From 7793b3de753f481fa975f33140e20ae78c22aabb Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 19 Nov 2025 15:16:31 -0800
Subject: [PATCH 01/24] remove entire tensor submodule and modify cmake
 accordingly

---
 dpctl/CMakeLists.txt                          |    1 -
 dpctl/__init__.py                             |    1 -
 dpctl/apis/include/dpctl4pybind11.hpp         |  524 +-
 dpctl/apis/include/dpctl_capi.h               |    3 -
 dpctl/tensor/CMakeLists.txt                   |  316 --
 dpctl/tensor/__init__.pxd                     |   24 -
 dpctl/tensor/__init__.py                      |  428 --
 dpctl/tensor/_accumulation.py                 |  454 --
 dpctl/tensor/_array_api.py                    |  242 -
 dpctl/tensor/_clip.py                         |  763 ---
 dpctl/tensor/_constants.py                    |   24 -
 dpctl/tensor/_copy_utils.py                   | 1147 -----
 dpctl/tensor/_ctors.py                        | 1959 --------
 dpctl/tensor/_data_types.py                   |   96 -
 dpctl/tensor/_device.py                       |  189 -
 dpctl/tensor/_dldevice_conversions.py         |   39 -
 dpctl/tensor/_dlpack.pxd                      |   61 -
 dpctl/tensor/_dlpack.pyx                      | 1233 -----
 dpctl/tensor/_elementwise_common.py           |  990 ----
 dpctl/tensor/_elementwise_funcs.py            | 2268 ---------
 dpctl/tensor/_flags.pyx                       |  163 -
 dpctl/tensor/_indexing_functions.py           |  625 ---
 dpctl/tensor/_linear_algebra_functions.py     | 1003 ----
 dpctl/tensor/_manipulation_functions.py       | 1070 -----
 dpctl/tensor/_numpy_helper.py                 |   32 -
 dpctl/tensor/_print.py                        |  491 --
 dpctl/tensor/_reduction.py                    |  818 ----
 dpctl/tensor/_reshape.py                      |  194 -
 dpctl/tensor/_scalar_utils.py                 |  111 -
 dpctl/tensor/_search_functions.py             |  403 --
 dpctl/tensor/_searchsorted.py                 |  157 -
 dpctl/tensor/_set_functions.py                |  781 ---
 dpctl/tensor/_slicing.pxi                     |  371 --
 dpctl/tensor/_sorting.py                      |  434 --
 dpctl/tensor/_statistical_functions.py        |  367 --
 dpctl/tensor/_stride_utils.pxi                |  302 --
 dpctl/tensor/_testing.py                      |  152 -
 dpctl/tensor/_type_utils.py                   |  981 ----
 dpctl/tensor/_types.pxi                       |  157 -
 dpctl/tensor/_usmarray.pxd                    |   76 -
 dpctl/tensor/_usmarray.pyx                    | 1967 --------
 dpctl/tensor/_utility_functions.py            |  491 --
 dpctl/tensor/include/dlpack/.clang-format     |    2 -
 .../tensor/include/dlpack/LICENSE.third-party |  201 -
 dpctl/tensor/include/dlpack/README.md         |    7 -
 dpctl/tensor/include/dlpack/dlpack.h          |  639 ---
 .../include/kernels/accumulators.hpp          | 1410 ------
 .../libtensor/include/kernels/alignment.hpp   |   45 -
 .../kernels/boolean_advanced_indexing.hpp     |  851 ----
 .../tensor/libtensor/include/kernels/clip.hpp |  349 --
 .../include/kernels/constructors.hpp          |  560 ---
 .../include/kernels/copy_and_cast.hpp         | 1262 -----
 .../include/kernels/copy_as_contiguous.hpp    |  639 ---
 .../include/kernels/dpctl_tensor_types.hpp    |   37 -
 .../kernels/elementwise_functions/abs.hpp     |  232 -
 .../kernels/elementwise_functions/acos.hpp    |  264 -
 .../kernels/elementwise_functions/acosh.hpp   |  293 --
 .../kernels/elementwise_functions/add.hpp     |  674 ---
 .../kernels/elementwise_functions/angle.hpp   |  207 -
 .../kernels/elementwise_functions/asin.hpp    |  285 --
 .../kernels/elementwise_functions/asinh.hpp   |  268 --
 .../kernels/elementwise_functions/atan.hpp    |  278 --
 .../kernels/elementwise_functions/atan2.hpp   |  226 -
 .../kernels/elementwise_functions/atanh.hpp   |  269 --
 .../elementwise_functions/bitwise_and.hpp     |  461 --
 .../elementwise_functions/bitwise_invert.hpp  |  228 -
 .../bitwise_left_shift.hpp                    |  482 --
 .../elementwise_functions/bitwise_or.hpp      |  457 --
 .../bitwise_right_shift.hpp                   |  488 --
 .../elementwise_functions/bitwise_xor.hpp     |  463 --
 .../elementwise_functions/cabs_impl.hpp       |   75 -
 .../kernels/elementwise_functions/cbrt.hpp    |  199 -
 .../kernels/elementwise_functions/ceil.hpp    |  221 -
 .../kernels/elementwise_functions/common.hpp  | 1041 ----
 .../elementwise_functions/common_detail.hpp   |   70 -
 .../elementwise_functions/common_inplace.hpp  |  475 --
 .../kernels/elementwise_functions/conj.hpp    |  227 -
 .../elementwise_functions/copysign.hpp        |  241 -
 .../kernels/elementwise_functions/cos.hpp     |  300 --
 .../kernels/elementwise_functions/cosh.hpp    |  290 --
 .../kernels/elementwise_functions/equal.hpp   |  312 --
 .../kernels/elementwise_functions/exp.hpp     |  258 -
 .../kernels/elementwise_functions/exp2.hpp    |  262 -
 .../kernels/elementwise_functions/expm1.hpp   |  273 --
 .../kernels/elementwise_functions/floor.hpp   |  221 -
 .../elementwise_functions/floor_divide.hpp    |  542 ---
 .../kernels/elementwise_functions/greater.hpp |  315 --
 .../elementwise_functions/greater_equal.hpp   |  319 --
 .../kernels/elementwise_functions/hypot.hpp   |  242 -
 .../kernels/elementwise_functions/imag.hpp    |  225 -
 .../elementwise_functions/isfinite.hpp        |  216 -
 .../kernels/elementwise_functions/isinf.hpp   |  213 -
 .../kernels/elementwise_functions/isnan.hpp   |  211 -
 .../kernels/elementwise_functions/less.hpp    |  310 --
 .../elementwise_functions/less_equal.hpp      |  313 --
 .../kernels/elementwise_functions/log.hpp     |  214 -
 .../kernels/elementwise_functions/log10.hpp   |  234 -
 .../kernels/elementwise_functions/log1p.hpp   |  239 -
 .../kernels/elementwise_functions/log2.hpp    |  235 -
 .../elementwise_functions/logaddexp.hpp       |  261 -
 .../elementwise_functions/logical_and.hpp     |  287 --
 .../elementwise_functions/logical_not.hpp     |  191 -
 .../elementwise_functions/logical_or.hpp      |  283 --
 .../elementwise_functions/logical_xor.hpp     |  288 --
 .../kernels/elementwise_functions/maximum.hpp |  316 --
 .../kernels/elementwise_functions/minimum.hpp |  316 --
 .../elementwise_functions/multiply.hpp        |  636 ---
 .../elementwise_functions/negative.hpp        |  211 -
 .../elementwise_functions/nextafter.hpp       |  241 -
 .../elementwise_functions/not_equal.hpp       |  297 --
 .../elementwise_functions/positive.hpp        |  226 -
 .../kernels/elementwise_functions/pow.hpp     |  590 ---
 .../kernels/elementwise_functions/proj.hpp    |  230 -
 .../kernels/elementwise_functions/real.hpp    |  224 -
 .../elementwise_functions/reciprocal.hpp      |  223 -
 .../elementwise_functions/remainder.hpp       |  565 ---
 .../kernels/elementwise_functions/round.hpp   |  231 -
 .../kernels/elementwise_functions/rsqrt.hpp   |  202 -
 .../kernels/elementwise_functions/sign.hpp    |  247 -
 .../kernels/elementwise_functions/signbit.hpp |  210 -
 .../kernels/elementwise_functions/sin.hpp     |  322 --
 .../kernels/elementwise_functions/sinh.hpp    |  291 --
 .../kernels/elementwise_functions/sqrt.hpp    |  217 -
 .../kernels/elementwise_functions/square.hpp  |  242 -
 .../elementwise_functions/subtract.hpp        |  634 ---
 .../elementwise_functions/sycl_complex.hpp    |   36 -
 .../kernels/elementwise_functions/tan.hpp     |  266 --
 .../kernels/elementwise_functions/tanh.hpp    |  261 -
 .../elementwise_functions/true_divide.hpp     |  663 ---
 .../kernels/elementwise_functions/trunc.hpp   |  218 -
 .../elementwise_functions/vec_size_util.hpp   |   73 -
 .../kernels/integer_advanced_indexing.hpp     |  417 --
 .../kernels/linalg_functions/dot_product.hpp  | 1401 ------
 .../include/kernels/linalg_functions/gemm.hpp | 4228 -----------------
 .../libtensor/include/kernels/reductions.hpp  | 3322 -------------
 .../libtensor/include/kernels/repeat.hpp      |  457 --
 .../include/kernels/sorting/isin.hpp          |  239 -
 .../include/kernels/sorting/merge_sort.hpp    |  833 ----
 .../include/kernels/sorting/radix_sort.hpp    | 1897 --------
 .../kernels/sorting/search_sorted_detail.hpp  |  118 -
 .../include/kernels/sorting/searchsorted.hpp  |  256 -
 .../kernels/sorting/sort_impl_fn_ptr_t.hpp    |   55 -
 .../include/kernels/sorting/sort_utils.hpp    |  145 -
 .../include/kernels/sorting/topk.hpp          |  505 --
 .../libtensor/include/kernels/where.hpp       |  335 --
 .../include/utils/indexing_utils.hpp          |  149 -
 .../libtensor/include/utils/math_utils.hpp    |  138 -
 .../include/utils/memory_overlap.hpp          |  152 -
 .../libtensor/include/utils/offset_utils.hpp  |  781 ---
 .../include/utils/output_validation.hpp       |   78 -
 .../include/utils/rich_comparisons.hpp        |  137 -
 .../libtensor/include/utils/strided_iters.hpp |  985 ----
 .../include/utils/sycl_alloc_utils.hpp        |  220 -
 .../libtensor/include/utils/sycl_utils.hpp    |  646 ---
 .../libtensor/include/utils/type_dispatch.hpp |  133 -
 .../include/utils/type_dispatch_building.hpp  |  291 --
 .../libtensor/include/utils/type_utils.hpp    |  158 -
 .../tensor/libtensor/source/accumulators.cpp  |  401 --
 .../tensor/libtensor/source/accumulators.hpp  |   58 -
 .../accumulators/accumulate_over_axis.hpp     |  454 --
 .../accumulators/accumulators_common.cpp      |   50 -
 .../accumulators/accumulators_common.hpp      |   41 -
 .../accumulators/cumulative_logsumexp.cpp     |  343 --
 .../accumulators/cumulative_logsumexp.hpp     |   41 -
 .../source/accumulators/cumulative_prod.cpp   |  353 --
 .../source/accumulators/cumulative_prod.hpp   |   41 -
 .../source/accumulators/cumulative_sum.cpp    |  351 --
 .../source/accumulators/cumulative_sum.hpp    |   41 -
 .../source/boolean_advanced_indexing.cpp      |  850 ----
 .../source/boolean_advanced_indexing.hpp      |   75 -
 dpctl/tensor/libtensor/source/clip.cpp        |  261 -
 dpctl/tensor/libtensor/source/clip.hpp        |   52 -
 .../source/copy_and_cast_usm_to_usm.cpp       |  300 --
 .../source/copy_and_cast_usm_to_usm.hpp       |   50 -
 .../libtensor/source/copy_as_contig.cpp       |  747 ---
 .../libtensor/source/copy_as_contig.hpp       |   33 -
 .../libtensor/source/copy_for_reshape.cpp     |  177 -
 .../libtensor/source/copy_for_reshape.hpp     |   50 -
 .../tensor/libtensor/source/copy_for_roll.cpp |  393 --
 .../tensor/libtensor/source/copy_for_roll.hpp |   58 -
 .../copy_numpy_ndarray_into_usm_ndarray.cpp   |  359 --
 .../copy_numpy_ndarray_into_usm_ndarray.hpp   |   50 -
 .../source/device_support_queries.cpp         |  168 -
 .../source/device_support_queries.hpp         |   49 -
 .../source/elementwise_functions/abs.cpp      |  119 -
 .../source/elementwise_functions/abs.hpp      |   42 -
 .../source/elementwise_functions/acos.cpp     |  119 -
 .../source/elementwise_functions/acos.hpp     |   42 -
 .../source/elementwise_functions/acosh.cpp    |  121 -
 .../source/elementwise_functions/acosh.hpp    |   42 -
 .../source/elementwise_functions/add.cpp      |  237 -
 .../source/elementwise_functions/add.hpp      |   42 -
 .../source/elementwise_functions/angle.cpp    |  121 -
 .../source/elementwise_functions/angle.hpp    |   42 -
 .../source/elementwise_functions/asin.cpp     |  119 -
 .../source/elementwise_functions/asin.hpp     |   42 -
 .../source/elementwise_functions/asinh.cpp    |  121 -
 .../source/elementwise_functions/asinh.hpp    |   42 -
 .../source/elementwise_functions/atan.cpp     |  119 -
 .../source/elementwise_functions/atan.hpp     |   42 -
 .../source/elementwise_functions/atan2.cpp    |  140 -
 .../source/elementwise_functions/atan2.hpp    |   42 -
 .../source/elementwise_functions/atanh.cpp    |  121 -
 .../source/elementwise_functions/atanh.hpp    |   42 -
 .../elementwise_functions/bitwise_and.cpp     |  200 -
 .../elementwise_functions/bitwise_and.hpp     |   42 -
 .../elementwise_functions/bitwise_invert.cpp  |  123 -
 .../elementwise_functions/bitwise_invert.hpp  |   42 -
 .../bitwise_left_shift.cpp                    |  210 -
 .../bitwise_left_shift.hpp                    |   42 -
 .../elementwise_functions/bitwise_or.cpp      |  200 -
 .../elementwise_functions/bitwise_or.hpp      |   42 -
 .../bitwise_right_shift.cpp                   |  211 -
 .../bitwise_right_shift.hpp                   |   42 -
 .../elementwise_functions/bitwise_xor.cpp     |  200 -
 .../elementwise_functions/bitwise_xor.hpp     |   42 -
 .../source/elementwise_functions/cbrt.cpp     |  119 -
 .../source/elementwise_functions/cbrt.hpp     |   42 -
 .../source/elementwise_functions/ceil.cpp     |  119 -
 .../source/elementwise_functions/ceil.hpp     |   44 -
 .../source/elementwise_functions/conj.cpp     |  119 -
 .../source/elementwise_functions/conj.hpp     |   42 -
 .../source/elementwise_functions/copysign.cpp |  140 -
 .../source/elementwise_functions/copysign.hpp |   42 -
 .../source/elementwise_functions/cos.cpp      |  119 -
 .../source/elementwise_functions/cos.hpp      |   42 -
 .../source/elementwise_functions/cosh.cpp     |  119 -
 .../source/elementwise_functions/cosh.hpp     |   42 -
 .../elementwise_common.cpp                    |  187 -
 .../elementwise_common.hpp                    |   42 -
 .../elementwise_functions.hpp                 |  813 ----
 .../elementwise_functions_type_utils.cpp      |   95 -
 .../elementwise_functions_type_utils.hpp      |   56 -
 .../source/elementwise_functions/equal.cpp    |  140 -
 .../source/elementwise_functions/equal.hpp    |   42 -
 .../source/elementwise_functions/exp.cpp      |  119 -
 .../source/elementwise_functions/exp.hpp      |   42 -
 .../source/elementwise_functions/exp2.cpp     |  119 -
 .../source/elementwise_functions/exp2.hpp     |   42 -
 .../source/elementwise_functions/expm1.cpp    |  121 -
 .../source/elementwise_functions/expm1.hpp    |   42 -
 .../source/elementwise_functions/floor.cpp    |  121 -
 .../source/elementwise_functions/floor.hpp    |   42 -
 .../elementwise_functions/floor_divide.cpp    |  200 -
 .../elementwise_functions/floor_divide.hpp    |   42 -
 .../source/elementwise_functions/greater.cpp  |  140 -
 .../source/elementwise_functions/greater.hpp  |   42 -
 .../elementwise_functions/greater_equal.cpp   |  141 -
 .../elementwise_functions/greater_equal.hpp   |   42 -
 .../source/elementwise_functions/hypot.cpp    |  140 -
 .../source/elementwise_functions/hypot.hpp    |   42 -
 .../source/elementwise_functions/imag.cpp     |  119 -
 .../source/elementwise_functions/imag.hpp     |   42 -
 .../source/elementwise_functions/isfinite.cpp |  122 -
 .../source/elementwise_functions/isfinite.hpp |   42 -
 .../source/elementwise_functions/isinf.cpp    |  121 -
 .../source/elementwise_functions/isinf.hpp    |   42 -
 .../source/elementwise_functions/isnan.cpp    |  121 -
 .../source/elementwise_functions/isnan.hpp    |   42 -
 .../source/elementwise_functions/less.cpp     |  140 -
 .../source/elementwise_functions/less.hpp     |   42 -
 .../elementwise_functions/less_equal.cpp      |  140 -
 .../elementwise_functions/less_equal.hpp      |   42 -
 .../source/elementwise_functions/log.cpp      |  119 -
 .../source/elementwise_functions/log.hpp      |   42 -
 .../source/elementwise_functions/log10.cpp    |  121 -
 .../source/elementwise_functions/log10.hpp    |   42 -
 .../source/elementwise_functions/log1p.cpp    |  121 -
 .../source/elementwise_functions/log1p.hpp    |   42 -
 .../source/elementwise_functions/log2.cpp     |  119 -
 .../source/elementwise_functions/log2.hpp     |   42 -
 .../elementwise_functions/logaddexp.cpp       |  140 -
 .../elementwise_functions/logaddexp.hpp       |   42 -
 .../elementwise_functions/logical_and.cpp     |  140 -
 .../elementwise_functions/logical_and.hpp     |   42 -
 .../elementwise_functions/logical_not.cpp     |  123 -
 .../elementwise_functions/logical_not.hpp     |   42 -
 .../elementwise_functions/logical_or.cpp      |  140 -
 .../elementwise_functions/logical_or.hpp      |   42 -
 .../elementwise_functions/logical_xor.cpp     |  140 -
 .../elementwise_functions/logical_xor.hpp     |   42 -
 .../source/elementwise_functions/maximum.cpp  |  140 -
 .../source/elementwise_functions/maximum.hpp  |   42 -
 .../source/elementwise_functions/minimum.cpp  |  140 -
 .../source/elementwise_functions/minimum.hpp  |   42 -
 .../source/elementwise_functions/multiply.cpp |  238 -
 .../source/elementwise_functions/multiply.hpp |   42 -
 .../source/elementwise_functions/negative.cpp |  122 -
 .../source/elementwise_functions/negative.hpp |   42 -
 .../elementwise_functions/nextafter.cpp       |  140 -
 .../elementwise_functions/nextafter.hpp       |   42 -
 .../elementwise_functions/not_equal.cpp       |  140 -
 .../elementwise_functions/not_equal.hpp       |   42 -
 .../source/elementwise_functions/positive.cpp |  122 -
 .../source/elementwise_functions/positive.hpp |   42 -
 .../source/elementwise_functions/pow.cpp      |  197 -
 .../source/elementwise_functions/pow.hpp      |   42 -
 .../source/elementwise_functions/proj.cpp     |  119 -
 .../source/elementwise_functions/proj.hpp     |   42 -
 .../source/elementwise_functions/real.cpp     |  119 -
 .../source/elementwise_functions/real.hpp     |   42 -
 .../elementwise_functions/reciprocal.cpp      |  123 -
 .../elementwise_functions/reciprocal.hpp      |   42 -
 .../elementwise_functions/remainder.cpp       |  199 -
 .../elementwise_functions/remainder.hpp       |   42 -
 .../source/elementwise_functions/round.cpp    |  121 -
 .../source/elementwise_functions/round.hpp    |   42 -
 .../source/elementwise_functions/rsqrt.cpp    |  121 -
 .../source/elementwise_functions/rsqrt.hpp    |   42 -
 .../source/elementwise_functions/sign.cpp     |  119 -
 .../source/elementwise_functions/sign.hpp     |   42 -
 .../source/elementwise_functions/signbit.cpp  |  122 -
 .../source/elementwise_functions/signbit.hpp  |   42 -
 .../source/elementwise_functions/sin.cpp      |  119 -
 .../source/elementwise_functions/sin.hpp      |   42 -
 .../source/elementwise_functions/sinh.cpp     |  119 -
 .../source/elementwise_functions/sinh.hpp     |   42 -
 .../source/elementwise_functions/sqrt.cpp     |  119 -
 .../source/elementwise_functions/sqrt.hpp     |   42 -
 .../source/elementwise_functions/square.cpp   |  121 -
 .../source/elementwise_functions/square.hpp   |   42 -
 .../source/elementwise_functions/subtract.cpp |  237 -
 .../source/elementwise_functions/subtract.hpp |   42 -
 .../source/elementwise_functions/tan.cpp      |  119 -
 .../source/elementwise_functions/tan.hpp      |   42 -
 .../source/elementwise_functions/tanh.cpp     |  119 -
 .../source/elementwise_functions/tanh.hpp     |   42 -
 .../elementwise_functions/true_divide.cpp     |  492 --
 .../elementwise_functions/true_divide.hpp     |   42 -
 .../source/elementwise_functions/trunc.cpp    |  121 -
 .../source/elementwise_functions/trunc.hpp    |   42 -
 dpctl/tensor/libtensor/source/eye_ctor.cpp    |  136 -
 dpctl/tensor/libtensor/source/eye_ctor.hpp    |   50 -
 dpctl/tensor/libtensor/source/full_ctor.cpp   |  300 --
 dpctl/tensor/libtensor/source/full_ctor.hpp   |   50 -
 .../source/integer_advanced_indexing.cpp      |  811 ----
 .../source/integer_advanced_indexing.hpp      |   63 -
 .../libtensor/source/linalg_functions/dot.cpp |  859 ----
 .../libtensor/source/linalg_functions/dot.hpp |   41 -
 .../linalg_functions/dot_atomic_support.hpp   |   58 -
 .../source/linalg_functions/dot_dispatch.hpp  |  391 --
 .../libtensor/source/linear_sequences.cpp     |  300 --
 .../libtensor/source/linear_sequences.hpp     |   59 -
 .../libtensor/source/reductions/all.cpp       |  155 -
 .../libtensor/source/reductions/all.hpp       |   41 -
 .../libtensor/source/reductions/any.cpp       |  155 -
 .../libtensor/source/reductions/any.hpp       |   41 -
 .../libtensor/source/reductions/argmax.cpp    |  277 --
 .../libtensor/source/reductions/argmax.hpp    |   41 -
 .../libtensor/source/reductions/argmin.cpp    |  278 --
 .../libtensor/source/reductions/argmin.hpp    |   41 -
 .../libtensor/source/reductions/logsumexp.cpp |  253 -
 .../libtensor/source/reductions/logsumexp.hpp |   41 -
 .../libtensor/source/reductions/max.cpp       |  410 --
 .../libtensor/source/reductions/max.hpp       |   41 -
 .../libtensor/source/reductions/min.cpp       |  412 --
 .../libtensor/source/reductions/min.hpp       |   41 -
 .../libtensor/source/reductions/prod.cpp      |  459 --
 .../libtensor/source/reductions/prod.hpp      |   41 -
 .../source/reductions/reduce_hypot.cpp        |  248 -
 .../source/reductions/reduce_hypot.hpp        |   41 -
 .../reductions/reduction_atomic_support.hpp   |  140 -
 .../source/reductions/reduction_common.cpp    |   64 -
 .../source/reductions/reduction_common.hpp    |   41 -
 .../source/reductions/reduction_over_axis.hpp | 1320 -----
 .../libtensor/source/reductions/sum.cpp       |  459 --
 .../libtensor/source/reductions/sum.hpp       |   41 -
 dpctl/tensor/libtensor/source/repeat.cpp      |  817 ----
 dpctl/tensor/libtensor/source/repeat.hpp      |   76 -
 .../source/simplify_iteration_space.cpp       |  535 ---
 .../source/simplify_iteration_space.hpp       |  120 -
 .../tensor/libtensor/source/sorting/isin.cpp  |  319 --
 .../tensor/libtensor/source/sorting/isin.hpp  |   42 -
 .../source/sorting/merge_argsort.cpp          |  152 -
 .../source/sorting/merge_argsort.hpp          |   42 -
 .../libtensor/source/sorting/merge_sort.cpp   |  134 -
 .../libtensor/source/sorting/merge_sort.hpp   |   42 -
 .../source/sorting/py_argsort_common.hpp      |  174 -
 .../source/sorting/py_sort_common.hpp         |  170 -
 .../source/sorting/radix_argsort.cpp          |  187 -
 .../source/sorting/radix_argsort.hpp          |   42 -
 .../libtensor/source/sorting/radix_sort.cpp   |  187 -
 .../libtensor/source/sorting/radix_sort.hpp   |   42 -
 .../source/sorting/radix_sort_support.hpp     |   71 -
 .../libtensor/source/sorting/searchsorted.cpp |  473 --
 .../libtensor/source/sorting/searchsorted.hpp |   42 -
 .../tensor/libtensor/source/sorting/topk.cpp  |  301 --
 .../tensor/libtensor/source/sorting/topk.hpp  |   42 -
 .../libtensor/source/tensor_accumulation.cpp  |   35 -
 .../tensor/libtensor/source/tensor_ctors.cpp  |  492 --
 .../libtensor/source/tensor_elementwise.cpp   |   34 -
 .../tensor/libtensor/source/tensor_linalg.cpp |   34 -
 .../libtensor/source/tensor_reductions.cpp    |   35 -
 .../libtensor/source/tensor_sorting.cpp       |   47 -
 .../libtensor/source/tensor_sorting_radix.cpp |   37 -
 dpctl/tensor/libtensor/source/triul_ctor.cpp  |  243 -
 dpctl/tensor/libtensor/source/triul_ctor.hpp  |   52 -
 dpctl/tensor/libtensor/source/where.cpp       |  262 -
 dpctl/tensor/libtensor/source/where.hpp       |   52 -
 dpctl/tensor/libtensor/source/zeros_ctor.cpp  |  157 -
 dpctl/tensor/libtensor/source/zeros_ctor.hpp  |   49 -
 dpctl/tensor/libtensor/tests/test_copy.py     |  309 --
 dpctl/tensor/libtensor/tests/test_main.cpp    |   33 -
 dpctl/tests/elementwise/__init__.py           |   20 -
 dpctl/tests/elementwise/test_abs.py           |  204 -
 dpctl/tests/elementwise/test_add.py           |  574 ---
 dpctl/tests/elementwise/test_angle.py         |   92 -
 dpctl/tests/elementwise/test_atan2.py         |  506 --
 dpctl/tests/elementwise/test_bitwise_and.py   |  127 -
 .../tests/elementwise/test_bitwise_invert.py  |  129 -
 .../elementwise/test_bitwise_left_shift.py    |  135 -
 dpctl/tests/elementwise/test_bitwise_or.py    |  143 -
 .../elementwise/test_bitwise_right_shift.py   |  151 -
 dpctl/tests/elementwise/test_bitwise_xor.py   |  143 -
 dpctl/tests/elementwise/test_cbrt.py          |   79 -
 dpctl/tests/elementwise/test_complex.py       |  221 -
 dpctl/tests/elementwise/test_copysign.py      |  111 -
 dpctl/tests/elementwise/test_divide.py        |  298 --
 .../elementwise/test_elementwise_classes.py   |  137 -
 dpctl/tests/elementwise/test_equal.py         |  190 -
 dpctl/tests/elementwise/test_exp.py           |  234 -
 dpctl/tests/elementwise/test_exp2.py          |  168 -
 dpctl/tests/elementwise/test_expm1.py         |  168 -
 .../elementwise/test_floor_ceil_trunc.py      |  163 -
 dpctl/tests/elementwise/test_floor_divide.py  |  304 --
 dpctl/tests/elementwise/test_greater.py       |  297 --
 dpctl/tests/elementwise/test_greater_equal.py |  296 --
 dpctl/tests/elementwise/test_hyperbolic.py    |  187 -
 dpctl/tests/elementwise/test_hypot.py         |  193 -
 dpctl/tests/elementwise/test_isfinite.py      |   99 -
 dpctl/tests/elementwise/test_isinf.py         |   93 -
 dpctl/tests/elementwise/test_isnan.py         |   98 -
 dpctl/tests/elementwise/test_less.py          |  297 --
 dpctl/tests/elementwise/test_less_equal.py    |  296 --
 dpctl/tests/elementwise/test_log.py           |  130 -
 dpctl/tests/elementwise/test_log10.py         |  133 -
 dpctl/tests/elementwise/test_log1p.py         |  169 -
 dpctl/tests/elementwise/test_log2.py          |  129 -
 dpctl/tests/elementwise/test_logaddexp.py     |  194 -
 dpctl/tests/elementwise/test_logical_and.py   |  304 --
 dpctl/tests/elementwise/test_logical_not.py   |  179 -
 dpctl/tests/elementwise/test_logical_or.py    |  305 --
 dpctl/tests/elementwise/test_logical_xor.py   |  306 --
 .../tests/elementwise/test_maximum_minimum.py |  314 --
 dpctl/tests/elementwise/test_multiply.py      |  234 -
 dpctl/tests/elementwise/test_negative.py      |   86 -
 dpctl/tests/elementwise/test_nextafter.py     |  151 -
 dpctl/tests/elementwise/test_not_equal.py     |  208 -
 dpctl/tests/elementwise/test_positive.py      |   79 -
 dpctl/tests/elementwise/test_pow.py           |  212 -
 dpctl/tests/elementwise/test_reciprocal.py    |   93 -
 dpctl/tests/elementwise/test_remainder.py     |  260 -
 dpctl/tests/elementwise/test_round.py         |  215 -
 dpctl/tests/elementwise/test_rsqrt.py         |   74 -
 dpctl/tests/elementwise/test_sign.py          |  121 -
 dpctl/tests/elementwise/test_signbit.py       |  108 -
 dpctl/tests/elementwise/test_sqrt.py          |  192 -
 dpctl/tests/elementwise/test_square.py        |   99 -
 dpctl/tests/elementwise/test_subtract.py      |  235 -
 dpctl/tests/elementwise/test_trigonometric.py |  216 -
 dpctl/tests/elementwise/test_type_utils.py    |  239 -
 dpctl/tests/elementwise/utils.py              |   61 -
 dpctl/tests/test_tensor_accumulation.py       |  435 --
 .../tests/test_tensor_array_api_inspection.py |  226 -
 dpctl/tests/test_tensor_asarray.py            |  649 ---
 dpctl/tests/test_tensor_clip.py               |  778 ---
 dpctl/tests/test_tensor_copy_utils.py         |  100 -
 dpctl/tests/test_tensor_diff.py               |  329 --
 dpctl/tests/test_tensor_dtype_routines.py     |  158 -
 dpctl/tests/test_tensor_isin.py               |  266 --
 .../test_tensor_statistical_functions.py      |  255 -
 dpctl/tests/test_tensor_sum.py                |  332 --
 dpctl/tests/test_tensor_testing.py            |  149 -
 dpctl/tests/test_usm_ndarray_ctor.py          | 2786 -----------
 dpctl/tests/test_usm_ndarray_dlpack.py        |  902 ----
 dpctl/tests/test_usm_ndarray_indexing.py      | 2041 --------
 dpctl/tests/test_usm_ndarray_linalg.py        | 1015 ----
 dpctl/tests/test_usm_ndarray_manipulation.py  | 1597 -------
 dpctl/tests/test_usm_ndarray_operators.py     |  142 -
 dpctl/tests/test_usm_ndarray_print.py         |  393 --
 dpctl/tests/test_usm_ndarray_reductions.py    |  690 ---
 .../test_usm_ndarray_search_functions.py      |  579 ---
 dpctl/tests/test_usm_ndarray_searchsorted.py  |  377 --
 dpctl/tests/test_usm_ndarray_sorting.py       |  381 --
 dpctl/tests/test_usm_ndarray_top_k.py         |  315 --
 dpctl/tests/test_usm_ndarray_unique.py        |  345 --
 .../test_usm_ndarray_utility_functions.py     |  167 -
 dpctl/utils/CMakeLists.txt                    |    5 -
 setup.py                                      |    9 -
 489 files changed, 13 insertions(+), 130293 deletions(-)
 delete mode 100644 dpctl/tensor/CMakeLists.txt
 delete mode 100644 dpctl/tensor/__init__.pxd
 delete mode 100644 dpctl/tensor/__init__.py
 delete mode 100644 dpctl/tensor/_accumulation.py
 delete mode 100644 dpctl/tensor/_array_api.py
 delete mode 100644 dpctl/tensor/_clip.py
 delete mode 100644 dpctl/tensor/_constants.py
 delete mode 100644 dpctl/tensor/_copy_utils.py
 delete mode 100644 dpctl/tensor/_ctors.py
 delete mode 100644 dpctl/tensor/_data_types.py
 delete mode 100644 dpctl/tensor/_device.py
 delete mode 100644 dpctl/tensor/_dldevice_conversions.py
 delete mode 100644 dpctl/tensor/_dlpack.pxd
 delete mode 100644 dpctl/tensor/_dlpack.pyx
 delete mode 100644 dpctl/tensor/_elementwise_common.py
 delete mode 100644 dpctl/tensor/_elementwise_funcs.py
 delete mode 100644 dpctl/tensor/_flags.pyx
 delete mode 100644 dpctl/tensor/_indexing_functions.py
 delete mode 100644 dpctl/tensor/_linear_algebra_functions.py
 delete mode 100644 dpctl/tensor/_manipulation_functions.py
 delete mode 100644 dpctl/tensor/_numpy_helper.py
 delete mode 100644 dpctl/tensor/_print.py
 delete mode 100644 dpctl/tensor/_reduction.py
 delete mode 100644 dpctl/tensor/_reshape.py
 delete mode 100644 dpctl/tensor/_scalar_utils.py
 delete mode 100644 dpctl/tensor/_search_functions.py
 delete mode 100644 dpctl/tensor/_searchsorted.py
 delete mode 100644 dpctl/tensor/_set_functions.py
 delete mode 100644 dpctl/tensor/_slicing.pxi
 delete mode 100644 dpctl/tensor/_sorting.py
 delete mode 100644 dpctl/tensor/_statistical_functions.py
 delete mode 100644 dpctl/tensor/_stride_utils.pxi
 delete mode 100644 dpctl/tensor/_testing.py
 delete mode 100644 dpctl/tensor/_type_utils.py
 delete mode 100644 dpctl/tensor/_types.pxi
 delete mode 100644 dpctl/tensor/_usmarray.pxd
 delete mode 100644 dpctl/tensor/_usmarray.pyx
 delete mode 100644 dpctl/tensor/_utility_functions.py
 delete mode 100644 dpctl/tensor/include/dlpack/.clang-format
 delete mode 100644 dpctl/tensor/include/dlpack/LICENSE.third-party
 delete mode 100644 dpctl/tensor/include/dlpack/README.md
 delete mode 100644 dpctl/tensor/include/dlpack/dlpack.h
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/accumulators.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/alignment.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/clip.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/constructors.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/cabs_impl.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/reductions.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/repeat.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/sorting/isin.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/sorting/searchsorted.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/sorting/sort_impl_fn_ptr_t.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/kernels/where.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/utils/indexing_utils.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/utils/math_utils.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/utils/memory_overlap.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/utils/offset_utils.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/utils/output_validation.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/utils/rich_comparisons.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/utils/strided_iters.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/utils/type_dispatch.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp
 delete mode 100644 dpctl/tensor/libtensor/include/utils/type_utils.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/accumulators.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/accumulators.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/accumulators/accumulators_common.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/accumulators/accumulators_common.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/accumulators/cumulative_logsumexp.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/accumulators/cumulative_prod.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/accumulators/cumulative_prod.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/accumulators/cumulative_sum.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/accumulators/cumulative_sum.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/clip.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/clip.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/copy_as_contig.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/copy_as_contig.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/copy_for_reshape.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/copy_for_reshape.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/copy_for_roll.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/copy_for_roll.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/device_support_queries.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/device_support_queries.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/abs.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/abs.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/acos.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/acos.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/acosh.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/acosh.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/add.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/add.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/angle.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/angle.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/asin.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/asin.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/asinh.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/asinh.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/atan.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/atan.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/atan2.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/atan2.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/atanh.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/atanh.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/bitwise_and.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/bitwise_and.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/bitwise_invert.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/bitwise_invert.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/bitwise_or.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/bitwise_or.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/bitwise_xor.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/bitwise_xor.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/cbrt.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/cbrt.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/ceil.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/ceil.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/conj.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/conj.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/copysign.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/copysign.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/cos.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/cos.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/cosh.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/cosh.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/elementwise_common.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/equal.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/equal.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/exp.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/exp.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/exp2.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/exp2.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/expm1.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/expm1.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/floor.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/floor.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/floor_divide.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/floor_divide.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/greater.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/greater.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/greater_equal.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/greater_equal.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/hypot.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/hypot.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/imag.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/imag.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/isfinite.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/isfinite.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/isinf.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/isinf.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/isnan.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/isnan.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/less.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/less.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/less_equal.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/less_equal.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/log.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/log.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/log10.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/log10.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/log1p.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/log1p.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/log2.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/log2.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/logaddexp.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/logaddexp.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/logical_and.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/logical_and.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/logical_not.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/logical_not.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/logical_or.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/logical_or.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/logical_xor.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/logical_xor.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/maximum.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/maximum.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/minimum.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/minimum.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/multiply.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/multiply.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/negative.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/negative.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/nextafter.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/nextafter.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/not_equal.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/not_equal.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/positive.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/positive.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/pow.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/pow.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/proj.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/proj.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/real.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/real.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/reciprocal.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/reciprocal.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/remainder.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/remainder.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/round.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/round.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/rsqrt.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/rsqrt.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/sign.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/sign.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/signbit.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/signbit.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/sin.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/sin.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/sinh.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/sinh.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/sqrt.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/sqrt.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/square.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/square.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/subtract.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/subtract.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/tan.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/tan.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/tanh.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/tanh.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/true_divide.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/trunc.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/trunc.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/eye_ctor.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/eye_ctor.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/full_ctor.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/full_ctor.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/integer_advanced_indexing.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/linalg_functions/dot.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/linalg_functions/dot.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/linalg_functions/dot_atomic_support.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/linalg_functions/dot_dispatch.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/linear_sequences.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/linear_sequences.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/reductions/all.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/reductions/all.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/reductions/any.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/reductions/any.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/reductions/argmax.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/reductions/argmax.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/reductions/argmin.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/reductions/argmin.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/reductions/logsumexp.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/reductions/logsumexp.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/reductions/max.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/reductions/max.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/reductions/min.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/reductions/min.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/reductions/prod.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/reductions/prod.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/reductions/reduce_hypot.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/reductions/reduce_hypot.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/reductions/reduction_atomic_support.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/reductions/reduction_common.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/reductions/reduction_common.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/reductions/sum.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/reductions/sum.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/repeat.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/repeat.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/simplify_iteration_space.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/simplify_iteration_space.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/sorting/isin.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/sorting/isin.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/sorting/merge_argsort.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/sorting/merge_argsort.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/sorting/merge_sort.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/sorting/merge_sort.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/sorting/py_argsort_common.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/sorting/py_sort_common.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/sorting/radix_argsort.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/sorting/radix_argsort.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/sorting/radix_sort.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/sorting/radix_sort.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/sorting/radix_sort_support.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/sorting/searchsorted.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/sorting/searchsorted.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/sorting/topk.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/sorting/topk.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/tensor_accumulation.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/tensor_ctors.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/tensor_elementwise.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/tensor_linalg.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/tensor_reductions.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/tensor_sorting.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/tensor_sorting_radix.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/triul_ctor.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/triul_ctor.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/where.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/where.hpp
 delete mode 100644 dpctl/tensor/libtensor/source/zeros_ctor.cpp
 delete mode 100644 dpctl/tensor/libtensor/source/zeros_ctor.hpp
 delete mode 100644 dpctl/tensor/libtensor/tests/test_copy.py
 delete mode 100644 dpctl/tensor/libtensor/tests/test_main.cpp
 delete mode 100644 dpctl/tests/elementwise/__init__.py
 delete mode 100644 dpctl/tests/elementwise/test_abs.py
 delete mode 100644 dpctl/tests/elementwise/test_add.py
 delete mode 100644 dpctl/tests/elementwise/test_angle.py
 delete mode 100644 dpctl/tests/elementwise/test_atan2.py
 delete mode 100644 dpctl/tests/elementwise/test_bitwise_and.py
 delete mode 100644 dpctl/tests/elementwise/test_bitwise_invert.py
 delete mode 100644 dpctl/tests/elementwise/test_bitwise_left_shift.py
 delete mode 100644 dpctl/tests/elementwise/test_bitwise_or.py
 delete mode 100644 dpctl/tests/elementwise/test_bitwise_right_shift.py
 delete mode 100644 dpctl/tests/elementwise/test_bitwise_xor.py
 delete mode 100644 dpctl/tests/elementwise/test_cbrt.py
 delete mode 100644 dpctl/tests/elementwise/test_complex.py
 delete mode 100644 dpctl/tests/elementwise/test_copysign.py
 delete mode 100644 dpctl/tests/elementwise/test_divide.py
 delete mode 100644 dpctl/tests/elementwise/test_elementwise_classes.py
 delete mode 100644 dpctl/tests/elementwise/test_equal.py
 delete mode 100644 dpctl/tests/elementwise/test_exp.py
 delete mode 100644 dpctl/tests/elementwise/test_exp2.py
 delete mode 100644 dpctl/tests/elementwise/test_expm1.py
 delete mode 100644 dpctl/tests/elementwise/test_floor_ceil_trunc.py
 delete mode 100644 dpctl/tests/elementwise/test_floor_divide.py
 delete mode 100644 dpctl/tests/elementwise/test_greater.py
 delete mode 100644 dpctl/tests/elementwise/test_greater_equal.py
 delete mode 100644 dpctl/tests/elementwise/test_hyperbolic.py
 delete mode 100644 dpctl/tests/elementwise/test_hypot.py
 delete mode 100644 dpctl/tests/elementwise/test_isfinite.py
 delete mode 100644 dpctl/tests/elementwise/test_isinf.py
 delete mode 100644 dpctl/tests/elementwise/test_isnan.py
 delete mode 100644 dpctl/tests/elementwise/test_less.py
 delete mode 100644 dpctl/tests/elementwise/test_less_equal.py
 delete mode 100644 dpctl/tests/elementwise/test_log.py
 delete mode 100644 dpctl/tests/elementwise/test_log10.py
 delete mode 100644 dpctl/tests/elementwise/test_log1p.py
 delete mode 100644 dpctl/tests/elementwise/test_log2.py
 delete mode 100644 dpctl/tests/elementwise/test_logaddexp.py
 delete mode 100644 dpctl/tests/elementwise/test_logical_and.py
 delete mode 100644 dpctl/tests/elementwise/test_logical_not.py
 delete mode 100644 dpctl/tests/elementwise/test_logical_or.py
 delete mode 100644 dpctl/tests/elementwise/test_logical_xor.py
 delete mode 100644 dpctl/tests/elementwise/test_maximum_minimum.py
 delete mode 100644 dpctl/tests/elementwise/test_multiply.py
 delete mode 100644 dpctl/tests/elementwise/test_negative.py
 delete mode 100644 dpctl/tests/elementwise/test_nextafter.py
 delete mode 100644 dpctl/tests/elementwise/test_not_equal.py
 delete mode 100644 dpctl/tests/elementwise/test_positive.py
 delete mode 100644 dpctl/tests/elementwise/test_pow.py
 delete mode 100644 dpctl/tests/elementwise/test_reciprocal.py
 delete mode 100644 dpctl/tests/elementwise/test_remainder.py
 delete mode 100644 dpctl/tests/elementwise/test_round.py
 delete mode 100644 dpctl/tests/elementwise/test_rsqrt.py
 delete mode 100644 dpctl/tests/elementwise/test_sign.py
 delete mode 100644 dpctl/tests/elementwise/test_signbit.py
 delete mode 100644 dpctl/tests/elementwise/test_sqrt.py
 delete mode 100644 dpctl/tests/elementwise/test_square.py
 delete mode 100644 dpctl/tests/elementwise/test_subtract.py
 delete mode 100644 dpctl/tests/elementwise/test_trigonometric.py
 delete mode 100644 dpctl/tests/elementwise/test_type_utils.py
 delete mode 100644 dpctl/tests/elementwise/utils.py
 delete mode 100644 dpctl/tests/test_tensor_accumulation.py
 delete mode 100644 dpctl/tests/test_tensor_array_api_inspection.py
 delete mode 100644 dpctl/tests/test_tensor_asarray.py
 delete mode 100644 dpctl/tests/test_tensor_clip.py
 delete mode 100644 dpctl/tests/test_tensor_copy_utils.py
 delete mode 100644 dpctl/tests/test_tensor_diff.py
 delete mode 100644 dpctl/tests/test_tensor_dtype_routines.py
 delete mode 100644 dpctl/tests/test_tensor_isin.py
 delete mode 100644 dpctl/tests/test_tensor_statistical_functions.py
 delete mode 100644 dpctl/tests/test_tensor_sum.py
 delete mode 100644 dpctl/tests/test_tensor_testing.py
 delete mode 100644 dpctl/tests/test_usm_ndarray_ctor.py
 delete mode 100644 dpctl/tests/test_usm_ndarray_dlpack.py
 delete mode 100644 dpctl/tests/test_usm_ndarray_indexing.py
 delete mode 100644 dpctl/tests/test_usm_ndarray_linalg.py
 delete mode 100644 dpctl/tests/test_usm_ndarray_manipulation.py
 delete mode 100644 dpctl/tests/test_usm_ndarray_operators.py
 delete mode 100644 dpctl/tests/test_usm_ndarray_print.py
 delete mode 100644 dpctl/tests/test_usm_ndarray_reductions.py
 delete mode 100644 dpctl/tests/test_usm_ndarray_search_functions.py
 delete mode 100644 dpctl/tests/test_usm_ndarray_searchsorted.py
 delete mode 100644 dpctl/tests/test_usm_ndarray_sorting.py
 delete mode 100644 dpctl/tests/test_usm_ndarray_top_k.py
 delete mode 100644 dpctl/tests/test_usm_ndarray_unique.py
 delete mode 100644 dpctl/tests/test_usm_ndarray_utility_functions.py

diff --git a/dpctl/CMakeLists.txt b/dpctl/CMakeLists.txt
index e2914f6394..a24c7443f9 100644
--- a/dpctl/CMakeLists.txt
+++ b/dpctl/CMakeLists.txt
@@ -205,5 +205,4 @@ target_link_libraries(DpctlCAPI INTERFACE ${_trgt}_headers)
 
 add_subdirectory(program)
 add_subdirectory(memory)
-add_subdirectory(tensor)
 add_subdirectory(utils)
diff --git a/dpctl/__init__.py b/dpctl/__init__.py
index 41661a6788..5e442a8fc4 100644
--- a/dpctl/__init__.py
+++ b/dpctl/__init__.py
@@ -126,7 +126,6 @@
 __all__ += [
     "memory",
     "program",
-    "tensor",
     "utils",
 ]
 
diff --git a/dpctl/apis/include/dpctl4pybind11.hpp b/dpctl/apis/include/dpctl4pybind11.hpp
index 3181291b8d..6e1a9fe3da 100644
--- a/dpctl/apis/include/dpctl4pybind11.hpp
+++ b/dpctl/apis/include/dpctl4pybind11.hpp
@@ -74,7 +74,6 @@ class dpctl_capi
     PyTypeObject *PyMemoryUSMDeviceType_;
     PyTypeObject *PyMemoryUSMSharedType_;
     PyTypeObject *PyMemoryUSMHostType_;
-    PyTypeObject *PyUSMArrayType_;
     PyTypeObject *PySyclProgramType_;
     PyTypeObject *PySyclKernelType_;
 
@@ -109,48 +108,6 @@ class dpctl_capi
         PySyclProgramObject *);
     PySyclProgramObject *(*SyclProgram_Make_)(DPCTLSyclKernelBundleRef);
 
-    // tensor
-    char *(*UsmNDArray_GetData_)(PyUSMArrayObject *);
-    int (*UsmNDArray_GetNDim_)(PyUSMArrayObject *);
-    py::ssize_t *(*UsmNDArray_GetShape_)(PyUSMArrayObject *);
-    py::ssize_t *(*UsmNDArray_GetStrides_)(PyUSMArrayObject *);
-    int (*UsmNDArray_GetTypenum_)(PyUSMArrayObject *);
-    int (*UsmNDArray_GetElementSize_)(PyUSMArrayObject *);
-    int (*UsmNDArray_GetFlags_)(PyUSMArrayObject *);
-    DPCTLSyclQueueRef (*UsmNDArray_GetQueueRef_)(PyUSMArrayObject *);
-    py::ssize_t (*UsmNDArray_GetOffset_)(PyUSMArrayObject *);
-    PyObject *(*UsmNDArray_GetUSMData_)(PyUSMArrayObject *);
-    void (*UsmNDArray_SetWritableFlag_)(PyUSMArrayObject *, int);
-    PyObject *(*UsmNDArray_MakeSimpleFromMemory_)(int,
-                                                  const py::ssize_t *,
-                                                  int,
-                                                  Py_MemoryObject *,
-                                                  py::ssize_t,
-                                                  char);
-    PyObject *(*UsmNDArray_MakeSimpleFromPtr_)(size_t,
-                                               int,
-                                               DPCTLSyclUSMRef,
-                                               DPCTLSyclQueueRef,
-                                               PyObject *);
-    PyObject *(*UsmNDArray_MakeFromPtr_)(int,
-                                         const py::ssize_t *,
-                                         int,
-                                         const py::ssize_t *,
-                                         DPCTLSyclUSMRef,
-                                         DPCTLSyclQueueRef,
-                                         py::ssize_t,
-                                         PyObject *);
-
-    int USM_ARRAY_C_CONTIGUOUS_;
-    int USM_ARRAY_F_CONTIGUOUS_;
-    int USM_ARRAY_WRITABLE_;
-    int UAR_BOOL_, UAR_BYTE_, UAR_UBYTE_, UAR_SHORT_, UAR_USHORT_, UAR_INT_,
-        UAR_UINT_, UAR_LONG_, UAR_ULONG_, UAR_LONGLONG_, UAR_ULONGLONG_,
-        UAR_FLOAT_, UAR_DOUBLE_, UAR_CFLOAT_, UAR_CDOUBLE_, UAR_TYPE_SENTINEL_,
-        UAR_HALF_;
-    int UAR_INT8_, UAR_UINT8_, UAR_INT16_, UAR_UINT16_, UAR_INT32_, UAR_UINT32_,
-        UAR_INT64_, UAR_UINT64_;
-
     bool PySyclDevice_Check_(PyObject *obj) const
     {
         return PyObject_TypeCheck(obj, PySyclDeviceType_) != 0;
@@ -179,7 +136,6 @@ class dpctl_capi
     ~dpctl_capi()
     {
         as_usm_memory_.reset();
-        default_usm_ndarray_.reset();
         default_usm_memory_.reset();
         default_sycl_queue_.reset();
     };
@@ -192,7 +148,6 @@ class dpctl_capi
 
     py::object default_sycl_queue_pyobj() { return *default_sycl_queue_; }
     py::object default_usm_memory_pyobj() { return *default_usm_memory_; }
-    py::object default_usm_ndarray_pyobj() { return *default_usm_ndarray_; }
     py::object as_usm_memory_pyobj() { return *as_usm_memory_; }
 
 private:
@@ -216,7 +171,6 @@ class dpctl_capi
 
     std::shared_ptr<py::object> default_sycl_queue_;
     std::shared_ptr<py::object> default_usm_memory_;
-    std::shared_ptr<py::object> default_usm_ndarray_;
     std::shared_ptr<py::object> as_usm_memory_;
 
     dpctl_capi()
@@ -226,42 +180,24 @@ class dpctl_capi
           Py_SyclQueueType_(nullptr), PySyclQueueType_(nullptr),
           Py_MemoryType_(nullptr), PyMemoryUSMDeviceType_(nullptr),
           PyMemoryUSMSharedType_(nullptr), PyMemoryUSMHostType_(nullptr),
-          PyUSMArrayType_(nullptr), PySyclProgramType_(nullptr),
-          PySyclKernelType_(nullptr), SyclDevice_GetDeviceRef_(nullptr),
-          SyclDevice_Make_(nullptr), SyclContext_GetContextRef_(nullptr),
-          SyclContext_Make_(nullptr), SyclEvent_GetEventRef_(nullptr),
-          SyclEvent_Make_(nullptr), SyclQueue_GetQueueRef_(nullptr),
-          SyclQueue_Make_(nullptr), Memory_GetUsmPointer_(nullptr),
-          Memory_GetOpaquePointer_(nullptr), Memory_GetContextRef_(nullptr),
-          Memory_GetQueueRef_(nullptr), Memory_GetNumBytes_(nullptr),
-          Memory_Make_(nullptr), SyclKernel_GetKernelRef_(nullptr),
-          SyclKernel_Make_(nullptr), SyclProgram_GetKernelBundleRef_(nullptr),
-          SyclProgram_Make_(nullptr), UsmNDArray_GetData_(nullptr),
-          UsmNDArray_GetNDim_(nullptr), UsmNDArray_GetShape_(nullptr),
-          UsmNDArray_GetStrides_(nullptr), UsmNDArray_GetTypenum_(nullptr),
-          UsmNDArray_GetElementSize_(nullptr), UsmNDArray_GetFlags_(nullptr),
-          UsmNDArray_GetQueueRef_(nullptr), UsmNDArray_GetOffset_(nullptr),
-          UsmNDArray_GetUSMData_(nullptr), UsmNDArray_SetWritableFlag_(nullptr),
-          UsmNDArray_MakeSimpleFromMemory_(nullptr),
-          UsmNDArray_MakeSimpleFromPtr_(nullptr),
-          UsmNDArray_MakeFromPtr_(nullptr), USM_ARRAY_C_CONTIGUOUS_(0),
-          USM_ARRAY_F_CONTIGUOUS_(0), USM_ARRAY_WRITABLE_(0), UAR_BOOL_(-1),
-          UAR_BYTE_(-1), UAR_UBYTE_(-1), UAR_SHORT_(-1), UAR_USHORT_(-1),
-          UAR_INT_(-1), UAR_UINT_(-1), UAR_LONG_(-1), UAR_ULONG_(-1),
-          UAR_LONGLONG_(-1), UAR_ULONGLONG_(-1), UAR_FLOAT_(-1),
-          UAR_DOUBLE_(-1), UAR_CFLOAT_(-1), UAR_CDOUBLE_(-1),
-          UAR_TYPE_SENTINEL_(-1), UAR_HALF_(-1), UAR_INT8_(-1), UAR_UINT8_(-1),
-          UAR_INT16_(-1), UAR_UINT16_(-1), UAR_INT32_(-1), UAR_UINT32_(-1),
-          UAR_INT64_(-1), UAR_UINT64_(-1), default_sycl_queue_{},
-          default_usm_memory_{}, default_usm_ndarray_{}, as_usm_memory_{}
+          PySyclProgramType_(nullptr), PySyclKernelType_(nullptr),
+          SyclDevice_GetDeviceRef_(nullptr), SyclDevice_Make_(nullptr),
+          SyclContext_GetContextRef_(nullptr), SyclContext_Make_(nullptr),
+          SyclEvent_GetEventRef_(nullptr), SyclEvent_Make_(nullptr),
+          SyclQueue_GetQueueRef_(nullptr), SyclQueue_Make_(nullptr),
+          Memory_GetUsmPointer_(nullptr), Memory_GetOpaquePointer_(nullptr),
+          Memory_GetContextRef_(nullptr), Memory_GetQueueRef_(nullptr),
+          Memory_GetNumBytes_(nullptr), Memory_Make_(nullptr),
+          SyclKernel_GetKernelRef_(nullptr), SyclKernel_Make_(nullptr),
+          SyclProgram_GetKernelBundleRef_(nullptr), SyclProgram_Make_(nullptr),
+          default_sycl_queue_{}, default_usm_memory_{}, as_usm_memory_{}
 
     {
         // Import Cython-generated C-API for dpctl
         // This imports python modules and initializes
         // static variables such as function pointers for C-API,
         // e.g. SyclDevice_GetDeviceRef, etc.
-        // pointers to Python types, i.e. PySyclDeviceType, etc.
-        // and exported constants, i.e. USM_ARRAY_C_CONTIGUOUS, etc.
+        // and pointers to Python types, i.e. PySyclDeviceType, etc.
         import_dpctl();
 
         // Python type objects for classes implemented by dpctl
@@ -277,7 +213,6 @@ class dpctl_capi
         this->PyMemoryUSMDeviceType_ = &PyMemoryUSMDeviceType;
         this->PyMemoryUSMSharedType_ = &PyMemoryUSMSharedType;
         this->PyMemoryUSMHostType_ = &PyMemoryUSMHostType;
-        this->PyUSMArrayType_ = &PyUSMArrayType;
         this->PySyclProgramType_ = &PySyclProgramType;
         this->PySyclKernelType_ = &PySyclKernelType;
 
@@ -311,67 +246,8 @@ class dpctl_capi
         this->SyclProgram_GetKernelBundleRef_ = SyclProgram_GetKernelBundleRef;
         this->SyclProgram_Make_ = SyclProgram_Make;
 
-        // dpctl.tensor.usm_ndarray API
-        this->UsmNDArray_GetData_ = UsmNDArray_GetData;
-        this->UsmNDArray_GetNDim_ = UsmNDArray_GetNDim;
-        this->UsmNDArray_GetShape_ = UsmNDArray_GetShape;
-        this->UsmNDArray_GetStrides_ = UsmNDArray_GetStrides;
-        this->UsmNDArray_GetTypenum_ = UsmNDArray_GetTypenum;
-        this->UsmNDArray_GetElementSize_ = UsmNDArray_GetElementSize;
-        this->UsmNDArray_GetFlags_ = UsmNDArray_GetFlags;
-        this->UsmNDArray_GetQueueRef_ = UsmNDArray_GetQueueRef;
-        this->UsmNDArray_GetOffset_ = UsmNDArray_GetOffset;
-        this->UsmNDArray_GetUSMData_ = UsmNDArray_GetUSMData;
-        this->UsmNDArray_SetWritableFlag_ = UsmNDArray_SetWritableFlag;
-        this->UsmNDArray_MakeSimpleFromMemory_ =
-            UsmNDArray_MakeSimpleFromMemory;
-        this->UsmNDArray_MakeSimpleFromPtr_ = UsmNDArray_MakeSimpleFromPtr;
-        this->UsmNDArray_MakeFromPtr_ = UsmNDArray_MakeFromPtr;
-
-        // constants
-        this->USM_ARRAY_C_CONTIGUOUS_ = USM_ARRAY_C_CONTIGUOUS;
-        this->USM_ARRAY_F_CONTIGUOUS_ = USM_ARRAY_F_CONTIGUOUS;
-        this->USM_ARRAY_WRITABLE_ = USM_ARRAY_WRITABLE;
-        this->UAR_BOOL_ = UAR_BOOL;
-        this->UAR_BYTE_ = UAR_BYTE;
-        this->UAR_UBYTE_ = UAR_UBYTE;
-        this->UAR_SHORT_ = UAR_SHORT;
-        this->UAR_USHORT_ = UAR_USHORT;
-        this->UAR_INT_ = UAR_INT;
-        this->UAR_UINT_ = UAR_UINT;
-        this->UAR_LONG_ = UAR_LONG;
-        this->UAR_ULONG_ = UAR_ULONG;
-        this->UAR_LONGLONG_ = UAR_LONGLONG;
-        this->UAR_ULONGLONG_ = UAR_ULONGLONG;
-        this->UAR_FLOAT_ = UAR_FLOAT;
-        this->UAR_DOUBLE_ = UAR_DOUBLE;
-        this->UAR_CFLOAT_ = UAR_CFLOAT;
-        this->UAR_CDOUBLE_ = UAR_CDOUBLE;
-        this->UAR_TYPE_SENTINEL_ = UAR_TYPE_SENTINEL;
-        this->UAR_HALF_ = UAR_HALF;
-
-        // deduced disjoint types
-        this->UAR_INT8_ = UAR_BYTE;
-        this->UAR_UINT8_ = UAR_UBYTE;
-        this->UAR_INT16_ = UAR_SHORT;
-        this->UAR_UINT16_ = UAR_USHORT;
-        this->UAR_INT32_ =
-            platform_typeid_lookup<std::int32_t, long, int, short>(
-                UAR_LONG, UAR_INT, UAR_SHORT);
-        this->UAR_UINT32_ =
-            platform_typeid_lookup<std::uint32_t, unsigned long, unsigned int,
-                                   unsigned short>(UAR_ULONG, UAR_UINT,
-                                                   UAR_USHORT);
-        this->UAR_INT64_ =
-            platform_typeid_lookup<std::int64_t, long, long long, int>(
-                UAR_LONG, UAR_LONGLONG, UAR_INT);
-        this->UAR_UINT64_ =
-            platform_typeid_lookup<std::uint64_t, unsigned long,
-                                   unsigned long long, unsigned int>(
-                UAR_ULONG, UAR_ULONGLONG, UAR_UINT);
-
         // create shared pointers to python objects used in type-casters
-        // for dpctl::memory::usm_memory and dpctl::tensor::usm_ndarray
+        // for dpctl::memory::usm_memory
         sycl::queue q_{};
         PySyclQueueObject *py_q_tmp =
             SyclQueue_Make(reinterpret_cast<DPCTLSyclQueueRef>(&q_));
@@ -391,17 +267,6 @@ class dpctl_capi
             mem_kl(1, py::arg("queue") = py_sycl_queue);
         default_usm_memory_ = std::shared_ptr<py::object>(
             new py::object{py_default_usm_memory}, Deleter{});
-
-        py::module_ mod_usmarray =
-            py::module_::import("dpctl.tensor._usmarray");
-        auto tensor_kl = mod_usmarray.attr("usm_ndarray");
-
-        const py::object &py_default_usm_ndarray =
-            tensor_kl(py::tuple(), py::arg("dtype") = py::str("u1"),
-                      py::arg("buffer") = py_default_usm_memory);
-
-        default_usm_ndarray_ = std::shared_ptr<py::object>(
-            new py::object{py_default_usm_ndarray}, Deleter{});
     }
 
     dpctl_capi(dpctl_capi const &) = default;
@@ -879,342 +744,6 @@ class usm_memory : public py::object
 
 } // end namespace memory
 
-namespace tensor
-{
-
-inline std::vector<py::ssize_t>
-c_contiguous_strides(int nd,
-                     const py::ssize_t *shape,
-                     py::ssize_t element_size = 1)
-{
-    if (nd > 0) {
-        std::vector<py::ssize_t> c_strides(nd, element_size);
-        for (int ic = nd - 1; ic > 0;) {
-            py::ssize_t next_v = c_strides[ic] * shape[ic];
-            c_strides[--ic] = next_v;
-        }
-        return c_strides;
-    }
-    else {
-        return std::vector<py::ssize_t>();
-    }
-}
-
-inline std::vector<py::ssize_t>
-f_contiguous_strides(int nd,
-                     const py::ssize_t *shape,
-                     py::ssize_t element_size = 1)
-{
-    if (nd > 0) {
-        std::vector<py::ssize_t> f_strides(nd, element_size);
-        for (int i = 0; i < nd - 1;) {
-            py::ssize_t next_v = f_strides[i] * shape[i];
-            f_strides[++i] = next_v;
-        }
-        return f_strides;
-    }
-    else {
-        return std::vector<py::ssize_t>();
-    }
-}
-
-inline std::vector<py::ssize_t>
-c_contiguous_strides(const std::vector<py::ssize_t> &shape,
-                     py::ssize_t element_size = 1)
-{
-    return c_contiguous_strides(shape.size(), shape.data(), element_size);
-}
-
-inline std::vector<py::ssize_t>
-f_contiguous_strides(const std::vector<py::ssize_t> &shape,
-                     py::ssize_t element_size = 1)
-{
-    return f_contiguous_strides(shape.size(), shape.data(), element_size);
-}
-
-class usm_ndarray : public py::object
-{
-public:
-    PYBIND11_OBJECT(usm_ndarray, py::object, [](PyObject *o) -> bool {
-        return PyObject_TypeCheck(
-                   o, ::dpctl::detail::dpctl_capi::get().PyUSMArrayType_) != 0;
-    })
-
-    usm_ndarray()
-        : py::object(
-              ::dpctl::detail::dpctl_capi::get().default_usm_ndarray_pyobj(),
-              borrowed_t{})
-    {
-        if (!m_ptr)
-            throw py::error_already_set();
-    }
-
-    char *get_data() const
-    {
-        PyUSMArrayObject *raw_ar = usm_array_ptr();
-
-        auto const &api = ::dpctl::detail::dpctl_capi::get();
-        return api.UsmNDArray_GetData_(raw_ar);
-    }
-
-    template <typename T> T *get_data() const
-    {
-        return reinterpret_cast<T *>(get_data());
-    }
-
-    int get_ndim() const
-    {
-        PyUSMArrayObject *raw_ar = usm_array_ptr();
-
-        auto const &api = ::dpctl::detail::dpctl_capi::get();
-        return api.UsmNDArray_GetNDim_(raw_ar);
-    }
-
-    const py::ssize_t *get_shape_raw() const
-    {
-        PyUSMArrayObject *raw_ar = usm_array_ptr();
-
-        auto const &api = ::dpctl::detail::dpctl_capi::get();
-        return api.UsmNDArray_GetShape_(raw_ar);
-    }
-
-    std::vector<py::ssize_t> get_shape_vector() const
-    {
-        auto raw_sh = get_shape_raw();
-        auto nd = get_ndim();
-
-        std::vector<py::ssize_t> shape_vector(raw_sh, raw_sh + nd);
-        return shape_vector;
-    }
-
-    py::ssize_t get_shape(int i) const
-    {
-        auto shape_ptr = get_shape_raw();
-        return shape_ptr[i];
-    }
-
-    const py::ssize_t *get_strides_raw() const
-    {
-        PyUSMArrayObject *raw_ar = usm_array_ptr();
-
-        auto const &api = ::dpctl::detail::dpctl_capi::get();
-        return api.UsmNDArray_GetStrides_(raw_ar);
-    }
-
-    std::vector<py::ssize_t> get_strides_vector() const
-    {
-        auto raw_st = get_strides_raw();
-        auto nd = get_ndim();
-
-        if (raw_st == nullptr) {
-            auto is_c_contig = is_c_contiguous();
-            auto is_f_contig = is_f_contiguous();
-            auto raw_sh = get_shape_raw();
-            if (is_c_contig) {
-                const auto &contig_strides = c_contiguous_strides(nd, raw_sh);
-                return contig_strides;
-            }
-            else if (is_f_contig) {
-                const auto &contig_strides = f_contiguous_strides(nd, raw_sh);
-                return contig_strides;
-            }
-            else {
-                throw std::runtime_error("Invalid array encountered when "
-                                         "building strides");
-            }
-        }
-        else {
-            std::vector<py::ssize_t> st_vec(raw_st, raw_st + nd);
-            return st_vec;
-        }
-    }
-
-    py::ssize_t get_size() const
-    {
-        PyUSMArrayObject *raw_ar = usm_array_ptr();
-
-        auto const &api = ::dpctl::detail::dpctl_capi::get();
-        int ndim = api.UsmNDArray_GetNDim_(raw_ar);
-        const py::ssize_t *shape = api.UsmNDArray_GetShape_(raw_ar);
-
-        py::ssize_t nelems = 1;
-        for (int i = 0; i < ndim; ++i) {
-            nelems *= shape[i];
-        }
-
-        assert(nelems >= 0);
-        return nelems;
-    }
-
-    std::pair<py::ssize_t, py::ssize_t> get_minmax_offsets() const
-    {
-        PyUSMArrayObject *raw_ar = usm_array_ptr();
-
-        auto const &api = ::dpctl::detail::dpctl_capi::get();
-        int nd = api.UsmNDArray_GetNDim_(raw_ar);
-        const py::ssize_t *shape = api.UsmNDArray_GetShape_(raw_ar);
-        const py::ssize_t *strides = api.UsmNDArray_GetStrides_(raw_ar);
-
-        py::ssize_t offset_min = 0;
-        py::ssize_t offset_max = 0;
-        if (strides == nullptr) {
-            py::ssize_t stride(1);
-            for (int i = 0; i < nd; ++i) {
-                offset_max += stride * (shape[i] - 1);
-                stride *= shape[i];
-            }
-        }
-        else {
-            for (int i = 0; i < nd; ++i) {
-                py::ssize_t delta = strides[i] * (shape[i] - 1);
-                if (strides[i] > 0) {
-                    offset_max += delta;
-                }
-                else {
-                    offset_min += delta;
-                }
-            }
-        }
-        return std::make_pair(offset_min, offset_max);
-    }
-
-    sycl::queue get_queue() const
-    {
-        PyUSMArrayObject *raw_ar = usm_array_ptr();
-
-        auto const &api = ::dpctl::detail::dpctl_capi::get();
-        DPCTLSyclQueueRef QRef = api.UsmNDArray_GetQueueRef_(raw_ar);
-        return *(reinterpret_cast<sycl::queue *>(QRef));
-    }
-
-    sycl::device get_device() const
-    {
-        PyUSMArrayObject *raw_ar = usm_array_ptr();
-
-        auto const &api = ::dpctl::detail::dpctl_capi::get();
-        DPCTLSyclQueueRef QRef = api.UsmNDArray_GetQueueRef_(raw_ar);
-        return reinterpret_cast<sycl::queue *>(QRef)->get_device();
-    }
-
-    int get_typenum() const
-    {
-        PyUSMArrayObject *raw_ar = usm_array_ptr();
-
-        auto const &api = ::dpctl::detail::dpctl_capi::get();
-        return api.UsmNDArray_GetTypenum_(raw_ar);
-    }
-
-    int get_flags() const
-    {
-        PyUSMArrayObject *raw_ar = usm_array_ptr();
-
-        auto const &api = ::dpctl::detail::dpctl_capi::get();
-        return api.UsmNDArray_GetFlags_(raw_ar);
-    }
-
-    int get_elemsize() const
-    {
-        PyUSMArrayObject *raw_ar = usm_array_ptr();
-
-        auto const &api = ::dpctl::detail::dpctl_capi::get();
-        return api.UsmNDArray_GetElementSize_(raw_ar);
-    }
-
-    bool is_c_contiguous() const
-    {
-        int flags = get_flags();
-        auto const &api = ::dpctl::detail::dpctl_capi::get();
-        return static_cast<bool>(flags & api.USM_ARRAY_C_CONTIGUOUS_);
-    }
-
-    bool is_f_contiguous() const
-    {
-        int flags = get_flags();
-        auto const &api = ::dpctl::detail::dpctl_capi::get();
-        return static_cast<bool>(flags & api.USM_ARRAY_F_CONTIGUOUS_);
-    }
-
-    bool is_writable() const
-    {
-        int flags = get_flags();
-        auto const &api = ::dpctl::detail::dpctl_capi::get();
-        return static_cast<bool>(flags & api.USM_ARRAY_WRITABLE_);
-    }
-
-    /*! @brief Get usm_data property of array */
-    py::object get_usm_data() const
-    {
-        PyUSMArrayObject *raw_ar = usm_array_ptr();
-
-        auto const &api = ::dpctl::detail::dpctl_capi::get();
-        // UsmNDArray_GetUSMData_ gives a new reference
-        PyObject *usm_data = api.UsmNDArray_GetUSMData_(raw_ar);
-
-        // pass reference ownership to py::object
-        return py::reinterpret_steal<py::object>(usm_data);
-    }
-
-    bool is_managed_by_smart_ptr() const
-    {
-        PyUSMArrayObject *raw_ar = usm_array_ptr();
-
-        auto const &api = ::dpctl::detail::dpctl_capi::get();
-        PyObject *usm_data = api.UsmNDArray_GetUSMData_(raw_ar);
-
-        if (!PyObject_TypeCheck(usm_data, api.Py_MemoryType_)) {
-            Py_DECREF(usm_data);
-            return false;
-        }
-
-        Py_MemoryObject *mem_obj =
-            reinterpret_cast<Py_MemoryObject *>(usm_data);
-        const void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj);
-
-        Py_DECREF(usm_data);
-        return bool(opaque_ptr);
-    }
-
-    const std::shared_ptr<void> &get_smart_ptr_owner() const
-    {
-        PyUSMArrayObject *raw_ar = usm_array_ptr();
-
-        auto const &api = ::dpctl::detail::dpctl_capi::get();
-
-        PyObject *usm_data = api.UsmNDArray_GetUSMData_(raw_ar);
-
-        if (!PyObject_TypeCheck(usm_data, api.Py_MemoryType_)) {
-            Py_DECREF(usm_data);
-            throw std::runtime_error(
-                "usm_ndarray object does not have Memory object "
-                "managing lifetime of USM allocation");
-        }
-
-        Py_MemoryObject *mem_obj =
-            reinterpret_cast<Py_MemoryObject *>(usm_data);
-        void *opaque_ptr = api.Memory_GetOpaquePointer_(mem_obj);
-        Py_DECREF(usm_data);
-
-        if (opaque_ptr) {
-            auto shptr_ptr =
-                reinterpret_cast<std::shared_ptr<void> *>(opaque_ptr);
-            return *shptr_ptr;
-        }
-        else {
-            throw std::runtime_error(
-                "Memory object underlying usm_ndarray does not have "
-                "smart pointer managing lifetime of USM allocation");
-        }
-    }
-
-private:
-    PyUSMArrayObject *usm_array_ptr() const
-    {
-        return reinterpret_cast<PyUSMArrayObject *>(m_ptr);
-    }
-};
-
-} // end namespace tensor
-
 namespace utils
 {
 
@@ -1231,12 +760,6 @@ struct ManagedMemory
                 py::cast<dpctl::memory::usm_memory>(h);
             return usm_memory_inst.is_managed_by_smart_ptr();
         }
-        else if (py::isinstance<dpctl::tensor::usm_ndarray>(h)) {
-            const auto &usm_array_inst =
-                py::cast<dpctl::tensor::usm_ndarray>(h);
-            return usm_array_inst.is_managed_by_smart_ptr();
-        }
-
         return false;
     }
 
@@ -1247,12 +770,6 @@ struct ManagedMemory
                 py::cast<dpctl::memory::usm_memory>(h);
             return usm_memory_inst.get_smart_ptr_owner();
         }
-        else if (py::isinstance<dpctl::tensor::usm_ndarray>(h)) {
-            const auto &usm_array_inst =
-                py::cast<dpctl::tensor::usm_ndarray>(h);
-            return usm_array_inst.get_smart_ptr_owner();
-        }
-
         throw std::runtime_error(
             "Attempted extraction of shared_ptr on an unrecognized type");
     }
@@ -1343,21 +860,6 @@ bool queues_are_compatible(const sycl::queue &exec_q,
     return true;
 }
 
-/*! @brief Check if all allocation queues of usm_ndarays are the same as
-    the execution queue */
-template <std::size_t num>
-bool queues_are_compatible(const sycl::queue &exec_q,
-                           const ::dpctl::tensor::usm_ndarray (&arrs)[num])
-{
-    for (std::size_t i = 0; i < num; ++i) {
-
-        if (exec_q != arrs[i].get_queue()) {
-            return false;
-        }
-    }
-    return true;
-}
-
 } // end namespace utils
 
 } // end namespace dpctl
diff --git a/dpctl/apis/include/dpctl_capi.h b/dpctl/apis/include/dpctl_capi.h
index 73e70903e6..9cf245aef5 100644
--- a/dpctl/apis/include/dpctl_capi.h
+++ b/dpctl/apis/include/dpctl_capi.h
@@ -45,8 +45,6 @@
 #include "dpctl/_sycl_queue_api.h"
 #include "dpctl/memory/_memory.h"
 #include "dpctl/memory/_memory_api.h"
-#include "dpctl/tensor/_usmarray.h"
-#include "dpctl/tensor/_usmarray_api.h"
 #include "dpctl/program/_program.h"
 #include "dpctl/program/_program_api.h"
 
@@ -68,7 +66,6 @@ static inline void import_dpctl(void)
     import_dpctl___sycl_event();
     import_dpctl___sycl_queue();
     import_dpctl__memory___memory();
-    import_dpctl__tensor___usmarray();
     import_dpctl__program___program();
     return;
 }
diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
deleted file mode 100644
index b5a66b88f6..0000000000
--- a/dpctl/tensor/CMakeLists.txt
+++ /dev/null
@@ -1,316 +0,0 @@
-file(GLOB _cython_sources *.pyx)
-foreach(_cy_file ${_cython_sources})
-    get_filename_component(_trgt ${_cy_file} NAME_WLE)
-    build_dpctl_ext(${_trgt} ${_cy_file} "dpctl/tensor" RELATIVE_PATH "..")
-    target_include_directories(${_trgt} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
-    target_link_libraries(DpctlCAPI INTERFACE ${_trgt}_headers)
-endforeach()
-
-# TODO: do we need to write this dependencies explicitly? Does it even work this
-#   way?
-add_custom_target(_usmarray_deps SOURCES
-    ${CMAKE_CURRENT_SOURCE_DIR}/_slicing.pxi
-    ${CMAKE_CURRENT_SOURCE_DIR}/_types.pxi
-    ${CMAKE_CURRENT_SOURCE_DIR}/_stride_utils.pxi
-)
-add_dependencies(_usmarray _usmarray_deps)
-
-if(WIN32)
-    if (${CMAKE_VERSION} VERSION_LESS "3.23")
-        # this is a work-around for target_link_options inserting option after -link option, cause
-        # linker to ignore it.
-        set(CMAKE_CXX_LINK_FLAGS "${CMAKE_CXX_LINK_FLAGS} -fsycl-device-code-split=per_kernel")
-    endif()
-endif()
-
-set(_elementwise_sources
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/elementwise_common.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/abs.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/acos.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/acosh.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/add.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/angle.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/asin.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/asinh.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/atan.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/atan2.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/atanh.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_and.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_invert.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_left_shift.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_or.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_right_shift.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_xor.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cbrt.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/ceil.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/conj.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/copysign.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cos.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cosh.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/equal.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/exp.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/exp2.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/expm1.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/floor_divide.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/floor.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/greater_equal.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/greater.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/hypot.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/imag.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/isfinite.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/isinf.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/isnan.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/less_equal.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/less.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log1p.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log2.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log10.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logaddexp.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_and.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_not.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_or.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_xor.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/maximum.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/minimum.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/multiply.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/negative.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/nextafter.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/not_equal.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/positive.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/pow.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/proj.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/real.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/reciprocal.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/remainder.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/round.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/rsqrt.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sign.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/signbit.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sin.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sinh.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sqrt.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/square.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/subtract.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/tan.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/tanh.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/true_divide.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/trunc.cpp
-)
-set(_reduction_sources
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduction_common.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/all.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/any.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmax.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmin.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/logsumexp.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/max.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/min.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/prod.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduce_hypot.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/sum.cpp
-)
-set(_sorting_sources
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/isin.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_sort.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_argsort.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_sort.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_argsort.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/searchsorted.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/topk.cpp
-)
-set(_static_lib_sources
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp
-)
-set(_tensor_impl_sources
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_ctors.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_as_contig.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
-)
-set(_tensor_elementwise_impl_sources
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_elementwise.cpp
-    ${_elementwise_sources}
-)
-set(_tensor_reductions_impl_sources
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_reductions.cpp
-    ${_reduction_sources}
-)
-set(_tensor_sorting_impl_sources
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_sorting.cpp
-    ${_sorting_sources}
-)
-set(_linalg_sources
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linalg_functions/dot.cpp
-)
-set(_tensor_linalg_impl_sources
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_linalg.cpp
-    ${_linalg_sources}
-)
-set(_accumulator_sources
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/accumulators_common.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_logsumexp.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_prod.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_sum.cpp
-)
-set(_tensor_accumulation_impl_sources
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_accumulation.cpp
-    ${_accumulator_sources}
-)
-
-set(_static_lib_trgt simplify_iteration_space)
-
-add_library(${_static_lib_trgt} STATIC ${_static_lib_sources})
-target_include_directories(${_static_lib_trgt} PRIVATE
-  ${Python_INCLUDE_DIRS} ${DPCTL_INCLUDE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include
-)
-target_link_libraries(${_static_lib_trgt} PRIVATE pybind11::headers ${Python_LIBRARIES})
-set_target_properties(${_static_lib_trgt} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-
-set(_py_trgts)
-
-set(python_module_name _tensor_impl)
-pybind11_add_module(${python_module_name} MODULE ${_tensor_impl_sources})
-add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_impl_sources})
-target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt})
-list(APPEND _py_trgts ${python_module_name})
-
-set(python_module_name _tensor_elementwise_impl)
-pybind11_add_module(${python_module_name} MODULE ${_tensor_elementwise_impl_sources})
-add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_elementwise_impl_sources})
-target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt})
-list(APPEND _py_trgts ${python_module_name})
-
-set(python_module_name _tensor_reductions_impl)
-pybind11_add_module(${python_module_name} MODULE ${_tensor_reductions_impl_sources})
-add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_reductions_impl_sources})
-target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt})
-list(APPEND _py_trgts ${python_module_name})
-
-set(python_module_name _tensor_sorting_impl)
-pybind11_add_module(${python_module_name} MODULE ${_tensor_sorting_impl_sources})
-add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_sorting_impl_sources})
-target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt})
-list(APPEND _py_trgts ${python_module_name})
-
-set(python_module_name _tensor_linalg_impl)
-pybind11_add_module(${python_module_name} MODULE ${_tensor_linalg_impl_sources})
-add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_linalg_impl_sources})
-target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt})
-list(APPEND _py_trgts ${python_module_name})
-
-set(python_module_name _tensor_accumulation_impl)
-pybind11_add_module(${python_module_name} MODULE ${_tensor_accumulation_impl_sources})
-add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_accumulation_impl_sources})
-target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt})
-list(APPEND _py_trgts ${python_module_name})
-
-set(_clang_prefix "")
-if (WIN32)
-  set(_clang_prefix "/clang:")
-endif()
-
-set(_no_fast_math_sources
-  ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp
-)
-list(APPEND _no_fast_math_sources
-     ${_elementwise_sources}
-     ${_reduction_sources}
-     ${_sorting_sources}
-     ${_linalg_sources}
-     ${_accumulator_sources}
-)
-
-foreach(_src_fn ${_no_fast_math_sources})
-  get_source_file_property(_cmpl_options_prop ${_src_fn} COMPILE_OPTIONS)
-  set(_combined_options_prop ${_cmpl_options_prop} "${_clang_prefix}-fno-fast-math")
-  set_source_files_properties(
-     ${_src_fn}
-     PROPERTIES COMPILE_OPTIONS "${_combined_options_prop}"
-  )
-endforeach()
-
-set(_compiler_definitions "")
-
-foreach(_src_fn ${_elementwise_sources})
-  get_source_file_property(_cmpl_options_defs ${_src_fn} COMPILE_DEFINITIONS)
-  if(${_cmpl_options_defs})
-     set(_combined_options_defs ${_cmpl_options_defs} "${_compiler_definitions}")
-  else()
-     set(_combined_options_defs "${_compiler_definitions}")
-  endif()
-  set_source_files_properties(
-     ${_src_fn}
-     PROPERTIES COMPILE_DEFINITIONS "${_combined_options_defs}"
-  )
-endforeach()
-
-set(_linker_options "LINKER:${DPCTL_LDFLAGS}")
-foreach(python_module_name ${_py_trgts})
-    target_compile_options(${python_module_name} PRIVATE -fno-sycl-id-queries-fit-in-int)
-    target_link_options(${python_module_name} PRIVATE -fsycl-device-code-split=per_kernel)
-    if (DPCTL_OFFLOAD_COMPRESS)
-        target_link_options(${python_module_name} PRIVATE --offload-compress)
-    endif()
-
-    target_include_directories(${python_module_name}
-        PRIVATE
-        ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include
-        ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/
-    )
-    target_link_options(${python_module_name} PRIVATE ${_linker_options})
-    if(DPCTL_GENERATE_COVERAGE)
-        if(DPCTL_GENERATE_COVERAGE_FOR_PYBIND11_EXTENSIONS)
-            target_compile_options(${python_module_name}
-                PRIVATE -fprofile-instr-generate -fcoverage-mapping
-            )
-        endif()
-        target_link_options(${python_module_name}
-            PRIVATE -fprofile-instr-generate -fcoverage-mapping
-        )
-    endif()
-    if(_dpctl_sycl_targets)
-        # make fat binary
-        target_compile_options(
-            ${python_module_name}
-            PRIVATE
-            ${_dpctl_sycl_target_compile_options}
-        )
-        target_link_options(
-            ${python_module_name}
-            PRIVATE
-            ${_dpctl_sycl_target_link_options}
-        )
-    endif()
-    # TODO: update source so they refernece individual libraries instead of
-    #   dpctl4pybind11.hpp. It will allow to simplify dependency tree
-    target_link_libraries(${python_module_name} PRIVATE DpctlCAPI)
-    if (DPCTL_WITH_REDIST)
-        set_target_properties(
-            ${python_module_name}
-            PROPERTIES
-                INSTALL_RPATH "$ORIGIN/../../../.."
-        )
-    endif()
-    install(TARGETS ${python_module_name} DESTINATION "dpctl/tensor")
-endforeach()
diff --git a/dpctl/tensor/__init__.pxd b/dpctl/tensor/__init__.pxd
deleted file mode 100644
index 332516028b..0000000000
--- a/dpctl/tensor/__init__.pxd
+++ /dev/null
@@ -1,24 +0,0 @@
-#                      Data Parallel Control (dpctl)
-#
-# Copyright 2020-2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-""" This file declares the extension types and functions for the Cython API
-    implemented in _usmarray.pyx file.
-"""
-
-# distutils: language = c++
-# cython: language_level=3
-
-from dpctl.tensor._usmarray cimport *
diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
deleted file mode 100644
index 517d8b989f..0000000000
--- a/dpctl/tensor/__init__.py
+++ /dev/null
@@ -1,428 +0,0 @@
-#                      Data Parallel Control (dpctl)
-#
-# Copyright 2020-2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-**Data Parallel Tensor** provides an N-dimensional array container
-backed by typed USM allocations and implements operations to
-create and manipulate such arrays, as well as perform operations
-on arrays in conformance with Python Array API standard.
-
-[ArrayAPI] https://data-apis.org/array-api
-"""
-
-# import for deprecation warning
-import warnings as _warnings
-
-from dpctl.tensor._copy_utils import asnumpy, astype, copy, from_numpy, to_numpy
-from dpctl.tensor._ctors import (
-    arange,
-    asarray,
-    empty,
-    empty_like,
-    eye,
-    full,
-    full_like,
-    linspace,
-    meshgrid,
-    ones,
-    ones_like,
-    tril,
-    triu,
-    zeros,
-    zeros_like,
-)
-from dpctl.tensor._data_types import (
-    bool,
-    complex64,
-    complex128,
-    dtype,
-    float16,
-    float32,
-    float64,
-    int8,
-    int16,
-    int32,
-    int64,
-    uint8,
-    uint16,
-    uint32,
-    uint64,
-)
-from dpctl.tensor._device import Device
-from dpctl.tensor._dldevice_conversions import (
-    dldevice_to_sycl_device,
-    sycl_device_to_dldevice,
-)
-from dpctl.tensor._dlpack import from_dlpack
-from dpctl.tensor._indexing_functions import (
-    extract,
-    nonzero,
-    place,
-    put,
-    put_along_axis,
-    take,
-    take_along_axis,
-)
-from dpctl.tensor._linear_algebra_functions import (
-    matmul,
-    matrix_transpose,
-    tensordot,
-    vecdot,
-)
-from dpctl.tensor._manipulation_functions import (
-    broadcast_arrays,
-    broadcast_to,
-    concat,
-    expand_dims,
-    flip,
-    moveaxis,
-    permute_dims,
-    repeat,
-    roll,
-    squeeze,
-    stack,
-    swapaxes,
-    tile,
-    unstack,
-)
-from dpctl.tensor._print import (
-    get_print_options,
-    print_options,
-    set_print_options,
-    usm_ndarray_repr,
-    usm_ndarray_str,
-)
-from dpctl.tensor._reshape import reshape
-from dpctl.tensor._search_functions import where
-from dpctl.tensor._statistical_functions import mean, std, var
-from dpctl.tensor._usmarray import DLDeviceType, usm_ndarray
-from dpctl.tensor._utility_functions import all, any, diff
-
-from ._accumulation import cumulative_logsumexp, cumulative_prod, cumulative_sum
-from ._array_api import __array_api_version__, __array_namespace_info__
-from ._clip import clip
-from ._constants import e, inf, nan, newaxis, pi
-from ._elementwise_funcs import (
-    abs,
-    acos,
-    acosh,
-    add,
-    angle,
-    asin,
-    asinh,
-    atan,
-    atan2,
-    atanh,
-    bitwise_and,
-    bitwise_invert,
-    bitwise_left_shift,
-    bitwise_or,
-    bitwise_right_shift,
-    bitwise_xor,
-    cbrt,
-    ceil,
-    conj,
-    copysign,
-    cos,
-    cosh,
-    divide,
-    equal,
-    exp,
-    exp2,
-    expm1,
-    floor,
-    floor_divide,
-    greater,
-    greater_equal,
-    hypot,
-    imag,
-    isfinite,
-    isinf,
-    isnan,
-    less,
-    less_equal,
-    log,
-    log1p,
-    log2,
-    log10,
-    logaddexp,
-    logical_and,
-    logical_not,
-    logical_or,
-    logical_xor,
-    maximum,
-    minimum,
-    multiply,
-    negative,
-    nextafter,
-    not_equal,
-    positive,
-    pow,
-    proj,
-    real,
-    reciprocal,
-    remainder,
-    round,
-    rsqrt,
-    sign,
-    signbit,
-    sin,
-    sinh,
-    sqrt,
-    square,
-    subtract,
-    tan,
-    tanh,
-    trunc,
-)
-from ._reduction import (
-    argmax,
-    argmin,
-    count_nonzero,
-    logsumexp,
-    max,
-    min,
-    prod,
-    reduce_hypot,
-    sum,
-)
-from ._searchsorted import searchsorted
-from ._set_functions import (
-    isin,
-    unique_all,
-    unique_counts,
-    unique_inverse,
-    unique_values,
-)
-from ._sorting import argsort, sort, top_k
-from ._testing import allclose
-from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type
-
-# deprecation warning for the dpctl.tensor module
-_warnings.warn(
-    "dpctl.tensor is deprecated since dpctl 0.21.1 and will be removed in a "
-    "future release. The functionality will be moved to separate package, dpnp "
-    "(see: https://github.com/IntelPython/dpnp). After that, use "
-    "'import dpnp.tensor' instead.",
-    DeprecationWarning,
-    stacklevel=2,
-)
-
-__all__ = [
-    "Device",
-    "usm_ndarray",
-    "arange",
-    "asarray",
-    "astype",
-    "copy",
-    "empty",
-    "zeros",
-    "ones",
-    "full",
-    "eye",
-    "linspace",
-    "empty_like",
-    "zeros_like",
-    "ones_like",
-    "full_like",
-    "flip",
-    "reshape",
-    "roll",
-    "concat",
-    "stack",
-    "broadcast_arrays",
-    "broadcast_to",
-    "expand_dims",
-    "permute_dims",
-    "squeeze",
-    "take",
-    "put",
-    "extract",
-    "place",
-    "nonzero",
-    "from_numpy",
-    "to_numpy",
-    "asnumpy",
-    "from_dlpack",
-    "tril",
-    "triu",
-    "where",
-    "matrix_transpose",
-    "all",
-    "any",
-    "dtype",
-    "isdtype",
-    "bool",
-    "int8",
-    "uint8",
-    "int16",
-    "uint16",
-    "int32",
-    "uint32",
-    "int64",
-    "uint64",
-    "float16",
-    "float32",
-    "float64",
-    "complex64",
-    "complex128",
-    "iinfo",
-    "finfo",
-    "unstack",
-    "moveaxis",
-    "swapaxes",
-    "can_cast",
-    "result_type",
-    "meshgrid",
-    "get_print_options",
-    "set_print_options",
-    "print_options",
-    "usm_ndarray_repr",
-    "usm_ndarray_str",
-    "newaxis",
-    "e",
-    "pi",
-    "nan",
-    "inf",
-    "abs",
-    "acos",
-    "acosh",
-    "add",
-    "asin",
-    "asinh",
-    "atan",
-    "atan2",
-    "atanh",
-    "bitwise_and",
-    "bitwise_invert",
-    "bitwise_left_shift",
-    "bitwise_or",
-    "bitwise_right_shift",
-    "bitwise_xor",
-    "ceil",
-    "conj",
-    "cos",
-    "cosh",
-    "divide",
-    "equal",
-    "exp",
-    "expm1",
-    "floor",
-    "floor_divide",
-    "greater",
-    "greater_equal",
-    "hypot",
-    "imag",
-    "isfinite",
-    "isinf",
-    "isnan",
-    "less",
-    "less_equal",
-    "log",
-    "logical_and",
-    "logical_not",
-    "logical_or",
-    "logical_xor",
-    "log1p",
-    "log2",
-    "log10",
-    "maximum",
-    "minimum",
-    "multiply",
-    "negative",
-    "not_equal",
-    "positive",
-    "pow",
-    "logaddexp",
-    "proj",
-    "real",
-    "remainder",
-    "round",
-    "sign",
-    "signbit",
-    "sin",
-    "sinh",
-    "sqrt",
-    "square",
-    "subtract",
-    "not_equal",
-    "floor_divide",
-    "sum",
-    "tan",
-    "tanh",
-    "trunc",
-    "allclose",
-    "repeat",
-    "tile",
-    "max",
-    "min",
-    "argmax",
-    "argmin",
-    "prod",
-    "cbrt",
-    "exp2",
-    "copysign",
-    "rsqrt",
-    "clip",
-    "logsumexp",
-    "reduce_hypot",
-    "mean",
-    "std",
-    "var",
-    "__array_api_version__",
-    "__array_namespace_info__",
-    "reciprocal",
-    "angle",
-    "sort",
-    "argsort",
-    "unique_all",
-    "unique_counts",
-    "unique_inverse",
-    "unique_values",
-    "matmul",
-    "tensordot",
-    "vecdot",
-    "searchsorted",
-    "cumulative_logsumexp",
-    "cumulative_prod",
-    "cumulative_sum",
-    "nextafter",
-    "diff",
-    "count_nonzero",
-    "DLDeviceType",
-    "take_along_axis",
-    "put_along_axis",
-    "top_k",
-    "dldevice_to_sycl_device",
-    "sycl_device_to_dldevice",
-    "isin",
-]
-
-
-def __getattr__(name: str):  # pragma: no cover
-    # per-attribute access deprecation notices per PEP 562
-    if name in __all__:
-        _warnings.warn(
-            f"dpctl.tensor.{name} is deprecated; dpctl.tensor is deprecated "
-            "since dpctl 0.21.1 and will be removed in a future release. The "
-            "functionality will be moved to separate package, dpnp (see: "
-            "https://github.com/IntelPython/dpnp). After that, use 'import "
-            "dpnp.tensor' instead.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-        return globals()[name]
-    raise AttributeError(f"module 'dpctl.tensor' has no attribute '{name}'")
diff --git a/dpctl/tensor/_accumulation.py b/dpctl/tensor/_accumulation.py
deleted file mode 100644
index 1006d222b9..0000000000
--- a/dpctl/tensor/_accumulation.py
+++ /dev/null
@@ -1,454 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import dpctl
-import dpctl.tensor as dpt
-import dpctl.tensor._tensor_accumulation_impl as tai
-import dpctl.tensor._tensor_impl as ti
-from dpctl.tensor._type_utils import (
-    _default_accumulation_dtype,
-    _default_accumulation_dtype_fp_types,
-    _to_device_supported_dtype,
-)
-from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
-
-from ._numpy_helper import normalize_axis_index
-
-
-def _accumulate_common(
-    x,
-    axis,
-    dtype,
-    include_initial,
-    out,
-    _accumulate_fn,
-    _accumulate_include_initial_fn,
-    _dtype_supported,
-    _default_accumulation_type_fn,
-):
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
-    appended_axis = False
-    if x.ndim == 0:
-        x = x[dpt.newaxis]
-        appended_axis = True
-    nd = x.ndim
-    if axis is None:
-        if nd > 1:
-            raise ValueError(
-                "`axis` cannot be `None` for array of dimension `{}`".format(nd)
-            )
-        axis = 0
-    else:
-        axis = normalize_axis_index(axis, nd, "axis")
-    sh = x.shape
-    res_sh = (
-        sh[:axis] + (sh[axis] + 1,) + sh[axis + 1 :] if include_initial else sh
-    )
-    a1 = axis + 1
-    if a1 == nd:
-        perm = list(range(nd))
-        arr = x
-    else:
-        perm = [i for i in range(nd) if i != axis] + [
-            axis,
-        ]
-        arr = dpt.permute_dims(x, perm)
-    q = x.sycl_queue
-    inp_dt = x.dtype
-    res_usm_type = x.usm_type
-    if dtype is None:
-        res_dt = _default_accumulation_type_fn(inp_dt, q)
-    else:
-        res_dt = dpt.dtype(dtype)
-        res_dt = _to_device_supported_dtype(res_dt, q.sycl_device)
-
-    # checking now avoids unnecessary allocations
-    implemented_types = _dtype_supported(inp_dt, res_dt)
-    if dtype is None and not implemented_types:
-        raise RuntimeError(
-            "Automatically determined accumulation data type does not "
-            "have direct implementation"
-        )
-    orig_out = out
-    if out is not None:
-        if not isinstance(out, dpt.usm_ndarray):
-            raise TypeError(
-                f"output array must be of usm_ndarray type, got {type(out)}"
-            )
-        if not out.flags.writable:
-            raise ValueError("provided `out` array is read-only")
-        out_sh = out.shape
-        # append an axis to `out` if scalar
-        if appended_axis and not include_initial:
-            out = out[dpt.newaxis, ...]
-            orig_out = out
-            final_res_sh = res_sh[1:]
-        else:
-            final_res_sh = res_sh
-        if not out_sh == final_res_sh:
-            raise ValueError(
-                "The shape of input and output arrays are inconsistent. "
-                f"Expected output shape is {final_res_sh}, got {out_sh}"
-            )
-        if res_dt != out.dtype:
-            raise ValueError(
-                f"Output array of type {res_dt} is needed, " f"got {out.dtype}"
-            )
-        if dpctl.utils.get_execution_queue((q, out.sycl_queue)) is None:
-            raise ExecutionPlacementError(
-                "Input and output allocation queues are not compatible"
-            )
-        # permute out array dims if necessary
-        if a1 != nd:
-            out = dpt.permute_dims(out, perm)
-            orig_out = out
-        if ti._array_overlap(x, out) and implemented_types:
-            out = dpt.empty_like(out)
-    else:
-        out = dpt.empty(
-            res_sh, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
-        )
-        if a1 != nd:
-            out = dpt.permute_dims(out, perm)
-
-    _manager = SequentialOrderManager[q]
-    depends = _manager.submitted_events
-    if implemented_types:
-        if not include_initial:
-            ht_e, acc_ev = _accumulate_fn(
-                src=arr,
-                trailing_dims_to_accumulate=1,
-                dst=out,
-                sycl_queue=q,
-                depends=depends,
-            )
-        else:
-            ht_e, acc_ev = _accumulate_include_initial_fn(
-                src=arr, dst=out, sycl_queue=q, depends=depends
-            )
-        _manager.add_event_pair(ht_e, acc_ev)
-        if not (orig_out is None or out is orig_out):
-            # Copy the out data from temporary buffer to original memory
-            ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=out, dst=orig_out, sycl_queue=q, depends=[acc_ev]
-            )
-            _manager.add_event_pair(ht_e_cpy, cpy_e)
-            out = orig_out
-    else:
-        if _dtype_supported(res_dt, res_dt):
-            tmp = dpt.empty(
-                arr.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
-            )
-            ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=arr, dst=tmp, sycl_queue=q, depends=depends
-            )
-            _manager.add_event_pair(ht_e_cpy, cpy_e)
-            if not include_initial:
-                ht_e, acc_ev = _accumulate_fn(
-                    src=tmp,
-                    trailing_dims_to_accumulate=1,
-                    dst=out,
-                    sycl_queue=q,
-                    depends=[cpy_e],
-                )
-            else:
-                ht_e, acc_ev = _accumulate_include_initial_fn(
-                    src=tmp,
-                    dst=out,
-                    sycl_queue=q,
-                    depends=[cpy_e],
-                )
-            _manager.add_event_pair(ht_e, acc_ev)
-        else:
-            buf_dt = _default_accumulation_type_fn(inp_dt, q)
-            tmp = dpt.empty(
-                arr.shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q
-            )
-            ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=arr, dst=tmp, sycl_queue=q, depends=depends
-            )
-            _manager.add_event_pair(ht_e_cpy, cpy_e)
-            tmp_res = dpt.empty(
-                res_sh, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q
-            )
-            if a1 != nd:
-                tmp_res = dpt.permute_dims(tmp_res, perm)
-            if not include_initial:
-                ht_e, acc_ev = _accumulate_fn(
-                    src=tmp,
-                    trailing_dims_to_accumulate=1,
-                    dst=tmp_res,
-                    sycl_queue=q,
-                    depends=[cpy_e],
-                )
-            else:
-                ht_e, acc_ev = _accumulate_include_initial_fn(
-                    src=tmp,
-                    dst=tmp_res,
-                    sycl_queue=q,
-                    depends=[cpy_e],
-                )
-            _manager.add_event_pair(ht_e, acc_ev)
-            ht_e_cpy2, cpy_e2 = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=tmp_res, dst=out, sycl_queue=q, depends=[acc_ev]
-            )
-            _manager.add_event_pair(ht_e_cpy2, cpy_e2)
-
-    if appended_axis:
-        out = dpt.squeeze(out)
-    if a1 != nd:
-        inv_perm = sorted(range(nd), key=lambda d: perm[d])
-        out = dpt.permute_dims(out, inv_perm)
-
-    return out
-
-
-def cumulative_sum(
-    x, /, *, axis=None, dtype=None, include_initial=False, out=None
-):
-    """
-    cumulative_sum(x, /, *, axis=None, dtype=None, include_initial=False,
-                   out=None)
-
-    Calculates the cumulative sum of elements in the input array `x`.
-
-    Args:
-        x (usm_ndarray):
-            input array.
-        axis (Optional[int]):
-            axis along which cumulative sum must be computed.
-            If `None`, the sum is computed over the entire array.
-            If `x` is a one-dimensional array, providing an `axis` is optional;
-            however, if `x` has more than one dimension, providing an `axis`
-            is required.
-            Default: `None`.
-        dtype (Optional[dtype]):
-            data type of the returned array. If `None`, the default data
-            type is inferred from the "kind" of the input array data type.
-
-                * If `x` has a real- or complex-valued floating-point data
-                  type, the returned array will have the same data type as
-                  `x`.
-                * If `x` has signed integral data type, the returned array
-                  will have the default signed integral type for the device
-                  where input array `x` is allocated.
-                * If `x` has unsigned integral data type, the returned array
-                  will have the default unsigned integral type for the device
-                  where input array `x` is allocated.
-                * If `x` has a boolean data type, the returned array will
-                  have the default signed integral type for the device
-                  where input array `x` is allocated.
-
-            If the data type (either specified or resolved) differs from the
-            data type of `x`, the input array elements are cast to the
-            specified data type before computing the cumulative sum.
-            Default: `None`.
-        include_initial (bool):
-            boolean indicating whether to include the initial value (i.e., the
-            additive identity, zero) as the first value along the provided axis
-            in the output. Default: `False`.
-        out (Optional[usm_ndarray]):
-            the array into which the result is written.
-            The data type of `out` must match the expected shape and the
-            expected data type of the result or (if provided) `dtype`.
-            If `None` then a new array is returned. Default: `None`.
-
-    Returns:
-        usm_ndarray:
-            an array containing cumulative sums. The returned array has the data
-            type as described in the `dtype` parameter description above.
-
-            The returned array shape is determined as follows:
-
-                * If `include_initial` is `False`, the returned array will
-                  have the same shape as `x`
-                * If `include_initial` is `True`, the returned array will
-                  have the same shape as `x` except the axis along which the
-                  cumulative sum is calculated, which will have size `N+1`
-
-            where `N` is the size of the axis the cumulative sums are computed
-            along.
-    """
-    return _accumulate_common(
-        x,
-        axis,
-        dtype,
-        include_initial,
-        out,
-        tai._cumsum_over_axis,
-        tai._cumsum_final_axis_include_initial,
-        tai._cumsum_dtype_supported,
-        _default_accumulation_dtype,
-    )
-
-
-def cumulative_prod(
-    x, /, *, axis=None, dtype=None, include_initial=False, out=None
-):
-    """
-    cumulative_prod(x, /, *, axis=None, dtype=None, include_initial=False,
-                   out=None)
-
-    Calculates the cumulative product of elements in the input array `x`.
-
-    Args:
-        x (usm_ndarray):
-            input array.
-        axis (Optional[int]):
-            axis along which cumulative product must be computed.
-            If `None`, the product is computed over the entire array.
-            If `x` is a one-dimensional array, providing an `axis` is optional;
-            however, if `x` has more than one dimension, providing an `axis`
-            is required.
-            Default: `None`.
-        dtype (Optional[dtype]):
-            data type of the returned array. If `None`, the default data
-            type is inferred from the "kind" of the input array data type.
-
-                * If `x` has a real- or complex-valued floating-point data
-                  type, the returned array will have the same data type as
-                  `x`.
-                * If `x` has signed integral data type, the returned array
-                  will have the default signed integral type for the device
-                  where input array `x` is allocated.
-                * If `x` has unsigned integral data type, the returned array
-                  will have the default unsigned integral type for the device
-                  where input array `x` is allocated.
-                * If `x` has a boolean data type, the returned array will
-                  have the default signed integral type for the device
-                  where input array `x` is allocated.
-
-            If the data type (either specified or resolved) differs from the
-            data type of `x`, the input array elements are cast to the
-            specified data type before computing the cumulative product.
-            Default: `None`.
-        include_initial (bool):
-            boolean indicating whether to include the initial value (i.e., the
-            additive identity, zero) as the first value along the provided
-            axis in the output. Default: `False`.
-        out (Optional[usm_ndarray]):
-            the array into which the result is written.
-            The data type of `out` must match the expected shape and the
-            expected data type of the result or (if provided) `dtype`.
-            If `None` then a new array is returned. Default: `None`.
-
-    Returns:
-        usm_ndarray:
-            an array containing cumulative products. The returned array has
-            the data type as described in the `dtype` parameter description
-            above.
-
-            The returned array shape is determined as follows:
-
-                * If `include_initial` is `False`, the returned array will
-                  have the same shape as `x`
-                * If `include_initial` is `True`, the returned array will
-                  have the same shape as `x` except the axis along which the
-                  cumulative product is calculated, which will have size `N+1`
-
-            where `N` is the size of the axis the cumulative products are
-            computed along.
-    """
-    return _accumulate_common(
-        x,
-        axis,
-        dtype,
-        include_initial,
-        out,
-        tai._cumprod_over_axis,
-        tai._cumprod_final_axis_include_initial,
-        tai._cumprod_dtype_supported,
-        _default_accumulation_dtype,
-    )
-
-
-def cumulative_logsumexp(
-    x, /, *, axis=None, dtype=None, include_initial=False, out=None
-):
-    """
-    cumulative_logsumexp(x, /, *, axis=None, dtype=None, include_initial=False,
-                   out=None)
-
-    Calculates the cumulative logsmumexp of elements in the input array `x`.
-
-    Args:
-        x (usm_ndarray):
-            input array.
-        axis (Optional[int]):
-            axis along which cumulative logsumexp must be computed.
-            If `None`, the logsumexp is computed over the entire array.
-            If `x` is a one-dimensional array, providing an `axis` is optional;
-            however, if `x` has more than one dimension, providing an `axis`
-            is required.
-            Default: `None`.
-        dtype (Optional[dtype]):
-            data type of the returned array. If `None`, the default data
-            type is inferred from the "kind" of the input array data type.
-
-                * If `x` has a real- or complex-valued floating-point data
-                  type, the returned array will have the same data type as
-                  `x`.
-                * If `x` has signed integral data type, the returned array
-                  will have the default signed integral type for the device
-                  where input array `x` is allocated.
-                * If `x` has unsigned integral data type, the returned array
-                  will have the default unsigned integral type for the device
-                  where input array `x` is allocated.
-                * If `x` has a boolean data type, the returned array will
-                  have the default signed integral type for the device
-                  where input array `x` is allocated.
-
-            If the data type (either specified or resolved) differs from the
-            data type of `x`, the input array elements are cast to the
-            specified data type before computing the cumulative logsumexp.
-            Default: `None`.
-        include_initial (bool):
-            boolean indicating whether to include the initial value (i.e., the
-            additive identity, zero) as the first value along the provided axis
-            in the output. Default: `False`.
-        out (Optional[usm_ndarray]):
-            the array into which the result is written.
-            The data type of `out` must match the expected shape and the
-            expected data type of the result or (if provided) `dtype`.
-            If `None` then a new array is returned. Default: `None`.
-
-    Returns:
-        usm_ndarray:
-            an array containing cumulative logsumexp results. The returned
-            array has the data type as described in the `dtype` parameter
-            description above.
-
-            The returned array shape is determined as follows:
-
-                * If `include_initial` is `False`, the returned array will
-                  have the same shape as `x`
-                * If `include_initial` is `True`, the returned array will
-                  have the same shape as `x` except the axis along which the
-                  cumulative logsumexp is calculated, which will have size
-                  `N+1`
-    """
-    return _accumulate_common(
-        x,
-        axis,
-        dtype,
-        include_initial,
-        out,
-        tai._cumlogsumexp_over_axis,
-        tai._cumlogsumexp_final_axis_include_initial,
-        tai._cumlogsumexp_dtype_supported,
-        _default_accumulation_dtype_fp_types,
-    )
diff --git a/dpctl/tensor/_array_api.py b/dpctl/tensor/_array_api.py
deleted file mode 100644
index ad798d4327..0000000000
--- a/dpctl/tensor/_array_api.py
+++ /dev/null
@@ -1,242 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import dpctl
-import dpctl.tensor as dpt
-from dpctl.tensor._tensor_impl import (
-    default_device_complex_type,
-    default_device_fp_type,
-    default_device_index_type,
-    default_device_int_type,
-)
-
-
-def _isdtype_impl(dtype, kind):
-    if isinstance(kind, str):
-        if kind == "bool":
-            return dtype.kind == "b"
-        elif kind == "signed integer":
-            return dtype.kind == "i"
-        elif kind == "unsigned integer":
-            return dtype.kind == "u"
-        elif kind == "integral":
-            return dtype.kind in "iu"
-        elif kind == "real floating":
-            return dtype.kind == "f"
-        elif kind == "complex floating":
-            return dtype.kind == "c"
-        elif kind == "numeric":
-            return dtype.kind in "iufc"
-        else:
-            raise ValueError(f"Unrecognized data type kind: {kind}")
-
-    elif isinstance(kind, tuple):
-        return any(_isdtype_impl(dtype, k) for k in kind)
-    else:
-        raise TypeError(f"Unsupported type for dtype kind: {type(kind)}")
-
-
-def _get_device_impl(d):
-    if d is None:
-        return dpctl.select_default_device()
-    elif isinstance(d, dpctl.SyclDevice):
-        return d
-    elif isinstance(d, (dpt.Device, dpctl.SyclQueue)):
-        return d.sycl_device
-    else:
-        try:
-            return dpctl.SyclDevice(d)
-        except TypeError:
-            raise TypeError(f"Unsupported type for device argument: {type(d)}")
-
-
-__array_api_version__ = "2024.12"
-
-
-class Info:
-    """
-    namespace returned by ``__array_namespace_info__()``
-    """
-
-    def __init__(self):
-        self._capabilities = {
-            "boolean indexing": True,
-            "data-dependent shapes": True,
-            "max dimensions": None,
-        }
-        self._all_dtypes = {
-            "bool": dpt.bool,
-            "float32": dpt.float32,
-            "float64": dpt.float64,
-            "complex64": dpt.complex64,
-            "complex128": dpt.complex128,
-            "int8": dpt.int8,
-            "int16": dpt.int16,
-            "int32": dpt.int32,
-            "int64": dpt.int64,
-            "uint8": dpt.uint8,
-            "uint16": dpt.uint16,
-            "uint32": dpt.uint32,
-            "uint64": dpt.uint64,
-        }
-
-    def capabilities(self):
-        """
-        capabilities()
-
-        Returns a dictionary of ``dpctl``'s capabilities.
-
-        The dictionary contains the following keys:
-            ``"boolean indexing"``:
-                boolean indicating ``dpctl``'s support of boolean indexing.
-                Value: ``True``
-            ``"data-dependent shapes"``:
-                boolean indicating ``dpctl``'s support of data-dependent shapes.
-                Value: ``True``
-            ``max dimensions``:
-                integer indication the maximum array dimension supported by ``dpctl``.
-                Value: ``None``
-
-        Returns:
-            dict:
-                dictionary of ``dpctl``'s capabilities
-        """
-        return self._capabilities.copy()
-
-    def default_device(self):
-        """
-        default_device()
-
-        Returns the default SYCL device.
-        """
-        return dpctl.select_default_device()
-
-    def default_dtypes(self, *, device=None):
-        """
-        default_dtypes(*, device=None)
-
-        Returns a dictionary of default data types for ``device``.
-
-        Args:
-            device (Optional[:class:`dpctl.SyclDevice`, :class:`dpctl.SyclQueue`, :class:`dpctl.tensor.Device`, str]):
-                array API concept of device used in getting default data types.
-                ``device`` can be ``None`` (in which case the default device
-                is used), an instance of :class:`dpctl.SyclDevice`, an instance
-                of :class:`dpctl.SyclQueue`, a :class:`dpctl.tensor.Device`
-                object returned by :attr:`dpctl.tensor.usm_ndarray.device`, or
-                a filter selector string.
-                Default: ``None``.
-
-        Returns:
-            dict:
-                a dictionary of default data types for ``device``:
-
-                    - ``"real floating"``: dtype
-                    - ``"complex floating"``: dtype
-                    - ``"integral"``: dtype
-                    - ``"indexing"``: dtype
-        """
-        device = _get_device_impl(device)
-        return {
-            "real floating": dpt.dtype(default_device_fp_type(device)),
-            "complex floating": dpt.dtype(default_device_complex_type(device)),
-            "integral": dpt.dtype(default_device_int_type(device)),
-            "indexing": dpt.dtype(default_device_index_type(device)),
-        }
-
-    def dtypes(self, *, device=None, kind=None):
-        """
-        dtypes(*, device=None, kind=None)
-
-        Returns a dictionary of all Array API data types of a specified
-        ``kind`` supported by ``device``.
-
-        This dictionary only includes data types supported by the
-        `Python Array API <https://data-apis.org/array-api/latest/>`_
-        specification.
-
-        Args:
-            device (Optional[:class:`dpctl.SyclDevice`, :class:`dpctl.SyclQueue`, :class:`dpctl.tensor.Device`, str]):
-                array API concept of device used in getting default data types.
-                ``device`` can be ``None`` (in which case the default device is
-                used), an instance of :class:`dpctl.SyclDevice`, an instance of
-                :class:`dpctl.SyclQueue`, a :class:`dpctl.tensor.Device`
-                object returned by :attr:`dpctl.tensor.usm_ndarray.device`, or
-                a filter selector string.
-                Default: ``None``.
-
-            kind (Optional[str, Tuple[str, ...]]):
-                data type kind.
-
-                - if ``kind`` is ``None``, returns a dictionary of all data
-                  types supported by `device`
-                - if ``kind`` is a string, returns a dictionary containing the
-                  data types belonging to the data type kind specified.
-
-                  Supports:
-
-                  * ``"bool"``
-                  * ``"signed integer"``
-                  * ``"unsigned integer"``
-                  * ``"integral"``
-                  * ``"real floating"``
-                  * ``"complex floating"``
-                  * ``"numeric"``
-
-                - if ``kind`` is a tuple, the tuple represents a union of
-                  ``kind`` strings, and returns a dictionary containing data
-                  types corresponding to the-specified union.
-
-                Default: ``None``.
-
-        Returns:
-            dict:
-                a dictionary of the supported data types of the specified
-                ``kind``
-        """
-        device = _get_device_impl(device)
-        _fp64 = device.has_aspect_fp64
-        if kind is None:
-            return {
-                key: val
-                for key, val in self._all_dtypes.items()
-                if _fp64 or (key != "float64" and key != "complex128")
-            }
-        else:
-            return {
-                key: val
-                for key, val in self._all_dtypes.items()
-                if (_fp64 or (key != "float64" and key != "complex128"))
-                and _isdtype_impl(val, kind)
-            }
-
-    def devices(self):
-        """
-        devices()
-
-        Returns a list of supported devices.
-        """
-        return dpctl.get_devices()
-
-
-def __array_namespace_info__():
-    """
-    __array_namespace_info__()
-
-    Returns a namespace with Array API namespace inspection utilities.
-
-    """
-    return Info()
diff --git a/dpctl/tensor/_clip.py b/dpctl/tensor/_clip.py
deleted file mode 100644
index 250f116927..0000000000
--- a/dpctl/tensor/_clip.py
+++ /dev/null
@@ -1,763 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import dpctl
-import dpctl.tensor as dpt
-import dpctl.tensor._tensor_elementwise_impl as tei
-import dpctl.tensor._tensor_impl as ti
-from dpctl.tensor._copy_utils import (
-    _empty_like_orderK,
-    _empty_like_pair_orderK,
-    _empty_like_triple_orderK,
-)
-from dpctl.tensor._manipulation_functions import _broadcast_shape_impl
-from dpctl.tensor._type_utils import _can_cast
-from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
-
-from ._scalar_utils import (
-    _get_dtype,
-    _get_queue_usm_type,
-    _get_shape,
-    _validate_dtype,
-)
-from ._type_utils import (
-    _resolve_one_strong_one_weak_types,
-    _resolve_one_strong_two_weak_types,
-)
-
-
-def _check_clip_dtypes(res_dtype, arg1_dtype, arg2_dtype, sycl_dev):
-    "Checks if both types `arg1_dtype` and `arg2_dtype` can be"
-    "cast to `res_dtype` according to the rule `safe`"
-    if arg1_dtype == res_dtype and arg2_dtype == res_dtype:
-        return None, None, res_dtype
-
-    _fp16 = sycl_dev.has_aspect_fp16
-    _fp64 = sycl_dev.has_aspect_fp64
-    if _can_cast(arg1_dtype, res_dtype, _fp16, _fp64) and _can_cast(
-        arg2_dtype, res_dtype, _fp16, _fp64
-    ):
-        # prevent unnecessary casting
-        ret_buf1_dt = None if res_dtype == arg1_dtype else res_dtype
-        ret_buf2_dt = None if res_dtype == arg2_dtype else res_dtype
-        return ret_buf1_dt, ret_buf2_dt, res_dtype
-    else:
-        return None, None, None
-
-
-def _clip_none(x, val, out, order, _binary_fn):
-    q1, x_usm_type = x.sycl_queue, x.usm_type
-    q2, val_usm_type = _get_queue_usm_type(val)
-    if q2 is None:
-        exec_q = q1
-        res_usm_type = x_usm_type
-    else:
-        exec_q = dpctl.utils.get_execution_queue((q1, q2))
-        if exec_q is None:
-            raise ExecutionPlacementError(
-                "Execution placement can not be unambiguously inferred "
-                "from input arguments."
-            )
-        res_usm_type = dpctl.utils.get_coerced_usm_type(
-            (
-                x_usm_type,
-                val_usm_type,
-            )
-        )
-    dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
-    x_shape = x.shape
-    val_shape = _get_shape(val)
-    if not isinstance(val_shape, (tuple, list)):
-        raise TypeError(
-            "Shape of arguments can not be inferred. "
-            "Arguments are expected to be "
-            "lists, tuples, or both"
-        )
-    try:
-        res_shape = _broadcast_shape_impl(
-            [
-                x_shape,
-                val_shape,
-            ]
-        )
-    except ValueError:
-        raise ValueError(
-            "operands could not be broadcast together with shapes "
-            f"{x_shape} and {val_shape}"
-        )
-    sycl_dev = exec_q.sycl_device
-    x_dtype = x.dtype
-    val_dtype = _get_dtype(val, sycl_dev)
-    if not _validate_dtype(val_dtype):
-        raise ValueError("Operands have unsupported data types")
-
-    val_dtype = _resolve_one_strong_one_weak_types(x_dtype, val_dtype, sycl_dev)
-
-    res_dt = x.dtype
-    _fp16 = sycl_dev.has_aspect_fp16
-    _fp64 = sycl_dev.has_aspect_fp64
-    if not _can_cast(val_dtype, res_dt, _fp16, _fp64):
-        raise ValueError(
-            f"function 'clip' does not support input types "
-            f"({x_dtype}, {val_dtype}), "
-            "and the inputs could not be safely coerced to any "
-            "supported types according to the casting rule ''safe''."
-        )
-
-    orig_out = out
-    if out is not None:
-        if not isinstance(out, dpt.usm_ndarray):
-            raise TypeError(
-                f"output array must be of usm_ndarray type, got {type(out)}"
-            )
-
-        if not out.flags.writable:
-            raise ValueError("provided `out` array is read-only")
-
-        if out.shape != res_shape:
-            raise ValueError(
-                "The shape of input and output arrays are inconsistent. "
-                f"Expected output shape is {res_shape}, got {out.shape}"
-            )
-
-        if res_dt != out.dtype:
-            raise ValueError(
-                f"Output array of type {res_dt} is needed, got {out.dtype}"
-            )
-
-        if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None:
-            raise ExecutionPlacementError(
-                "Input and output allocation queues are not compatible"
-            )
-
-        if ti._array_overlap(x, out):
-            if not ti._same_logical_tensors(x, out):
-                out = dpt.empty_like(out)
-
-        if isinstance(val, dpt.usm_ndarray):
-            if (
-                ti._array_overlap(val, out)
-                and not ti._same_logical_tensors(val, out)
-                and val_dtype == res_dt
-            ):
-                out = dpt.empty_like(out)
-
-    if isinstance(val, dpt.usm_ndarray):
-        val_ary = val
-    else:
-        val_ary = dpt.asarray(val, dtype=val_dtype, sycl_queue=exec_q)
-
-    if order == "A":
-        order = (
-            "F"
-            if all(
-                arr.flags.f_contiguous
-                for arr in (
-                    x,
-                    val_ary,
-                )
-            )
-            else "C"
-        )
-    if val_dtype == res_dt:
-        if out is None:
-            if order == "K":
-                out = _empty_like_pair_orderK(
-                    x, val_ary, res_dt, res_shape, res_usm_type, exec_q
-                )
-            else:
-                out = dpt.empty(
-                    res_shape,
-                    dtype=res_dt,
-                    usm_type=res_usm_type,
-                    sycl_queue=exec_q,
-                    order=order,
-                )
-        if x_shape != res_shape:
-            x = dpt.broadcast_to(x, res_shape)
-        if val_ary.shape != res_shape:
-            val_ary = dpt.broadcast_to(val_ary, res_shape)
-        _manager = SequentialOrderManager[exec_q]
-        dep_evs = _manager.submitted_events
-        ht_binary_ev, binary_ev = _binary_fn(
-            src1=x, src2=val_ary, dst=out, sycl_queue=exec_q, depends=dep_evs
-        )
-        _manager.add_event_pair(ht_binary_ev, binary_ev)
-        if not (orig_out is None or orig_out is out):
-            # Copy the out data from temporary buffer to original memory
-            ht_copy_out_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=out,
-                dst=orig_out,
-                sycl_queue=exec_q,
-                depends=[binary_ev],
-            )
-            _manager.add_event_pair(ht_copy_out_ev, copy_ev)
-            out = orig_out
-        return out
-    else:
-        if order == "K":
-            buf = _empty_like_orderK(val_ary, res_dt)
-        else:
-            buf = dpt.empty_like(val_ary, dtype=res_dt, order=order)
-        _manager = SequentialOrderManager[exec_q]
-        dep_evs = _manager.submitted_events
-        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=val_ary, dst=buf, sycl_queue=exec_q, depends=dep_evs
-        )
-        _manager.add_event_pair(ht_copy_ev, copy_ev)
-        if out is None:
-            if order == "K":
-                out = _empty_like_pair_orderK(
-                    x, buf, res_dt, res_shape, res_usm_type, exec_q
-                )
-            else:
-                out = dpt.empty(
-                    res_shape,
-                    dtype=res_dt,
-                    usm_type=res_usm_type,
-                    sycl_queue=exec_q,
-                    order=order,
-                )
-
-        if x_shape != res_shape:
-            x = dpt.broadcast_to(x, res_shape)
-        buf = dpt.broadcast_to(buf, res_shape)
-        ht_binary_ev, binary_ev = _binary_fn(
-            src1=x,
-            src2=buf,
-            dst=out,
-            sycl_queue=exec_q,
-            depends=[copy_ev],
-        )
-        _manager.add_event_pair(ht_binary_ev, binary_ev)
-        if not (orig_out is None or orig_out is out):
-            # Copy the out data from temporary buffer to original memory
-            ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=out,
-                dst=orig_out,
-                sycl_queue=exec_q,
-                depends=[binary_ev],
-            )
-            _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
-            out = orig_out
-        return out
-
-
-def clip(x, /, min=None, max=None, out=None, order="K"):
-    """clip(x, min=None, max=None, out=None, order="K")
-
-    Clips to the range [`min_i`, `max_i`] for each element `x_i`
-    in `x`.
-
-    Args:
-        x (usm_ndarray): Array containing elements to clip.
-            Must be compatible with `min` and `max` according
-            to broadcasting rules.
-        min ({None, Union[usm_ndarray, bool, int, float, complex]}, optional):
-            Array containing minimum values.
-            Must be compatible with `x` and `max` according
-            to broadcasting rules.
-        max ({None, Union[usm_ndarray, bool, int, float, complex]}, optional):
-            Array containing maximum values.
-            Must be compatible with `x` and `min` according
-            to broadcasting rules.
-        out ({None, usm_ndarray}, optional):
-            Output array to populate.
-            Array must have the correct shape and the expected data type.
-        order ("C","F","A","K", optional):
-            Memory layout of the newly output array, if parameter `out` is
-            `None`.
-            Default: "K".
-
-    Returns:
-        usm_ndarray:
-            An array with elements clipped to the range [`min`, `max`].
-            The returned array has the same data type as `x`.
-    """
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(
-            "Expected `x` to be of dpctl.tensor.usm_ndarray type, got "
-            f"{type(x)}"
-        )
-    if order not in ["K", "C", "F", "A"]:
-        order = "K"
-    if x.dtype.kind in "iu":
-        if isinstance(min, int) and min <= dpt.iinfo(x.dtype).min:
-            min = None
-        if isinstance(max, int) and max >= dpt.iinfo(x.dtype).max:
-            max = None
-    if min is None and max is None:
-        exec_q = x.sycl_queue
-        orig_out = out
-        if out is not None:
-            if not isinstance(out, dpt.usm_ndarray):
-                raise TypeError(
-                    "output array must be of usm_ndarray type, got "
-                    f"{type(out)}"
-                )
-
-            if not out.flags.writable:
-                raise ValueError("provided `out` array is read-only")
-
-            if out.shape != x.shape:
-                raise ValueError(
-                    "The shape of input and output arrays are "
-                    f"inconsistent. Expected output shape is {x.shape}, "
-                    f"got {out.shape}"
-                )
-
-            if x.dtype != out.dtype:
-                raise ValueError(
-                    f"Output array of type {x.dtype} is needed, "
-                    f"got {out.dtype}"
-                )
-
-            if (
-                dpctl.utils.get_execution_queue((exec_q, out.sycl_queue))
-                is None
-            ):
-                raise ExecutionPlacementError(
-                    "Input and output allocation queues are not compatible"
-                )
-
-            if ti._array_overlap(x, out):
-                if not ti._same_logical_tensors(x, out):
-                    out = dpt.empty_like(out)
-                else:
-                    return out
-        else:
-            if order == "K":
-                out = _empty_like_orderK(x, x.dtype)
-            else:
-                out = dpt.empty_like(x, order=order)
-
-        _manager = SequentialOrderManager[exec_q]
-        dep_evs = _manager.submitted_events
-        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=x, dst=out, sycl_queue=exec_q, depends=dep_evs
-        )
-        _manager.add_event_pair(ht_copy_ev, copy_ev)
-        if not (orig_out is None or orig_out is out):
-            # Copy the out data from temporary buffer to original memory
-            ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=out,
-                dst=orig_out,
-                sycl_queue=exec_q,
-                depends=[copy_ev],
-            )
-            _manager.add_event_pair(ht_copy_ev, cpy_ev)
-            out = orig_out
-        return out
-    elif max is None:
-        return _clip_none(x, min, out, order, tei._maximum)
-    elif min is None:
-        return _clip_none(x, max, out, order, tei._minimum)
-    else:
-        q1, x_usm_type = x.sycl_queue, x.usm_type
-        q2, min_usm_type = _get_queue_usm_type(min)
-        q3, max_usm_type = _get_queue_usm_type(max)
-        if q2 is None and q3 is None:
-            exec_q = q1
-            res_usm_type = x_usm_type
-        elif q3 is None:
-            exec_q = dpctl.utils.get_execution_queue((q1, q2))
-            if exec_q is None:
-                raise ExecutionPlacementError(
-                    "Execution placement can not be unambiguously inferred "
-                    "from input arguments."
-                )
-            res_usm_type = dpctl.utils.get_coerced_usm_type(
-                (
-                    x_usm_type,
-                    min_usm_type,
-                )
-            )
-        elif q2 is None:
-            exec_q = dpctl.utils.get_execution_queue((q1, q3))
-            if exec_q is None:
-                raise ExecutionPlacementError(
-                    "Execution placement can not be unambiguously inferred "
-                    "from input arguments."
-                )
-            res_usm_type = dpctl.utils.get_coerced_usm_type(
-                (
-                    x_usm_type,
-                    max_usm_type,
-                )
-            )
-        else:
-            exec_q = dpctl.utils.get_execution_queue((q1, q2, q3))
-            if exec_q is None:
-                raise ExecutionPlacementError(
-                    "Execution placement can not be unambiguously inferred "
-                    "from input arguments."
-                )
-            res_usm_type = dpctl.utils.get_coerced_usm_type(
-                (
-                    x_usm_type,
-                    min_usm_type,
-                    max_usm_type,
-                )
-            )
-        dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
-        x_shape = x.shape
-        min_shape = _get_shape(min)
-        max_shape = _get_shape(max)
-        if not all(
-            isinstance(s, (tuple, list))
-            for s in (
-                min_shape,
-                max_shape,
-            )
-        ):
-            raise TypeError(
-                "Shape of arguments can not be inferred. "
-                "Arguments are expected to be "
-                "lists, tuples, or both"
-            )
-        try:
-            res_shape = _broadcast_shape_impl(
-                [
-                    x_shape,
-                    min_shape,
-                    max_shape,
-                ]
-            )
-        except ValueError:
-            raise ValueError(
-                "operands could not be broadcast together with shapes "
-                f"{x_shape}, {min_shape}, and {max_shape}"
-            )
-        sycl_dev = exec_q.sycl_device
-        x_dtype = x.dtype
-        min_dtype = _get_dtype(min, sycl_dev)
-        max_dtype = _get_dtype(max, sycl_dev)
-        if not all(_validate_dtype(o) for o in (min_dtype, max_dtype)):
-            raise ValueError("Operands have unsupported data types")
-
-        min_dtype, max_dtype = _resolve_one_strong_two_weak_types(
-            x_dtype, min_dtype, max_dtype, sycl_dev
-        )
-
-        buf1_dt, buf2_dt, res_dt = _check_clip_dtypes(
-            x_dtype,
-            min_dtype,
-            max_dtype,
-            sycl_dev,
-        )
-
-        if res_dt is None:
-            raise ValueError(
-                f"function '{clip}' does not support input types "
-                f"({x_dtype}, {min_dtype}, {max_dtype}), "
-                "and the inputs could not be safely coerced to any "
-                "supported types according to the casting rule ''safe''."
-            )
-
-        orig_out = out
-        if out is not None:
-            if not isinstance(out, dpt.usm_ndarray):
-                raise TypeError(
-                    "output array must be of usm_ndarray type, got "
-                    f"{type(out)}"
-                )
-
-            if not out.flags.writable:
-                raise ValueError("provided `out` array is read-only")
-
-            if out.shape != res_shape:
-                raise ValueError(
-                    "The shape of input and output arrays are "
-                    f"inconsistent. Expected output shape is {res_shape}, "
-                    f"got {out.shape}"
-                )
-
-            if res_dt != out.dtype:
-                raise ValueError(
-                    f"Output array of type {res_dt} is needed, "
-                    f"got {out.dtype}"
-                )
-
-            if (
-                dpctl.utils.get_execution_queue((exec_q, out.sycl_queue))
-                is None
-            ):
-                raise ExecutionPlacementError(
-                    "Input and output allocation queues are not compatible"
-                )
-
-            if ti._array_overlap(x, out):
-                if not ti._same_logical_tensors(x, out):
-                    out = dpt.empty_like(out)
-
-            if isinstance(min, dpt.usm_ndarray):
-                if (
-                    ti._array_overlap(min, out)
-                    and not ti._same_logical_tensors(min, out)
-                    and buf1_dt is None
-                ):
-                    out = dpt.empty_like(out)
-
-            if isinstance(max, dpt.usm_ndarray):
-                if (
-                    ti._array_overlap(max, out)
-                    and not ti._same_logical_tensors(max, out)
-                    and buf2_dt is None
-                ):
-                    out = dpt.empty_like(out)
-
-        if isinstance(min, dpt.usm_ndarray):
-            a_min = min
-        else:
-            a_min = dpt.asarray(min, dtype=min_dtype, sycl_queue=exec_q)
-        if isinstance(max, dpt.usm_ndarray):
-            a_max = max
-        else:
-            a_max = dpt.asarray(max, dtype=max_dtype, sycl_queue=exec_q)
-
-        if order == "A":
-            order = (
-                "F"
-                if all(
-                    arr.flags.f_contiguous
-                    for arr in (
-                        x,
-                        a_min,
-                        a_max,
-                    )
-                )
-                else "C"
-            )
-        if buf1_dt is None and buf2_dt is None:
-            if out is None:
-                if order == "K":
-                    out = _empty_like_triple_orderK(
-                        x,
-                        a_min,
-                        a_max,
-                        res_dt,
-                        res_shape,
-                        res_usm_type,
-                        exec_q,
-                    )
-                else:
-                    out = dpt.empty(
-                        res_shape,
-                        dtype=res_dt,
-                        usm_type=res_usm_type,
-                        sycl_queue=exec_q,
-                        order=order,
-                    )
-            if x_shape != res_shape:
-                x = dpt.broadcast_to(x, res_shape)
-            if a_min.shape != res_shape:
-                a_min = dpt.broadcast_to(a_min, res_shape)
-            if a_max.shape != res_shape:
-                a_max = dpt.broadcast_to(a_max, res_shape)
-            _manager = SequentialOrderManager[exec_q]
-            dep_ev = _manager.submitted_events
-            ht_binary_ev, binary_ev = ti._clip(
-                src=x,
-                min=a_min,
-                max=a_max,
-                dst=out,
-                sycl_queue=exec_q,
-                depends=dep_ev,
-            )
-            _manager.add_event_pair(ht_binary_ev, binary_ev)
-            if not (orig_out is None or orig_out is out):
-                # Copy the out data from temporary buffer to original memory
-                ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-                    src=out,
-                    dst=orig_out,
-                    sycl_queue=exec_q,
-                    depends=[binary_ev],
-                )
-                _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
-                out = orig_out
-            return out
-
-        elif buf1_dt is None:
-            if order == "K":
-                buf2 = _empty_like_orderK(a_max, buf2_dt)
-            else:
-                buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order)
-            _manager = SequentialOrderManager[exec_q]
-            dep_ev = _manager.submitted_events
-            ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=a_max, dst=buf2, sycl_queue=exec_q, depends=dep_ev
-            )
-            _manager.add_event_pair(ht_copy_ev, copy_ev)
-            if out is None:
-                if order == "K":
-                    out = _empty_like_triple_orderK(
-                        x,
-                        a_min,
-                        buf2,
-                        res_dt,
-                        res_shape,
-                        res_usm_type,
-                        exec_q,
-                    )
-                else:
-                    out = dpt.empty(
-                        res_shape,
-                        dtype=res_dt,
-                        usm_type=res_usm_type,
-                        sycl_queue=exec_q,
-                        order=order,
-                    )
-
-            x = dpt.broadcast_to(x, res_shape)
-            if a_min.shape != res_shape:
-                a_min = dpt.broadcast_to(a_min, res_shape)
-            buf2 = dpt.broadcast_to(buf2, res_shape)
-            ht_binary_ev, binary_ev = ti._clip(
-                src=x,
-                min=a_min,
-                max=buf2,
-                dst=out,
-                sycl_queue=exec_q,
-                depends=[copy_ev],
-            )
-            _manager.add_event_pair(ht_binary_ev, binary_ev)
-            if not (orig_out is None or orig_out is out):
-                # Copy the out data from temporary buffer to original memory
-                ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-                    src=out,
-                    dst=orig_out,
-                    sycl_queue=exec_q,
-                    depends=[binary_ev],
-                )
-                _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
-                out = orig_out
-            return out
-
-        elif buf2_dt is None:
-            if order == "K":
-                buf1 = _empty_like_orderK(a_min, buf1_dt)
-            else:
-                buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order)
-            _manager = SequentialOrderManager[exec_q]
-            dep_ev = _manager.submitted_events
-            ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=a_min, dst=buf1, sycl_queue=exec_q, depends=dep_ev
-            )
-            _manager.add_event_pair(ht_copy_ev, copy_ev)
-            if out is None:
-                if order == "K":
-                    out = _empty_like_triple_orderK(
-                        x,
-                        buf1,
-                        a_max,
-                        res_dt,
-                        res_shape,
-                        res_usm_type,
-                        exec_q,
-                    )
-                else:
-                    out = dpt.empty(
-                        res_shape,
-                        dtype=res_dt,
-                        usm_type=res_usm_type,
-                        sycl_queue=exec_q,
-                        order=order,
-                    )
-
-            x = dpt.broadcast_to(x, res_shape)
-            buf1 = dpt.broadcast_to(buf1, res_shape)
-            if a_max.shape != res_shape:
-                a_max = dpt.broadcast_to(a_max, res_shape)
-            ht_binary_ev, binary_ev = ti._clip(
-                src=x,
-                min=buf1,
-                max=a_max,
-                dst=out,
-                sycl_queue=exec_q,
-                depends=[copy_ev],
-            )
-            _manager.add_event_pair(ht_binary_ev, binary_ev)
-            if not (orig_out is None or orig_out is out):
-                # Copy the out data from temporary buffer to original memory
-                ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-                    src=out,
-                    dst=orig_out,
-                    sycl_queue=exec_q,
-                    depends=[binary_ev],
-                )
-                _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
-                out = orig_out
-            return out
-
-        if order == "K":
-            if (
-                x.flags.c_contiguous
-                and a_min.flags.c_contiguous
-                and a_max.flags.c_contiguous
-            ):
-                order = "C"
-            elif (
-                x.flags.f_contiguous
-                and a_min.flags.f_contiguous
-                and a_max.flags.f_contiguous
-            ):
-                order = "F"
-        if order == "K":
-            buf1 = _empty_like_orderK(a_min, buf1_dt)
-        else:
-            buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order)
-
-        _manager = SequentialOrderManager[exec_q]
-        dep_evs = _manager.submitted_events
-        ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=a_min, dst=buf1, sycl_queue=exec_q, depends=dep_evs
-        )
-        _manager.add_event_pair(ht_copy1_ev, copy1_ev)
-        if order == "K":
-            buf2 = _empty_like_orderK(a_max, buf2_dt)
-        else:
-            buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order)
-        ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=a_max, dst=buf2, sycl_queue=exec_q, depends=dep_evs
-        )
-        _manager.add_event_pair(ht_copy2_ev, copy2_ev)
-        if out is None:
-            if order == "K":
-                out = _empty_like_triple_orderK(
-                    x, buf1, buf2, res_dt, res_shape, res_usm_type, exec_q
-                )
-            else:
-                out = dpt.empty(
-                    res_shape,
-                    dtype=res_dt,
-                    usm_type=res_usm_type,
-                    sycl_queue=exec_q,
-                    order=order,
-                )
-
-        x = dpt.broadcast_to(x, res_shape)
-        buf1 = dpt.broadcast_to(buf1, res_shape)
-        buf2 = dpt.broadcast_to(buf2, res_shape)
-        ht_, clip_ev = ti._clip(
-            src=x,
-            min=buf1,
-            max=buf2,
-            dst=out,
-            sycl_queue=exec_q,
-            depends=[copy1_ev, copy2_ev],
-        )
-        _manager.add_event_pair(ht_, clip_ev)
-        return out
diff --git a/dpctl/tensor/_constants.py b/dpctl/tensor/_constants.py
deleted file mode 100644
index 7401b2ccff..0000000000
--- a/dpctl/tensor/_constants.py
+++ /dev/null
@@ -1,24 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import numpy as np
-
-newaxis = None
-
-pi = np.pi
-e = np.e
-nan = np.nan
-inf = np.inf
diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py
deleted file mode 100644
index 88fd1acdb7..0000000000
--- a/dpctl/tensor/_copy_utils.py
+++ /dev/null
@@ -1,1147 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-import builtins
-import operator
-from numbers import Integral
-
-import numpy as np
-
-import dpctl
-import dpctl.memory as dpm
-import dpctl.tensor as dpt
-import dpctl.tensor._tensor_impl as ti
-import dpctl.utils
-from dpctl.tensor._data_types import _get_dtype
-from dpctl.tensor._device import normalize_queue_device
-from dpctl.tensor._type_utils import _dtype_supported_by_device_impl
-
-from ._numpy_helper import normalize_axis_index
-
-__doc__ = (
-    "Implementation module for copy- and cast- operations on "
-    ":class:`dpctl.tensor.usm_ndarray`."
-)
-
-int32_t_max = 1 + np.iinfo(np.int32).max
-
-
-def _copy_to_numpy(ary):
-    if not isinstance(ary, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(ary)}")
-    if ary.size == 0:
-        # no data needs to be copied for zero sized array
-        return np.ndarray(ary.shape, dtype=ary.dtype)
-    nb = ary.usm_data.nbytes
-    q = ary.sycl_queue
-    hh = dpm.MemoryUSMHost(nb, queue=q)
-    h = np.ndarray(nb, dtype="u1", buffer=hh).view(ary.dtype)
-    itsz = ary.itemsize
-    strides_bytes = tuple(si * itsz for si in ary.strides)
-    offset = ary._element_offset * itsz
-    # ensure that content of ary.usm_data is final
-    q.wait()
-    hh.copy_from_device(ary.usm_data)
-    return np.ndarray(
-        ary.shape,
-        dtype=ary.dtype,
-        buffer=h,
-        strides=strides_bytes,
-        offset=offset,
-    )
-
-
-def _copy_from_numpy(np_ary, usm_type="device", sycl_queue=None):
-    "Copies numpy array `np_ary` into a new usm_ndarray"
-    # This may perform a copy to meet stated requirements
-    Xnp = np.require(np_ary, requirements=["A", "E"])
-    alloc_q = normalize_queue_device(sycl_queue=sycl_queue, device=None)
-    dt = Xnp.dtype
-    if dt.char in "dD" and alloc_q.sycl_device.has_aspect_fp64 is False:
-        Xusm_dtype = (
-            dpt.dtype("float32") if dt.char == "d" else dpt.dtype("complex64")
-        )
-    else:
-        Xusm_dtype = dt
-    Xusm = dpt.empty(
-        Xnp.shape, dtype=Xusm_dtype, usm_type=usm_type, sycl_queue=sycl_queue
-    )
-    _copy_from_numpy_into(Xusm, Xnp)
-    return Xusm
-
-
-def _copy_from_numpy_into(dst, np_ary):
-    "Copies `np_ary` into `dst` of type :class:`dpctl.tensor.usm_ndarray"
-    if not isinstance(np_ary, np.ndarray):
-        raise TypeError(f"Expected numpy.ndarray, got {type(np_ary)}")
-    if not isinstance(dst, dpt.usm_ndarray):
-        raise TypeError(f"Expected usm_ndarray, got {type(dst)}")
-    if np_ary.flags["OWNDATA"]:
-        Xnp = np_ary
-    else:
-        # Determine base of input array
-        base = np_ary.base
-        while isinstance(base, np.ndarray):
-            base = base.base
-        if isinstance(base, dpm._memory._Memory):
-            # we must perform a copy, since subsequent
-            # _copy_numpy_ndarray_into_usm_ndarray is implemented using
-            # sycl::buffer, and using USM-pointers with sycl::buffer
-            # results is undefined behavior
-            Xnp = np_ary.copy()
-        else:
-            Xnp = np_ary
-    src_ary = np.broadcast_to(Xnp, dst.shape)
-    copy_q = dst.sycl_queue
-    if copy_q.sycl_device.has_aspect_fp64 is False:
-        src_ary_dt_c = src_ary.dtype.char
-        if src_ary_dt_c == "d":
-            src_ary = src_ary.astype(np.float32)
-        elif src_ary_dt_c == "D":
-            src_ary = src_ary.astype(np.complex64)
-    _manager = dpctl.utils.SequentialOrderManager[copy_q]
-    dep_ev = _manager.submitted_events
-    # synchronizing call
-    ti._copy_numpy_ndarray_into_usm_ndarray(
-        src=src_ary, dst=dst, sycl_queue=copy_q, depends=dep_ev
-    )
-
-
-def from_numpy(np_ary, /, *, device=None, usm_type="device", sycl_queue=None):
-    """
-    from_numpy(arg, device=None, usm_type="device", sycl_queue=None)
-
-    Creates :class:`dpctl.tensor.usm_ndarray` from instance of
-    :class:`numpy.ndarray`.
-
-    Args:
-        arg:
-            Input convertible to :class:`numpy.ndarray`
-        device (object): array API specification of device where the
-            output array is created. Device can be specified by
-            a filter selector string, an instance of
-            :class:`dpctl.SyclDevice`, an instance of
-            :class:`dpctl.SyclQueue`, or an instance of
-            :class:`dpctl.tensor.Device`. If the value is ``None``,
-            returned array is created on the default-selected device.
-            Default: ``None``
-        usm_type (str): The requested USM allocation type for the
-            output array. Recognized values are ``"device"``,
-            ``"shared"``, or ``"host"``
-        sycl_queue (:class:`dpctl.SyclQueue`, optional):
-            A SYCL queue that determines output array allocation device
-            as well as execution placement of data movement operations.
-            The ``device`` and ``sycl_queue`` arguments
-            are equivalent. Only one of them should be specified. If both
-            are provided, they must be consistent and result in using the
-            same execution queue. Default: ``None``
-
-    The returned array has the same shape, and the same data type kind.
-    If the device does not support the data type of input array, a
-    closest support data type of the same kind may be returned, e.g.
-    input array of type ``float16`` may be upcast to ``float32`` if the
-    target device does not support 16-bit floating point type.
-    """
-    q = normalize_queue_device(sycl_queue=sycl_queue, device=device)
-    return _copy_from_numpy(np_ary, usm_type=usm_type, sycl_queue=q)
-
-
-def to_numpy(usm_ary, /):
-    """
-    to_numpy(usm_ary)
-
-    Copies content of :class:`dpctl.tensor.usm_ndarray` instance ``usm_ary``
-    into :class:`numpy.ndarray` instance of the same shape and same data type.
-
-    Args:
-        usm_ary (usm_ndarray):
-            Input array
-    Returns:
-        :class:`numpy.ndarray`:
-            An instance of :class:`numpy.ndarray` populated with content of
-            ``usm_ary``
-    """
-    return _copy_to_numpy(usm_ary)
-
-
-def asnumpy(usm_ary):
-    """
-    asnumpy(usm_ary)
-
-    Copies content of :class:`dpctl.tensor.usm_ndarray` instance ``usm_ary``
-    into :class:`numpy.ndarray` instance of the same shape and same data
-    type.
-
-    Args:
-        usm_ary (usm_ndarray):
-            Input array
-    Returns:
-        :class:`numpy.ndarray`:
-            An instance of :class:`numpy.ndarray` populated with content
-            of ``usm_ary``
-    """
-    return _copy_to_numpy(usm_ary)
-
-
-class Dummy:
-    """
-    Helper class with specified ``__sycl_usm_array_interface__`` attribute
-    """
-
-    def __init__(self, iface):
-        self.__sycl_usm_array_interface__ = iface
-
-
-def _copy_overlapping(dst, src):
-    """Assumes src and dst have the same shape."""
-    q = normalize_queue_device(sycl_queue=dst.sycl_queue)
-    tmp = dpt.usm_ndarray(
-        src.shape,
-        dtype=src.dtype,
-        buffer="device",
-        order="C",
-        buffer_ctor_kwargs={"queue": q},
-    )
-    _manager = dpctl.utils.SequentialOrderManager[q]
-    dep_evs = _manager.submitted_events
-    hcp1, cp1 = ti._copy_usm_ndarray_into_usm_ndarray(
-        src=src, dst=tmp, sycl_queue=q, depends=dep_evs
-    )
-    _manager.add_event_pair(hcp1, cp1)
-    hcp2, cp2 = ti._copy_usm_ndarray_into_usm_ndarray(
-        src=tmp, dst=dst, sycl_queue=q, depends=[cp1]
-    )
-    _manager.add_event_pair(hcp2, cp2)
-
-
-def _copy_same_shape(dst, src):
-    """Assumes src and dst have the same shape."""
-    # check that memory regions do not overlap
-    if ti._array_overlap(dst, src):
-        if src._pointer == dst._pointer and (
-            src is dst
-            or (src.strides == dst.strides and src.dtype == dst.dtype)
-        ):
-            return
-        _copy_overlapping(src=src, dst=dst)
-        return
-
-    copy_q = dst.sycl_queue
-    _manager = dpctl.utils.SequentialOrderManager[copy_q]
-    dep_evs = _manager.submitted_events
-    hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-        src=src, dst=dst, sycl_queue=copy_q, depends=dep_evs
-    )
-    _manager.add_event_pair(hev, cpy_ev)
-
-
-if hasattr(np, "broadcast_shapes"):
-
-    def _broadcast_shapes(sh1, sh2):
-        return np.broadcast_shapes(sh1, sh2)
-
-else:
-
-    def _broadcast_shapes(sh1, sh2):
-        # use arrays with zero strides, whose memory footprint
-        # is independent of the number of array elements
-        return np.broadcast(
-            np.empty(sh1, dtype=[]),
-            np.empty(sh2, dtype=[]),
-        ).shape
-
-
-def _broadcast_strides(X_shape, X_strides, res_ndim):
-    """
-    Broadcasts strides to match the given dimensions;
-    returns tuple type strides.
-    """
-    out_strides = [0] * res_ndim
-    X_shape_len = len(X_shape)
-    str_dim = -X_shape_len
-    for i in range(X_shape_len):
-        shape_value = X_shape[i]
-        if not shape_value == 1:
-            out_strides[str_dim] = X_strides[i]
-        str_dim += 1
-
-    return tuple(out_strides)
-
-
-def _copy_from_usm_ndarray_to_usm_ndarray(dst, src):
-    if any(
-        not isinstance(arg, dpt.usm_ndarray)
-        for arg in (
-            dst,
-            src,
-        )
-    ):
-        raise TypeError(
-            "Both types are expected to be dpctl.tensor.usm_ndarray, "
-            f"got {type(dst)} and {type(src)}."
-        )
-
-    if dst.ndim == src.ndim and dst.shape == src.shape:
-        _copy_same_shape(dst, src)
-        return
-
-    try:
-        common_shape = _broadcast_shapes(dst.shape, src.shape)
-    except ValueError as exc:
-        raise ValueError("Shapes of two arrays are not compatible") from exc
-
-    if dst.size < src.size and dst.size < np.prod(common_shape):
-        raise ValueError("Destination is smaller ")
-
-    if len(common_shape) > dst.ndim:
-        ones_count = len(common_shape) - dst.ndim
-        for k in range(ones_count):
-            if common_shape[k] != 1:
-                raise ValueError
-        common_shape = common_shape[ones_count:]
-
-    if src.ndim < len(common_shape):
-        new_src_strides = _broadcast_strides(
-            src.shape, src.strides, len(common_shape)
-        )
-        src_same_shape = dpt.usm_ndarray(
-            common_shape,
-            dtype=src.dtype,
-            buffer=src,
-            strides=new_src_strides,
-            offset=src._element_offset,
-        )
-    elif src.ndim == len(common_shape):
-        new_src_strides = _broadcast_strides(
-            src.shape, src.strides, len(common_shape)
-        )
-        src_same_shape = dpt.usm_ndarray(
-            common_shape,
-            dtype=src.dtype,
-            buffer=src,
-            strides=new_src_strides,
-            offset=src._element_offset,
-        )
-    else:
-        # since broadcasting succeeded, src.ndim is greater because of
-        # leading sequence of ones, so we trim it
-        n = len(common_shape)
-        new_src_strides = _broadcast_strides(
-            src.shape[-n:], src.strides[-n:], n
-        )
-        src_same_shape = dpt.usm_ndarray(
-            common_shape,
-            dtype=src.dtype,
-            buffer=src.usm_data,
-            strides=new_src_strides,
-            offset=src._element_offset,
-        )
-
-    _copy_same_shape(dst, src_same_shape)
-
-
-def _make_empty_like_orderK(x, dt, usm_type, dev):
-    """
-    Returns empty array with shape and strides like `x`, with dtype `dt`,
-    USM type `usm_type`, on device `dev`.
-    """
-    st = list(x.strides)
-    perm = sorted(
-        range(x.ndim),
-        key=lambda d: builtins.abs(st[d]) if x.shape[d] > 1 else 0,
-        reverse=True,
-    )
-    inv_perm = sorted(range(x.ndim), key=lambda i: perm[i])
-    sh = x.shape
-    sh_sorted = tuple(sh[i] for i in perm)
-    R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C")
-    if min(st) < 0:
-        st_sorted = [st[i] for i in perm]
-        sl = tuple(
-            (
-                slice(None, None, -1)
-                if st_sorted[i] < 0
-                else slice(None, None, None)
-            )
-            for i in range(x.ndim)
-        )
-        R = R[sl]
-    return dpt.permute_dims(R, inv_perm)
-
-
-def _empty_like_orderK(x, dt, usm_type=None, dev=None):
-    """
-    Returns empty array like `x`, using order='K'
-
-    For an array `x` that was obtained by permutation of a contiguous
-    array the returned array will have the same shape and the same
-    strides as `x`.
-    """
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected usm_ndarray, got {type(x)}")
-    if usm_type is None:
-        usm_type = x.usm_type
-    if dev is None:
-        dev = x.device
-    fl = x.flags
-    if fl["C"] or x.size <= 1:
-        return dpt.empty_like(
-            x, dtype=dt, usm_type=usm_type, device=dev, order="C"
-        )
-    elif fl["F"]:
-        return dpt.empty_like(
-            x, dtype=dt, usm_type=usm_type, device=dev, order="F"
-        )
-    return _make_empty_like_orderK(x, dt, usm_type, dev)
-
-
-def _from_numpy_empty_like_orderK(x, dt, usm_type, dev):
-    """
-    Returns empty usm_ndarray like NumPy array `x`, using order='K'
-
-    For an array `x` that was obtained by permutation of a contiguous
-    array the returned array will have the same shape and the same
-    strides as `x`.
-    """
-    if not isinstance(x, np.ndarray):
-        raise TypeError(f"Expected numpy.ndarray, got {type(x)}")
-    fl = x.flags
-    if fl["C"] or x.size <= 1:
-        return dpt.empty(
-            x.shape, dtype=dt, usm_type=usm_type, device=dev, order="C"
-        )
-    elif fl["F"]:
-        return dpt.empty(
-            x.shape, dtype=dt, usm_type=usm_type, device=dev, order="F"
-        )
-    return _make_empty_like_orderK(x, dt, usm_type, dev)
-
-
-def _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev):
-    if not isinstance(X1, dpt.usm_ndarray):
-        raise TypeError(f"Expected usm_ndarray, got {type(X1)}")
-    if not isinstance(X2, dpt.usm_ndarray):
-        raise TypeError(f"Expected usm_ndarray, got {type(X2)}")
-    nd1 = X1.ndim
-    nd2 = X2.ndim
-    if nd1 > nd2 and X1.shape == res_shape:
-        return _empty_like_orderK(X1, dt, usm_type, dev)
-    elif nd1 < nd2 and X2.shape == res_shape:
-        return _empty_like_orderK(X2, dt, usm_type, dev)
-    fl1 = X1.flags
-    fl2 = X2.flags
-    if fl1["C"] or fl2["C"]:
-        return dpt.empty(
-            res_shape, dtype=dt, usm_type=usm_type, device=dev, order="C"
-        )
-    if fl1["F"] and fl2["F"]:
-        return dpt.empty(
-            res_shape, dtype=dt, usm_type=usm_type, device=dev, order="F"
-        )
-    st1 = list(X1.strides)
-    st2 = list(X2.strides)
-    max_ndim = max(nd1, nd2)
-    st1 += [0] * (max_ndim - len(st1))
-    st2 += [0] * (max_ndim - len(st2))
-    sh1 = list(X1.shape) + [0] * (max_ndim - nd1)
-    sh2 = list(X2.shape) + [0] * (max_ndim - nd2)
-    perm = sorted(
-        range(max_ndim),
-        key=lambda d: (
-            builtins.abs(st1[d]) if sh1[d] > 1 else 0,
-            builtins.abs(st2[d]) if sh2[d] > 1 else 0,
-        ),
-        reverse=True,
-    )
-    inv_perm = sorted(range(max_ndim), key=lambda i: perm[i])
-    st1_sorted = [st1[i] for i in perm]
-    st2_sorted = [st2[i] for i in perm]
-    sh = res_shape
-    sh_sorted = tuple(sh[i] for i in perm)
-    R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C")
-    if max(min(st1_sorted), min(st2_sorted)) < 0:
-        sl = tuple(
-            (
-                slice(None, None, -1)
-                if (st1_sorted[i] < 0 and st2_sorted[i] < 0)
-                else slice(None, None, None)
-            )
-            for i in range(nd1)
-        )
-        R = R[sl]
-    return dpt.permute_dims(R, inv_perm)
-
-
-def _empty_like_triple_orderK(X1, X2, X3, dt, res_shape, usm_type, dev):
-    if not isinstance(X1, dpt.usm_ndarray):
-        raise TypeError(f"Expected usm_ndarray, got {type(X1)}")
-    if not isinstance(X2, dpt.usm_ndarray):
-        raise TypeError(f"Expected usm_ndarray, got {type(X2)}")
-    if not isinstance(X3, dpt.usm_ndarray):
-        raise TypeError(f"Expected usm_ndarray, got {type(X3)}")
-    nd1 = X1.ndim
-    nd2 = X2.ndim
-    nd3 = X3.ndim
-    if X1.shape == res_shape and X2.shape == res_shape and len(res_shape) > nd3:
-        return _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev)
-    elif (
-        X2.shape == res_shape and X3.shape == res_shape and len(res_shape) > nd1
-    ):
-        return _empty_like_pair_orderK(X2, X3, dt, res_shape, usm_type, dev)
-    elif (
-        X1.shape == res_shape and X3.shape == res_shape and len(res_shape) > nd2
-    ):
-        return _empty_like_pair_orderK(X1, X3, dt, res_shape, usm_type, dev)
-    fl1 = X1.flags
-    fl2 = X2.flags
-    fl3 = X3.flags
-    if fl1["C"] or fl2["C"] or fl3["C"]:
-        return dpt.empty(
-            res_shape, dtype=dt, usm_type=usm_type, device=dev, order="C"
-        )
-    if fl1["F"] and fl2["F"] and fl3["F"]:
-        return dpt.empty(
-            res_shape, dtype=dt, usm_type=usm_type, device=dev, order="F"
-        )
-    st1 = list(X1.strides)
-    st2 = list(X2.strides)
-    st3 = list(X3.strides)
-    max_ndim = max(nd1, nd2, nd3)
-    st1 += [0] * (max_ndim - len(st1))
-    st2 += [0] * (max_ndim - len(st2))
-    st3 += [0] * (max_ndim - len(st3))
-    sh1 = list(X1.shape) + [0] * (max_ndim - nd1)
-    sh2 = list(X2.shape) + [0] * (max_ndim - nd2)
-    sh3 = list(X3.shape) + [0] * (max_ndim - nd3)
-    perm = sorted(
-        range(max_ndim),
-        key=lambda d: (
-            builtins.abs(st1[d]) if sh1[d] > 1 else 0,
-            builtins.abs(st2[d]) if sh2[d] > 1 else 0,
-            builtins.abs(st3[d]) if sh3[d] > 1 else 0,
-        ),
-        reverse=True,
-    )
-    inv_perm = sorted(range(max_ndim), key=lambda i: perm[i])
-    st1_sorted = [st1[i] for i in perm]
-    st2_sorted = [st2[i] for i in perm]
-    st3_sorted = [st3[i] for i in perm]
-    sh = res_shape
-    sh_sorted = tuple(sh[i] for i in perm)
-    R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C")
-    if max(min(st1_sorted), min(st2_sorted), min(st3_sorted)) < 0:
-        sl = tuple(
-            (
-                slice(None, None, -1)
-                if (
-                    st1_sorted[i] < 0
-                    and st2_sorted[i] < 0
-                    and st3_sorted[i] < 0
-                )
-                else slice(None, None, None)
-            )
-            for i in range(nd1)
-        )
-        R = R[sl]
-    return dpt.permute_dims(R, inv_perm)
-
-
-def copy(usm_ary, /, *, order="K"):
-    """copy(ary, order="K")
-
-    Creates a copy of given instance of :class:`dpctl.tensor.usm_ndarray`.
-
-    Args:
-        ary (usm_ndarray):
-            Input array
-        order (``"C"``, ``"F"``, ``"A"``, ``"K"``, optional):
-            Controls the memory layout of the output array
-    Returns:
-        usm_ndarray:
-            A copy of the input array.
-
-    Memory layout of the copy is controlled by ``order`` keyword,
-    following NumPy's conventions. The ``order`` keywords can be
-    one of the following:
-
-    .. list-table::
-
-        * - ``"C"``
-          - C-contiguous memory layout
-        * - ``"F"``
-          - Fortran-contiguous memory layout
-        * - ``"A"``
-          - Fortran-contiguous if the input array is also Fortran-contiguous,
-            otherwise C-contiguous
-        * - ``"K"``
-          - match the layout of ``usm_ary`` as closely as possible.
-
-    """
-    if len(order) == 0 or order[0] not in "KkAaCcFf":
-        raise ValueError(
-            "Unrecognized order keyword value, expecting 'K', 'A', 'F', or 'C'."
-        )
-    order = order[0].upper()
-    if not isinstance(usm_ary, dpt.usm_ndarray):
-        raise TypeError(
-            f"Expected object of type dpt.usm_ndarray, got {type(usm_ary)}"
-        )
-    copy_order = "C"
-    if order == "C":
-        pass
-    elif order == "F":
-        copy_order = order
-    elif order == "A":
-        if usm_ary.flags.f_contiguous:
-            copy_order = "F"
-    elif order == "K":
-        if usm_ary.flags.f_contiguous:
-            copy_order = "F"
-    else:
-        raise ValueError(
-            "Unrecognized value of the order keyword. "
-            "Recognized values are 'A', 'C', 'F', or 'K'"
-        )
-    if order == "K":
-        R = _empty_like_orderK(usm_ary, usm_ary.dtype)
-    else:
-        R = dpt.usm_ndarray(
-            usm_ary.shape,
-            dtype=usm_ary.dtype,
-            buffer=usm_ary.usm_type,
-            order=copy_order,
-            buffer_ctor_kwargs={"queue": usm_ary.sycl_queue},
-        )
-    _copy_same_shape(R, usm_ary)
-    return R
-
-
-def astype(
-    usm_ary, newdtype, /, *, order="K", casting="unsafe", copy=True, device=None
-):
-    """ astype(array, new_dtype, order="K", casting="unsafe", \
-            copy=True, device=None)
-
-    Returns a copy of the :class:`dpctl.tensor.usm_ndarray`, cast to a
-    specified type.
-
-    Args:
-        array (usm_ndarray):
-            An input array.
-        new_dtype (dtype):
-            The data type of the resulting array. If `None`, gives default
-            floating point type supported by device where the resulting array
-            will be located.
-        order ({"C", "F", "A", "K"}, optional):
-            Controls memory layout of the resulting array if a copy
-            is returned.
-        casting ({'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional):
-            Controls what kind of data casting may occur. Please see
-            :meth:`numpy.ndarray.astype` for description of casting modes.
-        copy (bool, optional):
-            By default, `astype` always returns a newly allocated array.
-            If this keyword is set to `False`, a view of the input array
-            may be returned when possible.
-        device (object): array API specification of device where the
-            output array is created. Device can be specified by
-            a filter selector string, an instance of
-            :class:`dpctl.SyclDevice`, an instance of
-            :class:`dpctl.SyclQueue`, or an instance of
-            :class:`dpctl.tensor.Device`. If the value is `None`,
-            returned array is created on the same device as `array`.
-            Default: `None`.
-
-    Returns:
-        usm_ndarray:
-            An array with requested data type.
-
-    A view can be returned, if possible, when `copy=False` is used.
-    """
-    if not isinstance(usm_ary, dpt.usm_ndarray):
-        return TypeError(
-            f"Expected object of type dpt.usm_ndarray, got {type(usm_ary)}"
-        )
-    if len(order) == 0 or order[0] not in "KkAaCcFf":
-        raise ValueError(
-            "Unrecognized order keyword value, expecting 'K', 'A', 'F', or 'C'."
-        )
-    order = order[0].upper()
-    ary_dtype = usm_ary.dtype
-    if device is not None:
-        if not isinstance(device, dpctl.SyclQueue):
-            if isinstance(device, dpt.Device):
-                device = device.sycl_queue
-            else:
-                device = dpt.Device.create_device(device).sycl_queue
-        d = device.sycl_device
-        target_dtype = _get_dtype(newdtype, device)
-        if not _dtype_supported_by_device_impl(
-            target_dtype, d.has_aspect_fp16, d.has_aspect_fp64
-        ):
-            raise ValueError(
-                f"Requested dtype '{target_dtype}' is not supported by the "
-                "target device"
-            )
-        usm_ary = usm_ary.to_device(device)
-    else:
-        target_dtype = _get_dtype(newdtype, usm_ary.sycl_queue)
-
-    if not dpt.can_cast(ary_dtype, target_dtype, casting=casting):
-        raise TypeError(
-            f"Can not cast from {ary_dtype} to {newdtype} "
-            f"according to rule {casting}."
-        )
-    c_contig = usm_ary.flags.c_contiguous
-    f_contig = usm_ary.flags.f_contiguous
-    needs_copy = copy or not ary_dtype == target_dtype
-    if not needs_copy and (order != "K"):
-        # ensure that order="F" for C-contig input triggers copy,
-        # and order="C" for F-contig input triggers copy too.
-        # 1D arrays which are both C- and F- contig should not
-        # force copying for neither order="F", nor order="C", see gh-1926
-        needs_copy = (
-            c_contig and not f_contig and order not in ["A", "C"]
-        ) or (not c_contig and f_contig and order not in ["A", "F"])
-    if not needs_copy:
-        return usm_ary
-    copy_order = "C"
-    if order == "C":
-        pass
-    elif order == "F":
-        copy_order = order
-    elif order == "A":
-        if usm_ary.flags.f_contiguous:
-            copy_order = "F"
-    elif order == "K":
-        if usm_ary.flags.f_contiguous:
-            copy_order = "F"
-    else:
-        raise ValueError(
-            "Unrecognized value of the order keyword. "
-            "Recognized values are 'A', 'C', 'F', or 'K'"
-        )
-    if order == "K":
-        R = _empty_like_orderK(usm_ary, target_dtype)
-    else:
-        R = dpt.usm_ndarray(
-            usm_ary.shape,
-            dtype=target_dtype,
-            buffer=usm_ary.usm_type,
-            order=copy_order,
-            buffer_ctor_kwargs={"queue": usm_ary.sycl_queue},
-        )
-    _copy_from_usm_ndarray_to_usm_ndarray(R, usm_ary)
-    return R
-
-
-def _extract_impl(ary, ary_mask, axis=0):
-    """Extract elements of ary by applying mask starting from slot
-    dimension axis"""
-    if not isinstance(ary, dpt.usm_ndarray):
-        raise TypeError(
-            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}"
-        )
-    if isinstance(ary_mask, dpt.usm_ndarray):
-        dst_usm_type = dpctl.utils.get_coerced_usm_type(
-            (ary.usm_type, ary_mask.usm_type)
-        )
-        exec_q = dpctl.utils.get_execution_queue(
-            (ary.sycl_queue, ary_mask.sycl_queue)
-        )
-        if exec_q is None:
-            raise dpctl.utils.ExecutionPlacementError(
-                "arrays have different associated queues. "
-                "Use `y.to_device(x.device)` to migrate."
-            )
-    elif isinstance(ary_mask, np.ndarray):
-        dst_usm_type = ary.usm_type
-        exec_q = ary.sycl_queue
-        ary_mask = dpt.asarray(
-            ary_mask, usm_type=dst_usm_type, sycl_queue=exec_q
-        )
-    else:
-        raise TypeError(
-            "Expecting type dpctl.tensor.usm_ndarray or numpy.ndarray, got "
-            f"{type(ary_mask)}"
-        )
-    ary_nd = ary.ndim
-    pp = normalize_axis_index(operator.index(axis), ary_nd)
-    mask_nd = ary_mask.ndim
-    if pp < 0 or pp + mask_nd > ary_nd:
-        raise ValueError(
-            "Parameter p is inconsistent with input array dimensions"
-        )
-    mask_nelems = ary_mask.size
-    cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64
-    cumsum = dpt.empty(mask_nelems, dtype=cumsum_dt, device=ary_mask.device)
-    exec_q = cumsum.sycl_queue
-    _manager = dpctl.utils.SequentialOrderManager[exec_q]
-    dep_evs = _manager.submitted_events
-    mask_count = ti.mask_positions(
-        ary_mask, cumsum, sycl_queue=exec_q, depends=dep_evs
-    )
-    dst_shape = ary.shape[:pp] + (mask_count,) + ary.shape[pp + mask_nd :]
-    dst = dpt.empty(
-        dst_shape, dtype=ary.dtype, usm_type=dst_usm_type, device=ary.device
-    )
-    if dst.size == 0:
-        return dst
-    hev, ev = ti._extract(
-        src=ary,
-        cumsum=cumsum,
-        axis_start=pp,
-        axis_end=pp + mask_nd,
-        dst=dst,
-        sycl_queue=exec_q,
-        depends=dep_evs,
-    )
-    _manager.add_event_pair(hev, ev)
-    return dst
-
-
-def _nonzero_impl(ary):
-    if not isinstance(ary, dpt.usm_ndarray):
-        raise TypeError(
-            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}"
-        )
-    exec_q = ary.sycl_queue
-    usm_type = ary.usm_type
-    mask_nelems = ary.size
-    cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64
-    cumsum = dpt.empty(
-        mask_nelems, dtype=cumsum_dt, sycl_queue=exec_q, order="C"
-    )
-    _manager = dpctl.utils.SequentialOrderManager[exec_q]
-    dep_evs = _manager.submitted_events
-    mask_count = ti.mask_positions(
-        ary, cumsum, sycl_queue=exec_q, depends=dep_evs
-    )
-    indexes_dt = ti.default_device_index_type(exec_q.sycl_device)
-    indexes = dpt.empty(
-        (ary.ndim, mask_count),
-        dtype=indexes_dt,
-        usm_type=usm_type,
-        sycl_queue=exec_q,
-        order="C",
-    )
-    hev, nz_ev = ti._nonzero(cumsum, indexes, ary.shape, exec_q)
-    res = tuple(indexes[i, :] for i in range(ary.ndim))
-    _manager.add_event_pair(hev, nz_ev)
-    return res
-
-
-def _get_indices_queue_usm_type(inds, queue, usm_type):
-    """
-    Utility for validating indices are NumPy ndarray or usm_ndarray of integral
-    dtype or Python integers. At least one must be an array.
-
-    For each array, the queue and usm type are appended to `queue_list` and
-    `usm_type_list`, respectively.
-    """
-    queues = [queue]
-    usm_types = [usm_type]
-    any_array = False
-    for ind in inds:
-        if isinstance(ind, (np.ndarray, dpt.usm_ndarray)):
-            any_array = True
-            if ind.dtype.kind not in "ui":
-                raise IndexError(
-                    "arrays used as indices must be of integer (or boolean) "
-                    "type"
-                )
-            if isinstance(ind, dpt.usm_ndarray):
-                queues.append(ind.sycl_queue)
-                usm_types.append(ind.usm_type)
-        elif not isinstance(ind, Integral):
-            raise TypeError(
-                "all elements of `ind` expected to be usm_ndarrays, "
-                f"NumPy arrays, or integers, found {type(ind)}"
-            )
-    if not any_array:
-        raise TypeError(
-            "at least one element of `inds` expected to be an array"
-        )
-    usm_type = dpctl.utils.get_coerced_usm_type(usm_types)
-    q = dpctl.utils.get_execution_queue(queues)
-    return q, usm_type
-
-
-def _prepare_indices_arrays(inds, q, usm_type):
-    """
-    Utility taking a mix of usm_ndarray and possibly Python int scalar indices,
-    a queue (assumed to be common to arrays in inds), and a usm type.
-
-    Python scalar integers are promoted to arrays on the provided queue and
-    with the provided usm type. All arrays are then promoted to a common
-    integral type (if possible) before being broadcast to a common shape.
-    """
-    # scalar integers -> arrays
-    inds = tuple(
-        map(
-            lambda ind: (
-                ind
-                if isinstance(ind, dpt.usm_ndarray)
-                else dpt.asarray(ind, usm_type=usm_type, sycl_queue=q)
-            ),
-            inds,
-        )
-    )
-
-    # promote to a common integral type if possible
-    ind_dt = dpt.result_type(*inds)
-    if ind_dt.kind not in "ui":
-        raise ValueError(
-            "cannot safely promote indices to an integer data type"
-        )
-    inds = tuple(
-        map(
-            lambda ind: (
-                ind if ind.dtype == ind_dt else dpt.astype(ind, ind_dt)
-            ),
-            inds,
-        )
-    )
-
-    # broadcast
-    inds = dpt.broadcast_arrays(*inds)
-
-    return inds
-
-
-def _take_multi_index(ary, inds, p, mode=0):
-    if not isinstance(ary, dpt.usm_ndarray):
-        raise TypeError(
-            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}"
-        )
-    ary_nd = ary.ndim
-    p = normalize_axis_index(operator.index(p), ary_nd)
-    mode = operator.index(mode)
-    if mode not in [0, 1]:
-        raise ValueError(
-            "Invalid value for mode keyword, only 0 or 1 is supported"
-        )
-    if not isinstance(inds, (list, tuple)):
-        inds = (inds,)
-
-    exec_q, res_usm_type = _get_indices_queue_usm_type(
-        inds, ary.sycl_queue, ary.usm_type
-    )
-    if exec_q is None:
-        raise dpctl.utils.ExecutionPlacementError(
-            "Can not automatically determine where to allocate the "
-            "result or performance execution. "
-            "Use `usm_ndarray.to_device` method to migrate data to "
-            "be associated with the same queue."
-        )
-
-    inds = _prepare_indices_arrays(inds, exec_q, res_usm_type)
-
-    ind0 = inds[0]
-    ary_sh = ary.shape
-    p_end = p + len(inds)
-    if 0 in ary_sh[p:p_end] and ind0.size != 0:
-        raise IndexError("cannot take non-empty indices from an empty axis")
-    res_shape = ary_sh[:p] + ind0.shape + ary_sh[p_end:]
-    res = dpt.empty(
-        res_shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q
-    )
-    _manager = dpctl.utils.SequentialOrderManager[exec_q]
-    dep_ev = _manager.submitted_events
-    hev, take_ev = ti._take(
-        src=ary,
-        ind=inds,
-        dst=res,
-        axis_start=p,
-        mode=mode,
-        sycl_queue=exec_q,
-        depends=dep_ev,
-    )
-    _manager.add_event_pair(hev, take_ev)
-    return res
-
-
-def _place_impl(ary, ary_mask, vals, axis=0):
-    """Extract elements of ary by applying mask starting from slot
-    dimension axis"""
-    if not isinstance(ary, dpt.usm_ndarray):
-        raise TypeError(
-            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}"
-        )
-    if isinstance(ary_mask, dpt.usm_ndarray):
-        exec_q = dpctl.utils.get_execution_queue(
-            (
-                ary.sycl_queue,
-                ary_mask.sycl_queue,
-            )
-        )
-        coerced_usm_type = dpctl.utils.get_coerced_usm_type(
-            (
-                ary.usm_type,
-                ary_mask.usm_type,
-            )
-        )
-        if exec_q is None:
-            raise dpctl.utils.ExecutionPlacementError(
-                "arrays have different associated queues. "
-                "Use `y.to_device(x.device)` to migrate."
-            )
-    elif isinstance(ary_mask, np.ndarray):
-        exec_q = ary.sycl_queue
-        coerced_usm_type = ary.usm_type
-        ary_mask = dpt.asarray(
-            ary_mask, usm_type=coerced_usm_type, sycl_queue=exec_q
-        )
-    else:
-        raise TypeError(
-            "Expecting type dpctl.tensor.usm_ndarray or numpy.ndarray, got "
-            f"{type(ary_mask)}"
-        )
-    if exec_q is not None:
-        if not isinstance(vals, dpt.usm_ndarray):
-            vals = dpt.asarray(
-                vals,
-                dtype=ary.dtype,
-                usm_type=coerced_usm_type,
-                sycl_queue=exec_q,
-            )
-        else:
-            exec_q = dpctl.utils.get_execution_queue((exec_q, vals.sycl_queue))
-            coerced_usm_type = dpctl.utils.get_coerced_usm_type(
-                (
-                    coerced_usm_type,
-                    vals.usm_type,
-                )
-            )
-    if exec_q is None:
-        raise dpctl.utils.ExecutionPlacementError(
-            "arrays have different associated queues. "
-            "Use `Y.to_device(X.device)` to migrate."
-        )
-    ary_nd = ary.ndim
-    pp = normalize_axis_index(operator.index(axis), ary_nd)
-    mask_nd = ary_mask.ndim
-    if pp < 0 or pp + mask_nd > ary_nd:
-        raise ValueError(
-            "Parameter p is inconsistent with input array dimensions"
-        )
-    mask_nelems = ary_mask.size
-    cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64
-    cumsum = dpt.empty(
-        mask_nelems,
-        dtype=cumsum_dt,
-        usm_type=coerced_usm_type,
-        device=ary_mask.device,
-    )
-    exec_q = cumsum.sycl_queue
-    _manager = dpctl.utils.SequentialOrderManager[exec_q]
-    dep_ev = _manager.submitted_events
-    mask_count = ti.mask_positions(
-        ary_mask, cumsum, sycl_queue=exec_q, depends=dep_ev
-    )
-    expected_vals_shape = (
-        ary.shape[:pp] + (mask_count,) + ary.shape[pp + mask_nd :]
-    )
-    if vals.dtype == ary.dtype:
-        rhs = vals
-    else:
-        rhs = dpt.astype(vals, ary.dtype)
-    rhs = dpt.broadcast_to(rhs, expected_vals_shape)
-    if mask_nelems == 0:
-        return
-    dep_ev = _manager.submitted_events
-    hev, pl_ev = ti._place(
-        dst=ary,
-        cumsum=cumsum,
-        axis_start=pp,
-        axis_end=pp + mask_nd,
-        rhs=rhs,
-        sycl_queue=exec_q,
-        depends=dep_ev,
-    )
-    _manager.add_event_pair(hev, pl_ev)
-    return
-
-
-def _put_multi_index(ary, inds, p, vals, mode=0):
-    if not isinstance(ary, dpt.usm_ndarray):
-        raise TypeError(
-            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}"
-        )
-    ary_nd = ary.ndim
-    p = normalize_axis_index(operator.index(p), ary_nd)
-    mode = operator.index(mode)
-    if mode not in [0, 1]:
-        raise ValueError(
-            "Invalid value for mode keyword, only 0 or 1 is supported"
-        )
-    if not isinstance(inds, (list, tuple)):
-        inds = (inds,)
-
-    exec_q, coerced_usm_type = _get_indices_queue_usm_type(
-        inds, ary.sycl_queue, ary.usm_type
-    )
-
-    if exec_q is not None:
-        if not isinstance(vals, dpt.usm_ndarray):
-            vals = dpt.asarray(
-                vals,
-                dtype=ary.dtype,
-                usm_type=coerced_usm_type,
-                sycl_queue=exec_q,
-            )
-        else:
-            exec_q = dpctl.utils.get_execution_queue((exec_q, vals.sycl_queue))
-            coerced_usm_type = dpctl.utils.get_coerced_usm_type(
-                (
-                    coerced_usm_type,
-                    vals.usm_type,
-                )
-            )
-    if exec_q is None:
-        raise dpctl.utils.ExecutionPlacementError(
-            "Can not automatically determine where to allocate the "
-            "result or performance execution. "
-            "Use `usm_ndarray.to_device` method to migrate data to "
-            "be associated with the same queue."
-        )
-
-    inds = _prepare_indices_arrays(inds, exec_q, coerced_usm_type)
-
-    ind0 = inds[0]
-    ary_sh = ary.shape
-    p_end = p + len(inds)
-    if 0 in ary_sh[p:p_end] and ind0.size != 0:
-        raise IndexError(
-            "cannot put into non-empty indices along an empty axis"
-        )
-    expected_vals_shape = ary_sh[:p] + ind0.shape + ary_sh[p_end:]
-    if vals.dtype == ary.dtype:
-        rhs = vals
-    else:
-        rhs = dpt.astype(vals, ary.dtype)
-    rhs = dpt.broadcast_to(rhs, expected_vals_shape)
-    _manager = dpctl.utils.SequentialOrderManager[exec_q]
-    dep_ev = _manager.submitted_events
-    hev, put_ev = ti._put(
-        dst=ary,
-        ind=inds,
-        val=rhs,
-        axis_start=p,
-        mode=mode,
-        sycl_queue=exec_q,
-        depends=dep_ev,
-    )
-    _manager.add_event_pair(hev, put_ev)
-    return
diff --git a/dpctl/tensor/_ctors.py b/dpctl/tensor/_ctors.py
deleted file mode 100644
index 89588a5ce2..0000000000
--- a/dpctl/tensor/_ctors.py
+++ /dev/null
@@ -1,1959 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import operator
-from numbers import Number
-
-import numpy as np
-
-import dpctl
-import dpctl.memory as dpm
-import dpctl.tensor as dpt
-import dpctl.tensor._tensor_impl as ti
-import dpctl.utils
-from dpctl.tensor._copy_utils import (
-    _empty_like_orderK,
-    _from_numpy_empty_like_orderK,
-)
-from dpctl.tensor._data_types import _get_dtype
-from dpctl.tensor._device import normalize_queue_device
-from dpctl.tensor._usmarray import _is_object_with_buffer_protocol
-
-__doc__ = "Implementation of creation functions in :module:`dpctl.tensor`"
-
-_empty_tuple = tuple()
-_host_set = frozenset([None])
-
-
-def _array_info_dispatch(obj):
-    if isinstance(obj, dpt.usm_ndarray):
-        return obj.shape, obj.dtype, frozenset([obj.sycl_queue])
-    if isinstance(obj, np.ndarray):
-        return obj.shape, obj.dtype, _host_set
-    if isinstance(obj, range):
-        return (len(obj),), int, _host_set
-    if isinstance(obj, bool):
-        return _empty_tuple, bool, _host_set
-    if isinstance(obj, float):
-        return _empty_tuple, float, _host_set
-    if isinstance(obj, int):
-        return _empty_tuple, int, _host_set
-    if isinstance(obj, complex):
-        return _empty_tuple, complex, _host_set
-    if isinstance(
-        obj,
-        (
-            list,
-            tuple,
-        ),
-    ):
-        return _array_info_sequence(obj)
-    if _is_object_with_buffer_protocol(obj):
-        np_obj = np.array(obj)
-        return np_obj.shape, np_obj.dtype, _host_set
-    if hasattr(obj, "__usm_ndarray__"):
-        usm_ar = getattr(obj, "__usm_ndarray__")
-        if isinstance(usm_ar, dpt.usm_ndarray):
-            return usm_ar.shape, usm_ar.dtype, frozenset([usm_ar.sycl_queue])
-    if hasattr(obj, "__sycl_usm_array_interface__"):
-        usm_ar = _usm_ndarray_from_suai(obj)
-        return usm_ar.shape, usm_ar.dtype, frozenset([usm_ar.sycl_queue])
-    raise ValueError(type(obj))
-
-
-def _array_info_sequence(li):
-    if not isinstance(li, (list, tuple, range)):
-        raise TypeError(f"Expected list, tuple, or range, got {type(li)}")
-    n = len(li)
-    dim = None
-    dt = None
-    device = frozenset()
-    for el in li:
-        el_dim, el_dt, el_dev = _array_info_dispatch(el)
-        if dim is None:
-            dim = el_dim
-            dt = np.promote_types(el_dt, el_dt)
-            device = device.union(el_dev)
-        elif el_dim == dim:
-            dt = np.promote_types(dt, el_dt)
-            device = device.union(el_dev)
-        else:
-            raise ValueError(f"Inconsistent dimensions, {dim} and {el_dim}")
-    if dim is None:
-        dim = tuple()
-        dt = float
-        device = _host_set
-    return (n,) + dim, dt, device
-
-
-def _asarray_from_usm_ndarray(
-    usm_ndary,
-    dtype=None,
-    copy=None,
-    usm_type=None,
-    sycl_queue=None,
-    order="K",
-):
-    if not isinstance(usm_ndary, dpt.usm_ndarray):
-        raise TypeError(
-            f"Expected dpctl.tensor.usm_ndarray, got {type(usm_ndary)}"
-        )
-    if usm_type is None:
-        usm_type = usm_ndary.usm_type
-    if sycl_queue is not None:
-        exec_q = dpctl.utils.get_execution_queue(
-            [usm_ndary.sycl_queue, sycl_queue]
-        )
-        copy_q = normalize_queue_device(sycl_queue=sycl_queue, device=exec_q)
-    else:
-        copy_q = usm_ndary.sycl_queue
-    if dtype is None:
-        dtype = _map_to_device_dtype(usm_ndary.dtype, copy_q)
-    # Conditions for zero copy:
-    can_zero_copy = copy is not True
-    #    dtype is unchanged
-    can_zero_copy = can_zero_copy and dtype == usm_ndary.dtype
-    #    USM allocation type is unchanged
-    can_zero_copy = can_zero_copy and usm_type == usm_ndary.usm_type
-    #    sycl_queue is unchanged
-    can_zero_copy = can_zero_copy and copy_q is usm_ndary.sycl_queue
-    #    order is unchanged
-    c_contig = usm_ndary.flags.c_contiguous
-    f_contig = usm_ndary.flags.f_contiguous
-    fc_contig = usm_ndary.flags.forc
-    if can_zero_copy:
-        if order == "C" and c_contig:
-            pass
-        elif order == "F" and f_contig:
-            pass
-        elif order == "A" and fc_contig:
-            pass
-        elif order == "K":
-            pass
-        else:
-            can_zero_copy = False
-    if copy is False and can_zero_copy is False:
-        raise ValueError("asarray(..., copy=False) is not possible")
-    if can_zero_copy:
-        return usm_ndary
-    if order == "A":
-        order = "F" if f_contig and not c_contig else "C"
-    if order == "K" and fc_contig:
-        order = "C" if c_contig else "F"
-    if order == "K":
-        _ensure_native_dtype_device_support(dtype, copy_q.sycl_device)
-        res = _empty_like_orderK(usm_ndary, dtype, usm_type, copy_q)
-    else:
-        _ensure_native_dtype_device_support(dtype, copy_q.sycl_device)
-        res = dpt.usm_ndarray(
-            usm_ndary.shape,
-            dtype=dtype,
-            buffer=usm_type,
-            order=order,
-            buffer_ctor_kwargs={"queue": copy_q},
-        )
-    eq = dpctl.utils.get_execution_queue([usm_ndary.sycl_queue, copy_q])
-    if eq is not None:
-        _manager = dpctl.utils.SequentialOrderManager[eq]
-        dep_evs = _manager.submitted_events
-        hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=usm_ndary, dst=res, sycl_queue=eq, depends=dep_evs
-        )
-        _manager.add_event_pair(hev, cpy_ev)
-    else:
-        tmp = dpt.asnumpy(usm_ndary)
-        res[...] = tmp
-    return res
-
-
-def _map_to_device_dtype(dt, q):
-    dtc = dt.char
-    if dtc == "?" or np.issubdtype(dt, np.integer):
-        return dt
-    d = q.sycl_device
-    if np.issubdtype(dt, np.floating):
-        if dtc == "f":
-            return dt
-        if dtc == "d" and d.has_aspect_fp64:
-            return dt
-        if dtc == "e" and d.has_aspect_fp16:
-            return dt
-        return dpt.dtype("f4")
-    if np.issubdtype(dt, np.complexfloating):
-        if dtc == "F":
-            return dt
-        if dtc == "D" and d.has_aspect_fp64:
-            return dt
-        return dpt.dtype("c8")
-    raise RuntimeError(f"Unrecognized data type '{dt}' encountered.")
-
-
-def _usm_ndarray_from_suai(obj):
-    sua_iface = getattr(obj, "__sycl_usm_array_interface__")
-    membuf = dpm.as_usm_memory(obj)
-    ary = dpt.usm_ndarray(
-        sua_iface["shape"],
-        dtype=sua_iface["typestr"],
-        buffer=membuf,
-        strides=sua_iface.get("strides", None),
-    )
-    _data_field = sua_iface["data"]
-    if isinstance(_data_field, tuple) and len(_data_field) > 1:
-        ro_field = _data_field[1]
-    else:
-        ro_field = False
-    if ro_field:
-        ary.flags["W"] = False
-    return ary
-
-
-def _asarray_from_numpy_ndarray(
-    ary, dtype=None, usm_type=None, sycl_queue=None, order="K"
-):
-    if not isinstance(ary, np.ndarray):
-        raise TypeError(f"Expected numpy.ndarray, got {type(ary)}")
-    if usm_type is None:
-        usm_type = "device"
-    copy_q = normalize_queue_device(sycl_queue=None, device=sycl_queue)
-    if ary.dtype.char not in "?bBhHiIlLqQefdFD":
-        raise TypeError(
-            f"Numpy array of data type {ary.dtype} is not supported. "
-            "Please convert the input to an array with numeric data type."
-        )
-    if dtype is None:
-        # deduce device-representable output data type
-        dtype = _map_to_device_dtype(ary.dtype, copy_q)
-    _ensure_native_dtype_device_support(dtype, copy_q.sycl_device)
-    f_contig = ary.flags["F"]
-    c_contig = ary.flags["C"]
-    fc_contig = f_contig or c_contig
-    if order == "A":
-        order = "F" if f_contig and not c_contig else "C"
-    if order == "K" and fc_contig:
-        order = "C" if c_contig else "F"
-    if order == "K":
-        # new USM allocation
-        res = _from_numpy_empty_like_orderK(ary, dtype, usm_type, copy_q)
-    else:
-        res = dpt.usm_ndarray(
-            ary.shape,
-            dtype=dtype,
-            buffer=usm_type,
-            order=order,
-            buffer_ctor_kwargs={"queue": copy_q},
-        )
-    res[...] = ary
-    return res
-
-
-def _ensure_native_dtype_device_support(dtype, dev) -> None:
-    """Check that dtype is natively supported by device.
-
-    Arg:
-        dtype:
-            Elemental data-type
-        dev (:class:`dpctl.SyclDevice`):
-            The device about which the query is being made.
-    Returns:
-        None
-    Raise:
-        ValueError:
-            if device does not natively support this `dtype`.
-    """
-    if dtype in [dpt.float64, dpt.complex128] and not dev.has_aspect_fp64:
-        raise ValueError(
-            f"Device {dev.name} does not provide native support "
-            "for double-precision floating point type."
-        )
-    if (
-        dtype
-        in [
-            dpt.float16,
-        ]
-        and not dev.has_aspect_fp16
-    ):
-        raise ValueError(
-            f"Device {dev.name} does not provide native support "
-            "for half-precision floating point type."
-        )
-
-
-def _usm_types_walker(o, usm_types_list):
-    if isinstance(o, dpt.usm_ndarray):
-        usm_types_list.append(o.usm_type)
-        return
-    if hasattr(o, "__usm_ndarray__"):
-        usm_arr = getattr(o, "__usm_ndarray__")
-        if isinstance(usm_arr, dpt.usm_ndarray):
-            usm_types_list.append(usm_arr.usm_type)
-            return
-    if hasattr(o, "__sycl_usm_array_interface__"):
-        usm_ar = _usm_ndarray_from_suai(o)
-        usm_types_list.append(usm_ar.usm_type)
-        return
-    if _is_object_with_buffer_protocol(o):
-        return
-    if isinstance(o, (int, bool, float, complex)):
-        return
-    if isinstance(o, (list, tuple, range)):
-        for el in o:
-            _usm_types_walker(el, usm_types_list)
-        return
-    raise TypeError
-
-
-def _device_copy_walker(seq_o, res, _manager):
-    if isinstance(seq_o, dpt.usm_ndarray):
-        exec_q = res.sycl_queue
-        deps = _manager.submitted_events
-        ht_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=seq_o, dst=res, sycl_queue=exec_q, depends=deps
-        )
-        _manager.add_event_pair(ht_ev, cpy_ev)
-        return
-    if hasattr(seq_o, "__usm_ndarray__"):
-        usm_arr = getattr(seq_o, "__usm_ndarray__")
-        if isinstance(usm_arr, dpt.usm_ndarray):
-            _device_copy_walker(usm_arr, res, _manager)
-            return
-    if hasattr(seq_o, "__sycl_usm_array_interface__"):
-        usm_ar = _usm_ndarray_from_suai(seq_o)
-        exec_q = res.sycl_queue
-        deps = _manager.submitted_events
-        ht_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=usm_ar, dst=res, sycl_queue=exec_q, depends=deps
-        )
-        _manager.add_event_pair(ht_ev, cpy_ev)
-        return
-    if isinstance(seq_o, (list, tuple)):
-        for i, el in enumerate(seq_o):
-            _device_copy_walker(el, res[i], _manager)
-        return
-    raise TypeError
-
-
-def _copy_through_host_walker(seq_o, usm_res):
-    if isinstance(seq_o, dpt.usm_ndarray):
-        if (
-            dpctl.utils.get_execution_queue(
-                (
-                    usm_res.sycl_queue,
-                    seq_o.sycl_queue,
-                )
-            )
-            is None
-        ):
-            usm_res[...] = dpt.asnumpy(seq_o).copy()
-            return
-        else:
-            usm_res[...] = seq_o
-    if hasattr(seq_o, "__usm_ndarray__"):
-        usm_arr = getattr(seq_o, "__usm_ndarray__")
-        if isinstance(usm_arr, dpt.usm_ndarray):
-            _copy_through_host_walker(usm_arr, usm_res)
-            return
-    if hasattr(seq_o, "__sycl_usm_array_interface__"):
-        usm_ar = _usm_ndarray_from_suai(seq_o)
-        if (
-            dpctl.utils.get_execution_queue(
-                (
-                    usm_res.sycl_queue,
-                    usm_ar.sycl_queue,
-                )
-            )
-            is None
-        ):
-            usm_res[...] = dpt.asnumpy(usm_ar).copy()
-        else:
-            usm_res[...] = usm_ar
-        return
-    if _is_object_with_buffer_protocol(seq_o):
-        np_ar = np.asarray(seq_o)
-        usm_res[...] = np_ar
-        return
-    if isinstance(seq_o, (list, tuple)):
-        for i, el in enumerate(seq_o):
-            _copy_through_host_walker(el, usm_res[i])
-        return
-    usm_res[...] = np.asarray(seq_o)
-
-
-def _asarray_from_seq(
-    seq_obj,
-    seq_shape,
-    seq_dt,
-    alloc_q,
-    exec_q,
-    dtype=None,
-    usm_type=None,
-    order="C",
-):
-    "`seq_obj` is a sequence"
-    if usm_type is None:
-        usm_types_in_seq = []
-        _usm_types_walker(seq_obj, usm_types_in_seq)
-        usm_type = dpctl.utils.get_coerced_usm_type(usm_types_in_seq)
-    dpctl.utils.validate_usm_type(usm_type)
-    if dtype is None:
-        dtype = _map_to_device_dtype(seq_dt, alloc_q)
-    else:
-        _mapped_dt = _map_to_device_dtype(dtype, alloc_q)
-        if _mapped_dt != dtype:
-            raise ValueError(
-                f"Device {alloc_q.sycl_device} "
-                f"does not support {dtype} natively."
-            )
-        dtype = _mapped_dt
-    if order in "KA":
-        order = "C"
-    if isinstance(exec_q, dpctl.SyclQueue):
-        res = dpt.empty(
-            seq_shape,
-            dtype=dtype,
-            usm_type=usm_type,
-            sycl_queue=alloc_q,
-            order=order,
-        )
-        _manager = dpctl.utils.SequentialOrderManager[exec_q]
-        _device_copy_walker(seq_obj, res, _manager)
-        return res
-    else:
-        res = dpt.empty(
-            seq_shape,
-            dtype=dtype,
-            usm_type=usm_type,
-            sycl_queue=alloc_q,
-            order=order,
-        )
-        _copy_through_host_walker(seq_obj, res)
-        return res
-
-
-def _asarray_from_seq_single_device(
-    obj,
-    seq_shape,
-    seq_dt,
-    seq_dev,
-    dtype=None,
-    usm_type=None,
-    sycl_queue=None,
-    order="C",
-):
-    if sycl_queue is None:
-        exec_q = seq_dev
-        alloc_q = seq_dev
-    else:
-        exec_q = dpctl.utils.get_execution_queue(
-            (
-                sycl_queue,
-                seq_dev,
-            )
-        )
-        alloc_q = sycl_queue
-    return _asarray_from_seq(
-        obj,
-        seq_shape,
-        seq_dt,
-        alloc_q,
-        exec_q,
-        dtype=dtype,
-        usm_type=usm_type,
-        order=order,
-    )
-
-
-def asarray(
-    obj,
-    /,
-    *,
-    dtype=None,
-    device=None,
-    copy=None,
-    usm_type=None,
-    sycl_queue=None,
-    order="K",
-):
-    """
-    Converts input object to :class:`dpctl.tensor.usm_ndarray`.
-
-    Args:
-        obj: Python object to convert. Can be an instance of
-            :class:`dpctl.tensor.usm_ndarray`,
-            an object representing SYCL USM allocation and implementing
-            ``__sycl_usm_array_interface__`` protocol, an instance
-            of :class:`numpy.ndarray`, an object supporting Python buffer
-            protocol, a Python scalar, or a (possibly nested) sequence of
-            Python scalars.
-        dtype (data type, optional):
-            output array data type. If ``dtype`` is
-            ``None``, the output array data type is inferred from data types in
-            ``obj``. Default: ``None``
-        copy (`bool`, optional):
-            boolean indicating whether or not to copy the
-            input. If ``True``, always creates a copy. If ``False``, the
-            need to copy raises :exc:`ValueError`. If ``None``, tries to reuse
-            existing memory allocations if possible, but allows to perform
-            a copy otherwise. Default: ``None``
-        order (``"C"``, ``"F"``, ``"A"``, ``"K"``, optional):
-            memory layout of the output array. Default: ``"K"``
-        device (optional): array API concept of device where the output array
-            is created. ``device`` can be ``None``, a oneAPI filter selector
-            string, an instance of :class:`dpctl.SyclDevice` corresponding to
-            a non-partitioned SYCL device, an instance of
-            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
-            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
-            Default: ``None``
-        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
-            The type of SYCL USM allocation for the output array.
-            Default: ``"device"``
-        sycl_queue (:class:`dpctl.SyclQueue`, optional):
-            The SYCL queue to use
-            for output array allocation and copying. ``sycl_queue`` and
-            ``device`` are complementary arguments, i.e. use one or another.
-            If both are specified, a :exc:`TypeError` is raised unless both
-            imply the same underlying SYCL queue to be used. If both are
-            ``None``, a cached queue targeting default-selected device is
-            used for allocation and population. Default: ``None``
-
-    Returns:
-        usm_ndarray:
-            Array created from input object.
-    """
-    # 1. Check that copy is a valid keyword
-    if copy not in [None, True, False]:
-        raise TypeError(
-            "Recognized copy keyword values should be True, False, or None"
-        )
-    # 2. Check that dtype is None, or a valid dtype
-    if dtype is not None:
-        dtype = dpt.dtype(dtype)
-    # 3. Validate order
-    if not isinstance(order, str):
-        raise TypeError(
-            f"Expected order keyword to be of type str, got {type(order)}"
-        )
-    if len(order) == 0 or order[0] not in "KkAaCcFf":
-        raise ValueError(
-            "Unrecognized order keyword value, expecting 'K', 'A', 'F', or 'C'."
-        )
-    order = order[0].upper()
-    # 4. Check that usm_type is None, or a valid value
-    dpctl.utils.validate_usm_type(usm_type, allow_none=True)
-    # 5. Normalize device/sycl_queue [keep it None if was None]
-    if device is not None or sycl_queue is not None:
-        sycl_queue = normalize_queue_device(
-            sycl_queue=sycl_queue, device=device
-        )
-
-    # handle instance(obj, usm_ndarray)
-    if isinstance(obj, dpt.usm_ndarray):
-        return _asarray_from_usm_ndarray(
-            obj,
-            dtype=dtype,
-            copy=copy,
-            usm_type=usm_type,
-            sycl_queue=sycl_queue,
-            order=order,
-        )
-    if hasattr(obj, "__usm_ndarray__"):
-        usm_arr = getattr(obj, "__usm_ndarray__")
-        if isinstance(usm_arr, dpt.usm_ndarray):
-            return _asarray_from_usm_ndarray(
-                usm_arr,
-                dtype=dtype,
-                copy=copy,
-                usm_type=usm_type,
-                sycl_queue=sycl_queue,
-                order=order,
-            )
-    if hasattr(obj, "__sycl_usm_array_interface__"):
-        ary = _usm_ndarray_from_suai(obj)
-        return _asarray_from_usm_ndarray(
-            ary,
-            dtype=dtype,
-            copy=copy,
-            usm_type=usm_type,
-            sycl_queue=sycl_queue,
-            order=order,
-        )
-    if isinstance(obj, np.ndarray):
-        if copy is False:
-            raise ValueError(
-                "Converting numpy.ndarray to usm_ndarray requires a copy"
-            )
-        return _asarray_from_numpy_ndarray(
-            obj,
-            dtype=dtype,
-            usm_type=usm_type,
-            sycl_queue=sycl_queue,
-            order=order,
-        )
-    if _is_object_with_buffer_protocol(obj):
-        if copy is False:
-            raise ValueError(
-                f"Converting {type(obj)} to usm_ndarray requires a copy"
-            )
-        return _asarray_from_numpy_ndarray(
-            np.array(obj),
-            dtype=dtype,
-            usm_type=usm_type,
-            sycl_queue=sycl_queue,
-            order=order,
-        )
-    if isinstance(obj, (list, tuple, range)):
-        if copy is False:
-            raise ValueError(
-                "Converting Python sequence to usm_ndarray requires a copy"
-            )
-        seq_shape, seq_dt, devs = _array_info_sequence(obj)
-        if devs == _host_set:
-            return _asarray_from_numpy_ndarray(
-                np.asarray(obj, dtype=dtype, order=order),
-                dtype=dtype,
-                usm_type=usm_type,
-                sycl_queue=sycl_queue,
-                order=order,
-            )
-        elif len(devs) == 1:
-            seq_dev = list(devs)[0]
-            return _asarray_from_seq_single_device(
-                obj,
-                seq_shape,
-                seq_dt,
-                seq_dev,
-                dtype=dtype,
-                usm_type=usm_type,
-                sycl_queue=sycl_queue,
-                order=order,
-            )
-        elif len(devs) > 1:
-            devs = [dev for dev in devs if dev is not None]
-            if sycl_queue is None:
-                if len(devs) == 1:
-                    alloc_q = devs[0]
-                else:
-                    raise dpctl.utils.ExecutionPlacementError(
-                        "Please specify `device` or `sycl_queue` keyword "
-                        "argument to determine where to allocate the "
-                        "resulting array."
-                    )
-            else:
-                alloc_q = sycl_queue
-            return _asarray_from_seq(
-                obj,
-                seq_shape,
-                seq_dt,
-                alloc_q,
-                #  force copying via host
-                None,
-                dtype=dtype,
-                usm_type=usm_type,
-                order=order,
-            )
-    if copy is False:
-        raise ValueError(
-            f"Converting {type(obj)} to usm_ndarray requires a copy"
-        )
-    # obj is a scalar, create 0d array
-    return _asarray_from_numpy_ndarray(
-        np.asarray(obj, dtype=dtype),
-        dtype=dtype,
-        usm_type=usm_type,
-        sycl_queue=sycl_queue,
-        order="C",
-    )
-
-
-def empty(
-    shape,
-    *,
-    dtype=None,
-    order="C",
-    device=None,
-    usm_type="device",
-    sycl_queue=None,
-):
-    """
-    Creates :class:`dpctl.tensor.usm_ndarray` from uninitialized
-    USM allocation.
-
-    Args:
-        shape (Tuple[int], int):
-            Dimensions of the array to be created.
-        dtype (optional):
-            data type of the array. Can be typestring,
-            a :class:`numpy.dtype` object, :mod:`numpy` char string,
-            or a NumPy scalar type. The ``None`` value creates an
-            array of floating point data type. Default: ``None``
-        order (``"C"``, or ``F"``):
-            memory layout for the array. Default: ``"C"``
-        device (optional): array API concept of device where the output array
-            is created. ``device`` can be ``None``, a oneAPI filter selector
-            string, an instance of :class:`dpctl.SyclDevice` corresponding to
-            a non-partitioned SYCL device, an instance of
-            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
-            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
-            Default: ``None``
-        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
-            The type of SYCL USM allocation for the output array.
-            Default: ``"device"``
-        sycl_queue (:class:`dpctl.SyclQueue`, optional):
-            The SYCL queue to use
-            for output array allocation and copying. ``sycl_queue`` and
-            ``device`` are complementary arguments, i.e. use one or another.
-            If both are specified, a :exc:`TypeError` is raised unless both
-            imply the same underlying SYCL queue to be used. If both are
-            ``None``, a cached queue targeting default-selected device is
-            used for allocation and population. Default: ``None``
-
-    Returns:
-        usm_ndarray:
-            Created empty array.
-    """
-    if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf":
-        raise ValueError(
-            "Unrecognized order keyword value, expecting 'F' or 'C'."
-        )
-    order = order[0].upper()
-    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
-    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
-    dtype = _get_dtype(dtype, sycl_queue)
-    _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
-    res = dpt.usm_ndarray(
-        shape,
-        dtype=dtype,
-        buffer=usm_type,
-        order=order,
-        buffer_ctor_kwargs={"queue": sycl_queue},
-    )
-    return res
-
-
-def _coerce_and_infer_dt(*args, dt, sycl_queue, err_msg, allow_bool=False):
-    "Deduce arange type from sequence spec"
-    nd, seq_dt, d = _array_info_sequence(args)
-    if d != _host_set or nd != (len(args),):
-        raise ValueError(err_msg)
-    dt = _get_dtype(dt, sycl_queue, ref_type=seq_dt)
-    if np.issubdtype(dt, np.integer):
-        return tuple(int(v) for v in args), dt
-    if np.issubdtype(dt, np.floating):
-        return tuple(float(v) for v in args), dt
-    if np.issubdtype(dt, np.complexfloating):
-        return tuple(complex(v) for v in args), dt
-    if allow_bool and dt.char == "?":
-        return tuple(bool(v) for v in args), dt
-    raise ValueError(f"Data type {dt} is not supported")
-
-
-def _round_for_arange(tmp):
-    k = int(tmp)
-    if k >= 0 and float(k) < tmp:
-        tmp = tmp + 1
-    return tmp
-
-
-def _get_arange_length(start, stop, step):
-    "Compute length of arange sequence"
-    span = stop - start
-    if hasattr(step, "__float__") and hasattr(span, "__float__"):
-        return _round_for_arange(span / step)
-    tmp = span / step
-    if hasattr(tmp, "__complex__"):
-        tmp = complex(tmp)
-        tmp = tmp.real
-    else:
-        tmp = float(tmp)
-    return _round_for_arange(tmp)
-
-
-def _to_scalar(obj, sc_ty):
-    """A way to convert object to NumPy scalar type.
-    Raises OverflowError if obj can not be represented
-    using the requested scalar type.
-    """
-    zd_arr = np.asarray(obj, dtype=sc_ty)
-    return zd_arr[tuple()]
-
-
-def arange(
-    start,
-    /,
-    stop=None,
-    step=1,
-    *,
-    dtype=None,
-    device=None,
-    usm_type="device",
-    sycl_queue=None,
-):
-    """
-    Returns evenly spaced values within the half-open interval [start, stop)
-    as a one-dimensional array.
-
-    Args:
-        start:
-            Starting point of the interval
-        stop:
-            Ending point of the interval. Default: ``None``
-        step: Increment of the returned sequence. Default: ``1``
-        dtype: Output array data type. Default: ``None``
-        device (optional): array API concept of device where the output array
-            is created. ``device`` can be ``None``, a oneAPI filter selector
-            string, an instance of :class:`dpctl.SyclDevice` corresponding to
-            a non-partitioned SYCL device, an instance of
-            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
-            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
-            Default: ``None``
-        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
-            The type of SYCL USM allocation for the output array.
-            Default: ``"device"``
-        sycl_queue (:class:`dpctl.SyclQueue`, optional):
-            The SYCL queue to use
-            for output array allocation and copying. ``sycl_queue`` and
-            ``device`` are complementary arguments, i.e. use one or another.
-            If both are specified, a :exc:`TypeError` is raised unless both
-            imply the same underlying SYCL queue to be used. If both are
-            ``None``, a cached queue targeting default-selected device is
-            used for allocation and population. Default: ``None``
-
-    Returns:
-        usm_ndarray:
-            Array populated with evenly spaced values.
-    """
-    if stop is None:
-        stop = start
-        start = 0
-    if step is None:
-        step = 1
-    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
-    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
-    is_bool = False
-    if dtype:
-        is_bool = (dtype is bool) or (dpt.dtype(dtype) == dpt.bool)
-    _, dt = _coerce_and_infer_dt(
-        start,
-        stop,
-        step,
-        dt=dpt.int8 if is_bool else dtype,
-        sycl_queue=sycl_queue,
-        err_msg="start, stop, and step must be Python scalars",
-        allow_bool=False,
-    )
-    try:
-        tmp = _get_arange_length(start, stop, step)
-        sh = max(int(tmp), 0)
-    except TypeError:
-        sh = 0
-    if is_bool and sh > 2:
-        raise ValueError("no fill-function for boolean data type")
-    res = dpt.usm_ndarray(
-        (sh,),
-        dtype=dt,
-        buffer=usm_type,
-        order="C",
-        buffer_ctor_kwargs={"queue": sycl_queue},
-    )
-    sc_ty = dt.type
-    _first = _to_scalar(start, sc_ty)
-    if sh > 1:
-        _second = _to_scalar(start + step, sc_ty)
-        if dt in [dpt.uint8, dpt.uint16, dpt.uint32, dpt.uint64]:
-            int64_ty = dpt.int64.type
-            _step = int64_ty(_second) - int64_ty(_first)
-        else:
-            _step = _second - _first
-        _step = sc_ty(_step)
-    else:
-        _step = sc_ty(1)
-    _start = _first
-    _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
-    # populating newly allocated array, no task dependencies
-    hev, lin_ev = ti._linspace_step(_start, _step, res, sycl_queue)
-    _manager.add_event_pair(hev, lin_ev)
-    if is_bool:
-        res_out = dpt.usm_ndarray(
-            (sh,),
-            dtype=dpt.bool,
-            buffer=usm_type,
-            order="C",
-            buffer_ctor_kwargs={"queue": sycl_queue},
-        )
-        hev_cpy, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=res, dst=res_out, sycl_queue=sycl_queue, depends=[lin_ev]
-        )
-        _manager.add_event_pair(hev_cpy, cpy_ev)
-        return res_out
-    return res
-
-
-def zeros(
-    shape,
-    *,
-    dtype=None,
-    order="C",
-    device=None,
-    usm_type="device",
-    sycl_queue=None,
-):
-    """
-    Returns a new :class:`dpctl.tensor.usm_ndarray` having a specified
-    shape and filled with zeros.
-
-    Args:
-        shape (Tuple[int], int):
-            Dimensions of the array to be created.
-        dtype (optional):
-            data type of the array. Can be typestring,
-            a :class:`numpy.dtype` object, :mod:`numpy` char string,
-            or a NumPy scalar type. Default: ``None``
-        order ("C", or "F"):
-            memory layout for the array. Default: ``"C"``
-        device (optional): array API concept of device where the output array
-            is created. ``device`` can be ``None``, a oneAPI filter selector
-            string, an instance of :class:`dpctl.SyclDevice` corresponding to
-            a non-partitioned SYCL device, an instance of
-            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
-            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
-            Default: ``None``
-        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
-            The type of SYCL USM allocation for the output array.
-            Default: ``"device"``
-        sycl_queue (:class:`dpctl.SyclQueue`, optional):
-            The SYCL queue to use
-            for output array allocation and copying. ``sycl_queue`` and
-            ``device`` are complementary arguments, i.e. use one or another.
-            If both are specified, a :exc:`TypeError` is raised unless both
-            imply the same underlying SYCL queue to be used. If both are
-            ``None``, a cached queue targeting default-selected device is
-            used for allocation and population. Default: ``None``
-
-    Returns:
-        usm_ndarray:
-            Constructed array initialized with zeros.
-    """
-    if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf":
-        raise ValueError(
-            "Unrecognized order keyword value, expecting 'F' or 'C'."
-        )
-    order = order[0].upper()
-    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
-    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
-    dtype = _get_dtype(dtype, sycl_queue)
-    _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
-    res = dpt.usm_ndarray(
-        shape,
-        dtype=dtype,
-        buffer=usm_type,
-        order=order,
-        buffer_ctor_kwargs={"queue": sycl_queue},
-    )
-    _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
-    # populating new allocation, no dependent events
-    hev, zeros_ev = ti._zeros_usm_ndarray(res, sycl_queue)
-    _manager.add_event_pair(hev, zeros_ev)
-
-    return res
-
-
-def ones(
-    shape,
-    *,
-    dtype=None,
-    order="C",
-    device=None,
-    usm_type="device",
-    sycl_queue=None,
-):
-    """ ones(shape, dtype=None, order="C", \
-             device=None, usm_type="device", sycl_queue=None)
-
-    Returns a new :class:`dpctl.tensor.usm_ndarray` having a specified
-    shape and filled with ones.
-
-    Args:
-        shape (Tuple[int], int):
-            Dimensions of the array to be created.
-        dtype (optional):
-            data type of the array. Can be typestring,
-            a :class:`numpy.dtype` object, :mod:`numpy` char string,
-            or a NumPy scalar type. Default: ``None``
-        order ("C", or "F"): memory layout for the array. Default: ``"C"``
-        device (optional): array API concept of device where the output array
-            is created. ``device`` can be ``None``, a oneAPI filter selector
-            string, an instance of :class:`dpctl.SyclDevice` corresponding to
-            a non-partitioned SYCL device, an instance of
-            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
-            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
-            Default: ``None``
-        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
-            The type of SYCL USM allocation for the output array.
-            Default: ``"device"``
-        sycl_queue (:class:`dpctl.SyclQueue`, optional):
-            The SYCL queue to use
-            for output array allocation and copying. ``sycl_queue`` and
-            ``device`` are complementary arguments, i.e. use one or another.
-            If both are specified, a :exc:`TypeError` is raised unless both
-            imply the same underlying SYCL queue to be used. If both are
-            ``None``, a cached queue targeting default-selected device is
-            used for allocation and population. Default: ``None``
-
-    Returns:
-        usm_ndarray:
-            Created array initialized with ones.
-    """
-    if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf":
-        raise ValueError(
-            "Unrecognized order keyword value, expecting 'F' or 'C'."
-        )
-    order = order[0].upper()
-    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
-    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
-    dtype = _get_dtype(dtype, sycl_queue)
-    res = dpt.usm_ndarray(
-        shape,
-        dtype=dtype,
-        buffer=usm_type,
-        order=order,
-        buffer_ctor_kwargs={"queue": sycl_queue},
-    )
-    _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
-    # populating new allocation, no dependent events
-    hev, full_ev = ti._full_usm_ndarray(1, res, sycl_queue)
-    _manager.add_event_pair(hev, full_ev)
-    return res
-
-
-def _cast_fill_val(fill_val, dt):
-    """
-    Casts the Python scalar `fill_val` to another Python type coercible to the
-    requested data type `dt`, if necessary.
-    """
-    val_type = type(fill_val)
-    if val_type in [float, complex] and np.issubdtype(dt, np.integer):
-        return int(fill_val.real)
-    elif val_type is complex and np.issubdtype(dt, np.floating):
-        return fill_val.real
-    elif val_type is int and np.issubdtype(dt, np.integer):
-        return _to_scalar(fill_val, dt)
-    else:
-        return fill_val
-
-
-def _validate_fill_value(fill_val):
-    """
-    Validates that `fill_val` is a numeric or boolean scalar.
-    """
-    # TODO: verify if `np.True_` and `np.False_` should be instances of
-    # Number in NumPy, like other NumPy scalars and like Python bools
-    # check for `np.bool_` separately as NumPy<2 has no `np.bool`
-    if not isinstance(fill_val, Number) and not isinstance(fill_val, np.bool_):
-        raise TypeError(
-            f"array cannot be filled with scalar of type {type(fill_val)}"
-        )
-
-
-def full(
-    shape,
-    fill_value,
-    *,
-    dtype=None,
-    order="C",
-    device=None,
-    usm_type=None,
-    sycl_queue=None,
-):
-    """
-    Returns a new :class:`dpctl.tensor.usm_ndarray` having a specified
-    shape and filled with `fill_value`.
-
-    Args:
-        shape (tuple):
-            Dimensions of the array to be created.
-        fill_value (int,float,complex,usm_ndarray):
-            fill value
-        dtype (optional): data type of the array. Can be typestring,
-            a :class:`numpy.dtype` object, :mod:`numpy` char string,
-            or a NumPy scalar type. Default: ``None``
-        order ("C", or "F"):
-            memory layout for the array. Default: ``"C"``
-        device (optional): array API concept of device where the output array
-            is created. ``device`` can be ``None``, a oneAPI filter selector
-            string, an instance of :class:`dpctl.SyclDevice` corresponding to
-            a non-partitioned SYCL device, an instance of
-            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
-            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
-            Default: ``None``
-        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
-            The type of SYCL USM allocation for the output array.
-            Default: ``"device"``
-        sycl_queue (:class:`dpctl.SyclQueue`, optional):
-            The SYCL queue to use
-            for output array allocation and copying. ``sycl_queue`` and
-            ``device`` are complementary arguments, i.e. use one or another.
-            If both are specified, a :exc:`TypeError` is raised unless both
-            imply the same underlying SYCL queue to be used. If both are
-            ``None``, a cached queue targeting default-selected device is
-            used for allocation and population. Default: ``None``
-
-    Returns:
-        usm_ndarray:
-            New array initialized with given value.
-    """
-    if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf":
-        raise ValueError(
-            "Unrecognized order keyword value, expecting 'F' or 'C'."
-        )
-    order = order[0].upper()
-    dpctl.utils.validate_usm_type(usm_type, allow_none=True)
-
-    if isinstance(fill_value, (dpt.usm_ndarray, np.ndarray, tuple, list)):
-        if (
-            isinstance(fill_value, dpt.usm_ndarray)
-            and sycl_queue is None
-            and device is None
-        ):
-            sycl_queue = fill_value.sycl_queue
-        else:
-            sycl_queue = normalize_queue_device(
-                sycl_queue=sycl_queue, device=device
-            )
-        X = dpt.asarray(
-            fill_value,
-            dtype=dtype,
-            order=order,
-            usm_type=usm_type,
-            sycl_queue=sycl_queue,
-        )
-        return dpt.copy(dpt.broadcast_to(X, shape), order=order)
-    else:
-        _validate_fill_value(fill_value)
-
-    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
-    usm_type = usm_type if usm_type is not None else "device"
-    dtype = _get_dtype(dtype, sycl_queue, ref_type=type(fill_value))
-    res = dpt.usm_ndarray(
-        shape,
-        dtype=dtype,
-        buffer=usm_type,
-        order=order,
-        buffer_ctor_kwargs={"queue": sycl_queue},
-    )
-    fill_value = _cast_fill_val(fill_value, dtype)
-
-    _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
-    # populating new allocation, no dependent events
-    hev, full_ev = ti._full_usm_ndarray(fill_value, res, sycl_queue)
-    _manager.add_event_pair(hev, full_ev)
-    return res
-
-
-def _normalize_order(order, arr):
-    """
-    Utility function for processing the `order` keyword of array-like
-    constructors, which support `"K"` and `"A"` orders.
-    """
-    arr_flags = arr.flags
-    f_contig = arr_flags["F"]
-    c_contig = arr_flags["C"]
-    if order == "A":
-        order = "F" if f_contig and not c_contig else "C"
-    if order == "K" and (f_contig or c_contig):
-        order = "C" if c_contig else "F"
-    return order
-
-
-def empty_like(
-    x, /, *, dtype=None, order="K", device=None, usm_type=None, sycl_queue=None
-):
-    """
-    Returns an uninitialized :class:`dpctl.tensor.usm_ndarray` with the
-    same `shape` as the input array `x`.
-
-    Args:
-        x (usm_ndarray):
-            Input array from which to derive the output array shape.
-        dtype (optional):
-            data type of the array. Can be a typestring,
-            a :class:`numpy.dtype` object, NumPy char string,
-            or a NumPy scalar type. Default: ``None``
-        order ("C", "F", "A", or "K"):
-            memory layout for the array. Default: ``"K"``
-        device (optional): array API concept of device where the output array
-            is created. ``device`` can be ``None``, a oneAPI filter selector
-            string, an instance of :class:`dpctl.SyclDevice` corresponding to
-            a non-partitioned SYCL device, an instance of
-            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
-            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
-            Default: ``None``
-        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
-            The type of SYCL USM allocation for the output array.
-            Default: ``"device"``
-        sycl_queue (:class:`dpctl.SyclQueue`, optional):
-            The SYCL queue to use
-            for output array allocation and copying. ``sycl_queue`` and
-            ``device`` are complementary arguments, i.e. use one or another.
-            If both are specified, a :exc:`TypeError` is raised unless both
-            imply the same underlying SYCL queue to be used. If both are
-            ``None``, a cached queue targeting default-selected device is
-            used for allocation. Default: ``None``
-
-    Returns:
-        usm_ndarray:
-            Created empty array with uninitialized memory.
-    """
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected instance of dpt.usm_ndarray, got {type(x)}.")
-    if (
-        not isinstance(order, str)
-        or len(order) == 0
-        or order[0] not in "CcFfAaKk"
-    ):
-        raise ValueError(
-            "Unrecognized order keyword value, expecting 'C', 'F', 'A', or 'K'."
-        )
-    order = order[0].upper()
-    if dtype is None:
-        dtype = x.dtype
-    if usm_type is None:
-        usm_type = x.usm_type
-    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
-    if device is None and sycl_queue is None:
-        device = x.device
-    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
-    dtype = dpt.dtype(dtype)
-    order = _normalize_order(order, x)
-    if order == "K":
-        _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
-        return _empty_like_orderK(x, dtype, usm_type, sycl_queue)
-    else:
-        shape = x.shape
-        _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
-        res = dpt.usm_ndarray(
-            shape,
-            dtype=dtype,
-            buffer=usm_type,
-            order=order,
-            buffer_ctor_kwargs={"queue": sycl_queue},
-        )
-        return res
-
-
-def zeros_like(
-    x, /, *, dtype=None, order="K", device=None, usm_type=None, sycl_queue=None
-):
-    """
-    Creates :class:`dpctl.tensor.usm_ndarray` from USM allocation
-    initialized with zeros.
-
-    Args:
-        x (usm_ndarray):
-            Input array from which to derive the shape of the
-            output array.
-        dtype (optional):
-            data type of the array. Can be typestring,
-            a :class:`numpy.dtype` object, :mod:`numpy` char string, or a
-            NumPy scalar type. If `None`, output array has the same data
-            type as the input array. Default: ``None``
-        order ("C", or "F"):
-            memory layout for the array. Default: ``"C"``
-        device (optional):
-            array API concept of device where the output array
-            is created. ``device`` can be ``None``, a oneAPI filter selector
-            string, an instance of :class:`dpctl.SyclDevice` corresponding to
-            a non-partitioned SYCL device, an instance of
-            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
-            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
-            Default: ``None``
-        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
-            The type of SYCL USM allocation for the output array.
-            Default: ``"device"``
-        sycl_queue (:class:`dpctl.SyclQueue`, optional):
-            The SYCL queue to use
-            for output array allocation and copying. ``sycl_queue`` and
-            ``device`` are complementary arguments, i.e. use one or another.
-            If both are specified, a :exc:`TypeError` is raised unless both
-            imply the same underlying SYCL queue to be used. If both are
-            ``None``, a cached queue targeting default-selected device is
-            used for allocation and population. Default: ``None``
-
-    Returns:
-        usm_ndarray:
-            New array initialized with zeros.
-    """
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected instance of dpt.usm_ndarray, got {type(x)}.")
-    if (
-        not isinstance(order, str)
-        or len(order) == 0
-        or order[0] not in "CcFfAaKk"
-    ):
-        raise ValueError(
-            "Unrecognized order keyword value, expecting 'C', 'F', 'A', or 'K'."
-        )
-    order = order[0].upper()
-    if dtype is None:
-        dtype = x.dtype
-    if usm_type is None:
-        usm_type = x.usm_type
-    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
-    if device is None and sycl_queue is None:
-        device = x.device
-    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
-    dtype = dpt.dtype(dtype)
-    order = _normalize_order(order, x)
-    if order == "K":
-        _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
-        res = _empty_like_orderK(x, dtype, usm_type, sycl_queue)
-        _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
-        # populating new allocation, no dependent events
-        hev, full_ev = ti._full_usm_ndarray(0, res, sycl_queue)
-        _manager.add_event_pair(hev, full_ev)
-        return res
-    else:
-        _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
-        sh = x.shape
-        return zeros(
-            sh,
-            dtype=dtype,
-            order=order,
-            device=device,
-            usm_type=usm_type,
-            sycl_queue=sycl_queue,
-        )
-
-
-def ones_like(
-    x, /, *, dtype=None, order="K", device=None, usm_type=None, sycl_queue=None
-):
-    """
-    Returns a new :class:`dpctl.tensor.usm_ndarray` filled with ones and
-    having the same `shape` as the input array `x`.
-
-    Args:
-        x (usm_ndarray):
-            Input array from which to derive the output array shape
-        dtype (optional):
-            data type of the array. Can be typestring,
-            a :class:`numpy.dtype` object, :mod:`numpy` char string,
-            or a NumPy scalar type. Default: `None`
-        order ("C", "F", "A", or "K"):
-            memory layout for the array. Default: ``"C"``
-        device (optional):
-            array API concept of device where the output array
-            is created. ``device`` can be ``None``, a oneAPI filter selector
-            string, an instance of :class:`dpctl.SyclDevice` corresponding to
-            a non-partitioned SYCL device, an instance of
-            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
-            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
-            Default: ``None``
-        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
-            The type of SYCL USM allocation for the output array.
-            Default: ``"device"``
-        sycl_queue (:class:`dpctl.SyclQueue`, optional):
-            The SYCL queue to use
-            for output array allocation and copying. ``sycl_queue`` and
-            ``device`` are complementary arguments, i.e. use one or another.
-            If both are specified, a :exc:`TypeError` is raised unless both
-            imply the same underlying SYCL queue to be used. If both are
-            ``None``, a cached queue targeting default-selected device is
-            used for allocation and population. Default: ``None``
-
-    Returns:
-        usm_ndarray:
-            New array initialized with ones.
-    """
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected instance of dpt.usm_ndarray, got {type(x)}.")
-    if (
-        not isinstance(order, str)
-        or len(order) == 0
-        or order[0] not in "CcFfAaKk"
-    ):
-        raise ValueError(
-            "Unrecognized order keyword value, expecting 'C', 'F', 'A', or 'K'."
-        )
-    order = order[0].upper()
-    if dtype is None:
-        dtype = x.dtype
-    if usm_type is None:
-        usm_type = x.usm_type
-    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
-    if device is None and sycl_queue is None:
-        device = x.device
-    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
-    dtype = dpt.dtype(dtype)
-    order = _normalize_order(order, x)
-    if order == "K":
-        _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
-        res = _empty_like_orderK(x, dtype, usm_type, sycl_queue)
-        _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
-        # populating new allocation, no dependent events
-        hev, full_ev = ti._full_usm_ndarray(1, res, sycl_queue)
-        _manager.add_event_pair(hev, full_ev)
-        return res
-    else:
-        sh = x.shape
-        return ones(
-            sh,
-            dtype=dtype,
-            order=order,
-            device=device,
-            usm_type=usm_type,
-            sycl_queue=sycl_queue,
-        )
-
-
-def full_like(
-    x,
-    /,
-    fill_value,
-    *,
-    dtype=None,
-    order="K",
-    device=None,
-    usm_type=None,
-    sycl_queue=None,
-):
-    """ full_like(x, fill_value, dtype=None, order="K", \
-                  device=None, usm_type=None, sycl_queue=None)
-
-    Returns a new :class:`dpctl.tensor.usm_ndarray` filled with `fill_value`
-    and having the same `shape` as the input array `x`.
-
-    Args:
-        x (usm_ndarray): Input array from which to derive the output array
-            shape.
-        fill_value: the value to fill output array with
-        dtype (optional):
-            data type of the array. Can be typestring,
-            a :class:`numpy.dtype` object, :mod:`numpy` char string, or a
-            NumPy scalar type. If ``dtype`` is ``None``, the output array data
-            type is inferred from ``x``. Default: ``None``
-        order ("C", "F", "A", or "K"):
-            memory layout for the array. Default: ``"K"``
-        device (optional):
-            array API concept of device where the output array
-            is created. ``device`` can be ``None``, a oneAPI filter selector
-            string, an instance of :class:`dpctl.SyclDevice` corresponding to
-            a non-partitioned SYCL device, an instance of
-            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
-            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
-            Default: ``None``
-        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
-            The type of SYCL USM allocation for the output array.
-            Default: ``"device"``
-        sycl_queue (:class:`dpctl.SyclQueue`, optional):
-            The SYCL queue to use
-            for output array allocation and copying. ``sycl_queue`` and
-            ``device`` are complementary arguments, i.e. use one or another.
-            If both are specified, a :exc:`TypeError` is raised unless both
-            imply the same underlying SYCL queue to be used. If both are
-            ``None``, a cached queue targeting default-selected device is
-            used for allocation and population. Default: ``None``
-
-    Returns:
-        usm_ndarray:
-            New array initialized with given value.
-    """
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected instance of dpt.usm_ndarray, got {type(x)}.")
-    if (
-        not isinstance(order, str)
-        or len(order) == 0
-        or order[0] not in "CcFfAaKk"
-    ):
-        raise ValueError(
-            "Unrecognized order keyword value, expecting 'C', 'F', 'A', or 'K'."
-        )
-    order = order[0].upper()
-    if dtype is None:
-        dtype = x.dtype
-    if usm_type is None:
-        usm_type = x.usm_type
-    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
-    if device is None and sycl_queue is None:
-        device = x.device
-    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
-    sh = x.shape
-    dtype = dpt.dtype(dtype)
-    order = _normalize_order(order, x)
-    if order == "K":
-        _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
-        if isinstance(fill_value, (dpt.usm_ndarray, np.ndarray, tuple, list)):
-            X = dpt.asarray(
-                fill_value,
-                dtype=dtype,
-                order=order,
-                usm_type=usm_type,
-                sycl_queue=sycl_queue,
-            )
-            X = dpt.broadcast_to(X, sh)
-            res = _empty_like_orderK(x, dtype, usm_type, sycl_queue)
-            _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
-            # order copy after tasks populating X
-            dep_evs = _manager.submitted_events
-            hev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=X, dst=res, sycl_queue=sycl_queue, depends=dep_evs
-            )
-            _manager.add_event_pair(hev, copy_ev)
-            return res
-        else:
-            _validate_fill_value(fill_value)
-
-        dtype = _get_dtype(dtype, sycl_queue, ref_type=type(fill_value))
-        res = _empty_like_orderK(x, dtype, usm_type, sycl_queue)
-        fill_value = _cast_fill_val(fill_value, dtype)
-        _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
-        # populating new allocation, no dependent events
-        hev, full_ev = ti._full_usm_ndarray(fill_value, res, sycl_queue)
-        _manager.add_event_pair(hev, full_ev)
-        return res
-    else:
-        return full(
-            sh,
-            fill_value,
-            dtype=dtype,
-            order=order,
-            device=device,
-            usm_type=usm_type,
-            sycl_queue=sycl_queue,
-        )
-
-
-def linspace(
-    start,
-    stop,
-    /,
-    num,
-    *,
-    dtype=None,
-    device=None,
-    endpoint=True,
-    sycl_queue=None,
-    usm_type="device",
-):
-    """
-    linspace(start, stop, num, dtype=None, device=None, endpoint=True, \
-        sycl_queue=None, usm_type="device")
-
-    Returns :class:`dpctl.tensor.usm_ndarray` array populated with
-    evenly spaced numbers of specified interval.
-
-    Args:
-        start:
-            the start of the interval.
-        stop:
-            the end of the interval. If the ``endpoint`` is ``False``, the
-            function generates ``num+1`` evenly spaced points starting
-            with ``start`` and ending with ``stop`` and exclude the
-            ``stop`` from the returned array such that the returned array
-            consists of evenly spaced numbers over the half-open interval
-            ``[start, stop)``. If ``endpoint`` is ``True``, the output
-            array consists of evenly spaced numbers over the closed
-            interval ``[start, stop]``. Default: ``True``
-        num (int):
-            number of samples. Must be a non-negative integer; otherwise,
-            the function raises ``ValueError`` exception.
-        dtype:
-            output array data type. Should be a floating data type.
-            If ``dtype`` is ``None``, the output array must be the default
-            floating point data type for target device.
-            Default: ``None``
-        device (optional):
-            array API concept of device where the output array
-            is created. ``device`` can be ``None``, a oneAPI filter selector
-            string, an instance of :class:`dpctl.SyclDevice` corresponding to
-            a non-partitioned SYCL device, an instance of
-            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
-            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
-            Default: ``None``
-        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
-            The type of SYCL USM allocation for the output array.
-            Default: ``"device"``
-        sycl_queue (:class:`dpctl.SyclQueue`, optional):
-            The SYCL queue to use
-            for output array allocation and copying. ``sycl_queue`` and
-            ``device`` are complementary arguments, i.e. use one or another.
-            If both are specified, a :exc:`TypeError` is raised unless both
-            imply the same underlying SYCL queue to be used. If both are
-            ``None``, a cached queue targeting default-selected device is
-            used for allocation and population. Default: ``None``
-        endpoint: boolean indicating whether to include ``stop`` in the
-            interval. Default: ``True``
-
-    Returns:
-        usm_ndarray:
-            Array populated with evenly spaced numbers in the requested
-            interval.
-    """
-    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
-    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
-    if endpoint not in [True, False]:
-        raise TypeError("endpoint keyword argument must be of boolean type")
-    num = operator.index(num)
-    if num < 0:
-        raise ValueError("Number of points must be non-negative")
-    _, dt = _coerce_and_infer_dt(
-        start,
-        stop,
-        dt=dtype,
-        sycl_queue=sycl_queue,
-        err_msg="start and stop must be Python scalars.",
-        allow_bool=True,
-    )
-    int_dt = None
-    if np.issubdtype(dt, np.integer):
-        if dtype is not None:
-            int_dt = dt
-        dt = ti.default_device_fp_type(sycl_queue)
-        dt = dpt.dtype(dt)
-        start = float(start)
-        stop = float(stop)
-    res = dpt.empty(num, dtype=dt, usm_type=usm_type, sycl_queue=sycl_queue)
-    _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
-    hev, la_ev = ti._linspace_affine(
-        start, stop, dst=res, include_endpoint=endpoint, sycl_queue=sycl_queue
-    )
-    _manager.add_event_pair(hev, la_ev)
-    return res if int_dt is None else dpt.astype(res, int_dt)
-
-
-def eye(
-    n_rows,
-    n_cols=None,
-    /,
-    *,
-    k=0,
-    dtype=None,
-    order="C",
-    device=None,
-    usm_type="device",
-    sycl_queue=None,
-):
-    """
-    eye(n_rows, n_cols=None, /, *, k=0, dtype=None, \
-        device=None, usm_type="device", sycl_queue=None)
-
-    Creates :class:`dpctl.tensor.usm_ndarray` with ones on the `k`-th
-    diagonal.
-
-    Args:
-        n_rows (int):
-            number of rows in the output array.
-        n_cols (int, optional):
-            number of columns in the output array. If ``None``,
-            ``n_cols = n_rows``. Default: ``None``
-        k (int):
-            index of the diagonal, with ``0`` as the main diagonal.
-            A positive value of ``k`` is a superdiagonal, a negative value
-            is a subdiagonal.
-            Raises :exc:`TypeError` if ``k`` is not an integer.
-            Default: ``0``
-        dtype (optional):
-            data type of the array. Can be typestring,
-            a :class:`numpy.dtype` object, :mod:`numpy` char string, or
-            a NumPy scalar type. Default: ``None``
-        order ("C" or "F"):
-            memory layout for the array. Default: ``"C"``
-        device (optional):
-            array API concept of device where the output array
-            is created. ``device`` can be ``None``, a oneAPI filter selector
-            string, an instance of :class:`dpctl.SyclDevice` corresponding to
-            a non-partitioned SYCL device, an instance of
-            :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object
-            returned by :attr:`dpctl.tensor.usm_ndarray.device`.
-            Default: ``None``
-        usm_type (``"device"``, ``"shared"``, ``"host"``, optional):
-            The type of SYCL USM allocation for the output array.
-            Default: ``"device"``
-        sycl_queue (:class:`dpctl.SyclQueue`, optional):
-            The SYCL queue to use
-            for output array allocation and copying. ``sycl_queue`` and
-            ``device`` are complementary arguments, i.e. use one or another.
-            If both are specified, a :exc:`TypeError` is raised unless both
-            imply the same underlying SYCL queue to be used. If both are
-            ``None``, a cached queue targeting default-selected device is
-            used for allocation and population. Default: ``None``
-
-    Returns:
-        usm_ndarray:
-            A diagonal matrix.
-    """
-    if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf":
-        raise ValueError(
-            "Unrecognized order keyword value, expecting 'F' or 'C'."
-        )
-    order = order[0].upper()
-    n_rows = operator.index(n_rows)
-    n_cols = n_rows if n_cols is None else operator.index(n_cols)
-    k = operator.index(k)
-    if k >= n_cols or -k >= n_rows:
-        return dpt.zeros(
-            (n_rows, n_cols),
-            dtype=dtype,
-            order=order,
-            device=device,
-            usm_type=usm_type,
-            sycl_queue=sycl_queue,
-        )
-    dpctl.utils.validate_usm_type(usm_type, allow_none=False)
-    sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device)
-    dtype = _get_dtype(dtype, sycl_queue)
-    _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
-    res = dpt.usm_ndarray(
-        (n_rows, n_cols),
-        dtype=dtype,
-        buffer=usm_type,
-        order=order,
-        buffer_ctor_kwargs={"queue": sycl_queue},
-    )
-    if n_rows != 0 and n_cols != 0:
-        _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
-        hev, eye_ev = ti._eye(k, dst=res, sycl_queue=sycl_queue)
-        _manager.add_event_pair(hev, eye_ev)
-    return res
-
-
-def tril(x, /, *, k=0):
-    """
-    Returns the lower triangular part of a matrix (or a stack of matrices)
-    ``x``.
-
-    The lower triangular part of the matrix is defined as the elements on and
-    below the specified diagonal ``k``.
-
-    Args:
-        x (usm_ndarray):
-            Input array
-        k (int, optional):
-            Specifies the diagonal above which to set
-            elements to zero. If ``k = 0``, the diagonal is the main diagonal.
-            If ``k < 0``, the diagonal is below the main diagonal.
-            If ``k > 0``, the diagonal is above the main diagonal.
-            Default: ``0``
-
-    Returns:
-        usm_ndarray:
-            A lower-triangular array or a stack of lower-triangular arrays.
-    """
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(
-            "Expected argument of type dpctl.tensor.usm_ndarray, "
-            f"got {type(x)}."
-        )
-
-    k = operator.index(k)
-
-    order = "F" if (x.flags.f_contiguous) else "C"
-
-    shape = x.shape
-    nd = x.ndim
-    if nd < 2:
-        raise ValueError("Array dimensions less than 2.")
-
-    q = x.sycl_queue
-    if k >= shape[nd - 1] - 1:
-        res = dpt.empty(
-            x.shape,
-            dtype=x.dtype,
-            order=order,
-            usm_type=x.usm_type,
-            sycl_queue=q,
-        )
-        _manager = dpctl.utils.SequentialOrderManager[q]
-        dep_evs = _manager.submitted_events
-        hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=x, dst=res, sycl_queue=q, depends=dep_evs
-        )
-        _manager.add_event_pair(hev, cpy_ev)
-    elif k < -shape[nd - 2]:
-        res = dpt.zeros(
-            x.shape,
-            dtype=x.dtype,
-            order=order,
-            usm_type=x.usm_type,
-            sycl_queue=q,
-        )
-    else:
-        res = dpt.empty(
-            x.shape,
-            dtype=x.dtype,
-            order=order,
-            usm_type=x.usm_type,
-            sycl_queue=q,
-        )
-        _manager = dpctl.utils.SequentialOrderManager[q]
-        dep_evs = _manager.submitted_events
-        hev, tril_ev = ti._tril(
-            src=x, dst=res, k=k, sycl_queue=q, depends=dep_evs
-        )
-        _manager.add_event_pair(hev, tril_ev)
-
-    return res
-
-
-def triu(x, /, *, k=0):
-    """
-    Returns the upper triangular part of a matrix (or a stack of matrices)
-    ``x``.
-
-    The upper triangular part of the matrix is defined as the elements on and
-    above the specified diagonal ``k``.
-
-    Args:
-        x (usm_ndarray):
-            Input array
-        k (int, optional):
-            Specifies the diagonal below which to set
-            elements to zero. If ``k = 0``, the diagonal is the main diagonal.
-            If ``k < 0``, the diagonal is below the main diagonal.
-            If ``k > 0``, the diagonal is above the main diagonal.
-            Default: ``0``
-
-    Returns:
-        usm_ndarray:
-            An upper-triangular array or a stack of upper-triangular arrays.
-    """
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(
-            "Expected argument of type dpctl.tensor.usm_ndarray, "
-            f"got {type(x)}."
-        )
-
-    k = operator.index(k)
-
-    order = "F" if (x.flags.f_contiguous) else "C"
-
-    shape = x.shape
-    nd = x.ndim
-    if nd < 2:
-        raise ValueError("Array dimensions less than 2.")
-
-    q = x.sycl_queue
-    if k > shape[nd - 1]:
-        res = dpt.zeros(
-            x.shape,
-            dtype=x.dtype,
-            order=order,
-            usm_type=x.usm_type,
-            sycl_queue=q,
-        )
-    elif k <= -shape[nd - 2] + 1:
-        res = dpt.empty(
-            x.shape,
-            dtype=x.dtype,
-            order=order,
-            usm_type=x.usm_type,
-            sycl_queue=q,
-        )
-        _manager = dpctl.utils.SequentialOrderManager[q]
-        dep_evs = _manager.submitted_events
-        hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=x, dst=res, sycl_queue=q, depends=dep_evs
-        )
-        _manager.add_event_pair(hev, cpy_ev)
-    else:
-        res = dpt.empty(
-            x.shape,
-            dtype=x.dtype,
-            order=order,
-            usm_type=x.usm_type,
-            sycl_queue=q,
-        )
-        _manager = dpctl.utils.SequentialOrderManager[q]
-        dep_evs = _manager.submitted_events
-        hev, triu_ev = ti._triu(
-            src=x, dst=res, k=k, sycl_queue=q, depends=dep_evs
-        )
-        _manager.add_event_pair(hev, triu_ev)
-
-    return res
-
-
-def meshgrid(*arrays, indexing="xy"):
-    """
-    Creates list of :class:`dpctl.tensor.usm_ndarray` coordinate matrices
-    from vectors.
-
-    Args:
-        arrays (usm_ndarray):
-            an arbitrary number of one-dimensional arrays
-            representing grid coordinates. Each array should have the same
-            numeric data type.
-        indexing (``"xy"``, or ``"ij"``):
-            Cartesian (``"xy"``) or matrix (``"ij"``) indexing of output.
-            If provided zero or one one-dimensional vector(s) (i.e., the
-            zero- and one-dimensional cases, respectively), the ``indexing``
-            keyword has no effect and should be ignored. Default: ``"xy"``
-
-    Returns:
-        List[array]:
-            list of ``N`` arrays, where ``N`` is the number of
-            provided one-dimensional input arrays. Each returned array must
-            have rank ``N``.
-            For a set of ``n`` vectors with lengths ``N0``, ``N1``, ``N2``, ...
-            The cartesian indexing results in arrays of shape
-            ``(N1, N0, N2, ...)``, while the
-            matrix indexing results in arrays of shape
-            ``(N0, N1, N2, ...)``.
-            Default: ``"xy"``.
-
-    Raises:
-        ValueError: If vectors are not of the same data type, or are not
-            one-dimensional.
-
-    """
-    ref_dt = None
-    ref_unset = True
-    for array in arrays:
-        if not isinstance(array, dpt.usm_ndarray):
-            raise TypeError(
-                f"Expected instance of dpt.usm_ndarray, got {type(array)}."
-            )
-        if array.ndim != 1:
-            raise ValueError("All arrays must be one-dimensional.")
-        if ref_unset:
-            ref_unset = False
-            ref_dt = array.dtype
-        else:
-            if not ref_dt == array.dtype:
-                raise ValueError(
-                    "All arrays must be of the same numeric data type."
-                )
-    if indexing not in ["xy", "ij"]:
-        raise ValueError(
-            "Unrecognized indexing keyword value, expecting 'xy' or 'ij.'"
-        )
-    n = len(arrays)
-    if n == 0:
-        return []
-
-    sh = (-1,) + (1,) * (n - 1)
-
-    res = []
-    if n > 1 and indexing == "xy":
-        res.append(dpt.reshape(arrays[0], (1, -1) + sh[2:], copy=True))
-        res.append(dpt.reshape(arrays[1], sh, copy=True))
-        arrays, sh = arrays[2:], sh[-2:] + sh[:-2]
-
-    for array in arrays:
-        res.append(dpt.reshape(array, sh, copy=True))
-        sh = sh[-1:] + sh[:-1]
-
-    output = dpt.broadcast_arrays(*res)
-
-    return output
diff --git a/dpctl/tensor/_data_types.py b/dpctl/tensor/_data_types.py
deleted file mode 100644
index 99a533fac8..0000000000
--- a/dpctl/tensor/_data_types.py
+++ /dev/null
@@ -1,96 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-from numpy import bool_ as np_bool_
-from numpy import complexfloating as np_complexfloating
-from numpy import dtype
-from numpy import floating as np_floating
-from numpy import integer as np_integer
-from numpy import issubdtype as np_issubdtype
-
-from dpctl.tensor._tensor_impl import (
-    default_device_bool_type as ti_default_device_bool_type,
-)
-from dpctl.tensor._tensor_impl import (
-    default_device_complex_type as ti_default_device_complex_type,
-)
-from dpctl.tensor._tensor_impl import (
-    default_device_fp_type as ti_default_device_fp_type,
-)
-from dpctl.tensor._tensor_impl import (
-    default_device_int_type as ti_default_device_int_type,
-)
-
-bool = dtype("bool")
-int8 = dtype("int8")
-int16 = dtype("int16")
-int32 = dtype("int32")
-int64 = dtype("int64")
-uint8 = dtype("uint8")
-uint16 = dtype("uint16")
-uint32 = dtype("uint32")
-uint64 = dtype("uint64")
-float16 = dtype("float16")
-float32 = dtype("float32")
-float64 = dtype("float64")
-complex64 = dtype("complex64")
-complex128 = dtype("complex128")
-
-
-def _get_dtype(inp_dt, sycl_obj, ref_type=None):
-    """
-    Type inference utility to construct data type
-    object with defaults based on reference type.
-
-    _get_dtype is used by dpctl.tensor.asarray
-    to infer data type of the output array from the
-    input sequence.
-    """
-    if inp_dt is None:
-        if ref_type in [None, float] or np_issubdtype(ref_type, np_floating):
-            fp_dt = ti_default_device_fp_type(sycl_obj)
-            return dtype(fp_dt)
-        if ref_type in [bool, np_bool_]:
-            bool_dt = ti_default_device_bool_type(sycl_obj)
-            return dtype(bool_dt)
-        if ref_type is int or np_issubdtype(ref_type, np_integer):
-            int_dt = ti_default_device_int_type(sycl_obj)
-            return dtype(int_dt)
-        if ref_type is complex or np_issubdtype(ref_type, np_complexfloating):
-            cfp_dt = ti_default_device_complex_type(sycl_obj)
-            return dtype(cfp_dt)
-        raise TypeError(f"Reference type {ref_type} not recognized.")
-    return dtype(inp_dt)
-
-
-__all__ = [
-    "dtype",
-    "_get_dtype",
-    "bool",
-    "int8",
-    "uint8",
-    "int16",
-    "uint16",
-    "int32",
-    "uint32",
-    "int64",
-    "uint64",
-    "float16",
-    "float32",
-    "float64",
-    "complex64",
-    "complex128",
-]
diff --git a/dpctl/tensor/_device.py b/dpctl/tensor/_device.py
deleted file mode 100644
index e4250a8df3..0000000000
--- a/dpctl/tensor/_device.py
+++ /dev/null
@@ -1,189 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-import dpctl
-from dpctl._sycl_device_factory import _cached_default_device
-from dpctl._sycl_queue_manager import get_device_cached_queue
-
-__doc__ = "Implementation of array API mandated Device class"
-
-
-class Device:
-    """
-    An object representing Data-API concept of device.
-
-    This is a wrapper around :class:`dpctl.SyclQueue` with custom
-    formatting. The class does not have public constructor,
-    but a class method :meth:`dpctl.tensor.Device.create_device` to construct
-    it from `device` keyword argument in Array-API functions.
-
-    Instance can be queried for ``sycl_queue``, ``sycl_context``,
-    or ``sycl_device``.
-    """
-
-    __device_queue_map__ = {}
-    sycl_queue_ = None
-
-    def __new__(cls, *args, **kwargs):
-        raise TypeError("No public constructor")
-
-    @classmethod
-    def create_device(cls, device=None):
-        """Device.create_device(device=None)
-
-        Creates instance of Device from argument.
-
-        Args:
-            device:
-                Device specification, i.e. `None`, :class:`.Device`,
-                :class:`dpctl.SyclQueue`, or a :class:`dpctl.SyclDevice`
-                corresponding to a root SYCL device.
-        Raises:
-            ValueError: if an instance of :class:`dpctl.SycDevice` corresponding
-                        to a sub-device was specified as the argument
-            SyclQueueCreationError: if :class:`dpctl.SyclQueue` could not be
-                                    created from the argument
-        """
-        dev = device
-        obj = super().__new__(cls)
-        if isinstance(dev, Device):
-            obj.sycl_queue_ = dev.sycl_queue
-        elif isinstance(dev, dpctl.SyclQueue):
-            obj.sycl_queue_ = dev
-        elif isinstance(dev, dpctl.SyclDevice):
-            par = dev.parent_device
-            if par is None:
-                obj.sycl_queue_ = get_device_cached_queue(dev)
-            else:
-                raise ValueError(
-                    f"Using non-root device {dev} to specify offloading "
-                    "target is ambiguous. Please use dpctl.SyclQueue "
-                    "targeting this device"
-                )
-        else:
-            if dev is None:
-                _dev = _cached_default_device()
-            else:
-                _dev = dpctl.SyclDevice(dev)
-            obj.sycl_queue_ = get_device_cached_queue(_dev)
-        return obj
-
-    @property
-    def sycl_queue(self):
-        """
-        :class:`dpctl.SyclQueue` used to offload to this :class:`.Device`.
-        """
-        return self.sycl_queue_
-
-    @property
-    def sycl_context(self):
-        """
-        :class:`dpctl.SyclContext` associated with this :class:`.Device`.
-        """
-        return self.sycl_queue_.sycl_context
-
-    @property
-    def sycl_device(self):
-        """
-        :class:`dpctl.SyclDevice` targeted by this :class:`.Device`.
-        """
-        return self.sycl_queue_.sycl_device
-
-    def __repr__(self):
-        try:
-            sd = self.sycl_device
-        except AttributeError as exc:
-            raise ValueError(
-                f"Instance of {self.__class__} is not initialized"
-            ) from exc
-        try:
-            fs = sd.filter_string
-            return f"Device({fs})"
-        except TypeError:
-            # This is a sub-device
-            return repr(self.sycl_queue)
-
-    def print_device_info(self):
-        "Outputs information about targeted SYCL device"
-        self.sycl_device.print_device_info()
-
-    def wait(self):
-        """
-        Call ``wait`` method of the underlying ``sycl_queue``.
-        """
-        self.sycl_queue_.wait()
-
-    def __eq__(self, other):
-        """Equality comparison based on underlying ``sycl_queue``."""
-        if isinstance(other, Device):
-            return self.sycl_queue.__eq__(other.sycl_queue)
-        elif isinstance(other, dpctl.SyclQueue):
-            return self.sycl_queue.__eq__(other)
-        return False
-
-    def __hash__(self):
-        """Compute object's hash value."""
-        return self.sycl_queue.__hash__()
-
-
-def normalize_queue_device(sycl_queue=None, device=None):
-    """normalize_queue_device(sycl_queue=None, device=None)
-
-    Utility to process exclusive keyword arguments 'device'
-    and 'sycl_queue' in functions of `dpctl.tensor`.
-
-    Args:
-        sycl_queue (:class:`dpctl.SyclQueue`, optional):
-            explicitly indicates where USM allocation is done
-            and the population code (if any) is executed.
-            Value `None` is interpreted as get the SYCL queue
-            from `device` keyword, or use default queue.
-            Default: None
-        device (string, :class:`dpctl.SyclDevice`, :class:`dpctl.SyclQueue,
-            :class:`dpctl.tensor.Device`, optional):
-            array-API keyword indicating non-partitioned SYCL device
-            where array is allocated.
-
-    Returns
-        :class:`dpctl.SyclQueue` object implied by either of provided
-        keywords. If both are None, `dpctl.SyclQueue()` is returned.
-        If both are specified and imply the same queue, `sycl_queue`
-        is returned.
-
-    Raises:
-        TypeError: if argument is not of the expected type, or keywords
-            imply incompatible queues.
-    """
-    q = sycl_queue
-    d = device
-    if q is None:
-        d = Device.create_device(d)
-        return d.sycl_queue
-    if not isinstance(q, dpctl.SyclQueue):
-        raise TypeError(f"Expected dpctl.SyclQueue, got {type(q)}")
-    if d is None:
-        return q
-    d = Device.create_device(d)
-    qq = dpctl.utils.get_execution_queue(
-        (
-            q,
-            d.sycl_queue,
-        )
-    )
-    if qq is None:
-        raise TypeError(
-            "sycl_queue and device keywords can not be both specified"
-        )
-    return qq
diff --git a/dpctl/tensor/_dldevice_conversions.py b/dpctl/tensor/_dldevice_conversions.py
deleted file mode 100644
index b96e127633..0000000000
--- a/dpctl/tensor/_dldevice_conversions.py
+++ /dev/null
@@ -1,39 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-from .._sycl_device import SyclDevice
-from ._usmarray import DLDeviceType
-
-
-def dldevice_to_sycl_device(dl_dev: tuple):
-    if isinstance(dl_dev, tuple):
-        if len(dl_dev) != 2:
-            raise ValueError("dldevice tuple must have length 2")
-    else:
-        raise TypeError(
-            f"dl_dev is expected to be a 2-tuple, got " f"{type(dl_dev)}"
-        )
-    if dl_dev[0] != DLDeviceType.kDLOneAPI:
-        raise ValueError("dldevice type must be kDLOneAPI")
-    return SyclDevice(str(dl_dev[1]))
-
-
-def sycl_device_to_dldevice(dev: SyclDevice):
-    if not isinstance(dev, SyclDevice):
-        raise TypeError(
-            "dev is expected to be a SyclDevice, got " f"{type(dev)}"
-        )
-    return (DLDeviceType.kDLOneAPI, dev.get_device_id())
diff --git a/dpctl/tensor/_dlpack.pxd b/dpctl/tensor/_dlpack.pxd
deleted file mode 100644
index f44db7c05c..0000000000
--- a/dpctl/tensor/_dlpack.pxd
+++ /dev/null
@@ -1,61 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-# distutils: language = c++
-# cython: language_level=3
-# cython: linetrace=True
-
-cdef extern from "numpy/npy_no_deprecated_api.h":
-    pass
-from numpy cimport ndarray
-
-from .._sycl_device cimport SyclDevice
-from ._usmarray cimport usm_ndarray
-
-
-cdef extern from "dlpack/dlpack.h" nogil:
-    int device_CPU "kDLCPU"
-    int device_CUDA "kDLCUDA"
-    int device_CUDAHost "kDLCUDAHost"
-    int device_CUDAManaged "kDLCUDAManaged"
-    int device_DLROCM "kDLROCM"
-    int device_ROCMHost "kDLROCMHost"
-    int device_OpenCL "kDLOpenCL"
-    int device_Vulkan "kDLVulkan"
-    int device_Metal "kDLMetal"
-    int device_VPI "kDLVPI"
-    int device_OneAPI "kDLOneAPI"
-    int device_WebGPU "kDLWebGPU"
-    int device_Hexagon "kDLHexagon"
-    int device_MAIA "kDLMAIA"
-    int device_Trn "kDLTrn"
-
-cpdef object to_dlpack_capsule(usm_ndarray array) except +
-cpdef object to_dlpack_versioned_capsule(
-    usm_ndarray array, bint copied
-) except +
-cpdef object numpy_to_dlpack_versioned_capsule(
-    ndarray array, bint copied
-) except +
-cpdef object from_dlpack_capsule(object dltensor) except +
-
-cdef class DLPackCreationError(Exception):
-    """
-    A DLPackCreateError exception is raised when constructing
-    DLPack capsule from `usm_ndarray` based on a USM allocation
-    on a partitioned SYCL device.
-    """
-    pass
diff --git a/dpctl/tensor/_dlpack.pyx b/dpctl/tensor/_dlpack.pyx
deleted file mode 100644
index c92dd0b1fa..0000000000
--- a/dpctl/tensor/_dlpack.pyx
+++ /dev/null
@@ -1,1233 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-# distutils: language = c++
-# cython: language_level=3
-# cython: linetrace=True
-
-cdef extern from "numpy/npy_no_deprecated_api.h":
-    pass
-
-cimport cpython
-from libc cimport stdlib
-from libc.stdint cimport int64_t, uint8_t, uint16_t, uint32_t, uint64_t
-from numpy cimport ndarray
-
-cimport dpctl as c_dpctl
-cimport dpctl.memory as c_dpmem
-from dpctl._sycl_queue_manager cimport get_device_cached_queue
-
-from .._backend cimport (
-    DPCTLDevice_Delete,
-    DPCTLDevice_GetParentDevice,
-    DPCTLSyclDeviceRef,
-    DPCTLSyclUSMRef,
-)
-from ._usmarray cimport (
-    USM_ARRAY_C_CONTIGUOUS,
-    USM_ARRAY_F_CONTIGUOUS,
-    USM_ARRAY_WRITABLE,
-    usm_ndarray,
-)
-
-import ctypes
-
-import numpy as np
-
-import dpctl
-import dpctl.memory as dpmem
-
-from ._device import Device
-
-
-cdef extern from "dlpack/dlpack.h" nogil:
-    cdef int DLPACK_MAJOR_VERSION
-
-    cdef int DLPACK_MINOR_VERSION
-
-    cdef int DLPACK_FLAG_BITMASK_READ_ONLY
-
-    cdef int DLPACK_FLAG_BITMASK_IS_COPIED
-
-    ctypedef struct DLPackVersion:
-        uint32_t major
-        uint32_t minor
-
-    cdef enum DLDeviceType:
-        kDLCPU
-        kDLCUDA
-        kDLCUDAHost
-        kDLCUDAManaged
-        kDLROCM
-        kDLROCMHost
-        kDLOpenCL
-        kDLVulkan
-        kDLMetal
-        kDLVPI
-        kDLOneAPI
-        kDLWebGPU
-        kDLHexagon
-        kDLMAIA
-        kDLTrn
-
-    ctypedef struct DLDevice:
-        DLDeviceType device_type
-        int device_id
-
-    cdef enum DLDataTypeCode:
-        kDLInt
-        kDLUInt
-        kDLFloat
-        kDLBfloat
-        kDLComplex
-        kDLBool
-        kDLFloat8_e3m4
-        kDLFloat8_e4m3
-        kDLFloat8_e4m3b11fnuz
-        kDLFloat8_e4m3fn
-        kDLFloat8_e4m3fnuz
-        kDLFloat8_e5m2
-        kDLFloat8_e5m2fnuz
-        kDLFloat8_e8m0fnu
-        kDLFloat6_e2m3fn
-        kDLFloat6_e3m2fn
-        kDLFloat4_e2m1fn
-
-    ctypedef struct DLDataType:
-        uint8_t code
-        uint8_t bits
-        uint16_t lanes
-
-    ctypedef struct DLTensor:
-        void *data
-        DLDevice device
-        int ndim
-        DLDataType dtype
-        int64_t *shape
-        int64_t *strides
-        uint64_t byte_offset
-
-    ctypedef struct DLManagedTensor:
-        DLTensor dl_tensor
-        void *manager_ctx
-        void (*deleter)(DLManagedTensor *)  # noqa: E211
-
-    ctypedef struct DLManagedTensorVersioned:
-        DLPackVersion version
-        void *manager_ctx
-        void (*deleter)(DLManagedTensorVersioned *)  # noqa: E211
-        uint64_t flags
-        DLTensor dl_tensor
-
-
-def get_build_dlpack_version():
-    """
-    Returns a tuple of integers representing the `major` and `minor`
-    version of DLPack :module:`dpctl.tensor` was built with.
-    This tuple can be passed as the `max_version` argument to
-    `__dlpack__` to guarantee module:`dpctl.tensor` can properly
-    consume capsule.
-
-    Returns:
-        Tuple[int, int]
-            A tuple of integers representing the `major` and `minor`
-            version of DLPack used to build :module:`dpctl.tensor`.
-    """
-    return (DLPACK_MAJOR_VERSION, DLPACK_MINOR_VERSION)
-
-
-cdef void _pycapsule_deleter(object dlt_capsule) noexcept:
-    cdef DLManagedTensor *dlm_tensor = NULL
-    if cpython.PyCapsule_IsValid(dlt_capsule, "dltensor"):
-        dlm_tensor = <DLManagedTensor*>cpython.PyCapsule_GetPointer(
-            dlt_capsule, "dltensor")
-        dlm_tensor.deleter(dlm_tensor)
-
-
-cdef void _managed_tensor_deleter(
-    DLManagedTensor *dlm_tensor
-) noexcept with gil:
-    if dlm_tensor is not NULL:
-        # we only delete shape, because we make single allocation to
-        # acommodate both shape and strides if strides are needed
-        stdlib.free(dlm_tensor.dl_tensor.shape)
-        cpython.Py_DECREF(<object>dlm_tensor.manager_ctx)
-        dlm_tensor.manager_ctx = NULL
-        stdlib.free(dlm_tensor)
-
-
-cdef void _pycapsule_versioned_deleter(object dlt_capsule) noexcept:
-    cdef DLManagedTensorVersioned *dlmv_tensor = NULL
-    if cpython.PyCapsule_IsValid(dlt_capsule, "dltensor_versioned"):
-        dlmv_tensor = <DLManagedTensorVersioned*>cpython.PyCapsule_GetPointer(
-            dlt_capsule, "dltensor_versioned")
-        dlmv_tensor.deleter(dlmv_tensor)
-
-
-cdef void _managed_tensor_versioned_deleter(
-    DLManagedTensorVersioned *dlmv_tensor
-) noexcept with gil:
-    if dlmv_tensor is not NULL:
-        # we only delete shape, because we make single allocation to
-        # acommodate both shape and strides if strides are needed
-        stdlib.free(dlmv_tensor.dl_tensor.shape)
-        cpython.Py_DECREF(<object>dlmv_tensor.manager_ctx)
-        dlmv_tensor.manager_ctx = NULL
-        stdlib.free(dlmv_tensor)
-
-
-cdef object _get_default_context(c_dpctl.SyclDevice dev):
-    try:
-        default_context = dev.sycl_platform.default_context
-    except RuntimeError:
-        # RT does not support default_context
-        default_context = None
-
-    return default_context
-
-cdef int get_array_dlpack_device_id(
-    usm_ndarray usm_ary
-) except -1:
-    """Finds ordinal number of the parent of device where array
-    was allocated.
-    """
-    cdef c_dpctl.SyclQueue ary_sycl_queue
-    cdef c_dpctl.SyclDevice ary_sycl_device
-    cdef DPCTLSyclDeviceRef pDRef = NULL
-    cdef int device_id = -1
-
-    ary_sycl_queue = usm_ary.get_sycl_queue()
-    ary_sycl_device = ary_sycl_queue.get_sycl_device()
-
-    default_context = _get_default_context(ary_sycl_device)
-    if default_context is None:
-        # check that ary_sycl_device is a non-partitioned device
-        pDRef = DPCTLDevice_GetParentDevice(ary_sycl_device.get_device_ref())
-        if pDRef is not NULL:
-            DPCTLDevice_Delete(pDRef)
-            raise DLPackCreationError(
-                "to_dlpack_capsule: DLPack can only export arrays allocated "
-                "on non-partitioned SYCL devices on platforms where "
-                "default_context oneAPI extension is not supported."
-            )
-    else:
-        if not usm_ary.sycl_context == default_context:
-            raise DLPackCreationError(
-                "to_dlpack_capsule: DLPack can only export arrays based on USM "
-                "allocations bound to a default platform SYCL context"
-            )
-    device_id = ary_sycl_device.get_device_id()
-
-    if device_id < 0:
-        raise DLPackCreationError(
-            "get_array_dlpack_device_id: failed to determine device_id"
-        )
-
-    return device_id
-
-
-cpdef to_dlpack_capsule(usm_ndarray usm_ary):
-    """
-    to_dlpack_capsule(usm_ary)
-
-    Constructs named Python capsule object referencing
-    instance of ``DLManagedTensor`` from
-    :class:`dpctl.tensor.usm_ndarray` instance.
-
-    Args:
-        usm_ary: An instance of :class:`dpctl.tensor.usm_ndarray`
-    Returns:
-        A new capsule with name ``"dltensor"`` that contains
-        a pointer to ``DLManagedTensor`` struct.
-    Raises:
-        DLPackCreationError: when array can be represented as
-            DLPack tensor. This may happen when array was allocated
-            on a partitioned sycl device, or its USM allocation is
-            not bound to the platform default SYCL context.
-        MemoryError: when host allocation to needed for ``DLManagedTensor``
-            did not succeed.
-        ValueError: when array elements data type could not be represented
-            in ``DLManagedTensor``.
-    """
-    cdef DLManagedTensor *dlm_tensor = NULL
-    cdef DLTensor *dl_tensor = NULL
-    cdef int nd = usm_ary.get_ndim()
-    cdef char *data_ptr = usm_ary.get_data()
-    cdef Py_ssize_t *shape_ptr = NULL
-    cdef Py_ssize_t *strides_ptr = NULL
-    cdef int64_t *shape_strides_ptr = NULL
-    cdef int i = 0
-    cdef int device_id = -1
-    cdef int flags = 0
-    cdef Py_ssize_t element_offset = 0
-    cdef Py_ssize_t byte_offset = 0
-    cdef Py_ssize_t si = 1
-
-    ary_base = usm_ary.get_base()
-
-    device_id = get_array_dlpack_device_id(usm_ary)
-
-    dlm_tensor = <DLManagedTensor *> stdlib.malloc(
-        sizeof(DLManagedTensor))
-    if dlm_tensor is NULL:
-        raise MemoryError(
-            "to_dlpack_capsule: Could not allocate memory for DLManagedTensor"
-        )
-    if nd > 0:
-        shape_strides_ptr = <int64_t *>stdlib.malloc((sizeof(int64_t) * 2) * nd)
-        if shape_strides_ptr is NULL:
-            stdlib.free(dlm_tensor)
-            raise MemoryError(
-                "to_dlpack_capsule: Could not allocate memory for shape/strides"
-            )
-        shape_ptr = usm_ary.get_shape()
-        for i in range(nd):
-            shape_strides_ptr[i] = shape_ptr[i]
-        strides_ptr = usm_ary.get_strides()
-        flags = usm_ary.flags_
-        if strides_ptr:
-            for i in range(nd):
-                shape_strides_ptr[nd + i] = strides_ptr[i]
-        else:
-            if flags & USM_ARRAY_C_CONTIGUOUS:
-                si = 1
-                for i in range(nd - 1, -1, -1):
-                    shape_strides_ptr[nd + i] = si
-                    si = si * shape_ptr[i]
-            elif flags & USM_ARRAY_F_CONTIGUOUS:
-                si = 1
-                for i in range(0, nd):
-                    shape_strides_ptr[nd + i] = si
-                    si = si * shape_ptr[i]
-            else:
-                stdlib.free(shape_strides_ptr)
-                stdlib.free(dlm_tensor)
-                raise BufferError(
-                    "to_dlpack_capsule: Invalid array encountered "
-                    "when building strides"
-                )
-
-            strides_ptr = <Py_ssize_t *>&shape_strides_ptr[nd]
-
-    ary_dt = usm_ary.dtype
-    ary_dtk = ary_dt.kind
-    element_offset = usm_ary.get_offset()
-    byte_offset = element_offset * (<Py_ssize_t>ary_dt.itemsize)
-
-    dl_tensor = &dlm_tensor.dl_tensor
-    dl_tensor.data = <void*>(data_ptr - byte_offset)
-    dl_tensor.ndim = nd
-    dl_tensor.byte_offset = <uint64_t>byte_offset
-    dl_tensor.shape = &shape_strides_ptr[0] if nd > 0 else NULL
-    dl_tensor.strides = &shape_strides_ptr[nd] if nd > 0 else NULL
-    dl_tensor.device.device_type = kDLOneAPI
-    dl_tensor.device.device_id = device_id
-    dl_tensor.dtype.lanes = <uint16_t>1
-    dl_tensor.dtype.bits = <uint8_t>(ary_dt.itemsize * 8)
-    if (ary_dtk == "b"):
-        dl_tensor.dtype.code = <uint8_t>kDLBool
-    elif (ary_dtk == "u"):
-        dl_tensor.dtype.code = <uint8_t>kDLUInt
-    elif (ary_dtk == "i"):
-        dl_tensor.dtype.code = <uint8_t>kDLInt
-    elif (ary_dtk == "f"):
-        dl_tensor.dtype.code = <uint8_t>kDLFloat
-    elif (ary_dtk == "c"):
-        dl_tensor.dtype.code = <uint8_t>kDLComplex
-    else:
-        stdlib.free(shape_strides_ptr)
-        stdlib.free(dlm_tensor)
-        raise ValueError("Unrecognized array data type")
-
-    dlm_tensor.manager_ctx = <void*>ary_base
-    cpython.Py_INCREF(ary_base)
-    dlm_tensor.deleter = _managed_tensor_deleter
-
-    return cpython.PyCapsule_New(dlm_tensor, "dltensor", _pycapsule_deleter)
-
-
-cpdef to_dlpack_versioned_capsule(usm_ndarray usm_ary, bint copied):
-    """
-    to_dlpack_versioned_capsule(usm_ary, copied)
-
-    Constructs named Python capsule object referencing
-    instance of ``DLManagedTensorVersioned`` from
-    :class:`dpctl.tensor.usm_ndarray` instance.
-
-    Args:
-        usm_ary: An instance of :class:`dpctl.tensor.usm_ndarray`
-        copied: A bint representing whether the data was previously
-            copied in order to set the flags with the is-copied
-            bitmask.
-    Returns:
-        A new capsule with name ``"dltensor_versioned"`` that
-        contains a pointer to ``DLManagedTensorVersioned`` struct.
-    Raises:
-        DLPackCreationError: when array can be represented as
-            DLPack tensor. This may happen when array was allocated
-            on a partitioned sycl device, or its USM allocation is
-            not bound to the platform default SYCL context.
-        MemoryError: when host allocation to needed for
-            ``DLManagedTensorVersioned`` did not succeed.
-        ValueError: when array elements data type could not be represented
-            in ``DLManagedTensorVersioned``.
-    """
-    cdef DLManagedTensorVersioned *dlmv_tensor = NULL
-    cdef DLTensor *dl_tensor = NULL
-    cdef uint32_t dlmv_flags = 0
-    cdef int nd = usm_ary.get_ndim()
-    cdef char *data_ptr = usm_ary.get_data()
-    cdef Py_ssize_t *shape_ptr = NULL
-    cdef Py_ssize_t *strides_ptr = NULL
-    cdef int64_t *shape_strides_ptr = NULL
-    cdef int i = 0
-    cdef int device_id = -1
-    cdef int flags = 0
-    cdef Py_ssize_t element_offset = 0
-    cdef Py_ssize_t byte_offset = 0
-    cdef Py_ssize_t si = 1
-
-    ary_base = usm_ary.get_base()
-
-    # Find ordinal number of the parent device
-    device_id = get_array_dlpack_device_id(usm_ary)
-
-    dlmv_tensor = <DLManagedTensorVersioned *> stdlib.malloc(
-        sizeof(DLManagedTensorVersioned))
-    if dlmv_tensor is NULL:
-        raise MemoryError(
-            "to_dlpack_versioned_capsule: Could not allocate memory "
-            "for DLManagedTensorVersioned"
-        )
-    if nd > 0:
-        shape_strides_ptr = <int64_t *>stdlib.malloc((sizeof(int64_t) * 2) * nd)
-        if shape_strides_ptr is NULL:
-            stdlib.free(dlmv_tensor)
-            raise MemoryError(
-                "to_dlpack_versioned_capsule: Could not allocate memory "
-                "for shape/strides"
-            )
-        # this can be a separate function for handling shapes and strides
-        shape_ptr = usm_ary.get_shape()
-        for i in range(nd):
-            shape_strides_ptr[i] = shape_ptr[i]
-        strides_ptr = usm_ary.get_strides()
-        flags = usm_ary.flags_
-        if strides_ptr:
-            for i in range(nd):
-                shape_strides_ptr[nd + i] = strides_ptr[i]
-        else:
-            if flags & USM_ARRAY_C_CONTIGUOUS:
-                si = 1
-                for i in range(nd - 1, -1, -1):
-                    shape_strides_ptr[nd + i] = si
-                    si = si * shape_ptr[i]
-            elif flags & USM_ARRAY_F_CONTIGUOUS:
-                si = 1
-                for i in range(0, nd):
-                    shape_strides_ptr[nd + i] = si
-                    si = si * shape_ptr[i]
-            else:
-                stdlib.free(shape_strides_ptr)
-                stdlib.free(dlmv_tensor)
-                raise BufferError(
-                    "to_dlpack_versioned_capsule: Invalid array encountered "
-                    "when building strides"
-                )
-
-            strides_ptr = <Py_ssize_t *>&shape_strides_ptr[nd]
-
-    # this can all be a function for building the dl_tensor
-    # object (separate from dlm/dlmv)
-    ary_dt = usm_ary.dtype
-    ary_dtk = ary_dt.kind
-    element_offset = usm_ary.get_offset()
-    byte_offset = element_offset * (<Py_ssize_t>ary_dt.itemsize)
-
-    dl_tensor = &dlmv_tensor.dl_tensor
-    dl_tensor.data = <void*>(data_ptr - byte_offset)
-    dl_tensor.ndim = nd
-    dl_tensor.byte_offset = <uint64_t>byte_offset
-    dl_tensor.shape = &shape_strides_ptr[0] if nd > 0 else NULL
-    dl_tensor.strides = &shape_strides_ptr[nd] if nd > 0 else NULL
-    dl_tensor.device.device_type = kDLOneAPI
-    dl_tensor.device.device_id = device_id
-    dl_tensor.dtype.lanes = <uint16_t>1
-    dl_tensor.dtype.bits = <uint8_t>(ary_dt.itemsize * 8)
-    if (ary_dtk == "b"):
-        dl_tensor.dtype.code = <uint8_t>kDLBool
-    elif (ary_dtk == "u"):
-        dl_tensor.dtype.code = <uint8_t>kDLUInt
-    elif (ary_dtk == "i"):
-        dl_tensor.dtype.code = <uint8_t>kDLInt
-    elif (ary_dtk == "f"):
-        dl_tensor.dtype.code = <uint8_t>kDLFloat
-    elif (ary_dtk == "c"):
-        dl_tensor.dtype.code = <uint8_t>kDLComplex
-    else:
-        stdlib.free(shape_strides_ptr)
-        stdlib.free(dlmv_tensor)
-        raise ValueError("Unrecognized array data type")
-
-    # set flags down here
-    if copied:
-        dlmv_flags |= DLPACK_FLAG_BITMASK_IS_COPIED
-    if not (flags & USM_ARRAY_WRITABLE):
-        dlmv_flags |= DLPACK_FLAG_BITMASK_READ_ONLY
-    dlmv_tensor.flags = dlmv_flags
-
-    dlmv_tensor.version.major = DLPACK_MAJOR_VERSION
-    dlmv_tensor.version.minor = DLPACK_MINOR_VERSION
-
-    dlmv_tensor.manager_ctx = <void*>ary_base
-    cpython.Py_INCREF(ary_base)
-    dlmv_tensor.deleter = _managed_tensor_versioned_deleter
-
-    return cpython.PyCapsule_New(
-        dlmv_tensor, "dltensor_versioned", _pycapsule_versioned_deleter
-    )
-
-
-cpdef numpy_to_dlpack_versioned_capsule(ndarray npy_ary, bint copied):
-    """
-    to_dlpack_versioned_capsule(npy_ary, copied)
-
-    Constructs named Python capsule object referencing
-    instance of ``DLManagedTensorVersioned`` from
-    :class:`numpy.ndarray` instance.
-
-    Args:
-        npy_ary: An instance of :class:`numpy.ndarray`
-        copied: A bint representing whether the data was previously
-            copied in order to set the flags with the is-copied
-            bitmask.
-    Returns:
-        A new capsule with name ``"dltensor_versioned"`` that
-        contains a pointer to ``DLManagedTensorVersioned`` struct.
-    Raises:
-        DLPackCreationError: when array can be represented as
-            DLPack tensor.
-        MemoryError: when host allocation to needed for
-            ``DLManagedTensorVersioned`` did not succeed.
-        ValueError: when array elements data type could not be represented
-            in ``DLManagedTensorVersioned``.
-    """
-    cdef DLManagedTensorVersioned *dlmv_tensor = NULL
-    cdef DLTensor *dl_tensor = NULL
-    cdef uint32_t dlmv_flags = 0
-    cdef int nd = npy_ary.ndim
-    cdef int64_t *shape_strides_ptr = NULL
-    cdef int i = 0
-    cdef Py_ssize_t byte_offset = 0
-    cdef int itemsize = npy_ary.itemsize
-
-    dlmv_tensor = <DLManagedTensorVersioned *> stdlib.malloc(
-        sizeof(DLManagedTensorVersioned))
-    if dlmv_tensor is NULL:
-        raise MemoryError(
-            "numpy_to_dlpack_versioned_capsule: Could not allocate memory "
-            "for DLManagedTensorVersioned"
-        )
-
-    shape = npy_ary.ctypes.shape_as(ctypes.c_int64)
-    strides = npy_ary.ctypes.strides_as(ctypes.c_int64)
-    if nd > 0:
-        if npy_ary.size != 1:
-            for i in range(nd):
-                if shape[i] != 1 and strides[i] % itemsize != 0:
-                    stdlib.free(dlmv_tensor)
-                    raise BufferError(
-                        "numpy_to_dlpack_versioned_capsule: DLPack cannot "
-                        "encode an array if strides are not a multiple of "
-                        "itemsize"
-                    )
-        shape_strides_ptr = <int64_t *>stdlib.malloc((sizeof(int64_t) * 2) * nd)
-        if shape_strides_ptr is NULL:
-            stdlib.free(dlmv_tensor)
-            raise MemoryError(
-                "numpy_to_dlpack_versioned_capsule: Could not allocate memory "
-                "for shape/strides"
-            )
-        for i in range(nd):
-            shape_strides_ptr[i] = shape[i]
-            shape_strides_ptr[nd + i] = strides[i] // itemsize
-
-    writable_flag = npy_ary.flags["W"]
-
-    ary_dt = npy_ary.dtype
-    ary_dtk = ary_dt.kind
-
-    dl_tensor = &dlmv_tensor.dl_tensor
-    dl_tensor.data = <void *> npy_ary.data
-    dl_tensor.ndim = nd
-    dl_tensor.byte_offset = <uint64_t>byte_offset
-    dl_tensor.shape = &shape_strides_ptr[0] if nd > 0 else NULL
-    dl_tensor.strides = &shape_strides_ptr[nd] if nd > 0 else NULL
-    dl_tensor.device.device_type = kDLCPU
-    dl_tensor.device.device_id = 0
-    dl_tensor.dtype.lanes = <uint16_t>1
-    dl_tensor.dtype.bits = <uint8_t>(ary_dt.itemsize * 8)
-    if (ary_dtk == "b"):
-        dl_tensor.dtype.code = <uint8_t>kDLBool
-    elif (ary_dtk == "u"):
-        dl_tensor.dtype.code = <uint8_t>kDLUInt
-    elif (ary_dtk == "i"):
-        dl_tensor.dtype.code = <uint8_t>kDLInt
-    elif (ary_dtk == "f" and ary_dt.itemsize <= 8):
-        dl_tensor.dtype.code = <uint8_t>kDLFloat
-    elif (ary_dtk == "c" and ary_dt.itemsize <= 16):
-        dl_tensor.dtype.code = <uint8_t>kDLComplex
-    else:
-        stdlib.free(shape_strides_ptr)
-        stdlib.free(dlmv_tensor)
-        raise ValueError("Unrecognized array data type")
-
-    # set flags down here
-    if copied:
-        dlmv_flags |= DLPACK_FLAG_BITMASK_IS_COPIED
-    if not writable_flag:
-        dlmv_flags |= DLPACK_FLAG_BITMASK_READ_ONLY
-    dlmv_tensor.flags = dlmv_flags
-
-    dlmv_tensor.version.major = DLPACK_MAJOR_VERSION
-    dlmv_tensor.version.minor = DLPACK_MINOR_VERSION
-
-    dlmv_tensor.manager_ctx = <void*>npy_ary
-    cpython.Py_INCREF(npy_ary)
-    dlmv_tensor.deleter = _managed_tensor_versioned_deleter
-
-    return cpython.PyCapsule_New(
-        dlmv_tensor, "dltensor_versioned", _pycapsule_versioned_deleter
-    )
-
-
-cdef class _DLManagedTensorOwner:
-    """
-    Helper class managing the lifetime of the DLManagedTensor struct
-    transferred from a 'dlpack' capsule.
-    """
-    cdef DLManagedTensor * dlm_tensor
-
-    def __cinit__(self):
-        self.dlm_tensor = NULL
-
-    def __dealloc__(self):
-        if self.dlm_tensor:
-            self.dlm_tensor.deleter(self.dlm_tensor)
-            self.dlm_tensor = NULL
-
-    @staticmethod
-    cdef _DLManagedTensorOwner _create(DLManagedTensor *dlm_tensor_src):
-        cdef _DLManagedTensorOwner res
-        res = _DLManagedTensorOwner.__new__(_DLManagedTensorOwner)
-        res.dlm_tensor = dlm_tensor_src
-        return res
-
-
-cdef class _DLManagedTensorVersionedOwner:
-    """
-    Helper class managing the lifetime of the DLManagedTensorVersioned
-    struct transferred from a 'dlpack_versioned' capsule.
-    """
-    cdef DLManagedTensorVersioned * dlmv_tensor
-
-    def __cinit__(self):
-        self.dlmv_tensor = NULL
-
-    def __dealloc__(self):
-        if self.dlmv_tensor:
-            self.dlmv_tensor.deleter(self.dlmv_tensor)
-            self.dlmv_tensor = NULL
-
-    @staticmethod
-    cdef _DLManagedTensorVersionedOwner _create(
-        DLManagedTensorVersioned *dlmv_tensor_src
-    ):
-        cdef _DLManagedTensorVersionedOwner res
-        res = _DLManagedTensorVersionedOwner.__new__(
-            _DLManagedTensorVersionedOwner
-        )
-        res.dlmv_tensor = dlmv_tensor_src
-        return res
-
-
-cdef dict _numpy_array_interface_from_dl_tensor(DLTensor *dlt, bint ro_flag):
-    """Constructs a NumPy `__array_interface__` dictionary from a DLTensor."""
-    cdef int itemsize = 0
-
-    if dlt.dtype.lanes != 1:
-        raise BufferError(
-            "Can not import DLPack tensor with lanes != 1"
-        )
-    itemsize = dlt.dtype.bits // 8
-    shape = list()
-    if (dlt.strides is NULL):
-        strides = None
-        for dim in range(dlt.ndim):
-            shape.append(dlt.shape[dim])
-    else:
-        strides = list()
-        for dim in range(dlt.ndim):
-            shape.append(dlt.shape[dim])
-            # convert to byte-strides
-            strides.append(dlt.strides[dim] * itemsize)
-        strides = tuple(strides)
-    shape = tuple(shape)
-    if (dlt.dtype.code == kDLUInt):
-        ary_dt = "u" + str(itemsize)
-    elif (dlt.dtype.code == kDLInt):
-        ary_dt = "i" + str(itemsize)
-    elif (dlt.dtype.code == kDLFloat):
-        ary_dt = "f" + str(itemsize)
-    elif (dlt.dtype.code == kDLComplex):
-        ary_dt = "c" + str(itemsize)
-    elif (dlt.dtype.code == kDLBool):
-        ary_dt = "b" + str(itemsize)
-    else:
-        raise BufferError(
-            "Can not import DLPack tensor with type code {}.".format(
-                <object>dlt.dtype.code
-            )
-        )
-    typestr = "|" + ary_dt
-    return dict(
-        version=3,
-        shape=shape,
-        strides=strides,
-        data=(<size_t> dlt.data, True if ro_flag else False),
-        offset=dlt.byte_offset,
-        typestr=typestr,
-    )
-
-
-class _numpy_array_interface_wrapper:
-    """
-    Class that wraps a Python capsule and dictionary for consumption by NumPy.
-
-    Implementation taken from
-    https://github.com/dmlc/dlpack/blob/main/apps/numpy_dlpack/dlpack/to_numpy.py
-
-    Args:
-        array_interface:
-            A dictionary describing the underlying memory. Formatted
-            to match `numpy.ndarray.__array_interface__`.
-
-        pycapsule:
-            A Python capsule wrapping the dlpack tensor that will be
-            converted to numpy.
-    """
-
-    def __init__(self, array_interface, memory_owner) -> None:
-        self.__array_interface__ = array_interface
-        self._memory_owner = memory_owner
-
-
-cdef bint _is_kdlcpu_device(DLDevice *dev):
-    "Check if DLTensor.DLDevice denotes (kDLCPU, 0)"
-    return (dev[0].device_type == kDLCPU) and (dev[0].device_id == 0)
-
-
-cpdef object from_dlpack_capsule(object py_caps):
-    """
-    from_dlpack_capsule(py_caps)
-
-    Reconstructs instance of :class:`dpctl.tensor.usm_ndarray` from
-    named Python capsule object referencing instance of ``DLManagedTensor``
-    without copy. The instance forms a view in the memory of the tensor.
-
-    Args:
-        caps:
-            Python capsule with name ``"dltensor"`` expected to reference
-            an instance of ``DLManagedTensor`` struct.
-    Returns:
-        Instance of :class:`dpctl.tensor.usm_ndarray` with a view into
-        memory of the tensor. Capsule is renamed to ``"used_dltensor"``
-        upon success.
-    Raises:
-        TypeError:
-            if argument is not a ``"dltensor"`` capsule.
-        ValueError:
-            if argument is ``"used_dltensor"`` capsule
-        BufferError:
-            if the USM pointer is not bound to the reconstructed
-            sycl context, or the DLPack's device_type is not supported
-            by :mod:`dpctl`.
-    """
-    cdef DLManagedTensorVersioned *dlmv_tensor = NULL
-    cdef DLManagedTensor *dlm_tensor = NULL
-    cdef DLTensor *dl_tensor = NULL
-    cdef int versioned = 0
-    cdef int readonly = 0
-    cdef bytes usm_type
-    cdef size_t sz = 1
-    cdef size_t alloc_sz = 1
-    cdef int i
-    cdef int device_id = -1
-    cdef int element_bytesize = 0
-    cdef Py_ssize_t offset_min = 0
-    cdef Py_ssize_t offset_max = 0
-    cdef char *mem_ptr = NULL
-    cdef Py_ssize_t mem_ptr_delta = 0
-    cdef Py_ssize_t element_offset = 0
-    cdef int64_t stride_i = -1
-    cdef int64_t shape_i = -1
-
-    if cpython.PyCapsule_IsValid(py_caps, "dltensor"):
-        dlm_tensor = <DLManagedTensor*>cpython.PyCapsule_GetPointer(
-                py_caps, "dltensor")
-        dl_tensor = &dlm_tensor.dl_tensor
-    elif cpython.PyCapsule_IsValid(py_caps, "dltensor_versioned"):
-        dlmv_tensor = <DLManagedTensorVersioned*>cpython.PyCapsule_GetPointer(
-                py_caps, "dltensor_versioned")
-        if dlmv_tensor.version.major > DLPACK_MAJOR_VERSION:
-            raise BufferError(
-                "Can not import DLPack tensor with major version "
-                f"greater than {DLPACK_MAJOR_VERSION}"
-            )
-        versioned = 1
-        readonly = (dlmv_tensor.flags & DLPACK_FLAG_BITMASK_READ_ONLY) != 0
-        dl_tensor = &dlmv_tensor.dl_tensor
-    elif (
-        cpython.PyCapsule_IsValid(py_caps, "used_dltensor")
-        or cpython.PyCapsule_IsValid(py_caps, "used_dltensor_versioned")
-    ):
-        raise ValueError(
-            "A DLPack tensor object can not be consumed multiple times"
-        )
-    else:
-        raise TypeError(
-            "`from_dlpack_capsule` expects a Python 'dltensor' capsule"
-        )
-
-    # Verify that we can work with this device
-    if dl_tensor.device.device_type == kDLOneAPI:
-        device_id = dl_tensor.device.device_id
-        root_device = dpctl.SyclDevice(str(<int>device_id))
-        try:
-            default_context = root_device.sycl_platform.default_context
-        except RuntimeError:
-            default_context = get_device_cached_queue(root_device).sycl_context
-        if dl_tensor.data is NULL:
-            usm_type = b"device"
-            q = get_device_cached_queue((default_context, root_device,))
-        else:
-            usm_type = c_dpmem._Memory.get_pointer_type(
-                <DPCTLSyclUSMRef> dl_tensor.data,
-                <c_dpctl.SyclContext>default_context)
-            if usm_type == b"unknown":
-                raise BufferError(
-                    "Data pointer in DLPack is not bound to default sycl "
-                    f"context of device '{device_id}', translated to "
-                    f"{root_device.filter_string}"
-                )
-            alloc_device = c_dpmem._Memory.get_pointer_device(
-                <DPCTLSyclUSMRef> dl_tensor.data,
-                <c_dpctl.SyclContext>default_context
-            )
-            q = get_device_cached_queue((default_context, alloc_device,))
-        if dl_tensor.dtype.bits % 8:
-            raise BufferError(
-                "Can not import DLPack tensor whose element's "
-                "bitsize is not a multiple of 8"
-            )
-        if dl_tensor.dtype.lanes != 1:
-            raise BufferError(
-                "Can not import DLPack tensor with lanes != 1"
-            )
-        if dl_tensor.ndim > 0:
-            offset_min = 0
-            offset_max = 0
-            for i in range(dl_tensor.ndim):
-                stride_i = dl_tensor.strides[i]
-                shape_i = dl_tensor.shape[i]
-                if shape_i > 1:
-                    shape_i -= 1
-                    if stride_i > 0:
-                        offset_max = offset_max + stride_i * shape_i
-                    else:
-                        offset_min = offset_min + stride_i * shape_i
-            sz = offset_max - offset_min + 1
-        if sz == 0:
-            sz = 1
-
-        element_bytesize = (dl_tensor.dtype.bits // 8)
-        sz = sz * element_bytesize
-        element_offset = dl_tensor.byte_offset // element_bytesize
-
-        # transfer ownership
-        if not versioned:
-            dlm_holder = _DLManagedTensorOwner._create(dlm_tensor)
-            cpython.PyCapsule_SetName(py_caps, "used_dltensor")
-        else:
-            dlmv_holder = _DLManagedTensorVersionedOwner._create(dlmv_tensor)
-            cpython.PyCapsule_SetName(py_caps, "used_dltensor_versioned")
-
-        if dl_tensor.data is NULL:
-            usm_mem = dpmem.MemoryUSMDevice(sz, q)
-        else:
-            mem_ptr_delta = dl_tensor.byte_offset - (
-                element_offset * element_bytesize
-            )
-            mem_ptr = <char *>dl_tensor.data
-            alloc_sz = dl_tensor.byte_offset + <uint64_t>(
-                (offset_max + 1) * element_bytesize)
-            tmp = c_dpmem._Memory.create_from_usm_pointer_size_qref(
-                <DPCTLSyclUSMRef> mem_ptr,
-                max(alloc_sz, <uint64_t>element_bytesize),
-                (<c_dpctl.SyclQueue>q).get_queue_ref(),
-                memory_owner=dlmv_holder if versioned else dlm_holder
-            )
-            if mem_ptr_delta == 0:
-                usm_mem = tmp
-            else:
-                alloc_sz = dl_tensor.byte_offset + <uint64_t>(
-                    (offset_max * element_bytesize + mem_ptr_delta))
-                usm_mem = c_dpmem._Memory.create_from_usm_pointer_size_qref(
-                    <DPCTLSyclUSMRef> (
-                        mem_ptr + (element_bytesize - mem_ptr_delta)
-                    ),
-                    max(alloc_sz, <uint64_t>element_bytesize),
-                    (<c_dpctl.SyclQueue>q).get_queue_ref(),
-                    memory_owner=tmp
-                )
-
-        py_shape = list()
-        if (dl_tensor.shape is not NULL):
-            for i in range(dl_tensor.ndim):
-                py_shape.append(dl_tensor.shape[i])
-        if (dl_tensor.strides is not NULL):
-            py_strides = list()
-            for i in range(dl_tensor.ndim):
-                py_strides.append(dl_tensor.strides[i])
-        else:
-            py_strides = None
-        if (dl_tensor.dtype.code == kDLUInt):
-            ary_dt = np.dtype("u" + str(element_bytesize))
-        elif (dl_tensor.dtype.code == kDLInt):
-            ary_dt = np.dtype("i" + str(element_bytesize))
-        elif (dl_tensor.dtype.code == kDLFloat):
-            ary_dt = np.dtype("f" + str(element_bytesize))
-        elif (dl_tensor.dtype.code == kDLComplex):
-            ary_dt = np.dtype("c" + str(element_bytesize))
-        elif (dl_tensor.dtype.code == kDLBool):
-            ary_dt = np.dtype("?")
-        else:
-            raise BufferError(
-                "Can not import DLPack tensor with type code {}.".format(
-                    <object>dl_tensor.dtype.code
-                )
-            )
-        res_ary = usm_ndarray(
-            py_shape,
-            dtype=ary_dt,
-            buffer=usm_mem,
-            strides=py_strides,
-            offset=element_offset
-        )
-        if readonly:
-            res_ary.flags_ = (res_ary.flags_ & ~USM_ARRAY_WRITABLE)
-        return res_ary
-    elif _is_kdlcpu_device(&dl_tensor.device):
-        ary_iface = _numpy_array_interface_from_dl_tensor(dl_tensor, readonly)
-        if not versioned:
-            dlm_holder = _DLManagedTensorOwner._create(dlm_tensor)
-            cpython.PyCapsule_SetName(py_caps, "used_dltensor")
-            return np.ctypeslib.as_array(
-                _numpy_array_interface_wrapper(ary_iface, dlm_holder)
-            )
-        else:
-            dlmv_holder = _DLManagedTensorVersionedOwner._create(dlmv_tensor)
-            cpython.PyCapsule_SetName(py_caps, "used_dltensor_versioned")
-            return np.ctypeslib.as_array(
-                _numpy_array_interface_wrapper(ary_iface, dlmv_holder)
-            )
-    else:
-        raise BufferError(
-            "The DLPack tensor resides on unsupported device."
-        )
-
-cdef usm_ndarray _to_usm_ary_from_host_blob(object host_blob, dev : Device):
-    q = dev.sycl_queue
-    np_ary = np.asarray(host_blob)
-    dt = np_ary.dtype
-    if dt.char in "dD" and q.sycl_device.has_aspect_fp64 is False:
-        Xusm_dtype = (
-            "float32" if dt.char == "d" else "complex64"
-        )
-    else:
-        Xusm_dtype = dt
-    usm_mem = dpmem.MemoryUSMDevice(np_ary.nbytes, queue=q)
-    usm_ary = usm_ndarray(np_ary.shape, dtype=Xusm_dtype, buffer=usm_mem)
-    usm_mem.copy_from_host(np.reshape(np_ary.view(dtype="u1"), -1))
-    return usm_ary
-
-
-# only cdef to make it private
-cdef object _create_device(object device, object dl_device):
-    if isinstance(device, Device):
-        return device
-    elif isinstance(device, dpctl.SyclDevice):
-        return Device.create_device(device)
-    else:
-        root_device = dpctl.SyclDevice(str(<int>dl_device[1]))
-        return Device.create_device(root_device)
-
-
-def from_dlpack(x, /, *, device=None, copy=None):
-    """from_dlpack(x, /, *, device=None, copy=None)
-
-    Constructs :class:`dpctl.tensor.usm_ndarray` or :class:`numpy.ndarray`
-    instance from a Python object ``x`` that implements ``__dlpack__`` protocol.
-
-    Args:
-        x (object):
-            A Python object representing an array that supports
-            ``__dlpack__`` protocol.
-        device (
-            Optional[str, :class:`dpctl.SyclDevice`,
-            :class:`dpctl.SyclQueue`,
-            :class:`dpctl.tensor.Device`,
-            tuple([:class:`enum.IntEnum`, int])])
-        ):
-            Device where the output array is to be placed. ``device`` keyword
-            values can be:
-
-            * ``None``
-                The data remains on the same device.
-            * oneAPI filter selector string
-                SYCL device selected by :ref:`filter selector string
-                <filter_selector_string>`.
-            * :class:`dpctl.SyclDevice`
-                explicit SYCL device that must correspond to
-                a non-partitioned SYCL device.
-            * :class:`dpctl.SyclQueue`
-                implies SYCL device targeted by the SYCL queue.
-            * :class:`dpctl.tensor.Device`
-                implies SYCL device `device.sycl_queue`. The `Device` object
-                is obtained via :attr:`dpctl.tensor.usm_ndarray.device`.
-            * ``(device_type, device_id)``
-               2-tuple matching the format of the output of the
-               ``__dlpack_device__`` method: an integer enumerator representing
-               the device type followed by an integer representing the index of
-               the device. The only supported :class:`dpctl.tensor.DLDeviceType`
-               device types are ``"kDLCPU"`` and ``"kDLOneAPI"``.
-
-            Default: ``None``.
-
-        copy (bool, optional)
-            Boolean indicating whether or not to copy the input.
-
-            * If ``copy`` is ``True``, the input will always be
-              copied.
-            * If ``False``, a ``BufferError`` will be raised if a
-              copy is deemed necessary.
-            * If ``None``, a copy will be made only if deemed
-              necessary, otherwise, the existing memory buffer will
-              be reused.
-
-            Default: ``None``.
-
-    Returns:
-        Alternative[usm_ndarray, numpy.ndarray]:
-            An array containing the data in ``x``. When ``copy`` is
-            ``None`` or ``False``, this may be a view into the original
-            memory.
-
-            The type of the returned object
-            depends on where the data backing up input object ``x`` resides.
-            If it resides in a USM allocation on a SYCL device, the
-            type :class:`dpctl.tensor.usm_ndarray` is returned, otherwise if it
-            resides on ``"kDLCPU"`` device the type is :class:`numpy.ndarray`,
-            and otherwise an exception is raised.
-
-            .. note::
-
-                If the return type is :class:`dpctl.tensor.usm_ndarray`, the
-                associated SYCL queue is derived from the ``device`` keyword.
-                When ``device`` keyword value has type :class:`dpctl.SyclQueue`,
-                the explicit queue instance is used, when ``device`` keyword
-                value has type :class:`dpctl.tensor.Device`, the
-                ``device.sycl_queue`` is used. In all other cases, the cached
-                SYCL queue corresponding to the implied SYCL device is used.
-
-    Raises:
-        TypeError:
-            if ``x`` does not implement ``__dlpack__`` method
-        ValueError:
-            if data of the input object resides on an unsupported device
-
-    See https://dmlc.github.io/dlpack/latest/ for more details.
-
-    :Example:
-
-        .. code-block:: python
-
-            import dpctl
-            import dpctl.tensor as dpt
-
-            class Container:
-                "Helper class implementing `__dlpack__` protocol"
-                def __init__(self, array):
-                    self._array = array
-
-                def __dlpack__(self, stream=None):
-                    return self._array.__dlpack__(stream=stream)
-
-                def __dlpack_device__(self):
-                    return self._array.__dlpack_device__()
-
-            C = Container(dpt.linspace(0, 100, num=20, dtype="int16"))
-            # create usm_ndarray view
-            X = dpt.from_dlpack(C)
-            # migrate content of the container to device of type kDLCPU
-            Y = dpt.from_dlpack(C, device=(dpt.DLDeviceType.kDLCPU, 0))
-
-    """
-    dlpack_attr = getattr(x, "__dlpack__", None)
-    dlpack_dev_attr = getattr(x, "__dlpack_device__", None)
-    if not callable(dlpack_attr) or not callable(dlpack_dev_attr):
-        raise TypeError(
-            f"The argument of type {type(x)} does not implement "
-            "`__dlpack__` and `__dlpack_device__` methods."
-        )
-    # device is converted to a dlpack_device if necessary
-    dl_device = None
-    if device:
-        if isinstance(device, tuple):
-            dl_device = device
-            if len(dl_device) != 2:
-                raise ValueError(
-                    "Argument `device` specified as a tuple must have length 2"
-                )
-        else:
-            if not isinstance(device, dpctl.SyclDevice):
-                device = Device.create_device(device)
-                d = device.sycl_device
-            else:
-                d = device
-            dl_device = (device_OneAPI, d.get_device_id())
-    if dl_device is not None:
-        if (dl_device[0] not in [device_OneAPI, device_CPU]):
-            raise ValueError(
-                f"Argument `device`={device} is not supported."
-            )
-    got_type_error = False
-    got_buffer_error = False
-    got_other_error = False
-    saved_exception = None
-    # First DLPack version supporting dl_device, and copy
-    requested_ver = (1, 0)
-    cpu_dev = (device_CPU, 0)
-    try:
-        # setting max_version to minimal version that supports
-        # dl_device/copy keywords
-        dlpack_capsule = dlpack_attr(
-            max_version=requested_ver,
-            dl_device=dl_device,
-            copy=copy
-        )
-    except TypeError:
-        # exporter does not support max_version keyword
-        got_type_error = True
-    except (BufferError, NotImplementedError, ValueError) as e:
-        # Either dl_device, or copy cannot be satisfied
-        got_buffer_error = True
-        saved_exception = e
-    except Exception as e:
-        got_other_error = True
-        saved_exception = e
-    else:
-        # execution did not raise exceptions
-        return from_dlpack_capsule(dlpack_capsule)
-    finally:
-        if got_type_error:
-            # max_version/dl_device, copy keywords are not supported
-            # by __dlpack__
-            x_dldev = dlpack_dev_attr()
-            if (dl_device is None) or (dl_device == x_dldev):
-                dlpack_capsule = dlpack_attr()
-                return from_dlpack_capsule(dlpack_capsule)
-            # must copy via host
-            if copy is False:
-                raise BufferError(
-                    "Importing data via DLPack requires copying, but "
-                    "copy=False was provided"
-                )
-            # when max_version/dl_device/copy are not supported
-            # we can only support importing to OneAPI devices
-            # from host, or from another oneAPI device
-            is_supported_x_dldev = (
-                x_dldev == cpu_dev or
-                (x_dldev[0] == device_OneAPI)
-            )
-            is_supported_dl_device = (
-                dl_device == cpu_dev or
-                dl_device[0] == device_OneAPI
-            )
-            if is_supported_x_dldev and is_supported_dl_device:
-                dlpack_capsule = dlpack_attr()
-                blob = from_dlpack_capsule(dlpack_capsule)
-            else:
-                raise BufferError(
-                    f"Can not import to requested device {dl_device}"
-                )
-            dev = _create_device(device, dl_device)
-            if x_dldev == cpu_dev and dl_device == cpu_dev:
-                # both source and destination are CPU
-                return blob
-            elif x_dldev == cpu_dev:
-                # source is CPU, destination is oneAPI
-                return _to_usm_ary_from_host_blob(blob, dev)
-            elif dl_device == cpu_dev:
-                # source is oneAPI, destination is CPU
-                cpu_caps = blob.__dlpack__(
-                    max_version=get_build_dlpack_version(),
-                    dl_device=cpu_dev
-                )
-                return from_dlpack_capsule(cpu_caps)
-            else:
-                import dpctl.tensor as dpt
-                return dpt.asarray(blob, device=dev)
-        elif got_buffer_error:
-            # we are here, because dlpack_attr could not deal with requested
-            # dl_device, or copying was required
-            if copy is False:
-                raise BufferError(
-                    "Importing data via DLPack requires copying, but "
-                    "copy=False was provided"
-                )
-            if dl_device is None:
-                raise saved_exception
-            # must copy via host
-            if dl_device[0] != device_OneAPI:
-                raise BufferError(
-                    f"Can not import to requested device {dl_device}"
-                )
-            x_dldev = dlpack_dev_attr()
-            if x_dldev == cpu_dev:
-                dlpack_capsule = dlpack_attr()
-                host_blob = from_dlpack_capsule(dlpack_capsule)
-            else:
-                dlpack_capsule = dlpack_attr(
-                    max_version=requested_ver,
-                    dl_device=cpu_dev,
-                    copy=copy
-                )
-                host_blob = from_dlpack_capsule(dlpack_capsule)
-            dev = _create_device(device, dl_device)
-            return _to_usm_ary_from_host_blob(host_blob, dev)
-        elif got_other_error:
-            raise saved_exception
diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py
deleted file mode 100644
index 491ef75c56..0000000000
--- a/dpctl/tensor/_elementwise_common.py
+++ /dev/null
@@ -1,990 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import dpctl
-import dpctl.tensor as dpt
-import dpctl.tensor._tensor_impl as ti
-from dpctl.tensor._manipulation_functions import _broadcast_shape_impl
-from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
-
-from ._copy_utils import _empty_like_orderK, _empty_like_pair_orderK
-from ._scalar_utils import (
-    _get_dtype,
-    _get_queue_usm_type,
-    _get_shape,
-    _validate_dtype,
-)
-from ._type_utils import (
-    _acceptance_fn_default_binary,
-    _acceptance_fn_default_unary,
-    _all_data_types,
-    _find_buf_dtype,
-    _find_buf_dtype2,
-    _find_buf_dtype_in_place_op,
-    _resolve_weak_types,
-)
-
-
-class UnaryElementwiseFunc:
-    """
-    Class that implements unary element-wise functions.
-
-    Args:
-        name (str):
-            Name of the unary function
-        result_type_resovler_fn (callable):
-            Function that takes dtype of the input and
-            returns the dtype of the result if the
-            implementation functions supports it, or
-            returns `None` otherwise.
-        unary_dp_impl_fn (callable):
-            Data-parallel implementation function with signature
-            `impl_fn(src: usm_ndarray, dst: usm_ndarray,
-             sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])`
-            where the `src` is the argument array, `dst` is the
-            array to be populated with function values, effectively
-            evaluating `dst = func(src)`.
-            The `impl_fn` is expected to return a 2-tuple of `SyclEvent`s.
-            The first event corresponds to data-management host tasks,
-            including lifetime management of argument Python objects to ensure
-            that their associated USM allocation is not freed before offloaded
-            computational tasks complete execution, while the second event
-            corresponds to computational tasks associated with function
-            evaluation.
-        acceptance_fn (callable, optional):
-            Function to influence type promotion behavior of this unary
-            function. The function takes 4 arguments:
-                arg_dtype - Data type of the first argument
-                buf_dtype - Data type the argument would be cast to
-                res_dtype - Data type of the output array with function values
-                sycl_dev - The :class:`dpctl.SyclDevice` where the function
-                    evaluation is carried out.
-            The function is invoked when the argument of the unary function
-            requires casting, e.g. the argument of `dpctl.tensor.log` is an
-            array with integral data type.
-        docs (str):
-            Documentation string for the unary function.
-    """
-
-    def __init__(
-        self,
-        name,
-        result_type_resolver_fn,
-        unary_dp_impl_fn,
-        docs,
-        acceptance_fn=None,
-    ):
-        self.__name__ = "UnaryElementwiseFunc"
-        self.name_ = name
-        self.result_type_resolver_fn_ = result_type_resolver_fn
-        self.types_ = None
-        self.unary_fn_ = unary_dp_impl_fn
-        self.__doc__ = docs
-        if callable(acceptance_fn):
-            self.acceptance_fn_ = acceptance_fn
-        else:
-            self.acceptance_fn_ = _acceptance_fn_default_unary
-
-    def __str__(self):
-        return f"<{self.__name__} '{self.name_}'>"
-
-    def __repr__(self):
-        return f"<{self.__name__} '{self.name_}'>"
-
-    def get_implementation_function(self):
-        """Returns the implementation function for
-        this elementwise unary function.
-
-        """
-        return self.unary_fn_
-
-    def get_type_result_resolver_function(self):
-        """Returns the type resolver function for this
-        elementwise unary function.
-        """
-        return self.result_type_resolver_fn_
-
-    def get_type_promotion_path_acceptance_function(self):
-        """Returns the acceptance function for this
-        elementwise binary function.
-
-        Acceptance function influences the type promotion
-        behavior of this unary function.
-        The function takes 4 arguments:
-            arg_dtype - Data type of the first argument
-            buf_dtype - Data type the argument would be cast to
-            res_dtype - Data type of the output array with function values
-            sycl_dev - The :class:`dpctl.SyclDevice` where the function
-                evaluation is carried out.
-        The function is invoked when the argument of the unary function
-        requires casting, e.g. the argument of `dpctl.tensor.log` is an
-        array with integral data type.
-        """
-        return self.acceptance_fn_
-
-    @property
-    def nin(self):
-        """
-        Returns the number of arguments treated as inputs.
-        """
-        return 1
-
-    @property
-    def nout(self):
-        """
-        Returns the number of arguments treated as outputs.
-        """
-        return 1
-
-    @property
-    def types(self):
-        """Returns information about types supported by
-        implementation function, using NumPy's character
-        encoding for data types, e.g.
-
-        :Example:
-            .. code-block:: python
-
-                dpctl.tensor.sin.types
-                # Outputs: ['e->e', 'f->f', 'd->d', 'F->F', 'D->D']
-        """
-        types = self.types_
-        if not types:
-            types = []
-            for dt1 in _all_data_types(True, True):
-                dt2 = self.result_type_resolver_fn_(dt1)
-                if dt2:
-                    types.append(f"{dt1.char}->{dt2.char}")
-            self.types_ = types
-        return types
-
-    def __call__(self, x, /, *, out=None, order="K"):
-        if not isinstance(x, dpt.usm_ndarray):
-            raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
-
-        if order not in ["C", "F", "K", "A"]:
-            order = "K"
-        buf_dt, res_dt = _find_buf_dtype(
-            x.dtype,
-            self.result_type_resolver_fn_,
-            x.sycl_device,
-            acceptance_fn=self.acceptance_fn_,
-        )
-        if res_dt is None:
-            raise ValueError(
-                f"function '{self.name_}' does not support input type "
-                f"({x.dtype}), "
-                "and the input could not be safely coerced to any "
-                "supported types according to the casting rule ''safe''."
-            )
-
-        orig_out = out
-        if out is not None:
-            if not isinstance(out, dpt.usm_ndarray):
-                raise TypeError(
-                    f"output array must be of usm_ndarray type, got {type(out)}"
-                )
-
-            if not out.flags.writable:
-                raise ValueError("provided `out` array is read-only")
-
-            if out.shape != x.shape:
-                raise ValueError(
-                    "The shape of input and output arrays are inconsistent. "
-                    f"Expected output shape is {x.shape}, got {out.shape}"
-                )
-
-            if res_dt != out.dtype:
-                raise ValueError(
-                    f"Output array of type {res_dt} is needed, "
-                    f"got {out.dtype}"
-                )
-
-            if (
-                buf_dt is None
-                and ti._array_overlap(x, out)
-                and not ti._same_logical_tensors(x, out)
-            ):
-                # Allocate a temporary buffer to avoid memory overlapping.
-                # Note if `buf_dt` is not None, a temporary copy of `x` will be
-                # created, so the array overlap check isn't needed.
-                out = dpt.empty_like(out)
-
-            if (
-                dpctl.utils.get_execution_queue((x.sycl_queue, out.sycl_queue))
-                is None
-            ):
-                raise ExecutionPlacementError(
-                    "Input and output allocation queues are not compatible"
-                )
-
-        exec_q = x.sycl_queue
-        _manager = SequentialOrderManager[exec_q]
-        if buf_dt is None:
-            if out is None:
-                if order == "K":
-                    out = _empty_like_orderK(x, res_dt)
-                else:
-                    if order == "A":
-                        order = "F" if x.flags.f_contiguous else "C"
-                    out = dpt.empty_like(x, dtype=res_dt, order=order)
-
-            dep_evs = _manager.submitted_events
-            ht_unary_ev, unary_ev = self.unary_fn_(
-                x, out, sycl_queue=exec_q, depends=dep_evs
-            )
-            _manager.add_event_pair(ht_unary_ev, unary_ev)
-
-            if not (orig_out is None or orig_out is out):
-                # Copy the out data from temporary buffer to original memory
-                ht_copy_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-                    src=out, dst=orig_out, sycl_queue=exec_q, depends=[unary_ev]
-                )
-                _manager.add_event_pair(ht_copy_ev, cpy_ev)
-                out = orig_out
-
-            return out
-
-        if order == "K":
-            buf = _empty_like_orderK(x, buf_dt)
-        else:
-            if order == "A":
-                order = "F" if x.flags.f_contiguous else "C"
-            buf = dpt.empty_like(x, dtype=buf_dt, order=order)
-
-        dep_evs = _manager.submitted_events
-        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=x, dst=buf, sycl_queue=exec_q, depends=dep_evs
-        )
-        _manager.add_event_pair(ht_copy_ev, copy_ev)
-        if out is None:
-            if order == "K":
-                out = _empty_like_orderK(buf, res_dt)
-            else:
-                out = dpt.empty_like(buf, dtype=res_dt, order=order)
-
-        ht, uf_ev = self.unary_fn_(
-            buf, out, sycl_queue=exec_q, depends=[copy_ev]
-        )
-        _manager.add_event_pair(ht, uf_ev)
-
-        return out
-
-
-class BinaryElementwiseFunc:
-    """
-    Class that implements binary element-wise functions.
-
-    Args:
-        name (str):
-            Name of the unary function
-        result_type_resovle_fn (callable):
-            Function that takes dtypes of the input and
-            returns the dtype of the result if the
-            implementation functions supports it, or
-            returns `None` otherwise.
-        binary_dp_impl_fn (callable):
-            Data-parallel implementation function with signature
-            `impl_fn(src1: usm_ndarray, src2: usm_ndarray, dst: usm_ndarray,
-             sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])`
-            where the `src1` and `src2` are the argument arrays, `dst` is the
-            array to be populated with function values,
-            i.e. `dst=func(src1, src2)`.
-            The `impl_fn` is expected to return a 2-tuple of `SyclEvent`s.
-            The first event corresponds to data-management host tasks,
-            including lifetime management of argument Python objects to ensure
-            that their associated USM allocation is not freed before offloaded
-            computational tasks complete execution, while the second event
-            corresponds to computational tasks associated with function
-            evaluation.
-        docs (str):
-            Documentation string for the unary function.
-        binary_inplace_fn (callable, optional):
-            Data-parallel implementation function with signature
-            `impl_fn(src: usm_ndarray, dst: usm_ndarray,
-             sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])`
-            where the `src` is the argument array, `dst` is the
-            array to be populated with function values,
-            i.e. `dst=func(dst, src)`.
-            The `impl_fn` is expected to return a 2-tuple of `SyclEvent`s.
-            The first event corresponds to data-management host tasks,
-            including async lifetime management of Python arguments,
-            while the second event corresponds to computational tasks
-            associated with function evaluation.
-        acceptance_fn (callable, optional):
-            Function to influence type promotion behavior of this binary
-            function. The function takes 6 arguments:
-                arg1_dtype - Data type of the first argument
-                arg2_dtype - Data type of the second argument
-                ret_buf1_dtype - Data type the first argument would be cast to
-                ret_buf2_dtype - Data type the second argument would be cast to
-                res_dtype - Data type of the output array with function values
-                sycl_dev - The :class:`dpctl.SyclDevice` where the function
-                    evaluation is carried out.
-            The function is only called when both arguments of the binary
-            function require casting, e.g. both arguments of
-            `dpctl.tensor.logaddexp` are arrays with integral data type.
-    """
-
-    def __init__(
-        self,
-        name,
-        result_type_resolver_fn,
-        binary_dp_impl_fn,
-        docs,
-        binary_inplace_fn=None,
-        acceptance_fn=None,
-        weak_type_resolver=None,
-    ):
-        self.__name__ = "BinaryElementwiseFunc"
-        self.name_ = name
-        self.result_type_resolver_fn_ = result_type_resolver_fn
-        self.types_ = None
-        self.binary_fn_ = binary_dp_impl_fn
-        self.binary_inplace_fn_ = binary_inplace_fn
-        self.__doc__ = docs
-        if callable(acceptance_fn):
-            self.acceptance_fn_ = acceptance_fn
-        else:
-            self.acceptance_fn_ = _acceptance_fn_default_binary
-        if callable(weak_type_resolver):
-            self.weak_type_resolver_ = weak_type_resolver
-        else:
-            self.weak_type_resolver_ = _resolve_weak_types
-
-    def __str__(self):
-        return f"<{self.__name__} '{self.name_}'>"
-
-    def __repr__(self):
-        return f"<{self.__name__} '{self.name_}'>"
-
-    def get_implementation_function(self):
-        """Returns the out-of-place implementation
-        function for this elementwise binary function.
-
-        """
-        return self.binary_fn_
-
-    def get_implementation_inplace_function(self):
-        """Returns the in-place implementation
-        function for this elementwise binary function.
-
-        """
-        return self.binary_inplace_fn_
-
-    def get_type_result_resolver_function(self):
-        """Returns the type resolver function for this
-        elementwise binary function.
-        """
-        return self.result_type_resolver_fn_
-
-    def get_type_promotion_path_acceptance_function(self):
-        """Returns the acceptance function for this
-        elementwise binary function.
-
-        Acceptance function influences the type promotion
-        behavior of this binary function.
-        The function takes 6 arguments:
-            arg1_dtype - Data type of the first argument
-            arg2_dtype - Data type of the second argument
-            ret_buf1_dtype - Data type the first argument would be cast to
-            ret_buf2_dtype - Data type the second argument would be cast to
-            res_dtype - Data type of the output array with function values
-            sycl_dev - :class:`dpctl.SyclDevice` on which function evaluation
-                is carried out.
-
-        The acceptance function is only invoked if both input arrays must be
-        cast to intermediary data types, as would happen during call of
-        `dpctl.tensor.hypot` with both arrays being of integral data type.
-        """
-        return self.acceptance_fn_
-
-    def get_array_dtype_scalar_type_resolver_function(self):
-        """Returns the function which determines how to treat
-        Python scalar types for this elementwise binary function.
-
-        Resolver influences what type the scalar will be
-        treated as prior to type promotion behavior.
-        The function takes 3 arguments:
-
-        Args:
-            o1_dtype (object, dtype):
-                A class representing a Python scalar type or a ``dtype``
-            o2_dtype (object, dtype):
-                A class representing a Python scalar type or a ``dtype``
-            sycl_dev (:class:`dpctl.SyclDevice`):
-                Device on which function evaluation is carried out.
-
-        One of ``o1_dtype`` and ``o2_dtype`` must be a ``dtype`` instance.
-        """
-        return self.weak_type_resolver_
-
-    @property
-    def nin(self):
-        """
-        Returns the number of arguments treated as inputs.
-        """
-        return 2
-
-    @property
-    def nout(self):
-        """
-        Returns the number of arguments treated as outputs.
-        """
-        return 1
-
-    @property
-    def types(self):
-        """Returns information about types supported by
-        implementation function, using NumPy's character
-        encoding for data types, e.g.
-
-        :Example:
-            .. code-block:: python
-
-                dpctl.tensor.divide.types
-                # Outputs: ['ee->e', 'ff->f', 'fF->F', 'dd->d', 'dD->D',
-                #    'Ff->F', 'FF->F', 'Dd->D', 'DD->D']
-        """
-        types = self.types_
-        if not types:
-            types = []
-            _all_dtypes = _all_data_types(True, True)
-            for dt1 in _all_dtypes:
-                for dt2 in _all_dtypes:
-                    dt3 = self.result_type_resolver_fn_(dt1, dt2)
-                    if dt3:
-                        types.append(f"{dt1.char}{dt2.char}->{dt3.char}")
-            self.types_ = types
-        return types
-
-    def __call__(self, o1, o2, /, *, out=None, order="K"):
-        if order not in ["K", "C", "F", "A"]:
-            order = "K"
-        q1, o1_usm_type = _get_queue_usm_type(o1)
-        q2, o2_usm_type = _get_queue_usm_type(o2)
-        if q1 is None and q2 is None:
-            raise ExecutionPlacementError(
-                "Execution placement can not be unambiguously inferred "
-                "from input arguments. "
-                "One of the arguments must represent USM allocation and "
-                "expose `__sycl_usm_array_interface__` property"
-            )
-        if q1 is None:
-            exec_q = q2
-            res_usm_type = o2_usm_type
-        elif q2 is None:
-            exec_q = q1
-            res_usm_type = o1_usm_type
-        else:
-            exec_q = dpctl.utils.get_execution_queue((q1, q2))
-            if exec_q is None:
-                raise ExecutionPlacementError(
-                    "Execution placement can not be unambiguously inferred "
-                    "from input arguments."
-                )
-            res_usm_type = dpctl.utils.get_coerced_usm_type(
-                (
-                    o1_usm_type,
-                    o2_usm_type,
-                )
-            )
-        dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
-        o1_shape = _get_shape(o1)
-        o2_shape = _get_shape(o2)
-        if not all(
-            isinstance(s, (tuple, list))
-            for s in (
-                o1_shape,
-                o2_shape,
-            )
-        ):
-            raise TypeError(
-                "Shape of arguments can not be inferred. "
-                "Arguments are expected to be "
-                "lists, tuples, or both"
-            )
-        try:
-            res_shape = _broadcast_shape_impl(
-                [
-                    o1_shape,
-                    o2_shape,
-                ]
-            )
-        except ValueError:
-            raise ValueError(
-                "operands could not be broadcast together with shapes "
-                f"{o1_shape} and {o2_shape}"
-            )
-        sycl_dev = exec_q.sycl_device
-        o1_dtype = _get_dtype(o1, sycl_dev)
-        o2_dtype = _get_dtype(o2, sycl_dev)
-        if not all(_validate_dtype(o) for o in (o1_dtype, o2_dtype)):
-            raise ValueError("Operands have unsupported data types")
-
-        o1_dtype, o2_dtype = self.weak_type_resolver_(
-            o1_dtype, o2_dtype, sycl_dev
-        )
-
-        buf1_dt, buf2_dt, res_dt = _find_buf_dtype2(
-            o1_dtype,
-            o2_dtype,
-            self.result_type_resolver_fn_,
-            sycl_dev,
-            acceptance_fn=self.acceptance_fn_,
-        )
-
-        if res_dt is None:
-            raise ValueError(
-                f"function '{self.name_}' does not support input types "
-                f"({o1_dtype}, {o2_dtype}), "
-                "and the inputs could not be safely coerced to any "
-                "supported types according to the casting rule ''safe''."
-            )
-
-        orig_out = out
-        _manager = SequentialOrderManager[exec_q]
-        if out is not None:
-            if not isinstance(out, dpt.usm_ndarray):
-                raise TypeError(
-                    f"output array must be of usm_ndarray type, got {type(out)}"
-                )
-
-            if not out.flags.writable:
-                raise ValueError("provided `out` array is read-only")
-
-            if out.shape != res_shape:
-                raise ValueError(
-                    "The shape of input and output arrays are inconsistent. "
-                    f"Expected output shape is {res_shape}, got {out.shape}"
-                )
-
-            if res_dt != out.dtype:
-                raise ValueError(
-                    f"Output array of type {res_dt} is needed, "
-                    f"got {out.dtype}"
-                )
-
-            if (
-                dpctl.utils.get_execution_queue((exec_q, out.sycl_queue))
-                is None
-            ):
-                raise ExecutionPlacementError(
-                    "Input and output allocation queues are not compatible"
-                )
-
-            if isinstance(o1, dpt.usm_ndarray):
-                if ti._array_overlap(o1, out) and buf1_dt is None:
-                    if not ti._same_logical_tensors(o1, out):
-                        out = dpt.empty_like(out)
-                    elif self.binary_inplace_fn_ is not None:
-                        # if there is a dedicated in-place kernel
-                        # it can be called here, otherwise continues
-                        if isinstance(o2, dpt.usm_ndarray):
-                            src2 = o2
-                            if (
-                                ti._array_overlap(o2, out)
-                                and not ti._same_logical_tensors(o2, out)
-                                and buf2_dt is None
-                            ):
-                                buf2_dt = o2_dtype
-                        else:
-                            src2 = dpt.asarray(
-                                o2, dtype=o2_dtype, sycl_queue=exec_q
-                            )
-                        if buf2_dt is None:
-                            if src2.shape != res_shape:
-                                src2 = dpt.broadcast_to(src2, res_shape)
-                            dep_evs = _manager.submitted_events
-                            ht_, comp_ev = self.binary_inplace_fn_(
-                                lhs=o1,
-                                rhs=src2,
-                                sycl_queue=exec_q,
-                                depends=dep_evs,
-                            )
-                            _manager.add_event_pair(ht_, comp_ev)
-                        else:
-                            buf2 = dpt.empty_like(src2, dtype=buf2_dt)
-                            dep_evs = _manager.submitted_events
-                            (
-                                ht_copy_ev,
-                                copy_ev,
-                            ) = ti._copy_usm_ndarray_into_usm_ndarray(
-                                src=src2,
-                                dst=buf2,
-                                sycl_queue=exec_q,
-                                depends=dep_evs,
-                            )
-                            _manager.add_event_pair(ht_copy_ev, copy_ev)
-
-                            buf2 = dpt.broadcast_to(buf2, res_shape)
-                            ht_, bf_ev = self.binary_inplace_fn_(
-                                lhs=o1,
-                                rhs=buf2,
-                                sycl_queue=exec_q,
-                                depends=[copy_ev],
-                            )
-                            _manager.add_event_pair(ht_, bf_ev)
-
-                        return out
-
-            if isinstance(o2, dpt.usm_ndarray):
-                if (
-                    ti._array_overlap(o2, out)
-                    and not ti._same_logical_tensors(o2, out)
-                    and buf2_dt is None
-                ):
-                    # should not reach if out is reallocated
-                    # after being checked against o1
-                    out = dpt.empty_like(out)
-
-        if isinstance(o1, dpt.usm_ndarray):
-            src1 = o1
-        else:
-            src1 = dpt.asarray(o1, dtype=o1_dtype, sycl_queue=exec_q)
-        if isinstance(o2, dpt.usm_ndarray):
-            src2 = o2
-        else:
-            src2 = dpt.asarray(o2, dtype=o2_dtype, sycl_queue=exec_q)
-
-        if order == "A":
-            order = (
-                "F"
-                if all(
-                    arr.flags.f_contiguous
-                    for arr in (
-                        src1,
-                        src2,
-                    )
-                )
-                else "C"
-            )
-
-        if buf1_dt is None and buf2_dt is None:
-            if out is None:
-                if order == "K":
-                    out = _empty_like_pair_orderK(
-                        src1, src2, res_dt, res_shape, res_usm_type, exec_q
-                    )
-                else:
-                    out = dpt.empty(
-                        res_shape,
-                        dtype=res_dt,
-                        usm_type=res_usm_type,
-                        sycl_queue=exec_q,
-                        order=order,
-                    )
-            if src1.shape != res_shape:
-                src1 = dpt.broadcast_to(src1, res_shape)
-            if src2.shape != res_shape:
-                src2 = dpt.broadcast_to(src2, res_shape)
-            deps_ev = _manager.submitted_events
-            ht_binary_ev, binary_ev = self.binary_fn_(
-                src1=src1,
-                src2=src2,
-                dst=out,
-                sycl_queue=exec_q,
-                depends=deps_ev,
-            )
-            _manager.add_event_pair(ht_binary_ev, binary_ev)
-            if not (orig_out is None or orig_out is out):
-                # Copy the out data from temporary buffer to original memory
-                ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-                    src=out,
-                    dst=orig_out,
-                    sycl_queue=exec_q,
-                    depends=[binary_ev],
-                )
-                _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
-                out = orig_out
-            return out
-        elif buf1_dt is None:
-            if order == "K":
-                buf2 = _empty_like_orderK(src2, buf2_dt)
-            else:
-                buf2 = dpt.empty_like(src2, dtype=buf2_dt, order=order)
-            dep_evs = _manager.submitted_events
-            ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=src2, dst=buf2, sycl_queue=exec_q, depends=dep_evs
-            )
-            _manager.add_event_pair(ht_copy_ev, copy_ev)
-            if out is None:
-                if order == "K":
-                    out = _empty_like_pair_orderK(
-                        src1, buf2, res_dt, res_shape, res_usm_type, exec_q
-                    )
-                else:
-                    out = dpt.empty(
-                        res_shape,
-                        dtype=res_dt,
-                        usm_type=res_usm_type,
-                        sycl_queue=exec_q,
-                        order=order,
-                    )
-
-            if src1.shape != res_shape:
-                src1 = dpt.broadcast_to(src1, res_shape)
-            buf2 = dpt.broadcast_to(buf2, res_shape)
-            ht_binary_ev, binary_ev = self.binary_fn_(
-                src1=src1,
-                src2=buf2,
-                dst=out,
-                sycl_queue=exec_q,
-                depends=[copy_ev],
-            )
-            _manager.add_event_pair(ht_binary_ev, binary_ev)
-            if not (orig_out is None or orig_out is out):
-                # Copy the out data from temporary buffer to original memory
-                ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-                    src=out,
-                    dst=orig_out,
-                    sycl_queue=exec_q,
-                    depends=[binary_ev],
-                )
-                _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
-                out = orig_out
-            return out
-        elif buf2_dt is None:
-            if order == "K":
-                buf1 = _empty_like_orderK(src1, buf1_dt)
-            else:
-                buf1 = dpt.empty_like(src1, dtype=buf1_dt, order=order)
-            dep_evs = _manager.submitted_events
-            ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=src1, dst=buf1, sycl_queue=exec_q, depends=dep_evs
-            )
-            _manager.add_event_pair(ht_copy_ev, copy_ev)
-            if out is None:
-                if order == "K":
-                    out = _empty_like_pair_orderK(
-                        buf1, src2, res_dt, res_shape, res_usm_type, exec_q
-                    )
-                else:
-                    out = dpt.empty(
-                        res_shape,
-                        dtype=res_dt,
-                        usm_type=res_usm_type,
-                        sycl_queue=exec_q,
-                        order=order,
-                    )
-
-            buf1 = dpt.broadcast_to(buf1, res_shape)
-            if src2.shape != res_shape:
-                src2 = dpt.broadcast_to(src2, res_shape)
-            ht_binary_ev, binary_ev = self.binary_fn_(
-                src1=buf1,
-                src2=src2,
-                dst=out,
-                sycl_queue=exec_q,
-                depends=[copy_ev],
-            )
-            _manager.add_event_pair(ht_binary_ev, binary_ev)
-            if not (orig_out is None or orig_out is out):
-                # Copy the out data from temporary buffer to original memory
-                ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-                    src=out,
-                    dst=orig_out,
-                    sycl_queue=exec_q,
-                    depends=[binary_ev],
-                )
-                _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
-                out = orig_out
-            return out
-
-        if order == "K":
-            if src1.flags.c_contiguous and src2.flags.c_contiguous:
-                order = "C"
-            elif src1.flags.f_contiguous and src2.flags.f_contiguous:
-                order = "F"
-        if order == "K":
-            buf1 = _empty_like_orderK(src1, buf1_dt)
-        else:
-            buf1 = dpt.empty_like(src1, dtype=buf1_dt, order=order)
-        dep_evs = _manager.submitted_events
-        ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=src1, dst=buf1, sycl_queue=exec_q, depends=dep_evs
-        )
-        _manager.add_event_pair(ht_copy1_ev, copy1_ev)
-        if order == "K":
-            buf2 = _empty_like_orderK(src2, buf2_dt)
-        else:
-            buf2 = dpt.empty_like(src2, dtype=buf2_dt, order=order)
-        ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=src2, dst=buf2, sycl_queue=exec_q, depends=dep_evs
-        )
-        _manager.add_event_pair(ht_copy2_ev, copy2_ev)
-        if out is None:
-            if order == "K":
-                out = _empty_like_pair_orderK(
-                    buf1, buf2, res_dt, res_shape, res_usm_type, exec_q
-                )
-            else:
-                out = dpt.empty(
-                    res_shape,
-                    dtype=res_dt,
-                    usm_type=res_usm_type,
-                    sycl_queue=exec_q,
-                    order=order,
-                )
-
-        buf1 = dpt.broadcast_to(buf1, res_shape)
-        buf2 = dpt.broadcast_to(buf2, res_shape)
-        ht_, bf_ev = self.binary_fn_(
-            src1=buf1,
-            src2=buf2,
-            dst=out,
-            sycl_queue=exec_q,
-            depends=[copy1_ev, copy2_ev],
-        )
-        _manager.add_event_pair(ht_, bf_ev)
-        return out
-
-    def _inplace_op(self, o1, o2):
-        if self.binary_inplace_fn_ is None:
-            raise ValueError(
-                "binary function does not have a dedicated in-place "
-                "implementation"
-            )
-        if not isinstance(o1, dpt.usm_ndarray):
-            raise TypeError(
-                "Expected first argument to be "
-                f"dpctl.tensor.usm_ndarray, got {type(o1)}"
-            )
-        if not o1.flags.writable:
-            raise ValueError("provided left-hand side array is read-only")
-        q1, o1_usm_type = o1.sycl_queue, o1.usm_type
-        q2, o2_usm_type = _get_queue_usm_type(o2)
-        if q2 is None:
-            exec_q = q1
-            res_usm_type = o1_usm_type
-        else:
-            exec_q = dpctl.utils.get_execution_queue((q1, q2))
-            if exec_q is None:
-                raise ExecutionPlacementError(
-                    "Execution placement can not be unambiguously inferred "
-                    "from input arguments."
-                )
-            res_usm_type = dpctl.utils.get_coerced_usm_type(
-                (
-                    o1_usm_type,
-                    o2_usm_type,
-                )
-            )
-        dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
-        o1_shape = o1.shape
-        o2_shape = _get_shape(o2)
-        if not isinstance(o2_shape, (tuple, list)):
-            raise TypeError(
-                "Shape of second argument can not be inferred. "
-                "Expected list or tuple."
-            )
-        try:
-            res_shape = _broadcast_shape_impl(
-                [
-                    o1_shape,
-                    o2_shape,
-                ]
-            )
-        except ValueError:
-            raise ValueError(
-                "operands could not be broadcast together with shapes "
-                f"{o1_shape} and {o2_shape}"
-            )
-
-        if res_shape != o1_shape:
-            raise ValueError(
-                "The shape of the non-broadcastable left-hand "
-                f"side {o1_shape} is inconsistent with the "
-                f"broadcast shape {res_shape}."
-            )
-
-        sycl_dev = exec_q.sycl_device
-        o1_dtype = o1.dtype
-        o2_dtype = _get_dtype(o2, sycl_dev)
-        if not _validate_dtype(o2_dtype):
-            raise ValueError("Operand has an unsupported data type")
-
-        o1_dtype, o2_dtype = self.weak_type_resolver_(
-            o1_dtype, o2_dtype, sycl_dev
-        )
-
-        buf_dt, res_dt = _find_buf_dtype_in_place_op(
-            o1_dtype,
-            o2_dtype,
-            self.result_type_resolver_fn_,
-            sycl_dev,
-        )
-
-        if res_dt is None:
-            raise ValueError(
-                f"function '{self.name_}' does not support input types "
-                f"({o1_dtype}, {o2_dtype}), "
-                "and the inputs could not be safely coerced to any "
-                "supported types according to the casting rule "
-                "''same_kind''."
-            )
-
-        if res_dt != o1_dtype:
-            raise ValueError(
-                f"Output array of type {res_dt} is needed, " f"got {o1_dtype}"
-            )
-
-        _manager = SequentialOrderManager[exec_q]
-        if isinstance(o2, dpt.usm_ndarray):
-            src2 = o2
-            if (
-                ti._array_overlap(o2, o1)
-                and not ti._same_logical_tensors(o2, o1)
-                and buf_dt is None
-            ):
-                buf_dt = o2_dtype
-        else:
-            src2 = dpt.asarray(o2, dtype=o2_dtype, sycl_queue=exec_q)
-        if buf_dt is None:
-            if src2.shape != res_shape:
-                src2 = dpt.broadcast_to(src2, res_shape)
-            dep_evs = _manager.submitted_events
-            ht_, comp_ev = self.binary_inplace_fn_(
-                lhs=o1,
-                rhs=src2,
-                sycl_queue=exec_q,
-                depends=dep_evs,
-            )
-            _manager.add_event_pair(ht_, comp_ev)
-        else:
-            buf = dpt.empty_like(src2, dtype=buf_dt)
-            dep_evs = _manager.submitted_events
-            (
-                ht_copy_ev,
-                copy_ev,
-            ) = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=src2,
-                dst=buf,
-                sycl_queue=exec_q,
-                depends=dep_evs,
-            )
-            _manager.add_event_pair(ht_copy_ev, copy_ev)
-
-            buf = dpt.broadcast_to(buf, res_shape)
-            ht_, bf_ev = self.binary_inplace_fn_(
-                lhs=o1,
-                rhs=buf,
-                sycl_queue=exec_q,
-                depends=[copy_ev],
-            )
-            _manager.add_event_pair(ht_, bf_ev)
-
-        return o1
diff --git a/dpctl/tensor/_elementwise_funcs.py b/dpctl/tensor/_elementwise_funcs.py
deleted file mode 100644
index 46c44417d4..0000000000
--- a/dpctl/tensor/_elementwise_funcs.py
+++ /dev/null
@@ -1,2268 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import dpctl.tensor._tensor_elementwise_impl as ti
-
-from ._elementwise_common import BinaryElementwiseFunc, UnaryElementwiseFunc
-from ._type_utils import (
-    _acceptance_fn_divide,
-    _acceptance_fn_negative,
-    _acceptance_fn_reciprocal,
-    _acceptance_fn_subtract,
-    _resolve_weak_types_all_py_ints,
-)
-
-# U01: ==== ABS    (x)
-_abs_docstring_ = r"""
-abs(x, /, \*, out=None, order='K')
-
-Calculates the absolute value for each element `x_i` of input array `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array. May have any data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array,
-        if parameter `out` is ``None``.
-        Default: `"K"`.
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise absolute values.
-        For complex input, the absolute value is its magnitude.
-        If `x` has a real-valued data type, the returned array has the
-        same data type as `x`. If `x` has a complex floating-point data type,
-        the returned array has a real-valued floating-point data type whose
-        precision matches the precision of `x`.
-"""
-
-abs = UnaryElementwiseFunc("abs", ti._abs_result_type, ti._abs, _abs_docstring_)
-del _abs_docstring_
-
-# U02: ==== ACOS   (x)
-_acos_docstring = r"""
-acos(x, /, \*, out=None, order='K')
-
-Computes inverse cosine for each element `x_i` for input array `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have a floating-point data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise inverse cosine, in radians
-        and in the closed interval :math:`[0, \pi]`. The data type of the
-        returned array is determined by the Type Promotion Rules.
-"""
-
-acos = UnaryElementwiseFunc(
-    "acos", ti._acos_result_type, ti._acos, _acos_docstring
-)
-del _acos_docstring
-
-# U03: ===== ACOSH (x)
-_acosh_docstring = r"""
-acosh(x, /, \*, out=None, order='K')
-
-Computes inverse hyperbolic cosine for each element `x_i` for input array `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have a floating-point data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise inverse hyperbolic cosine, in
-        radians and in the half-closed interval :math:`[0, \infty)`. The data
-        type of the returned array is determined by the Type Promotion Rules.
-"""
-
-acosh = UnaryElementwiseFunc(
-    "acosh", ti._acosh_result_type, ti._acosh, _acosh_docstring
-)
-del _acosh_docstring
-
-# B01: ===== ADD   (x1, x2)
-
-_add_docstring_ = r"""
-add(x1, x2, /, \*, out=None, order='K')
-
-Calculates the sum for each element `x1_i` of the input array `x1` with
-the respective element `x2_i` of the input array `x2`.
-
-Args:
-    x1 (usm_ndarray):
-        First input array. May have any data type.
-    x2 (usm_ndarray):
-        Second input array. May have any data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise sums. The data type of the
-        returned array is determined by the Type Promotion Rules.
-"""
-add = BinaryElementwiseFunc(
-    "add",
-    ti._add_result_type,
-    ti._add,
-    _add_docstring_,
-    binary_inplace_fn=ti._add_inplace,
-)
-del _add_docstring_
-
-# U04: ===== ASIN  (x)
-_asin_docstring = r"""
-asin(x, /, \*, out=None, order='K')
-
-Computes inverse sine for each element `x_i` for input array `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have a floating-point data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise inverse sine, in radians
-        and in the closed interval :math:`[-\pi/2, \pi/2]`. The data type
-        of the returned array is determined by the Type Promotion Rules.
-"""
-
-asin = UnaryElementwiseFunc(
-    "asin", ti._asin_result_type, ti._asin, _asin_docstring
-)
-del _asin_docstring
-
-# U05: ===== ASINH (x)
-_asinh_docstring = r"""
-asinh(x, /, \*, out=None, order='K')
-
-Computes inverse hyperbolic sine for each element `x_i` for input array `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have a floating-point data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise inverse hyperbolic sine, in
-        radians. The data type of the returned array is determined by
-        the Type Promotion Rules.
-"""
-
-asinh = UnaryElementwiseFunc(
-    "asinh", ti._asinh_result_type, ti._asinh, _asinh_docstring
-)
-del _asinh_docstring
-
-# U06: ===== ATAN  (x)
-_atan_docstring = r"""
-atan(x, /, \*, out=None, order='K')
-
-Computes inverse tangent for each element `x_i` for input array `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have a floating-point data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise inverse tangent, in radians
-        and in the closed interval :math:`[-\pi/2, \pi/2]`. The data type
-        of the returned array is determined by the Type Promotion Rules.
-"""
-
-atan = UnaryElementwiseFunc(
-    "atan", ti._atan_result_type, ti._atan, _atan_docstring
-)
-del _atan_docstring
-
-# B02: ===== ATAN2 (x1, x2)
-_atan2_docstring_ = r"""
-atan2(x1, x2, /, \*, out=None, order='K')
-
-Calculates the inverse tangent of the quotient `x1_i/x2_i` for each element
-`x1_i` of the input array `x1` with the respective element `x2_i` of the
-input array `x2`. Each element-wise result is expressed in radians.
-
-Args:
-    x1 (usm_ndarray):
-        First input array, expected to have a real-valued floating-point
-        data type.
-    x2 (usm_ndarray):
-        Second input array, also expected to have a real-valued
-        floating-point data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the inverse tangent of the quotient `x1`/`x2`.
-        The returned array must have a real-valued floating-point data type
-        determined by Type Promotion Rules.
-"""
-
-atan2 = BinaryElementwiseFunc(
-    "atan2", ti._atan2_result_type, ti._atan2, _atan2_docstring_
-)
-del _atan2_docstring_
-
-# U07: ===== ATANH (x)
-_atanh_docstring = r"""
-atanh(x, /, \*, out=None, order='K')
-
-Computes hyperbolic inverse tangent for each element `x_i` for input array `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have a floating-point data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise hyperbolic inverse tangent, in
-        radians. The data type of the returned array is determined by
-        the Type Promotion Rules.
-"""
-
-atanh = UnaryElementwiseFunc(
-    "atanh", ti._atanh_result_type, ti._atanh, _atanh_docstring
-)
-del _atanh_docstring
-
-# B03: ===== BITWISE_AND           (x1, x2)
-_bitwise_and_docstring_ = r"""
-bitwise_and(x1, x2, /, \*, out=None, order='K')
-
-Computes the bitwise AND of the underlying binary representation of each
-element `x1_i` of the input array `x1` with the respective element `x2_i`
-of the input array `x2`.
-
-Args:
-    x1 (usm_ndarray):
-        First input array, expected to have integer or boolean data type.
-    x2 (usm_ndarray):
-        Second input array, also expected to have integer or boolean data
-        type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise results. The data type
-        of the returned array is determined by the Type Promotion Rules.
-"""
-
-bitwise_and = BinaryElementwiseFunc(
-    "bitwise_and",
-    ti._bitwise_and_result_type,
-    ti._bitwise_and,
-    _bitwise_and_docstring_,
-    binary_inplace_fn=ti._bitwise_and_inplace,
-)
-del _bitwise_and_docstring_
-
-# B04: ===== BITWISE_LEFT_SHIFT    (x1, x2)
-_bitwise_left_shift_docstring_ = r"""
-bitwise_left_shift(x1, x2, /, \*, out=None, order='K')
-
-Shifts the bits of each element `x1_i` of the input array x1 to the left by
-appending `x2_i` (i.e., the respective element in the input array `x2`) zeros to
-the right of `x1_i`.
-
-Args:
-    x1 (usm_ndarray):
-        First input array, expected to have integer data type.
-    x2 (usm_ndarray):
-        Second input array, also expected to have integer data type.
-        Each element must be greater than or equal to 0.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise results. The data type
-        of the returned array is determined by the Type Promotion Rules.
-"""
-
-bitwise_left_shift = BinaryElementwiseFunc(
-    "bitwise_left_shift",
-    ti._bitwise_left_shift_result_type,
-    ti._bitwise_left_shift,
-    _bitwise_left_shift_docstring_,
-    binary_inplace_fn=ti._bitwise_left_shift_inplace,
-)
-del _bitwise_left_shift_docstring_
-
-
-# U08: ===== BITWISE_INVERT        (x)
-_bitwise_invert_docstring = r"""
-bitwise_invert(x, /, \*, out=None, order='K')
-
-Inverts (flips) each bit for each element `x_i` of the input array `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have integer or boolean data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise results.
-        The data type of the returned array is same as the data type of the
-        input array.
-"""
-
-bitwise_invert = UnaryElementwiseFunc(
-    "bitwise_invert",
-    ti._bitwise_invert_result_type,
-    ti._bitwise_invert,
-    _bitwise_invert_docstring,
-)
-del _bitwise_invert_docstring
-
-# B05: ===== BITWISE_OR            (x1, x2)
-_bitwise_or_docstring_ = r"""
-bitwise_or(x1, x2, /, \*, out=None, order='K')
-
-Computes the bitwise OR of the underlying binary representation of each
-element `x1_i` of the input array `x1` with the respective element `x2_i`
-of the input array `x2`.
-
-Args:
-    x1 (usm_ndarray):
-        First input array, expected to have integer or boolean data type.
-    x2 (usm_ndarray):
-        Second input array, also expected to have integer or boolean data
-        type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise results. The data type
-        of the returned array is determined by the Type Promotion Rules.
-"""
-
-bitwise_or = BinaryElementwiseFunc(
-    "bitwise_or",
-    ti._bitwise_or_result_type,
-    ti._bitwise_or,
-    _bitwise_or_docstring_,
-    binary_inplace_fn=ti._bitwise_or_inplace,
-)
-del _bitwise_or_docstring_
-
-# B06: ===== BITWISE_RIGHT_SHIFT   (x1, x2)
-_bitwise_right_shift_docstring_ = r"""
-bitwise_right_shift(x1, x2, /, \*, out=None, order='K')
-
-Shifts the bits of each element `x1_i` of the input array `x1` to the right
-according to the respective element `x2_i` of the input array `x2`.
-
-Args:
-    x1 (usm_ndarray):
-        First input array, expected to have integer data type.
-    x2 (usm_ndarray):
-        Second input array, also expected to have integer data type.
-        Each element must be greater than or equal to 0.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise results. The data type
-        of the returned array is determined by the Type Promotion Rules.
-"""
-
-bitwise_right_shift = BinaryElementwiseFunc(
-    "bitwise_right_shift",
-    ti._bitwise_right_shift_result_type,
-    ti._bitwise_right_shift,
-    _bitwise_right_shift_docstring_,
-    binary_inplace_fn=ti._bitwise_right_shift_inplace,
-)
-del _bitwise_right_shift_docstring_
-
-
-# B07: ===== BITWISE_XOR           (x1, x2)
-_bitwise_xor_docstring_ = r"""
-bitwise_xor(x1, x2, /, \*, out=None, order='K')
-
-Computes the bitwise XOR of the underlying binary representation of each
-element `x1_i` of the input array `x1` with the respective element `x2_i`
-of the input array `x2`.
-
-Args:
-    x1 (usm_ndarray):
-        First input array, expected to have integer or boolean data type.
-    x2 (usm_ndarray):
-        Second input array, also expected to have integer or boolean data
-        type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise results. The data type
-        of the returned array is determined by the Type Promotion Rules.
-"""
-
-bitwise_xor = BinaryElementwiseFunc(
-    "bitwise_xor",
-    ti._bitwise_xor_result_type,
-    ti._bitwise_xor,
-    _bitwise_xor_docstring_,
-    binary_inplace_fn=ti._bitwise_xor_inplace,
-)
-del _bitwise_xor_docstring_
-
-
-# U09: ==== CEIL          (x)
-_ceil_docstring = r"""
-ceil(x, /, \*, out=None, order='K')
-
-Returns the ceiling for each element `x_i` for input array `x`.
-
-The ceil of `x_i` is the smallest integer `n`, such that `n >= x_i`.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have a boolean or real-valued data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise ceiling.
-"""
-
-ceil = UnaryElementwiseFunc(
-    "ceil", ti._ceil_result_type, ti._ceil, _ceil_docstring
-)
-del _ceil_docstring
-
-# U10: ==== CONJ          (x)
-_conj_docstring = r"""
-conj(x, /, \*, out=None, order='K')
-
-Computes conjugate of each element `x_i` for input array `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array. May have any data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise conjugate values.
-"""
-
-conj = UnaryElementwiseFunc(
-    "conj", ti._conj_result_type, ti._conj, _conj_docstring
-)
-del _conj_docstring
-
-# U11: ==== COS           (x)
-_cos_docstring = r"""
-cos(x, /, \*, out=None, order='K')
-
-Computes cosine for each element `x_i` for input array `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have a floating-point data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise cosine. The data type
-        of the returned array is determined by the Type Promotion Rules.
-"""
-
-cos = UnaryElementwiseFunc("cos", ti._cos_result_type, ti._cos, _cos_docstring)
-del _cos_docstring
-
-# U12: ==== COSH          (x)
-_cosh_docstring = r"""
-cosh(x, /, \*, out=None, order='K')
-
-Computes hyperbolic cosine for each element `x_i` for input array `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have a floating-point data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise hyperbolic cosine. The data type
-        of the returned array is determined by the Type Promotion Rules.
-"""
-
-cosh = UnaryElementwiseFunc(
-    "cosh", ti._cosh_result_type, ti._cosh, _cosh_docstring
-)
-del _cosh_docstring
-
-# B08: ==== DIVIDE        (x1, x2)
-_divide_docstring_ = r"""
-divide(x1, x2, /, \*, out=None, order='K')
-
-Calculates the ratio for each element `x1_i` of the input array `x1` with
-the respective element `x2_i` of the input array `x2`.
-
-Args:
-    x1 (usm_ndarray):
-        First input array, expected to have a floating-point data type.
-    x2 (usm_ndarray):
-        Second input array, also expected to have a floating-point data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the result of element-wise division. The data type
-        of the returned array is determined by the Type Promotion Rules.
-"""
-
-divide = BinaryElementwiseFunc(
-    "divide",
-    ti._divide_result_type,
-    ti._divide,
-    _divide_docstring_,
-    binary_inplace_fn=ti._divide_inplace,
-    acceptance_fn=_acceptance_fn_divide,
-    weak_type_resolver=_resolve_weak_types_all_py_ints,
-)
-del _divide_docstring_
-
-# B09: ==== EQUAL         (x1, x2)
-_equal_docstring_ = r"""
-equal(x1, x2, /, \*, out=None, order='K')
-
-Calculates equality test results for each element `x1_i` of the input array `x1`
-with the respective element `x2_i` of the input array `x2`.
-
-Args:
-    x1 (usm_ndarray):
-        First input array. May have any data type.
-    x2 (usm_ndarray):
-        Second input array. May have any data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the result of element-wise equality comparison.
-        The returned array has a data type of `bool`.
-"""
-
-equal = BinaryElementwiseFunc(
-    "equal",
-    ti._equal_result_type,
-    ti._equal,
-    _equal_docstring_,
-    weak_type_resolver=_resolve_weak_types_all_py_ints,
-)
-del _equal_docstring_
-
-# U13: ==== EXP           (x)
-_exp_docstring = r"""
-exp(x, /, \*, out=None, order='K')
-
-Computes the exponential for each element `x_i` of input array `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have a floating-point data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise exponential of `x`.
-        The data type of the returned array is determined by
-        the Type Promotion Rules.
-"""
-
-exp = UnaryElementwiseFunc("exp", ti._exp_result_type, ti._exp, _exp_docstring)
-del _exp_docstring
-
-# U14: ==== EXPM1         (x)
-_expm1_docstring = r"""
-expm1(x, /, \*, out=None, order='K')
-
-Computes the exponential minus 1 for each element `x_i` of input array `x`.
-
-This function calculates `exp(x) - 1.0` more accurately for small values of `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have a floating-point data type.
-    out (usm_ndarray):
-        Output array to populate. Array must have the correct
-        shape and the expected data type.
-    order ("C","F","A","K", optional): memory layout of the new
-        output array, if parameter `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise `exp(x) - 1` results.
-        The data type of the returned array is determined by the Type
-        Promotion Rules.
-"""
-
-expm1 = UnaryElementwiseFunc(
-    "expm1", ti._expm1_result_type, ti._expm1, _expm1_docstring
-)
-del _expm1_docstring
-
-# U15: ==== FLOOR         (x)
-_floor_docstring = r"""
-floor(x, /, \*, out=None, order='K')
-
-Returns the floor for each element `x_i` for input array `x`.
-
-The floor of `x_i` is the largest integer `n`, such that `n <= x_i`.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have a boolean or real-valued data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise floor.
-"""
-
-floor = UnaryElementwiseFunc(
-    "floor", ti._floor_result_type, ti._floor, _floor_docstring
-)
-del _floor_docstring
-
-# B10: ==== FLOOR_DIVIDE  (x1, x2)
-_floor_divide_docstring_ = r"""
-floor_divide(x1, x2, /, \*, out=None, order='K')
-
-Calculates the ratio for each element `x1_i` of the input array `x1` with
-the respective element `x2_i` of the input array `x2` to the greatest
-integer-value number that is not greater than the division result.
-
-Args:
-    x1 (usm_ndarray):
-        First input array, expected to have a real-valued data type.
-    x2 (usm_ndarray):
-        Second input array, also expected to have a real-valued data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the result of element-wise floor of division.
-        The data type of the returned array is determined by the Type
-        Promotion Rules.
-"""
-
-floor_divide = BinaryElementwiseFunc(
-    "floor_divide",
-    ti._floor_divide_result_type,
-    ti._floor_divide,
-    _floor_divide_docstring_,
-    binary_inplace_fn=ti._floor_divide_inplace,
-)
-del _floor_divide_docstring_
-
-# B11: ==== GREATER       (x1, x2)
-_greater_docstring_ = r"""
-greater(x1, x2, /, \*, out=None, order='K')
-
-Computes the greater-than test results for each element `x1_i` of
-the input array `x1` with the respective element `x2_i` of the input array `x2`.
-
-Args:
-    x1 (usm_ndarray):
-        First input array. May have any data type.
-    x2 (usm_ndarray):
-        Second input array. May have any data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the result of element-wise greater-than comparison.
-        The returned array has a data type of `bool`.
-"""
-
-greater = BinaryElementwiseFunc(
-    "greater",
-    ti._greater_result_type,
-    ti._greater,
-    _greater_docstring_,
-    weak_type_resolver=_resolve_weak_types_all_py_ints,
-)
-del _greater_docstring_
-
-# B12: ==== GREATER_EQUAL (x1, x2)
-_greater_equal_docstring_ = r"""
-greater_equal(x1, x2, /, \*, out=None, order='K')
-
-Computes the greater-than or equal-to test results for each element `x1_i` of
-the input array `x1` with the respective element `x2_i` of the input array `x2`.
-
-Args:
-    x1 (usm_ndarray):
-        First input array. May have any data type.
-    x2 (usm_ndarray):
-        Second input array. May have any data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the result of element-wise greater-than or equal-to
-        comparison.
-        The returned array has a data type of `bool`.
-"""
-
-greater_equal = BinaryElementwiseFunc(
-    "greater_equal",
-    ti._greater_equal_result_type,
-    ti._greater_equal,
-    _greater_equal_docstring_,
-    weak_type_resolver=_resolve_weak_types_all_py_ints,
-)
-del _greater_equal_docstring_
-
-# U16: ==== IMAG        (x)
-_imag_docstring = r"""
-imag(x, /, \*, out=None, order='K')
-
-Computes imaginary part of each element `x_i` for input array `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array. May have any data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise imaginary component of input.
-        If the input is a real-valued data type, the returned array has
-        the same data type. If the input is a complex floating-point
-        data type, the returned array has a floating-point data type
-        with the same floating-point precision as complex input.
-"""
-
-imag = UnaryElementwiseFunc(
-    "imag", ti._imag_result_type, ti._imag, _imag_docstring
-)
-del _imag_docstring
-
-# U17: ==== ISFINITE    (x)
-_isfinite_docstring_ = r"""
-isfinite(x, /, \*, out=None, order='K')
-
-Test if each element of input array is a finite number.
-
-Args:
-    x (usm_ndarray):
-        Input array. May have any data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array which is True where `x` is not positive infinity,
-        negative infinity, or NaN, False otherwise.
-        The data type of the returned array is `bool`.
-"""
-
-isfinite = UnaryElementwiseFunc(
-    "isfinite", ti._isfinite_result_type, ti._isfinite, _isfinite_docstring_
-)
-del _isfinite_docstring_
-
-# U18: ==== ISINF       (x)
-_isinf_docstring_ = r"""
-isinf(x, /, \*, out=None, order='K')
-
-Test if each element of input array is an infinity.
-
-Args:
-    x (usm_ndarray):
-        Input array. May have any data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array which is True where `x` is positive or negative infinity,
-        False otherwise. The data type of the returned array is `bool`.
-"""
-
-isinf = UnaryElementwiseFunc(
-    "isinf", ti._isinf_result_type, ti._isinf, _isinf_docstring_
-)
-del _isinf_docstring_
-
-# U19: ==== ISNAN       (x)
-_isnan_docstring_ = r"""
-isnan(x, /, \*, out=None, order='K')
-
-Test if each element of an input array is a NaN.
-
-Args:
-    x (usm_ndarray):
-        Input array. May have any data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array which is True where x is NaN, False otherwise.
-        The data type of the returned array is `bool`.
-"""
-
-isnan = UnaryElementwiseFunc(
-    "isnan", ti._isnan_result_type, ti._isnan, _isnan_docstring_
-)
-del _isnan_docstring_
-
-# B13: ==== LESS        (x1, x2)
-_less_docstring_ = r"""
-less(x1, x2, /, \*, out=None, order='K')
-
-Computes the less-than test results for each element `x1_i` of
-the input array `x1` with the respective element `x2_i` of the input array `x2`.
-
-Args:
-    x1 (usm_ndarray):
-        First input array. May have any data type.
-    x2 (usm_ndarray):
-        Second input array. May have any data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the result of element-wise less-than comparison.
-        The returned array has a data type of `bool`.
-"""
-
-less = BinaryElementwiseFunc(
-    "less",
-    ti._less_result_type,
-    ti._less,
-    _less_docstring_,
-    weak_type_resolver=_resolve_weak_types_all_py_ints,
-)
-del _less_docstring_
-
-
-# B14: ==== LESS_EQUAL  (x1, x2)
-_less_equal_docstring_ = r"""
-less_equal(x1, x2, /, \*, out=None, order='K')
-
-Computes the less-than or equal-to test results for each element `x1_i` of
-the input array `x1` with the respective element `x2_i` of the input array `x2`.
-
-Args:
-    x1 (usm_ndarray):
-        First input array. May have any data type.
-    x2 (usm_ndarray):
-        Second input array. May have any data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the result of element-wise less-than or equal-to
-        comparison. The returned array has a data type of `bool`.
-"""
-
-less_equal = BinaryElementwiseFunc(
-    "less_equal",
-    ti._less_equal_result_type,
-    ti._less_equal,
-    _less_equal_docstring_,
-    weak_type_resolver=_resolve_weak_types_all_py_ints,
-)
-del _less_equal_docstring_
-
-# U20: ==== LOG         (x)
-_log_docstring = r"""
-log(x, /, \*, out=None, order='K')
-
-Computes the natural logarithm for each element `x_i` of input array `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have a floating-point data type.
-    out (usm_ndarray):
-        Output array to populate. Array must have the correct
-        shape and the expected data type.
-    order ("C","F","A","K", optional): memory layout of the new
-        output array, if parameter `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise natural logarithm values.
-        The data type of the returned array is determined by the Type
-        Promotion Rules.
-"""
-
-log = UnaryElementwiseFunc("log", ti._log_result_type, ti._log, _log_docstring)
-del _log_docstring
-
-# U21: ==== LOG1P       (x)
-_log1p_docstring = r"""
-log1p(x, /, \*, out=None, order='K')
-
-Computes the natural logarithm of (1 + `x`) for each element `x_i` of input
-array `x`.
-
-This function calculates `log(1 + x)` more accurately for small values of `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have a floating-point data type.
-    out (usm_ndarray):
-        Output array to populate. Array must have the correct
-        shape and the expected data type.
-    order ("C","F","A","K", optional): memory layout of the new
-        output array, if parameter `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise `log(1 + x)` results. The data type
-        of the returned array is determined by the Type Promotion Rules.
-"""
-
-log1p = UnaryElementwiseFunc(
-    "log1p", ti._log1p_result_type, ti._log1p, _log1p_docstring
-)
-del _log1p_docstring
-
-# U22: ==== LOG2        (x)
-_log2_docstring_ = r"""
-log2(x, /, \*, out=None, order='K')
-
-Computes the base-2 logarithm for each element `x_i` of input array `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have a floating-point data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise base-2 logarithm of `x`.
-        The data type of the returned array is determined by the
-        Type Promotion Rules.
-"""
-
-log2 = UnaryElementwiseFunc(
-    "log2", ti._log2_result_type, ti._log2, _log2_docstring_
-)
-del _log2_docstring_
-
-# U23: ==== LOG10       (x)
-_log10_docstring_ = r"""
-log10(x, /, \*, out=None, order='K')
-
-Computes the base-10 logarithm for each element `x_i` of input array `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have a floating-point data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: `"K"`.
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise base-10 logarithm of `x`.
-        The data type of the returned array is determined by the
-        Type Promotion Rules.
-"""
-
-log10 = UnaryElementwiseFunc(
-    "log10", ti._log10_result_type, ti._log10, _log10_docstring_
-)
-del _log10_docstring_
-
-# B15: ==== LOGADDEXP   (x1, x2)
-_logaddexp_docstring_ = r"""
-logaddexp(x1, x2, /, \*, out=None, order='K')
-
-Calculates the natural logarithm of the sum of exponentials for each element
-`x1_i` of the input array `x1` with the respective element `x2_i` of the input
-array `x2`.
-
-This function calculates `log(exp(x1) + exp(x2))` more accurately for small
-values of `x`.
-
-Args:
-    x1 (usm_ndarray):
-        First input array, expected to have a real-valued floating-point data
-        type.
-    x2 (usm_ndarray):
-        Second input array, also expected to have a real-valued floating-point
-        data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise results. The data type
-        of the returned array is determined by the Type Promotion Rules.
-"""
-
-logaddexp = BinaryElementwiseFunc(
-    "logaddexp", ti._logaddexp_result_type, ti._logaddexp, _logaddexp_docstring_
-)
-del _logaddexp_docstring_
-
-# B16: ==== LOGICAL_AND (x1, x2)
-_logical_and_docstring_ = r"""
-logical_and(x1, x2, /, \*, out=None, order='K')
-
-Computes the logical AND for each element `x1_i` of the input array `x1` with
-the respective element `x2_i` of the input array `x2`.
-
-Args:
-    x1 (usm_ndarray):
-        First input array. May have any data type.
-    x2 (usm_ndarray):
-        Second input array. May have any data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise logical AND results.
-"""
-logical_and = BinaryElementwiseFunc(
-    "logical_and",
-    ti._logical_and_result_type,
-    ti._logical_and,
-    _logical_and_docstring_,
-)
-del _logical_and_docstring_
-
-# U24: ==== LOGICAL_NOT (x)
-_logical_not_docstring = r"""
-logical_not(x, /, \*, out=None, order='K')
-
-Computes the logical NOT for each element `x_i` of input array `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array. May have any data type.
-    out (usm_ndarray):
-        Output array to populate. Array must have the correct
-        shape and the expected data type.
-    order ("C","F","A","K", optional): memory layout of the new
-        output array, if parameter `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise logical NOT results.
-"""
-
-logical_not = UnaryElementwiseFunc(
-    "logical_not",
-    ti._logical_not_result_type,
-    ti._logical_not,
-    _logical_not_docstring,
-)
-del _logical_not_docstring
-
-# B17: ==== LOGICAL_OR  (x1, x2)
-_logical_or_docstring_ = r"""
-logical_or(x1, x2, /, \*, out=None, order='K')
-
-Computes the logical OR for each element `x1_i` of the input array `x1`
-with the respective element `x2_i` of the input array `x2`.
-
-Args:
-    x1 (usm_ndarray):
-        First input array. May have any data type.
-    x2 (usm_ndarray):
-        Second input array. May have any data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise logical OR results.
-"""
-logical_or = BinaryElementwiseFunc(
-    "logical_or",
-    ti._logical_or_result_type,
-    ti._logical_or,
-    _logical_or_docstring_,
-)
-del _logical_or_docstring_
-
-# B18: ==== LOGICAL_XOR (x1, x2)
-_logical_xor_docstring_ = r"""
-logical_xor(x1, x2, /, \*, out=None, order='K')
-
-Computes the logical XOR for each element `x1_i` of the input array `x1`
-with the respective element `x2_i` of the input array `x2`.
-
-Args:
-    x1 (usm_ndarray):
-        First input array. May have any data type.
-    x2 (usm_ndarray):
-        Second input array. May have any data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise logical XOR results.
-"""
-logical_xor = BinaryElementwiseFunc(
-    "logical_xor",
-    ti._logical_xor_result_type,
-    ti._logical_xor,
-    _logical_xor_docstring_,
-)
-del _logical_xor_docstring_
-
-# B26: ==== MAXIMUM    (x1, x2)
-_maximum_docstring_ = r"""
-maximum(x1, x2, /, \*, out=None, order='K')
-
-Compares two input arrays `x1` and `x2` and returns a new array containing the
-element-wise maxima.
-
-Args:
-    x1 (usm_ndarray):
-        First input array. May have any data type.
-    x2 (usm_ndarray):
-        Second input array. May have any data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise maxima. The data type of
-        the returned array is determined by the Type Promotion Rules.
-"""
-maximum = BinaryElementwiseFunc(
-    "maximum",
-    ti._maximum_result_type,
-    ti._maximum,
-    _maximum_docstring_,
-)
-del _maximum_docstring_
-
-# B27: ==== MINIMUM    (x1, x2)
-_minimum_docstring_ = r"""
-minimum(x1, x2, /, \*, out=None, order='K')
-
-Compares two input arrays `x1` and `x2` and returns a new array containing the
-element-wise minima.
-
-Args:
-    x1 (usm_ndarray):
-        First input array. May have any data type.
-    x2 (usm_ndarray):
-        Second input array. May have any data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise minima. The data type of
-        the returned array is determined by the Type Promotion Rules.
-"""
-minimum = BinaryElementwiseFunc(
-    "minimum",
-    ti._minimum_result_type,
-    ti._minimum,
-    _minimum_docstring_,
-)
-del _minimum_docstring_
-
-# B19: ==== MULTIPLY    (x1, x2)
-_multiply_docstring_ = r"""
-multiply(x1, x2, /, \*, out=None, order='K')
-
-Calculates the product for each element `x1_i` of the input array `x1` with the
-respective element `x2_i` of the input array `x2`.
-
-Args:
-    x1 (usm_ndarray):
-        First input array. May have any data type.
-    x2 (usm_ndarray):
-        Second input array. May have any data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise products. The data type of
-        the returned array is determined by the Type Promotion Rules.
-"""
-multiply = BinaryElementwiseFunc(
-    "multiply",
-    ti._multiply_result_type,
-    ti._multiply,
-    _multiply_docstring_,
-    binary_inplace_fn=ti._multiply_inplace,
-)
-del _multiply_docstring_
-
-# U25: ==== NEGATIVE    (x)
-_negative_docstring_ = r"""
-negative(x, /, \*, out=None, order='K')
-
-Computes the numerical negative for each element `x_i` of input array `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have a numeric data type.
-    out (usm_ndarray):
-        Output array to populate. Array must have the correct
-        shape and the expected data type.
-    order ("C","F","A","K", optional): memory layout of the new
-        output array, if parameter `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the negative of `x`.
-"""
-
-negative = UnaryElementwiseFunc(
-    "negative",
-    ti._negative_result_type,
-    ti._negative,
-    _negative_docstring_,
-    acceptance_fn=_acceptance_fn_negative,
-)
-del _negative_docstring_
-
-# B28: ==== NEXTAFTER    (x1, x2)
-_nextafter_docstring_ = r"""
-nextafter(x1, x2, /, \*, out=None, order='K')
-
-Calculates the next floating-point value after element `x1_i` of the input
-array `x1` toward the respective element `x2_i` of the input array `x2`.
-
-Args:
-    x1 (usm_ndarray):
-        First input array, expected to have a real-valued floating-point data
-        type.
-    x2 (usm_ndarray):
-        Second input array, expected to have a real-valued floating-point data
-        type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise next representable values of `x1`
-        in the direction of `x2`. The data type of the returned array is
-        determined by the Type Promotion Rules.
-"""
-nextafter = BinaryElementwiseFunc(
-    "nextafter",
-    ti._nextafter_result_type,
-    ti._nextafter,
-    _nextafter_docstring_,
-)
-del _nextafter_docstring_
-
-# B20: ==== NOT_EQUAL   (x1, x2)
-_not_equal_docstring_ = r"""
-not_equal(x1, x2, /, \*, out=None, order='K')
-
-Calculates inequality test results for each element `x1_i` of the
-input array `x1` with the respective element `x2_i` of the input array `x2`.
-
-Args:
-    x1 (usm_ndarray):
-        First input array.
-    x2 (usm_ndarray):
-        Second input array.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the result of element-wise inequality comparison.
-        The returned array has a data type of `bool`.
-"""
-
-not_equal = BinaryElementwiseFunc(
-    "not_equal",
-    ti._not_equal_result_type,
-    ti._not_equal,
-    _not_equal_docstring_,
-    weak_type_resolver=_resolve_weak_types_all_py_ints,
-)
-del _not_equal_docstring_
-
-# U26: ==== POSITIVE    (x)
-_positive_docstring_ = r"""
-positive(x, /, \*, out=None, order='K')
-
-Computes the numerical positive for each element `x_i` of input array `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have a numeric data type.
-    out (usm_ndarray):
-        Output array to populate. Array must have the correct
-        shape and the expected data type.
-    order ("C","F","A","K", optional): memory layout of the new
-        output array, if parameter `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the positive of `x`.
-"""
-
-positive = UnaryElementwiseFunc(
-    "positive", ti._positive_result_type, ti._positive, _positive_docstring_
-)
-del _positive_docstring_
-
-# B21: ==== POW         (x1, x2)
-_pow_docstring_ = r"""
-pow(x1, x2, /, \*, out=None, order='K')
-
-Calculates `x1_i` raised to `x2_i` for each element `x1_i` of the input array
-`x1` with the respective element `x2_i` of the input array `x2`.
-
-Args:
-    x1 (usm_ndarray):
-        First input array, expected to have a numeric data type.
-    x2 (usm_ndarray):
-        Second input array, also expected to have a numeric data type.
-    out (usm_ndarray):
-        Output array to populate. Array must have the correct
-        shape and the expected data type.
-    order ("C","F","A","K", optional): memory layout of the new
-        output array, if parameter `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the bases in `x1` raised to the exponents in `x2`
-        element-wise. The data type of the returned array is determined by the
-        Type Promotion Rules.
-"""
-pow = BinaryElementwiseFunc(
-    "pow",
-    ti._pow_result_type,
-    ti._pow,
-    _pow_docstring_,
-    binary_inplace_fn=ti._pow_inplace,
-)
-del _pow_docstring_
-
-# U40: ==== PROJ        (x)
-_proj_docstring = r"""
-proj(x, /, \*, out=None, order='K')
-
-Computes projection of each element `x_i` for input array `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have a complex data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise projection.
-"""
-
-proj = UnaryElementwiseFunc(
-    "proj", ti._proj_result_type, ti._proj, _proj_docstring
-)
-del _proj_docstring
-
-# U27: ==== REAL        (x)
-_real_docstring = r"""
-real(x, /, \*, out=None, order='K')
-
-Computes real part of each element `x_i` for input array `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array. May have any data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise real component of input.
-        If the input is a real-valued data type, the returned array has
-        the same data type. If the input is a complex floating-point
-        data type, the returned array has a floating-point data type
-        with the same floating-point precision as complex input.
-"""
-
-real = UnaryElementwiseFunc(
-    "real", ti._real_result_type, ti._real, _real_docstring
-)
-del _real_docstring
-
-# B22: ==== REMAINDER   (x1, x2)
-_remainder_docstring_ = r"""
-remainder(x1, x2, /, \*, out=None, order='K')
-
-Calculates the remainder of division for each element `x1_i` of the input array
-`x1` with the respective element `x2_i` of the input array `x2`.
-
-This function is equivalent to the Python modulus operator.
-
-Args:
-    x1 (usm_ndarray):
-        First input array, expected to have a real-valued data type.
-    x2 (usm_ndarray):
-        Second input array, also expected to have a real-valued data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise remainders. Each remainder has the
-        same sign as respective element `x2_i`. The data type of the returned
-        array is determined by the Type Promotion Rules.
-"""
-remainder = BinaryElementwiseFunc(
-    "remainder",
-    ti._remainder_result_type,
-    ti._remainder,
-    _remainder_docstring_,
-    binary_inplace_fn=ti._remainder_inplace,
-)
-del _remainder_docstring_
-
-# U28: ==== ROUND       (x)
-_round_docstring = r"""
-round(x, /, \*, out=None, order='K')
-
-Rounds each element `x_i` of the input array `x` to
-the nearest integer-valued number.
-
-When two integers are equally close to `x_i`, the result is the nearest even
-integer to `x_i`.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have a numeric data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise rounded values.
-"""
-
-round = UnaryElementwiseFunc(
-    "round", ti._round_result_type, ti._round, _round_docstring
-)
-del _round_docstring
-
-# U29: ==== SIGN        (x)
-_sign_docstring = r"""
-sign(x, /, \*, out=None, order='K')
-
-Computes an indication of the sign of each element `x_i` of input array `x`
-using the signum function.
-
-The signum function returns `-1` if `x_i` is less than `0`,
-`0` if `x_i` is equal to `0`, and `1` if `x_i` is greater than `0`.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have a numeric data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise result of the signum function. The
-        data type of the returned array is determined by the Type Promotion
-        Rules.
-"""
-
-sign = UnaryElementwiseFunc(
-    "sign", ti._sign_result_type, ti._sign, _sign_docstring
-)
-del _sign_docstring
-
-# U41: ==== SIGNBIT        (x)
-_signbit_docstring = r"""
-signbit(x, /, \*, out=None, order='K')
-
-Computes an indication of whether the sign bit of each element `x_i` of
-input array `x` is set.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have a real-valued floating-point data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise signbit results. The returned array
-        must have a data type of `bool`.
-"""
-
-signbit = UnaryElementwiseFunc(
-    "signbit", ti._signbit_result_type, ti._signbit, _signbit_docstring
-)
-del _signbit_docstring
-
-# U30: ==== SIN         (x)
-_sin_docstring = r"""
-sin(x, /, \*, out=None, order='K')
-
-Computes sine for each element `x_i` of input array `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have a real-valued floating-point data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise sine. The data type of the
-        returned array is determined by the Type Promotion Rules.
-"""
-
-sin = UnaryElementwiseFunc("sin", ti._sin_result_type, ti._sin, _sin_docstring)
-del _sin_docstring
-
-# U31: ==== SINH        (x)
-_sinh_docstring = r"""
-sinh(x, /, \*, out=None, order='K')
-
-Computes hyperbolic sine for each element `x_i` for input array `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have a floating-point data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise hyperbolic sine. The data type
-        of the returned array is determined by the Type Promotion Rules.
-"""
-
-sinh = UnaryElementwiseFunc(
-    "sinh", ti._sinh_result_type, ti._sinh, _sinh_docstring
-)
-del _sinh_docstring
-
-# U32: ==== SQUARE      (x)
-_square_docstring_ = r"""
-square(x, /, \*, out=None, order='K')
-
-Squares each element `x_i` of input array `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array. May have any data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise squares of `x`. The data type of
-        the returned array is determined by the Type Promotion Rules.
-"""
-
-square = UnaryElementwiseFunc(
-    "square", ti._square_result_type, ti._square, _square_docstring_
-)
-del _square_docstring_
-
-# U33: ==== SQRT        (x)
-_sqrt_docstring_ = r"""
-sqrt(x, /, \*, out=None, order='K')
-
-Computes the positive square-root for each element `x_i` of input array `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have a floating-point data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise positive square-roots of `x`. The
-        data type of the returned array is determined by the Type Promotion
-        Rules.
-"""
-
-sqrt = UnaryElementwiseFunc(
-    "sqrt", ti._sqrt_result_type, ti._sqrt, _sqrt_docstring_
-)
-del _sqrt_docstring_
-
-# B23: ==== SUBTRACT    (x1, x2)
-_subtract_docstring_ = r"""
-subtract(x1, x2, /, \*, out=None, order='K')
-
-Calculates the difference between each element `x1_i` of the input
-array `x1` and the respective element `x2_i` of the input array `x2`.
-
-Args:
-    x1 (usm_ndarray):
-        First input array, expected to have a numeric data type.
-    x2 (usm_ndarray):
-        Second input array, also expected to have a numeric data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise differences. The data type
-        of the returned array is determined by the Type Promotion Rules.
-"""
-subtract = BinaryElementwiseFunc(
-    "subtract",
-    ti._subtract_result_type,
-    ti._subtract,
-    _subtract_docstring_,
-    binary_inplace_fn=ti._subtract_inplace,
-    acceptance_fn=_acceptance_fn_subtract,
-)
-del _subtract_docstring_
-
-# U34: ==== TAN         (x)
-_tan_docstring = r"""
-tan(x, /, \*, out=None, order='K')
-
-Computes tangent for each element `x_i` for input array `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have a floating-point data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise tangent. The data type
-        of the returned array is determined by the Type Promotion Rules.
-"""
-
-tan = UnaryElementwiseFunc("tan", ti._tan_result_type, ti._tan, _tan_docstring)
-del _tan_docstring
-
-# U35: ==== TANH        (x)
-_tanh_docstring = r"""
-tanh(x, /, \*, out=None, order='K')
-
-Computes hyperbolic tangent for each element `x_i` for input array `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have a floating-point data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise hyperbolic tangent. The data type
-        of the returned array is determined by the Type Promotion Rules.
-"""
-
-tanh = UnaryElementwiseFunc(
-    "tanh", ti._tanh_result_type, ti._tanh, _tanh_docstring
-)
-del _tanh_docstring
-
-# U36: ==== TRUNC       (x)
-_trunc_docstring = r"""
-trunc(x, /, \*, out=None, order='K')
-
-Returns the truncated value for each element `x_i` for input array `x`.
-
-The truncated value of the scalar `x` is the nearest integer i which is
-closer to zero than `x` is. In short, the fractional part of the
-signed number `x` is discarded.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have a boolean or real-valued data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the result of element-wise division. The data type
-        of the returned array is determined by the Type Promotion Rules.
-"""
-trunc = UnaryElementwiseFunc(
-    "trunc", ti._trunc_result_type, ti._trunc, _trunc_docstring
-)
-del _trunc_docstring
-
-
-# B24: ==== HYPOT        (x1, x2)
-_hypot_docstring_ = r"""
-hypot(x1, x2, /, \*, out=None, order='K')
-
-Computes the square root of the sum of squares for each element `x1_i` of the
-input array `x1` with the respective element `x2_i` of the input array `x2`.
-
-Args:
-    x1 (usm_ndarray):
-        First input array, expected to have a real-valued floating-point data
-        type.
-    x2 (usm_ndarray):
-        Second input array, also expected to have a real-valued floating-point
-        data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array must have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise hypotenuse. The data type
-        of the returned array is determined by the Type Promotion Rules.
-"""
-
-hypot = BinaryElementwiseFunc(
-    "hypot", ti._hypot_result_type, ti._hypot, _hypot_docstring_
-)
-del _hypot_docstring_
-
-
-# U37: ==== CBRT        (x)
-_cbrt_docstring_ = r"""
-cbrt(x, /, \*, out=None, order='K')
-
-Computes the cube-root for each element `x_i` for input array `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have a real-valued floating-point data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise cube-root.
-        The data type of the returned array is determined by
-        the Type Promotion Rules.
-"""
-
-cbrt = UnaryElementwiseFunc(
-    "cbrt", ti._cbrt_result_type, ti._cbrt, _cbrt_docstring_
-)
-del _cbrt_docstring_
-
-
-# U38: ==== EXP2        (x)
-_exp2_docstring_ = r"""
-exp2(x, /, \*, out=None, order='K')
-
-Computes the base-2 exponential for each element `x_i` for input array `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have a floating-point data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise base-2 exponentials.
-        The data type of the returned array is determined by
-        the Type Promotion Rules.
-"""
-
-exp2 = UnaryElementwiseFunc(
-    "exp2", ti._exp2_result_type, ti._exp2, _exp2_docstring_
-)
-del _exp2_docstring_
-
-
-# B25: ==== COPYSIGN    (x1, x2)
-_copysign_docstring_ = r"""
-copysign(x1, x2, /, \*, out=None, order='K')
-
-Composes a floating-point value with the magnitude of `x1_i` and the sign of
-`x2_i` for each element of input arrays `x1` and `x2`.
-
-Args:
-    x1 (usm_ndarray):
-        First input array, expected to have a real-valued floating-point data
-        type.
-    x2 (usm_ndarray):
-        Second input array, also expected to have a real-valued floating-point
-        data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise results. The data type
-        of the returned array is determined by the Type Promotion Rules.
-"""
-copysign = BinaryElementwiseFunc(
-    "copysign",
-    ti._copysign_result_type,
-    ti._copysign,
-    _copysign_docstring_,
-)
-del _copysign_docstring_
-
-
-# U39: ==== RSQRT        (x)
-_rsqrt_docstring_ = r"""
-rsqrt(x, /, \*, out=None, order='K')
-
-Computes the reciprocal square-root for each element `x_i` for input array `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have a real-valued floating-point data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise reciprocal square-root.
-        The returned array has a floating-point data type determined by
-        the Type Promotion Rules.
-"""
-
-rsqrt = UnaryElementwiseFunc(
-    "rsqrt", ti._rsqrt_result_type, ti._rsqrt, _rsqrt_docstring_
-)
-del _rsqrt_docstring_
-
-
-# U42: ==== RECIPROCAL        (x)
-_reciprocal_docstring = r"""
-reciprocal(x, /, \*, out=None, order='K')
-
-Computes the reciprocal of each element `x_i` for input array `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have a floating-point data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise reciprocals.
-        The returned array has a floating-point data type determined
-        by the Type Promotion Rules.
-"""
-
-reciprocal = UnaryElementwiseFunc(
-    "reciprocal",
-    ti._reciprocal_result_type,
-    ti._reciprocal,
-    _reciprocal_docstring,
-    acceptance_fn=_acceptance_fn_reciprocal,
-)
-del _reciprocal_docstring
-
-
-# U43: ==== ANGLE        (x)
-_angle_docstring = r"""
-angle(x, /, \*, out=None, order='K')
-
-Computes the phase angle (also called the argument) of each element `x_i` for
-input array `x`.
-
-Args:
-    x (usm_ndarray):
-        Input array, expected to have a complex floating-point data type.
-    out (Union[usm_ndarray, None], optional):
-        Output array to populate.
-        Array have the correct shape and the expected data type.
-    order ("C","F","A","K", optional):
-        Memory layout of the new output array, if parameter
-        `out` is ``None``.
-        Default: "K".
-
-Returns:
-    usm_ndarray:
-        An array containing the element-wise phase angles.
-        The returned array has a floating-point data type determined
-        by the Type Promotion Rules.
-"""
-
-angle = UnaryElementwiseFunc(
-    "angle",
-    ti._angle_result_type,
-    ti._angle,
-    _angle_docstring,
-)
-del _angle_docstring
-
-del ti
diff --git a/dpctl/tensor/_flags.pyx b/dpctl/tensor/_flags.pyx
deleted file mode 100644
index 052687838c..0000000000
--- a/dpctl/tensor/_flags.pyx
+++ /dev/null
@@ -1,163 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-# distutils: language = c++
-# cython: language_level=3
-# cython: linetrace=True
-
-from libcpp cimport bool as cpp_bool
-
-from dpctl.tensor._usmarray cimport (
-    USM_ARRAY_C_CONTIGUOUS,
-    USM_ARRAY_F_CONTIGUOUS,
-    USM_ARRAY_WRITABLE,
-    usm_ndarray,
-)
-
-
-cdef cpp_bool _check_bit(int flag, int mask):
-    return (flag & mask) == mask
-
-
-cdef class Flags:
-    """
-    Helper class to query the flags of a :class:`dpctl.tensor.usm_ndarray`
-    instance, which describe how the instance interfaces with its underlying
-    memory.
-    """
-    cdef int flags_
-    cdef usm_ndarray arr_
-
-    def __cinit__(self, usm_ndarray arr, int flags):
-        self.arr_ = arr
-        self.flags_ = flags
-
-    @property
-    def flags(self):
-        """
-        Integer representation of the memory layout flags of
-        :class:`dpctl.tensor.usm_ndarray` instance.
-        """
-        return self.flags_
-
-    @property
-    def c_contiguous(self):
-        """
-        True if the memory layout of the
-        :class:`dpctl.tensor.usm_ndarray` instance is C-contiguous.
-        """
-        return _check_bit(self.flags_, USM_ARRAY_C_CONTIGUOUS)
-
-    @property
-    def f_contiguous(self):
-        """
-        True if the memory layout of the
-        :class:`dpctl.tensor.usm_ndarray` instance is F-contiguous.
-        """
-        return _check_bit(self.flags_, USM_ARRAY_F_CONTIGUOUS)
-
-    @property
-    def writable(self):
-        """
-        True if :class:`dpctl.tensor.usm_ndarray` instance is writable.
-        """
-        return _check_bit(self.flags_, USM_ARRAY_WRITABLE)
-
-    @writable.setter
-    def writable(self, new_val):
-        if not isinstance(new_val, bool):
-            raise TypeError("Expecting a boolean value")
-        self.arr_._set_writable_flag(new_val)
-
-    @property
-    def fc(self):
-        """
-        True if the memory layout of the :class:`dpctl.tensor.usm_ndarray`
-        instance is C-contiguous and F-contiguous.
-        """
-        return (
-           _check_bit(self.flags_, USM_ARRAY_C_CONTIGUOUS)
-           and _check_bit(self.flags_, USM_ARRAY_F_CONTIGUOUS)
-        )
-
-    @property
-    def forc(self):
-        """
-        True if the memory layout of the :class:`dpctl.tensor.usm_ndarray`
-        instance is C-contiguous or F-contiguous.
-        """
-        return (
-           _check_bit(self.flags_, USM_ARRAY_C_CONTIGUOUS)
-           or _check_bit(self.flags_, USM_ARRAY_F_CONTIGUOUS)
-        )
-
-    @property
-    def fnc(self):
-        """
-        True if the memory layout of the :class:`dpctl.tensor.usm_ndarray`
-        instance is F-contiguous and not C-contiguous.
-        """
-        return (
-           _check_bit(self.flags_, USM_ARRAY_F_CONTIGUOUS)
-           and not _check_bit(self.flags_, USM_ARRAY_C_CONTIGUOUS)
-        )
-
-    @property
-    def contiguous(self):
-        """
-        True if the memory layout of the :class:`dpctl.tensor.usm_ndarray`
-        instance is C-contiguous and F-contiguous.
-        Equivalent to `forc.`
-        """
-        return self.forc
-
-    def __getitem__(self, name):
-        if name in ["C_CONTIGUOUS", "C"]:
-            return self.c_contiguous
-        elif name in ["F_CONTIGUOUS", "F"]:
-            return self.f_contiguous
-        elif name in ["WRITABLE", "W"]:
-            return self.writable
-        elif name == "FC":
-            return self.fc
-        elif name == "FNC":
-            return self.fnc
-        elif name in ["FORC", "CONTIGUOUS"]:
-            return self.forc
-
-    def __setitem__(self, name, val):
-        if name in ["WRITABLE", "W"]:
-            self.writable = val
-        else:
-            raise ValueError(
-                "Only writable ('W' or 'WRITABLE') flag can be set"
-            )
-
-    def __repr__(self):
-        out = []
-        for name in "C_CONTIGUOUS", "F_CONTIGUOUS", "WRITABLE":
-            out.append("  {} : {}".format(name, self[name]))
-        return "\n".join(out)
-
-    def __eq__(self, other):
-        cdef Flags other_
-        if isinstance(other, self.__class__):
-            other_ = <Flags>other
-            return self.flags_ == other_.flags_
-        elif isinstance(other, int):
-            return self.flags_ == <int>other
-        else:
-            return False
diff --git a/dpctl/tensor/_indexing_functions.py b/dpctl/tensor/_indexing_functions.py
deleted file mode 100644
index 4f04a6094c..0000000000
--- a/dpctl/tensor/_indexing_functions.py
+++ /dev/null
@@ -1,625 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import operator
-
-import dpctl
-import dpctl.tensor as dpt
-import dpctl.tensor._tensor_impl as ti
-import dpctl.utils
-
-from ._copy_utils import (
-    _extract_impl,
-    _nonzero_impl,
-    _put_multi_index,
-    _take_multi_index,
-)
-from ._numpy_helper import normalize_axis_index
-
-
-def _get_indexing_mode(name):
-    modes = {"wrap": 0, "clip": 1}
-    try:
-        return modes[name]
-    except KeyError:
-        raise ValueError(
-            "`mode` must be `wrap` or `clip`." "Got `{}`.".format(name)
-        )
-
-
-def take(x, indices, /, *, axis=None, out=None, mode="wrap"):
-    """take(x, indices, axis=None, out=None, mode="wrap")
-
-    Takes elements from an array along a given axis at given indices.
-
-    Args:
-        x (usm_ndarray):
-            The array that elements will be taken from.
-        indices (usm_ndarray):
-            One-dimensional array of indices.
-        axis (int, optional):
-            The axis along which the values will be selected.
-            If ``x`` is one-dimensional, this argument is optional.
-            Default: ``None``.
-        out (Optional[usm_ndarray]):
-            Output array to populate. Array must have the correct
-            shape and the expected data type.
-        mode (str, optional):
-            How out-of-bounds indices will be handled. Possible values
-            are:
-
-            - ``"wrap"``: clamps indices to (``-n <= i < n``), then wraps
-              negative indices.
-            - ``"clip"``: clips indices to (``0 <= i < n``).
-
-            Default: ``"wrap"``.
-
-    Returns:
-       usm_ndarray:
-          Array with shape
-          ``x.shape[:axis] + indices.shape + x.shape[axis + 1:]``
-          filled with elements from ``x``.
-    """
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(
-            "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x))
-        )
-
-    if not isinstance(indices, dpt.usm_ndarray):
-        raise TypeError(
-            "`indices` expected `dpt.usm_ndarray`, got `{}`.".format(
-                type(indices)
-            )
-        )
-    if indices.dtype.kind not in "ui":
-        raise IndexError(
-            "`indices` expected integer data type, got `{}`".format(
-                indices.dtype
-            )
-        )
-    if indices.ndim != 1:
-        raise ValueError(
-            "`indices` expected a 1D array, got `{}`".format(indices.ndim)
-        )
-    exec_q = dpctl.utils.get_execution_queue([x.sycl_queue, indices.sycl_queue])
-    if exec_q is None:
-        raise dpctl.utils.ExecutionPlacementError
-    res_usm_type = dpctl.utils.get_coerced_usm_type(
-        [x.usm_type, indices.usm_type]
-    )
-
-    mode = _get_indexing_mode(mode)
-
-    x_ndim = x.ndim
-    if axis is None:
-        if x_ndim > 1:
-            raise ValueError(
-                "`axis` cannot be `None` for array of dimension `{}`".format(
-                    x_ndim
-                )
-            )
-        axis = 0
-
-    if x_ndim > 0:
-        axis = normalize_axis_index(operator.index(axis), x_ndim)
-        x_sh = x.shape
-        if x_sh[axis] == 0 and indices.size != 0:
-            raise IndexError("cannot take non-empty indices from an empty axis")
-        res_shape = x.shape[:axis] + indices.shape + x.shape[axis + 1 :]
-    else:
-        if axis != 0:
-            raise ValueError("`axis` must be 0 for an array of dimension 0.")
-        res_shape = indices.shape
-
-    dt = x.dtype
-
-    orig_out = out
-    if out is not None:
-        if not isinstance(out, dpt.usm_ndarray):
-            raise TypeError(
-                f"output array must be of usm_ndarray type, got {type(out)}"
-            )
-        if not out.flags.writable:
-            raise ValueError("provided `out` array is read-only")
-
-        if out.shape != res_shape:
-            raise ValueError(
-                "The shape of input and output arrays are inconsistent. "
-                f"Expected output shape is {res_shape}, got {out.shape}"
-            )
-        if dt != out.dtype:
-            raise ValueError(
-                f"Output array of type {dt} is needed, got {out.dtype}"
-            )
-        if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None:
-            raise dpctl.utils.ExecutionPlacementError(
-                "Input and output allocation queues are not compatible"
-            )
-        if ti._array_overlap(x, out):
-            out = dpt.empty_like(out)
-    else:
-        out = dpt.empty(
-            res_shape, dtype=dt, usm_type=res_usm_type, sycl_queue=exec_q
-        )
-
-    _manager = dpctl.utils.SequentialOrderManager[exec_q]
-    deps_ev = _manager.submitted_events
-    hev, take_ev = ti._take(
-        x, (indices,), out, axis, mode, sycl_queue=exec_q, depends=deps_ev
-    )
-    _manager.add_event_pair(hev, take_ev)
-
-    if not (orig_out is None or out is orig_out):
-        # Copy the out data from temporary buffer to original memory
-        ht_e_cpy, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=out, dst=orig_out, sycl_queue=exec_q, depends=[take_ev]
-        )
-        _manager.add_event_pair(ht_e_cpy, cpy_ev)
-        out = orig_out
-
-    return out
-
-
-def put(x, indices, vals, /, *, axis=None, mode="wrap"):
-    """put(x, indices, vals, axis=None, mode="wrap")
-
-    Puts values into an array along a given axis at given indices.
-
-    Args:
-        x (usm_ndarray):
-            The array the values will be put into.
-        indices (usm_ndarray):
-            One-dimensional array of indices.
-        vals (usm_ndarray):
-            Array of values to be put into ``x``.
-            Must be broadcastable to the result shape
-            ``x.shape[:axis] + indices.shape + x.shape[axis+1:]``.
-        axis (int, optional):
-            The axis along which the values will be placed.
-            If ``x`` is one-dimensional, this argument is optional.
-            Default: ``None``.
-        mode (str, optional):
-            How out-of-bounds indices will be handled. Possible values
-            are:
-
-            - ``"wrap"``: clamps indices to (``-n <= i < n``), then wraps
-              negative indices.
-            - ``"clip"``: clips indices to (``0 <= i < n``).
-
-            Default: ``"wrap"``.
-
-    .. note::
-
-        If input array ``indices`` contains duplicates, a race condition
-        occurs, and the value written into corresponding positions in ``x``
-        may vary from run to run. Preserving sequential semantics in handing
-        the duplicates to achieve deterministic behavior requires additional
-        work, e.g.
-
-        :Example:
-
-            .. code-block:: python
-
-                from dpctl import tensor as dpt
-
-                def put_vec_duplicates(vec, ind, vals):
-                    "Put values into vec, handling possible duplicates in ind"
-                    assert vec.ndim, ind.ndim, vals.ndim == 1, 1, 1
-
-                    # find positions of last occurences of each
-                    # unique index
-                    ind_flipped = dpt.flip(ind)
-                    ind_uniq = dpt.unique_all(ind_flipped).indices
-                    has_dups = len(ind) != len(ind_uniq)
-
-                    if has_dups:
-                        ind_uniq = dpt.subtract(vec.size - 1, ind_uniq)
-                        ind = dpt.take(ind, ind_uniq)
-                        vals = dpt.take(vals, ind_uniq)
-
-                    dpt.put(vec, ind, vals)
-
-                n = 512
-                ind = dpt.concat((dpt.arange(n), dpt.arange(n, -1, step=-1)))
-                x = dpt.zeros(ind.size, dtype="int32")
-                vals = dpt.arange(ind.size, dtype=x.dtype)
-
-                # Values corresponding to last positions of
-                # duplicate indices are written into the vector x
-                put_vec_duplicates(x, ind, vals)
-
-                parts = (vals[-1:-n-2:-1], dpt.zeros(n, dtype=x.dtype))
-                expected = dpt.concat(parts)
-                assert dpt.all(x == expected)
-    """
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(
-            "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x))
-        )
-    if not isinstance(indices, dpt.usm_ndarray):
-        raise TypeError(
-            "`indices` expected `dpt.usm_ndarray`, got `{}`.".format(
-                type(indices)
-            )
-        )
-    if isinstance(vals, dpt.usm_ndarray):
-        queues_ = [x.sycl_queue, indices.sycl_queue, vals.sycl_queue]
-        usm_types_ = [x.usm_type, indices.usm_type, vals.usm_type]
-    else:
-        queues_ = [x.sycl_queue, indices.sycl_queue]
-        usm_types_ = [x.usm_type, indices.usm_type]
-    if indices.ndim != 1:
-        raise ValueError(
-            "`indices` expected a 1D array, got `{}`".format(indices.ndim)
-        )
-    if indices.dtype.kind not in "ui":
-        raise IndexError(
-            "`indices` expected integer data type, got `{}`".format(
-                indices.dtype
-            )
-        )
-    exec_q = dpctl.utils.get_execution_queue(queues_)
-    if exec_q is None:
-        raise dpctl.utils.ExecutionPlacementError
-    vals_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_)
-
-    mode = _get_indexing_mode(mode)
-
-    x_ndim = x.ndim
-    if axis is None:
-        if x_ndim > 1:
-            raise ValueError(
-                "`axis` cannot be `None` for array of dimension `{}`".format(
-                    x_ndim
-                )
-            )
-        axis = 0
-
-    if x_ndim > 0:
-        axis = normalize_axis_index(operator.index(axis), x_ndim)
-        x_sh = x.shape
-        if x_sh[axis] == 0 and indices.size != 0:
-            raise IndexError("cannot take non-empty indices from an empty axis")
-        val_shape = x.shape[:axis] + indices.shape + x.shape[axis + 1 :]
-    else:
-        if axis != 0:
-            raise ValueError("`axis` must be 0 for an array of dimension 0.")
-        val_shape = indices.shape
-
-    if not isinstance(vals, dpt.usm_ndarray):
-        vals = dpt.asarray(
-            vals, dtype=x.dtype, usm_type=vals_usm_type, sycl_queue=exec_q
-        )
-    # choose to throw here for consistency with `place`
-    if vals.size == 0:
-        raise ValueError(
-            "cannot put into non-empty indices along an empty axis"
-        )
-    if vals.dtype == x.dtype:
-        rhs = vals
-    else:
-        rhs = dpt.astype(vals, x.dtype)
-    rhs = dpt.broadcast_to(rhs, val_shape)
-
-    _manager = dpctl.utils.SequentialOrderManager[exec_q]
-    deps_ev = _manager.submitted_events
-    hev, put_ev = ti._put(
-        x, (indices,), rhs, axis, mode, sycl_queue=exec_q, depends=deps_ev
-    )
-    _manager.add_event_pair(hev, put_ev)
-
-
-def extract(condition, arr):
-    """extract(condition, arr)
-
-    Returns the elements of an array that satisfies the condition.
-
-    If ``condition`` is boolean ``dpctl.tensor.extract`` is
-    equivalent to ``arr[condition]``.
-
-    Note that ``dpctl.tensor.place`` does the opposite of
-    ``dpctl.tensor.extract``.
-
-    Args:
-       conditions (usm_ndarray):
-            An array whose non-zero or ``True`` entries indicate the element
-            of ``arr`` to extract.
-
-       arr (usm_ndarray):
-            Input array of the same size as ``condition``.
-
-    Returns:
-        usm_ndarray:
-            Rank 1 array of values from ``arr`` where ``condition`` is
-            ``True``.
-    """
-    if not isinstance(condition, dpt.usm_ndarray):
-        raise TypeError(
-            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(condition)}"
-        )
-    if not isinstance(arr, dpt.usm_ndarray):
-        raise TypeError(
-            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(arr)}"
-        )
-    exec_q = dpctl.utils.get_execution_queue(
-        (
-            condition.sycl_queue,
-            arr.sycl_queue,
-        )
-    )
-    if exec_q is None:
-        raise dpctl.utils.ExecutionPlacementError
-    if condition.shape != arr.shape:
-        raise ValueError("Arrays are not of the same size")
-    return _extract_impl(arr, condition)
-
-
-def place(arr, mask, vals):
-    """place(arr, mask, vals)
-
-    Change elements of an array based on conditional and input values.
-
-    If ``mask`` is boolean ``dpctl.tensor.place`` is
-    equivalent to ``arr[condition] = vals``.
-
-    Args:
-        arr (usm_ndarray):
-            Array to put data into.
-        mask (usm_ndarray):
-            Boolean mask array. Must have the same size as ``arr``.
-        vals (usm_ndarray, sequence):
-            Values to put into ``arr``. Only the first N elements are
-            used, where N is the number of True values in ``mask``. If
-            ``vals`` is smaller than N, it will be repeated, and if
-            elements of ``arr`` are to be masked, this sequence must be
-            non-empty. Array ``vals`` must be one dimensional.
-    """
-    if not isinstance(arr, dpt.usm_ndarray):
-        raise TypeError(
-            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(arr)}"
-        )
-    if not isinstance(mask, dpt.usm_ndarray):
-        raise TypeError(
-            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(mask)}"
-        )
-    if not isinstance(vals, dpt.usm_ndarray):
-        raise TypeError(
-            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(vals)}"
-        )
-    exec_q = dpctl.utils.get_execution_queue(
-        (
-            arr.sycl_queue,
-            mask.sycl_queue,
-            vals.sycl_queue,
-        )
-    )
-    if exec_q is None:
-        raise dpctl.utils.ExecutionPlacementError
-    if arr.shape != mask.shape or vals.ndim != 1:
-        raise ValueError("Array sizes are not as required")
-    cumsum = dpt.empty(mask.size, dtype="i8", sycl_queue=exec_q)
-    _manager = dpctl.utils.SequentialOrderManager[exec_q]
-    deps_ev = _manager.submitted_events
-    nz_count = ti.mask_positions(
-        mask, cumsum, sycl_queue=exec_q, depends=deps_ev
-    )
-    if nz_count == 0:
-        return
-    if vals.size == 0:
-        raise ValueError("Cannot insert from an empty array!")
-    if vals.dtype == arr.dtype:
-        rhs = vals
-    else:
-        rhs = dpt.astype(vals, arr.dtype)
-    hev, pl_ev = ti._place(
-        dst=arr,
-        cumsum=cumsum,
-        axis_start=0,
-        axis_end=mask.ndim,
-        rhs=rhs,
-        sycl_queue=exec_q,
-    )
-    _manager.add_event_pair(hev, pl_ev)
-
-
-def nonzero(arr):
-    """nonzero(arr)
-
-    Return the indices of non-zero elements.
-
-    Returns a tuple of usm_ndarrays, one for each dimension
-    of ``arr``, containing the indices of the non-zero elements
-    in that dimension. The values of ``arr`` are always tested in
-    row-major, C-style order.
-
-    Args:
-        arr (usm_ndarray):
-            Input array, which has non-zero array rank.
-
-    Returns:
-        Tuple[usm_ndarray, ...]:
-            Indices of non-zero array elements.
-    """
-    if not isinstance(arr, dpt.usm_ndarray):
-        raise TypeError(
-            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(arr)}"
-        )
-    if arr.ndim == 0:
-        raise ValueError("Array of positive rank is expected")
-    return _nonzero_impl(arr)
-
-
-def _range(sh_i, i, nd, q, usm_t, dt):
-    ind = dpt.arange(sh_i, dtype=dt, usm_type=usm_t, sycl_queue=q)
-    ind.shape = tuple(sh_i if i == j else 1 for j in range(nd))
-    return ind
-
-
-def take_along_axis(x, indices, /, *, axis=-1, mode="wrap"):
-    """
-    Returns elements from an array at the one-dimensional indices specified
-    by ``indices`` along a provided ``axis``.
-
-    Args:
-        x (usm_ndarray):
-            input array. Must be compatible with ``indices``, except for the
-            axis (dimension) specified by ``axis``.
-        indices (usm_ndarray):
-            array indices. Must have the same rank (i.e., number of dimensions)
-            as ``x``.
-        axis: int
-            axis along which to select values. If ``axis`` is negative, the
-            function determines the axis along which to select values by
-            counting from the last dimension. Default: ``-1``.
-        mode (str, optional):
-            How out-of-bounds indices will be handled. Possible values
-            are:
-
-            - ``"wrap"``: clamps indices to (``-n <= i < n``), then wraps
-              negative indices.
-            - ``"clip"``: clips indices to (``0 <= i < n``).
-
-            Default: ``"wrap"``.
-
-    Returns:
-        usm_ndarray:
-            an array having the same data type as ``x``. The returned array has
-            the same rank (i.e., number of dimensions) as ``x`` and a shape
-            determined according to broadcasting rules, except for the axis
-            (dimension) specified by ``axis`` whose size must equal the size
-            of the corresponding axis (dimension) in ``indices``.
-
-    Note:
-        Treatment of the out-of-bound indices in ``indices`` array is controlled
-        by the value of ``mode`` keyword.
-    """
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
-    if not isinstance(indices, dpt.usm_ndarray):
-        raise TypeError(
-            f"Expected dpctl.tensor.usm_ndarray, got {type(indices)}"
-        )
-    x_nd = x.ndim
-    if x_nd != indices.ndim:
-        raise ValueError(
-            "Number of dimensions in the first and the second "
-            "argument arrays must be equal"
-        )
-    pp = normalize_axis_index(operator.index(axis), x_nd)
-    out_usm_type = dpctl.utils.get_coerced_usm_type(
-        (x.usm_type, indices.usm_type)
-    )
-    exec_q = dpctl.utils.get_execution_queue((x.sycl_queue, indices.sycl_queue))
-    if exec_q is None:
-        raise dpctl.utils.ExecutionPlacementError(
-            "Execution placement can not be unambiguously inferred "
-            "from input arguments. "
-        )
-    mode_i = _get_indexing_mode(mode)
-    indexes_dt = (
-        dpt.uint64
-        if indices.dtype == dpt.uint64
-        else ti.default_device_index_type(exec_q.sycl_device)
-    )
-    _ind = tuple(
-        (
-            indices
-            if i == pp
-            else _range(x.shape[i], i, x_nd, exec_q, out_usm_type, indexes_dt)
-        )
-        for i in range(x_nd)
-    )
-    return _take_multi_index(x, _ind, 0, mode=mode_i)
-
-
-def put_along_axis(x, indices, vals, /, *, axis=-1, mode="wrap"):
-    """
-    Puts elements into an array at the one-dimensional indices specified by
-    ``indices`` along a provided ``axis``.
-
-    Args:
-        x (usm_ndarray):
-            input array. Must be compatible with ``indices``, except for the
-            axis (dimension) specified by ``axis``.
-        indices (usm_ndarray):
-            array indices. Must have the same rank (i.e., number of dimensions)
-            as ``x``.
-        vals (usm_ndarray):
-            Array of values to be put into ``x``.
-            Must be broadcastable to the shape of ``indices``.
-        axis: int
-            axis along which to select values. If ``axis`` is negative, the
-            function determines the axis along which to select values by
-            counting from the last dimension. Default: ``-1``.
-        mode (str, optional):
-            How out-of-bounds indices will be handled. Possible values
-            are:
-
-            - ``"wrap"``: clamps indices to (``-n <= i < n``), then wraps
-              negative indices.
-            - ``"clip"``: clips indices to (``0 <= i < n``).
-
-            Default: ``"wrap"``.
-
-    .. note::
-
-        If input array ``indices`` contains duplicates, a race condition
-        occurs, and the value written into corresponding positions in ``x``
-        may vary from run to run. Preserving sequential semantics in handing
-        the duplicates to achieve deterministic behavior requires additional
-        work.
-    """
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
-    if not isinstance(indices, dpt.usm_ndarray):
-        raise TypeError(
-            f"Expected dpctl.tensor.usm_ndarray, got {type(indices)}"
-        )
-    x_nd = x.ndim
-    if x_nd != indices.ndim:
-        raise ValueError(
-            "Number of dimensions in the first and the second "
-            "argument arrays must be equal"
-        )
-    pp = normalize_axis_index(operator.index(axis), x_nd)
-    if isinstance(vals, dpt.usm_ndarray):
-        queues_ = [x.sycl_queue, indices.sycl_queue, vals.sycl_queue]
-        usm_types_ = [x.usm_type, indices.usm_type, vals.usm_type]
-    else:
-        queues_ = [x.sycl_queue, indices.sycl_queue]
-        usm_types_ = [x.usm_type, indices.usm_type]
-    exec_q = dpctl.utils.get_execution_queue(queues_)
-    if exec_q is None:
-        raise dpctl.utils.ExecutionPlacementError(
-            "Execution placement can not be unambiguously inferred "
-            "from input arguments. "
-        )
-    out_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_)
-    mode_i = _get_indexing_mode(mode)
-    indexes_dt = (
-        dpt.uint64
-        if indices.dtype == dpt.uint64
-        else ti.default_device_index_type(exec_q.sycl_device)
-    )
-    _ind = tuple(
-        (
-            indices
-            if i == pp
-            else _range(x.shape[i], i, x_nd, exec_q, out_usm_type, indexes_dt)
-        )
-        for i in range(x_nd)
-    )
-    return _put_multi_index(x, _ind, 0, vals, mode=mode_i)
diff --git a/dpctl/tensor/_linear_algebra_functions.py b/dpctl/tensor/_linear_algebra_functions.py
deleted file mode 100644
index 8fd1c532ad..0000000000
--- a/dpctl/tensor/_linear_algebra_functions.py
+++ /dev/null
@@ -1,1003 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import operator
-
-import dpctl
-import dpctl.tensor as dpt
-import dpctl.tensor._tensor_elementwise_impl as tei
-import dpctl.tensor._tensor_impl as ti
-import dpctl.tensor._tensor_linalg_impl as tli
-from dpctl.tensor._copy_utils import _empty_like_orderK, _empty_like_pair_orderK
-from dpctl.tensor._manipulation_functions import _broadcast_shape_impl
-from dpctl.tensor._type_utils import (
-    _acceptance_fn_default_binary,
-    _find_buf_dtype2,
-    _to_device_supported_dtype,
-)
-from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
-
-from ._numpy_helper import normalize_axis_index, normalize_axis_tuple
-
-
-def matrix_transpose(x):
-    r"""matrix_transpose(x)
-
-    Transposes the innermost two dimensions of `x`, where `x` is a
-    2-dimensional matrix or a stack of 2-dimensional matrices.
-
-    To convert from a 1-dimensional array to a 2-dimensional column
-    vector, use x[:, dpt.newaxis].
-
-    Args:
-       x (usm_ndarray):
-          Input array with shape (..., m, n).
-
-    Returns:
-       usm_ndarray:
-          Array with shape (..., n, m).
-    """
-
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(
-            "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x))
-        )
-    if x.ndim < 2:
-        raise ValueError(
-            "dpctl.tensor.matrix_transpose requires array to have"
-            "at least 2 dimensions"
-        )
-
-    return x.mT
-
-
-def tensordot(x1, x2, axes=2):
-    r"""tensordot(x1, x2, axes=2)
-
-    Returns a tensor contraction of `x1` and `x2` over specific axes.
-
-    Args:
-        x1 (usm_ndarray):
-            first input array, expected to have numeric data type.
-        x2 (usm_ndarray):
-            second input array, expected to have numeric data type.
-            Corresponding contracted axes of `x1` and `x2` must be equal.
-        axes (Union[int, Tuple[Sequence[int], Sequence[int]]):
-            number of axes to contract or explicit sequences of axes for
-            `x1` and `x2`, respectively. If `axes` is an integer equal to `N`,
-            then the contraction is performed over last `N` axes of `x1` and
-            the first `N` axis of `x2` in order. The size of each corresponding
-            axis must match and must be non-negative.
-
-                * if `N` equals `0`, the result is the tensor outer product
-                * if `N` equals `1`, the result is the tensor dot product
-                * if `N` equals `2`, the result is the tensor double
-                  contraction (default).
-
-            If `axes` is a tuple of two sequences `(x1_axes, x2_axes)`, the
-            first sequence applies to `x1` and the second sequence applies
-            to `x2`. Both sequences must have equal length, and each axis
-            `x1_axes[i]` for `x1` must have the same size as the respective
-            axis `x2_axes[i]` for `x2`. Each sequence must consist of unique
-            integers that specify valid axes for each respective array.
-            For example, if `x1` has rank `N`, a valid axis must reside on the
-            half-open interval `[-N, N)`.
-    Returns:
-        usm_ndarray:
-            an array containing the tensor contraction whose shape consists of
-            the non-contracted axes of the first array `x1`, followed by the
-            non-contracted axes of the second array `x2`. The returned array
-            must have a data type determined by Type Promotion Rules.
-    """
-    if not isinstance(x1, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x1)}")
-    if not isinstance(x2, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x2)}")
-    q1, x1_usm_type = x1.sycl_queue, x1.usm_type
-    q2, x2_usm_type = x2.sycl_queue, x2.usm_type
-    exec_q = dpctl.utils.get_execution_queue((q1, q2))
-    if exec_q is None:
-        raise ExecutionPlacementError(
-            "Execution placement can not be unambiguously inferred "
-            "from input arguments."
-        )
-    res_usm_type = dpctl.utils.get_coerced_usm_type(
-        (
-            x1_usm_type,
-            x2_usm_type,
-        )
-    )
-    dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
-    # handle axes and shapes validation
-    x1_nd = x1.ndim
-    x2_nd = x2.ndim
-    x1_shape = x1.shape
-    x2_shape = x2.shape
-    if isinstance(axes, int):
-        if axes < 0:
-            raise ValueError("`axes` integer is expected to be non-negative")
-        n_axes1 = axes
-        n_axes2 = axes
-        axes1 = normalize_axis_tuple(tuple(range(-axes, 0)), x1_nd)
-        axes2 = tuple(range(0, axes))
-    elif isinstance(axes, tuple):
-        if len(axes) != 2:
-            raise ValueError(
-                "`axes` tuple is expected to contain two sequences"
-            )
-        axes1 = tuple(axes[0])
-        axes2 = tuple(axes[1])
-        n_axes1 = len(axes1)
-        n_axes2 = len(axes2)
-    else:
-        raise TypeError("`axes` must be an integer or a tuple of sequences")
-    if n_axes1 != n_axes2:
-        raise ValueError(
-            "number of axes contracted must be the same for each array"
-        )
-    if n_axes1 == 0:
-        arr1 = x1[..., dpt.newaxis]
-        arr2 = x2[dpt.newaxis, ...]
-        n_axes1 = 1
-        n_axes2 = 1
-    else:
-        same_shapes = True
-        for i in range(n_axes1):
-            axis1 = axes1[i]
-            axis2 = axes2[i]
-            same_shapes = same_shapes and (x1_shape[axis1] == x2_shape[axis2])
-        if not same_shapes:
-            raise ValueError("shape mismatch in contracted `tensordot` axes")
-        axes1 = normalize_axis_tuple(axes1, x1_nd)
-        axes2 = normalize_axis_tuple(axes2, x2_nd)
-        perm1 = [i for i in range(x1_nd) if i not in axes1] + list(axes1)
-        perm2 = list(axes2) + [i for i in range(x2_nd) if i not in axes2]
-        arr1 = dpt.permute_dims(x1, perm1)
-        arr2 = dpt.permute_dims(x2, perm2)
-    arr1_outer_nd = arr1.ndim - n_axes1
-    arr2_outer_nd = arr2.ndim - n_axes2
-    res_shape = arr1.shape[:arr1_outer_nd] + arr2.shape[n_axes2:]
-    # type validation
-    sycl_dev = exec_q.sycl_device
-    x1_dtype = x1.dtype
-    x2_dtype = x2.dtype
-    buf1_dt, buf2_dt, res_dt = _find_buf_dtype2(
-        x1_dtype,
-        x2_dtype,
-        tli._dot_result_type,
-        sycl_dev,
-        acceptance_fn=_acceptance_fn_default_binary,
-    )
-    if res_dt is None:
-        raise TypeError(
-            "function 'tensordot' does not support input types "
-            f"({x1_dtype}, {x2_dtype}), "
-            "and the inputs could not be safely coerced to any "
-            "supported types according to the casting rule ''safe''."
-        )
-
-    _manager = SequentialOrderManager[exec_q]
-    if buf1_dt is None and buf2_dt is None:
-        out = dpt.empty(
-            res_shape,
-            dtype=res_dt,
-            usm_type=res_usm_type,
-            sycl_queue=exec_q,
-            order="C",
-        )
-        dep_evs = _manager.submitted_events
-        ht_dot_ev, dot_ev = tli._dot(
-            x1=arr1,
-            x2=arr2,
-            batch_dims=0,
-            x1_outer_dims=arr1_outer_nd,
-            x2_outer_dims=arr2_outer_nd,
-            inner_dims=n_axes1,
-            dst=out,
-            sycl_queue=exec_q,
-            depends=dep_evs,
-        )
-        _manager.add_event_pair(ht_dot_ev, dot_ev)
-
-        return out
-
-    elif buf1_dt is None:
-        buf2 = _empty_like_orderK(arr2, buf2_dt)
-
-        dep_evs = _manager.submitted_events
-        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=arr2, dst=buf2, sycl_queue=exec_q, depends=dep_evs
-        )
-        _manager.add_event_pair(ht_copy_ev, copy_ev)
-        out = dpt.empty(
-            res_shape,
-            dtype=res_dt,
-            usm_type=res_usm_type,
-            sycl_queue=exec_q,
-            order="C",
-        )
-        ht_dot_ev, dot_ev = tli._dot(
-            x1=arr1,
-            x2=buf2,
-            batch_dims=0,
-            x1_outer_dims=arr1_outer_nd,
-            x2_outer_dims=arr2_outer_nd,
-            inner_dims=n_axes1,
-            dst=out,
-            sycl_queue=exec_q,
-            depends=[copy_ev],
-        )
-        _manager.add_event_pair(ht_dot_ev, dot_ev)
-
-        return out
-
-    elif buf2_dt is None:
-        buf1 = _empty_like_orderK(arr1, buf1_dt)
-        dep_evs = _manager.submitted_events
-        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=arr1, dst=buf1, sycl_queue=exec_q, depends=dep_evs
-        )
-        _manager.add_event_pair(ht_copy_ev, copy_ev)
-        out = dpt.empty(
-            res_shape,
-            dtype=res_dt,
-            usm_type=res_usm_type,
-            sycl_queue=exec_q,
-            order="C",
-        )
-        ht_dot_ev, dot_ev = tli._dot(
-            x1=buf1,
-            x2=arr2,
-            batch_dims=0,
-            x1_outer_dims=arr1_outer_nd,
-            x2_outer_dims=arr2_outer_nd,
-            inner_dims=n_axes1,
-            dst=out,
-            sycl_queue=exec_q,
-            depends=[copy_ev],
-        )
-        _manager.add_event_pair(ht_dot_ev, dot_ev)
-
-        return out
-
-    buf1 = _empty_like_orderK(arr1, buf1_dt)
-    deps_ev = _manager.submitted_events
-    ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-        src=arr1, dst=buf1, sycl_queue=exec_q, depends=deps_ev
-    )
-    _manager.add_event_pair(ht_copy1_ev, copy1_ev)
-    buf2 = _empty_like_orderK(arr2, buf2_dt)
-    ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-        src=arr2, dst=buf2, sycl_queue=exec_q, depends=deps_ev
-    )
-    _manager.add_event_pair(ht_copy2_ev, copy2_ev)
-    out = dpt.empty(
-        res_shape,
-        dtype=res_dt,
-        usm_type=res_usm_type,
-        sycl_queue=exec_q,
-        order="C",
-    )
-    ht_, dot_ev = tli._dot(
-        x1=buf1,
-        x2=buf2,
-        batch_dims=0,
-        x1_outer_dims=arr1_outer_nd,
-        x2_outer_dims=arr2_outer_nd,
-        inner_dims=n_axes1,
-        dst=out,
-        sycl_queue=exec_q,
-        depends=[copy1_ev, copy2_ev],
-    )
-    _manager.add_event_pair(ht_, dot_ev)
-
-    return out
-
-
-def vecdot(x1, x2, axis=-1):
-    r"""vecdot(x1, x2, axis=-1)
-
-    Computes the (vector) dot product of two arrays.
-
-    Args:
-        x1 (usm_ndarray):
-            first input array.
-        x2 (usm_ndarray):
-            second input array. Input arrays must have compatible
-            shapes along non-contract axes according to broadcasting
-            rules, and must have the same size along the contracted
-            axis. Input arrays should be of numeric type.
-        axis (Optional[int]):
-            axis over which to compute the dot product. The axis must
-            be an integer on the interval `[-N, -1]`, where `N` is
-            ``min(x1.ndim, x2.ndim)``. The axis along which dot product
-            is performed is counted backward from the last axes
-            (that is, `-1` refers to the last axis). By default,
-            dot product is computed over the last axis.
-            Default: `-1`.
-
-    Returns:
-        usm_ndarray:
-            if `x1` and `x2` are both one-dimensional arrays, a
-            zero-dimensional array containing the dot product value
-            is returned; otherwise, a non-zero-dimensional array containing
-            the dot products and having rank `N-1`, where `N` is the rank
-            of the shape of input arrays after broadcasting rules are applied
-            to non-contracted axes.
-    """
-    if not isinstance(x1, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x1)}")
-    if not isinstance(x2, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x2)}")
-    q1, x1_usm_type = x1.sycl_queue, x1.usm_type
-    q2, x2_usm_type = x2.sycl_queue, x2.usm_type
-    exec_q = dpctl.utils.get_execution_queue((q1, q2))
-    if exec_q is None:
-        raise ExecutionPlacementError(
-            "Execution placement can not be unambiguously inferred "
-            "from input arguments."
-        )
-    res_usm_type = dpctl.utils.get_coerced_usm_type(
-        (
-            x1_usm_type,
-            x2_usm_type,
-        )
-    )
-    dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
-    # axis and shape validation
-    x1_nd = x1.ndim
-    x2_nd = x2.ndim
-    x1_shape = x1.shape
-    x2_shape = x2.shape
-    if axis >= 0:
-        raise ValueError("`axis` must be negative")
-    axis = operator.index(axis)
-    x1_axis = normalize_axis_index(axis, x1_nd)
-    x2_axis = normalize_axis_index(axis, x2_nd)
-    if x1_shape[x1_axis] != x2_shape[x2_axis]:
-        raise ValueError(
-            "given axis must have the same shape for `x1` and `x2`"
-        )
-    if x1_nd > x2_nd:
-        x2_shape = (1,) * (x1_nd - x2_nd) + x2_shape
-    elif x2_nd > x1_nd:
-        x1_shape = (1,) * (x2_nd - x1_nd) + x1_shape
-    try:
-        broadcast_sh = _broadcast_shape_impl(
-            [
-                x1_shape,
-                x2_shape,
-            ]
-        )
-    except ValueError:
-        raise ValueError("mismatch in `vecdot` dimensions")
-    broadcast_nd = len(broadcast_sh)
-    contracted_axis = normalize_axis_index(axis, broadcast_nd)
-    res_sh = tuple(
-        [broadcast_sh[i] for i in range(broadcast_nd) if i != contracted_axis]
-    )
-    # type validation
-    sycl_dev = exec_q.sycl_device
-    x1_dtype = x1.dtype
-    x2_dtype = x2.dtype
-    buf1_dt, buf2_dt, res_dt = _find_buf_dtype2(
-        x1_dtype,
-        x2_dtype,
-        tli._dot_result_type,
-        sycl_dev,
-        acceptance_fn=_acceptance_fn_default_binary,
-    )
-    if res_dt is None:
-        raise TypeError(
-            "function 'vecdot' does not support input types "
-            f"({x1_dtype}, {x2_dtype}), "
-            "and the inputs could not be safely coerced to any "
-            "supported types according to the casting rule ''safe''."
-        )
-
-    _manager = SequentialOrderManager[exec_q]
-    if buf1_dt is None and buf2_dt is None:
-        if x1.dtype.kind == "c":
-            x1_tmp = _empty_like_orderK(x1, x1.dtype)
-            dep_evs = _manager.submitted_events
-            ht_conj_ev, conj_ev = tei._conj(
-                src=x1, dst=x1_tmp, sycl_queue=exec_q, depends=dep_evs
-            )
-            _manager.add_event_pair(ht_conj_ev, conj_ev)
-            x1 = x1_tmp
-        if x1.shape != broadcast_sh:
-            x1 = dpt.broadcast_to(x1, broadcast_sh)
-        if x2.shape != broadcast_sh:
-            x2 = dpt.broadcast_to(x2, broadcast_sh)
-        x1 = dpt.moveaxis(x1, contracted_axis, -1)
-        x2 = dpt.moveaxis(x2, contracted_axis, -1)
-        out = dpt.empty(
-            res_sh,
-            dtype=res_dt,
-            usm_type=res_usm_type,
-            sycl_queue=exec_q,
-            order="C",
-        )
-        dep_evs = _manager.submitted_events
-        ht_dot_ev, dot_ev = tli._dot(
-            x1=x1,
-            x2=x2,
-            batch_dims=len(res_sh),
-            x1_outer_dims=0,
-            x2_outer_dims=0,
-            inner_dims=1,
-            dst=out,
-            sycl_queue=exec_q,
-            depends=dep_evs,
-        )
-        _manager.add_event_pair(ht_dot_ev, dot_ev)
-        return dpt.reshape(out, res_sh)
-
-    elif buf1_dt is None:
-        if x1.dtype.kind == "c":
-            x1_tmp = _empty_like_orderK(x1, x1.dtype)
-            deps_ev = _manager.submitted_events
-            ht_conj_ev, conj_e = tei._conj(
-                src=x1, dst=x1_tmp, sycl_queue=exec_q, depends=deps_ev
-            )
-            _manager.add_event_pair(ht_conj_ev, conj_e)
-            x1 = x1_tmp
-        buf2 = _empty_like_orderK(x2, buf2_dt)
-        deps_ev = _manager.submitted_events
-        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=x2, dst=buf2, sycl_queue=exec_q, depends=deps_ev
-        )
-        _manager.add_event_pair(ht_copy_ev, copy_ev)
-        if x1.shape != broadcast_sh:
-            x1 = dpt.broadcast_to(x1, broadcast_sh)
-        if buf2.shape != broadcast_sh:
-            buf2 = dpt.broadcast_to(buf2, broadcast_sh)
-        x1 = dpt.moveaxis(x1, contracted_axis, -1)
-        buf2 = dpt.moveaxis(buf2, contracted_axis, -1)
-        out = dpt.empty(
-            res_sh,
-            dtype=res_dt,
-            usm_type=res_usm_type,
-            sycl_queue=exec_q,
-            order="C",
-        )
-        ht_dot_ev, dot_ev = tli._dot(
-            x1=x1,
-            x2=buf2,
-            batch_dims=len(res_sh),
-            x1_outer_dims=0,
-            x2_outer_dims=0,
-            inner_dims=1,
-            dst=out,
-            sycl_queue=exec_q,
-            depends=[copy_ev],
-        )
-        _manager.add_event_pair(ht_dot_ev, dot_ev)
-        return dpt.reshape(out, res_sh)
-
-    elif buf2_dt is None:
-        buf1 = _empty_like_orderK(x1, buf1_dt)
-        deps_ev = _manager.submitted_events
-        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=x1, dst=buf1, sycl_queue=exec_q, depends=deps_ev
-        )
-        _manager.add_event_pair(ht_copy_ev, copy_ev)
-        if buf1.dtype.kind == "c":
-            ht_conj_ev, conj_ev = tei._conj(
-                src=buf1, dst=buf1, sycl_queue=exec_q, depends=[copy_ev]
-            )
-            _manager.add_event_pair(ht_conj_ev, conj_ev)
-        if buf1.shape != broadcast_sh:
-            buf1 = dpt.broadcast_to(buf1, broadcast_sh)
-        if x2.shape != broadcast_sh:
-            x2 = dpt.broadcast_to(x2, broadcast_sh)
-        buf1 = dpt.moveaxis(buf1, contracted_axis, -1)
-        x2 = dpt.moveaxis(x2, contracted_axis, -1)
-        out = dpt.empty(
-            res_sh,
-            dtype=res_dt,
-            usm_type=res_usm_type,
-            sycl_queue=exec_q,
-            order="C",
-        )
-        deps_ev = _manager.submitted_events
-        ht_dot_ev, dot_ev = tli._dot(
-            x1=buf1,
-            x2=x2,
-            batch_dims=len(res_sh),
-            x1_outer_dims=0,
-            x2_outer_dims=0,
-            inner_dims=1,
-            dst=out,
-            sycl_queue=exec_q,
-            depends=deps_ev,
-        )
-        _manager.add_event_pair(ht_dot_ev, dot_ev)
-        return dpt.reshape(out, res_sh)
-
-    buf1 = _empty_like_orderK(x1, buf1_dt)
-    deps_ev = _manager.submitted_events
-    ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-        src=x1, dst=buf1, sycl_queue=exec_q, depends=deps_ev
-    )
-    _manager.add_event_pair(ht_copy1_ev, copy1_ev)
-    if buf1.dtype.kind == "c":
-        ht_conj_ev, conj_ev = tei._conj(
-            src=buf1, dst=buf1, sycl_queue=exec_q, depends=[copy1_ev]
-        )
-        _manager.add_event_pair(ht_conj_ev, conj_ev)
-    buf2 = _empty_like_orderK(x2, buf2_dt)
-    ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-        src=x2, dst=buf2, sycl_queue=exec_q, depends=deps_ev
-    )
-    _manager.add_event_pair(ht_copy2_ev, copy2_ev)
-    if buf1.shape != broadcast_sh:
-        buf1 = dpt.broadcast_to(buf1, broadcast_sh)
-    if buf2.shape != broadcast_sh:
-        buf2 = dpt.broadcast_to(buf2, broadcast_sh)
-    buf1 = dpt.moveaxis(buf1, contracted_axis, -1)
-    buf2 = dpt.moveaxis(buf2, contracted_axis, -1)
-    out = dpt.empty(
-        res_sh,
-        dtype=res_dt,
-        usm_type=res_usm_type,
-        sycl_queue=exec_q,
-        order="C",
-    )
-    deps_ev = _manager.submitted_events
-    ht_dot_ev, dot_ev = tli._dot(
-        x1=buf1,
-        x2=buf2,
-        batch_dims=len(res_sh),
-        x1_outer_dims=0,
-        x2_outer_dims=0,
-        inner_dims=1,
-        dst=out,
-        sycl_queue=exec_q,
-        depends=deps_ev,
-    )
-    _manager.add_event_pair(ht_dot_ev, dot_ev)
-    return out
-
-
-def matmul(x1, x2, out=None, dtype=None, order="K"):
-    r"""matmul(x1, x2, out=None, order="K")
-
-    Computes the matrix product. Implements the same semantics
-    as the built-in operator `@`.
-
-    Args:
-        x1 (usm_ndarray):
-            first input array. Expected to have numeric data type, and
-            at least one dimension. If `x1` is one-dimensional having
-            shape `(M,)`, and `x2` has more than one dimension, `x1` is
-            effectively treated as a two-dimensional array with shape `(1, M)`,
-            although the prepended dimension is removed from the output array.
-            If `x1` has shape `(..., M, K)`, the innermost two dimensions form
-            matrices on which to perform matrix multiplication.
-        x2 (usm_ndarray):
-            second input array. Expected to have numeric data type, and
-            at least one dimension. If `x2` is one-dimensional having
-            shape `(N,)`, and `x1` has more than one dimension, `x2` is
-            effectively treated as a two-dimensional array with shape `(N, 1)`,
-            although the appended dimension is removed from the output array.
-            If `x2` has shape `(..., K, N)`, the innermost two dimensions form
-            matrices on which to perform matrix multiplication.
-        out (Optional[usm_ndarray]):
-            the array into which the result of the matrix product is written.
-            The data type of `out` must match the expected data type of the
-            result or (if provided) `dtype`.
-            If `None` then a new array is returned. Default: `None`.
-        dtype (Optional[dtype]):
-            data type of the returned array. If `None`, the data type of the
-            returned array is determined by the Type Promotion Rules.
-            Default: `None`.
-        order (["K", "C", "F", "A"]):
-            memory layout of the output array, if `out` is `None`, otherwise
-            the `order` parameter value is not used. Default: `K`.
-    Returns:
-        usm_ndarray:
-            * if both `x1` and `x2` are one-dimensional arrays with shape
-              `(N,)`, returned array is a zero-dimensional array containing
-              inner product as its only element.
-            * if `x1` is two-dimensional array with shape `(M, K)` and `x2` is
-              a two-dimensional array with shape `(K, N)`, returned array is a
-              two-dimensional array with shape `(M, N)` and contains the
-              conventional matrix product.
-            * if `x1` is a one-dimensional array with shape `(K,)` and `x2` is
-              an array with shape `(..., K, N)`, returned array contains the
-              conventional matrix product and has shape `(..., N)`.
-            * if `x1` is an array with shape `(..., M, K)` and `x2` is a
-              one-dimensional array with shape `(K,)`, returned array has shape
-              `(..., M)` and contains the conventional matrix product.
-            * if `x1` is a two-dimensional array with shape `(M, K)` and `x2`
-              is an array with shape `(..., K, N)`, returned array contains
-              conventional matrix product for each stacked matrix and has shape
-              `(..., M, N)`.
-            * if `x1` has shape `(..., M, K)` and `x2` is a two-dimensional
-              array with shape `(K, N)`, returned array contains conventional
-              matrix product for each stacked matrix and has shape
-              `(..., M, N)`.
-            * if both `x1` and `x2` have more than two dimensions, returned
-              array contains conventional matrix product for each stacked
-              matrix and has shape determined by broadcasting rules for
-              `x1.shape[:-2]` and `x2.shape[:-2]`.
-
-            The data type of the returned array is determined by the Type
-            Promotion Rules. If either `x1` or `x2` has a complex floating
-            point type, neither argument is complex conjugated or transposed.
-    """
-    if not isinstance(x1, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x1)}")
-    if not isinstance(x2, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x2)}")
-    if order not in ["K", "C", "F", "A"]:
-        order = "K"
-    q1, x1_usm_type = x1.sycl_queue, x1.usm_type
-    q2, x2_usm_type = x2.sycl_queue, x2.usm_type
-    exec_q = dpctl.utils.get_execution_queue((q1, q2))
-    if exec_q is None:
-        raise ExecutionPlacementError(
-            "Execution placement can not be unambiguously inferred "
-            "from input arguments."
-        )
-    res_usm_type = dpctl.utils.get_coerced_usm_type(
-        (
-            x1_usm_type,
-            x2_usm_type,
-        )
-    )
-    dpctl.utils.validate_usm_type(res_usm_type, allow_none=False)
-
-    x1_nd = x1.ndim
-    x2_nd = x2.ndim
-    if x1_nd == 0 or x2_nd == 0:
-        raise ValueError("one or more operands to `matmul` is 0 dimensional")
-    x1_shape = x1.shape
-    x2_shape = x2.shape
-    appended_axes = []
-    if x1_nd == 1:
-        x1 = x1[dpt.newaxis, :]
-        x1_shape = x1.shape
-        appended_axes.append(-2)
-    if x2_nd == 1:
-        x2 = x2[:, dpt.newaxis]
-        x2_shape = x2.shape
-        appended_axes.append(-1)
-    if x1_shape[-1] != x2_shape[-2]:
-        raise ValueError("mismatch in `matmul` inner dimension")
-    x1_outer_sh = x1_shape[:-2]
-    x2_outer_sh = x2_shape[:-2]
-    try:
-        res_outer_sh = _broadcast_shape_impl(
-            [
-                x1_outer_sh,
-                x2_outer_sh,
-            ]
-        )
-    except ValueError:
-        raise ValueError("mismatch in `matmul` batching dimensions")
-    x1_broadcast_shape = res_outer_sh + x1_shape[-2:]
-    x2_broadcast_shape = res_outer_sh + x2_shape[-2:]
-    res_shape = res_outer_sh + x1_shape[-2:-1] + x2_shape[-1:]
-
-    sycl_dev = exec_q.sycl_device
-    x1_dtype = x1.dtype
-    x2_dtype = x2.dtype
-    if dtype is None:
-        buf1_dt, buf2_dt, res_dt = _find_buf_dtype2(
-            x1_dtype,
-            x2_dtype,
-            tli._dot_result_type,
-            sycl_dev,
-            acceptance_fn=_acceptance_fn_default_binary,
-        )
-        if res_dt is None:
-            raise ValueError(
-                "function 'matmul' does not support input types "
-                f"({x1_dtype}, {x2_dtype}), "
-                "and the inputs could not be safely coerced to any "
-                "supported types according to the casting rule ''safe''."
-            )
-    else:
-        res_dt = dpt.dtype(dtype)
-        res_dt = _to_device_supported_dtype(res_dt, sycl_dev)
-        buf1_dt, buf2_dt = None, None
-        if x1_dtype != res_dt:
-            if dpt.can_cast(x1_dtype, res_dt, casting="same_kind"):
-                buf1_dt = res_dt
-            else:
-                raise ValueError(
-                    r"`matmul` input `x1` cannot be cast from "
-                    f"{x1_dtype} to "
-                    f"requested type {res_dt} according to the casting rule "
-                    "''same_kind''."
-                )
-        if x2_dtype != res_dt:
-            if dpt.can_cast(x2_dtype, res_dt, casting="same_kind"):
-                buf2_dt = res_dt
-            else:
-                raise ValueError(
-                    r"`matmul` input `x2` cannot be cast from "
-                    f"{x2_dtype} to "
-                    f"requested type {res_dt} according to the casting rule "
-                    "''same_kind''."
-                )
-
-    orig_out = out
-    if out is not None:
-        if not isinstance(out, dpt.usm_ndarray):
-            raise TypeError(
-                f"output array must be of usm_ndarray type, got {type(out)}"
-            )
-
-        if not out.flags.writable:
-            raise ValueError("provided `out` array is read-only")
-
-        final_res_shape = tuple(
-            res_shape[i]
-            for i in range(-len(res_shape), 0)
-            if i not in appended_axes
-        )
-        if out.shape != final_res_shape:
-            raise ValueError(
-                "The shape of input and output arrays are inconsistent. "
-                f"Expected output shape is {final_res_shape}, got {out.shape}"
-            )
-
-        if appended_axes:
-            out = dpt.expand_dims(out, axis=appended_axes)
-            orig_out = out
-
-        if res_dt != out.dtype:
-            raise ValueError(
-                f"Output array of type {res_dt} is needed, got {out.dtype}"
-            )
-
-        if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None:
-            raise ExecutionPlacementError(
-                "Input and output allocation queues are not compatible"
-            )
-
-        if ti._array_overlap(x1, out) and buf1_dt is None:
-            out = dpt.empty_like(out)
-
-        if ti._array_overlap(x2, out) and buf2_dt is None:
-            # should not reach if out is reallocated
-            # after being checked against x1
-            out = dpt.empty_like(out)
-
-    if order == "A":
-        order = (
-            "F"
-            if all(
-                arr.flags.f_contiguous
-                for arr in (
-                    x1,
-                    x2,
-                )
-            )
-            else "C"
-        )
-
-    _manager = SequentialOrderManager[exec_q]
-    if buf1_dt is None and buf2_dt is None:
-        if out is None:
-            if order == "K":
-                out = _empty_like_pair_orderK(
-                    x1, x2, res_dt, res_shape, res_usm_type, exec_q
-                )
-            else:
-                out = dpt.empty(
-                    res_shape,
-                    dtype=res_dt,
-                    usm_type=res_usm_type,
-                    sycl_queue=exec_q,
-                    order=order,
-                )
-        if x1.shape != x1_broadcast_shape:
-            x1 = dpt.broadcast_to(x1, x1_broadcast_shape)
-        if x2.shape != x2_broadcast_shape:
-            x2 = dpt.broadcast_to(x2, x2_broadcast_shape)
-        deps_evs = _manager.submitted_events
-        ht_dot_ev, dot_ev = tli._dot(
-            x1=x1,
-            x2=x2,
-            batch_dims=len(res_shape[:-2]),
-            x1_outer_dims=1,
-            x2_outer_dims=1,
-            inner_dims=1,
-            dst=out,
-            sycl_queue=exec_q,
-            depends=deps_evs,
-        )
-        _manager.add_event_pair(ht_dot_ev, dot_ev)
-        if not (orig_out is None or orig_out is out):
-            # Copy the out data from temporary buffer to original memory
-            ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=out,
-                dst=orig_out,
-                sycl_queue=exec_q,
-                depends=[dot_ev],
-            )
-            _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
-            out = orig_out
-        if appended_axes:
-            out = dpt.squeeze(out, tuple(appended_axes))
-        return out
-    elif buf1_dt is None:
-        if order == "K":
-            buf2 = _empty_like_orderK(x2, buf2_dt)
-        else:
-            buf2 = dpt.empty_like(x2, dtype=buf2_dt, order=order)
-        deps_evs = _manager.submitted_events
-        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=x2, dst=buf2, sycl_queue=exec_q, depends=deps_evs
-        )
-        _manager.add_event_pair(ht_copy_ev, copy_ev)
-        if out is None:
-            if order == "K":
-                out = _empty_like_pair_orderK(
-                    x1, buf2, res_dt, res_shape, res_usm_type, exec_q
-                )
-            else:
-                out = dpt.empty(
-                    res_shape,
-                    dtype=res_dt,
-                    usm_type=res_usm_type,
-                    sycl_queue=exec_q,
-                    order=order,
-                )
-
-        if x1.shape != x1_broadcast_shape:
-            x1 = dpt.broadcast_to(x1, x1_broadcast_shape)
-        if buf2.shape != x2_broadcast_shape:
-            buf2 = dpt.broadcast_to(buf2, x2_broadcast_shape)
-        ht_dot_ev, dot_ev = tli._dot(
-            x1=x1,
-            x2=buf2,
-            batch_dims=len(res_shape[:-2]),
-            x1_outer_dims=1,
-            x2_outer_dims=1,
-            inner_dims=1,
-            dst=out,
-            sycl_queue=exec_q,
-            depends=[copy_ev],
-        )
-        _manager.add_event_pair(ht_dot_ev, dot_ev)
-        if not (orig_out is None or orig_out is out):
-            # Copy the out data from temporary buffer to original memory
-            ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=out,
-                dst=orig_out,
-                sycl_queue=exec_q,
-                depends=[dot_ev],
-            )
-            _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
-            out = orig_out
-        if appended_axes:
-            out = dpt.squeeze(out, tuple(appended_axes))
-        return out
-
-    elif buf2_dt is None:
-        if order == "K":
-            buf1 = _empty_like_orderK(x1, buf1_dt)
-        else:
-            buf1 = dpt.empty_like(x1, dtype=buf1_dt, order=order)
-        deps_ev = _manager.submitted_events
-        ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=x1, dst=buf1, sycl_queue=exec_q, depends=deps_ev
-        )
-        _manager.add_event_pair(ht_copy_ev, copy_ev)
-        if out is None:
-            if order == "K":
-                out = _empty_like_pair_orderK(
-                    buf1, x2, res_dt, res_shape, res_usm_type, exec_q
-                )
-            else:
-                out = dpt.empty(
-                    res_shape,
-                    dtype=res_dt,
-                    usm_type=res_usm_type,
-                    sycl_queue=exec_q,
-                    order=order,
-                )
-
-        if buf1.shape != x1_broadcast_shape:
-            buf1 = dpt.broadcast_to(buf1, x1_broadcast_shape)
-        if x2.shape != x2_broadcast_shape:
-            x2 = dpt.broadcast_to(x2, x2_broadcast_shape)
-        ht_dot_ev, dot_ev = tli._dot(
-            x1=buf1,
-            x2=x2,
-            batch_dims=len(res_shape[:-2]),
-            x1_outer_dims=1,
-            x2_outer_dims=1,
-            inner_dims=1,
-            dst=out,
-            sycl_queue=exec_q,
-            depends=[copy_ev],
-        )
-        _manager.add_event_pair(ht_dot_ev, dot_ev)
-        if not (orig_out is None or orig_out is out):
-            # Copy the out data from temporary buffer to original memory
-            ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=out,
-                dst=orig_out,
-                sycl_queue=exec_q,
-                depends=[dot_ev],
-            )
-            _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
-            out = orig_out
-        if appended_axes:
-            out = dpt.squeeze(out, tuple(appended_axes))
-        return out
-
-    if order == "K":
-        if x1.flags.c_contiguous and x2.flags.c_contiguous:
-            order = "C"
-        elif x1.flags.f_contiguous and x2.flags.f_contiguous:
-            order = "F"
-    if order == "K":
-        buf1 = _empty_like_orderK(x1, buf1_dt)
-    else:
-        buf1 = dpt.empty_like(x1, dtype=buf1_dt, order=order)
-    deps_ev = _manager.submitted_events
-    ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-        src=x1, dst=buf1, sycl_queue=exec_q, depends=deps_ev
-    )
-    _manager.add_event_pair(ht_copy1_ev, copy1_ev)
-    if order == "K":
-        buf2 = _empty_like_orderK(x2, buf2_dt)
-    else:
-        buf2 = dpt.empty_like(x2, dtype=buf2_dt, order=order)
-    ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-        src=x2, dst=buf2, sycl_queue=exec_q, depends=deps_ev
-    )
-    _manager.add_event_pair(ht_copy2_ev, copy2_ev)
-    if out is None:
-        if order == "K":
-            out = _empty_like_pair_orderK(
-                buf1, buf2, res_dt, res_shape, res_usm_type, exec_q
-            )
-        else:
-            out = dpt.empty(
-                res_shape,
-                dtype=res_dt,
-                usm_type=res_usm_type,
-                sycl_queue=exec_q,
-                order=order,
-            )
-
-    if buf1.shape != x1_broadcast_shape:
-        buf1 = dpt.broadcast_to(buf1, x1_broadcast_shape)
-    if buf2.shape != x2_broadcast_shape:
-        buf2 = dpt.broadcast_to(buf2, x2_broadcast_shape)
-    ht_, dot_ev = tli._dot(
-        x1=buf1,
-        x2=buf2,
-        batch_dims=len(res_shape[:-2]),
-        x1_outer_dims=1,
-        x2_outer_dims=1,
-        inner_dims=1,
-        dst=out,
-        sycl_queue=exec_q,
-        depends=[copy1_ev, copy2_ev],
-    )
-    _manager.add_event_pair(ht_, dot_ev)
-    if appended_axes:
-        out = dpt.squeeze(out, tuple(appended_axes))
-    return out
diff --git a/dpctl/tensor/_manipulation_functions.py b/dpctl/tensor/_manipulation_functions.py
deleted file mode 100644
index a1ab047b16..0000000000
--- a/dpctl/tensor/_manipulation_functions.py
+++ /dev/null
@@ -1,1070 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-
-import itertools
-import operator
-
-import numpy as np
-
-import dpctl
-import dpctl.tensor as dpt
-import dpctl.tensor._tensor_impl as ti
-import dpctl.utils as dputils
-
-from ._copy_utils import _broadcast_strides
-from ._numpy_helper import normalize_axis_index, normalize_axis_tuple
-from ._type_utils import _supported_dtype, _to_device_supported_dtype
-
-__doc__ = (
-    "Implementation module for array manipulation "
-    "functions in :module:`dpctl.tensor`"
-)
-
-
-def _broadcast_shape_impl(shapes):
-    if len(set(shapes)) == 1:
-        return shapes[0]
-    mutable_shapes = False
-    nds = [len(s) for s in shapes]
-    biggest = max(nds)
-    sh_len = len(shapes)
-    for i in range(sh_len):
-        diff = biggest - nds[i]
-        if diff > 0:
-            ty = type(shapes[i])
-            shapes[i] = ty(
-                itertools.chain(itertools.repeat(1, diff), shapes[i])
-            )
-    common_shape = []
-    for axis in range(biggest):
-        lengths = [s[axis] for s in shapes]
-        unique = set(lengths + [1])
-        if len(unique) > 2:
-            raise ValueError(
-                "Shape mismatch: two or more arrays have "
-                f"incompatible dimensions on axis ({axis},)"
-            )
-        elif len(unique) == 2:
-            unique.remove(1)
-            new_length = unique.pop()
-            common_shape.append(new_length)
-            for i in range(sh_len):
-                if shapes[i][axis] == 1:
-                    if not mutable_shapes:
-                        shapes = [list(s) for s in shapes]
-                        mutable_shapes = True
-                    shapes[i][axis] = new_length
-        else:
-            common_shape.append(1)
-
-    return tuple(common_shape)
-
-
-def _broadcast_shapes(*args):
-    """
-    Broadcast the input shapes into a single shape;
-    returns tuple broadcasted shape.
-    """
-    array_shapes = [array.shape for array in args]
-    return _broadcast_shape_impl(array_shapes)
-
-
-def permute_dims(X, /, axes):
-    """permute_dims(x, axes)
-
-    Permute the axes (dimensions) of an array; returns the permuted
-    array as a view.
-
-    Args:
-        x (usm_ndarray): input array.
-        axes (Tuple[int, ...]): tuple containing permutation of
-           `(0,1,...,N-1)` where `N` is the number of axes (dimensions)
-           of `x`.
-    Returns:
-        usm_ndarray:
-            An array with permuted axes.
-            The returned array must has the same data type as `x`,
-            is created on the same device as `x` and has the same USM allocation
-            type as `x`.
-    """
-    if not isinstance(X, dpt.usm_ndarray):
-        raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
-    axes = normalize_axis_tuple(axes, X.ndim, "axes")
-    if not X.ndim == len(axes):
-        raise ValueError(
-            "The length of the passed axes does not match "
-            "to the number of usm_ndarray dimensions."
-        )
-    newstrides = tuple(X.strides[i] for i in axes)
-    newshape = tuple(X.shape[i] for i in axes)
-    return dpt.usm_ndarray(
-        shape=newshape,
-        dtype=X.dtype,
-        buffer=X,
-        strides=newstrides,
-        offset=X._element_offset,
-    )
-
-
-def expand_dims(X, /, *, axis=0):
-    """expand_dims(x, axis)
-
-    Expands the shape of an array by inserting a new axis (dimension)
-    of size one at the position specified by axis.
-
-    Args:
-        x (usm_ndarray):
-            input array
-        axis (Union[int, Tuple[int]]):
-            axis position in the expanded axes (zero-based). If `x` has rank
-            (i.e, number of dimensions) `N`, a valid `axis` must reside
-            in the closed-interval `[-N-1, N]`. If provided a negative
-            `axis`, the `axis` position at which to insert a singleton
-            dimension is computed as `N + axis + 1`. Hence, if
-            provided `-1`, the resolved axis position is `N` (i.e.,
-            a singleton dimension must be appended to the input array `x`).
-            If provided `-N-1`, the resolved axis position is `0` (i.e., a
-            singleton dimension is prepended to the input array `x`).
-
-    Returns:
-        usm_ndarray:
-            Returns a view, if possible, and a copy otherwise with the number
-            of dimensions increased.
-            The expanded array has the same data type as the input array `x`.
-            The expanded array is located on the same device as the input
-            array, and has the same USM allocation type.
-
-    Raises:
-        IndexError: if `axis` value is invalid.
-    """
-    if not isinstance(X, dpt.usm_ndarray):
-        raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
-
-    if type(axis) not in (tuple, list):
-        axis = (axis,)
-
-    out_ndim = len(axis) + X.ndim
-    axis = normalize_axis_tuple(axis, out_ndim)
-
-    shape_it = iter(X.shape)
-    shape = tuple(1 if ax in axis else next(shape_it) for ax in range(out_ndim))
-
-    return dpt.reshape(X, shape)
-
-
-def squeeze(X, /, axis=None):
-    """squeeze(x, axis)
-
-    Removes singleton dimensions (axes) from array `x`.
-
-    Args:
-        x (usm_ndarray): input array
-        axis (Union[int, Tuple[int,...]]): axis (or axes) to squeeze.
-
-    Returns:
-        usm_ndarray:
-            Output array is a view, if possible,
-            and a copy otherwise, but with all or a subset of the
-            dimensions of length 1 removed. Output has the same data
-            type as the input, is allocated on the same device as the
-            input and has the same USM allocation type as the input
-            array `x`.
-
-    Raises:
-        ValueError: if the specified axis has a size greater than one.
-    """
-    if not isinstance(X, dpt.usm_ndarray):
-        raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
-    X_shape = X.shape
-    if axis is not None:
-        axis = normalize_axis_tuple(axis, X.ndim if X.ndim != 0 else X.ndim + 1)
-        new_shape = []
-        for i, x in enumerate(X_shape):
-            if i not in axis:
-                new_shape.append(x)
-            else:
-                if x != 1:
-                    raise ValueError(
-                        "Cannot select an axis to squeeze out "
-                        "which has size not equal to one."
-                    )
-        new_shape = tuple(new_shape)
-    else:
-        new_shape = tuple(axis for axis in X_shape if axis != 1)
-    if new_shape == X.shape:
-        return X
-    else:
-        return dpt.reshape(X, new_shape)
-
-
-def broadcast_to(X, /, shape):
-    """broadcast_to(x, shape)
-
-    Broadcast an array to a new `shape`; returns the broadcasted
-    :class:`dpctl.tensor.usm_ndarray` as a view.
-
-    Args:
-        x (usm_ndarray): input array
-        shape (Tuple[int,...]): array shape. The `shape` must be
-            compatible with `x` according to broadcasting rules.
-
-    Returns:
-        usm_ndarray:
-            An array with the specified `shape`.
-            The output array is a view of the input array, and
-            hence has the same data type, USM allocation type and
-            device attributes.
-    """
-    if not isinstance(X, dpt.usm_ndarray):
-        raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
-
-    # Use numpy.broadcast_to to check the validity of the input
-    # parameter 'shape'. Raise ValueError if 'X' is not compatible
-    # with 'shape' according to NumPy's broadcasting rules.
-    new_array = np.broadcast_to(
-        np.broadcast_to(np.empty(tuple(), dtype="u1"), X.shape), shape
-    )
-    new_sts = _broadcast_strides(X.shape, X.strides, new_array.ndim)
-    return dpt.usm_ndarray(
-        shape=new_array.shape,
-        dtype=X.dtype,
-        buffer=X,
-        strides=new_sts,
-        offset=X._element_offset,
-    )
-
-
-def broadcast_arrays(*args):
-    """broadcast_arrays(*arrays)
-
-    Broadcasts one or more :class:`dpctl.tensor.usm_ndarrays` against
-    one another.
-
-    Args:
-        arrays (usm_ndarray): an arbitrary number of arrays to be
-            broadcasted.
-
-    Returns:
-        List[usm_ndarray]:
-            A list of broadcasted arrays. Each array
-            must have the same shape. Each array must have the same `dtype`,
-            `device` and `usm_type` attributes as its corresponding input
-            array.
-    """
-    if len(args) == 0:
-        raise ValueError("`broadcast_arrays` requires at least one argument")
-    for X in args:
-        if not isinstance(X, dpt.usm_ndarray):
-            raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
-
-    shape = _broadcast_shapes(*args)
-
-    if all(X.shape == shape for X in args):
-        return args
-
-    return [broadcast_to(X, shape) for X in args]
-
-
-def flip(X, /, *, axis=None):
-    """flip(x, axis)
-
-    Reverses the order of elements in an array `x` along the given `axis`.
-    The shape of the array is preserved, but the elements are reordered.
-
-    Args:
-        x (usm_ndarray): input array.
-        axis (Optional[Union[int, Tuple[int,...]]]): axis (or axes) along
-            which to flip.
-            If `axis` is `None`, all input array axes are flipped.
-            If `axis` is negative, the flipped axis is counted from the
-            last dimension. If provided more than one axis, only the specified
-            axes are flipped. Default: `None`.
-
-    Returns:
-        usm_ndarray:
-            A view of `x` with the entries of `axis` reversed.
-    """
-    if not isinstance(X, dpt.usm_ndarray):
-        raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
-    X_ndim = X.ndim
-    if axis is None:
-        indexer = (np.s_[::-1],) * X_ndim
-    else:
-        axis = normalize_axis_tuple(axis, X_ndim)
-        indexer = tuple(
-            np.s_[::-1] if i in axis else np.s_[:] for i in range(X.ndim)
-        )
-    return X[indexer]
-
-
-def roll(x, /, shift, *, axis=None):
-    """
-    roll(x, shift, axis)
-
-    Rolls array elements along a specified axis.
-    Array elements that roll beyond the last position are re-introduced
-    at the first position. Array elements that roll beyond the first position
-    are re-introduced at the last position.
-
-    Args:
-        x (usm_ndarray): input array
-        shift (Union[int, Tuple[int,...]]): number of places by which the
-            elements are shifted. If `shift` is a tuple, then `axis` must be a
-            tuple of the same size, and each of the given axes must be shifted
-            by the corresponding element in `shift`. If `shift` is an `int`
-            and `axis` a tuple, then the same `shift` must be used for all
-            specified axes. If a `shift` is positive, then array elements is
-            shifted positively (toward larger indices) along the dimension of
-            `axis`.
-            If a `shift` is negative, then array elements must be shifted
-            negatively (toward smaller indices) along the dimension of `axis`.
-        axis (Optional[Union[int, Tuple[int,...]]]): axis (or axes) along which
-            elements to shift. If `axis` is `None`, the array is
-            flattened, shifted, and then restored to its original shape.
-            Default: `None`.
-
-    Returns:
-        usm_ndarray:
-            An array having the same `dtype`, `usm_type` and
-            `device` attributes as `x` and whose elements are shifted relative
-            to `x`.
-    """
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected usm_ndarray type, got {type(x)}.")
-    exec_q = x.sycl_queue
-    _manager = dputils.SequentialOrderManager[exec_q]
-    if axis is None:
-        shift = operator.index(shift)
-        res = dpt.empty(
-            x.shape, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
-        )
-        sz = operator.index(x.size)
-        shift = (shift % sz) if sz > 0 else 0
-        dep_evs = _manager.submitted_events
-        hev, roll_ev = ti._copy_usm_ndarray_for_roll_1d(
-            src=x,
-            dst=res,
-            shift=shift,
-            sycl_queue=exec_q,
-            depends=dep_evs,
-        )
-        _manager.add_event_pair(hev, roll_ev)
-        return res
-    axis = normalize_axis_tuple(axis, x.ndim, allow_duplicate=True)
-    broadcasted = np.broadcast(shift, axis)
-    if broadcasted.ndim > 1:
-        raise ValueError("'shift' and 'axis' should be scalars or 1D sequences")
-    shifts = [
-        0,
-    ] * x.ndim
-    shape = x.shape
-    for sh, ax in broadcasted:
-        n_i = operator.index(shape[ax])
-        shifted = shifts[ax] + operator.index(sh)
-        shifts[ax] = (shifted % n_i) if n_i > 0 else 0
-    res = dpt.empty(
-        x.shape, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
-    )
-    dep_evs = _manager.submitted_events
-    ht_e, roll_ev = ti._copy_usm_ndarray_for_roll_nd(
-        src=x, dst=res, shifts=shifts, sycl_queue=exec_q, depends=dep_evs
-    )
-    _manager.add_event_pair(ht_e, roll_ev)
-    return res
-
-
-def _arrays_validation(arrays, check_ndim=True):
-    n = len(arrays)
-    if n == 0:
-        raise TypeError("Missing 1 required positional argument: 'arrays'.")
-
-    if not isinstance(arrays, (list, tuple)):
-        raise TypeError(f"Expected tuple or list type, got {type(arrays)}.")
-
-    for X in arrays:
-        if not isinstance(X, dpt.usm_ndarray):
-            raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
-
-    exec_q = dputils.get_execution_queue([X.sycl_queue for X in arrays])
-    if exec_q is None:
-        raise ValueError("All the input arrays must have same sycl queue.")
-
-    res_usm_type = dputils.get_coerced_usm_type([X.usm_type for X in arrays])
-    if res_usm_type is None:
-        raise ValueError("All the input arrays must have usm_type.")
-
-    X0 = arrays[0]
-    _supported_dtype(Xi.dtype for Xi in arrays)
-
-    res_dtype = X0.dtype
-    dev = exec_q.sycl_device
-    for i in range(1, n):
-        res_dtype = np.promote_types(res_dtype, arrays[i])
-        res_dtype = _to_device_supported_dtype(res_dtype, dev)
-
-    if check_ndim:
-        for i in range(1, n):
-            if X0.ndim != arrays[i].ndim:
-                raise ValueError(
-                    "All the input arrays must have same number of dimensions, "
-                    f"but the array at index 0 has {X0.ndim} dimension(s) and "
-                    f"the array at index {i} has {arrays[i].ndim} dimension(s)."
-                )
-    return res_dtype, res_usm_type, exec_q
-
-
-def _check_same_shapes(X0_shape, axis, n, arrays):
-    for i in range(1, n):
-        Xi_shape = arrays[i].shape
-        for j, X0j in enumerate(X0_shape):
-            if X0j != Xi_shape[j] and j != axis:
-                raise ValueError(
-                    "All the input array dimensions for the concatenation "
-                    f"axis must match exactly, but along dimension {j}, the "
-                    f"array at index 0 has size {X0j} and the array "
-                    f"at index {i} has size {Xi_shape[j]}."
-                )
-
-
-def _concat_axis_None(arrays):
-    "Implementation of concat(arrays, axis=None)."
-    res_dtype, res_usm_type, exec_q = _arrays_validation(
-        arrays, check_ndim=False
-    )
-    res_shape = 0
-    for array in arrays:
-        res_shape += array.size
-    res = dpt.empty(
-        res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q
-    )
-
-    fill_start = 0
-    _manager = dputils.SequentialOrderManager[exec_q]
-    deps = _manager.submitted_events
-    for array in arrays:
-        fill_end = fill_start + array.size
-        if array.flags.c_contiguous:
-            hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=dpt.reshape(array, -1),
-                dst=res[fill_start:fill_end],
-                sycl_queue=exec_q,
-                depends=deps,
-            )
-            _manager.add_event_pair(hev, cpy_ev)
-        else:
-            src_ = array
-            # _copy_usm_ndarray_for_reshape requires src and dst to have
-            # the same data type
-            if not array.dtype == res_dtype:
-                src2_ = dpt.empty_like(src_, dtype=res_dtype)
-                ht_copy_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-                    src=src_, dst=src2_, sycl_queue=exec_q, depends=deps
-                )
-                _manager.add_event_pair(ht_copy_ev, cpy_ev)
-                hev, reshape_copy_ev = ti._copy_usm_ndarray_for_reshape(
-                    src=src2_,
-                    dst=res[fill_start:fill_end],
-                    sycl_queue=exec_q,
-                    depends=[cpy_ev],
-                )
-                _manager.add_event_pair(hev, reshape_copy_ev)
-            else:
-                hev, cpy_ev = ti._copy_usm_ndarray_for_reshape(
-                    src=src_,
-                    dst=res[fill_start:fill_end],
-                    sycl_queue=exec_q,
-                    depends=deps,
-                )
-                _manager.add_event_pair(hev, cpy_ev)
-        fill_start = fill_end
-
-    return res
-
-
-def concat(arrays, /, *, axis=0):
-    """concat(arrays, axis)
-
-    Joins a sequence of arrays along an existing axis.
-
-    Args:
-        arrays (Union[List[usm_ndarray, Tuple[usm_ndarray,...]]]):
-            input arrays to join. The arrays must have the same shape,
-            except in the dimension specified by `axis`.
-        axis (Optional[int]): axis along which the arrays will be joined.
-            If `axis` is `None`, arrays must be flattened before
-            concatenation. If `axis` is negative, it is understood as
-            being counted from the last dimension. Default: `0`.
-
-    Returns:
-        usm_ndarray:
-            An output array containing the concatenated
-            values. The output array data type is determined by Type
-            Promotion Rules of array API.
-
-    All input arrays must have the same device attribute. The output array
-    is allocated on that same device, and data movement operations are
-    scheduled on a queue underlying the device. The USM allocation type
-    of the output array is determined by USM allocation type promotion
-    rules.
-    """
-    if axis is None:
-        return _concat_axis_None(arrays)
-
-    res_dtype, res_usm_type, exec_q = _arrays_validation(arrays)
-    n = len(arrays)
-    X0 = arrays[0]
-
-    axis = normalize_axis_index(axis, X0.ndim)
-    X0_shape = X0.shape
-    _check_same_shapes(X0_shape, axis, n, arrays)
-
-    res_shape_axis = 0
-    for X in arrays:
-        res_shape_axis = res_shape_axis + X.shape[axis]
-
-    res_shape = tuple(
-        X0_shape[i] if i != axis else res_shape_axis for i in range(X0.ndim)
-    )
-
-    res = dpt.empty(
-        res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q
-    )
-
-    _manager = dputils.SequentialOrderManager[exec_q]
-    deps = _manager.submitted_events
-    fill_start = 0
-    for i in range(n):
-        fill_end = fill_start + arrays[i].shape[axis]
-        c_shapes_copy = tuple(
-            np.s_[fill_start:fill_end] if j == axis else np.s_[:]
-            for j in range(X0.ndim)
-        )
-        hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=arrays[i],
-            dst=res[c_shapes_copy],
-            sycl_queue=exec_q,
-            depends=deps,
-        )
-        _manager.add_event_pair(hev, cpy_ev)
-        fill_start = fill_end
-
-    return res
-
-
-def stack(arrays, /, *, axis=0):
-    """
-    stack(arrays, axis)
-
-    Joins a sequence of arrays along a new axis.
-
-    Args:
-        arrays (Union[List[usm_ndarray], Tuple[usm_ndarray,...]]):
-            input arrays to join. Each array must have the same shape.
-        axis (int): axis along which the arrays will be joined. Providing
-            an `axis` specified the index of the new axis in the dimensions
-            of the output array. A valid axis must be on the interval
-            `[-N, N)`, where `N` is the rank (number of dimensions) of `x`.
-            Default: `0`.
-
-    Returns:
-        usm_ndarray:
-            An output array having rank `N+1`, where `N` is
-            the rank (number of dimensions) of `x`. If the input arrays have
-            different data types, array API Type Promotion Rules apply.
-
-    Raises:
-        ValueError: if not all input arrays have the same shape
-        IndexError: if provided an `axis` outside of the required interval.
-    """
-    res_dtype, res_usm_type, exec_q = _arrays_validation(arrays)
-
-    n = len(arrays)
-    X0 = arrays[0]
-    res_ndim = X0.ndim + 1
-    axis = normalize_axis_index(axis, res_ndim)
-    X0_shape = X0.shape
-
-    for i in range(1, n):
-        if X0_shape != arrays[i].shape:
-            raise ValueError("All input arrays must have the same shape")
-
-    res_shape = tuple(
-        X0_shape[i - 1 * (i >= axis)] if i != axis else n
-        for i in range(res_ndim)
-    )
-
-    res = dpt.empty(
-        res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q
-    )
-
-    _manager = dputils.SequentialOrderManager[exec_q]
-    dep_evs = _manager.submitted_events
-    for i in range(n):
-        c_shapes_copy = tuple(
-            i if j == axis else np.s_[:] for j in range(res_ndim)
-        )
-        _dst = res[c_shapes_copy]
-        hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=arrays[i], dst=_dst, sycl_queue=exec_q, depends=dep_evs
-        )
-        _manager.add_event_pair(hev, cpy_ev)
-
-    return res
-
-
-def unstack(X, /, *, axis=0):
-    """unstack(x, axis=0)
-
-    Splits an array in a sequence of arrays along the given axis.
-
-    Args:
-        x (usm_ndarray): input array
-
-        axis (int, optional): axis along which `x` is unstacked.
-            If `x` has rank (i.e, number of dimensions) `N`,
-            a valid `axis` must reside in the half-open interval `[-N, N)`.
-            Default: `0`.
-
-    Returns:
-        Tuple[usm_ndarray,...]:
-            Output sequence of arrays which are views into the input array.
-
-    Raises:
-        AxisError: if the `axis` value is invalid.
-    """
-    if not isinstance(X, dpt.usm_ndarray):
-        raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
-
-    axis = normalize_axis_index(axis, X.ndim)
-    Y = dpt.moveaxis(X, axis, 0)
-
-    return tuple(Y[i] for i in range(Y.shape[0]))
-
-
-def moveaxis(X, source, destination, /):
-    """moveaxis(x, source, destination)
-
-    Moves axes of an array to new positions.
-
-    Args:
-        x (usm_ndarray): input array
-
-        source (int or a sequence of int):
-            Original positions of the axes to move.
-            These must be unique. If `x` has rank (i.e., number of
-            dimensions) `N`, a valid `axis` must be in the
-            half-open interval `[-N, N)`.
-
-        destination (int or a sequence of int):
-            Destination positions for each of the original axes.
-            These must also be unique. If `x` has rank
-            (i.e., number of dimensions) `N`, a valid `axis` must be
-            in the half-open interval `[-N, N)`.
-
-    Returns:
-        usm_ndarray:
-            Array with moved axes.
-            The returned array must has the same data type as `x`,
-            is created on the same device as `x` and has the same
-            USM allocation type as `x`.
-
-    Raises:
-        AxisError: if `axis` value is invalid.
-        ValueError: if `src` and `dst` have not equal number of elements.
-    """
-    if not isinstance(X, dpt.usm_ndarray):
-        raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
-
-    source = normalize_axis_tuple(source, X.ndim, "source")
-    destination = normalize_axis_tuple(destination, X.ndim, "destination")
-
-    if len(source) != len(destination):
-        raise ValueError(
-            "`source` and `destination` arguments must have "
-            "the same number of elements"
-        )
-
-    ind = [n for n in range(X.ndim) if n not in source]
-
-    for src, dst in sorted(zip(destination, source)):
-        ind.insert(src, dst)
-
-    return dpt.permute_dims(X, tuple(ind))
-
-
-def swapaxes(X, axis1, axis2):
-    """swapaxes(x, axis1, axis2)
-
-    Interchanges two axes of an array.
-
-    Args:
-        x (usm_ndarray): input array
-
-        axis1 (int): First axis.
-            If `x` has rank (i.e., number of dimensions) `N`,
-            a valid `axis` must be in the half-open interval `[-N, N)`.
-
-        axis2 (int): Second axis.
-            If `x` has rank (i.e., number of dimensions) `N`,
-            a valid `axis` must be in the half-open interval `[-N, N)`.
-
-    Returns:
-        usm_ndarray:
-            Array with swapped axes.
-            The returned array must has the same data type as `x`,
-            is created on the same device as `x` and has the same USM
-            allocation type as `x`.
-
-    Raises:
-        AxisError: if `axis` value is invalid.
-    """
-    if not isinstance(X, dpt.usm_ndarray):
-        raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
-
-    axis1 = normalize_axis_index(axis1, X.ndim, "axis1")
-    axis2 = normalize_axis_index(axis2, X.ndim, "axis2")
-
-    ind = list(range(0, X.ndim))
-    ind[axis1] = axis2
-    ind[axis2] = axis1
-    return dpt.permute_dims(X, tuple(ind))
-
-
-def repeat(x, repeats, /, *, axis=None):
-    """repeat(x, repeats, axis=None)
-
-    Repeat elements of an array on a per-element basis.
-
-    Args:
-        x (usm_ndarray): input array
-
-        repeats (Union[int, Sequence[int, ...], usm_ndarray]):
-            The number of repetitions for each element.
-
-            `repeats` must be broadcast-compatible with `N` where `N` is
-            `prod(x.shape)` if `axis` is `None` and `x.shape[axis]`
-            otherwise.
-
-            If `repeats` is an array, it must have an integer data type.
-            Otherwise, `repeats` must be a Python integer or sequence of
-            Python integers (i.e., a tuple, list, or range).
-
-        axis (Optional[int]):
-            The axis along which to repeat values. If `axis` is `None`, the
-            function repeats elements of the flattened array. Default: `None`.
-
-    Returns:
-        usm_ndarray:
-            output array with repeated elements.
-
-            If `axis` is `None`, the returned array is one-dimensional,
-            otherwise, it has the same shape as `x`, except for the axis along
-            which elements were repeated.
-
-            The returned array will have the same data type as `x`.
-            The returned array will be located on the same device as `x` and
-            have the same USM allocation type as `x`.
-
-    Raises:
-        AxisError: if `axis` value is invalid.
-    """
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected usm_ndarray type, got {type(x)}.")
-
-    x_ndim = x.ndim
-    x_shape = x.shape
-    if axis is not None:
-        axis = normalize_axis_index(operator.index(axis), x_ndim)
-        axis_size = x_shape[axis]
-    else:
-        axis_size = x.size
-
-    scalar = False
-    if isinstance(repeats, int):
-        if repeats < 0:
-            raise ValueError("`repeats` must be a positive integer")
-        usm_type = x.usm_type
-        exec_q = x.sycl_queue
-        scalar = True
-    elif isinstance(repeats, dpt.usm_ndarray):
-        if repeats.ndim > 1:
-            raise ValueError(
-                "`repeats` array must be 0- or 1-dimensional, got "
-                f"{repeats.ndim}"
-            )
-        exec_q = dpctl.utils.get_execution_queue(
-            (x.sycl_queue, repeats.sycl_queue)
-        )
-        if exec_q is None:
-            raise dputils.ExecutionPlacementError(
-                "Execution placement can not be unambiguously inferred "
-                "from input arguments."
-            )
-        usm_type = dpctl.utils.get_coerced_usm_type(
-            (
-                x.usm_type,
-                repeats.usm_type,
-            )
-        )
-        dpctl.utils.validate_usm_type(usm_type, allow_none=False)
-        if not dpt.can_cast(repeats.dtype, dpt.int64, casting="same_kind"):
-            raise TypeError(
-                f"'repeats' data type {repeats.dtype} cannot be cast to "
-                "'int64' according to the casting rule ''safe.''"
-            )
-        if repeats.size == 1:
-            scalar = True
-            # bring the single element to the host
-            if repeats.ndim == 0:
-                repeats = int(repeats)
-            else:
-                # Get the single element explicitly
-                # since non-0D arrays can not be converted to scalars
-                repeats = int(repeats[0])
-            if repeats < 0:
-                raise ValueError("`repeats` elements must be positive")
-        else:
-            if repeats.size != axis_size:
-                raise ValueError(
-                    "'repeats' array must be broadcastable to the size of "
-                    "the repeated axis"
-                )
-            if not dpt.all(repeats >= 0):
-                raise ValueError("'repeats' elements must be positive")
-
-    elif isinstance(repeats, (tuple, list, range)):
-        usm_type = x.usm_type
-        exec_q = x.sycl_queue
-
-        len_reps = len(repeats)
-        if len_reps == 1:
-            repeats = repeats[0]
-            if repeats < 0:
-                raise ValueError("`repeats` elements must be positive")
-            scalar = True
-        else:
-            if len_reps != axis_size:
-                raise ValueError(
-                    "`repeats` sequence must have the same length as the "
-                    "repeated axis"
-                )
-            repeats = dpt.asarray(
-                repeats, dtype=dpt.int64, usm_type=usm_type, sycl_queue=exec_q
-            )
-            if not dpt.all(repeats >= 0):
-                raise ValueError("`repeats` elements must be positive")
-    else:
-        raise TypeError(
-            "Expected int, sequence, or `usm_ndarray` for second argument,"
-            f"got {type(repeats)}"
-        )
-
-    _manager = dputils.SequentialOrderManager[exec_q]
-    dep_evs = _manager.submitted_events
-    if scalar:
-        res_axis_size = repeats * axis_size
-        if axis is not None:
-            res_shape = x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :]
-        else:
-            res_shape = (res_axis_size,)
-        res = dpt.empty(
-            res_shape, dtype=x.dtype, usm_type=usm_type, sycl_queue=exec_q
-        )
-        if res_axis_size > 0:
-            ht_rep_ev, rep_ev = ti._repeat_by_scalar(
-                src=x,
-                dst=res,
-                reps=repeats,
-                axis=axis,
-                sycl_queue=exec_q,
-                depends=dep_evs,
-            )
-            _manager.add_event_pair(ht_rep_ev, rep_ev)
-    else:
-        if repeats.dtype != dpt.int64:
-            rep_buf = dpt.empty(
-                repeats.shape,
-                dtype=dpt.int64,
-                usm_type=usm_type,
-                sycl_queue=exec_q,
-            )
-            ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=repeats, dst=rep_buf, sycl_queue=exec_q, depends=dep_evs
-            )
-            _manager.add_event_pair(ht_copy_ev, copy_ev)
-            cumsum = dpt.empty(
-                (axis_size,),
-                dtype=dpt.int64,
-                usm_type=usm_type,
-                sycl_queue=exec_q,
-            )
-            # _cumsum_1d synchronizes so `depends` ends here safely
-            res_axis_size = ti._cumsum_1d(
-                rep_buf, cumsum, sycl_queue=exec_q, depends=[copy_ev]
-            )
-            if axis is not None:
-                res_shape = (
-                    x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :]
-                )
-            else:
-                res_shape = (res_axis_size,)
-            res = dpt.empty(
-                res_shape,
-                dtype=x.dtype,
-                usm_type=usm_type,
-                sycl_queue=exec_q,
-            )
-            if res_axis_size > 0:
-                ht_rep_ev, rep_ev = ti._repeat_by_sequence(
-                    src=x,
-                    dst=res,
-                    reps=rep_buf,
-                    cumsum=cumsum,
-                    axis=axis,
-                    sycl_queue=exec_q,
-                )
-                _manager.add_event_pair(ht_rep_ev, rep_ev)
-        else:
-            cumsum = dpt.empty(
-                (axis_size,),
-                dtype=dpt.int64,
-                usm_type=usm_type,
-                sycl_queue=exec_q,
-            )
-            res_axis_size = ti._cumsum_1d(
-                repeats, cumsum, sycl_queue=exec_q, depends=dep_evs
-            )
-            if axis is not None:
-                res_shape = (
-                    x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :]
-                )
-            else:
-                res_shape = (res_axis_size,)
-            res = dpt.empty(
-                res_shape,
-                dtype=x.dtype,
-                usm_type=usm_type,
-                sycl_queue=exec_q,
-            )
-            if res_axis_size > 0:
-                ht_rep_ev, rep_ev = ti._repeat_by_sequence(
-                    src=x,
-                    dst=res,
-                    reps=repeats,
-                    cumsum=cumsum,
-                    axis=axis,
-                    sycl_queue=exec_q,
-                )
-                _manager.add_event_pair(ht_rep_ev, rep_ev)
-    return res
-
-
-def tile(x, repetitions, /):
-    """tile(x, repetitions)
-
-    Repeat an input array `x` along each axis a number of times given by
-    `repetitions`.
-
-    For `N` = len(`repetitions`) and `M` = len(`x.shape`):
-
-        * If `M < N`, `x` will have `N - M` new axes prepended to its shape
-        * If `M > N`, `repetitions` will have `M - N` ones prepended to it
-
-    Args:
-        x (usm_ndarray): input array
-
-        repetitions (Union[int, Tuple[int, ...]]):
-            The number of repetitions along each dimension of `x`.
-
-    Returns:
-        usm_ndarray:
-            tiled output array.
-
-            The returned array will have rank `max(M, N)`. If `S` is the
-            shape of `x` after prepending dimensions and `R` is
-            `repetitions` after prepending ones, then the shape of the
-            result will be `S[i] * R[i]` for each dimension `i`.
-
-            The returned array will have the same data type as `x`.
-            The returned array will be located on the same device as `x` and
-            have the same USM allocation type as `x`.
-    """
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected usm_ndarray type, got {type(x)}.")
-
-    if not isinstance(repetitions, tuple):
-        if isinstance(repetitions, int):
-            repetitions = (repetitions,)
-        else:
-            raise TypeError(
-                f"Expected tuple or integer type, got {type(repetitions)}."
-            )
-
-    rep_dims = len(repetitions)
-    x_dims = x.ndim
-    if rep_dims < x_dims:
-        repetitions = (x_dims - rep_dims) * (1,) + repetitions
-    elif x_dims < rep_dims:
-        x = dpt.reshape(x, (rep_dims - x_dims) * (1,) + x.shape)
-    res_shape = tuple(map(lambda sh, rep: sh * rep, x.shape, repetitions))
-    # case of empty input
-    if x.size == 0:
-        return dpt.empty(
-            res_shape,
-            dtype=x.dtype,
-            usm_type=x.usm_type,
-            sycl_queue=x.sycl_queue,
-        )
-    in_sh = x.shape
-    if res_shape == in_sh:
-        return dpt.copy(x)
-    expanded_sh = []
-    broadcast_sh = []
-    out_sz = 1
-    for i in range(len(res_shape)):
-        out_sz *= res_shape[i]
-        reps, sh = repetitions[i], in_sh[i]
-        if reps == 1:
-            # dimension will be unchanged
-            broadcast_sh.append(sh)
-            expanded_sh.append(sh)
-        elif sh == 1:
-            # dimension will be broadcast
-            broadcast_sh.append(reps)
-            expanded_sh.append(sh)
-        else:
-            broadcast_sh.extend([reps, sh])
-            expanded_sh.extend([1, sh])
-    exec_q = x.sycl_queue
-    xdt = x.dtype
-    xut = x.usm_type
-    res = dpt.empty((out_sz,), dtype=xdt, usm_type=xut, sycl_queue=exec_q)
-    # no need to copy data for empty output
-    if out_sz > 0:
-        x = dpt.broadcast_to(
-            # this reshape should never copy
-            dpt.reshape(x, expanded_sh),
-            broadcast_sh,
-        )
-        # copy broadcast input into flat array
-        _manager = dputils.SequentialOrderManager[exec_q]
-        dep_evs = _manager.submitted_events
-        hev, cp_ev = ti._copy_usm_ndarray_for_reshape(
-            src=x, dst=res, sycl_queue=exec_q, depends=dep_evs
-        )
-        _manager.add_event_pair(hev, cp_ev)
-    return dpt.reshape(res, res_shape)
diff --git a/dpctl/tensor/_numpy_helper.py b/dpctl/tensor/_numpy_helper.py
deleted file mode 100644
index 1325faa3bf..0000000000
--- a/dpctl/tensor/_numpy_helper.py
+++ /dev/null
@@ -1,32 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import numpy as np
-
-_npver = np.lib.NumpyVersion(np.__version__)
-
-if _npver < "1.25.0":  # pragma: no cover
-    from numpy import AxisError
-else:
-    from numpy.exceptions import AxisError
-
-if _npver >= "2.0.0":
-    from numpy._core.numeric import normalize_axis_index, normalize_axis_tuple
-else:  # pragma: no cover
-    from numpy.core.numeric import normalize_axis_index, normalize_axis_tuple
-
-
-__all__ = ["AxisError", "normalize_axis_index", "normalize_axis_tuple"]
diff --git a/dpctl/tensor/_print.py b/dpctl/tensor/_print.py
deleted file mode 100644
index 0156422083..0000000000
--- a/dpctl/tensor/_print.py
+++ /dev/null
@@ -1,491 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import contextlib
-import itertools
-import operator
-
-import numpy as np
-
-import dpctl
-import dpctl.tensor as dpt
-import dpctl.tensor._tensor_impl as ti
-import dpctl.utils
-
-__doc__ = "Print functions for :class:`dpctl.tensor.usm_ndarray`."
-
-_print_options = {
-    "linewidth": 75,
-    "edgeitems": 3,
-    "threshold": 1000,
-    "precision": 8,
-    "floatmode": "maxprec",
-    "suppress": False,
-    "nanstr": "nan",
-    "infstr": "inf",
-    "sign": "-",
-}
-
-
-def _move_to_next_line(string, s, line_width, prefix):
-    """
-    Move string to next line if it doesn't fit in the current line.
-    """
-    bottom_len = len(s) - (s.rfind("\n") + 1)
-    next_line = bottom_len + len(string) + 1 > line_width
-    string = ",\n" + " " * len(prefix) + string if next_line else ", " + string
-
-    return string
-
-
-def _options_dict(
-    linewidth=None,
-    edgeitems=None,
-    threshold=None,
-    precision=None,
-    floatmode=None,
-    suppress=None,
-    nanstr=None,
-    infstr=None,
-    sign=None,
-    numpy=False,
-):
-    if numpy:
-        numpy_options = np.get_printoptions()
-        options = {k: numpy_options[k] for k in _print_options.keys()}
-    else:
-        options = _print_options.copy()
-
-    if suppress:
-        options["suppress"] = True
-
-    local = dict(locals().items())
-    for int_arg in ["linewidth", "precision", "threshold", "edgeitems"]:
-        val = local[int_arg]
-        if val is not None:
-            options[int_arg] = operator.index(val)
-
-    for str_arg in ["nanstr", "infstr"]:
-        val = local[str_arg]
-        if val is not None:
-            if not isinstance(val, str):
-                raise TypeError(
-                    "`{}` ".format(str_arg) + "must be of `string` type."
-                )
-            options[str_arg] = val
-
-    signs = ["-", "+", " "]
-    if sign is not None:
-        if sign not in signs:
-            raise ValueError(
-                "`sign` must be one of"
-                + ", ".join("`{}`".format(s) for s in signs)
-            )
-        options["sign"] = sign
-
-    floatmodes = ["fixed", "unique", "maxprec", "maxprec_equal"]
-    if floatmode is not None:
-        if floatmode not in floatmodes:
-            raise ValueError(
-                "`floatmode` must be one of"
-                + ", ".join("`{}`".format(m) for m in floatmodes)
-            )
-        options["floatmode"] = floatmode
-
-    return options
-
-
-def set_print_options(
-    linewidth=None,
-    edgeitems=None,
-    threshold=None,
-    precision=None,
-    floatmode=None,
-    suppress=None,
-    nanstr=None,
-    infstr=None,
-    sign=None,
-    numpy=False,
-):
-    """
-    set_print_options(linewidth=None, edgeitems=None, threshold=None,
-                      precision=None, floatmode=None, suppress=None,
-                      nanstr=None, infstr=None, sign=None, numpy=False)
-
-    Set options for printing :class:`dpctl.tensor.usm_ndarray` class.
-
-    Args:
-        linewidth (int, optional):
-            Number of characters printed per line.
-            Raises `TypeError` if linewidth is not an integer.
-            Default: `75`.
-        edgeitems (int, optional):
-            Number of elements at the beginning and end
-            when the printed array is abbreviated.
-            Raises `TypeError` if edgeitems is not an integer.
-            Default: `3`.
-        threshold (int, optional):
-            Number of elements that triggers array abbreviation.
-            Raises `TypeError` if threshold is not an integer.
-            Default: `1000`.
-        precision (int or None, optional):
-            Number of digits printed for floating point numbers.
-            Raises `TypeError` if precision is not an integer.
-            Default: `8`.
-        floatmode (str, optional):
-            Controls how floating point numbers are interpreted.
-                `"fixed:`:
-                    Always prints exactly `precision` digits.
-                `"unique"`:
-                    Ignores precision, prints the number of
-                    digits necessary to uniquely specify each number.
-                `"maxprec"`:
-                    Prints `precision` digits or fewer,
-                    if fewer will uniquely represent a number.
-                `"maxprec_equal"`:
-                    Prints an equal number of digits
-                    for each number. This number is `precision` digits
-                    or fewer, if fewer will uniquely represent each number.
-            Raises `ValueError` if floatmode is not one of
-            `fixed`, `unique`, `maxprec`, or `maxprec_equal`.
-            Default: "maxprec_equal"
-        suppress (bool, optional):
-            If `True,` numbers equal to zero in the current precision
-            will print as zero.
-            Default: `False`.
-        nanstr (str, optional):
-            String used to represent nan.
-            Raises `TypeError` if nanstr is not a string.
-            Default: `"nan"`.
-        infstr (str, optional):
-            String used to represent infinity.
-            Raises `TypeError` if infstr is not a string.
-            Default: `"inf"`.
-        sign (str, optional):
-            Controls the sign of floating point numbers.
-                `"-"`:
-                    Omit the sign of positive numbers.
-                `"+"`:
-                    Always print the sign of positive numbers.
-                `" "`:
-                    Always print a whitespace in place of the
-                    sign of positive numbers.
-            Raises `ValueError` if sign is not one of
-            `"-"`, `"+"`, or `" "`.
-            Default: `"-"`.
-        numpy (bool, optional): If `True,` then before other specified print
-            options are set, a dictionary of Numpy's print options
-            will be used to initialize dpctl's print options.
-            Default: "False"
-    """
-    options = _options_dict(
-        linewidth=linewidth,
-        edgeitems=edgeitems,
-        threshold=threshold,
-        precision=precision,
-        floatmode=floatmode,
-        suppress=suppress,
-        nanstr=nanstr,
-        infstr=infstr,
-        sign=sign,
-        numpy=numpy,
-    )
-    _print_options.update(options)
-
-
-def get_print_options():
-    """get_print_options()
-
-    Returns a copy of current options for printing
-    :class:`dpctl.tensor.usm_ndarray` class.
-
-    Returns:
-        dict: dictionary with array
-           printing option settings.
-
-    Options:
-        - "linewidth" : int, default 75
-        - "edgeitems" : int, default 3
-        - "threshold" : int, default 1000
-        - "precision" : int, default 8
-        - "floatmode" : str, default "maxprec_equal"
-        - "suppress" : bool, default False
-        - "nanstr" : str, default "nan"
-        - "infstr" : str, default "inf"
-        - "sign" : str, default "-"
-    """
-    return _print_options.copy()
-
-
-@contextlib.contextmanager
-def print_options(*args, **kwargs):
-    """
-    Context manager for print options.
-
-    Set print options for the scope of a `with` block.
-    `as` yields dictionary of print options.
-    """
-    options = dpt.get_print_options()
-    try:
-        dpt.set_print_options(*args, **kwargs)
-        yield dpt.get_print_options()
-    finally:
-        dpt.set_print_options(**options)
-
-
-def _nd_corners(arr_in, edge_items):
-    _shape = arr_in.shape
-    max_shape = 2 * edge_items + 1
-    if max(_shape) <= max_shape:
-        return dpt.asnumpy(arr_in)
-    res_shape = tuple(
-        max_shape if _shape[i] > max_shape else _shape[i]
-        for i in range(arr_in.ndim)
-    )
-
-    exec_q = arr_in.sycl_queue
-    arr_out = dpt.empty(
-        res_shape,
-        dtype=arr_in.dtype,
-        usm_type=arr_in.usm_type,
-        sycl_queue=exec_q,
-    )
-
-    blocks = []
-    for i in range(len(_shape)):
-        if _shape[i] > max_shape:
-            blocks.append(
-                (
-                    np.s_[:edge_items],
-                    np.s_[-edge_items:],
-                )
-            )
-        else:
-            blocks.append((np.s_[:],))
-
-    _manager = dpctl.utils.SequentialOrderManager[exec_q]
-    dep_evs = _manager.submitted_events
-    hev_list = []
-    for slc in itertools.product(*blocks):
-        hev, _ = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=arr_in[slc],
-            dst=arr_out[slc],
-            sycl_queue=exec_q,
-            depends=dep_evs,
-        )
-        hev_list.append(hev)
-
-    dpctl.SyclEvent.wait_for(hev_list)
-    return dpt.asnumpy(arr_out)
-
-
-def usm_ndarray_str(
-    x,
-    line_width=None,
-    edge_items=None,
-    threshold=None,
-    precision=None,
-    floatmode=None,
-    suppress=None,
-    sign=None,
-    numpy=False,
-    separator=" ",
-    prefix="",
-    suffix="",
-):
-    """
-    usm_ndarray_str(x, line_width=None, edgeitems=None, threshold=None,
-                    precision=None, floatmode=None, suppress=None,
-                    sign=None, numpy=False, separator=" ", prefix="",
-                    suffix="")
-
-    Returns a string representing the elements of a
-    :class:`dpctl.tensor.usm_ndarray`.
-
-    Args:
-        x (usm_ndarray):
-            Input array.
-        line_width (int, optional):
-            Number of characters printed per line.
-            Raises `TypeError` if line_width is not an integer.
-            Default: `75`.
-        edgeitems (int, optional):
-            Number of elements at the beginning and end
-            when the printed array is abbreviated.
-            Raises `TypeError` if edgeitems is not an integer.
-            Default: `3`.
-        threshold (int, optional):
-            Number of elements that triggers array abbreviation.
-            Raises `TypeError` if threshold is not an integer.
-            Default: `1000`.
-        precision (int or None, optional):
-            Number of digits printed for floating point numbers.
-            Raises `TypeError` if precision is not an integer.
-            Default: `8`.
-        floatmode (str, optional):
-            Controls how floating point numbers are interpreted.
-                `"fixed:`:
-                    Always prints exactly `precision` digits.
-                `"unique"`:
-                    Ignores precision, prints the number of
-                    digits necessary to uniquely specify each number.
-                `"maxprec"`:
-                    Prints `precision` digits or fewer,
-                    if fewer will uniquely represent a number.
-                `"maxprec_equal"`:
-                    Prints an equal number of digits for each number.
-                    This number is `precision` digits or fewer,
-                    if fewer will uniquely represent each number.
-            Raises `ValueError` if floatmode is not one of
-            `fixed`, `unique`, `maxprec`, or `maxprec_equal`.
-            Default: "maxprec_equal"
-        suppress (bool, optional):
-            If `True,` numbers equal to zero in the current precision
-            will print as zero.
-            Default: `False`.
-        sign (str, optional):
-            Controls the sign of floating point numbers.
-                `"-"`:
-                    Omit the sign of positive numbers.
-                `"+"`:
-                    Always print the sign of positive numbers.
-                `" "`:
-                    Always print a whitespace in place of the
-                    sign of positive numbers.
-            Raises `ValueError` if sign is not one of
-            `"-"`, `"+"`, or `" "`.
-            Default: `"-"`.
-        numpy (bool, optional):
-            If `True,` then before other specified print
-            options are set, a dictionary of Numpy's print options
-            will be used to initialize dpctl's print options.
-            Default: "False"
-        separator (str, optional):
-            String inserted between elements of the array string.
-            Default: " "
-        prefix (str, optional):
-            String used to determine spacing to the left of the array string.
-            Default: ""
-        suffix (str, optional):
-            String that determines length of the last line of the array string.
-            Default: ""
-
-    Returns:
-        str: string representation of input array.
-    """
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
-
-    options = get_print_options()
-    options.update(
-        _options_dict(
-            linewidth=line_width,
-            edgeitems=edge_items,
-            threshold=threshold,
-            precision=precision,
-            floatmode=floatmode,
-            suppress=suppress,
-            sign=sign,
-            numpy=numpy,
-        )
-    )
-
-    threshold = options["threshold"]
-    edge_items = options["edgeitems"]
-
-    if x.size > threshold:
-        data = _nd_corners(x, edge_items)
-        options["threshold"] = 0
-    else:
-        data = dpt.asnumpy(x)
-    with np.printoptions(**options):
-        s = np.array2string(
-            data, separator=separator, prefix=prefix, suffix=suffix
-        )
-    return s
-
-
-def usm_ndarray_repr(
-    x, line_width=None, precision=None, suppress=None, prefix="usm_ndarray"
-):
-    """
-    usm_ndarray_repr(x, line_width=None, precision=None,
-                     suppress=None, prefix="")
-
-    Returns a formatted string representing the elements
-    of a :class:`dpctl.tensor.usm_ndarray` and its data type,
-    if not a default type.
-
-    Args:
-        x (usm_ndarray): Input array.
-        line_width (int, optional): Number of characters printed per line.
-            Raises `TypeError` if line_width is not an integer.
-            Default: `75`.
-        precision (int or None, optional): Number of digits printed for
-            floating point numbers.
-            Raises `TypeError` if precision is not an integer.
-            Default: `8`.
-        suppress (bool, optional): If `True,` numbers equal to zero
-            in the current precision will print as zero.
-            Default: `False`.
-        prefix (str, optional): String inserted at the start of the array
-            string.
-            Default: ""
-
-    Returns:
-        str: formatted string representing the input array
-    """
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
-
-    if line_width is None:
-        line_width = _print_options["linewidth"]
-
-    show_dtype = x.dtype not in [
-        dpt.bool,
-        dpt.int64,
-        dpt.float64,
-        dpt.complex128,
-    ]
-
-    prefix = prefix + "("
-    suffix = ")"
-
-    s = usm_ndarray_str(
-        x,
-        line_width=line_width,
-        precision=precision,
-        suppress=suppress,
-        separator=", ",
-        prefix=prefix,
-        suffix=suffix,
-    )
-
-    if show_dtype or x.size == 0:
-        dtype_str = f"dtype={x.dtype.name}"
-        dtype_str = _move_to_next_line(dtype_str, s, line_width, prefix)
-    else:
-        dtype_str = ""
-
-    options = get_print_options()
-    threshold = options["threshold"]
-    if (x.size == 0 and x.shape != (0,)) or x.size > threshold:
-        shape_str = f"shape={x.shape}"
-        shape_str = _move_to_next_line(shape_str, s, line_width, prefix)
-    else:
-        shape_str = ""
-
-    return prefix + s + shape_str + dtype_str + suffix
diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py
deleted file mode 100644
index 51709a7439..0000000000
--- a/dpctl/tensor/_reduction.py
+++ /dev/null
@@ -1,818 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import dpctl
-import dpctl.tensor as dpt
-import dpctl.tensor._tensor_impl as ti
-import dpctl.tensor._tensor_reductions_impl as tri
-from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
-
-from ._numpy_helper import normalize_axis_tuple
-from ._type_utils import (
-    _default_accumulation_dtype,
-    _default_accumulation_dtype_fp_types,
-    _to_device_supported_dtype,
-)
-
-
-def _reduction_over_axis(
-    x,
-    axis,
-    dtype,
-    keepdims,
-    out,
-    _reduction_fn,
-    _dtype_supported,
-    _default_reduction_type_fn,
-):
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
-    nd = x.ndim
-    if axis is None:
-        axis = tuple(range(nd))
-        perm = list(axis)
-        arr = x
-    else:
-        if not isinstance(axis, (tuple, list)):
-            axis = (axis,)
-        axis = normalize_axis_tuple(axis, nd, "axis")
-        perm = [i for i in range(nd) if i not in axis] + list(axis)
-        arr = dpt.permute_dims(x, perm)
-    red_nd = len(axis)
-    res_shape = arr.shape[: nd - red_nd]
-    q = x.sycl_queue
-    inp_dt = x.dtype
-    if dtype is None:
-        res_dt = _default_reduction_type_fn(inp_dt, q)
-    else:
-        res_dt = dpt.dtype(dtype)
-        res_dt = _to_device_supported_dtype(res_dt, q.sycl_device)
-
-    res_usm_type = x.usm_type
-
-    implemented_types = _dtype_supported(inp_dt, res_dt, res_usm_type, q)
-    if dtype is None and not implemented_types:
-        raise RuntimeError(
-            "Automatically determined reduction data type does not "
-            "have direct implementation"
-        )
-    orig_out = out
-    if out is not None:
-        if not isinstance(out, dpt.usm_ndarray):
-            raise TypeError(
-                f"output array must be of usm_ndarray type, got {type(out)}"
-            )
-        if not out.flags.writable:
-            raise ValueError("provided `out` array is read-only")
-        if not keepdims:
-            final_res_shape = res_shape
-        else:
-            inp_shape = x.shape
-            final_res_shape = tuple(
-                inp_shape[i] if i not in axis else 1 for i in range(nd)
-            )
-        if not out.shape == final_res_shape:
-            raise ValueError(
-                "The shape of input and output arrays are inconsistent. "
-                f"Expected output shape is {final_res_shape}, got {out.shape}"
-            )
-        if res_dt != out.dtype:
-            raise ValueError(
-                f"Output array of type {res_dt} is needed, got {out.dtype}"
-            )
-        if dpctl.utils.get_execution_queue((q, out.sycl_queue)) is None:
-            raise ExecutionPlacementError(
-                "Input and output allocation queues are not compatible"
-            )
-        if keepdims:
-            out = dpt.squeeze(out, axis=axis)
-            orig_out = out
-        if ti._array_overlap(x, out) and implemented_types:
-            out = dpt.empty_like(out)
-    else:
-        out = dpt.empty(
-            res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
-        )
-
-    _manager = SequentialOrderManager[q]
-    dep_evs = _manager.submitted_events
-    if red_nd == 0:
-        ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=arr, dst=out, sycl_queue=q, depends=dep_evs
-        )
-        _manager.add_event_pair(ht_e_cpy, cpy_e)
-        if not (orig_out is None or orig_out is out):
-            ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=out, dst=orig_out, sycl_queue=q, depends=[cpy_e]
-            )
-            _manager.add_event_pair(ht_e_cpy2, cpy2_e)
-            out = orig_out
-        return out
-
-    if implemented_types:
-        ht_e, red_e = _reduction_fn(
-            src=arr,
-            trailing_dims_to_reduce=red_nd,
-            dst=out,
-            sycl_queue=q,
-            depends=dep_evs,
-        )
-        _manager.add_event_pair(ht_e, red_e)
-        if not (orig_out is None or orig_out is out):
-            ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=out, dst=orig_out, sycl_queue=q, depends=[red_e]
-            )
-            _manager.add_event_pair(ht_e_cpy, cpy_e)
-            out = orig_out
-    else:
-        if _dtype_supported(res_dt, res_dt, res_usm_type, q):
-            tmp = dpt.empty(
-                arr.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
-            )
-            ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=arr, dst=tmp, sycl_queue=q, depends=dep_evs
-            )
-            _manager.add_event_pair(ht_e_cpy, cpy_e)
-            ht_e_red, red_ev = _reduction_fn(
-                src=tmp,
-                trailing_dims_to_reduce=red_nd,
-                dst=out,
-                sycl_queue=q,
-                depends=[cpy_e],
-            )
-            _manager.add_event_pair(ht_e_red, red_ev)
-        else:
-            buf_dt = _default_reduction_type_fn(inp_dt, q)
-            tmp = dpt.empty(
-                arr.shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q
-            )
-            ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=arr, dst=tmp, sycl_queue=q, depends=dep_evs
-            )
-            _manager.add_event_pair(ht_e_cpy, cpy_e)
-            tmp_res = dpt.empty(
-                res_shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q
-            )
-            ht_e_red, r_e = _reduction_fn(
-                src=tmp,
-                trailing_dims_to_reduce=red_nd,
-                dst=tmp_res,
-                sycl_queue=q,
-                depends=[cpy_e],
-            )
-            _manager.add_event_pair(ht_e_red, r_e)
-            ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=tmp_res, dst=out, sycl_queue=q, depends=[r_e]
-            )
-            _manager.add_event_pair(ht_e_cpy2, cpy2_e)
-
-    if keepdims:
-        res_shape = res_shape + (1,) * red_nd
-        inv_perm = sorted(range(nd), key=lambda d: perm[d])
-        out = dpt.permute_dims(dpt.reshape(out, res_shape), inv_perm)
-    return out
-
-
-def sum(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
-    """
-    Calculates the sum of elements in the input array ``x``.
-
-    Args:
-        x (usm_ndarray):
-            input array.
-        axis (Optional[int, Tuple[int, ...]]):
-            axis or axes along which sums must be computed. If a tuple
-            of unique integers, sums are computed over multiple axes.
-            If ``None``, the sum is computed over the entire array.
-            Default: ``None``.
-        dtype (Optional[dtype]):
-            data type of the returned array. If ``None``, the default data
-            type is inferred from the "kind" of the input array data type.
-
-            * If ``x`` has a real- or complex-valued floating-point data
-              type, the returned array will have the same data type as
-              ``x``.
-            * If ``x`` has signed integral data type, the returned array
-              will have the default signed integral type for the device
-              where input array ``x`` is allocated.
-            * If ``x`` has unsigned integral data type, the returned array
-              will have the default unsigned integral type for the device
-              where input array ``x`` is allocated.
-              array ``x`` is allocated.
-            * If ``x`` has a boolean data type, the returned array will
-              have the default signed integral type for the device
-              where input array ``x`` is allocated.
-
-            If the data type (either specified or resolved) differs from the
-            data type of ``x``, the input array elements are cast to the
-            specified data type before computing the sum.
-            Default: ``None``.
-        keepdims (Optional[bool]):
-            if ``True``, the reduced axes (dimensions) are included in the
-            result as singleton dimensions, so that the returned array remains
-            compatible with the input arrays according to Array Broadcasting
-            rules. Otherwise, if ``False``, the reduced axes are not included
-            in the returned array. Default: ``False``.
-        out (Optional[usm_ndarray]):
-            the array into which the result is written.
-            The data type of ``out`` must match the expected shape and the
-            expected data type of the result or (if provided) ``dtype``.
-            If ``None`` then a new array is returned. Default: ``None``.
-
-    Returns:
-        usm_ndarray:
-            an array containing the sums. If the sum was computed over the
-            entire array, a zero-dimensional array is returned. The returned
-            array has the data type as described in the ``dtype`` parameter
-            description above.
-    """
-    return _reduction_over_axis(
-        x,
-        axis,
-        dtype,
-        keepdims,
-        out,
-        tri._sum_over_axis,
-        tri._sum_over_axis_dtype_supported,
-        _default_accumulation_dtype,
-    )
-
-
-def prod(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
-    """
-    Calculates the product of elements in the input array ``x``.
-
-    Args:
-        x (usm_ndarray):
-            input array.
-        axis (Optional[int, Tuple[int, ...]]):
-            axis or axes along which products must be computed. If a tuple
-            of unique integers, products are computed over multiple axes.
-            If ``None``, the product is computed over the entire array.
-            Default: ``None``.
-        dtype (Optional[dtype]):
-            data type of the returned array. If ``None``, the default data
-            type is inferred from the "kind" of the input array data type.
-
-            * If ``x`` has a real- or complex-valued floating-point data
-              type, the returned array will have the same data type as
-              ``x``.
-            * If ``x`` has signed integral data type, the returned array
-              will have the default signed integral type for the device
-              where input array ``x`` is allocated.
-            * If ``x`` has unsigned integral data type, the returned array
-              will have the default unsigned integral type for the device
-              where input array ``x`` is allocated.
-            * If ``x`` has a boolean data type, the returned array will
-              have the default signed integral type for the device
-              where input array ``x`` is allocated.
-
-            If the data type (either specified or resolved) differs from the
-            data type of ``x``, the input array elements are cast to the
-            specified data type before computing the product.
-            Default: ``None``.
-        keepdims (Optional[bool]):
-            if ``True``, the reduced axes (dimensions) are included in the
-            result as singleton dimensions, so that the returned array remains
-            compatible with the input arrays according to Array Broadcasting
-            rules. Otherwise, if ``False``, the reduced axes are not included
-            in the returned array. Default: ``False``.
-        out (Optional[usm_ndarray]):
-            the array into which the result is written.
-            The data type of ``out`` must match the expected shape and the
-            expected data type of the result or (if provided) ``dtype``.
-            If ``None`` then a new array is returned. Default: ``None``.
-
-    Returns:
-        usm_ndarray:
-            an array containing the products. If the product was computed over
-            the entire array, a zero-dimensional array is returned. The
-            returned array has the data type as described in the ``dtype``
-            parameter description above.
-    """
-    return _reduction_over_axis(
-        x,
-        axis,
-        dtype,
-        keepdims,
-        out,
-        tri._prod_over_axis,
-        tri._prod_over_axis_dtype_supported,
-        _default_accumulation_dtype,
-    )
-
-
-def logsumexp(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
-    """
-    Calculates the logarithm of the sum of exponentials of elements in the
-    input array ``x``.
-
-    Args:
-        x (usm_ndarray):
-            input array.
-        axis (Optional[int, Tuple[int, ...]]):
-            axis or axes along which values must be computed. If a tuple
-            of unique integers, values are computed over multiple axes.
-            If ``None``, the result is computed over the entire array.
-            Default: ``None``.
-        dtype (Optional[dtype]):
-            data type of the returned array. If ``None``, the default data
-            type is inferred from the "kind" of the input array data type.
-
-            * If ``x`` has a real-valued floating-point data type, the
-              returned array will have the same data type as ``x``.
-            * If ``x`` has a boolean or integral data type, the returned array
-              will have the default floating point data type for the device
-              where input array ``x`` is allocated.
-            * If ``x`` has a complex-valued floating-point data type,
-              an error is raised.
-
-            If the data type (either specified or resolved) differs from the
-            data type of ``x``, the input array elements are cast to the
-            specified data type before computing the result.
-            Default: ``None``.
-        keepdims (Optional[bool]):
-            if ``True``, the reduced axes (dimensions) are included in the
-            result as singleton dimensions, so that the returned array remains
-            compatible with the input arrays according to Array Broadcasting
-            rules. Otherwise, if ``False``, the reduced axes are not included
-            in the returned array. Default: ``False``.
-        out (Optional[usm_ndarray]):
-            the array into which the result is written.
-            The data type of ``out`` must match the expected shape and the
-            expected data type of the result or (if provided) ``dtype``.
-            If ``None`` then a new array is returned. Default: ``None``.
-
-    Returns:
-        usm_ndarray:
-            an array containing the results. If the result was computed over
-            the entire array, a zero-dimensional array is returned.
-            The returned array has the data type as described in the
-            ``dtype`` parameter description above.
-    """
-    return _reduction_over_axis(
-        x,
-        axis,
-        dtype,
-        keepdims,
-        out,
-        tri._logsumexp_over_axis,
-        lambda inp_dt, res_dt, *_: tri._logsumexp_over_axis_dtype_supported(
-            inp_dt, res_dt
-        ),
-        _default_accumulation_dtype_fp_types,
-    )
-
-
-def reduce_hypot(x, /, *, axis=None, dtype=None, keepdims=False, out=None):
-    """
-    Calculates the square root of the sum of squares of elements in the input
-    array ``x``.
-
-    Args:
-        x (usm_ndarray):
-            input array.
-        axis (Optional[int, Tuple[int, ...]]):
-            axis or axes along which values must be computed. If a tuple
-            of unique integers, values are computed over multiple axes.
-            If ``None``, the result is computed over the entire array.
-            Default: ``None``.
-        dtype (Optional[dtype]):
-            data type of the returned array. If ``None``, the default data
-            type is inferred from the "kind" of the input array data type.
-
-            * If ``x`` has a real-valued floating-point data type, the
-              returned array will have the same data type as ``x``.
-            * If ``x`` has a boolean or integral data type, the returned array
-              will have the default floating point data type for the device
-              where input array ``x`` is allocated.
-            * If ``x`` has a complex-valued floating-point data type,
-              an error is raised.
-
-            If the data type (either specified or resolved) differs from the
-            data type of ``x``, the input array elements are cast to the
-            specified data type before computing the result. Default: ``None``.
-        keepdims (Optional[bool]):
-            if ``True``, the reduced axes (dimensions) are included in the
-            result as singleton dimensions, so that the returned array remains
-            compatible with the input arrays according to Array Broadcasting
-            rules. Otherwise, if ``False``, the reduced axes are not included
-            in the returned array. Default: ``False``.
-        out (Optional[usm_ndarray]):
-            the array into which the result is written.
-            The data type of ``out`` must match the expected shape and the
-            expected data type of the result or (if provided) ``dtype``.
-            If ``None`` then a new array is returned. Default: ``None``.
-
-    Returns:
-        usm_ndarray:
-            an array containing the results. If the result was computed over
-            the entire array, a zero-dimensional array is returned. The
-            returned array has the data type as described in the ``dtype``
-            parameter description above.
-    """
-    return _reduction_over_axis(
-        x,
-        axis,
-        dtype,
-        keepdims,
-        out,
-        tri._hypot_over_axis,
-        lambda inp_dt, res_dt, *_: tri._hypot_over_axis_dtype_supported(
-            inp_dt, res_dt
-        ),
-        _default_accumulation_dtype_fp_types,
-    )
-
-
-def _comparison_over_axis(x, axis, keepdims, out, _reduction_fn):
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
-
-    nd = x.ndim
-    if axis is None:
-        axis = tuple(range(nd))
-        perm = list(axis)
-        x_tmp = x
-    else:
-        if not isinstance(axis, (tuple, list)):
-            axis = (axis,)
-        axis = normalize_axis_tuple(axis, nd, "axis")
-        perm = [i for i in range(nd) if i not in axis] + list(axis)
-        x_tmp = dpt.permute_dims(x, perm)
-    red_nd = len(axis)
-    if any([x_tmp.shape[i] == 0 for i in range(-red_nd, 0)]):
-        raise ValueError("reduction cannot be performed over zero-size axes")
-    res_shape = x_tmp.shape[: nd - red_nd]
-    exec_q = x.sycl_queue
-    res_dt = x.dtype
-    res_usm_type = x.usm_type
-
-    orig_out = out
-    if out is not None:
-        if not isinstance(out, dpt.usm_ndarray):
-            raise TypeError(
-                f"output array must be of usm_ndarray type, got {type(out)}"
-            )
-        if not out.flags.writable:
-            raise ValueError("provided `out` array is read-only")
-        if not keepdims:
-            final_res_shape = res_shape
-        else:
-            inp_shape = x.shape
-            final_res_shape = tuple(
-                inp_shape[i] if i not in axis else 1 for i in range(nd)
-            )
-        if not out.shape == final_res_shape:
-            raise ValueError(
-                "The shape of input and output arrays are inconsistent. "
-                f"Expected output shape is {final_res_shape}, got {out.shape}"
-            )
-        if res_dt != out.dtype:
-            raise ValueError(
-                f"Output array of type {res_dt} is needed, got {out.dtype}"
-            )
-        if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None:
-            raise ExecutionPlacementError(
-                "Input and output allocation queues are not compatible"
-            )
-        if keepdims:
-            out = dpt.squeeze(out, axis=axis)
-            orig_out = out
-        if ti._array_overlap(x, out):
-            out = dpt.empty_like(out)
-    else:
-        out = dpt.empty(
-            res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=exec_q
-        )
-
-    _manager = SequentialOrderManager[exec_q]
-    dep_evs = _manager.submitted_events
-    if red_nd == 0:
-        ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=x_tmp, dst=out, sycl_queue=exec_q, depends=dep_evs
-        )
-        _manager.add_event_pair(ht_e_cpy, cpy_e)
-        if not (orig_out is None or orig_out is out):
-            ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=out, dst=orig_out, sycl_queue=exec_q, depends=[cpy_e]
-            )
-            _manager.add_event_pair(ht_e_cpy2, cpy2_e)
-            out = orig_out
-        return out
-
-    hev, red_ev = _reduction_fn(
-        src=x_tmp,
-        trailing_dims_to_reduce=red_nd,
-        dst=out,
-        sycl_queue=exec_q,
-        depends=dep_evs,
-    )
-    _manager.add_event_pair(hev, red_ev)
-    if not (orig_out is None or orig_out is out):
-        ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=out, dst=orig_out, sycl_queue=exec_q, depends=[red_ev]
-        )
-        _manager.add_event_pair(ht_e_cpy2, cpy2_e)
-        out = orig_out
-
-    if keepdims:
-        res_shape = res_shape + (1,) * red_nd
-        inv_perm = sorted(range(nd), key=lambda d: perm[d])
-        out = dpt.permute_dims(dpt.reshape(out, res_shape), inv_perm)
-    return out
-
-
-def max(x, /, *, axis=None, keepdims=False, out=None):
-    """
-    Calculates the maximum value of the input array ``x``.
-
-    Args:
-        x (usm_ndarray):
-            input array.
-        axis (Optional[int, Tuple[int, ...]]):
-            axis or axes along which maxima must be computed. If a tuple
-            of unique integers, the maxima are computed over multiple axes.
-            If ``None``, the max is computed over the entire array.
-            Default: ``None``.
-        keepdims (Optional[bool]):
-            if ``True``, the reduced axes (dimensions) are included in the
-            result as singleton dimensions, so that the returned array remains
-            compatible with the input arrays according to Array Broadcasting
-            rules. Otherwise, if ``False``, the reduced axes are not included
-            in the returned array. Default: ``False``.
-        out (Optional[usm_ndarray]):
-            the array into which the result is written.
-            The data type of ``out`` must match the expected shape and the
-            expected data type of the result.
-            If ``None`` then a new array is returned. Default: ``None``.
-
-    Returns:
-        usm_ndarray:
-            an array containing the maxima. If the max was computed over the
-            entire array, a zero-dimensional array is returned. The returned
-            array has the same data type as ``x``.
-    """
-    return _comparison_over_axis(x, axis, keepdims, out, tri._max_over_axis)
-
-
-def min(x, /, *, axis=None, keepdims=False, out=None):
-    """
-    Calculates the minimum value of the input array ``x``.
-
-    Args:
-        x (usm_ndarray):
-            input array.
-        axis (Optional[int, Tuple[int, ...]]):
-            axis or axes along which minima must be computed. If a tuple
-            of unique integers, the minima are computed over multiple axes.
-            If ``None``, the min is computed over the entire array.
-            Default: ``None``.
-        keepdims (Optional[bool]):
-            if ``True``, the reduced axes (dimensions) are included in the
-            result as singleton dimensions, so that the returned array remains
-            compatible with the input arrays according to Array Broadcasting
-            rules. Otherwise, if ``False``, the reduced axes are not included
-            in the returned array. Default: ``False``.
-        out (Optional[usm_ndarray]):
-            the array into which the result is written.
-            The data type of ``out`` must match the expected shape and the
-            expected data type of the result.
-            If ``None`` then a new array is returned. Default: ``None``.
-
-    Returns:
-        usm_ndarray:
-            an array containing the minima. If the min was computed over the
-            entire array, a zero-dimensional array is returned. The returned
-            array has the same data type as ``x``.
-    """
-    return _comparison_over_axis(x, axis, keepdims, out, tri._min_over_axis)
-
-
-def _search_over_axis(x, axis, keepdims, out, _reduction_fn):
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
-
-    nd = x.ndim
-    if axis is None:
-        axis = tuple(range(nd))
-        perm = list(axis)
-        x_tmp = x
-    else:
-        if isinstance(axis, int):
-            axis = (axis,)
-        else:
-            raise TypeError(
-                f"'axis' argument expected to have type 'int' "
-                r"or be `None`, "
-                f"got type {type(axis)}"
-            )
-        axis = normalize_axis_tuple(axis, nd, "axis")
-        perm = [i for i in range(nd) if i not in axis] + list(axis)
-        x_tmp = dpt.permute_dims(x, perm)
-    axis = normalize_axis_tuple(axis, nd, "axis")
-    red_nd = len(axis)
-    if any([x_tmp.shape[i] == 0 for i in range(-red_nd, 0)]):
-        raise ValueError("reduction cannot be performed over zero-size axes")
-    res_shape = x_tmp.shape[: nd - red_nd]
-    exec_q = x.sycl_queue
-    res_dt = ti.default_device_index_type(exec_q.sycl_device)
-    res_usm_type = x.usm_type
-
-    orig_out = out
-    if out is not None:
-        if not isinstance(out, dpt.usm_ndarray):
-            raise TypeError(
-                f"output array must be of usm_ndarray type, got {type(out)}"
-            )
-        if not out.flags.writable:
-            raise ValueError("provided `out` array is read-only")
-        if not keepdims:
-            final_res_shape = res_shape
-        else:
-            inp_shape = x.shape
-            final_res_shape = tuple(
-                inp_shape[i] if i not in axis else 1 for i in range(nd)
-            )
-        if not out.shape == final_res_shape:
-            raise ValueError(
-                "The shape of input and output arrays are inconsistent. "
-                f"Expected output shape is {final_res_shape}, got {out.shape}"
-            )
-        if res_dt != out.dtype:
-            raise ValueError(
-                f"Output array of type {res_dt} is needed, got {out.dtype}"
-            )
-        if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None:
-            raise ExecutionPlacementError(
-                "Input and output allocation queues are not compatible"
-            )
-        if keepdims:
-            out = dpt.squeeze(out, axis=axis)
-            orig_out = out
-        if ti._array_overlap(x, out) and red_nd > 0:
-            out = dpt.empty_like(out)
-    else:
-        out = dpt.empty(
-            res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=exec_q
-        )
-
-    _manager = SequentialOrderManager[exec_q]
-    dep_evs = _manager.submitted_events
-    if red_nd == 0:
-        ht_e_fill, fill_ev = ti._full_usm_ndarray(
-            fill_value=0, dst=out, sycl_queue=exec_q, depends=dep_evs
-        )
-        _manager.add_event_pair(ht_e_fill, fill_ev)
-        return out
-
-    hev, red_ev = _reduction_fn(
-        src=x_tmp,
-        trailing_dims_to_reduce=red_nd,
-        dst=out,
-        sycl_queue=exec_q,
-        depends=dep_evs,
-    )
-    _manager.add_event_pair(hev, red_ev)
-    if not (orig_out is None or orig_out is out):
-        ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=out, dst=orig_out, sycl_queue=exec_q, depends=[red_ev]
-        )
-        _manager.add_event_pair(ht_e_cpy2, cpy2_e)
-        out = orig_out
-
-    if keepdims:
-        res_shape = res_shape + (1,) * red_nd
-        inv_perm = sorted(range(nd), key=lambda d: perm[d])
-        out = dpt.permute_dims(dpt.reshape(out, res_shape), inv_perm)
-    return out
-
-
-def argmax(x, /, *, axis=None, keepdims=False, out=None):
-    """
-    Returns the indices of the maximum values of the input array ``x`` along a
-    specified axis.
-
-    When the maximum value occurs multiple times, the indices corresponding to
-    the first occurrence are returned.
-
-    Args:
-        x (usm_ndarray):
-            input array.
-        axis (Optional[int]):
-            axis along which to search. If ``None``, returns the index of the
-            maximum value of the flattened array.
-            Default: ``None``.
-        keepdims (Optional[bool]):
-            if ``True``, the reduced axes (dimensions) are included in the
-            result as singleton dimensions, so that the returned array remains
-            compatible with the input arrays according to Array Broadcasting
-            rules. Otherwise, if ``False``, the reduced axes are not included
-            in the returned array. Default: ``False``.
-        out (Optional[usm_ndarray]):
-            the array into which the result is written.
-            The data type of ``out`` must match the expected shape and the
-            expected data type of the result.
-            If ``None`` then a new array is returned. Default: ``None``.
-
-    Returns:
-        usm_ndarray:
-            an array containing the indices of the first occurrence of the
-            maximum values. If the entire array was searched, a
-            zero-dimensional array is returned. The returned array has the
-            default array index data type for the device of ``x``.
-    """
-    return _search_over_axis(x, axis, keepdims, out, tri._argmax_over_axis)
-
-
-def argmin(x, /, *, axis=None, keepdims=False, out=None):
-    """
-    Returns the indices of the minimum values of the input array ``x`` along a
-    specified axis.
-
-    When the minimum value occurs multiple times, the indices corresponding to
-    the first occurrence are returned.
-
-    Args:
-        x (usm_ndarray):
-            input array.
-        axis (Optional[int]):
-            axis along which to search. If ``None``, returns the index of the
-            minimum value of the flattened array.
-            Default: ``None``.
-        keepdims (Optional[bool]):
-            if ``True``, the reduced axes (dimensions) are included in the
-            result as singleton dimensions, so that the returned array remains
-            compatible with the input arrays according to Array Broadcasting
-            rules. Otherwise, if ``False``, the reduced axes are not included
-            in the returned array. Default: ``False``.
-        out (Optional[usm_ndarray]):
-            the array into which the result is written.
-            The data type of ``out`` must match the expected shape and the
-            expected data type of the result.
-            If ``None`` then a new array is returned. Default: ``None``.
-
-    Returns:
-        usm_ndarray:
-            an array containing the indices of the first occurrence of the
-            minimum values. If the entire array was searched, a
-            zero-dimensional array is returned. The returned array has the
-            default array index data type for the device of ``x``.
-    """
-    return _search_over_axis(x, axis, keepdims, out, tri._argmin_over_axis)
-
-
-def count_nonzero(x, /, *, axis=None, keepdims=False, out=None):
-    """
-    Counts the number of elements in the input array ``x`` which are non-zero.
-
-    Args:
-        x (usm_ndarray):
-            input array.
-        axis (Optional[int, Tuple[int, ...]]):
-            axis or axes along which to count. If a tuple of unique integers,
-            the number of non-zero values are computed over multiple axes.
-            If ``None``, the number of non-zero values is computed over the
-            entire array.
-            Default: ``None``.
-        keepdims (Optional[bool]):
-            if ``True``, the reduced axes (dimensions) are included in the
-            result as singleton dimensions, so that the returned array remains
-            compatible with the input arrays according to Array Broadcasting
-            rules. Otherwise, if ``False``, the reduced axes are not included
-            in the returned array. Default: ``False``.
-        out (Optional[usm_ndarray]):
-            the array into which the result is written.
-            The data type of ``out`` must match the expected shape and data
-            type.
-            If ``None`` then a new array is returned. Default: ``None``.
-
-    Returns:
-        usm_ndarray:
-            an array containing the count of non-zero values. If the sum was
-            computed over the entire array, a zero-dimensional array is
-            returned. The returned array will have the default array index data
-            type.
-    """
-    if x.dtype != dpt.bool:
-        x = dpt.astype(x, dpt.bool, copy=False)
-    return sum(
-        x,
-        axis=axis,
-        dtype=ti.default_device_index_type(x.sycl_device),
-        keepdims=keepdims,
-        out=out,
-    )
diff --git a/dpctl/tensor/_reshape.py b/dpctl/tensor/_reshape.py
deleted file mode 100644
index 627ed56559..0000000000
--- a/dpctl/tensor/_reshape.py
+++ /dev/null
@@ -1,194 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-import operator
-
-import numpy as np
-
-import dpctl.tensor as dpt
-import dpctl.utils
-from dpctl.tensor._tensor_impl import (
-    _copy_usm_ndarray_for_reshape,
-    _ravel_multi_index,
-    _unravel_index,
-)
-
-__doc__ = "Implementation module for :func:`dpctl.tensor.reshape`."
-
-
-def _make_unit_indexes(shape):
-    """
-    Construct a diagonal matrix with with one on the diagonal
-    except if the corresponding element of shape is 1.
-    """
-    nd = len(shape)
-    mi = np.zeros((nd, nd), dtype="u4")
-    for i, dim in enumerate(shape):
-        mi[i, i] = 1 if dim > 1 else 0
-    return mi
-
-
-def ti_unravel_index(flat_index, shape, order="C"):
-    return _unravel_index(flat_index, shape, order)
-
-
-def ti_ravel_multi_index(multi_index, shape, order="C"):
-    return _ravel_multi_index(multi_index, shape, order)
-
-
-def reshaped_strides(old_sh, old_sts, new_sh, order="C"):
-    """
-    When reshaping array with `old_sh` shape and `old_sts` strides
-    into the new shape `new_sh`, returns the new stride if the reshape
-    can be a view, otherwise returns `None`.
-    """
-    eye_new_mi = _make_unit_indexes(new_sh)
-    new_sts = [
-        sum(
-            st_i * ind_i
-            for st_i, ind_i in zip(
-                old_sts, ti_unravel_index(flat_index, old_sh, order=order)
-            )
-        )
-        for flat_index in [
-            ti_ravel_multi_index(unitvec, new_sh, order=order)
-            for unitvec in eye_new_mi
-        ]
-    ]
-    eye_old_mi = _make_unit_indexes(old_sh)
-    check_sts = [
-        sum(
-            st_i * ind_i
-            for st_i, ind_i in zip(
-                new_sts, ti_unravel_index(flat_index, new_sh, order=order)
-            )
-        )
-        for flat_index in [
-            ti_ravel_multi_index(unitvec, old_sh, order=order)
-            for unitvec in eye_old_mi
-        ]
-    ]
-    valid = all(
-        check_st == old_st or old_dim == 1
-        for check_st, old_st, old_dim in zip(check_sts, old_sts, old_sh)
-    )
-    return new_sts if valid else None
-
-
-def reshape(X, /, shape, *, order="C", copy=None):
-    """reshape(x, shape, order="C")
-
-    Reshapes array ``x`` into new shape.
-
-    Args:
-        x (usm_ndarray):
-            input array
-        shape (Tuple[int]):
-            the desired shape of the resulting array.
-        order ("C", "F", optional):
-            memory layout of the resulting array
-            if a copy is found to be necessary. Supported
-            choices are ``"C"`` for C-contiguous, or row-major layout;
-            and ``"F"`` for F-contiguous, or column-major layout.
-
-    Returns:
-        usm_ndarray:
-            Reshaped array is a view, if possible,
-            and a copy otherwise with memory layout as indicated
-            by ``order`` keyword.
-    """
-    if not isinstance(X, dpt.usm_ndarray):
-        raise TypeError
-    if not isinstance(shape, (list, tuple)):
-        shape = (shape,)
-    if order in "cfCF":
-        order = order.upper()
-    else:
-        raise ValueError(
-            f"Keyword 'order' not recognized. Expecting 'C' or 'F', got {order}"
-        )
-    if copy not in (True, False, None):
-        raise ValueError(
-            f"Keyword 'copy' not recognized. Expecting True, False, "
-            f"or None, got {copy}"
-        )
-    shape = [operator.index(d) for d in shape]
-    negative_ones_count = 0
-    for nshi in shape:
-        if nshi == -1:
-            negative_ones_count = negative_ones_count + 1
-        if (nshi < -1) or negative_ones_count > 1:
-            raise ValueError(
-                "Target shape should have at most 1 negative "
-                "value which can only be -1"
-            )
-    if negative_ones_count:
-        sz = -np.prod(shape)
-        if sz == 0:
-            raise ValueError(
-                f"Can not reshape array of size {X.size} into "
-                f"shape {tuple(i for i in shape if i >= 0)}"
-            )
-        v = X.size // sz
-        shape = [v if d == -1 else d for d in shape]
-    if X.size != np.prod(shape):
-        raise ValueError(f"Can not reshape into {shape}")
-    if X.size:
-        newsts = reshaped_strides(X.shape, X.strides, shape, order=order)
-    else:
-        newsts = (1,) * len(shape)
-    copy_required = newsts is None
-    if copy_required and (copy is False):
-        raise ValueError(
-            "Reshaping the array requires a copy, but no copying was "
-            "requested by using copy=False"
-        )
-    copy_q = X.sycl_queue
-    if copy_required or (copy is True):
-        # must perform a copy
-        copy_q = X.sycl_queue
-        flat_res = dpt.usm_ndarray(
-            (X.size,),
-            dtype=X.dtype,
-            buffer=X.usm_type,
-            buffer_ctor_kwargs={"queue": copy_q},
-        )
-        _manager = dpctl.utils.SequentialOrderManager[copy_q]
-        dep_evs = _manager.submitted_events
-        if order == "C":
-            hev, r_e = _copy_usm_ndarray_for_reshape(
-                src=X, dst=flat_res, sycl_queue=copy_q, depends=dep_evs
-            )
-        else:
-            X_t = dpt.permute_dims(X, range(X.ndim - 1, -1, -1))
-            hev, r_e = _copy_usm_ndarray_for_reshape(
-                src=X_t, dst=flat_res, sycl_queue=copy_q, depends=dep_evs
-            )
-        _manager.add_event_pair(hev, r_e)
-        return dpt.usm_ndarray(
-            tuple(shape), dtype=X.dtype, buffer=flat_res, order=order
-        )
-    # can form a view
-    if (len(shape) == X.ndim) and all(
-        s1 == s2 for s1, s2 in zip(shape, X.shape)
-    ):
-        return X
-    return dpt.usm_ndarray(
-        shape,
-        dtype=X.dtype,
-        buffer=X,
-        strides=tuple(newsts),
-        offset=X._element_offset,
-    )
diff --git a/dpctl/tensor/_scalar_utils.py b/dpctl/tensor/_scalar_utils.py
deleted file mode 100644
index 8b6aa01c86..0000000000
--- a/dpctl/tensor/_scalar_utils.py
+++ /dev/null
@@ -1,111 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import numbers
-
-import numpy as np
-
-import dpctl.memory as dpm
-import dpctl.tensor as dpt
-from dpctl.tensor._usmarray import _is_object_with_buffer_protocol as _is_buffer
-
-from ._type_utils import (
-    WeakBooleanType,
-    WeakComplexType,
-    WeakFloatingType,
-    WeakIntegralType,
-    _to_device_supported_dtype,
-)
-
-
-def _get_queue_usm_type(o):
-    """Return SYCL device where object `o` allocated memory, or None."""
-    if isinstance(o, dpt.usm_ndarray):
-        return o.sycl_queue, o.usm_type
-    elif hasattr(o, "__sycl_usm_array_interface__"):
-        try:
-            m = dpm.as_usm_memory(o)
-            return m.sycl_queue, m.get_usm_type()
-        except Exception:
-            return None, None
-    return None, None
-
-
-def _get_dtype(o, dev):
-    if isinstance(o, dpt.usm_ndarray):
-        return o.dtype
-    if hasattr(o, "__sycl_usm_array_interface__"):
-        return dpt.asarray(o).dtype
-    if _is_buffer(o):
-        host_dt = np.array(o).dtype
-        dev_dt = _to_device_supported_dtype(host_dt, dev)
-        return dev_dt
-    if hasattr(o, "dtype"):
-        dev_dt = _to_device_supported_dtype(o.dtype, dev)
-        return dev_dt
-    if isinstance(o, bool):
-        return WeakBooleanType(o)
-    if isinstance(o, int):
-        return WeakIntegralType(o)
-    if isinstance(o, float):
-        return WeakFloatingType(o)
-    if isinstance(o, complex):
-        return WeakComplexType(o)
-    return np.object_
-
-
-def _validate_dtype(dt) -> bool:
-    return isinstance(
-        dt,
-        (WeakBooleanType, WeakIntegralType, WeakFloatingType, WeakComplexType),
-    ) or (
-        isinstance(dt, dpt.dtype)
-        and dt
-        in [
-            dpt.bool,
-            dpt.int8,
-            dpt.uint8,
-            dpt.int16,
-            dpt.uint16,
-            dpt.int32,
-            dpt.uint32,
-            dpt.int64,
-            dpt.uint64,
-            dpt.float16,
-            dpt.float32,
-            dpt.float64,
-            dpt.complex64,
-            dpt.complex128,
-        ]
-    )
-
-
-def _get_shape(o):
-    if isinstance(o, dpt.usm_ndarray):
-        return o.shape
-    if _is_buffer(o):
-        return memoryview(o).shape
-    if isinstance(o, numbers.Number):
-        return tuple()
-    return getattr(o, "shape", tuple())
-
-
-__all__ = [
-    "_get_dtype",
-    "_get_queue_usm_type",
-    "_get_shape",
-    "_validate_dtype",
-]
diff --git a/dpctl/tensor/_search_functions.py b/dpctl/tensor/_search_functions.py
deleted file mode 100644
index e09535bd3a..0000000000
--- a/dpctl/tensor/_search_functions.py
+++ /dev/null
@@ -1,403 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import dpctl
-import dpctl.tensor as dpt
-import dpctl.tensor._tensor_impl as ti
-from dpctl.tensor._manipulation_functions import _broadcast_shape_impl
-from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
-
-from ._copy_utils import _empty_like_orderK, _empty_like_triple_orderK
-from ._scalar_utils import (
-    _get_dtype,
-    _get_queue_usm_type,
-    _get_shape,
-    _validate_dtype,
-)
-from ._type_utils import (
-    WeakBooleanType,
-    WeakComplexType,
-    WeakFloatingType,
-    WeakIntegralType,
-    _all_data_types,
-    _can_cast,
-    _is_weak_dtype,
-    _strong_dtype_num_kind,
-    _to_device_supported_dtype,
-    _weak_type_num_kind,
-)
-
-
-def _default_dtype_from_weak_type(dt, dev):
-    if isinstance(dt, WeakBooleanType):
-        return dpt.bool
-    if isinstance(dt, WeakIntegralType):
-        return dpt.dtype(ti.default_device_int_type(dev))
-    if isinstance(dt, WeakFloatingType):
-        return dpt.dtype(ti.default_device_fp_type(dev))
-    if isinstance(dt, WeakComplexType):
-        return dpt.dtype(ti.default_device_complex_type(dev))
-
-
-def _resolve_two_weak_types(o1_dtype, o2_dtype, dev):
-    "Resolves two weak data types per NEP-0050"
-    if _is_weak_dtype(o1_dtype):
-        if _is_weak_dtype(o2_dtype):
-            return _default_dtype_from_weak_type(
-                o1_dtype, dev
-            ), _default_dtype_from_weak_type(o2_dtype, dev)
-        o1_kind_num = _weak_type_num_kind(o1_dtype)
-        o2_kind_num = _strong_dtype_num_kind(o2_dtype)
-        if o1_kind_num > o2_kind_num:
-            if isinstance(o1_dtype, WeakIntegralType):
-                return dpt.dtype(ti.default_device_int_type(dev)), o2_dtype
-            if isinstance(o1_dtype, WeakComplexType):
-                if o2_dtype is dpt.float16 or o2_dtype is dpt.float32:
-                    return dpt.complex64, o2_dtype
-                return (
-                    _to_device_supported_dtype(dpt.complex128, dev),
-                    o2_dtype,
-                )
-            return _to_device_supported_dtype(dpt.float64, dev), o2_dtype
-        else:
-            return o2_dtype, o2_dtype
-    elif _is_weak_dtype(o2_dtype):
-        o1_kind_num = _strong_dtype_num_kind(o1_dtype)
-        o2_kind_num = _weak_type_num_kind(o2_dtype)
-        if o2_kind_num > o1_kind_num:
-            if isinstance(o2_dtype, WeakIntegralType):
-                return o1_dtype, dpt.dtype(ti.default_device_int_type(dev))
-            if isinstance(o2_dtype, WeakComplexType):
-                if o1_dtype is dpt.float16 or o1_dtype is dpt.float32:
-                    return o1_dtype, dpt.complex64
-                return o1_dtype, _to_device_supported_dtype(dpt.complex128, dev)
-            return (
-                o1_dtype,
-                _to_device_supported_dtype(dpt.float64, dev),
-            )
-        else:
-            return o1_dtype, o1_dtype
-    else:
-        return o1_dtype, o2_dtype
-
-
-def _where_result_type(dt1, dt2, dev):
-    res_dtype = dpt.result_type(dt1, dt2)
-    fp16 = dev.has_aspect_fp16
-    fp64 = dev.has_aspect_fp64
-
-    all_dts = _all_data_types(fp16, fp64)
-    if res_dtype in all_dts:
-        return res_dtype
-    else:
-        for res_dtype_ in all_dts:
-            if _can_cast(dt1, res_dtype_, fp16, fp64) and _can_cast(
-                dt2, res_dtype_, fp16, fp64
-            ):
-                return res_dtype_
-        return None
-
-
-def where(condition, x1, x2, /, *, order="K", out=None):
-    """
-    Returns :class:`dpctl.tensor.usm_ndarray` with elements chosen
-    from ``x1`` or ``x2`` depending on ``condition``.
-
-    Args:
-        condition (usm_ndarray): When ``True`` yields from ``x1``,
-            and otherwise yields from ``x2``.
-            Must be compatible with ``x1`` and ``x2`` according
-            to broadcasting rules.
-        x1 (Union[usm_ndarray, bool, int, float, complex]):
-            Array from which values are chosen when ``condition`` is ``True``.
-            Must be compatible with ``condition`` and ``x2`` according
-            to broadcasting rules.
-        x2 (Union[usm_ndarray, bool, int, float, complex]):
-            Array from which values are chosen when ``condition`` is not
-            ``True``.
-            Must be compatible with ``condition`` and ``x2`` according
-            to broadcasting rules.
-        order (``"K"``, ``"C"``, ``"F"``, ``"A"``, optional):
-            Memory layout of the new output array,
-            if parameter ``out`` is ``None``.
-            Default: ``"K"``.
-        out (Optional[usm_ndarray]):
-            the array into which the result is written.
-            The data type of `out` must match the expected shape and the
-            expected data type of the result.
-            If ``None`` then a new array is returned. Default: ``None``.
-
-    Returns:
-        usm_ndarray:
-            An array with elements from ``x1`` where ``condition`` is ``True``,
-            and elements from ``x2`` elsewhere.
-
-    The data type of the returned array is determined by applying
-    the Type Promotion Rules to ``x1`` and ``x2``.
-    """
-    if not isinstance(condition, dpt.usm_ndarray):
-        raise TypeError(
-            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(condition)}"
-        )
-    if order not in ["K", "C", "F", "A"]:
-        order = "K"
-    q1, condition_usm_type = condition.sycl_queue, condition.usm_type
-    q2, x1_usm_type = _get_queue_usm_type(x1)
-    q3, x2_usm_type = _get_queue_usm_type(x2)
-    if q2 is None and q3 is None:
-        exec_q = q1
-        out_usm_type = condition_usm_type
-    elif q3 is None:
-        exec_q = dpctl.utils.get_execution_queue((q1, q2))
-        if exec_q is None:
-            raise ExecutionPlacementError(
-                "Execution placement can not be unambiguously inferred "
-                "from input arguments."
-            )
-        out_usm_type = dpctl.utils.get_coerced_usm_type(
-            (
-                condition_usm_type,
-                x1_usm_type,
-            )
-        )
-    elif q2 is None:
-        exec_q = dpctl.utils.get_execution_queue((q1, q3))
-        if exec_q is None:
-            raise ExecutionPlacementError(
-                "Execution placement can not be unambiguously inferred "
-                "from input arguments."
-            )
-        out_usm_type = dpctl.utils.get_coerced_usm_type(
-            (
-                condition_usm_type,
-                x2_usm_type,
-            )
-        )
-    else:
-        exec_q = dpctl.utils.get_execution_queue((q1, q2, q3))
-        if exec_q is None:
-            raise ExecutionPlacementError(
-                "Execution placement can not be unambiguously inferred "
-                "from input arguments."
-            )
-        out_usm_type = dpctl.utils.get_coerced_usm_type(
-            (
-                condition_usm_type,
-                x1_usm_type,
-                x2_usm_type,
-            )
-        )
-    dpctl.utils.validate_usm_type(out_usm_type, allow_none=False)
-    condition_shape = condition.shape
-    x1_shape = _get_shape(x1)
-    x2_shape = _get_shape(x2)
-    if not all(
-        isinstance(s, (tuple, list))
-        for s in (
-            x1_shape,
-            x2_shape,
-        )
-    ):
-        raise TypeError(
-            "Shape of arguments can not be inferred. "
-            "Arguments are expected to be "
-            "lists, tuples, or both"
-        )
-    try:
-        res_shape = _broadcast_shape_impl(
-            [
-                condition_shape,
-                x1_shape,
-                x2_shape,
-            ]
-        )
-    except ValueError:
-        raise ValueError(
-            "operands could not be broadcast together with shapes "
-            f"{condition_shape}, {x1_shape}, and {x2_shape}"
-        )
-    sycl_dev = exec_q.sycl_device
-    x1_dtype = _get_dtype(x1, sycl_dev)
-    x2_dtype = _get_dtype(x2, sycl_dev)
-    if not all(_validate_dtype(o) for o in (x1_dtype, x2_dtype)):
-        raise ValueError("Operands have unsupported data types")
-    x1_dtype, x2_dtype = _resolve_two_weak_types(x1_dtype, x2_dtype, sycl_dev)
-    out_dtype = _where_result_type(x1_dtype, x2_dtype, sycl_dev)
-    if out_dtype is None:
-        raise TypeError(
-            "function 'where' does not support input "
-            f"types ({x1_dtype}, {x2_dtype}), "
-            "and the inputs could not be safely coerced "
-            "to any supported types according to the casting rule ''safe''."
-        )
-
-    orig_out = out
-    if out is not None:
-        if not isinstance(out, dpt.usm_ndarray):
-            raise TypeError(
-                "output array must be of usm_ndarray type, got " f"{type(out)}"
-            )
-
-        if not out.flags.writable:
-            raise ValueError("provided `out` array is read-only")
-
-        if out.shape != res_shape:
-            raise ValueError(
-                "The shape of input and output arrays are "
-                f"inconsistent. Expected output shape is {res_shape}, "
-                f"got {out.shape}"
-            )
-
-        if out_dtype != out.dtype:
-            raise ValueError(
-                f"Output array of type {out_dtype} is needed, "
-                f"got {out.dtype}"
-            )
-
-        if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None:
-            raise ExecutionPlacementError(
-                "Input and output allocation queues are not compatible"
-            )
-
-        if ti._array_overlap(condition, out) and not ti._same_logical_tensors(
-            condition, out
-        ):
-            out = dpt.empty_like(out)
-
-        if isinstance(x1, dpt.usm_ndarray):
-            if (
-                ti._array_overlap(x1, out)
-                and not ti._same_logical_tensors(x1, out)
-                and x1_dtype == out_dtype
-            ):
-                out = dpt.empty_like(out)
-
-        if isinstance(x2, dpt.usm_ndarray):
-            if (
-                ti._array_overlap(x2, out)
-                and not ti._same_logical_tensors(x2, out)
-                and x2_dtype == out_dtype
-            ):
-                out = dpt.empty_like(out)
-
-    if order == "A":
-        order = (
-            "F"
-            if all(
-                arr.flags.f_contiguous
-                for arr in (
-                    condition,
-                    x1,
-                    x2,
-                )
-            )
-            else "C"
-        )
-    if not isinstance(x1, dpt.usm_ndarray):
-        x1 = dpt.asarray(x1, dtype=x1_dtype, sycl_queue=exec_q)
-    if not isinstance(x2, dpt.usm_ndarray):
-        x2 = dpt.asarray(x2, dtype=x2_dtype, sycl_queue=exec_q)
-
-    if condition.size == 0:
-        if out is not None:
-            return out
-        else:
-            if order == "K":
-                return _empty_like_triple_orderK(
-                    condition,
-                    x1,
-                    x2,
-                    out_dtype,
-                    res_shape,
-                    out_usm_type,
-                    exec_q,
-                )
-            else:
-                return dpt.empty(
-                    res_shape,
-                    dtype=out_dtype,
-                    order=order,
-                    usm_type=out_usm_type,
-                    sycl_queue=exec_q,
-                )
-
-    _manager = SequentialOrderManager[exec_q]
-    dep_evs = _manager.submitted_events
-    if x1_dtype != out_dtype:
-        if order == "K":
-            _x1 = _empty_like_orderK(x1, out_dtype)
-        else:
-            _x1 = dpt.empty_like(x1, dtype=out_dtype, order=order)
-        ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=x1, dst=_x1, sycl_queue=exec_q, depends=dep_evs
-        )
-        x1 = _x1
-        _manager.add_event_pair(ht_copy1_ev, copy1_ev)
-
-    if x2_dtype != out_dtype:
-        if order == "K":
-            _x2 = _empty_like_orderK(x2, out_dtype)
-        else:
-            _x2 = dpt.empty_like(x2, dtype=out_dtype, order=order)
-        ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=x2, dst=_x2, sycl_queue=exec_q, depends=dep_evs
-        )
-        x2 = _x2
-        _manager.add_event_pair(ht_copy2_ev, copy2_ev)
-
-    if out is None:
-        if order == "K":
-            out = _empty_like_triple_orderK(
-                condition, x1, x2, out_dtype, res_shape, out_usm_type, exec_q
-            )
-        else:
-            out = dpt.empty(
-                res_shape,
-                dtype=out_dtype,
-                order=order,
-                usm_type=out_usm_type,
-                sycl_queue=exec_q,
-            )
-
-    if condition_shape != res_shape:
-        condition = dpt.broadcast_to(condition, res_shape)
-    if x1_shape != res_shape:
-        x1 = dpt.broadcast_to(x1, res_shape)
-    if x2_shape != res_shape:
-        x2 = dpt.broadcast_to(x2, res_shape)
-
-    dep_evs = _manager.submitted_events
-    hev, where_ev = ti._where(
-        condition=condition,
-        x1=x1,
-        x2=x2,
-        dst=out,
-        sycl_queue=exec_q,
-        depends=dep_evs,
-    )
-    _manager.add_event_pair(hev, where_ev)
-    if not (orig_out is None or orig_out is out):
-        # Copy the out data from temporary buffer to original memory
-        ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=out,
-            dst=orig_out,
-            sycl_queue=exec_q,
-            depends=[where_ev],
-        )
-        _manager.add_event_pair(ht_copy_out_ev, cpy_ev)
-        out = orig_out
-
-    return out
diff --git a/dpctl/tensor/_searchsorted.py b/dpctl/tensor/_searchsorted.py
deleted file mode 100644
index 131759b5ce..0000000000
--- a/dpctl/tensor/_searchsorted.py
+++ /dev/null
@@ -1,157 +0,0 @@
-from typing import Literal, Union
-
-import dpctl
-import dpctl.utils as du
-
-from ._copy_utils import _empty_like_orderK
-from ._ctors import empty
-from ._tensor_impl import _copy_usm_ndarray_into_usm_ndarray as ti_copy
-from ._tensor_impl import _take as ti_take
-from ._tensor_impl import (
-    default_device_index_type as ti_default_device_index_type,
-)
-from ._tensor_sorting_impl import _searchsorted_left, _searchsorted_right
-from ._type_utils import isdtype, result_type
-from ._usmarray import usm_ndarray
-
-
-def searchsorted(
-    x1: usm_ndarray,
-    x2: usm_ndarray,
-    /,
-    *,
-    side: Literal["left", "right"] = "left",
-    sorter: Union[usm_ndarray, None] = None,
-) -> usm_ndarray:
-    """searchsorted(x1, x2, side='left', sorter=None)
-
-    Finds the indices into `x1` such that, if the corresponding elements
-    in `x2` were inserted before the indices, the order of `x1`, when sorted
-    in ascending order, would be preserved.
-
-    Args:
-        x1 (usm_ndarray):
-            input array. Must be a one-dimensional array. If `sorter` is
-            `None`, must be sorted in ascending order; otherwise, `sorter` must
-            be an array of indices that sort `x1` in ascending order.
-        x2 (usm_ndarray):
-            array containing search values.
-        side (Literal["left", "right]):
-            argument controlling which index is returned if a value lands
-            exactly on an edge. If `x2` is an array of rank `N` where
-            `v = x2[n, m, ..., j]`, the element `ret[n, m, ..., j]` in the
-            return array `ret` contains the position `i` such that
-            if `side="left"`, it is the first index such that
-            `x1[i-1] < v <= x1[i]`, `0` if `v <= x1[0]`, and `x1.size`
-            if `v > x1[-1]`;
-            and if `side="right"`, it is the first position `i` such that
-            `x1[i-1] <= v < x1[i]`, `0` if `v < x1[0]`, and `x1.size`
-            if `v >= x1[-1]`. Default: `"left"`.
-        sorter (Optional[usm_ndarray]):
-            array of indices that sort `x1` in ascending order. The array must
-            have the same shape as `x1` and have an integral data type.
-            Out of bound index values of `sorter` array are treated using
-            `"wrap"` mode documented in :py:func:`dpctl.tensor.take`.
-            Default: `None`.
-    """
-    if not isinstance(x1, usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x1)}")
-    if not isinstance(x2, usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x2)}")
-    if sorter is not None and not isinstance(sorter, usm_ndarray):
-        raise TypeError(
-            f"Expected dpctl.tensor.usm_ndarray, got {type(sorter)}"
-        )
-
-    if side not in ["left", "right"]:
-        raise ValueError(
-            "Unrecognized value of 'side' keyword argument. "
-            "Expected either 'left' or 'right'"
-        )
-
-    if sorter is None:
-        q = du.get_execution_queue([x1.sycl_queue, x2.sycl_queue])
-    else:
-        q = du.get_execution_queue(
-            [x1.sycl_queue, x2.sycl_queue, sorter.sycl_queue]
-        )
-    if q is None:
-        raise du.ExecutionPlacementError(
-            "Execution placement can not be unambiguously "
-            "inferred from input arguments."
-        )
-
-    if x1.ndim != 1:
-        raise ValueError("First argument array must be one-dimensional")
-
-    x1_dt = x1.dtype
-    x2_dt = x2.dtype
-
-    _manager = du.SequentialOrderManager[q]
-    dep_evs = _manager.submitted_events
-    ev = dpctl.SyclEvent()
-    if sorter is not None:
-        if not isdtype(sorter.dtype, "integral"):
-            raise ValueError(
-                f"Sorter array must have integral data type, got {sorter.dtype}"
-            )
-        if x1.shape != sorter.shape:
-            raise ValueError(
-                "Sorter array must be one-dimension with the same "
-                "shape as the first argument array"
-            )
-        res = empty(x1.shape, dtype=x1_dt, usm_type=x1.usm_type, sycl_queue=q)
-        ind = (sorter,)
-        axis = 0
-        wrap_out_of_bound_indices_mode = 0
-        ht_ev, ev = ti_take(
-            x1,
-            ind,
-            res,
-            axis,
-            wrap_out_of_bound_indices_mode,
-            sycl_queue=q,
-            depends=dep_evs,
-        )
-        x1 = res
-        _manager.add_event_pair(ht_ev, ev)
-
-    if x1_dt != x2_dt:
-        dt = result_type(x1, x2)
-        if x1_dt != dt:
-            x1_buf = _empty_like_orderK(x1, dt)
-            dep_evs = _manager.submitted_events
-            ht_ev, ev = ti_copy(
-                src=x1, dst=x1_buf, sycl_queue=q, depends=dep_evs
-            )
-            _manager.add_event_pair(ht_ev, ev)
-            x1 = x1_buf
-        if x2_dt != dt:
-            x2_buf = _empty_like_orderK(x2, dt)
-            dep_evs = _manager.submitted_events
-            ht_ev, ev = ti_copy(
-                src=x2, dst=x2_buf, sycl_queue=q, depends=dep_evs
-            )
-            _manager.add_event_pair(ht_ev, ev)
-            x2 = x2_buf
-
-    dst_usm_type = du.get_coerced_usm_type([x1.usm_type, x2.usm_type])
-    index_dt = ti_default_device_index_type(q)
-
-    dst = _empty_like_orderK(x2, index_dt, usm_type=dst_usm_type)
-
-    dep_evs = _manager.submitted_events
-    if side == "left":
-        ht_ev, s_ev = _searchsorted_left(
-            hay=x1,
-            needles=x2,
-            positions=dst,
-            sycl_queue=q,
-            depends=dep_evs,
-        )
-    else:
-        ht_ev, s_ev = _searchsorted_right(
-            hay=x1, needles=x2, positions=dst, sycl_queue=q, depends=dep_evs
-        )
-    _manager.add_event_pair(ht_ev, s_ev)
-    return dst
diff --git a/dpctl/tensor/_set_functions.py b/dpctl/tensor/_set_functions.py
deleted file mode 100644
index 1d4c28b924..0000000000
--- a/dpctl/tensor/_set_functions.py
+++ /dev/null
@@ -1,781 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-from typing import NamedTuple, Optional, Union
-
-import dpctl.tensor as dpt
-import dpctl.utils as du
-
-from ._copy_utils import _empty_like_orderK
-from ._scalar_utils import (
-    _get_dtype,
-    _get_queue_usm_type,
-    _get_shape,
-    _validate_dtype,
-)
-from ._tensor_elementwise_impl import _not_equal, _subtract
-from ._tensor_impl import (
-    _copy_usm_ndarray_into_usm_ndarray,
-    _extract,
-    _full_usm_ndarray,
-    _linspace_step,
-    _take,
-    default_device_index_type,
-    mask_positions,
-)
-from ._tensor_sorting_impl import (
-    _argsort_ascending,
-    _isin,
-    _searchsorted_left,
-    _sort_ascending,
-)
-from ._type_utils import (
-    _resolve_weak_types_all_py_ints,
-    _to_device_supported_dtype,
-)
-
-__all__ = [
-    "isin",
-    "unique_values",
-    "unique_counts",
-    "unique_inverse",
-    "unique_all",
-    "UniqueAllResult",
-    "UniqueCountsResult",
-    "UniqueInverseResult",
-]
-
-
-class UniqueAllResult(NamedTuple):
-    values: dpt.usm_ndarray
-    indices: dpt.usm_ndarray
-    inverse_indices: dpt.usm_ndarray
-    counts: dpt.usm_ndarray
-
-
-class UniqueCountsResult(NamedTuple):
-    values: dpt.usm_ndarray
-    counts: dpt.usm_ndarray
-
-
-class UniqueInverseResult(NamedTuple):
-    values: dpt.usm_ndarray
-    inverse_indices: dpt.usm_ndarray
-
-
-def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray:
-    """unique_values(x)
-
-    Returns the unique elements of an input array `x`.
-
-    Args:
-        x (usm_ndarray):
-            input array. Inputs with more than one dimension are flattened.
-    Returns:
-        usm_ndarray
-            an array containing the set of unique elements in `x`. The
-            returned array has the same data type as `x`.
-    """
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
-    array_api_dev = x.device
-    exec_q = array_api_dev.sycl_queue
-    if x.ndim == 1:
-        fx = x
-    else:
-        fx = dpt.reshape(x, (x.size,), order="C")
-    if fx.size == 0:
-        return fx
-    s = dpt.empty_like(fx, order="C")
-    _manager = du.SequentialOrderManager[exec_q]
-    dep_evs = _manager.submitted_events
-    if fx.flags.c_contiguous:
-        ht_ev, sort_ev = _sort_ascending(
-            src=fx,
-            trailing_dims_to_sort=1,
-            dst=s,
-            sycl_queue=exec_q,
-            depends=dep_evs,
-        )
-        _manager.add_event_pair(ht_ev, sort_ev)
-    else:
-        tmp = dpt.empty_like(fx, order="C")
-        ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray(
-            src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs
-        )
-        _manager.add_event_pair(ht_ev, copy_ev)
-        ht_ev, sort_ev = _sort_ascending(
-            src=tmp,
-            trailing_dims_to_sort=1,
-            dst=s,
-            sycl_queue=exec_q,
-            depends=[copy_ev],
-        )
-        _manager.add_event_pair(ht_ev, sort_ev)
-    unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q)
-    ht_ev, uneq_ev = _not_equal(
-        src1=s[:-1],
-        src2=s[1:],
-        dst=unique_mask[1:],
-        sycl_queue=exec_q,
-        depends=[sort_ev],
-    )
-    _manager.add_event_pair(ht_ev, uneq_ev)
-    # writing into new allocation, no dependencies
-    ht_ev, one_ev = _full_usm_ndarray(
-        fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
-    )
-    _manager.add_event_pair(ht_ev, one_ev)
-    cumsum = dpt.empty(s.shape, dtype=dpt.int64, sycl_queue=exec_q)
-    # synchronizing call
-    n_uniques = mask_positions(
-        unique_mask, cumsum, sycl_queue=exec_q, depends=[one_ev, uneq_ev]
-    )
-    if n_uniques == fx.size:
-        return s
-    unique_vals = dpt.empty(
-        n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
-    )
-    ht_ev, ex_e = _extract(
-        src=s,
-        cumsum=cumsum,
-        axis_start=0,
-        axis_end=1,
-        dst=unique_vals,
-        sycl_queue=exec_q,
-    )
-    _manager.add_event_pair(ht_ev, ex_e)
-    return unique_vals
-
-
-def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult:
-    """unique_counts(x)
-
-    Returns the unique elements of an input array `x` and the corresponding
-    counts for each unique element in `x`.
-
-    Args:
-        x (usm_ndarray):
-            input array. Inputs with more than one dimension are flattened.
-    Returns:
-        tuple[usm_ndarray, usm_ndarray]
-            a namedtuple `(values, counts)` whose
-
-            * first element is the field name `values` and is an array
-               containing the unique elements of `x`. This array has the
-               same data type as `x`.
-            * second element has the field name `counts` and is an array
-              containing the number of times each unique element occurs in `x`.
-              This array has the same shape as `values` and has the default
-              array index data type.
-    """
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
-    array_api_dev = x.device
-    exec_q = array_api_dev.sycl_queue
-    x_usm_type = x.usm_type
-    if x.ndim == 1:
-        fx = x
-    else:
-        fx = dpt.reshape(x, (x.size,), order="C")
-    ind_dt = default_device_index_type(exec_q)
-    if fx.size == 0:
-        return UniqueCountsResult(fx, dpt.empty_like(fx, dtype=ind_dt))
-    s = dpt.empty_like(fx, order="C")
-
-    _manager = du.SequentialOrderManager[exec_q]
-    dep_evs = _manager.submitted_events
-    if fx.flags.c_contiguous:
-        ht_ev, sort_ev = _sort_ascending(
-            src=fx,
-            trailing_dims_to_sort=1,
-            dst=s,
-            sycl_queue=exec_q,
-            depends=dep_evs,
-        )
-        _manager.add_event_pair(ht_ev, sort_ev)
-    else:
-        tmp = dpt.empty_like(fx, order="C")
-        ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray(
-            src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs
-        )
-        _manager.add_event_pair(ht_ev, copy_ev)
-        ht_ev, sort_ev = _sort_ascending(
-            src=tmp,
-            dst=s,
-            trailing_dims_to_sort=1,
-            sycl_queue=exec_q,
-            depends=[copy_ev],
-        )
-        _manager.add_event_pair(ht_ev, sort_ev)
-    unique_mask = dpt.empty(s.shape, dtype="?", sycl_queue=exec_q)
-    ht_ev, uneq_ev = _not_equal(
-        src1=s[:-1],
-        src2=s[1:],
-        dst=unique_mask[1:],
-        sycl_queue=exec_q,
-        depends=[sort_ev],
-    )
-    _manager.add_event_pair(ht_ev, uneq_ev)
-    # no dependency, since we write into new allocation
-    ht_ev, one_ev = _full_usm_ndarray(
-        fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
-    )
-    _manager.add_event_pair(ht_ev, one_ev)
-    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q)
-    # synchronizing call
-    n_uniques = mask_positions(
-        unique_mask, cumsum, sycl_queue=exec_q, depends=[one_ev, uneq_ev]
-    )
-    if n_uniques == fx.size:
-        return UniqueCountsResult(
-            s,
-            dpt.ones(
-                n_uniques, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
-            ),
-        )
-    unique_vals = dpt.empty(
-        n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q
-    )
-    # populate unique values
-    ht_ev, ex_e = _extract(
-        src=s,
-        cumsum=cumsum,
-        axis_start=0,
-        axis_end=1,
-        dst=unique_vals,
-        sycl_queue=exec_q,
-    )
-    _manager.add_event_pair(ht_ev, ex_e)
-    unique_counts = dpt.empty(
-        n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
-    )
-    idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
-    # writing into new allocation, no dependency
-    ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q)
-    _manager.add_event_pair(ht_ev, id_ev)
-    ht_ev, extr_ev = _extract(
-        src=idx,
-        cumsum=cumsum,
-        axis_start=0,
-        axis_end=1,
-        dst=unique_counts[:-1],
-        sycl_queue=exec_q,
-        depends=[id_ev],
-    )
-    _manager.add_event_pair(ht_ev, extr_ev)
-    # no dependency, writing into disjoint segmenent of new allocation
-    ht_ev, set_ev = _full_usm_ndarray(
-        x.size, dst=unique_counts[-1], sycl_queue=exec_q
-    )
-    _manager.add_event_pair(ht_ev, set_ev)
-    _counts = dpt.empty_like(unique_counts[1:])
-    ht_ev, sub_ev = _subtract(
-        src1=unique_counts[1:],
-        src2=unique_counts[:-1],
-        dst=_counts,
-        sycl_queue=exec_q,
-        depends=[set_ev, extr_ev],
-    )
-    _manager.add_event_pair(ht_ev, sub_ev)
-    return UniqueCountsResult(unique_vals, _counts)
-
-
-def unique_inverse(x):
-    """unique_inverse
-
-    Returns the unique elements of an input array x and the indices from the
-    set of unique elements that reconstruct `x`.
-
-    Args:
-        x (usm_ndarray):
-            input array. Inputs with more than one dimension are flattened.
-    Returns:
-        tuple[usm_ndarray, usm_ndarray]
-            a namedtuple `(values, inverse_indices)` whose
-
-            * first element has the field name `values` and is an array
-              containing the unique elements of `x`. The array has the same
-              data type as `x`.
-            * second element has the field name `inverse_indices` and is an
-              array containing the indices of values that reconstruct `x`.
-              The array has the same shape as `x` and has the default array
-              index data type.
-    """
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
-    array_api_dev = x.device
-    exec_q = array_api_dev.sycl_queue
-    x_usm_type = x.usm_type
-    ind_dt = default_device_index_type(exec_q)
-    if x.ndim == 1:
-        fx = x
-    else:
-        fx = dpt.reshape(x, (x.size,), order="C")
-    sorting_ids = dpt.empty_like(fx, dtype=ind_dt, order="C")
-    unsorting_ids = dpt.empty_like(sorting_ids, dtype=ind_dt, order="C")
-    if fx.size == 0:
-        return UniqueInverseResult(fx, dpt.reshape(unsorting_ids, x.shape))
-
-    _manager = du.SequentialOrderManager[exec_q]
-    dep_evs = _manager.submitted_events
-    if fx.flags.c_contiguous:
-        ht_ev, sort_ev = _argsort_ascending(
-            src=fx,
-            trailing_dims_to_sort=1,
-            dst=sorting_ids,
-            sycl_queue=exec_q,
-            depends=dep_evs,
-        )
-        _manager.add_event_pair(ht_ev, sort_ev)
-    else:
-        tmp = dpt.empty_like(fx, order="C")
-        ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray(
-            src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs
-        )
-        _manager.add_event_pair(ht_ev, copy_ev)
-        ht_ev, sort_ev = _argsort_ascending(
-            src=tmp,
-            trailing_dims_to_sort=1,
-            dst=sorting_ids,
-            sycl_queue=exec_q,
-            depends=[copy_ev],
-        )
-        _manager.add_event_pair(ht_ev, sort_ev)
-    ht_ev, argsort_ev = _argsort_ascending(
-        src=sorting_ids,
-        trailing_dims_to_sort=1,
-        dst=unsorting_ids,
-        sycl_queue=exec_q,
-        depends=[sort_ev],
-    )
-    _manager.add_event_pair(ht_ev, argsort_ev)
-    s = dpt.empty_like(fx)
-    # s = fx[sorting_ids]
-    ht_ev, take_ev = _take(
-        src=fx,
-        ind=(sorting_ids,),
-        dst=s,
-        axis_start=0,
-        mode=0,
-        sycl_queue=exec_q,
-        depends=[sort_ev],
-    )
-    _manager.add_event_pair(ht_ev, take_ev)
-    unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q)
-    ht_ev, uneq_ev = _not_equal(
-        src1=s[:-1],
-        src2=s[1:],
-        dst=unique_mask[1:],
-        sycl_queue=exec_q,
-        depends=[take_ev],
-    )
-    _manager.add_event_pair(ht_ev, uneq_ev)
-    # no dependency
-    ht_ev, one_ev = _full_usm_ndarray(
-        fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
-    )
-    _manager.add_event_pair(ht_ev, one_ev)
-    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q)
-    # synchronizing call
-    n_uniques = mask_positions(
-        unique_mask, cumsum, sycl_queue=exec_q, depends=[uneq_ev, one_ev]
-    )
-    if n_uniques == fx.size:
-        return UniqueInverseResult(s, dpt.reshape(unsorting_ids, x.shape))
-    unique_vals = dpt.empty(
-        n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q
-    )
-    ht_ev, uv_ev = _extract(
-        src=s,
-        cumsum=cumsum,
-        axis_start=0,
-        axis_end=1,
-        dst=unique_vals,
-        sycl_queue=exec_q,
-    )
-    _manager.add_event_pair(ht_ev, uv_ev)
-    cum_unique_counts = dpt.empty(
-        n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
-    )
-    idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
-    ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q)
-    _manager.add_event_pair(ht_ev, id_ev)
-    ht_ev, extr_ev = _extract(
-        src=idx,
-        cumsum=cumsum,
-        axis_start=0,
-        axis_end=1,
-        dst=cum_unique_counts[:-1],
-        sycl_queue=exec_q,
-        depends=[id_ev],
-    )
-    _manager.add_event_pair(ht_ev, extr_ev)
-    ht_ev, set_ev = _full_usm_ndarray(
-        x.size, dst=cum_unique_counts[-1], sycl_queue=exec_q
-    )
-    _manager.add_event_pair(ht_ev, set_ev)
-    _counts = dpt.empty_like(cum_unique_counts[1:])
-    ht_ev, sub_ev = _subtract(
-        src1=cum_unique_counts[1:],
-        src2=cum_unique_counts[:-1],
-        dst=_counts,
-        sycl_queue=exec_q,
-        depends=[set_ev, extr_ev],
-    )
-    _manager.add_event_pair(ht_ev, sub_ev)
-
-    inv = dpt.empty_like(x, dtype=ind_dt, order="C")
-    ht_ev, ssl_ev = _searchsorted_left(
-        hay=unique_vals,
-        needles=x,
-        positions=inv,
-        sycl_queue=exec_q,
-        depends=[
-            uv_ev,
-        ],
-    )
-    _manager.add_event_pair(ht_ev, ssl_ev)
-
-    return UniqueInverseResult(unique_vals, inv)
-
-
-def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
-    """unique_all(x)
-
-    Returns the unique elements of an input array `x`, the first occurring
-    indices for each unique element in `x`, the indices from the set of unique
-    elements that reconstruct `x`, and the corresponding counts for each
-    unique element in `x`.
-
-    Args:
-        x (usm_ndarray):
-            input array. Inputs with more than one dimension are flattened.
-    Returns:
-        tuple[usm_ndarray, usm_ndarray, usm_ndarray, usm_ndarray]
-            a namedtuple `(values, indices, inverse_indices, counts)` whose
-
-            * first element has the field name `values` and is an array
-              containing the unique elements of `x`. The array has the same
-              data type as `x`.
-            * second element has the field name `indices` and is an array
-              the indices (of first occurrences) of `x` that result in
-              `values`. The array has the same shape as `values` and has the
-              default array index data type.
-            * third element has the field name `inverse_indices` and is an
-              array containing the indices of values that reconstruct `x`.
-              The array has the same shape as `x` and has the default array
-              index data type.
-            * fourth element has the field name `counts` and is an array
-              containing the number of times each unique element occurs in `x`.
-              This array has the same shape as `values` and has the default
-              array index data type.
-    """
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
-    array_api_dev = x.device
-    exec_q = array_api_dev.sycl_queue
-    x_usm_type = x.usm_type
-    ind_dt = default_device_index_type(exec_q)
-    if x.ndim == 1:
-        fx = x
-    else:
-        fx = dpt.reshape(x, (x.size,), order="C")
-    sorting_ids = dpt.empty_like(fx, dtype=ind_dt, order="C")
-    unsorting_ids = dpt.empty_like(sorting_ids, dtype=ind_dt, order="C")
-    if fx.size == 0:
-        # original array contains no data
-        # so it can be safely returned as values
-        return UniqueAllResult(
-            fx,
-            sorting_ids,
-            dpt.reshape(unsorting_ids, x.shape),
-            dpt.empty_like(fx, dtype=ind_dt),
-        )
-    _manager = du.SequentialOrderManager[exec_q]
-    dep_evs = _manager.submitted_events
-    if fx.flags.c_contiguous:
-        ht_ev, sort_ev = _argsort_ascending(
-            src=fx,
-            trailing_dims_to_sort=1,
-            dst=sorting_ids,
-            sycl_queue=exec_q,
-            depends=dep_evs,
-        )
-        _manager.add_event_pair(ht_ev, sort_ev)
-    else:
-        tmp = dpt.empty_like(fx, order="C")
-        ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray(
-            src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs
-        )
-        _manager.add_event_pair(ht_ev, copy_ev)
-        ht_ev, sort_ev = _argsort_ascending(
-            src=tmp,
-            trailing_dims_to_sort=1,
-            dst=sorting_ids,
-            sycl_queue=exec_q,
-            depends=[copy_ev],
-        )
-        _manager.add_event_pair(ht_ev, sort_ev)
-    ht_ev, args_ev = _argsort_ascending(
-        src=sorting_ids,
-        trailing_dims_to_sort=1,
-        dst=unsorting_ids,
-        sycl_queue=exec_q,
-        depends=[sort_ev],
-    )
-    _manager.add_event_pair(ht_ev, args_ev)
-    s = dpt.empty_like(fx)
-    # s = fx[sorting_ids]
-    ht_ev, take_ev = _take(
-        src=fx,
-        ind=(sorting_ids,),
-        dst=s,
-        axis_start=0,
-        mode=0,
-        sycl_queue=exec_q,
-        depends=[sort_ev],
-    )
-    _manager.add_event_pair(ht_ev, take_ev)
-    unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q)
-    ht_ev, uneq_ev = _not_equal(
-        src1=s[:-1],
-        src2=s[1:],
-        dst=unique_mask[1:],
-        sycl_queue=exec_q,
-        depends=[take_ev],
-    )
-    _manager.add_event_pair(ht_ev, uneq_ev)
-    ht_ev, one_ev = _full_usm_ndarray(
-        fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
-    )
-    _manager.add_event_pair(ht_ev, one_ev)
-    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q)
-    # synchronizing call
-    n_uniques = mask_positions(
-        unique_mask, cumsum, sycl_queue=exec_q, depends=[uneq_ev, one_ev]
-    )
-    if n_uniques == fx.size:
-        _counts = dpt.ones(
-            n_uniques, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
-        )
-        return UniqueAllResult(
-            s,
-            sorting_ids,
-            dpt.reshape(unsorting_ids, x.shape),
-            _counts,
-        )
-    unique_vals = dpt.empty(
-        n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q
-    )
-    ht_ev, uv_ev = _extract(
-        src=s,
-        cumsum=cumsum,
-        axis_start=0,
-        axis_end=1,
-        dst=unique_vals,
-        sycl_queue=exec_q,
-    )
-    _manager.add_event_pair(ht_ev, uv_ev)
-    cum_unique_counts = dpt.empty(
-        n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
-    )
-    idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
-    ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q)
-    _manager.add_event_pair(ht_ev, id_ev)
-    ht_ev, extr_ev = _extract(
-        src=idx,
-        cumsum=cumsum,
-        axis_start=0,
-        axis_end=1,
-        dst=cum_unique_counts[:-1],
-        sycl_queue=exec_q,
-        depends=[id_ev],
-    )
-    _manager.add_event_pair(ht_ev, extr_ev)
-    ht_ev, set_ev = _full_usm_ndarray(
-        x.size, dst=cum_unique_counts[-1], sycl_queue=exec_q
-    )
-    _manager.add_event_pair(ht_ev, set_ev)
-    _counts = dpt.empty_like(cum_unique_counts[1:])
-    ht_ev, sub_ev = _subtract(
-        src1=cum_unique_counts[1:],
-        src2=cum_unique_counts[:-1],
-        dst=_counts,
-        sycl_queue=exec_q,
-        depends=[set_ev, extr_ev],
-    )
-    _manager.add_event_pair(ht_ev, sub_ev)
-
-    inv = dpt.empty_like(x, dtype=ind_dt, order="C")
-    ht_ev, ssl_ev = _searchsorted_left(
-        hay=unique_vals,
-        needles=x,
-        positions=inv,
-        sycl_queue=exec_q,
-        depends=[
-            uv_ev,
-        ],
-    )
-    _manager.add_event_pair(ht_ev, ssl_ev)
-    return UniqueAllResult(
-        unique_vals,
-        sorting_ids[cum_unique_counts[:-1]],
-        inv,
-        _counts,
-    )
-
-
-def isin(
-    x: Union[dpt.usm_ndarray, int, float, complex, bool],
-    test_elements: Union[dpt.usm_ndarray, int, float, complex, bool],
-    /,
-    *,
-    invert: Optional[bool] = False,
-) -> dpt.usm_ndarray:
-    """isin(x, test_elements, /, *, invert=False)
-
-    Tests `x in test_elements` for each element of `x`. Returns a boolean array
-    with the same shape as `x` that is `True` where the element is in
-    `test_elements`, `False` otherwise.
-
-    Args:
-        x (Union[usm_ndarray, bool, int, float, complex]):
-            input element or elements.
-        test_elements (Union[usm_ndarray, bool, int, float, complex]):
-            elements against which to test each value of `x`.
-        invert (Optional[bool]):
-            if `True`, the output results are inverted, i.e., are equivalent to
-            testing `x not in test_elements` for each element of `x`.
-            Default: `False`.
-
-    Returns:
-        usm_ndarray:
-            an array of the inclusion test results. The returned array has a
-            boolean data type and the same shape as `x`.
-    """
-    q1, x_usm_type = _get_queue_usm_type(x)
-    q2, test_usm_type = _get_queue_usm_type(test_elements)
-    if q1 is None and q2 is None:
-        raise du.ExecutionPlacementError(
-            "Execution placement can not be unambiguously inferred "
-            "from input arguments. "
-            "One of the arguments must represent USM allocation and "
-            "expose `__sycl_usm_array_interface__` property"
-        )
-    if q1 is None:
-        exec_q = q2
-        res_usm_type = test_usm_type
-    elif q2 is None:
-        exec_q = q1
-        res_usm_type = x_usm_type
-    else:
-        exec_q = du.get_execution_queue((q1, q2))
-        if exec_q is None:
-            raise du.ExecutionPlacementError(
-                "Execution placement can not be unambiguously inferred "
-                "from input arguments."
-            )
-        res_usm_type = du.get_coerced_usm_type(
-            (
-                x_usm_type,
-                test_usm_type,
-            )
-        )
-    du.validate_usm_type(res_usm_type, allow_none=False)
-    sycl_dev = exec_q.sycl_device
-
-    if not isinstance(invert, bool):
-        raise TypeError(
-            "`invert` keyword argument must be of boolean type, "
-            f"got {type(invert)}"
-        )
-
-    x_dt = _get_dtype(x, sycl_dev)
-    test_dt = _get_dtype(test_elements, sycl_dev)
-    if not all(_validate_dtype(dt) for dt in (x_dt, test_dt)):
-        raise ValueError("Operands have unsupported data types")
-
-    x_sh = _get_shape(x)
-    if isinstance(test_elements, dpt.usm_ndarray) and test_elements.size == 0:
-        if invert:
-            return dpt.ones(
-                x_sh, dtype=dpt.bool, usm_type=res_usm_type, sycl_queue=exec_q
-            )
-        else:
-            return dpt.zeros(
-                x_sh, dtype=dpt.bool, usm_type=res_usm_type, sycl_queue=exec_q
-            )
-
-    dt1, dt2 = _resolve_weak_types_all_py_ints(x_dt, test_dt, sycl_dev)
-    dt = _to_device_supported_dtype(dpt.result_type(dt1, dt2), sycl_dev)
-
-    if not isinstance(x, dpt.usm_ndarray):
-        x_arr = dpt.asarray(
-            x, dtype=dt1, usm_type=res_usm_type, sycl_queue=exec_q
-        )
-    else:
-        x_arr = x
-
-    if not isinstance(test_elements, dpt.usm_ndarray):
-        test_arr = dpt.asarray(
-            test_elements, dtype=dt2, usm_type=res_usm_type, sycl_queue=exec_q
-        )
-    else:
-        test_arr = test_elements
-
-    _manager = du.SequentialOrderManager[exec_q]
-    dep_evs = _manager.submitted_events
-
-    if x_dt != dt:
-        x_buf = _empty_like_orderK(x_arr, dt, res_usm_type, exec_q)
-        ht_ev, ev = _copy_usm_ndarray_into_usm_ndarray(
-            src=x_arr, dst=x_buf, sycl_queue=exec_q, depends=dep_evs
-        )
-        _manager.add_event_pair(ht_ev, ev)
-    else:
-        x_buf = x_arr
-
-    if test_dt != dt:
-        # copy into C-contiguous memory, because the array will be flattened
-        test_buf = dpt.empty_like(
-            test_arr, dtype=dt, order="C", usm_type=res_usm_type
-        )
-        ht_ev, ev = _copy_usm_ndarray_into_usm_ndarray(
-            src=test_arr, dst=test_buf, sycl_queue=exec_q, depends=dep_evs
-        )
-        _manager.add_event_pair(ht_ev, ev)
-    else:
-        test_buf = test_arr
-
-    test_buf = dpt.reshape(test_buf, -1)
-    test_buf = dpt.sort(test_buf)
-
-    dst = dpt.empty_like(
-        x_buf, dtype=dpt.bool, usm_type=res_usm_type, order="C"
-    )
-
-    dep_evs = _manager.submitted_events
-    ht_ev, s_ev = _isin(
-        needles=x_buf,
-        hay=test_buf,
-        dst=dst,
-        sycl_queue=exec_q,
-        invert=invert,
-        depends=dep_evs,
-    )
-    _manager.add_event_pair(ht_ev, s_ev)
-    return dst
diff --git a/dpctl/tensor/_slicing.pxi b/dpctl/tensor/_slicing.pxi
deleted file mode 100644
index eaf9a855fb..0000000000
--- a/dpctl/tensor/_slicing.pxi
+++ /dev/null
@@ -1,371 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import numbers
-from operator import index
-from cpython.buffer cimport PyObject_CheckBuffer
-from numpy import ndarray
-
-
-cdef bint _is_buffer(object o):
-    return PyObject_CheckBuffer(o)
-
-
-cdef Py_ssize_t _slice_len(
-    Py_ssize_t sl_start,
-    Py_ssize_t sl_stop,
-    Py_ssize_t sl_step
-):
-    """
-    Compute len(range(sl_start, sl_stop, sl_step))
-    """
-    if sl_start == sl_stop:
-        return 0
-    if sl_step > 0:
-        if sl_start > sl_stop:
-            return 0
-        # 1 + argmax k such htat sl_start + sl_step*k < sl_stop
-        return 1 + ((sl_stop - sl_start - 1) // sl_step)
-    else:
-        if sl_start < sl_stop:
-            return 0
-        return 1 + ((sl_stop - sl_start + 1) // sl_step)
-
-
-cdef bint _is_integral(object x) except *:
-    """Gives True if x is an integral slice spec"""
-    if isinstance(x, (ndarray, usm_ndarray)):
-        if x.ndim > 0:
-            return False
-        if x.dtype.kind not in "ui":
-            return False
-        return True
-    if isinstance(x, bool):
-        return False
-    if isinstance(x, int):
-        return True
-    if _is_buffer(x):
-        mbuf = memoryview(x)
-        if mbuf.ndim == 0:
-            f = mbuf.format
-            return f in "bBhHiIlLqQ"
-        else:
-            return False
-    if callable(getattr(x, "__index__", None)):
-        try:
-            index(x)
-        except (TypeError, ValueError):
-            return False
-        return True
-    return False
-
-
-cdef bint _is_boolean(object x) except *:
-    """Gives True if x is an integral slice spec"""
-    if isinstance(x, (ndarray, usm_ndarray)):
-        if x.ndim > 0:
-            return False
-        if x.dtype.kind not in "b":
-            return False
-        return True
-    if isinstance(x, bool):
-        return True
-    if isinstance(x, (int, float, complex)):
-        return False
-    if _is_buffer(x):
-        mbuf = memoryview(x)
-        if mbuf.ndim == 0:
-            f = mbuf.format
-            return f in "?"
-        else:
-            return False
-    if callable(getattr(x, "__bool__", None)):
-        try:
-            x.__bool__()
-        except (TypeError, ValueError):
-            return False
-        return True
-    return False
-
-
-def _basic_slice_meta(ind, shape : tuple, strides : tuple, offset : int):
-    """
-    Give basic slicing index `ind` and array layout information produce
-    a 5-tuple (resulting_shape, resulting_strides, resulting_offset,
-       advanced_ind, resulting_advanced_ind_pos)
-    used to construct a view into underlying array over which advanced
-    indexing, if any, is to be performed.
-
-    Raises IndexError for invalid index `ind`.
-    """
-    _no_advanced_ind = tuple()
-    _no_advanced_pos = -1
-    if ind is Ellipsis:
-        return (shape, strides, offset, _no_advanced_ind, _no_advanced_pos)
-    elif ind is None:
-        return (
-            (1,) + shape,
-            (0,) + strides,
-            offset,
-            _no_advanced_ind,
-            _no_advanced_pos,
-        )
-    elif isinstance(ind, slice):
-        sl_start, sl_stop, sl_step = ind.indices(shape[0])
-        sh0 = _slice_len(sl_start, sl_stop, sl_step)
-        str0 = sl_step * strides[0]
-        new_strides = (
-            strides if (sl_step == 1 or sh0 == 0) else (str0,) + strides[1:]
-        )
-        new_shape = (sh0, ) + shape[1:]
-        is_empty = any(sh_i == 0 for sh_i in new_shape)
-        new_offset = offset if is_empty else offset + sl_start * strides[0]
-        return (
-            new_shape,
-            new_strides,
-            new_offset,
-            _no_advanced_ind,
-            _no_advanced_pos,
-        )
-    elif _is_boolean(ind):
-        if ind:
-            return (
-                (1,) + shape,
-                (0,) + strides,
-                offset,
-                _no_advanced_ind,
-                _no_advanced_pos,
-            )
-        else:
-            return (
-                (0,) + shape,
-                (0,) + strides,
-                offset,
-                _no_advanced_ind,
-                _no_advanced_pos,
-            )
-    elif _is_integral(ind):
-        ind = index(ind)
-        new_shape = shape[1:]
-        new_strides = strides[1:]
-        is_empty = any(sh_i == 0 for sh_i in new_shape)
-        if 0 <= ind < shape[0]:
-            new_offset = offset if is_empty else offset + ind * strides[0]
-            return (
-                new_shape,
-                new_strides,
-                new_offset,
-                _no_advanced_ind,
-                _no_advanced_pos,
-            )
-        elif -shape[0] <= ind < 0:
-            new_offset = (
-                offset if is_empty else offset + (shape[0] + ind) * strides[0]
-            )
-            return (
-                new_shape,
-                new_strides,
-                new_offset,
-                _no_advanced_ind,
-                _no_advanced_pos,
-            )
-        else:
-            raise IndexError(
-                "Index {0} is out of range for axes 0 with "
-                "size {1}".format(ind, shape[0]))
-    elif isinstance(ind, (ndarray, usm_ndarray)):
-        return (shape, strides, offset, (ind,), 0)
-    elif isinstance(ind, tuple):
-        axes_referenced = 0
-        ellipses_count = 0
-        newaxis_count = 0
-        explicit_index = 0
-        seen_arrays_yet = False
-        array_streak_started = False
-        array_streak_interrupted = False
-        for i in ind:
-            if i is None:
-                newaxis_count += 1
-                if array_streak_started:
-                    array_streak_interrupted = True
-            elif i is Ellipsis:
-                ellipses_count += 1
-                if array_streak_started:
-                    array_streak_interrupted = True
-            elif isinstance(i, slice):
-                axes_referenced += 1
-                if array_streak_started:
-                    array_streak_interrupted = True
-            elif _is_boolean(i):
-                newaxis_count += 1
-                if array_streak_started:
-                    array_streak_interrupted = True
-            elif _is_integral(i):
-                axes_referenced += 1
-                if not array_streak_started and array_streak_interrupted:
-                    explicit_index += 1
-            elif isinstance(i, (ndarray, usm_ndarray)):
-                if not seen_arrays_yet:
-                    seen_arrays_yet = True
-                    array_streak_started = True
-                    array_streak_interrupted = False
-                if array_streak_interrupted:
-                    raise IndexError(
-                        "Advanced indexing array specs may not be "
-                        "separated by basic slicing specs."
-                    )
-                dt_k = i.dtype.kind
-                if dt_k == "b" and i.ndim > 0:
-                    axes_referenced += i.ndim
-                elif dt_k in "ui" and i.ndim > 0:
-                    axes_referenced += 1
-                else:
-                    raise IndexError(
-                        "arrays used as indices must be of integer "
-                        "(or boolean) type"
-                    )
-            else:
-                raise IndexError(
-                    "Only integers, slices (`:`), ellipsis (`...`), "
-                    "dpctl.tensor.newaxis (`None`) and integer and "
-                    "boolean arrays are valid indices."
-                )
-        if ellipses_count > 1:
-            raise IndexError(
-                "an index can only have a single ellipsis ('...')")
-        if axes_referenced > len(shape):
-            raise IndexError(
-                "too many indices for an array, array is "
-                "{0}-dimensional, but {1} were indexed".format(
-                    len(shape), axes_referenced))
-        if ellipses_count:
-            ellipses_count = len(shape) - axes_referenced
-        new_shape_len = (newaxis_count + ellipses_count
-                         + axes_referenced - explicit_index)
-        new_shape = list()
-        new_strides = list()
-        new_advanced_ind = list()
-        k = 0
-        new_advanced_start_pos = -1
-        advanced_start_pos_set = False
-        new_offset = offset
-        is_empty = False
-        array_streak = False
-        for i in range(len(ind)):
-            ind_i = ind[i]
-            if (ind_i is Ellipsis):
-                k_new = k + ellipses_count
-                new_shape.extend(shape[k:k_new])
-                new_strides.extend(strides[k:k_new])
-                if any(dim == 0 for dim in shape[k:k_new]):
-                    is_empty = True
-                    new_offset = offset
-                k = k_new
-                if array_streak:
-                    array_streak = False
-            elif ind_i is None:
-                new_shape.append(1)
-                new_strides.append(0)
-                if array_streak:
-                    array_streak = False
-            elif isinstance(ind_i, slice):
-                k_new = k + 1
-                sl_start, sl_stop, sl_step = ind_i.indices(shape[k])
-                sh_i = _slice_len(sl_start, sl_stop, sl_step)
-                str_i = (1 if sh_i == 0 else sl_step) * strides[k]
-                new_shape.append(sh_i)
-                new_strides.append(str_i)
-                if sh_i > 0 and not is_empty:
-                    new_offset = new_offset + sl_start * strides[k]
-                if sh_i == 0:
-                    is_empty = True
-                    new_offset = offset
-                k = k_new
-                if array_streak:
-                    array_streak = False
-            elif _is_boolean(ind_i):
-                new_shape.append(1 if ind_i else 0)
-                new_strides.append(0)
-                if array_streak:
-                    array_streak = False
-            elif _is_integral(ind_i):
-                if array_streak:
-                    if not isinstance(ind_i, (ndarray, usm_ndarray)):
-                        ind_i = index(ind_i)
-                        # integer will be converted to an array,
-                        # still raise if OOB
-                        if not (
-                            0 <= ind_i < shape[k] or -shape[k] <= ind_i < 0
-                        ):
-                            raise IndexError(
-                                "Index {0} is out of range for axes "
-                                "{1} with size {2}".format(ind_i, k, shape[k])
-                            )
-                    new_advanced_ind.append(ind_i)
-                    k_new = k + 1
-                    new_shape.extend(shape[k:k_new])
-                    new_strides.extend(strides[k:k_new])
-                    k = k_new
-                else:
-                    ind_i = index(ind_i)
-                    if 0 <= ind_i < shape[k]:
-                        k_new = k + 1
-                        if not is_empty:
-                            new_offset = new_offset + ind_i * strides[k]
-                        k = k_new
-                    elif -shape[k] <= ind_i < 0:
-                        k_new = k + 1
-                        if not is_empty:
-                            new_offset = (
-                                new_offset + (shape[k] + ind_i) * strides[k]
-                            )
-                        k = k_new
-                    else:
-                        raise IndexError(
-                            "Index {0} is out of range for axes "
-                            "{1} with size {2}".format(ind_i, k, shape[k])
-                        )
-            elif isinstance(ind_i, (ndarray, usm_ndarray)):
-                if not array_streak:
-                    array_streak = True
-                if not advanced_start_pos_set:
-                    new_advanced_start_pos = len(new_shape)
-                    advanced_start_pos_set = True
-                new_advanced_ind.append(ind_i)
-                dt_k = ind_i.dtype.kind
-                if dt_k == "b":
-                    k_new = k + ind_i.ndim
-                else:
-                    k_new = k + 1
-                new_shape.extend(shape[k:k_new])
-                new_strides.extend(strides[k:k_new])
-                k = k_new
-        new_shape.extend(shape[k:])
-        new_strides.extend(strides[k:])
-        new_shape_len += len(shape) - k
-        return (
-            tuple(new_shape),
-            tuple(new_strides),
-            new_offset,
-            tuple(new_advanced_ind),
-            new_advanced_start_pos
-        )
-    else:
-        raise IndexError(
-            "Only integers, slices (`:`), ellipsis (`...`), "
-            "dpctl.tensor.newaxis (`None`) and integer and "
-            "boolean arrays are valid indices."
-        )
diff --git a/dpctl/tensor/_sorting.py b/dpctl/tensor/_sorting.py
deleted file mode 100644
index 8ac623da38..0000000000
--- a/dpctl/tensor/_sorting.py
+++ /dev/null
@@ -1,434 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import operator
-from typing import NamedTuple
-
-import dpctl.tensor as dpt
-import dpctl.tensor._tensor_impl as ti
-import dpctl.utils as du
-
-from ._numpy_helper import normalize_axis_index
-from ._tensor_sorting_impl import (
-    _argsort_ascending,
-    _argsort_descending,
-    _radix_argsort_ascending,
-    _radix_argsort_descending,
-    _radix_sort_ascending,
-    _radix_sort_descending,
-    _radix_sort_dtype_supported,
-    _sort_ascending,
-    _sort_descending,
-    _topk,
-)
-
-__all__ = ["sort", "argsort"]
-
-
-def _get_mergesort_impl_fn(descending):
-    return _sort_descending if descending else _sort_ascending
-
-
-def _get_radixsort_impl_fn(descending):
-    return _radix_sort_descending if descending else _radix_sort_ascending
-
-
-def sort(x, /, *, axis=-1, descending=False, stable=True, kind=None):
-    """sort(x, axis=-1, descending=False, stable=True)
-
-    Returns a sorted copy of an input array `x`.
-
-    Args:
-        x (usm_ndarray):
-            input array.
-        axis (Optional[int]):
-            axis along which to sort. If set to `-1`, the function
-            must sort along the last axis. Default: `-1`.
-        descending (Optional[bool]):
-            sort order. If `True`, the array must be sorted in descending
-            order (by value). If `False`, the array must be sorted in
-            ascending order (by value). Default: `False`.
-        stable (Optional[bool]):
-            sort stability. If `True`, the returned array must maintain the
-            relative order of `x` values which compare as equal. If `False`,
-            the returned array may or may not maintain the relative order of
-            `x` values which compare as equal. Default: `True`.
-        kind (Optional[Literal["stable", "mergesort", "radixsort"]]):
-            Sorting algorithm. The default is `"stable"`, which uses parallel
-            merge-sort or parallel radix-sort algorithms depending on the
-            array data type.
-    Returns:
-        usm_ndarray:
-            a sorted array. The returned array has the same data type and
-            the same shape as the input array `x`.
-    """
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(
-            f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}"
-        )
-    nd = x.ndim
-    if nd == 0:
-        axis = normalize_axis_index(axis, ndim=1, msg_prefix="axis")
-        return dpt.copy(x, order="C")
-    else:
-        axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis")
-    a1 = axis + 1
-    if a1 == nd:
-        perm = list(range(nd))
-        arr = x
-    else:
-        perm = [i for i in range(nd) if i != axis] + [
-            axis,
-        ]
-        arr = dpt.permute_dims(x, perm)
-    if kind is None:
-        kind = "stable"
-    if not isinstance(kind, str) or kind not in [
-        "stable",
-        "radixsort",
-        "mergesort",
-    ]:
-        raise ValueError(
-            "Unsupported kind value. Expected 'stable', 'mergesort', "
-            f"or 'radixsort', but got '{kind}'"
-        )
-    if kind == "mergesort":
-        impl_fn = _get_mergesort_impl_fn(descending)
-    elif kind == "radixsort":
-        if _radix_sort_dtype_supported(x.dtype.num):
-            impl_fn = _get_radixsort_impl_fn(descending)
-        else:
-            raise ValueError(f"Radix sort is not supported for {x.dtype}")
-    else:
-        dt = x.dtype
-        if dt in [dpt.bool, dpt.uint8, dpt.int8, dpt.int16, dpt.uint16]:
-            impl_fn = _get_radixsort_impl_fn(descending)
-        else:
-            impl_fn = _get_mergesort_impl_fn(descending)
-    exec_q = x.sycl_queue
-    _manager = du.SequentialOrderManager[exec_q]
-    dep_evs = _manager.submitted_events
-    if arr.flags.c_contiguous:
-        res = dpt.empty_like(arr, order="C")
-        ht_ev, impl_ev = impl_fn(
-            src=arr,
-            trailing_dims_to_sort=1,
-            dst=res,
-            sycl_queue=exec_q,
-            depends=dep_evs,
-        )
-        _manager.add_event_pair(ht_ev, impl_ev)
-    else:
-        tmp = dpt.empty_like(arr, order="C")
-        ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs
-        )
-        _manager.add_event_pair(ht_ev, copy_ev)
-        res = dpt.empty_like(arr, order="C")
-        ht_ev, impl_ev = impl_fn(
-            src=tmp,
-            trailing_dims_to_sort=1,
-            dst=res,
-            sycl_queue=exec_q,
-            depends=[copy_ev],
-        )
-        _manager.add_event_pair(ht_ev, impl_ev)
-    if a1 != nd:
-        inv_perm = sorted(range(nd), key=lambda d: perm[d])
-        res = dpt.permute_dims(res, inv_perm)
-    return res
-
-
-def _get_mergeargsort_impl_fn(descending):
-    return _argsort_descending if descending else _argsort_ascending
-
-
-def _get_radixargsort_impl_fn(descending):
-    return _radix_argsort_descending if descending else _radix_argsort_ascending
-
-
-def argsort(x, axis=-1, descending=False, stable=True, kind=None):
-    """argsort(x, axis=-1, descending=False, stable=True)
-
-    Returns the indices that sort an array `x` along a specified axis.
-
-    Args:
-        x (usm_ndarray):
-            input array.
-        axis (Optional[int]):
-            axis along which to sort. If set to `-1`, the function
-            must sort along the last axis. Default: `-1`.
-        descending (Optional[bool]):
-            sort order. If `True`, the array must be sorted in descending
-            order (by value). If `False`, the array must be sorted in
-            ascending order (by value). Default: `False`.
-        stable (Optional[bool]):
-            sort stability. If `True`, the returned array must maintain the
-            relative order of `x` values which compare as equal. If `False`,
-            the returned array may or may not maintain the relative order of
-            `x` values which compare as equal. Default: `True`.
-        kind (Optional[Literal["stable", "mergesort", "radixsort"]]):
-            Sorting algorithm. The default is `"stable"`, which uses parallel
-            merge-sort or parallel radix-sort algorithms depending on the
-            array data type.
-
-    Returns:
-        usm_ndarray:
-            an array of indices. The returned array has the  same shape as
-            the input array `x`. The return array has default array index
-            data type.
-    """
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(
-            f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}"
-        )
-    nd = x.ndim
-    if nd == 0:
-        axis = normalize_axis_index(axis, ndim=1, msg_prefix="axis")
-        return dpt.zeros_like(
-            x, dtype=ti.default_device_index_type(x.sycl_queue), order="C"
-        )
-    else:
-        axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis")
-    a1 = axis + 1
-    if a1 == nd:
-        perm = list(range(nd))
-        arr = x
-    else:
-        perm = [i for i in range(nd) if i != axis] + [
-            axis,
-        ]
-        arr = dpt.permute_dims(x, perm)
-    if kind is None:
-        kind = "stable"
-    if not isinstance(kind, str) or kind not in [
-        "stable",
-        "radixsort",
-        "mergesort",
-    ]:
-        raise ValueError(
-            "Unsupported kind value. Expected 'stable', 'mergesort', "
-            f"or 'radixsort', but got '{kind}'"
-        )
-    if kind == "mergesort":
-        impl_fn = _get_mergeargsort_impl_fn(descending)
-    elif kind == "radixsort":
-        if _radix_sort_dtype_supported(x.dtype.num):
-            impl_fn = _get_radixargsort_impl_fn(descending)
-        else:
-            raise ValueError(f"Radix sort is not supported for {x.dtype}")
-    else:
-        dt = x.dtype
-        if dt in [dpt.bool, dpt.uint8, dpt.int8, dpt.int16, dpt.uint16]:
-            impl_fn = _get_radixargsort_impl_fn(descending)
-        else:
-            impl_fn = _get_mergeargsort_impl_fn(descending)
-    exec_q = x.sycl_queue
-    _manager = du.SequentialOrderManager[exec_q]
-    dep_evs = _manager.submitted_events
-    index_dt = ti.default_device_index_type(exec_q)
-    if arr.flags.c_contiguous:
-        res = dpt.empty_like(arr, dtype=index_dt, order="C")
-        ht_ev, impl_ev = impl_fn(
-            src=arr,
-            trailing_dims_to_sort=1,
-            dst=res,
-            sycl_queue=exec_q,
-            depends=dep_evs,
-        )
-        _manager.add_event_pair(ht_ev, impl_ev)
-    else:
-        tmp = dpt.empty_like(arr, order="C")
-        ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs
-        )
-        _manager.add_event_pair(ht_ev, copy_ev)
-        res = dpt.empty_like(arr, dtype=index_dt, order="C")
-        ht_ev, impl_ev = impl_fn(
-            src=tmp,
-            trailing_dims_to_sort=1,
-            dst=res,
-            sycl_queue=exec_q,
-            depends=[copy_ev],
-        )
-        _manager.add_event_pair(ht_ev, impl_ev)
-    if a1 != nd:
-        inv_perm = sorted(range(nd), key=lambda d: perm[d])
-        res = dpt.permute_dims(res, inv_perm)
-    return res
-
-
-def _get_top_k_largest(mode):
-    modes = {"largest": True, "smallest": False}
-    try:
-        return modes[mode]
-    except KeyError:
-        raise ValueError(
-            f"`mode` must be `largest` or `smallest`. Got `{mode}`."
-        )
-
-
-class TopKResult(NamedTuple):
-    values: dpt.usm_ndarray
-    indices: dpt.usm_ndarray
-
-
-def top_k(x, k, /, *, axis=None, mode="largest"):
-    """top_k(x, k, axis=None, mode="largest")
-
-    Returns the `k` largest or smallest values and their indices in the input
-    array `x` along the specified axis `axis`.
-
-    Args:
-        x (usm_ndarray):
-            input array.
-        k (int):
-            number of elements to find. Must be a positive integer value.
-        axis (Optional[int]):
-            axis along which to search. If `None`, the search will be performed
-            over the flattened array. Default: ``None``.
-        mode (Literal["largest", "smallest"]):
-            search mode. Must be one of the following modes:
-
-            - `"largest"`: return the `k` largest elements.
-            - `"smallest"`: return the `k` smallest elements.
-
-            Default: `"largest"`.
-
-    Returns:
-        tuple[usm_ndarray, usm_ndarray]
-            a namedtuple `(values, indices)` whose
-
-            * first element `values` will be an array containing the `k`
-              largest or smallest elements of `x`. The array has the same data
-              type as `x`. If `axis` was `None`, `values` will be a
-              one-dimensional array with shape `(k,)` and otherwise, `values`
-              will have shape `x.shape[:axis] + (k,) + x.shape[axis+1:]`
-            * second element `indices` will be an array containing indices of
-              `x` that result in `values`. The array will have the same shape
-              as `values` and will have the default array index data type.
-    """
-    largest = _get_top_k_largest(mode)
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(
-            f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}"
-        )
-
-    k = operator.index(k)
-    if k < 0:
-        raise ValueError("`k` must be a positive integer value")
-
-    nd = x.ndim
-    if axis is None:
-        sz = x.size
-        if nd == 0:
-            if k > 1:
-                raise ValueError(f"`k`={k} is out of bounds 1")
-            return TopKResult(
-                dpt.copy(x, order="C"),
-                dpt.zeros_like(
-                    x, dtype=ti.default_device_index_type(x.sycl_queue)
-                ),
-            )
-        arr = x
-        n_search_dims = None
-        res_sh = k
-    else:
-        axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis")
-        sz = x.shape[axis]
-        a1 = axis + 1
-        if a1 == nd:
-            perm = list(range(nd))
-            arr = x
-        else:
-            perm = [i for i in range(nd) if i != axis] + [
-                axis,
-            ]
-            arr = dpt.permute_dims(x, perm)
-        n_search_dims = 1
-        res_sh = arr.shape[: nd - 1] + (k,)
-
-    if k > sz:
-        raise ValueError(f"`k`={k} is out of bounds {sz}")
-
-    exec_q = x.sycl_queue
-    _manager = du.SequentialOrderManager[exec_q]
-    dep_evs = _manager.submitted_events
-
-    res_usm_type = arr.usm_type
-    if arr.flags.c_contiguous:
-        vals = dpt.empty(
-            res_sh,
-            dtype=arr.dtype,
-            usm_type=res_usm_type,
-            order="C",
-            sycl_queue=exec_q,
-        )
-        inds = dpt.empty(
-            res_sh,
-            dtype=ti.default_device_index_type(exec_q),
-            usm_type=res_usm_type,
-            order="C",
-            sycl_queue=exec_q,
-        )
-        ht_ev, impl_ev = _topk(
-            src=arr,
-            trailing_dims_to_search=n_search_dims,
-            k=k,
-            largest=largest,
-            vals=vals,
-            inds=inds,
-            sycl_queue=exec_q,
-            depends=dep_evs,
-        )
-        _manager.add_event_pair(ht_ev, impl_ev)
-    else:
-        tmp = dpt.empty_like(arr, order="C")
-        ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs
-        )
-        _manager.add_event_pair(ht_ev, copy_ev)
-        vals = dpt.empty(
-            res_sh,
-            dtype=arr.dtype,
-            usm_type=res_usm_type,
-            order="C",
-            sycl_queue=exec_q,
-        )
-        inds = dpt.empty(
-            res_sh,
-            dtype=ti.default_device_index_type(exec_q),
-            usm_type=res_usm_type,
-            order="C",
-            sycl_queue=exec_q,
-        )
-        ht_ev, impl_ev = _topk(
-            src=tmp,
-            trailing_dims_to_search=n_search_dims,
-            k=k,
-            largest=largest,
-            vals=vals,
-            inds=inds,
-            sycl_queue=exec_q,
-            depends=[copy_ev],
-        )
-        _manager.add_event_pair(ht_ev, impl_ev)
-    if axis is not None and a1 != nd:
-        inv_perm = sorted(range(nd), key=lambda d: perm[d])
-        vals = dpt.permute_dims(vals, inv_perm)
-        inds = dpt.permute_dims(inds, inv_perm)
-
-    return TopKResult(vals, inds)
diff --git a/dpctl/tensor/_statistical_functions.py b/dpctl/tensor/_statistical_functions.py
deleted file mode 100644
index 48ce9e1534..0000000000
--- a/dpctl/tensor/_statistical_functions.py
+++ /dev/null
@@ -1,367 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import dpctl.tensor as dpt
-import dpctl.tensor._tensor_elementwise_impl as tei
-import dpctl.tensor._tensor_impl as ti
-import dpctl.tensor._tensor_reductions_impl as tri
-import dpctl.utils as du
-
-from ._numpy_helper import normalize_axis_tuple
-
-
-def _var_impl(x, axis, correction, keepdims):
-    nd = x.ndim
-    if axis is None:
-        axis = tuple(range(nd))
-    if not isinstance(axis, (tuple, list)):
-        axis = (axis,)
-    axis = normalize_axis_tuple(axis, nd, "axis")
-    perm = []
-    nelems = 1
-    for i in range(nd):
-        if i not in axis:
-            perm.append(i)
-        else:
-            nelems *= x.shape[i]
-    red_nd = len(axis)
-    perm = perm + list(axis)
-    q = x.sycl_queue
-    inp_dt = x.dtype
-    res_dt = (
-        inp_dt
-        if inp_dt.kind == "f"
-        else dpt.dtype(ti.default_device_fp_type(q))
-    )
-    res_usm_type = x.usm_type
-
-    _manager = du.SequentialOrderManager[q]
-    dep_evs = _manager.submitted_events
-    if inp_dt != res_dt:
-        buf = dpt.empty_like(x, dtype=res_dt)
-        ht_e_buf, c_e1 = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=x, dst=buf, sycl_queue=q, depends=dep_evs
-        )
-        _manager.add_event_pair(ht_e_buf, c_e1)
-    else:
-        buf = x
-    # calculate mean
-    buf2 = dpt.permute_dims(buf, perm)
-    res_shape = buf2.shape[: nd - red_nd]
-    # use keepdims=True path for later broadcasting
-    if red_nd == 0:
-        mean_ary = dpt.empty_like(buf)
-        dep_evs = _manager.submitted_events
-        ht_e1, c_e2 = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=buf, dst=mean_ary, sycl_queue=q, depends=dep_evs
-        )
-        _manager.add_event_pair(ht_e1, c_e2)
-    else:
-        mean_ary = dpt.empty(
-            res_shape,
-            dtype=res_dt,
-            usm_type=res_usm_type,
-            sycl_queue=q,
-        )
-        dep_evs = _manager.submitted_events
-        ht_e1, r_e1 = tri._sum_over_axis(
-            src=buf2,
-            trailing_dims_to_reduce=red_nd,
-            dst=mean_ary,
-            sycl_queue=q,
-            depends=dep_evs,
-        )
-        _manager.add_event_pair(ht_e1, r_e1)
-
-        mean_ary_shape = res_shape + (1,) * red_nd
-        inv_perm = sorted(range(nd), key=lambda d: perm[d])
-        mean_ary = dpt.permute_dims(
-            dpt.reshape(mean_ary, mean_ary_shape), inv_perm
-        )
-    # divide in-place to get mean
-    mean_ary_shape = mean_ary.shape
-
-    dep_evs = _manager.submitted_events
-    ht_e2, d_e1 = tei._divide_by_scalar(
-        src=mean_ary, scalar=nelems, dst=mean_ary, sycl_queue=q, depends=dep_evs
-    )
-    _manager.add_event_pair(ht_e2, d_e1)
-
-    # subtract mean from original array to get deviations
-    dev_ary = dpt.empty_like(buf)
-    if mean_ary_shape != buf.shape:
-        mean_ary = dpt.broadcast_to(mean_ary, buf.shape)
-    ht_e4, su_e = tei._subtract(
-        src1=buf, src2=mean_ary, dst=dev_ary, sycl_queue=q, depends=[d_e1]
-    )
-    _manager.add_event_pair(ht_e4, su_e)
-    # square deviations
-    ht_e5, sq_e = tei._square(
-        src=dev_ary, dst=dev_ary, sycl_queue=q, depends=[su_e]
-    )
-    _manager.add_event_pair(ht_e5, sq_e)
-
-    # take sum of squared deviations
-    dev_ary2 = dpt.permute_dims(dev_ary, perm)
-    if red_nd == 0:
-        res = dev_ary
-    else:
-        res = dpt.empty(
-            res_shape,
-            dtype=res_dt,
-            usm_type=res_usm_type,
-            sycl_queue=q,
-        )
-        ht_e6, r_e2 = tri._sum_over_axis(
-            src=dev_ary2,
-            trailing_dims_to_reduce=red_nd,
-            dst=res,
-            sycl_queue=q,
-            depends=[sq_e],
-        )
-        _manager.add_event_pair(ht_e6, r_e2)
-
-        if keepdims:
-            res_shape = res_shape + (1,) * red_nd
-            inv_perm = sorted(range(nd), key=lambda d: perm[d])
-            res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm)
-    res_shape = res.shape
-    # when nelems - correction <= 0, yield nans
-    div = max(nelems - correction, 0)
-    if not div:
-        div = dpt.nan
-    dep_evs = _manager.submitted_events
-    ht_e7, d_e2 = tei._divide_by_scalar(
-        src=res, scalar=div, dst=res, sycl_queue=q, depends=dep_evs
-    )
-    _manager.add_event_pair(ht_e7, d_e2)
-    return res, [d_e2]
-
-
-def mean(x, axis=None, keepdims=False):
-    """mean(x, axis=None, keepdims=False)
-
-    Calculates the arithmetic mean of elements in the input array `x`.
-
-    Args:
-        x (usm_ndarray):
-            input array.
-        axis (Optional[int, Tuple[int, ...]]):
-            axis or axes along which the arithmetic means must be computed. If
-            a tuple of unique integers, the means are computed over multiple
-            axes. If `None`, the mean is computed over the entire array.
-            Default: `None`.
-        keepdims (Optional[bool]):
-            if `True`, the reduced axes (dimensions) are included in the result
-            as singleton dimensions, so that the returned array remains
-            compatible with the input array according to Array Broadcasting
-            rules. Otherwise, if `False`, the reduced axes are not included in
-            the returned array. Default: `False`.
-    Returns:
-        usm_ndarray:
-            an array containing the arithmetic means. If the mean was computed
-            over the entire array, a zero-dimensional array is returned.
-
-            If `x` has a floating-point data type, the returned array will have
-            the same data type as `x`.
-            If `x` has a boolean or integral data type, the returned array
-            will have the default floating point data type for the device
-            where input array `x` is allocated.
-    """
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
-    nd = x.ndim
-    if axis is None:
-        axis = tuple(range(nd))
-    if not isinstance(axis, (tuple, list)):
-        axis = (axis,)
-    axis = normalize_axis_tuple(axis, nd, "axis")
-    perm = []
-    nelems = 1
-    for i in range(nd):
-        if i not in axis:
-            perm.append(i)
-        else:
-            nelems *= x.shape[i]
-    sum_nd = len(axis)
-    perm = perm + list(axis)
-    arr2 = dpt.permute_dims(x, perm)
-    res_shape = arr2.shape[: nd - sum_nd]
-    q = x.sycl_queue
-    inp_dt = x.dtype
-    res_dt = (
-        x.dtype
-        if x.dtype.kind in "fc"
-        else dpt.dtype(ti.default_device_fp_type(q))
-    )
-    res_usm_type = x.usm_type
-    if sum_nd == 0:
-        return dpt.astype(x, res_dt, copy=True)
-
-    _manager = du.SequentialOrderManager[q]
-    dep_evs = _manager.submitted_events
-    if tri._sum_over_axis_dtype_supported(inp_dt, res_dt, res_usm_type, q):
-        res = dpt.empty(
-            res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
-        )
-        ht_e1, r_e = tri._sum_over_axis(
-            src=arr2,
-            trailing_dims_to_reduce=sum_nd,
-            dst=res,
-            sycl_queue=q,
-            depends=dep_evs,
-        )
-        _manager.add_event_pair(ht_e1, r_e)
-    else:
-        tmp = dpt.empty(
-            arr2.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
-        )
-        ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=arr2, dst=tmp, sycl_queue=q, depends=dep_evs
-        )
-        _manager.add_event_pair(ht_e_cpy, cpy_e)
-        res = dpt.empty(
-            res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
-        )
-        ht_e_red, r_e = tri._sum_over_axis(
-            src=tmp,
-            trailing_dims_to_reduce=sum_nd,
-            dst=res,
-            sycl_queue=q,
-            depends=[cpy_e],
-        )
-        _manager.add_event_pair(ht_e_red, r_e)
-
-    if keepdims:
-        res_shape = res_shape + (1,) * sum_nd
-        inv_perm = sorted(range(nd), key=lambda d: perm[d])
-        res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm)
-
-    dep_evs = _manager.submitted_events
-    ht_e2, div_e = tei._divide_by_scalar(
-        src=res, scalar=nelems, dst=res, sycl_queue=q, depends=dep_evs
-    )
-    _manager.add_event_pair(ht_e2, div_e)
-    return res
-
-
-def var(x, axis=None, correction=0.0, keepdims=False):
-    """var(x, axis=None, correction=0.0, keepdims=False)
-
-    Calculates the variance of elements in the input array `x`.
-
-    Args:
-        x (usm_ndarray):
-            input array.
-        axis (Optional[int, Tuple[int, ...]]):
-            axis or axes along which the variances must be computed. If a tuple
-            of unique integers, the variances are computed over multiple axes.
-            If `None`, the variance is computed over the entire array.
-            Default: `None`.
-        correction (Optional[float, int]):
-            degrees of freedom adjustment. The divisor used in calculating the
-            variance is `N - correction`, where `N` corresponds to the total
-            number of elements over which the variance is calculated.
-            Default: `0.0`.
-        keepdims (Optional[bool]):
-            if `True`, the reduced axes (dimensions) are included in the result
-            as singleton dimensions, so that the returned array remains
-            compatible with the input array according to Array Broadcasting
-            rules. Otherwise, if `False`, the reduced axes are not included in
-            the returned array. Default: `False`.
-    Returns:
-        usm_ndarray:
-            an array containing the variances. If the variance was computed
-            over the entire array, a zero-dimensional array is returned.
-
-            If `x` has a real-valued floating-point data type, the returned
-            array will have the same data type as `x`.
-            If `x` has a boolean or integral data type, the returned array
-            will have the default floating point data type for the device
-            where input array `x` is allocated.
-    """
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
-
-    if not isinstance(correction, (int, float)):
-        raise TypeError(
-            "Expected a Python integer or float for `correction`, got"
-            f"{type(x)}"
-        )
-
-    if x.dtype.kind == "c":
-        raise ValueError("`var` does not support complex types")
-
-    res, _ = _var_impl(x, axis, correction, keepdims)
-    return res
-
-
-def std(x, axis=None, correction=0.0, keepdims=False):
-    """std(x, axis=None, correction=0.0, keepdims=False)
-
-    Calculates the standard deviation of elements in the input array `x`.
-
-    Args:
-        x (usm_ndarray):
-            input array.
-        axis (Optional[int, Tuple[int, ...]]):
-            axis or axes along which the standard deviations must be computed.
-            If a tuple of unique integers, the standard deviations are computed
-            over multiple axes. If `None`, the standard deviation is computed
-            over the entire array. Default: `None`.
-        correction (Optional[float, int]):
-            degrees of freedom adjustment. The divisor used in calculating the
-            standard deviation is `N - correction`, where `N` corresponds to the
-            total number of elements over which the standard deviation is
-            calculated. Default: `0.0`.
-        keepdims (Optional[bool]):
-            if `True`, the reduced axes (dimensions) are included in the result
-            as singleton dimensions, so that the returned array remains
-            compatible with the input array according to Array Broadcasting
-            rules. Otherwise, if `False`, the reduced axes are not included in
-            the returned array. Default: `False`.
-    Returns:
-        usm_ndarray:
-            an array containing the standard deviations. If the standard
-            deviation was computed over the entire array, a zero-dimensional
-            array is returned.
-
-            If `x` has a real-valued floating-point data type, the returned
-            array will have the same data type as `x`.
-            If `x` has a boolean or integral data type, the returned array
-            will have the default floating point data type for the device
-            where input array `x` is allocated.
-    """
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
-
-    if not isinstance(correction, (int, float)):
-        raise TypeError(
-            "Expected a Python integer or float for `correction`,"
-            f"got {type(x)}"
-        )
-
-    if x.dtype.kind == "c":
-        raise ValueError("`std` does not support complex types")
-
-    exec_q = x.sycl_queue
-    _manager = du.SequentialOrderManager[exec_q]
-    res, deps = _var_impl(x, axis, correction, keepdims)
-    ht_ev, sqrt_ev = tei._sqrt(
-        src=res, dst=res, sycl_queue=exec_q, depends=deps
-    )
-    _manager.add_event_pair(ht_ev, sqrt_ev)
-    return res
diff --git a/dpctl/tensor/_stride_utils.pxi b/dpctl/tensor/_stride_utils.pxi
deleted file mode 100644
index adbeefe3a6..0000000000
--- a/dpctl/tensor/_stride_utils.pxi
+++ /dev/null
@@ -1,302 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-# distutils: language = c++
-# cython: language_level=3
-
-from cpython.mem cimport PyMem_Malloc
-from cpython.ref cimport Py_INCREF
-from cpython.tuple cimport PyTuple_New, PyTuple_SetItem
-
-
-cdef int ERROR_MALLOC = 1
-cdef int ERROR_INTERNAL = -1
-cdef int ERROR_INCORRECT_ORDER = 2
-cdef int ERROR_UNEXPECTED_STRIDES = 3
-
-cdef int USM_ARRAY_C_CONTIGUOUS = 1
-cdef int USM_ARRAY_F_CONTIGUOUS = 2
-cdef int USM_ARRAY_WRITABLE = 4
-
-
-cdef Py_ssize_t shape_to_elem_count(int nd, Py_ssize_t *shape_arr):
-    """
-    Computes number of elements in an array.
-    """
-    cdef Py_ssize_t count = 1
-    for i in range(nd):
-        count *= shape_arr[i]
-    return count
-
-
-cdef int _from_input_shape_strides(
-    int nd, object shape, object strides, int itemsize, char order,
-    Py_ssize_t **shape_ptr, Py_ssize_t **strides_ptr,
-    Py_ssize_t *nelems, Py_ssize_t *min_disp, Py_ssize_t *max_disp,
-    int *contig
-):
-    """
-    Arguments: nd, shape, strides, itemsize, order
-    Modifies:
-        shape_ptr - pointer to C array for shape values
-        stride_ptr - pointer to C array for strides values
-        nelems - Number of elements in array
-        min_disp = min( dot(strides, index), index for shape)
-        max_disp = max( dor(strides, index), index for shape)
-        contig = enumeration for array contiguity
-    Returns: 0 on success, error code otherwise.
-        On success pointers point to allocated arrays,
-        Otherwise they are set to NULL
-    """
-    cdef int i
-    cdef int j
-    cdef bint all_incr = 1
-    cdef bint all_decr = 1
-    cdef bint strides_inspected = 0
-    cdef Py_ssize_t elem_count = 1
-    cdef Py_ssize_t min_shift = 0
-    cdef Py_ssize_t max_shift = 0
-    cdef Py_ssize_t str_i
-    cdef Py_ssize_t* shape_arr
-    cdef Py_ssize_t* strides_arr
-
-    if (int(order) not in [ord("C"), ord("F"), ord("c"), ord("f")]):
-        return ERROR_INCORRECT_ORDER
-
-    # 0-d array
-    if (nd == 0):
-        contig[0] = (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)
-        nelems[0] = 1
-        min_disp[0] = 0
-        max_disp[0] = 0
-        shape_ptr[0] = <Py_ssize_t *>(<size_t>0)
-        strides_ptr[0] = <Py_ssize_t *>(<size_t>0)
-        return 0
-
-    shape_arr = <Py_ssize_t*>PyMem_Malloc(nd * sizeof(Py_ssize_t))
-    if (not shape_arr):
-        return ERROR_MALLOC
-    shape_ptr[0] = shape_arr
-    for i in range(0, nd):
-        shape_arr[i] = <Py_ssize_t> shape[i]
-        elem_count *= shape_arr[i]
-    if elem_count == 0:
-        contig[0] = (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)
-        nelems[0] = 1
-        min_disp[0] = 0
-        max_disp[0] = 0
-        if strides is None:
-            strides_ptr[0] = <Py_ssize_t *>(<size_t>0)
-        else:
-            strides_arr = <Py_ssize_t*>PyMem_Malloc(nd * sizeof(Py_ssize_t))
-            if (not strides_arr):
-                PyMem_Free(shape_ptr[0])
-                shape_ptr[0] = <Py_ssize_t *>(<size_t>0)
-                return ERROR_MALLOC
-            strides_ptr[0] = strides_arr
-            for i in range(0, nd):
-                strides_arr[i] = <Py_ssize_t> strides[i]
-        return 0
-    nelems[0] = elem_count
-    if (strides is None):
-        # no need to allocate and populate strides
-        if order == <char> ord("C") or order == <char> ord("c"):
-            contig[0] = USM_ARRAY_C_CONTIGUOUS
-        else:
-            contig[0] = USM_ARRAY_F_CONTIGUOUS
-        if nd == 1:
-            contig[0] = USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS
-        else:
-            j = 0
-            for i in range(nd):
-                if shape_arr[i] > 1:
-                    j = j + 1
-            if j < 2:
-                contig[0] = USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS
-        min_disp[0] = 0
-        max_disp[0] = (elem_count - 1)
-        strides_ptr[0] = <Py_ssize_t *>(<size_t>0)
-        return 0
-    elif ((isinstance(strides, (list, tuple)) or hasattr(strides, "tolist"))
-          and len(strides) == nd):
-        strides_arr = <Py_ssize_t*>PyMem_Malloc(nd * sizeof(Py_ssize_t))
-        if (not strides_arr):
-            PyMem_Free(shape_ptr[0])
-            shape_ptr[0] = <Py_ssize_t *>(<size_t>0)
-            return ERROR_MALLOC
-        strides_ptr[0] = strides_arr
-        for i in range(0, nd):
-            str_i = <Py_ssize_t> strides[i]
-            strides_arr[i] = str_i
-            if str_i > 0:
-                max_shift += str_i * (shape_arr[i] - 1)
-            else:
-                min_shift += str_i * (shape_arr[i] - 1)
-        min_disp[0] = min_shift
-        max_disp[0] = max_shift
-        if max_shift == min_shift + (elem_count - 1):
-            if elem_count == 1:
-                contig[0] = (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)
-                return 0
-            if nd == 1:
-                if strides_arr[0] == 1:
-                    contig[0] = USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS
-                else:
-                    contig[0] = 0
-                return 0
-            i = 0
-            while i < nd:
-                if shape_arr[i] == 1:
-                    i = i + 1
-                    continue
-                j = i + 1
-                while (j < nd and shape_arr[j] == 1):
-                    j = j + 1
-                if j < nd:
-                    strides_inspected = 1
-                    if all_incr:
-                        all_incr = (
-                            (strides_arr[i] > 0) and
-                            (strides_arr[j] > 0) and
-                            (strides_arr[i] <= strides_arr[j])
-                        )
-                    if all_decr:
-                        all_decr = (
-                            (strides_arr[i] > 0) and
-                            (strides_arr[j] > 0) and
-                            (strides_arr[i] >= strides_arr[j])
-                        )
-                    i = j
-                else:
-                    if not strides_inspected:
-                        # all dimensions have size 1 except
-                        # dimension 'i'. Array is both C and F
-                        # contiguous
-                        strides_inspected = 1
-                        all_incr = (strides_arr[i] == 1)
-                        all_decr = all_incr
-                    break
-            # should only set contig flags on actually obtained
-            # values, rather than default values
-            all_incr = all_incr and strides_inspected
-            all_decr = all_decr and strides_inspected
-            if all_incr and all_decr:
-                contig[0] = (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)
-            elif all_incr:
-                contig[0] = USM_ARRAY_F_CONTIGUOUS
-            elif all_decr:
-                contig[0] = USM_ARRAY_C_CONTIGUOUS
-            else:
-                contig[0] = 0
-            return 0
-        else:
-            contig[0] = 0  # non-contiguous
-        return 0
-    else:
-        PyMem_Free(shape_ptr[0])
-        shape_ptr[0] = <Py_ssize_t *>(<size_t>0)
-        return ERROR_UNEXPECTED_STRIDES
-    # return ERROR_INTERNAL
-
-
-cdef object _make_int_tuple(int nd, const Py_ssize_t *ary):
-    """
-    Makes Python tuple from C array
-    """
-    cdef tuple res
-    cdef object tmp
-    if (ary):
-        res = PyTuple_New(nd)
-        for i in range(nd):
-            tmp = <object>ary[i]
-            Py_INCREF(tmp)  # SetItem steals the reference
-            PyTuple_SetItem(res, i, tmp)
-        return res
-    else:
-        return None
-
-
-cdef object _make_reversed_int_tuple(int nd, const Py_ssize_t *ary):
-    """
-    Makes Python reversed tuple from C array
-    """
-    cdef tuple res
-    cdef object tmp
-    cdef int i
-    cdef int nd_1
-    if (ary):
-        res = PyTuple_New(nd)
-        nd_1 = nd - 1
-        for i in range(nd):
-            tmp = <object>ary[i]
-            Py_INCREF(tmp)  # SetItem steals the reference
-            PyTuple_SetItem(res, nd_1 - i, tmp)
-        return res
-    else:
-        return None
-
-
-cdef object _c_contig_strides(int nd, Py_ssize_t *shape):
-    """
-    Makes Python tuple for strides of C-contiguous array
-    """
-    cdef tuple cc_strides = PyTuple_New(nd)
-    cdef object si = 1
-    cdef int i
-    cdef int nd_1 = nd - 1
-    for i in range(0, nd):
-        Py_INCREF(si)  # SetItem steals the reference
-        PyTuple_SetItem(cc_strides, nd_1 - i, si)
-        si = si * shape[nd_1 - i]
-    return cc_strides
-
-
-cdef object _f_contig_strides(int nd, Py_ssize_t *shape):
-    """
-    Makes Python tuple for strides of F-contiguous array
-    """
-    cdef tuple fc_strides = PyTuple_New(nd)
-    cdef object si = 1
-    for i in range(0, nd):
-        Py_INCREF(si)  # SetItem steals the reference
-        PyTuple_SetItem(fc_strides, i, si)
-        si = si * shape[i]
-    return fc_strides
-
-cdef object _swap_last_two(tuple t):
-    """
-    Swap last two elements of a tuple
-    """
-    cdef int nd = len(t)
-    cdef tuple res
-    cdef int i
-    cdef object tmp
-    if (nd < 2):
-        return t
-    res = PyTuple_New(nd)
-    # copy all elements except the last two
-    for i in range(0, nd-2):
-        tmp = t[i]
-        Py_INCREF(tmp)  # SetItem steals the reference
-        PyTuple_SetItem(res, i, tmp)
-    # swap the last two elements
-    tmp = t[nd-1]
-    Py_INCREF(tmp)  # SetItem steals
-    PyTuple_SetItem(res, nd - 2, tmp)
-    tmp = t[nd-2]
-    Py_INCREF(tmp)  # SetItem steals
-    PyTuple_SetItem(res, nd - 1, tmp)
-    return res
diff --git a/dpctl/tensor/_testing.py b/dpctl/tensor/_testing.py
deleted file mode 100644
index 704494d568..0000000000
--- a/dpctl/tensor/_testing.py
+++ /dev/null
@@ -1,152 +0,0 @@
-#                      Data Parallel Control (dpctl)
-#
-# Copyright 2020-2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-
-import dpctl.tensor as dpt
-import dpctl.utils as du
-
-from ._manipulation_functions import _broadcast_shape_impl
-from ._type_utils import _to_device_supported_dtype
-
-
-def _allclose_complex_fp(z1, z2, atol, rtol, equal_nan):
-    z1r = dpt.real(z1)
-    z1i = dpt.imag(z1)
-    z2r = dpt.real(z2)
-    z2i = dpt.imag(z2)
-    if equal_nan:
-        check1 = dpt.all(dpt.isnan(z1r) == dpt.isnan(z2r)) and dpt.all(
-            dpt.isnan(z1i) == dpt.isnan(z2i)
-        )
-    else:
-        check1 = (
-            dpt.logical_not(dpt.any(dpt.isnan(z1r)))
-            and dpt.logical_not(dpt.any(dpt.isnan(z1i)))
-        ) and (
-            dpt.logical_not(dpt.any(dpt.isnan(z2r)))
-            and dpt.logical_not(dpt.any(dpt.isnan(z2i)))
-        )
-    if not check1:
-        return check1
-    mr = dpt.isinf(z1r)
-    mi = dpt.isinf(z1i)
-    check2 = dpt.all(mr == dpt.isinf(z2r)) and dpt.all(mi == dpt.isinf(z2i))
-    if not check2:
-        return check2
-    check3 = dpt.all(z1r[mr] == z2r[mr]) and dpt.all(z1i[mi] == z2i[mi])
-    if not check3:
-        return check3
-    mr = dpt.isfinite(z1r)
-    mi = dpt.isfinite(z1i)
-    mv1 = z1r[mr]
-    mv2 = z2r[mr]
-    check4 = dpt.all(
-        dpt.abs(mv1 - mv2)
-        < dpt.maximum(atol, rtol * dpt.maximum(dpt.abs(mv1), dpt.abs(mv2)))
-    )
-    if not check4:
-        return check4
-    mv1 = z1i[mi]
-    mv2 = z2i[mi]
-    check5 = dpt.all(
-        dpt.abs(mv1 - mv2)
-        <= dpt.maximum(atol, rtol * dpt.maximum(dpt.abs(mv1), dpt.abs(mv2)))
-    )
-    return check5
-
-
-def _allclose_real_fp(r1, r2, atol, rtol, equal_nan):
-    if equal_nan:
-        check1 = dpt.all(dpt.isnan(r1) == dpt.isnan(r2))
-    else:
-        check1 = dpt.logical_not(dpt.any(dpt.isnan(r1))) and dpt.logical_not(
-            dpt.any(dpt.isnan(r2))
-        )
-    if not check1:
-        return check1
-    mr = dpt.isinf(r1)
-    check2 = dpt.all(mr == dpt.isinf(r2))
-    if not check2:
-        return check2
-    check3 = dpt.all(r1[mr] == r2[mr])
-    if not check3:
-        return check3
-    m = dpt.isfinite(r1)
-    mv1 = r1[m]
-    mv2 = r2[m]
-    check4 = dpt.all(
-        dpt.abs(mv1 - mv2)
-        <= dpt.maximum(atol, rtol * dpt.maximum(dpt.abs(mv1), dpt.abs(mv2)))
-    )
-    return check4
-
-
-def _allclose_others(r1, r2):
-    return dpt.all(r1 == r2)
-
-
-def allclose(a1, a2, atol=1e-8, rtol=1e-5, equal_nan=False):
-    """allclose(a1, a2, atol=1e-8, rtol=1e-5, equal_nan=False)
-
-    Returns True if two arrays are element-wise equal within tolerances.
-
-    The testing is based on the following elementwise comparison:
-
-           abs(a - b) <= max(atol, rtol * max(abs(a), abs(b)))
-    """
-    if not isinstance(a1, dpt.usm_ndarray):
-        raise TypeError(
-            f"Expected dpctl.tensor.usm_ndarray type, got {type(a1)}."
-        )
-    if not isinstance(a2, dpt.usm_ndarray):
-        raise TypeError(
-            f"Expected dpctl.tensor.usm_ndarray type, got {type(a2)}."
-        )
-    atol = float(atol)
-    rtol = float(rtol)
-    if atol < 0.0 or rtol < 0.0:
-        raise ValueError(
-            "Absolute and relative tolerances must be non-negative"
-        )
-    equal_nan = bool(equal_nan)
-    exec_q = du.get_execution_queue(tuple(a.sycl_queue for a in (a1, a2)))
-    if exec_q is None:
-        raise du.ExecutionPlacementError(
-            "Execution placement can not be unambiguously inferred "
-            "from input arguments."
-        )
-    res_sh = _broadcast_shape_impl([a1.shape, a2.shape])
-    b1 = a1
-    b2 = a2
-    if b1.dtype == b2.dtype:
-        res_dt = b1.dtype
-    else:
-        res_dt = np.promote_types(b1.dtype, b2.dtype)
-        res_dt = _to_device_supported_dtype(res_dt, exec_q.sycl_device)
-        b1 = dpt.astype(b1, res_dt)
-        b2 = dpt.astype(b2, res_dt)
-
-    b1 = dpt.broadcast_to(b1, res_sh)
-    b2 = dpt.broadcast_to(b2, res_sh)
-
-    k = b1.dtype.kind
-    if k == "c":
-        return _allclose_complex_fp(b1, b2, atol, rtol, equal_nan)
-    elif k == "f":
-        return _allclose_real_fp(b1, b2, atol, rtol, equal_nan)
-    else:
-        return _allclose_others(b1, b2)
diff --git a/dpctl/tensor/_type_utils.py b/dpctl/tensor/_type_utils.py
deleted file mode 100644
index c78674ff18..0000000000
--- a/dpctl/tensor/_type_utils.py
+++ /dev/null
@@ -1,981 +0,0 @@
-#                      Data Parallel Control (dpctl)
-#
-# Copyright 2020-2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import annotations
-
-import numpy as np
-
-import dpctl.tensor as dpt
-import dpctl.tensor._tensor_impl as ti
-
-
-def _all_data_types(_fp16, _fp64):
-    _non_fp_types = [
-        dpt.bool,
-        dpt.int8,
-        dpt.uint8,
-        dpt.int16,
-        dpt.uint16,
-        dpt.int32,
-        dpt.uint32,
-        dpt.int64,
-        dpt.uint64,
-    ]
-    if _fp64:
-        if _fp16:
-            return _non_fp_types + [
-                dpt.float16,
-                dpt.float32,
-                dpt.float64,
-                dpt.complex64,
-                dpt.complex128,
-            ]
-        else:
-            return _non_fp_types + [
-                dpt.float32,
-                dpt.float64,
-                dpt.complex64,
-                dpt.complex128,
-            ]
-    else:
-        if _fp16:
-            return _non_fp_types + [
-                dpt.float16,
-                dpt.float32,
-                dpt.complex64,
-            ]
-        else:
-            return _non_fp_types + [
-                dpt.float32,
-                dpt.complex64,
-            ]
-
-
-def _is_maximal_inexact_type(dt: dpt.dtype, _fp16: bool, _fp64: bool):
-    """
-    Return True if data type `dt` is the
-    maximal size inexact data type
-    """
-    if _fp64:
-        return dt in [dpt.float64, dpt.complex128]
-    return dt in [dpt.float32, dpt.complex64]
-
-
-def _dtype_supported_by_device_impl(
-    dt: dpt.dtype, has_fp16: bool, has_fp64: bool
-) -> bool:
-    if has_fp64:
-        if not has_fp16:
-            if dt is dpt.float16:
-                return False
-    else:
-        if dt is dpt.float64:
-            return False
-        elif dt is dpt.complex128:
-            return False
-        if not has_fp16 and dt is dpt.float16:
-            return False
-    return True
-
-
-def _can_cast(
-    from_: dpt.dtype, to_: dpt.dtype, _fp16: bool, _fp64: bool, casting="safe"
-) -> bool:
-    """
-    Can `from_` be cast to `to_` safely on a device with
-    fp16 and fp64 aspects as given?
-    """
-    if not _dtype_supported_by_device_impl(to_, _fp16, _fp64):
-        return False
-    can_cast_v = np.can_cast(from_, to_, casting=casting)  # ask NumPy
-    if _fp16 and _fp64:
-        return can_cast_v
-    if not can_cast_v:
-        if (
-            from_.kind in "biu"
-            and to_.kind in "fc"
-            and _is_maximal_inexact_type(to_, _fp16, _fp64)
-        ):
-            return True
-
-    return can_cast_v
-
-
-def _to_device_supported_dtype_impl(dt, has_fp16, has_fp64):
-    if has_fp64:
-        if not has_fp16:
-            if dt is dpt.float16:
-                return dpt.float32
-    else:
-        if dt is dpt.float64:
-            return dpt.float32
-        elif dt is dpt.complex128:
-            return dpt.complex64
-        if not has_fp16 and dt is dpt.float16:
-            return dpt.float32
-    return dt
-
-
-def _to_device_supported_dtype(dt, dev):
-    has_fp16 = dev.has_aspect_fp16
-    has_fp64 = dev.has_aspect_fp64
-
-    return _to_device_supported_dtype_impl(dt, has_fp16, has_fp64)
-
-
-def _acceptance_fn_default_unary(arg_dtype, ret_buf_dt, res_dt, sycl_dev):
-    return True
-
-
-def _acceptance_fn_reciprocal(arg_dtype, buf_dt, res_dt, sycl_dev):
-    # if the kind of result is different from the kind of input, we use the
-    # default floating-point dtype for the resulting kind. This guarantees
-    # alignment of reciprocal and divide output types.
-    if buf_dt.kind != arg_dtype.kind:
-        default_dt = _get_device_default_dtype(res_dt.kind, sycl_dev)
-        if res_dt == default_dt:
-            return True
-        else:
-            return False
-    else:
-        return True
-
-
-def _acceptance_fn_negative(arg_dtype, buf_dt, res_dt, sycl_dev):
-    # negative is not defined for boolean data type
-    if arg_dtype.char == "?":
-        raise ValueError(
-            "The `negative` function, the `-` operator, is not supported "
-            "for inputs of data type bool, use the `~` operator or the "
-            "`logical_not` function instead"
-        )
-    else:
-        return True
-
-
-def _acceptance_fn_subtract(
-    arg1_dtype, arg2_dtype, buf1_dt, buf2_dt, res_dt, sycl_dev
-):
-    # subtract is not defined for boolean data type
-    if arg1_dtype.char == "?" and arg2_dtype.char == "?":
-        raise ValueError(
-            "The `subtract` function, the `-` operator, is not supported "
-            "for inputs of data type bool, use the `^` operator,  the "
-            "`bitwise_xor`, or the `logical_xor` function instead"
-        )
-    else:
-        return True
-
-
-def _find_buf_dtype(arg_dtype, query_fn, sycl_dev, acceptance_fn):
-    res_dt = query_fn(arg_dtype)
-    if res_dt:
-        return None, res_dt
-
-    _fp16 = sycl_dev.has_aspect_fp16
-    _fp64 = sycl_dev.has_aspect_fp64
-    all_dts = _all_data_types(_fp16, _fp64)
-    for buf_dt in all_dts:
-        if _can_cast(arg_dtype, buf_dt, _fp16, _fp64):
-            res_dt = query_fn(buf_dt)
-            if res_dt:
-                acceptable = acceptance_fn(arg_dtype, buf_dt, res_dt, sycl_dev)
-                if acceptable:
-                    return buf_dt, res_dt
-                else:
-                    continue
-
-    return None, None
-
-
-def _get_device_default_dtype(dt_kind, sycl_dev):
-    if dt_kind == "b":
-        return dpt.dtype(ti.default_device_bool_type(sycl_dev))
-    elif dt_kind == "i":
-        return dpt.dtype(ti.default_device_int_type(sycl_dev))
-    elif dt_kind == "u":
-        return dpt.dtype(ti.default_device_uint_type(sycl_dev))
-    elif dt_kind == "f":
-        return dpt.dtype(ti.default_device_fp_type(sycl_dev))
-    elif dt_kind == "c":
-        return dpt.dtype(ti.default_device_complex_type(sycl_dev))
-    raise RuntimeError
-
-
-def _acceptance_fn_default_binary(
-    arg1_dtype, arg2_dtype, ret_buf1_dt, ret_buf2_dt, res_dt, sycl_dev
-):
-    return True
-
-
-def _acceptance_fn_divide(
-    arg1_dtype, arg2_dtype, ret_buf1_dt, ret_buf2_dt, res_dt, sycl_dev
-):
-    # both are being promoted, if the kind of result is
-    # different than the kind of original input dtypes,
-    # we use default dtype for the resulting kind.
-    # This covers, e.g. (array_dtype_i1 / array_dtype_u1)
-    # result of which in divide is double (in NumPy), but
-    # regular type promotion rules peg at float16
-    if (ret_buf1_dt.kind != arg1_dtype.kind) and (
-        ret_buf2_dt.kind != arg2_dtype.kind
-    ):
-        default_dt = _get_device_default_dtype(res_dt.kind, sycl_dev)
-        if res_dt == default_dt:
-            return True
-        else:
-            return False
-    else:
-        return True
-
-
-def _find_buf_dtype2(arg1_dtype, arg2_dtype, query_fn, sycl_dev, acceptance_fn):
-    res_dt = query_fn(arg1_dtype, arg2_dtype)
-    if res_dt:
-        return None, None, res_dt
-
-    _fp16 = sycl_dev.has_aspect_fp16
-    _fp64 = sycl_dev.has_aspect_fp64
-    all_dts = _all_data_types(_fp16, _fp64)
-    for buf1_dt in all_dts:
-        for buf2_dt in all_dts:
-            if _can_cast(arg1_dtype, buf1_dt, _fp16, _fp64) and _can_cast(
-                arg2_dtype, buf2_dt, _fp16, _fp64
-            ):
-                res_dt = query_fn(buf1_dt, buf2_dt)
-                if res_dt:
-                    ret_buf1_dt = None if buf1_dt == arg1_dtype else buf1_dt
-                    ret_buf2_dt = None if buf2_dt == arg2_dtype else buf2_dt
-                    if ret_buf1_dt is None or ret_buf2_dt is None:
-                        return ret_buf1_dt, ret_buf2_dt, res_dt
-                    else:
-                        acceptable = acceptance_fn(
-                            arg1_dtype,
-                            arg2_dtype,
-                            ret_buf1_dt,
-                            ret_buf2_dt,
-                            res_dt,
-                            sycl_dev,
-                        )
-                        if acceptable:
-                            return ret_buf1_dt, ret_buf2_dt, res_dt
-                        else:
-                            continue
-
-    return None, None, None
-
-
-def _find_buf_dtype_in_place_op(arg1_dtype, arg2_dtype, query_fn, sycl_dev):
-    res_dt = query_fn(arg1_dtype, arg2_dtype)
-    if res_dt:
-        return None, res_dt
-
-    _fp16 = sycl_dev.has_aspect_fp16
-    _fp64 = sycl_dev.has_aspect_fp64
-    if _can_cast(arg2_dtype, arg1_dtype, _fp16, _fp64, casting="same_kind"):
-        res_dt = query_fn(arg1_dtype, arg1_dtype)
-        if res_dt:
-            return arg1_dtype, res_dt
-
-    return None, None
-
-
-class WeakBooleanType:
-    "Python type representing type of Python boolean objects"
-
-    def __init__(self, o):
-        self.o_ = o
-
-    def get(self):
-        return self.o_
-
-
-class WeakIntegralType:
-    "Python type representing type of Python integral objects"
-
-    def __init__(self, o):
-        self.o_ = o
-
-    def get(self):
-        return self.o_
-
-
-class WeakFloatingType:
-    """Python type representing type of Python floating point objects"""
-
-    def __init__(self, o):
-        self.o_ = o
-
-    def get(self):
-        return self.o_
-
-
-class WeakComplexType:
-    """Python type representing type of Python complex floating point objects"""
-
-    def __init__(self, o):
-        self.o_ = o
-
-    def get(self):
-        return self.o_
-
-
-def _weak_type_num_kind(o):
-    _map = {"?": 0, "i": 1, "f": 2, "c": 3}
-    if isinstance(o, WeakBooleanType):
-        return _map["?"]
-    if isinstance(o, WeakIntegralType):
-        return _map["i"]
-    if isinstance(o, WeakFloatingType):
-        return _map["f"]
-    if isinstance(o, WeakComplexType):
-        return _map["c"]
-    raise TypeError(
-        f"Unexpected type {o} while expecting "
-        "`WeakBooleanType`, `WeakIntegralType`,"
-        "`WeakFloatingType`, or `WeakComplexType`."
-    )
-
-
-def _strong_dtype_num_kind(o):
-    _map = {"b": 0, "i": 1, "u": 1, "f": 2, "c": 3}
-    if not isinstance(o, dpt.dtype):
-        raise TypeError
-    k = o.kind
-    if k in _map:
-        return _map[k]
-    raise ValueError(f"Unrecognized kind {k} for dtype {o}")
-
-
-def _is_weak_dtype(dtype):
-    return isinstance(
-        dtype,
-        (WeakBooleanType, WeakIntegralType, WeakFloatingType, WeakComplexType),
-    )
-
-
-def _resolve_weak_types(o1_dtype, o2_dtype, dev):
-    "Resolves weak data type per NEP-0050"
-    if _is_weak_dtype(o1_dtype):
-        if _is_weak_dtype(o2_dtype):
-            raise ValueError
-        o1_kind_num = _weak_type_num_kind(o1_dtype)
-        o2_kind_num = _strong_dtype_num_kind(o2_dtype)
-        if o1_kind_num > o2_kind_num:
-            if isinstance(o1_dtype, WeakIntegralType):
-                return dpt.dtype(ti.default_device_int_type(dev)), o2_dtype
-            if isinstance(o1_dtype, WeakComplexType):
-                if o2_dtype is dpt.float16 or o2_dtype is dpt.float32:
-                    return dpt.complex64, o2_dtype
-                return (
-                    _to_device_supported_dtype(dpt.complex128, dev),
-                    o2_dtype,
-                )
-            return _to_device_supported_dtype(dpt.float64, dev), o2_dtype
-        else:
-            return o2_dtype, o2_dtype
-    elif _is_weak_dtype(o2_dtype):
-        o1_kind_num = _strong_dtype_num_kind(o1_dtype)
-        o2_kind_num = _weak_type_num_kind(o2_dtype)
-        if o2_kind_num > o1_kind_num:
-            if isinstance(o2_dtype, WeakIntegralType):
-                return o1_dtype, dpt.dtype(ti.default_device_int_type(dev))
-            if isinstance(o2_dtype, WeakComplexType):
-                if o1_dtype is dpt.float16 or o1_dtype is dpt.float32:
-                    return o1_dtype, dpt.complex64
-                return o1_dtype, _to_device_supported_dtype(dpt.complex128, dev)
-            return (
-                o1_dtype,
-                _to_device_supported_dtype(dpt.float64, dev),
-            )
-        else:
-            return o1_dtype, o1_dtype
-    else:
-        return o1_dtype, o2_dtype
-
-
-def _resolve_weak_types_all_py_ints(o1_dtype, o2_dtype, dev):
-    "Resolves weak data type per NEP-0050 for comparisons and"
-    " divide, where result type is known and special behavior"
-    "is needed to handle mixed integer kinds and Python integers"
-    "without overflow"
-    if _is_weak_dtype(o1_dtype):
-        if _is_weak_dtype(o2_dtype):
-            raise ValueError
-        o1_kind_num = _weak_type_num_kind(o1_dtype)
-        o2_kind_num = _strong_dtype_num_kind(o2_dtype)
-        if o1_kind_num > o2_kind_num:
-            if isinstance(o1_dtype, WeakIntegralType):
-                return dpt.dtype(ti.default_device_int_type(dev)), o2_dtype
-            if isinstance(o1_dtype, WeakComplexType):
-                if o2_dtype is dpt.float16 or o2_dtype is dpt.float32:
-                    return dpt.complex64, o2_dtype
-                return (
-                    _to_device_supported_dtype(dpt.complex128, dev),
-                    o2_dtype,
-                )
-            return _to_device_supported_dtype(dpt.float64, dev), o2_dtype
-        else:
-            if o1_kind_num == o2_kind_num and isinstance(
-                o1_dtype, WeakIntegralType
-            ):
-                o1_val = o1_dtype.get()
-                o2_iinfo = dpt.iinfo(o2_dtype)
-                if (o1_val < o2_iinfo.min) or (o1_val > o2_iinfo.max):
-                    return dpt.dtype(np.min_scalar_type(o1_val)), o2_dtype
-            return o2_dtype, o2_dtype
-    elif _is_weak_dtype(o2_dtype):
-        o1_kind_num = _strong_dtype_num_kind(o1_dtype)
-        o2_kind_num = _weak_type_num_kind(o2_dtype)
-        if o2_kind_num > o1_kind_num:
-            if isinstance(o2_dtype, WeakIntegralType):
-                return o1_dtype, dpt.dtype(ti.default_device_int_type(dev))
-            if isinstance(o2_dtype, WeakComplexType):
-                if o1_dtype is dpt.float16 or o1_dtype is dpt.float32:
-                    return o1_dtype, dpt.complex64
-                return o1_dtype, _to_device_supported_dtype(dpt.complex128, dev)
-            return (
-                o1_dtype,
-                _to_device_supported_dtype(dpt.float64, dev),
-            )
-        else:
-            if o1_kind_num == o2_kind_num and isinstance(
-                o2_dtype, WeakIntegralType
-            ):
-                o2_val = o2_dtype.get()
-                o1_iinfo = dpt.iinfo(o1_dtype)
-                if (o2_val < o1_iinfo.min) or (o2_val > o1_iinfo.max):
-                    return o1_dtype, dpt.dtype(np.min_scalar_type(o2_val))
-            return o1_dtype, o1_dtype
-    else:
-        return o1_dtype, o2_dtype
-
-
-def _resolve_one_strong_two_weak_types(st_dtype, dtype1, dtype2, dev):
-    "Resolves weak data types per NEP-0050,"
-    "where the second and third arguments are"
-    "permitted to be weak types"
-    if _is_weak_dtype(st_dtype):
-        raise ValueError
-    if _is_weak_dtype(dtype1):
-        if _is_weak_dtype(dtype2):
-            kind_num1 = _weak_type_num_kind(dtype1)
-            kind_num2 = _weak_type_num_kind(dtype2)
-            st_kind_num = _strong_dtype_num_kind(st_dtype)
-
-            if kind_num1 > st_kind_num:
-                if isinstance(dtype1, WeakIntegralType):
-                    ret_dtype1 = dpt.dtype(ti.default_device_int_type(dev))
-                elif isinstance(dtype1, WeakComplexType):
-                    if st_dtype is dpt.float16 or st_dtype is dpt.float32:
-                        ret_dtype1 = dpt.complex64
-                    ret_dtype1 = _to_device_supported_dtype(dpt.complex128, dev)
-                else:
-                    ret_dtype1 = _to_device_supported_dtype(dpt.float64, dev)
-            else:
-                ret_dtype1 = st_dtype
-
-            if kind_num2 > st_kind_num:
-                if isinstance(dtype2, WeakIntegralType):
-                    ret_dtype2 = dpt.dtype(ti.default_device_int_type(dev))
-                elif isinstance(dtype2, WeakComplexType):
-                    if st_dtype is dpt.float16 or st_dtype is dpt.float32:
-                        ret_dtype2 = dpt.complex64
-                    ret_dtype2 = _to_device_supported_dtype(dpt.complex128, dev)
-                else:
-                    ret_dtype2 = _to_device_supported_dtype(dpt.float64, dev)
-            else:
-                ret_dtype2 = st_dtype
-
-            return ret_dtype1, ret_dtype2
-
-        max_dt_num_kind, max_dtype = max(
-            [
-                (_strong_dtype_num_kind(st_dtype), st_dtype),
-                (_strong_dtype_num_kind(dtype2), dtype2),
-            ]
-        )
-        dt1_kind_num = _weak_type_num_kind(dtype1)
-        if dt1_kind_num > max_dt_num_kind:
-            if isinstance(dtype1, WeakIntegralType):
-                return dpt.dtype(ti.default_device_int_type(dev)), dtype2
-            if isinstance(dtype1, WeakComplexType):
-                if max_dtype is dpt.float16 or max_dtype is dpt.float32:
-                    return dpt.complex64, dtype2
-                return (
-                    _to_device_supported_dtype(dpt.complex128, dev),
-                    dtype2,
-                )
-            return _to_device_supported_dtype(dpt.float64, dev), dtype2
-        else:
-            return max_dtype, dtype2
-    elif _is_weak_dtype(dtype2):
-        max_dt_num_kind, max_dtype = max(
-            [
-                (_strong_dtype_num_kind(st_dtype), st_dtype),
-                (_strong_dtype_num_kind(dtype1), dtype1),
-            ]
-        )
-        dt2_kind_num = _weak_type_num_kind(dtype2)
-        if dt2_kind_num > max_dt_num_kind:
-            if isinstance(dtype2, WeakIntegralType):
-                return dtype1, dpt.dtype(ti.default_device_int_type(dev))
-            if isinstance(dtype2, WeakComplexType):
-                if max_dtype is dpt.float16 or max_dtype is dpt.float32:
-                    return dtype1, dpt.complex64
-                return (
-                    dtype1,
-                    _to_device_supported_dtype(dpt.complex128, dev),
-                )
-            return dtype1, _to_device_supported_dtype(dpt.float64, dev)
-        else:
-            return dtype1, max_dtype
-    else:
-        # both are strong dtypes
-        # return unmodified
-        return dtype1, dtype2
-
-
-def _resolve_one_strong_one_weak_types(st_dtype, dtype, dev):
-    "Resolves one weak data type with one strong data type per NEP-0050"
-    if _is_weak_dtype(st_dtype):
-        raise ValueError
-    if _is_weak_dtype(dtype):
-        st_kind_num = _strong_dtype_num_kind(st_dtype)
-        kind_num = _weak_type_num_kind(dtype)
-        if kind_num > st_kind_num:
-            if isinstance(dtype, WeakIntegralType):
-                return dpt.dtype(ti.default_device_int_type(dev))
-            if isinstance(dtype, WeakComplexType):
-                if st_dtype is dpt.float16 or st_dtype is dpt.float32:
-                    return dpt.complex64
-                return _to_device_supported_dtype(dpt.complex128, dev)
-            return _to_device_supported_dtype(dpt.float64, dev)
-        else:
-            return st_dtype
-    else:
-        return dtype
-
-
-class finfo_object:
-    """
-    `numpy.finfo` subclass which returns Python floating-point scalars for
-    `eps`, `max`, `min`, and `smallest_normal` attributes.
-    """
-
-    def __init__(self, dtype):
-        _supported_dtype([dpt.dtype(dtype)])
-        self._finfo = np.finfo(dtype)
-
-    @property
-    def bits(self):
-        """
-        number of bits occupied by the real-valued floating-point data type.
-        """
-        return int(self._finfo.bits)
-
-    @property
-    def smallest_normal(self):
-        """
-        smallest positive real-valued floating-point number with full
-        precision.
-        """
-        return float(self._finfo.smallest_normal)
-
-    @property
-    def tiny(self):
-        """an alias for `smallest_normal`"""
-        return float(self._finfo.tiny)
-
-    @property
-    def eps(self):
-        """
-        difference between 1.0 and the next smallest representable real-valued
-        floating-point number larger than 1.0 according to the IEEE-754
-        standard.
-        """
-        return float(self._finfo.eps)
-
-    @property
-    def epsneg(self):
-        """
-        difference between 1.0 and the next smallest representable real-valued
-        floating-point number smaller than 1.0 according to the IEEE-754
-        standard.
-        """
-        return float(self._finfo.epsneg)
-
-    @property
-    def min(self):
-        """smallest representable real-valued number."""
-        return float(self._finfo.min)
-
-    @property
-    def max(self):
-        "largest representable real-valued number."
-        return float(self._finfo.max)
-
-    @property
-    def resolution(self):
-        "the approximate decimal resolution of this type."
-        return float(self._finfo.resolution)
-
-    @property
-    def precision(self):
-        """
-        the approximate number of decimal digits to which this kind of
-        floating point type is precise.
-        """
-        return float(self._finfo.precision)
-
-    @property
-    def dtype(self):
-        """
-        the dtype for which finfo returns information. For complex input, the
-        returned dtype is the associated floating point dtype for its real and
-        complex components.
-        """
-        return self._finfo.dtype
-
-    def __str__(self):
-        return self._finfo.__str__()
-
-    def __repr__(self):
-        return self._finfo.__repr__()
-
-
-def can_cast(from_, to, /, *, casting="safe") -> bool:
-    """ can_cast(from, to, casting="safe")
-
-    Determines if one data type can be cast to another data type according \
-    to Type Promotion Rules.
-
-    Args:
-       from_ (Union[usm_ndarray, dtype]):
-           source data type. If `from_` is an array, a device-specific type
-           promotion rules apply.
-       to (dtype):
-           target data type
-       casting (Optional[str]):
-            controls what kind of data casting may occur.
-
-                * "no" means data types should not be cast at all.
-                * "safe" means only casts that preserve values are allowed.
-                * "same_kind" means only safe casts and casts within a kind,
-                  like `float64` to `float32`, are allowed.
-                * "unsafe" means any data conversion can be done.
-
-            Default: `"safe"`.
-
-    Returns:
-        bool:
-            Gives `True` if cast can occur according to the casting rule.
-
-    Device-specific type promotion rules take into account which data type are
-    and are not supported by a specific device.
-    """
-    if isinstance(to, dpt.usm_ndarray):
-        raise TypeError(f"Expected `dpt.dtype` type, got {type(to)}.")
-
-    dtype_to = dpt.dtype(to)
-    _supported_dtype([dtype_to])
-
-    if isinstance(from_, dpt.usm_ndarray):
-        dtype_from = from_.dtype
-        return _can_cast(
-            dtype_from,
-            dtype_to,
-            from_.sycl_device.has_aspect_fp16,
-            from_.sycl_device.has_aspect_fp64,
-            casting=casting,
-        )
-    else:
-        dtype_from = dpt.dtype(from_)
-        _supported_dtype([dtype_from])
-        # query casting as if all dtypes are supported
-        return _can_cast(dtype_from, dtype_to, True, True, casting=casting)
-
-
-def result_type(*arrays_and_dtypes):
-    """
-    result_type(*arrays_and_dtypes)
-
-    Returns the dtype that results from applying the Type Promotion Rules to \
-        the arguments.
-
-    Args:
-        arrays_and_dtypes (Union[usm_ndarray, dtype]):
-            An arbitrary length sequence of usm_ndarray objects or dtypes.
-
-    Returns:
-        dtype:
-            The dtype resulting from an operation involving the
-            input arrays and dtypes.
-    """
-    dtypes = []
-    devices = []
-    weak_dtypes = []
-    for arg_i in arrays_and_dtypes:
-        if isinstance(arg_i, dpt.usm_ndarray):
-            devices.append(arg_i.sycl_device)
-            dtypes.append(arg_i.dtype)
-        elif isinstance(arg_i, int):
-            weak_dtypes.append(WeakIntegralType(arg_i))
-        elif isinstance(arg_i, float):
-            weak_dtypes.append(WeakFloatingType(arg_i))
-        elif isinstance(arg_i, complex):
-            weak_dtypes.append(WeakComplexType(arg_i))
-        elif isinstance(arg_i, bool):
-            weak_dtypes.append(WeakBooleanType(arg_i))
-        else:
-            dt = dpt.dtype(arg_i)
-            _supported_dtype([dt])
-            dtypes.append(dt)
-
-    has_fp16 = True
-    has_fp64 = True
-    target_dev = None
-    if devices:
-        inspected = False
-        for d in devices:
-            if inspected:
-                unsame_fp16_support = d.has_aspect_fp16 != has_fp16
-                unsame_fp64_support = d.has_aspect_fp64 != has_fp64
-                if unsame_fp16_support or unsame_fp64_support:
-                    raise ValueError(
-                        "Input arrays reside on devices "
-                        "with different device supports; "
-                        "unable to determine which "
-                        "device-specific type promotion rules "
-                        "to use."
-                    )
-            else:
-                has_fp16 = d.has_aspect_fp16
-                has_fp64 = d.has_aspect_fp64
-                target_dev = d
-                inspected = True
-
-    if not dtypes and weak_dtypes:
-        dtypes.append(weak_dtypes[0].get())
-
-    if not (has_fp16 and has_fp64):
-        for dt in dtypes:
-            if not _dtype_supported_by_device_impl(dt, has_fp16, has_fp64):
-                raise ValueError(
-                    f"Argument {dt} is not supported by the device"
-                )
-        res_dt = np.result_type(*dtypes)
-        res_dt = _to_device_supported_dtype_impl(res_dt, has_fp16, has_fp64)
-        for wdt in weak_dtypes:
-            pair = _resolve_weak_types(wdt, res_dt, target_dev)
-            res_dt = np.result_type(*pair)
-            res_dt = _to_device_supported_dtype_impl(res_dt, has_fp16, has_fp64)
-    else:
-        res_dt = np.result_type(*dtypes)
-        if weak_dtypes:
-            weak_dt_obj = [wdt.get() for wdt in weak_dtypes]
-            res_dt = np.result_type(res_dt, *weak_dt_obj)
-
-    return res_dt
-
-
-def iinfo(dtype, /):
-    """iinfo(dtype)
-
-    Returns machine limits for integer data types.
-
-    Args:
-        dtype (dtype, usm_ndarray):
-            integer dtype or
-            an array with integer dtype.
-
-    Returns:
-        iinfo_object:
-            An object with the following attributes:
-
-            * bits: int
-                number of bits occupied by the data type
-            * max: int
-                largest representable number.
-            * min: int
-                smallest representable number.
-            * dtype: dtype
-                integer data type.
-    """
-    if isinstance(dtype, dpt.usm_ndarray):
-        dtype = dtype.dtype
-    _supported_dtype([dpt.dtype(dtype)])
-    return np.iinfo(dtype)
-
-
-def finfo(dtype, /):
-    """finfo(type)
-
-    Returns machine limits for floating-point data types.
-
-    Args:
-        dtype (dtype, usm_ndarray): floating-point dtype or
-            an array with floating point data type.
-            If complex, the information is about its component
-            data type.
-
-    Returns:
-        finfo_object:
-            an object have the following attributes:
-
-                * bits: int
-                    number of bits occupied by dtype.
-                * eps: float
-                    difference between 1.0 and the next smallest representable
-                    real-valued floating-point number larger than 1.0 according
-                    to the IEEE-754 standard.
-                * max: float
-                    largest representable real-valued number.
-                * min: float
-                    smallest representable real-valued number.
-                * smallest_normal: float
-                    smallest positive real-valued floating-point number with
-                    full precision.
-                * dtype: dtype
-                    real-valued floating-point data type.
-
-    """
-    if isinstance(dtype, dpt.usm_ndarray):
-        dtype = dtype.dtype
-    _supported_dtype([dpt.dtype(dtype)])
-    return finfo_object(dtype)
-
-
-def _supported_dtype(dtypes):
-    for dtype in dtypes:
-        if dtype.char not in "?bBhHiIlLqQefdFD":
-            raise ValueError(f"Dpctl doesn't support dtype {dtype}.")
-    return True
-
-
-def isdtype(dtype, kind):
-    """isdtype(dtype, kind)
-
-    Returns a boolean indicating whether a provided `dtype` is
-    of a specified data type `kind`.
-
-    See [array API](array_api) for more information.
-
-    [array_api]: https://data-apis.org/array-api/latest/
-    """
-
-    if not isinstance(dtype, np.dtype):
-        raise TypeError(f"Expected instance of `dpt.dtype`, got {dtype}")
-
-    if isinstance(kind, np.dtype):
-        return dtype == kind
-
-    elif isinstance(kind, str):
-        if kind == "bool":
-            return dtype == np.dtype("bool")
-        elif kind == "signed integer":
-            return dtype.kind == "i"
-        elif kind == "unsigned integer":
-            return dtype.kind == "u"
-        elif kind == "integral":
-            return dtype.kind in "iu"
-        elif kind == "real floating":
-            return dtype.kind == "f"
-        elif kind == "complex floating":
-            return dtype.kind == "c"
-        elif kind == "numeric":
-            return dtype.kind in "iufc"
-        else:
-            raise ValueError(f"Unrecognized data type kind: {kind}")
-
-    elif isinstance(kind, tuple):
-        return any(isdtype(dtype, k) for k in kind)
-
-    else:
-        raise TypeError(f"Unsupported data type kind: {kind}")
-
-
-def _default_accumulation_dtype(inp_dt, q):
-    """Gives default output data type for given input data
-    type `inp_dt` when accumulation is performed on queue `q`
-    """
-    inp_kind = inp_dt.kind
-    if inp_kind in "bi":
-        res_dt = dpt.dtype(ti.default_device_int_type(q))
-        if inp_dt.itemsize > res_dt.itemsize:
-            res_dt = inp_dt
-    elif inp_kind in "u":
-        res_dt = dpt.dtype(ti.default_device_uint_type(q))
-        res_ii = dpt.iinfo(res_dt)
-        inp_ii = dpt.iinfo(inp_dt)
-        if inp_ii.min >= res_ii.min and inp_ii.max <= res_ii.max:
-            pass
-        else:
-            res_dt = inp_dt
-    elif inp_kind in "fc":
-        res_dt = inp_dt
-
-    return res_dt
-
-
-def _default_accumulation_dtype_fp_types(inp_dt, q):
-    """Gives default output data type for given input data
-    type `inp_dt` when accumulation is performed on queue `q`
-    and the accumulation supports only floating-point data types
-    """
-    inp_kind = inp_dt.kind
-    if inp_kind in "biu":
-        res_dt = dpt.dtype(ti.default_device_fp_type(q))
-        can_cast_v = dpt.can_cast(inp_dt, res_dt)
-        if not can_cast_v:
-            _fp64 = q.sycl_device.has_aspect_fp64
-            res_dt = dpt.float64 if _fp64 else dpt.float32
-    elif inp_kind in "f":
-        res_dt = inp_dt
-    elif inp_kind in "c":
-        raise ValueError("function not defined for complex types")
-
-    return res_dt
-
-
-__all__ = [
-    "_find_buf_dtype",
-    "_find_buf_dtype2",
-    "_to_device_supported_dtype",
-    "_acceptance_fn_default_unary",
-    "_acceptance_fn_reciprocal",
-    "_acceptance_fn_default_binary",
-    "_acceptance_fn_divide",
-    "_acceptance_fn_negative",
-    "_acceptance_fn_subtract",
-    "_resolve_one_strong_one_weak_types",
-    "_resolve_one_strong_two_weak_types",
-    "_resolve_weak_types",
-    "_resolve_weak_types_all_py_ints",
-    "_weak_type_num_kind",
-    "_strong_dtype_num_kind",
-    "can_cast",
-    "finfo",
-    "iinfo",
-    "isdtype",
-    "result_type",
-    "WeakBooleanType",
-    "WeakIntegralType",
-    "WeakFloatingType",
-    "WeakComplexType",
-    "_default_accumulation_dtype",
-    "_default_accumulation_dtype_fp_types",
-    "_find_buf_dtype_in_place_op",
-]
diff --git a/dpctl/tensor/_types.pxi b/dpctl/tensor/_types.pxi
deleted file mode 100644
index c36147bb1c..0000000000
--- a/dpctl/tensor/_types.pxi
+++ /dev/null
@@ -1,157 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-# these typenum values are aligned to values in NumPy
-cdef:
-    int UAR_BOOL = 0  # pragma: no cover
-    int UAR_BYTE = 1  # pragma: no cover
-    int UAR_UBYTE = 2  # pragma: no cover
-    int UAR_SHORT = 3  # pragma: no cover
-    int UAR_USHORT = 4  # pragma: no cover
-    int UAR_INT = 5  # pragma: no cover
-    int UAR_UINT = 6  # pragma: no cover
-    int UAR_LONG = 7  # pragma: no cover
-    int UAR_ULONG = 8  # pragma: no cover
-    int UAR_LONGLONG = 9  # pragma: no cover
-    int UAR_ULONGLONG = 10  # pragma: no cover
-    int UAR_FLOAT = 11  # pragma: no cover
-    int UAR_DOUBLE = 12  # pragma: no cover
-    int UAR_CFLOAT = 14  # pragma: no cover
-    int UAR_CDOUBLE = 15  # pragma: no cover
-    int UAR_TYPE_SENTINEL = 17  # pragma: no cover
-    int UAR_HALF = 23  # pragma: no cover
-
-cdef int type_bytesize(int typenum):
-    """
-    NPY_BOOL=0         : 1
-    NPY_BYTE=1         : 1
-    NPY_UBYTE=2        : 1
-    NPY_SHORT=3        : 2
-    NPY_USHORT=4       : 2
-    NPY_INT=5          : sizeof(int)
-    NPY_UINT=6         : sizeof(unsigned int)
-    NPY_LONG=7         : sizeof(long)
-    NPY_ULONG=8        : sizeof(unsigned long)
-    NPY_LONGLONG=9     : 8
-    NPY_ULONGLONG=10   : 8
-    NPY_FLOAT=11       : 4
-    NPY_DOUBLE=12      : 8
-    NPY_LONGDOUBLE=13  : N/A
-    NPY_CFLOAT=14      : 8
-    NPY_CDOUBLE=15     : 16
-    NPY_CLONGDOUBLE=16 : N/A
-    NPY_HALF=23        : 2
-    """
-    cdef int *type_to_bytesize = [
-        1,
-        sizeof(char),
-        sizeof(unsigned char),
-        sizeof(short),
-        sizeof(unsigned short),
-        sizeof(int),
-        sizeof(unsigned int),
-        sizeof(long),
-        sizeof(unsigned long),
-        sizeof(long long),
-        sizeof(unsigned long long),
-        sizeof(float),
-        sizeof(double), -1,
-        sizeof(float complex),
-        sizeof(double complex), -1]
-
-    if typenum < 0:  # pragma: no cover
-        return -1
-    if typenum > 16:
-        if typenum == 23:
-            return 2
-        return -1
-
-    return type_to_bytesize[typenum]
-
-
-cdef str _make_typestr(int typenum):
-    """
-    Make typestring from type number
-    """
-    cdef type_to_str = ["|b", "|i", "|u", "|i", "|u",
-                        "|i", "|u", "|i", "|u", "|i", "|u",
-                        "|f", "|f", "", "|c", "|c", ""]
-
-    if (typenum < 0):  # pragma: no cover
-        return ""
-    if (typenum > 16):
-        if (typenum == 23):
-            return "|f2"
-        return ""  # pragma: no cover
-
-    return type_to_str[typenum] + str(type_bytesize(typenum))
-
-
-cdef int typenum_from_format(str s):
-    """
-    Internal utility to convert string describing type format
-
-    Format is [<|=>][biufc]#
-    Shortcuts for formats are i, u, d, D
-    """
-    if not s:
-        return -1
-    try:
-        dt = np.dtype(s)
-    except Exception:
-        return -1
-    if (dt.byteorder == ">"):
-        return -2
-    return dt.num
-
-
-cdef int descr_to_typenum(object dtype):
-    """
-    Returns typenum for argumentd dtype that has attribute descr,
-    assumed numpy.dtype
-    """
-    obj = getattr(dtype, "descr")
-    if (not isinstance(obj, list) or len(obj) != 1):
-        return -1    # token for ValueError
-    obj = obj[0]
-    if (
-        not isinstance(obj, tuple) or len(obj) != 2 or obj[0]
-    ):  # pragma: no cover
-        return -1
-    obj = obj[1]
-    if not isinstance(obj, str):  # pragma: no cover
-        return -1
-    return typenum_from_format(obj)
-
-
-cdef int dtype_to_typenum(dtype):
-    if isinstance(dtype, str):
-        return typenum_from_format(dtype)
-    elif isinstance(dtype, bytes):
-        return typenum_from_format(dtype.decode("UTF-8"))
-    elif hasattr(dtype, "descr"):
-        return descr_to_typenum(dtype)
-    else:
-        try:
-            dt = np.dtype(dtype)
-        except TypeError:
-            return -3
-        except Exception:  # pragma: no cover
-            return -1
-        if hasattr(dt, "descr"):
-            return descr_to_typenum(dt)
-        else:  # pragma: no cover
-            return -3  # token for TypeError
diff --git a/dpctl/tensor/_usmarray.pxd b/dpctl/tensor/_usmarray.pxd
deleted file mode 100644
index 861d7f01a0..0000000000
--- a/dpctl/tensor/_usmarray.pxd
+++ /dev/null
@@ -1,76 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-# distutils: language = c++
-# cython: language_level=3
-
-cimport dpctl
-
-
-cdef public api int USM_ARRAY_C_CONTIGUOUS
-cdef public api int USM_ARRAY_F_CONTIGUOUS
-cdef public api int USM_ARRAY_WRITABLE
-
-cdef public api int UAR_BOOL
-cdef public api int UAR_BYTE
-cdef public api int UAR_UBYTE
-cdef public api int UAR_SHORT
-cdef public api int UAR_USHORT
-cdef public api int UAR_INT
-cdef public api int UAR_UINT
-cdef public api int UAR_LONG
-cdef public api int UAR_ULONG
-cdef public api int UAR_LONGLONG
-cdef public api int UAR_ULONGLONG
-cdef public api int UAR_FLOAT
-cdef public api int UAR_DOUBLE
-cdef public api int UAR_CFLOAT
-cdef public api int UAR_CDOUBLE
-cdef public api int UAR_TYPE_SENTINEL
-cdef public api int UAR_HALF
-
-
-cdef api class usm_ndarray [object PyUSMArrayObject, type PyUSMArrayType]:
-    # data fields
-    cdef char* data_
-    cdef int nd_
-    cdef Py_ssize_t *shape_
-    cdef Py_ssize_t *strides_
-    cdef int typenum_
-    cdef int flags_
-    cdef object base_
-    cdef object array_namespace_
-    # make usm_ndarray weak-referenceable
-    cdef object __weakref__
-
-    cdef void _reset(usm_ndarray self)
-    cdef void _cleanup(usm_ndarray self)
-    cdef Py_ssize_t get_offset(usm_ndarray self) except *
-
-    cdef char* get_data(self)
-    cdef int get_ndim(self)
-    cdef Py_ssize_t * get_shape(self)
-    cdef Py_ssize_t * get_strides(self)
-    cdef int get_typenum(self)
-    cdef int get_itemsize(self)
-    cdef int get_flags(self)
-    cdef object get_base(self)
-    cdef dpctl.DPCTLSyclQueueRef get_queue_ref(self) except *
-    cdef dpctl.SyclQueue get_sycl_queue(self)
-
-    cdef _set_writable_flag(self, int)
-
-    cdef __cythonbufferdefaults__ = {"mode": "strided"}
diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx
deleted file mode 100644
index 8b9cf92a42..0000000000
--- a/dpctl/tensor/_usmarray.pyx
+++ /dev/null
@@ -1,1967 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-# distutils: language = c++
-# cython: language_level=3
-# cython: linetrace=True
-
-import numpy as np
-
-import dpctl
-import dpctl.memory as dpmem
-
-from .._backend cimport DPCTLSyclUSMRef
-from .._sycl_device_factory cimport _cached_default_device
-
-from ._data_types import bool as dpt_bool
-from ._device import Device
-from ._print import usm_ndarray_repr, usm_ndarray_str
-
-from cpython.mem cimport PyMem_Free
-from cpython.tuple cimport PyTuple_New, PyTuple_SetItem
-
-cimport dpctl as c_dpctl
-cimport dpctl.memory as c_dpmem
-cimport dpctl.tensor._dlpack as c_dlpack
-
-from ._dlpack import get_build_dlpack_version
-
-from .._sycl_device_factory cimport _cached_default_device
-
-from enum import IntEnum
-
-import dpctl.tensor._flags as _flags
-from dpctl.tensor._tensor_impl import default_device_fp_type
-
-include "_stride_utils.pxi"
-include "_types.pxi"
-include "_slicing.pxi"
-
-
-class DLDeviceType(IntEnum):
-    """
-    An :class:`enum.IntEnum` for the types of DLDevices supported by the DLPack
-    protocol.
-
-        ``kDLCPU``:
-            CPU (host) device
-        ``kDLCUDA``:
-            CUDA GPU device
-        ``kDLCUDAHost``:
-            Pinned CUDA CPU memory by cudaMallocHost
-        ``kDLOpenCL``:
-            OpenCL device
-        ``kDLVulkan``:
-            Vulkan buffer
-        ``kDLMetal``:
-            Metal for Apple GPU
-        ``kDLVPI``:
-            Verilog simulator buffer
-        ``kDLROCM``:
-            ROCm GPU device
-        ``kDLROCMHost``:
-            Pinned ROCm CPU memory allocated by hipMallocHost
-        ``kDLExtDev``:
-            Reserved extension device type used to test new devices
-        ``kDLCUDAManaged``:
-            CUDA managed/unified memory allocated by cudaMallocManaged
-        ``kDLOneAPI``:
-            Unified shared memory allocated on a oneAPI non-partitioned device
-        ``kDLWebGPU``:
-            Device support for WebGPU standard
-        ``kDLHexagon``:
-            Qualcomm Hexagon DSP
-        ``kDLMAIA``:
-            Microsoft MAIA device
-        ``kDLTrn``:
-            AWS Trainium device
-    """
-    kDLCPU = c_dlpack.device_CPU
-    kDLCUDA = c_dlpack.device_CUDA
-    kDLCUDAHost = c_dlpack.device_CUDAHost
-    kDLCUDAManaged = c_dlpack.device_CUDAManaged
-    kDLROCM = c_dlpack.device_DLROCM
-    kDLROCMHost = c_dlpack.device_ROCMHost
-    kDLOpenCL = c_dlpack.device_OpenCL
-    kDLVulkan = c_dlpack.device_Vulkan
-    kDLMetal = c_dlpack.device_Metal
-    kDLVPI = c_dlpack.device_VPI
-    kDLOneAPI = c_dlpack.device_OneAPI
-    kDLWebGPU = c_dlpack.device_WebGPU
-    kDLHexagon = c_dlpack.device_Hexagon
-    kDLMAIA = c_dlpack.device_MAIA
-    kDLTrn = c_dlpack.device_Trn
-
-
-cdef class InternalUSMArrayError(Exception):
-    """
-    An InternalUSMArrayError exception is raised when internal
-    inconsistency has been detected in :class:`.usm_ndarray`.
-    """
-    pass
-
-
-cdef object _as_zero_dim_ndarray(object usm_ary):
-    "Convert size-1 array to NumPy 0d array"
-    mem_view = dpmem.as_usm_memory(usm_ary)
-    usm_ary.sycl_queue.wait()
-    host_buf = mem_view.copy_to_host()
-    view = host_buf.view(usm_ary.dtype)
-    view.shape = tuple()
-    return view
-
-
-cdef inline void _check_0d_scalar_conversion(object usm_ary) except *:
-    "Raise TypeError if array cannot be converted to a Python scalar"
-    if (usm_ary.ndim != 0):
-        raise TypeError(
-            "only 0-dimensional arrays can be converted to Python scalars"
-        )
-
-
-cdef int _copy_writable(int lhs_flags, int rhs_flags):
-    "Copy the WRITABLE flag to lhs_flags from rhs_flags"
-    return (lhs_flags & ~USM_ARRAY_WRITABLE) | (rhs_flags & USM_ARRAY_WRITABLE)
-
-
-cdef bint _is_host_cpu(object dl_device):
-    "Check if dl_device denotes (kDLCPU, 0)"
-    cdef object dl_type
-    cdef object dl_id
-    cdef Py_ssize_t n_elems = -1
-
-    try:
-        n_elems = len(dl_device)
-    except TypeError:
-        pass
-
-    if n_elems != 2:
-        return False
-
-    dl_type = dl_device[0]
-    dl_id = dl_device[1]
-    if isinstance(dl_type, str):
-        return (dl_type == "kDLCPU" and dl_id == 0)
-
-    return (dl_type == DLDeviceType.kDLCPU) and (dl_id == 0)
-
-
-cdef void _validate_and_use_stream(
-    object stream, c_dpctl.SyclQueue self_queue
-) except *:
-    if (stream is None or stream == self_queue):
-        pass
-    else:
-        if not isinstance(stream, dpctl.SyclQueue):
-            raise TypeError(
-                "stream argument type was expected to be dpctl.SyclQueue,"
-                f" got {type(stream)} instead"
-            )
-        ev = self_queue.submit_barrier()
-        stream.submit_barrier(dependent_events=[ev])
-
-cdef class usm_ndarray:
-    """ usm_ndarray(shape, dtype=None, strides=None, buffer="device", \
-           offset=0, order="C", buffer_ctor_kwargs=dict(), \
-           array_namespace=None)
-
-    An array object represents a multidimensional tensor of numeric
-    elements stored in a USM allocation on a SYCL device.
-
-    Arg:
-        shape (int, tuple):
-            Shape of the array to be created.
-        dtype (str, dtype):
-            Array data type, i.e. the type of array elements.
-            If ``dtype`` has the value ``None``, it is determined by default
-            floating point type supported by target device.
-            The supported types are
-
-                ``bool``:
-                    boolean type
-                ``int8``, ``int16``, ``int32``, ``int64``:
-                    signed integer types
-                ``uint8``, ``uint16``, ``uint32``, ``uint64``:
-                    unsigned integer types
-                ``float16``:
-                    half-precision floating type,
-                    supported if target device's property
-                    ``has_aspect_fp16`` is ``True``
-                ``float32``, ``complex64``:
-                    single-precision real and complex floating types
-                ``float64``, ``complex128``:
-                    double-precision real and complex floating
-                    types, supported if target device's property
-                    ``has_aspect_fp64`` is ``True``.
-
-            Default: ``None``.
-        strides (tuple, optional):
-            Strides of the array to be created in elements.
-            If ``strides`` has the value ``None``, it is determined by the
-            ``shape`` of the array and the requested ``order``.
-            Default: ``None``.
-        buffer (str, object, optional):
-            A string corresponding to the type of USM allocation to make,
-            or a Python object representing a USM memory allocation, i.e.
-            :class:`dpctl.memory.MemoryUSMDevice`,
-            :class:`dpctl.memory.MemoryUSMShared`, or
-            :class:`dpctl.memory.MemoryUSMHost`. Recognized strings are
-            ``"device"``, ``"shared"``, or ``"host"``. Additional arguments to
-            the USM memory allocators can be passed in a dictionary specified
-            via ``buffer_ctor_kwrds`` keyword parameter.
-            Default: ``"device"``.
-        offset (int, optional):
-            Offset of the array element with all zero indexes relative to the
-            start of the provided `buffer` in elements. The argument is ignored
-            if the ``buffer`` value is a string and the memory is allocated by
-            the constructor. Default: ``0``.
-        order ({"C", "F"}, optional):
-            The memory layout of the array when constructing using a new
-            allocation. Value ``"C"`` corresponds to C-contiguous, or row-major
-            memory layout, while value ``"F"`` corresponds to F-contiguous, or
-            column-major layout. Default: ``"C"``.
-        buffer_ctor_kwargs (dict, optional):
-            Dictionary with keyword parameters to use when creating a new USM
-            memory allocation. See :class:`dpctl.memory.MemoryUSMShared` for
-            supported keyword arguments.
-        array_namespace (module, optional):
-            Array namespace module associated with this array.
-            Default: ``None``.
-
-    ``buffer`` can be ``"shared"``, ``"host"``, ``"device"`` to allocate
-    new device memory by calling respective constructor with
-    the specified ``buffer_ctor_kwrds``; ``buffer`` can be an
-    instance of :class:`dpctl.memory.MemoryUSMShared`,
-    :class:`dpctl.memory.MemoryUSMDevice`, or
-    :class:`dpctl.memory.MemoryUSMHost`; ``buffer`` can also be
-    another :class:`dpctl.tensor.usm_ndarray` instance, in which case its
-    underlying ``MemoryUSM*`` buffer is used.
-    """
-
-    cdef void _reset(usm_ndarray self):
-        """
-        Initializes member fields
-        """
-        self.base_ = None
-        self.array_namespace_ = None
-        self.nd_ = -1
-        self.data_ = <char *>0
-        self.shape_ = <Py_ssize_t *>0
-        self.strides_ = <Py_ssize_t *>0
-        self.flags_ = 0
-
-    cdef void _cleanup(usm_ndarray self):
-        if (self.shape_):
-            PyMem_Free(self.shape_)
-        if (self.strides_):
-            PyMem_Free(self.strides_)
-        self._reset()
-
-    def __cinit__(self, shape, dtype=None, strides=None, buffer="device",
-                  Py_ssize_t offset=0, order="C",
-                  buffer_ctor_kwargs=dict(),
-                  array_namespace=None):
-        """
-        strides and offset must be given in units of array elements.
-        buffer can be strings ('device'|'shared'|'host' to allocate new memory)
-        or ``dpctl.memory.MemoryUSM*`` buffers, or ``usm_ndarray`` instances.
-        """
-        cdef int nd = 0
-        cdef int typenum = 0
-        cdef int itemsize = 0
-        cdef int err = 0
-        cdef int contig_flag = 0
-        cdef int writable_flag = USM_ARRAY_WRITABLE
-        cdef Py_ssize_t *shape_ptr = NULL
-        cdef Py_ssize_t ary_nelems = 0
-        cdef Py_ssize_t ary_nbytes = 0
-        cdef Py_ssize_t *strides_ptr = NULL
-        cdef Py_ssize_t _offset = offset
-        cdef Py_ssize_t ary_min_displacement = 0
-        cdef Py_ssize_t ary_max_displacement = 0
-        cdef bint is_fp64 = False
-        cdef bint is_fp16 = False
-
-        self._reset()
-        if not isinstance(shape, (list, tuple)):
-            if hasattr(shape, "tolist"):
-                fn = getattr(shape, "tolist")
-                if callable(fn):
-                    shape = shape.tolist()
-            if not isinstance(shape, (list, tuple)):
-                try:
-                    <Py_ssize_t> shape
-                    shape = [shape, ]
-                except Exception as e:
-                    raise TypeError(
-                        "Argument shape must a non-negative integer, "
-                        "or a list/tuple of such integers."
-                    ) from e
-        nd = len(shape)
-        if dtype is None:
-            if isinstance(buffer, (dpmem._memory._Memory, usm_ndarray)):
-                q = buffer.sycl_queue
-            else:
-                q = buffer_ctor_kwargs.get("queue")
-            if q is not None:
-                dtype = default_device_fp_type(q)
-            else:
-                dev = _cached_default_device()
-                dtype = "f8" if dev.has_aspect_fp64 else "f4"
-        typenum = dtype_to_typenum(dtype)
-        if (typenum < 0):
-            if typenum == -2:
-                raise ValueError(
-                    "Data type '" + str(dtype) +
-                    "' can only have native byteorder."
-                )
-            elif typenum == -1:
-                raise ValueError(
-                    "Data type '" + str(dtype) + "' is not understood."
-                )
-            raise TypeError(
-                f"Expected string or a dtype object, got {type(dtype)}"
-            )
-        itemsize = type_bytesize(typenum)
-        if (itemsize < 1):
-            raise TypeError(
-                "dtype=" + np.dtype(dtype).name + " is not supported."
-            )
-        # allocate host C-arrays for shape, strides
-        err = _from_input_shape_strides(
-            nd, shape, strides, itemsize, <char> ord(order),
-            &shape_ptr, &strides_ptr, &ary_nelems,
-            &ary_min_displacement, &ary_max_displacement, &contig_flag
-        )
-        if (err):
-            self._cleanup()
-            if err == ERROR_MALLOC:
-                raise MemoryError("Memory allocation for shape/strides "
-                                  "array failed.")
-            elif err == ERROR_INCORRECT_ORDER:
-                raise ValueError(
-                    "Unsupported order='{}' given. "
-                    "Supported values are 'C' or 'F'.".format(order))
-            elif err == ERROR_UNEXPECTED_STRIDES:
-                raise ValueError(
-                    "strides={} is not understood".format(strides))
-            else:
-                raise InternalUSMArrayError(
-                    " .. while processing shape and strides.")
-        ary_nbytes = (ary_max_displacement -
-                      ary_min_displacement + 1) * itemsize
-        if isinstance(buffer, dpmem._memory._Memory):
-            _buffer = buffer
-        elif isinstance(buffer, (str, bytes)):
-            if isinstance(buffer, bytes):
-                buffer = buffer.decode("UTF-8")
-            _offset = -ary_min_displacement
-            if (buffer == "shared"):
-                _buffer = dpmem.MemoryUSMShared(ary_nbytes,
-                                                **buffer_ctor_kwargs)
-            elif (buffer == "device"):
-                _buffer = dpmem.MemoryUSMDevice(ary_nbytes,
-                                                **buffer_ctor_kwargs)
-            elif (buffer == "host"):
-                _buffer = dpmem.MemoryUSMHost(ary_nbytes,
-                                              **buffer_ctor_kwargs)
-            else:
-                self._cleanup()
-                raise ValueError(
-                    "buffer='{}' is not understood. "
-                    "Recognized values are 'device', 'shared',  'host', "
-                    "an instance of `MemoryUSM*` object, or a usm_ndarray"
-                    "".format(buffer)
-                )
-        elif isinstance(buffer, usm_ndarray):
-            if not buffer.flags.writable:
-                writable_flag = 0
-            _buffer = buffer.usm_data
-        else:
-            self._cleanup()
-            raise ValueError("buffer='{}' was not understood.".format(buffer))
-        if (shape_to_elem_count(nd, shape_ptr) > 0 and
-            (_offset + ary_min_displacement < 0 or
-             (_offset + ary_max_displacement + 1) * itemsize > _buffer.nbytes)):
-            self._cleanup()
-            raise ValueError(("buffer='{}' can not accommodate "
-                              "the requested array.").format(buffer))
-        is_fp64 = (typenum == UAR_DOUBLE or typenum == UAR_CDOUBLE)
-        is_fp16 = (typenum == UAR_HALF)
-        if (is_fp64 or is_fp16):
-            if (
-                (is_fp64 and not _buffer.sycl_device.has_aspect_fp64) or
-                (is_fp16 and not _buffer.sycl_device.has_aspect_fp16)
-            ):
-                raise ValueError(
-                    f"Device {_buffer.sycl_device.name} does"
-                    f" not support {dtype} natively."
-                )
-        self.base_ = _buffer
-        self.data_ = (<char *> (<size_t> _buffer._pointer)) + itemsize * _offset
-        self.shape_ = shape_ptr
-        self.strides_ = strides_ptr
-        self.typenum_ = typenum
-        self.flags_ = (contig_flag | writable_flag)
-        self.nd_ = nd
-        self.array_namespace_ = array_namespace
-
-    def __dealloc__(self):
-        self._cleanup()
-
-    @property
-    def _pointer(self):
-        """
-        Returns USM pointer to the start of array (element with zero
-        multi-index) encoded as integer.
-        """
-        return <size_t> self.get_data()
-
-    cdef Py_ssize_t get_offset(self) except *:
-        cdef char *mem_ptr = NULL
-        cdef char *ary_ptr = self.get_data()
-        mem_ptr = <char *>(<size_t> self.base_._pointer)
-        byte_offset = ary_ptr - mem_ptr
-        item_size = self.get_itemsize()
-        if (byte_offset % item_size):
-            raise InternalUSMArrayError(
-                "byte_offset is not a multiple of item_size.")
-        return byte_offset // item_size
-
-    @property
-    def _element_offset(self):
-        """Returns the offset of the zero-index element of the array, in
-        elements, relative to the start of memory allocation"""
-        return self.get_offset()
-
-    @property
-    def _byte_bounds(self):
-        """Returns a 2-tuple with pointers to the end-points of the array
-
-        :Example:
-
-            .. code-block:: python
-
-                from dpctl import tensor
-
-                x = tensor.ones((3, 10, 7))
-                y = tensor.flip(x[:, 1::2], axis=1)
-
-                beg_p, end_p = y._byte_bounds
-                # Bytes taken to store this array
-                bytes_extent = end_p - beg_p
-
-                # C-contiguous copy is more compact
-                yc = tensor.copy(y, order="C")
-                beg_pc, end_pc = yc._byte_bounds
-                assert bytes_extent < end_pc - beg_pc
-        """
-        cdef Py_ssize_t min_disp = 0
-        cdef Py_ssize_t max_disp = 0
-        cdef Py_ssize_t step_ = 0
-        cdef Py_ssize_t dim_ = 0
-        cdef int it = 0
-        cdef Py_ssize_t _itemsize = self.get_itemsize()
-
-        if (
-            (self.flags_ & USM_ARRAY_C_CONTIGUOUS)
-            or (self.flags_ & USM_ARRAY_F_CONTIGUOUS)
-        ):
-            return (
-                self._pointer,
-                self._pointer + shape_to_elem_count(
-                    self.nd_, self.shape_
-                ) * _itemsize
-            )
-
-        for it in range(self.nd_):
-            dim_ = self.shape[it]
-            if dim_ > 0:
-                step_ = self.strides[it]
-                if step_ > 0:
-                    max_disp += step_ * (dim_ - 1)
-                else:
-                    min_disp += step_ * (dim_ - 1)
-
-        return (
-            self._pointer + min_disp * _itemsize,
-            self._pointer + (max_disp + 1) * _itemsize
-        )
-
-    cdef char* get_data(self):
-        """Returns the USM pointer for this array."""
-        return self.data_
-
-    cdef int get_ndim(self):
-        """
-        Returns the number of indices needed to address
-        an element of this array.
-        """
-        return self.nd_
-
-    cdef Py_ssize_t* get_shape(self):
-        """
-        Returns pointer to shape C-array for this array.
-
-        C-array has at least ``ndim`` non-negative elements,
-        which determine the range of permissible indices
-        addressing individual elements of this array.
-        """
-        return self.shape_
-
-    cdef Py_ssize_t* get_strides(self):
-        """
-        Returns pointer to strides C-array for this array.
-
-        The pointer can be NULL (contiguous array), or the
-        array size is at least ``ndim`` elements
-        """
-        return self.strides_
-
-    cdef int get_typenum(self):
-        """Returns typenum corresponding to values of this array"""
-        return self.typenum_
-
-    cdef int get_itemsize(self):
-        """
-        Returns itemsize of this arrays in bytes
-        """
-        return type_bytesize(self.typenum_)
-
-    cdef int get_flags(self):
-        """Returns flags of this array"""
-        return self.flags_
-
-    cdef object get_base(self):
-        """Returns the object owning the USM data addressed by this array"""
-        return self.base_
-
-    cdef c_dpctl.SyclQueue get_sycl_queue(self):
-        cdef c_dpmem._Memory mem
-        if not isinstance(self.base_, dpctl.memory._Memory):
-            raise InternalUSMArrayError(
-                "This array has unexpected memory owner"
-            )
-        mem = <c_dpmem._Memory> self.base_
-        return mem.queue
-
-    cdef c_dpctl.DPCTLSyclQueueRef get_queue_ref(self) except *:
-        """
-        Returns a copy of DPCTLSyclQueueRef associated with array
-        """
-        cdef c_dpctl.SyclQueue q = self.get_sycl_queue()
-        cdef c_dpctl.DPCTLSyclQueueRef QRef = q.get_queue_ref()
-        cdef c_dpctl.DPCTLSyclQueueRef QRefCopy = NULL
-        if QRef is not NULL:
-            QRefCopy = c_dpctl.DPCTLQueue_Copy(QRef)
-            return QRefCopy
-        else:
-            raise InternalUSMArrayError(
-                "Memory owner of this array is corrupted"
-            )
-
-    @property
-    def __sycl_usm_array_interface__(self):
-        """
-        Gives ``__sycl_usm_array_interface__`` dictionary describing
-        the array.
-        """
-        cdef Py_ssize_t byte_offset = -1
-        cdef int item_size = -1
-        cdef Py_ssize_t elem_offset = -1
-        cdef char *mem_ptr = NULL
-        cdef char *ary_ptr = NULL
-        if (not isinstance(self.base_, dpmem._memory._Memory)):
-            raise InternalUSMArrayError(
-                "Invalid instance of usm_ndarray encountered. "
-                "Private field base_ has an unexpected type {}.".format(
-                    type(self.base_)
-                )
-            )
-        ary_iface = self.base_.__sycl_usm_array_interface__
-        mem_ptr = <char *>(<size_t> ary_iface["data"][0])
-        ary_ptr = <char *>(<size_t> self.data_)
-        ro_flag = False if (self.flags_ & USM_ARRAY_WRITABLE) else True
-        ary_iface["data"] = (<size_t> mem_ptr, ro_flag)
-        ary_iface["shape"] = self.shape
-        if (self.strides_):
-            ary_iface["strides"] = _make_int_tuple(self.nd_, self.strides_)
-        else:
-            if (self.flags_ & USM_ARRAY_C_CONTIGUOUS):
-                ary_iface["strides"] = None
-            elif (self.flags_ & USM_ARRAY_F_CONTIGUOUS):
-                ary_iface["strides"] = _f_contig_strides(self.nd_, self.shape_)
-            else:
-                raise InternalUSMArrayError(
-                    "USM Array is not contiguous and has empty strides"
-                )
-        ary_iface["typestr"] = _make_typestr(self.typenum_)
-        byte_offset = ary_ptr - mem_ptr
-        item_size = self.get_itemsize()
-        if (byte_offset % item_size):
-            raise InternalUSMArrayError(
-                "byte_offset is not a multiple of item_size.")
-        elem_offset = byte_offset // item_size
-        ary_iface["offset"] = elem_offset
-        # must wait for content of the memory to finalize
-        self.sycl_queue.wait()
-        return ary_iface
-
-    @property
-    def ndim(self):
-        """
-        Gives the number of indices needed to address elements of this array.
-        """
-        return self.nd_
-
-    @property
-    def usm_data(self):
-        """
-        Gives USM memory object underlying :class:`.usm_ndarray` instance.
-        """
-        return self.get_base()
-
-    @property
-    def shape(self):
-        """
-        Elements of the shape tuple give the lengths of the
-        respective array dimensions.
-
-        Setting shape is allowed only when reshaping to the requested
-        dimensions can be returned as view, otherwise :exc:`AttributeError`
-        is raised. Use :func:`dpctl.tensor.reshape` to reshape the array
-        in all cases.
-
-        :Example:
-
-            .. code-block:: python
-
-                from dpctl import tensor
-
-                x = tensor.arange(899)
-                x.shape = (29, 31)
-        """
-        if self.nd_ > 0:
-            return _make_int_tuple(self.nd_, self.shape_)
-        else:
-            return tuple()
-
-    @shape.setter
-    def shape(self, new_shape):
-        """
-        Modifies usm_ndarray instance in-place by changing its metadata
-        about the shape and the strides of the array, or raises
-        `AttributeError` exception if in-place change is not possible.
-
-        Args:
-            new_shape: (tuple, int)
-                New shape. Only non-negative values are supported.
-                The new shape may not lead to the change in the
-                number of elements in the array.
-
-        Whether the array can be reshape in-place depends on its
-        strides. Use :func:`dpctl.tensor.reshape` function which
-        always succeeds to reshape the array by performing a copy
-        if necessary.
-        """
-        cdef int new_nd = -1
-        cdef Py_ssize_t nelems = -1
-        cdef int err = 0
-        cdef Py_ssize_t min_disp = 0
-        cdef Py_ssize_t max_disp = 0
-        cdef int contig_flag = 0
-        cdef Py_ssize_t *shape_ptr = NULL
-        cdef Py_ssize_t *strides_ptr = NULL
-        cdef Py_ssize_t size = -1
-        import operator
-
-        from ._reshape import reshaped_strides
-
-        try:
-            new_nd = len(new_shape)
-        except TypeError:
-            new_nd = 1
-            new_shape = (new_shape,)
-        try:
-            new_shape = tuple(operator.index(dim) for dim in new_shape)
-        except TypeError:
-            raise TypeError(
-                "Target shape must be a finite iterable of integers"
-            )
-        size = shape_to_elem_count(self.nd_, self.shape_)
-        if not np.prod(new_shape) == size:
-            raise TypeError(
-                f"Can not reshape array of size {self.size} into {new_shape}"
-            )
-        if size > 0:
-            new_strides = reshaped_strides(
-               self.shape,
-               self.strides,
-               new_shape
-            )
-        else:
-            new_strides = (1,) * len(new_shape)
-        if new_strides is None:
-            raise AttributeError(
-                "Incompatible shape for in-place modification. "
-                "Use `reshape()` to make a copy with the desired shape."
-            )
-        err = _from_input_shape_strides(
-            new_nd, new_shape, new_strides,
-            self.get_itemsize(),
-            b"C",
-            &shape_ptr, &strides_ptr,
-            &nelems, &min_disp, &max_disp, &contig_flag
-        )
-        if (err == 0):
-            if (self.shape_):
-                PyMem_Free(self.shape_)
-            if (self.strides_):
-                PyMem_Free(self.strides_)
-            self.flags_ = (contig_flag | (self.flags_ & USM_ARRAY_WRITABLE))
-            self.nd_ = new_nd
-            self.shape_ = shape_ptr
-            self.strides_ = strides_ptr
-        else:
-            raise InternalUSMArrayError(
-                "Encountered in shape setter, error code {err}".format(err)
-            )
-
-    @property
-    def strides(self):
-        """
-        Returns memory displacement in array elements, upon unit
-        change of respective index.
-
-        For example, for strides ``(s1, s2, s3)`` and multi-index
-        ``(i1, i2, i3)`` position of the respective element relative
-        to zero multi-index element is ``s1*s1 + s2*i2 + s3*i3``.
-
-        :Example:
-
-            .. code-block:: python
-
-                from dpctl import tensor
-
-                x = tensor.zeros((20, 30))
-                xv = x[10:, :15]
-
-                multi_id = (3, 5)
-                byte_displacement = xv[multi_id]._pointer - xv[0, 0]._pointer
-                element_displacement = sum(
-                    i * s for i, s in zip(multi_id, xv.strides)
-                )
-                assert byte_displacement == element_displacement * xv.itemsize
-        """
-        if (self.strides_):
-            return _make_int_tuple(self.nd_, self.strides_)
-        else:
-            if (self.flags_ & USM_ARRAY_C_CONTIGUOUS):
-                return _c_contig_strides(self.nd_, self.shape_)
-            elif (self.flags_ & USM_ARRAY_F_CONTIGUOUS):
-                return _f_contig_strides(self.nd_, self.shape_)
-            else:
-                raise ValueError("Inconsistent usm_ndarray data")
-
-    @property
-    def flags(self):
-        """
-        Returns :class:`dpctl.tensor._flags.Flags` object.
-        """
-        return _flags.Flags(self, self.flags_)
-
-    cdef _set_writable_flag(self, int flag):
-        cdef int mask = (USM_ARRAY_WRITABLE if flag else 0)
-        self.flags_ = _copy_writable(self.flags_, mask)
-
-    @property
-    def usm_type(self):
-        """
-        USM type of underlying memory. Possible values are:
-
-            * ``"device"``
-                USM-device allocation in device memory, only accessible
-                to kernels executed on the device
-            * ``"shared"``
-                USM-shared allocation in device memory, accessible both
-                from the device and from host
-            * ``"host"``
-                USM-host allocation in host memory, accessible both
-                from the device and from host
-
-        See: https://docs.oneapi.com/versions/latest/dpcpp/iface/usm.html
-        """
-        return self.base_.get_usm_type()
-
-    @property
-    def itemsize(self):
-        """
-        Size of array element in bytes.
-        """
-        return self.get_itemsize()
-
-    @property
-    def nbytes(self):
-        """
-        Total bytes consumed by the elements of the array.
-        """
-        return (
-            shape_to_elem_count(self.nd_, self.shape_) *
-            self.get_itemsize())
-
-    @property
-    def size(self):
-        """
-        Number of elements in the array.
-        """
-        return shape_to_elem_count(self.nd_, self.shape_)
-
-    @property
-    def dtype(self):
-        """
-        Returns NumPy's dtype corresponding to the type of the array elements.
-        """
-        return np.dtype(_make_typestr(self.typenum_))
-
-    @property
-    def sycl_queue(self):
-        """
-        Returns :class:`dpctl.SyclQueue` object associated with USM data.
-        """
-        return self.get_sycl_queue()
-
-    @property
-    def sycl_device(self):
-        """
-        Returns :class:`dpctl.SyclDevice` object on which USM data
-        was allocated.
-        """
-        q = self.sycl_queue
-        return q.sycl_device
-
-    @property
-    def device(self):
-        """
-        Returns :class:`dpctl.tensor.Device` object representing
-        residence of the array data.
-
-        The ``Device`` object represents Array API notion of the
-        device, and contains :class:`dpctl.SyclQueue` associated
-        with this array. Hence, ``.device`` property provides
-        information distinct from ``.sycl_device`` property.
-
-        :Example:
-
-            .. code-block:: python
-
-                >>> from dpctl import tensor
-                >>> x = tensor.ones(10)
-                >>> x.device
-                Device(level_zero:gpu:0)
-        """
-        return Device.create_device(self.sycl_queue)
-
-    @property
-    def sycl_context(self):
-        """
-        Returns :class:`dpctl.SyclContext` object to which USM data is bound.
-        """
-        q = self.sycl_queue
-        return q.sycl_context
-
-    @property
-    def T(self):
-        """Returns transposed array for 2D array, raises ``ValueError``
-        otherwise.
-        """
-        if self.nd_ == 2:
-            return _transpose(self)
-        else:
-            raise ValueError(
-                "array.T requires array to have 2 dimensions. "
-                "Use array.mT to transpose stacks of matrices and "
-                "dpctl.tensor.permute_dims() to permute dimensions."
-            )
-
-    @property
-    def mT(self):
-        """ Returns array (a view) where the last two dimensions are
-        transposed.
-        """
-        if self.nd_ < 2:
-            raise ValueError(
-                "array.mT requires array to have at least 2 dimensions."
-            )
-        return _m_transpose(self)
-
-    @property
-    def real(self):
-        """
-        Returns view into real component for arrays with
-        complex data-types and returns itself for all other
-        data-types.
-
-        :Example:
-
-            .. code-block:: python
-
-                from dpctl import tensor
-
-                # Create complex array from
-                # arrays of real and imaginary parts
-
-                re = tensor.linspace(-1, 1, num=100, dtype="f4")
-                im = tensor.full_like(re, fill_value=tensor.pi)
-
-                z = tensor.empty_like(re, dtype="c8")
-                z.real[:] = re
-                z.imag[:] = im
-        """
-        # explicitly check for UAR_HALF, which is greater than UAR_CFLOAT
-        if (self.typenum_ < UAR_CFLOAT or self.typenum_ == UAR_HALF):
-            # elements are real
-            return self
-        if (self.typenum_ < UAR_TYPE_SENTINEL):
-            return _real_view(self)
-
-    @property
-    def imag(self):
-        """ Returns view into imaginary component for arrays with
-        complex data-types and returns new zero array for all other
-        data-types.
-
-        :Example:
-
-            .. code-block:: python
-
-                from dpctl import tensor
-
-                # Reset imaginary part of complex array
-
-                z = tensor.ones(100, dtype="c8")
-                z.imag[:] = dpt.pi/2
-        """
-        # explicitly check for UAR_HALF, which is greater than UAR_CFLOAT
-        if (self.typenum_ < UAR_CFLOAT or self.typenum_ == UAR_HALF):
-            # elements are real
-            return _zero_like(self)
-        if (self.typenum_ < UAR_TYPE_SENTINEL):
-            return _imag_view(self)
-
-    def __getitem__(self, ind):
-        cdef tuple _meta = _basic_slice_meta(
-            ind, (<object>self).shape, (<object> self).strides,
-            self.get_offset())
-        cdef usm_ndarray res
-        cdef int i = 0
-        cdef bint matching = 1
-
-        if len(_meta) < 5:
-            raise RuntimeError
-
-        res = usm_ndarray.__new__(
-            usm_ndarray,
-            _meta[0],
-            dtype=_make_typestr(self.typenum_),
-            strides=_meta[1],
-            buffer=self.base_,
-            offset=_meta[2]
-        )
-        res.array_namespace_ = self.array_namespace_
-
-        adv_ind = _meta[3]
-        adv_ind_start_p = _meta[4]
-
-        if adv_ind_start_p < 0:
-            res.flags_ = _copy_writable(res.flags_, self.flags_)
-            return res
-
-        from ._copy_utils import _extract_impl, _nonzero_impl, _take_multi_index
-
-        # if len(adv_ind == 1), the (only) element is always an array
-        if len(adv_ind) == 1 and adv_ind[0].dtype == dpt_bool:
-            key_ = adv_ind[0]
-            adv_ind_end_p = key_.ndim + adv_ind_start_p
-            if adv_ind_end_p > res.ndim:
-                raise IndexError("too many indices for the array")
-            key_shape = key_.shape
-            arr_shape = res.shape[adv_ind_start_p:adv_ind_end_p]
-            for i in range(key_.ndim):
-                if matching:
-                    if not key_shape[i] == arr_shape[i] and key_shape[i] > 0:
-                        matching = 0
-            if not matching:
-                raise IndexError(
-                    "boolean index did not match indexed array in dimensions"
-                )
-            res = _extract_impl(res, key_, axis=adv_ind_start_p)
-            res.flags_ = _copy_writable(res.flags_, self.flags_)
-            return res
-
-        if any(
-            (
-                isinstance(ind, usm_ndarray) and ind.dtype == dpt_bool
-            ) for ind in adv_ind
-        ):
-            adv_ind_int = list()
-            for ind in adv_ind:
-                if isinstance(ind, usm_ndarray) and ind.dtype == dpt_bool:
-                    adv_ind_int.extend(_nonzero_impl(ind))
-                else:
-                    adv_ind_int.append(ind)
-            res = _take_multi_index(res, tuple(adv_ind_int), adv_ind_start_p)
-            res.flags_ = _copy_writable(res.flags_, self.flags_)
-            return res
-
-        res = _take_multi_index(res, adv_ind, adv_ind_start_p)
-        res.flags_ = _copy_writable(res.flags_, self.flags_)
-        return res
-
-    def to_device(self, target_device, /, *, stream=None):
-        """ to_device(target_device, /, *, stream=None)
-
-        Transfers this array to specified target device.
-
-        :Example:
-            .. code-block:: python
-
-                import dpctl
-                import dpctl.tensor as dpt
-
-                x = dpt.full(10**6, 2, dtype="int64")
-                q_prof = dpctl.SyclQueue(
-                    x.sycl_device, property="enable_profiling")
-                # return a view with profile-enabled queue
-                y = x.to_device(q_prof)
-                timer = dpctl.SyclTimer()
-                with timer(q_prof):
-                    z = y * y
-                print(timer.dt)
-
-        Args:
-            target_device (object):
-                Array API concept of target device.
-                It can be a oneAPI filter selector string,
-                an instance of :class:`dpctl.SyclDevice` corresponding to a
-                non-partitioned SYCL device, an instance of
-                :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device`
-                object returned by :attr:`dpctl.tensor.usm_ndarray.device`.
-            stream (:class:`dpctl.SyclQueue`, optional):
-                Execution queue to synchronize with. If ``None``,
-                synchronization is not performed.
-
-        Returns:
-            usm_ndarray:
-                A view if data copy is not required, and a copy otherwise.
-                If copying is required, it is done by copying from the original
-                allocation device to the host, followed by copying from host
-                to the target device.
-        """
-        cdef c_dpctl.DPCTLSyclQueueRef QRef = NULL
-        cdef c_dpmem._Memory arr_buf
-        d = Device.create_device(target_device)
-
-        _validate_and_use_stream(stream, self.sycl_queue)
-
-        if (d.sycl_context == self.sycl_context):
-            arr_buf = <c_dpmem._Memory> self.usm_data
-            QRef = (<c_dpctl.SyclQueue> d.sycl_queue).get_queue_ref()
-            view_buffer = c_dpmem._Memory.create_from_usm_pointer_size_qref(
-                <DPCTLSyclUSMRef>arr_buf.get_data_ptr(),
-                arr_buf.nbytes,
-                QRef,
-                memory_owner=arr_buf
-            )
-            res = usm_ndarray(
-                self.shape,
-                self.dtype,
-                buffer=view_buffer,
-                strides=self.strides,
-                offset=self.get_offset()
-            )
-            res.flags_ = self.flags_
-            return res
-        else:
-            nbytes = self.usm_data.nbytes
-            copy_buffer = type(self.usm_data)(
-                nbytes, queue=d.sycl_queue
-            )
-            copy_buffer.copy_from_device(self.usm_data)
-            res = usm_ndarray(
-                self.shape,
-                self.dtype,
-                buffer=copy_buffer,
-                strides=self.strides,
-                offset=self.get_offset()
-            )
-            res.flags_ = self.flags_
-            return res
-
-    def _set_namespace(self, mod):
-        """ Sets array namespace to given module `mod`. """
-        self.array_namespace_ = mod
-
-    def __array_namespace__(self, api_version=None):
-        """
-        Returns array namespace, member functions of which
-        implement data API.
-
-        Args:
-            api_version (str, optional)
-                Request namespace compliant with given version of
-                array API. If ``None``, namespace for the most
-                recent supported version is returned.
-                Default: ``None``.
-        """
-        if api_version is not None:
-            from ._array_api import __array_api_version__
-            if not isinstance(api_version, str):
-                raise TypeError(f"Expected type str, got {type(api_version)}")
-            if api_version != __array_api_version__:
-                raise ValueError(f"Only {__array_api_version__} is supported")
-        return (
-            self.array_namespace_
-            if self.array_namespace_ is not None
-            else dpctl.tensor
-        )
-
-    def __bool__(self):
-        if self.size == 1:
-            _check_0d_scalar_conversion(self)
-            view = _as_zero_dim_ndarray(self)
-            return view.__bool__()
-
-        if self.size == 0:
-            raise ValueError(
-                "The truth value of an empty array is ambiguous"
-            )
-
-        raise ValueError(
-            "The truth value of an array with more than one element is "
-            "ambiguous. Use dpctl.tensor.any() or dpctl.tensor.all()"
-        )
-
-    def __float__(self):
-        if self.size == 1:
-            _check_0d_scalar_conversion(self)
-            view = _as_zero_dim_ndarray(self)
-            return view.__float__()
-
-        raise ValueError(
-            "only size-1 arrays can be converted to Python scalars"
-        )
-
-    def __complex__(self):
-        if self.size == 1:
-            _check_0d_scalar_conversion(self)
-            view = _as_zero_dim_ndarray(self)
-            return view.__complex__()
-
-        raise ValueError(
-            "only size-1 arrays can be converted to Python scalars"
-        )
-
-    def __int__(self):
-        if self.size == 1:
-            _check_0d_scalar_conversion(self)
-            view = _as_zero_dim_ndarray(self)
-            return view.__int__()
-
-        raise ValueError(
-            "only size-1 arrays can be converted to Python scalars"
-        )
-
-    def __index__(self):
-        if np.issubdtype(self.dtype, np.integer):
-            return int(self)
-
-        raise IndexError("only integer arrays are valid indices")
-
-    def __abs__(self):
-        return dpctl.tensor.abs(self)
-
-    def __add__(self, other):
-        """
-        Implementation for operator.add
-        """
-        return dpctl.tensor.add(self, other)
-
-    def __and__(self, other):
-        "Implementation for operator.and"
-        return dpctl.tensor.bitwise_and(self, other)
-
-    def __dlpack__(
-        self, *, stream=None, max_version=None, dl_device=None, copy=None
-    ):
-        """
-        Produces DLPack capsule.
-
-        Args:
-            stream (:class:`dpctl.SyclQueue`, optional):
-                Execution queue to synchronize with.
-                If ``None``, synchronization is not performed.
-                Default: ``None``.
-            max_version (tuple[int, int], optional):
-                The maximum DLPack version the consumer (caller of
-                ``__dlpack__``) supports. As ``__dlpack__`` may not
-                always return a DLPack capsule with version
-                `max_version`, the consumer must verify the version
-                even if this argument is passed.
-                Default: ``None``.
-            dl_device (tuple[enum.Enum, int], optional):
-                The device the returned DLPack capsule will be
-                placed on.
-                The device must be a 2-tuple matching the format of
-                ``__dlpack_device__`` method, an integer enumerator
-                representing the device type followed by an integer
-                representing the index of the device.
-                Default: ``None``.
-            copy (bool, optional):
-                Boolean indicating whether or not to copy the input.
-
-                * If ``copy`` is ``True``, the input will always be
-                  copied.
-                * If ``False``, a ``BufferError`` will be raised if a
-                  copy is deemed necessary.
-                * If ``None``, a copy will be made only if deemed
-                  necessary, otherwise, the existing memory buffer will
-                  be reused.
-
-                Default: ``None``.
-
-        Raises:
-            MemoryError:
-                when host memory can not be allocated.
-            DLPackCreationError:
-                when array is allocated on a partitioned
-                SYCL device, or with a non-default context.
-            BufferError:
-                when a copy is deemed necessary but ``copy``
-                is ``False`` or when the provided ``dl_device``
-                cannot be handled.
-        """
-        if max_version is None:
-            # legacy path for DLManagedTensor
-            # copy kwarg ignored because copy flag can't be set
-            _caps = c_dlpack.to_dlpack_capsule(self)
-            _validate_and_use_stream(stream, self.sycl_queue)
-            return _caps
-        else:
-            if not isinstance(max_version, tuple) or len(max_version) != 2:
-                raise TypeError(
-                    "`__dlpack__` expects `max_version` to be a "
-                    "2-tuple of integers `(major, minor)`, instead "
-                    f"got {max_version}"
-                )
-            dpctl_dlpack_version = get_build_dlpack_version()
-            if max_version[0] >= dpctl_dlpack_version[0]:
-                # DLManagedTensorVersioned path
-                if dl_device is not None:
-                    if not isinstance(dl_device, tuple) or len(dl_device) != 2:
-                        raise TypeError(
-                            "`__dlpack__` expects `dl_device` to be a 2-tuple "
-                            "of `(device_type, device_id)`, instead "
-                            f"got {dl_device}"
-                        )
-                    if dl_device != self.__dlpack_device__():
-                        if copy is False:
-                            raise BufferError(
-                                "array cannot be placed on the requested "
-                                "device without a copy"
-                            )
-                        if _is_host_cpu(dl_device):
-                            if stream is not None:
-                                raise ValueError(
-                                    "`stream` must be `None` when `dl_device` "
-                                    "is of type `kDLCPU`"
-                                )
-                            from ._copy_utils import _copy_to_numpy
-                            _arr = _copy_to_numpy(self)
-                            _arr.flags["W"] = self.flags["W"]
-                            return c_dlpack.numpy_to_dlpack_versioned_capsule(
-                                _arr, True
-                            )
-                        else:
-                            raise BufferError(
-                                f"targeting `dl_device` {dl_device} with "
-                                "`__dlpack__` is not yet implemented"
-                            )
-                if copy is None:
-                    copy = False
-                # TODO: strategy for handling stream on different device
-                # from dl_device
-                if copy:
-                    _validate_and_use_stream(stream, self.sycl_queue)
-                    nbytes = self.usm_data.nbytes
-                    copy_buffer = type(self.usm_data)(
-                        nbytes, queue=self.sycl_queue
-                    )
-                    copy_buffer.copy_from_device(self.usm_data)
-                    _copied_arr = usm_ndarray(
-                        self.shape,
-                        self.dtype,
-                        buffer=copy_buffer,
-                        strides=self.strides,
-                        offset=self.get_offset()
-                    )
-                    _copied_arr.flags_ = self.flags_
-                    _caps = c_dlpack.to_dlpack_versioned_capsule(
-                        _copied_arr, copy
-                    )
-                else:
-                    _caps = c_dlpack.to_dlpack_versioned_capsule(self, copy)
-                    _validate_and_use_stream(stream, self.sycl_queue)
-                return _caps
-            else:
-                # legacy path for DLManagedTensor
-                _caps = c_dlpack.to_dlpack_capsule(self)
-                _validate_and_use_stream(stream, self.sycl_queue)
-                return _caps
-
-    def __dlpack_device__(self):
-        """
-        Gives a tuple (``device_type``, ``device_id``) corresponding to
-        ``DLDevice`` entry in ``DLTensor`` in DLPack protocol.
-
-        The tuple describes the non-partitioned device where the array has been
-        allocated, or the non-partitioned parent device of the allocation
-        device.
-
-        See :class:`dpctl.tensor.DLDeviceType` for a list of devices supported
-        by the DLPack protocol.
-
-        Raises:
-            DLPackCreationError:
-                when the ``device_id`` could not be determined.
-        """
-        try:
-            dev_id = self.sycl_device.get_device_id()
-        except ValueError as e:
-            raise c_dlpack.DLPackCreationError(
-                "Could not determine id of the device where array was "
-                "allocated."
-            )
-        return (
-            DLDeviceType.kDLOneAPI,
-            dev_id,
-        )
-
-    def __eq__(self, other):
-        return dpctl.tensor.equal(self, other)
-
-    def __floordiv__(self, other):
-        return dpctl.tensor.floor_divide(self, other)
-
-    def __ge__(self, other):
-        return dpctl.tensor.greater_equal(self, other)
-
-    def __gt__(self, other):
-        return dpctl.tensor.greater(self, other)
-
-    def __invert__(self):
-        return dpctl.tensor.bitwise_invert(self)
-
-    def __le__(self, other):
-        return dpctl.tensor.less_equal(self, other)
-
-    def __len__(self):
-        if (self.nd_):
-            return self.shape[0]
-        else:
-            raise TypeError("len() of unsized object")
-
-    def __lshift__(self, other):
-        return dpctl.tensor.bitwise_left_shift(self, other)
-
-    def __lt__(self, other):
-        return dpctl.tensor.less(self, other)
-
-    def __matmul__(self, other):
-        return dpctl.tensor.matmul(self, other)
-
-    def __mod__(self, other):
-        return dpctl.tensor.remainder(self, other)
-
-    def __mul__(self, other):
-        return dpctl.tensor.multiply(self, other)
-
-    def __ne__(self, other):
-        return dpctl.tensor.not_equal(self, other)
-
-    def __neg__(self):
-        return dpctl.tensor.negative(self)
-
-    def __or__(self, other):
-        return dpctl.tensor.bitwise_or(self, other)
-
-    def __pos__(self):
-        return dpctl.tensor.positive(self)
-
-    def __pow__(self, other):
-        return dpctl.tensor.pow(self, other)
-
-    def __rshift__(self, other):
-        return dpctl.tensor.bitwise_right_shift(self, other)
-
-    def __setitem__(self, key, rhs):
-        cdef tuple _meta
-        cdef usm_ndarray Xv
-
-        if (self.flags_ & USM_ARRAY_WRITABLE) == 0:
-            raise ValueError("Can not modify read-only array.")
-
-        _meta = _basic_slice_meta(
-            key, (<object>self).shape, (<object> self).strides,
-            self.get_offset()
-        )
-
-        if len(_meta) < 5:
-            raise RuntimeError
-
-        Xv = usm_ndarray.__new__(
-            usm_ndarray,
-            _meta[0],
-            dtype=_make_typestr(self.typenum_),
-            strides=_meta[1],
-            buffer=self.base_,
-            offset=_meta[2],
-        )
-        # set namespace
-        Xv.array_namespace_ = self.array_namespace_
-
-        from ._copy_utils import (
-            _copy_from_numpy_into,
-            _copy_from_usm_ndarray_to_usm_ndarray,
-            _nonzero_impl,
-            _place_impl,
-            _put_multi_index,
-        )
-
-        adv_ind = _meta[3]
-        adv_ind_start_p = _meta[4]
-
-        if adv_ind_start_p < 0:
-            # basic slicing
-            if isinstance(rhs, usm_ndarray):
-                _copy_from_usm_ndarray_to_usm_ndarray(Xv, rhs)
-            else:
-                if hasattr(rhs, "__sycl_usm_array_interface__"):
-                    from dpctl.tensor import asarray
-                    try:
-                        rhs_ar = asarray(rhs)
-                        _copy_from_usm_ndarray_to_usm_ndarray(Xv, rhs_ar)
-                    except Exception:
-                        raise ValueError(
-                            f"Input of type {type(rhs)} could not be "
-                            "converted to usm_ndarray"
-                        )
-                else:
-                    rhs_np = np.asarray(rhs)
-                    if type_bytesize(rhs_np.dtype.num) < 0:
-                        raise ValueError(
-                            f"Input of type {type(rhs)} can not be "
-                            "assigned to usm_ndarray because of "
-                            f"unsupported data type '{rhs_np.dtype}'"
-                        )
-                    try:
-                        _copy_from_numpy_into(Xv, rhs_np)
-                    except Exception:
-                        raise ValueError(
-                            f"Input of type {type(rhs)} could not be "
-                            "copied into dpctl.tensor.usm_ndarray"
-                        )
-            return
-
-        if len(adv_ind) == 1 and adv_ind[0].dtype == dpt_bool:
-            _place_impl(Xv, adv_ind[0], rhs, axis=adv_ind_start_p)
-            return
-
-        if any(
-            (
-                isinstance(ind, usm_ndarray) and ind.dtype == dpt_bool
-            ) for ind in adv_ind
-        ):
-            adv_ind_int = list()
-            for ind in adv_ind:
-                if isinstance(ind, usm_ndarray) and ind.dtype == dpt_bool:
-                    adv_ind_int.extend(_nonzero_impl(ind))
-                else:
-                    adv_ind_int.append(ind)
-            _put_multi_index(Xv, tuple(adv_ind_int), adv_ind_start_p, rhs)
-            return
-
-        _put_multi_index(Xv, adv_ind, adv_ind_start_p, rhs)
-        return
-
-    def __sub__(self, other):
-        return dpctl.tensor.subtract(self, other)
-
-    def __truediv__(self, other):
-        return dpctl.tensor.divide(self, other)
-
-    def __xor__(self, other):
-        return dpctl.tensor.bitwise_xor(self, other)
-
-    def __radd__(self, other):
-        return dpctl.tensor.add(other, self)
-
-    def __rand__(self, other):
-        return dpctl.tensor.bitwise_and(other, self)
-
-    def __rfloordiv__(self, other):
-        return dpctl.tensor.floor_divide(other, self)
-
-    def __rlshift__(self, other):
-        return dpctl.tensor.bitwise_left_shift(other, self)
-
-    def __rmatmul__(self, other):
-        return dpctl.tensor.matmul(other, self)
-
-    def __rmod__(self, other):
-        return dpctl.tensor.remainder(other, self)
-
-    def __rmul__(self, other):
-        return dpctl.tensor.multiply(other, self)
-
-    def __ror__(self, other):
-        return dpctl.tensor.bitwise_or(other, self)
-
-    def __rpow__(self, other):
-        return dpctl.tensor.pow(other, self)
-
-    def __rrshift__(self, other):
-        return dpctl.tensor.bitwise_right_shift(other, self)
-
-    def __rsub__(self, other):
-        return dpctl.tensor.subtract(other, self)
-
-    def __rtruediv__(self, other):
-        return dpctl.tensor.divide(other, self)
-
-    def __rxor__(self, other):
-        return dpctl.tensor.bitwise_xor(other, self)
-
-    def __iadd__(self, other):
-        return dpctl.tensor.add._inplace_op(self, other)
-
-    def __iand__(self, other):
-        return dpctl.tensor.bitwise_and._inplace_op(self, other)
-
-    def __ifloordiv__(self, other):
-        return dpctl.tensor.floor_divide._inplace_op(self, other)
-
-    def __ilshift__(self, other):
-        return dpctl.tensor.bitwise_left_shift._inplace_op(self, other)
-
-    def __imatmul__(self, other):
-        return dpctl.tensor.matmul(self, other, out=self, dtype=self.dtype)
-
-    def __imod__(self, other):
-        return dpctl.tensor.remainder._inplace_op(self, other)
-
-    def __imul__(self, other):
-        return dpctl.tensor.multiply._inplace_op(self, other)
-
-    def __ior__(self, other):
-        return dpctl.tensor.bitwise_or._inplace_op(self, other)
-
-    def __ipow__(self, other):
-        return dpctl.tensor.pow._inplace_op(self, other)
-
-    def __irshift__(self, other):
-        return dpctl.tensor.bitwise_right_shift._inplace_op(self, other)
-
-    def __isub__(self, other):
-        return dpctl.tensor.subtract._inplace_op(self, other)
-
-    def __itruediv__(self, other):
-        return dpctl.tensor.divide._inplace_op(self, other)
-
-    def __ixor__(self, other):
-        return dpctl.tensor.bitwise_xor._inplace_op(self, other)
-
-    def __str__(self):
-        return usm_ndarray_str(self)
-
-    def __repr__(self):
-        return usm_ndarray_repr(self)
-
-    def __array__(self, dtype=None, /, *, copy=None):
-        """NumPy's array protocol method to disallow implicit conversion.
-
-        Without this definition, `numpy.asarray(usm_ar)` converts
-        usm_ndarray instance into NumPy array with data type `object`
-        and every element being 0d usm_ndarray.
-
-        https://github.com/IntelPython/dpctl/pull/1384#issuecomment-1707212972
-        """
-        raise TypeError(
-            "Implicit conversion to a NumPy array is not allowed. "
-            "Use `dpctl.tensor.asnumpy` to copy data from this "
-            "`dpctl.tensor.usm_ndarray` instance to NumPy array"
-        )
-
-
-cdef usm_ndarray _real_view(usm_ndarray ary):
-    """
-    View into real parts of a complex type array
-    """
-    cdef int r_typenum_ = -1
-    cdef usm_ndarray r = None
-    cdef Py_ssize_t offset_elems = 0
-
-    if (ary.typenum_ == UAR_CFLOAT):
-        r_typenum_ = UAR_FLOAT
-    elif (ary.typenum_ == UAR_CDOUBLE):
-        r_typenum_ = UAR_DOUBLE
-    else:
-        raise InternalUSMArrayError(
-            "_real_view call on array of non-complex type.")
-
-    offset_elems = ary.get_offset() * 2
-    r = usm_ndarray.__new__(
-        usm_ndarray,
-        _make_int_tuple(ary.nd_, ary.shape_) if ary.nd_ > 0 else tuple(),
-        dtype=_make_typestr(r_typenum_),
-        strides=tuple(2 * si for si in ary.strides),
-        buffer=ary.base_,
-        offset=offset_elems,
-        order=("C" if (ary.flags_ & USM_ARRAY_C_CONTIGUOUS) else "F")
-    )
-    r.flags_ = _copy_writable(r.flags_, ary.flags_)
-    r.array_namespace_ = ary.array_namespace_
-    return r
-
-
-cdef usm_ndarray _imag_view(usm_ndarray ary):
-    """
-    View into imaginary parts of a complex type array
-    """
-    cdef int r_typenum_ = -1
-    cdef usm_ndarray r = None
-    cdef Py_ssize_t offset_elems = 0
-
-    if (ary.typenum_ == UAR_CFLOAT):
-        r_typenum_ = UAR_FLOAT
-    elif (ary.typenum_ == UAR_CDOUBLE):
-        r_typenum_ = UAR_DOUBLE
-    else:
-        raise InternalUSMArrayError(
-            "_imag_view call on array of non-complex type.")
-
-    # displace pointer to imaginary part
-    offset_elems = 2 * ary.get_offset() + 1
-    r = usm_ndarray.__new__(
-        usm_ndarray,
-        _make_int_tuple(ary.nd_, ary.shape_) if ary.nd_ > 0 else tuple(),
-        dtype=_make_typestr(r_typenum_),
-        strides=tuple(2 * si for si in ary.strides),
-        buffer=ary.base_,
-        offset=offset_elems,
-        order=("C" if (ary.flags_ & USM_ARRAY_C_CONTIGUOUS) else "F")
-    )
-    r.flags_ = _copy_writable(r.flags_, ary.flags_)
-    r.array_namespace_ = ary.array_namespace_
-    return r
-
-
-cdef usm_ndarray _transpose(usm_ndarray ary):
-    """
-    Construct transposed array without copying the data
-    """
-    cdef usm_ndarray r = usm_ndarray.__new__(
-        usm_ndarray,
-        _make_reversed_int_tuple(ary.nd_, ary.shape_),
-        dtype=_make_typestr(ary.typenum_),
-        strides=(
-            _make_reversed_int_tuple(ary.nd_, ary.strides_)
-            if (ary.strides_) else None),
-        buffer=ary.base_,
-        order=("F" if (ary.flags_ & USM_ARRAY_C_CONTIGUOUS) else "C"),
-        offset=ary.get_offset()
-    )
-    r.flags_ = _copy_writable(r.flags_, ary.flags_)
-    return r
-
-
-cdef usm_ndarray _m_transpose(usm_ndarray ary):
-    """
-    Construct matrix transposed array
-    """
-    cdef usm_ndarray r = usm_ndarray.__new__(
-        usm_ndarray,
-        _swap_last_two(_make_int_tuple(ary.nd_, ary.shape_)),
-        dtype=_make_typestr(ary.typenum_),
-        strides=_swap_last_two(ary.strides),
-        buffer=ary.base_,
-        order=("F" if (ary.flags_ & USM_ARRAY_C_CONTIGUOUS) else "C"),
-        offset=ary.get_offset()
-    )
-    r.flags_ = _copy_writable(r.flags_, ary.flags_)
-    return r
-
-
-cdef usm_ndarray _zero_like(usm_ndarray ary):
-    """
-    Make C-contiguous array of zero elements with same shape,
-    type, device, and sycl_queue as ary.
-    """
-    cdef dt = _make_typestr(ary.typenum_)
-    cdef usm_ndarray r = usm_ndarray(
-        _make_int_tuple(ary.nd_, ary.shape_) if ary.nd_ > 0 else tuple(),
-        dtype=dt,
-        buffer=ary.base_.get_usm_type(),
-        buffer_ctor_kwargs={"queue": ary.get_sycl_queue()},
-    )
-    r.base_.memset()
-    return r
-
-
-cdef api char* UsmNDArray_GetData(usm_ndarray arr):
-    """Get allocation pointer of zero index element of array """
-    return arr.get_data()
-
-
-cdef api int UsmNDArray_GetNDim(usm_ndarray arr):
-    """Get array rank: length of its shape"""
-    return arr.get_ndim()
-
-
-cdef api Py_ssize_t* UsmNDArray_GetShape(usm_ndarray arr):
-    """Get host pointer to shape vector"""
-    return arr.get_shape()
-
-
-cdef api Py_ssize_t* UsmNDArray_GetStrides(usm_ndarray arr):
-    """Get host pointer to strides vector"""
-    return arr.get_strides()
-
-
-cdef api int UsmNDArray_GetTypenum(usm_ndarray arr):
-    """Get type number for data type of array elements"""
-    return arr.get_typenum()
-
-
-cdef api int UsmNDArray_GetElementSize(usm_ndarray arr):
-    """Get array element size in bytes"""
-    return arr.get_itemsize()
-
-
-cdef api int UsmNDArray_GetFlags(usm_ndarray arr):
-    """Get flags of array"""
-    return arr.get_flags()
-
-
-cdef api c_dpctl.DPCTLSyclQueueRef UsmNDArray_GetQueueRef(usm_ndarray arr):
-    """Get DPCTLSyclQueueRef for queue associated with the array"""
-    return arr.get_queue_ref()
-
-
-cdef api Py_ssize_t UsmNDArray_GetOffset(usm_ndarray arr):
-    """Get offset of zero-index array element from the beginning of the USM
-    allocation"""
-    return arr.get_offset()
-
-
-cdef api object UsmNDArray_GetUSMData(usm_ndarray arr):
-    """Get USM data object underlying the array"""
-    return arr.get_base()
-
-
-cdef api void UsmNDArray_SetWritableFlag(usm_ndarray arr, int flag):
-    """Set/unset USM_ARRAY_WRITABLE in the given array `arr`."""
-    arr._set_writable_flag(flag)
-
-
-cdef api object UsmNDArray_MakeSimpleFromMemory(
-    int nd, const Py_ssize_t *shape, int typenum,
-    c_dpmem._Memory mobj, Py_ssize_t offset, char order
-):
-    """Create contiguous usm_ndarray.
-
-    Args:
-        nd: number of dimensions (non-negative)
-        shape: array of nd non-negative array's sizes along each dimension
-        typenum: array elemental type number
-        ptr: pointer to the start of allocation
-        QRef: DPCTLSyclQueueRef associated with the allocation
-        offset: distance between element with zero multi-index and the
-                start of allocation
-        order: Memory layout of the array. Use 'C' for C-contiguous or
-               row-major layout; 'F' for F-contiguous or column-major layout
-    Returns:
-        Created usm_ndarray instance
-    """
-    cdef object shape_tuple = _make_int_tuple(nd, <Py_ssize_t *>shape)
-    cdef usm_ndarray arr = usm_ndarray(
-        shape_tuple,
-        dtype=_make_typestr(typenum),
-        buffer=mobj,
-        offset=offset,
-        order=<bytes>(order)
-    )
-    return arr
-
-
-cdef api object UsmNDArray_MakeSimpleFromPtr(
-    size_t nelems,
-    int typenum,
-    c_dpctl.DPCTLSyclUSMRef ptr,
-    c_dpctl.DPCTLSyclQueueRef QRef,
-    object owner
-):
-    """Create 1D contiguous usm_ndarray from pointer.
-
-    Args:
-        nelems: number of elements in array
-        typenum: array elemental type number
-        ptr: pointer to the start of allocation
-        QRef: DPCTLSyclQueueRef associated with the allocation
-        owner: Python object managing lifetime of USM allocation.
-               Value None implies transfer of USM allocation ownership
-               to the created array object.
-    Returns:
-        Created usm_ndarray instance
-    """
-    cdef int itemsize = type_bytesize(typenum)
-    if (itemsize < 1):
-        raise ValueError(
-            "dtype with typenum=" + str(typenum) + " is not supported."
-        )
-    cdef size_t nbytes = (<size_t> itemsize) * nelems
-    cdef c_dpmem._Memory mobj
-    mobj = c_dpmem._Memory.create_from_usm_pointer_size_qref(
-        ptr, nbytes, QRef, memory_owner=owner
-    )
-    cdef usm_ndarray arr = usm_ndarray(
-        (nelems,),
-        dtype=_make_typestr(typenum),
-        buffer=mobj
-    )
-    return arr
-
-cdef api object UsmNDArray_MakeFromPtr(
-    int nd,
-    const Py_ssize_t *shape,
-    int typenum,
-    const Py_ssize_t *strides,
-    c_dpctl.DPCTLSyclUSMRef ptr,
-    c_dpctl.DPCTLSyclQueueRef QRef,
-    Py_ssize_t offset,
-    object owner
-):
-    """
-    General usm_ndarray constructor from externally made USM-allocation.
-
-    Args:
-        nd: number of dimensions (non-negative)
-        shape: array of nd non-negative array's sizes along each dimension
-        typenum: array elemental type number
-        strides: array of nd strides along each dimension in elements
-        ptr: pointer to the start of allocation
-        QRef: DPCTLSyclQueueRef associated with the allocation
-        offset: distance between element with zero multi-index and the
-                start of allocation
-        owner: Python object managing lifetime of USM allocation.
-               Value None implies transfer of USM allocation ownership
-               to the created array object.
-    Returns:
-        Created usm_ndarray instance
-    """
-    cdef int itemsize = type_bytesize(typenum)
-    cdef size_t nelems = 1
-    cdef Py_ssize_t min_disp = 0
-    cdef Py_ssize_t max_disp = 0
-    cdef Py_ssize_t step_ = 0
-    cdef Py_ssize_t dim_ = 0
-    cdef it = 0
-    cdef c_dpmem._Memory mobj
-    cdef usm_ndarray arr
-    cdef object obj_shape
-    cdef object obj_strides
-
-    if (itemsize < 1):
-        raise ValueError(
-            "dtype with typenum=" + str(typenum) + " is not supported."
-        )
-    if (nd < 0):
-        raise ValueError("Dimensionality must be non-negative")
-    if (ptr is NULL or QRef is NULL):
-        raise ValueError(
-            "Non-null USM allocation pointer and QRef are expected"
-        )
-    if (nd == 0):
-        # case of 0d scalars
-        mobj = c_dpmem._Memory.create_from_usm_pointer_size_qref(
-            ptr, itemsize, QRef, memory_owner=owner
-        )
-        arr = usm_ndarray(
-            tuple(),
-            dtype=_make_typestr(typenum),
-            buffer=mobj
-        )
-        return arr
-    if (shape is NULL or strides is NULL):
-        raise ValueError("Both shape and stride vectors are required")
-    for it in range(nd):
-        dim_ = shape[it]
-        if dim_ < 0:
-            raise ValueError(
-                f"Dimension along axis {it} must be non-negative"
-            )
-        nelems *= dim_
-        if dim_ > 0:
-            step_ = strides[it]
-            if step_ > 0:
-                max_disp += step_ * (dim_ - 1)
-            else:
-                min_disp += step_ * (dim_ - 1)
-
-    obj_shape = _make_int_tuple(nd, shape)
-    obj_strides = _make_int_tuple(nd, strides)
-    if nelems == 0:
-        mobj = c_dpmem._Memory.create_from_usm_pointer_size_qref(
-            ptr, itemsize, QRef, memory_owner=owner
-        )
-        arr = usm_ndarray(
-            obj_shape,
-            dtype=_make_typestr(typenum),
-            strides=obj_strides,
-            buffer=mobj,
-            offset=0
-        )
-        return arr
-    if offset + min_disp < 0:
-        raise ValueError(
-            "Given shape, strides and offset reference out-of-bound memory"
-        )
-    nbytes = (<size_t> itemsize) * (offset + max_disp + 1)
-    mobj = c_dpmem._Memory.create_from_usm_pointer_size_qref(
-        ptr, nbytes, QRef, memory_owner=owner
-    )
-    arr = usm_ndarray(
-        obj_shape,
-        dtype=_make_typestr(typenum),
-        strides=obj_strides,
-        buffer=mobj,
-        offset=offset
-    )
-    return arr
-
-
-def _is_object_with_buffer_protocol(o):
-    "Returns True if object supports Python buffer protocol"
-    return _is_buffer(o)
diff --git a/dpctl/tensor/_utility_functions.py b/dpctl/tensor/_utility_functions.py
deleted file mode 100644
index 3ccf283dbf..0000000000
--- a/dpctl/tensor/_utility_functions.py
+++ /dev/null
@@ -1,491 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import builtins
-import operator
-
-import dpctl.tensor as dpt
-import dpctl.tensor._tensor_impl as ti
-import dpctl.tensor._tensor_reductions_impl as tri
-import dpctl.utils as du
-
-from ._numpy_helper import normalize_axis_index, normalize_axis_tuple
-from ._scalar_utils import (
-    _get_dtype,
-    _get_queue_usm_type,
-    _get_shape,
-    _validate_dtype,
-)
-from ._type_utils import (
-    _resolve_one_strong_one_weak_types,
-    _resolve_one_strong_two_weak_types,
-)
-
-
-def _boolean_reduction(x, axis, keepdims, func):
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
-
-    nd = x.ndim
-    if axis is None:
-        red_nd = nd
-        # case of a scalar
-        if red_nd == 0:
-            return dpt.astype(x, dpt.bool)
-        x_tmp = x
-        res_shape = tuple()
-        perm = list(range(nd))
-    else:
-        if not isinstance(axis, (tuple, list)):
-            axis = (axis,)
-        axis = normalize_axis_tuple(axis, nd, "axis")
-
-        red_nd = len(axis)
-        # check for axis=()
-        if red_nd == 0:
-            return dpt.astype(x, dpt.bool)
-        perm = [i for i in range(nd) if i not in axis] + list(axis)
-        x_tmp = dpt.permute_dims(x, perm)
-        res_shape = x_tmp.shape[: nd - red_nd]
-
-    exec_q = x.sycl_queue
-    res_usm_type = x.usm_type
-
-    _manager = du.SequentialOrderManager[exec_q]
-    dep_evs = _manager.submitted_events
-    # always allocate the temporary as
-    # int32 and usm-device  to ensure that atomic updates
-    # are supported
-    res_tmp = dpt.empty(
-        res_shape,
-        dtype=dpt.int32,
-        usm_type="device",
-        sycl_queue=exec_q,
-    )
-    hev0, ev0 = func(
-        src=x_tmp,
-        trailing_dims_to_reduce=red_nd,
-        dst=res_tmp,
-        sycl_queue=exec_q,
-        depends=dep_evs,
-    )
-    _manager.add_event_pair(hev0, ev0)
-
-    # copy to boolean result array
-    res = dpt.empty(
-        res_shape,
-        dtype=dpt.bool,
-        usm_type=res_usm_type,
-        sycl_queue=exec_q,
-    )
-    hev1, ev1 = ti._copy_usm_ndarray_into_usm_ndarray(
-        src=res_tmp, dst=res, sycl_queue=exec_q, depends=[ev0]
-    )
-    _manager.add_event_pair(hev1, ev1)
-
-    if keepdims:
-        res_shape = res_shape + (1,) * red_nd
-        inv_perm = sorted(range(nd), key=lambda d: perm[d])
-        res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm)
-    return res
-
-
-def all(x, /, *, axis=None, keepdims=False):
-    """
-    all(x, axis=None, keepdims=False)
-
-    Tests whether all input array elements evaluate to True along a given axis.
-
-    Args:
-        x (usm_ndarray): Input array.
-        axis (Optional[Union[int, Tuple[int,...]]]): Axis (or axes)
-            along which to perform a logical AND reduction.
-            When `axis` is `None`, a logical AND reduction
-            is performed over all dimensions of `x`.
-            If `axis` is negative, the axis is counted from
-            the last dimension to the first.
-            Default: `None`.
-        keepdims (bool, optional): If `True`, the reduced axes are included
-            in the result as singleton dimensions, and the result is
-            broadcastable to the input array shape.
-            If `False`, the reduced axes are not included in the result.
-            Default: `False`.
-
-    Returns:
-        usm_ndarray:
-            An array with a data type of `bool`
-            containing the results of the logical AND reduction.
-    """
-    return _boolean_reduction(x, axis, keepdims, tri._all)
-
-
-def any(x, /, *, axis=None, keepdims=False):
-    """
-    any(x, axis=None, keepdims=False)
-
-    Tests whether any input array elements evaluate to True along a given axis.
-
-    Args:
-        x (usm_ndarray): Input array.
-        axis (Optional[Union[int, Tuple[int,...]]]): Axis (or axes)
-            along which to perform a logical OR reduction.
-            When `axis` is `None`, a logical OR reduction
-            is performed over all dimensions of `x`.
-            If `axis` is negative, the axis is counted from
-            the last dimension to the first.
-            Default: `None`.
-        keepdims (bool, optional): If `True`, the reduced axes are included
-            in the result as singleton dimensions, and the result is
-            broadcastable to the input array shape.
-            If `False`, the reduced axes are not included in the result.
-            Default: `False`.
-
-    Returns:
-        usm_ndarray:
-            An array with a data type of `bool`
-            containing the results of the logical OR reduction.
-    """
-    return _boolean_reduction(x, axis, keepdims, tri._any)
-
-
-def _validate_diff_shape(sh1, sh2, axis):
-    """Utility for validating that two shapes `sh1` and `sh2`
-    are possible to concatenate along `axis`."""
-    if not sh2:
-        # scalars will always be accepted
-        return True
-    else:
-        sh1_ndim = len(sh1)
-        if sh1_ndim == len(sh2) and builtins.all(
-            sh1[i] == sh2[i] for i in range(sh1_ndim) if i != axis
-        ):
-            return True
-        else:
-            return False
-
-
-def _concat_diff_input(arr, axis, prepend, append):
-    """
-    Concatenates `arr`, `prepend` and, `append` along `axis`,
-    where `arr` is an array and `prepend` and `append` are
-    any mixture of arrays and scalars.
-    """
-    if prepend is not None and append is not None:
-        q1, x_usm_type = arr.sycl_queue, arr.usm_type
-        q2, prepend_usm_type = _get_queue_usm_type(prepend)
-        q3, append_usm_type = _get_queue_usm_type(append)
-        if q2 is None and q3 is None:
-            exec_q = q1
-            coerced_usm_type = x_usm_type
-        elif q3 is None:
-            exec_q = du.get_execution_queue((q1, q2))
-            if exec_q is None:
-                raise du.ExecutionPlacementError(
-                    "Execution placement can not be unambiguously inferred "
-                    "from input arguments."
-                )
-            coerced_usm_type = du.get_coerced_usm_type(
-                (
-                    x_usm_type,
-                    prepend_usm_type,
-                )
-            )
-        elif q2 is None:
-            exec_q = du.get_execution_queue((q1, q3))
-            if exec_q is None:
-                raise du.ExecutionPlacementError(
-                    "Execution placement can not be unambiguously inferred "
-                    "from input arguments."
-                )
-            coerced_usm_type = du.get_coerced_usm_type(
-                (
-                    x_usm_type,
-                    append_usm_type,
-                )
-            )
-        else:
-            exec_q = du.get_execution_queue((q1, q2, q3))
-            if exec_q is None:
-                raise du.ExecutionPlacementError(
-                    "Execution placement can not be unambiguously inferred "
-                    "from input arguments."
-                )
-            coerced_usm_type = du.get_coerced_usm_type(
-                (
-                    x_usm_type,
-                    prepend_usm_type,
-                    append_usm_type,
-                )
-            )
-        du.validate_usm_type(coerced_usm_type, allow_none=False)
-        arr_shape = arr.shape
-        prepend_shape = _get_shape(prepend)
-        append_shape = _get_shape(append)
-        if not builtins.all(
-            isinstance(s, (tuple, list))
-            for s in (
-                prepend_shape,
-                append_shape,
-            )
-        ):
-            raise TypeError(
-                "Shape of arguments can not be inferred. "
-                "Arguments are expected to be "
-                "lists, tuples, or both"
-            )
-        valid_prepend_shape = _validate_diff_shape(
-            arr_shape, prepend_shape, axis
-        )
-        if not valid_prepend_shape:
-            raise ValueError(
-                f"`diff` argument `prepend` with shape {prepend_shape} is "
-                f"invalid for first input with shape {arr_shape}"
-            )
-        valid_append_shape = _validate_diff_shape(arr_shape, append_shape, axis)
-        if not valid_append_shape:
-            raise ValueError(
-                f"`diff` argument `append` with shape {append_shape} is invalid"
-                f" for first input with shape {arr_shape}"
-            )
-        sycl_dev = exec_q.sycl_device
-        arr_dtype = arr.dtype
-        prepend_dtype = _get_dtype(prepend, sycl_dev)
-        append_dtype = _get_dtype(append, sycl_dev)
-        if not builtins.all(
-            _validate_dtype(o) for o in (prepend_dtype, append_dtype)
-        ):
-            raise ValueError("Operands have unsupported data types")
-        prepend_dtype, append_dtype = _resolve_one_strong_two_weak_types(
-            arr_dtype, prepend_dtype, append_dtype, sycl_dev
-        )
-        if isinstance(prepend, dpt.usm_ndarray):
-            a_prepend = prepend
-        else:
-            a_prepend = dpt.asarray(
-                prepend,
-                dtype=prepend_dtype,
-                usm_type=coerced_usm_type,
-                sycl_queue=exec_q,
-            )
-        if isinstance(append, dpt.usm_ndarray):
-            a_append = append
-        else:
-            a_append = dpt.asarray(
-                append,
-                dtype=append_dtype,
-                usm_type=coerced_usm_type,
-                sycl_queue=exec_q,
-            )
-        if not prepend_shape:
-            prepend_shape = arr_shape[:axis] + (1,) + arr_shape[axis + 1 :]
-            a_prepend = dpt.broadcast_to(a_prepend, prepend_shape)
-        if not append_shape:
-            append_shape = arr_shape[:axis] + (1,) + arr_shape[axis + 1 :]
-            a_append = dpt.broadcast_to(a_append, append_shape)
-        return dpt.concat((a_prepend, arr, a_append), axis=axis)
-    elif prepend is not None:
-        q1, x_usm_type = arr.sycl_queue, arr.usm_type
-        q2, prepend_usm_type = _get_queue_usm_type(prepend)
-        if q2 is None:
-            exec_q = q1
-            coerced_usm_type = x_usm_type
-        else:
-            exec_q = du.get_execution_queue((q1, q2))
-            if exec_q is None:
-                raise du.ExecutionPlacementError(
-                    "Execution placement can not be unambiguously inferred "
-                    "from input arguments."
-                )
-            coerced_usm_type = du.get_coerced_usm_type(
-                (
-                    x_usm_type,
-                    prepend_usm_type,
-                )
-            )
-        du.validate_usm_type(coerced_usm_type, allow_none=False)
-        arr_shape = arr.shape
-        prepend_shape = _get_shape(prepend)
-        if not isinstance(prepend_shape, (tuple, list)):
-            raise TypeError(
-                "Shape of argument can not be inferred. "
-                "Argument is expected to be a "
-                "list or tuple"
-            )
-        valid_prepend_shape = _validate_diff_shape(
-            arr_shape, prepend_shape, axis
-        )
-        if not valid_prepend_shape:
-            raise ValueError(
-                f"`diff` argument `prepend` with shape {prepend_shape} is "
-                f"invalid for first input with shape {arr_shape}"
-            )
-        sycl_dev = exec_q.sycl_device
-        arr_dtype = arr.dtype
-        prepend_dtype = _get_dtype(prepend, sycl_dev)
-        if not _validate_dtype(prepend_dtype):
-            raise ValueError("Operand has unsupported data type")
-        prepend_dtype = _resolve_one_strong_one_weak_types(
-            arr_dtype, prepend_dtype, sycl_dev
-        )
-        if isinstance(prepend, dpt.usm_ndarray):
-            a_prepend = prepend
-        else:
-            a_prepend = dpt.asarray(
-                prepend,
-                dtype=prepend_dtype,
-                usm_type=coerced_usm_type,
-                sycl_queue=exec_q,
-            )
-        if not prepend_shape:
-            prepend_shape = arr_shape[:axis] + (1,) + arr_shape[axis + 1 :]
-            a_prepend = dpt.broadcast_to(a_prepend, prepend_shape)
-        return dpt.concat((a_prepend, arr), axis=axis)
-    elif append is not None:
-        q1, x_usm_type = arr.sycl_queue, arr.usm_type
-        q2, append_usm_type = _get_queue_usm_type(append)
-        if q2 is None:
-            exec_q = q1
-            coerced_usm_type = x_usm_type
-        else:
-            exec_q = du.get_execution_queue((q1, q2))
-            if exec_q is None:
-                raise du.ExecutionPlacementError(
-                    "Execution placement can not be unambiguously inferred "
-                    "from input arguments."
-                )
-            coerced_usm_type = du.get_coerced_usm_type(
-                (
-                    x_usm_type,
-                    append_usm_type,
-                )
-            )
-        du.validate_usm_type(coerced_usm_type, allow_none=False)
-        arr_shape = arr.shape
-        append_shape = _get_shape(append)
-        if not isinstance(append_shape, (tuple, list)):
-            raise TypeError(
-                "Shape of argument can not be inferred. "
-                "Argument is expected to be a "
-                "list or tuple"
-            )
-        valid_append_shape = _validate_diff_shape(arr_shape, append_shape, axis)
-        if not valid_append_shape:
-            raise ValueError(
-                f"`diff` argument `append` with shape {append_shape} is invalid"
-                f" for first input with shape {arr_shape}"
-            )
-        sycl_dev = exec_q.sycl_device
-        arr_dtype = arr.dtype
-        append_dtype = _get_dtype(append, sycl_dev)
-        if not _validate_dtype(append_dtype):
-            raise ValueError("Operand has unsupported data type")
-        append_dtype = _resolve_one_strong_one_weak_types(
-            arr_dtype, append_dtype, sycl_dev
-        )
-        if isinstance(append, dpt.usm_ndarray):
-            a_append = append
-        else:
-            a_append = dpt.asarray(
-                append,
-                dtype=append_dtype,
-                usm_type=coerced_usm_type,
-                sycl_queue=exec_q,
-            )
-        if not append_shape:
-            append_shape = arr_shape[:axis] + (1,) + arr_shape[axis + 1 :]
-            a_append = dpt.broadcast_to(a_append, append_shape)
-        return dpt.concat((arr, a_append), axis=axis)
-    else:
-        arr1 = arr
-    return arr1
-
-
-def diff(x, /, *, axis=-1, n=1, prepend=None, append=None):
-    """
-    Calculates the `n`-th discrete forward difference of `x` along `axis`.
-
-    Args:
-        x (usm_ndarray):
-            input array.
-        axis (int):
-            axis along which to compute the difference. A valid axis must be on
-            the interval `[-N, N)`, where `N` is the rank (number of
-            dimensions) of `x`.
-            Default: `-1`
-        n (int):
-            number of times to recursively compute the difference.
-            Default: `1`.
-        prepend (Union[usm_ndarray, bool, int, float, complex]):
-            value or values to prepend to the specified axis before taking the
-            difference.
-            Must have the same shape as `x` except along `axis`, which can have
-            any shape.
-            Default: `None`.
-        append (Union[usm_ndarray, bool, int, float, complex]):
-            value or values to append to the specified axis before taking the
-            difference.
-            Must have the same shape as `x` except along `axis`, which can have
-            any shape.
-            Default: `None`.
-
-    Returns:
-        usm_ndarray:
-            an array containing the `n`-th differences. The array will have the
-            same shape as `x`, except along `axis`, which will have shape:
-            ``prepend.shape[axis] + x.shape[axis] + append.shape[axis] - n``
-
-            The data type of the returned array is determined by the Type
-            Promotion Rules.
-    """
-
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(
-            "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(x)}"
-        )
-    x_nd = x.ndim
-    axis = normalize_axis_index(operator.index(axis), x_nd)
-    n = operator.index(n)
-    if n < 0:
-        raise ValueError(f"`n` must be positive, got {n}")
-    arr = _concat_diff_input(x, axis, prepend, append)
-    if n == 0:
-        return arr
-    # form slices and recurse
-    sl0 = tuple(
-        slice(None) if i != axis else slice(1, None) for i in range(x_nd)
-    )
-    sl1 = tuple(
-        slice(None) if i != axis else slice(None, -1) for i in range(x_nd)
-    )
-
-    diff_op = dpt.not_equal if x.dtype == dpt.bool else dpt.subtract
-    if n > 1:
-        arr_tmp0 = diff_op(arr[sl0], arr[sl1])
-        arr_tmp1 = diff_op(arr_tmp0[sl0], arr_tmp0[sl1])
-        n = n - 2
-        if n > 0:
-            sl3 = tuple(
-                slice(None) if i != axis else slice(None, -2)
-                for i in range(x_nd)
-            )
-            for _ in range(n):
-                arr_tmp0_sliced = arr_tmp0[sl3]
-                diff_op(arr_tmp1[sl0], arr_tmp1[sl1], out=arr_tmp0_sliced)
-                arr_tmp0, arr_tmp1 = arr_tmp1, arr_tmp0_sliced
-        arr = arr_tmp1
-    else:
-        arr = diff_op(arr[sl0], arr[sl1])
-    return arr
diff --git a/dpctl/tensor/include/dlpack/.clang-format b/dpctl/tensor/include/dlpack/.clang-format
deleted file mode 100644
index 9d159247d5..0000000000
--- a/dpctl/tensor/include/dlpack/.clang-format
+++ /dev/null
@@ -1,2 +0,0 @@
-DisableFormat: true
-SortIncludes: false
diff --git a/dpctl/tensor/include/dlpack/LICENSE.third-party b/dpctl/tensor/include/dlpack/LICENSE.third-party
deleted file mode 100644
index 20a9c8a7b4..0000000000
--- a/dpctl/tensor/include/dlpack/LICENSE.third-party
+++ /dev/null
@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "{}"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright 2017 by Contributors
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/dpctl/tensor/include/dlpack/README.md b/dpctl/tensor/include/dlpack/README.md
deleted file mode 100644
index 3a7bc6d422..0000000000
--- a/dpctl/tensor/include/dlpack/README.md
+++ /dev/null
@@ -1,7 +0,0 @@
-# DLPack header
-
-The header `dlpack.h` downloaded from `https://github.com/dmlc/dlpack.git` remote at tag v1.0rc commit [`62100c1`](https://github.com/dmlc/dlpack/commit/62100c123144ae7a80061f4220be2dbd3cbaefc7).
-
-The file can also be viewed using github web interface at https://github.com/dmlc/dlpack/blob/62100c123144ae7a80061f4220be2dbd3cbaefc7/include/dlpack/dlpack.h
-
-License file was retrieved from https://github.com/dmlc/dlpack/blob/main/LICENSE
diff --git a/dpctl/tensor/include/dlpack/dlpack.h b/dpctl/tensor/include/dlpack/dlpack.h
deleted file mode 100644
index 9a710ebde6..0000000000
--- a/dpctl/tensor/include/dlpack/dlpack.h
+++ /dev/null
@@ -1,639 +0,0 @@
-/*!
- *  Copyright (c) 2017 -  by Contributors
- * \file dlpack.h
- * \brief The common header of DLPack.
- */
-#ifndef DLPACK_DLPACK_H_
-#define DLPACK_DLPACK_H_
-
-/**
- * \brief Compatibility with C++
- */
-#ifdef __cplusplus
-#define DLPACK_EXTERN_C extern "C"
-#else
-#define DLPACK_EXTERN_C
-#endif
-
-/*! \brief The current major version of dlpack */
-#define DLPACK_MAJOR_VERSION 1
-
-/*! \brief The current minor version of dlpack */
-#define DLPACK_MINOR_VERSION 2
-
-/*! \brief DLPACK_DLL prefix for windows */
-#ifdef _WIN32
-#ifdef DLPACK_EXPORTS
-#define DLPACK_DLL __declspec(dllexport)
-#else
-#define DLPACK_DLL __declspec(dllimport)
-#endif
-#else
-#define DLPACK_DLL
-#endif
-
-#include <stdint.h>
-#include <stddef.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*!
- * \brief The DLPack version.
- *
- * A change in major version indicates that we have changed the
- * data layout of the ABI - DLManagedTensorVersioned.
- *
- * A change in minor version indicates that we have added new
- * code, such as a new device type, but the ABI is kept the same.
- *
- * If an obtained DLPack tensor has a major version that disagrees
- * with the version number specified in this header file
- * (i.e. major != DLPACK_MAJOR_VERSION), the consumer must call the deleter
- * (and it is safe to do so). It is not safe to access any other fields
- * as the memory layout will have changed.
- *
- * In the case of a minor version mismatch, the tensor can be safely used as
- * long as the consumer knows how to interpret all fields. Minor version
- * updates indicate the addition of enumeration values.
- */
-typedef struct {
-  /*! \brief DLPack major version. */
-  uint32_t major;
-  /*! \brief DLPack minor version. */
-  uint32_t minor;
-} DLPackVersion;
-
-/*!
- * \brief The device type in DLDevice.
- */
-#ifdef __cplusplus
-typedef enum : int32_t {
-#else
-typedef enum {
-#endif
-  /*! \brief CPU device */
-  kDLCPU = 1,
-  /*! \brief CUDA GPU device */
-  kDLCUDA = 2,
-  /*!
-   * \brief Pinned CUDA CPU memory by cudaMallocHost
-   */
-  kDLCUDAHost = 3,
-  /*! \brief OpenCL devices. */
-  kDLOpenCL = 4,
-  /*! \brief Vulkan buffer for next generation graphics. */
-  kDLVulkan = 7,
-  /*! \brief Metal for Apple GPU. */
-  kDLMetal = 8,
-  /*! \brief Verilog simulator buffer */
-  kDLVPI = 9,
-  /*! \brief ROCm GPUs for AMD GPUs */
-  kDLROCM = 10,
-  /*!
-   * \brief Pinned ROCm CPU memory allocated by hipMallocHost
-   */
-  kDLROCMHost = 11,
-  /*!
-   * \brief Reserved extension device type,
-   * used for quickly test extension device
-   * The semantics can differ depending on the implementation.
-   */
-  kDLExtDev = 12,
-  /*!
-   * \brief CUDA managed/unified memory allocated by cudaMallocManaged
-   */
-  kDLCUDAManaged = 13,
-  /*!
-   * \brief Unified shared memory allocated on a oneAPI non-partititioned
-   * device. Call to oneAPI runtime is required to determine the device
-   * type, the USM allocation type and the sycl context it is bound to.
-   *
-   */
-  kDLOneAPI = 14,
-  /*! \brief GPU support for next generation WebGPU standard. */
-  kDLWebGPU = 15,
-  /*! \brief Qualcomm Hexagon DSP */
-  kDLHexagon = 16,
-  /*! \brief Microsoft MAIA devices */
-  kDLMAIA = 17,
-  /*! \brief AWS Trainium */
-  kDLTrn = 18,
-} DLDeviceType;
-
-/*!
- * \brief A Device for Tensor and operator.
- */
-typedef struct {
-  /*! \brief The device type used in the device. */
-  DLDeviceType device_type;
-  /*!
-   * \brief The device index.
-   * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0.
-   */
-  int32_t device_id;
-} DLDevice;
-
-/*!
- * \brief The type code options DLDataType.
- */
-typedef enum {
-  /*! \brief signed integer */
-  kDLInt = 0U,
-  /*! \brief unsigned integer */
-  kDLUInt = 1U,
-  /*! \brief IEEE floating point */
-  kDLFloat = 2U,
-  /*!
-   * \brief Opaque handle type, reserved for testing purposes.
-   * Frameworks need to agree on the handle data type for the exchange to be well-defined.
-   */
-  kDLOpaqueHandle = 3U,
-  /*! \brief bfloat16 */
-  kDLBfloat = 4U,
-  /*!
-   * \brief complex number
-   * (C/C++/Python layout: compact struct per complex number)
-   */
-  kDLComplex = 5U,
-  /*! \brief boolean */
-  kDLBool = 6U,
-  /*! \brief FP8 data types */
-  kDLFloat8_e3m4 = 7U,
-  kDLFloat8_e4m3 = 8U,
-  kDLFloat8_e4m3b11fnuz = 9U,
-  kDLFloat8_e4m3fn = 10U,
-  kDLFloat8_e4m3fnuz = 11U,
-  kDLFloat8_e5m2 = 12U,
-  kDLFloat8_e5m2fnuz = 13U,
-  kDLFloat8_e8m0fnu = 14U,
-  /*! \brief FP6 data types
-   * Setting bits != 6 is currently unspecified, and the producer must ensure it is set
-   * while the consumer must stop importing if the value is unexpected.
-   */
-  kDLFloat6_e2m3fn = 15U,
-  kDLFloat6_e3m2fn = 16U,
-  /*! \brief FP4 data types
-   * Setting bits != 4 is currently unspecified, and the producer must ensure it is set
-   * while the consumer must stop importing if the value is unexpected.
-   */
-  kDLFloat4_e2m1fn = 17U,
-} DLDataTypeCode;
-
-/*!
- * \brief The data type the tensor can hold. The data type is assumed to follow the
- * native endian-ness. An explicit error message should be raised when attempting to
- * export an array with non-native endianness
- *
- *  Examples
- *   - float: type_code = 2, bits = 32, lanes = 1
- *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4
- *   - int8: type_code = 0, bits = 8, lanes = 1
- *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1
- *   - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits)
- *   - float8_e4m3: type_code = 8, bits = 8, lanes = 1 (packed in memory)
- *   - float6_e3m2fn: type_code = 16, bits = 6, lanes = 1 (packed in memory)
- *   - float4_e2m1fn: type_code = 17, bits = 4, lanes = 1 (packed in memory)
- *
- *  When a sub-byte type is packed, DLPack requires the data to be in little bit-endian, i.e.,
- *  for a packed data set D ((D >> (i * bits)) && bit_mask) stores the i-th element.
- */
-typedef struct {
-  /*!
-   * \brief Type code of base types.
-   * We keep it uint8_t instead of DLDataTypeCode for minimal memory
-   * footprint, but the value should be one of DLDataTypeCode enum values.
-   * */
-  uint8_t code;
-  /*!
-   * \brief Number of bits, common choices are 8, 16, 32.
-   */
-  uint8_t bits;
-  /*! \brief Number of lanes in the type, used for vector types. */
-  uint16_t lanes;
-} DLDataType;
-
-/*!
- * \brief Plain C Tensor object, does not manage memory.
- */
-typedef struct {
-  /*!
-   * \brief The data pointer points to the allocated data. This will be CUDA
-   * device pointer or cl_mem handle in OpenCL. It may be opaque on some device
-   * types. This pointer is always aligned to 256 bytes as in CUDA. The
-   * `byte_offset` field should be used to point to the beginning of the data.
-   *
-   * Note that as of Nov 2021, multiple libraries (CuPy, PyTorch, TensorFlow,
-   * TVM, perhaps others) do not adhere to this 256 byte alignment requirement
-   * on CPU/CUDA/ROCm, and always use `byte_offset=0`.  This must be fixed
-   * (after which this note will be updated); at the moment it is recommended
-   * to not rely on the data pointer being correctly aligned.
-   *
-   * For given DLTensor, the size of memory required to store the contents of
-   * data is calculated as follows:
-   *
-   * \code{.c}
-   * static inline size_t GetDataSize(const DLTensor* t) {
-   *   size_t size = 1;
-   *   for (tvm_index_t i = 0; i < t->ndim; ++i) {
-   *     size *= t->shape[i];
-   *   }
-   *   size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
-   *   return size;
-   * }
-   * \endcode
-   *
-   * Note that if the tensor is of size zero, then the data pointer should be
-   * set to `NULL`.
-   */
-  void* data;
-  /*! \brief The device of the tensor */
-  DLDevice device;
-  /*! \brief Number of dimensions */
-  int32_t ndim;
-  /*! \brief The data type of the pointer*/
-  DLDataType dtype;
-  /*!
-   * \brief The shape of the tensor
-   *
-   *  When ndim == 0, shape can be set to NULL.
-   */
-  int64_t* shape;
-  /*!
-   * \brief strides of the tensor (in number of elements, not bytes),
-   *  can not be NULL if ndim != 0, must points to
-   *  an array of ndim elements that specifies the strides,
-   *  so consumer can always rely on strides[dim] being valid for 0 <= dim < ndim.
-   *
-   *  When ndim == 0, strides can be set to NULL.
-   *
-   *  \note Before DLPack v1.2, strides can be NULL to indicate contiguous data.
-   *        This is not allowed in DLPack v1.2 and later. The rationale
-   *        is to simplify the consumer handling.
-   */
-  int64_t* strides;
-  /*! \brief The offset in bytes to the beginning pointer to data */
-  uint64_t byte_offset;
-} DLTensor;
-
-/*!
- * \brief C Tensor object, manage memory of DLTensor. This data structure is
- *  intended to facilitate the borrowing of DLTensor by another framework. It is
- *  not meant to transfer the tensor. When the borrowing framework doesn't need
- *  the tensor, it should call the deleter to notify the host that the resource
- *  is no longer needed.
- *
- * \note This data structure is used as Legacy DLManagedTensor
- *       in DLPack exchange and is deprecated after DLPack v0.8
- *       Use DLManagedTensorVersioned instead.
- *       This data structure may get renamed or deleted in future versions.
- *
- * \sa DLManagedTensorVersioned
- */
-typedef struct DLManagedTensor {
-  /*! \brief DLTensor which is being memory managed */
-  DLTensor dl_tensor;
-  /*! \brief the context of the original host framework of DLManagedTensor in
-   *   which DLManagedTensor is used in the framework. It can also be NULL.
-   */
-  void * manager_ctx;
-  /*!
-   * \brief Destructor - this should be called
-   * to destruct the manager_ctx  which backs the DLManagedTensor. It can be
-   * NULL if there is no way for the caller to provide a reasonable destructor.
-   * The destructor deletes the argument self as well.
-   */
-  void (*deleter)(struct DLManagedTensor * self);
-} DLManagedTensor;
-
-// bit masks used in the DLManagedTensorVersioned
-
-/*! \brief bit mask to indicate that the tensor is read only. */
-#define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL)
-
-/*!
- * \brief bit mask to indicate that the tensor is a copy made by the producer.
- *
- * If set, the tensor is considered solely owned throughout its lifetime by the
- * consumer, until the producer-provided deleter is invoked.
- */
-#define DLPACK_FLAG_BITMASK_IS_COPIED (1UL << 1UL)
-
-/*!
- * \brief bit mask to indicate that whether a sub-byte type is packed or padded.
- *
- * The default for sub-byte types (ex: fp4/fp6) is assumed packed. This flag can
- * be set by the producer to signal that a tensor of sub-byte type is padded.
- */
-#define DLPACK_FLAG_BITMASK_IS_SUBBYTE_TYPE_PADDED (1UL << 2UL)
-
-/*!
- * \brief A versioned and managed C Tensor object, manage memory of DLTensor.
- *
- * This data structure is intended to facilitate the borrowing of DLTensor by
- * another framework. It is not meant to transfer the tensor. When the borrowing
- * framework doesn't need the tensor, it should call the deleter to notify the
- * host that the resource is no longer needed.
- *
- * \note This is the current standard DLPack exchange data structure.
- */
-typedef struct DLManagedTensorVersioned {
-  /*!
-   * \brief The API and ABI version of the current managed Tensor
-   */
-  DLPackVersion version;
-  /*!
-   * \brief the context of the original host framework.
-   *
-   * Stores DLManagedTensorVersioned is used in the
-   * framework. It can also be NULL.
-   */
-  void *manager_ctx;
-  /*!
-   * \brief Destructor.
-   *
-   * This should be called to destruct manager_ctx which holds the DLManagedTensorVersioned.
-   * It can be NULL if there is no way for the caller to provide a reasonable
-   * destructor. The destructor deletes the argument self as well.
-   */
-  void (*deleter)(struct DLManagedTensorVersioned *self);
-  /*!
-   * \brief Additional bitmask flags information about the tensor.
-   *
-   * By default the flags should be set to 0.
-   *
-   * \note Future ABI changes should keep everything until this field
-   *       stable, to ensure that deleter can be correctly called.
-   *
-   * \sa DLPACK_FLAG_BITMASK_READ_ONLY
-   * \sa DLPACK_FLAG_BITMASK_IS_COPIED
-   */
-  uint64_t flags;
-  /*! \brief DLTensor which is being memory managed */
-  DLTensor dl_tensor;
-} DLManagedTensorVersioned;
-
-//----------------------------------------------------------------------
-// DLPack `__c_dlpack_exchange_api__` fast exchange protocol definitions
-//----------------------------------------------------------------------
-/*!
- * \brief Request a producer library to create a new tensor.
- *
- * Create a new `DLManagedTensorVersioned` within the context of the producer
- * library. The allocation is defined via the prototype DLTensor.
- *
- * This function is exposed by the framework through the DLPackExchangeAPI.
- *
- * \param prototype The prototype DLTensor. Only the dtype, ndim, shape,
- *        and device fields are used.
- * \param out The output DLManagedTensorVersioned.
- * \param error_ctx Context for `SetError`.
- * \param SetError The function to set the error.
- * \return The owning DLManagedTensorVersioned* or NULL on failure.
- *         SetError is called exactly when NULL is returned (the implementor
- *         must ensure this).
- * \note - As a C function, must not thrown C++ exceptions.
- *       - Error propagation via SetError to avoid any direct need
- *         of Python API. Due to this `SetError` may have to ensure the GIL is
- *         held since it will presumably set a Python error.
- *
- * \sa DLPackExchangeAPI
- */
-typedef int (*DLPackManagedTensorAllocator)(                                         //
-  DLTensor* prototype, DLManagedTensorVersioned** out, void* error_ctx,              //
-  void (*SetError)(void* error_ctx, const char* kind, const char* message)           //
-);
-
-/*!
- * \brief Exports a PyObject* Tensor/NDArray to a DLManagedTensorVersioned.
- *
- * This function does not perform any stream synchronization. The consumer should query
- * DLPackCurrentWorkStream to get the current work stream and launch kernels on it.
- *
- * This function is exposed by the framework through the DLPackExchangeAPI.
- *
- * \param py_object The Python object to convert. Must have the same type
- *        as the one the `DLPackExchangeAPI` was discovered from.
- * \return The owning DLManagedTensorVersioned* or NULL on failure with a
- *         Python exception set. If the data cannot be described using DLPack
- *         this should be a BufferError if possible.
- * \note - As a C function, must not thrown C++ exceptions.
- *
- * \sa DLPackExchangeAPI, DLPackCurrentWorkStream
- */
-typedef int (*DLPackManagedTensorFromPyObjectNoSync)(                 //
-  void* py_object,                                                    //
-  DLManagedTensorVersioned** out                                      //
-);
-
-/*!
- * \brief Exports a PyObject* Tensor/NDArray to a provided DLTensor.
- *
- * This function provides a faster interface for temporary, non-owning, exchange.
- * The producer (implementor) still owns the memory of data, strides, shape.
- * The liveness of the DLTensor and the data it views is only guaranteed until
- * control is returned.
- *
- * This function currently assumes that the producer (implementor) can fill
- * in the DLTensor shape and strides without the need for temporary allocations.
- *
- * This function does not perform any stream synchronization. The consumer should query
- * DLPackCurrentWorkStream to get the current work stream and launch kernels on it.
- *
- * This function is exposed by the framework through the DLPackExchangeAPI.
- *
- * \param py_object The Python object to convert. Must have the same type
- *        as the one the `DLPackExchangeAPI` was discovered from.
- * \param out The output DLTensor, whose space is pre-allocated on stack.
- * \return 0 on success, -1 on failure with a Python exception set.
- * \note - As a C function, must not thrown C++ exceptions.
- *
- * \sa DLPackExchangeAPI, DLPackCurrentWorkStream
- */
-typedef int (*DLPackDLTensorFromPyObjectNoSync)(                      //
-  void* py_object,                                                    //
-  DLTensor* out                                                       //
-);
-
-/*!
- * \brief Obtain the current work stream of a device.
- *
- * Obtain the current work stream of a device from the producer framework.
- * For example, it should map to torch.cuda.current_stream in PyTorch.
- *
- * When device_type is kDLCPU, the consumer do not have to query the stream
- * and the producer can simply return NULL when queried.
- * The consumer do not have to do anything on stream sync or setting.
- * So CPU only framework can just provide a dummy implementation that
- * always set out_current_stream[0] to NULL.
- *
- * \param device_type The device type.
- * \param device_id The device id.
- * \param out_current_stream The output current work stream.
- *
- * \return 0 on success, -1 on failure with a Python exception set.
- * \note - As a C function, must not thrown C++ exceptions.
- *
- * \sa DLPackExchangeAPI
- */
-typedef int (*DLPackCurrentWorkStream)(                         //
-  DLDeviceType device_type,                                     //
-  int32_t device_id,                                            //
-  void** out_current_stream                                     //
-);
-
-/*!
- * \brief Imports a DLManagedTensorVersioned to a PyObject* Tensor/NDArray.
- *
- * Convert an owning DLManagedTensorVersioned* to the Python tensor of the
- * producer (implementor) library with the correct type.
- *
- * This function does not perform any stream synchronization.
- *
- * This function is exposed by the framework through the DLPackExchangeAPI.
- *
- * \param tensor The DLManagedTensorVersioned to convert the ownership of the
- *        tensor is stolen.
- * \param out_py_object The output Python object.
- * \return 0 on success, -1 on failure with a Python exception set.
- *
- * \sa DLPackExchangeAPI
- */
-typedef int (*DLPackManagedTensorToPyObjectNoSync)(                //
-  DLManagedTensorVersioned* tensor,                                //
-  void** out_py_object                                             //
-);
-
-/*!
- * \brief DLPackExchangeAPI stable header.
- * \sa DLPackExchangeAPI
- */
-typedef struct DLPackExchangeAPIHeader {
-  /*!
-   * \brief The provided DLPack version the consumer must check major version
-   *        compatibility before using this struct.
-   */
-  DLPackVersion version;
-  /*!
-   * \brief Optional pointer to an older DLPackExchangeAPI in the chain.
-   *
-   * It must be NULL if the framework does not support older versions.
-   * If the current major version is larger than the one supported by the
-   * consumer, the consumer may walk this to find an earlier supported version.
-   *
-   * \sa DLPackExchangeAPI
-   */
-  struct DLPackExchangeAPIHeader* prev_api;
-} DLPackExchangeAPIHeader;
-
-/*!
- * \brief Framework-specific function pointers table for DLPack exchange.
- *
- * Additionally to `__dlpack__()` we define a C function table sharable by
- * Python implementations via `__c_dlpack_exchange_api__`.
- * This attribute must be set on the type as a Python integer compatible
- * with `PyLong_FromVoidPtr`/`PyLong_AsVoidPtr`.
- *
- * A consumer library may use a pattern such as:
- *
- * \code
- *
- * PyObject *api_obj = type(tensor_obj).__c_dlpack_exchange_api__;  // as C-code
- * MyDLPackExchangeAPI *api = PyLong_AsVoidPtr(api_obj);
- * if (api == NULL && PyErr_Occurred()) { goto handle_error; }
- *
- * \endcode
- *
- * Note that this must be defined on the type. The consumer should look up the
- * attribute on the type and may cache the result for each unique type.
- *
- * The precise API table is given by:
- * \code
- * struct MyDLPackExchangeAPI : public DLPackExchangeAPI {
- *   MyDLPackExchangeAPI() {
- *     header.version.major = DLPACK_MAJOR_VERSION;
- *     header.version.minor = DLPACK_MINOR_VERSION;
- *     header.prev_version_api = nullptr;
- *
- *     managed_tensor_allocator = MyDLPackManagedTensorAllocator;
- *     managed_tensor_from_py_object_no_sync = MyDLPackManagedTensorFromPyObjectNoSync;
- *     managed_tensor_to_py_object_no_sync = MyDLPackManagedTensorToPyObjectNoSync;
- *     dltensor_from_py_object_no_sync = MyDLPackDLTensorFromPyObjectNoSync;
- *     current_work_stream = MyDLPackCurrentWorkStream;
- *  }
- *
- *  static const DLPackExchangeAPI* Global() {
- *     static MyDLPackExchangeAPI inst;
- *     return &inst;
- *  }
- * };
- * \endcode
- *
- * Guidelines for leveraging DLPackExchangeAPI:
- *
- * There are generally two kinds of consumer needs for DLPack exchange:
- * - N0: library support, where consumer.kernel(x, y, z) would like to run a kernel
- *       with the data from x, y, z. The consumer is also expected to run the kernel with the same
- *       stream context as the producer. For example, when x, y, z is torch.Tensor,
- *       consumer should query exchange_api->current_work_stream to get the
- *       current stream and launch the kernel with the same stream.
- *       This setup is necessary for no synchronization in kernel launch and maximum compatibility
- *       with CUDA graph capture in the producer.
- *       This is the desirable behavior for library extension support for frameworks like PyTorch.
- * - N1: data ingestion and retention
- *
- * Note that obj.__dlpack__() API should provide useful ways for N1.
- * The primary focus of the current DLPackExchangeAPI is to enable faster exchange N0
- * with the support of the function pointer current_work_stream.
- *
- * Array/Tensor libraries should statically create and initialize this structure
- * then return a pointer to DLPackExchangeAPI as an int value in Tensor/Array.
- * The DLPackExchangeAPI* must stay alive throughout the lifetime of the process.
- *
- * One simple way to do so is to create a static instance of DLPackExchangeAPI
- * within the framework and return a pointer to it. The following code
- * shows an example to do so in C++. It should also be reasonably easy
- * to do so in other languages.
- */
-typedef struct DLPackExchangeAPI {
-  /*!
-   * \brief The header that remains stable across versions.
-   */
-  DLPackExchangeAPIHeader header;
-  /*!
-   * \brief Producer function pointer for DLPackManagedTensorAllocator
-   *        This function must not be NULL.
-   * \sa DLPackManagedTensorAllocator
-   */
-  DLPackManagedTensorAllocator managed_tensor_allocator;
-  /*!
-   * \brief Producer function pointer for DLPackManagedTensorFromPyObject
-   *        This function must be not NULL.
-   * \sa DLPackManagedTensorFromPyObject
-   */
-  DLPackManagedTensorFromPyObjectNoSync managed_tensor_from_py_object_no_sync;
-  /*!
-   * \brief Producer function pointer for DLPackManagedTensorToPyObject
-   *        This function must be not NULL.
-   * \sa DLPackManagedTensorToPyObject
-   */
-  DLPackManagedTensorToPyObjectNoSync managed_tensor_to_py_object_no_sync;
-  /*!
-   * \brief Producer function pointer for DLPackDLTensorFromPyObject
-   *        This function can be NULL when the producer does not support this function.
-   * \sa DLPackDLTensorFromPyObjectNoSync
-   */
-  DLPackDLTensorFromPyObjectNoSync dltensor_from_py_object_no_sync;
-  /*!
-   * \brief Producer function pointer for DLPackCurrentWorkStream
-   *        This function must be not NULL.
-   * \sa DLPackCurrentWorkStream
-   */
-  DLPackCurrentWorkStream current_work_stream;
-} DLPackExchangeAPI;
-
-#ifdef __cplusplus
-}  // DLPACK_EXTERN_C
-#endif
-#endif  // DLPACK_DLPACK_H_
diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
deleted file mode 100644
index 407ea9e19a..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
+++ /dev/null
@@ -1,1410 +0,0 @@
-//===  accumulators.hpp - Implementation of accumulator kernels --*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for accumulators (cumulative sum, prod, etc.).
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <array>
-#include <cstddef>
-#include <cstdint>
-#include <limits>
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "dpctl_tensor_types.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/sycl_alloc_utils.hpp"
-#include "utils/sycl_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace accumulators
-{
-
-namespace su_ns = dpctl::tensor::sycl_utils;
-
-using dpctl::tensor::ssize_t;
-using namespace dpctl::tensor::offset_utils;
-
-template <typename T> T ceiling_quotient(T n, T m) { return (n + m - 1) / m; }
-
-template <typename inputT, typename outputT> struct NonZeroIndicator
-{
-    constexpr NonZeroIndicator() {}
-
-    outputT operator()(const inputT &val) const
-    {
-        static constexpr outputT out_one(1);
-        static constexpr outputT out_zero(0);
-        static constexpr inputT val_zero(0);
-
-        return (val == val_zero) ? out_zero : out_one;
-    }
-};
-
-template <typename T> struct NoOpTransformer
-{
-    constexpr NoOpTransformer() {}
-
-    T operator()(const T &val) const { return val; }
-};
-
-template <typename srcTy, typename dstTy> struct CastTransformer
-{
-    constexpr CastTransformer() {}
-
-    dstTy operator()(const srcTy &val) const
-    {
-        using dpctl::tensor::type_utils::convert_impl;
-        return convert_impl<dstTy, srcTy>(val);
-    }
-};
-
-template <typename ScanOpT, typename T> struct needs_workaround
-{
-    // workaround needed due to crash in JITing on CPU
-    // remove when CMPLRLLVM-65813 is resolved
-    static constexpr bool value = su_ns::IsSyclLogicalAnd<T, ScanOpT>::value ||
-                                  su_ns::IsSyclLogicalOr<T, ScanOpT>::value;
-};
-
-template <typename BinOpT, typename T> struct can_use_inclusive_scan_over_group
-{
-    static constexpr bool value = sycl::has_known_identity<BinOpT, T>::value &&
-                                  !needs_workaround<BinOpT, T>::value;
-};
-
-namespace detail
-{
-template <typename T> class stack_t
-{
-    T *src_;
-    std::size_t size_;
-    T *local_scans_;
-
-public:
-    stack_t() : src_{}, size_{}, local_scans_{} {}
-    stack_t(T *src, std::size_t sz, T *local_scans)
-        : src_(src), size_(sz), local_scans_(local_scans)
-    {
-    }
-    ~stack_t(){};
-
-    T *get_src_ptr() const { return src_; }
-
-    std::size_t get_size() const { return size_; }
-
-    T *get_local_scans_ptr() const { return local_scans_; }
-};
-
-template <typename T> class stack_strided_t
-{
-    T *src_;
-    std::size_t size_;
-    T *local_scans_;
-    std::size_t local_stride_;
-
-public:
-    stack_strided_t() : src_{}, size_{}, local_scans_{}, local_stride_{} {}
-    stack_strided_t(T *src,
-                    std::size_t sz,
-                    T *local_scans,
-                    std::size_t local_stride)
-        : src_(src), size_(sz), local_scans_(local_scans),
-          local_stride_(local_stride)
-    {
-    }
-    ~stack_strided_t(){};
-
-    T *get_src_ptr() const { return src_; }
-
-    std::size_t get_size() const { return size_; }
-
-    T *get_local_scans_ptr() const { return local_scans_; }
-
-    std::size_t get_local_stride() const { return local_stride_; }
-};
-
-} // end of namespace detail
-
-// Iterative cumulative summation
-
-using nwiT = std::uint32_t;
-
-template <typename inputT,
-          typename outputT,
-          nwiT n_wi,
-          typename IterIndexerT,
-          typename InpIndexerT,
-          typename OutIndexerT,
-          typename TransformerT,
-          typename ScanOpT,
-          bool include_initial>
-class inclusive_scan_iter_local_scan_blocked_krn;
-
-template <typename inputT,
-          typename outputT,
-          nwiT n_wi,
-          typename IterIndexerT,
-          typename InpIndexerT,
-          typename OutIndexerT,
-          typename TransformerT,
-          typename ScanOpT,
-          bool include_initial>
-class inclusive_scan_iter_local_scan_striped_krn;
-
-template <typename inputT,
-          typename outputT,
-          nwiT n_wi,
-          typename IterIndexerT,
-          typename InpIndexerT,
-          typename OutIndexerT,
-          typename TransformerT,
-          typename ScanOpT,
-          bool include_initial = false>
-sycl::event
-inclusive_scan_base_step_blocked(sycl::queue &exec_q,
-                                 const std::uint32_t wg_size,
-                                 const std::size_t iter_nelems,
-                                 const std::size_t acc_nelems,
-                                 const inputT *input,
-                                 outputT *output,
-                                 const std::size_t s0,
-                                 const std::size_t s1,
-                                 const IterIndexerT &iter_indexer,
-                                 const InpIndexerT &inp_indexer,
-                                 const OutIndexerT &out_indexer,
-                                 TransformerT transformer,
-                                 const ScanOpT &scan_op,
-                                 outputT identity,
-                                 std::size_t &acc_groups,
-                                 const std::vector<sycl::event> &depends = {})
-{
-    acc_groups = ceiling_quotient<std::size_t>(acc_nelems, n_wi * wg_size);
-
-    sycl::event inc_scan_phase1_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        using slmT = sycl::local_accessor<outputT, 1>;
-
-        auto gws = sycl::range<1>(iter_nelems * acc_groups * wg_size);
-        auto lws = sycl::range<1>(wg_size);
-
-        auto ndRange = sycl::nd_range<1>(gws, lws);
-
-        slmT slm_iscan_tmp(lws, cgh);
-
-        using KernelName = inclusive_scan_iter_local_scan_blocked_krn<
-            inputT, outputT, n_wi, IterIndexerT, InpIndexerT, OutIndexerT,
-            TransformerT, ScanOpT, include_initial>;
-
-        cgh.parallel_for<KernelName>(ndRange, [=, slm_iscan_tmp =
-                                                      std::move(slm_iscan_tmp)](
-                                                  sycl::nd_item<1> it) {
-            const std::size_t gid = it.get_global_id(0);
-            const std::size_t lid = it.get_local_id(0);
-
-            const std::uint32_t wg_size = it.get_local_range(0);
-            const std::size_t reduce_chunks = acc_groups * wg_size;
-            const std::size_t iter_gid = gid / reduce_chunks;
-            const std::size_t chunk_gid = gid - (iter_gid * reduce_chunks);
-
-            const std::size_t i = chunk_gid * n_wi;
-            const auto &iter_offsets = iter_indexer(iter_gid);
-            const auto &inp_iter_offset = iter_offsets.get_first_offset();
-            const auto &out_iter_offset = iter_offsets.get_second_offset();
-
-            std::array<outputT, n_wi> local_iscan;
-
-#pragma unroll
-            for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
-                const std::size_t i_m_wi = i + m_wi;
-                if constexpr (!include_initial) {
-                    local_iscan[m_wi] =
-                        (i_m_wi < acc_nelems)
-                            ? transformer(input[inp_iter_offset +
-                                                inp_indexer(s0 + s1 * i_m_wi)])
-                            : identity;
-                }
-                else {
-                    // shift input to the left by a single element relative to
-                    // output
-                    local_iscan[m_wi] =
-                        (i_m_wi < acc_nelems && i_m_wi > 0)
-                            ? transformer(
-                                  input[inp_iter_offset +
-                                        inp_indexer((s0 + s1 * i_m_wi) - 1)])
-                            : identity;
-                }
-            }
-
-#pragma unroll
-            for (nwiT m_wi = 1; m_wi < n_wi; ++m_wi) {
-                local_iscan[m_wi] =
-                    scan_op(local_iscan[m_wi], local_iscan[m_wi - 1]);
-            }
-            // local_iscan is now result of
-            // inclusive scan of locally stored inputs
-
-            outputT wg_iscan_val;
-            if constexpr (can_use_inclusive_scan_over_group<ScanOpT,
-                                                            outputT>::value)
-            {
-                wg_iscan_val = sycl::inclusive_scan_over_group(
-                    it.get_group(), local_iscan.back(), scan_op, identity);
-            }
-            else {
-                wg_iscan_val = su_ns::custom_inclusive_scan_over_group(
-                    it.get_group(), it.get_sub_group(), slm_iscan_tmp,
-                    local_iscan.back(), identity, scan_op);
-                // ensure all finished reading from SLM, to avoid race condition
-                // with subsequent writes into SLM
-                it.barrier(sycl::access::fence_space::local_space);
-            }
-
-            slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val;
-            it.barrier(sycl::access::fence_space::local_space);
-            const outputT modifier = (lid == 0) ? identity : slm_iscan_tmp[lid];
-
-#pragma unroll
-            for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
-                local_iscan[m_wi] = scan_op(local_iscan[m_wi], modifier);
-            }
-
-            const std::size_t start = std::min(i, acc_nelems);
-            const std::size_t end = std::min(i + n_wi, acc_nelems);
-            const nwiT m_max = static_cast<nwiT>(end - start);
-            for (nwiT m_wi = 0; m_wi < m_max; ++m_wi) {
-                output[out_iter_offset + out_indexer(i + m_wi)] =
-                    local_iscan[m_wi];
-            }
-        });
-    });
-
-    return inc_scan_phase1_ev;
-}
-
-template <typename inputT,
-          typename outputT,
-          nwiT n_wi,
-          typename IterIndexerT,
-          typename InpIndexerT,
-          typename OutIndexerT,
-          typename TransformerT,
-          typename ScanOpT,
-          bool include_initial = false>
-sycl::event
-inclusive_scan_base_step_striped(sycl::queue &exec_q,
-                                 const std::uint32_t wg_size,
-                                 const std::size_t iter_nelems,
-                                 const std::size_t acc_nelems,
-                                 const inputT *input,
-                                 outputT *output,
-                                 const std::size_t s0,
-                                 const std::size_t s1,
-                                 const IterIndexerT &iter_indexer,
-                                 const InpIndexerT &inp_indexer,
-                                 const OutIndexerT &out_indexer,
-                                 TransformerT transformer,
-                                 const ScanOpT &scan_op,
-                                 outputT identity,
-                                 std::size_t &acc_groups,
-                                 const std::vector<sycl::event> &depends = {})
-{
-    const std::uint32_t reduce_nelems_per_wg = n_wi * wg_size;
-    acc_groups =
-        ceiling_quotient<std::size_t>(acc_nelems, reduce_nelems_per_wg);
-
-    sycl::event inc_scan_phase1_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        using slmT = sycl::local_accessor<outputT, 1>;
-
-        const auto &gRange = sycl::range<1>{iter_nelems * acc_groups * wg_size};
-        const auto &lRange = sycl::range<1>{wg_size};
-
-        const auto &ndRange = sycl::nd_range<1>{gRange, lRange};
-
-        slmT slm_iscan_tmp(reduce_nelems_per_wg, cgh);
-
-        using KernelName = inclusive_scan_iter_local_scan_striped_krn<
-            inputT, outputT, n_wi, IterIndexerT, InpIndexerT, OutIndexerT,
-            TransformerT, ScanOpT, include_initial>;
-
-        cgh.parallel_for<KernelName>(ndRange, [=, slm_iscan_tmp =
-                                                      std::move(slm_iscan_tmp)](
-                                                  sycl::nd_item<1> it) {
-            const std::uint32_t lid = it.get_local_linear_id();
-            const std::uint32_t wg_size = it.get_local_range(0);
-
-            const auto &sg = it.get_sub_group();
-            const std::uint32_t sgSize = sg.get_max_local_range()[0];
-            const std::size_t sgroup_id = sg.get_group_id()[0];
-            const std::uint32_t lane_id = sg.get_local_id()[0];
-
-            const std::size_t flat_group_id = it.get_group(0);
-            const std::size_t iter_gid = flat_group_id / acc_groups;
-            const std::size_t acc_group_id =
-                flat_group_id - (iter_gid * acc_groups);
-
-            const auto &iter_offsets = iter_indexer(iter_gid);
-            const auto &inp_iter_offset = iter_offsets.get_first_offset();
-            const auto &out_iter_offset = iter_offsets.get_second_offset();
-
-            std::array<outputT, n_wi> local_iscan{};
-
-            const std::size_t inp_id0 = acc_group_id * n_wi * wg_size +
-                                        sgroup_id * n_wi * sgSize + lane_id;
-
-#pragma unroll
-            for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
-                const std::size_t inp_id = inp_id0 + m_wi * sgSize;
-                if constexpr (!include_initial) {
-                    local_iscan[m_wi] =
-                        (inp_id < acc_nelems)
-                            ? transformer(input[inp_iter_offset +
-                                                inp_indexer(s0 + s1 * inp_id)])
-                            : identity;
-                }
-                else {
-                    // shift input to the left by a single element relative to
-                    // output
-                    local_iscan[m_wi] =
-                        (inp_id < acc_nelems && inp_id > 0)
-                            ? transformer(
-                                  input[inp_iter_offset +
-                                        inp_indexer((s0 + s1 * inp_id) - 1)])
-                            : identity;
-                }
-            }
-
-            // change layout from striped to blocked
-            {
-                {
-                    const std::uint32_t local_offset0 = lid * n_wi;
-#pragma unroll
-                    for (std::uint32_t i = 0; i < n_wi; ++i) {
-                        slm_iscan_tmp[local_offset0 + i] = local_iscan[i];
-                    }
-
-                    it.barrier(sycl::access::fence_space::local_space);
-                }
-
-                {
-                    const std::uint32_t block_offset =
-                        sgroup_id * sgSize * n_wi;
-                    const std::uint32_t disp0 = lane_id * n_wi;
-#pragma unroll
-                    for (nwiT i = 0; i < n_wi; ++i) {
-                        const std::uint32_t disp = disp0 + i;
-
-                        // disp == lane_id1 + i1 * sgSize;
-                        const std::uint32_t i1 = disp / sgSize;
-                        const std::uint32_t lane_id1 = disp - i1 * sgSize;
-
-                        const std::uint32_t disp_exchanged =
-                            (lane_id1 * n_wi + i1);
-
-                        local_iscan[i] =
-                            slm_iscan_tmp[block_offset + disp_exchanged];
-                    }
-
-                    it.barrier(sycl::access::fence_space::local_space);
-                }
-            }
-
-#pragma unroll
-            for (nwiT m_wi = 1; m_wi < n_wi; ++m_wi) {
-                local_iscan[m_wi] =
-                    scan_op(local_iscan[m_wi], local_iscan[m_wi - 1]);
-            }
-            // local_iscan is now result of
-            // inclusive scan of locally stored inputs
-
-            outputT wg_iscan_val;
-            if constexpr (can_use_inclusive_scan_over_group<ScanOpT,
-                                                            outputT>::value)
-            {
-                wg_iscan_val = sycl::inclusive_scan_over_group(
-                    it.get_group(), local_iscan.back(), scan_op, identity);
-            }
-            else {
-                wg_iscan_val = su_ns::custom_inclusive_scan_over_group(
-                    it.get_group(), sg, slm_iscan_tmp, local_iscan.back(),
-                    identity, scan_op);
-                // ensure all finished reading from SLM, to avoid race condition
-                // with subsequent writes into SLM
-                it.barrier(sycl::access::fence_space::local_space);
-            }
-
-            slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val;
-            it.barrier(sycl::access::fence_space::local_space);
-            const outputT modifier = (lid == 0) ? identity : slm_iscan_tmp[lid];
-
-#pragma unroll
-            for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
-                local_iscan[m_wi] = scan_op(local_iscan[m_wi], modifier);
-            }
-
-            it.barrier(sycl::access::fence_space::local_space);
-
-            // convert back to blocked layout
-            {
-                {
-                    const std::uint32_t local_offset0 = lid * n_wi;
-#pragma unroll
-                    for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
-                        slm_iscan_tmp[local_offset0 + m_wi] = local_iscan[m_wi];
-                    }
-
-                    it.barrier(sycl::access::fence_space::local_space);
-                }
-            }
-
-            {
-                const std::uint32_t block_offset =
-                    sgroup_id * sgSize * n_wi + lane_id;
-#pragma unroll
-                for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) {
-                    const std::uint32_t m_wi_scaled = m_wi * sgSize;
-                    const std::size_t out_id = inp_id0 + m_wi_scaled;
-                    if (out_id < acc_nelems) {
-                        output[out_iter_offset + out_indexer(out_id)] =
-                            slm_iscan_tmp[block_offset + m_wi_scaled];
-                    }
-                }
-            }
-        });
-    });
-
-    return inc_scan_phase1_ev;
-}
-
-template <typename inputT,
-          typename outputT,
-          nwiT n_wi,
-          typename IterIndexerT,
-          typename InpIndexerT,
-          typename OutIndexerT,
-          typename TransformerT,
-          typename ScanOpT,
-          bool include_initial = false>
-sycl::event
-inclusive_scan_base_step(sycl::queue &exec_q,
-                         const std::uint32_t wg_size,
-                         const std::size_t iter_nelems,
-                         const std::size_t acc_nelems,
-                         const inputT *input,
-                         outputT *output,
-                         const std::size_t s0,
-                         const std::size_t s1,
-                         const IterIndexerT &iter_indexer,
-                         const InpIndexerT &inp_indexer,
-                         const OutIndexerT &out_indexer,
-                         TransformerT transformer,
-                         const ScanOpT &scan_op,
-                         outputT identity,
-                         std::size_t &acc_groups,
-                         const std::vector<sycl::event> &depends = {})
-{
-    // For small stride use striped load/store.
-    // Threshold value chosen experimentally.
-    if (s1 <= 16) {
-        return inclusive_scan_base_step_striped<
-            inputT, outputT, n_wi, IterIndexerT, InpIndexerT, OutIndexerT,
-            TransformerT, ScanOpT, include_initial>(
-            exec_q, wg_size, iter_nelems, acc_nelems, input, output, s0, s1,
-            iter_indexer, inp_indexer, out_indexer, transformer, scan_op,
-            identity, acc_groups, depends);
-    }
-    else {
-        return inclusive_scan_base_step_blocked<
-            inputT, outputT, n_wi, IterIndexerT, InpIndexerT, OutIndexerT,
-            TransformerT, ScanOpT, include_initial>(
-            exec_q, wg_size, iter_nelems, acc_nelems, input, output, s0, s1,
-            iter_indexer, inp_indexer, out_indexer, transformer, scan_op,
-            identity, acc_groups, depends);
-    }
-}
-
-template <typename outputT, nwiT n_wi, typename ScanOpT>
-class inclusive_scan_1d_iter_chunk_update_krn;
-
-template <typename UpdateKernelName,
-          typename outputT,
-          nwiT n_wi,
-          typename ScanOpT>
-sycl::event update_local_chunks_1d(sycl::queue &exec_q,
-                                   outputT *src,
-                                   std::size_t src_size,
-                                   const outputT *local_scans,
-                                   std::size_t chunk_size,
-                                   const sycl::event &dependent_event)
-{
-    const auto &ctx = exec_q.get_context();
-    const auto &dev = exec_q.get_device();
-
-    const auto &kernel_id = sycl::get_kernel_id<UpdateKernelName>();
-    auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
-        ctx, {dev}, {kernel_id});
-    auto krn = kb.get_kernel(kernel_id);
-
-    const std::uint32_t sg_size = krn.template get_info<
-        sycl::info::kernel_device_specific::max_sub_group_size>(dev);
-
-    // output[ chunk_size * (i + 1) + j] += temp[i]
-    sycl::event update_event = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependent_event);
-        cgh.use_kernel_bundle(kb);
-
-        static constexpr nwiT updates_per_wi = n_wi;
-        const std::size_t n_items =
-            ceiling_quotient<std::size_t>(src_size, sg_size * n_wi) * sg_size;
-
-        sycl::range<1> gRange{n_items};
-        sycl::range<1> lRange{sg_size};
-        sycl::nd_range<1> ndRange{gRange, lRange};
-
-        cgh.parallel_for<UpdateKernelName>(
-            ndRange,
-            [chunk_size, src, src_size, local_scans](sycl::nd_item<1> ndit) {
-                static constexpr ScanOpT scan_op{};
-                static constexpr outputT identity =
-                    su_ns::Identity<ScanOpT, outputT>::value;
-
-                const std::uint32_t lws = ndit.get_local_range(0);
-                const std::size_t block_offset = ndit.get_group(0) * n_wi * lws;
-#pragma unroll
-                for (std::size_t i = 0; i < updates_per_wi; ++i) {
-                    const std::size_t src_id =
-                        block_offset + ndit.get_local_id(0) + i * lws;
-                    if (src_id < src_size) {
-                        const std::size_t scan_id = (src_id / chunk_size);
-                        const outputT modifier =
-                            (scan_id > 0) ? local_scans[scan_id - 1] : identity;
-                        src[src_id] = scan_op(src[src_id], modifier);
-                    }
-                }
-            });
-    });
-
-    return update_event;
-}
-
-/*
- * output[j] = sum( input[s0 + i * s1], 0 <= i <= j)
- * for 0 <= j < n_elems
- */
-template <typename inputT,
-          typename outputT,
-          nwiT n_wi,
-          typename IndexerT,
-          typename TransformerT,
-          typename ScanOpT,
-          bool include_initial>
-sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q,
-                                   const std::uint32_t wg_size,
-                                   const std::size_t n_elems,
-                                   const inputT *input,
-                                   outputT *output,
-                                   const std::size_t s0,
-                                   const std::size_t s1,
-                                   const IndexerT &indexer,
-                                   const TransformerT &transformer,
-                                   std::vector<sycl::event> &host_tasks,
-                                   const std::vector<sycl::event> &depends = {})
-{
-    static constexpr ScanOpT scan_op{};
-    static constexpr outputT identity =
-        su_ns::Identity<ScanOpT, outputT>::value;
-
-    static constexpr std::size_t _iter_nelems = 1;
-
-    using IterIndexerT = dpctl::tensor::offset_utils::TwoZeroOffsets_Indexer;
-    static constexpr IterIndexerT _no_op_iter_indexer{};
-
-    using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-    static constexpr NoOpIndexerT _no_op_indexer{};
-
-    std::size_t n_groups;
-    sycl::event inc_scan_phase1_ev =
-        inclusive_scan_base_step<inputT, outputT, n_wi, IterIndexerT, IndexerT,
-                                 NoOpIndexerT, TransformerT, ScanOpT,
-                                 include_initial>(
-            exec_q, wg_size, _iter_nelems, n_elems, input, output, s0, s1,
-            _no_op_iter_indexer, indexer, _no_op_indexer, transformer, scan_op,
-            identity, n_groups, depends);
-
-    sycl::event dependent_event = inc_scan_phase1_ev;
-    if (n_groups > 1) {
-        const std::size_t chunk_size = wg_size * n_wi;
-
-        // how much of temporary allocation do we need
-        std::size_t n_groups_ = n_groups;
-        std::size_t temp_size = 0;
-        while (n_groups_ > 1) {
-            const std::size_t this_size = (n_groups_ - 1);
-            temp_size += this_size;
-            n_groups_ = ceiling_quotient(this_size, chunk_size);
-        }
-
-        // allocate
-        auto temp_owner =
-            dpctl::tensor::alloc_utils::smart_malloc_device<outputT>(temp_size,
-                                                                     exec_q);
-        outputT *temp = temp_owner.get();
-
-        std::vector<detail::stack_t<outputT>> stack{};
-
-        // inclusive scans over blocks
-        n_groups_ = n_groups;
-        outputT *src = output;
-        outputT *local_scans = temp;
-
-        using NoOpTransformerT = NoOpTransformer<outputT>;
-        static constexpr NoOpTransformerT _no_op_transformer{};
-        std::size_t size_to_update = n_elems;
-        while (n_groups_ > 1) {
-
-            const std::size_t src_size = n_groups_ - 1;
-            dependent_event =
-                inclusive_scan_base_step<outputT, outputT, n_wi, IterIndexerT,
-                                         NoOpIndexerT, NoOpIndexerT,
-                                         NoOpTransformerT, ScanOpT>(
-                    exec_q, wg_size, _iter_nelems, src_size, src, local_scans,
-                    chunk_size - 1, chunk_size, _no_op_iter_indexer,
-                    _no_op_indexer, _no_op_indexer, _no_op_transformer, scan_op,
-                    identity, n_groups_, // n_groups_ is modified in place
-                    {dependent_event});
-            stack.push_back({src, size_to_update, local_scans});
-            src = local_scans;
-            local_scans += src_size;
-            size_to_update = src_size;
-        }
-
-        for (std::size_t reverse_stack_id = 0; reverse_stack_id < stack.size();
-             ++reverse_stack_id)
-        {
-            const std::size_t stack_id = stack.size() - 1 - reverse_stack_id;
-
-            const auto &stack_elem = stack[stack_id];
-            outputT *src = stack_elem.get_src_ptr();
-            const std::size_t src_size = stack_elem.get_size();
-            const outputT *local_scans = stack_elem.get_local_scans_ptr();
-
-            using UpdateKernelName =
-                class inclusive_scan_1d_iter_chunk_update_krn<outputT, n_wi,
-                                                              ScanOpT>;
-
-            dependent_event = update_local_chunks_1d<UpdateKernelName, outputT,
-                                                     n_wi, ScanOpT>(
-                exec_q, src, src_size, local_scans, chunk_size,
-                dependent_event);
-        }
-
-        sycl::event free_ev = dpctl::tensor::alloc_utils::async_smart_free(
-            exec_q, {dependent_event}, temp_owner);
-
-        host_tasks.push_back(free_ev);
-    }
-
-    return dependent_event;
-}
-
-typedef sycl::event (*accumulate_1d_contig_impl_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t,
-    const char *,
-    char *,
-    std::vector<sycl::event> &,
-    const std::vector<sycl::event> &);
-
-template <typename srcT,
-          typename dstT,
-          typename transformerT,
-          typename AccumulateOpT,
-          bool include_initial>
-sycl::event
-accumulate_1d_contig_impl(sycl::queue &q,
-                          std::size_t n_elems,
-                          const char *src,
-                          char *dst,
-                          std::vector<sycl::event> &host_tasks,
-                          const std::vector<sycl::event> &depends = {})
-{
-    const srcT *src_data_ptr = reinterpret_cast<const srcT *>(src);
-    dstT *dst_data_ptr = reinterpret_cast<dstT *>(dst);
-
-    using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-    static constexpr NoOpIndexerT flat_indexer{};
-    static constexpr transformerT transformer{};
-
-    static constexpr std::size_t s0 = 0;
-    static constexpr std::size_t s1 = 1;
-
-    sycl::event comp_ev;
-    const sycl::device &dev = q.get_device();
-    if (dev.has(sycl::aspect::cpu)) {
-        static constexpr nwiT n_wi_for_cpu = 8;
-        const std::uint32_t wg_size = 256;
-        comp_ev = inclusive_scan_iter_1d<srcT, dstT, n_wi_for_cpu, NoOpIndexerT,
-                                         transformerT, AccumulateOpT,
-                                         include_initial>(
-            q, wg_size, n_elems, src_data_ptr, dst_data_ptr, s0, s1,
-            flat_indexer, transformer, host_tasks, depends);
-    }
-    else {
-        static constexpr nwiT n_wi_for_gpu = 4;
-        // base_scan_striped algorithm does not execute correctly
-        // on HIP device with wg_size > 64
-        const std::uint32_t wg_size =
-            (q.get_backend() == sycl::backend::ext_oneapi_hip) ? 64 : 256;
-        comp_ev = inclusive_scan_iter_1d<srcT, dstT, n_wi_for_gpu, NoOpIndexerT,
-                                         transformerT, AccumulateOpT,
-                                         include_initial>(
-            q, wg_size, n_elems, src_data_ptr, dst_data_ptr, s0, s1,
-            flat_indexer, transformer, host_tasks, depends);
-    }
-    return comp_ev;
-}
-
-template <typename outputT,
-          nwiT n_wi,
-          typename IterIndexerT,
-          typename IndexerT,
-          typename ScanOpT>
-class inclusive_scan_final_chunk_update_krn;
-
-template <typename UpdateKernelName,
-          typename outputT,
-          nwiT n_wi,
-          typename OutIterIndexerT,
-          typename OutIndexerT,
-          typename ScanOpT>
-sycl::event final_update_local_chunks(sycl::queue &exec_q,
-                                      std::size_t iter_nelems,
-                                      outputT *src,
-                                      std::size_t src_size,
-                                      const outputT *local_scans,
-                                      std::size_t chunk_size,
-                                      std::size_t local_stride,
-                                      const OutIterIndexerT &out_iter_indexer,
-                                      const OutIndexerT &out_indexer,
-                                      sycl::event dependent_event)
-{
-    const auto &kernel_id = sycl::get_kernel_id<UpdateKernelName>();
-
-    auto const &ctx = exec_q.get_context();
-    auto const &dev = exec_q.get_device();
-    auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
-        ctx, {dev}, {kernel_id});
-
-    auto krn = kb.get_kernel(kernel_id);
-
-    const std::uint32_t sg_size = krn.template get_info<
-        sycl::info::kernel_device_specific::max_sub_group_size>(dev);
-
-    static constexpr nwiT updates_per_wi = n_wi;
-    const std::size_t updates_per_sg = sg_size * updates_per_wi;
-    const std::size_t update_nelems =
-        ceiling_quotient(src_size, updates_per_sg) * sg_size;
-
-    sycl::range<2> gRange{iter_nelems, update_nelems};
-    sycl::range<2> lRange{1, sg_size};
-
-    sycl::nd_range<2> ndRange{gRange, lRange};
-
-    sycl::event update_event = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependent_event);
-
-        cgh.parallel_for<UpdateKernelName>(
-            ndRange, [chunk_size, src_size, local_stride, src, local_scans,
-                      out_iter_indexer, out_indexer](sycl::nd_item<2> ndit) {
-                static constexpr ScanOpT scan_op{};
-                static constexpr outputT identity =
-                    su_ns::Identity<ScanOpT, outputT>::value;
-
-                const std::uint32_t lws = ndit.get_local_range(1);
-
-                const std::size_t iter_gid = ndit.get_group(0);
-
-                const std::size_t src_axis_id0 =
-                    ndit.get_group(1) * updates_per_wi * lws +
-                    ndit.get_local_id(1);
-                const std::size_t src_iter_id = out_iter_indexer(iter_gid);
-#pragma unroll
-                for (nwiT i = 0; i < updates_per_wi; ++i) {
-                    const std::size_t src_axis_id = src_axis_id0 + i * lws;
-                    const std::size_t src_id =
-                        out_indexer(src_axis_id) + src_iter_id;
-
-                    if (src_axis_id < src_size) {
-                        const std::size_t scan_axis_id =
-                            src_axis_id / chunk_size;
-                        const std::size_t scan_id =
-                            scan_axis_id + iter_gid * local_stride;
-
-                        const outputT modifier = (scan_axis_id > 0)
-                                                     ? local_scans[scan_id - 1]
-                                                     : identity;
-
-                        src[src_id] = scan_op(src[src_id], modifier);
-                    }
-                }
-            });
-    });
-
-    return update_event;
-}
-
-template <typename outputT, nwiT n_wi, typename ScanOpT>
-class inclusive_scan_iter_chunk_update_krn;
-
-template <typename UpdateKernelName,
-          typename outputT,
-          nwiT n_wi,
-          typename ScanOpT>
-sycl::event update_local_chunks(sycl::queue &exec_q,
-                                std::size_t iter_nelems,
-                                outputT *src,
-                                std::size_t src_size,
-                                const outputT *local_scans,
-                                std::size_t chunk_size,
-                                std::size_t local_stride,
-                                sycl::event dependent_event)
-{
-    static constexpr NoOpIndexer out_indexer{};
-    static constexpr NoOpIndexer iter_out_indexer{};
-
-    return final_update_local_chunks<UpdateKernelName, outputT, n_wi,
-                                     NoOpIndexer, NoOpIndexer, ScanOpT>(
-        exec_q, iter_nelems, src, src_size, local_scans, chunk_size,
-        local_stride, iter_out_indexer, out_indexer, dependent_event);
-}
-
-template <typename inputT,
-          typename outputT,
-          nwiT n_wi,
-          typename InpIterIndexerT,
-          typename OutIterIndexerT,
-          typename InpIndexerT,
-          typename OutIndexerT,
-          typename TransformerT,
-          typename ScanOpT,
-          bool include_initial>
-sycl::event inclusive_scan_iter(sycl::queue &exec_q,
-                                const std::uint32_t wg_size,
-                                const std::size_t iter_nelems,
-                                const std::size_t acc_nelems,
-                                const inputT *input,
-                                outputT *output,
-                                const std::size_t s0,
-                                const std::size_t s1,
-                                const InpIterIndexerT &inp_iter_indexer,
-                                const OutIterIndexerT &out_iter_indexer,
-                                const InpIndexerT &inp_indexer,
-                                const OutIndexerT &out_indexer,
-                                const TransformerT &transformer,
-                                std::vector<sycl::event> &host_tasks,
-                                const std::vector<sycl::event> &depends = {})
-{
-    static constexpr ScanOpT scan_op{};
-    static constexpr outputT identity =
-        su_ns::Identity<ScanOpT, outputT>::value;
-
-    using IterIndexerT =
-        dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-            InpIterIndexerT, OutIterIndexerT>;
-    const IterIndexerT iter_indexer{inp_iter_indexer, out_iter_indexer};
-
-    std::size_t acc_groups;
-    sycl::event inc_scan_phase1_ev =
-        inclusive_scan_base_step<inputT, outputT, n_wi, IterIndexerT,
-                                 InpIndexerT, OutIndexerT, TransformerT,
-                                 ScanOpT, include_initial>(
-            exec_q, wg_size, iter_nelems, acc_nelems, input, output, s0, s1,
-            iter_indexer, inp_indexer, out_indexer, transformer, scan_op,
-            identity, acc_groups, depends);
-
-    sycl::event dependent_event = inc_scan_phase1_ev;
-    if (acc_groups > 1) {
-        const std::size_t chunk_size = wg_size * n_wi;
-
-        // how much of temporary allocation do we need
-        std::size_t acc_groups_ = acc_groups;
-        std::size_t temp_size = 0;
-        while (acc_groups_ > 1) {
-            const std::size_t this_size = (acc_groups_ - 1);
-            temp_size += this_size;
-            acc_groups_ = ceiling_quotient<std::size_t>(this_size, chunk_size);
-        }
-
-        // allocate
-        auto temp_owner =
-            dpctl::tensor::alloc_utils::smart_malloc_device<outputT>(
-                iter_nelems * temp_size, exec_q);
-        outputT *temp = temp_owner.get();
-
-        std::vector<detail::stack_strided_t<outputT>> stack{};
-
-        // inclusive scans over blocks
-        acc_groups_ = acc_groups;
-        outputT *src = output;
-        outputT *local_scans = temp;
-
-        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        static constexpr NoOpIndexerT _no_op_indexer{};
-        using NoOpTransformerT = NoOpTransformer<outputT>;
-        static constexpr NoOpTransformerT _no_op_transformer{};
-        std::size_t size_to_update = acc_nelems;
-
-        {
-            std::size_t src_size = acc_groups - 1;
-            using LocalScanIndexerT =
-                dpctl::tensor::offset_utils::Strided1DIndexer;
-            const LocalScanIndexerT scan_iter_indexer{/* size */ iter_nelems,
-                                                      /* step */ src_size};
-
-            using IterIndexerT =
-                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                    OutIterIndexerT, LocalScanIndexerT>;
-            const IterIndexerT iter_indexer_{out_iter_indexer,
-                                             scan_iter_indexer};
-
-            dependent_event =
-                inclusive_scan_base_step<outputT, outputT, n_wi, IterIndexerT,
-                                         OutIndexerT, NoOpIndexerT,
-                                         NoOpTransformerT, ScanOpT>(
-                    exec_q, wg_size, iter_nelems, src_size, src, local_scans,
-                    chunk_size - 1, chunk_size, iter_indexer_, out_indexer,
-                    _no_op_indexer, _no_op_transformer, scan_op, identity,
-                    acc_groups_, // acc_groups_ is modified in place
-                    {dependent_event});
-            stack.push_back({src, size_to_update, local_scans, src_size});
-            src = local_scans;
-            local_scans += src_size * iter_nelems;
-            size_to_update = src_size;
-        }
-
-        while (acc_groups_ > 1) {
-            std::size_t src_size = acc_groups_ - 1;
-
-            using LocalScanIndexerT =
-                dpctl::tensor::offset_utils::Strided1DIndexer;
-            const LocalScanIndexerT scan1_iter_indexer{
-                /* size */ iter_nelems,
-                /* step */ size_to_update};
-            const LocalScanIndexerT scan2_iter_indexer{/* size */ iter_nelems,
-                                                       /* step */ src_size};
-
-            using IterIndexerT =
-                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                    LocalScanIndexerT, LocalScanIndexerT>;
-            const IterIndexerT iter_indexer_{scan1_iter_indexer,
-                                             scan2_iter_indexer};
-
-            dependent_event =
-                inclusive_scan_base_step<outputT, outputT, n_wi, IterIndexerT,
-                                         NoOpIndexerT, NoOpIndexerT,
-                                         NoOpTransformerT, ScanOpT>(
-                    exec_q, wg_size, iter_nelems, src_size, src, local_scans,
-                    chunk_size - 1, chunk_size, iter_indexer_, _no_op_indexer,
-                    _no_op_indexer, _no_op_transformer, scan_op, identity,
-                    acc_groups_, // acc_groups_ is modified in place
-                    {dependent_event});
-            stack.push_back({src, size_to_update, local_scans, src_size});
-            src = local_scans;
-            local_scans += src_size * iter_nelems;
-            size_to_update = src_size;
-        }
-
-        for (std::size_t reverse_stack_id = 0;
-             reverse_stack_id < stack.size() - 1; ++reverse_stack_id)
-        {
-            const std::size_t stack_id = stack.size() - 1 - reverse_stack_id;
-
-            const auto &stack_elem = stack[stack_id];
-            outputT *src = stack_elem.get_src_ptr();
-            std::size_t src_size = stack_elem.get_size();
-            outputT *local_scans = stack_elem.get_local_scans_ptr();
-            std::size_t local_stride = stack_elem.get_local_stride();
-
-            using UpdateKernelName =
-                class inclusive_scan_iter_chunk_update_krn<outputT, n_wi,
-                                                           ScanOpT>;
-
-            dependent_event =
-                update_local_chunks<UpdateKernelName, outputT, n_wi, ScanOpT>(
-                    exec_q, iter_nelems, src, src_size, local_scans, chunk_size,
-                    local_stride, dependent_event);
-        }
-
-        // last stack element is always directly to output
-        {
-            const auto &stack_elem = stack[0];
-            outputT *src = stack_elem.get_src_ptr();
-            const std::size_t src_size = stack_elem.get_size();
-            outputT *local_scans = stack_elem.get_local_scans_ptr();
-            const std::size_t local_stride = stack_elem.get_local_stride();
-
-            using UpdateKernelName =
-                class inclusive_scan_final_chunk_update_krn<
-                    outputT, n_wi, OutIterIndexerT, OutIndexerT, ScanOpT>;
-
-            dependent_event =
-                final_update_local_chunks<UpdateKernelName, outputT, n_wi,
-                                          OutIterIndexerT, OutIndexerT,
-                                          ScanOpT>(
-                    exec_q, iter_nelems, src, src_size, local_scans, chunk_size,
-                    local_stride, out_iter_indexer, out_indexer,
-                    dependent_event);
-        }
-
-        sycl::event free_ev = dpctl::tensor::alloc_utils::async_smart_free(
-            exec_q, {dependent_event}, temp_owner);
-        host_tasks.push_back(free_ev);
-    }
-
-    return dependent_event;
-}
-
-typedef sycl::event (*accumulate_strided_impl_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t,
-    std::size_t,
-    const char *,
-    int,
-    const ssize_t *,
-    ssize_t,
-    ssize_t,
-    int,
-    const ssize_t *,
-    char *,
-    std::vector<sycl::event> &,
-    const std::vector<sycl::event> &);
-
-template <typename srcT,
-          typename dstT,
-          typename transformerT,
-          typename AccumulateOpT,
-          bool include_initial>
-sycl::event
-accumulate_strided_impl(sycl::queue &q,
-                        std::size_t iter_nelems,
-                        std::size_t acc_nelems,
-                        const char *src,
-                        int iter_nd,
-                        const ssize_t *iter_shape_strides,
-                        ssize_t inp_iter_offset,
-                        ssize_t out_iter_offset,
-                        int acc_nd,
-                        const ssize_t *acc_shape_strides,
-                        char *dst,
-                        std::vector<sycl::event> &host_tasks,
-                        const std::vector<sycl::event> &depends = {})
-{
-    const srcT *src_data_ptr = reinterpret_cast<const srcT *>(src);
-    dstT *dst_data_ptr = reinterpret_cast<dstT *>(dst);
-
-    using InpIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
-    const InpIndexerT inp_axis_indexer{acc_nd, 0, acc_shape_strides};
-    const InpIndexerT inp_iter_indexer{iter_nd, inp_iter_offset,
-                                       iter_shape_strides};
-
-    using OutIndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer;
-    const OutIndexerT out_axis_indexer{acc_nd, 0, acc_shape_strides,
-                                       acc_shape_strides + 2 * acc_nd};
-    const OutIndexerT out_iter_indexer{iter_nd, out_iter_offset,
-                                       iter_shape_strides,
-                                       iter_shape_strides + 2 * iter_nd};
-
-    static constexpr transformerT transformer{};
-
-    static constexpr std::size_t s0 = 0;
-    static constexpr std::size_t s1 = 1;
-
-    const sycl::device &dev = q.get_device();
-    sycl::event comp_ev;
-    if (dev.has(sycl::aspect::cpu)) {
-        static constexpr nwiT n_wi_for_cpu = 8;
-        const std::uint32_t wg_size = 256;
-        comp_ev =
-            inclusive_scan_iter<srcT, dstT, n_wi_for_cpu, InpIndexerT,
-                                OutIndexerT, InpIndexerT, OutIndexerT,
-                                transformerT, AccumulateOpT, include_initial>(
-                q, wg_size, iter_nelems, acc_nelems, src_data_ptr, dst_data_ptr,
-                s0, s1, inp_iter_indexer, out_iter_indexer, inp_axis_indexer,
-                out_axis_indexer, transformer, host_tasks, depends);
-    }
-    else {
-        static constexpr nwiT n_wi_for_gpu = 4;
-        // base_scan_striped algorithm does not execute correctly
-        // on HIP device with wg_size > 64
-        const std::uint32_t wg_size =
-            (q.get_backend() == sycl::backend::ext_oneapi_hip) ? 64 : 256;
-        comp_ev =
-            inclusive_scan_iter<srcT, dstT, n_wi_for_gpu, InpIndexerT,
-                                OutIndexerT, InpIndexerT, OutIndexerT,
-                                transformerT, AccumulateOpT, include_initial>(
-                q, wg_size, iter_nelems, acc_nelems, src_data_ptr, dst_data_ptr,
-                s0, s1, inp_iter_indexer, out_iter_indexer, inp_axis_indexer,
-                out_axis_indexer, transformer, host_tasks, depends);
-    }
-
-    return comp_ev;
-}
-
-typedef std::size_t (*cumsum_val_contig_impl_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t,
-    const char *,
-    char *,
-    std::vector<sycl::event> &,
-    const std::vector<sycl::event> &);
-
-template <typename maskT, typename cumsumT, typename transformerT>
-std::size_t cumsum_val_contig_impl(sycl::queue &q,
-                                   std::size_t n_elems,
-                                   const char *mask,
-                                   char *cumsum,
-                                   std::vector<sycl::event> &host_tasks,
-                                   const std::vector<sycl::event> &depends = {})
-{
-    const maskT *mask_data_ptr = reinterpret_cast<const maskT *>(mask);
-    cumsumT *cumsum_data_ptr = reinterpret_cast<cumsumT *>(cumsum);
-
-    using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-    static constexpr NoOpIndexerT flat_indexer{};
-    static constexpr transformerT transformer{};
-
-    static constexpr std::size_t s0 = 0;
-    static constexpr std::size_t s1 = 1;
-    static constexpr bool include_initial = false;
-    using AccumulateOpT = sycl::plus<cumsumT>;
-
-    sycl::event comp_ev;
-    const sycl::device &dev = q.get_device();
-    if (dev.has(sycl::aspect::cpu)) {
-        static constexpr nwiT n_wi_for_cpu = 8;
-        const std::uint32_t wg_size = 256;
-        comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_cpu,
-                                         NoOpIndexerT, transformerT,
-                                         AccumulateOpT, include_initial>(
-            q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, s0, s1,
-            flat_indexer, transformer, host_tasks, depends);
-    }
-    else {
-        static constexpr nwiT n_wi_for_gpu = 4;
-        // base_scan_striped algorithm does not execute correctly
-        // on HIP device with wg_size > 64
-        const std::uint32_t wg_size =
-            (q.get_backend() == sycl::backend::ext_oneapi_hip) ? 64 : 256;
-        comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_gpu,
-                                         NoOpIndexerT, transformerT,
-                                         AccumulateOpT, include_initial>(
-            q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, s0, s1,
-            flat_indexer, transformer, host_tasks, depends);
-    }
-    cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1);
-
-    auto host_usm_owner =
-        dpctl::tensor::alloc_utils::smart_malloc_host<cumsumT>(1, q);
-    cumsumT *last_elem_host_usm = host_usm_owner.get();
-
-    sycl::event copy_e = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(comp_ev);
-        cgh.copy<cumsumT>(last_elem, last_elem_host_usm, 1);
-    });
-    copy_e.wait();
-    std::size_t return_val = static_cast<std::size_t>(*last_elem_host_usm);
-
-    // explicitly free USM host allocation, by envoking deleter of
-    // the unique_ptr
-    host_usm_owner.reset(nullptr);
-
-    return return_val;
-}
-
-template <typename fnT, typename T> struct MaskPositionsContigFactoryForInt32
-{
-    fnT get()
-    {
-        using cumsumT = std::int32_t;
-        fnT fn =
-            cumsum_val_contig_impl<T, cumsumT, NonZeroIndicator<T, cumsumT>>;
-        return fn;
-    }
-};
-
-template <typename fnT, typename T> struct MaskPositionsContigFactoryForInt64
-{
-    fnT get()
-    {
-        using cumsumT = std::int64_t;
-        fnT fn =
-            cumsum_val_contig_impl<T, cumsumT, NonZeroIndicator<T, cumsumT>>;
-        return fn;
-    }
-};
-
-template <typename fnT, typename T> struct Cumsum1DContigFactory
-{
-    fnT get()
-    {
-        if constexpr (std::is_integral_v<T>) {
-            using cumsumT = std::int64_t;
-            fnT fn =
-                cumsum_val_contig_impl<T, cumsumT, NoOpTransformer<cumsumT>>;
-            return fn;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-typedef std::size_t (*cumsum_val_strided_impl_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t,
-    const char *,
-    int,
-    const ssize_t *,
-    char *,
-    std::vector<sycl::event> &,
-    const std::vector<sycl::event> &);
-
-template <typename maskT, typename cumsumT, typename transformerT>
-std::size_t
-cumsum_val_strided_impl(sycl::queue &q,
-                        std::size_t n_elems,
-                        const char *mask,
-                        int nd,
-                        const ssize_t *shape_strides,
-                        char *cumsum,
-                        std::vector<sycl::event> &host_tasks,
-                        const std::vector<sycl::event> &depends = {})
-{
-    const maskT *mask_data_ptr = reinterpret_cast<const maskT *>(mask);
-    cumsumT *cumsum_data_ptr = reinterpret_cast<cumsumT *>(cumsum);
-
-    using StridedIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
-    const StridedIndexerT strided_indexer{nd, 0, shape_strides};
-    static constexpr transformerT transformer{};
-
-    static constexpr std::size_t s0 = 0;
-    static constexpr std::size_t s1 = 1;
-    static constexpr bool include_initial = false;
-    using AccumulateOpT = sycl::plus<cumsumT>;
-
-    const sycl::device &dev = q.get_device();
-    sycl::event comp_ev;
-    if (dev.has(sycl::aspect::cpu)) {
-        static constexpr nwiT n_wi_for_cpu = 8;
-        const std::uint32_t wg_size = 256;
-        comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_cpu,
-                                         StridedIndexerT, transformerT,
-                                         AccumulateOpT, include_initial>(
-            q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, s0, s1,
-            strided_indexer, transformer, host_tasks, depends);
-    }
-    else {
-        static constexpr nwiT n_wi_for_gpu = 4;
-        // base_scan_striped algorithm does not execute correctly
-        // on HIP device with wg_size > 64
-        const std::uint32_t wg_size =
-            (q.get_backend() == sycl::backend::ext_oneapi_hip) ? 64 : 256;
-        comp_ev = inclusive_scan_iter_1d<maskT, cumsumT, n_wi_for_gpu,
-                                         StridedIndexerT, transformerT,
-                                         AccumulateOpT, include_initial>(
-            q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, s0, s1,
-            strided_indexer, transformer, host_tasks, depends);
-    }
-
-    cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1);
-
-    auto host_usm_owner =
-        dpctl::tensor::alloc_utils::smart_malloc_host<cumsumT>(1, q);
-    cumsumT *last_elem_host_usm = host_usm_owner.get();
-
-    sycl::event copy_e = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(comp_ev);
-        cgh.copy<cumsumT>(last_elem, last_elem_host_usm, 1);
-    });
-    copy_e.wait();
-    std::size_t return_val = static_cast<std::size_t>(*last_elem_host_usm);
-
-    // explicitly free USM-host temporary, by envoking deleter of
-    // the unique_ptr
-    host_usm_owner.reset(nullptr);
-
-    return return_val;
-}
-
-template <typename fnT, typename T> struct MaskPositionsStridedFactoryForInt32
-{
-    fnT get()
-    {
-        using cumsumT = std::int32_t;
-        fnT fn =
-            cumsum_val_strided_impl<T, cumsumT, NonZeroIndicator<T, cumsumT>>;
-        return fn;
-    }
-};
-
-template <typename fnT, typename T> struct MaskPositionsStridedFactoryForInt64
-{
-    fnT get()
-    {
-        using cumsumT = std::int64_t;
-        fnT fn =
-            cumsum_val_strided_impl<T, cumsumT, NonZeroIndicator<T, cumsumT>>;
-        return fn;
-    }
-};
-
-template <typename fnT, typename T> struct Cumsum1DStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (std::is_integral_v<T>) {
-            using cumsumT = std::int64_t;
-            fnT fn =
-                cumsum_val_strided_impl<T, cumsumT, NoOpTransformer<cumsumT>>;
-            return fn;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-} // namespace accumulators
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/alignment.hpp b/dpctl/tensor/libtensor/include/kernels/alignment.hpp
deleted file mode 100644
index 1b92deb7ce..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/alignment.hpp
+++ /dev/null
@@ -1,45 +0,0 @@
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-#pragma once
-
-#include <cstddef>
-#include <cstdint>
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace alignment_utils
-{
-
-inline constexpr std::size_t required_alignment = 64UL;
-
-template <std::uintptr_t alignment, typename Ptr> bool is_aligned(Ptr p)
-{
-    return !(reinterpret_cast<std::uintptr_t>(p) % alignment);
-}
-
-template <typename KernelName> class disabled_sg_loadstore_wrapper_krn;
-
-} // end of namespace alignment_utils
-} // end of namespace kernels
-} // end of namespace tensor
-} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
deleted file mode 100644
index 01ba0b21c1..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
+++ /dev/null
@@ -1,851 +0,0 @@
-//=== boolean_advanced_indexing.hpp -                      ------*-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for advanced tensor index operations.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <limits>
-#include <utility>
-#include <vector>
-
-#include <sycl/sycl.hpp>
-
-#include "dpctl_tensor_types.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace indexing
-{
-
-using dpctl::tensor::ssize_t;
-using namespace dpctl::tensor::offset_utils;
-
-template <typename OrthogIndexerT,
-          typename MaskedSrcIndexerT,
-          typename MaskedDstIndexerT,
-          typename dataT,
-          typename indT,
-          typename LocalAccessorT>
-struct MaskedExtractStridedFunctor
-{
-    MaskedExtractStridedFunctor(const dataT *src_data_p,
-                                const indT *cumsum_data_p,
-                                dataT *dst_data_p,
-                                std::size_t masked_iter_size,
-                                const OrthogIndexerT &orthog_src_dst_indexer_,
-                                const MaskedSrcIndexerT &masked_src_indexer_,
-                                const MaskedDstIndexerT &masked_dst_indexer_,
-                                const LocalAccessorT &lacc_)
-        : src(src_data_p), cumsum(cumsum_data_p), dst(dst_data_p),
-          masked_nelems(masked_iter_size),
-          orthog_src_dst_indexer(orthog_src_dst_indexer_),
-          masked_src_indexer(masked_src_indexer_),
-          masked_dst_indexer(masked_dst_indexer_), lacc(lacc_)
-    {
-        static_assert(
-            std::is_same_v<indT, typename LocalAccessorT::value_type>);
-    }
-
-    void operator()(sycl::nd_item<2> ndit) const
-    {
-        const std::size_t orthog_i = ndit.get_global_id(0);
-        const std::uint32_t l_i = ndit.get_local_id(1);
-        const std::uint32_t lws = ndit.get_local_range(1);
-
-        const std::size_t masked_i = ndit.get_global_id(1);
-        const std::size_t masked_block_start = masked_i - l_i;
-
-        const std::size_t max_offset = masked_nelems + 1;
-        for (std::uint32_t i = l_i; i < lacc.size(); i += lws) {
-            const std::size_t offset = masked_block_start + i;
-            lacc[i] = (offset == 0)           ? indT(0)
-                      : (offset < max_offset) ? cumsum[offset - 1]
-                                              : cumsum[masked_nelems - 1] + 1;
-        }
-
-        sycl::group_barrier(ndit.get_group());
-
-        const indT current_running_count = lacc[l_i + 1];
-        const bool mask_set = (masked_i == 0)
-                                  ? (current_running_count == 1)
-                                  : (current_running_count == lacc[l_i] + 1);
-
-        // dst[cumsum[i] - 1, j] = src[i, j]
-        //     if cumsum[i] == ((i > 0) ? cumsum[i-1] + 1 : 1)
-        if (mask_set && (masked_i < masked_nelems)) {
-            const auto &orthog_offsets = orthog_src_dst_indexer(orthog_i);
-
-            const std::size_t total_src_offset =
-                masked_src_indexer(masked_i) +
-                orthog_offsets.get_first_offset();
-            const std::size_t total_dst_offset =
-                masked_dst_indexer(current_running_count - 1) +
-                orthog_offsets.get_second_offset();
-
-            dst[total_dst_offset] = src[total_src_offset];
-        }
-    }
-
-private:
-    const dataT *src = nullptr;
-    const indT *cumsum = nullptr;
-    dataT *dst = nullptr;
-    std::size_t masked_nelems = 0;
-    // has nd, shape, src_strides, dst_strides for
-    // dimensions that ARE NOT masked
-    OrthogIndexerT orthog_src_dst_indexer;
-    // has nd, shape, src_strides for
-    // dimensions that ARE masked
-    MaskedSrcIndexerT masked_src_indexer;
-    // has 1, dst_strides for dimensions that ARE masked
-    MaskedDstIndexerT masked_dst_indexer;
-    LocalAccessorT lacc;
-};
-
-template <typename OrthogIndexerT,
-          typename MaskedDstIndexerT,
-          typename MaskedRhsIndexerT,
-          typename dataT,
-          typename indT,
-          typename LocalAccessorT>
-struct MaskedPlaceStridedFunctor
-{
-    MaskedPlaceStridedFunctor(dataT *dst_data_p,
-                              const indT *cumsum_data_p,
-                              const dataT *rhs_data_p,
-                              std::size_t masked_iter_size,
-                              const OrthogIndexerT &orthog_dst_rhs_indexer_,
-                              const MaskedDstIndexerT &masked_dst_indexer_,
-                              const MaskedRhsIndexerT &masked_rhs_indexer_,
-                              const LocalAccessorT &lacc_)
-        : dst(dst_data_p), cumsum(cumsum_data_p), rhs(rhs_data_p),
-          masked_nelems(masked_iter_size),
-          orthog_dst_rhs_indexer(orthog_dst_rhs_indexer_),
-          masked_dst_indexer(masked_dst_indexer_),
-          masked_rhs_indexer(masked_rhs_indexer_), lacc(lacc_)
-    {
-        static_assert(
-            std::is_same_v<indT, typename LocalAccessorT::value_type>);
-    }
-
-    void operator()(sycl::nd_item<2> ndit) const
-    {
-        const std::size_t orthog_i = ndit.get_global_id(0);
-        const std::uint32_t l_i = ndit.get_local_id(1);
-        const std::uint32_t lws = ndit.get_local_range(1);
-
-        const std::size_t masked_i = ndit.get_global_id(1);
-        const std::size_t masked_block_start = masked_i - l_i;
-
-        const std::size_t max_offset = masked_nelems + 1;
-        for (std::uint32_t i = l_i; i < lacc.size(); i += lws) {
-            const std::size_t offset = masked_block_start + i;
-            lacc[i] = (offset == 0)           ? indT(0)
-                      : (offset < max_offset) ? cumsum[offset - 1]
-                                              : cumsum[masked_nelems - 1] + 1;
-        }
-
-        sycl::group_barrier(ndit.get_group());
-
-        const indT current_running_count = lacc[l_i + 1];
-        const bool mask_set = (masked_i == 0)
-                                  ? (current_running_count == 1)
-                                  : (current_running_count == lacc[l_i] + 1);
-
-        // src[i, j] = rhs[cumsum[i] - 1, j]
-        // if cumsum[i] == ((i > 0) ? cumsum[i-1] + 1 : 1)
-        if (mask_set && (masked_i < masked_nelems)) {
-            const auto &orthog_offsets = orthog_dst_rhs_indexer(orthog_i);
-
-            const std::size_t total_dst_offset =
-                masked_dst_indexer(masked_i) +
-                orthog_offsets.get_first_offset();
-            const std::size_t total_rhs_offset =
-                masked_rhs_indexer(current_running_count - 1) +
-                orthog_offsets.get_second_offset();
-
-            dst[total_dst_offset] = rhs[total_rhs_offset];
-        }
-    }
-
-private:
-    dataT *dst = nullptr;
-    const indT *cumsum = nullptr;
-    const dataT *rhs = nullptr;
-    std::size_t masked_nelems = 0;
-    // has nd, shape, dst_strides, rhs_strides for
-    // dimensions that ARE NOT masked
-    OrthogIndexerT orthog_dst_rhs_indexer;
-    // has nd, shape, dst_strides for
-    // dimensions that ARE masked
-    MaskedDstIndexerT masked_dst_indexer;
-    // has 1, rhs_strides for dimensions that ARE masked
-    MaskedRhsIndexerT masked_rhs_indexer;
-    LocalAccessorT lacc;
-};
-
-// ======= Masked extraction ================================
-
-namespace detail
-{
-
-template <std::size_t I, std::size_t... IR>
-std::size_t _get_lws_impl(std::size_t n)
-{
-    if constexpr (sizeof...(IR) == 0) {
-        return I;
-    }
-    else {
-        return (n < I) ? _get_lws_impl<IR...>(n) : I;
-    }
-}
-
-inline std::size_t get_lws(std::size_t n)
-{
-    static constexpr std::size_t lws0 = 256u;
-    static constexpr std::size_t lws1 = 128u;
-    static constexpr std::size_t lws2 = 64u;
-    return _get_lws_impl<lws0, lws1, lws2>(n);
-}
-
-} // end of namespace detail
-
-template <typename MaskedDstIndexerT, typename dataT, typename indT>
-class masked_extract_all_slices_contig_impl_krn;
-
-typedef sycl::event (*masked_extract_all_slices_contig_impl_fn_ptr_t)(
-    sycl::queue &,
-    ssize_t,
-    const char *,
-    const char *,
-    char *,
-    ssize_t,
-    ssize_t,
-    const std::vector<sycl::event> &);
-
-template <typename dataT, typename indT>
-sycl::event masked_extract_all_slices_contig_impl(
-    sycl::queue &exec_q,
-    ssize_t iteration_size,
-    const char *src_p,
-    const char *cumsum_p,
-    char *dst_p,
-    ssize_t dst_size, // dst is 1D
-    ssize_t dst_stride,
-    const std::vector<sycl::event> &depends = {})
-{
-    static constexpr TwoZeroOffsets_Indexer orthog_src_dst_indexer{};
-
-    static constexpr NoOpIndexer masked_src_indexer{};
-    const Strided1DIndexer masked_dst_indexer(/* size */ dst_size,
-                                              /* step */ dst_stride);
-
-    using KernelName =
-        class masked_extract_all_slices_contig_impl_krn<Strided1DIndexer, dataT,
-                                                        indT>;
-
-    using LocalAccessorT = sycl::local_accessor<indT, 1>;
-    using Impl =
-        struct MaskedExtractStridedFunctor<TwoZeroOffsets_Indexer, NoOpIndexer,
-                                           Strided1DIndexer, dataT, indT,
-                                           LocalAccessorT>;
-
-    const std::size_t masked_extent = iteration_size;
-
-    const std::size_t lws = detail::get_lws(masked_extent);
-
-    const std::size_t n_groups = (iteration_size + lws - 1) / lws;
-
-    sycl::range<2> gRange{1, n_groups * lws};
-    sycl::range<2> lRange{1, lws};
-
-    sycl::nd_range<2> ndRange(gRange, lRange);
-
-    const dataT *src_tp = reinterpret_cast<const dataT *>(src_p);
-    const indT *cumsum_tp = reinterpret_cast<const indT *>(cumsum_p);
-    dataT *dst_tp = reinterpret_cast<dataT *>(dst_p);
-
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        const std::size_t lacc_size = std::min(lws, masked_extent) + 1;
-        LocalAccessorT lacc(lacc_size, cgh);
-
-        cgh.parallel_for<KernelName>(
-            ndRange, Impl(src_tp, cumsum_tp, dst_tp, masked_extent,
-                          orthog_src_dst_indexer, masked_src_indexer,
-                          masked_dst_indexer, lacc));
-    });
-
-    return comp_ev;
-}
-
-template <typename MaskedSrcIndexerT,
-          typename MaskedDstIndexerT,
-          typename dataT,
-          typename indT>
-class masked_extract_all_slices_strided_impl_krn;
-
-typedef sycl::event (*masked_extract_all_slices_strided_impl_fn_ptr_t)(
-    sycl::queue &,
-    ssize_t,
-    const char *,
-    const char *,
-    char *,
-    int,
-    ssize_t const *,
-    ssize_t,
-    ssize_t,
-    const std::vector<sycl::event> &);
-
-template <typename dataT, typename indT>
-sycl::event masked_extract_all_slices_strided_impl(
-    sycl::queue &exec_q,
-    ssize_t iteration_size,
-    const char *src_p,
-    const char *cumsum_p,
-    char *dst_p,
-    int nd,
-    const ssize_t
-        *packed_src_shape_strides, // [src_shape, src_strides], length 2*nd
-    ssize_t dst_size,              // dst is 1D
-    ssize_t dst_stride,
-    const std::vector<sycl::event> &depends = {})
-{
-    static constexpr TwoZeroOffsets_Indexer orthog_src_dst_indexer{};
-
-    /* StridedIndexer(int _nd, ssize_t _offset, ssize_t const
-     * *_packed_shape_strides) */
-    const StridedIndexer masked_src_indexer(nd, 0, packed_src_shape_strides);
-    const Strided1DIndexer masked_dst_indexer(/* size */ dst_size,
-                                              /* step */ dst_stride);
-
-    using KernelName = class masked_extract_all_slices_strided_impl_krn<
-        StridedIndexer, Strided1DIndexer, dataT, indT>;
-
-    using LocalAccessorT = sycl::local_accessor<indT, 1>;
-    using Impl =
-        struct MaskedExtractStridedFunctor<TwoZeroOffsets_Indexer,
-                                           StridedIndexer, Strided1DIndexer,
-                                           dataT, indT, LocalAccessorT>;
-
-    const std::size_t masked_nelems = iteration_size;
-
-    const std::size_t lws = detail::get_lws(masked_nelems);
-
-    const std::size_t n_groups = (masked_nelems + lws - 1) / lws;
-
-    sycl::range<2> gRange{1, n_groups * lws};
-    sycl::range<2> lRange{1, lws};
-
-    sycl::nd_range<2> ndRange(gRange, lRange);
-
-    const dataT *src_tp = reinterpret_cast<const dataT *>(src_p);
-    const indT *cumsum_tp = reinterpret_cast<const indT *>(cumsum_p);
-    dataT *dst_tp = reinterpret_cast<dataT *>(dst_p);
-
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        const std::size_t lacc_size = std::min(lws, masked_nelems) + 1;
-        LocalAccessorT lacc(lacc_size, cgh);
-
-        cgh.parallel_for<KernelName>(
-            ndRange, Impl(src_tp, cumsum_tp, dst_tp, iteration_size,
-                          orthog_src_dst_indexer, masked_src_indexer,
-                          masked_dst_indexer, lacc));
-    });
-
-    return comp_ev;
-}
-
-typedef sycl::event (*masked_extract_some_slices_strided_impl_fn_ptr_t)(
-    sycl::queue &,
-    ssize_t,
-    ssize_t,
-    const char *,
-    const char *,
-    char *,
-    int,
-    ssize_t const *,
-    ssize_t,
-    ssize_t,
-    int,
-    ssize_t const *,
-    ssize_t,
-    ssize_t,
-    const std::vector<sycl::event> &);
-
-template <typename OrthoIndexerT,
-          typename MaskedSrcIndexerT,
-          typename MaskedDstIndexerT,
-          typename dataT,
-          typename indT>
-class masked_extract_some_slices_strided_impl_krn;
-
-template <typename dataT, typename indT>
-sycl::event masked_extract_some_slices_strided_impl(
-    sycl::queue &exec_q,
-    ssize_t orthog_nelems,
-    ssize_t masked_nelems,
-    const char *src_p,
-    const char *cumsum_p,
-    char *dst_p,
-    int orthog_nd,
-    // [ortho_shape, ortho_src_strides, // ortho_dst_strides],
-    // length 3*ortho_nd
-    const ssize_t *packed_ortho_src_dst_shape_strides,
-    ssize_t ortho_src_offset,
-    ssize_t ortho_dst_offset,
-    int masked_nd,
-    // [masked_src_shape, masked_src_strides],
-    // length 2*masked_nd, mask_dst is 1D
-    const ssize_t *packed_masked_src_shape_strides,
-    ssize_t masked_dst_size,
-    ssize_t masked_dst_stride,
-    const std::vector<sycl::event> &depends = {})
-{
-    const TwoOffsets_StridedIndexer orthog_src_dst_indexer{
-        orthog_nd, ortho_src_offset, ortho_dst_offset,
-        packed_ortho_src_dst_shape_strides};
-
-    const StridedIndexer masked_src_indexer{masked_nd, 0,
-                                            packed_masked_src_shape_strides};
-    const Strided1DIndexer masked_dst_indexer{/* size */ masked_dst_size,
-                                              /* step */ masked_dst_stride};
-
-    using KernelName = class masked_extract_some_slices_strided_impl_krn<
-        TwoOffsets_StridedIndexer, StridedIndexer, Strided1DIndexer, dataT,
-        indT>;
-
-    using LocalAccessorT = sycl::local_accessor<indT, 1>;
-    using Impl =
-        struct MaskedExtractStridedFunctor<TwoOffsets_StridedIndexer,
-                                           StridedIndexer, Strided1DIndexer,
-                                           dataT, indT, LocalAccessorT>;
-
-    const std::size_t masked_extent = masked_nelems;
-
-    const std::size_t lws = detail::get_lws(masked_extent);
-
-    const std::size_t n_groups = ((masked_extent + lws - 1) / lws);
-    const std::size_t orthog_extent = static_cast<std::size_t>(orthog_nelems);
-
-    sycl::range<2> gRange{orthog_extent, n_groups * lws};
-    sycl::range<2> lRange{1, lws};
-
-    sycl::nd_range<2> ndRange(gRange, lRange);
-
-    const dataT *src_tp = reinterpret_cast<const dataT *>(src_p);
-    const indT *cumsum_tp = reinterpret_cast<const indT *>(cumsum_p);
-    dataT *dst_tp = reinterpret_cast<dataT *>(dst_p);
-
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        const std::size_t lacc_size =
-            std::min<std::size_t>(lws, masked_extent) + 1;
-        LocalAccessorT lacc(lacc_size, cgh);
-
-        cgh.parallel_for<KernelName>(
-            ndRange, Impl(src_tp, cumsum_tp, dst_tp, masked_nelems,
-                          orthog_src_dst_indexer, masked_src_indexer,
-                          masked_dst_indexer, lacc));
-    });
-
-    return comp_ev;
-}
-
-template <typename fnT, typename T>
-struct MaskExtractAllSlicesContigFactoryForInt32
-{
-    fnT get()
-    {
-        fnT fn = masked_extract_all_slices_contig_impl<T, std::int32_t>;
-        return fn;
-    }
-};
-
-template <typename fnT, typename T>
-struct MaskExtractAllSlicesContigFactoryForInt64
-{
-    fnT get()
-    {
-        fnT fn = masked_extract_all_slices_contig_impl<T, std::int64_t>;
-        return fn;
-    }
-};
-
-template <typename fnT, typename T>
-struct MaskExtractAllSlicesStridedFactoryForInt32
-{
-    fnT get()
-    {
-        fnT fn = masked_extract_all_slices_strided_impl<T, std::int32_t>;
-        return fn;
-    }
-};
-
-template <typename fnT, typename T>
-struct MaskExtractAllSlicesStridedFactoryForInt64
-{
-    fnT get()
-    {
-        fnT fn = masked_extract_all_slices_strided_impl<T, std::int64_t>;
-        return fn;
-    }
-};
-
-template <typename fnT, typename T>
-struct MaskExtractSomeSlicesStridedFactoryForInt32
-{
-    fnT get()
-    {
-        fnT fn = masked_extract_some_slices_strided_impl<T, std::int32_t>;
-        return fn;
-    }
-};
-
-template <typename fnT, typename T>
-struct MaskExtractSomeSlicesStridedFactoryForInt64
-{
-    fnT get()
-    {
-        fnT fn = masked_extract_some_slices_strided_impl<T, std::int64_t>;
-        return fn;
-    }
-};
-
-// Masked placement
-
-template <typename OrthoIndexerT,
-          typename MaskedDstIndexerT,
-          typename MaskedRhsIndexerT,
-          typename dataT,
-          typename indT>
-class masked_place_all_slices_strided_impl_krn;
-
-typedef sycl::event (*masked_place_all_slices_strided_impl_fn_ptr_t)(
-    sycl::queue &,
-    ssize_t,
-    char *,
-    const char *,
-    const char *,
-    int,
-    ssize_t const *,
-    ssize_t,
-    ssize_t,
-    const std::vector<sycl::event> &);
-
-template <typename dataT, typename indT>
-sycl::event masked_place_all_slices_strided_impl(
-    sycl::queue &exec_q,
-    ssize_t iteration_size,
-    char *dst_p,
-    const char *cumsum_p,
-    const char *rhs_p,
-    int nd,
-    const ssize_t
-        *packed_dst_shape_strides, // [dst_shape, dst_strides], length 2*nd
-    ssize_t rhs_size,              // rhs is 1D
-    ssize_t rhs_stride,
-    const std::vector<sycl::event> &depends = {})
-{
-    static constexpr TwoZeroOffsets_Indexer orthog_dst_rhs_indexer{};
-
-    /* StridedIndexer(int _nd, ssize_t _offset, ssize_t const
-     * *_packed_shape_strides) */
-    const StridedIndexer masked_dst_indexer(nd, 0, packed_dst_shape_strides);
-    const Strided1DCyclicIndexer masked_rhs_indexer(0, rhs_size, rhs_stride);
-
-    using KernelName = class masked_place_all_slices_strided_impl_krn<
-        TwoZeroOffsets_Indexer, StridedIndexer, Strided1DCyclicIndexer, dataT,
-        indT>;
-
-    static constexpr std::size_t nominal_lws = 256;
-    const std::size_t masked_extent = iteration_size;
-    const std::size_t lws = std::min(masked_extent, nominal_lws);
-
-    const std::size_t n_groups = (masked_extent + lws - 1) / lws;
-
-    sycl::range<2> gRange{1, n_groups * lws};
-    sycl::range<2> lRange{1, lws};
-    sycl::nd_range<2> ndRange{gRange, lRange};
-
-    using LocalAccessorT = sycl::local_accessor<indT, 1>;
-    using Impl =
-        MaskedPlaceStridedFunctor<TwoZeroOffsets_Indexer, StridedIndexer,
-                                  Strided1DCyclicIndexer, dataT, indT,
-                                  LocalAccessorT>;
-
-    dataT *dst_tp = reinterpret_cast<dataT *>(dst_p);
-    const dataT *rhs_tp = reinterpret_cast<const dataT *>(rhs_p);
-    const indT *cumsum_tp = reinterpret_cast<const indT *>(cumsum_p);
-
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        const std::size_t lacc_size = std::min(masked_extent, lws) + 1;
-        LocalAccessorT lacc(lacc_size, cgh);
-
-        cgh.parallel_for<KernelName>(
-            ndRange, Impl(dst_tp, cumsum_tp, rhs_tp, iteration_size,
-                          orthog_dst_rhs_indexer, masked_dst_indexer,
-                          masked_rhs_indexer, lacc));
-    });
-
-    return comp_ev;
-}
-
-typedef sycl::event (*masked_place_some_slices_strided_impl_fn_ptr_t)(
-    sycl::queue &,
-    ssize_t,
-    ssize_t,
-    char *,
-    const char *,
-    const char *,
-    int,
-    ssize_t const *,
-    ssize_t,
-    ssize_t,
-    int,
-    ssize_t const *,
-    ssize_t,
-    ssize_t,
-    const std::vector<sycl::event> &);
-
-template <typename OrthoIndexerT,
-          typename MaskedSrcIndexerT,
-          typename MaskedDstIndexerT,
-          typename dataT,
-          typename indT>
-class masked_place_some_slices_strided_impl_krn;
-
-template <typename dataT, typename indT>
-sycl::event masked_place_some_slices_strided_impl(
-    sycl::queue &exec_q,
-    ssize_t orthog_nelems,
-    ssize_t masked_nelems,
-    char *dst_p,
-    const char *cumsum_p,
-    const char *rhs_p,
-    int orthog_nd,
-    // [ortho_shape, ortho_dst_strides, ortho_rhs_strides],
-    // length 3*ortho_nd
-    const ssize_t *packed_ortho_dst_rhs_shape_strides,
-    ssize_t ortho_dst_offset,
-    ssize_t ortho_rhs_offset,
-    int masked_nd,
-    // [masked_dst_shape, masked_dst_strides],
-    // length 2*masked_nd, mask_dst is 1D
-    const ssize_t *packed_masked_dst_shape_strides,
-    ssize_t masked_rhs_size,
-    ssize_t masked_rhs_stride,
-    const std::vector<sycl::event> &depends = {})
-{
-    const TwoOffsets_StridedIndexer orthog_dst_rhs_indexer{
-        orthog_nd, ortho_dst_offset, ortho_rhs_offset,
-        packed_ortho_dst_rhs_shape_strides};
-
-    /* StridedIndexer(int _nd, ssize_t _offset, ssize_t const
-     * *_packed_shape_strides) */
-    const StridedIndexer masked_dst_indexer{masked_nd, 0,
-                                            packed_masked_dst_shape_strides};
-    const Strided1DCyclicIndexer masked_rhs_indexer{0, masked_rhs_size,
-                                                    masked_rhs_stride};
-
-    using KernelName = class masked_place_some_slices_strided_impl_krn<
-        TwoOffsets_StridedIndexer, StridedIndexer, Strided1DCyclicIndexer,
-        dataT, indT>;
-
-    static constexpr std::size_t nominal_lws = 256;
-    const std::size_t orthog_extent = orthog_nelems;
-    const std::size_t masked_extent = masked_nelems;
-    const std::size_t lws = std::min(masked_extent, nominal_lws);
-
-    const std::size_t n_groups = (masked_extent + lws - 1) / lws;
-
-    sycl::range<2> gRange{orthog_extent, n_groups * lws};
-    sycl::range<2> lRange{1, lws};
-    sycl::nd_range<2> ndRange{gRange, lRange};
-
-    using LocalAccessorT = sycl::local_accessor<indT, 1>;
-    using Impl =
-        MaskedPlaceStridedFunctor<TwoOffsets_StridedIndexer, StridedIndexer,
-                                  Strided1DCyclicIndexer, dataT, indT,
-                                  LocalAccessorT>;
-
-    dataT *dst_tp = reinterpret_cast<dataT *>(dst_p);
-    const dataT *rhs_tp = reinterpret_cast<const dataT *>(rhs_p);
-    const indT *cumsum_tp = reinterpret_cast<const indT *>(cumsum_p);
-
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        const std::size_t lacc_size = std::min(masked_extent, lws) + 1;
-        LocalAccessorT lacc(lacc_size, cgh);
-
-        cgh.parallel_for<KernelName>(
-            ndRange, Impl(dst_tp, cumsum_tp, rhs_tp, masked_nelems,
-                          orthog_dst_rhs_indexer, masked_dst_indexer,
-                          masked_rhs_indexer, lacc));
-    });
-
-    return comp_ev;
-}
-
-template <typename fnT, typename T>
-struct MaskPlaceAllSlicesStridedFactoryForInt32
-{
-    fnT get()
-    {
-        fnT fn = masked_place_all_slices_strided_impl<T, std::int32_t>;
-        return fn;
-    }
-};
-
-template <typename fnT, typename T>
-struct MaskPlaceAllSlicesStridedFactoryForInt64
-{
-    fnT get()
-    {
-        fnT fn = masked_place_all_slices_strided_impl<T, std::int64_t>;
-        return fn;
-    }
-};
-
-template <typename fnT, typename T>
-struct MaskPlaceSomeSlicesStridedFactoryForInt32
-{
-    fnT get()
-    {
-        fnT fn = masked_place_some_slices_strided_impl<T, std::int32_t>;
-        return fn;
-    }
-};
-
-template <typename fnT, typename T>
-struct MaskPlaceSomeSlicesStridedFactoryForInt64
-{
-    fnT get()
-    {
-        fnT fn = masked_place_some_slices_strided_impl<T, std::int64_t>;
-        return fn;
-    }
-};
-
-// Non-zero
-
-template <typename T1, typename T2> class non_zero_indexes_krn;
-
-typedef sycl::event (*non_zero_indexes_fn_ptr_t)(
-    sycl::queue &,
-    ssize_t,
-    ssize_t,
-    int,
-    const char *,
-    char *,
-    const ssize_t *,
-    std::vector<sycl::event> const &);
-
-template <typename indT1, typename indT2>
-sycl::event non_zero_indexes_impl(sycl::queue &exec_q,
-                                  ssize_t iter_size,
-                                  ssize_t nz_elems,
-                                  int nd,
-                                  const char *cumsum_cp,
-                                  char *indexes_cp,
-                                  const ssize_t *mask_shape,
-                                  std::vector<sycl::event> const &depends)
-{
-    const indT1 *cumsum_data = reinterpret_cast<const indT1 *>(cumsum_cp);
-    indT2 *indexes_data = reinterpret_cast<indT2 *>(indexes_cp);
-
-    static constexpr std::size_t nominal_lws = 256u;
-    const std::size_t masked_extent = iter_size;
-    const std::size_t lws = std::min(masked_extent, nominal_lws);
-
-    const std::size_t n_groups = (masked_extent + lws - 1) / lws;
-    sycl::range<1> gRange{n_groups * lws};
-    sycl::range<1> lRange{lws};
-
-    sycl::nd_range<1> ndRange{gRange, lRange};
-
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        const std::size_t lacc_size = std::min(lws, masked_extent) + 1;
-        sycl::local_accessor<indT1, 1> lacc(lacc_size, cgh);
-
-        using KernelName = class non_zero_indexes_krn<indT1, indT2>;
-
-        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> ndit) {
-            const std::size_t group_i = ndit.get_group(0);
-            const std::uint32_t l_i = ndit.get_local_id(0);
-            const std::uint32_t lws = ndit.get_local_range(0);
-
-            const std::size_t masked_block_start = group_i * lws;
-
-            for (std::uint32_t i = l_i; i < lacc.size(); i += lws) {
-                const std::size_t offset = masked_block_start + i;
-                lacc[i] = (offset == 0) ? indT1(0)
-                          : (offset - 1 < masked_extent)
-                              ? cumsum_data[offset - 1]
-                              : cumsum_data[masked_extent - 1] + 1;
-            }
-
-            sycl::group_barrier(ndit.get_group());
-
-            const std::size_t i = masked_block_start + l_i;
-            const auto cs_val = lacc[l_i];
-            const bool cond = (lacc[l_i + 1] == cs_val + 1);
-
-            if (cond && (i < masked_extent)) {
-                ssize_t i_ = static_cast<ssize_t>(i);
-                for (int dim = nd; --dim > 0;) {
-                    const auto sd = mask_shape[dim];
-                    const ssize_t q = i_ / sd;
-                    const ssize_t r = (i_ - q * sd);
-                    indexes_data[cs_val + dim * nz_elems] =
-                        static_cast<indT2>(r);
-                    i_ = q;
-                }
-                indexes_data[cs_val] = static_cast<indT2>(i_);
-            }
-        });
-    });
-
-    return comp_ev;
-}
-
-} // namespace indexing
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/clip.hpp b/dpctl/tensor/libtensor/include/kernels/clip.hpp
deleted file mode 100644
index 815ebbcbc9..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/clip.hpp
+++ /dev/null
@@ -1,349 +0,0 @@
-//=== clip.hpp -  Implementation of clip kernels ---*-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for dpctl.tensor.clip.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <algorithm>
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "dpctl_tensor_types.hpp"
-#include "kernels/alignment.hpp"
-#include "utils/math_utils.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/sycl_utils.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace clip
-{
-
-using dpctl::tensor::ssize_t;
-using namespace dpctl::tensor::offset_utils;
-
-using dpctl::tensor::kernels::alignment_utils::
-    disabled_sg_loadstore_wrapper_krn;
-using dpctl::tensor::kernels::alignment_utils::is_aligned;
-using dpctl::tensor::kernels::alignment_utils::required_alignment;
-
-using dpctl::tensor::sycl_utils::sub_group_load;
-using dpctl::tensor::sycl_utils::sub_group_store;
-
-template <typename T> T clip(const T &x, const T &min, const T &max)
-{
-    using dpctl::tensor::type_utils::is_complex;
-    if constexpr (is_complex<T>::value) {
-        using dpctl::tensor::math_utils::max_complex;
-        using dpctl::tensor::math_utils::min_complex;
-        return min_complex(max_complex(x, min), max);
-    }
-    else if constexpr (std::is_floating_point_v<T> ||
-                       std::is_same_v<T, sycl::half>)
-    {
-        auto tmp = (std::isnan(x) || x > min) ? x : min;
-        return (std::isnan(tmp) || tmp < max) ? tmp : max;
-    }
-    else if constexpr (std::is_same_v<T, bool>) {
-        return (x || min) && max;
-    }
-    else {
-        auto tmp = (x > min) ? x : min;
-        return (tmp < max) ? tmp : max;
-    }
-}
-
-template <typename T,
-          std::uint8_t vec_sz = 4,
-          std::uint8_t n_vecs = 2,
-          bool enable_sg_loadstore = true>
-class ClipContigFunctor
-{
-private:
-    std::size_t nelems = 0;
-    const T *x_p = nullptr;
-    const T *min_p = nullptr;
-    const T *max_p = nullptr;
-    T *dst_p = nullptr;
-
-public:
-    ClipContigFunctor(std::size_t nelems_,
-                      const T *x_p_,
-                      const T *min_p_,
-                      const T *max_p_,
-                      T *dst_p_)
-        : nelems(nelems_), x_p(x_p_), min_p(min_p_), max_p(max_p_),
-          dst_p(dst_p_)
-    {
-    }
-
-    void operator()(sycl::nd_item<1> ndit) const
-    {
-        static constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz;
-
-        using dpctl::tensor::type_utils::is_complex;
-        if constexpr (is_complex<T>::value || !enable_sg_loadstore) {
-            const std::uint16_t sgSize =
-                ndit.get_sub_group().get_local_range()[0];
-            const std::size_t gid = ndit.get_global_linear_id();
-            const std::uint16_t nelems_per_sg = sgSize * nelems_per_wi;
-
-            const std::size_t start =
-                (gid / sgSize) * (nelems_per_sg - sgSize) + gid;
-            const std::size_t end = std::min(nelems, start + nelems_per_sg);
-
-            for (std::size_t offset = start; offset < end; offset += sgSize) {
-                dst_p[offset] = clip(x_p[offset], min_p[offset], max_p[offset]);
-            }
-        }
-        else {
-            auto sg = ndit.get_sub_group();
-            const std::uint16_t sgSize = sg.get_max_local_range()[0];
-
-            const std::size_t base =
-                nelems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
-                                 sg.get_group_id()[0] * sgSize);
-
-            if (base + nelems_per_wi * sgSize < nelems) {
-                sycl::vec<T, vec_sz> dst_vec;
-#pragma unroll
-                for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
-                    const std::size_t idx = base + it * sgSize;
-                    auto x_multi_ptr = sycl::address_space_cast<
-                        sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(&x_p[idx]);
-                    auto min_multi_ptr = sycl::address_space_cast<
-                        sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(&min_p[idx]);
-                    auto max_multi_ptr = sycl::address_space_cast<
-                        sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(&max_p[idx]);
-                    auto dst_multi_ptr = sycl::address_space_cast<
-                        sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(&dst_p[idx]);
-
-                    const sycl::vec<T, vec_sz> x_vec =
-                        sub_group_load<vec_sz>(sg, x_multi_ptr);
-                    const sycl::vec<T, vec_sz> min_vec =
-                        sub_group_load<vec_sz>(sg, min_multi_ptr);
-                    const sycl::vec<T, vec_sz> max_vec =
-                        sub_group_load<vec_sz>(sg, max_multi_ptr);
-#pragma unroll
-                    for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) {
-                        dst_vec[vec_id] = clip(x_vec[vec_id], min_vec[vec_id],
-                                               max_vec[vec_id]);
-                    }
-                    sub_group_store<vec_sz>(sg, dst_vec, dst_multi_ptr);
-                }
-            }
-            else {
-                const std::size_t lane_id = sg.get_local_id()[0];
-                for (std::size_t k = base + lane_id; k < nelems; k += sgSize) {
-                    dst_p[k] = clip(x_p[k], min_p[k], max_p[k]);
-                }
-            }
-        }
-    }
-};
-
-template <typename T, int vec_sz, int n_vecs> class clip_contig_kernel;
-
-typedef sycl::event (*clip_contig_impl_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t,
-    const char *,
-    const char *,
-    const char *,
-    char *,
-    const std::vector<sycl::event> &);
-
-template <typename T>
-sycl::event clip_contig_impl(sycl::queue &q,
-                             std::size_t nelems,
-                             const char *x_cp,
-                             const char *min_cp,
-                             const char *max_cp,
-                             char *dst_cp,
-                             const std::vector<sycl::event> &depends)
-{
-    const T *x_tp = reinterpret_cast<const T *>(x_cp);
-    const T *min_tp = reinterpret_cast<const T *>(min_cp);
-    const T *max_tp = reinterpret_cast<const T *>(max_cp);
-    T *dst_tp = reinterpret_cast<T *>(dst_cp);
-
-    sycl::event clip_ev = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        std::size_t lws = 64;
-        static constexpr std::uint8_t vec_sz = 4;
-        static constexpr std::uint8_t n_vecs = 2;
-        const std::size_t n_groups =
-            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
-        const auto gws_range = sycl::range<1>(n_groups * lws);
-        const auto lws_range = sycl::range<1>(lws);
-
-        if (is_aligned<required_alignment>(x_cp) &&
-            is_aligned<required_alignment>(min_cp) &&
-            is_aligned<required_alignment>(max_cp) &&
-            is_aligned<required_alignment>(dst_cp))
-        {
-            static constexpr bool enable_sg_loadstore = true;
-            using KernelName = clip_contig_kernel<T, vec_sz, n_vecs>;
-            using Impl =
-                ClipContigFunctor<T, vec_sz, n_vecs, enable_sg_loadstore>;
-
-            cgh.parallel_for<KernelName>(
-                sycl::nd_range<1>(gws_range, lws_range),
-                Impl(nelems, x_tp, min_tp, max_tp, dst_tp));
-        }
-        else {
-            static constexpr bool disable_sg_loadstore = false;
-            using InnerKernelName = clip_contig_kernel<T, vec_sz, n_vecs>;
-            using KernelName =
-                disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
-            using Impl =
-                ClipContigFunctor<T, vec_sz, n_vecs, disable_sg_loadstore>;
-
-            cgh.parallel_for<KernelName>(
-                sycl::nd_range<1>(gws_range, lws_range),
-                Impl(nelems, x_tp, min_tp, max_tp, dst_tp));
-        }
-    });
-
-    return clip_ev;
-}
-
-template <typename T, typename IndexerT> class ClipStridedFunctor
-{
-private:
-    const T *x_p = nullptr;
-    const T *min_p = nullptr;
-    const T *max_p = nullptr;
-    T *dst_p = nullptr;
-    IndexerT indexer;
-
-public:
-    ClipStridedFunctor(const T *x_p_,
-                       const T *min_p_,
-                       const T *max_p_,
-                       T *dst_p_,
-                       const IndexerT &indexer_)
-        : x_p(x_p_), min_p(min_p_), max_p(max_p_), dst_p(dst_p_),
-          indexer(indexer_)
-    {
-    }
-
-    void operator()(sycl::id<1> id) const
-    {
-        std::size_t gid = id[0];
-        auto offsets = indexer(static_cast<ssize_t>(gid));
-        dst_p[offsets.get_fourth_offset()] = clip(
-            x_p[offsets.get_first_offset()], min_p[offsets.get_second_offset()],
-            max_p[offsets.get_third_offset()]);
-    }
-};
-
-template <typename T, typename IndexerT> class clip_strided_kernel;
-
-typedef sycl::event (*clip_strided_impl_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t,
-    int,
-    const char *,
-    const char *,
-    const char *,
-    char *,
-    const ssize_t *,
-    ssize_t,
-    ssize_t,
-    ssize_t,
-    ssize_t,
-    const std::vector<sycl::event> &);
-
-template <typename T>
-sycl::event clip_strided_impl(sycl::queue &q,
-                              std::size_t nelems,
-                              int nd,
-                              const char *x_cp,
-                              const char *min_cp,
-                              const char *max_cp,
-                              char *dst_cp,
-                              const ssize_t *shape_strides,
-                              ssize_t x_offset,
-                              ssize_t min_offset,
-                              ssize_t max_offset,
-                              ssize_t dst_offset,
-                              const std::vector<sycl::event> &depends)
-{
-    const T *x_tp = reinterpret_cast<const T *>(x_cp);
-    const T *min_tp = reinterpret_cast<const T *>(min_cp);
-    const T *max_tp = reinterpret_cast<const T *>(max_cp);
-    T *dst_tp = reinterpret_cast<T *>(dst_cp);
-
-    sycl::event clip_ev = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        const FourOffsets_StridedIndexer indexer{
-            nd, x_offset, min_offset, max_offset, dst_offset, shape_strides};
-
-        using KernelName = clip_strided_kernel<T, FourOffsets_StridedIndexer>;
-        using Impl = ClipStridedFunctor<T, FourOffsets_StridedIndexer>;
-
-        cgh.parallel_for<KernelName>(
-            sycl::range<1>(nelems),
-            Impl(x_tp, min_tp, max_tp, dst_tp, indexer));
-    });
-
-    return clip_ev;
-}
-
-template <typename fnT, typename T> struct ClipStridedFactory
-{
-    fnT get()
-    {
-        fnT fn = clip_strided_impl<T>;
-        return fn;
-    }
-};
-
-template <typename fnT, typename T> struct ClipContigFactory
-{
-    fnT get()
-    {
-
-        fnT fn = clip_contig_impl<T>;
-        return fn;
-    }
-};
-
-} // namespace clip
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/constructors.hpp b/dpctl/tensor/libtensor/include/kernels/constructors.hpp
deleted file mode 100644
index e734f30497..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/constructors.hpp
+++ /dev/null
@@ -1,560 +0,0 @@
-//=== constructors.hpp -  -----------------------------------*-C++-*--/===//
-//===              Implementation of tensor constructors kernels ------===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for tensor constructors.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <complex>
-#include <cstddef>
-
-#include <sycl/sycl.hpp>
-
-#include "dpctl_tensor_types.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/strided_iters.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace constructors
-{
-
-using dpctl::tensor::ssize_t;
-
-/*!
-  @defgroup CtorKernels
- */
-
-template <typename Ty> class linear_sequence_step_kernel;
-template <typename Ty, typename wTy> class linear_sequence_affine_kernel;
-template <typename Ty> class full_strided_kernel;
-template <typename Ty> class eye_kernel;
-
-using namespace dpctl::tensor::offset_utils;
-
-template <typename Ty> class LinearSequenceStepFunctor
-{
-private:
-    Ty *p = nullptr;
-    Ty start_v;
-    Ty step_v;
-
-public:
-    LinearSequenceStepFunctor(char *dst_p, Ty v0, Ty dv)
-        : p(reinterpret_cast<Ty *>(dst_p)), start_v(v0), step_v(dv)
-    {
-    }
-
-    void operator()(sycl::id<1> wiid) const
-    {
-        auto i = wiid.get(0);
-        using dpctl::tensor::type_utils::is_complex;
-        if constexpr (is_complex<Ty>::value) {
-            p[i] = Ty{start_v.real() + i * step_v.real(),
-                      start_v.imag() + i * step_v.imag()};
-        }
-        else {
-            p[i] = start_v + i * step_v;
-        }
-    }
-};
-
-/*!
- * @brief Function to submit kernel to populate given contiguous memory
- * allocation with linear sequence specified by typed starting value and
- * increment.
- *
- * @param q  Sycl queue to which the kernel is submitted
- * @param nelems Length of the sequence
- * @param start_v Typed starting value of the sequence
- * @param step_v  Typed increment of the sequence
- * @param array_data Kernel accessible USM pointer to the start of array to be
- * populated.
- * @param depends List of events to wait for before starting computations, if
- * any.
- *
- * @return Event to wait on to ensure that computation completes.
- * @defgroup CtorKernels
- */
-template <typename Ty>
-sycl::event lin_space_step_impl(sycl::queue &exec_q,
-                                std::size_t nelems,
-                                Ty start_v,
-                                Ty step_v,
-                                char *array_data,
-                                const std::vector<sycl::event> &depends)
-{
-    dpctl::tensor::type_utils::validate_type_for_device<Ty>(exec_q);
-    sycl::event lin_space_step_event = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        cgh.parallel_for<linear_sequence_step_kernel<Ty>>(
-            sycl::range<1>{nelems},
-            LinearSequenceStepFunctor<Ty>(array_data, start_v, step_v));
-    });
-
-    return lin_space_step_event;
-}
-
-// Constructor to populate tensor with linear sequence defined by
-// start and and data
-
-template <typename Ty, typename wTy> class LinearSequenceAffineFunctor
-{
-private:
-    Ty *p = nullptr;
-    Ty start_v;
-    Ty end_v;
-    std::size_t n;
-
-public:
-    LinearSequenceAffineFunctor(char *dst_p, Ty v0, Ty v1, std::size_t den)
-        : p(reinterpret_cast<Ty *>(dst_p)), start_v(v0), end_v(v1),
-          n((den == 0) ? 1 : den)
-    {
-    }
-
-    void operator()(sycl::id<1> wiid) const
-    {
-        auto i = wiid.get(0);
-        wTy wc = wTy(i) / n;
-        wTy w = wTy(n - i) / n;
-        using dpctl::tensor::type_utils::is_complex;
-        if constexpr (is_complex<Ty>::value) {
-            using reT = typename Ty::value_type;
-            auto _w = static_cast<reT>(w);
-            auto _wc = static_cast<reT>(wc);
-            auto re_comb = sycl::fma(start_v.real(), _w, reT(0));
-            re_comb =
-                sycl::fma(end_v.real(), _wc,
-                          re_comb); // start_v.real() * _w + end_v.real() * _wc;
-            auto im_comb =
-                sycl::fma(start_v.imag(), _w,
-                          reT(0)); // start_v.imag() * _w + end_v.imag() * _wc;
-            im_comb = sycl::fma(end_v.imag(), _wc, im_comb);
-            Ty affine_comb = Ty{re_comb, im_comb};
-            p[i] = affine_comb;
-        }
-        else if constexpr (std::is_floating_point<Ty>::value) {
-            Ty _w = static_cast<Ty>(w);
-            Ty _wc = static_cast<Ty>(wc);
-            auto affine_comb =
-                sycl::fma(start_v, _w, Ty(0)); // start_v * w + end_v * wc;
-            affine_comb = sycl::fma(end_v, _wc, affine_comb);
-            p[i] = affine_comb;
-        }
-        else {
-            using dpctl::tensor::type_utils::convert_impl;
-            auto affine_comb = start_v * w + end_v * wc;
-            p[i] = convert_impl<Ty, decltype(affine_comb)>(affine_comb);
-        }
-    }
-};
-
-/*!
- * @brief Function to submit kernel to populate given contiguous memory
- * allocation with linear sequence specified by typed starting and end values.
- *
- * @param exec_q  Sycl queue to which kernel is submitted for execution.
- * @param nelems  Length of the sequence.
- * @param start_v Stating value of the sequence.
- * @param end_v   End-value of the sequence.
- * @param include_endpoint  Whether the end-value is included in the sequence.
- * @param array_data Kernel accessible USM pointer to the start of array to be
- * populated.
- * @param depends  List of events to wait for before starting computations, if
- * any.
- *
- * @return Event to wait on to ensure that computation completes.
- * @defgroup CtorKernels
- */
-template <typename Ty>
-sycl::event lin_space_affine_impl(sycl::queue &exec_q,
-                                  std::size_t nelems,
-                                  Ty start_v,
-                                  Ty end_v,
-                                  bool include_endpoint,
-                                  char *array_data,
-                                  const std::vector<sycl::event> &depends)
-{
-    dpctl::tensor::type_utils::validate_type_for_device<Ty>(exec_q);
-
-    const bool device_supports_doubles =
-        exec_q.get_device().has(sycl::aspect::fp64);
-    const std::size_t den = (include_endpoint) ? nelems - 1 : nelems;
-
-    sycl::event lin_space_affine_event = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        if (device_supports_doubles) {
-            using KernelName = linear_sequence_affine_kernel<Ty, double>;
-            using Impl = LinearSequenceAffineFunctor<Ty, double>;
-
-            cgh.parallel_for<KernelName>(sycl::range<1>{nelems},
-                                         Impl(array_data, start_v, end_v, den));
-        }
-        else {
-            using KernelName = linear_sequence_affine_kernel<Ty, float>;
-            using Impl = LinearSequenceAffineFunctor<Ty, float>;
-
-            cgh.parallel_for<KernelName>(sycl::range<1>{nelems},
-                                         Impl(array_data, start_v, end_v, den));
-        }
-    });
-
-    return lin_space_affine_event;
-}
-
-/* ================ Full ================== */
-
-/*!
- * @brief Function to submit kernel to fill given contiguous memory allocation
- * with specified value.
- *
- * @param exec_q  Sycl queue to which kernel is submitted for execution.
- * @param nelems  Length of the sequence
- * @param fill_v  Value to fill the array with
- * @param dst_p Kernel accessible USM pointer to the start of array to be
- * populated.
- * @param depends  List of events to wait for before starting computations, if
- * any.
- *
- * @return Event to wait on to ensure that computation completes.
- * @defgroup CtorKernels
- */
-template <typename dstTy>
-sycl::event full_contig_impl(sycl::queue &q,
-                             std::size_t nelems,
-                             dstTy fill_v,
-                             char *dst_p,
-                             const std::vector<sycl::event> &depends)
-{
-    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
-    sycl::event fill_ev = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        dstTy *p = reinterpret_cast<dstTy *>(dst_p);
-        cgh.fill<dstTy>(p, fill_v, nelems);
-    });
-
-    return fill_ev;
-}
-
-template <typename Ty, typename IndexerT> class FullStridedFunctor
-{
-private:
-    Ty *p = nullptr;
-    Ty fill_v;
-    IndexerT indexer;
-
-public:
-    FullStridedFunctor(Ty *p_, const Ty &fill_v_, const IndexerT &indexer_)
-        : p(p_), fill_v(fill_v_), indexer(indexer_)
-    {
-    }
-
-    void operator()(sycl::id<1> id) const
-    {
-        auto offset = indexer(id.get(0));
-        p[offset] = fill_v;
-    }
-};
-
-/*!
- * @brief Function to submit kernel to fill given contiguous memory allocation
- * with specified value.
- *
- * @param exec_q  Sycl queue to which kernel is submitted for execution.
- * @param nd  Array dimensionality
- * @param nelems  Length of the sequence
- * @param shape_strides  Kernel accessible USM pointer to packed shape and
- * strides of array.
- * @param fill_v  Value to fill the array with
- * @param dst_p  Kernel accessible USM pointer to the start of array to be
- * populated.
- * @param depends  List of events to wait for before starting computations, if
- * any.
- *
- * @return Event to wait on to ensure that computation completes.
- * @defgroup CtorKernels
- */
-template <typename dstTy>
-sycl::event full_strided_impl(sycl::queue &q,
-                              int nd,
-                              std::size_t nelems,
-                              const ssize_t *shape_strides,
-                              dstTy fill_v,
-                              char *dst_p,
-                              const std::vector<sycl::event> &depends)
-{
-    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
-
-    dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_p);
-
-    using dpctl::tensor::offset_utils::StridedIndexer;
-    const StridedIndexer strided_indexer(nd, 0, shape_strides);
-
-    sycl::event fill_ev = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        using KernelName = full_strided_kernel<dstTy>;
-        using Impl = FullStridedFunctor<dstTy, StridedIndexer>;
-
-        cgh.parallel_for<KernelName>(sycl::range<1>{nelems},
-                                     Impl(dst_tp, fill_v, strided_indexer));
-    });
-
-    return fill_ev;
-}
-
-/* ================ Eye ================== */
-
-typedef sycl::event (*eye_fn_ptr_t)(sycl::queue &,
-                                    std::size_t nelems, // num_elements
-                                    ssize_t start,
-                                    ssize_t end,
-                                    ssize_t step,
-                                    char *, // dst_data_ptr
-                                    const std::vector<sycl::event> &);
-
-template <typename Ty> class EyeFunctor
-{
-private:
-    Ty *p = nullptr;
-    ssize_t start_v;
-    ssize_t end_v;
-    ssize_t step_v;
-
-public:
-    EyeFunctor(char *dst_p,
-               const ssize_t v0,
-               const ssize_t v1,
-               const ssize_t dv)
-        : p(reinterpret_cast<Ty *>(dst_p)), start_v(v0), end_v(v1), step_v(dv)
-    {
-    }
-
-    void operator()(sycl::id<1> wiid) const
-    {
-        Ty set_v = 0;
-        ssize_t i = static_cast<ssize_t>(wiid.get(0));
-        if (i >= start_v and i <= end_v) {
-            if ((i - start_v) % step_v == 0) {
-                set_v = 1;
-            }
-        }
-        p[i] = set_v;
-    }
-};
-
-/*!
- * @brief Function to populate 2D array with eye matrix.
- *
- * @param exec_q  Sycl queue to which kernel is submitted for execution.
- * @param nelems  Number of elements to assign.
- * @param start   Position of the first non-zero value.
- * @param end     Position of the last non-zero value.
- * @param step    Number of array elements between non-zeros.
- * @param array_data Kernel accessible USM pointer for the destination array.
- * @param depends  List of events to wait for before starting computations, if
- * any.
- *
- * @return  Event to wait on to ensure that computation completes.
- * @defgroup CtorKernels
- */
-template <typename Ty>
-sycl::event eye_impl(sycl::queue &exec_q,
-                     std::size_t nelems,
-                     const ssize_t start,
-                     const ssize_t end,
-                     const ssize_t step,
-                     char *array_data,
-                     const std::vector<sycl::event> &depends)
-{
-    dpctl::tensor::type_utils::validate_type_for_device<Ty>(exec_q);
-    sycl::event eye_event = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        using KernelName = eye_kernel<Ty>;
-        using Impl = EyeFunctor<Ty>;
-
-        cgh.parallel_for<KernelName>(sycl::range<1>{nelems},
-                                     Impl(array_data, start, end, step));
-    });
-
-    return eye_event;
-}
-
-/*!
- * @brief  Factory to get function pointer of type `fnT` for data type `Ty`.
- * @ingroup CtorKernels
- */
-template <typename fnT, typename Ty> struct EyeFactory
-{
-    fnT get()
-    {
-        fnT f = eye_impl<Ty>;
-        return f;
-    }
-};
-
-/* =========================== Tril and triu ============================== */
-
-// define function type
-typedef sycl::event (*tri_fn_ptr_t)(sycl::queue &,
-                                    ssize_t,   // inner_range  //ssize_t
-                                    ssize_t,   // outer_range
-                                    char *,    // src_data_ptr
-                                    char *,    // dst_data_ptr
-                                    ssize_t,   // nd
-                                    ssize_t *, // shape_and_strides
-                                    ssize_t,   // k
-                                    const std::vector<sycl::event> &,
-                                    const std::vector<sycl::event> &);
-
-/*!
- * @brief Function to copy triangular matrices from source stack to destination
- * stack.
- *
- * @param exec_q  Sycl queue to which kernel is submitted for execution.
- * @param inner_range  Number of elements in each matrix.
- * @param outer_range  Number of matrices to copy.
- * @param src_p  Kernel accessible USM pointer for the source array.
- * @param dst_p  Kernel accessible USM pointer for the destination array.
- * @param nd  The array dimensionality of source and destination arrays.
- * @param shape_and_strides  Kernel accessible USM pointer to packed shape and
- * strides of arrays.
- * @param k Position of the diagonal above/below which to copy filling the rest
- * with zero elements.
- * @param depends  List of events to wait for before starting computations, if
- * any.
- * @param additional_depends  List of additional events to wait for before
- * starting computations, if any.
- *
- * @return  Event to wait on to ensure that computation completes.
- * @defgroup CtorKernels
- */
-template <typename Ty, bool> class tri_kernel;
-template <typename Ty, bool upper>
-sycl::event tri_impl(sycl::queue &exec_q,
-                     ssize_t inner_range,
-                     ssize_t outer_range,
-                     char *src_p,
-                     char *dst_p,
-                     ssize_t nd,
-                     ssize_t *shape_and_strides,
-                     ssize_t k,
-                     const std::vector<sycl::event> &depends,
-                     const std::vector<sycl::event> &additional_depends)
-{
-    static constexpr int d2 = 2;
-    ssize_t src_s = nd;
-    ssize_t dst_s = 2 * nd;
-    ssize_t nd_1 = nd - 1;
-    ssize_t nd_2 = nd - 2;
-    Ty *src = reinterpret_cast<Ty *>(src_p);
-    Ty *dst = reinterpret_cast<Ty *>(dst_p);
-
-    dpctl::tensor::type_utils::validate_type_for_device<Ty>(exec_q);
-
-    sycl::event tri_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        cgh.depends_on(additional_depends);
-
-        cgh.parallel_for<tri_kernel<Ty, upper>>(
-            sycl::range<1>(inner_range * outer_range), [=](sycl::id<1> idx) {
-                ssize_t outer_gid = idx[0] / inner_range;
-                ssize_t inner_gid = idx[0] - inner_range * outer_gid;
-
-                ssize_t src_inner_offset = 0, dst_inner_offset = 0;
-                bool to_copy{false};
-
-                {
-                    using dpctl::tensor::strides::CIndexer_array;
-                    CIndexer_array<d2, ssize_t> indexer_i(
-                        {shape_and_strides[nd_2], shape_and_strides[nd_1]});
-                    indexer_i.set(inner_gid);
-                    const std::array<ssize_t, d2> &inner = indexer_i.get();
-                    src_inner_offset =
-                        inner[0] * shape_and_strides[src_s + nd_2] +
-                        inner[1] * shape_and_strides[src_s + nd_1];
-                    dst_inner_offset =
-                        inner[0] * shape_and_strides[dst_s + nd_2] +
-                        inner[1] * shape_and_strides[dst_s + nd_1];
-
-                    if constexpr (upper)
-                        to_copy = (inner[0] + k >= inner[1]);
-                    else
-                        to_copy = (inner[0] + k <= inner[1]);
-                }
-
-                ssize_t src_offset = 0;
-                ssize_t dst_offset = 0;
-                {
-                    using dpctl::tensor::strides::CIndexer_vector;
-                    CIndexer_vector<ssize_t> outer(nd - d2);
-                    outer.get_displacement(
-                        outer_gid, shape_and_strides, shape_and_strides + src_s,
-                        shape_and_strides + dst_s, src_offset, dst_offset);
-                }
-
-                src_offset += src_inner_offset;
-                dst_offset += dst_inner_offset;
-
-                dst[dst_offset] = (to_copy) ? src[src_offset] : Ty(0);
-            });
-    });
-    return tri_ev;
-}
-
-/*!
- * @brief  Factory to get function pointer of type `fnT` for data type `Ty`.
- * @ingroup CtorKernels
- */
-template <typename fnT, typename Ty> struct TrilGenericFactory
-{
-    fnT get()
-    {
-        fnT f = tri_impl<Ty, /*tril*/ true>;
-        return f;
-    }
-};
-
-/*!
- * @brief  Factory to get function pointer of type `fnT` for data type `Ty`.
- * @ingroup CtorKernels
- */
-template <typename fnT, typename Ty> struct TriuGenericFactory
-{
-    fnT get()
-    {
-        fnT f = tri_impl<Ty, /*triu*/ false>;
-        return f;
-    }
-};
-
-} // namespace constructors
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
deleted file mode 100644
index 023c3d8717..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
+++ /dev/null
@@ -1,1262 +0,0 @@
-//=== copy_and_cast.hpp - Implementation of copy-and-cast kernels *-C++-*/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for tensor copying and value casting.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "dpctl_tensor_types.hpp"
-#include "kernels/alignment.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/sycl_utils.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace copy_and_cast
-{
-
-using dpctl::tensor::ssize_t;
-using namespace dpctl::tensor::offset_utils;
-
-using dpctl::tensor::kernels::alignment_utils::
-    disabled_sg_loadstore_wrapper_krn;
-using dpctl::tensor::kernels::alignment_utils::is_aligned;
-using dpctl::tensor::kernels::alignment_utils::required_alignment;
-
-using dpctl::tensor::sycl_utils::sub_group_load;
-using dpctl::tensor::sycl_utils::sub_group_store;
-
-template <typename srcT, typename dstT, typename IndexerT>
-class copy_cast_generic_kernel;
-
-template <typename srcT,
-          typename dstT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class copy_cast_contig_kernel;
-
-template <typename srcT, typename dstT, typename IndexerT>
-class copy_cast_from_host_kernel;
-
-template <typename srcT, typename dstT, typename IndexerT>
-class copy_cast_from_host_contig_kernel;
-
-template <typename srcTy, typename dstTy> class Caster
-{
-public:
-    Caster() = default;
-    dstTy operator()(const srcTy &src) const
-    {
-        using dpctl::tensor::type_utils::convert_impl;
-        return convert_impl<dstTy, srcTy>(src);
-    }
-};
-
-template <typename srcT, typename dstT, typename CastFnT, typename IndexerT>
-class GenericCopyFunctor
-{
-private:
-    const srcT *src_ = nullptr;
-    dstT *dst_ = nullptr;
-    IndexerT indexer_;
-
-public:
-    GenericCopyFunctor(const srcT *src_p, dstT *dst_p, const IndexerT &indexer)
-        : src_(src_p), dst_(dst_p), indexer_(indexer)
-    {
-    }
-
-    void operator()(sycl::id<1> wiid) const
-    {
-        const auto &offsets = indexer_(static_cast<ssize_t>(wiid.get(0)));
-        const ssize_t &src_offset = offsets.get_first_offset();
-        const ssize_t &dst_offset = offsets.get_second_offset();
-
-        static constexpr CastFnT fn{};
-        dst_[dst_offset] = fn(src_[src_offset]);
-    }
-};
-
-/*!
-  @defgroup CopyAndCastKernels
- */
-
-/*!
- * @brief Function pointer type for generic array cast and copying function.
- */
-typedef sycl::event (*copy_and_cast_generic_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t,
-    int,
-    const ssize_t *,
-    const char *,
-    ssize_t,
-    char *,
-    ssize_t,
-    const std::vector<sycl::event> &,
-    const std::vector<sycl::event> &);
-
-/*!
- * @brief Generic function to copy `nelems` elements from `src` usm_ndarray to
- `dst` usm_ndarray while casting from `srcTy` to `dstTy`.
-
-   Both arrays have array dimensionality specified via argument `nd`. The
- `shape_and_strides` is kernel accessible USM array of length `3*nd`, where the
- first `nd` elements encode common shape, second `nd` elements contain strides
- of `src` array, and the trailing `nd` elements contain strides of `dst` array.
- `src_p` and `dst_p` represent pointers into respective arrays, but the start of
- iteration begins at offset of `src_offset` elements for `src` array and at
- offset `dst_offset` elements for `dst` array. Kernel is submitted to sycl queue
- `q` with events `depends` and `additional_depends` as dependencies.
-
-   @param  q       Sycl queue to which the kernel is submitted.
-   @param  nelems  Number of elements to cast and copy.
-   @param  nd      Array dimensionality, i.e. number of indices needed to
- identify an element of each array.
-   @param  shape_and_strides  Kernel accessible USM pointer to packed shape and
- strides.
-   @param  src_p   Kernel accessible USM pointer for the source array
-   @param  src_offset  Offset to the beginning of iteration in number of
- elements of source array from `src_p`.
-   @param  dst_p   Kernel accessible USM pointer for the destination array
-   @param  dst_offset  Offset to the beginning of iteration in number of
- elements of destination array from `dst_p`.
-   @param  depends  List of events to wait for before starting computations, if
- any.
-   @param  additional_depends Additional list of events to wait for before
- starting computations, if any.
-
-   @return  Event to wait on to ensure that computation completes.
-   @ingroup CopyAndCastKernels
- */
-template <typename dstTy, typename srcTy>
-sycl::event
-copy_and_cast_generic_impl(sycl::queue &q,
-                           std::size_t nelems,
-                           int nd,
-                           const ssize_t *shape_and_strides,
-                           const char *src_p,
-                           ssize_t src_offset,
-                           char *dst_p,
-                           ssize_t dst_offset,
-                           const std::vector<sycl::event> &depends,
-                           const std::vector<sycl::event> &additional_depends)
-{
-    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
-    dpctl::tensor::type_utils::validate_type_for_device<srcTy>(q);
-
-    sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        cgh.depends_on(additional_depends);
-
-        const TwoOffsets_StridedIndexer indexer{nd, src_offset, dst_offset,
-                                                shape_and_strides};
-        const srcTy *src_tp = reinterpret_cast<const srcTy *>(src_p);
-        dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_p);
-
-        cgh.parallel_for<class copy_cast_generic_kernel<
-            srcTy, dstTy, TwoOffsets_StridedIndexer>>(
-            sycl::range<1>(nelems),
-            GenericCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>,
-                               TwoOffsets_StridedIndexer>(src_tp, dst_tp,
-                                                          indexer));
-    });
-
-    return copy_and_cast_ev;
-}
-
-/*!
- * @brief Factory to get generic function pointer of type `fnT` for given source
- * data type `S` and destination data type `D`.
- * @ingroup CopyAndCastKernels
- */
-template <typename fnT, typename D, typename S> struct CopyAndCastGenericFactory
-{
-    fnT get()
-    {
-        fnT f = copy_and_cast_generic_impl<D, S>;
-        return f;
-    }
-};
-
-// Specialization of copy_and_cast for contiguous arrays
-
-template <typename srcT,
-          typename dstT,
-          typename CastFnT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-class ContigCopyFunctor
-{
-private:
-    std::size_t nelems;
-    const srcT *src_p = nullptr;
-    dstT *dst_p = nullptr;
-
-public:
-    ContigCopyFunctor(const std::size_t nelems_,
-                      const srcT *src_p_,
-                      dstT *dst_p_)
-        : nelems(nelems_), src_p(src_p_), dst_p(dst_p_)
-    {
-    }
-
-    void operator()(sycl::nd_item<1> ndit) const
-    {
-        static constexpr CastFnT fn{};
-
-        static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
-
-        using dpctl::tensor::type_utils::is_complex_v;
-        if constexpr (!enable_sg_loadstore || is_complex_v<srcT> ||
-                      is_complex_v<dstT>)
-        {
-            std::uint16_t sgSize = ndit.get_sub_group().get_local_range()[0];
-            const std::size_t gid = ndit.get_global_linear_id();
-
-            // start = (gid / sgSize) * elems_per_sg + (gid % sgSize)
-            const std::uint16_t elems_per_sg = sgSize * elems_per_wi;
-            const std::size_t start =
-                (gid / sgSize) * (elems_per_sg - sgSize) + gid;
-            const std::size_t end = std::min(nelems, start + elems_per_sg);
-            for (std::size_t offset = start; offset < end; offset += sgSize) {
-                dst_p[offset] = fn(src_p[offset]);
-            }
-        }
-        else {
-            auto sg = ndit.get_sub_group();
-            const std::uint16_t sgSize = sg.get_max_local_range()[0];
-            const std::size_t base =
-                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
-                                sg.get_group_id()[0] * sgSize);
-
-            if (base + elems_per_wi * sgSize < nelems) {
-                sycl::vec<dstT, vec_sz> dst_vec;
-
-#pragma unroll
-                for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
-                    const std::size_t offset = base + it * sgSize;
-                    auto src_multi_ptr = sycl::address_space_cast<
-                        sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(&src_p[offset]);
-                    auto dst_multi_ptr = sycl::address_space_cast<
-                        sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(&dst_p[offset]);
-
-                    const sycl::vec<srcT, vec_sz> src_vec =
-                        sub_group_load<vec_sz>(sg, src_multi_ptr);
-#pragma unroll
-                    for (std::uint8_t k = 0; k < vec_sz; k++) {
-                        dst_vec[k] = fn(src_vec[k]);
-                    }
-                    sub_group_store<vec_sz>(sg, dst_vec, dst_multi_ptr);
-                }
-            }
-            else {
-                const std::size_t start = base + sg.get_local_id()[0];
-                for (std::size_t k = start; k < nelems; k += sgSize) {
-                    dst_p[k] = fn(src_p[k]);
-                }
-            }
-        }
-    }
-};
-
-/*!
- * @brief Function pointer type for contiguous array cast and copy function.
- */
-typedef sycl::event (*copy_and_cast_contig_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t,
-    const char *,
-    char *,
-    const std::vector<sycl::event> &);
-
-/*!
- * @brief Function to copy `nelems` elements from contiguous `src` usm_ndarray
- to contiguous `dst` usm_ndarray while casting from `srcTy` to `dstTy`.
-
-   Both arrays have the same number of elements `nelems`.
- `src_cp` and `dst_cp` represent char pointers to the start of respective
- arrays. Kernel is submitted to sycl queue `q` with events `depends` as
- dependencies.
-
-   @param  q       Sycl queue to which the kernel is submitted.
-   @param  nelems  Number of elements to cast and copy.
-   @param  src_p   Kernel accessible USM pointer for the source array
-   @param  dst_p   Kernel accessible USM pointer for the destination array
-   @param  depends  List of events to wait for before starting computations, if
- any.
-
-   @return  Event to wait on to ensure that computation completes.
-   @ingroup CopyAndCastKernels
- */
-template <typename dstTy, typename srcTy>
-sycl::event copy_and_cast_contig_impl(sycl::queue &q,
-                                      std::size_t nelems,
-                                      const char *src_cp,
-                                      char *dst_cp,
-                                      const std::vector<sycl::event> &depends)
-{
-    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
-    dpctl::tensor::type_utils::validate_type_for_device<srcTy>(q);
-
-    sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        const srcTy *src_tp = reinterpret_cast<const srcTy *>(src_cp);
-        dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_cp);
-
-        std::size_t lws = 64;
-        static constexpr std::uint32_t vec_sz = 4;
-        static constexpr std::uint32_t n_vecs = 2;
-        const std::size_t n_groups =
-            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
-        const auto gws_range = sycl::range<1>(n_groups * lws);
-        const auto lws_range = sycl::range<1>(lws);
-
-        if (is_aligned<required_alignment>(src_cp) &&
-            is_aligned<required_alignment>(dst_cp))
-        {
-            static constexpr bool enable_sg_loadstore = true;
-            using KernelName =
-                copy_cast_contig_kernel<srcTy, dstTy, vec_sz, n_vecs>;
-
-            cgh.parallel_for<KernelName>(
-                sycl::nd_range<1>(gws_range, lws_range),
-                ContigCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>, vec_sz,
-                                  n_vecs, enable_sg_loadstore>(nelems, src_tp,
-                                                               dst_tp));
-        }
-        else {
-            static constexpr bool disable_sg_loadstore = false;
-            using InnerKernelName =
-                copy_cast_contig_kernel<srcTy, dstTy, vec_sz, n_vecs>;
-            using KernelName =
-                disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
-
-            cgh.parallel_for<KernelName>(
-                sycl::nd_range<1>(gws_range, lws_range),
-                ContigCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>, vec_sz,
-                                  n_vecs, disable_sg_loadstore>(nelems, src_tp,
-                                                                dst_tp));
-        }
-    });
-
-    return copy_and_cast_ev;
-}
-
-/*!
- * @brief Factory to get specialized function pointer for casting and copying
- * contiguous arrays.
- * @ingroup CopyAndCastKernels
- */
-template <typename fnT, typename D, typename S> struct CopyAndCastContigFactory
-{
-    fnT get()
-    {
-        fnT f = copy_and_cast_contig_impl<D, S>;
-        return f;
-    }
-};
-
-// Specialization of copy_and_cast for 1D arrays
-
-/*!
- * @brief Factory to get function pointer for casting and copying 1D arrays.
- * @ingroup CopyAndCastKernels
- */
-typedef sycl::event (*copy_and_cast_1d_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t,
-    const std::array<ssize_t, 1> &,
-    const std::array<ssize_t, 1> &,
-    const std::array<ssize_t, 1> &,
-    const char *,
-    ssize_t,
-    char *,
-    ssize_t,
-    const std::vector<sycl::event> &);
-
-/*!
- * @brief Factory to get function pointer for casting and copying 2D arrays.
- * @ingroup CopyAndCastKernels
- */
-typedef sycl::event (*copy_and_cast_2d_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t,
-    const std::array<ssize_t, 2> &,
-    const std::array<ssize_t, 2> &,
-    const std::array<ssize_t, 2> &,
-    const char *,
-    ssize_t,
-    char *,
-    ssize_t,
-    const std::vector<sycl::event> &);
-
-/*!
- * @brief Specialized for given array dimension function to copy `nelems`
- elements from `src` usm_ndarray to `dst` usm_ndarray while casting from `srcTy`
- to `dstTy`.
-
-   Both arrays have array dimensionality known at compile time and specified in
- template parameters `nd`. Arrays' shape and strides are provided as
- `std::array`. `src_p` and `dst_p` represent pointers into respective arrays,
- but the start of iteration begins at offset of `src_offset` elements for `src`
- array and at offset `dst_offset` elements for `dst` array. Kernel is submitted
- to sycl queue `q` with events `depends` as dependencies.
-
-   @param q  The queue where the routine should be executed.
-   @param nelems  Number of elements to cast and copy.
-   @param shape   Common shape of the arrays.
-   @param src_strides Strides of the source array.
-   @param dst_strides Strides of the destination array.
-   @param src_p  Kernel accessible USM pointer for the source array
-   @param src_offset  Offset to the beginning of iteration in number of elements
- of the source array from `src_p`.
-   @param dst_p  Kernel accessible USM pointer for the destination array
-   @param dst_offset  Offset to the beginning of iteration in number of elements
- of the destination array from `src_p`.
-   @param depends  List of events to wait for before starting computations, if
- any.
-
-   @return  Event to wait on to ensure that computation completes.
- * @ingroup CopyAndCastKernels
- */
-template <typename dstTy, typename srcTy, int nd>
-sycl::event
-copy_and_cast_nd_specialized_impl(sycl::queue &q,
-                                  std::size_t nelems,
-                                  const std::array<ssize_t, nd> &shape,
-                                  const std::array<ssize_t, nd> &src_strides,
-                                  const std::array<ssize_t, nd> &dst_strides,
-                                  const char *src_p,
-                                  ssize_t src_offset,
-                                  char *dst_p,
-                                  ssize_t dst_offset,
-                                  const std::vector<sycl::event> &depends)
-{
-    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
-    dpctl::tensor::type_utils::validate_type_for_device<srcTy>(q);
-
-    sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) {
-        using IndexerT = TwoOffsets_FixedDimStridedIndexer<nd>;
-        const IndexerT indexer{shape, src_strides, dst_strides, src_offset,
-                               dst_offset};
-        const srcTy *src_tp = reinterpret_cast<const srcTy *>(src_p);
-        dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_p);
-
-        cgh.depends_on(depends);
-        cgh.parallel_for<
-            class copy_cast_generic_kernel<srcTy, dstTy, IndexerT>>(
-            sycl::range<1>(nelems),
-            GenericCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>, IndexerT>(
-                src_tp, dst_tp, indexer));
-    });
-
-    return copy_and_cast_ev;
-}
-
-/*!
- * @brief Factory to get 1D-specialized function pointer of type `fnT` for given
- * source data type `S` and destination data type `D`.
- * @ingroup CopyAndCastKernels
- */
-template <typename fnT, typename D, typename S> struct CopyAndCast1DFactory
-{
-    fnT get()
-    {
-        fnT f = copy_and_cast_nd_specialized_impl<D, S, 1>;
-        return f;
-    }
-};
-
-/*!
- * @brief Factory to get 2D-specialized function pointer of type `fnT` for given
- * source data type `S` and destination data type `D`.
- * @ingroup CopyAndCastKernels
- */
-template <typename fnT, typename D, typename S> struct CopyAndCast2DFactory
-{
-    fnT get()
-    {
-        fnT f = copy_and_cast_nd_specialized_impl<D, S, 2>;
-        return f;
-    }
-};
-
-// ====================== Copying from host to USM
-
-template <typename AccessorT,
-          typename dstTy,
-          typename CastFnT,
-          typename IndexerT>
-class GenericCopyFromHostFunctor
-{
-private:
-    AccessorT src_acc_;
-    dstTy *dst_ = nullptr;
-    IndexerT indexer_;
-
-public:
-    GenericCopyFromHostFunctor(const AccessorT &src_acc,
-                               dstTy *dst_p,
-                               const IndexerT &indexer)
-        : src_acc_(src_acc), dst_(dst_p), indexer_(indexer)
-    {
-    }
-
-    void operator()(sycl::id<1> wiid) const
-    {
-        const auto &offsets = indexer_(static_cast<ssize_t>(wiid.get(0)));
-        const ssize_t &src_offset = offsets.get_first_offset();
-        const ssize_t &dst_offset = offsets.get_second_offset();
-
-        CastFnT fn{};
-        dst_[dst_offset] = fn(src_acc_[src_offset]);
-    }
-};
-
-typedef void (*copy_and_cast_from_host_blocking_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t,
-    int,
-    const ssize_t *,
-    const char *,
-    ssize_t,
-    ssize_t,
-    ssize_t,
-    char *,
-    ssize_t,
-    const std::vector<sycl::event> &,
-    const std::vector<sycl::event> &);
-
-/*!
- * @brief Function to copy from NumPy's ndarray with elements of type `srcTy`
- * into usm_ndarray with elements of type `srcTy`.
- *
- * Function to cast and copy elements from numpy.ndarray specified by typeless
- * `host_src_p` and the `src_offset` given in the number of array elements.
- * Arrays' metadata are given in packed USM vector of length `3*nd` whose first
- * `nd` elements contain arrays' shape, next `nd` elements specify source
- * strides in elements (not bytes), and trailing `nd` elements specify
- * destination array strides. Kernel dependencies are given by two vectors of
- * events: `depends` and `additional_depends`. The function execution is
- * complete at the return.
- *
- * @param q  The queue where the routine should be executed.
- * @param nelems Number of elements to cast and copy.
- * @param nd The dimensionality of arrays
- * @param shape_and_strides  Kernel accessible USM pointer to packed shape and
- * strides.
- * @param host_src_p  Host (not USM allocated) pointer associated with the
- * source array.
- * @param src_offset  Offset to the beginning of iteration in number of elements
- * of the source array from `host_src_p`.
- * @param src_min_nelem_offset  Smallest value of offset relative to
- * `host_src_p` in number of elements attained while iterating over elements of
- * the source array.
- * @param src_max_nelem_offset  Largest value of offset relative to `host_src_p`
- * in number of elements attained while iterating over elements of the source
- * array.
- * @param dst_p  USM pointer associated with the destination array.
- * @param dst_offset  Offset to the beginning of iteration in number of elements
- * of the destination array from `dst_p`.
- * @param depends  List of events to wait for before starting computations, if
- * any.
- * @param additional_depends List of additional events to wait for before
- * starting computations, if any.
- *
- * @ingroup CopyAndCastKernels
- */
-template <typename dstTy, typename srcTy>
-void copy_and_cast_from_host_impl(
-    sycl::queue &q,
-    std::size_t nelems,
-    int nd,
-    const ssize_t *shape_and_strides,
-    const char *host_src_p,
-    ssize_t src_offset,
-    ssize_t src_min_nelem_offset,
-    ssize_t src_max_nelem_offset,
-    char *dst_p,
-    ssize_t dst_offset,
-    const std::vector<sycl::event> &depends,
-    const std::vector<sycl::event> &additional_depends)
-{
-    ssize_t nelems_range = src_max_nelem_offset - src_min_nelem_offset + 1;
-
-    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
-    dpctl::tensor::type_utils::validate_type_for_device<srcTy>(q);
-
-    sycl::buffer<srcTy, 1> npy_buf(
-        reinterpret_cast<const srcTy *>(host_src_p) + src_min_nelem_offset,
-        sycl::range<1>(nelems_range), {sycl::property::buffer::use_host_ptr{}});
-
-    sycl::event copy_and_cast_from_host_ev = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        cgh.depends_on(additional_depends);
-
-        sycl::accessor npy_acc(npy_buf, cgh, sycl::read_only);
-
-        const TwoOffsets_StridedIndexer indexer{
-            nd, src_offset - src_min_nelem_offset, dst_offset,
-            const_cast<const ssize_t *>(shape_and_strides)};
-
-        dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_p);
-
-        cgh.parallel_for<copy_cast_from_host_kernel<srcTy, dstTy,
-                                                    TwoOffsets_StridedIndexer>>(
-            sycl::range<1>(nelems),
-            GenericCopyFromHostFunctor<decltype(npy_acc), dstTy,
-                                       Caster<srcTy, dstTy>,
-                                       TwoOffsets_StridedIndexer>(
-                npy_acc, dst_tp, indexer));
-    });
-
-    // perform explicit synchronization. Implicit synchronization would be
-    // performed by sycl::buffer destructor.
-    copy_and_cast_from_host_ev.wait();
-
-    return;
-}
-
-/*!
- * @brief Factory to get function pointer of type `fnT` for given NumPy array
- * source data type `S` and destination data type `D`.
- * @defgroup CopyAndCastKernels
- */
-template <typename fnT, typename D, typename S>
-struct CopyAndCastFromHostFactory
-{
-    fnT get()
-    {
-        fnT f = copy_and_cast_from_host_impl<D, S>;
-        return f;
-    }
-};
-
-typedef void (*copy_and_cast_from_host_contig_blocking_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t,  /* nelems */
-    const char *, /* src_pointer */
-    ssize_t,      /* src_offset */
-    char *,       /* dst_pointer */
-    ssize_t,      /* dst_offset */
-    const std::vector<sycl::event> &);
-
-/*!
- * @brief Function to copy from NumPy's ndarray with elements of type `srcTy`
- * into usm_ndarray with elements of type `srcTy` for contiguous arrays.
- *
- * Function to cast and copy elements from numpy.ndarray specified by typeless
- * `host_src_p` and the `src_offset` given in the number of array elements.
- * Kernel dependencies are given by two vectors of
- * events: `depends` and `additional_depends`. The function execution is
- * complete at the return.
- *
- * @param q  The queue where the routine should be executed.
- * @param nelems Number of elements to cast and copy.
- * @param src_stride The stride of source array in elements
- * @param dst_stride The stride of destimation array in elements
- * @param host_src_p  Host (not USM allocated) pointer associated with the
- * source array.
- * @param src_offset  Offset to the beginning of iteration in number of elements
- * of the source array from `host_src_p`.
- * @param dst_p  USM pointer associated with the destination array.
- * @param dst_offset  Offset to the beginning of iteration in number of elements
- * of the destination array from `dst_p`.
- * @param depends  List of events to wait for before starting computations, if
- * any.
- *
- * @ingroup CopyAndCastKernels
- */
-template <typename dstTy, typename srcTy>
-void copy_and_cast_from_host_contig_impl(
-    sycl::queue &q,
-    std::size_t nelems,
-    const char *host_src_p,
-    ssize_t src_offset,
-    char *dst_p,
-    ssize_t dst_offset,
-    const std::vector<sycl::event> &depends)
-{
-    dpctl::tensor::type_utils::validate_type_for_device<dstTy>(q);
-    dpctl::tensor::type_utils::validate_type_for_device<srcTy>(q);
-
-    sycl::buffer<srcTy, 1> npy_buf(
-        reinterpret_cast<const srcTy *>(host_src_p) + src_offset,
-        sycl::range<1>(nelems), {sycl::property::buffer::use_host_ptr{}});
-
-    sycl::event copy_and_cast_from_host_ev = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        sycl::accessor npy_acc(npy_buf, cgh, sycl::read_only);
-
-        using IndexerT = TwoOffsets_CombinedIndexer<NoOpIndexer, NoOpIndexer>;
-        static constexpr NoOpIndexer src_indexer{};
-        static constexpr NoOpIndexer dst_indexer{};
-        static constexpr TwoOffsets_CombinedIndexer indexer{src_indexer,
-                                                            dst_indexer};
-
-        dstTy *dst_tp = reinterpret_cast<dstTy *>(dst_p) + dst_offset;
-
-        cgh.parallel_for<
-            copy_cast_from_host_contig_kernel<srcTy, dstTy, IndexerT>>(
-            sycl::range<1>(nelems),
-            GenericCopyFromHostFunctor<decltype(npy_acc), dstTy,
-                                       Caster<srcTy, dstTy>, IndexerT>(
-                npy_acc, dst_tp, indexer));
-    });
-
-    // perform explicit synchronization. Implicit synchronization would be
-    // performed by sycl::buffer destructor.
-    copy_and_cast_from_host_ev.wait();
-
-    return;
-}
-
-/*!
- * @brief Factory to get function pointer of type `fnT` for given NumPy array
- * source data type `S` and destination data type `D`.
- * @defgroup CopyAndCastKernels
- */
-template <typename fnT, typename D, typename S>
-struct CopyAndCastFromHostContigFactory
-{
-    fnT get()
-    {
-        fnT f = copy_and_cast_from_host_contig_impl<D, S>;
-        return f;
-    }
-};
-
-// =============== Copying for reshape ================== //
-
-template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
-class copy_for_reshape_generic_kernel;
-
-template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
-class GenericCopyForReshapeFunctor
-{
-private:
-    const Ty *src_p = nullptr;
-    Ty *dst_p = nullptr;
-    SrcIndexerT src_indexer_;
-    DstIndexerT dst_indexer_;
-
-public:
-    GenericCopyForReshapeFunctor(const char *src_ptr,
-                                 char *dst_ptr,
-                                 const SrcIndexerT &src_indexer,
-                                 const DstIndexerT &dst_indexer)
-        : src_p(reinterpret_cast<const Ty *>(src_ptr)),
-          dst_p(reinterpret_cast<Ty *>(dst_ptr)), src_indexer_(src_indexer),
-          dst_indexer_(dst_indexer)
-    {
-    }
-
-    void operator()(sycl::id<1> wiid) const
-    {
-        const ssize_t src_offset = src_indexer_(wiid.get(0));
-        const ssize_t dst_offset = dst_indexer_(wiid.get(0));
-
-        dst_p[dst_offset] = src_p[src_offset];
-    }
-};
-
-// define function type
-typedef sycl::event (*copy_for_reshape_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t,     // num_elements
-    int,             // src_nd
-    int,             // dst_nd
-    const ssize_t *, // packed shapes and strides
-    const char *,    // src_data_ptr
-    char *,          // dst_data_ptr
-    const std::vector<sycl::event> &);
-
-/*!
- * @brief Function to copy content of array while reshaping.
- *
- * Submits a kernel to perform a copy `dst[unravel_index(i,
- * dst.shape)] = src[unravel_undex(i, src.shape)]`.
- *
- * @param  q      The execution queue where kernel is submitted.
- * @param  nelems The number of elements to copy
- * @param  src_nd Array dimension of the source array
- * @param  dst_nd Array dimension of the destination array
- * @param  packed_shapes_and_strides Kernel accessible USM array of size
- * `2*src_nd + 2*dst_nd` with content `[src_shape, src_strides, dst_shape,
- * dst_strides]`.
- * @param  src_p  Typeless USM pointer to the buffer of the source array
- * @param  dst_p  Typeless USM pointer to the buffer of the destination array
- * @param  depends  List of events to wait for before starting computations, if
- * any.
- *
- * @return Event to wait on to ensure that computation completes.
- * @ingroup CopyAndCastKernels
- */
-template <typename Ty>
-sycl::event
-copy_for_reshape_generic_impl(sycl::queue &q,
-                              std::size_t nelems,
-                              int src_nd,
-                              int dst_nd,
-                              const ssize_t *packed_shapes_and_strides,
-                              const char *src_p,
-                              char *dst_p,
-                              const std::vector<sycl::event> &depends)
-{
-    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
-
-    sycl::event copy_for_reshape_ev = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        // packed_shapes_and_strides:
-        //   USM array of size 2*(src_nd + dst_nd)
-        //   [ src_shape; src_strides; dst_shape; dst_strides ]
-
-        const ssize_t *src_shape_and_strides =
-            const_cast<const ssize_t *>(packed_shapes_and_strides);
-
-        const ssize_t *dst_shape_and_strides = const_cast<const ssize_t *>(
-            packed_shapes_and_strides + (2 * src_nd));
-
-        const StridedIndexer src_indexer{src_nd, 0, src_shape_and_strides};
-        const StridedIndexer dst_indexer{dst_nd, 0, dst_shape_and_strides};
-
-        using KernelName =
-            copy_for_reshape_generic_kernel<Ty, StridedIndexer, StridedIndexer>;
-
-        cgh.parallel_for<KernelName>(
-            sycl::range<1>(nelems),
-            GenericCopyForReshapeFunctor<Ty, StridedIndexer, StridedIndexer>(
-                src_p, dst_p, src_indexer, dst_indexer));
-    });
-
-    return copy_for_reshape_ev;
-}
-
-/*!
- * @brief Factory to get function pointer of type `fnT` for given array data
- * type `Ty`.
- * @ingroup CopyAndCastKernels
- */
-template <typename fnT, typename Ty> struct CopyForReshapeGenericFactory
-{
-    fnT get()
-    {
-        fnT f = copy_for_reshape_generic_impl<Ty>;
-        return f;
-    }
-};
-
-// ================== Copying for roll ================== //
-
-/*! @brief Functor to cyclically roll global_id to the left */
-struct LeftRolled1DTransformer
-{
-    LeftRolled1DTransformer(std::size_t offset, std::size_t size)
-        : offset_(offset), size_(size)
-    {
-    }
-
-    std::size_t operator()(std::size_t gid) const
-    {
-        const std::size_t shifted_gid =
-            ((gid < offset_) ? gid + size_ - offset_ : gid - offset_);
-        return shifted_gid;
-    }
-
-private:
-    std::size_t offset_ = 0;
-    std::size_t size_ = 1;
-};
-
-/*! @brief Indexer functor to compose indexer and transformer */
-template <typename IndexerT, typename TransformerT> struct CompositionIndexer
-{
-    CompositionIndexer(IndexerT f, TransformerT t) : f_(f), t_(t) {}
-
-    auto operator()(std::size_t gid) const { return f_(t_(gid)); }
-
-private:
-    IndexerT f_;
-    TransformerT t_;
-};
-
-/*! @brief Indexer functor to find offset for nd-shifted indices lifted from
- * iteration id */
-struct RolledNDIndexer
-{
-    RolledNDIndexer(int nd,
-                    const ssize_t *shape,
-                    const ssize_t *strides,
-                    const ssize_t *ndshifts,
-                    ssize_t starting_offset)
-        : nd_(nd), shape_(shape), strides_(strides), ndshifts_(ndshifts),
-          starting_offset_(starting_offset)
-    {
-    }
-
-    ssize_t operator()(std::size_t gid) const { return compute_offset(gid); }
-
-private:
-    int nd_ = -1;
-    const ssize_t *shape_ = nullptr;
-    const ssize_t *strides_ = nullptr;
-    const ssize_t *ndshifts_ = nullptr;
-    ssize_t starting_offset_ = 0;
-
-    ssize_t compute_offset(ssize_t gid) const
-    {
-        using dpctl::tensor::strides::CIndexer_vector;
-
-        CIndexer_vector _ind(nd_);
-        ssize_t relative_offset_(0);
-        _ind.get_left_rolled_displacement<const ssize_t *, const ssize_t *>(
-            gid,
-            shape_,    // shape ptr
-            strides_,  // strides ptr
-            ndshifts_, // shifts ptr
-            relative_offset_);
-        return starting_offset_ + relative_offset_;
-    }
-};
-
-template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
-class copy_for_roll_strided_kernel;
-
-template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
-class StridedCopyForRollFunctor
-{
-private:
-    const Ty *src_p = nullptr;
-    Ty *dst_p = nullptr;
-    SrcIndexerT src_indexer_;
-    DstIndexerT dst_indexer_;
-
-public:
-    StridedCopyForRollFunctor(const Ty *src_ptr,
-                              Ty *dst_ptr,
-                              const SrcIndexerT &src_indexer,
-                              const DstIndexerT &dst_indexer)
-        : src_p(src_ptr), dst_p(dst_ptr), src_indexer_(src_indexer),
-          dst_indexer_(dst_indexer)
-    {
-    }
-
-    void operator()(sycl::id<1> wiid) const
-    {
-        const std::size_t gid = wiid.get(0);
-
-        const ssize_t src_offset = src_indexer_(gid);
-        const ssize_t dst_offset = dst_indexer_(gid);
-
-        dst_p[dst_offset] = src_p[src_offset];
-    }
-};
-
-// define function type
-typedef sycl::event (*copy_for_roll_strided_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t,     // shift
-    std::size_t,     // num_elements
-    int,             // common_nd
-    const ssize_t *, // packed shapes and strides
-    const char *,    // src_data_ptr
-    ssize_t,         // src_offset
-    char *,          // dst_data_ptr
-    ssize_t,         // dst_offset
-    const std::vector<sycl::event> &);
-
-/*!
- * @brief Function to copy content of array with a shift.
- *
- * Submits a kernel to perform a copy `dst[unravel_index((i + shift) % nelems ,
- * dst.shape)] = src[unravel_undex(i, src.shape)]`.
- *
- * @param  q      The execution queue where kernel is submitted.
- * @param  shift  The shift in flat indexing, must be non-negative.
- * @param  nelems The number of elements to copy
- * @param  nd     Array dimensionality of the destination and source arrays
- * @param  packed_shapes_and_strides Kernel accessible USM array
- * of size `3*nd` with content `[common_shape, src_strides, dst_strides]`.
- * @param  src_p  Typeless USM pointer to the buffer of the source array
- * @param  src_offset Displacement of first element of src relative src_p in
- * elements
- * @param  dst_p  Typeless USM pointer to the buffer of the destination array
- * @param  dst_offset Displacement of first element of dst relative dst_p in
- * elements
- * @param  depends  List of events to wait for before starting computations, if
- * any.
- *
- * @return Event to wait on to ensure that computation completes.
- * @ingroup CopyAndCastKernels
- */
-template <typename Ty>
-sycl::event copy_for_roll_strided_impl(sycl::queue &q,
-                                       std::size_t shift,
-                                       std::size_t nelems,
-                                       int nd,
-                                       const ssize_t *packed_shapes_and_strides,
-                                       const char *src_p,
-                                       ssize_t src_offset,
-                                       char *dst_p,
-                                       ssize_t dst_offset,
-                                       const std::vector<sycl::event> &depends)
-{
-    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
-
-    sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        // packed_shapes_and_strides:
-        //   USM array of size 3 * nd
-        //   [ common_shape; src_strides; dst_strides ]
-
-        const StridedIndexer src_indexer{nd, src_offset,
-                                         packed_shapes_and_strides};
-        const LeftRolled1DTransformer left_roll_transformer{shift, nelems};
-
-        using CompositeIndexerT =
-            CompositionIndexer<StridedIndexer, LeftRolled1DTransformer>;
-
-        const CompositeIndexerT rolled_src_indexer(src_indexer,
-                                                   left_roll_transformer);
-
-        UnpackedStridedIndexer dst_indexer{nd, dst_offset,
-                                           packed_shapes_and_strides,
-                                           packed_shapes_and_strides + 2 * nd};
-
-        using KernelName = copy_for_roll_strided_kernel<Ty, CompositeIndexerT,
-                                                        UnpackedStridedIndexer>;
-
-        const Ty *src_tp = reinterpret_cast<const Ty *>(src_p);
-        Ty *dst_tp = reinterpret_cast<Ty *>(dst_p);
-
-        cgh.parallel_for<KernelName>(
-            sycl::range<1>(nelems),
-            StridedCopyForRollFunctor<Ty, CompositeIndexerT,
-                                      UnpackedStridedIndexer>(
-                src_tp, dst_tp, rolled_src_indexer, dst_indexer));
-    });
-
-    return copy_for_roll_ev;
-}
-
-// define function type
-typedef sycl::event (*copy_for_roll_contig_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t,  // shift
-    std::size_t,  // num_elements
-    const char *, // src_data_ptr
-    ssize_t,      // src_offset
-    char *,       // dst_data_ptr
-    ssize_t,      // dst_offset
-    const std::vector<sycl::event> &);
-
-template <typename Ty> class copy_for_roll_contig_kernel;
-
-/*!
- * @brief Function to copy content of array with a shift.
- *
- * Submits a kernel to perform a copy `dst[unravel_index((i + shift) % nelems ,
- * dst.shape)] = src[unravel_undex(i, src.shape)]`.
- *
- * @param  q      The execution queue where kernel is submitted.
- * @param  shift  The shift in flat indexing, must be non-negative.
- * @param  nelems The number of elements to copy
- * @param  src_p  Typeless USM pointer to the buffer of the source array
- * @param  src_offset Displacement of the start of array src relative src_p in
- * elements
- * @param  dst_p  Typeless USM pointer to the buffer of the destination array
- * @param  dst_offset Displacement of the start of array dst relative dst_p in
- * elements
- * @param  depends  List of events to wait for before starting computations, if
- * any.
- *
- * @return Event to wait on to ensure that computation completes.
- * @ingroup CopyAndCastKernels
- */
-template <typename Ty>
-sycl::event copy_for_roll_contig_impl(sycl::queue &q,
-                                      std::size_t shift,
-                                      std::size_t nelems,
-                                      const char *src_p,
-                                      ssize_t src_offset,
-                                      char *dst_p,
-                                      ssize_t dst_offset,
-                                      const std::vector<sycl::event> &depends)
-{
-    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
-
-    sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        static constexpr NoOpIndexer src_indexer{};
-        const LeftRolled1DTransformer roller{shift, nelems};
-
-        const CompositionIndexer<NoOpIndexer, LeftRolled1DTransformer>
-            left_rolled_src_indexer{src_indexer, roller};
-        static constexpr NoOpIndexer dst_indexer{};
-
-        using KernelName = copy_for_roll_contig_kernel<Ty>;
-
-        const Ty *src_tp = reinterpret_cast<const Ty *>(src_p) + src_offset;
-        Ty *dst_tp = reinterpret_cast<Ty *>(dst_p) + dst_offset;
-
-        cgh.parallel_for<KernelName>(
-            sycl::range<1>(nelems),
-            StridedCopyForRollFunctor<
-                Ty, CompositionIndexer<NoOpIndexer, LeftRolled1DTransformer>,
-                NoOpIndexer>(src_tp, dst_tp, left_rolled_src_indexer,
-                             dst_indexer));
-    });
-
-    return copy_for_roll_ev;
-}
-
-/*!
- * @brief Factory to get function pointer of type `fnT` for given array data
- * type `Ty`.
- * @ingroup CopyAndCastKernels
- */
-template <typename fnT, typename Ty> struct CopyForRollStridedFactory
-{
-    fnT get()
-    {
-        fnT f = copy_for_roll_strided_impl<Ty>;
-        return f;
-    }
-};
-
-/*!
- * @brief Factory to get function pointer of type `fnT` for given array data
- * type `Ty`.
- * @ingroup CopyAndCastKernels
- */
-template <typename fnT, typename Ty> struct CopyForRollContigFactory
-{
-    fnT get()
-    {
-        fnT f = copy_for_roll_contig_impl<Ty>;
-        return f;
-    }
-};
-
-template <typename Ty, typename SrcIndexerT, typename DstIndexerT>
-class copy_for_roll_ndshift_strided_kernel;
-
-// define function type
-typedef sycl::event (*copy_for_roll_ndshift_strided_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t,     // num_elements
-    int,             // common_nd
-    const ssize_t *, // packed shape, strides, shifts
-    const char *,    // src_data_ptr
-    ssize_t,         // src_offset
-    char *,          // dst_data_ptr
-    ssize_t,         // dst_offset
-    const std::vector<sycl::event> &);
-
-template <typename Ty>
-sycl::event copy_for_roll_ndshift_strided_impl(
-    sycl::queue &q,
-    std::size_t nelems,
-    int nd,
-    const ssize_t *packed_shapes_and_strides_and_shifts,
-    const char *src_p,
-    ssize_t src_offset,
-    char *dst_p,
-    ssize_t dst_offset,
-    const std::vector<sycl::event> &depends)
-{
-    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
-
-    sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        // packed_shapes_and_strides_and_shifts:
-        //   USM array of size 4 * nd
-        //   [ common_shape; src_strides; dst_strides; shifts ]
-
-        const ssize_t *shape_ptr = packed_shapes_and_strides_and_shifts;
-        const ssize_t *src_strides_ptr =
-            packed_shapes_and_strides_and_shifts + nd;
-        const ssize_t *dst_strides_ptr =
-            packed_shapes_and_strides_and_shifts + 2 * nd;
-        const ssize_t *shifts_ptr =
-            packed_shapes_and_strides_and_shifts + 3 * nd;
-
-        const RolledNDIndexer src_indexer{nd, shape_ptr, src_strides_ptr,
-                                          shifts_ptr, src_offset};
-
-        const UnpackedStridedIndexer dst_indexer{nd, dst_offset, shape_ptr,
-                                                 dst_strides_ptr};
-
-        using KernelName = copy_for_roll_strided_kernel<Ty, RolledNDIndexer,
-                                                        UnpackedStridedIndexer>;
-
-        const Ty *src_tp = reinterpret_cast<const Ty *>(src_p);
-        Ty *dst_tp = reinterpret_cast<Ty *>(dst_p);
-
-        cgh.parallel_for<KernelName>(
-            sycl::range<1>(nelems),
-            StridedCopyForRollFunctor<Ty, RolledNDIndexer,
-                                      UnpackedStridedIndexer>(
-                src_tp, dst_tp, src_indexer, dst_indexer));
-    });
-
-    return copy_for_roll_ev;
-}
-
-/*!
- * @brief Factory to get function pointer of type `fnT` for given array data
- * type `Ty`.
- * @ingroup CopyAndCastKernels
- */
-template <typename fnT, typename Ty> struct CopyForRollNDShiftFactory
-{
-    fnT get()
-    {
-        fnT f = copy_for_roll_ndshift_strided_impl<Ty>;
-        return f;
-    }
-};
-
-} // namespace copy_and_cast
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp b/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
deleted file mode 100644
index a27a8c78d3..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
+++ /dev/null
@@ -1,639 +0,0 @@
-//=== copy_ascontig.hpp - Implementation of copy-and-cast kernels *-C++-*/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for tensor copying and value casting.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "dpctl_tensor_types.hpp"
-#include "kernels/alignment.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/sycl_utils.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace copy_as_contig
-{
-
-using dpctl::tensor::ssize_t;
-using dpctl::tensor::sycl_utils::sub_group_store;
-
-template <typename T,
-          typename IndexerT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-class CopyAsCContigFunctor
-{
-private:
-    std::size_t nelems;
-    const T *src_p = nullptr;
-    T *dst_p = nullptr;
-    IndexerT src_indexer;
-
-public:
-    CopyAsCContigFunctor(std::size_t n,
-                         const T *src_,
-                         T *dst_,
-                         const IndexerT &src_indexer_)
-        : nelems(n), src_p(src_), dst_p(dst_), src_indexer(src_indexer_)
-    {
-    }
-
-    void operator()(sycl::nd_item<1> ndit) const
-    {
-        static_assert(vec_sz > 0);
-        static_assert(n_vecs > 0);
-
-        static constexpr std::uint8_t elems_per_wi = vec_sz * n_vecs;
-
-        using dpctl::tensor::type_utils::is_complex;
-        if constexpr (!enable_sg_loadstore || is_complex<T>::value) {
-            const std::uint16_t sgSize =
-                ndit.get_sub_group().get_max_local_range()[0];
-            const std::size_t gid = ndit.get_global_linear_id();
-
-            // start = (gid / sgSize) * sgSize * elems_per_wi + (gid % sgSize)
-            // gid % sgSize == gid - (gid / sgSize) * sgSize
-            const std::uint16_t elems_per_sg = sgSize * elems_per_wi;
-            const std::size_t start =
-                (gid / sgSize) * (elems_per_sg - sgSize) + gid;
-            const std::size_t end = std::min(nelems, start + elems_per_sg);
-
-            for (std::size_t offset = start; offset < end; offset += sgSize) {
-                auto src_offset = src_indexer(offset);
-                dst_p[offset] = src_p[src_offset];
-            }
-        }
-        else {
-            auto sg = ndit.get_sub_group();
-            const std::uint16_t sgSize = sg.get_max_local_range()[0];
-            const std::size_t base =
-                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
-                                sg.get_group_id()[0] * sgSize);
-            const std::uint16_t elems_per_sg = elems_per_wi * sgSize;
-
-            if (base + elems_per_sg < nelems) {
-#pragma unroll
-                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
-                    // it == vec_id * vec_sz, for  0 <= vec_id < n_vecs
-                    const std::size_t block_start_id = base + it * sgSize;
-                    auto dst_multi_ptr = sycl::address_space_cast<
-                        sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(&dst_p[block_start_id]);
-
-                    const std::size_t elem_id0 =
-                        block_start_id + sg.get_local_id();
-                    sycl::vec<T, vec_sz> dst_vec;
-#pragma unroll
-                    for (std::uint8_t k = 0; k < vec_sz; ++k) {
-                        const std::size_t elem_id = elem_id0 + k * sgSize;
-                        const ssize_t src_offset = src_indexer(elem_id);
-                        dst_vec[k] = src_p[src_offset];
-                    }
-                    sub_group_store<vec_sz>(sg, dst_vec, dst_multi_ptr);
-                }
-            }
-            else {
-                const std::size_t lane_id = sg.get_local_id()[0];
-                const std::size_t k0 = base + lane_id;
-                for (std::size_t k = k0; k < nelems; k += sgSize) {
-                    const ssize_t src_offset = src_indexer(k);
-                    dst_p[k] = src_p[src_offset];
-                }
-            }
-        }
-    }
-};
-
-template <typename T,
-          typename IndexerT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs,
-          bool enable_sg_load,
-          typename KernelName>
-sycl::event submit_c_contiguous_copy(sycl::queue &exec_q,
-                                     std::size_t nelems,
-                                     const T *src,
-                                     T *dst,
-                                     const IndexerT &src_indexer,
-                                     const std::vector<sycl::event> &depends)
-{
-    static_assert(vec_sz > 0);
-    static_assert(n_vecs > 0);
-
-    static constexpr std::size_t preferred_lws = 256;
-
-    const auto &kernel_id = sycl::get_kernel_id<KernelName>();
-
-    auto const &ctx = exec_q.get_context();
-    auto const &dev = exec_q.get_device();
-    auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
-        ctx, {dev}, {kernel_id});
-
-    auto krn = kb.get_kernel(kernel_id);
-
-    const std::uint32_t max_sg_size = krn.template get_info<
-        sycl::info::kernel_device_specific::max_sub_group_size>(dev);
-
-    const std::size_t lws =
-        ((preferred_lws + max_sg_size - 1) / max_sg_size) * max_sg_size;
-
-    static constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz;
-
-    const std::size_t nelems_per_group = nelems_per_wi * lws;
-    const std::size_t n_groups =
-        (nelems + nelems_per_group - 1) / (nelems_per_group);
-
-    sycl::event copy_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        cgh.use_kernel_bundle(kb);
-
-        const sycl::range<1> gRange{n_groups * lws};
-        const sycl::range<1> lRange{lws};
-
-        cgh.parallel_for<KernelName>(
-            sycl::nd_range<1>(gRange, lRange),
-            CopyAsCContigFunctor<T, IndexerT, vec_sz, n_vecs, enable_sg_load>(
-                nelems, src, dst, src_indexer));
-    });
-    return copy_ev;
-}
-
-template <typename T,
-          typename IndexT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs,
-          bool enable_sgload>
-class as_contig_krn;
-
-template <typename T>
-sycl::event
-as_c_contiguous_array_generic_impl(sycl::queue &exec_q,
-                                   std::size_t nelems,
-                                   int nd,
-                                   const ssize_t *shape_and_strides,
-                                   const char *src_p,
-                                   char *dst_p,
-                                   const std::vector<sycl::event> &depends)
-{
-    dpctl::tensor::type_utils::validate_type_for_device<T>(exec_q);
-
-    const T *src_tp = reinterpret_cast<const T *>(src_p);
-    T *dst_tp = reinterpret_cast<T *>(dst_p);
-
-    using IndexerT = dpctl::tensor::offset_utils::StridedIndexer;
-    const IndexerT src_indexer(nd, ssize_t(0), shape_and_strides);
-
-    static constexpr std::uint8_t vec_sz = 4u;
-    static constexpr std::uint8_t n_vecs = 2u;
-
-    using dpctl::tensor::kernels::alignment_utils::
-        disabled_sg_loadstore_wrapper_krn;
-    using dpctl::tensor::kernels::alignment_utils::is_aligned;
-    using dpctl::tensor::kernels::alignment_utils::required_alignment;
-
-    sycl::event copy_ev;
-    if (is_aligned<required_alignment>(dst_p)) {
-        static constexpr bool enable_sg_load = true;
-        using KernelName =
-            as_contig_krn<T, IndexerT, vec_sz, n_vecs, enable_sg_load>;
-        copy_ev = submit_c_contiguous_copy<T, IndexerT, vec_sz, n_vecs,
-                                           enable_sg_load, KernelName>(
-            exec_q, nelems, src_tp, dst_tp, src_indexer, depends);
-    }
-    else {
-        static constexpr bool disable_sg_load = false;
-        using InnerKernelName =
-            as_contig_krn<T, IndexerT, vec_sz, n_vecs, disable_sg_load>;
-        using KernelName = disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
-        copy_ev = submit_c_contiguous_copy<T, IndexerT, vec_sz, n_vecs,
-                                           disable_sg_load, KernelName>(
-            exec_q, nelems, src_tp, dst_tp, src_indexer, depends);
-    }
-
-    return copy_ev;
-}
-
-typedef sycl::event (*as_c_contiguous_array_impl_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t,
-    int,
-    const ssize_t *,
-    const char *,
-    char *,
-    const std::vector<sycl::event> &);
-
-template <typename fnT, typename T> struct AsCContigFactory
-{
-    fnT get() { return as_c_contiguous_array_generic_impl<T>; }
-};
-
-template <typename T,
-          typename IndexerT,
-          std::uint16_t tile_size,
-          std::uint16_t n_lines>
-class as_contig_batch_of_square_matrices_krn;
-
-namespace detail
-{
-/*! @brief batch of matrices (n, n), source strides (1, src_ld), destination
-   strides (dst_ld, 1) src and destination arrays must be disjoint memory blocks
-   to avoid race condition
- */
-template <typename T, typename BatchIndexerT>
-sycl::event as_c_contiguous_batch_of_square_matrices_impl(
-    sycl::queue &exec_q,
-    std::size_t batch_nelems,
-    const BatchIndexerT &batch_two_offsets_indexer,
-    std::size_t n,
-    const char *src_p,
-    ssize_t src_ld,
-    char *dst_p,
-    ssize_t dst_ld,
-    const std::vector<sycl::event> &depends)
-{
-    dpctl::tensor::type_utils::validate_type_for_device<T>(exec_q);
-
-    const T *src_tp = reinterpret_cast<const T *>(src_p);
-    T *dst_tp = reinterpret_cast<T *>(dst_p);
-
-    static constexpr std::uint16_t private_tile_size = 4;
-    static constexpr std::uint16_t n_lines = 2;
-    static constexpr std::uint16_t block_size =
-        n_lines * private_tile_size * private_tile_size;
-
-    static constexpr std::uint16_t lws0 = block_size;
-    static constexpr std::uint16_t lws1 = n_lines;
-    static constexpr std::uint16_t nelems_per_wi = (block_size / lws1);
-
-    static_assert(nelems_per_wi * lws1 == block_size);
-    static_assert(nelems_per_wi == private_tile_size * private_tile_size);
-
-    static constexpr std::uint32_t lws = lws0 * lws1;
-
-    const std::size_t n_tiles = (n + block_size - 1) / block_size;
-
-    const ssize_t src_stride = src_ld;
-    const ssize_t dst_stride = dst_ld;
-
-    sycl::range<1> lRange{lws};
-    sycl::range<1> gRange{batch_nelems * n_tiles * n_tiles * lws};
-
-    sycl::nd_range<1> ndRange{gRange, lRange};
-
-    using KernelName =
-        as_contig_batch_of_square_matrices_krn<T, BatchIndexerT,
-                                               private_tile_size, lws1>;
-
-    sycl::event e = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        sycl::local_accessor<T, 1> local_block(block_size * block_size, cgh);
-
-        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> nd_it) {
-            // 1. Read block from source array into SLM
-            const std::uint32_t lid_lin = nd_it.get_local_linear_id();
-            const std::size_t gr_id_lin = nd_it.get_group_linear_id();
-
-            const std::size_t batch_id = gr_id_lin / (n_tiles * n_tiles);
-            const std::size_t rem = gr_id_lin - batch_id * (n_tiles * n_tiles);
-
-            const auto &batch_two_offsets = batch_two_offsets_indexer(batch_id);
-            const auto &src_batch_offset = batch_two_offsets.get_first_offset();
-            const auto &dst_batch_offset =
-                batch_two_offsets.get_second_offset();
-
-            // Block id
-            /* 0 <= src_gr_i1 < n_groups_n1 */
-            const std::size_t src_tile_i1 = rem / n_tiles;
-            /* 0 <= src_gr_i0 < n_groups_n0 */
-            const std::size_t src_tile_i0 = rem - src_tile_i1 * n_tiles;
-
-            // ID of element within the block
-            /* 0 <= src_i1 < lws1 */
-            const std::uint32_t src_i1 = lid_lin / lws0;
-            /* 0 <= src_i0 < lws0 */
-            const std::uint32_t src_i0 = lid_lin - src_i1 * lws0;
-
-            // Matrix element ID
-            const std::size_t src_tile_start0 = src_tile_i0 * block_size;
-            const std::size_t src_tile_start1 = src_tile_i1 * block_size;
-            const std::size_t src_gid0 = (src_tile_start0 + src_i0);
-            const std::size_t src_gid1 = (src_tile_start1 + src_i1);
-
-            // src_offset = src_gid0 * 1 + (src_gid1 + pr_id * lws1) *
-            // src_stride
-            const std::size_t src_offset0 =
-                src_batch_offset + src_gid0 * 1 + src_gid1 * src_stride;
-            const std::size_t pr_step_src = lws1 * src_stride;
-
-            const std::uint32_t local_offset0 = src_i0 + src_i1 * block_size;
-            const std::uint32_t pr_step_local = lws1 * block_size;
-
-            for (std::uint32_t pr_id = 0; pr_id < nelems_per_wi; ++pr_id) {
-                local_block[local_offset0 + pr_step_local * pr_id] =
-                    (src_gid0 < n && src_gid1 + pr_id * lws1 < n)
-                        ? src_tp[src_offset0 + pr_step_src * pr_id]
-                        : T(0);
-            }
-
-            const std::uint32_t local_dim0 = static_cast<std::uint32_t>(
-                std::min<std::size_t>(src_tile_start0 + block_size, n) -
-                src_tile_start0);
-            const std::uint32_t local_dim1 = static_cast<std::uint32_t>(
-                std::min<std::size_t>(src_tile_start1 + block_size, n) -
-                src_tile_start1);
-
-            sycl::group_barrier(nd_it.get_group(),
-                                sycl::memory_scope::work_group);
-
-            // 2. Permute the block matrix in SLM using two private arrays
-            std::array<T, nelems_per_wi> private_block_01 = {T(0)};
-            std::array<T, nelems_per_wi> private_block_10 = {T(0)};
-
-            // 0 <= lid_lin < lws0 * lws1 ==
-            //       (block_size * block_size / nelems_per_wi) ==
-            //       (block_size/private_tile_size)**2
-            static constexpr std::uint16_t n_private_tiles_per_axis =
-                block_size / private_tile_size;
-            const std::uint16_t local_tile_id0 =
-                lid_lin / n_private_tiles_per_axis;
-            const std::uint16_t local_tile_id1 =
-                lid_lin - local_tile_id0 * n_private_tiles_per_axis;
-
-            if (local_tile_id0 <= local_tile_id1) {
-                for (std::uint16_t pr_i0 = 0; pr_i0 < private_tile_size;
-                     ++pr_i0)
-                {
-                    for (std::uint16_t pr_i1 = 0; pr_i1 < private_tile_size;
-                         ++pr_i1)
-                    {
-                        const std::uint16_t t0_offset =
-                            local_tile_id0 * private_tile_size;
-                        const std::uint16_t t1_offset =
-                            local_tile_id1 * private_tile_size;
-
-                        const std::uint16_t pr_offset =
-                            pr_i1 * private_tile_size + pr_i0;
-                        const std::uint16_t rel_offset =
-                            pr_i0 + pr_i1 * block_size;
-
-                        // read (local_tile_id0, local_tile_id1)
-                        const std::uint16_t local_01_offset =
-                            (t0_offset + t1_offset * block_size) + rel_offset;
-                        private_block_01[pr_offset] =
-                            local_block[local_01_offset];
-
-                        // read (local_tile_id1, local_tile_id0)
-                        const std::uint16_t local_10_offset =
-                            (t1_offset + t0_offset * block_size) + rel_offset;
-                        private_block_10[pr_offset] =
-                            local_block[local_10_offset];
-                    }
-                }
-            }
-
-            sycl::group_barrier(nd_it.get_group(),
-                                sycl::memory_scope::work_group);
-
-            if (local_tile_id0 <= local_tile_id1) {
-                for (std::uint16_t pr_i0 = 0; pr_i0 < private_tile_size;
-                     ++pr_i0)
-                {
-                    for (std::uint16_t pr_i1 = 0; pr_i1 < private_tile_size;
-                         ++pr_i1)
-                    {
-                        const std::uint16_t t0_offset =
-                            local_tile_id0 * private_tile_size;
-                        const std::uint16_t t1_offset =
-                            local_tile_id1 * private_tile_size;
-                        const std::uint16_t pr_offset =
-                            pr_i0 * private_tile_size + pr_i1;
-
-                        const std::uint16_t rel_offset =
-                            pr_i0 + pr_i1 * block_size;
-
-                        // write back permuted private blocks
-                        const std::uint32_t local_01_offset =
-                            (t0_offset + t1_offset * block_size) + rel_offset;
-                        local_block[local_01_offset] =
-                            private_block_10[pr_offset];
-
-                        const std::uint16_t local_10_offset =
-                            (t1_offset + t0_offset * block_size) + rel_offset;
-                        local_block[local_10_offset] =
-                            private_block_01[pr_offset];
-                    }
-                }
-            }
-
-            sycl::group_barrier(nd_it.get_group(),
-                                sycl::memory_scope::work_group);
-
-            // 3. Write out permuted SLM to destination array
-
-            const std::size_t dst_tile_start0 = src_tile_start0;
-            const std::size_t dst_tile_start1 = src_tile_start1;
-
-            if (local_dim0 == block_size && local_dim1 == block_size) {
-                const std::uint16_t dst_i0 = src_i1;
-                const std::uint16_t dst_i1 = src_i0;
-
-                const std::size_t dst_gid0 = (dst_tile_start0 + dst_i0);
-                const std::size_t dst_gid1 = (dst_tile_start1 + dst_i1);
-
-                const std::size_t dst_offset0 =
-                    dst_batch_offset + dst_gid0 * dst_stride + dst_gid1 * 1;
-                const std::size_t pr_step_dst = lws1 * dst_stride;
-
-                const std::uint16_t _local_offset0 =
-                    dst_i0 * block_size + dst_i1;
-                const std::uint16_t _pr_step_local = lws1 * block_size;
-
-                for (std::uint16_t pr_id = 0; pr_id < nelems_per_wi; ++pr_id) {
-                    if ((dst_gid1 < n) && ((dst_gid0 + pr_id * lws1) < n)) {
-                        dst_tp[dst_offset0 + pr_step_dst * pr_id] =
-                            local_block[_local_offset0 +
-                                        _pr_step_local * pr_id];
-                    }
-                }
-            }
-            else {
-                // map local_linear_id into (local_dim0, local_dim1)
-                for (std::uint16_t el_id = lid_lin;
-                     el_id < local_dim0 * local_dim1; el_id += lws0 * lws1)
-                {
-
-                    // 0 <= local_i0 < local_dim0
-                    const std::uint16_t loc_i0 = el_id / local_dim1;
-                    // 0 <= local_i1 < local_dim1
-                    const std::uint16_t loc_i1 = el_id - loc_i0 * local_dim1;
-
-                    const std::uint16_t dst_i0 = loc_i0;
-                    const std::uint16_t dst_i1 = loc_i1;
-
-                    const std::size_t dst_gid0 = (dst_tile_start0 + dst_i0);
-                    const std::size_t dst_gid1 = (dst_tile_start1 + dst_i1);
-
-                    const std::size_t dst_offset =
-                        dst_batch_offset + dst_gid0 * dst_stride + dst_gid1 * 1;
-                    const std::uint16_t local_offset =
-                        loc_i0 * block_size + loc_i1;
-
-                    if ((dst_gid1 < n) && (dst_gid0 < n)) {
-                        dst_tp[dst_offset] = local_block[local_offset];
-                    }
-                }
-            }
-        });
-    });
-
-    return e;
-}
-
-} // end of namespace detail
-
-template <typename T>
-sycl::event as_c_contiguous_1d_batch_of_square_matrices_impl(
-    sycl::queue &exec_q,
-    std::size_t batch_nelems,
-    ssize_t src_batch_step,
-    ssize_t dst_batch_step,
-    std::size_t n,
-    const char *src_p,
-    ssize_t src_ld,
-    char *dst_p,
-    ssize_t dst_ld,
-    const std::vector<sycl::event> &depends)
-{
-    using dpctl::tensor::offset_utils::Strided1DIndexer;
-    using dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer;
-    using BatchIndexerT =
-        TwoOffsets_CombinedIndexer<Strided1DIndexer, Strided1DIndexer>;
-
-    const auto &src_batch_indexer =
-        Strided1DIndexer(batch_nelems, src_batch_step);
-    const auto &dst_batch_indexer =
-        Strided1DIndexer(batch_nelems, dst_batch_step);
-
-    const BatchIndexerT batch_two_indexer{src_batch_indexer, dst_batch_indexer};
-
-    return detail::as_c_contiguous_batch_of_square_matrices_impl<T,
-                                                                 BatchIndexerT>(
-        exec_q, batch_nelems, batch_two_indexer, n, src_p, src_ld, dst_p,
-        dst_ld, depends);
-}
-
-typedef sycl::event (
-    *as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t)(
-    sycl::queue &, /* execution queue */
-    std::size_t,   /* number of batch elements */
-    ssize_t,       /* distance between batches in source array */
-    ssize_t,       /* distance between batches in destination array */
-    std::size_t,   /* size of square matrices in the batch */
-    const char *,
-    ssize_t, /* untyped pointer to F-contig source array, and matrix leading
-                dimension */
-    char *,
-    ssize_t, /* untyped pointer to C-contig destination array, and matrix
-                leading dimension */
-    const std::vector<sycl::event> &);
-
-template <typename fnT, typename T>
-struct AsCContig1DBatchOfSquareMatricesFactory
-{
-    fnT get() { return as_c_contiguous_1d_batch_of_square_matrices_impl<T>; }
-};
-
-template <typename T>
-sycl::event as_c_contiguous_nd_batch_of_square_matrices_impl(
-    sycl::queue &exec_q,
-    std::size_t batch_nelems,
-    int batch_nd,
-    const ssize_t *src_batch_shape_strides,
-    const ssize_t dst_batch_step,
-    std::size_t n,
-    const char *src_p,
-    ssize_t src_ld,
-    char *dst_p,
-    ssize_t dst_ld,
-    const std::vector<sycl::event> &depends)
-{
-    using SrcIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
-    using DstIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-    using dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer;
-    using BatchIndexerT = TwoOffsets_CombinedIndexer<SrcIndexerT, DstIndexerT>;
-
-    static constexpr ssize_t zero_offset{0};
-
-    const SrcIndexerT src_batch_indexer{batch_nd, zero_offset,
-                                        src_batch_shape_strides};
-    const DstIndexerT dst_batch_indexer{/* size */ batch_nelems,
-                                        /* step */ dst_batch_step};
-
-    const BatchIndexerT batch_two_offsets_indexer{src_batch_indexer,
-                                                  dst_batch_indexer};
-
-    return detail::as_c_contiguous_batch_of_square_matrices_impl<T,
-                                                                 BatchIndexerT>(
-        exec_q, batch_nelems, batch_two_offsets_indexer, n, src_p, src_ld,
-        dst_p, dst_ld, depends);
-}
-
-typedef sycl::event (
-    *as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t)(
-    sycl::queue &, /* execution queue */
-    std::size_t,   /* number of matrices in the batch */
-    int,
-    const ssize_t *, /* dimensionality, and packed [shape, src_strides]
-                        describing iteration over batch in source array */
-    ssize_t,         /* distance between batches in destination array */
-    std::size_t,     /* matrix size */
-    const char *,
-    ssize_t, /* untyped pointer to source array of F-contig matrices, and
-                leading dimension of the matrix */
-    char *,
-    ssize_t, /* untyped pointer to destination array of F-contig matrices, and
-                leading dimension of the matrix */
-    const std::vector<sycl::event> &);
-
-template <typename fnT, typename T>
-struct AsCContigNDBatchOfSquareMatricesFactory
-{
-    fnT get() { return as_c_contiguous_nd_batch_of_square_matrices_impl<T>; }
-};
-
-} // namespace copy_as_contig
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp b/dpctl/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp
deleted file mode 100644
index 74280e59a2..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp
+++ /dev/null
@@ -1,37 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#pragma once
-
-#include <cstddef>
-
-namespace dpctl
-{
-namespace tensor
-{
-
-typedef std::ptrdiff_t ssize_t;
-
-}
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
deleted file mode 100644
index d600175cf6..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
+++ /dev/null
@@ -1,232 +0,0 @@
-//=== abs.hpp -   Unary function ABS                     ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of ABS(x) function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <limits>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "cabs_impl.hpp"
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace abs
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::ssize_t;
-using dpctl::tensor::type_utils::is_complex;
-
-template <typename argT, typename resT> struct AbsFunctor
-{
-
-    using is_constant = typename std::false_type;
-    // constexpr resT constant_value = resT{};
-    using supports_vec = typename std::false_type;
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &x) const
-    {
-
-        if constexpr (std::is_same_v<argT, bool> ||
-                      (std::is_integral<argT>::value &&
-                       std::is_unsigned<argT>::value))
-        {
-            static_assert(std::is_same_v<resT, argT>);
-            return x;
-        }
-        else {
-            if constexpr (is_complex<argT>::value) {
-                return detail::cabs(x);
-            }
-            else if constexpr (std::is_same_v<argT, sycl::half> ||
-                               std::is_floating_point_v<argT>)
-            {
-                return (sycl::signbit(x) ? -x : x);
-            }
-            else {
-                return sycl::abs(x);
-            }
-        }
-    }
-};
-
-template <typename argT,
-          typename resT = argT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using AbsContigFunctor =
-    elementwise_common::UnaryContigFunctor<argT,
-                                           resT,
-                                           AbsFunctor<argT, resT>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename T> struct AbsOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, bool>,
-        td_ns::TypeMapResultEntry<T, std::uint8_t>,
-        td_ns::TypeMapResultEntry<T, std::uint16_t>,
-        td_ns::TypeMapResultEntry<T, std::uint32_t>,
-        td_ns::TypeMapResultEntry<T, std::uint64_t>,
-        td_ns::TypeMapResultEntry<T, std::int8_t>,
-        td_ns::TypeMapResultEntry<T, std::int16_t>,
-        td_ns::TypeMapResultEntry<T, std::int32_t>,
-        td_ns::TypeMapResultEntry<T, std::int64_t>,
-        td_ns::TypeMapResultEntry<T, sycl::half>,
-        td_ns::TypeMapResultEntry<T, float>,
-        td_ns::TypeMapResultEntry<T, double>,
-        td_ns::TypeMapResultEntry<T, std::complex<float>, float>,
-        td_ns::TypeMapResultEntry<T, std::complex<double>, double>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-
-template <typename argTy> struct AbsContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class abs_contig_kernel;
-
-template <typename argTy>
-sycl::event abs_contig_impl(sycl::queue &exec_q,
-                            std::size_t nelems,
-                            const char *arg_p,
-                            char *res_p,
-                            const std::vector<sycl::event> &depends = {})
-{
-    using AbsHS = hyperparam_detail::AbsContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = AbsHS::vec_sz;
-    static constexpr std::uint8_t n_vec = AbsHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, AbsOutputType, AbsContigFunctor, abs_contig_kernel, vec_sz,
-        n_vec>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct AbsContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!AbsOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = abs_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct AbsTypeMapFactory
-{
-    /*! @brief get typeid for output type of abs(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename AbsOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename argTy, typename resTy, typename IndexerT>
-using AbsStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, AbsFunctor<argTy, resTy>>;
-
-template <typename T1, typename T2, typename T3> class abs_strided_kernel;
-
-template <typename argTy>
-sycl::event abs_strided_impl(sycl::queue &exec_q,
-                             std::size_t nelems,
-                             int nd,
-                             const ssize_t *shape_and_strides,
-                             const char *arg_p,
-                             ssize_t arg_offset,
-                             char *res_p,
-                             ssize_t res_offset,
-                             const std::vector<sycl::event> &depends,
-                             const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, AbsOutputType, AbsStridedFunctor, abs_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct AbsStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!AbsOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = abs_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace abs
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
deleted file mode 100644
index c59a283ded..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
+++ /dev/null
@@ -1,264 +0,0 @@
-//=== acos.hpp -   Unary function ACOS                  ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of ACOS(x) function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "sycl_complex.hpp"
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace acos
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-
-template <typename argT, typename resT> struct AcosFunctor
-{
-
-    // is function constant for given argT
-    using is_constant = typename std::false_type;
-    // constant value, if constant
-    // constexpr resT constant_value = resT{};
-    // is function defined for sycl::vec
-    using supports_vec = typename std::false_type;
-    // do both argTy and resTy support sugroup store/load operation
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-        if constexpr (is_complex<argT>::value) {
-            using realT = typename argT::value_type;
-
-            static constexpr realT q_nan =
-                std::numeric_limits<realT>::quiet_NaN();
-
-            const realT x = std::real(in);
-            const realT y = std::imag(in);
-
-            if (std::isnan(x)) {
-                /* acos(NaN + I*+-Inf) = NaN + I*-+Inf */
-                if (std::isinf(y)) {
-                    return resT{q_nan, -y};
-                }
-
-                /* all other cases involving NaN return NaN + I*NaN. */
-                return resT{q_nan, q_nan};
-            }
-            if (std::isnan(y)) {
-                /* acos(+-Inf + I*NaN) = NaN + I*opt(-)Inf */
-                if (std::isinf(x)) {
-                    return resT{q_nan, -std::numeric_limits<realT>::infinity()};
-                }
-                /* acos(0 + I*NaN) = PI/2 + I*NaN with inexact */
-                if (x == realT(0)) {
-                    const realT res_re = sycl::atan(realT(1)) * 2; // PI/2
-                    return resT{res_re, q_nan};
-                }
-
-                /* all other cases involving NaN return NaN + I*NaN. */
-                return resT{q_nan, q_nan};
-            }
-
-            /*
-             * For large x or y including acos(+-Inf + I*+-Inf)
-             */
-            static constexpr realT r_eps =
-                realT(1) / std::numeric_limits<realT>::epsilon();
-            if (sycl::fabs(x) > r_eps || sycl::fabs(y) > r_eps) {
-                using sycl_complexT = exprm_ns::complex<realT>;
-                sycl_complexT log_in =
-                    exprm_ns::log(exprm_ns::complex<realT>(in));
-
-                const realT wx = log_in.real();
-                const realT wy = log_in.imag();
-                const realT rx = sycl::fabs(wy);
-
-                realT ry = wx + sycl::log(realT(2));
-                return resT{rx, (sycl::signbit(y)) ? ry : -ry};
-            }
-
-            /* ordinary cases */
-            return exprm_ns::acos(exprm_ns::complex<realT>(in)); // acos(in);
-        }
-        else {
-            static_assert(std::is_floating_point_v<argT> ||
-                          std::is_same_v<argT, sycl::half>);
-            return sycl::acos(in);
-        }
-    }
-};
-
-template <typename argTy,
-          typename resTy = argTy,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using AcosContigFunctor =
-    elementwise_common::UnaryContigFunctor<argTy,
-                                           resTy,
-                                           AcosFunctor<argTy, resTy>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using AcosStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, AcosFunctor<argTy, resTy>>;
-
-template <typename T> struct AcosOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, sycl::half>,
-        td_ns::TypeMapResultEntry<T, float>,
-        td_ns::TypeMapResultEntry<T, double>,
-        td_ns::TypeMapResultEntry<T, std::complex<float>>,
-        td_ns::TypeMapResultEntry<T, std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-
-template <typename argTy> struct AcosContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class acos_contig_kernel;
-
-template <typename argTy>
-sycl::event acos_contig_impl(sycl::queue &exec_q,
-                             std::size_t nelems,
-                             const char *arg_p,
-                             char *res_p,
-                             const std::vector<sycl::event> &depends = {})
-{
-    using AcosHS = hyperparam_detail::AcosContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = AcosHS::vec_sz;
-    static constexpr std::uint8_t n_vec = AcosHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, AcosOutputType, AcosContigFunctor, acos_contig_kernel, vec_sz,
-        n_vec>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct AcosContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!AcosOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = acos_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct AcosTypeMapFactory
-{
-    /*! @brief get typeid for output type of sycl::acos(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename AcosOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class acos_strided_kernel;
-
-template <typename argTy>
-sycl::event
-acos_strided_impl(sycl::queue &exec_q,
-                  std::size_t nelems,
-                  int nd,
-                  const ssize_t *shape_and_strides,
-                  const char *arg_p,
-                  ssize_t arg_offset,
-                  char *res_p,
-                  ssize_t res_offset,
-                  const std::vector<sycl::event> &depends,
-                  const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, AcosOutputType, AcosStridedFunctor, acos_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct AcosStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!AcosOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = acos_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace acos
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
deleted file mode 100644
index 12c20525f5..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
+++ /dev/null
@@ -1,293 +0,0 @@
-//=== acosh.hpp -   Unary function ACOSH                ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of ACOSH(x) function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "sycl_complex.hpp"
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace acosh
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-
-template <typename argT, typename resT> struct AcoshFunctor
-{
-
-    // is function constant for given argT
-    using is_constant = typename std::false_type;
-    // constant value, if constant
-    // constexpr resT constant_value = resT{};
-    // is function defined for sycl::vec
-    using supports_vec = typename std::false_type;
-    // do both argTy and resTy support sugroup store/load operation
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-        if constexpr (is_complex<argT>::value) {
-            using realT = typename argT::value_type;
-
-            static constexpr realT q_nan =
-                std::numeric_limits<realT>::quiet_NaN();
-            /*
-             * acosh(in) = I*acos(in) or -I*acos(in)
-             * where the sign is chosen so Re(acosh(in)) >= 0.
-             * So, we first calculate acos(in) and then acosh(in).
-             */
-            const realT x = std::real(in);
-            const realT y = std::imag(in);
-
-            resT acos_in;
-            if (std::isnan(x)) {
-                /* acos(NaN + I*+-Inf) = NaN + I*-+Inf */
-                if (std::isinf(y)) {
-                    acos_in = resT{q_nan, -y};
-                }
-                else {
-                    acos_in = resT{q_nan, q_nan};
-                }
-            }
-            else if (std::isnan(y)) {
-                /* acos(+-Inf + I*NaN) = NaN + I*opt(-)Inf */
-                static constexpr realT inf =
-                    std::numeric_limits<realT>::infinity();
-
-                if (std::isinf(x)) {
-                    acos_in = resT{q_nan, -inf};
-                }
-                /* acos(0 + I*NaN) = Pi/2 + I*NaN with inexact */
-                else if (x == realT(0)) {
-                    const realT pi_half = sycl::atan(realT(1)) * 2;
-                    acos_in = resT{pi_half, q_nan};
-                }
-                else {
-                    acos_in = resT{q_nan, q_nan};
-                }
-            }
-
-            static constexpr realT r_eps =
-                realT(1) / std::numeric_limits<realT>::epsilon();
-            /*
-             * For large x or y including acos(+-Inf + I*+-Inf)
-             */
-            if (sycl::fabs(x) > r_eps || sycl::fabs(y) > r_eps) {
-                using sycl_complexT = typename exprm_ns::complex<realT>;
-                const sycl_complexT log_in = exprm_ns::log(sycl_complexT(in));
-                const realT wx = log_in.real();
-                const realT wy = log_in.imag();
-                const realT rx = sycl::fabs(wy);
-                realT ry = wx + sycl::log(realT(2));
-                acos_in = resT{rx, (sycl::signbit(y)) ? ry : -ry};
-            }
-            else {
-                /* ordinary cases */
-                acos_in =
-                    exprm_ns::acos(exprm_ns::complex<realT>(in)); // acos(in);
-            }
-
-            /* Now we calculate acosh(z) */
-            const realT rx = std::real(acos_in);
-            const realT ry = std::imag(acos_in);
-
-            /* acosh(NaN + I*NaN) = NaN + I*NaN */
-            if (std::isnan(rx) && std::isnan(ry)) {
-                return resT{ry, rx};
-            }
-            /* acosh(NaN + I*+-Inf) = +Inf + I*NaN */
-            /* acosh(+-Inf + I*NaN) = +Inf + I*NaN */
-            if (std::isnan(rx)) {
-                return resT{sycl::fabs(ry), rx};
-            }
-            /* acosh(0 + I*NaN) = NaN + I*NaN */
-            if (std::isnan(ry)) {
-                return resT{ry, ry};
-            }
-            /* ordinary cases */
-            const realT res_im = sycl::copysign(rx, std::imag(in));
-            return resT{sycl::fabs(ry), res_im};
-        }
-        else {
-            static_assert(std::is_floating_point_v<argT> ||
-                          std::is_same_v<argT, sycl::half>);
-            return sycl::acosh(in);
-        }
-    }
-};
-
-template <typename argTy,
-          typename resTy = argTy,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using AcoshContigFunctor =
-    elementwise_common::UnaryContigFunctor<argTy,
-                                           resTy,
-                                           AcoshFunctor<argTy, resTy>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using AcoshStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, AcoshFunctor<argTy, resTy>>;
-
-template <typename T> struct AcoshOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, sycl::half>,
-        td_ns::TypeMapResultEntry<T, float>,
-        td_ns::TypeMapResultEntry<T, double>,
-        td_ns::TypeMapResultEntry<T, std::complex<float>>,
-        td_ns::TypeMapResultEntry<T, std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct AcoshContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class acosh_contig_kernel;
-
-template <typename argTy>
-sycl::event acosh_contig_impl(sycl::queue &exec_q,
-                              std::size_t nelems,
-                              const char *arg_p,
-                              char *res_p,
-                              const std::vector<sycl::event> &depends = {})
-{
-    using AcoshHS = hyperparam_detail::AcoshContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = AcoshHS::vec_sz;
-    static constexpr std::uint8_t n_vec = AcoshHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, AcoshOutputType, AcoshContigFunctor, acosh_contig_kernel, vec_sz,
-        n_vec>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct AcoshContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!AcoshOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = acosh_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct AcoshTypeMapFactory
-{
-    /*! @brief get typeid for output type of sycl::acosh(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename AcoshOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class acosh_strided_kernel;
-
-template <typename argTy>
-sycl::event
-acosh_strided_impl(sycl::queue &exec_q,
-                   std::size_t nelems,
-                   int nd,
-                   const ssize_t *shape_and_strides,
-                   const char *arg_p,
-                   ssize_t arg_offset,
-                   char *res_p,
-                   ssize_t res_offset,
-                   const std::vector<sycl::event> &depends,
-                   const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, AcoshOutputType, AcoshStridedFunctor, acosh_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct AcoshStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!AcoshOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = acosh_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace acosh
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
deleted file mode 100644
index d65c06500c..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
+++ /dev/null
@@ -1,674 +0,0 @@
-//=== add.hpp -   Binary function ADD                    ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of ADD(x1, x2)
-/// function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "sycl_complex.hpp"
-#include "vec_size_util.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/common_inplace.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace add
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace tu_ns = dpctl::tensor::type_utils;
-
-template <typename argT1, typename argT2, typename resT> struct AddFunctor
-{
-
-    using supports_sg_loadstore = std::negation<
-        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
-    using supports_vec = std::negation<
-        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
-
-    resT operator()(const argT1 &in1, const argT2 &in2) const
-    {
-        if constexpr (tu_ns::is_complex<argT1>::value &&
-                      tu_ns::is_complex<argT2>::value)
-        {
-            using rT1 = typename argT1::value_type;
-            using rT2 = typename argT2::value_type;
-
-            return exprm_ns::complex<rT1>(in1) + exprm_ns::complex<rT2>(in2);
-        }
-        else if constexpr (tu_ns::is_complex<argT1>::value &&
-                           !tu_ns::is_complex<argT2>::value)
-        {
-            using rT1 = typename argT1::value_type;
-
-            return exprm_ns::complex<rT1>(in1) + in2;
-        }
-        else if constexpr (!tu_ns::is_complex<argT1>::value &&
-                           tu_ns::is_complex<argT2>::value)
-        {
-            using rT2 = typename argT2::value_type;
-
-            return in1 + exprm_ns::complex<rT2>(in2);
-        }
-        else {
-            return in1 + in2;
-        }
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz>
-    operator()(const sycl::vec<argT1, vec_sz> &in1,
-               const sycl::vec<argT2, vec_sz> &in2) const
-    {
-        auto tmp = in1 + in2;
-        if constexpr (std::is_same_v<resT,
-                                     typename decltype(tmp)::element_type>)
-        {
-            return tmp;
-        }
-        else {
-            using dpctl::tensor::type_utils::vec_cast;
-
-            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
-                tmp);
-        }
-    }
-};
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using AddContigFunctor =
-    elementwise_common::BinaryContigFunctor<argT1,
-                                            argT2,
-                                            resT,
-                                            AddFunctor<argT1, argT2, resT>,
-                                            vec_sz,
-                                            n_vecs,
-                                            enable_sg_loadstore>;
-
-template <typename argT1, typename argT2, typename resT, typename IndexerT>
-using AddStridedFunctor =
-    elementwise_common::BinaryStridedFunctor<argT1,
-                                             argT2,
-                                             resT,
-                                             IndexerT,
-                                             AddFunctor<argT1, argT2, resT>>;
-
-template <typename T1, typename T2> struct AddOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint8_t,
-                                        T2,
-                                        std::uint8_t,
-                                        std::uint8_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int8_t,
-                                        T2,
-                                        std::int8_t,
-                                        std::int8_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint16_t,
-                                        T2,
-                                        std::uint16_t,
-                                        std::uint16_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int16_t,
-                                        T2,
-                                        std::int16_t,
-                                        std::int16_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint32_t,
-                                        T2,
-                                        std::uint32_t,
-                                        std::uint32_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int32_t,
-                                        T2,
-                                        std::int32_t,
-                                        std::int32_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint64_t,
-                                        T2,
-                                        std::uint64_t,
-                                        std::uint64_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int64_t,
-                                        T2,
-                                        std::int64_t,
-                                        std::int64_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        sycl::half,
-                                        T2,
-                                        sycl::half,
-                                        sycl::half>,
-        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
-        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<float>,
-                                        T2,
-                                        std::complex<float>,
-                                        std::complex<float>>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<double>,
-                                        T2,
-                                        std::complex<double>,
-                                        std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::BinaryContigHyperparameterSetEntry;
-using vsu_ns::ContigHyperparameterSetDefault;
-
-template <typename argTy1, typename argTy2> struct AddContigHyperparameterSet
-{
-    using value_type = typename std::disjunction<
-        BinaryContigHyperparameterSetEntry<argTy1,
-                                           std::int32_t,
-                                           argTy2,
-                                           std::int32_t,
-                                           2u,
-                                           2u>,
-        BinaryContigHyperparameterSetEntry<argTy1,
-                                           std::int32_t,
-                                           argTy2,
-                                           std::int32_t,
-                                           2u,
-                                           2u>,
-        BinaryContigHyperparameterSetEntry<argTy1,
-                                           std::int64_t,
-                                           argTy2,
-                                           std::int64_t,
-                                           2u,
-                                           2u>,
-        BinaryContigHyperparameterSetEntry<argTy1,
-                                           std::uint64_t,
-                                           argTy2,
-                                           std::uint64_t,
-                                           2u,
-                                           2u>,
-        BinaryContigHyperparameterSetEntry<argTy1,
-                                           float,
-                                           argTy2,
-                                           float,
-                                           2u,
-                                           2u>,
-        BinaryContigHyperparameterSetEntry<argTy1,
-                                           double,
-                                           argTy2,
-                                           double,
-                                           1u,
-                                           2u>,
-        ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class add_contig_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event add_contig_impl(sycl::queue &exec_q,
-                            std::size_t nelems,
-                            const char *arg1_p,
-                            ssize_t arg1_offset,
-                            const char *arg2_p,
-                            ssize_t arg2_offset,
-                            char *res_p,
-                            ssize_t res_offset,
-                            const std::vector<sycl::event> &depends = {})
-{
-    using AddHS = hyperparam_detail::AddContigHyperparameterSet<argTy1, argTy2>;
-    static constexpr auto vec_sz = AddHS::vec_sz;
-    static constexpr auto n_vecs = AddHS::n_vecs;
-
-    return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, AddOutputType, AddContigFunctor, add_contig_kernel,
-        vec_sz, n_vecs>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                        arg2_offset, res_p, res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct AddContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!AddOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = add_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2> struct AddTypeMapFactory
-{
-    /*! @brief get typeid for output type of std::add(T1 x, T2 y) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename AddOutputType<T1, T2>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename resT, typename IndexerT>
-class add_strided_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event add_strided_impl(sycl::queue &exec_q,
-                             std::size_t nelems,
-                             int nd,
-                             const ssize_t *shape_and_strides,
-                             const char *arg1_p,
-                             ssize_t arg1_offset,
-                             const char *arg2_p,
-                             ssize_t arg2_offset,
-                             char *res_p,
-                             ssize_t res_offset,
-                             const std::vector<sycl::event> &depends,
-                             const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_strided_impl<
-        argTy1, argTy2, AddOutputType, AddStridedFunctor, add_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
-        arg2_offset, res_p, res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct AddStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!AddOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = add_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename argT1, typename argT2, typename resT>
-class add_matrix_row_broadcast_sg_krn;
-
-template <typename argT1, typename argT2, typename resT>
-using AddContigMatrixContigRowBroadcastingFunctor =
-    elementwise_common::BinaryContigMatrixContigRowBroadcastingFunctor<
-        argT1,
-        argT2,
-        resT,
-        AddFunctor<argT1, argT2, resT>>;
-
-template <typename argT1, typename argT2, typename resT>
-sycl::event add_contig_matrix_contig_row_broadcast_impl(
-    sycl::queue &exec_q,
-    std::vector<sycl::event> &host_tasks,
-    std::size_t n0,
-    std::size_t n1,
-    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
-    ssize_t mat_offset,
-    const char *vec_p, // typeless pointer to (n1,) contiguous row
-    ssize_t vec_offset,
-    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
-                 //    res[i,j] = mat[i,j] + vec[j]
-    ssize_t res_offset,
-    const std::vector<sycl::event> &depends = {})
-{
-    return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl<
-        argT1, argT2, resT, AddContigMatrixContigRowBroadcastingFunctor,
-        add_matrix_row_broadcast_sg_krn>(exec_q, host_tasks, n0, n1, mat_p,
-                                         mat_offset, vec_p, vec_offset, res_p,
-                                         res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct AddContigMatrixContigRowBroadcastFactory
-{
-    fnT get()
-    {
-        if constexpr (!AddOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            using resT = typename AddOutputType<T1, T2>::value_type;
-            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
-                          dpctl::tensor::type_utils::is_complex<T2>::value ||
-                          dpctl::tensor::type_utils::is_complex<resT>::value)
-            {
-                fnT fn = nullptr;
-                return fn;
-            }
-            else {
-                fnT fn =
-                    add_contig_matrix_contig_row_broadcast_impl<T1, T2, resT>;
-                return fn;
-            }
-        }
-    }
-};
-
-template <typename argT1, typename argT2, typename resT>
-sycl::event add_contig_row_contig_matrix_broadcast_impl(
-    sycl::queue &exec_q,
-    std::vector<sycl::event> &host_tasks,
-    std::size_t n0,
-    std::size_t n1,
-    const char *vec_p, // typeless pointer to (n1,) contiguous row
-    ssize_t vec_offset,
-    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
-    ssize_t mat_offset,
-    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
-                 //    res[i,j] = mat[i,j] + vec[j]
-    ssize_t res_offset,
-    const std::vector<sycl::event> &depends = {})
-{
-    return add_contig_matrix_contig_row_broadcast_impl<argT2, argT1, resT>(
-        exec_q, host_tasks, n0, n1, mat_p, mat_offset, vec_p, vec_offset, res_p,
-        res_offset, depends);
-};
-
-template <typename fnT, typename T1, typename T2>
-struct AddContigRowContigMatrixBroadcastFactory
-{
-    fnT get()
-    {
-        if constexpr (!AddOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            using resT = typename AddOutputType<T1, T2>::value_type;
-            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
-                          dpctl::tensor::type_utils::is_complex<T2>::value ||
-                          dpctl::tensor::type_utils::is_complex<resT>::value)
-            {
-                fnT fn = nullptr;
-                return fn;
-            }
-            else {
-                fnT fn =
-                    add_contig_row_contig_matrix_broadcast_impl<T1, T2, resT>;
-                return fn;
-            }
-        }
-    }
-};
-
-template <typename argT, typename resT> struct AddInplaceFunctor
-{
-
-    using supports_sg_loadstore = std::negation<
-        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
-    using supports_vec = std::negation<
-        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
-
-    void operator()(resT &res, const argT &in) { res += in; }
-
-    template <int vec_sz>
-    void operator()(sycl::vec<resT, vec_sz> &res,
-                    const sycl::vec<argT, vec_sz> &in)
-    {
-        res += in;
-    }
-};
-
-template <typename argT,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using AddInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor<
-    argT,
-    resT,
-    AddInplaceFunctor<argT, resT>,
-    vec_sz,
-    n_vecs,
-    enable_sg_loadstore>;
-
-template <typename argT, typename resT, typename IndexerT>
-using AddInplaceStridedFunctor =
-    elementwise_common::BinaryInplaceStridedFunctor<
-        argT,
-        resT,
-        IndexerT,
-        AddInplaceFunctor<argT, resT>>;
-
-template <typename argT,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class add_inplace_contig_kernel;
-
-/* @brief Types supported by in-place add */
-template <typename argTy, typename resTy> struct AddInplaceTypePairSupport
-{
-    /* value if true a kernel for <argTy, resTy> must be instantiated  */
-    static constexpr bool is_defined = std::disjunction<
-        td_ns::TypePairDefinedEntry<argTy, bool, resTy, bool>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, resTy, sycl::half>,
-        td_ns::TypePairDefinedEntry<argTy, float, resTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, double, resTy, double>,
-        td_ns::TypePairDefinedEntry<argTy,
-                                    std::complex<float>,
-                                    resTy,
-                                    std::complex<float>>,
-        td_ns::TypePairDefinedEntry<argTy,
-                                    std::complex<double>,
-                                    resTy,
-                                    std::complex<double>>,
-        // fall-through
-        td_ns::NotDefinedEntry>::is_defined;
-};
-
-template <typename fnT, typename argT, typename resT>
-struct AddInplaceTypeMapFactory
-{
-    /*! @brief get typeid for output type of x += y */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        if constexpr (AddInplaceTypePairSupport<argT, resT>::is_defined) {
-            return td_ns::GetTypeid<resT>{}.get();
-        }
-        else {
-            return td_ns::GetTypeid<void>{}.get();
-        }
-    }
-};
-
-template <typename argTy, typename resTy>
-sycl::event
-add_inplace_contig_impl(sycl::queue &exec_q,
-                        std::size_t nelems,
-                        const char *arg_p,
-                        ssize_t arg_offset,
-                        char *res_p,
-                        ssize_t res_offset,
-                        const std::vector<sycl::event> &depends = {})
-{
-    static constexpr auto vec_sz =
-        hyperparam_detail::AddContigHyperparameterSet<resTy, argTy>::vec_sz;
-    static constexpr auto n_vecs =
-        hyperparam_detail::AddContigHyperparameterSet<resTy, argTy>::n_vecs;
-
-    return elementwise_common::binary_inplace_contig_impl<
-        argTy, resTy, AddInplaceContigFunctor, add_inplace_contig_kernel,
-        vec_sz, n_vecs>(exec_q, nelems, arg_p, arg_offset, res_p, res_offset,
-                        depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct AddInplaceContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!AddInplaceTypePairSupport<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = add_inplace_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename resT, typename argT, typename IndexerT>
-class add_inplace_strided_kernel;
-
-template <typename argTy, typename resTy>
-sycl::event
-add_inplace_strided_impl(sycl::queue &exec_q,
-                         std::size_t nelems,
-                         int nd,
-                         const ssize_t *shape_and_strides,
-                         const char *arg_p,
-                         ssize_t arg_offset,
-                         char *res_p,
-                         ssize_t res_offset,
-                         const std::vector<sycl::event> &depends,
-                         const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_inplace_strided_impl<
-        argTy, resTy, AddInplaceStridedFunctor, add_inplace_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct AddInplaceStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!AddInplaceTypePairSupport<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = add_inplace_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename argT, typename resT>
-class add_inplace_row_matrix_broadcast_sg_krn;
-
-template <typename argT, typename resT>
-using AddInplaceRowMatrixBroadcastingFunctor =
-    elementwise_common::BinaryInplaceRowMatrixBroadcastingFunctor<
-        argT,
-        resT,
-        AddInplaceFunctor<argT, resT>>;
-
-template <typename argT, typename resT>
-sycl::event add_inplace_row_matrix_broadcast_impl(
-    sycl::queue &exec_q,
-    std::vector<sycl::event> &host_tasks,
-    std::size_t n0,
-    std::size_t n1,
-    const char *vec_p, // typeless pointer to (n1,) contiguous row
-    ssize_t vec_offset,
-    char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
-    ssize_t mat_offset,
-    const std::vector<sycl::event> &depends = {})
-{
-    return elementwise_common::binary_inplace_row_matrix_broadcast_impl<
-        argT, resT, AddInplaceRowMatrixBroadcastingFunctor,
-        add_inplace_row_matrix_broadcast_sg_krn>(exec_q, host_tasks, n0, n1,
-                                                 vec_p, vec_offset, mat_p,
-                                                 mat_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct AddInplaceRowMatrixBroadcastFactory
-{
-    fnT get()
-    {
-        if constexpr (!AddInplaceTypePairSupport<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
-                          dpctl::tensor::type_utils::is_complex<T2>::value)
-            {
-                fnT fn = nullptr;
-                return fn;
-            }
-            else {
-                fnT fn = add_inplace_row_matrix_broadcast_impl<T1, T2>;
-                return fn;
-            }
-        }
-    }
-};
-
-} // namespace add
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
deleted file mode 100644
index 843c982de1..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp
+++ /dev/null
@@ -1,207 +0,0 @@
-//=== angle.hpp -   Unary function ANGLE              ------*-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of ANGLE(x) function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "sycl_complex.hpp"
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace angle
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-
-template <typename argT, typename resT> struct AngleFunctor
-{
-
-    // is function constant for given argT
-    using is_constant = typename std::false_type;
-    // constant value, if constant
-    // constexpr resT constant_value = resT{};
-    // is function defined for sycl::vec
-    using supports_vec = typename std::false_type;
-    // do both argTy and resTy support sugroup store/load operation
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-        using rT = typename argT::value_type;
-
-        return exprm_ns::arg(exprm_ns::complex<rT>(in)); // arg(in);
-    }
-};
-
-template <typename argTy,
-          typename resTy = argTy,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using AngleContigFunctor =
-    elementwise_common::UnaryContigFunctor<argTy,
-                                           resTy,
-                                           AngleFunctor<argTy, resTy>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using AngleStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, AngleFunctor<argTy, resTy>>;
-
-template <typename T> struct AngleOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, std::complex<float>, float>,
-        td_ns::TypeMapResultEntry<T, std::complex<double>, double>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct AngleContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class angle_contig_kernel;
-
-template <typename argTy>
-sycl::event angle_contig_impl(sycl::queue &exec_q,
-                              std::size_t nelems,
-                              const char *arg_p,
-                              char *res_p,
-                              const std::vector<sycl::event> &depends = {})
-{
-    using AngleHS = hyperparam_detail::AngleContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = AngleHS::vec_sz;
-    static constexpr std::uint8_t n_vec = AngleHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, AngleOutputType, AngleContigFunctor, angle_contig_kernel, vec_sz,
-        n_vec>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct AngleContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!AngleOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = angle_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct AngleTypeMapFactory
-{
-    /*! @brief get typeid for output type of std::arg(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename AngleOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class angle_strided_kernel;
-
-template <typename argTy>
-sycl::event
-angle_strided_impl(sycl::queue &exec_q,
-                   std::size_t nelems,
-                   int nd,
-                   const ssize_t *shape_and_strides,
-                   const char *arg_p,
-                   ssize_t arg_offset,
-                   char *res_p,
-                   ssize_t res_offset,
-                   const std::vector<sycl::event> &depends,
-                   const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, AngleOutputType, AngleStridedFunctor, angle_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct AngleStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!AngleOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = angle_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace angle
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
deleted file mode 100644
index 3ba00c4198..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
+++ /dev/null
@@ -1,285 +0,0 @@
-//=== asin.hpp -   Unary function ASIN                  ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of ASIN(x) function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "sycl_complex.hpp"
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace asin
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-
-template <typename argT, typename resT> struct AsinFunctor
-{
-
-    // is function constant for given argT
-    using is_constant = typename std::false_type;
-    // constant value, if constant
-    // constexpr resT constant_value = resT{};
-    // is function defined for sycl::vec
-    using supports_vec = typename std::false_type;
-    // do both argTy and resTy support sugroup store/load operation
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-        if constexpr (is_complex<argT>::value) {
-            using realT = typename argT::value_type;
-
-            static constexpr realT q_nan =
-                std::numeric_limits<realT>::quiet_NaN();
-
-            /*
-             * asin(in) = I * conj( asinh(I * conj(in)) )
-             * so we first calculate w = asinh(I * conj(in)) with
-             * x = real(I * conj(in)) = imag(in)
-             * y = imag(I * conj(in)) = real(in)
-             * and then return {imag(w), real(w)} which is asin(in)
-             */
-            const realT x = std::imag(in);
-            const realT y = std::real(in);
-
-            if (std::isnan(x)) {
-                /* asinh(NaN + I*+-Inf) = opt(+-)Inf + I*NaN */
-                if (std::isinf(y)) {
-                    const realT asinh_re = y;
-                    const realT asinh_im = q_nan;
-                    return resT{asinh_im, asinh_re};
-                }
-                /* asinh(NaN + I*0) = NaN + I*0 */
-                if (y == realT(0)) {
-                    const realT asinh_re = q_nan;
-                    const realT asinh_im = y;
-                    return resT{asinh_im, asinh_re};
-                }
-                /* All other cases involving NaN return NaN + I*NaN. */
-                return resT{q_nan, q_nan};
-            }
-            else if (std::isnan(y)) {
-                /* asinh(+-Inf + I*NaN) = +-Inf + I*NaN */
-                if (std::isinf(x)) {
-                    const realT asinh_re = x;
-                    const realT asinh_im = q_nan;
-                    return resT{asinh_im, asinh_re};
-                }
-                /* All other cases involving NaN return NaN + I*NaN. */
-                return resT{q_nan, q_nan};
-            }
-
-            /*
-             * For large x or y including asinh(+-Inf + I*+-Inf)
-             * asinh(in) = sign(x)*log(sign(x)*in) + O(1/in^2)   as in ->
-             * infinity The above formula works for the imaginary part as well,
-             * because Im(asinh(in)) = sign(x)*atan2(sign(x)*y, fabs(x)) +
-             * O(y/in^3) as in -> infinity, uniformly in y
-             */
-            static constexpr realT r_eps =
-                realT(1) / std::numeric_limits<realT>::epsilon();
-            if (sycl::fabs(x) > r_eps || sycl::fabs(y) > r_eps) {
-                using sycl_complexT = exprm_ns::complex<realT>;
-                const sycl_complexT z{x, y};
-                realT wx, wy;
-                if (!sycl::signbit(x)) {
-                    const auto log_z = exprm_ns::log(z);
-                    wx = log_z.real() + sycl::log(realT(2));
-                    wy = log_z.imag();
-                }
-                else {
-                    const auto log_mz = exprm_ns::log(-z);
-                    wx = log_mz.real() + sycl::log(realT(2));
-                    wy = log_mz.imag();
-                }
-                const realT asinh_re = sycl::copysign(wx, x);
-                const realT asinh_im = sycl::copysign(wy, y);
-                return resT{asinh_im, asinh_re};
-            }
-            /* ordinary cases */
-            return exprm_ns::asin(
-                exprm_ns::complex<realT>(in)); // sycl::asin(in);
-        }
-        else {
-            static_assert(std::is_floating_point_v<argT> ||
-                          std::is_same_v<argT, sycl::half>);
-            return sycl::asin(in);
-        }
-    }
-};
-
-template <typename argTy,
-          typename resTy = argTy,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using AsinContigFunctor =
-    elementwise_common::UnaryContigFunctor<argTy,
-                                           resTy,
-                                           AsinFunctor<argTy, resTy>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using AsinStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, AsinFunctor<argTy, resTy>>;
-
-template <typename T> struct AsinOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, sycl::half>,
-        td_ns::TypeMapResultEntry<T, float>,
-        td_ns::TypeMapResultEntry<T, double>,
-        td_ns::TypeMapResultEntry<T, std::complex<float>>,
-        td_ns::TypeMapResultEntry<T, std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct AsinContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class asin_contig_kernel;
-
-template <typename argTy>
-sycl::event asin_contig_impl(sycl::queue &exec_q,
-                             std::size_t nelems,
-                             const char *arg_p,
-                             char *res_p,
-                             const std::vector<sycl::event> &depends = {})
-{
-    using AddHS = hyperparam_detail::AsinContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = AddHS::vec_sz;
-    static constexpr std::uint8_t n_vec = AddHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, AsinOutputType, AsinContigFunctor, asin_contig_kernel, vec_sz,
-        n_vec>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct AsinContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!AsinOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = asin_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct AsinTypeMapFactory
-{
-    /*! @brief get typeid for output type of sycl::asin(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename AsinOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class asin_strided_kernel;
-
-template <typename argTy>
-sycl::event
-asin_strided_impl(sycl::queue &exec_q,
-                  std::size_t nelems,
-                  int nd,
-                  const ssize_t *shape_and_strides,
-                  const char *arg_p,
-                  ssize_t arg_offset,
-                  char *res_p,
-                  ssize_t res_offset,
-                  const std::vector<sycl::event> &depends,
-                  const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, AsinOutputType, AsinStridedFunctor, asin_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct AsinStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!AsinOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = asin_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace asin
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
deleted file mode 100644
index 7441e51ae7..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
+++ /dev/null
@@ -1,268 +0,0 @@
-//=== asinh.hpp -   Unary function ASINH                ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of ASINH(x) function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "sycl_complex.hpp"
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace asinh
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-
-template <typename argT, typename resT> struct AsinhFunctor
-{
-
-    // is function constant for given argT
-    using is_constant = typename std::false_type;
-    // constant value, if constant
-    // constexpr resT constant_value = resT{};
-    // is function defined for sycl::vec
-    using supports_vec = typename std::false_type;
-    // do both argTy and resTy support sugroup store/load operation
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-        if constexpr (is_complex<argT>::value) {
-            using realT = typename argT::value_type;
-
-            static constexpr realT q_nan =
-                std::numeric_limits<realT>::quiet_NaN();
-
-            const realT x = std::real(in);
-            const realT y = std::imag(in);
-
-            if (std::isnan(x)) {
-                /* asinh(NaN + I*+-Inf) = opt(+-)Inf + I*NaN */
-                if (std::isinf(y)) {
-                    return resT{y, q_nan};
-                }
-                /* asinh(NaN + I*0) = NaN + I*0 */
-                if (y == realT(0)) {
-                    return resT{q_nan, y};
-                }
-                /* All other cases involving NaN return NaN + I*NaN. */
-                return resT{q_nan, q_nan};
-            }
-
-            if (std::isnan(y)) {
-                /* asinh(+-Inf + I*NaN) = +-Inf + I*NaN */
-                if (std::isinf(x)) {
-                    return resT{x, q_nan};
-                }
-                /* All other cases involving NaN return NaN + I*NaN. */
-                return resT{q_nan, q_nan};
-            }
-
-            /*
-             * For large x or y including asinh(+-Inf + I*+-Inf)
-             * asinh(in) = sign(x)*log(sign(x)*in) + O(1/in^2)   as in ->
-             * infinity The above formula works for the imaginary part as well,
-             * because Im(asinh(in)) = sign(x)*atan2(sign(x)*y, fabs(x)) +
-             * O(y/in^3) as in -> infinity, uniformly in y
-             */
-            static constexpr realT r_eps =
-                realT(1) / std::numeric_limits<realT>::epsilon();
-
-            if (sycl::fabs(x) > r_eps || sycl::fabs(y) > r_eps) {
-                using sycl_complexT = exprm_ns::complex<realT>;
-                sycl_complexT log_in = (sycl::signbit(x))
-                                           ? exprm_ns::log(sycl_complexT(-in))
-                                           : exprm_ns::log(sycl_complexT(in));
-                realT wx = log_in.real() + sycl::log(realT(2));
-                realT wy = log_in.imag();
-
-                const realT res_re = sycl::copysign(wx, x);
-                const realT res_im = sycl::copysign(wy, y);
-                return resT{res_re, res_im};
-            }
-
-            /* ordinary cases */
-            return exprm_ns::asinh(exprm_ns::complex<realT>(in)); // asinh(in);
-        }
-        else {
-            static_assert(std::is_floating_point_v<argT> ||
-                          std::is_same_v<argT, sycl::half>);
-            return sycl::asinh(in);
-        }
-    }
-};
-
-template <typename argTy,
-          typename resTy = argTy,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using AsinhContigFunctor =
-    elementwise_common::UnaryContigFunctor<argTy,
-                                           resTy,
-                                           AsinhFunctor<argTy, resTy>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using AsinhStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, AsinhFunctor<argTy, resTy>>;
-
-template <typename T> struct AsinhOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, sycl::half>,
-        td_ns::TypeMapResultEntry<T, float>,
-        td_ns::TypeMapResultEntry<T, double>,
-        td_ns::TypeMapResultEntry<T, std::complex<float>>,
-        td_ns::TypeMapResultEntry<T, std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct AsinhContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class asinh_contig_kernel;
-
-template <typename argTy>
-sycl::event asinh_contig_impl(sycl::queue &exec_q,
-                              std::size_t nelems,
-                              const char *arg_p,
-                              char *res_p,
-                              const std::vector<sycl::event> &depends = {})
-{
-    using AsinhHS = hyperparam_detail::AsinhContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = AsinhHS::vec_sz;
-    static constexpr std::uint8_t n_vec = AsinhHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, AsinhOutputType, AsinhContigFunctor, asinh_contig_kernel, vec_sz,
-        n_vec>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct AsinhContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!AsinhOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = asinh_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct AsinhTypeMapFactory
-{
-    /*! @brief get typeid for output type of sycl::asinh(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename AsinhOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class asinh_strided_kernel;
-
-template <typename argTy>
-sycl::event
-asinh_strided_impl(sycl::queue &exec_q,
-                   std::size_t nelems,
-                   int nd,
-                   const ssize_t *shape_and_strides,
-                   const char *arg_p,
-                   ssize_t arg_offset,
-                   char *res_p,
-                   ssize_t res_offset,
-                   const std::vector<sycl::event> &depends,
-                   const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, AsinhOutputType, AsinhStridedFunctor, asinh_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct AsinhStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!AsinhOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = asinh_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace asinh
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
deleted file mode 100644
index 87fad500ab..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
+++ /dev/null
@@ -1,278 +0,0 @@
-//=== atan.hpp -   Unary function ATAN                  ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of ATAN(x) function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "sycl_complex.hpp"
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace atan
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::kernels::vec_size_utils::ContigHyperparameterSetDefault;
-using dpctl::tensor::kernels::vec_size_utils::UnaryContigHyperparameterSetEntry;
-
-using dpctl::tensor::type_utils::is_complex;
-
-template <typename argT, typename resT> struct AtanFunctor
-{
-
-    // is function constant for given argT
-    using is_constant = typename std::false_type;
-    // constant value, if constant
-    // constexpr resT constant_value = resT{};
-    // is function defined for sycl::vec
-    using supports_vec = typename std::false_type;
-    // do both argTy and resTy support sugroup store/load operation
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-        if constexpr (is_complex<argT>::value) {
-            using realT = typename argT::value_type;
-
-            static constexpr realT q_nan =
-                std::numeric_limits<realT>::quiet_NaN();
-            /*
-             * atan(in) = I * conj( atanh(I * conj(in)) )
-             * so we first calculate w = atanh(I * conj(in)) with
-             * x = real(I * conj(in)) = imag(in)
-             * y = imag(I * conj(in)) = real(in)
-             * and then return {imag(w), real(w)} which is atan(in)
-             */
-            const realT x = std::imag(in);
-            const realT y = std::real(in);
-            if (std::isnan(x)) {
-                /* atanh(NaN + I*+-Inf) = sign(NaN)*0 + I*+-Pi/2 */
-                if (std::isinf(y)) {
-                    const realT pi_half = sycl::atan(realT(1)) * 2;
-
-                    const realT atanh_re = sycl::copysign(realT(0), x);
-                    const realT atanh_im = sycl::copysign(pi_half, y);
-                    return resT{atanh_im, atanh_re};
-                }
-                /*
-                 * All other cases involving NaN return NaN + I*NaN.
-                 */
-                return resT{q_nan, q_nan};
-            }
-            else if (std::isnan(y)) {
-                /* atanh(+-Inf + I*NaN) = +-0 + I*NaN */
-                if (std::isinf(x)) {
-                    const realT atanh_re = sycl::copysign(realT(0), x);
-                    const realT atanh_im = q_nan;
-                    return resT{atanh_im, atanh_re};
-                }
-                /* atanh(+-0 + I*NaN) = +-0 + I*NaN */
-                if (x == realT(0)) {
-                    return resT{q_nan, x};
-                }
-                /*
-                 * All other cases involving NaN return NaN + I*NaN.
-                 */
-                return resT{q_nan, q_nan};
-            }
-
-            /*
-             * For large x or y including
-             * atanh(+-Inf + I*+-Inf) = 0 + I*+-PI/2
-             * The sign of pi/2 depends on the sign of imaginary part of the
-             * input.
-             */
-            static constexpr realT r_eps =
-                realT(1) / std::numeric_limits<realT>::epsilon();
-            if (sycl::fabs(x) > r_eps || sycl::fabs(y) > r_eps) {
-                const realT pi_half = sycl::atan(realT(1)) * 2;
-
-                const realT atanh_re = realT(0);
-                const realT atanh_im = sycl::copysign(pi_half, y);
-                return resT{atanh_im, atanh_re};
-            }
-            /* ordinary cases */
-            return exprm_ns::atan(exprm_ns::complex<realT>(in)); // atan(in);
-        }
-        else {
-            static_assert(std::is_floating_point_v<argT> ||
-                          std::is_same_v<argT, sycl::half>);
-            return sycl::atan(in);
-        }
-    }
-};
-
-template <typename argTy,
-          typename resTy = argTy,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using AtanContigFunctor =
-    elementwise_common::UnaryContigFunctor<argTy,
-                                           resTy,
-                                           AtanFunctor<argTy, resTy>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using AtanStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, AtanFunctor<argTy, resTy>>;
-
-template <typename T> struct AtanOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, sycl::half>,
-        td_ns::TypeMapResultEntry<T, float>,
-        td_ns::TypeMapResultEntry<T, double>,
-        td_ns::TypeMapResultEntry<T, std::complex<float>>,
-        td_ns::TypeMapResultEntry<T, std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct AtanContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class atan_contig_kernel;
-
-template <typename argTy>
-sycl::event atan_contig_impl(sycl::queue &exec_q,
-                             std::size_t nelems,
-                             const char *arg_p,
-                             char *res_p,
-                             const std::vector<sycl::event> &depends = {})
-{
-    using AtanHS = hyperparam_detail::AtanContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = AtanHS::vec_sz;
-    static constexpr std::uint8_t n_vec = AtanHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, AtanOutputType, AtanContigFunctor, atan_contig_kernel, vec_sz,
-        n_vec>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct AtanContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!AtanOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = atan_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct AtanTypeMapFactory
-{
-    /*! @brief get typeid for output type of sycl::atan(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename AtanOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class atan_strided_kernel;
-
-template <typename argTy>
-sycl::event
-atan_strided_impl(sycl::queue &exec_q,
-                  std::size_t nelems,
-                  int nd,
-                  const ssize_t *shape_and_strides,
-                  const char *arg_p,
-                  ssize_t arg_offset,
-                  char *res_p,
-                  ssize_t res_offset,
-                  const std::vector<sycl::event> &depends,
-                  const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, AtanOutputType, AtanStridedFunctor, atan_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct AtanStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!AtanOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = atan_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace atan
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
deleted file mode 100644
index 7e5dbe0bb5..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
+++ /dev/null
@@ -1,226 +0,0 @@
-//=== ATAN2.hpp -   Binary function ATAN2  ------               *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of ATAN2(x1, x2)
-/// function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace atan2
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace tu_ns = dpctl::tensor::type_utils;
-
-template <typename argT1, typename argT2, typename resT> struct Atan2Functor
-{
-
-    using supports_sg_loadstore = std::true_type;
-    using supports_vec = std::false_type;
-
-    resT operator()(const argT1 &in1, const argT2 &in2) const
-    {
-        if (std::isinf(in2) && !sycl::signbit(in2)) {
-            if (std::isfinite(in1)) {
-                return sycl::copysign(resT(0), in1);
-            }
-        }
-        return sycl::atan2(in1, in2);
-    }
-};
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using Atan2ContigFunctor =
-    elementwise_common::BinaryContigFunctor<argT1,
-                                            argT2,
-                                            resT,
-                                            Atan2Functor<argT1, argT2, resT>,
-                                            vec_sz,
-                                            n_vecs,
-                                            enable_sg_loadstore>;
-
-template <typename argT1, typename argT2, typename resT, typename IndexerT>
-using Atan2StridedFunctor =
-    elementwise_common::BinaryStridedFunctor<argT1,
-                                             argT2,
-                                             resT,
-                                             IndexerT,
-                                             Atan2Functor<argT1, argT2, resT>>;
-
-template <typename T1, typename T2> struct Atan2OutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        sycl::half,
-                                        T2,
-                                        sycl::half,
-                                        sycl::half>,
-        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
-        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::BinaryContigHyperparameterSetEntry;
-using vsu_ns::ContigHyperparameterSetDefault;
-
-template <typename argTy1, typename argTy2> struct Atan2ContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class atan2_contig_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event atan2_contig_impl(sycl::queue &exec_q,
-                              std::size_t nelems,
-                              const char *arg1_p,
-                              ssize_t arg1_offset,
-                              const char *arg2_p,
-                              ssize_t arg2_offset,
-                              char *res_p,
-                              ssize_t res_offset,
-                              const std::vector<sycl::event> &depends = {})
-{
-    using Atan2HS =
-        hyperparam_detail::Atan2ContigHyperparameterSet<argTy1, argTy2>;
-    static constexpr std::uint8_t vec_sz = Atan2HS::vec_sz;
-    static constexpr std::uint8_t n_vecs = Atan2HS::n_vecs;
-
-    return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, Atan2OutputType, Atan2ContigFunctor,
-        atan2_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p,
-                                             arg1_offset, arg2_p, arg2_offset,
-                                             res_p, res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct Atan2ContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!Atan2OutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = atan2_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2> struct Atan2TypeMapFactory
-{
-    /*! @brief get typeid for output type of sycl::atan2(T1 x, T2 y) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename Atan2OutputType<T1, T2>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename resT, typename IndexerT>
-class atan2_strided_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event
-atan2_strided_impl(sycl::queue &exec_q,
-                   std::size_t nelems,
-                   int nd,
-                   const ssize_t *shape_and_strides,
-                   const char *arg1_p,
-                   ssize_t arg1_offset,
-                   const char *arg2_p,
-                   ssize_t arg2_offset,
-                   char *res_p,
-                   ssize_t res_offset,
-                   const std::vector<sycl::event> &depends,
-                   const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_strided_impl<
-        argTy1, argTy2, Atan2OutputType, Atan2StridedFunctor,
-        atan2_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
-                              arg1_offset, arg2_p, arg2_offset, res_p,
-                              res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct Atan2StridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!Atan2OutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = atan2_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-} // namespace atan2
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
deleted file mode 100644
index 4148c1545a..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
+++ /dev/null
@@ -1,269 +0,0 @@
-//=== atanh.hpp -   Unary function ATANH                ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of ATANH(x) function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "sycl_complex.hpp"
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace atanh
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-
-template <typename argT, typename resT> struct AtanhFunctor
-{
-
-    // is function constant for given argT
-    using is_constant = typename std::false_type;
-    // constant value, if constant
-    // constexpr resT constant_value = resT{};
-    // is function defined for sycl::vec
-    using supports_vec = typename std::false_type;
-    // do both argTy and resTy support sugroup store/load operation
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-        if constexpr (is_complex<argT>::value) {
-            using realT = typename argT::value_type;
-            static constexpr realT q_nan =
-                std::numeric_limits<realT>::quiet_NaN();
-
-            const realT x = std::real(in);
-            const realT y = std::imag(in);
-
-            if (std::isnan(x)) {
-                /* atanh(NaN + I*+-Inf) = sign(NaN)0 + I*+-PI/2 */
-                if (std::isinf(y)) {
-                    const realT pi_half = sycl::atan(realT(1)) * 2;
-
-                    const realT res_re = sycl::copysign(realT(0), x);
-                    const realT res_im = sycl::copysign(pi_half, y);
-                    return resT{res_re, res_im};
-                }
-                /*
-                 * All other cases involving NaN return NaN + I*NaN.
-                 */
-                return resT{q_nan, q_nan};
-            }
-            else if (std::isnan(y)) {
-                /* atanh(+-Inf + I*NaN) = +-0 + I*NaN */
-                if (std::isinf(x)) {
-                    const realT res_re = sycl::copysign(realT(0), x);
-                    return resT{res_re, q_nan};
-                }
-                /* atanh(+-0 + I*NaN) = +-0 + I*NaN */
-                if (x == realT(0)) {
-                    return resT{x, q_nan};
-                }
-                /*
-                 * All other cases involving NaN return NaN + I*NaN.
-                 */
-                return resT{q_nan, q_nan};
-            }
-
-            /*
-             * For large x or y including
-             * atanh(+-Inf + I*+-Inf) = 0 + I*+-PI/2
-             * The sign of PI/2 depends on the sign of imaginary part of the
-             * input.
-             */
-            const realT RECIP_EPSILON =
-                realT(1) / std::numeric_limits<realT>::epsilon();
-            if (sycl::fabs(x) > RECIP_EPSILON || sycl::fabs(y) > RECIP_EPSILON)
-            {
-                const realT pi_half = sycl::atan(realT(1)) * 2;
-
-                const realT res_re = realT(0);
-                const realT res_im = sycl::copysign(pi_half, y);
-                return resT{res_re, res_im};
-            }
-            /* ordinary cases */
-            return exprm_ns::atanh(exprm_ns::complex<realT>(in)); // atanh(in);
-        }
-        else {
-            static_assert(std::is_floating_point_v<argT> ||
-                          std::is_same_v<argT, sycl::half>);
-            return sycl::atanh(in);
-        }
-    }
-};
-
-template <typename argTy,
-          typename resTy = argTy,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using AtanhContigFunctor =
-    elementwise_common::UnaryContigFunctor<argTy,
-                                           resTy,
-                                           AtanhFunctor<argTy, resTy>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using AtanhStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, AtanhFunctor<argTy, resTy>>;
-
-template <typename T> struct AtanhOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, sycl::half>,
-        td_ns::TypeMapResultEntry<T, float>,
-        td_ns::TypeMapResultEntry<T, double>,
-        td_ns::TypeMapResultEntry<T, std::complex<float>>,
-        td_ns::TypeMapResultEntry<T, std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct AtanhContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class atanh_contig_kernel;
-
-template <typename argTy>
-sycl::event atanh_contig_impl(sycl::queue &exec_q,
-                              std::size_t nelems,
-                              const char *arg_p,
-                              char *res_p,
-                              const std::vector<sycl::event> &depends = {})
-{
-    using AtanhHS = hyperparam_detail::AtanhContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = AtanhHS::vec_sz;
-    static constexpr std::uint8_t n_vec = AtanhHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, AtanhOutputType, AtanhContigFunctor, atanh_contig_kernel, vec_sz,
-        n_vec>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct AtanhContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!AtanhOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = atanh_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct AtanhTypeMapFactory
-{
-    /*! @brief get typeid for output type of sycl::atanh(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename AtanhOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class atanh_strided_kernel;
-
-template <typename argTy>
-sycl::event
-atanh_strided_impl(sycl::queue &exec_q,
-                   std::size_t nelems,
-                   int nd,
-                   const ssize_t *shape_and_strides,
-                   const char *arg_p,
-                   ssize_t arg_offset,
-                   char *res_p,
-                   ssize_t res_offset,
-                   const std::vector<sycl::event> &depends,
-                   const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, AtanhOutputType, AtanhStridedFunctor, atanh_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct AtanhStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!AtanhOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = atanh_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace atanh
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
deleted file mode 100644
index 9c164ea5a2..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
+++ /dev/null
@@ -1,461 +0,0 @@
-//=== bitwise_and.hpp -   Binary function BITWISE_AND  -------- *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain in1 copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise bitwise_and(ar1, ar2) operation.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/common_inplace.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace bitwise_and
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace tu_ns = dpctl::tensor::type_utils;
-
-template <typename argT1, typename argT2, typename resT>
-struct BitwiseAndFunctor
-{
-    static_assert(std::is_same_v<resT, argT1>);
-    static_assert(std::is_same_v<resT, argT2>);
-
-    using supports_sg_loadstore = typename std::true_type;
-    using supports_vec = typename std::true_type;
-
-    resT operator()(const argT1 &in1, const argT2 &in2) const
-    {
-        using tu_ns::convert_impl;
-
-        if constexpr (std::is_same_v<resT, bool>) {
-            return in1 && in2;
-        }
-        else {
-            return (in1 & in2);
-        }
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz>
-    operator()(const sycl::vec<argT1, vec_sz> &in1,
-               const sycl::vec<argT2, vec_sz> &in2) const
-    {
-
-        if constexpr (std::is_same_v<resT, bool>) {
-            using dpctl::tensor::type_utils::vec_cast;
-
-            auto tmp = (in1 && in2);
-            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
-                tmp);
-        }
-        else {
-            return (in1 & in2);
-        }
-    }
-};
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using BitwiseAndContigFunctor = elementwise_common::BinaryContigFunctor<
-    argT1,
-    argT2,
-    resT,
-    BitwiseAndFunctor<argT1, argT2, resT>,
-    vec_sz,
-    n_vecs,
-    enable_sg_loadstore>;
-
-template <typename argT1, typename argT2, typename resT, typename IndexerT>
-using BitwiseAndStridedFunctor = elementwise_common::BinaryStridedFunctor<
-    argT1,
-    argT2,
-    resT,
-    IndexerT,
-    BitwiseAndFunctor<argT1, argT2, resT>>;
-
-template <typename T1, typename T2> struct BitwiseAndOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint8_t,
-                                        T2,
-                                        std::uint8_t,
-                                        std::uint8_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int8_t,
-                                        T2,
-                                        std::int8_t,
-                                        std::int8_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint16_t,
-                                        T2,
-                                        std::uint16_t,
-                                        std::uint16_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int16_t,
-                                        T2,
-                                        std::int16_t,
-                                        std::int16_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint32_t,
-                                        T2,
-                                        std::uint32_t,
-                                        std::uint32_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int32_t,
-                                        T2,
-                                        std::int32_t,
-                                        std::int32_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint64_t,
-                                        T2,
-                                        std::uint64_t,
-                                        std::uint64_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int64_t,
-                                        T2,
-                                        std::int64_t,
-                                        std::int64_t>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::BinaryContigHyperparameterSetEntry;
-using vsu_ns::ContigHyperparameterSetDefault;
-
-template <typename argTy1, typename argTy2>
-struct BitwiseAndContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-} // end of namespace hyperparam_detail
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class bitwise_and_contig_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event
-bitwise_and_contig_impl(sycl::queue &exec_q,
-                        std::size_t nelems,
-                        const char *arg1_p,
-                        ssize_t arg1_offset,
-                        const char *arg2_p,
-                        ssize_t arg2_offset,
-                        char *res_p,
-                        ssize_t res_offset,
-                        const std::vector<sycl::event> &depends = {})
-{
-    using BitwiseAndHS =
-        hyperparam_detail::BitwiseAndContigHyperparameterSet<argTy1, argTy2>;
-    static constexpr std::uint8_t vec_sz = BitwiseAndHS::vec_sz;
-    static constexpr std::uint8_t n_vec = BitwiseAndHS::n_vecs;
-
-    return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, BitwiseAndOutputType, BitwiseAndContigFunctor,
-        bitwise_and_contig_kernel, vec_sz, n_vec>(
-        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
-        res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct BitwiseAndContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!BitwiseAndOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = bitwise_and_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2>
-struct BitwiseAndTypeMapFactory
-{
-    /*! @brief get typeid for output type of operator()>(x, y), always bool
-     */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename BitwiseAndOutputType<T1, T2>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename resT, typename IndexerT>
-class bitwise_and_strided_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event
-bitwise_and_strided_impl(sycl::queue &exec_q,
-                         std::size_t nelems,
-                         int nd,
-                         const ssize_t *shape_and_strides,
-                         const char *arg1_p,
-                         ssize_t arg1_offset,
-                         const char *arg2_p,
-                         ssize_t arg2_offset,
-                         char *res_p,
-                         ssize_t res_offset,
-                         const std::vector<sycl::event> &depends,
-                         const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_strided_impl<
-        argTy1, argTy2, BitwiseAndOutputType, BitwiseAndStridedFunctor,
-        bitwise_and_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
-        arg2_offset, res_p, res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct BitwiseAndStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!BitwiseAndOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = bitwise_and_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename argT, typename resT> struct BitwiseAndInplaceFunctor
-{
-    using supports_sg_loadstore = typename std::true_type;
-    using supports_vec = typename std::true_type;
-
-    void operator()(resT &res, const argT &in) const
-    {
-        using tu_ns::convert_impl;
-
-        if constexpr (std::is_same_v<resT, bool>) {
-            res = res && in;
-        }
-        else {
-            res &= in;
-        }
-    }
-
-    template <int vec_sz>
-    void operator()(sycl::vec<resT, vec_sz> &res,
-                    const sycl::vec<argT, vec_sz> &in) const
-    {
-
-        if constexpr (std::is_same_v<resT, bool>) {
-            using dpctl::tensor::type_utils::vec_cast;
-
-            auto tmp = (res && in);
-            res = vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
-                tmp);
-        }
-        else {
-            res &= in;
-        }
-    }
-};
-
-template <typename argT,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using BitwiseAndInplaceContigFunctor =
-    elementwise_common::BinaryInplaceContigFunctor<
-        argT,
-        resT,
-        BitwiseAndInplaceFunctor<argT, resT>,
-        vec_sz,
-        n_vecs,
-        enable_sg_loadstore>;
-
-template <typename argT, typename resT, typename IndexerT>
-using BitwiseAndInplaceStridedFunctor =
-    elementwise_common::BinaryInplaceStridedFunctor<
-        argT,
-        resT,
-        IndexerT,
-        BitwiseAndInplaceFunctor<argT, resT>>;
-
-template <typename argT,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class bitwise_and_inplace_contig_kernel;
-
-/* @brief Types supported by in-place bitwise AND */
-template <typename argTy, typename resTy>
-struct BitwiseAndInplaceTypePairSupport
-{
-    /* value if true a kernel for <argTy, resTy> must be instantiated  */
-    static constexpr bool is_defined = std::disjunction<
-        td_ns::TypePairDefinedEntry<argTy, bool, resTy, bool>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
-        // fall-through
-        td_ns::NotDefinedEntry>::is_defined;
-};
-
-template <typename fnT, typename argT, typename resT>
-struct BitwiseAndInplaceTypeMapFactory
-{
-    /*! @brief get typeid for output type of x &= y */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        if constexpr (BitwiseAndInplaceTypePairSupport<argT, resT>::is_defined)
-        {
-            return td_ns::GetTypeid<resT>{}.get();
-        }
-        else {
-            return td_ns::GetTypeid<void>{}.get();
-        }
-    }
-};
-
-template <typename argTy, typename resTy>
-sycl::event
-bitwise_and_inplace_contig_impl(sycl::queue &exec_q,
-                                std::size_t nelems,
-                                const char *arg_p,
-                                ssize_t arg_offset,
-                                char *res_p,
-                                ssize_t res_offset,
-                                const std::vector<sycl::event> &depends = {})
-{
-    using BitwiseAndHS =
-        hyperparam_detail::BitwiseAndContigHyperparameterSet<resTy, argTy>;
-    static constexpr std::uint8_t vec_sz = BitwiseAndHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = BitwiseAndHS::n_vecs;
-
-    return elementwise_common::binary_inplace_contig_impl<
-        argTy, resTy, BitwiseAndInplaceContigFunctor,
-        bitwise_and_inplace_contig_kernel, vec_sz, n_vecs>(
-        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct BitwiseAndInplaceContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!BitwiseAndInplaceTypePairSupport<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = bitwise_and_inplace_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename resT, typename argT, typename IndexerT>
-class bitwise_and_inplace_strided_kernel;
-
-template <typename argTy, typename resTy>
-sycl::event bitwise_and_inplace_strided_impl(
-    sycl::queue &exec_q,
-    std::size_t nelems,
-    int nd,
-    const ssize_t *shape_and_strides,
-    const char *arg_p,
-    ssize_t arg_offset,
-    char *res_p,
-    ssize_t res_offset,
-    const std::vector<sycl::event> &depends,
-    const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_inplace_strided_impl<
-        argTy, resTy, BitwiseAndInplaceStridedFunctor,
-        bitwise_and_inplace_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct BitwiseAndInplaceStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!BitwiseAndInplaceTypePairSupport<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = bitwise_and_inplace_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-} // namespace bitwise_and
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
deleted file mode 100644
index c292373575..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
+++ /dev/null
@@ -1,228 +0,0 @@
-//=== bitwise_invert.hpp -   Unary function bitwise_invert      *-C++-*--/===//
-//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of bitwise_invert(x)
-/// function that inverts bits of binary representation of the argument.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace bitwise_invert
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace tu_ns = dpctl::tensor::type_utils;
-
-using dpctl::tensor::type_utils::vec_cast;
-
-template <typename argT, typename resT> struct BitwiseInvertFunctor
-{
-    static_assert(std::is_same_v<argT, resT>);
-    static_assert(std::is_integral_v<argT> || std::is_same_v<argT, bool>);
-
-    using is_constant = typename std::false_type;
-    // constexpr resT constant_value = resT{};
-    using supports_vec = typename std::negation<std::is_same<argT, bool>>;
-    using supports_sg_loadstore = typename std::true_type;
-
-    resT operator()(const argT &in) const
-    {
-        if constexpr (std::is_same_v<argT, bool>) {
-            return !in;
-        }
-        else {
-            return ~in;
-        }
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in) const
-    {
-        return ~in;
-    }
-};
-
-template <typename argT,
-          typename resT = argT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using BitwiseInvertContigFunctor =
-    elementwise_common::UnaryContigFunctor<argT,
-                                           resT,
-                                           BitwiseInvertFunctor<argT, resT>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using BitwiseInvertStridedFunctor =
-    elementwise_common::UnaryStridedFunctor<argTy,
-                                            resTy,
-                                            IndexerT,
-                                            BitwiseInvertFunctor<argTy, resTy>>;
-
-template <typename argTy> struct BitwiseInvertOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<argTy, bool>,
-        td_ns::TypeMapResultEntry<argTy, std::uint8_t>,
-        td_ns::TypeMapResultEntry<argTy, std::uint16_t>,
-        td_ns::TypeMapResultEntry<argTy, std::uint32_t>,
-        td_ns::TypeMapResultEntry<argTy, std::uint64_t>,
-        td_ns::TypeMapResultEntry<argTy, std::int8_t>,
-        td_ns::TypeMapResultEntry<argTy, std::int16_t>,
-        td_ns::TypeMapResultEntry<argTy, std::int32_t>,
-        td_ns::TypeMapResultEntry<argTy, std::int64_t>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct BitwiseInvertContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class bitwise_invert_contig_kernel;
-
-template <typename argTy>
-sycl::event
-bitwise_invert_contig_impl(sycl::queue &exec_q,
-                           std::size_t nelems,
-                           const char *arg_p,
-                           char *res_p,
-                           const std::vector<sycl::event> &depends = {})
-{
-    using BitwiseInvertHS =
-        hyperparam_detail::BitwiseInvertContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = BitwiseInvertHS::vec_sz;
-    static constexpr std::uint8_t n_vec = BitwiseInvertHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, BitwiseInvertOutputType, BitwiseInvertContigFunctor,
-        bitwise_invert_contig_kernel, vec_sz, n_vec>(exec_q, nelems, arg_p,
-                                                     res_p, depends);
-}
-
-template <typename fnT, typename T> struct BitwiseInvertContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!BitwiseInvertOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = bitwise_invert_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct BitwiseInvertTypeMapFactory
-{
-    /*! @brief get typeid for output type of sycl::logical_not(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename BitwiseInvertOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3>
-class bitwise_invert_strided_kernel;
-
-template <typename argTy>
-sycl::event
-bitwise_invert_strided_impl(sycl::queue &exec_q,
-                            std::size_t nelems,
-                            int nd,
-                            const ssize_t *shape_and_strides,
-                            const char *arg_p,
-                            ssize_t arg_offset,
-                            char *res_p,
-                            ssize_t res_offset,
-                            const std::vector<sycl::event> &depends,
-                            const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, BitwiseInvertOutputType, BitwiseInvertStridedFunctor,
-        bitwise_invert_strided_kernel>(exec_q, nelems, nd, shape_and_strides,
-                                       arg_p, arg_offset, res_p, res_offset,
-                                       depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct BitwiseInvertStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!BitwiseInvertOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = bitwise_invert_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace bitwise_invert
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
deleted file mode 100644
index 6511895537..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
+++ /dev/null
@@ -1,482 +0,0 @@
-//=== bitwise_left-shift.hpp - Binary func. BITWISE_LEFT_SHIFT -*-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain in1 copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise bitwise_left_shift(ar1, ar2)
-/// operation.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/common_inplace.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace bitwise_left_shift
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace tu_ns = dpctl::tensor::type_utils;
-
-template <typename argT1, typename argT2, typename resT>
-struct BitwiseLeftShiftFunctor
-{
-    static_assert(std::is_integral_v<argT1>);
-    static_assert(std::is_integral_v<argT2>);
-    static_assert(!std::is_same_v<argT1, bool>);
-    static_assert(!std::is_same_v<argT2, bool>);
-
-    using supports_sg_loadstore = typename std::true_type;
-    using supports_vec = typename std::true_type;
-
-    resT operator()(const argT1 &in1, const argT2 &in2) const
-    {
-        return impl(in1, in2);
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz>
-    operator()(const sycl::vec<argT1, vec_sz> &in1,
-               const sycl::vec<argT2, vec_sz> &in2) const
-    {
-        sycl::vec<resT, vec_sz> res;
-#pragma unroll
-        for (int i = 0; i < vec_sz; ++i) {
-            res[i] = impl(in1[i], in2[i]);
-        }
-        return res;
-    }
-
-private:
-    resT impl(const argT1 &in1, const argT2 &in2) const
-    {
-        static constexpr argT2 in1_bitsize =
-            static_cast<argT2>(sizeof(argT1) * 8);
-        static constexpr resT zero = resT(0);
-
-        // bitshift op with second operand negative, or >= bitwidth(argT1) is UB
-        // array API spec mandates 0
-        if constexpr (std::is_unsigned_v<argT2>) {
-            return (in2 < in1_bitsize) ? (in1 << in2) : zero;
-        }
-        else {
-            return (in2 < argT2(0))
-                       ? zero
-                       : ((in2 < in1_bitsize) ? (in1 << in2) : zero);
-        }
-    }
-};
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using BitwiseLeftShiftContigFunctor = elementwise_common::BinaryContigFunctor<
-    argT1,
-    argT2,
-    resT,
-    BitwiseLeftShiftFunctor<argT1, argT2, resT>,
-    vec_sz,
-    n_vecs,
-    enable_sg_loadstore>;
-
-template <typename argT1, typename argT2, typename resT, typename IndexerT>
-using BitwiseLeftShiftStridedFunctor = elementwise_common::BinaryStridedFunctor<
-    argT1,
-    argT2,
-    resT,
-    IndexerT,
-    BitwiseLeftShiftFunctor<argT1, argT2, resT>>;
-
-template <typename T1, typename T2> struct BitwiseLeftShiftOutputType
-{
-    using ResT = T1;
-    using value_type = typename std::disjunction<
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int8_t,
-                                        T2,
-                                        std::int8_t,
-                                        std::int8_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint8_t,
-                                        T2,
-                                        std::uint8_t,
-                                        std::uint8_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int16_t,
-                                        T2,
-                                        std::int16_t,
-                                        std::int16_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint16_t,
-                                        T2,
-                                        std::uint16_t,
-                                        std::uint16_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int32_t,
-                                        T2,
-                                        std::int32_t,
-                                        std::int32_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint32_t,
-                                        T2,
-                                        std::uint32_t,
-                                        std::uint32_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int64_t,
-                                        T2,
-                                        std::int64_t,
-                                        std::int64_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint64_t,
-                                        T2,
-                                        std::uint64_t,
-                                        std::uint64_t>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::BinaryContigHyperparameterSetEntry;
-using vsu_ns::ContigHyperparameterSetDefault;
-
-template <typename argTy1, typename argTy2>
-struct BitwiseLeftShiftContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class bitwise_left_shift_contig_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event
-bitwise_left_shift_contig_impl(sycl::queue &exec_q,
-                               std::size_t nelems,
-                               const char *arg1_p,
-                               ssize_t arg1_offset,
-                               const char *arg2_p,
-                               ssize_t arg2_offset,
-                               char *res_p,
-                               ssize_t res_offset,
-                               const std::vector<sycl::event> &depends = {})
-{
-    using BitwiseLSHS =
-        hyperparam_detail::BitwiseLeftShiftContigHyperparameterSet<argTy1,
-                                                                   argTy2>;
-    static constexpr std::uint8_t vec_sz = BitwiseLSHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = BitwiseLSHS::n_vecs;
-
-    return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, BitwiseLeftShiftOutputType,
-        BitwiseLeftShiftContigFunctor, bitwise_left_shift_contig_kernel, vec_sz,
-        n_vecs>(exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
-                res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct BitwiseLeftShiftContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!BitwiseLeftShiftOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = bitwise_left_shift_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2>
-struct BitwiseLeftShiftTypeMapFactory
-{
-    /*! @brief get typeid for output type of operator()>(x, y), always bool
-     */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename BitwiseLeftShiftOutputType<T1, T2>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename resT, typename IndexerT>
-class bitwise_left_shift_strided_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event bitwise_left_shift_strided_impl(
-    sycl::queue &exec_q,
-    std::size_t nelems,
-    int nd,
-    const ssize_t *shape_and_strides,
-    const char *arg1_p,
-    ssize_t arg1_offset,
-    const char *arg2_p,
-    ssize_t arg2_offset,
-    char *res_p,
-    ssize_t res_offset,
-    const std::vector<sycl::event> &depends,
-    const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_strided_impl<
-        argTy1, argTy2, BitwiseLeftShiftOutputType,
-        BitwiseLeftShiftStridedFunctor, bitwise_left_shift_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
-        arg2_offset, res_p, res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct BitwiseLeftShiftStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!BitwiseLeftShiftOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = bitwise_left_shift_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename argT, typename resT> struct BitwiseLeftShiftInplaceFunctor
-{
-    static_assert(std::is_integral_v<argT>);
-    static_assert(!std::is_same_v<argT, bool>);
-
-    using supports_sg_loadstore = typename std::true_type;
-    using supports_vec = typename std::true_type;
-
-    void operator()(resT &res, const argT &in) const { impl(res, in); }
-
-    template <int vec_sz>
-    void operator()(sycl::vec<resT, vec_sz> &res,
-                    const sycl::vec<argT, vec_sz> &in) const
-    {
-#pragma unroll
-        for (int i = 0; i < vec_sz; ++i) {
-            impl(res[i], in[i]);
-        }
-    }
-
-private:
-    void impl(resT &res, const argT &in) const
-    {
-        static constexpr argT res_bitsize = static_cast<argT>(sizeof(resT) * 8);
-        static constexpr resT zero = resT(0);
-
-        // bitshift op with second operand negative, or >= bitwidth(argT1) is UB
-        // array API spec mandates 0
-        if constexpr (std::is_unsigned_v<argT>) {
-            (in < res_bitsize) ? (res <<= in) : res = zero;
-        }
-        else {
-            (in < argT(0)) ? res = zero
-                           : ((in < res_bitsize) ? (res <<= in) : res = zero);
-        }
-    }
-};
-
-template <typename argT,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using BitwiseLeftShiftInplaceContigFunctor =
-    elementwise_common::BinaryInplaceContigFunctor<
-        argT,
-        resT,
-        BitwiseLeftShiftInplaceFunctor<argT, resT>,
-        vec_sz,
-        n_vecs,
-        enable_sg_loadstore>;
-
-template <typename argT, typename resT, typename IndexerT>
-using BitwiseLeftShiftInplaceStridedFunctor =
-    elementwise_common::BinaryInplaceStridedFunctor<
-        argT,
-        resT,
-        IndexerT,
-        BitwiseLeftShiftInplaceFunctor<argT, resT>>;
-
-template <typename argT,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class bitwise_left_shift_inplace_contig_kernel;
-
-/* @brief Types supported by in-place bitwise left shift */
-template <typename argTy, typename resTy>
-struct BitwiseLeftShiftInplaceTypePairSupport
-{
-    /* value if true a kernel for <argTy, resTy> must be instantiated  */
-    static constexpr bool is_defined = std::disjunction<
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
-        // fall-through
-        td_ns::NotDefinedEntry>::is_defined;
-};
-
-template <typename fnT, typename argT, typename resT>
-struct BitwiseLeftShiftInplaceTypeMapFactory
-{
-    /*! @brief get typeid for output type of x <<= y */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        if constexpr (BitwiseLeftShiftInplaceTypePairSupport<argT,
-                                                             resT>::is_defined)
-        {
-            return td_ns::GetTypeid<resT>{}.get();
-        }
-        else {
-            return td_ns::GetTypeid<void>{}.get();
-        }
-    }
-};
-
-template <typename argTy, typename resTy>
-sycl::event bitwise_left_shift_inplace_contig_impl(
-    sycl::queue &exec_q,
-    std::size_t nelems,
-    const char *arg_p,
-    ssize_t arg_offset,
-    char *res_p,
-    ssize_t res_offset,
-    const std::vector<sycl::event> &depends = {})
-{
-    using BitwiseLSHS =
-        hyperparam_detail::BitwiseLeftShiftContigHyperparameterSet<resTy,
-                                                                   argTy>;
-    static constexpr std::uint8_t vec_sz = BitwiseLSHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = BitwiseLSHS::n_vecs;
-
-    return elementwise_common::binary_inplace_contig_impl<
-        argTy, resTy, BitwiseLeftShiftInplaceContigFunctor,
-        bitwise_left_shift_inplace_contig_kernel, vec_sz, n_vecs>(
-        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct BitwiseLeftShiftInplaceContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!BitwiseLeftShiftInplaceTypePairSupport<T1,
-                                                              T2>::is_defined)
-        {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = bitwise_left_shift_inplace_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename resT, typename argT, typename IndexerT>
-class bitwise_left_shift_inplace_strided_kernel;
-
-template <typename argTy, typename resTy>
-sycl::event bitwise_left_shift_inplace_strided_impl(
-    sycl::queue &exec_q,
-    std::size_t nelems,
-    int nd,
-    const ssize_t *shape_and_strides,
-    const char *arg_p,
-    ssize_t arg_offset,
-    char *res_p,
-    ssize_t res_offset,
-    const std::vector<sycl::event> &depends,
-    const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_inplace_strided_impl<
-        argTy, resTy, BitwiseLeftShiftInplaceStridedFunctor,
-        bitwise_left_shift_inplace_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct BitwiseLeftShiftInplaceStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!BitwiseLeftShiftInplaceTypePairSupport<T1,
-                                                              T2>::is_defined)
-        {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = bitwise_left_shift_inplace_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-} // namespace bitwise_left_shift
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
deleted file mode 100644
index 4e20b75138..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
+++ /dev/null
@@ -1,457 +0,0 @@
-//=== bitwise_or.hpp -   Binary function BITWISE_OR    -------- *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain in1 copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise bitwise_or(ar1, ar2) operation.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/common_inplace.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace bitwise_or
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace tu_ns = dpctl::tensor::type_utils;
-
-template <typename argT1, typename argT2, typename resT> struct BitwiseOrFunctor
-{
-    static_assert(std::is_same_v<resT, argT1>);
-    static_assert(std::is_same_v<resT, argT2>);
-
-    using supports_sg_loadstore = typename std::true_type;
-    using supports_vec = typename std::true_type;
-
-    resT operator()(const argT1 &in1, const argT2 &in2) const
-    {
-        using tu_ns::convert_impl;
-
-        if constexpr (std::is_same_v<resT, bool>) {
-            return in1 || in2;
-        }
-        else {
-            return (in1 | in2);
-        }
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz>
-    operator()(const sycl::vec<argT1, vec_sz> &in1,
-               const sycl::vec<argT2, vec_sz> &in2) const
-    {
-
-        if constexpr (std::is_same_v<resT, bool>) {
-            using dpctl::tensor::type_utils::vec_cast;
-
-            auto tmp = (in1 || in2);
-            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
-                tmp);
-        }
-        else {
-            return (in1 | in2);
-        }
-    }
-};
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using BitwiseOrContigFunctor = elementwise_common::BinaryContigFunctor<
-    argT1,
-    argT2,
-    resT,
-    BitwiseOrFunctor<argT1, argT2, resT>,
-    vec_sz,
-    n_vecs,
-    enable_sg_loadstore>;
-
-template <typename argT1, typename argT2, typename resT, typename IndexerT>
-using BitwiseOrStridedFunctor = elementwise_common::BinaryStridedFunctor<
-    argT1,
-    argT2,
-    resT,
-    IndexerT,
-    BitwiseOrFunctor<argT1, argT2, resT>>;
-
-template <typename T1, typename T2> struct BitwiseOrOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint8_t,
-                                        T2,
-                                        std::uint8_t,
-                                        std::uint8_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int8_t,
-                                        T2,
-                                        std::int8_t,
-                                        std::int8_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint16_t,
-                                        T2,
-                                        std::uint16_t,
-                                        std::uint16_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int16_t,
-                                        T2,
-                                        std::int16_t,
-                                        std::int16_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint32_t,
-                                        T2,
-                                        std::uint32_t,
-                                        std::uint32_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int32_t,
-                                        T2,
-                                        std::int32_t,
-                                        std::int32_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint64_t,
-                                        T2,
-                                        std::uint64_t,
-                                        std::uint64_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int64_t,
-                                        T2,
-                                        std::int64_t,
-                                        std::int64_t>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::BinaryContigHyperparameterSetEntry;
-using vsu_ns::ContigHyperparameterSetDefault;
-
-template <typename argTy1, typename argTy2>
-struct BitwiseOrContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class bitwise_or_contig_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event bitwise_or_contig_impl(sycl::queue &exec_q,
-                                   std::size_t nelems,
-                                   const char *arg1_p,
-                                   ssize_t arg1_offset,
-                                   const char *arg2_p,
-                                   ssize_t arg2_offset,
-                                   char *res_p,
-                                   ssize_t res_offset,
-                                   const std::vector<sycl::event> &depends = {})
-{
-    using BitwiseOrHS =
-        hyperparam_detail::BitwiseOrContigHyperparameterSet<argTy1, argTy2>;
-    static constexpr std::uint8_t vec_sz = BitwiseOrHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = BitwiseOrHS::n_vecs;
-
-    return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, BitwiseOrOutputType, BitwiseOrContigFunctor,
-        bitwise_or_contig_kernel, vec_sz, n_vecs>(
-        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
-        res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct BitwiseOrContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!BitwiseOrOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = bitwise_or_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2> struct BitwiseOrTypeMapFactory
-{
-    /*! @brief get typeid for output type of operator()>(x, y), always bool
-     */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename BitwiseOrOutputType<T1, T2>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename resT, typename IndexerT>
-class bitwise_or_strided_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event
-bitwise_or_strided_impl(sycl::queue &exec_q,
-                        std::size_t nelems,
-                        int nd,
-                        const ssize_t *shape_and_strides,
-                        const char *arg1_p,
-                        ssize_t arg1_offset,
-                        const char *arg2_p,
-                        ssize_t arg2_offset,
-                        char *res_p,
-                        ssize_t res_offset,
-                        const std::vector<sycl::event> &depends,
-                        const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_strided_impl<
-        argTy1, argTy2, BitwiseOrOutputType, BitwiseOrStridedFunctor,
-        bitwise_or_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
-        arg2_offset, res_p, res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct BitwiseOrStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!BitwiseOrOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = bitwise_or_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename argT, typename resT> struct BitwiseOrInplaceFunctor
-{
-    using supports_sg_loadstore = typename std::true_type;
-    using supports_vec = typename std::true_type;
-
-    void operator()(resT &res, const argT &in) const
-    {
-        using tu_ns::convert_impl;
-
-        if constexpr (std::is_same_v<resT, bool>) {
-            res = res || in;
-        }
-        else {
-            res |= in;
-        }
-    }
-
-    template <int vec_sz>
-    void operator()(sycl::vec<resT, vec_sz> &res,
-                    const sycl::vec<argT, vec_sz> &in) const
-    {
-
-        if constexpr (std::is_same_v<resT, bool>) {
-            using dpctl::tensor::type_utils::vec_cast;
-
-            auto tmp = (res || in);
-            res = vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
-                tmp);
-        }
-        else {
-            res |= in;
-        }
-    }
-};
-
-template <typename argT,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using BitwiseOrInplaceContigFunctor =
-    elementwise_common::BinaryInplaceContigFunctor<
-        argT,
-        resT,
-        BitwiseOrInplaceFunctor<argT, resT>,
-        vec_sz,
-        n_vecs,
-        enable_sg_loadstore>;
-
-template <typename argT, typename resT, typename IndexerT>
-using BitwiseOrInplaceStridedFunctor =
-    elementwise_common::BinaryInplaceStridedFunctor<
-        argT,
-        resT,
-        IndexerT,
-        BitwiseOrInplaceFunctor<argT, resT>>;
-
-template <typename argT,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class bitwise_or_inplace_contig_kernel;
-
-/* @brief Types supported by in-place bitwise OR */
-template <typename argTy, typename resTy> struct BitwiseOrInplaceTypePairSupport
-{
-    /* value if true a kernel for <argTy, resTy> must be instantiated  */
-    static constexpr bool is_defined = std::disjunction<
-        td_ns::TypePairDefinedEntry<argTy, bool, resTy, bool>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
-        // fall-through
-        td_ns::NotDefinedEntry>::is_defined;
-};
-
-template <typename fnT, typename argT, typename resT>
-struct BitwiseOrInplaceTypeMapFactory
-{
-    /*! @brief get typeid for output type of x |= y */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        if constexpr (BitwiseOrInplaceTypePairSupport<argT, resT>::is_defined) {
-            return td_ns::GetTypeid<resT>{}.get();
-        }
-        else {
-            return td_ns::GetTypeid<void>{}.get();
-        }
-    }
-};
-
-template <typename argTy, typename resTy>
-sycl::event
-bitwise_or_inplace_contig_impl(sycl::queue &exec_q,
-                               std::size_t nelems,
-                               const char *arg_p,
-                               ssize_t arg_offset,
-                               char *res_p,
-                               ssize_t res_offset,
-                               const std::vector<sycl::event> &depends = {})
-{
-    using BitwiseOrHS =
-        hyperparam_detail::BitwiseOrContigHyperparameterSet<resTy, argTy>;
-
-    static constexpr std::uint8_t vec_sz = BitwiseOrHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = BitwiseOrHS::n_vecs;
-
-    return elementwise_common::binary_inplace_contig_impl<
-        argTy, resTy, BitwiseOrInplaceContigFunctor,
-        bitwise_or_inplace_contig_kernel, vec_sz, n_vecs>(
-        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct BitwiseOrInplaceContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!BitwiseOrInplaceTypePairSupport<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = bitwise_or_inplace_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename resT, typename argT, typename IndexerT>
-class bitwise_or_inplace_strided_kernel;
-
-template <typename argTy, typename resTy>
-sycl::event bitwise_or_inplace_strided_impl(
-    sycl::queue &exec_q,
-    std::size_t nelems,
-    int nd,
-    const ssize_t *shape_and_strides,
-    const char *arg_p,
-    ssize_t arg_offset,
-    char *res_p,
-    ssize_t res_offset,
-    const std::vector<sycl::event> &depends,
-    const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_inplace_strided_impl<
-        argTy, resTy, BitwiseOrInplaceStridedFunctor,
-        bitwise_or_inplace_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct BitwiseOrInplaceStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!BitwiseOrInplaceTypePairSupport<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = bitwise_or_inplace_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-} // namespace bitwise_or
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
deleted file mode 100644
index 047a33cea9..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
+++ /dev/null
@@ -1,488 +0,0 @@
-//=== bitwise_right_shift.hpp - Binary func. BITWISE_RIGHT_SHIFT *-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain in1 copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise bitwise_right_shift(ar1, ar2)
-/// operation.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/common_inplace.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace bitwise_right_shift
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace tu_ns = dpctl::tensor::type_utils;
-
-template <typename argT1, typename argT2, typename resT>
-struct BitwiseRightShiftFunctor
-{
-    static_assert(std::is_same_v<resT, argT1>);
-    static_assert(std::is_integral_v<argT1>);
-    static_assert(std::is_integral_v<argT2>);
-
-    using supports_sg_loadstore = typename std::true_type;
-    using supports_vec = typename std::true_type;
-
-    resT operator()(const argT1 &in1, const argT2 &in2) const
-    {
-        return impl(in1, in2);
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz>
-    operator()(const sycl::vec<argT1, vec_sz> &in1,
-               const sycl::vec<argT2, vec_sz> &in2) const
-    {
-        sycl::vec<resT, vec_sz> res;
-#pragma unroll
-        for (int i = 0; i < vec_sz; ++i) {
-            res[i] = impl(in1[i], in2[i]);
-        }
-        return res;
-    }
-
-private:
-    resT impl(const argT1 &in1, const argT2 &in2) const
-    {
-        static constexpr argT2 in1_bitsize =
-            static_cast<argT2>(sizeof(argT1) * 8);
-        static constexpr resT zero = resT(0);
-
-        // bitshift op with second operand negative, or >= bitwidth(argT1) is UB
-        // array API spec mandates 0
-        if constexpr (std::is_unsigned_v<argT2>) {
-            return (in2 < in1_bitsize) ? (in1 >> in2) : zero;
-        }
-        else {
-            return (in2 < argT2(0))
-                       ? zero
-                       : ((in2 < in1_bitsize)
-                              ? (in1 >> in2)
-                              : (in1 < argT1(0) ? resT(-1) : zero));
-        }
-    }
-};
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using BitwiseRightShiftContigFunctor = elementwise_common::BinaryContigFunctor<
-    argT1,
-    argT2,
-    resT,
-    BitwiseRightShiftFunctor<argT1, argT2, resT>,
-    vec_sz,
-    n_vecs,
-    enable_sg_loadstore>;
-
-template <typename argT1, typename argT2, typename resT, typename IndexerT>
-using BitwiseRightShiftStridedFunctor =
-    elementwise_common::BinaryStridedFunctor<
-        argT1,
-        argT2,
-        resT,
-        IndexerT,
-        BitwiseRightShiftFunctor<argT1, argT2, resT>>;
-
-template <typename T1, typename T2> struct BitwiseRightShiftOutputType
-{
-    using ResT = T1;
-    using value_type = typename std::disjunction<
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int8_t,
-                                        T2,
-                                        std::int8_t,
-                                        std::int8_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint8_t,
-                                        T2,
-                                        std::uint8_t,
-                                        std::uint8_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int16_t,
-                                        T2,
-                                        std::int16_t,
-                                        std::int16_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint16_t,
-                                        T2,
-                                        std::uint16_t,
-                                        std::uint16_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int32_t,
-                                        T2,
-                                        std::int32_t,
-                                        std::int32_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint32_t,
-                                        T2,
-                                        std::uint32_t,
-                                        std::uint32_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int64_t,
-                                        T2,
-                                        std::int64_t,
-                                        std::int64_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint64_t,
-                                        T2,
-                                        std::uint64_t,
-                                        std::uint64_t>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::BinaryContigHyperparameterSetEntry;
-using vsu_ns::ContigHyperparameterSetDefault;
-
-template <typename argTy1, typename argTy2>
-struct BitwiseRightShiftContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // namespace hyperparam_detail
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class bitwise_right_shift_contig_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event
-bitwise_right_shift_contig_impl(sycl::queue &exec_q,
-                                std::size_t nelems,
-                                const char *arg1_p,
-                                ssize_t arg1_offset,
-                                const char *arg2_p,
-                                ssize_t arg2_offset,
-                                char *res_p,
-                                ssize_t res_offset,
-                                const std::vector<sycl::event> &depends = {})
-{
-    using BitwiseRSHS =
-        hyperparam_detail::BitwiseRightShiftContigHyperparameterSet<argTy1,
-                                                                    argTy2>;
-    constexpr std::uint8_t vec_sz = BitwiseRSHS::vec_sz;
-    constexpr std::uint8_t n_vecs = BitwiseRSHS::n_vecs;
-
-    return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, BitwiseRightShiftOutputType,
-        BitwiseRightShiftContigFunctor, bitwise_right_shift_contig_kernel,
-        vec_sz, n_vecs>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                        arg2_offset, res_p, res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct BitwiseRightShiftContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!BitwiseRightShiftOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = bitwise_right_shift_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2>
-struct BitwiseRightShiftTypeMapFactory
-{
-    /*! @brief get typeid for output type of operator()>(x, y), always bool
-     */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename BitwiseRightShiftOutputType<T1, T2>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename resT, typename IndexerT>
-class bitwise_right_shift_strided_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event bitwise_right_shift_strided_impl(
-    sycl::queue &exec_q,
-    std::size_t nelems,
-    int nd,
-    const ssize_t *shape_and_strides,
-    const char *arg1_p,
-    ssize_t arg1_offset,
-    const char *arg2_p,
-    ssize_t arg2_offset,
-    char *res_p,
-    ssize_t res_offset,
-    const std::vector<sycl::event> &depends,
-    const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_strided_impl<
-        argTy1, argTy2, BitwiseRightShiftOutputType,
-        BitwiseRightShiftStridedFunctor, bitwise_right_shift_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
-        arg2_offset, res_p, res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct BitwiseRightShiftStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!BitwiseRightShiftOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = bitwise_right_shift_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename argT, typename resT> struct BitwiseRightShiftInplaceFunctor
-{
-    static_assert(std::is_integral_v<argT>);
-    static_assert(!std::is_same_v<argT, bool>);
-
-    using supports_sg_loadstore = typename std::true_type;
-    using supports_vec = typename std::true_type;
-
-    void operator()(resT &res, const argT &in) const { impl(res, in); }
-
-    template <int vec_sz>
-    void operator()(sycl::vec<resT, vec_sz> &res,
-                    const sycl::vec<argT, vec_sz> &in) const
-    {
-#pragma unroll
-        for (int i = 0; i < vec_sz; ++i) {
-            impl(res[i], in[i]);
-        }
-    }
-
-private:
-    void impl(resT &res, const argT &in) const
-    {
-        static constexpr argT res_bitsize = static_cast<argT>(sizeof(resT) * 8);
-        static constexpr resT zero = resT(0);
-
-        // bitshift op with second operand negative, or >= bitwidth(argT1) is UB
-        // array API spec mandates 0
-        if constexpr (std::is_unsigned_v<argT>) {
-            (in < res_bitsize) ? (res >>= in) : res = zero;
-        }
-        else {
-            (in < argT(0)) ? res = zero
-                           : ((in < res_bitsize) ? (res >>= in)
-                              : (res < resT(0))  ? res = resT(-1)
-                                                 : res = zero);
-        }
-    }
-};
-
-template <typename argT,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using BitwiseRightShiftInplaceContigFunctor =
-    elementwise_common::BinaryInplaceContigFunctor<
-        argT,
-        resT,
-        BitwiseRightShiftInplaceFunctor<argT, resT>,
-        vec_sz,
-        n_vecs,
-        enable_sg_loadstore>;
-
-template <typename argT, typename resT, typename IndexerT>
-using BitwiseRightShiftInplaceStridedFunctor =
-    elementwise_common::BinaryInplaceStridedFunctor<
-        argT,
-        resT,
-        IndexerT,
-        BitwiseRightShiftInplaceFunctor<argT, resT>>;
-
-template <typename argT,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class bitwise_right_shift_inplace_contig_kernel;
-
-/* @brief Types supported by in-place bitwise right shift */
-template <typename argTy, typename resTy>
-struct BitwiseRightShiftInplaceTypePairSupport
-{
-    /* value if true a kernel for <argTy, resTy> must be instantiated  */
-    static constexpr bool is_defined = std::disjunction<
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
-        // fall-through
-        td_ns::NotDefinedEntry>::is_defined;
-};
-
-template <typename fnT, typename argT, typename resT>
-struct BitwiseRightShiftInplaceTypeMapFactory
-{
-    /*! @brief get typeid for output type of x >>= y */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        if constexpr (BitwiseRightShiftInplaceTypePairSupport<argT,
-                                                              resT>::is_defined)
-        {
-            return td_ns::GetTypeid<resT>{}.get();
-        }
-        else {
-            return td_ns::GetTypeid<void>{}.get();
-        }
-    }
-};
-
-template <typename argTy, typename resTy>
-sycl::event bitwise_right_shift_inplace_contig_impl(
-    sycl::queue &exec_q,
-    std::size_t nelems,
-    const char *arg_p,
-    ssize_t arg_offset,
-    char *res_p,
-    ssize_t res_offset,
-    const std::vector<sycl::event> &depends = {})
-{
-    using BitwiseRSHS =
-        hyperparam_detail::BitwiseRightShiftContigHyperparameterSet<resTy,
-                                                                    argTy>;
-
-    // res = OP(res, arg)
-    static constexpr std::uint8_t vec_sz = BitwiseRSHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = BitwiseRSHS::n_vecs;
-
-    return elementwise_common::binary_inplace_contig_impl<
-        argTy, resTy, BitwiseRightShiftInplaceContigFunctor,
-        bitwise_right_shift_inplace_contig_kernel, vec_sz, n_vecs>(
-        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct BitwiseRightShiftInplaceContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!BitwiseRightShiftInplaceTypePairSupport<T1,
-                                                               T2>::is_defined)
-        {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = bitwise_right_shift_inplace_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename resT, typename argT, typename IndexerT>
-class bitwise_right_shift_inplace_strided_kernel;
-
-template <typename argTy, typename resTy>
-sycl::event bitwise_right_shift_inplace_strided_impl(
-    sycl::queue &exec_q,
-    std::size_t nelems,
-    int nd,
-    const ssize_t *shape_and_strides,
-    const char *arg_p,
-    ssize_t arg_offset,
-    char *res_p,
-    ssize_t res_offset,
-    const std::vector<sycl::event> &depends,
-    const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_inplace_strided_impl<
-        argTy, resTy, BitwiseRightShiftInplaceStridedFunctor,
-        bitwise_right_shift_inplace_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct BitwiseRightShiftInplaceStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!BitwiseRightShiftInplaceTypePairSupport<T1,
-                                                               T2>::is_defined)
-        {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = bitwise_right_shift_inplace_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-} // namespace bitwise_right_shift
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
deleted file mode 100644
index d14a12a248..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
+++ /dev/null
@@ -1,463 +0,0 @@
-//=== bitwise_xor.hpp -   Binary function BITWISE_XOR  -------- *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain in1 copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise bitwise_xor(ar1, ar2) operation.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/common_inplace.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace bitwise_xor
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace tu_ns = dpctl::tensor::type_utils;
-
-template <typename argT1, typename argT2, typename resT>
-struct BitwiseXorFunctor
-{
-    static_assert(std::is_same_v<resT, argT1>);
-    static_assert(std::is_same_v<resT, argT2>);
-
-    using supports_sg_loadstore = typename std::true_type;
-    using supports_vec = typename std::true_type;
-
-    resT operator()(const argT1 &in1, const argT2 &in2) const
-    {
-        if constexpr (std::is_same_v<resT, bool>) {
-            // (false != false) -> false, (false != true) -> true
-            // (true != false) -> true,  (true != true) -> false
-            return (in1 != in2);
-        }
-        else {
-            return (in1 ^ in2);
-        }
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz>
-    operator()(const sycl::vec<argT1, vec_sz> &in1,
-               const sycl::vec<argT2, vec_sz> &in2) const
-    {
-
-        if constexpr (std::is_same_v<resT, bool>) {
-            using dpctl::tensor::type_utils::vec_cast;
-
-            auto tmp = (in1 != in2);
-            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
-                tmp);
-        }
-        else {
-            return (in1 ^ in2);
-        }
-    }
-};
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using BitwiseXorContigFunctor = elementwise_common::BinaryContigFunctor<
-    argT1,
-    argT2,
-    resT,
-    BitwiseXorFunctor<argT1, argT2, resT>,
-    vec_sz,
-    n_vecs,
-    enable_sg_loadstore>;
-
-template <typename argT1, typename argT2, typename resT, typename IndexerT>
-using BitwiseXorStridedFunctor = elementwise_common::BinaryStridedFunctor<
-    argT1,
-    argT2,
-    resT,
-    IndexerT,
-    BitwiseXorFunctor<argT1, argT2, resT>>;
-
-template <typename T1, typename T2> struct BitwiseXorOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint8_t,
-                                        T2,
-                                        std::uint8_t,
-                                        std::uint8_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int8_t,
-                                        T2,
-                                        std::int8_t,
-                                        std::int8_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint16_t,
-                                        T2,
-                                        std::uint16_t,
-                                        std::uint16_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int16_t,
-                                        T2,
-                                        std::int16_t,
-                                        std::int16_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint32_t,
-                                        T2,
-                                        std::uint32_t,
-                                        std::uint32_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int32_t,
-                                        T2,
-                                        std::int32_t,
-                                        std::int32_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint64_t,
-                                        T2,
-                                        std::uint64_t,
-                                        std::uint64_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int64_t,
-                                        T2,
-                                        std::int64_t,
-                                        std::int64_t>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::BinaryContigHyperparameterSetEntry;
-using vsu_ns::ContigHyperparameterSetDefault;
-
-template <typename argTy1, typename argTy2>
-struct BitwiseXorContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class bitwise_xor_contig_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event
-bitwise_xor_contig_impl(sycl::queue &exec_q,
-                        std::size_t nelems,
-                        const char *arg1_p,
-                        ssize_t arg1_offset,
-                        const char *arg2_p,
-                        ssize_t arg2_offset,
-                        char *res_p,
-                        ssize_t res_offset,
-                        const std::vector<sycl::event> &depends = {})
-{
-    using BitwiseXorHS =
-        hyperparam_detail::BitwiseXorContigHyperparameterSet<argTy1, argTy2>;
-    static constexpr std::uint8_t vec_sz = BitwiseXorHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = BitwiseXorHS::n_vecs;
-
-    return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, BitwiseXorOutputType, BitwiseXorContigFunctor,
-        bitwise_xor_contig_kernel, vec_sz, n_vecs>(
-        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
-        res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct BitwiseXorContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!BitwiseXorOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = bitwise_xor_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2>
-struct BitwiseXorTypeMapFactory
-{
-    /*! @brief get typeid for output type of operator()>(x, y), always bool
-     */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename BitwiseXorOutputType<T1, T2>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename resT, typename IndexerT>
-class bitwise_xor_strided_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event
-bitwise_xor_strided_impl(sycl::queue &exec_q,
-                         std::size_t nelems,
-                         int nd,
-                         const ssize_t *shape_and_strides,
-                         const char *arg1_p,
-                         ssize_t arg1_offset,
-                         const char *arg2_p,
-                         ssize_t arg2_offset,
-                         char *res_p,
-                         ssize_t res_offset,
-                         const std::vector<sycl::event> &depends,
-                         const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_strided_impl<
-        argTy1, argTy2, BitwiseXorOutputType, BitwiseXorStridedFunctor,
-        bitwise_xor_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
-        arg2_offset, res_p, res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct BitwiseXorStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!BitwiseXorOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = bitwise_xor_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename argT, typename resT> struct BitwiseXorInplaceFunctor
-{
-    using supports_sg_loadstore = typename std::true_type;
-    using supports_vec = typename std::true_type;
-
-    void operator()(resT &res, const argT &in) const
-    {
-        using tu_ns::convert_impl;
-
-        if constexpr (std::is_same_v<resT, bool>) {
-            res = (res != in);
-        }
-        else {
-            res ^= in;
-        }
-    }
-
-    template <int vec_sz>
-    void operator()(sycl::vec<resT, vec_sz> &res,
-                    const sycl::vec<argT, vec_sz> &in) const
-    {
-
-        if constexpr (std::is_same_v<resT, bool>) {
-            using dpctl::tensor::type_utils::vec_cast;
-
-            auto tmp = (res != in);
-            res = vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
-                tmp);
-        }
-        else {
-            res ^= in;
-        }
-    }
-};
-
-template <typename argT,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using BitwiseXorInplaceContigFunctor =
-    elementwise_common::BinaryInplaceContigFunctor<
-        argT,
-        resT,
-        BitwiseXorInplaceFunctor<argT, resT>,
-        vec_sz,
-        n_vecs,
-        enable_sg_loadstore>;
-
-template <typename argT, typename resT, typename IndexerT>
-using BitwiseXorInplaceStridedFunctor =
-    elementwise_common::BinaryInplaceStridedFunctor<
-        argT,
-        resT,
-        IndexerT,
-        BitwiseXorInplaceFunctor<argT, resT>>;
-
-template <typename argT,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class bitwise_xor_inplace_contig_kernel;
-
-/* @brief Types supported by in-place bitwise XOR */
-template <typename argTy, typename resTy>
-struct BitwiseXorInplaceTypePairSupport
-{
-    /* value if true a kernel for <argTy, resTy> must be instantiated  */
-    static constexpr bool is_defined = std::disjunction<
-        td_ns::TypePairDefinedEntry<argTy, bool, resTy, bool>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
-        // fall-through
-        td_ns::NotDefinedEntry>::is_defined;
-};
-
-template <typename fnT, typename argT, typename resT>
-struct BitwiseXorInplaceTypeMapFactory
-{
-    /*! @brief get typeid for output type of x ^= y */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        if constexpr (BitwiseXorInplaceTypePairSupport<argT, resT>::is_defined)
-        {
-            return td_ns::GetTypeid<resT>{}.get();
-        }
-        else {
-            return td_ns::GetTypeid<void>{}.get();
-        }
-    }
-};
-
-template <typename argTy, typename resTy>
-sycl::event
-bitwise_xor_inplace_contig_impl(sycl::queue &exec_q,
-                                std::size_t nelems,
-                                const char *arg_p,
-                                ssize_t arg_offset,
-                                char *res_p,
-                                ssize_t res_offset,
-                                const std::vector<sycl::event> &depends = {})
-{
-    using BitwiseXorHS =
-        hyperparam_detail::BitwiseXorContigHyperparameterSet<resTy, argTy>;
-
-    static constexpr std::uint8_t vec_sz = BitwiseXorHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = BitwiseXorHS::n_vecs;
-
-    return elementwise_common::binary_inplace_contig_impl<
-        argTy, resTy, BitwiseXorInplaceContigFunctor,
-        bitwise_xor_inplace_contig_kernel, vec_sz, n_vecs>(
-        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct BitwiseXorInplaceContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!BitwiseXorInplaceTypePairSupport<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = bitwise_xor_inplace_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename resT, typename argT, typename IndexerT>
-class bitwise_xor_inplace_strided_kernel;
-
-template <typename argTy, typename resTy>
-sycl::event bitwise_xor_inplace_strided_impl(
-    sycl::queue &exec_q,
-    std::size_t nelems,
-    int nd,
-    const ssize_t *shape_and_strides,
-    const char *arg_p,
-    ssize_t arg_offset,
-    char *res_p,
-    ssize_t res_offset,
-    const std::vector<sycl::event> &depends,
-    const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_inplace_strided_impl<
-        argTy, resTy, BitwiseXorInplaceStridedFunctor,
-        bitwise_xor_inplace_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct BitwiseXorInplaceStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!BitwiseXorInplaceTypePairSupport<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = bitwise_xor_inplace_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-} // namespace bitwise_xor
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cabs_impl.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cabs_impl.hpp
deleted file mode 100644
index 4cf32725b1..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cabs_impl.hpp
+++ /dev/null
@@ -1,75 +0,0 @@
-//===------- cabs_impl.hpp - Implementation of cabs  -------*-C++-*/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines an implementation of the complex absolute value.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <complex>
-#include <limits>
-
-#include "sycl_complex.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace detail
-{
-
-template <typename realT> realT cabs(std::complex<realT> const &z)
-{
-    // Special values for cabs( x + y * 1j):
-    //   * If x is either +infinity or -infinity and y is any value
-    //   (including NaN), the result is +infinity.
-    //   * If x is any value (including NaN) and y is either +infinity or
-    //   -infinity, the result is +infinity.
-    //   * If x is either +0 or -0, the result is equal to abs(y).
-    //   * If y is either +0 or -0, the result is equal to abs(x).
-    //   * If x is NaN and y is a finite number, the result is NaN.
-    //   * If x is a finite number and y is NaN, the result is NaN.
-    //   * If x is NaN and y is NaN, the result is NaN.
-
-    const realT x = std::real(z);
-    const realT y = std::imag(z);
-
-    static constexpr realT q_nan = std::numeric_limits<realT>::quiet_NaN();
-    static constexpr realT p_inf = std::numeric_limits<realT>::infinity();
-
-    const realT res =
-        std::isinf(x)
-            ? p_inf
-            : ((std::isinf(y)
-                    ? p_inf
-                    : ((std::isnan(x)
-                            ? q_nan
-                            : exprm_ns::abs(exprm_ns::complex<realT>(z))))));
-
-    return res;
-}
-
-} // namespace detail
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
deleted file mode 100644
index a26c0b6875..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
+++ /dev/null
@@ -1,199 +0,0 @@
-//=== cbrt.hpp -   Unary function CBRT                   ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of CBRT(x)
-/// function that compute a square root.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace cbrt
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-template <typename argT, typename resT> struct CbrtFunctor
-{
-
-    // is function constant for given argT
-    using is_constant = typename std::false_type;
-    // constant value, if constant
-    // constexpr resT constant_value = resT{};
-    // is function defined for sycl::vec
-    using supports_vec = typename std::false_type;
-    // do both argTy and resTy support sugroup store/load operation
-    using supports_sg_loadstore = typename std::true_type;
-
-    resT operator()(const argT &in) const { return sycl::cbrt(in); }
-};
-
-template <typename argTy,
-          typename resTy = argTy,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using CbrtContigFunctor =
-    elementwise_common::UnaryContigFunctor<argTy,
-                                           resTy,
-                                           CbrtFunctor<argTy, resTy>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using CbrtStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, CbrtFunctor<argTy, resTy>>;
-
-template <typename T> struct CbrtOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, sycl::half, sycl::half>,
-        td_ns::TypeMapResultEntry<T, float, float>,
-        td_ns::TypeMapResultEntry<T, double, double>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct CbrtContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class cbrt_contig_kernel;
-
-template <typename argTy>
-sycl::event cbrt_contig_impl(sycl::queue &exec_q,
-                             std::size_t nelems,
-                             const char *arg_p,
-                             char *res_p,
-                             const std::vector<sycl::event> &depends = {})
-{
-    using CbrtHS = hyperparam_detail::CbrtContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = CbrtHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = CbrtHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, CbrtOutputType, CbrtContigFunctor, cbrt_contig_kernel, vec_sz,
-        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct CbrtContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!CbrtOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = cbrt_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct CbrtTypeMapFactory
-{
-    /*! @brief get typeid for output type of std::cbrt(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename CbrtOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class cbrt_strided_kernel;
-
-template <typename argTy>
-sycl::event
-cbrt_strided_impl(sycl::queue &exec_q,
-                  std::size_t nelems,
-                  int nd,
-                  const ssize_t *shape_and_strides,
-                  const char *arg_p,
-                  ssize_t arg_offset,
-                  char *res_p,
-                  ssize_t res_offset,
-                  const std::vector<sycl::event> &depends,
-                  const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, CbrtOutputType, CbrtStridedFunctor, cbrt_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct CbrtStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!CbrtOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = cbrt_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace cbrt
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp
deleted file mode 100644
index eb9d576db1..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp
+++ /dev/null
@@ -1,221 +0,0 @@
-//=== ceil.hpp -   Unary function CEIL                  ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of CEIL(x) function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace ceil
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-
-template <typename argT, typename resT> struct CeilFunctor
-{
-
-    // is function constant for given argT
-    using is_constant = typename std::false_type;
-    // constant value, if constant
-    // constexpr resT constant_value = resT{};
-    // is function defined for sycl::vec
-    using supports_vec = typename std::false_type;
-    // do both argTy and resTy support sugroup store/load operation
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-        if constexpr (std::is_integral_v<argT>) {
-            return in;
-        }
-        else {
-            if (in == 0) {
-                return in;
-            }
-            return sycl::ceil(in);
-        }
-    }
-};
-
-template <typename argTy,
-          typename resTy = argTy,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using CeilContigFunctor =
-    elementwise_common::UnaryContigFunctor<argTy,
-                                           resTy,
-                                           CeilFunctor<argTy, resTy>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using CeilStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, CeilFunctor<argTy, resTy>>;
-
-template <typename T> struct CeilOutputType
-{
-    using value_type =
-        typename std::disjunction<td_ns::TypeMapResultEntry<T, bool>,
-                                  td_ns::TypeMapResultEntry<T, std::uint8_t>,
-                                  td_ns::TypeMapResultEntry<T, std::uint16_t>,
-                                  td_ns::TypeMapResultEntry<T, std::uint32_t>,
-                                  td_ns::TypeMapResultEntry<T, std::uint64_t>,
-                                  td_ns::TypeMapResultEntry<T, std::int8_t>,
-                                  td_ns::TypeMapResultEntry<T, std::int16_t>,
-                                  td_ns::TypeMapResultEntry<T, std::int32_t>,
-                                  td_ns::TypeMapResultEntry<T, std::int64_t>,
-                                  td_ns::TypeMapResultEntry<T, sycl::half>,
-                                  td_ns::TypeMapResultEntry<T, float>,
-                                  td_ns::TypeMapResultEntry<T, double>,
-                                  td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct CeilContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class ceil_contig_kernel;
-
-template <typename argTy>
-sycl::event ceil_contig_impl(sycl::queue &exec_q,
-                             std::size_t nelems,
-                             const char *arg_p,
-                             char *res_p,
-                             const std::vector<sycl::event> &depends = {})
-{
-    using CeilHS = hyperparam_detail::CeilContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = CeilHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = CeilHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, CeilOutputType, CeilContigFunctor, ceil_contig_kernel, vec_sz,
-        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct CeilContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!CeilOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = ceil_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct CeilTypeMapFactory
-{
-    /*! @brief get typeid for output type of sycl::ceil(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename CeilOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class ceil_strided_kernel;
-
-template <typename argTy>
-sycl::event
-ceil_strided_impl(sycl::queue &exec_q,
-                  std::size_t nelems,
-                  int nd,
-                  const ssize_t *shape_and_strides,
-                  const char *arg_p,
-                  ssize_t arg_offset,
-                  char *res_p,
-                  ssize_t res_offset,
-                  const std::vector<sycl::event> &depends,
-                  const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, CeilOutputType, CeilStridedFunctor, ceil_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct CeilStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!CeilOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = ceil_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace ceil
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
deleted file mode 100644
index 1836b61d0e..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
+++ /dev/null
@@ -1,1041 +0,0 @@
-//=== common.hpp -  Common code for elementwise operations ----- *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines common code for elementwise tensor operations.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <stdexcept>
-#include <utility>
-
-#include <sycl/sycl.hpp>
-
-#include "kernels/alignment.hpp"
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common_detail.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/sycl_alloc_utils.hpp"
-#include "utils/sycl_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace elementwise_common
-{
-
-using dpctl::tensor::ssize_t;
-using dpctl::tensor::kernels::alignment_utils::
-    disabled_sg_loadstore_wrapper_krn;
-using dpctl::tensor::kernels::alignment_utils::is_aligned;
-using dpctl::tensor::kernels::alignment_utils::required_alignment;
-
-using dpctl::tensor::sycl_utils::sub_group_load;
-using dpctl::tensor::sycl_utils::sub_group_store;
-
-/*! @brief Functor for unary function evaluation on contiguous array */
-template <typename argT,
-          typename resT,
-          typename UnaryOperatorT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-struct UnaryContigFunctor
-{
-private:
-    const argT *in = nullptr;
-    resT *out = nullptr;
-    std::size_t nelems_;
-
-public:
-    UnaryContigFunctor(const argT *inp, resT *res, const std::size_t n_elems)
-        : in(inp), out(res), nelems_(n_elems)
-    {
-    }
-
-    void operator()(sycl::nd_item<1> ndit) const
-    {
-        static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
-        UnaryOperatorT op{};
-        /* Each work-item processes vec_sz elements, contiguous in memory */
-        /* NOTE: work-group size must be divisible by sub-group size */
-
-        if constexpr (enable_sg_loadstore && UnaryOperatorT::is_constant::value)
-        {
-            // value of operator is known to be a known constant
-            constexpr resT const_val = UnaryOperatorT::constant_value;
-
-            auto sg = ndit.get_sub_group();
-            const std::uint16_t sgSize = sg.get_max_local_range()[0];
-
-            const std::size_t base =
-                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
-                                sg.get_group_id()[0] * sgSize);
-            if (base + elems_per_wi * sgSize < nelems_) {
-                static constexpr sycl::vec<resT, vec_sz> res_vec(const_val);
-#pragma unroll
-                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
-                    const std::size_t offset = base + it * sgSize;
-                    auto out_multi_ptr = sycl::address_space_cast<
-                        sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(&out[offset]);
-
-                    sub_group_store<vec_sz>(sg, res_vec, out_multi_ptr);
-                }
-            }
-            else {
-                const std::size_t lane_id = sg.get_local_id()[0];
-                for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
-                    out[k] = const_val;
-                }
-            }
-        }
-        else if constexpr (enable_sg_loadstore &&
-                           UnaryOperatorT::supports_sg_loadstore::value &&
-                           UnaryOperatorT::supports_vec::value && (vec_sz > 1))
-        {
-            auto sg = ndit.get_sub_group();
-            const std::uint16_t sgSize = sg.get_max_local_range()[0];
-
-            const std::size_t base =
-                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
-                                sg.get_group_id()[0] * sgSize);
-            if (base + elems_per_wi * sgSize < nelems_) {
-#pragma unroll
-                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
-                    const std::size_t offset = base + it * sgSize;
-                    auto in_multi_ptr = sycl::address_space_cast<
-                        sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(&in[offset]);
-                    auto out_multi_ptr = sycl::address_space_cast<
-                        sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(&out[offset]);
-
-                    const sycl::vec<argT, vec_sz> x =
-                        sub_group_load<vec_sz>(sg, in_multi_ptr);
-                    const sycl::vec<resT, vec_sz> res_vec = op(x);
-                    sub_group_store<vec_sz>(sg, res_vec, out_multi_ptr);
-                }
-            }
-            else {
-                const std::size_t lane_id = sg.get_local_id()[0];
-                for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
-                    // scalar call
-                    out[k] = op(in[k]);
-                }
-            }
-        }
-        else if constexpr (enable_sg_loadstore &&
-                           UnaryOperatorT::supports_sg_loadstore::value &&
-                           std::is_same_v<resT, argT>)
-        {
-            // default: use scalar-value function
-
-            auto sg = ndit.get_sub_group();
-            const std::uint16_t sgSize = sg.get_max_local_range()[0];
-            const std::size_t base =
-                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
-                                sg.get_group_id()[0] * sgSize);
-
-            if (base + elems_per_wi * sgSize < nelems_) {
-#pragma unroll
-                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
-                    const std::size_t offset = base + it * sgSize;
-                    auto in_multi_ptr = sycl::address_space_cast<
-                        sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(&in[offset]);
-                    auto out_multi_ptr = sycl::address_space_cast<
-                        sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(&out[offset]);
-
-                    sycl::vec<argT, vec_sz> arg_vec =
-                        sub_group_load<vec_sz>(sg, in_multi_ptr);
-#pragma unroll
-                    for (std::uint32_t k = 0; k < vec_sz; ++k) {
-                        arg_vec[k] = op(arg_vec[k]);
-                    }
-                    sub_group_store<vec_sz>(sg, arg_vec, out_multi_ptr);
-                }
-            }
-            else {
-                const std::size_t lane_id = sg.get_local_id()[0];
-                for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
-                    out[k] = op(in[k]);
-                }
-            }
-        }
-        else if constexpr (enable_sg_loadstore &&
-                           UnaryOperatorT::supports_sg_loadstore::value)
-        {
-            // default: use scalar-value function
-
-            auto sg = ndit.get_sub_group();
-            const std::uint16_t sgSize = sg.get_max_local_range()[0];
-            const std::size_t base =
-                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
-                                sg.get_group_id()[0] * sgSize);
-
-            if (base + elems_per_wi * sgSize < nelems_) {
-#pragma unroll
-                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
-                    const std::size_t offset = base + it * sgSize;
-                    auto in_multi_ptr = sycl::address_space_cast<
-                        sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(&in[offset]);
-                    auto out_multi_ptr = sycl::address_space_cast<
-                        sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(&out[offset]);
-
-                    const sycl::vec<argT, vec_sz> arg_vec =
-                        sub_group_load<vec_sz>(sg, in_multi_ptr);
-                    sycl::vec<resT, vec_sz> res_vec;
-#pragma unroll
-                    for (std::uint8_t k = 0; k < vec_sz; ++k) {
-                        res_vec[k] = op(arg_vec[k]);
-                    }
-                    sub_group_store<vec_sz>(sg, res_vec, out_multi_ptr);
-                }
-            }
-            else {
-                const std::size_t lane_id = sg.get_local_id()[0];
-                for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
-                    out[k] = op(in[k]);
-                }
-            }
-        }
-        else {
-            const std::uint16_t sgSize =
-                ndit.get_sub_group().get_local_range()[0];
-            const std::size_t gid = ndit.get_global_linear_id();
-            const std::uint16_t elems_per_sg = sgSize * elems_per_wi;
-
-            const std::size_t start =
-                (gid / sgSize) * (elems_per_sg - sgSize) + gid;
-            const std::size_t end = std::min(nelems_, start + elems_per_sg);
-            for (std::size_t offset = start; offset < end; offset += sgSize) {
-                out[offset] = op(in[offset]);
-            }
-        }
-    }
-};
-
-template <typename argT, typename resT, typename IndexerT, typename UnaryOpT>
-struct UnaryStridedFunctor
-{
-private:
-    const argT *inp_ = nullptr;
-    resT *res_ = nullptr;
-    IndexerT inp_out_indexer_;
-
-public:
-    UnaryStridedFunctor(const argT *inp_p,
-                        resT *res_p,
-                        const IndexerT &inp_out_indexer)
-        : inp_(inp_p), res_(res_p), inp_out_indexer_(inp_out_indexer)
-    {
-    }
-
-    void operator()(sycl::id<1> wid) const
-    {
-        const auto &offsets_ = inp_out_indexer_(wid.get(0));
-        const ssize_t &inp_offset = offsets_.get_first_offset();
-        const ssize_t &res_offset = offsets_.get_second_offset();
-
-        UnaryOpT op{};
-
-        res_[res_offset] = op(inp_[inp_offset]);
-    }
-};
-
-template <typename SizeT>
-SizeT select_lws(const sycl::device &, SizeT n_work_items_needed)
-{
-    // TODO: make the decision based on device descriptors
-
-    // constexpr SizeT few_threshold = (SizeT(1) << 17);
-    static constexpr SizeT med_threshold = (SizeT(1) << 21);
-
-    const SizeT lws =
-        (n_work_items_needed <= med_threshold ? SizeT(128) : SizeT(256));
-
-    return lws;
-}
-
-template <typename argTy,
-          template <typename T>
-          class UnaryOutputType,
-          template <typename A,
-                    typename R,
-                    std::uint8_t vs,
-                    std::uint8_t nv,
-                    bool enable>
-          class ContigFunctorT,
-          template <typename A, typename R, std::uint8_t vs, std::uint8_t nv>
-          class kernel_name,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u>
-sycl::event unary_contig_impl(sycl::queue &exec_q,
-                              std::size_t nelems,
-                              const char *arg_p,
-                              char *res_p,
-                              const std::vector<sycl::event> &depends = {})
-{
-    static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
-    const std::size_t n_work_items_needed = nelems / elems_per_wi;
-    const std::size_t lws =
-        select_lws(exec_q.get_device(), n_work_items_needed);
-
-    const std::size_t n_groups =
-        ((nelems + lws * elems_per_wi - 1) / (lws * elems_per_wi));
-    const auto gws_range = sycl::range<1>(n_groups * lws);
-    const auto lws_range = sycl::range<1>(lws);
-
-    using resTy = typename UnaryOutputType<argTy>::value_type;
-    using BaseKernelName = kernel_name<argTy, resTy, vec_sz, n_vecs>;
-
-    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
-    resTy *res_tp = reinterpret_cast<resTy *>(res_p);
-
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        if (is_aligned<required_alignment>(arg_p) &&
-            is_aligned<required_alignment>(res_p))
-        {
-            static constexpr bool enable_sg_loadstore = true;
-            using KernelName = BaseKernelName;
-            using Impl = ContigFunctorT<argTy, resTy, vec_sz, n_vecs,
-                                        enable_sg_loadstore>;
-
-            cgh.parallel_for<KernelName>(
-                sycl::nd_range<1>(gws_range, lws_range),
-                Impl(arg_tp, res_tp, nelems));
-        }
-        else {
-            static constexpr bool disable_sg_loadstore = false;
-            using KernelName =
-                disabled_sg_loadstore_wrapper_krn<BaseKernelName>;
-            using Impl = ContigFunctorT<argTy, resTy, vec_sz, n_vecs,
-                                        disable_sg_loadstore>;
-
-            cgh.parallel_for<KernelName>(
-                sycl::nd_range<1>(gws_range, lws_range),
-                Impl(arg_tp, res_tp, nelems));
-        }
-    });
-
-    return comp_ev;
-}
-
-template <typename argTy,
-          template <typename T>
-          class UnaryOutputType,
-          template <typename A, typename R, typename I>
-          class StridedFunctorT,
-          template <typename A, typename R, typename I>
-          class kernel_name>
-sycl::event
-unary_strided_impl(sycl::queue &exec_q,
-                   std::size_t nelems,
-                   int nd,
-                   const ssize_t *shape_and_strides,
-                   const char *arg_p,
-                   ssize_t arg_offset,
-                   char *res_p,
-                   ssize_t res_offset,
-                   const std::vector<sycl::event> &depends,
-                   const std::vector<sycl::event> &additional_depends)
-{
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        cgh.depends_on(additional_depends);
-
-        using resTy = typename UnaryOutputType<argTy>::value_type;
-        using IndexerT =
-            typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
-
-        const IndexerT indexer{nd, arg_offset, res_offset, shape_and_strides};
-
-        const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
-        resTy *res_tp = reinterpret_cast<resTy *>(res_p);
-
-        using Impl = StridedFunctorT<argTy, resTy, IndexerT>;
-
-        cgh.parallel_for<kernel_name<argTy, resTy, IndexerT>>(
-            {nelems}, Impl(arg_tp, res_tp, indexer));
-    });
-    return comp_ev;
-}
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          typename BinaryOperatorT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-struct BinaryContigFunctor
-{
-private:
-    const argT1 *in1 = nullptr;
-    const argT2 *in2 = nullptr;
-    resT *out = nullptr;
-    std::size_t nelems_;
-
-public:
-    BinaryContigFunctor(const argT1 *inp1,
-                        const argT2 *inp2,
-                        resT *res,
-                        const std::size_t n_elems)
-        : in1(inp1), in2(inp2), out(res), nelems_(n_elems)
-    {
-    }
-
-    void operator()(sycl::nd_item<1> ndit) const
-    {
-        static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz;
-        BinaryOperatorT op{};
-        /* Each work-item processes vec_sz elements, contiguous in memory */
-        /* NOTE: work-group size must be divisible by sub-group size */
-
-        if constexpr (enable_sg_loadstore &&
-                      BinaryOperatorT::supports_sg_loadstore::value &&
-                      BinaryOperatorT::supports_vec::value && (vec_sz > 1))
-        {
-            auto sg = ndit.get_sub_group();
-            std::uint16_t sgSize = sg.get_max_local_range()[0];
-
-            const std::size_t base =
-                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
-                                sg.get_group_id()[0] * sgSize);
-
-            if (base + elems_per_wi * sgSize < nelems_) {
-                sycl::vec<resT, vec_sz> res_vec;
-
-#pragma unroll
-                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
-                    std::size_t offset = base + it * sgSize;
-                    auto in1_multi_ptr = sycl::address_space_cast<
-                        sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(&in1[offset]);
-                    auto in2_multi_ptr = sycl::address_space_cast<
-                        sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(&in2[offset]);
-                    auto out_multi_ptr = sycl::address_space_cast<
-                        sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(&out[offset]);
-
-                    const sycl::vec<argT1, vec_sz> arg1_vec =
-                        sub_group_load<vec_sz>(sg, in1_multi_ptr);
-                    const sycl::vec<argT2, vec_sz> arg2_vec =
-                        sub_group_load<vec_sz>(sg, in2_multi_ptr);
-                    res_vec = op(arg1_vec, arg2_vec);
-                    sub_group_store<vec_sz>(sg, res_vec, out_multi_ptr);
-                }
-            }
-            else {
-                const std::size_t lane_id = sg.get_local_id()[0];
-                for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
-                    out[k] = op(in1[k], in2[k]);
-                }
-            }
-        }
-        else if constexpr (enable_sg_loadstore &&
-                           BinaryOperatorT::supports_sg_loadstore::value)
-        {
-            auto sg = ndit.get_sub_group();
-            const std::uint16_t sgSize = sg.get_max_local_range()[0];
-
-            const std::size_t base =
-                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
-                                sg.get_group_id()[0] * sgSize);
-
-            if (base + elems_per_wi * sgSize < nelems_) {
-#pragma unroll
-                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
-                    const std::size_t offset = base + it * sgSize;
-                    auto in1_multi_ptr = sycl::address_space_cast<
-                        sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(&in1[offset]);
-                    auto in2_multi_ptr = sycl::address_space_cast<
-                        sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(&in2[offset]);
-                    auto out_multi_ptr = sycl::address_space_cast<
-                        sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(&out[offset]);
-
-                    const sycl::vec<argT1, vec_sz> arg1_vec =
-                        sub_group_load<vec_sz>(sg, in1_multi_ptr);
-                    const sycl::vec<argT2, vec_sz> arg2_vec =
-                        sub_group_load<vec_sz>(sg, in2_multi_ptr);
-
-                    sycl::vec<resT, vec_sz> res_vec;
-#pragma unroll
-                    for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) {
-                        res_vec[vec_id] =
-                            op(arg1_vec[vec_id], arg2_vec[vec_id]);
-                    }
-                    sub_group_store<vec_sz>(sg, res_vec, out_multi_ptr);
-                }
-            }
-            else {
-                const std::size_t lane_id = sg.get_local_id()[0];
-                for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
-                    out[k] = op(in1[k], in2[k]);
-                }
-            }
-        }
-        else {
-            const std::size_t sgSize =
-                ndit.get_sub_group().get_local_range()[0];
-            const std::size_t gid = ndit.get_global_linear_id();
-            const std::size_t elems_per_sg = sgSize * elems_per_wi;
-
-            const std::size_t start =
-                (gid / sgSize) * (elems_per_sg - sgSize) + gid;
-            const std::size_t end = std::min(nelems_, start + elems_per_sg);
-            for (std::size_t offset = start; offset < end; offset += sgSize) {
-                out[offset] = op(in1[offset], in2[offset]);
-            }
-        }
-    }
-};
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          typename ThreeOffsets_IndexerT,
-          typename BinaryOperatorT>
-struct BinaryStridedFunctor
-{
-private:
-    const argT1 *in1 = nullptr;
-    const argT2 *in2 = nullptr;
-    resT *out = nullptr;
-    ThreeOffsets_IndexerT three_offsets_indexer_;
-
-public:
-    BinaryStridedFunctor(const argT1 *inp1_tp,
-                         const argT2 *inp2_tp,
-                         resT *res_tp,
-                         const ThreeOffsets_IndexerT &inps_res_indexer)
-        : in1(inp1_tp), in2(inp2_tp), out(res_tp),
-          three_offsets_indexer_(inps_res_indexer)
-    {
-    }
-
-    void operator()(sycl::id<1> wid) const
-    {
-        const auto &three_offsets_ =
-            three_offsets_indexer_(static_cast<ssize_t>(wid.get(0)));
-
-        const auto &inp1_offset = three_offsets_.get_first_offset();
-        const auto &inp2_offset = three_offsets_.get_second_offset();
-        const auto &out_offset = three_offsets_.get_third_offset();
-
-        BinaryOperatorT op{};
-        out[out_offset] = op(in1[inp1_offset], in2[inp2_offset]);
-    }
-};
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          typename BinaryOperatorT>
-struct BinaryContigMatrixContigRowBroadcastingFunctor
-{
-private:
-    const argT1 *mat;
-    const argT2 *padded_vec;
-    resT *res;
-    std::size_t n_elems;
-    std::size_t n1;
-
-public:
-    BinaryContigMatrixContigRowBroadcastingFunctor(const argT1 *mat_tp,
-                                                   const argT2 *row_tp,
-                                                   resT *res_tp,
-                                                   std::size_t n_elems_in_mat,
-                                                   std::size_t n_elems_in_row)
-        : mat(mat_tp), padded_vec(row_tp), res(res_tp), n_elems(n_elems_in_mat),
-          n1(n_elems_in_row)
-    {
-    }
-
-    void operator()(sycl::nd_item<1> ndit) const
-    {
-        /* NOTE: work-group size must be divisible by sub-group size */
-
-        BinaryOperatorT op{};
-        static_assert(BinaryOperatorT::supports_sg_loadstore::value);
-
-        const auto &sg = ndit.get_sub_group();
-        const std::size_t gid = ndit.get_global_linear_id();
-
-        const std::size_t sgSize = sg.get_max_local_range()[0];
-        const std::size_t base = gid - sg.get_local_id()[0];
-
-        if (base + sgSize < n_elems) {
-            auto in1_multi_ptr = sycl::address_space_cast<
-                sycl::access::address_space::global_space,
-                sycl::access::decorated::yes>(&mat[base]);
-
-            auto in2_multi_ptr = sycl::address_space_cast<
-                sycl::access::address_space::global_space,
-                sycl::access::decorated::yes>(&padded_vec[base % n1]);
-
-            auto out_multi_ptr = sycl::address_space_cast<
-                sycl::access::address_space::global_space,
-                sycl::access::decorated::yes>(&res[base]);
-
-            const argT1 mat_el = sub_group_load(sg, in1_multi_ptr);
-            const argT2 vec_el = sub_group_load(sg, in2_multi_ptr);
-
-            resT res_el = op(mat_el, vec_el);
-
-            sub_group_store(sg, res_el, out_multi_ptr);
-        }
-        else {
-            const std::size_t lane_id = sg.get_local_id()[0];
-            for (std::size_t k = base + lane_id; k < n_elems; k += sgSize) {
-                res[k] = op(mat[k], padded_vec[k % n1]);
-            }
-        }
-    }
-};
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          typename BinaryOperatorT>
-struct BinaryContigRowContigMatrixBroadcastingFunctor
-{
-private:
-    const argT1 *padded_vec;
-    const argT2 *mat;
-    resT *res;
-    std::size_t n_elems;
-    std::size_t n1;
-
-public:
-    BinaryContigRowContigMatrixBroadcastingFunctor(const argT1 *row_tp,
-                                                   const argT2 *mat_tp,
-                                                   resT *res_tp,
-                                                   std::size_t n_elems_in_mat,
-                                                   std::size_t n_elems_in_row)
-        : padded_vec(row_tp), mat(mat_tp), res(res_tp), n_elems(n_elems_in_mat),
-          n1(n_elems_in_row)
-    {
-    }
-
-    void operator()(sycl::nd_item<1> ndit) const
-    {
-        /* NOTE: work-group size must be divisible by sub-group size */
-        BinaryOperatorT op{};
-        static_assert(BinaryOperatorT::supports_sg_loadstore::value);
-
-        const auto &sg = ndit.get_sub_group();
-        std::size_t gid = ndit.get_global_linear_id();
-
-        const std::size_t sgSize = sg.get_max_local_range()[0];
-        const std::size_t base = gid - sg.get_local_id()[0];
-
-        if (base + sgSize < n_elems) {
-            auto in1_multi_ptr = sycl::address_space_cast<
-                sycl::access::address_space::global_space,
-                sycl::access::decorated::yes>(&padded_vec[base % n1]);
-
-            auto in2_multi_ptr = sycl::address_space_cast<
-                sycl::access::address_space::global_space,
-                sycl::access::decorated::yes>(&mat[base]);
-
-            auto out_multi_ptr = sycl::address_space_cast<
-                sycl::access::address_space::global_space,
-                sycl::access::decorated::yes>(&res[base]);
-
-            const argT2 mat_el = sub_group_load(sg, in2_multi_ptr);
-            const argT1 vec_el = sub_group_load(sg, in1_multi_ptr);
-
-            resT res_el = op(vec_el, mat_el);
-
-            sub_group_store(sg, res_el, out_multi_ptr);
-        }
-        else {
-            const std::size_t lane_id = sg.get_local_id()[0];
-            for (std::size_t k = base + lane_id; k < n_elems; k += sgSize) {
-                res[k] = op(padded_vec[k % n1], mat[k]);
-            }
-        }
-    }
-};
-
-// Typedefs for function pointers
-
-typedef sycl::event (*unary_contig_impl_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t,
-    const char *,
-    char *,
-    const std::vector<sycl::event> &);
-
-typedef sycl::event (*unary_strided_impl_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t,
-    int,
-    const ssize_t *,
-    const char *,
-    ssize_t,
-    char *,
-    ssize_t,
-    const std::vector<sycl::event> &,
-    const std::vector<sycl::event> &);
-
-typedef sycl::event (*binary_contig_impl_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t,
-    const char *,
-    ssize_t,
-    const char *,
-    ssize_t,
-    char *,
-    ssize_t,
-    const std::vector<sycl::event> &);
-
-typedef sycl::event (*binary_strided_impl_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t,
-    int,
-    const ssize_t *,
-    const char *,
-    ssize_t,
-    const char *,
-    ssize_t,
-    char *,
-    ssize_t,
-    const std::vector<sycl::event> &,
-    const std::vector<sycl::event> &);
-
-typedef sycl::event (*binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t)(
-    sycl::queue &,
-    std::vector<sycl::event> &,
-    std::size_t,
-    std::size_t,
-    const char *,
-    ssize_t,
-    const char *,
-    ssize_t,
-    char *,
-    ssize_t,
-    const std::vector<sycl::event> &);
-
-typedef sycl::event (*binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t)(
-    sycl::queue &,
-    std::vector<sycl::event> &,
-    std::size_t,
-    std::size_t,
-    const char *,
-    ssize_t,
-    const char *,
-    ssize_t,
-    char *,
-    ssize_t,
-    const std::vector<sycl::event> &);
-
-template <typename argTy1,
-          typename argTy2,
-          template <typename T1, typename T2>
-          class BinaryOutputType,
-          template <typename T1,
-                    typename T2,
-                    typename T3,
-                    std::uint8_t vs,
-                    std::uint8_t nv,
-                    bool enable_sg_loadstore>
-          class BinaryContigFunctorT,
-          template <typename T1,
-                    typename T2,
-                    typename T3,
-                    std::uint8_t vs,
-                    std::uint8_t nv>
-          class kernel_name,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u>
-sycl::event binary_contig_impl(sycl::queue &exec_q,
-                               std::size_t nelems,
-                               const char *arg1_p,
-                               ssize_t arg1_offset,
-                               const char *arg2_p,
-                               ssize_t arg2_offset,
-                               char *res_p,
-                               ssize_t res_offset,
-                               const std::vector<sycl::event> &depends = {})
-{
-    const std::size_t n_work_items_needed = nelems / (n_vecs * vec_sz);
-    const std::size_t lws =
-        select_lws(exec_q.get_device(), n_work_items_needed);
-
-    const std::size_t n_groups =
-        ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
-    const auto gws_range = sycl::range<1>(n_groups * lws);
-    const auto lws_range = sycl::range<1>(lws);
-
-    using resTy = typename BinaryOutputType<argTy1, argTy2>::value_type;
-    using BaseKernelName = kernel_name<argTy1, argTy2, resTy, vec_sz, n_vecs>;
-
-    const argTy1 *arg1_tp =
-        reinterpret_cast<const argTy1 *>(arg1_p) + arg1_offset;
-    const argTy2 *arg2_tp =
-        reinterpret_cast<const argTy2 *>(arg2_p) + arg2_offset;
-    resTy *res_tp = reinterpret_cast<resTy *>(res_p) + res_offset;
-
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        if (is_aligned<required_alignment>(arg1_tp) &&
-            is_aligned<required_alignment>(arg2_tp) &&
-            is_aligned<required_alignment>(res_tp))
-        {
-            static constexpr bool enable_sg_loadstore = true;
-            using KernelName = BaseKernelName;
-            using Impl = BinaryContigFunctorT<argTy1, argTy2, resTy, vec_sz,
-                                              n_vecs, enable_sg_loadstore>;
-
-            cgh.parallel_for<KernelName>(
-                sycl::nd_range<1>(gws_range, lws_range),
-                Impl(arg1_tp, arg2_tp, res_tp, nelems));
-        }
-        else {
-            static constexpr bool disable_sg_loadstore = false;
-            using KernelName =
-                disabled_sg_loadstore_wrapper_krn<BaseKernelName>;
-            using Impl = BinaryContigFunctorT<argTy1, argTy2, resTy, vec_sz,
-                                              n_vecs, disable_sg_loadstore>;
-
-            cgh.parallel_for<KernelName>(
-                sycl::nd_range<1>(gws_range, lws_range),
-                Impl(arg1_tp, arg2_tp, res_tp, nelems));
-        }
-    });
-    return comp_ev;
-}
-
-template <typename argTy1,
-          typename argTy2,
-          template <typename T1, typename T2>
-          class BinaryOutputType,
-          template <typename T1, typename T2, typename T3, typename IndT>
-          class BinaryStridedFunctorT,
-          template <typename T1, typename T2, typename T3, typename IndT>
-          class kernel_name>
-sycl::event
-binary_strided_impl(sycl::queue &exec_q,
-                    std::size_t nelems,
-                    int nd,
-                    const ssize_t *shape_and_strides,
-                    const char *arg1_p,
-                    ssize_t arg1_offset,
-                    const char *arg2_p,
-                    ssize_t arg2_offset,
-                    char *res_p,
-                    ssize_t res_offset,
-                    const std::vector<sycl::event> &depends,
-                    const std::vector<sycl::event> &additional_depends)
-{
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        cgh.depends_on(additional_depends);
-
-        using resTy = typename BinaryOutputType<argTy1, argTy2>::value_type;
-
-        using IndexerT =
-            typename dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
-
-        const IndexerT indexer{nd, arg1_offset, arg2_offset, res_offset,
-                               shape_and_strides};
-
-        const argTy1 *arg1_tp = reinterpret_cast<const argTy1 *>(arg1_p);
-        const argTy2 *arg2_tp = reinterpret_cast<const argTy2 *>(arg2_p);
-        resTy *res_tp = reinterpret_cast<resTy *>(res_p);
-
-        using Impl = BinaryStridedFunctorT<argTy1, argTy2, resTy, IndexerT>;
-
-        cgh.parallel_for<kernel_name<argTy1, argTy2, resTy, IndexerT>>(
-            {nelems}, Impl(arg1_tp, arg2_tp, res_tp, indexer));
-    });
-    return comp_ev;
-}
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          template <typename T1, typename T2, typename T3>
-          class BinaryContigMatrixContigRowBroadcastFunctorT,
-          template <typename T1, typename T2, typename T3>
-          class kernel_name>
-sycl::event binary_contig_matrix_contig_row_broadcast_impl(
-    sycl::queue &exec_q,
-    std::vector<sycl::event> &host_tasks,
-    std::size_t n0,
-    std::size_t n1,
-    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
-    ssize_t mat_offset,
-    const char *vec_p, // typeless pointer to (n1,) contiguous row
-    ssize_t vec_offset,
-    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
-                 //    res[i,j] = op(mat[i,j], vec[j])
-    ssize_t res_offset,
-    const std::vector<sycl::event> &depends = {})
-{
-    const argT1 *mat = reinterpret_cast<const argT1 *>(mat_p) + mat_offset;
-    const argT2 *vec = reinterpret_cast<const argT2 *>(vec_p) + vec_offset;
-    resT *res = reinterpret_cast<resT *>(res_p) + res_offset;
-
-    const auto &dev = exec_q.get_device();
-    const auto &sg_sizes = dev.get_info<sycl::info::device::sub_group_sizes>();
-    // Get device-specific kernel info max_sub_group_size
-    std::size_t max_sgSize =
-        *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes)));
-
-    std::size_t n1_padded = n1 + max_sgSize;
-    auto padded_vec_owner =
-        dpctl::tensor::alloc_utils::smart_malloc_device<argT2>(n1_padded,
-                                                               exec_q);
-    argT2 *padded_vec = padded_vec_owner.get();
-
-    sycl::event make_padded_vec_ev =
-        dpctl::tensor::kernels::elementwise_detail::populate_padded_vector<
-            argT2>(exec_q, vec, n1, padded_vec, n1_padded, depends);
-
-    // sub-group spans work-items [I, I + sgSize)
-    // base = ndit.get_global_linear_id() - sg.get_local_id()[0]
-    // Generically, sub_group_load( &mat[base]) may load arrays from
-    // different rows of mat. The start corresponds to row (base / n0)
-    // We read sub_group_load(&padded_vec[(base / n0)]).
-    // The vector is padded to ensure that reads are accessible
-
-    const std::size_t lws = 128;
-
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(make_padded_vec_ev);
-
-        auto lwsRange = sycl::range<1>(lws);
-        std::size_t n_elems = n0 * n1;
-        std::size_t n_groups = (n_elems + lws - 1) / lws;
-        auto gwsRange = sycl::range<1>(n_groups * lws);
-
-        using Impl =
-            BinaryContigMatrixContigRowBroadcastFunctorT<argT1, argT2, resT>;
-
-        cgh.parallel_for<class kernel_name<argT1, argT2, resT>>(
-            sycl::nd_range<1>(gwsRange, lwsRange),
-            Impl(mat, padded_vec, res, n_elems, n1));
-    });
-
-    sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
-        exec_q, {comp_ev}, padded_vec_owner);
-
-    host_tasks.push_back(tmp_cleanup_ev);
-
-    return comp_ev;
-}
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          template <typename T1, typename T2, typename T3>
-          class BinaryContigRowContigMatrixBroadcastFunctorT,
-          template <typename T1, typename T2, typename T3>
-          class kernel_name>
-sycl::event binary_contig_row_contig_matrix_broadcast_impl(
-    sycl::queue &exec_q,
-    std::vector<sycl::event> &host_tasks,
-    std::size_t n0,
-    std::size_t n1,
-    const char *vec_p, // typeless pointer to (n1,) contiguous row
-    ssize_t vec_offset,
-    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
-    ssize_t mat_offset,
-    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
-                 //    res[i,j] = op(vec[j], mat[i,j])
-    ssize_t res_offset,
-    const std::vector<sycl::event> &depends = {})
-{
-    const argT1 *vec = reinterpret_cast<const argT2 *>(vec_p) + vec_offset;
-    const argT2 *mat = reinterpret_cast<const argT1 *>(mat_p) + mat_offset;
-    resT *res = reinterpret_cast<resT *>(res_p) + res_offset;
-
-    const auto &dev = exec_q.get_device();
-    const auto &sg_sizes = dev.get_info<sycl::info::device::sub_group_sizes>();
-    // Get device-specific kernel info max_sub_group_size
-    std::size_t max_sgSize =
-        *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes)));
-
-    std::size_t n1_padded = n1 + max_sgSize;
-    auto padded_vec_owner =
-        dpctl::tensor::alloc_utils::smart_malloc_device<argT2>(n1_padded,
-                                                               exec_q);
-    argT2 *padded_vec = padded_vec_owner.get();
-
-    sycl::event make_padded_vec_ev =
-        dpctl::tensor::kernels::elementwise_detail::populate_padded_vector<
-            argT2>(exec_q, vec, n1, padded_vec, n1_padded, depends);
-
-    // sub-group spans work-items [I, I + sgSize)
-    // base = ndit.get_global_linear_id() - sg.get_local_id()[0]
-    // Generically, sub_group_load( &mat[base]) may load arrays from
-    // different rows of mat. The start corresponds to row (base / n0)
-    // We read sub_group_load(&padded_vec[(base / n0)]). The vector is
-    // padded to ensure that reads are accessible
-
-    const std::size_t lws = 128;
-
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(make_padded_vec_ev);
-
-        auto lwsRange = sycl::range<1>(lws);
-        std::size_t n_elems = n0 * n1;
-        std::size_t n_groups = (n_elems + lws - 1) / lws;
-        auto gwsRange = sycl::range<1>(n_groups * lws);
-
-        using Impl =
-            BinaryContigRowContigMatrixBroadcastFunctorT<argT1, argT2, resT>;
-
-        cgh.parallel_for<class kernel_name<argT1, argT2, resT>>(
-            sycl::nd_range<1>(gwsRange, lwsRange),
-            Impl(padded_vec, mat, res, n_elems, n1));
-    });
-
-    sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
-        exec_q, {comp_ev}, padded_vec_owner);
-
-    host_tasks.push_back(tmp_cleanup_ev);
-
-    return comp_ev;
-};
-
-} // namespace elementwise_common
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp
deleted file mode 100644
index 37a672d565..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-//=== common_detail.hpp -                                     - *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines common code for elementwise tensor operations.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <vector>
-
-#include <sycl/sycl.hpp>
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace elementwise_detail
-{
-
-template <typename T> class populate_padded_vec_krn;
-
-template <typename T>
-sycl::event
-populate_padded_vector(sycl::queue &exec_q,
-                       const T *vec,
-                       std::size_t vec_sz,
-                       T *padded_vec,
-                       size_t padded_vec_sz,
-                       const std::vector<sycl::event> &dependent_events)
-{
-    sycl::event populate_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) {
-        // ensure vec contains actual data
-        cgh.depends_on(dependent_events);
-
-        sycl::range<1> gRange{padded_vec_sz};
-
-        cgh.parallel_for<class populate_padded_vec_krn<T>>(
-            gRange, [=](sycl::id<1> id) {
-                std::size_t i = id[0];
-                padded_vec[i] = vec[i % vec_sz];
-            });
-    });
-
-    return populate_padded_vec_ev;
-}
-
-} // end of namespace elementwise_detail
-} // end of namespace kernels
-} // end of namespace tensor
-} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
deleted file mode 100644
index 4c16f8ff8f..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
+++ /dev/null
@@ -1,475 +0,0 @@
-//=== common_inplace.hpp -  Common code for in-place elementwise operations
-//----- *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines common code for in-place elementwise tensor operations.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <stdexcept>
-
-#include <sycl/sycl.hpp>
-
-#include "kernels/alignment.hpp"
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common_detail.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/sycl_alloc_utils.hpp"
-#include "utils/sycl_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace elementwise_common
-{
-
-using dpctl::tensor::ssize_t;
-using dpctl::tensor::kernels::alignment_utils::
-    disabled_sg_loadstore_wrapper_krn;
-using dpctl::tensor::kernels::alignment_utils::is_aligned;
-using dpctl::tensor::kernels::alignment_utils::required_alignment;
-
-using dpctl::tensor::sycl_utils::sub_group_load;
-using dpctl::tensor::sycl_utils::sub_group_store;
-
-template <typename argT,
-          typename resT,
-          typename BinaryInplaceOperatorT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-struct BinaryInplaceContigFunctor
-{
-private:
-    const argT *rhs = nullptr;
-    resT *lhs = nullptr;
-    std::size_t nelems_;
-
-public:
-    BinaryInplaceContigFunctor(const argT *rhs_tp,
-                               resT *lhs_tp,
-                               const std::size_t n_elems)
-        : rhs(rhs_tp), lhs(lhs_tp), nelems_(n_elems)
-    {
-    }
-
-    void operator()(sycl::nd_item<1> ndit) const
-    {
-        BinaryInplaceOperatorT op{};
-        static constexpr std::uint8_t elems_per_wi = vec_sz * n_vecs;
-        /* Each work-item processes vec_sz elements, contiguous in memory */
-        /* NB: Workgroup size must be divisible by sub-group size */
-
-        if constexpr (enable_sg_loadstore &&
-                      BinaryInplaceOperatorT::supports_sg_loadstore::value &&
-                      BinaryInplaceOperatorT::supports_vec::value &&
-                      (vec_sz > 1))
-        {
-            auto sg = ndit.get_sub_group();
-            std::uint16_t sgSize = sg.get_max_local_range()[0];
-
-            std::size_t base =
-                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
-                                sg.get_group_id()[0] * sgSize);
-
-            if (base + elems_per_wi * sgSize < nelems_) {
-
-#pragma unroll
-                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
-                    const std::size_t offset = base + it * sgSize;
-                    auto rhs_multi_ptr = sycl::address_space_cast<
-                        sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(&rhs[offset]);
-                    auto lhs_multi_ptr = sycl::address_space_cast<
-                        sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(&lhs[offset]);
-
-                    const sycl::vec<argT, vec_sz> &arg_vec =
-                        sub_group_load<vec_sz>(sg, rhs_multi_ptr);
-                    sycl::vec<resT, vec_sz> res_vec =
-                        sub_group_load<vec_sz>(sg, lhs_multi_ptr);
-                    op(res_vec, arg_vec);
-
-                    sub_group_store<vec_sz>(sg, res_vec, lhs_multi_ptr);
-                }
-            }
-            else {
-                const std::size_t lane_id = sg.get_local_id()[0];
-                for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
-                    op(lhs[k], rhs[k]);
-                }
-            }
-        }
-        else if constexpr (enable_sg_loadstore &&
-                           BinaryInplaceOperatorT::supports_sg_loadstore::value)
-        {
-            auto sg = ndit.get_sub_group();
-            std::uint16_t sgSize = sg.get_max_local_range()[0];
-
-            std::size_t base =
-                elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
-                                sg.get_group_id()[0] * sgSize);
-
-            if (base + elems_per_wi * sgSize < nelems_) {
-#pragma unroll
-                for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
-                    const std::size_t offset = base + it * sgSize;
-                    auto rhs_multi_ptr = sycl::address_space_cast<
-                        sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(&rhs[offset]);
-                    auto lhs_multi_ptr = sycl::address_space_cast<
-                        sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(&lhs[offset]);
-
-                    const sycl::vec<argT, vec_sz> arg_vec =
-                        sub_group_load<vec_sz>(sg, rhs_multi_ptr);
-                    sycl::vec<resT, vec_sz> res_vec =
-                        sub_group_load<vec_sz>(sg, lhs_multi_ptr);
-#pragma unroll
-                    for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) {
-                        op(res_vec[vec_id], arg_vec[vec_id]);
-                    }
-                    sub_group_store<vec_sz>(sg, res_vec, lhs_multi_ptr);
-                }
-            }
-            else {
-                const std::size_t lane_id = sg.get_local_id()[0];
-                for (std::size_t k = base + lane_id; k < nelems_; k += sgSize) {
-                    op(lhs[k], rhs[k]);
-                }
-            }
-        }
-        else {
-            const std::size_t sgSize =
-                ndit.get_sub_group().get_local_range()[0];
-            const std::size_t gid = ndit.get_global_linear_id();
-            const std::size_t elems_per_sg = elems_per_wi * sgSize;
-
-            const std::size_t start =
-                (gid / sgSize) * (elems_per_sg - sgSize) + gid;
-            const std::size_t end = std::min(nelems_, start + elems_per_sg);
-            for (std::size_t offset = start; offset < end; offset += sgSize) {
-                op(lhs[offset], rhs[offset]);
-            }
-        }
-    }
-};
-
-template <typename argT,
-          typename resT,
-          typename TwoOffsets_IndexerT,
-          typename BinaryInplaceOperatorT>
-struct BinaryInplaceStridedFunctor
-{
-private:
-    const argT *rhs = nullptr;
-    resT *lhs = nullptr;
-    TwoOffsets_IndexerT two_offsets_indexer_;
-
-public:
-    BinaryInplaceStridedFunctor(const argT *rhs_tp,
-                                resT *lhs_tp,
-                                const TwoOffsets_IndexerT &inp_res_indexer)
-        : rhs(rhs_tp), lhs(lhs_tp), two_offsets_indexer_(inp_res_indexer)
-    {
-    }
-
-    void operator()(sycl::id<1> wid) const
-    {
-        const auto &two_offsets_ =
-            two_offsets_indexer_(static_cast<ssize_t>(wid.get(0)));
-
-        const auto &inp_offset = two_offsets_.get_first_offset();
-        const auto &lhs_offset = two_offsets_.get_second_offset();
-
-        BinaryInplaceOperatorT op{};
-        op(lhs[lhs_offset], rhs[inp_offset]);
-    }
-};
-
-template <typename argT, typename resT, typename BinaryOperatorT>
-struct BinaryInplaceRowMatrixBroadcastingFunctor
-{
-private:
-    const argT *padded_vec;
-    resT *mat;
-    std::size_t n_elems;
-    std::size_t n1;
-
-public:
-    BinaryInplaceRowMatrixBroadcastingFunctor(const argT *row_tp,
-                                              resT *mat_tp,
-                                              std::size_t n_elems_in_mat,
-                                              std::size_t n_elems_in_row)
-        : padded_vec(row_tp), mat(mat_tp), n_elems(n_elems_in_mat),
-          n1(n_elems_in_row)
-    {
-    }
-
-    void operator()(sycl::nd_item<1> ndit) const
-    {
-        /* Workgroup size is expected to be a multiple of sub-group size */
-        BinaryOperatorT op{};
-        static_assert(BinaryOperatorT::supports_sg_loadstore::value);
-
-        auto sg = ndit.get_sub_group();
-        const std::size_t gid = ndit.get_global_linear_id();
-
-        std::uint8_t sgSize = sg.get_max_local_range()[0];
-        std::size_t base = gid - sg.get_local_id()[0];
-
-        if (base + sgSize < n_elems) {
-            auto in_multi_ptr = sycl::address_space_cast<
-                sycl::access::address_space::global_space,
-                sycl::access::decorated::yes>(&padded_vec[base % n1]);
-
-            auto out_multi_ptr = sycl::address_space_cast<
-                sycl::access::address_space::global_space,
-                sycl::access::decorated::yes>(&mat[base]);
-
-            const argT vec_el = sub_group_load(sg, in_multi_ptr);
-            resT mat_el = sub_group_load(sg, out_multi_ptr);
-
-            op(mat_el, vec_el);
-
-            sub_group_store(sg, mat_el, out_multi_ptr);
-        }
-        else {
-            const std::size_t start = base + sg.get_local_id()[0];
-            for (std::size_t k = start; k < n_elems; k += sgSize) {
-                op(mat[k], padded_vec[k % n1]);
-            }
-        }
-    }
-};
-
-// Typedefs for function pointers
-
-typedef sycl::event (*binary_inplace_contig_impl_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t,
-    const char *,
-    ssize_t,
-    char *,
-    ssize_t,
-    const std::vector<sycl::event> &);
-
-typedef sycl::event (*binary_inplace_strided_impl_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t,
-    int,
-    const ssize_t *,
-    const char *,
-    ssize_t,
-    char *,
-    ssize_t,
-    const std::vector<sycl::event> &,
-    const std::vector<sycl::event> &);
-
-typedef sycl::event (*binary_inplace_row_matrix_broadcast_impl_fn_ptr_t)(
-    sycl::queue &,
-    std::vector<sycl::event> &,
-    std::size_t,
-    std::size_t,
-    const char *,
-    ssize_t,
-    char *,
-    ssize_t,
-    const std::vector<sycl::event> &);
-
-template <typename argTy,
-          typename resTy,
-          template <typename T1,
-                    typename T2,
-                    std::uint8_t vs,
-                    std::uint8_t nv,
-                    bool enable_sg_loadstore>
-          class BinaryInplaceContigFunctorT,
-          template <typename T1, typename T2, std::uint8_t vs, std::uint8_t nv>
-          class kernel_name,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u>
-sycl::event
-binary_inplace_contig_impl(sycl::queue &exec_q,
-                           std::size_t nelems,
-                           const char *rhs_p,
-                           ssize_t rhs_offset,
-                           char *lhs_p,
-                           ssize_t lhs_offset,
-                           const std::vector<sycl::event> &depends = {})
-{
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        const std::size_t lws = 128;
-        const std::size_t n_groups =
-            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
-        const auto gws_range = sycl::range<1>(n_groups * lws);
-        const auto lws_range = sycl::range<1>(lws);
-
-        const argTy *arg_tp =
-            reinterpret_cast<const argTy *>(rhs_p) + rhs_offset;
-        resTy *res_tp = reinterpret_cast<resTy *>(lhs_p) + lhs_offset;
-
-        if (is_aligned<required_alignment>(arg_tp) &&
-            is_aligned<required_alignment>(res_tp))
-        {
-            static constexpr bool enable_sg_loadstore = true;
-            using KernelName = kernel_name<argTy, resTy, vec_sz, n_vecs>;
-            using Impl =
-                BinaryInplaceContigFunctorT<argTy, resTy, vec_sz, n_vecs,
-                                            enable_sg_loadstore>;
-
-            cgh.parallel_for<KernelName>(
-                sycl::nd_range<1>(gws_range, lws_range),
-                Impl(arg_tp, res_tp, nelems));
-        }
-        else {
-            static constexpr bool disable_sg_loadstore = true;
-            using InnerKernelName = kernel_name<argTy, resTy, vec_sz, n_vecs>;
-            using KernelName =
-                disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
-            using Impl =
-                BinaryInplaceContigFunctorT<argTy, resTy, vec_sz, n_vecs,
-                                            disable_sg_loadstore>;
-
-            cgh.parallel_for<KernelName>(
-                sycl::nd_range<1>(gws_range, lws_range),
-                Impl(arg_tp, res_tp, nelems));
-        }
-    });
-    return comp_ev;
-}
-
-template <typename argTy,
-          typename resTy,
-          template <typename T1, typename T2, typename IndT>
-          class BinaryInplaceStridedFunctorT,
-          template <typename T1, typename T2, typename IndT>
-          class kernel_name>
-sycl::event
-binary_inplace_strided_impl(sycl::queue &exec_q,
-                            std::size_t nelems,
-                            int nd,
-                            const ssize_t *shape_and_strides,
-                            const char *rhs_p,
-                            ssize_t rhs_offset,
-                            char *lhs_p,
-                            ssize_t lhs_offset,
-                            const std::vector<sycl::event> &depends,
-                            const std::vector<sycl::event> &additional_depends)
-{
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        cgh.depends_on(additional_depends);
-
-        using IndexerT =
-            typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
-
-        const IndexerT indexer{nd, rhs_offset, lhs_offset, shape_and_strides};
-
-        const argTy *arg_tp = reinterpret_cast<const argTy *>(rhs_p);
-        resTy *res_tp = reinterpret_cast<resTy *>(lhs_p);
-
-        using Impl = BinaryInplaceStridedFunctorT<argTy, resTy, IndexerT>;
-
-        cgh.parallel_for<kernel_name<argTy, resTy, IndexerT>>(
-            {nelems}, Impl(arg_tp, res_tp, indexer));
-    });
-    return comp_ev;
-}
-
-template <typename argT,
-          typename resT,
-          template <typename T1, typename T3>
-          class BinaryInplaceRowMatrixBroadcastFunctorT,
-          template <typename T1, typename T3>
-          class kernel_name>
-sycl::event binary_inplace_row_matrix_broadcast_impl(
-    sycl::queue &exec_q,
-    std::vector<sycl::event> &host_tasks,
-    std::size_t n0,
-    std::size_t n1,
-    const char *vec_p, // typeless pointer to (n1,) contiguous row
-    ssize_t vec_offset,
-    char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
-    ssize_t mat_offset,
-    const std::vector<sycl::event> &depends = {})
-{
-    const argT *vec = reinterpret_cast<const argT *>(vec_p) + vec_offset;
-    resT *mat = reinterpret_cast<resT *>(mat_p) + mat_offset;
-
-    const auto &dev = exec_q.get_device();
-    const auto &sg_sizes = dev.get_info<sycl::info::device::sub_group_sizes>();
-    // Get device-specific kernel info max_sub_group_size
-    std::size_t max_sgSize =
-        *(std::max_element(std::begin(sg_sizes), std::end(sg_sizes)));
-
-    std::size_t n1_padded = n1 + max_sgSize;
-    auto padded_vec_owner =
-        dpctl::tensor::alloc_utils::smart_malloc_device<argT>(n1_padded,
-                                                              exec_q);
-    argT *padded_vec = padded_vec_owner.get();
-
-    sycl::event make_padded_vec_ev =
-        dpctl::tensor::kernels::elementwise_detail::populate_padded_vector<
-            argT>(exec_q, vec, n1, padded_vec, n1_padded, depends);
-
-    // sub-group spans work-items [I, I + sgSize)
-    // base = ndit.get_global_linear_id() - sg.get_local_id()[0]
-    // Generically, sub_group_load( &mat[base]) may load arrays from
-    // different rows of mat. The start corresponds to row (base / n0)
-    // We read sub_group_load(&padded_vec[(base / n0)]). The vector is
-    // padded to ensure that reads are accessible
-
-    const std::size_t lws = 128;
-
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(make_padded_vec_ev);
-
-        auto lwsRange = sycl::range<1>(lws);
-        std::size_t n_elems = n0 * n1;
-        std::size_t n_groups = (n_elems + lws - 1) / lws;
-        auto gwsRange = sycl::range<1>(n_groups * lws);
-
-        using Impl = BinaryInplaceRowMatrixBroadcastFunctorT<argT, resT>;
-
-        cgh.parallel_for<class kernel_name<argT, resT>>(
-            sycl::nd_range<1>(gwsRange, lwsRange),
-            Impl(padded_vec, mat, n_elems, n1));
-    });
-
-    sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
-        exec_q, {comp_ev}, padded_vec_owner);
-    host_tasks.push_back(tmp_cleanup_ev);
-
-    return comp_ev;
-}
-
-} // namespace elementwise_common
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
deleted file mode 100644
index 6bf9658fc6..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
+++ /dev/null
@@ -1,227 +0,0 @@
-//=== conj.hpp -   Unary function CONJ                     ------
-//*-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of CONJ(x) function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "sycl_complex.hpp"
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace conj
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-
-template <typename argT, typename resT> struct ConjFunctor
-{
-
-    // is function constant for given argT
-    using is_constant = typename std::false_type;
-    // constant value, if constant
-    // constexpr resT constant_value = resT{};
-    // is function defined for sycl::vec
-    using supports_vec = typename std::false_type;
-    // do both argTy and resTy support sugroup store/load operation
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-        if constexpr (is_complex<argT>::value) {
-            using rT = typename argT::value_type;
-
-            return exprm_ns::conj(exprm_ns::complex<rT>(in)); // conj(in);
-        }
-        else {
-            if constexpr (!std::is_same_v<argT, bool>)
-                static_assert(std::is_same_v<resT, argT>);
-            return in;
-        }
-    }
-};
-
-template <typename argTy,
-          typename resTy = argTy,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using ConjContigFunctor =
-    elementwise_common::UnaryContigFunctor<argTy,
-                                           resTy,
-                                           ConjFunctor<argTy, resTy>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using ConjStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, ConjFunctor<argTy, resTy>>;
-
-template <typename T> struct ConjOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, bool, std::int8_t>,
-        td_ns::TypeMapResultEntry<T, std::uint8_t>,
-        td_ns::TypeMapResultEntry<T, std::uint16_t>,
-        td_ns::TypeMapResultEntry<T, std::uint32_t>,
-        td_ns::TypeMapResultEntry<T, std::uint64_t>,
-        td_ns::TypeMapResultEntry<T, std::int8_t>,
-        td_ns::TypeMapResultEntry<T, std::int16_t>,
-        td_ns::TypeMapResultEntry<T, std::int32_t>,
-        td_ns::TypeMapResultEntry<T, std::int64_t>,
-        td_ns::TypeMapResultEntry<T, sycl::half>,
-        td_ns::TypeMapResultEntry<T, float>,
-        td_ns::TypeMapResultEntry<T, double>,
-        td_ns::TypeMapResultEntry<T, std::complex<float>>,
-        td_ns::TypeMapResultEntry<T, std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct ConjContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class conj_contig_kernel;
-
-template <typename argTy>
-sycl::event conj_contig_impl(sycl::queue &exec_q,
-                             std::size_t nelems,
-                             const char *arg_p,
-                             char *res_p,
-                             const std::vector<sycl::event> &depends = {})
-{
-    using ConjHS = hyperparam_detail::ConjContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = ConjHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = ConjHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, ConjOutputType, ConjContigFunctor, conj_contig_kernel, vec_sz,
-        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct ConjContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!ConjOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = conj_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct ConjTypeMapFactory
-{
-    /*! @brief get typeid for output type of std::conj(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename ConjOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class conj_strided_kernel;
-
-template <typename argTy>
-sycl::event
-conj_strided_impl(sycl::queue &exec_q,
-                  std::size_t nelems,
-                  int nd,
-                  const ssize_t *shape_and_strides,
-                  const char *arg_p,
-                  ssize_t arg_offset,
-                  char *res_p,
-                  ssize_t res_offset,
-                  const std::vector<sycl::event> &depends,
-                  const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, ConjOutputType, ConjStridedFunctor, conj_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct ConjStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!ConjOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = conj_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace conj
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
deleted file mode 100644
index 5762053821..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
+++ /dev/null
@@ -1,241 +0,0 @@
-//=== copysign.hpp -   Binary function COPYSIGN         ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of COPYSIGN(x1, x2)
-/// function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace copysign
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace tu_ns = dpctl::tensor::type_utils;
-
-template <typename argT1, typename argT2, typename resT> struct CopysignFunctor
-{
-
-    using supports_sg_loadstore = std::true_type;
-    using supports_vec = std::true_type;
-
-    resT operator()(const argT1 &in1, const argT2 &in2) const
-    {
-        return sycl::copysign(in1, in2);
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz>
-    operator()(const sycl::vec<argT1, vec_sz> &in1,
-               const sycl::vec<argT2, vec_sz> &in2) const
-    {
-        auto tmp = sycl::copysign(in1, in2);
-        if constexpr (std::is_same_v<resT,
-                                     typename decltype(tmp)::element_type>)
-        {
-            return tmp;
-        }
-        else {
-            using dpctl::tensor::type_utils::vec_cast;
-
-            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
-                tmp);
-        }
-    }
-};
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using CopysignContigFunctor =
-    elementwise_common::BinaryContigFunctor<argT1,
-                                            argT2,
-                                            resT,
-                                            CopysignFunctor<argT1, argT2, resT>,
-                                            vec_sz,
-                                            n_vecs,
-                                            enable_sg_loadstore>;
-
-template <typename argT1, typename argT2, typename resT, typename IndexerT>
-using CopysignStridedFunctor = elementwise_common::BinaryStridedFunctor<
-    argT1,
-    argT2,
-    resT,
-    IndexerT,
-    CopysignFunctor<argT1, argT2, resT>>;
-
-template <typename T1, typename T2> struct CopysignOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        sycl::half,
-                                        T2,
-                                        sycl::half,
-                                        sycl::half>,
-        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
-        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::BinaryContigHyperparameterSetEntry;
-using vsu_ns::ContigHyperparameterSetDefault;
-
-template <typename argTy1, typename argTy2>
-struct CopysignContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class copysign_contig_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event copysign_contig_impl(sycl::queue &exec_q,
-                                 std::size_t nelems,
-                                 const char *arg1_p,
-                                 ssize_t arg1_offset,
-                                 const char *arg2_p,
-                                 ssize_t arg2_offset,
-                                 char *res_p,
-                                 ssize_t res_offset,
-                                 const std::vector<sycl::event> &depends = {})
-{
-    using CopySignHS =
-        hyperparam_detail::CopysignContigHyperparameterSet<argTy1, argTy2>;
-    static constexpr std::uint8_t vec_sz = CopySignHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = CopySignHS::n_vecs;
-
-    return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, CopysignOutputType, CopysignContigFunctor,
-        copysign_contig_kernel, vec_sz, n_vecs>(
-        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
-        res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct CopysignContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!CopysignOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = copysign_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2> struct CopysignTypeMapFactory
-{
-    /*! @brief get typeid for output type of divide(T1 x, T2 y) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename CopysignOutputType<T1, T2>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename resT, typename IndexerT>
-class copysign_strided_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event
-copysign_strided_impl(sycl::queue &exec_q,
-                      std::size_t nelems,
-                      int nd,
-                      const ssize_t *shape_and_strides,
-                      const char *arg1_p,
-                      ssize_t arg1_offset,
-                      const char *arg2_p,
-                      ssize_t arg2_offset,
-                      char *res_p,
-                      ssize_t res_offset,
-                      const std::vector<sycl::event> &depends,
-                      const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_strided_impl<
-        argTy1, argTy2, CopysignOutputType, CopysignStridedFunctor,
-        copysign_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
-                                 arg1_offset, arg2_p, arg2_offset, res_p,
-                                 res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct CopysignStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!CopysignOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = copysign_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-} // namespace copysign
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
deleted file mode 100644
index 81608346a7..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
+++ /dev/null
@@ -1,300 +0,0 @@
-//=== cos.hpp -   Unary function COS                     ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of COS(x) function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "sycl_complex.hpp"
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace cos
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-
-template <typename argT, typename resT> struct CosFunctor
-{
-
-    // is function constant for given argT
-    using is_constant = typename std::false_type;
-    // constant value, if constant
-    // constexpr resT constant_value = resT{};
-    // is function defined for sycl::vec
-    using supports_vec = typename std::false_type;
-    // do both argTy and resTy support sugroup store/load operation
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-        if constexpr (is_complex<argT>::value) {
-            using realT = typename argT::value_type;
-
-            static constexpr realT q_nan =
-                std::numeric_limits<realT>::quiet_NaN();
-
-            realT const &in_re = std::real(in);
-            realT const &in_im = std::imag(in);
-
-            const bool in_re_finite = std::isfinite(in_re);
-            const bool in_im_finite = std::isfinite(in_im);
-
-            /*
-             * Handle the nearly-non-exceptional cases where
-             * real and imaginary parts of input are finite.
-             */
-            if (in_re_finite && in_im_finite) {
-                return exprm_ns::cos(exprm_ns::complex<realT>(in)); // cos(in);
-            }
-
-            /*
-             * since cos(in) = cosh(I * in), for special cases,
-             * we return cosh(I * in).
-             */
-            const realT x = -in_im;
-            const realT y = in_re;
-
-            const bool xfinite = in_im_finite;
-            const bool yfinite = in_re_finite;
-            /*
-             * cosh(+-0 +- I Inf) = dNaN + I sign(d(+-0, dNaN))0.
-             * The sign of 0 in the result is unspecified.  Choice = normally
-             * the same as dNaN.
-             *
-             * cosh(+-0 +- I NaN) = d(NaN) + I sign(d(+-0, NaN))0.
-             * The sign of 0 in the result is unspecified.  Choice = normally
-             * the same as d(NaN).
-             */
-            if (x == realT(0) && !yfinite) {
-                const realT y_m_y = (y - y);
-                const realT res_im = sycl::copysign(realT(0), x * y_m_y);
-                return resT{y_m_y, res_im};
-            }
-
-            /*
-             * cosh(+-Inf +- I 0) = +Inf + I (+-)(+-)0.
-             *
-             * cosh(NaN +- I 0)   = d(NaN) + I sign(d(NaN, +-0))0.
-             * The sign of 0 in the result is unspecified.
-             */
-            if (y == realT(0) && !xfinite) {
-                const realT res_im = sycl::copysign(realT(0), x) * y;
-                return resT{x * x, res_im};
-            }
-
-            /*
-             * cosh(x +- I Inf) = dNaN + I dNaN.
-             *
-             * cosh(x + I NaN) = d(NaN) + I d(NaN).
-             */
-            if (xfinite && !yfinite) {
-                const realT y_m_y = (y - y);
-                return resT{y_m_y, x * y_m_y};
-            }
-
-            /*
-             * cosh(+-Inf + I NaN)  = +Inf + I d(NaN).
-             *
-             * cosh(+-Inf +- I Inf) = +Inf + I dNaN.
-             * The sign of Inf in the result is unspecified.  Choice = always +.
-             *
-             * cosh(+-Inf + I y)   = +Inf cos(y) +- I Inf sin(y)
-             */
-            if (std::isinf(x)) {
-                if (!yfinite) {
-                    return resT{x * x, sycl::copysign(q_nan, x)};
-                }
-                return resT{(x * x) * sycl::cos(y), x * sycl::sin(y)};
-            }
-
-            /*
-             * cosh(NaN + I NaN)  = d(NaN) + I d(NaN).
-             *
-             * cosh(NaN +- I Inf) = d(NaN) + I d(NaN).
-             *
-             * cosh(NaN + I y)    = d(NaN) + I d(NaN).
-             */
-            return resT{(x * x) * q_nan, (x + x) * q_nan};
-        }
-        else {
-            static_assert(std::is_floating_point_v<argT> ||
-                          std::is_same_v<argT, sycl::half>);
-            return sycl::cos(in);
-        }
-    }
-};
-
-template <typename argTy,
-          typename resTy = argTy,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using CosContigFunctor =
-    elementwise_common::UnaryContigFunctor<argTy,
-                                           resTy,
-                                           CosFunctor<argTy, resTy>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using CosStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, CosFunctor<argTy, resTy>>;
-
-template <typename T> struct CosOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, sycl::half, sycl::half>,
-        td_ns::TypeMapResultEntry<T, float, float>,
-        td_ns::TypeMapResultEntry<T, double, double>,
-        td_ns::TypeMapResultEntry<T, std::complex<float>, std::complex<float>>,
-        td_ns::
-            TypeMapResultEntry<T, std::complex<double>, std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct CosContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class cos_contig_kernel;
-
-template <typename argTy>
-sycl::event cos_contig_impl(sycl::queue &exec_q,
-                            std::size_t nelems,
-                            const char *arg_p,
-                            char *res_p,
-                            const std::vector<sycl::event> &depends = {})
-{
-    using CosHS = hyperparam_detail::CosContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = CosHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = CosHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, CosOutputType, CosContigFunctor, cos_contig_kernel, vec_sz,
-        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct CosContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!CosOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = cos_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct CosTypeMapFactory
-{
-    /*! @brief get typeid for output type of sycl::cos(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename CosOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class cos_strided_kernel;
-
-template <typename argTy>
-sycl::event cos_strided_impl(sycl::queue &exec_q,
-                             std::size_t nelems,
-                             int nd,
-                             const ssize_t *shape_and_strides,
-                             const char *arg_p,
-                             ssize_t arg_offset,
-                             char *res_p,
-                             ssize_t res_offset,
-                             const std::vector<sycl::event> &depends,
-                             const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, CosOutputType, CosStridedFunctor, cos_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct CosStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!CosOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = cos_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace cos
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
deleted file mode 100644
index be3e76cb2b..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
+++ /dev/null
@@ -1,290 +0,0 @@
-//=== cosh.hpp -   Unary function COSH                  ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of COSH(x) function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "sycl_complex.hpp"
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace cosh
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-
-template <typename argT, typename resT> struct CoshFunctor
-{
-
-    // is function constant for given argT
-    using is_constant = typename std::false_type;
-    // constant value, if constant
-    // constexpr resT constant_value = resT{};
-    // is function defined for sycl::vec
-    using supports_vec = typename std::false_type;
-    // do both argTy and resTy support sugroup store/load operation
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-        if constexpr (is_complex<argT>::value) {
-            using realT = typename argT::value_type;
-
-            static constexpr realT q_nan =
-                std::numeric_limits<realT>::quiet_NaN();
-
-            const realT x = std::real(in);
-            const realT y = std::imag(in);
-
-            const bool xfinite = std::isfinite(x);
-            const bool yfinite = std::isfinite(y);
-
-            /*
-             * Handle the nearly-non-exceptional cases where
-             * real and imaginary parts of input are finite.
-             */
-            if (xfinite && yfinite) {
-                return exprm_ns::cosh(
-                    exprm_ns::complex<realT>(in)); // cosh(in);
-            }
-
-            /*
-             * cosh(+-0 +- I Inf) = dNaN + I sign(d(+-0, dNaN))0.
-             * The sign of 0 in the result is unspecified.  Choice = normally
-             * the same as dNaN.
-             *
-             * cosh(+-0 +- I NaN) = d(NaN) + I sign(d(+-0, NaN))0.
-             * The sign of 0 in the result is unspecified.  Choice = normally
-             * the same as d(NaN).
-             */
-            if (x == realT(0) && !yfinite) {
-                const realT res_im = sycl::copysign(realT(0), x * q_nan);
-                return resT{q_nan, res_im};
-            }
-
-            /*
-             * cosh(+-Inf +- I 0) = +Inf + I (+-)(+-)0.
-             *
-             * cosh(NaN +- I 0)   = d(NaN) + I sign(d(NaN, +-0))0.
-             * The sign of 0 in the result is unspecified.
-             */
-            if (y == realT(0) && !xfinite) {
-                const realT res_im = sycl::copysign(realT(0), x) * y;
-                return resT{x * x, res_im};
-            }
-
-            /*
-             * cosh(x +- I Inf) = dNaN + I dNaN.
-             *
-             * cosh(x + I NaN) = d(NaN) + I d(NaN).
-             */
-            if (xfinite && !yfinite) {
-                return resT{q_nan, x * q_nan};
-            }
-
-            /*
-             * cosh(+-Inf + I NaN)  = +Inf + I d(NaN).
-             *
-             * cosh(+-Inf +- I Inf) = +Inf + I dNaN.
-             * The sign of Inf in the result is unspecified.  Choice = always +.
-             *
-             * cosh(+-Inf + I y)   = +Inf cos(y) +- I Inf sin(y)
-             */
-            if (std::isinf(x)) {
-                if (!yfinite) {
-                    return resT{x * x, x * q_nan};
-                }
-                return resT{(x * x) * sycl::cos(y), x * sycl::sin(y)};
-            }
-
-            /*
-             * cosh(NaN + I NaN)  = d(NaN) + I d(NaN).
-             *
-             * cosh(NaN +- I Inf) = d(NaN) + I d(NaN).
-             *
-             * cosh(NaN + I y)    = d(NaN) + I d(NaN).
-             */
-            return resT{(x * x) * (y - y), (x + x) * (y - y)};
-        }
-        else {
-            static_assert(std::is_floating_point_v<argT> ||
-                          std::is_same_v<argT, sycl::half>);
-            return sycl::cosh(in);
-        }
-    }
-};
-
-template <typename argTy,
-          typename resTy = argTy,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using CoshContigFunctor =
-    elementwise_common::UnaryContigFunctor<argTy,
-                                           resTy,
-                                           CoshFunctor<argTy, resTy>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using CoshStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, CoshFunctor<argTy, resTy>>;
-
-template <typename T> struct CoshOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, sycl::half>,
-        td_ns::TypeMapResultEntry<T, float>,
-        td_ns::TypeMapResultEntry<T, double>,
-        td_ns::TypeMapResultEntry<T, std::complex<float>>,
-        td_ns::TypeMapResultEntry<T, std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct CoshContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class cosh_contig_kernel;
-
-template <typename argTy>
-sycl::event cosh_contig_impl(sycl::queue &exec_q,
-                             std::size_t nelems,
-                             const char *arg_p,
-                             char *res_p,
-                             const std::vector<sycl::event> &depends = {})
-{
-    using CoshHS = hyperparam_detail::CoshContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = CoshHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = CoshHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, CoshOutputType, CoshContigFunctor, cosh_contig_kernel, vec_sz,
-        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct CoshContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!CoshOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = cosh_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct CoshTypeMapFactory
-{
-    /*! @brief get typeid for output type of sycl::cosh(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename CoshOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class cosh_strided_kernel;
-
-template <typename argTy>
-sycl::event
-cosh_strided_impl(sycl::queue &exec_q,
-                  std::size_t nelems,
-                  int nd,
-                  const ssize_t *shape_and_strides,
-                  const char *arg_p,
-                  ssize_t arg_offset,
-                  char *res_p,
-                  ssize_t res_offset,
-                  const std::vector<sycl::event> &depends,
-                  const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, CoshOutputType, CoshStridedFunctor, cosh_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct CoshStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!CoshOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = cosh_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace cosh
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
deleted file mode 100644
index 682e9f397e..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
+++ /dev/null
@@ -1,312 +0,0 @@
-//=== equal.hpp -   Binary function EQUAL                ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of equality of
-/// tensor elements.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "sycl_complex.hpp"
-#include "vec_size_util.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace equal
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace tu_ns = dpctl::tensor::type_utils;
-
-template <typename argT1, typename argT2, typename resT> struct EqualFunctor
-{
-    static_assert(std::is_same_v<resT, bool>);
-
-    using supports_sg_loadstore = std::negation<
-        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
-    using supports_vec = std::conjunction<
-        std::is_same<argT1, argT2>,
-        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
-                                       tu_ns::is_complex<argT2>>>>;
-
-    resT operator()(const argT1 &in1, const argT2 &in2) const
-    {
-        if constexpr (tu_ns::is_complex<argT1>::value &&
-                      tu_ns::is_complex<argT2>::value)
-        {
-            using realT1 = typename argT1::value_type;
-            using realT2 = typename argT2::value_type;
-
-            return exprm_ns::complex<realT1>(in1) ==
-                   exprm_ns::complex<realT2>(in2);
-        }
-        else {
-            if constexpr (std::is_integral_v<argT1> &&
-                          std::is_integral_v<argT2> &&
-                          std::is_signed_v<argT1> != std::is_signed_v<argT2>)
-            {
-                if constexpr (std::is_signed_v<argT1> &&
-                              !std::is_signed_v<argT2>)
-                {
-                    return (in1 < 0) ? false : (static_cast<argT2>(in1) == in2);
-                }
-                else {
-                    if constexpr (!std::is_signed_v<argT1> &&
-                                  std::is_signed_v<argT2>)
-                    {
-                        return (in2 < 0) ? false
-                                         : (in1 == static_cast<argT1>(in2));
-                    }
-                }
-            }
-            else {
-                return (in1 == in2);
-            }
-        }
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz>
-    operator()(const sycl::vec<argT1, vec_sz> &in1,
-               const sycl::vec<argT2, vec_sz> &in2) const
-    {
-        auto tmp = (in1 == in2);
-        if constexpr (std::is_same_v<resT,
-                                     typename decltype(tmp)::element_type>)
-        {
-            return tmp;
-        }
-        else {
-            using dpctl::tensor::type_utils::vec_cast;
-
-            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
-                tmp);
-        }
-    }
-};
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using EqualContigFunctor =
-    elementwise_common::BinaryContigFunctor<argT1,
-                                            argT2,
-                                            resT,
-                                            EqualFunctor<argT1, argT2, resT>,
-                                            vec_sz,
-                                            n_vecs,
-                                            enable_sg_loadstore>;
-
-template <typename argT1, typename argT2, typename resT, typename IndexerT>
-using EqualStridedFunctor =
-    elementwise_common::BinaryStridedFunctor<argT1,
-                                             argT2,
-                                             resT,
-                                             IndexerT,
-                                             EqualFunctor<argT1, argT2, resT>>;
-
-template <typename T1, typename T2> struct EqualOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::uint8_t, T2, std::uint8_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, std::int8_t, T2, std::int8_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint16_t,
-                                        T2,
-                                        std::uint16_t,
-                                        bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::int16_t, T2, std::int16_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint32_t,
-                                        T2,
-                                        std::uint32_t,
-                                        bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::int32_t, T2, std::int32_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint64_t,
-                                        T2,
-                                        std::uint64_t,
-                                        bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::int64_t, bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::uint64_t, T2, std::int64_t, bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::uint64_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, sycl::half, T2, sycl::half, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<float>,
-                                        T2,
-                                        std::complex<float>,
-                                        bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<double>,
-                                        T2,
-                                        std::complex<double>,
-                                        bool>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::BinaryContigHyperparameterSetEntry;
-using vsu_ns::ContigHyperparameterSetDefault;
-
-template <typename argTy1, typename argTy2> struct EqualContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class equal_contig_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event equal_contig_impl(sycl::queue &exec_q,
-                              std::size_t nelems,
-                              const char *arg1_p,
-                              ssize_t arg1_offset,
-                              const char *arg2_p,
-                              ssize_t arg2_offset,
-                              char *res_p,
-                              ssize_t res_offset,
-                              const std::vector<sycl::event> &depends = {})
-{
-    using EqualHS =
-        hyperparam_detail::EqualContigHyperparameterSet<argTy1, argTy2>;
-    static constexpr std::uint8_t vec_sz = EqualHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = EqualHS::n_vecs;
-
-    return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, EqualOutputType, EqualContigFunctor,
-        equal_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p,
-                                             arg1_offset, arg2_p, arg2_offset,
-                                             res_p, res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct EqualContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!EqualOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = equal_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2> struct EqualTypeMapFactory
-{
-    /*! @brief get typeid for output type of operator()==(x, y), always bool */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename EqualOutputType<T1, T2>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename resT, typename IndexerT>
-class equal_strided_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event
-equal_strided_impl(sycl::queue &exec_q,
-                   std::size_t nelems,
-                   int nd,
-                   const ssize_t *shape_and_strides,
-                   const char *arg1_p,
-                   ssize_t arg1_offset,
-                   const char *arg2_p,
-                   ssize_t arg2_offset,
-                   char *res_p,
-                   ssize_t res_offset,
-                   const std::vector<sycl::event> &depends,
-                   const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_strided_impl<
-        argTy1, argTy2, EqualOutputType, EqualStridedFunctor,
-        equal_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
-                              arg1_offset, arg2_p, arg2_offset, res_p,
-                              res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct EqualStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!EqualOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = equal_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-} // namespace equal
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp
deleted file mode 100644
index 770229f804..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp
+++ /dev/null
@@ -1,258 +0,0 @@
-//=== exp.hpp -   Unary function EXP                     ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of EXP(x) function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "sycl_complex.hpp"
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace exp
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-
-template <typename argT, typename resT> struct ExpFunctor
-{
-    // is function constant for given argT
-    using is_constant = typename std::false_type;
-    // constant value, if constant
-    // constexpr resT constant_value = resT{};
-    // is function defined for sycl::vec
-    using supports_vec = typename std::false_type;
-    // do both argTy and resTy support sugroup store/load operation
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-        if constexpr (is_complex<argT>::value) {
-            using realT = typename argT::value_type;
-
-            static constexpr realT q_nan =
-                std::numeric_limits<realT>::quiet_NaN();
-
-            const realT x = std::real(in);
-            const realT y = std::imag(in);
-            if (std::isfinite(x)) {
-                if (std::isfinite(y)) {
-                    return exprm_ns::exp(
-                        exprm_ns::complex<realT>(in)); // exp(in);
-                }
-                else {
-                    return resT{q_nan, q_nan};
-                }
-            }
-            else if (std::isnan(x)) {
-                /* x is nan */
-                if (y == realT(0)) {
-                    return resT{in};
-                }
-                else {
-                    return resT{x, q_nan};
-                }
-            }
-            else {
-                if (!sycl::signbit(x)) { /* x is +inf */
-                    if (y == realT(0)) {
-                        return resT{x, y};
-                    }
-                    else if (std::isfinite(y)) {
-                        return resT{x * sycl::cos(y), x * sycl::sin(y)};
-                    }
-                    else {
-                        /* x = +inf, y = +-inf || nan */
-                        return resT{x, q_nan};
-                    }
-                }
-                else { /* x is -inf */
-                    if (std::isfinite(y)) {
-                        realT exp_x = sycl::exp(x);
-                        return resT{exp_x * sycl::cos(y), exp_x * sycl::sin(y)};
-                    }
-                    else {
-                        /* x = -inf, y = +-inf || nan */
-                        return resT{0, 0};
-                    }
-                }
-            }
-        }
-        else {
-            return sycl::exp(in);
-        }
-    }
-};
-
-template <typename argTy,
-          typename resTy = argTy,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using ExpContigFunctor =
-    elementwise_common::UnaryContigFunctor<argTy,
-                                           resTy,
-                                           ExpFunctor<argTy, resTy>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using ExpStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, ExpFunctor<argTy, resTy>>;
-
-template <typename T> struct ExpOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, sycl::half>,
-        td_ns::TypeMapResultEntry<T, float>,
-        td_ns::TypeMapResultEntry<T, double>,
-        td_ns::TypeMapResultEntry<T, std::complex<float>>,
-        td_ns::TypeMapResultEntry<T, std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct ExpContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class exp_contig_kernel;
-
-template <typename argTy>
-sycl::event exp_contig_impl(sycl::queue &exec_q,
-                            std::size_t nelems,
-                            const char *arg_p,
-                            char *res_p,
-                            const std::vector<sycl::event> &depends = {})
-{
-    using ExpHS = hyperparam_detail::ExpContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = ExpHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = ExpHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, ExpOutputType, ExpContigFunctor, exp_contig_kernel, vec_sz,
-        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct ExpContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!ExpOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = exp_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct ExpTypeMapFactory
-{
-    /*! @brief get typeid for output type of sycl::exp(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename ExpOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class exp_strided_kernel;
-
-template <typename argTy>
-sycl::event exp_strided_impl(sycl::queue &exec_q,
-                             std::size_t nelems,
-                             int nd,
-                             const ssize_t *shape_and_strides,
-                             const char *arg_p,
-                             ssize_t arg_offset,
-                             char *res_p,
-                             ssize_t res_offset,
-                             const std::vector<sycl::event> &depends,
-                             const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, ExpOutputType, ExpStridedFunctor, exp_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct ExpStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!ExpOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = exp_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace exp
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
deleted file mode 100644
index 44cef5fc9e..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
+++ /dev/null
@@ -1,262 +0,0 @@
-//=== exp2.hpp -   Unary function EXP2                     ------
-//*-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of EXP2(x) function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "sycl_complex.hpp"
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace exp2
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-
-template <typename argT, typename resT> struct Exp2Functor
-{
-    // is function constant for given argT
-    using is_constant = typename std::false_type;
-    // constant value, if constant
-    // constexpr resT constant_value = resT{};
-    // is function defined for sycl::vec
-    using supports_vec = typename std::false_type;
-    // do both argTy and resTy support sugroup store/load operation
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-        if constexpr (is_complex<argT>::value) {
-            using realT = typename argT::value_type;
-
-            const argT tmp = in * sycl::log(realT(2));
-
-            static constexpr realT q_nan =
-                std::numeric_limits<realT>::quiet_NaN();
-
-            const realT x = std::real(tmp);
-            const realT y = std::imag(tmp);
-            if (std::isfinite(x)) {
-                if (std::isfinite(y)) {
-                    return exprm_ns::exp(exprm_ns::complex<realT>(tmp));
-                }
-                else {
-                    return resT{q_nan, q_nan};
-                }
-            }
-            else if (std::isnan(x)) {
-                /* x is nan */
-                if (y == realT(0)) {
-                    return resT{in};
-                }
-                else {
-                    return resT{x, q_nan};
-                }
-            }
-            else {
-                if (!sycl::signbit(x)) { /* x is +inf */
-                    if (y == realT(0)) {
-                        return resT{x, y};
-                    }
-                    else if (std::isfinite(y)) {
-                        return resT{x * sycl::cos(y), x * sycl::sin(y)};
-                    }
-                    else {
-                        /* x = +inf, y = +-inf || nan */
-                        return resT{x, q_nan};
-                    }
-                }
-                else { /* x is -inf */
-                    if (std::isfinite(y)) {
-                        realT exp_x = sycl::exp(x);
-                        return resT{exp_x * sycl::cos(y), exp_x * sycl::sin(y)};
-                    }
-                    else {
-                        /* x = -inf, y = +-inf || nan */
-                        return resT{0, 0};
-                    }
-                }
-            }
-        }
-        else {
-            return sycl::exp2(in);
-        }
-    }
-};
-
-template <typename argTy,
-          typename resTy = argTy,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using Exp2ContigFunctor =
-    elementwise_common::UnaryContigFunctor<argTy,
-                                           resTy,
-                                           Exp2Functor<argTy, resTy>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using Exp2StridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, Exp2Functor<argTy, resTy>>;
-
-template <typename T> struct Exp2OutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, sycl::half>,
-        td_ns::TypeMapResultEntry<T, float>,
-        td_ns::TypeMapResultEntry<T, double>,
-        td_ns::TypeMapResultEntry<T, std::complex<float>>,
-        td_ns::TypeMapResultEntry<T, std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct Exp2ContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class exp2_contig_kernel;
-
-template <typename argTy>
-sycl::event exp2_contig_impl(sycl::queue &exec_q,
-                             std::size_t nelems,
-                             const char *arg_p,
-                             char *res_p,
-                             const std::vector<sycl::event> &depends = {})
-{
-    using Exp2HS = hyperparam_detail::Exp2ContigHyperparameterSet<argTy>;
-
-    static constexpr std::uint8_t vec_sz = Exp2HS::vec_sz;
-    static constexpr std::uint8_t n_vecs = Exp2HS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, Exp2OutputType, Exp2ContigFunctor, exp2_contig_kernel, vec_sz,
-        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct Exp2ContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!Exp2OutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = exp2_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct Exp2TypeMapFactory
-{
-    /*! @brief get typeid for output type of sycl::exp2(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename Exp2OutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class exp2_strided_kernel;
-
-template <typename argTy>
-sycl::event
-exp2_strided_impl(sycl::queue &exec_q,
-                  std::size_t nelems,
-                  int nd,
-                  const ssize_t *shape_and_strides,
-                  const char *arg_p,
-                  ssize_t arg_offset,
-                  char *res_p,
-                  ssize_t res_offset,
-                  const std::vector<sycl::event> &depends,
-                  const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, Exp2OutputType, Exp2StridedFunctor, exp2_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct Exp2StridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!Exp2OutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = exp2_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace exp2
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
deleted file mode 100644
index de0bf515b4..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
+++ /dev/null
@@ -1,273 +0,0 @@
-//=== expm1.hpp -   Unary function EXPM1                   ------
-//*-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of EXPM1(x) function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <limits>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace expm1
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-
-template <typename argT, typename resT> struct Expm1Functor
-{
-
-    // is function constant for given argT
-    using is_constant = typename std::false_type;
-    // constant value, if constant
-    // constexpr resT constant_value = resT{};
-    // is function defined for sycl::vec
-    using supports_vec = typename std::false_type;
-    // do both argTy and resTy support sugroup store/load operation
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-        if constexpr (is_complex<argT>::value) {
-            using realT = typename argT::value_type;
-            // expm1(x + I*y) = expm1(x)*cos(y) - 2*sin(y / 2)^2 +
-            // I*exp(x)*sin(y)
-            const realT x = std::real(in);
-            const realT y = std::imag(in);
-
-            // special cases
-            if (std::isinf(x)) {
-                if (x > realT(0)) {
-                    // positive infinity cases
-                    if (!std::isfinite(y)) {
-                        return resT{x, std::numeric_limits<realT>::quiet_NaN()};
-                    }
-                    else if (y == realT(0)) {
-                        return in;
-                    }
-                    else {
-                        return (resT{sycl::copysign(x, sycl::cos(y)),
-                                     sycl::copysign(x, sycl::sin(y))});
-                    }
-                }
-                else {
-                    // negative infinity cases
-                    if (!std::isfinite(y)) {
-                        // copy sign of y to guarantee
-                        // conj(expm1(x)) == expm1(conj(x))
-                        return resT{realT(-1), sycl::copysign(realT(0), y)};
-                    }
-                    else {
-                        return resT{realT(-1),
-                                    sycl::copysign(realT(0), sycl::sin(y))};
-                    }
-                }
-            }
-
-            if (std::isnan(x)) {
-                if (y == realT(0)) {
-                    return in;
-                }
-                else {
-                    return resT{std::numeric_limits<realT>::quiet_NaN(),
-                                std::numeric_limits<realT>::quiet_NaN()};
-                }
-            }
-
-            // x, y finite numbers
-            const realT cosY_val = sycl::cos(y);
-            const realT sinY_val = (y == 0) ? y : sycl::sin(y);
-            const realT sinhalfY_val = (y == 0) ? y : sycl::sin(y / 2);
-
-            const realT res_re =
-                sycl::expm1(x) * cosY_val - 2 * sinhalfY_val * sinhalfY_val;
-            realT res_im = sycl::exp(x) * sinY_val;
-            return resT{res_re, res_im};
-        }
-        else {
-            static_assert(std::is_floating_point_v<argT> ||
-                          std::is_same_v<argT, sycl::half>);
-            static_assert(std::is_same_v<argT, resT>);
-            if (in == 0) {
-                return in;
-            }
-            return sycl::expm1(in);
-        }
-    }
-};
-
-template <typename argTy,
-          typename resTy = argTy,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using Expm1ContigFunctor =
-    elementwise_common::UnaryContigFunctor<argTy,
-                                           resTy,
-                                           Expm1Functor<argTy, resTy>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using Expm1StridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, Expm1Functor<argTy, resTy>>;
-
-template <typename T> struct Expm1OutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, sycl::half, sycl::half>,
-        td_ns::TypeMapResultEntry<T, float, float>,
-        td_ns::TypeMapResultEntry<T, double, double>,
-        td_ns::TypeMapResultEntry<T, std::complex<float>, std::complex<float>>,
-        td_ns::
-            TypeMapResultEntry<T, std::complex<double>, std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct Expm1ContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class expm1_contig_kernel;
-
-template <typename argTy>
-sycl::event expm1_contig_impl(sycl::queue &exec_q,
-                              std::size_t nelems,
-                              const char *arg_p,
-                              char *res_p,
-                              const std::vector<sycl::event> &depends = {})
-{
-    using Expm1HS = hyperparam_detail::Expm1ContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = Expm1HS::vec_sz;
-    static constexpr std::uint8_t n_vecs = Expm1HS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, Expm1OutputType, Expm1ContigFunctor, expm1_contig_kernel, vec_sz,
-        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct Expm1ContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!Expm1OutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = expm1_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct Expm1TypeMapFactory
-{
-    /*! @brief get typeid for output type of sycl::expm1(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename Expm1OutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class expm1_strided_kernel;
-
-template <typename argTy>
-sycl::event
-expm1_strided_impl(sycl::queue &exec_q,
-                   std::size_t nelems,
-                   int nd,
-                   const ssize_t *shape_and_strides,
-                   const char *arg_p,
-                   ssize_t arg_offset,
-                   char *res_p,
-                   ssize_t res_offset,
-                   const std::vector<sycl::event> &depends,
-                   const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, Expm1OutputType, Expm1StridedFunctor, expm1_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct Expm1StridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!Expm1OutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = expm1_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace expm1
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp
deleted file mode 100644
index b80c1f3b54..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp
+++ /dev/null
@@ -1,221 +0,0 @@
-//=== floor.hpp -   Unary function FLOOR                ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of FLOOR(x) function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace floor
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-
-template <typename argT, typename resT> struct FloorFunctor
-{
-
-    // is function constant for given argT
-    using is_constant = typename std::false_type;
-    // constant value, if constant
-    // constexpr resT constant_value = resT{};
-    // is function defined for sycl::vec
-    using supports_vec = typename std::false_type;
-    // do both argTy and resTy support sugroup store/load operation
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-        if constexpr (std::is_integral_v<argT>) {
-            return in;
-        }
-        else {
-            if (in == 0) {
-                return in;
-            }
-            return sycl::floor(in);
-        }
-    }
-};
-
-template <typename argTy,
-          typename resTy = argTy,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using FloorContigFunctor =
-    elementwise_common::UnaryContigFunctor<argTy,
-                                           resTy,
-                                           FloorFunctor<argTy, resTy>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using FloorStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, FloorFunctor<argTy, resTy>>;
-
-template <typename T> struct FloorOutputType
-{
-    using value_type =
-        typename std::disjunction<td_ns::TypeMapResultEntry<T, bool>,
-                                  td_ns::TypeMapResultEntry<T, std::uint8_t>,
-                                  td_ns::TypeMapResultEntry<T, std::uint16_t>,
-                                  td_ns::TypeMapResultEntry<T, std::uint32_t>,
-                                  td_ns::TypeMapResultEntry<T, std::uint64_t>,
-                                  td_ns::TypeMapResultEntry<T, std::int8_t>,
-                                  td_ns::TypeMapResultEntry<T, std::int16_t>,
-                                  td_ns::TypeMapResultEntry<T, std::int32_t>,
-                                  td_ns::TypeMapResultEntry<T, std::int64_t>,
-                                  td_ns::TypeMapResultEntry<T, sycl::half>,
-                                  td_ns::TypeMapResultEntry<T, float>,
-                                  td_ns::TypeMapResultEntry<T, double>,
-                                  td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct FloorContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class floor_contig_kernel;
-
-template <typename argTy>
-sycl::event floor_contig_impl(sycl::queue &exec_q,
-                              std::size_t nelems,
-                              const char *arg_p,
-                              char *res_p,
-                              const std::vector<sycl::event> &depends = {})
-{
-    using FloorHS = hyperparam_detail::FloorContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = FloorHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = FloorHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, FloorOutputType, FloorContigFunctor, floor_contig_kernel, vec_sz,
-        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct FloorContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!FloorOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = floor_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct FloorTypeMapFactory
-{
-    /*! @brief get typeid for output type of sycl::floor(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename FloorOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class floor_strided_kernel;
-
-template <typename argTy>
-sycl::event
-floor_strided_impl(sycl::queue &exec_q,
-                   std::size_t nelems,
-                   int nd,
-                   const ssize_t *shape_and_strides,
-                   const char *arg_p,
-                   ssize_t arg_offset,
-                   char *res_p,
-                   ssize_t res_offset,
-                   const std::vector<sycl::event> &depends,
-                   const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, FloorOutputType, FloorStridedFunctor, floor_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct FloorStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!FloorOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = floor_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace floor
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
deleted file mode 100644
index 21b9304e53..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
+++ /dev/null
@@ -1,542 +0,0 @@
-//=== floor_divide.hpp -  Binary function FLOOR_DIVIDE  ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of FLOOR_DIVIDE(x1, x2)
-/// function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/common_inplace.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace floor_divide
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace tu_ns = dpctl::tensor::type_utils;
-
-template <typename argT1, typename argT2, typename resT>
-struct FloorDivideFunctor
-{
-    using supports_sg_loadstore = std::true_type;
-    using supports_vec = std::true_type;
-
-    resT operator()(const argT1 &in1, const argT2 &in2) const
-    {
-        if constexpr (std::is_integral_v<argT1> || std::is_integral_v<argT2>) {
-            if (in2 == argT2(0)) {
-                return resT(0);
-            }
-            if constexpr (std::is_signed_v<argT1> || std::is_signed_v<argT2>) {
-                auto div = in1 / in2;
-                auto mod = in1 % in2;
-                auto corr = (mod != 0 && l_xor(mod < 0, in2 < 0));
-                return (div - corr);
-            }
-            else {
-                return (in1 / in2);
-            }
-        }
-        else {
-            auto div = in1 / in2;
-            return (div == resT(0)) ? div : resT(sycl::floor(div));
-        }
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz>
-    operator()(const sycl::vec<argT1, vec_sz> &in1,
-               const sycl::vec<argT2, vec_sz> &in2) const
-    {
-        if constexpr (std::is_integral_v<resT>) {
-            sycl::vec<resT, vec_sz> res;
-#pragma unroll
-            for (int i = 0; i < vec_sz; ++i) {
-                if (in2[i] == argT2(0)) {
-                    res[i] = resT(0);
-                }
-                else {
-                    res[i] = in1[i] / in2[i];
-                    if constexpr (std::is_signed_v<resT>) {
-                        auto mod = in1[i] % in2[i];
-                        auto corr = (mod != 0 && l_xor(mod < 0, in2[i] < 0));
-                        res[i] -= corr;
-                    }
-                }
-            }
-            return res;
-        }
-        else {
-            auto tmp = in1 / in2;
-            using tmpT = typename decltype(tmp)::element_type;
-#pragma unroll
-            for (int i = 0; i < vec_sz; ++i) {
-                if (in2[i] != argT2(0)) {
-                    tmp[i] = sycl::floor(tmp[i]);
-                }
-            }
-            if constexpr (std::is_same_v<resT, tmpT>) {
-                return tmp;
-            }
-            else {
-                using dpctl::tensor::type_utils::vec_cast;
-                return vec_cast<resT, tmpT, vec_sz>(tmp);
-            }
-        }
-    }
-
-private:
-    bool l_xor(bool b1, bool b2) const { return (b1 != b2); }
-};
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using FloorDivideContigFunctor = elementwise_common::BinaryContigFunctor<
-    argT1,
-    argT2,
-    resT,
-    FloorDivideFunctor<argT1, argT2, resT>,
-    vec_sz,
-    n_vecs,
-    enable_sg_loadstore>;
-
-template <typename argT1, typename argT2, typename resT, typename IndexerT>
-using FloorDivideStridedFunctor = elementwise_common::BinaryStridedFunctor<
-    argT1,
-    argT2,
-    resT,
-    IndexerT,
-    FloorDivideFunctor<argT1, argT2, resT>>;
-
-template <typename T1, typename T2> struct FloorDivideOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint8_t,
-                                        T2,
-                                        std::uint8_t,
-                                        std::uint8_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int8_t,
-                                        T2,
-                                        std::int8_t,
-                                        std::int8_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint16_t,
-                                        T2,
-                                        std::uint16_t,
-                                        std::uint16_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int16_t,
-                                        T2,
-                                        std::int16_t,
-                                        std::int16_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint32_t,
-                                        T2,
-                                        std::uint32_t,
-                                        std::uint32_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int32_t,
-                                        T2,
-                                        std::int32_t,
-                                        std::int32_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint64_t,
-                                        T2,
-                                        std::uint64_t,
-                                        std::uint64_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int64_t,
-                                        T2,
-                                        std::int64_t,
-                                        std::int64_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        sycl::half,
-                                        T2,
-                                        sycl::half,
-                                        sycl::half>,
-        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
-        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::BinaryContigHyperparameterSetEntry;
-using vsu_ns::ContigHyperparameterSetDefault;
-
-template <typename argTy1, typename argTy2>
-struct FloorDivideContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class floor_divide_contig_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event
-floor_divide_contig_impl(sycl::queue &exec_q,
-                         std::size_t nelems,
-                         const char *arg1_p,
-                         ssize_t arg1_offset,
-                         const char *arg2_p,
-                         ssize_t arg2_offset,
-                         char *res_p,
-                         ssize_t res_offset,
-                         const std::vector<sycl::event> &depends = {})
-{
-    using FloorDivideHS =
-        hyperparam_detail::FloorDivideContigHyperparameterSet<argTy1, argTy2>;
-    static constexpr std::uint8_t vec_sz = FloorDivideHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = FloorDivideHS::n_vecs;
-
-    return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, FloorDivideOutputType, FloorDivideContigFunctor,
-        floor_divide_contig_kernel, vec_sz, n_vecs>(
-        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
-        res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct FloorDivideContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!FloorDivideOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = floor_divide_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2>
-struct FloorDivideTypeMapFactory
-{
-    /*! @brief get typeid for output type of floor_divide(T1 x, T2 y) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename FloorDivideOutputType<T1, T2>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename resT, typename IndexerT>
-class floor_divide_strided_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event
-floor_divide_strided_impl(sycl::queue &exec_q,
-                          std::size_t nelems,
-                          int nd,
-                          const ssize_t *shape_and_strides,
-                          const char *arg1_p,
-                          ssize_t arg1_offset,
-                          const char *arg2_p,
-                          ssize_t arg2_offset,
-                          char *res_p,
-                          ssize_t res_offset,
-                          const std::vector<sycl::event> &depends,
-                          const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_strided_impl<
-        argTy1, argTy2, FloorDivideOutputType, FloorDivideStridedFunctor,
-        floor_divide_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
-        arg2_offset, res_p, res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct FloorDivideStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!FloorDivideOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = floor_divide_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename argT, typename resT> struct FloorDivideInplaceFunctor
-{
-    using supports_sg_loadstore = std::true_type;
-    using supports_vec = std::true_type;
-
-    void operator()(resT &in1, const argT &in2) const
-    {
-        if constexpr (std::is_integral_v<resT>) {
-            if (in2 == argT(0)) {
-                in1 = 0;
-                return;
-            }
-            if constexpr (std::is_signed_v<resT>) {
-                auto tmp = in1;
-                in1 /= in2;
-                auto mod = tmp % in2;
-                auto corr = (mod != 0 && l_xor(mod < 0, in2 < 0));
-                in1 -= corr;
-            }
-            else {
-                in1 /= in2;
-            }
-        }
-        else {
-            in1 /= in2;
-            if (in1 == resT(0)) {
-                return;
-            }
-            in1 = sycl::floor(in1);
-        }
-    }
-
-    template <int vec_sz>
-    void operator()(sycl::vec<resT, vec_sz> &in1,
-                    const sycl::vec<argT, vec_sz> &in2) const
-    {
-        if constexpr (std::is_integral_v<resT>) {
-#pragma unroll
-            for (int i = 0; i < vec_sz; ++i) {
-                if (in2[i] == argT(0)) {
-                    in1[i] = 0;
-                }
-                else {
-                    if constexpr (std::is_signed_v<resT>) {
-                        auto tmp = in1[i];
-                        in1[i] /= in2[i];
-                        auto mod = tmp % in2[i];
-                        auto corr = (mod != 0 && l_xor(mod < 0, in2[i] < 0));
-                        in1[i] -= corr;
-                    }
-                    else {
-                        in1[i] /= in2[i];
-                    }
-                }
-            }
-        }
-        else {
-            in1 /= in2;
-#pragma unroll
-            for (int i = 0; i < vec_sz; ++i) {
-                if (in2[i] != argT(0)) {
-                    in1[i] = sycl::floor(in1[i]);
-                }
-            }
-        }
-    }
-
-private:
-    bool l_xor(bool b1, bool b2) const { return (b1 != b2); }
-};
-
-template <typename argT,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using FloorDivideInplaceContigFunctor =
-    elementwise_common::BinaryInplaceContigFunctor<
-        argT,
-        resT,
-        FloorDivideInplaceFunctor<argT, resT>,
-        vec_sz,
-        n_vecs,
-        enable_sg_loadstore>;
-
-template <typename argT, typename resT, typename IndexerT>
-using FloorDivideInplaceStridedFunctor =
-    elementwise_common::BinaryInplaceStridedFunctor<
-        argT,
-        resT,
-        IndexerT,
-        FloorDivideInplaceFunctor<argT, resT>>;
-
-template <typename argT,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class floor_divide_inplace_contig_kernel;
-
-/* @brief Types supported by in-place floor division */
-template <typename argTy, typename resTy>
-struct FloorDivideInplaceTypePairSupport
-{
-    /* value if true a kernel for <argTy, resTy> must be instantiated  */
-    static constexpr bool is_defined = std::disjunction<
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, resTy, sycl::half>,
-        td_ns::TypePairDefinedEntry<argTy, float, resTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, double, resTy, double>,
-        // fall-through
-        td_ns::NotDefinedEntry>::is_defined;
-};
-
-template <typename fnT, typename argT, typename resT>
-struct FloorDivideInplaceTypeMapFactory
-{
-    /*! @brief get typeid for output type of x //= y */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        if constexpr (FloorDivideInplaceTypePairSupport<argT, resT>::is_defined)
-        {
-            return td_ns::GetTypeid<resT>{}.get();
-        }
-        else {
-            return td_ns::GetTypeid<void>{}.get();
-        }
-    }
-};
-
-template <typename argTy, typename resTy>
-sycl::event
-floor_divide_inplace_contig_impl(sycl::queue &exec_q,
-                                 std::size_t nelems,
-                                 const char *arg_p,
-                                 ssize_t arg_offset,
-                                 char *res_p,
-                                 ssize_t res_offset,
-                                 const std::vector<sycl::event> &depends = {})
-{
-    using FloorDivideHS =
-        hyperparam_detail::FloorDivideContigHyperparameterSet<resTy, argTy>;
-
-    static constexpr std::uint8_t vec_sz = FloorDivideHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = FloorDivideHS::n_vecs;
-
-    return elementwise_common::binary_inplace_contig_impl<
-        argTy, resTy, FloorDivideInplaceContigFunctor,
-        floor_divide_inplace_contig_kernel, vec_sz, n_vecs>(
-        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct FloorDivideInplaceContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!FloorDivideInplaceTypePairSupport<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = floor_divide_inplace_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename resT, typename argT, typename IndexerT>
-class floor_divide_inplace_strided_kernel;
-
-template <typename argTy, typename resTy>
-sycl::event floor_divide_inplace_strided_impl(
-    sycl::queue &exec_q,
-    std::size_t nelems,
-    int nd,
-    const ssize_t *shape_and_strides,
-    const char *arg_p,
-    ssize_t arg_offset,
-    char *res_p,
-    ssize_t res_offset,
-    const std::vector<sycl::event> &depends,
-    const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_inplace_strided_impl<
-        argTy, resTy, FloorDivideInplaceStridedFunctor,
-        floor_divide_inplace_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct FloorDivideInplaceStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!FloorDivideInplaceTypePairSupport<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = floor_divide_inplace_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-} // namespace floor_divide
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
deleted file mode 100644
index bdde27e175..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
+++ /dev/null
@@ -1,315 +0,0 @@
-//=== greater.hpp -   Binary function GREATER              ------
-//*-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain in1 copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of comparison of
-/// tensor elements.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "utils/math_utils.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace greater
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace tu_ns = dpctl::tensor::type_utils;
-
-template <typename argT1, typename argT2, typename resT> struct GreaterFunctor
-{
-    static_assert(std::is_same_v<resT, bool>);
-
-    using supports_sg_loadstore = std::negation<
-        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
-    using supports_vec = std::conjunction<
-        std::is_same<argT1, argT2>,
-        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
-                                       tu_ns::is_complex<argT2>>>>;
-
-    resT operator()(const argT1 &in1, const argT2 &in2) const
-    {
-        if constexpr (tu_ns::is_complex<argT1>::value ||
-                      tu_ns::is_complex<argT2>::value)
-        {
-            static_assert(std::is_same_v<argT1, argT2>);
-            using dpctl::tensor::math_utils::greater_complex;
-            return greater_complex<argT1>(in1, in2);
-        }
-        else {
-            if constexpr (std::is_integral_v<argT1> &&
-                          std::is_integral_v<argT2> &&
-                          std::is_signed_v<argT1> != std::is_signed_v<argT2>)
-            {
-                if constexpr (std::is_signed_v<argT1> &&
-                              !std::is_signed_v<argT2>)
-                {
-                    return (in1 < 0) ? false : (static_cast<argT2>(in1) > in2);
-                }
-                else {
-                    if constexpr (!std::is_signed_v<argT1> &&
-                                  std::is_signed_v<argT2>)
-                    {
-                        return (in2 < 0) ? true
-                                         : (in1 > static_cast<argT1>(in2));
-                    }
-                }
-            }
-            else {
-                return (in1 > in2);
-            }
-        }
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz>
-    operator()(const sycl::vec<argT1, vec_sz> &in1,
-               const sycl::vec<argT2, vec_sz> &in2) const
-    {
-
-        auto tmp = (in1 > in2);
-
-        if constexpr (std::is_same_v<resT,
-                                     typename decltype(tmp)::element_type>)
-        {
-            return tmp;
-        }
-        else {
-            using dpctl::tensor::type_utils::vec_cast;
-
-            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
-                tmp);
-        }
-    }
-};
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using GreaterContigFunctor =
-    elementwise_common::BinaryContigFunctor<argT1,
-                                            argT2,
-                                            resT,
-                                            GreaterFunctor<argT1, argT2, resT>,
-                                            vec_sz,
-                                            n_vecs,
-                                            enable_sg_loadstore>;
-
-template <typename argT1, typename argT2, typename resT, typename IndexerT>
-using GreaterStridedFunctor = elementwise_common::BinaryStridedFunctor<
-    argT1,
-    argT2,
-    resT,
-    IndexerT,
-    GreaterFunctor<argT1, argT2, resT>>;
-
-template <typename T1, typename T2> struct GreaterOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::uint8_t, T2, std::uint8_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, std::int8_t, T2, std::int8_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint16_t,
-                                        T2,
-                                        std::uint16_t,
-                                        bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::int16_t, T2, std::int16_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint32_t,
-                                        T2,
-                                        std::uint32_t,
-                                        bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::int32_t, T2, std::int32_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint64_t,
-                                        T2,
-                                        std::uint64_t,
-                                        bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::int64_t, bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::uint64_t, T2, std::int64_t, bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::uint64_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, sycl::half, T2, sycl::half, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<float>,
-                                        T2,
-                                        std::complex<float>,
-                                        bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<double>,
-                                        T2,
-                                        std::complex<double>,
-                                        bool>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::BinaryContigHyperparameterSetEntry;
-using vsu_ns::ContigHyperparameterSetDefault;
-
-template <typename argTy1, typename argTy2>
-struct GreaterContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class greater_contig_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event greater_contig_impl(sycl::queue &exec_q,
-                                std::size_t nelems,
-                                const char *arg1_p,
-                                ssize_t arg1_offset,
-                                const char *arg2_p,
-                                ssize_t arg2_offset,
-                                char *res_p,
-                                ssize_t res_offset,
-                                const std::vector<sycl::event> &depends = {})
-{
-    using GreaterHS =
-        hyperparam_detail::GreaterContigHyperparameterSet<argTy1, argTy2>;
-
-    static constexpr std::uint8_t vec_sz = GreaterHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = GreaterHS::n_vecs;
-
-    return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, GreaterOutputType, GreaterContigFunctor,
-        greater_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p,
-                                               arg1_offset, arg2_p, arg2_offset,
-                                               res_p, res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct GreaterContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!GreaterOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = greater_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2> struct GreaterTypeMapFactory
-{
-    /*! @brief get typeid for output type of operator()>(x, y), always bool */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename GreaterOutputType<T1, T2>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename resT, typename IndexerT>
-class greater_strided_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event
-greater_strided_impl(sycl::queue &exec_q,
-                     std::size_t nelems,
-                     int nd,
-                     const ssize_t *shape_and_strides,
-                     const char *arg1_p,
-                     ssize_t arg1_offset,
-                     const char *arg2_p,
-                     ssize_t arg2_offset,
-                     char *res_p,
-                     ssize_t res_offset,
-                     const std::vector<sycl::event> &depends,
-                     const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_strided_impl<
-        argTy1, argTy2, GreaterOutputType, GreaterStridedFunctor,
-        greater_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
-                                arg1_offset, arg2_p, arg2_offset, res_p,
-                                res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct GreaterStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!GreaterOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = greater_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-} // namespace greater
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
deleted file mode 100644
index e5599b5a14..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
+++ /dev/null
@@ -1,319 +0,0 @@
-//=== greater_equal.hpp -   Binary function GREATER_EQUAL        ------
-//*-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain in1 copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of comparison of
-/// tensor elements.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "utils/math_utils.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace greater_equal
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace tu_ns = dpctl::tensor::type_utils;
-
-template <typename argT1, typename argT2, typename resT>
-struct GreaterEqualFunctor
-{
-    static_assert(std::is_same_v<resT, bool>);
-
-    using supports_sg_loadstore = std::negation<
-        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
-    using supports_vec = std::conjunction<
-        std::is_same<argT1, argT2>,
-        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
-                                       tu_ns::is_complex<argT2>>>>;
-
-    resT operator()(const argT1 &in1, const argT2 &in2) const
-    {
-        if constexpr (tu_ns::is_complex<argT1>::value ||
-                      tu_ns::is_complex<argT2>::value)
-        {
-            static_assert(std::is_same_v<argT1, argT2>);
-            using dpctl::tensor::math_utils::greater_equal_complex;
-            return greater_equal_complex<argT1>(in1, in2);
-        }
-        else {
-            if constexpr (std::is_integral_v<argT1> &&
-                          std::is_integral_v<argT2> &&
-                          std::is_signed_v<argT1> != std::is_signed_v<argT2>)
-            {
-                if constexpr (std::is_signed_v<argT1> &&
-                              !std::is_signed_v<argT2>)
-                {
-                    return (in1 < 0) ? false : (static_cast<argT2>(in1) >= in2);
-                }
-                else {
-                    if constexpr (!std::is_signed_v<argT1> &&
-                                  std::is_signed_v<argT2>)
-                    {
-                        return (in2 < 0) ? true
-                                         : (in1 >= static_cast<argT1>(in2));
-                    }
-                }
-            }
-            else {
-                return (in1 >= in2);
-            }
-        }
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz>
-    operator()(const sycl::vec<argT1, vec_sz> &in1,
-               const sycl::vec<argT2, vec_sz> &in2) const
-    {
-
-        auto tmp = (in1 >= in2);
-
-        if constexpr (std::is_same_v<resT,
-                                     typename decltype(tmp)::element_type>)
-        {
-            return tmp;
-        }
-        else {
-            using dpctl::tensor::type_utils::vec_cast;
-
-            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
-                tmp);
-        }
-    }
-};
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using GreaterEqualContigFunctor = elementwise_common::BinaryContigFunctor<
-    argT1,
-    argT2,
-    resT,
-    GreaterEqualFunctor<argT1, argT2, resT>,
-    vec_sz,
-    n_vecs,
-    enable_sg_loadstore>;
-
-template <typename argT1, typename argT2, typename resT, typename IndexerT>
-using GreaterEqualStridedFunctor = elementwise_common::BinaryStridedFunctor<
-    argT1,
-    argT2,
-    resT,
-    IndexerT,
-    GreaterEqualFunctor<argT1, argT2, resT>>;
-
-template <typename T1, typename T2> struct GreaterEqualOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::uint8_t, T2, std::uint8_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, std::int8_t, T2, std::int8_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint16_t,
-                                        T2,
-                                        std::uint16_t,
-                                        bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::int16_t, T2, std::int16_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint32_t,
-                                        T2,
-                                        std::uint32_t,
-                                        bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::int32_t, T2, std::int32_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint64_t,
-                                        T2,
-                                        std::uint64_t,
-                                        bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::int64_t, bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::uint64_t, T2, std::int64_t, bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::uint64_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, sycl::half, T2, sycl::half, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<float>,
-                                        T2,
-                                        std::complex<float>,
-                                        bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<double>,
-                                        T2,
-                                        std::complex<double>,
-                                        bool>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::BinaryContigHyperparameterSetEntry;
-using vsu_ns::ContigHyperparameterSetDefault;
-
-template <typename argTy1, typename argTy2>
-struct GreaterEqualContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class greater_equal_contig_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event
-greater_equal_contig_impl(sycl::queue &exec_q,
-                          std::size_t nelems,
-                          const char *arg1_p,
-                          ssize_t arg1_offset,
-                          const char *arg2_p,
-                          ssize_t arg2_offset,
-                          char *res_p,
-                          ssize_t res_offset,
-                          const std::vector<sycl::event> &depends = {})
-{
-    using GreaterEqHS =
-        hyperparam_detail::GreaterEqualContigHyperparameterSet<argTy1, argTy2>;
-    static constexpr std::uint8_t vec_sz = GreaterEqHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = GreaterEqHS::n_vecs;
-
-    return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, GreaterEqualOutputType, GreaterEqualContigFunctor,
-        greater_equal_contig_kernel, vec_sz, n_vecs>(
-        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
-        res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct GreaterEqualContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!GreaterEqualOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = greater_equal_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2>
-struct GreaterEqualTypeMapFactory
-{
-    /*! @brief get typeid for output type of operator()>(x, y), always bool */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename GreaterEqualOutputType<T1, T2>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename resT, typename IndexerT>
-class greater_equal_strided_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event
-greater_equal_strided_impl(sycl::queue &exec_q,
-                           std::size_t nelems,
-                           int nd,
-                           const ssize_t *shape_and_strides,
-                           const char *arg1_p,
-                           ssize_t arg1_offset,
-                           const char *arg2_p,
-                           ssize_t arg2_offset,
-                           char *res_p,
-                           ssize_t res_offset,
-                           const std::vector<sycl::event> &depends,
-                           const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_strided_impl<
-        argTy1, argTy2, GreaterEqualOutputType, GreaterEqualStridedFunctor,
-        greater_equal_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
-        arg2_offset, res_p, res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct GreaterEqualStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!GreaterEqualOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = greater_equal_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-} // namespace greater_equal
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
deleted file mode 100644
index 12b67481f0..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
+++ /dev/null
@@ -1,242 +0,0 @@
-//=== HYPOT.hpp -   Binary function HYPOT  ------               *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of HYPOT(x1, x2)
-/// function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace hypot
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace tu_ns = dpctl::tensor::type_utils;
-
-template <typename argT1, typename argT2, typename resT> struct HypotFunctor
-{
-
-    using supports_sg_loadstore = std::negation<
-        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
-    using supports_vec = std::negation<
-        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
-
-    resT operator()(const argT1 &in1, const argT2 &in2) const
-    {
-        return sycl::hypot(in1, in2);
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz>
-    operator()(const sycl::vec<argT1, vec_sz> &in1,
-               const sycl::vec<argT2, vec_sz> &in2) const
-    {
-        auto res = sycl::hypot(in1, in2);
-        if constexpr (std::is_same_v<resT,
-                                     typename decltype(res)::element_type>)
-        {
-            return res;
-        }
-        else {
-            using dpctl::tensor::type_utils::vec_cast;
-
-            return vec_cast<resT, typename decltype(res)::element_type, vec_sz>(
-                res);
-        }
-    }
-};
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using HypotContigFunctor =
-    elementwise_common::BinaryContigFunctor<argT1,
-                                            argT2,
-                                            resT,
-                                            HypotFunctor<argT1, argT2, resT>,
-                                            vec_sz,
-                                            n_vecs,
-                                            enable_sg_loadstore>;
-
-template <typename argT1, typename argT2, typename resT, typename IndexerT>
-using HypotStridedFunctor =
-    elementwise_common::BinaryStridedFunctor<argT1,
-                                             argT2,
-                                             resT,
-                                             IndexerT,
-                                             HypotFunctor<argT1, argT2, resT>>;
-
-template <typename T1, typename T2> struct HypotOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        sycl::half,
-                                        T2,
-                                        sycl::half,
-                                        sycl::half>,
-        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
-        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::BinaryContigHyperparameterSetEntry;
-using vsu_ns::ContigHyperparameterSetDefault;
-
-template <typename argTy1, typename argTy2> struct HypotContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class hypot_contig_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event hypot_contig_impl(sycl::queue &exec_q,
-                              std::size_t nelems,
-                              const char *arg1_p,
-                              ssize_t arg1_offset,
-                              const char *arg2_p,
-                              ssize_t arg2_offset,
-                              char *res_p,
-                              ssize_t res_offset,
-                              const std::vector<sycl::event> &depends = {})
-{
-    using HypotHS =
-        hyperparam_detail::HypotContigHyperparameterSet<argTy1, argTy2>;
-    static constexpr std::uint8_t vec_sz = HypotHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = HypotHS::n_vecs;
-
-    return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, HypotOutputType, HypotContigFunctor,
-        hypot_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p,
-                                             arg1_offset, arg2_p, arg2_offset,
-                                             res_p, res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct HypotContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!HypotOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = hypot_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2> struct HypotTypeMapFactory
-{
-    /*! @brief get typeid for output type of sycl::hypot(T1 x, T2 y) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename HypotOutputType<T1, T2>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename resT, typename IndexerT>
-class hypot_strided_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event
-hypot_strided_impl(sycl::queue &exec_q,
-                   std::size_t nelems,
-                   int nd,
-                   const ssize_t *shape_and_strides,
-                   const char *arg1_p,
-                   ssize_t arg1_offset,
-                   const char *arg2_p,
-                   ssize_t arg2_offset,
-                   char *res_p,
-                   ssize_t res_offset,
-                   const std::vector<sycl::event> &depends,
-                   const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_strided_impl<
-        argTy1, argTy2, HypotOutputType, HypotStridedFunctor,
-        hypot_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
-                              arg1_offset, arg2_p, arg2_offset, res_p,
-                              res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct HypotStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!HypotOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = hypot_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-} // namespace hypot
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp
deleted file mode 100644
index e4d615f87e..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp
+++ /dev/null
@@ -1,225 +0,0 @@
-//=== imag.hpp -   Unary function IMAG                     ------
-//*-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of IMAG(x) function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace imag
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-using dpctl::tensor::type_utils::is_complex_v;
-
-template <typename argT, typename resT> struct ImagFunctor
-{
-
-    // is function constant for given argT
-    using is_constant =
-        typename std::is_same<is_complex<argT>, std::false_type>;
-    // constant value, if constant
-    static constexpr resT constant_value = resT{0};
-    // is function defined for sycl::vec
-    using supports_vec = typename std::false_type;
-    // do both argTy and resTy support sugroup store/load operation
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-        if constexpr (is_complex_v<argT>) {
-            return std::imag(in);
-        }
-        else {
-            static_assert(std::is_same_v<resT, argT>);
-            return constant_value;
-        }
-    }
-};
-
-template <typename argTy,
-          typename resTy = argTy,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using ImagContigFunctor =
-    elementwise_common::UnaryContigFunctor<argTy,
-                                           resTy,
-                                           ImagFunctor<argTy, resTy>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using ImagStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, ImagFunctor<argTy, resTy>>;
-
-template <typename T> struct ImagOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, bool>,
-        td_ns::TypeMapResultEntry<T, std::uint8_t>,
-        td_ns::TypeMapResultEntry<T, std::uint16_t>,
-        td_ns::TypeMapResultEntry<T, std::uint32_t>,
-        td_ns::TypeMapResultEntry<T, std::uint64_t>,
-        td_ns::TypeMapResultEntry<T, std::int8_t>,
-        td_ns::TypeMapResultEntry<T, std::int16_t>,
-        td_ns::TypeMapResultEntry<T, std::int32_t>,
-        td_ns::TypeMapResultEntry<T, std::int64_t>,
-        td_ns::TypeMapResultEntry<T, sycl::half>,
-        td_ns::TypeMapResultEntry<T, float>,
-        td_ns::TypeMapResultEntry<T, double>,
-        td_ns::TypeMapResultEntry<T, std::complex<float>, float>,
-        td_ns::TypeMapResultEntry<T, std::complex<double>, double>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct ImagContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class imag_contig_kernel;
-
-template <typename argTy>
-sycl::event imag_contig_impl(sycl::queue &exec_q,
-                             std::size_t nelems,
-                             const char *arg_p,
-                             char *res_p,
-                             const std::vector<sycl::event> &depends = {})
-{
-    using ImagHS = hyperparam_detail::ImagContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = ImagHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = ImagHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, ImagOutputType, ImagContigFunctor, imag_contig_kernel, vec_sz,
-        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct ImagContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!ImagOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = imag_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct ImagTypeMapFactory
-{
-    /*! @brief get typeid for output type of std::imag(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename ImagOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class imag_strided_kernel;
-
-template <typename argTy>
-sycl::event
-imag_strided_impl(sycl::queue &exec_q,
-                  std::size_t nelems,
-                  int nd,
-                  const ssize_t *shape_and_strides,
-                  const char *arg_p,
-                  ssize_t arg_offset,
-                  char *res_p,
-                  ssize_t res_offset,
-                  const std::vector<sycl::event> &depends,
-                  const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, ImagOutputType, ImagStridedFunctor, imag_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct ImagStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!ImagOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = imag_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace imag
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
deleted file mode 100644
index 4615f9ce93..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
+++ /dev/null
@@ -1,216 +0,0 @@
-//=== isfinite.hpp -   Unary function ISFINITE           ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of ISFINITE(x)
-/// function that tests whether a tensor element is finite.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace isfinite
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-using dpctl::tensor::type_utils::vec_cast;
-
-template <typename argT, typename resT> struct IsFiniteFunctor
-{
-    static_assert(std::is_same_v<resT, bool>);
-
-    /*
-    std::is_same<argT, bool>::value ||
-                           std::is_integral<argT>::value
-    */
-    using is_constant = typename std::disjunction<std::is_same<argT, bool>,
-                                                  std::is_integral<argT>>;
-    static constexpr resT constant_value = true;
-    using supports_vec = typename std::false_type;
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-        if constexpr (is_complex<argT>::value) {
-            const bool real_isfinite = std::isfinite(std::real(in));
-            const bool imag_isfinite = std::isfinite(std::imag(in));
-            return (real_isfinite && imag_isfinite);
-        }
-        else if constexpr (std::is_same<argT, bool>::value ||
-                           std::is_integral<argT>::value)
-        {
-            return constant_value;
-        }
-        else if constexpr (std::is_same_v<argT, sycl::half>) {
-            return sycl::isfinite(in);
-        }
-        else {
-            return std::isfinite(in);
-        }
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in) const
-    {
-        auto const &res_vec = sycl::isfinite(in);
-
-        using deducedT = typename std::remove_cv_t<
-            std::remove_reference_t<decltype(res_vec)>>::element_type;
-
-        return vec_cast<bool, deducedT, vec_sz>(res_vec);
-    }
-};
-
-template <typename argT,
-          typename resT = bool,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using IsFiniteContigFunctor =
-    elementwise_common::UnaryContigFunctor<argT,
-                                           resT,
-                                           IsFiniteFunctor<argT, resT>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using IsFiniteStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, IsFiniteFunctor<argTy, resTy>>;
-
-template <typename argTy> struct IsFiniteOutputType
-{
-    using value_type = bool;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct IsFiniteContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class isfinite_contig_kernel;
-
-template <typename argTy>
-sycl::event isfinite_contig_impl(sycl::queue &exec_q,
-                                 std::size_t nelems,
-                                 const char *arg_p,
-                                 char *res_p,
-                                 const std::vector<sycl::event> &depends = {})
-{
-    using IsFiniteHS =
-        hyperparam_detail::IsFiniteContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = IsFiniteHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = IsFiniteHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, IsFiniteOutputType, IsFiniteContigFunctor,
-        isfinite_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p,
-                                                depends);
-}
-
-template <typename fnT, typename T> struct IsFiniteContigFactory
-{
-    fnT get()
-    {
-        fnT fn = isfinite_contig_impl<T>;
-        return fn;
-    }
-};
-
-template <typename fnT, typename T> struct IsFiniteTypeMapFactory
-{
-    /*! @brief get typeid for output type of sycl::isfinite(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename IsFiniteOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class isfinite_strided_kernel;
-
-template <typename argTy>
-sycl::event
-isfinite_strided_impl(sycl::queue &exec_q,
-                      std::size_t nelems,
-                      int nd,
-                      const ssize_t *shape_and_strides,
-                      const char *arg_p,
-                      ssize_t arg_offset,
-                      char *res_p,
-                      ssize_t res_offset,
-                      const std::vector<sycl::event> &depends,
-                      const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<argTy, IsFiniteOutputType,
-                                                  IsFiniteStridedFunctor,
-                                                  isfinite_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct IsFiniteStridedFactory
-{
-    fnT get()
-    {
-        fnT fn = isfinite_strided_impl<T>;
-        return fn;
-    }
-};
-
-} // namespace isfinite
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
deleted file mode 100644
index 9291eeeb72..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
+++ /dev/null
@@ -1,213 +0,0 @@
-//=== isinf.hpp -   Unary function ISINF                 ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of ISINF(x)
-/// function that tests whether a tensor element is an infinity.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace isinf
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-using dpctl::tensor::type_utils::vec_cast;
-
-template <typename argT, typename resT> struct IsInfFunctor
-{
-    static_assert(std::is_same_v<resT, bool>);
-
-    using is_constant = typename std::disjunction<std::is_same<argT, bool>,
-                                                  std::is_integral<argT>>;
-    static constexpr resT constant_value = false;
-    using supports_vec =
-        typename std::disjunction<std::is_same<argT, sycl::half>,
-                                  std::is_floating_point<argT>>;
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-        if constexpr (is_complex<argT>::value) {
-            const bool real_isinf = std::isinf(std::real(in));
-            const bool imag_isinf = std::isinf(std::imag(in));
-            return (real_isinf || imag_isinf);
-        }
-        else if constexpr (std::is_same<argT, bool>::value ||
-                           std::is_integral<argT>::value)
-        {
-            return constant_value;
-        }
-        else if constexpr (std::is_same_v<argT, sycl::half>) {
-            return sycl::isinf(in);
-        }
-        else {
-            return std::isinf(in);
-        }
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in) const
-    {
-        auto const &res_vec = sycl::isinf(in);
-
-        using deducedT = typename std::remove_cv_t<
-            std::remove_reference_t<decltype(res_vec)>>::element_type;
-
-        return vec_cast<bool, deducedT, vec_sz>(res_vec);
-    }
-};
-
-template <typename argT,
-          typename resT = bool,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using IsInfContigFunctor =
-    elementwise_common::UnaryContigFunctor<argT,
-                                           resT,
-                                           IsInfFunctor<argT, resT>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using IsInfStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, IsInfFunctor<argTy, resTy>>;
-
-template <typename argTy> struct IsInfOutputType
-{
-    using value_type = bool;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct IsInfContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class isinf_contig_kernel;
-
-template <typename argTy>
-sycl::event isinf_contig_impl(sycl::queue &exec_q,
-                              std::size_t nelems,
-                              const char *arg_p,
-                              char *res_p,
-                              const std::vector<sycl::event> &depends = {})
-{
-    using IsInfHS = hyperparam_detail::IsInfContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = IsInfHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = IsInfHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, IsInfOutputType, IsInfContigFunctor, isinf_contig_kernel, vec_sz,
-        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct IsInfContigFactory
-{
-    fnT get()
-    {
-        fnT fn = isinf_contig_impl<T>;
-        return fn;
-    }
-};
-
-template <typename fnT, typename T> struct IsInfTypeMapFactory
-{
-    /*! @brief get typeid for output type of sycl::isinf(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename IsInfOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class isinf_strided_kernel;
-
-template <typename argTy>
-sycl::event
-isinf_strided_impl(sycl::queue &exec_q,
-                   std::size_t nelems,
-                   int nd,
-                   const ssize_t *shape_and_strides,
-                   const char *arg_p,
-                   ssize_t arg_offset,
-                   char *res_p,
-                   ssize_t res_offset,
-                   const std::vector<sycl::event> &depends,
-                   const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, IsInfOutputType, IsInfStridedFunctor, isinf_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct IsInfStridedFactory
-{
-    fnT get()
-    {
-        fnT fn = isinf_strided_impl<T>;
-        return fn;
-    }
-};
-
-} // namespace isinf
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
deleted file mode 100644
index 397037dbae..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
+++ /dev/null
@@ -1,211 +0,0 @@
-//=== isnan.hpp -   Unary function ISNAN                 ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of ISNAN(x)
-/// function that tests whether a tensor element is a NaN.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace isnan
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-using dpctl::tensor::type_utils::vec_cast;
-
-template <typename argT, typename resT> struct IsNanFunctor
-{
-    static_assert(std::is_same_v<resT, bool>);
-
-    /*
-    std::is_same<argT, bool>::value ||
-                           std::is_integral<argT>::value
-    */
-    using is_constant = typename std::disjunction<std::is_same<argT, bool>,
-                                                  std::is_integral<argT>>;
-    static constexpr resT constant_value = false;
-    using supports_vec = typename std::true_type;
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-        if constexpr (is_complex<argT>::value) {
-            const bool real_isnan = sycl::isnan(std::real(in));
-            const bool imag_isnan = sycl::isnan(std::imag(in));
-            return (real_isnan || imag_isnan);
-        }
-        else if constexpr (std::is_same<argT, bool>::value ||
-                           std::is_integral<argT>::value)
-        {
-            return constant_value;
-        }
-        else {
-            return sycl::isnan(in);
-        }
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in) const
-    {
-        auto const &res_vec = sycl::isnan(in);
-
-        using deducedT = typename std::remove_cv_t<
-            std::remove_reference_t<decltype(res_vec)>>::element_type;
-
-        return vec_cast<bool, deducedT, vec_sz>(res_vec);
-    }
-};
-
-template <typename argT,
-          typename resT = bool,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using IsNanContigFunctor =
-    elementwise_common::UnaryContigFunctor<argT,
-                                           resT,
-                                           IsNanFunctor<argT, resT>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using IsNanStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, IsNanFunctor<argTy, resTy>>;
-
-template <typename argTy> struct IsNanOutputType
-{
-    using value_type = bool;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct IsNanContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class isnan_contig_kernel;
-
-template <typename argTy>
-sycl::event isnan_contig_impl(sycl::queue &exec_q,
-                              std::size_t nelems,
-                              const char *arg_p,
-                              char *res_p,
-                              const std::vector<sycl::event> &depends = {})
-{
-    using IsNanHS = hyperparam_detail::IsNanContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = IsNanHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = IsNanHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, IsNanOutputType, IsNanContigFunctor, isnan_contig_kernel, vec_sz,
-        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct IsNanContigFactory
-{
-    fnT get()
-    {
-        fnT fn = isnan_contig_impl<T>;
-        return fn;
-    }
-};
-
-template <typename fnT, typename T> struct IsNanTypeMapFactory
-{
-    /*! @brief get typeid for output type of sycl::isnan(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename IsNanOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class isnan_strided_kernel;
-
-template <typename argTy>
-sycl::event
-isnan_strided_impl(sycl::queue &exec_q,
-                   std::size_t nelems,
-                   int nd,
-                   const ssize_t *shape_and_strides,
-                   const char *arg_p,
-                   ssize_t arg_offset,
-                   char *res_p,
-                   ssize_t res_offset,
-                   const std::vector<sycl::event> &depends,
-                   const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, IsNanOutputType, IsNanStridedFunctor, isnan_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct IsNanStridedFactory
-{
-    fnT get()
-    {
-        fnT fn = isnan_strided_impl<T>;
-        return fn;
-    }
-};
-
-} // namespace isnan
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
deleted file mode 100644
index f323e28a93..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
+++ /dev/null
@@ -1,310 +0,0 @@
-//=== less.hpp -   Binary function LESS                ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain in1 copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of comparison of
-/// tensor elements.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "utils/math_utils.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace less
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace tu_ns = dpctl::tensor::type_utils;
-
-template <typename argT1, typename argT2, typename resT> struct LessFunctor
-{
-    static_assert(std::is_same_v<resT, bool>);
-
-    using supports_sg_loadstore = std::negation<
-        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
-    using supports_vec = std::conjunction<
-        std::is_same<argT1, argT2>,
-        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
-                                       tu_ns::is_complex<argT2>>>>;
-
-    resT operator()(const argT1 &in1, const argT2 &in2) const
-    {
-        if constexpr (tu_ns::is_complex<argT1>::value ||
-                      tu_ns::is_complex<argT2>::value)
-        {
-            static_assert(std::is_same_v<argT1, argT2>);
-            using dpctl::tensor::math_utils::less_complex;
-            return less_complex<argT1>(in1, in2);
-        }
-        else {
-            if constexpr (std::is_integral_v<argT1> &&
-                          std::is_integral_v<argT2> &&
-                          std::is_signed_v<argT1> != std::is_signed_v<argT2>)
-            {
-                if constexpr (std::is_signed_v<argT1> &&
-                              !std::is_signed_v<argT2>)
-                {
-                    return (in1 < 0) ? true : (static_cast<argT2>(in1) < in2);
-                }
-                else {
-                    if constexpr (!std::is_signed_v<argT1> &&
-                                  std::is_signed_v<argT2>)
-                    {
-                        return (in2 < 0) ? false
-                                         : (in1 < static_cast<argT1>(in2));
-                    }
-                }
-            }
-            else {
-                return (in1 < in2);
-            }
-        }
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz>
-    operator()(const sycl::vec<argT1, vec_sz> &in1,
-               const sycl::vec<argT2, vec_sz> &in2) const
-    {
-        auto tmp = (in1 < in2);
-
-        if constexpr (std::is_same_v<resT,
-                                     typename decltype(tmp)::element_type>)
-        {
-            return tmp;
-        }
-        else {
-            using dpctl::tensor::type_utils::vec_cast;
-
-            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
-                tmp);
-        }
-    }
-};
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using LessContigFunctor =
-    elementwise_common::BinaryContigFunctor<argT1,
-                                            argT2,
-                                            resT,
-                                            LessFunctor<argT1, argT2, resT>,
-                                            vec_sz,
-                                            n_vecs,
-                                            enable_sg_loadstore>;
-
-template <typename argT1, typename argT2, typename resT, typename IndexerT>
-using LessStridedFunctor =
-    elementwise_common::BinaryStridedFunctor<argT1,
-                                             argT2,
-                                             resT,
-                                             IndexerT,
-                                             LessFunctor<argT1, argT2, resT>>;
-
-template <typename T1, typename T2> struct LessOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::uint8_t, T2, std::uint8_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, std::int8_t, T2, std::int8_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint16_t,
-                                        T2,
-                                        std::uint16_t,
-                                        bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::int16_t, T2, std::int16_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint32_t,
-                                        T2,
-                                        std::uint32_t,
-                                        bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::int32_t, T2, std::int32_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint64_t,
-                                        T2,
-                                        std::uint64_t,
-                                        bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::int64_t, bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::uint64_t, T2, std::int64_t, bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::uint64_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, sycl::half, T2, sycl::half, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<float>,
-                                        T2,
-                                        std::complex<float>,
-                                        bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<double>,
-                                        T2,
-                                        std::complex<double>,
-                                        bool>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::BinaryContigHyperparameterSetEntry;
-using vsu_ns::ContigHyperparameterSetDefault;
-
-template <typename argTy1, typename argTy2> struct LessContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class less_contig_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event less_contig_impl(sycl::queue &exec_q,
-                             std::size_t nelems,
-                             const char *arg1_p,
-                             ssize_t arg1_offset,
-                             const char *arg2_p,
-                             ssize_t arg2_offset,
-                             char *res_p,
-                             ssize_t res_offset,
-                             const std::vector<sycl::event> &depends = {})
-{
-    using LessHS =
-        hyperparam_detail::LessContigHyperparameterSet<argTy1, argTy2>;
-    static constexpr std::uint8_t vec_sz = LessHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = LessHS::n_vecs;
-
-    return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, LessOutputType, LessContigFunctor, less_contig_kernel,
-        vec_sz, n_vecs>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                        arg2_offset, res_p, res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct LessContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!LessOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = less_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2> struct LessTypeMapFactory
-{
-    /*! @brief get typeid for output type of operator()>(x, y), always bool */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename LessOutputType<T1, T2>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename resT, typename IndexerT>
-class less_strided_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event
-less_strided_impl(sycl::queue &exec_q,
-                  std::size_t nelems,
-                  int nd,
-                  const ssize_t *shape_and_strides,
-                  const char *arg1_p,
-                  ssize_t arg1_offset,
-                  const char *arg2_p,
-                  ssize_t arg2_offset,
-                  char *res_p,
-                  ssize_t res_offset,
-                  const std::vector<sycl::event> &depends,
-                  const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_strided_impl<
-        argTy1, argTy2, LessOutputType, LessStridedFunctor,
-        less_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
-                             arg1_offset, arg2_p, arg2_offset, res_p,
-                             res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct LessStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!LessOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = less_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-} // namespace less
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
deleted file mode 100644
index 8ae6f236fc..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
+++ /dev/null
@@ -1,313 +0,0 @@
-//=== less_equal.hpp -   Binary function LESS_EQUAL            ------
-//*-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain in1 copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of comparison of
-/// tensor elements.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "utils/math_utils.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace less_equal
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace tu_ns = dpctl::tensor::type_utils;
-
-template <typename argT1, typename argT2, typename resT> struct LessEqualFunctor
-{
-    static_assert(std::is_same_v<resT, bool>);
-
-    using supports_sg_loadstore = std::negation<
-        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
-    using supports_vec = std::conjunction<
-        std::is_same<argT1, argT2>,
-        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
-                                       tu_ns::is_complex<argT2>>>>;
-
-    resT operator()(const argT1 &in1, const argT2 &in2) const
-    {
-        if constexpr (tu_ns::is_complex<argT1>::value ||
-                      tu_ns::is_complex<argT2>::value)
-        {
-            static_assert(std::is_same_v<argT1, argT2>);
-            using dpctl::tensor::math_utils::less_equal_complex;
-            return less_equal_complex<argT1>(in1, in2);
-        }
-        else {
-            if constexpr (std::is_integral_v<argT1> &&
-                          std::is_integral_v<argT2> &&
-                          std::is_signed_v<argT1> != std::is_signed_v<argT2>)
-            {
-                if constexpr (std::is_signed_v<argT1> &&
-                              !std::is_signed_v<argT2>)
-                {
-                    return (in1 < 0) ? true : (static_cast<argT2>(in1) <= in2);
-                }
-                else {
-                    if constexpr (!std::is_signed_v<argT1> &&
-                                  std::is_signed_v<argT2>)
-                    {
-                        return (in2 < 0) ? false
-                                         : (in1 <= static_cast<argT1>(in2));
-                    }
-                }
-            }
-            else {
-                return (in1 <= in2);
-            }
-        }
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz>
-    operator()(const sycl::vec<argT1, vec_sz> &in1,
-               const sycl::vec<argT2, vec_sz> &in2) const
-    {
-
-        auto tmp = (in1 <= in2);
-
-        if constexpr (std::is_same_v<resT,
-                                     typename decltype(tmp)::element_type>)
-        {
-            return tmp;
-        }
-        else {
-            using dpctl::tensor::type_utils::vec_cast;
-
-            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
-                tmp);
-        }
-    }
-};
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using LessEqualContigFunctor = elementwise_common::BinaryContigFunctor<
-    argT1,
-    argT2,
-    resT,
-    LessEqualFunctor<argT1, argT2, resT>,
-    vec_sz,
-    n_vecs,
-    enable_sg_loadstore>;
-
-template <typename argT1, typename argT2, typename resT, typename IndexerT>
-using LessEqualStridedFunctor = elementwise_common::BinaryStridedFunctor<
-    argT1,
-    argT2,
-    resT,
-    IndexerT,
-    LessEqualFunctor<argT1, argT2, resT>>;
-
-template <typename T1, typename T2> struct LessEqualOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::uint8_t, T2, std::uint8_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, std::int8_t, T2, std::int8_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint16_t,
-                                        T2,
-                                        std::uint16_t,
-                                        bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::int16_t, T2, std::int16_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint32_t,
-                                        T2,
-                                        std::uint32_t,
-                                        bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::int32_t, T2, std::int32_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint64_t,
-                                        T2,
-                                        std::uint64_t,
-                                        bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::int64_t, bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::uint64_t, T2, std::int64_t, bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::uint64_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, sycl::half, T2, sycl::half, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<float>,
-                                        T2,
-                                        std::complex<float>,
-                                        bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<double>,
-                                        T2,
-                                        std::complex<double>,
-                                        bool>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::BinaryContigHyperparameterSetEntry;
-using vsu_ns::ContigHyperparameterSetDefault;
-
-template <typename argTy1, typename argTy2>
-struct LessEqualContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class less_equal_contig_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event less_equal_contig_impl(sycl::queue &exec_q,
-                                   std::size_t nelems,
-                                   const char *arg1_p,
-                                   ssize_t arg1_offset,
-                                   const char *arg2_p,
-                                   ssize_t arg2_offset,
-                                   char *res_p,
-                                   ssize_t res_offset,
-                                   const std::vector<sycl::event> &depends = {})
-{
-    using LessEqHS =
-        hyperparam_detail::LessEqualContigHyperparameterSet<argTy1, argTy2>;
-    static constexpr std::uint8_t vec_sz = LessEqHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = LessEqHS::n_vecs;
-
-    return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, LessEqualOutputType, LessEqualContigFunctor,
-        less_equal_contig_kernel, vec_sz, n_vecs>(
-        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
-        res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct LessEqualContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!LessEqualOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = less_equal_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2> struct LessEqualTypeMapFactory
-{
-    /*! @brief get typeid for output type of operator()>(x, y), always bool */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename LessEqualOutputType<T1, T2>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename resT, typename IndexerT>
-class less_equal_strided_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event
-less_equal_strided_impl(sycl::queue &exec_q,
-                        std::size_t nelems,
-                        int nd,
-                        const ssize_t *shape_and_strides,
-                        const char *arg1_p,
-                        ssize_t arg1_offset,
-                        const char *arg2_p,
-                        ssize_t arg2_offset,
-                        char *res_p,
-                        ssize_t res_offset,
-                        const std::vector<sycl::event> &depends,
-                        const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_strided_impl<
-        argTy1, argTy2, LessEqualOutputType, LessEqualStridedFunctor,
-        less_equal_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
-        arg2_offset, res_p, res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct LessEqualStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!LessEqualOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = less_equal_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-} // namespace less_equal
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp
deleted file mode 100644
index b6a934677d..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp
+++ /dev/null
@@ -1,214 +0,0 @@
-//=== log.hpp -   Unary function LOG                     ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of LOG(x) function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "sycl_complex.hpp"
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace log
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-
-template <typename argT, typename resT> struct LogFunctor
-{
-
-    // is function constant for given argT
-    using is_constant = typename std::false_type;
-    // constant value, if constant
-    // constexpr resT constant_value = resT{};
-    // is function defined for sycl::vec
-    using supports_vec = typename std::false_type;
-    // do both argTy and resTy support sugroup store/load operation
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-        if constexpr (is_complex<argT>::value) {
-            using realT = typename argT::value_type;
-            return exprm_ns::log(exprm_ns::complex<realT>(in)); // log(in);
-        }
-        else {
-            return sycl::log(in);
-        }
-    }
-};
-
-template <typename argTy,
-          typename resTy = argTy,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using LogContigFunctor =
-    elementwise_common::UnaryContigFunctor<argTy,
-                                           resTy,
-                                           LogFunctor<argTy, resTy>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using LogStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, LogFunctor<argTy, resTy>>;
-
-template <typename T> struct LogOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, sycl::half, sycl::half>,
-        td_ns::TypeMapResultEntry<T, float, float>,
-        td_ns::TypeMapResultEntry<T, double, double>,
-        td_ns::TypeMapResultEntry<T, std::complex<float>, std::complex<float>>,
-        td_ns::
-            TypeMapResultEntry<T, std::complex<double>, std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct LogContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class log_contig_kernel;
-
-template <typename argTy>
-sycl::event log_contig_impl(sycl::queue &exec_q,
-                            std::size_t nelems,
-                            const char *arg_p,
-                            char *res_p,
-                            const std::vector<sycl::event> &depends = {})
-{
-    using LogHS = hyperparam_detail::LogContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = LogHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = LogHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, LogOutputType, LogContigFunctor, log_contig_kernel, vec_sz,
-        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct LogContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!LogOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = log_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct LogTypeMapFactory
-{
-    /*! @brief get typeid for output type of sycl::log(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename LogOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class log_strided_kernel;
-
-template <typename argTy>
-sycl::event log_strided_impl(sycl::queue &exec_q,
-                             std::size_t nelems,
-                             int nd,
-                             const ssize_t *shape_and_strides,
-                             const char *arg_p,
-                             ssize_t arg_offset,
-                             char *res_p,
-                             ssize_t res_offset,
-                             const std::vector<sycl::event> &depends,
-                             const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, LogOutputType, LogStridedFunctor, log_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct LogStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!LogOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = log_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace log
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp
deleted file mode 100644
index bf27099e6a..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp
+++ /dev/null
@@ -1,234 +0,0 @@
-//=== log10.hpp -   Unary function LOG10                     ------
-//*-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of LOG10(x) function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "sycl_complex.hpp"
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace log10
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-using dpctl::tensor::type_utils::vec_cast;
-
-template <typename argT, typename resT> struct Log10Functor
-{
-
-    // is function constant for given argT
-    using is_constant = typename std::false_type;
-    // constant value, if constant
-    // constexpr resT constant_value = resT{};
-    // is function defined for sycl::vec
-    using supports_vec = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-    // do both argTy and resTy support sugroup store/load operation
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-        if constexpr (is_complex<argT>::value) {
-            using realT = typename argT::value_type;
-            // return (log(in) / log(realT{10}));
-            return exprm_ns::log(exprm_ns::complex<realT>(in)) /
-                   sycl::log(realT{10});
-        }
-        else {
-            return sycl::log10(in);
-        }
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in) const
-    {
-        auto const &res_vec = sycl::log10(in);
-        using deducedT = typename std::remove_cv_t<
-            std::remove_reference_t<decltype(res_vec)>>::element_type;
-        if constexpr (std::is_same_v<resT, deducedT>) {
-            return res_vec;
-        }
-        else {
-            return vec_cast<resT, deducedT, vec_sz>(res_vec);
-        }
-    }
-};
-
-template <typename argTy,
-          typename resTy = argTy,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using Log10ContigFunctor =
-    elementwise_common::UnaryContigFunctor<argTy,
-                                           resTy,
-                                           Log10Functor<argTy, resTy>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using Log10StridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, Log10Functor<argTy, resTy>>;
-
-template <typename T> struct Log10OutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, sycl::half, sycl::half>,
-        td_ns::TypeMapResultEntry<T, float, float>,
-        td_ns::TypeMapResultEntry<T, double, double>,
-        td_ns::TypeMapResultEntry<T, std::complex<float>, std::complex<float>>,
-        td_ns::
-            TypeMapResultEntry<T, std::complex<double>, std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct Log10ContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class log10_contig_kernel;
-
-template <typename argTy>
-sycl::event log10_contig_impl(sycl::queue &exec_q,
-                              std::size_t nelems,
-                              const char *arg_p,
-                              char *res_p,
-                              const std::vector<sycl::event> &depends = {})
-{
-    using Log10HS = hyperparam_detail::Log10ContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = Log10HS::vec_sz;
-    static constexpr std::uint8_t n_vecs = Log10HS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, Log10OutputType, Log10ContigFunctor, log10_contig_kernel, vec_sz,
-        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct Log10ContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!Log10OutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = log10_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct Log10TypeMapFactory
-{
-    /*! @brief get typeid for output type of sycl::log10(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename Log10OutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class log10_strided_kernel;
-
-template <typename argTy>
-sycl::event
-log10_strided_impl(sycl::queue &exec_q,
-                   std::size_t nelems,
-                   int nd,
-                   const ssize_t *shape_and_strides,
-                   const char *arg_p,
-                   ssize_t arg_offset,
-                   char *res_p,
-                   ssize_t res_offset,
-                   const std::vector<sycl::event> &depends,
-                   const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, Log10OutputType, Log10StridedFunctor, log10_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct Log10StridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!Log10OutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = log10_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace log10
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp
deleted file mode 100644
index be14d459ee..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp
+++ /dev/null
@@ -1,239 +0,0 @@
-//=== log1p.hpp -   Unary function LOG1P                   ------
-//*-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of LOG1P(x) function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace log1p
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-
-// TODO: evaluate precision against alternatives
-template <typename argT, typename resT> struct Log1pFunctor
-{
-
-    // is function constant for given argT
-    using is_constant = typename std::false_type;
-    // constant value, if constant
-    // constexpr resT constant_value = resT{};
-    // is function defined for sycl::vec
-    using supports_vec = typename std::false_type;
-    // do both argTy and resTy support sugroup store/load operation
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-        if constexpr (is_complex<argT>::value) {
-            // log1p(z) = ln((x + 1) + yI)
-            //          = ln(|(x + 1) + yi|)
-            //             + I * atan2(y, x + 1)
-            //          = ln(sqrt((x + 1)^2 + y^2))
-            //             + I *atan2(y, x + 1)
-            //          = log1p(x^2 + 2x + y^2) / 2
-            //             + I * atan2(y, x + 1)
-            using realT = typename argT::value_type;
-            const realT x = std::real(in);
-            const realT y = std::imag(in);
-
-            // imaginary part of result
-            const realT res_im = sycl::atan2(y, x + 1);
-
-            if (std::max(sycl::fabs(x), sycl::fabs(y)) < realT{.1}) {
-                const realT v = x * (2 + x) + y * y;
-                return resT{sycl::log1p(v) / 2, res_im};
-            }
-            else {
-                // when not close to zero,
-                // prevent overflow
-                const realT m = sycl::hypot(x + 1, y);
-                return resT{sycl::log(m), res_im};
-            }
-        }
-        else {
-            static_assert(std::is_floating_point_v<argT> ||
-                          std::is_same_v<argT, sycl::half>);
-            return sycl::log1p(in);
-        }
-    }
-};
-
-template <typename argTy,
-          typename resTy = argTy,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using Log1pContigFunctor =
-    elementwise_common::UnaryContigFunctor<argTy,
-                                           resTy,
-                                           Log1pFunctor<argTy, resTy>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using Log1pStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, Log1pFunctor<argTy, resTy>>;
-
-template <typename T> struct Log1pOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, sycl::half, sycl::half>,
-        td_ns::TypeMapResultEntry<T, float, float>,
-        td_ns::TypeMapResultEntry<T, double, double>,
-        td_ns::TypeMapResultEntry<T, std::complex<float>, std::complex<float>>,
-        td_ns::
-            TypeMapResultEntry<T, std::complex<double>, std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct Log1pContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class log1p_contig_kernel;
-
-template <typename argTy>
-sycl::event log1p_contig_impl(sycl::queue &exec_q,
-                              std::size_t nelems,
-                              const char *arg_p,
-                              char *res_p,
-                              const std::vector<sycl::event> &depends = {})
-{
-    using Log1pHS = hyperparam_detail::Log1pContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = Log1pHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = Log1pHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, Log1pOutputType, Log1pContigFunctor, log1p_contig_kernel, vec_sz,
-        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct Log1pContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!Log1pOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = log1p_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct Log1pTypeMapFactory
-{
-    /*! @brief get typeid for output type of sycl::log1p(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename Log1pOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class log1p_strided_kernel;
-
-template <typename argTy>
-sycl::event
-log1p_strided_impl(sycl::queue &exec_q,
-                   std::size_t nelems,
-                   int nd,
-                   const ssize_t *shape_and_strides,
-                   const char *arg_p,
-                   ssize_t arg_offset,
-                   char *res_p,
-                   ssize_t res_offset,
-                   const std::vector<sycl::event> &depends,
-                   const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, Log1pOutputType, Log1pStridedFunctor, log1p_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct Log1pStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!Log1pOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = log1p_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace log1p
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp
deleted file mode 100644
index 0548c61f9d..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp
+++ /dev/null
@@ -1,235 +0,0 @@
-//=== log2.hpp -   Unary function LOG2                     ------
-//*-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of LOG2(x) function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "sycl_complex.hpp"
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace log2
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-using dpctl::tensor::type_utils::vec_cast;
-
-template <typename argT, typename resT> struct Log2Functor
-{
-
-    // is function constant for given argT
-    using is_constant = typename std::false_type;
-    // constant value, if constant
-    // constexpr resT constant_value = resT{};
-    // is function defined for sycl::vec
-    using supports_vec = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-    // do both argTy and resTy support sugroup store/load operation
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-        if constexpr (is_complex<argT>::value) {
-            using realT = typename argT::value_type;
-
-            // log(in) / log(realT{2});
-            return exprm_ns::log(exprm_ns::complex<realT>(in)) /
-                   sycl::log(realT{2});
-        }
-        else {
-            return sycl::log2(in);
-        }
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in) const
-    {
-        auto const &res_vec = sycl::log2(in);
-        using deducedT = typename std::remove_cv_t<
-            std::remove_reference_t<decltype(res_vec)>>::element_type;
-        if constexpr (std::is_same_v<resT, deducedT>) {
-            return res_vec;
-        }
-        else {
-            return vec_cast<resT, deducedT, vec_sz>(res_vec);
-        }
-    }
-};
-
-template <typename argTy,
-          typename resTy = argTy,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using Log2ContigFunctor =
-    elementwise_common::UnaryContigFunctor<argTy,
-                                           resTy,
-                                           Log2Functor<argTy, resTy>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using Log2StridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, Log2Functor<argTy, resTy>>;
-
-template <typename T> struct Log2OutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, sycl::half, sycl::half>,
-        td_ns::TypeMapResultEntry<T, float, float>,
-        td_ns::TypeMapResultEntry<T, double, double>,
-        td_ns::TypeMapResultEntry<T, std::complex<float>, std::complex<float>>,
-        td_ns::
-            TypeMapResultEntry<T, std::complex<double>, std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct Log2ContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class log2_contig_kernel;
-
-template <typename argTy>
-sycl::event log2_contig_impl(sycl::queue &exec_q,
-                             std::size_t nelems,
-                             const char *arg_p,
-                             char *res_p,
-                             const std::vector<sycl::event> &depends = {})
-{
-    using Log2HS = hyperparam_detail::Log2ContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = Log2HS::vec_sz;
-    static constexpr std::uint8_t n_vecs = Log2HS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, Log2OutputType, Log2ContigFunctor, log2_contig_kernel, vec_sz,
-        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct Log2ContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!Log2OutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = log2_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct Log2TypeMapFactory
-{
-    /*! @brief get typeid for output type of sycl::log2(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename Log2OutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class log2_strided_kernel;
-
-template <typename argTy>
-sycl::event
-log2_strided_impl(sycl::queue &exec_q,
-                  std::size_t nelems,
-                  int nd,
-                  const ssize_t *shape_and_strides,
-                  const char *arg_p,
-                  ssize_t arg_offset,
-                  char *res_p,
-                  ssize_t res_offset,
-                  const std::vector<sycl::event> &depends,
-                  const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, Log2OutputType, Log2StridedFunctor, log2_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct Log2StridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!Log2OutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = log2_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace log2
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
deleted file mode 100644
index cd1dc11e9c..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
+++ /dev/null
@@ -1,261 +0,0 @@
-//=== logaddexp.hpp -   Binary function LOGADDEXP                    ------
-//*-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of LOGADDEXP(x1, x2)
-/// function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <limits>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "utils/math_utils.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace logaddexp
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace tu_ns = dpctl::tensor::type_utils;
-
-using dpctl::tensor::type_utils::is_complex;
-using dpctl::tensor::type_utils::vec_cast;
-
-template <typename argT1, typename argT2, typename resT> struct LogAddExpFunctor
-{
-    using supports_sg_loadstore = std::true_type;
-    using supports_vec = std::true_type;
-
-    resT operator()(const argT1 &in1, const argT2 &in2) const
-    {
-        using dpctl::tensor::math_utils::logaddexp;
-        return logaddexp<resT>(in1, in2);
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz>
-    operator()(const sycl::vec<argT1, vec_sz> &in1,
-               const sycl::vec<argT2, vec_sz> &in2) const
-    {
-        sycl::vec<resT, vec_sz> res;
-        auto diff = in1 - in2; // take advantange of faster vec arithmetic
-
-#pragma unroll
-        for (int i = 0; i < vec_sz; ++i) {
-            if (std::isfinite(diff[i])) {
-                res[i] = std::max<resT>(in1[i], in2[i]) +
-                         impl_finite<resT>(-sycl::fabs(diff[i]));
-            }
-            else {
-                using dpctl::tensor::math_utils::logaddexp;
-                res[i] = logaddexp<resT>(in1[i], in2[i]);
-            }
-        }
-
-        return res;
-    }
-
-private:
-    template <typename T> T impl_finite(T const &in) const
-    {
-        return (in > 0) ? (in + sycl::log1p(sycl::exp(-in)))
-                        : sycl::log1p(sycl::exp(in));
-    }
-};
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using LogAddExpContigFunctor = elementwise_common::BinaryContigFunctor<
-    argT1,
-    argT2,
-    resT,
-    LogAddExpFunctor<argT1, argT2, resT>,
-    vec_sz,
-    n_vecs,
-    enable_sg_loadstore>;
-
-template <typename argT1, typename argT2, typename resT, typename IndexerT>
-using LogAddExpStridedFunctor = elementwise_common::BinaryStridedFunctor<
-    argT1,
-    argT2,
-    resT,
-    IndexerT,
-    LogAddExpFunctor<argT1, argT2, resT>>;
-
-template <typename T1, typename T2> struct LogAddExpOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        sycl::half,
-                                        T2,
-                                        sycl::half,
-                                        sycl::half>,
-        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
-        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::BinaryContigHyperparameterSetEntry;
-using vsu_ns::ContigHyperparameterSetDefault;
-
-template <typename argTy1, typename argTy2>
-struct LogAddExpContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class logaddexp_contig_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event logaddexp_contig_impl(sycl::queue &exec_q,
-                                  std::size_t nelems,
-                                  const char *arg1_p,
-                                  ssize_t arg1_offset,
-                                  const char *arg2_p,
-                                  ssize_t arg2_offset,
-                                  char *res_p,
-                                  ssize_t res_offset,
-                                  const std::vector<sycl::event> &depends = {})
-{
-    using LogAddExpHS =
-        hyperparam_detail::LogAddExpContigHyperparameterSet<argTy1, argTy2>;
-    static constexpr std::uint8_t vec_sz = LogAddExpHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = LogAddExpHS::n_vecs;
-
-    return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, LogAddExpOutputType, LogAddExpContigFunctor,
-        logaddexp_contig_kernel, vec_sz, n_vecs>(
-        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
-        res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct LogAddExpContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!LogAddExpOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = logaddexp_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2> struct LogAddExpTypeMapFactory
-{
-    /*! @brief get typeid for output type of logaddexp(T1 x, T2 y) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename LogAddExpOutputType<T1, T2>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename resT, typename IndexerT>
-class logaddexp_strided_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event
-logaddexp_strided_impl(sycl::queue &exec_q,
-                       std::size_t nelems,
-                       int nd,
-                       const ssize_t *shape_and_strides,
-                       const char *arg1_p,
-                       ssize_t arg1_offset,
-                       const char *arg2_p,
-                       ssize_t arg2_offset,
-                       char *res_p,
-                       ssize_t res_offset,
-                       const std::vector<sycl::event> &depends,
-                       const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_strided_impl<
-        argTy1, argTy2, LogAddExpOutputType, LogAddExpStridedFunctor,
-        logaddexp_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
-                                  arg1_offset, arg2_p, arg2_offset, res_p,
-                                  res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct LogAddExpStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!LogAddExpOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = logaddexp_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename argT1, typename argT2, typename resT>
-class logaddexp_matrix_row_broadcast_sg_krn;
-
-} // namespace logaddexp
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
deleted file mode 100644
index 6b91ff6915..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
+++ /dev/null
@@ -1,287 +0,0 @@
-//=== logical_and.hpp -   Binary function GREATER              ------
-//*-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain in1 copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of comparison of
-/// tensor elements.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace logical_and
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace tu_ns = dpctl::tensor::type_utils;
-
-template <typename argT1, typename argT2, typename resT>
-struct LogicalAndFunctor
-{
-    static_assert(std::is_same_v<resT, bool>);
-
-    using supports_sg_loadstore = std::negation<
-        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
-    using supports_vec = std::conjunction<
-        std::is_same<argT1, argT2>,
-        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
-                                       tu_ns::is_complex<argT2>>>>;
-
-    resT operator()(const argT1 &in1, const argT2 &in2) const
-    {
-        using tu_ns::convert_impl;
-
-        return (convert_impl<bool, argT1>(in1) &&
-                convert_impl<bool, argT2>(in2));
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz>
-    operator()(const sycl::vec<argT1, vec_sz> &in1,
-               const sycl::vec<argT2, vec_sz> &in2) const
-    {
-
-        auto tmp = (in1 && in2);
-        if constexpr (std::is_same_v<resT,
-                                     typename decltype(tmp)::element_type>)
-        {
-            return tmp;
-        }
-        else {
-            using dpctl::tensor::type_utils::vec_cast;
-
-            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
-                tmp);
-        }
-    }
-};
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using LogicalAndContigFunctor = elementwise_common::BinaryContigFunctor<
-    argT1,
-    argT2,
-    resT,
-    LogicalAndFunctor<argT1, argT2, resT>,
-    vec_sz,
-    n_vecs,
-    enable_sg_loadstore>;
-
-template <typename argT1, typename argT2, typename resT, typename IndexerT>
-using LogicalAndStridedFunctor = elementwise_common::BinaryStridedFunctor<
-    argT1,
-    argT2,
-    resT,
-    IndexerT,
-    LogicalAndFunctor<argT1, argT2, resT>>;
-
-template <typename T1, typename T2> struct LogicalAndOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::uint8_t, T2, std::uint8_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, std::int8_t, T2, std::int8_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint16_t,
-                                        T2,
-                                        std::uint16_t,
-                                        bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::int16_t, T2, std::int16_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint32_t,
-                                        T2,
-                                        std::uint32_t,
-                                        bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::int32_t, T2, std::int32_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint64_t,
-                                        T2,
-                                        std::uint64_t,
-                                        bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::int64_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, sycl::half, T2, sycl::half, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<float>,
-                                        T2,
-                                        std::complex<float>,
-                                        bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<double>,
-                                        T2,
-                                        std::complex<double>,
-                                        bool>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::BinaryContigHyperparameterSetEntry;
-using vsu_ns::ContigHyperparameterSetDefault;
-
-template <typename argTy1, typename argTy2>
-struct LogicalAndContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class logical_and_contig_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event
-logical_and_contig_impl(sycl::queue &exec_q,
-                        std::size_t nelems,
-                        const char *arg1_p,
-                        ssize_t arg1_offset,
-                        const char *arg2_p,
-                        ssize_t arg2_offset,
-                        char *res_p,
-                        ssize_t res_offset,
-                        const std::vector<sycl::event> &depends = {})
-{
-    using LogicalAndHS =
-        hyperparam_detail::LogicalAndContigHyperparameterSet<argTy1, argTy2>;
-    static constexpr std::uint8_t vec_sz = LogicalAndHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = LogicalAndHS::n_vecs;
-
-    return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, LogicalAndOutputType, LogicalAndContigFunctor,
-        logical_and_contig_kernel, vec_sz, n_vecs>(
-        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
-        res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct LogicalAndContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!LogicalAndOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = logical_and_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2>
-struct LogicalAndTypeMapFactory
-{
-    /*! @brief get typeid for output type of operator()>(x, y), always bool
-     */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename LogicalAndOutputType<T1, T2>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename resT, typename IndexerT>
-class logical_and_strided_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event
-logical_and_strided_impl(sycl::queue &exec_q,
-                         std::size_t nelems,
-                         int nd,
-                         const ssize_t *shape_and_strides,
-                         const char *arg1_p,
-                         ssize_t arg1_offset,
-                         const char *arg2_p,
-                         ssize_t arg2_offset,
-                         char *res_p,
-                         ssize_t res_offset,
-                         const std::vector<sycl::event> &depends,
-                         const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_strided_impl<
-        argTy1, argTy2, LogicalAndOutputType, LogicalAndStridedFunctor,
-        logical_and_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
-        arg2_offset, res_p, res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct LogicalAndStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!LogicalAndOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = logical_and_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-} // namespace logical_and
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp
deleted file mode 100644
index d4f9ec671f..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp
+++ /dev/null
@@ -1,191 +0,0 @@
-//=== logical_not.hpp -   Unary function ISNAN                 ------
-//*-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of ISNAN(x)
-/// function that tests whether a tensor element is a NaN.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace logical_not
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace tu_ns = dpctl::tensor::type_utils;
-
-template <typename argT, typename resT> struct LogicalNotFunctor
-{
-    static_assert(std::is_same_v<resT, bool>);
-
-    using is_constant = typename std::false_type;
-    // constexpr resT constant_value = resT{};
-    using supports_vec = typename std::false_type;
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<tu_ns::is_complex<resT>, tu_ns::is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-        using tu_ns::convert_impl;
-        return !convert_impl<bool, argT>(in);
-    }
-};
-
-template <typename argT,
-          typename resT = bool,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using LogicalNotContigFunctor =
-    elementwise_common::UnaryContigFunctor<argT,
-                                           resT,
-                                           LogicalNotFunctor<argT, resT>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using LogicalNotStridedFunctor =
-    elementwise_common::UnaryStridedFunctor<argTy,
-                                            resTy,
-                                            IndexerT,
-                                            LogicalNotFunctor<argTy, resTy>>;
-
-template <typename argTy> struct LogicalNotOutputType
-{
-    using value_type = bool;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct LogicalNotContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class logical_not_contig_kernel;
-
-template <typename argTy>
-sycl::event
-logical_not_contig_impl(sycl::queue &exec_q,
-                        std::size_t nelems,
-                        const char *arg_p,
-                        char *res_p,
-                        const std::vector<sycl::event> &depends = {})
-{
-    using LogicalNotHS =
-        hyperparam_detail::LogicalNotContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = LogicalNotHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = LogicalNotHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, LogicalNotOutputType, LogicalNotContigFunctor,
-        logical_not_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p,
-                                                   depends);
-}
-
-template <typename fnT, typename T> struct LogicalNotContigFactory
-{
-    fnT get()
-    {
-        fnT fn = logical_not_contig_impl<T>;
-        return fn;
-    }
-};
-
-template <typename fnT, typename T> struct LogicalNotTypeMapFactory
-{
-    /*! @brief get typeid for output type of sycl::logical_not(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename LogicalNotOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3>
-class logical_not_strided_kernel;
-
-template <typename argTy>
-sycl::event
-logical_not_strided_impl(sycl::queue &exec_q,
-                         std::size_t nelems,
-                         int nd,
-                         const ssize_t *shape_and_strides,
-                         const char *arg_p,
-                         ssize_t arg_offset,
-                         char *res_p,
-                         ssize_t res_offset,
-                         const std::vector<sycl::event> &depends,
-                         const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<argTy, LogicalNotOutputType,
-                                                  LogicalNotStridedFunctor,
-                                                  logical_not_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct LogicalNotStridedFactory
-{
-    fnT get()
-    {
-        fnT fn = logical_not_strided_impl<T>;
-        return fn;
-    }
-};
-
-} // namespace logical_not
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
deleted file mode 100644
index fdf3d134de..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
+++ /dev/null
@@ -1,283 +0,0 @@
-//=== logical_or.hpp -   Binary function GREATER              ------
-//*-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain in1 copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of comparison of
-/// tensor elements.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace logical_or
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace tu_ns = dpctl::tensor::type_utils;
-
-template <typename argT1, typename argT2, typename resT> struct LogicalOrFunctor
-{
-    static_assert(std::is_same_v<resT, bool>);
-
-    using supports_sg_loadstore = std::negation<
-        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
-    using supports_vec = std::conjunction<
-        std::is_same<argT1, argT2>,
-        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
-                                       tu_ns::is_complex<argT2>>>>;
-
-    resT operator()(const argT1 &in1, const argT2 &in2) const
-    {
-        using tu_ns::convert_impl;
-
-        return (convert_impl<bool, argT1>(in1) ||
-                convert_impl<bool, argT2>(in2));
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz>
-    operator()(const sycl::vec<argT1, vec_sz> &in1,
-               const sycl::vec<argT2, vec_sz> &in2) const
-    {
-
-        auto tmp = (in1 || in2);
-        if constexpr (std::is_same_v<resT,
-                                     typename decltype(tmp)::element_type>)
-        {
-            return tmp;
-        }
-        else {
-            using dpctl::tensor::type_utils::vec_cast;
-
-            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
-                tmp);
-        }
-    }
-};
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using LogicalOrContigFunctor = elementwise_common::BinaryContigFunctor<
-    argT1,
-    argT2,
-    resT,
-    LogicalOrFunctor<argT1, argT2, resT>,
-    vec_sz,
-    n_vecs,
-    enable_sg_loadstore>;
-
-template <typename argT1, typename argT2, typename resT, typename IndexerT>
-using LogicalOrStridedFunctor = elementwise_common::BinaryStridedFunctor<
-    argT1,
-    argT2,
-    resT,
-    IndexerT,
-    LogicalOrFunctor<argT1, argT2, resT>>;
-
-template <typename T1, typename T2> struct LogicalOrOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::uint8_t, T2, std::uint8_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, std::int8_t, T2, std::int8_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint16_t,
-                                        T2,
-                                        std::uint16_t,
-                                        bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::int16_t, T2, std::int16_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint32_t,
-                                        T2,
-                                        std::uint32_t,
-                                        bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::int32_t, T2, std::int32_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint64_t,
-                                        T2,
-                                        std::uint64_t,
-                                        bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::int64_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, sycl::half, T2, sycl::half, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<float>,
-                                        T2,
-                                        std::complex<float>,
-                                        bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<double>,
-                                        T2,
-                                        std::complex<double>,
-                                        bool>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::BinaryContigHyperparameterSetEntry;
-using vsu_ns::ContigHyperparameterSetDefault;
-
-template <typename argTy1, typename argTy2>
-struct LogicalOrContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class logical_or_contig_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event logical_or_contig_impl(sycl::queue &exec_q,
-                                   std::size_t nelems,
-                                   const char *arg1_p,
-                                   ssize_t arg1_offset,
-                                   const char *arg2_p,
-                                   ssize_t arg2_offset,
-                                   char *res_p,
-                                   ssize_t res_offset,
-                                   const std::vector<sycl::event> &depends = {})
-{
-    using LogicalOrHS =
-        hyperparam_detail::LogicalOrContigHyperparameterSet<argTy1, argTy2>;
-    static constexpr std::uint8_t vec_sz = LogicalOrHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = LogicalOrHS::n_vecs;
-
-    return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, LogicalOrOutputType, LogicalOrContigFunctor,
-        logical_or_contig_kernel, vec_sz, n_vecs>(
-        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
-        res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct LogicalOrContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!LogicalOrOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = logical_or_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2> struct LogicalOrTypeMapFactory
-{
-    /*! @brief get typeid for output type of operator()>(x, y), always bool
-     */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename LogicalOrOutputType<T1, T2>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename resT, typename IndexerT>
-class logical_or_strided_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event
-logical_or_strided_impl(sycl::queue &exec_q,
-                        std::size_t nelems,
-                        int nd,
-                        const ssize_t *shape_and_strides,
-                        const char *arg1_p,
-                        ssize_t arg1_offset,
-                        const char *arg2_p,
-                        ssize_t arg2_offset,
-                        char *res_p,
-                        ssize_t res_offset,
-                        const std::vector<sycl::event> &depends,
-                        const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_strided_impl<
-        argTy1, argTy2, LogicalOrOutputType, LogicalOrStridedFunctor,
-        logical_or_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
-        arg2_offset, res_p, res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct LogicalOrStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!LogicalOrOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = logical_or_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-} // namespace logical_or
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
deleted file mode 100644
index b1521a238a..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
+++ /dev/null
@@ -1,288 +0,0 @@
-//=== logical_xor.hpp -   Binary function GREATER              ------
-//*-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain in1 copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of comparison of
-/// tensor elements.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace logical_xor
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace tu_ns = dpctl::tensor::type_utils;
-
-template <typename argT1, typename argT2, typename resT>
-struct LogicalXorFunctor
-{
-    static_assert(std::is_same_v<resT, bool>);
-
-    using supports_sg_loadstore = std::negation<
-        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
-    using supports_vec = std::conjunction<
-        std::is_same<argT1, argT2>,
-        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
-                                       tu_ns::is_complex<argT2>>>>;
-
-    resT operator()(const argT1 &in1, const argT2 &in2) const
-    {
-        using tu_ns::convert_impl;
-
-        return (convert_impl<bool, argT1>(in1) !=
-                convert_impl<bool, argT2>(in2));
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz>
-    operator()(const sycl::vec<argT1, vec_sz> &in1,
-               const sycl::vec<argT2, vec_sz> &in2) const
-    {
-        using tu_ns::vec_cast;
-        auto tmp1 = vec_cast<bool, argT1, vec_sz>(in1);
-        auto tmp2 = vec_cast<bool, argT2, vec_sz>(in2);
-
-        auto tmp = (tmp1 != tmp2);
-        if constexpr (std::is_same_v<resT,
-                                     typename decltype(tmp)::element_type>)
-        {
-            return tmp;
-        }
-        else {
-            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
-                tmp);
-        }
-    }
-};
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using LogicalXorContigFunctor = elementwise_common::BinaryContigFunctor<
-    argT1,
-    argT2,
-    resT,
-    LogicalXorFunctor<argT1, argT2, resT>,
-    vec_sz,
-    n_vecs,
-    enable_sg_loadstore>;
-
-template <typename argT1, typename argT2, typename resT, typename IndexerT>
-using LogicalXorStridedFunctor = elementwise_common::BinaryStridedFunctor<
-    argT1,
-    argT2,
-    resT,
-    IndexerT,
-    LogicalXorFunctor<argT1, argT2, resT>>;
-
-template <typename T1, typename T2> struct LogicalXorOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::uint8_t, T2, std::uint8_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, std::int8_t, T2, std::int8_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint16_t,
-                                        T2,
-                                        std::uint16_t,
-                                        bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::int16_t, T2, std::int16_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint32_t,
-                                        T2,
-                                        std::uint32_t,
-                                        bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::int32_t, T2, std::int32_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint64_t,
-                                        T2,
-                                        std::uint64_t,
-                                        bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::int64_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, sycl::half, T2, sycl::half, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<float>,
-                                        T2,
-                                        std::complex<float>,
-                                        bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<double>,
-                                        T2,
-                                        std::complex<double>,
-                                        bool>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::BinaryContigHyperparameterSetEntry;
-using vsu_ns::ContigHyperparameterSetDefault;
-
-template <typename argTy1, typename argTy2>
-struct LogicalXorContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class logical_xor_contig_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event
-logical_xor_contig_impl(sycl::queue &exec_q,
-                        std::size_t nelems,
-                        const char *arg1_p,
-                        ssize_t arg1_offset,
-                        const char *arg2_p,
-                        ssize_t arg2_offset,
-                        char *res_p,
-                        ssize_t res_offset,
-                        const std::vector<sycl::event> &depends = {})
-{
-    using LogicalXorHS =
-        hyperparam_detail::LogicalXorContigHyperparameterSet<argTy1, argTy2>;
-    static constexpr std::uint8_t vec_sz = LogicalXorHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = LogicalXorHS::n_vecs;
-
-    return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, LogicalXorOutputType, LogicalXorContigFunctor,
-        logical_xor_contig_kernel, vec_sz, n_vecs>(
-        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
-        res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct LogicalXorContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!LogicalXorOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = logical_xor_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2>
-struct LogicalXorTypeMapFactory
-{
-    /*! @brief get typeid for output type of operator()>(x, y), always bool
-     */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename LogicalXorOutputType<T1, T2>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename resT, typename IndexerT>
-class logical_xor_strided_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event
-logical_xor_strided_impl(sycl::queue &exec_q,
-                         std::size_t nelems,
-                         int nd,
-                         const ssize_t *shape_and_strides,
-                         const char *arg1_p,
-                         ssize_t arg1_offset,
-                         const char *arg2_p,
-                         ssize_t arg2_offset,
-                         char *res_p,
-                         ssize_t res_offset,
-                         const std::vector<sycl::event> &depends,
-                         const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_strided_impl<
-        argTy1, argTy2, LogicalXorOutputType, LogicalXorStridedFunctor,
-        logical_xor_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
-        arg2_offset, res_p, res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct LogicalXorStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!LogicalXorOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = logical_xor_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-} // namespace logical_xor
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
deleted file mode 100644
index 6fd8faf648..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
+++ /dev/null
@@ -1,316 +0,0 @@
-//=== maximum.hpp -   Binary function MAXIMUM            ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of MAXIMUM(x1, x2)
-/// function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "utils/math_utils.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace maximum
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace tu_ns = dpctl::tensor::type_utils;
-
-template <typename argT1, typename argT2, typename resT> struct MaximumFunctor
-{
-
-    using supports_sg_loadstore = std::negation<
-        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
-    using supports_vec = std::conjunction<
-        std::is_same<argT1, argT2>,
-        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
-                                       tu_ns::is_complex<argT2>>>>;
-
-    resT operator()(const argT1 &in1, const argT2 &in2) const
-    {
-        if constexpr (tu_ns::is_complex<argT1>::value ||
-                      tu_ns::is_complex<argT2>::value)
-        {
-            static_assert(std::is_same_v<argT1, argT2>);
-            using dpctl::tensor::math_utils::max_complex;
-            return max_complex<argT1>(in1, in2);
-        }
-        else if constexpr (std::is_floating_point_v<argT1> ||
-                           std::is_same_v<argT1, sycl::half>)
-        {
-            const bool choose_first = (std::isnan(in1) || (in1 > in2));
-            return (choose_first) ? in1 : in2;
-        }
-        else {
-            return (in1 > in2) ? in1 : in2;
-        }
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz>
-    operator()(const sycl::vec<argT1, vec_sz> &in1,
-               const sycl::vec<argT2, vec_sz> &in2) const
-    {
-        sycl::vec<resT, vec_sz> res;
-#pragma unroll
-        for (int i = 0; i < vec_sz; ++i) {
-            const auto &v1 = in1[i];
-            const auto &v2 = in2[i];
-            if constexpr (std::is_floating_point_v<argT1> ||
-                          std::is_same_v<argT1, sycl::half>)
-            {
-                const bool choose_first = (std::isnan(v1) || (v1 > v2));
-                res[i] = (choose_first) ? v1 : v2;
-            }
-            else {
-                res[i] = (v1 > v2) ? v1 : v2;
-            }
-        }
-        return res;
-    }
-};
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using MaximumContigFunctor =
-    elementwise_common::BinaryContigFunctor<argT1,
-                                            argT2,
-                                            resT,
-                                            MaximumFunctor<argT1, argT2, resT>,
-                                            vec_sz,
-                                            n_vecs,
-                                            enable_sg_loadstore>;
-
-template <typename argT1, typename argT2, typename resT, typename IndexerT>
-using MaximumStridedFunctor = elementwise_common::BinaryStridedFunctor<
-    argT1,
-    argT2,
-    resT,
-    IndexerT,
-    MaximumFunctor<argT1, argT2, resT>>;
-
-template <typename T1, typename T2> struct MaximumOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint8_t,
-                                        T2,
-                                        std::uint8_t,
-                                        std::uint8_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int8_t,
-                                        T2,
-                                        std::int8_t,
-                                        std::int8_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint16_t,
-                                        T2,
-                                        std::uint16_t,
-                                        std::uint16_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int16_t,
-                                        T2,
-                                        std::int16_t,
-                                        std::int16_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint32_t,
-                                        T2,
-                                        std::uint32_t,
-                                        std::uint32_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int32_t,
-                                        T2,
-                                        std::int32_t,
-                                        std::int32_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint64_t,
-                                        T2,
-                                        std::uint64_t,
-                                        std::uint64_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int64_t,
-                                        T2,
-                                        std::int64_t,
-                                        std::int64_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        sycl::half,
-                                        T2,
-                                        sycl::half,
-                                        sycl::half>,
-        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
-        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<float>,
-                                        T2,
-                                        std::complex<float>,
-                                        std::complex<float>>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<double>,
-                                        T2,
-                                        std::complex<double>,
-                                        std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::BinaryContigHyperparameterSetEntry;
-using vsu_ns::ContigHyperparameterSetDefault;
-
-template <typename argTy1, typename argTy2>
-struct MaximumContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class maximum_contig_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event maximum_contig_impl(sycl::queue &exec_q,
-                                std::size_t nelems,
-                                const char *arg1_p,
-                                ssize_t arg1_offset,
-                                const char *arg2_p,
-                                ssize_t arg2_offset,
-                                char *res_p,
-                                ssize_t res_offset,
-                                const std::vector<sycl::event> &depends = {})
-{
-    using MaxHS =
-        hyperparam_detail::MaximumContigHyperparameterSet<argTy1, argTy2>;
-    static constexpr std::uint8_t vec_sz = MaxHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = MaxHS::n_vecs;
-
-    return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, MaximumOutputType, MaximumContigFunctor,
-        maximum_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p,
-                                               arg1_offset, arg2_p, arg2_offset,
-                                               res_p, res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct MaximumContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!MaximumOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = maximum_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2> struct MaximumTypeMapFactory
-{
-    /*! @brief get typeid for output type of maximum(T1 x, T2 y) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename MaximumOutputType<T1, T2>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename resT, typename IndexerT>
-class maximum_strided_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event
-maximum_strided_impl(sycl::queue &exec_q,
-                     std::size_t nelems,
-                     int nd,
-                     const ssize_t *shape_and_strides,
-                     const char *arg1_p,
-                     ssize_t arg1_offset,
-                     const char *arg2_p,
-                     ssize_t arg2_offset,
-                     char *res_p,
-                     ssize_t res_offset,
-                     const std::vector<sycl::event> &depends,
-                     const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_strided_impl<
-        argTy1, argTy2, MaximumOutputType, MaximumStridedFunctor,
-        maximum_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
-                                arg1_offset, arg2_p, arg2_offset, res_p,
-                                res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct MaximumStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!MaximumOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = maximum_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-} // namespace maximum
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
deleted file mode 100644
index 37b43eb0a0..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
+++ /dev/null
@@ -1,316 +0,0 @@
-//=== minimum.hpp -   Binary function MINIMUM           ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of MINIMUM(x1, x2)
-/// function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "utils/math_utils.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace minimum
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace tu_ns = dpctl::tensor::type_utils;
-
-template <typename argT1, typename argT2, typename resT> struct MinimumFunctor
-{
-
-    using supports_sg_loadstore = std::negation<
-        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
-    using supports_vec = std::conjunction<
-        std::is_same<argT1, argT2>,
-        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
-                                       tu_ns::is_complex<argT2>>>>;
-
-    resT operator()(const argT1 &in1, const argT2 &in2) const
-    {
-        if constexpr (tu_ns::is_complex<argT1>::value ||
-                      tu_ns::is_complex<argT2>::value)
-        {
-            static_assert(std::is_same_v<argT1, argT2>);
-            using dpctl::tensor::math_utils::min_complex;
-            return min_complex<argT1>(in1, in2);
-        }
-        else if constexpr (std::is_floating_point_v<argT1> ||
-                           std::is_same_v<argT1, sycl::half>)
-        {
-            const bool choose_first = sycl::isnan(in1) || (in1 < in2);
-            return (choose_first) ? in1 : in2;
-        }
-        else {
-            return (in1 < in2) ? in1 : in2;
-        }
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz>
-    operator()(const sycl::vec<argT1, vec_sz> &in1,
-               const sycl::vec<argT2, vec_sz> &in2) const
-    {
-        sycl::vec<resT, vec_sz> res;
-#pragma unroll
-        for (int i = 0; i < vec_sz; ++i) {
-            const auto &v1 = in1[i];
-            const auto &v2 = in2[i];
-            if constexpr (std::is_floating_point_v<argT1> ||
-                          std::is_same_v<argT1, sycl::half>)
-            {
-                const bool choose_first = sycl::isnan(v1) || (v1 < v2);
-                res[i] = (choose_first) ? v1 : v2;
-            }
-            else {
-                res[i] = (v1 < v2) ? v1 : v2;
-            }
-        }
-        return res;
-    }
-};
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using MinimumContigFunctor =
-    elementwise_common::BinaryContigFunctor<argT1,
-                                            argT2,
-                                            resT,
-                                            MinimumFunctor<argT1, argT2, resT>,
-                                            vec_sz,
-                                            n_vecs,
-                                            enable_sg_loadstore>;
-
-template <typename argT1, typename argT2, typename resT, typename IndexerT>
-using MinimumStridedFunctor = elementwise_common::BinaryStridedFunctor<
-    argT1,
-    argT2,
-    resT,
-    IndexerT,
-    MinimumFunctor<argT1, argT2, resT>>;
-
-template <typename T1, typename T2> struct MinimumOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint8_t,
-                                        T2,
-                                        std::uint8_t,
-                                        std::uint8_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int8_t,
-                                        T2,
-                                        std::int8_t,
-                                        std::int8_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint16_t,
-                                        T2,
-                                        std::uint16_t,
-                                        std::uint16_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int16_t,
-                                        T2,
-                                        std::int16_t,
-                                        std::int16_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint32_t,
-                                        T2,
-                                        std::uint32_t,
-                                        std::uint32_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int32_t,
-                                        T2,
-                                        std::int32_t,
-                                        std::int32_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint64_t,
-                                        T2,
-                                        std::uint64_t,
-                                        std::uint64_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int64_t,
-                                        T2,
-                                        std::int64_t,
-                                        std::int64_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        sycl::half,
-                                        T2,
-                                        sycl::half,
-                                        sycl::half>,
-        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
-        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<float>,
-                                        T2,
-                                        std::complex<float>,
-                                        std::complex<float>>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<double>,
-                                        T2,
-                                        std::complex<double>,
-                                        std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::BinaryContigHyperparameterSetEntry;
-using vsu_ns::ContigHyperparameterSetDefault;
-
-template <typename argTy1, typename argTy2>
-struct MinimumContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class minimum_contig_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event minimum_contig_impl(sycl::queue &exec_q,
-                                std::size_t nelems,
-                                const char *arg1_p,
-                                ssize_t arg1_offset,
-                                const char *arg2_p,
-                                ssize_t arg2_offset,
-                                char *res_p,
-                                ssize_t res_offset,
-                                const std::vector<sycl::event> &depends = {})
-{
-    using MinHS =
-        hyperparam_detail::MinimumContigHyperparameterSet<argTy1, argTy2>;
-    static constexpr std::uint8_t vec_sz = MinHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = MinHS::n_vecs;
-
-    return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, MinimumOutputType, MinimumContigFunctor,
-        minimum_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg1_p,
-                                               arg1_offset, arg2_p, arg2_offset,
-                                               res_p, res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct MinimumContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!MinimumOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = minimum_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2> struct MinimumTypeMapFactory
-{
-    /*! @brief get typeid for output type of minimum(T1 x, T2 y) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename MinimumOutputType<T1, T2>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename resT, typename IndexerT>
-class minimum_strided_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event
-minimum_strided_impl(sycl::queue &exec_q,
-                     std::size_t nelems,
-                     int nd,
-                     const ssize_t *shape_and_strides,
-                     const char *arg1_p,
-                     ssize_t arg1_offset,
-                     const char *arg2_p,
-                     ssize_t arg2_offset,
-                     char *res_p,
-                     ssize_t res_offset,
-                     const std::vector<sycl::event> &depends,
-                     const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_strided_impl<
-        argTy1, argTy2, MinimumOutputType, MinimumStridedFunctor,
-        minimum_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
-                                arg1_offset, arg2_p, arg2_offset, res_p,
-                                res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct MinimumStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!MinimumOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = minimum_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-} // namespace minimum
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
deleted file mode 100644
index 798610f445..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
+++ /dev/null
@@ -1,636 +0,0 @@
-//=== multiply.hpp -   Binary function MUL               ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of MUL(x1, x2)
-/// function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "sycl_complex.hpp"
-#include "vec_size_util.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/common_inplace.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace multiply
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace tu_ns = dpctl::tensor::type_utils;
-
-template <typename argT1, typename argT2, typename resT> struct MultiplyFunctor
-{
-
-    using supports_sg_loadstore = std::negation<
-        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
-    using supports_vec = std::negation<
-        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
-
-    resT operator()(const argT1 &in1, const argT2 &in2) const
-    {
-        if constexpr (tu_ns::is_complex<argT1>::value &&
-                      tu_ns::is_complex<argT2>::value)
-        {
-            using realT1 = typename argT1::value_type;
-            using realT2 = typename argT2::value_type;
-
-            return exprm_ns::complex<realT1>(in1) *
-                   exprm_ns::complex<realT2>(in2);
-        }
-        else {
-            return in1 * in2;
-        }
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz>
-    operator()(const sycl::vec<argT1, vec_sz> &in1,
-               const sycl::vec<argT2, vec_sz> &in2) const
-    {
-        auto tmp = in1 * in2;
-        if constexpr (std::is_same_v<resT,
-                                     typename decltype(tmp)::element_type>)
-        {
-            return tmp;
-        }
-        else {
-            using dpctl::tensor::type_utils::vec_cast;
-
-            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
-                tmp);
-        }
-    }
-};
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using MultiplyContigFunctor =
-    elementwise_common::BinaryContigFunctor<argT1,
-                                            argT2,
-                                            resT,
-                                            MultiplyFunctor<argT1, argT2, resT>,
-                                            vec_sz,
-                                            n_vecs,
-                                            enable_sg_loadstore>;
-
-template <typename argT1, typename argT2, typename resT, typename IndexerT>
-using MultiplyStridedFunctor = elementwise_common::BinaryStridedFunctor<
-    argT1,
-    argT2,
-    resT,
-    IndexerT,
-    MultiplyFunctor<argT1, argT2, resT>>;
-
-template <typename T1, typename T2> struct MultiplyOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint8_t,
-                                        T2,
-                                        std::uint8_t,
-                                        std::uint8_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int8_t,
-                                        T2,
-                                        std::int8_t,
-                                        std::int8_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint16_t,
-                                        T2,
-                                        std::uint16_t,
-                                        std::uint16_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int16_t,
-                                        T2,
-                                        std::int16_t,
-                                        std::int16_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint32_t,
-                                        T2,
-                                        std::uint32_t,
-                                        std::uint32_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int32_t,
-                                        T2,
-                                        std::int32_t,
-                                        std::int32_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint64_t,
-                                        T2,
-                                        std::uint64_t,
-                                        std::uint64_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int64_t,
-                                        T2,
-                                        std::int64_t,
-                                        std::int64_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        sycl::half,
-                                        T2,
-                                        sycl::half,
-                                        sycl::half>,
-        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
-        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<float>,
-                                        T2,
-                                        std::complex<float>,
-                                        std::complex<float>>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<double>,
-                                        T2,
-                                        std::complex<double>,
-                                        std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::BinaryContigHyperparameterSetEntry;
-using vsu_ns::ContigHyperparameterSetDefault;
-
-template <typename argTy1, typename argTy2>
-struct MultiplyContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class multiply_contig_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event multiply_contig_impl(sycl::queue &exec_q,
-                                 std::size_t nelems,
-                                 const char *arg1_p,
-                                 ssize_t arg1_offset,
-                                 const char *arg2_p,
-                                 ssize_t arg2_offset,
-                                 char *res_p,
-                                 ssize_t res_offset,
-                                 const std::vector<sycl::event> &depends = {})
-{
-    using MulHS =
-        hyperparam_detail::MultiplyContigHyperparameterSet<argTy1, argTy2>;
-    static constexpr std::uint8_t vec_sz = MulHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = MulHS::n_vecs;
-
-    return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, MultiplyOutputType, MultiplyContigFunctor,
-        multiply_contig_kernel, vec_sz, n_vecs>(
-        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
-        res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct MultiplyContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!MultiplyOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = multiply_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2> struct MultiplyTypeMapFactory
-{
-    /*! @brief get typeid for output type of multiply(T1 x, T2 y) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename MultiplyOutputType<T1, T2>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename resT, typename IndexerT>
-class multiply_strided_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event
-multiply_strided_impl(sycl::queue &exec_q,
-                      std::size_t nelems,
-                      int nd,
-                      const ssize_t *shape_and_strides,
-                      const char *arg1_p,
-                      ssize_t arg1_offset,
-                      const char *arg2_p,
-                      ssize_t arg2_offset,
-                      char *res_p,
-                      ssize_t res_offset,
-                      const std::vector<sycl::event> &depends,
-                      const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_strided_impl<
-        argTy1, argTy2, MultiplyOutputType, MultiplyStridedFunctor,
-        multiply_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
-                                 arg1_offset, arg2_p, arg2_offset, res_p,
-                                 res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct MultiplyStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!MultiplyOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = multiply_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename argT1, typename argT2, typename resT>
-class multiply_matrix_row_broadcast_sg_krn;
-
-template <typename argT1, typename argT2, typename resT>
-using MultiplyContigMatrixContigRowBroadcastingFunctor =
-    elementwise_common::BinaryContigMatrixContigRowBroadcastingFunctor<
-        argT1,
-        argT2,
-        resT,
-        MultiplyFunctor<argT1, argT2, resT>>;
-
-template <typename argT1, typename argT2, typename resT>
-sycl::event multiply_contig_matrix_contig_row_broadcast_impl(
-    sycl::queue &exec_q,
-    std::vector<sycl::event> &host_tasks,
-    std::size_t n0,
-    std::size_t n1,
-    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
-    ssize_t mat_offset,
-    const char *vec_p, // typeless pointer to (n1,) contiguous row
-    ssize_t vec_offset,
-    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
-                 //    res[i,j] = mat[i,j] * vec[j]
-    ssize_t res_offset,
-    const std::vector<sycl::event> &depends = {})
-{
-    return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl<
-        argT1, argT2, resT, MultiplyContigMatrixContigRowBroadcastingFunctor,
-        multiply_matrix_row_broadcast_sg_krn>(exec_q, host_tasks, n0, n1, mat_p,
-                                              mat_offset, vec_p, vec_offset,
-                                              res_p, res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct MultiplyContigMatrixContigRowBroadcastFactory
-{
-    fnT get()
-    {
-        if constexpr (!MultiplyOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            using resT = typename MultiplyOutputType<T1, T2>::value_type;
-            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
-                          dpctl::tensor::type_utils::is_complex<T2>::value ||
-                          dpctl::tensor::type_utils::is_complex<resT>::value)
-            {
-                fnT fn = nullptr;
-                return fn;
-            }
-            else {
-                fnT fn =
-                    multiply_contig_matrix_contig_row_broadcast_impl<T1, T2,
-                                                                     resT>;
-                return fn;
-            }
-        }
-    }
-};
-
-template <typename argT1, typename argT2, typename resT>
-sycl::event multiply_contig_row_contig_matrix_broadcast_impl(
-    sycl::queue &exec_q,
-    std::vector<sycl::event> &host_tasks,
-    std::size_t n0,
-    std::size_t n1,
-    const char *vec_p, // typeless pointer to (n1,) contiguous row
-    ssize_t vec_offset,
-    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
-    ssize_t mat_offset,
-    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
-                 //    res[i,j] = mat[i,j] * vec[j]
-    ssize_t res_offset,
-    const std::vector<sycl::event> &depends = {})
-{
-    return multiply_contig_matrix_contig_row_broadcast_impl<argT2, argT1, resT>(
-        exec_q, host_tasks, n0, n1, mat_p, mat_offset, vec_p, vec_offset, res_p,
-        res_offset, depends);
-};
-
-template <typename fnT, typename T1, typename T2>
-struct MultiplyContigRowContigMatrixBroadcastFactory
-{
-    fnT get()
-    {
-        if constexpr (!MultiplyOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            using resT = typename MultiplyOutputType<T1, T2>::value_type;
-            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
-                          dpctl::tensor::type_utils::is_complex<T2>::value ||
-                          dpctl::tensor::type_utils::is_complex<resT>::value)
-            {
-                fnT fn = nullptr;
-                return fn;
-            }
-            else {
-                fnT fn =
-                    multiply_contig_row_contig_matrix_broadcast_impl<T1, T2,
-                                                                     resT>;
-                return fn;
-            }
-        }
-    }
-};
-
-template <typename argT, typename resT> struct MultiplyInplaceFunctor
-{
-
-    using supports_sg_loadstore = std::negation<
-        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
-    using supports_vec = std::negation<
-        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
-
-    void operator()(resT &res, const argT &in) { res *= in; }
-
-    template <int vec_sz>
-    void operator()(sycl::vec<resT, vec_sz> &res,
-                    const sycl::vec<argT, vec_sz> &in)
-    {
-        res *= in;
-    }
-};
-
-template <typename argT,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using MultiplyInplaceContigFunctor =
-    elementwise_common::BinaryInplaceContigFunctor<
-        argT,
-        resT,
-        MultiplyInplaceFunctor<argT, resT>,
-        vec_sz,
-        n_vecs,
-        enable_sg_loadstore>;
-
-template <typename argT, typename resT, typename IndexerT>
-using MultiplyInplaceStridedFunctor =
-    elementwise_common::BinaryInplaceStridedFunctor<
-        argT,
-        resT,
-        IndexerT,
-        MultiplyInplaceFunctor<argT, resT>>;
-
-template <typename argT,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class multiply_inplace_contig_kernel;
-
-/* @brief Types supported by in-place multiplication */
-template <typename argTy, typename resTy> struct MultiplyInplaceTypePairSupport
-{
-    /* value if true a kernel for <argTy, resTy> must be instantiated  */
-    static constexpr bool is_defined = std::disjunction<
-        td_ns::TypePairDefinedEntry<argTy, bool, resTy, bool>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, resTy, sycl::half>,
-        td_ns::TypePairDefinedEntry<argTy, float, resTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, double, resTy, double>,
-        td_ns::TypePairDefinedEntry<argTy,
-                                    std::complex<float>,
-                                    resTy,
-                                    std::complex<float>>,
-        td_ns::TypePairDefinedEntry<argTy,
-                                    std::complex<double>,
-                                    resTy,
-                                    std::complex<double>>,
-        // fall-through
-        td_ns::NotDefinedEntry>::is_defined;
-};
-
-template <typename fnT, typename argT, typename resT>
-struct MultiplyInplaceTypeMapFactory
-{
-    /*! @brief get typeid for output type of x *= y */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        if constexpr (MultiplyInplaceTypePairSupport<argT, resT>::is_defined) {
-            return td_ns::GetTypeid<resT>{}.get();
-        }
-        else {
-            return td_ns::GetTypeid<void>{}.get();
-        }
-    }
-};
-
-template <typename argTy, typename resTy>
-sycl::event
-multiply_inplace_contig_impl(sycl::queue &exec_q,
-                             std::size_t nelems,
-                             const char *arg_p,
-                             ssize_t arg_offset,
-                             char *res_p,
-                             ssize_t res_offset,
-                             const std::vector<sycl::event> &depends = {})
-{
-    using MulHS =
-        hyperparam_detail::MultiplyContigHyperparameterSet<resTy, argTy>;
-    static constexpr std::uint8_t vec_sz = MulHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = MulHS::n_vecs;
-
-    return elementwise_common::binary_inplace_contig_impl<
-        argTy, resTy, MultiplyInplaceContigFunctor,
-        multiply_inplace_contig_kernel, vec_sz, n_vecs>(
-        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct MultiplyInplaceContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!MultiplyInplaceTypePairSupport<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = multiply_inplace_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename resT, typename argT, typename IndexerT>
-class multiply_inplace_strided_kernel;
-
-template <typename argTy, typename resTy>
-sycl::event multiply_inplace_strided_impl(
-    sycl::queue &exec_q,
-    std::size_t nelems,
-    int nd,
-    const ssize_t *shape_and_strides,
-    const char *arg_p,
-    ssize_t arg_offset,
-    char *res_p,
-    ssize_t res_offset,
-    const std::vector<sycl::event> &depends,
-    const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_inplace_strided_impl<
-        argTy, resTy, MultiplyInplaceStridedFunctor,
-        multiply_inplace_strided_kernel>(exec_q, nelems, nd, shape_and_strides,
-                                         arg_p, arg_offset, res_p, res_offset,
-                                         depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct MultiplyInplaceStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!MultiplyInplaceTypePairSupport<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = multiply_inplace_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename argT, typename resT>
-class multiply_inplace_row_matrix_broadcast_sg_krn;
-
-template <typename argT, typename resT>
-using MultiplyInplaceRowMatrixBroadcastingFunctor =
-    elementwise_common::BinaryInplaceRowMatrixBroadcastingFunctor<
-        argT,
-        resT,
-        MultiplyInplaceFunctor<argT, resT>>;
-
-template <typename argT, typename resT>
-sycl::event multiply_inplace_row_matrix_broadcast_impl(
-    sycl::queue &exec_q,
-    std::vector<sycl::event> &host_tasks,
-    std::size_t n0,
-    std::size_t n1,
-    const char *vec_p, // typeless pointer to (n1,) contiguous row
-    ssize_t vec_offset,
-    char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
-    ssize_t mat_offset,
-    const std::vector<sycl::event> &depends = {})
-{
-    return elementwise_common::binary_inplace_row_matrix_broadcast_impl<
-        argT, resT, MultiplyInplaceRowMatrixBroadcastingFunctor,
-        multiply_inplace_row_matrix_broadcast_sg_krn>(
-        exec_q, host_tasks, n0, n1, vec_p, vec_offset, mat_p, mat_offset,
-        depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct MultiplyInplaceRowMatrixBroadcastFactory
-{
-    fnT get()
-    {
-        if constexpr (!MultiplyInplaceTypePairSupport<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
-                          dpctl::tensor::type_utils::is_complex<T2>::value)
-            {
-                fnT fn = nullptr;
-                return fn;
-            }
-            else {
-                fnT fn = multiply_inplace_row_matrix_broadcast_impl<T1, T2>;
-                return fn;
-            }
-        }
-    }
-};
-
-} // namespace multiply
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
deleted file mode 100644
index 4a8fc76bcc..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
+++ /dev/null
@@ -1,211 +0,0 @@
-//=== negative.hpp -   Unary function POSITIVE           ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of POSITIVE(x)
-/// function that returns x.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace negative
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-using dpctl::tensor::type_utils::vec_cast;
-
-template <typename argT, typename resT> struct NegativeFunctor
-{
-
-    using is_constant = typename std::false_type;
-    // constexpr resT constant_value = resT{};
-    using supports_vec = typename std::false_type;
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &x) const { return -x; }
-};
-
-template <typename argT,
-          typename resT = argT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using NegativeContigFunctor =
-    elementwise_common::UnaryContigFunctor<argT,
-                                           resT,
-                                           NegativeFunctor<argT, resT>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename T> struct NegativeOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, std::uint8_t>,
-        td_ns::TypeMapResultEntry<T, std::uint16_t>,
-        td_ns::TypeMapResultEntry<T, std::uint32_t>,
-        td_ns::TypeMapResultEntry<T, std::uint64_t>,
-        td_ns::TypeMapResultEntry<T, std::int8_t>,
-        td_ns::TypeMapResultEntry<T, std::int16_t>,
-        td_ns::TypeMapResultEntry<T, std::int32_t>,
-        td_ns::TypeMapResultEntry<T, std::int64_t>,
-        td_ns::TypeMapResultEntry<T, sycl::half>,
-        td_ns::TypeMapResultEntry<T, float>,
-        td_ns::TypeMapResultEntry<T, double>,
-        td_ns::TypeMapResultEntry<T, std::complex<float>>,
-        td_ns::TypeMapResultEntry<T, std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct NegativeContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class negative_contig_kernel;
-
-template <typename argTy>
-sycl::event negative_contig_impl(sycl::queue &exec_q,
-                                 std::size_t nelems,
-                                 const char *arg_p,
-                                 char *res_p,
-                                 const std::vector<sycl::event> &depends = {})
-{
-    using NegHS = hyperparam_detail::NegativeContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = NegHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = NegHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, NegativeOutputType, NegativeContigFunctor,
-        negative_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p,
-                                                depends);
-}
-
-template <typename fnT, typename T> struct NegativeContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!NegativeOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = negative_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct NegativeTypeMapFactory
-{
-    /*! @brief get typeid for output type of std::negative(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename NegativeOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename argTy, typename resTy, typename IndexerT>
-using NegativeStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, NegativeFunctor<argTy, resTy>>;
-
-template <typename T1, typename T2, typename T3> class negative_strided_kernel;
-
-template <typename argTy>
-sycl::event
-negative_strided_impl(sycl::queue &exec_q,
-                      std::size_t nelems,
-                      int nd,
-                      const ssize_t *shape_and_strides,
-                      const char *arg_p,
-                      ssize_t arg_offset,
-                      char *res_p,
-                      ssize_t res_offset,
-                      const std::vector<sycl::event> &depends,
-                      const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<argTy, NegativeOutputType,
-                                                  NegativeStridedFunctor,
-                                                  negative_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct NegativeStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!NegativeOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = negative_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace negative
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp
deleted file mode 100644
index f68e4e8295..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp
+++ /dev/null
@@ -1,241 +0,0 @@
-//=== NEXTAFTER.hpp -   Binary function NEXTAFTER  ------ *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of NEXTAFTER(x1, x2)
-/// function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace nextafter
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace tu_ns = dpctl::tensor::type_utils;
-
-template <typename argT1, typename argT2, typename resT> struct NextafterFunctor
-{
-
-    using supports_sg_loadstore = std::true_type;
-    using supports_vec = std::true_type;
-
-    resT operator()(const argT1 &in1, const argT2 &in2) const
-    {
-        return sycl::nextafter(in1, in2);
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz>
-    operator()(const sycl::vec<argT1, vec_sz> &in1,
-               const sycl::vec<argT2, vec_sz> &in2) const
-    {
-        auto res = sycl::nextafter(in1, in2);
-        if constexpr (std::is_same_v<resT,
-                                     typename decltype(res)::element_type>)
-        {
-            return res;
-        }
-        else {
-            using dpctl::tensor::type_utils::vec_cast;
-
-            return vec_cast<resT, typename decltype(res)::element_type, vec_sz>(
-                res);
-        }
-    }
-};
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using NextafterContigFunctor = elementwise_common::BinaryContigFunctor<
-    argT1,
-    argT2,
-    resT,
-    NextafterFunctor<argT1, argT2, resT>,
-    vec_sz,
-    n_vecs,
-    enable_sg_loadstore>;
-
-template <typename argT1, typename argT2, typename resT, typename IndexerT>
-using NextafterStridedFunctor = elementwise_common::BinaryStridedFunctor<
-    argT1,
-    argT2,
-    resT,
-    IndexerT,
-    NextafterFunctor<argT1, argT2, resT>>;
-
-template <typename T1, typename T2> struct NextafterOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        sycl::half,
-                                        T2,
-                                        sycl::half,
-                                        sycl::half>,
-        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
-        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::BinaryContigHyperparameterSetEntry;
-using vsu_ns::ContigHyperparameterSetDefault;
-
-template <typename argTy1, typename argTy2>
-struct NextafterContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class nextafter_contig_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event nextafter_contig_impl(sycl::queue &exec_q,
-                                  std::size_t nelems,
-                                  const char *arg1_p,
-                                  ssize_t arg1_offset,
-                                  const char *arg2_p,
-                                  ssize_t arg2_offset,
-                                  char *res_p,
-                                  ssize_t res_offset,
-                                  const std::vector<sycl::event> &depends = {})
-{
-    using NextafterHS =
-        hyperparam_detail::NextafterContigHyperparameterSet<argTy1, argTy2>;
-    static constexpr std::uint8_t vec_sz = NextafterHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = NextafterHS::n_vecs;
-
-    return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, NextafterOutputType, NextafterContigFunctor,
-        nextafter_contig_kernel, vec_sz, n_vecs>(
-        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
-        res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct NextafterContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!NextafterOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = nextafter_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2> struct NextafterTypeMapFactory
-{
-    /*! @brief get typeid for output type of std::nextafter(T1 x, T2 y) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename NextafterOutputType<T1, T2>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename resT, typename IndexerT>
-class nextafter_strided_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event
-nextafter_strided_impl(sycl::queue &exec_q,
-                       std::size_t nelems,
-                       int nd,
-                       const ssize_t *shape_and_strides,
-                       const char *arg1_p,
-                       ssize_t arg1_offset,
-                       const char *arg2_p,
-                       ssize_t arg2_offset,
-                       char *res_p,
-                       ssize_t res_offset,
-                       const std::vector<sycl::event> &depends,
-                       const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_strided_impl<
-        argTy1, argTy2, NextafterOutputType, NextafterStridedFunctor,
-        nextafter_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
-                                  arg1_offset, arg2_p, arg2_offset, res_p,
-                                  res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct NextafterStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!NextafterOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = nextafter_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-} // namespace nextafter
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
deleted file mode 100644
index f8f436ec4f..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
+++ /dev/null
@@ -1,297 +0,0 @@
-//=== not_equal.hpp -   Binary function NOT_EQUAL        ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of inequality of
-/// tensor elements.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace not_equal
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace tu_ns = dpctl::tensor::type_utils;
-
-template <typename argT1, typename argT2, typename resT> struct NotEqualFunctor
-{
-    static_assert(std::is_same_v<resT, bool>);
-
-    using supports_sg_loadstore = std::negation<
-        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
-    using supports_vec = std::conjunction<
-        std::is_same<argT1, argT2>,
-        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
-                                       tu_ns::is_complex<argT2>>>>;
-
-    resT operator()(const argT1 &in1, const argT2 &in2) const
-    {
-        if constexpr (std::is_integral_v<argT1> && std::is_integral_v<argT2> &&
-                      std::is_signed_v<argT1> != std::is_signed_v<argT2>)
-        {
-            if constexpr (std::is_signed_v<argT1> && !std::is_signed_v<argT2>) {
-                return (in1 < 0) ? true : (static_cast<argT2>(in1) != in2);
-            }
-            else {
-                if constexpr (!std::is_signed_v<argT1> &&
-                              std::is_signed_v<argT2>)
-                {
-                    return (in2 < 0) ? true : (in1 != static_cast<argT1>(in2));
-                }
-            }
-        }
-        else {
-            return (in1 != in2);
-        }
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz>
-    operator()(const sycl::vec<argT1, vec_sz> &in1,
-               const sycl::vec<argT2, vec_sz> &in2) const
-    {
-        auto tmp = (in1 != in2);
-        if constexpr (std::is_same_v<resT,
-                                     typename decltype(tmp)::element_type>)
-        {
-            return tmp;
-        }
-        else {
-            using dpctl::tensor::type_utils::vec_cast;
-
-            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
-                tmp);
-        }
-    }
-};
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using NotEqualContigFunctor =
-    elementwise_common::BinaryContigFunctor<argT1,
-                                            argT2,
-                                            resT,
-                                            NotEqualFunctor<argT1, argT2, resT>,
-                                            vec_sz,
-                                            n_vecs,
-                                            enable_sg_loadstore>;
-
-template <typename argT1, typename argT2, typename resT, typename IndexerT>
-using NotEqualStridedFunctor = elementwise_common::BinaryStridedFunctor<
-    argT1,
-    argT2,
-    resT,
-    IndexerT,
-    NotEqualFunctor<argT1, argT2, resT>>;
-
-template <typename T1, typename T2> struct NotEqualOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::uint8_t, T2, std::uint8_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, std::int8_t, T2, std::int8_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint16_t,
-                                        T2,
-                                        std::uint16_t,
-                                        bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::int16_t, T2, std::int16_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint32_t,
-                                        T2,
-                                        std::uint32_t,
-                                        bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::int32_t, T2, std::int32_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint64_t,
-                                        T2,
-                                        std::uint64_t,
-                                        bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::int64_t, bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::uint64_t, T2, std::int64_t, bool>,
-        td_ns::
-            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::uint64_t, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, sycl::half, T2, sycl::half, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<float>,
-                                        T2,
-                                        std::complex<float>,
-                                        bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<double>,
-                                        T2,
-                                        std::complex<double>,
-                                        bool>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::BinaryContigHyperparameterSetEntry;
-using vsu_ns::ContigHyperparameterSetDefault;
-
-template <typename argTy1, typename argTy2>
-struct NotEqualContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class not_equal_contig_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event not_equal_contig_impl(sycl::queue &exec_q,
-                                  std::size_t nelems,
-                                  const char *arg1_p,
-                                  ssize_t arg1_offset,
-                                  const char *arg2_p,
-                                  ssize_t arg2_offset,
-                                  char *res_p,
-                                  ssize_t res_offset,
-                                  const std::vector<sycl::event> &depends = {})
-{
-    using NotEqHS =
-        hyperparam_detail::NotEqualContigHyperparameterSet<argTy1, argTy2>;
-    static constexpr std::uint8_t vec_sz = NotEqHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = NotEqHS::n_vecs;
-
-    return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, NotEqualOutputType, NotEqualContigFunctor,
-        not_equal_contig_kernel, vec_sz, n_vecs>(
-        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
-        res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct NotEqualContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!NotEqualOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = not_equal_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2> struct NotEqualTypeMapFactory
-{
-    /*! @brief get typeid for output type of operator()!=(x, y), always bool */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename NotEqualOutputType<T1, T2>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename resT, typename IndexerT>
-class not_equal_strided_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event
-not_equal_strided_impl(sycl::queue &exec_q,
-                       std::size_t nelems,
-                       int nd,
-                       const ssize_t *shape_and_strides,
-                       const char *arg1_p,
-                       ssize_t arg1_offset,
-                       const char *arg2_p,
-                       ssize_t arg2_offset,
-                       char *res_p,
-                       ssize_t res_offset,
-                       const std::vector<sycl::event> &depends,
-                       const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_strided_impl<
-        argTy1, argTy2, NotEqualOutputType, NotEqualStridedFunctor,
-        not_equal_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
-                                  arg1_offset, arg2_p, arg2_offset, res_p,
-                                  res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct NotEqualStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!NotEqualOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = not_equal_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-} // namespace not_equal
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
deleted file mode 100644
index 9bc9e5782a..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
+++ /dev/null
@@ -1,226 +0,0 @@
-//=== positive.hpp -   Unary function POSITIVE           ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of POSITIVE(x)
-/// function that returns x.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace positive
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-using dpctl::tensor::type_utils::vec_cast;
-
-template <typename argT, typename resT> struct PositiveFunctor
-{
-
-    using is_constant = typename std::false_type;
-    // constexpr resT constant_value = resT{};
-    using supports_vec = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &x) const { return x; }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in) const
-    {
-        auto const &res_vec = in;
-        using deducedT = typename std::remove_cv_t<
-            std::remove_reference_t<decltype(res_vec)>>::element_type;
-        if constexpr (std::is_same_v<resT, deducedT>) {
-            return res_vec;
-        }
-        else {
-            return vec_cast<resT, deducedT, vec_sz>(res_vec);
-        }
-    }
-};
-
-template <typename argT,
-          typename resT = argT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using PositiveContigFunctor =
-    elementwise_common::UnaryContigFunctor<argT,
-                                           resT,
-                                           PositiveFunctor<argT, resT>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename T> struct PositiveOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, std::uint8_t>,
-        td_ns::TypeMapResultEntry<T, std::uint16_t>,
-        td_ns::TypeMapResultEntry<T, std::uint32_t>,
-        td_ns::TypeMapResultEntry<T, std::uint64_t>,
-        td_ns::TypeMapResultEntry<T, std::int8_t>,
-        td_ns::TypeMapResultEntry<T, std::int16_t>,
-        td_ns::TypeMapResultEntry<T, std::int32_t>,
-        td_ns::TypeMapResultEntry<T, std::int64_t>,
-        td_ns::TypeMapResultEntry<T, sycl::half>,
-        td_ns::TypeMapResultEntry<T, float>,
-        td_ns::TypeMapResultEntry<T, double>,
-        td_ns::TypeMapResultEntry<T, std::complex<float>>,
-        td_ns::TypeMapResultEntry<T, std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct PositiveContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class positive_contig_kernel;
-
-template <typename argTy>
-sycl::event positive_contig_impl(sycl::queue &exec_q,
-                                 std::size_t nelems,
-                                 const char *arg_p,
-                                 char *res_p,
-                                 const std::vector<sycl::event> &depends = {})
-{
-    using PosHS = hyperparam_detail::PositiveContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = PosHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = PosHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, PositiveOutputType, PositiveContigFunctor,
-        positive_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p,
-                                                depends);
-}
-
-template <typename fnT, typename T> struct PositiveContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!PositiveOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = positive_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct PositiveTypeMapFactory
-{
-    /*! @brief get typeid for output type of std::positive(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename PositiveOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename argTy, typename resTy, typename IndexerT>
-using PositiveStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, PositiveFunctor<argTy, resTy>>;
-
-template <typename T1, typename T2, typename T3> class positive_strided_kernel;
-
-template <typename argTy>
-sycl::event
-positive_strided_impl(sycl::queue &exec_q,
-                      std::size_t nelems,
-                      int nd,
-                      const ssize_t *shape_and_strides,
-                      const char *arg_p,
-                      ssize_t arg_offset,
-                      char *res_p,
-                      ssize_t res_offset,
-                      const std::vector<sycl::event> &depends,
-                      const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<argTy, PositiveOutputType,
-                                                  PositiveStridedFunctor,
-                                                  positive_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct PositiveStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!PositiveOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = positive_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace positive
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
deleted file mode 100644
index 326d83f412..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
+++ /dev/null
@@ -1,590 +0,0 @@
-//=== POW.hpp -   Binary function POW                    ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of POW(x1, x2)
-/// function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <limits>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "sycl_complex.hpp"
-#include "vec_size_util.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/common_inplace.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace pow
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace tu_ns = dpctl::tensor::type_utils;
-
-template <typename argT1, typename argT2, typename resT> struct PowFunctor
-{
-
-    using supports_sg_loadstore = std::negation<
-        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
-    using supports_vec = std::negation<
-        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
-
-    resT operator()(const argT1 &in1, const argT2 &in2) const
-    {
-        if constexpr (std::is_integral_v<argT1> || std::is_integral_v<argT2>) {
-            auto tmp1 = in1;
-            auto tmp2 = in2;
-            if constexpr (std::is_signed_v<argT2>) {
-                if (tmp2 < 0) {
-                    // invalid; return 0
-                    return resT(0);
-                }
-            }
-            resT res = 1;
-            if (tmp1 == 1 || tmp2 == 0) {
-                return res;
-            }
-            while (tmp2 > 0) {
-                if (tmp2 & 1) {
-                    res *= tmp1;
-                }
-                tmp2 >>= 1;
-                tmp1 *= tmp1;
-            }
-            return res;
-        }
-        else if constexpr (tu_ns::is_complex<argT1>::value &&
-                           tu_ns::is_complex<argT2>::value)
-        {
-            using realT1 = typename argT1::value_type;
-            using realT2 = typename argT2::value_type;
-
-            return exprm_ns::pow(exprm_ns::complex<realT1>(in1),
-                                 exprm_ns::complex<realT2>(in2));
-        }
-        else {
-            return sycl::pow(in1, in2);
-        }
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz>
-    operator()(const sycl::vec<argT1, vec_sz> &in1,
-               const sycl::vec<argT2, vec_sz> &in2) const
-    {
-        if constexpr (std::is_integral_v<argT1> || std::is_integral_v<argT2>) {
-            sycl::vec<resT, vec_sz> res;
-#pragma unroll
-            for (int i = 0; i < vec_sz; ++i) {
-                auto tmp1 = in1[i];
-                auto tmp2 = in2[i];
-                if constexpr (std::is_signed_v<argT2>) {
-                    if (tmp2 < 0) {
-                        // invalid; yield 0
-                        res[i] = 0;
-                        continue;
-                    }
-                }
-                resT res_tmp = 1;
-                if (tmp1 == 1 || tmp2 == 0) {
-                    res[i] = res_tmp;
-                    continue;
-                }
-                while (tmp2 > 0) {
-                    if (tmp2 & 1) {
-                        res_tmp *= tmp1;
-                    }
-                    tmp2 >>= 1;
-                    tmp1 *= tmp1;
-                }
-                res[i] = res_tmp;
-            }
-            return res;
-        }
-        else {
-            auto res = sycl::pow(in1, in2);
-            if constexpr (std::is_same_v<resT,
-                                         typename decltype(res)::element_type>)
-            {
-                return res;
-            }
-            else {
-                using dpctl::tensor::type_utils::vec_cast;
-
-                return vec_cast<resT, typename decltype(res)::element_type,
-                                vec_sz>(res);
-            }
-        }
-    }
-};
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using PowContigFunctor =
-    elementwise_common::BinaryContigFunctor<argT1,
-                                            argT2,
-                                            resT,
-                                            PowFunctor<argT1, argT2, resT>,
-                                            vec_sz,
-                                            n_vecs,
-                                            enable_sg_loadstore>;
-
-template <typename argT1, typename argT2, typename resT, typename IndexerT>
-using PowStridedFunctor =
-    elementwise_common::BinaryStridedFunctor<argT1,
-                                             argT2,
-                                             resT,
-                                             IndexerT,
-                                             PowFunctor<argT1, argT2, resT>>;
-
-template <typename T1, typename T2> struct PowOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint8_t,
-                                        T2,
-                                        std::uint8_t,
-                                        std::uint8_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int8_t,
-                                        T2,
-                                        std::int8_t,
-                                        std::int8_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint16_t,
-                                        T2,
-                                        std::uint16_t,
-                                        std::uint16_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int16_t,
-                                        T2,
-                                        std::int16_t,
-                                        std::int16_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint32_t,
-                                        T2,
-                                        std::uint32_t,
-                                        std::uint32_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int32_t,
-                                        T2,
-                                        std::int32_t,
-                                        std::int32_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint64_t,
-                                        T2,
-                                        std::uint64_t,
-                                        std::uint64_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int64_t,
-                                        T2,
-                                        std::int64_t,
-                                        std::int64_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        sycl::half,
-                                        T2,
-                                        sycl::half,
-                                        sycl::half>,
-        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
-        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<float>,
-                                        T2,
-                                        std::complex<float>,
-                                        std::complex<float>>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<double>,
-                                        T2,
-                                        std::complex<double>,
-                                        std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::BinaryContigHyperparameterSetEntry;
-using vsu_ns::ContigHyperparameterSetDefault;
-
-template <typename argTy1, typename argTy2> struct PowContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class pow_contig_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event pow_contig_impl(sycl::queue &exec_q,
-                            std::size_t nelems,
-                            const char *arg1_p,
-                            ssize_t arg1_offset,
-                            const char *arg2_p,
-                            ssize_t arg2_offset,
-                            char *res_p,
-                            ssize_t res_offset,
-                            const std::vector<sycl::event> &depends = {})
-{
-    using PowHS = hyperparam_detail::PowContigHyperparameterSet<argTy1, argTy2>;
-    static constexpr std::uint8_t vec_sz = PowHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = PowHS::n_vecs;
-
-    return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, PowOutputType, PowContigFunctor, pow_contig_kernel,
-        vec_sz, n_vecs>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
-                        arg2_offset, res_p, res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct PowContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!PowOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = pow_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2> struct PowTypeMapFactory
-{
-    /*! @brief get typeid for output type of std::pow(T1 x, T2 y) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename PowOutputType<T1, T2>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename resT, typename IndexerT>
-class pow_strided_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event pow_strided_impl(sycl::queue &exec_q,
-                             std::size_t nelems,
-                             int nd,
-                             const ssize_t *shape_and_strides,
-                             const char *arg1_p,
-                             ssize_t arg1_offset,
-                             const char *arg2_p,
-                             ssize_t arg2_offset,
-                             char *res_p,
-                             ssize_t res_offset,
-                             const std::vector<sycl::event> &depends,
-                             const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_strided_impl<
-        argTy1, argTy2, PowOutputType, PowStridedFunctor, pow_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
-        arg2_offset, res_p, res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct PowStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!PowOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = pow_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename argT, typename resT> struct PowInplaceFunctor
-{
-
-    using supports_sg_loadstore = std::negation<
-        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
-    using supports_vec = std::negation<
-        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
-
-    void operator()(resT &res, const argT &in)
-    {
-        if constexpr (std::is_integral_v<argT> || std::is_integral_v<resT>) {
-            auto tmp1 = res;
-            auto tmp2 = in;
-            if constexpr (std::is_signed_v<argT>) {
-                if (tmp2 < 0) {
-                    // invalid; return 0
-                    res = 0;
-                    return;
-                }
-            }
-            if (tmp1 == 1) {
-                return;
-            }
-            if (tmp2 == 0) {
-                res = 1;
-                return;
-            }
-            resT res_tmp = 1;
-            while (tmp2 > 0) {
-                if (tmp2 & 1) {
-                    res_tmp *= tmp1;
-                }
-                tmp2 >>= 1;
-                tmp1 *= tmp1;
-            }
-            res = res_tmp;
-        }
-        else if constexpr (tu_ns::is_complex<argT>::value &&
-                           tu_ns::is_complex<resT>::value)
-        {
-            using r_resT = typename resT::value_type;
-            using r_argT = typename argT::value_type;
-
-            res = exprm_ns::pow(exprm_ns::complex<r_resT>(res),
-                                exprm_ns::complex<r_argT>(in));
-        }
-        else {
-            res = sycl::pow(res, in);
-        }
-        return;
-    }
-
-    template <int vec_sz>
-    void operator()(sycl::vec<resT, vec_sz> &res,
-                    const sycl::vec<argT, vec_sz> &in)
-    {
-        if constexpr (std::is_integral_v<argT> || std::is_integral_v<resT>) {
-#pragma unroll
-            for (int i = 0; i < vec_sz; ++i) {
-                auto tmp1 = res[i];
-                auto tmp2 = in[i];
-                if constexpr (std::is_signed_v<argT>) {
-                    if (tmp2 < 0) {
-                        // invalid; return 0
-                        res[i] = 0;
-                        continue;
-                    }
-                }
-                if (tmp1 == 1) {
-                    continue;
-                }
-                if (tmp2 == 0) {
-                    res[i] = 1;
-                    continue;
-                }
-                resT res_tmp = 1;
-                while (tmp2 > 0) {
-                    if (tmp2 & 1) {
-                        res_tmp *= tmp1;
-                    }
-                    tmp2 >>= 1;
-                    tmp1 *= tmp1;
-                }
-                res[i] = res_tmp;
-            }
-        }
-        else {
-            res = sycl::pow(res, in);
-        }
-    }
-};
-
-template <typename argT,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using PowInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor<
-    argT,
-    resT,
-    PowInplaceFunctor<argT, resT>,
-    vec_sz,
-    n_vecs,
-    enable_sg_loadstore>;
-
-template <typename argT, typename resT, typename IndexerT>
-using PowInplaceStridedFunctor =
-    elementwise_common::BinaryInplaceStridedFunctor<
-        argT,
-        resT,
-        IndexerT,
-        PowInplaceFunctor<argT, resT>>;
-
-template <typename argT,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class pow_inplace_contig_kernel;
-
-/* @brief Types supported by in-place pow */
-template <typename argTy, typename resTy> struct PowInplaceTypePairSupport
-{
-    /* value if true a kernel for <argTy, resTy> must be instantiated  */
-    static constexpr bool is_defined = std::disjunction<
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, resTy, sycl::half>,
-        td_ns::TypePairDefinedEntry<argTy, float, resTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, double, resTy, double>,
-        td_ns::TypePairDefinedEntry<argTy,
-                                    std::complex<float>,
-                                    resTy,
-                                    std::complex<float>>,
-        td_ns::TypePairDefinedEntry<argTy,
-                                    std::complex<double>,
-                                    resTy,
-                                    std::complex<double>>,
-        // fall-through
-        td_ns::NotDefinedEntry>::is_defined;
-};
-
-template <typename fnT, typename argT, typename resT>
-struct PowInplaceTypeMapFactory
-{
-    /*! @brief get typeid for output type of x **= y */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        if constexpr (PowInplaceTypePairSupport<argT, resT>::is_defined) {
-            return td_ns::GetTypeid<resT>{}.get();
-        }
-        else {
-            return td_ns::GetTypeid<void>{}.get();
-        }
-    }
-};
-
-template <typename argTy, typename resTy>
-sycl::event
-pow_inplace_contig_impl(sycl::queue &exec_q,
-                        std::size_t nelems,
-                        const char *arg_p,
-                        ssize_t arg_offset,
-                        char *res_p,
-                        ssize_t res_offset,
-                        const std::vector<sycl::event> &depends = {})
-{
-    using PowHS = hyperparam_detail::PowContigHyperparameterSet<resTy, argTy>;
-    static constexpr std::uint8_t vec_sz = PowHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = PowHS::n_vecs;
-
-    return elementwise_common::binary_inplace_contig_impl<
-        argTy, resTy, PowInplaceContigFunctor, pow_inplace_contig_kernel,
-        vec_sz, n_vecs>(exec_q, nelems, arg_p, arg_offset, res_p, res_offset,
-                        depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct PowInplaceContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!PowInplaceTypePairSupport<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = pow_inplace_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename resT, typename argT, typename IndexerT>
-class pow_inplace_strided_kernel;
-
-template <typename argTy, typename resTy>
-sycl::event
-pow_inplace_strided_impl(sycl::queue &exec_q,
-                         std::size_t nelems,
-                         int nd,
-                         const ssize_t *shape_and_strides,
-                         const char *arg_p,
-                         ssize_t arg_offset,
-                         char *res_p,
-                         ssize_t res_offset,
-                         const std::vector<sycl::event> &depends,
-                         const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_inplace_strided_impl<
-        argTy, resTy, PowInplaceStridedFunctor, pow_inplace_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct PowInplaceStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!PowInplaceTypePairSupport<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = pow_inplace_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-} // namespace pow
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
deleted file mode 100644
index a9d630e762..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
+++ /dev/null
@@ -1,230 +0,0 @@
-//=== proj.hpp -   Unary function PROJ                     ------
-//*-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of PROJ(x) function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <limits>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace proj
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-
-template <typename argT, typename resT> struct ProjFunctor
-{
-
-    // is function constant for given argT
-    using is_constant = typename std::false_type;
-    // constant value, if constant
-    // constexpr resT constant_value = resT{};
-    // is function defined for sycl::vec
-    using supports_vec = typename std::false_type;
-    // do both argTy and resTy support sugroup store/load operation
-    using supports_sg_loadstore = typename std::false_type;
-
-    resT operator()(const argT &in) const
-    {
-        using realT = typename argT::value_type;
-        const realT x = std::real(in);
-        const realT y = std::imag(in);
-
-        if (std::isinf(x)) {
-            return value_at_infinity(y);
-        }
-        else if (std::isinf(y)) {
-            return value_at_infinity(y);
-        }
-        else {
-            return in;
-        }
-    }
-
-private:
-    template <typename T> std::complex<T> value_at_infinity(const T &y) const
-    {
-        const T res_im = sycl::copysign(T(0), y);
-        return std::complex<T>{std::numeric_limits<T>::infinity(), res_im};
-    }
-};
-
-template <typename argTy,
-          typename resTy = argTy,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using ProjContigFunctor =
-    elementwise_common::UnaryContigFunctor<argTy,
-                                           resTy,
-                                           ProjFunctor<argTy, resTy>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using ProjStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, ProjFunctor<argTy, resTy>>;
-
-template <typename T> struct ProjOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, std::complex<float>>,
-        td_ns::TypeMapResultEntry<T, std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct ProjContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class proj_contig_kernel;
-
-template <typename argTy>
-sycl::event proj_contig_impl(sycl::queue &exec_q,
-                             std::size_t nelems,
-                             const char *arg_p,
-                             char *res_p,
-                             const std::vector<sycl::event> &depends = {})
-{
-    using ProjHS = hyperparam_detail::ProjContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = ProjHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = ProjHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, ProjOutputType, ProjContigFunctor, proj_contig_kernel, vec_sz,
-        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct ProjContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!ProjOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            if constexpr (std::is_same_v<T, std::complex<double>>) {
-                fnT fn = proj_contig_impl<T>;
-                return fn;
-            }
-            else {
-                fnT fn = proj_contig_impl<T>;
-                return fn;
-            }
-        }
-    }
-};
-
-template <typename fnT, typename T> struct ProjTypeMapFactory
-{
-    /*! @brief get typeid for output type of std::proj(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename ProjOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class proj_strided_kernel;
-
-template <typename argTy>
-sycl::event
-proj_strided_impl(sycl::queue &exec_q,
-                  std::size_t nelems,
-                  int nd,
-                  const ssize_t *shape_and_strides,
-                  const char *arg_p,
-                  ssize_t arg_offset,
-                  char *res_p,
-                  ssize_t res_offset,
-                  const std::vector<sycl::event> &depends,
-                  const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, ProjOutputType, ProjStridedFunctor, proj_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct ProjStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!ProjOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = proj_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace proj
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp
deleted file mode 100644
index a8b5c719a0..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp
+++ /dev/null
@@ -1,224 +0,0 @@
-//=== real.hpp -   Unary function REAL                     ------
-//*-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of REAL(x) function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace real
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-using dpctl::tensor::type_utils::is_complex_v;
-
-template <typename argT, typename resT> struct RealFunctor
-{
-
-    // is function constant for given argT
-    using is_constant = typename std::false_type;
-    // constant value, if constant
-    // constexpr resT constant_value = resT{};
-    // is function defined for sycl::vec
-    using supports_vec = typename std::false_type;
-    // do both argTy and resTy support sugroup store/load operation
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-        if constexpr (is_complex_v<argT>) {
-            return std::real(in);
-        }
-        else {
-            static_assert(std::is_same_v<resT, argT>);
-            return in;
-        }
-    }
-};
-
-template <typename argTy,
-          typename resTy = argTy,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using RealContigFunctor =
-    elementwise_common::UnaryContigFunctor<argTy,
-                                           resTy,
-                                           RealFunctor<argTy, resTy>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using RealStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, RealFunctor<argTy, resTy>>;
-
-template <typename T> struct RealOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, bool>,
-        td_ns::TypeMapResultEntry<T, std::uint8_t>,
-        td_ns::TypeMapResultEntry<T, std::uint16_t>,
-        td_ns::TypeMapResultEntry<T, std::uint32_t>,
-        td_ns::TypeMapResultEntry<T, std::uint64_t>,
-        td_ns::TypeMapResultEntry<T, std::int8_t>,
-        td_ns::TypeMapResultEntry<T, std::int16_t>,
-        td_ns::TypeMapResultEntry<T, std::int32_t>,
-        td_ns::TypeMapResultEntry<T, std::int64_t>,
-        td_ns::TypeMapResultEntry<T, sycl::half>,
-        td_ns::TypeMapResultEntry<T, float>,
-        td_ns::TypeMapResultEntry<T, double>,
-        td_ns::TypeMapResultEntry<T, std::complex<float>, float>,
-        td_ns::TypeMapResultEntry<T, std::complex<double>, double>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct RealContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class real_contig_kernel;
-
-template <typename argTy>
-sycl::event real_contig_impl(sycl::queue &exec_q,
-                             std::size_t nelems,
-                             const char *arg_p,
-                             char *res_p,
-                             const std::vector<sycl::event> &depends = {})
-{
-    using RealHS = hyperparam_detail::RealContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = RealHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = RealHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, RealOutputType, RealContigFunctor, real_contig_kernel, vec_sz,
-        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct RealContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!RealOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = real_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct RealTypeMapFactory
-{
-    /*! @brief get typeid for output type of std::real(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename RealOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class real_strided_kernel;
-
-template <typename argTy>
-sycl::event
-real_strided_impl(sycl::queue &exec_q,
-                  std::size_t nelems,
-                  int nd,
-                  const ssize_t *shape_and_strides,
-                  const char *arg_p,
-                  ssize_t arg_offset,
-                  char *res_p,
-                  ssize_t res_offset,
-                  const std::vector<sycl::event> &depends,
-                  const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, RealOutputType, RealStridedFunctor, real_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct RealStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!RealOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = real_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace real
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp
deleted file mode 100644
index a320aa3181..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp
+++ /dev/null
@@ -1,223 +0,0 @@
-//=== reciprocal.hpp -   Unary function RECIPROCAL                     ------
-//*-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of RECIPROCAL(x)
-/// function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "sycl_complex.hpp"
-#include "vec_size_util.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace reciprocal
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-
-template <typename argT, typename resT> struct ReciprocalFunctor
-{
-    // is function constant for given argT
-    using is_constant = typename std::false_type;
-    // constant value, if constant
-    // constexpr resT constant_value = resT{};
-    // is function defined for sycl::vec
-    using supports_vec = typename std::false_type;
-    // do both argTy and resTy support sugroup store/load operation
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-        if constexpr (is_complex<argT>::value) {
-
-            using realT = typename argT::value_type;
-
-            return realT(1) / exprm_ns::complex<realT>(in);
-        }
-        else {
-            return argT(1) / in;
-        }
-    }
-};
-
-template <typename argTy,
-          typename resTy = argTy,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using ReciprocalContigFunctor =
-    elementwise_common::UnaryContigFunctor<argTy,
-                                           resTy,
-                                           ReciprocalFunctor<argTy, resTy>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using ReciprocalStridedFunctor =
-    elementwise_common::UnaryStridedFunctor<argTy,
-                                            resTy,
-                                            IndexerT,
-                                            ReciprocalFunctor<argTy, resTy>>;
-
-template <typename T> struct ReciprocalOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, sycl::half>,
-        td_ns::TypeMapResultEntry<T, float>,
-        td_ns::TypeMapResultEntry<T, double>,
-        td_ns::TypeMapResultEntry<T, std::complex<float>>,
-        td_ns::TypeMapResultEntry<T, std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct ReciprocalContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class reciprocal_contig_kernel;
-
-template <typename argTy>
-sycl::event reciprocal_contig_impl(sycl::queue &exec_q,
-                                   std::size_t nelems,
-                                   const char *arg_p,
-                                   char *res_p,
-                                   const std::vector<sycl::event> &depends = {})
-{
-    using RecipHS = hyperparam_detail::ReciprocalContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = RecipHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = RecipHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, ReciprocalOutputType, ReciprocalContigFunctor,
-        reciprocal_contig_kernel, vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p,
-                                                  depends);
-}
-
-template <typename fnT, typename T> struct ReciprocalContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!ReciprocalOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = reciprocal_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct ReciprocalTypeMapFactory
-{
-    /*! @brief get typeid for output type of 1 / x */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename ReciprocalOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3>
-class reciprocal_strided_kernel;
-
-template <typename argTy>
-sycl::event
-reciprocal_strided_impl(sycl::queue &exec_q,
-                        std::size_t nelems,
-                        int nd,
-                        const ssize_t *shape_and_strides,
-                        const char *arg_p,
-                        ssize_t arg_offset,
-                        char *res_p,
-                        ssize_t res_offset,
-                        const std::vector<sycl::event> &depends,
-                        const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<argTy, ReciprocalOutputType,
-                                                  ReciprocalStridedFunctor,
-                                                  reciprocal_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct ReciprocalStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!ReciprocalOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = reciprocal_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace reciprocal
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
deleted file mode 100644
index 028b078c6b..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
+++ /dev/null
@@ -1,565 +0,0 @@
-//=== remainder.hpp -   Binary function REMAINDER                ------
-//*-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of the
-/// modulo of tensor elements.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/common_inplace.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace remainder
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace tu_ns = dpctl::tensor::type_utils;
-
-template <typename argT1, typename argT2, typename resT> struct RemainderFunctor
-{
-    static_assert(std::is_same_v<argT1, argT2>);
-    using supports_sg_loadstore = std::true_type;
-    using supports_vec = std::true_type;
-
-    resT operator()(const argT1 &in1, const argT2 &in2) const
-    {
-        if constexpr (std::is_integral_v<argT1> || std::is_integral_v<argT2>) {
-            if (in2 == argT2(0)) {
-                return resT(0);
-            }
-            if constexpr (std::is_signed_v<argT1> || std::is_signed_v<argT2>) {
-                auto out = (in1 % in2);
-                if (out != 0 && l_xor(in1 < 0, in2 < 0)) {
-                    out += in2;
-                }
-                return out;
-            }
-            else {
-                return (in1 % in2);
-            }
-        }
-        else {
-            auto rem = sycl::fmod(in1, in2);
-            if (rem) {
-                if (l_xor(in2 < 0, rem < 0)) {
-                    rem += in2;
-                }
-            }
-            else {
-                rem = sycl::copysign(resT(0), in2);
-            }
-            return rem;
-        }
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz>
-    operator()(const sycl::vec<argT1, vec_sz> &in1,
-               const sycl::vec<argT2, vec_sz> &in2) const
-    {
-        if constexpr (std::is_integral_v<argT1> || std::is_integral_v<argT2>) {
-            sycl::vec<resT, vec_sz> rem;
-#pragma unroll
-            for (auto i = 0; i < vec_sz; ++i) {
-                if (in2[i] == argT2(0)) {
-                    rem[i] = resT(0);
-                }
-                else {
-                    rem[i] = in1[i] % in2[i];
-                    if constexpr (std::is_signed_v<argT1> ||
-                                  std::is_signed_v<argT2>)
-                    {
-                        if (rem[i] != 0 && l_xor(in1[i] < 0, in2[i] < 0)) {
-                            rem[i] += in2[i];
-                        }
-                    }
-                }
-            }
-            return rem;
-        }
-        else {
-            auto rem = sycl::fmod(in1, in2);
-            using remT = typename decltype(rem)::element_type;
-#pragma unroll
-            for (auto i = 0; i < vec_sz; ++i) {
-                if (rem[i]) {
-                    if (l_xor(in2[i] < 0, rem[i] < 0)) {
-                        rem[i] += in2[i];
-                    }
-                }
-                else {
-                    rem[i] = sycl::copysign(remT(0), in2[i]);
-                }
-            }
-            if constexpr (std::is_same_v<resT, remT>) {
-                return rem;
-            }
-            else {
-                using dpctl::tensor::type_utils::vec_cast;
-
-                return vec_cast<resT, remT, vec_sz>(rem);
-            }
-        }
-    }
-
-private:
-    bool l_xor(bool b1, bool b2) const { return (b1 != b2); }
-};
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using RemainderContigFunctor = elementwise_common::BinaryContigFunctor<
-    argT1,
-    argT2,
-    resT,
-    RemainderFunctor<argT1, argT2, resT>,
-    vec_sz,
-    n_vecs,
-    enable_sg_loadstore>;
-
-template <typename argT1, typename argT2, typename resT, typename IndexerT>
-using RemainderStridedFunctor = elementwise_common::BinaryStridedFunctor<
-    argT1,
-    argT2,
-    resT,
-    IndexerT,
-    RemainderFunctor<argT1, argT2, resT>>;
-
-template <typename T1, typename T2> struct RemainderOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint8_t,
-                                        T2,
-                                        std::uint8_t,
-                                        std::uint8_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int8_t,
-                                        T2,
-                                        std::int8_t,
-                                        std::int8_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint16_t,
-                                        T2,
-                                        std::uint16_t,
-                                        std::uint16_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int16_t,
-                                        T2,
-                                        std::int16_t,
-                                        std::int16_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint32_t,
-                                        T2,
-                                        std::uint32_t,
-                                        std::uint32_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int32_t,
-                                        T2,
-                                        std::int32_t,
-                                        std::int32_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint64_t,
-                                        T2,
-                                        std::uint64_t,
-                                        std::uint64_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int64_t,
-                                        T2,
-                                        std::int64_t,
-                                        std::int64_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        sycl::half,
-                                        T2,
-                                        sycl::half,
-                                        sycl::half>,
-        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
-        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::BinaryContigHyperparameterSetEntry;
-using vsu_ns::ContigHyperparameterSetDefault;
-
-template <typename argTy1, typename argTy2>
-struct RemainderContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class remainder_contig_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event remainder_contig_impl(sycl::queue &exec_q,
-                                  std::size_t nelems,
-                                  const char *arg1_p,
-                                  ssize_t arg1_offset,
-                                  const char *arg2_p,
-                                  ssize_t arg2_offset,
-                                  char *res_p,
-                                  ssize_t res_offset,
-                                  const std::vector<sycl::event> &depends = {})
-{
-    using RemHS =
-        hyperparam_detail::RemainderContigHyperparameterSet<argTy1, argTy2>;
-    static constexpr std::uint8_t vec_sz = RemHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = RemHS::n_vecs;
-
-    return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, RemainderOutputType, RemainderContigFunctor,
-        remainder_contig_kernel, vec_sz, n_vecs>(
-        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
-        res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct RemainderContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!RemainderOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = remainder_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2> struct RemainderTypeMapFactory
-{
-    /*! @brief get typeid for output type of remainder(T x, T y) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename RemainderOutputType<T1, T2>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename resT, typename IndexerT>
-class remainder_strided_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event
-remainder_strided_impl(sycl::queue &exec_q,
-                       std::size_t nelems,
-                       int nd,
-                       const ssize_t *shape_and_strides,
-                       const char *arg1_p,
-                       ssize_t arg1_offset,
-                       const char *arg2_p,
-                       ssize_t arg2_offset,
-                       char *res_p,
-                       ssize_t res_offset,
-                       const std::vector<sycl::event> &depends,
-                       const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_strided_impl<
-        argTy1, argTy2, RemainderOutputType, RemainderStridedFunctor,
-        remainder_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
-                                  arg1_offset, arg2_p, arg2_offset, res_p,
-                                  res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct RemainderStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!RemainderOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = remainder_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename argT, typename resT> struct RemainderInplaceFunctor
-{
-
-    using supports_sg_loadstore = std::true_type;
-    using supports_vec = std::true_type;
-
-    // functor is only well-defined when argT and resT are the same
-    static_assert(std::is_same_v<argT, resT>);
-
-    void operator()(resT &res, const argT &in)
-    {
-        if constexpr (std::is_integral_v<argT> || std::is_integral_v<resT>) {
-            if (in == argT(0)) {
-                res = 0;
-                return;
-            }
-            if constexpr (std::is_signed_v<argT> || std::is_signed_v<resT>) {
-                auto tmp = res;
-                res %= in;
-                if (res != resT(0) && l_xor(tmp < 0, in < 0)) {
-                    res += in;
-                }
-            }
-            else {
-                res %= in;
-            }
-        }
-        else {
-            res = sycl::fmod(res, in);
-            if (res) {
-                if (l_xor(in < 0, res < 0)) {
-                    res += in;
-                }
-            }
-            else {
-                res = sycl::copysign(resT(0), in);
-            }
-        }
-    }
-
-    template <int vec_sz>
-    void operator()(sycl::vec<resT, vec_sz> &res,
-                    const sycl::vec<argT, vec_sz> &in)
-    {
-        if constexpr (std::is_integral_v<argT> || std::is_integral_v<resT>) {
-#pragma unroll
-            for (auto i = 0; i < vec_sz; ++i) {
-                if (in[i] == argT(0)) {
-                    res[i] = 0;
-                }
-                else {
-                    auto rem = res[i] % in[i];
-                    if constexpr (std::is_signed_v<argT> ||
-                                  std::is_signed_v<resT>)
-                    {
-                        if (rem != 0 && l_xor(res[i] < 0, in[i] < 0)) {
-                            rem += in[i];
-                        }
-                    }
-                    res[i] = rem;
-                }
-            }
-        }
-        else {
-            res = sycl::fmod(res, in);
-#pragma unroll
-            for (auto i = 0; i < vec_sz; ++i) {
-                if (res[i]) {
-                    if (l_xor(in[i] < 0, res[i] < 0)) {
-                        res[i] += in[i];
-                    }
-                }
-                else {
-                    res[i] = sycl::copysign(resT(0), in[i]);
-                }
-            }
-        }
-    }
-
-private:
-    bool l_xor(bool b1, bool b2) const { return (b1 != b2); }
-};
-
-template <typename argT,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using RemainderInplaceContigFunctor =
-    elementwise_common::BinaryInplaceContigFunctor<
-        argT,
-        resT,
-        RemainderInplaceFunctor<argT, resT>,
-        vec_sz,
-        n_vecs,
-        enable_sg_loadstore>;
-
-template <typename argT, typename resT, typename IndexerT>
-using RemainderInplaceStridedFunctor =
-    elementwise_common::BinaryInplaceStridedFunctor<
-        argT,
-        resT,
-        IndexerT,
-        RemainderInplaceFunctor<argT, resT>>;
-
-template <typename argT,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class remainder_inplace_contig_kernel;
-
-/* @brief Types supported by in-place remainder */
-template <typename argTy, typename resTy> struct RemainderInplaceTypePairSupport
-{
-    /* value if true a kernel for <argTy, resTy> must be instantiated  */
-    static constexpr bool is_defined = std::disjunction<
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, resTy, sycl::half>,
-        td_ns::TypePairDefinedEntry<argTy, float, resTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, double, resTy, double>,
-        // fall-through
-        td_ns::NotDefinedEntry>::is_defined;
-};
-
-template <typename fnT, typename argT, typename resT>
-struct RemainderInplaceTypeMapFactory
-{
-    /*! @brief get typeid for output type of x %= y */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        if constexpr (RemainderInplaceTypePairSupport<argT, resT>::is_defined) {
-            return td_ns::GetTypeid<resT>{}.get();
-        }
-        else {
-            return td_ns::GetTypeid<void>{}.get();
-        }
-    }
-};
-
-template <typename argTy, typename resTy>
-sycl::event
-remainder_inplace_contig_impl(sycl::queue &exec_q,
-                              std::size_t nelems,
-                              const char *arg_p,
-                              ssize_t arg_offset,
-                              char *res_p,
-                              ssize_t res_offset,
-                              const std::vector<sycl::event> &depends = {})
-{
-    using RemHS =
-        hyperparam_detail::RemainderContigHyperparameterSet<resTy, argTy>;
-    static constexpr std::uint8_t vec_sz = RemHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = RemHS::n_vecs;
-
-    return elementwise_common::binary_inplace_contig_impl<
-        argTy, resTy, RemainderInplaceContigFunctor,
-        remainder_inplace_contig_kernel, vec_sz, n_vecs>(
-        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct RemainderInplaceContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!RemainderInplaceTypePairSupport<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = remainder_inplace_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename resT, typename argT, typename IndexerT>
-class remainder_inplace_strided_kernel;
-
-template <typename argTy, typename resTy>
-sycl::event remainder_inplace_strided_impl(
-    sycl::queue &exec_q,
-    std::size_t nelems,
-    int nd,
-    const ssize_t *shape_and_strides,
-    const char *arg_p,
-    ssize_t arg_offset,
-    char *res_p,
-    ssize_t res_offset,
-    const std::vector<sycl::event> &depends,
-    const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_inplace_strided_impl<
-        argTy, resTy, RemainderInplaceStridedFunctor,
-        remainder_inplace_strided_kernel>(exec_q, nelems, nd, shape_and_strides,
-                                          arg_p, arg_offset, res_p, res_offset,
-                                          depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct RemainderInplaceStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!RemainderInplaceTypePairSupport<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = remainder_inplace_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-} // namespace remainder
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
deleted file mode 100644
index f83349d6e5..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
+++ /dev/null
@@ -1,231 +0,0 @@
-//=== round.hpp -   Unary function ROUND                ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of ROUND(x) function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace round
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-
-template <typename argT, typename resT> struct RoundFunctor
-{
-
-    // is function constant for given argT
-    using is_constant = typename std::false_type;
-    // constant value, if constant
-    // constexpr resT constant_value = resT{};
-    // is function defined for sycl::vec
-    using supports_vec = typename std::false_type;
-    // do both argTy and resTy support sugroup store/load operation
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-
-        if constexpr (std::is_integral_v<argT>) {
-            return in;
-        }
-        else if constexpr (is_complex<argT>::value) {
-            using realT = typename argT::value_type;
-            return resT{round_func<realT>(std::real(in)),
-                        round_func<realT>(std::imag(in))};
-        }
-        else {
-            return round_func<argT>(in);
-        }
-    }
-
-private:
-    template <typename T> T round_func(const T &input) const
-    {
-        return sycl::rint(input);
-    }
-};
-
-template <typename argTy,
-          typename resTy = argTy,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using RoundContigFunctor =
-    elementwise_common::UnaryContigFunctor<argTy,
-                                           resTy,
-                                           RoundFunctor<argTy, resTy>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using RoundStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, RoundFunctor<argTy, resTy>>;
-
-template <typename T> struct RoundOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, std::uint8_t>,
-        td_ns::TypeMapResultEntry<T, std::uint16_t>,
-        td_ns::TypeMapResultEntry<T, std::uint32_t>,
-        td_ns::TypeMapResultEntry<T, std::uint64_t>,
-        td_ns::TypeMapResultEntry<T, std::int8_t>,
-        td_ns::TypeMapResultEntry<T, std::int16_t>,
-        td_ns::TypeMapResultEntry<T, std::int32_t>,
-        td_ns::TypeMapResultEntry<T, std::int64_t>,
-        td_ns::TypeMapResultEntry<T, sycl::half>,
-        td_ns::TypeMapResultEntry<T, float>,
-        td_ns::TypeMapResultEntry<T, double>,
-        td_ns::TypeMapResultEntry<T, std::complex<float>>,
-        td_ns::TypeMapResultEntry<T, std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct RoundContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class round_contig_kernel;
-
-template <typename argTy>
-sycl::event round_contig_impl(sycl::queue &exec_q,
-                              std::size_t nelems,
-                              const char *arg_p,
-                              char *res_p,
-                              const std::vector<sycl::event> &depends = {})
-{
-    using RoundHS = hyperparam_detail::RoundContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = RoundHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = RoundHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, RoundOutputType, RoundContigFunctor, round_contig_kernel, vec_sz,
-        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct RoundContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!RoundOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = round_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct RoundTypeMapFactory
-{
-    /*! @brief get typeid for output type of sycl::round(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename RoundOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class round_strided_kernel;
-
-template <typename argTy>
-sycl::event
-round_strided_impl(sycl::queue &exec_q,
-                   std::size_t nelems,
-                   int nd,
-                   const ssize_t *shape_and_strides,
-                   const char *arg_p,
-                   ssize_t arg_offset,
-                   char *res_p,
-                   ssize_t res_offset,
-                   const std::vector<sycl::event> &depends,
-                   const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, RoundOutputType, RoundStridedFunctor, round_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct RoundStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!RoundOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = round_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace round
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
deleted file mode 100644
index ba78e19fa9..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
+++ /dev/null
@@ -1,202 +0,0 @@
-//=== rsqrt.hpp -   Unary function RSQRT                   ------
-//*-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of RSQRT(x)
-/// function that computes the reciprocal square root.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <limits>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace rsqrt
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-template <typename argT, typename resT> struct RsqrtFunctor
-{
-
-    // is function constant for given argT
-    using is_constant = typename std::false_type;
-    // constant value, if constant
-    // constexpr resT constant_value = resT{};
-    // is function defined for sycl::vec
-    using supports_vec = typename std::false_type;
-    // do both argTy and resTy support sugroup store/load operation
-    using supports_sg_loadstore = typename std::true_type;
-
-    resT operator()(const argT &in) const { return sycl::rsqrt(in); }
-};
-
-template <typename argTy,
-          typename resTy = argTy,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using RsqrtContigFunctor =
-    elementwise_common::UnaryContigFunctor<argTy,
-                                           resTy,
-                                           RsqrtFunctor<argTy, resTy>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using RsqrtStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, RsqrtFunctor<argTy, resTy>>;
-
-template <typename T> struct RsqrtOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, sycl::half, sycl::half>,
-        td_ns::TypeMapResultEntry<T, float, float>,
-        td_ns::TypeMapResultEntry<T, double, double>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct RsqrtContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class rsqrt_contig_kernel;
-
-template <typename argTy>
-sycl::event rsqrt_contig_impl(sycl::queue &exec_q,
-                              std::size_t nelems,
-                              const char *arg_p,
-                              char *res_p,
-                              const std::vector<sycl::event> &depends = {})
-{
-    using RsqrtHS = hyperparam_detail::RsqrtContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = RsqrtHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = RsqrtHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, RsqrtOutputType, RsqrtContigFunctor, rsqrt_contig_kernel, vec_sz,
-        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct RsqrtContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!RsqrtOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = rsqrt_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct RsqrtTypeMapFactory
-{
-    /*! @brief get typeid for output type of sycl::rsqrt(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename RsqrtOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class rsqrt_strided_kernel;
-
-template <typename argTy>
-sycl::event
-rsqrt_strided_impl(sycl::queue &exec_q,
-                   std::size_t nelems,
-                   int nd,
-                   const ssize_t *shape_and_strides,
-                   const char *arg_p,
-                   ssize_t arg_offset,
-                   char *res_p,
-                   ssize_t res_offset,
-                   const std::vector<sycl::event> &depends,
-                   const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, RsqrtOutputType, RsqrtStridedFunctor, rsqrt_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct RsqrtStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!RsqrtOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = rsqrt_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace rsqrt
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
deleted file mode 100644
index 97c3305c7f..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
+++ /dev/null
@@ -1,247 +0,0 @@
-//=== sign.hpp -   Unary function SIGN                   ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of SIGN(x) function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <limits>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "cabs_impl.hpp"
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace sign
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-using dpctl::tensor::type_utils::vec_cast;
-
-template <typename argT, typename resT> struct SignFunctor
-{
-    static_assert(std::is_same_v<resT, argT>);
-    using is_constant = typename std::false_type;
-    // constexpr resT constant_value = resT{};
-    using supports_vec = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-    using supports_sg_loadstore = std::false_type;
-
-    resT operator()(const argT &in) const
-    {
-        if constexpr (std::is_integral_v<argT>) {
-            if constexpr (std::is_unsigned_v<argT>) {
-                return resT(0 < in);
-            }
-            else {
-                return sign_impl<argT>(in);
-            }
-        }
-        else {
-            if constexpr (is_complex<argT>::value) {
-                using realT = typename argT::value_type;
-
-                if (in == argT(0)) {
-                    return resT(0);
-                }
-                else {
-                    auto z = exprm_ns::complex<realT>(in);
-                    return (z / detail::cabs(in));
-                }
-            }
-            else {
-                if (std::isnan(in)) {
-                    return std::numeric_limits<resT>::quiet_NaN();
-                }
-                else {
-                    return sign_impl<argT>(in);
-                }
-            }
-        }
-    }
-
-private:
-    template <typename T> T sign_impl(const T &v) const
-    {
-        return (T(0) < v) - (v < T(0));
-    }
-};
-
-template <typename argT,
-          typename resT = argT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using SignContigFunctor =
-    elementwise_common::UnaryContigFunctor<argT,
-                                           resT,
-                                           SignFunctor<argT, resT>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename T> struct SignOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, std::uint8_t>,
-        td_ns::TypeMapResultEntry<T, std::uint16_t>,
-        td_ns::TypeMapResultEntry<T, std::uint32_t>,
-        td_ns::TypeMapResultEntry<T, std::uint64_t>,
-        td_ns::TypeMapResultEntry<T, std::int8_t>,
-        td_ns::TypeMapResultEntry<T, std::int16_t>,
-        td_ns::TypeMapResultEntry<T, std::int32_t>,
-        td_ns::TypeMapResultEntry<T, std::int64_t>,
-        td_ns::TypeMapResultEntry<T, sycl::half>,
-        td_ns::TypeMapResultEntry<T, float>,
-        td_ns::TypeMapResultEntry<T, double>,
-        td_ns::TypeMapResultEntry<T, std::complex<float>>,
-        td_ns::TypeMapResultEntry<T, std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct SignContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class sign_contig_kernel;
-
-template <typename argTy>
-sycl::event sign_contig_impl(sycl::queue &exec_q,
-                             std::size_t nelems,
-                             const char *arg_p,
-                             char *res_p,
-                             const std::vector<sycl::event> &depends = {})
-{
-    using SignHS = hyperparam_detail::SignContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = SignHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = SignHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, SignOutputType, SignContigFunctor, sign_contig_kernel, vec_sz,
-        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct SignContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!SignOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = sign_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct SignTypeMapFactory
-{
-    /*! @brief get typeid for output type of sign(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename SignOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename argTy, typename resTy, typename IndexerT>
-using SignStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, SignFunctor<argTy, resTy>>;
-
-template <typename T1, typename T2, typename T3> class sign_strided_kernel;
-
-template <typename argTy>
-sycl::event
-sign_strided_impl(sycl::queue &exec_q,
-                  std::size_t nelems,
-                  int nd,
-                  const ssize_t *shape_and_strides,
-                  const char *arg_p,
-                  ssize_t arg_offset,
-                  char *res_p,
-                  ssize_t res_offset,
-                  const std::vector<sycl::event> &depends,
-                  const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, SignOutputType, SignStridedFunctor, sign_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct SignStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!SignOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = sign_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace sign
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
deleted file mode 100644
index 01d25453cd..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
+++ /dev/null
@@ -1,210 +0,0 @@
-//=== signbit.hpp -   Unary function signbit            ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of SIGNBIT(x)
-/// function that tests whether the sign bit of the tensor element is set.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace signbit
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-using dpctl::tensor::type_utils::vec_cast;
-
-template <typename argT, typename resT> struct SignbitFunctor
-{
-    static_assert(std::is_same_v<resT, bool>);
-
-    using is_constant = std::false_type;
-    static constexpr resT constant_value = false;
-    using supports_vec = std::true_type;
-    using supports_sg_loadstore = std::true_type;
-
-    resT operator()(const argT &in) const { return std::signbit(in); }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in) const
-    {
-        auto const &res_vec = sycl::signbit(in);
-
-        using deducedT = typename std::remove_cv_t<
-            std::remove_reference_t<decltype(res_vec)>>::element_type;
-
-        return vec_cast<resT, deducedT, vec_sz>(res_vec);
-    }
-};
-
-template <typename argT,
-          typename resT = bool,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using SignbitContigFunctor =
-    elementwise_common::UnaryContigFunctor<argT,
-                                           resT,
-                                           SignbitFunctor<argT, resT>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using SignbitStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, SignbitFunctor<argTy, resTy>>;
-
-template <typename argTy> struct SignbitOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<argTy, sycl::half, bool>,
-        td_ns::TypeMapResultEntry<argTy, float, bool>,
-        td_ns::TypeMapResultEntry<argTy, double, bool>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct SignbitContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class signbit_contig_kernel;
-
-template <typename argTy>
-sycl::event signbit_contig_impl(sycl::queue &exec_q,
-                                std::size_t nelems,
-                                const char *arg_p,
-                                char *res_p,
-                                const std::vector<sycl::event> &depends = {})
-{
-    using SignbitHS = hyperparam_detail::SignbitContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = SignbitHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = SignbitHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, SignbitOutputType, SignbitContigFunctor, signbit_contig_kernel,
-        vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct SignbitContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!SignbitOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = signbit_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct SignbitTypeMapFactory
-{
-    /*! @brief get typeid for output type of sycl::isinf(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename SignbitOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class signbit_strided_kernel;
-
-template <typename argTy>
-sycl::event
-signbit_strided_impl(sycl::queue &exec_q,
-                     std::size_t nelems,
-                     int nd,
-                     const ssize_t *shape_and_strides,
-                     const char *arg_p,
-                     ssize_t arg_offset,
-                     char *res_p,
-                     ssize_t res_offset,
-                     const std::vector<sycl::event> &depends,
-                     const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<argTy, SignbitOutputType,
-                                                  SignbitStridedFunctor,
-                                                  signbit_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct SignbitStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!SignbitOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = signbit_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace signbit
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
deleted file mode 100644
index eede1e82c9..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
+++ /dev/null
@@ -1,322 +0,0 @@
-//=== sin.hpp -   Unary function SIN                     ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of SIN(x) function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "sycl_complex.hpp"
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace sin
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-
-template <typename argT, typename resT> struct SinFunctor
-{
-    // is function constant for given argT
-    using is_constant = typename std::false_type;
-    // constant value, if constant
-    // constexpr resT constant_value = resT{};
-    // is function defined for sycl::vec
-    using supports_vec = typename std::false_type;
-    // do both argTy and resTy support sugroup store/load operation
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-        if constexpr (is_complex<argT>::value) {
-            using realT = typename argT::value_type;
-
-            static constexpr realT q_nan =
-                std::numeric_limits<realT>::quiet_NaN();
-
-            realT const &in_re = std::real(in);
-            realT const &in_im = std::imag(in);
-
-            const bool in_re_finite = std::isfinite(in_re);
-            const bool in_im_finite = std::isfinite(in_im);
-            /*
-             * Handle the nearly-non-exceptional cases where
-             * real and imaginary parts of input are finite.
-             */
-            if (in_re_finite && in_im_finite) {
-                resT res =
-                    exprm_ns::sin(exprm_ns::complex<realT>(in)); // sin(in);
-                if (in_re == realT(0)) {
-                    res.real(sycl::copysign(realT(0), in_re));
-                }
-                return res;
-            }
-
-            /*
-             * since sin(in) = -I * sinh(I * in), for special cases,
-             * we calculate real and imaginary parts of z = sinh(I * in) and
-             * then return { imag(z) , -real(z) } which is sin(in).
-             */
-            const realT x = -in_im;
-            const realT y = in_re;
-            const bool xfinite = in_im_finite;
-            const bool yfinite = in_re_finite;
-            /*
-             * sinh(+-0 +- I Inf) = sign(d(+-0, dNaN))0 + I dNaN.
-             * The sign of 0 in the result is unspecified.  Choice = normally
-             * the same as dNaN.
-             *
-             * sinh(+-0 +- I NaN) = sign(d(+-0, NaN))0 + I d(NaN).
-             * The sign of 0 in the result is unspecified.  Choice = normally
-             * the same as d(NaN).
-             */
-            if (x == realT(0) && !yfinite) {
-                const realT sinh_im = q_nan;
-                const realT sinh_re = sycl::copysign(realT(0), x * sinh_im);
-                return resT{sinh_im, -sinh_re};
-            }
-
-            /*
-             * sinh(+-Inf +- I 0) = +-Inf + I +-0.
-             *
-             * sinh(NaN +- I 0)   = d(NaN) + I +-0.
-             */
-            if (y == realT(0) && !xfinite) {
-                if (std::isnan(x)) {
-                    const realT sinh_re = x;
-                    const realT sinh_im = y;
-                    return resT{sinh_im, -sinh_re};
-                }
-                const realT sinh_re = x;
-                const realT sinh_im = sycl::copysign(realT(0), y);
-                return resT{sinh_im, -sinh_re};
-            }
-
-            /*
-             * sinh(x +- I Inf) = dNaN + I dNaN.
-             *
-             * sinh(x + I NaN) = d(NaN) + I d(NaN).
-             */
-            if (xfinite && !yfinite) {
-                const realT sinh_re = q_nan;
-                const realT sinh_im = x * sinh_re;
-                return resT{sinh_im, -sinh_re};
-            }
-
-            /*
-             * sinh(+-Inf + I NaN)  = +-Inf + I d(NaN).
-             * The sign of Inf in the result is unspecified.  Choice = normally
-             * the same as d(NaN).
-             *
-             * sinh(+-Inf +- I Inf) = +Inf + I dNaN.
-             * The sign of Inf in the result is unspecified.
-             * Choice = always - here for sinh to have positive result for
-             * imaginary part of sin.
-             *
-             * sinh(+-Inf + I y)   = +-Inf cos(y) + I Inf sin(y)
-             */
-            if (std::isinf(x)) {
-                if (!yfinite) {
-                    const realT sinh_re = -x * x;
-                    const realT sinh_im = x * (y - y);
-                    return resT{sinh_im, -sinh_re};
-                }
-                const realT sinh_re = x * sycl::cos(y);
-                const realT sinh_im =
-                    std::numeric_limits<realT>::infinity() * sycl::sin(y);
-                return resT{sinh_im, -sinh_re};
-            }
-
-            /*
-             * sinh(NaN + I NaN)  = d(NaN) + I d(NaN).
-             *
-             * sinh(NaN +- I Inf) = d(NaN) + I d(NaN).
-             *
-             * sinh(NaN + I y)    = d(NaN) + I d(NaN).
-             */
-            const realT y_m_y = (y - y);
-            const realT sinh_re = (x * x) * y_m_y;
-            const realT sinh_im = (x + x) * y_m_y;
-            return resT{sinh_im, -sinh_re};
-        }
-        else {
-            static_assert(std::is_same_v<argT, resT>);
-            if (in == 0) {
-                return in;
-            }
-            return sycl::sin(in);
-        }
-    }
-};
-
-template <typename argTy,
-          typename resTy = argTy,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using SinContigFunctor =
-    elementwise_common::UnaryContigFunctor<argTy,
-                                           resTy,
-                                           SinFunctor<argTy, resTy>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using SinStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, SinFunctor<argTy, resTy>>;
-
-template <typename T> struct SinOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, sycl::half>,
-        td_ns::TypeMapResultEntry<T, float>,
-        td_ns::TypeMapResultEntry<T, double>,
-        td_ns::TypeMapResultEntry<T, std::complex<float>>,
-        td_ns::TypeMapResultEntry<T, std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct SinContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class sin_contig_kernel;
-
-template <typename argTy>
-sycl::event sin_contig_impl(sycl::queue &exec_q,
-                            std::size_t nelems,
-                            const char *arg_p,
-                            char *res_p,
-                            const std::vector<sycl::event> &depends = {})
-{
-    using SinHS = hyperparam_detail::SinContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = SinHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = SinHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, SinOutputType, SinContigFunctor, sin_contig_kernel, vec_sz,
-        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct SinContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!SinOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = sin_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct SinTypeMapFactory
-{
-    /*! @brief get typeid for output type of sycl::sin(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename SinOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class sin_strided_kernel;
-
-template <typename argTy>
-sycl::event sin_strided_impl(sycl::queue &exec_q,
-                             std::size_t nelems,
-                             int nd,
-                             const ssize_t *shape_and_strides,
-                             const char *arg_p,
-                             ssize_t arg_offset,
-                             char *res_p,
-                             ssize_t res_offset,
-                             const std::vector<sycl::event> &depends,
-                             const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, SinOutputType, SinStridedFunctor, sin_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct SinStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!SinOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = sin_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace sin
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
deleted file mode 100644
index e26631d5dc..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
+++ /dev/null
@@ -1,291 +0,0 @@
-//=== sinh.hpp -   Unary function SINH                  ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of SINH(x) function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "sycl_complex.hpp"
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace sinh
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-
-template <typename argT, typename resT> struct SinhFunctor
-{
-
-    // is function constant for given argT
-    using is_constant = typename std::false_type;
-    // constant value, if constant
-    // constexpr resT constant_value = resT{};
-    // is function defined for sycl::vec
-    using supports_vec = typename std::false_type;
-    // do both argTy and resTy support sugroup store/load operation
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-        if constexpr (is_complex<argT>::value) {
-            using realT = typename argT::value_type;
-
-            const realT x = std::real(in);
-            const realT y = std::imag(in);
-
-            const bool xfinite = std::isfinite(x);
-            const bool yfinite = std::isfinite(y);
-
-            /*
-             * Handle the nearly-non-exceptional cases where
-             * real and imaginary parts of input are finite.
-             */
-            if (xfinite && yfinite) {
-                return exprm_ns::sinh(exprm_ns::complex<realT>(in));
-            }
-            /*
-             * sinh(+-0 +- I Inf) = sign(d(+-0, dNaN))0 + I dNaN.
-             * The sign of 0 in the result is unspecified.  Choice = normally
-             * the same as dNaN.
-             *
-             * sinh(+-0 +- I NaN) = sign(d(+-0, NaN))0 + I d(NaN).
-             * The sign of 0 in the result is unspecified.  Choice = normally
-             * the same as d(NaN).
-             */
-            if (x == realT(0) && !yfinite) {
-                const realT res_re = sycl::copysign(realT(0), x * (y - y));
-                return resT{res_re, y - y};
-            }
-
-            /*
-             * sinh(+-Inf +- I 0) = +-Inf + I +-0.
-             *
-             * sinh(NaN +- I 0)   = d(NaN) + I +-0.
-             */
-            if (y == realT(0) && !xfinite) {
-                if (std::isnan(x)) {
-                    return resT{x, y};
-                }
-                const realT res_im = sycl::copysign(realT(0), y);
-                return resT{x, res_im};
-            }
-
-            /*
-             * sinh(x +- I Inf) = dNaN + I dNaN.
-             *
-             * sinh(x + I NaN) = d(NaN) + I d(NaN).
-             */
-            if (xfinite && !yfinite) {
-                return resT{y - y, x * (y - y)};
-            }
-
-            /*
-             * sinh(+-Inf + I NaN)  = +-Inf + I d(NaN).
-             * The sign of Inf in the result is unspecified.  Choice = normally
-             * the same as d(NaN).
-             *
-             * sinh(+-Inf +- I Inf) = +Inf + I dNaN.
-             * The sign of Inf in the result is unspecified.  Choice = always +.
-             *
-             * sinh(+-Inf + I y)   = +-Inf cos(y) + I Inf sin(y)
-             */
-            if (!xfinite && !std::isnan(x)) {
-                if (!yfinite) {
-                    return resT{x * x, x * (y - y)};
-                }
-                return resT{x * sycl::cos(y),
-                            std::numeric_limits<realT>::infinity() *
-                                sycl::sin(y)};
-            }
-
-            /*
-             * sinh(NaN + I NaN)  = d(NaN) + I d(NaN).
-             *
-             * sinh(NaN +- I Inf) = d(NaN) + I d(NaN).
-             *
-             * sinh(NaN + I y)    = d(NaN) + I d(NaN).
-             */
-            return resT{(x * x) * (y - y), (x + x) * (y - y)};
-        }
-        else {
-            static_assert(std::is_floating_point_v<argT> ||
-                          std::is_same_v<argT, sycl::half>);
-            return sycl::sinh(in);
-        }
-    }
-};
-
-template <typename argTy,
-          typename resTy = argTy,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using SinhContigFunctor =
-    elementwise_common::UnaryContigFunctor<argTy,
-                                           resTy,
-                                           SinhFunctor<argTy, resTy>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using SinhStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, SinhFunctor<argTy, resTy>>;
-
-template <typename T> struct SinhOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, sycl::half>,
-        td_ns::TypeMapResultEntry<T, float>,
-        td_ns::TypeMapResultEntry<T, double>,
-        td_ns::TypeMapResultEntry<T, std::complex<float>>,
-        td_ns::TypeMapResultEntry<T, std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct SinhContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class sinh_contig_kernel;
-
-template <typename argTy>
-sycl::event sinh_contig_impl(sycl::queue &exec_q,
-                             std::size_t nelems,
-                             const char *arg_p,
-                             char *res_p,
-                             const std::vector<sycl::event> &depends = {})
-{
-    using SinhHS = hyperparam_detail::SinhContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = SinhHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = SinhHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, SinhOutputType, SinhContigFunctor, sinh_contig_kernel, vec_sz,
-        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct SinhContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!SinhOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = sinh_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct SinhTypeMapFactory
-{
-    /*! @brief get typeid for output type of sycl::sinh(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename SinhOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class sinh_strided_kernel;
-
-template <typename argTy>
-sycl::event
-sinh_strided_impl(sycl::queue &exec_q,
-                  std::size_t nelems,
-                  int nd,
-                  const ssize_t *shape_and_strides,
-                  const char *arg_p,
-                  ssize_t arg_offset,
-                  char *res_p,
-                  ssize_t res_offset,
-                  const std::vector<sycl::event> &depends,
-                  const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, SinhOutputType, SinhStridedFunctor, sinh_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct SinhStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!SinhOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = sinh_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace sinh
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
deleted file mode 100644
index ed80521acd..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
+++ /dev/null
@@ -1,217 +0,0 @@
-//=== sqrt.hpp -   Unary function SQRT                   ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of SQRT(x)
-/// function that compute a square root.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <limits>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "sycl_complex.hpp"
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace sqrt
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-
-template <typename argT, typename resT> struct SqrtFunctor
-{
-
-    // is function constant for given argT
-    using is_constant = typename std::false_type;
-    // constant value, if constant
-    // constexpr resT constant_value = resT{};
-    // is function defined for sycl::vec
-    using supports_vec = typename std::false_type;
-    // do both argTy and resTy support sugroup store/load operation
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-        if constexpr (is_complex<argT>::value) {
-            using realT = typename argT::value_type;
-            return exprm_ns::sqrt(exprm_ns::complex<realT>(in));
-        }
-        else {
-            return sycl::sqrt(in);
-        }
-    }
-};
-
-template <typename argTy,
-          typename resTy = argTy,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using SqrtContigFunctor =
-    elementwise_common::UnaryContigFunctor<argTy,
-                                           resTy,
-                                           SqrtFunctor<argTy, resTy>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using SqrtStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, SqrtFunctor<argTy, resTy>>;
-
-template <typename T> struct SqrtOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, sycl::half, sycl::half>,
-        td_ns::TypeMapResultEntry<T, float, float>,
-        td_ns::TypeMapResultEntry<T, double, double>,
-        td_ns::TypeMapResultEntry<T, std::complex<float>, std::complex<float>>,
-        td_ns::
-            TypeMapResultEntry<T, std::complex<double>, std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct SqrtContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class sqrt_contig_kernel;
-
-template <typename argTy>
-sycl::event sqrt_contig_impl(sycl::queue &exec_q,
-                             std::size_t nelems,
-                             const char *arg_p,
-                             char *res_p,
-                             const std::vector<sycl::event> &depends = {})
-{
-    using SqrtHS = hyperparam_detail::SqrtContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = SqrtHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = SqrtHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, SqrtOutputType, SqrtContigFunctor, sqrt_contig_kernel, vec_sz,
-        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct SqrtContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!SqrtOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = sqrt_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct SqrtTypeMapFactory
-{
-    /*! @brief get typeid for output type of std::sqrt(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename SqrtOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class sqrt_strided_kernel;
-
-template <typename argTy>
-sycl::event
-sqrt_strided_impl(sycl::queue &exec_q,
-                  std::size_t nelems,
-                  int nd,
-                  const ssize_t *shape_and_strides,
-                  const char *arg_p,
-                  ssize_t arg_offset,
-                  char *res_p,
-                  ssize_t res_offset,
-                  const std::vector<sycl::event> &depends,
-                  const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, SqrtOutputType, SqrtStridedFunctor, sqrt_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct SqrtStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!SqrtOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = sqrt_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace sqrt
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
deleted file mode 100644
index 7aa12452cc..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
+++ /dev/null
@@ -1,242 +0,0 @@
-//=== square.hpp -   Unary function SQUARE       ------         *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of SQUARE(x)
-///
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "sycl_complex.hpp"
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace square
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-using dpctl::tensor::type_utils::vec_cast;
-
-template <typename argT, typename resT> struct SquareFunctor
-{
-
-    // is function constant for given argT
-    using is_constant = typename std::false_type;
-    // constant value, if constant
-    // constexpr resT constant_value = resT{};
-    // is function defined for sycl::vec
-    using supports_vec = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-    // do both argTy and resTy support sugroup store/load operation
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-        if constexpr (is_complex<argT>::value) {
-            using realT = typename argT::value_type;
-
-            auto z = exprm_ns::complex<realT>(in);
-
-            return z * z;
-        }
-        else {
-            return in * in;
-        }
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz> operator()(const sycl::vec<argT, vec_sz> &in) const
-    {
-        auto const &res_vec = in * in;
-        using deducedT = typename std::remove_cv_t<
-            std::remove_reference_t<decltype(res_vec)>>::element_type;
-        if constexpr (std::is_same_v<resT, deducedT>) {
-            return res_vec;
-        }
-        else {
-            return vec_cast<resT, deducedT, vec_sz>(res_vec);
-        }
-    }
-};
-
-template <typename argTy,
-          typename resTy = argTy,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using SquareContigFunctor =
-    elementwise_common::UnaryContigFunctor<argTy,
-                                           resTy,
-                                           SquareFunctor<argTy, resTy>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using SquareStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, SquareFunctor<argTy, resTy>>;
-
-template <typename T> struct SquareOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, bool, std::int8_t>,
-        td_ns::TypeMapResultEntry<T, std::uint8_t>,
-        td_ns::TypeMapResultEntry<T, std::uint16_t>,
-        td_ns::TypeMapResultEntry<T, std::uint32_t>,
-        td_ns::TypeMapResultEntry<T, std::uint64_t>,
-        td_ns::TypeMapResultEntry<T, std::int8_t>,
-        td_ns::TypeMapResultEntry<T, std::int16_t>,
-        td_ns::TypeMapResultEntry<T, std::int32_t>,
-        td_ns::TypeMapResultEntry<T, std::int64_t>,
-        td_ns::TypeMapResultEntry<T, sycl::half>,
-        td_ns::TypeMapResultEntry<T, float>,
-        td_ns::TypeMapResultEntry<T, double>,
-        td_ns::TypeMapResultEntry<T, std::complex<float>>,
-        td_ns::TypeMapResultEntry<T, std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct SquareContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class square_contig_kernel;
-
-template <typename argTy>
-sycl::event square_contig_impl(sycl::queue &exec_q,
-                               std::size_t nelems,
-                               const char *arg_p,
-                               char *res_p,
-                               const std::vector<sycl::event> &depends = {})
-{
-    using SquareHS = hyperparam_detail::SquareContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = SquareHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = SquareHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, SquareOutputType, SquareContigFunctor, square_contig_kernel,
-        vec_sz, n_vecs>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct SquareContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!SquareOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = square_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct SquareTypeMapFactory
-{
-    /*! @brief get typeid for output type of x * x */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename SquareOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class square_strided_kernel;
-
-template <typename argTy>
-sycl::event
-square_strided_impl(sycl::queue &exec_q,
-                    std::size_t nelems,
-                    int nd,
-                    const ssize_t *shape_and_strides,
-                    const char *arg_p,
-                    ssize_t arg_offset,
-                    char *res_p,
-                    ssize_t res_offset,
-                    const std::vector<sycl::event> &depends,
-                    const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, SquareOutputType, SquareStridedFunctor, square_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct SquareStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!SquareOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = square_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace square
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
deleted file mode 100644
index ee817c2941..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
+++ /dev/null
@@ -1,634 +0,0 @@
-//=== subtract.hpp -   Binary function SUBTRACT         ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of DIVIDE(x1, x2)
-/// function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/common_inplace.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace subtract
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace tu_ns = dpctl::tensor::type_utils;
-
-template <typename argT1, typename argT2, typename resT> struct SubtractFunctor
-{
-
-    using supports_sg_loadstore = std::negation<
-        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
-    using supports_vec = std::negation<
-        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
-
-    resT operator()(const argT1 &in1, const argT2 &in2) const
-    {
-        return in1 - in2;
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz>
-    operator()(const sycl::vec<argT1, vec_sz> &in1,
-               const sycl::vec<argT2, vec_sz> &in2) const
-    {
-        auto tmp = in1 - in2;
-        if constexpr (std::is_same_v<resT,
-                                     typename decltype(tmp)::element_type>)
-        {
-            return tmp;
-        }
-        else {
-            using dpctl::tensor::type_utils::vec_cast;
-
-            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
-                tmp);
-        }
-    }
-};
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using SubtractContigFunctor =
-    elementwise_common::BinaryContigFunctor<argT1,
-                                            argT2,
-                                            resT,
-                                            SubtractFunctor<argT1, argT2, resT>,
-                                            vec_sz,
-                                            n_vecs,
-                                            enable_sg_loadstore>;
-
-template <typename argT1, typename argT2, typename resT, typename IndexerT>
-using SubtractStridedFunctor = elementwise_common::BinaryStridedFunctor<
-    argT1,
-    argT2,
-    resT,
-    IndexerT,
-    SubtractFunctor<argT1, argT2, resT>>;
-
-template <typename T1, typename T2> struct SubtractOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint8_t,
-                                        T2,
-                                        std::uint8_t,
-                                        std::uint8_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int8_t,
-                                        T2,
-                                        std::int8_t,
-                                        std::int8_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint16_t,
-                                        T2,
-                                        std::uint16_t,
-                                        std::uint16_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int16_t,
-                                        T2,
-                                        std::int16_t,
-                                        std::int16_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint32_t,
-                                        T2,
-                                        std::uint32_t,
-                                        std::uint32_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int32_t,
-                                        T2,
-                                        std::int32_t,
-                                        std::int32_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint64_t,
-                                        T2,
-                                        std::uint64_t,
-                                        std::uint64_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int64_t,
-                                        T2,
-                                        std::int64_t,
-                                        std::int64_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        sycl::half,
-                                        T2,
-                                        sycl::half,
-                                        sycl::half>,
-        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
-        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<float>,
-                                        T2,
-                                        std::complex<float>,
-                                        std::complex<float>>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<double>,
-                                        T2,
-                                        std::complex<double>,
-                                        std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::BinaryContigHyperparameterSetEntry;
-using vsu_ns::ContigHyperparameterSetDefault;
-
-template <typename argTy1, typename argTy2>
-struct SubtractContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class subtract_contig_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event subtract_contig_impl(sycl::queue &exec_q,
-                                 std::size_t nelems,
-                                 const char *arg1_p,
-                                 ssize_t arg1_offset,
-                                 const char *arg2_p,
-                                 ssize_t arg2_offset,
-                                 char *res_p,
-                                 ssize_t res_offset,
-                                 const std::vector<sycl::event> &depends = {})
-{
-    using SubHS =
-        hyperparam_detail::SubtractContigHyperparameterSet<argTy1, argTy2>;
-    static constexpr std::uint8_t vec_sz = SubHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = SubHS::n_vecs;
-
-    return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, SubtractOutputType, SubtractContigFunctor,
-        subtract_contig_kernel, vec_sz, n_vecs>(
-        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
-        res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct SubtractContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!SubtractOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = subtract_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2> struct SubtractTypeMapFactory
-{
-    /*! @brief get typeid for output type of divide(T1 x, T2 y) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename SubtractOutputType<T1, T2>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename resT, typename IndexerT>
-class subtract_strided_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event
-subtract_strided_impl(sycl::queue &exec_q,
-                      std::size_t nelems,
-                      int nd,
-                      const ssize_t *shape_and_strides,
-                      const char *arg1_p,
-                      ssize_t arg1_offset,
-                      const char *arg2_p,
-                      ssize_t arg2_offset,
-                      char *res_p,
-                      ssize_t res_offset,
-                      const std::vector<sycl::event> &depends,
-                      const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_strided_impl<
-        argTy1, argTy2, SubtractOutputType, SubtractStridedFunctor,
-        subtract_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
-                                 arg1_offset, arg2_p, arg2_offset, res_p,
-                                 res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct SubtractStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!SubtractOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = subtract_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename argT1, typename argT2, typename resT>
-using SubtractContigMatrixContigRowBroadcastingFunctor =
-    elementwise_common::BinaryContigMatrixContigRowBroadcastingFunctor<
-        argT1,
-        argT2,
-        resT,
-        SubtractFunctor<argT1, argT2, resT>>;
-
-template <typename argT1, typename argT2, typename resT>
-using SubtractContigRowContigMatrixBroadcastingFunctor =
-    elementwise_common::BinaryContigRowContigMatrixBroadcastingFunctor<
-        argT1,
-        argT2,
-        resT,
-        SubtractFunctor<argT1, argT2, resT>>;
-
-template <typename argT1, typename argT2, typename resT>
-class subtract_matrix_row_broadcast_sg_krn;
-
-template <typename argT1, typename argT2, typename resT>
-class subtract_row_matrix_broadcast_sg_krn;
-
-template <typename argT1, typename argT2, typename resT>
-sycl::event subtract_contig_matrix_contig_row_broadcast_impl(
-    sycl::queue &exec_q,
-    std::vector<sycl::event> &host_tasks,
-    std::size_t n0,
-    std::size_t n1,
-    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
-    ssize_t mat_offset,
-    const char *vec_p, // typeless pointer to (n1,) contiguous row
-    ssize_t vec_offset,
-    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
-                 //    res[i,j] = mat[i,j] - vec[j]
-    ssize_t res_offset,
-    const std::vector<sycl::event> &depends = {})
-{
-    return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl<
-        argT1, argT2, resT, SubtractContigMatrixContigRowBroadcastingFunctor,
-        subtract_matrix_row_broadcast_sg_krn>(exec_q, host_tasks, n0, n1, mat_p,
-                                              mat_offset, vec_p, vec_offset,
-                                              res_p, res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct SubtractContigMatrixContigRowBroadcastFactory
-{
-    fnT get()
-    {
-        if constexpr (!SubtractOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            using resT = typename SubtractOutputType<T1, T2>::value_type;
-            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
-                          dpctl::tensor::type_utils::is_complex<T2>::value ||
-                          dpctl::tensor::type_utils::is_complex<resT>::value)
-            {
-                fnT fn = nullptr;
-                return fn;
-            }
-            else {
-                fnT fn =
-                    subtract_contig_matrix_contig_row_broadcast_impl<T1, T2,
-                                                                     resT>;
-                return fn;
-            }
-        }
-    }
-};
-
-template <typename argT1, typename argT2, typename resT>
-sycl::event subtract_contig_row_contig_matrix_broadcast_impl(
-    sycl::queue &exec_q,
-    std::vector<sycl::event> &host_tasks,
-    std::size_t n0,
-    std::size_t n1,
-    const char *vec_p, // typeless pointer to (n1,) contiguous row
-    ssize_t vec_offset,
-    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
-    ssize_t mat_offset,
-    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
-                 //    res[i,j] = op(vec[j], mat[i,j])
-    ssize_t res_offset,
-    const std::vector<sycl::event> &depends = {})
-{
-    return elementwise_common::binary_contig_row_contig_matrix_broadcast_impl<
-        argT1, argT2, resT, SubtractContigRowContigMatrixBroadcastingFunctor,
-        subtract_row_matrix_broadcast_sg_krn>(exec_q, host_tasks, n0, n1, vec_p,
-                                              vec_offset, mat_p, mat_offset,
-                                              res_p, res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct SubtractContigRowContigMatrixBroadcastFactory
-{
-    fnT get()
-    {
-        if constexpr (!SubtractOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            using resT = typename SubtractOutputType<T1, T2>::value_type;
-            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
-                          dpctl::tensor::type_utils::is_complex<T2>::value ||
-                          dpctl::tensor::type_utils::is_complex<resT>::value)
-            {
-                fnT fn = nullptr;
-                return fn;
-            }
-            else {
-                fnT fn =
-                    subtract_contig_row_contig_matrix_broadcast_impl<T1, T2,
-                                                                     resT>;
-                return fn;
-            }
-        }
-    }
-};
-
-template <typename argT, typename resT> struct SubtractInplaceFunctor
-{
-
-    using supports_sg_loadstore = std::negation<
-        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
-    using supports_vec = std::negation<
-        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
-
-    void operator()(resT &res, const argT &in) { res -= in; }
-
-    template <int vec_sz>
-    void operator()(sycl::vec<resT, vec_sz> &res,
-                    const sycl::vec<argT, vec_sz> &in)
-    {
-        res -= in;
-    }
-};
-
-template <typename argT,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using SubtractInplaceContigFunctor =
-    elementwise_common::BinaryInplaceContigFunctor<
-        argT,
-        resT,
-        SubtractInplaceFunctor<argT, resT>,
-        vec_sz,
-        n_vecs,
-        enable_sg_loadstore>;
-
-template <typename argT, typename resT, typename IndexerT>
-using SubtractInplaceStridedFunctor =
-    elementwise_common::BinaryInplaceStridedFunctor<
-        argT,
-        resT,
-        IndexerT,
-        SubtractInplaceFunctor<argT, resT>>;
-
-template <typename argT,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class subtract_inplace_contig_kernel;
-
-/* @brief Types supported by in-place subtraction */
-template <typename argTy, typename resTy> struct SubtractInplaceTypePairSupport
-{
-    /* value if true a kernel for <argTy, resTy> must be instantiated  */
-    static constexpr bool is_defined = std::disjunction<
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, resTy, sycl::half>,
-        td_ns::TypePairDefinedEntry<argTy, float, resTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, double, resTy, double>,
-        td_ns::TypePairDefinedEntry<argTy,
-                                    std::complex<float>,
-                                    resTy,
-                                    std::complex<float>>,
-        td_ns::TypePairDefinedEntry<argTy,
-                                    std::complex<double>,
-                                    resTy,
-                                    std::complex<double>>,
-        // fall-through
-        td_ns::NotDefinedEntry>::is_defined;
-};
-
-template <typename fnT, typename argT, typename resT>
-struct SubtractInplaceTypeMapFactory
-{
-    /*! @brief get typeid for output type of x -= y */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        if constexpr (SubtractInplaceTypePairSupport<argT, resT>::is_defined) {
-            return td_ns::GetTypeid<resT>{}.get();
-        }
-        else {
-            return td_ns::GetTypeid<void>{}.get();
-        }
-    }
-};
-
-template <typename argTy, typename resTy>
-sycl::event
-subtract_inplace_contig_impl(sycl::queue &exec_q,
-                             std::size_t nelems,
-                             const char *arg_p,
-                             ssize_t arg_offset,
-                             char *res_p,
-                             ssize_t res_offset,
-                             const std::vector<sycl::event> &depends = {})
-{
-    using SubHS =
-        hyperparam_detail::SubtractContigHyperparameterSet<resTy, argTy>;
-    static constexpr std::uint8_t vec_sz = SubHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = SubHS::n_vecs;
-
-    return elementwise_common::binary_inplace_contig_impl<
-        argTy, resTy, SubtractInplaceContigFunctor,
-        subtract_inplace_contig_kernel, vec_sz, n_vecs>(
-        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct SubtractInplaceContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!SubtractInplaceTypePairSupport<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = subtract_inplace_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename resT, typename argT, typename IndexerT>
-class subtract_inplace_strided_kernel;
-
-template <typename argTy, typename resTy>
-sycl::event subtract_inplace_strided_impl(
-    sycl::queue &exec_q,
-    std::size_t nelems,
-    int nd,
-    const ssize_t *shape_and_strides,
-    const char *arg_p,
-    ssize_t arg_offset,
-    char *res_p,
-    ssize_t res_offset,
-    const std::vector<sycl::event> &depends,
-    const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_inplace_strided_impl<
-        argTy, resTy, SubtractInplaceStridedFunctor,
-        subtract_inplace_strided_kernel>(exec_q, nelems, nd, shape_and_strides,
-                                         arg_p, arg_offset, res_p, res_offset,
-                                         depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct SubtractInplaceStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!SubtractInplaceTypePairSupport<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = subtract_inplace_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename argT, typename resT>
-class subtract_inplace_row_matrix_broadcast_sg_krn;
-
-template <typename argT, typename resT>
-using SubtractInplaceRowMatrixBroadcastingFunctor =
-    elementwise_common::BinaryInplaceRowMatrixBroadcastingFunctor<
-        argT,
-        resT,
-        SubtractInplaceFunctor<argT, resT>>;
-
-template <typename argT, typename resT>
-sycl::event subtract_inplace_row_matrix_broadcast_impl(
-    sycl::queue &exec_q,
-    std::vector<sycl::event> &host_tasks,
-    std::size_t n0,
-    std::size_t n1,
-    const char *vec_p, // typeless pointer to (n1,) contiguous row
-    ssize_t vec_offset,
-    char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
-    ssize_t mat_offset,
-    const std::vector<sycl::event> &depends = {})
-{
-    return elementwise_common::binary_inplace_row_matrix_broadcast_impl<
-        argT, resT, SubtractInplaceRowMatrixBroadcastingFunctor,
-        subtract_inplace_row_matrix_broadcast_sg_krn>(
-        exec_q, host_tasks, n0, n1, vec_p, vec_offset, mat_p, mat_offset,
-        depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct SubtractInplaceRowMatrixBroadcastFactory
-{
-    fnT get()
-    {
-        if constexpr (!SubtractInplaceTypePairSupport<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
-                          dpctl::tensor::type_utils::is_complex<T2>::value)
-            {
-                fnT fn = nullptr;
-                return fn;
-            }
-            else {
-                fnT fn = subtract_inplace_row_matrix_broadcast_impl<T1, T2>;
-                return fn;
-            }
-        }
-    }
-};
-
-} // namespace subtract
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp
deleted file mode 100644
index 3b5a1b9e7b..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp
+++ /dev/null
@@ -1,36 +0,0 @@
-//=== sycl_complex.hpp ----------------------------------------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines a macro for defining the SYCL_EXT_ONEAPI_COMPLEX macro
-/// and indirect inclusion of the experimental oneAPI SYCL complex extension
-/// header file.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-
-#define SYCL_EXT_ONEAPI_COMPLEX
-#if __has_include(<sycl/ext/oneapi/experimental/sycl_complex.hpp>)
-#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
-#else
-#include <sycl/ext/oneapi/experimental/complex/complex.hpp>
-#endif
-
-namespace exprm_ns = sycl::ext::oneapi::experimental;
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
deleted file mode 100644
index fc67ca9a25..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
+++ /dev/null
@@ -1,266 +0,0 @@
-//=== tan.hpp -   Unary function TAN                    ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of TAN(x) function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "sycl_complex.hpp"
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace tan
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-
-template <typename argT, typename resT> struct TanFunctor
-{
-
-    // is function constant for given argT
-    using is_constant = typename std::false_type;
-    // constant value, if constant
-    // constexpr resT constant_value = resT{};
-    // is function defined for sycl::vec
-    using supports_vec = typename std::false_type;
-    // do both argTy and resTy support sugroup store/load operation
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-        if constexpr (is_complex<argT>::value) {
-
-            using realT = typename argT::value_type;
-
-            static constexpr realT q_nan =
-                std::numeric_limits<realT>::quiet_NaN();
-            /*
-             * since tan(in) = -I * tanh(I * in), for special cases,
-             * we calculate real and imaginary parts of z = tanh(I * in) and
-             * return { imag(z) , -real(z) } which is tan(in).
-             */
-            const realT x = -std::imag(in);
-            const realT y = std::real(in);
-            /*
-             * tanh(NaN + i 0) = NaN + i 0
-             *
-             * tanh(NaN + i y) = NaN + i NaN        for y != 0
-             *
-             * The imaginary part has the sign of x*sin(2*y), but there's no
-             * special effort to get this right.
-             *
-             * tanh(+-Inf +- i Inf) = +-1 +- 0
-             *
-             * tanh(+-Inf + i y) = +-1 + 0 sin(2y)        for y finite
-             *
-             * The imaginary part of the sign is unspecified.  This special
-             * case is only needed to avoid a spurious invalid exception when
-             * y is infinite.
-             */
-            if (!std::isfinite(x)) {
-                if (std::isnan(x)) {
-                    const realT tanh_re = x;
-                    const realT tanh_im = (y == realT(0) ? y : x * y);
-                    return resT{tanh_im, -tanh_re};
-                }
-                const realT tanh_re = sycl::copysign(realT(1), x);
-                const realT tanh_im = sycl::copysign(
-                    realT(0), std::isinf(y) ? y : sycl::sin(y) * sycl::cos(y));
-                return resT{tanh_im, -tanh_re};
-            }
-            /*
-             * tanh(x + i NAN) = NaN + i NaN for non-zero x
-             * tanh(x +- i Inf) = NaN + i NaN for non-zero x
-             * tanh(0 + i NAN) = 0 + i NaN
-             * tanh(0 +- i Inf) = 0 + i NaN
-             */
-            if (!std::isfinite(y)) {
-                if (x == realT(0)) {
-                    return resT{q_nan, x};
-                }
-                return resT{q_nan, q_nan};
-            }
-            /* ordinary cases */
-            return exprm_ns::tan(exprm_ns::complex<realT>(in)); // tan(in);
-        }
-        else {
-            static_assert(std::is_floating_point_v<argT> ||
-                          std::is_same_v<argT, sycl::half>);
-            return sycl::tan(in);
-        }
-    }
-};
-
-template <typename argTy,
-          typename resTy = argTy,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using TanContigFunctor =
-    elementwise_common::UnaryContigFunctor<argTy,
-                                           resTy,
-                                           TanFunctor<argTy, resTy>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using TanStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, TanFunctor<argTy, resTy>>;
-
-template <typename T> struct TanOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, sycl::half>,
-        td_ns::TypeMapResultEntry<T, float>,
-        td_ns::TypeMapResultEntry<T, double>,
-        td_ns::TypeMapResultEntry<T, std::complex<float>>,
-        td_ns::TypeMapResultEntry<T, std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct TanContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class tan_contig_kernel;
-
-template <typename argTy>
-sycl::event tan_contig_impl(sycl::queue &exec_q,
-                            std::size_t nelems,
-                            const char *arg_p,
-                            char *res_p,
-                            const std::vector<sycl::event> &depends = {})
-{
-    using TanHS = hyperparam_detail::TanContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = TanHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = TanHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, TanOutputType, TanContigFunctor, tan_contig_kernel, vec_sz,
-        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct TanContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!TanOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = tan_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct TanTypeMapFactory
-{
-    /*! @brief get typeid for output type of sycl::tan(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename TanOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class tan_strided_kernel;
-
-template <typename argTy>
-sycl::event tan_strided_impl(sycl::queue &exec_q,
-                             std::size_t nelems,
-                             int nd,
-                             const ssize_t *shape_and_strides,
-                             const char *arg_p,
-                             ssize_t arg_offset,
-                             char *res_p,
-                             ssize_t res_offset,
-                             const std::vector<sycl::event> &depends,
-                             const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, TanOutputType, TanStridedFunctor, tan_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct TanStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!TanOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = tan_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace tan
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
deleted file mode 100644
index dda2e2914f..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
+++ /dev/null
@@ -1,261 +0,0 @@
-//=== tanh.hpp -   Unary function TANH                     ------
-//*-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of TANH(x) function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "sycl_complex.hpp"
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace tanh
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-
-template <typename argT, typename resT> struct TanhFunctor
-{
-
-    // is function constant for given argT
-    using is_constant = typename std::false_type;
-    // constant value, if constant
-    // constexpr resT constant_value = resT{};
-    // is function defined for sycl::vec
-    using supports_vec = typename std::false_type;
-    // do both argTy and resTy support sugroup store/load operation
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-        if constexpr (is_complex<argT>::value) {
-            using realT = typename argT::value_type;
-
-            static constexpr realT q_nan =
-                std::numeric_limits<realT>::quiet_NaN();
-
-            const realT x = std::real(in);
-            const realT y = std::imag(in);
-            /*
-             * tanh(NaN + i 0) = NaN + i 0
-             *
-             * tanh(NaN + i y) = NaN + i NaN        for y != 0
-             *
-             * The imaginary part has the sign of x*sin(2*y), but there's no
-             * special effort to get this right.
-             *
-             * tanh(+-Inf +- i Inf) = +-1 +- 0
-             *
-             * tanh(+-Inf + i y) = +-1 + 0 sin(2y)        for y finite
-             *
-             * The imaginary part of the sign is unspecified.  This special
-             * case is only needed to avoid a spurious invalid exception when
-             * y is infinite.
-             */
-            if (!std::isfinite(x)) {
-                if (std::isnan(x)) {
-                    return resT{q_nan, (y == realT(0) ? y : q_nan)};
-                }
-                const realT res_re = sycl::copysign(realT(1), x);
-                const realT res_im = sycl::copysign(
-                    realT(0), std::isinf(y) ? y : sycl::sin(y) * sycl::cos(y));
-                return resT{res_re, res_im};
-            }
-            /*
-             * tanh(x + i NAN) = NaN + i NaN for non-zero x
-             * tanh(x +- i Inf) = NaN + i NaN for non-zero x
-             * tanh(0 + i NAN) = 0 + i NaN
-             * tanh(0 +- i Inf) = 0 + i NaN
-             */
-            if (!std::isfinite(y)) {
-                if (x == realT(0)) {
-                    return resT{x, q_nan};
-                }
-                return resT{q_nan, q_nan};
-            }
-            /* ordinary cases */
-            return exprm_ns::tanh(exprm_ns::complex<realT>(in)); // tanh(in);
-        }
-        else {
-            static_assert(std::is_floating_point_v<argT> ||
-                          std::is_same_v<argT, sycl::half>);
-            return sycl::tanh(in);
-        }
-    }
-};
-
-template <typename argTy,
-          typename resTy = argTy,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using TanhContigFunctor =
-    elementwise_common::UnaryContigFunctor<argTy,
-                                           resTy,
-                                           TanhFunctor<argTy, resTy>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using TanhStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, TanhFunctor<argTy, resTy>>;
-
-template <typename T> struct TanhOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::TypeMapResultEntry<T, sycl::half>,
-        td_ns::TypeMapResultEntry<T, float>,
-        td_ns::TypeMapResultEntry<T, double>,
-        td_ns::TypeMapResultEntry<T, std::complex<float>>,
-        td_ns::TypeMapResultEntry<T, std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct TanhContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class tanh_contig_kernel;
-
-template <typename argTy>
-sycl::event tanh_contig_impl(sycl::queue &exec_q,
-                             std::size_t nelems,
-                             const char *arg_p,
-                             char *res_p,
-                             const std::vector<sycl::event> &depends = {})
-{
-    using TanhHS = hyperparam_detail::TanhContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = TanhHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = TanhHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, TanhOutputType, TanhContigFunctor, tanh_contig_kernel, vec_sz,
-        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct TanhContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!TanhOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = tanh_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct TanhTypeMapFactory
-{
-    /*! @brief get typeid for output type of sycl::tanh(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename TanhOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class tanh_strided_kernel;
-
-template <typename argTy>
-sycl::event
-tanh_strided_impl(sycl::queue &exec_q,
-                  std::size_t nelems,
-                  int nd,
-                  const ssize_t *shape_and_strides,
-                  const char *arg_p,
-                  ssize_t arg_offset,
-                  char *res_p,
-                  ssize_t res_offset,
-                  const std::vector<sycl::event> &depends,
-                  const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, TanhOutputType, TanhStridedFunctor, tanh_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct TanhStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!TanhOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = tanh_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace tanh
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
deleted file mode 100644
index 1aa3fbd482..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
+++ /dev/null
@@ -1,663 +0,0 @@
-//=== true_divide.hpp -   Binary function DIVIDE         ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of DIVIDE(x1, x2)
-/// function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "sycl_complex.hpp"
-#include "vec_size_util.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/common_inplace.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace true_divide
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace tu_ns = dpctl::tensor::type_utils;
-
-template <typename argT1, typename argT2, typename resT>
-struct TrueDivideFunctor
-{
-
-    using supports_sg_loadstore = std::negation<
-        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
-    using supports_vec = std::negation<
-        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
-
-    resT operator()(const argT1 &in1, const argT2 &in2) const
-    {
-        if constexpr (tu_ns::is_complex<argT1>::value &&
-                      tu_ns::is_complex<argT2>::value)
-        {
-            using realT1 = typename argT1::value_type;
-            using realT2 = typename argT2::value_type;
-
-            return exprm_ns::complex<realT1>(in1) /
-                   exprm_ns::complex<realT2>(in2);
-        }
-        else if constexpr (tu_ns::is_complex<argT1>::value &&
-                           !tu_ns::is_complex<argT2>::value)
-        {
-            using realT1 = typename argT1::value_type;
-
-            return exprm_ns::complex<realT1>(in1) / in2;
-        }
-        else if constexpr (!tu_ns::is_complex<argT1>::value &&
-                           tu_ns::is_complex<argT2>::value)
-        {
-            using realT2 = typename argT2::value_type;
-
-            return in1 / exprm_ns::complex<realT2>(in2);
-        }
-        else {
-            return in1 / in2;
-        }
-    }
-
-    template <int vec_sz>
-    sycl::vec<resT, vec_sz>
-    operator()(const sycl::vec<argT1, vec_sz> &in1,
-               const sycl::vec<argT2, vec_sz> &in2) const
-    {
-        auto tmp = in1 / in2;
-        if constexpr (std::is_same_v<resT,
-                                     typename decltype(tmp)::element_type>)
-        {
-            return tmp;
-        }
-        else {
-            using dpctl::tensor::type_utils::vec_cast;
-
-            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
-                tmp);
-        }
-    }
-};
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using TrueDivideContigFunctor = elementwise_common::BinaryContigFunctor<
-    argT1,
-    argT2,
-    resT,
-    TrueDivideFunctor<argT1, argT2, resT>,
-    vec_sz,
-    n_vecs,
-    enable_sg_loadstore>;
-
-template <typename argT1, typename argT2, typename resT, typename IndexerT>
-using TrueDivideStridedFunctor = elementwise_common::BinaryStridedFunctor<
-    argT1,
-    argT2,
-    resT,
-    IndexerT,
-    TrueDivideFunctor<argT1, argT2, resT>>;
-
-template <typename T1, typename T2> struct TrueDivideOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        sycl::half,
-                                        T2,
-                                        sycl::half,
-                                        sycl::half>,
-        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
-        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<float>,
-                                        T2,
-                                        std::complex<float>,
-                                        std::complex<float>>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<float>,
-                                        T2,
-                                        float,
-                                        std::complex<float>>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        float,
-                                        T2,
-                                        std::complex<float>,
-                                        std::complex<float>>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<double>,
-                                        T2,
-                                        std::complex<double>,
-                                        std::complex<double>>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        double,
-                                        T2,
-                                        std::complex<double>,
-                                        std::complex<double>>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<double>,
-                                        T2,
-                                        double,
-                                        std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::BinaryContigHyperparameterSetEntry;
-using vsu_ns::ContigHyperparameterSetDefault;
-
-template <typename argTy1, typename argTy2>
-struct TrueDivideContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename argT1,
-          typename argT2,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class true_divide_contig_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event
-true_divide_contig_impl(sycl::queue &exec_q,
-                        std::size_t nelems,
-                        const char *arg1_p,
-                        ssize_t arg1_offset,
-                        const char *arg2_p,
-                        ssize_t arg2_offset,
-                        char *res_p,
-                        ssize_t res_offset,
-                        const std::vector<sycl::event> &depends = {})
-{
-    using DivHS =
-        hyperparam_detail::TrueDivideContigHyperparameterSet<argTy1, argTy2>;
-    static constexpr std::uint8_t vec_sz = DivHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = DivHS::n_vecs;
-
-    return elementwise_common::binary_contig_impl<
-        argTy1, argTy2, TrueDivideOutputType, TrueDivideContigFunctor,
-        true_divide_contig_kernel, vec_sz, n_vecs>(
-        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
-        res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2> struct TrueDivideContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!TrueDivideOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = true_divide_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2>
-struct TrueDivideTypeMapFactory
-{
-    /*! @brief get typeid for output type of divide(T1 x, T2 y) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename TrueDivideOutputType<T1, T2>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename resT, typename IndexerT>
-class true_divide_strided_kernel;
-
-template <typename argTy1, typename argTy2>
-sycl::event
-true_divide_strided_impl(sycl::queue &exec_q,
-                         std::size_t nelems,
-                         int nd,
-                         const ssize_t *shape_and_strides,
-                         const char *arg1_p,
-                         ssize_t arg1_offset,
-                         const char *arg2_p,
-                         ssize_t arg2_offset,
-                         char *res_p,
-                         ssize_t res_offset,
-                         const std::vector<sycl::event> &depends,
-                         const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_strided_impl<
-        argTy1, argTy2, TrueDivideOutputType, TrueDivideStridedFunctor,
-        true_divide_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
-        arg2_offset, res_p, res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct TrueDivideStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!TrueDivideOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = true_divide_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename argT1, typename argT2, typename resT>
-using TrueDivideContigMatrixContigRowBroadcastingFunctor =
-    elementwise_common::BinaryContigMatrixContigRowBroadcastingFunctor<
-        argT1,
-        argT2,
-        resT,
-        TrueDivideFunctor<argT1, argT2, resT>>;
-
-template <typename argT1, typename argT2, typename resT>
-using TrueDivideContigRowContigMatrixBroadcastingFunctor =
-    elementwise_common::BinaryContigRowContigMatrixBroadcastingFunctor<
-        argT1,
-        argT2,
-        resT,
-        TrueDivideFunctor<argT1, argT2, resT>>;
-
-template <typename argT1, typename argT2, typename resT>
-class true_divide_matrix_row_broadcast_sg_krn;
-
-template <typename argT1, typename argT2, typename resT>
-class true_divide_row_matrix_broadcast_sg_krn;
-
-template <typename argT1, typename argT2, typename resT>
-sycl::event true_divide_contig_matrix_contig_row_broadcast_impl(
-    sycl::queue &exec_q,
-    std::vector<sycl::event> &host_tasks,
-    std::size_t n0,
-    std::size_t n1,
-    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
-    ssize_t mat_offset,
-    const char *vec_p, // typeless pointer to (n1,) contiguous row
-    ssize_t vec_offset,
-    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
-                 //    res[i,j] = mat[i,j] / vec[j]
-    ssize_t res_offset,
-    const std::vector<sycl::event> &depends = {})
-{
-    return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl<
-        argT1, argT2, resT, TrueDivideContigMatrixContigRowBroadcastingFunctor,
-        true_divide_matrix_row_broadcast_sg_krn>(
-        exec_q, host_tasks, n0, n1, mat_p, mat_offset, vec_p, vec_offset, res_p,
-        res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct TrueDivideContigMatrixContigRowBroadcastFactory
-{
-    fnT get()
-    {
-        if constexpr (!TrueDivideOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            using resT = typename TrueDivideOutputType<T1, T2>::value_type;
-            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
-                          dpctl::tensor::type_utils::is_complex<T2>::value ||
-                          dpctl::tensor::type_utils::is_complex<resT>::value)
-            {
-                fnT fn = nullptr;
-                return fn;
-            }
-            else {
-                fnT fn =
-                    true_divide_contig_matrix_contig_row_broadcast_impl<T1, T2,
-                                                                        resT>;
-                return fn;
-            }
-        }
-    }
-};
-
-template <typename argT1, typename argT2, typename resT>
-sycl::event true_divide_contig_row_contig_matrix_broadcast_impl(
-    sycl::queue &exec_q,
-    std::vector<sycl::event> &host_tasks,
-    std::size_t n0,
-    std::size_t n1,
-    const char *vec_p, // typeless pointer to (n1,) contiguous row
-    ssize_t vec_offset,
-    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
-    ssize_t mat_offset,
-    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
-                 //    res[i,j] = mat[i,j] + vec[j]
-    ssize_t res_offset,
-    const std::vector<sycl::event> &depends = {})
-{
-    return elementwise_common::binary_contig_row_contig_matrix_broadcast_impl<
-        argT1, argT2, resT, TrueDivideContigRowContigMatrixBroadcastingFunctor,
-        true_divide_row_matrix_broadcast_sg_krn>(
-        exec_q, host_tasks, n0, n1, vec_p, vec_offset, mat_p, mat_offset, res_p,
-        res_offset, depends);
-};
-
-template <typename fnT, typename T1, typename T2>
-struct TrueDivideContigRowContigMatrixBroadcastFactory
-{
-    fnT get()
-    {
-        if constexpr (!TrueDivideOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            using resT = typename TrueDivideOutputType<T1, T2>::value_type;
-            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
-                          dpctl::tensor::type_utils::is_complex<T2>::value ||
-                          dpctl::tensor::type_utils::is_complex<resT>::value)
-            {
-                fnT fn = nullptr;
-                return fn;
-            }
-            else {
-                fnT fn =
-                    true_divide_contig_row_contig_matrix_broadcast_impl<T1, T2,
-                                                                        resT>;
-                return fn;
-            }
-        }
-    }
-};
-
-template <typename argT, typename resT> struct TrueDivideInplaceFunctor
-{
-
-    using supports_sg_loadstore = std::negation<
-        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
-    using supports_vec = std::negation<
-        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
-
-    void operator()(resT &res, const argT &in)
-    {
-        if constexpr (tu_ns::is_complex<resT>::value) {
-            if constexpr (tu_ns::is_complex<argT>::value) {
-                using res_rT = typename resT::value_type;
-                using arg_rT = typename argT::value_type;
-
-                auto res1 = exprm_ns::complex<res_rT>(res);
-                res1 /= exprm_ns::complex<arg_rT>(in);
-                res = res1;
-            }
-            else {
-                using res_rT = typename resT::value_type;
-
-                auto res1 = exprm_ns::complex<res_rT>(res);
-                res1 /= in;
-                res = res1;
-            }
-        }
-        else {
-            res /= in;
-        }
-    }
-
-    template <int vec_sz>
-    void operator()(sycl::vec<resT, vec_sz> &res,
-                    const sycl::vec<argT, vec_sz> &in)
-    {
-        res /= in;
-    }
-};
-
-/* @brief Types supported by in-place divide */
-template <typename argTy, typename resTy>
-struct TrueDivideInplaceTypePairSupport
-{
-
-    /* value if true a kernel for <argTy, resTy> must be instantiated  */
-    static constexpr bool is_defined = std::disjunction<
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, resTy, sycl::half>,
-        td_ns::TypePairDefinedEntry<argTy, float, resTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, double, resTy, double>,
-        td_ns::TypePairDefinedEntry<argTy, float, resTy, std::complex<float>>,
-        td_ns::TypePairDefinedEntry<argTy,
-                                    std::complex<float>,
-                                    resTy,
-                                    std::complex<float>>,
-        td_ns::TypePairDefinedEntry<argTy, double, resTy, std::complex<double>>,
-        td_ns::TypePairDefinedEntry<argTy,
-                                    std::complex<double>,
-                                    resTy,
-                                    std::complex<double>>,
-        // fall-through
-        td_ns::NotDefinedEntry>::is_defined;
-};
-
-template <typename fnT, typename argT, typename resT>
-struct TrueDivideInplaceTypeMapFactory
-{
-    /*! @brief get typeid for output type of divide(T1 x, T2 y) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        if constexpr (TrueDivideInplaceTypePairSupport<argT, resT>::is_defined)
-        {
-            return td_ns::GetTypeid<resT>{}.get();
-        }
-        else {
-            return td_ns::GetTypeid<void>{}.get();
-        }
-    }
-};
-
-template <typename argT,
-          typename resT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using TrueDivideInplaceContigFunctor =
-    elementwise_common::BinaryInplaceContigFunctor<
-        argT,
-        resT,
-        TrueDivideInplaceFunctor<argT, resT>,
-        vec_sz,
-        n_vecs,
-        enable_sg_loadstore>;
-
-template <typename argT, typename resT, typename IndexerT>
-using TrueDivideInplaceStridedFunctor =
-    elementwise_common::BinaryInplaceStridedFunctor<
-        argT,
-        resT,
-        IndexerT,
-        TrueDivideInplaceFunctor<argT, resT>>;
-
-template <typename argT,
-          typename resT,
-          std::uint8_t vec_sz,
-          std::uint8_t n_vecs>
-class true_divide_inplace_contig_kernel;
-
-template <typename argTy, typename resTy>
-sycl::event
-true_divide_inplace_contig_impl(sycl::queue &exec_q,
-                                std::size_t nelems,
-                                const char *arg_p,
-                                ssize_t arg_offset,
-                                char *res_p,
-                                ssize_t res_offset,
-                                const std::vector<sycl::event> &depends = {})
-{
-    using DivHS =
-        hyperparam_detail::TrueDivideContigHyperparameterSet<resTy, argTy>;
-    static constexpr std::uint8_t vec_sz = DivHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = DivHS::vec_sz;
-
-    return elementwise_common::binary_inplace_contig_impl<
-        argTy, resTy, TrueDivideInplaceContigFunctor,
-        true_divide_inplace_contig_kernel, vec_sz, n_vecs>(
-        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct TrueDivideInplaceContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!TrueDivideInplaceTypePairSupport<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = true_divide_inplace_contig_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename resT, typename argT, typename IndexerT>
-class true_divide_inplace_strided_kernel;
-
-template <typename argTy, typename resTy>
-sycl::event true_divide_inplace_strided_impl(
-    sycl::queue &exec_q,
-    std::size_t nelems,
-    int nd,
-    const ssize_t *shape_and_strides,
-    const char *arg_p,
-    ssize_t arg_offset,
-    char *res_p,
-    ssize_t res_offset,
-    const std::vector<sycl::event> &depends,
-    const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::binary_inplace_strided_impl<
-        argTy, resTy, TrueDivideInplaceStridedFunctor,
-        true_divide_inplace_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct TrueDivideInplaceStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!TrueDivideInplaceTypePairSupport<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = true_divide_inplace_strided_impl<T1, T2>;
-            return fn;
-        }
-    }
-};
-
-template <typename argT, typename resT>
-class true_divide_inplace_row_matrix_broadcast_sg_krn;
-
-template <typename argT, typename resT>
-using TrueDivideInplaceRowMatrixBroadcastingFunctor =
-    elementwise_common::BinaryInplaceRowMatrixBroadcastingFunctor<
-        argT,
-        resT,
-        TrueDivideInplaceFunctor<argT, resT>>;
-
-template <typename argT, typename resT>
-sycl::event true_divide_inplace_row_matrix_broadcast_impl(
-    sycl::queue &exec_q,
-    std::vector<sycl::event> &host_tasks,
-    std::size_t n0,
-    std::size_t n1,
-    const char *vec_p, // typeless pointer to (n1,) contiguous row
-    ssize_t vec_offset,
-    char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
-    ssize_t mat_offset,
-    const std::vector<sycl::event> &depends = {})
-{
-    return elementwise_common::binary_inplace_row_matrix_broadcast_impl<
-        argT, resT, TrueDivideInplaceRowMatrixBroadcastingFunctor,
-        true_divide_inplace_row_matrix_broadcast_sg_krn>(
-        exec_q, host_tasks, n0, n1, vec_p, vec_offset, mat_p, mat_offset,
-        depends);
-}
-
-template <typename fnT, typename T1, typename T2>
-struct TrueDivideInplaceRowMatrixBroadcastFactory
-{
-    fnT get()
-    {
-        if constexpr (!TrueDivideInplaceTypePairSupport<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
-                          dpctl::tensor::type_utils::is_complex<T2>::value)
-            {
-                fnT fn = nullptr;
-                return fn;
-            }
-            else {
-                fnT fn = true_divide_inplace_row_matrix_broadcast_impl<T1, T2>;
-                return fn;
-            }
-        }
-    }
-};
-
-} // namespace true_divide
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp
deleted file mode 100644
index 61a37ca1b2..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp
+++ /dev/null
@@ -1,218 +0,0 @@
-//=== trunc.hpp -   Unary function TRUNC                ------  *-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for elementwise evaluation of TRUNC(x) function.
-//===---------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "vec_size_util.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-#include "utils/offset_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace trunc
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::type_utils::is_complex;
-
-template <typename argT, typename resT> struct TruncFunctor
-{
-
-    // is function constant for given argT
-    using is_constant = typename std::false_type;
-    // constant value, if constant
-    // constexpr resT constant_value = resT{};
-    // is function defined for sycl::vec
-    using supports_vec = typename std::false_type;
-    // do both argTy and resTy support sugroup store/load operation
-    using supports_sg_loadstore = typename std::negation<
-        std::disjunction<is_complex<resT>, is_complex<argT>>>;
-
-    resT operator()(const argT &in) const
-    {
-        if constexpr (std::is_integral_v<argT>) {
-            return in;
-        }
-        else {
-            return sycl::trunc(in);
-        }
-    }
-};
-
-template <typename argTy,
-          typename resTy = argTy,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-using TruncContigFunctor =
-    elementwise_common::UnaryContigFunctor<argTy,
-                                           resTy,
-                                           TruncFunctor<argTy, resTy>,
-                                           vec_sz,
-                                           n_vecs,
-                                           enable_sg_loadstore>;
-
-template <typename argTy, typename resTy, typename IndexerT>
-using TruncStridedFunctor = elementwise_common::
-    UnaryStridedFunctor<argTy, resTy, IndexerT, TruncFunctor<argTy, resTy>>;
-
-template <typename T> struct TruncOutputType
-{
-    using value_type =
-        typename std::disjunction<td_ns::TypeMapResultEntry<T, bool>,
-                                  td_ns::TypeMapResultEntry<T, std::uint8_t>,
-                                  td_ns::TypeMapResultEntry<T, std::uint16_t>,
-                                  td_ns::TypeMapResultEntry<T, std::uint32_t>,
-                                  td_ns::TypeMapResultEntry<T, std::uint64_t>,
-                                  td_ns::TypeMapResultEntry<T, std::int8_t>,
-                                  td_ns::TypeMapResultEntry<T, std::int16_t>,
-                                  td_ns::TypeMapResultEntry<T, std::int32_t>,
-                                  td_ns::TypeMapResultEntry<T, std::int64_t>,
-                                  td_ns::TypeMapResultEntry<T, sycl::half>,
-                                  td_ns::TypeMapResultEntry<T, float>,
-                                  td_ns::TypeMapResultEntry<T, double>,
-                                  td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-namespace hyperparam_detail
-{
-
-namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
-
-using vsu_ns::ContigHyperparameterSetDefault;
-using vsu_ns::UnaryContigHyperparameterSetEntry;
-
-template <typename argTy> struct TruncContigHyperparameterSet
-{
-    using value_type =
-        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
-
-    constexpr static auto vec_sz = value_type::vec_sz;
-    constexpr static auto n_vecs = value_type::n_vecs;
-};
-
-} // end of namespace hyperparam_detail
-
-template <typename T1, typename T2, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class trunc_contig_kernel;
-
-template <typename argTy>
-sycl::event trunc_contig_impl(sycl::queue &exec_q,
-                              std::size_t nelems,
-                              const char *arg_p,
-                              char *res_p,
-                              const std::vector<sycl::event> &depends = {})
-{
-    using TruncHS = hyperparam_detail::TruncContigHyperparameterSet<argTy>;
-    static constexpr std::uint8_t vec_sz = TruncHS::vec_sz;
-    static constexpr std::uint8_t n_vecs = TruncHS::n_vecs;
-
-    return elementwise_common::unary_contig_impl<
-        argTy, TruncOutputType, TruncContigFunctor, trunc_contig_kernel, vec_sz,
-        n_vecs>(exec_q, nelems, arg_p, res_p, depends);
-}
-
-template <typename fnT, typename T> struct TruncContigFactory
-{
-    fnT get()
-    {
-        if constexpr (!TruncOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = trunc_contig_impl<T>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct TruncTypeMapFactory
-{
-    /*! @brief get typeid for output type of sycl::trunc(T x) */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT = typename TruncOutputType<T>::value_type;
-        return td_ns::GetTypeid<rT>{}.get();
-    }
-};
-
-template <typename T1, typename T2, typename T3> class trunc_strided_kernel;
-
-template <typename argTy>
-sycl::event
-trunc_strided_impl(sycl::queue &exec_q,
-                   std::size_t nelems,
-                   int nd,
-                   const ssize_t *shape_and_strides,
-                   const char *arg_p,
-                   ssize_t arg_offset,
-                   char *res_p,
-                   ssize_t res_offset,
-                   const std::vector<sycl::event> &depends,
-                   const std::vector<sycl::event> &additional_depends)
-{
-    return elementwise_common::unary_strided_impl<
-        argTy, TruncOutputType, TruncStridedFunctor, trunc_strided_kernel>(
-        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
-        res_offset, depends, additional_depends);
-}
-
-template <typename fnT, typename T> struct TruncStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (!TruncOutputType<T>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            fnT fn = trunc_strided_impl<T>;
-            return fn;
-        }
-    }
-};
-
-} // namespace trunc
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
deleted file mode 100644
index ae4024cfbe..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp
+++ /dev/null
@@ -1,73 +0,0 @@
-//=== vec_size_utils.hpp -                            -------/ /*-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines utilities for selection of hyperparameters for kernels
-/// implementing unary and binary elementwise functions for contiguous inputs
-//===---------------------------------------------------------------------===//
-
-#pragma once
-
-#include <cstdint>
-#include <type_traits>
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace vec_size_utils
-{
-
-template <typename Ty1,
-          typename ArgTy1,
-          typename Ty2,
-          typename ArgTy2,
-          std::uint8_t vec_sz_v,
-          std::uint8_t n_vecs_v>
-struct BinaryContigHyperparameterSetEntry
-    : std::conjunction<std::is_same<Ty1, ArgTy1>, std::is_same<Ty2, ArgTy2>>
-{
-    static constexpr std::uint8_t vec_sz = vec_sz_v;
-    static constexpr std::uint8_t n_vecs = n_vecs_v;
-};
-
-template <typename Ty,
-          typename ArgTy,
-          std::uint8_t vec_sz_v,
-          std::uint8_t n_vecs_v>
-struct UnaryContigHyperparameterSetEntry : std::is_same<Ty, ArgTy>
-{
-    static constexpr std::uint8_t vec_sz = vec_sz_v;
-    static constexpr std::uint8_t n_vecs = n_vecs_v;
-};
-
-template <std::uint8_t vec_sz_v, std::uint8_t n_vecs_v>
-struct ContigHyperparameterSetDefault : std::true_type
-{
-    static constexpr std::uint8_t vec_sz = vec_sz_v;
-    static constexpr std::uint8_t n_vecs = n_vecs_v;
-};
-
-} // end of namespace vec_size_utils
-} // end of namespace kernels
-} // end of namespace tensor
-} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
deleted file mode 100644
index 75807d3446..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
+++ /dev/null
@@ -1,417 +0,0 @@
-//=== indexing.hpp -  Implementation of indexing kernels ---*-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for advanced tensor index operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <algorithm>
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "dpctl_tensor_types.hpp"
-#include "utils/indexing_utils.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace indexing
-{
-
-using dpctl::tensor::ssize_t;
-
-template <typename ProjectorT,
-          typename OrthogIndexer,
-          typename IndicesIndexer,
-          typename AxesIndexer,
-          typename T,
-          typename indT>
-class TakeFunctor
-{
-private:
-    const char *src_ = nullptr;
-    char *dst_ = nullptr;
-    char **ind_ = nullptr;
-    int k_ = 0;
-    std::size_t ind_nelems_ = 0;
-    const ssize_t *axes_shape_and_strides_ = nullptr;
-    OrthogIndexer orthog_strider;
-    IndicesIndexer ind_strider;
-    AxesIndexer axes_strider;
-
-public:
-    TakeFunctor(const char *src_cp,
-                char *dst_cp,
-                char **ind_cp,
-                int k,
-                std::size_t ind_nelems,
-                const ssize_t *axes_shape_and_strides,
-                const OrthogIndexer &orthog_strider_,
-                const IndicesIndexer &ind_strider_,
-                const AxesIndexer &axes_strider_)
-        : src_(src_cp), dst_(dst_cp), ind_(ind_cp), k_(k),
-          ind_nelems_(ind_nelems),
-          axes_shape_and_strides_(axes_shape_and_strides),
-          orthog_strider(orthog_strider_), ind_strider(ind_strider_),
-          axes_strider(axes_strider_)
-    {
-    }
-
-    void operator()(sycl::id<1> id) const
-    {
-        const T *src = reinterpret_cast<const T *>(src_);
-        T *dst = reinterpret_cast<T *>(dst_);
-
-        ssize_t i_orthog = id / ind_nelems_;
-        ssize_t i_along = id - (i_orthog * ind_nelems_);
-
-        auto orthog_offsets = orthog_strider(i_orthog);
-
-        ssize_t src_offset = orthog_offsets.get_first_offset();
-        ssize_t dst_offset = orthog_offsets.get_second_offset();
-
-        static constexpr ProjectorT proj{};
-        for (int axis_idx = 0; axis_idx < k_; ++axis_idx) {
-            indT *ind_data = reinterpret_cast<indT *>(ind_[axis_idx]);
-
-            ssize_t ind_offset = ind_strider(i_along, axis_idx);
-            // proj produces an index in the range of the given axis
-            ssize_t projected_idx =
-                proj(axes_shape_and_strides_[axis_idx], ind_data[ind_offset]);
-            src_offset +=
-                projected_idx * axes_shape_and_strides_[k_ + axis_idx];
-        }
-
-        dst_offset += axes_strider(i_along);
-
-        dst[dst_offset] = src[src_offset];
-    }
-};
-
-template <typename ProjectorT,
-          typename OrthogIndexer,
-          typename IndicesIndexer,
-          typename AxesIndexer,
-          typename T,
-          typename indT>
-class take_kernel;
-
-typedef sycl::event (*take_fn_ptr_t)(sycl::queue &,
-                                     std::size_t,
-                                     std::size_t,
-                                     int,
-                                     int,
-                                     int,
-                                     const ssize_t *,
-                                     const ssize_t *,
-                                     const ssize_t *,
-                                     const char *,
-                                     char *,
-                                     char **,
-                                     ssize_t,
-                                     ssize_t,
-                                     const ssize_t *,
-                                     const std::vector<sycl::event> &);
-
-template <typename ProjectorT, typename Ty, typename indT>
-sycl::event take_impl(sycl::queue &q,
-                      std::size_t orthog_nelems,
-                      std::size_t ind_nelems,
-                      int nd,
-                      int ind_nd,
-                      int k,
-                      const ssize_t *orthog_shape_and_strides,
-                      const ssize_t *axes_shape_and_strides,
-                      const ssize_t *ind_shape_and_strides,
-                      const char *src_p,
-                      char *dst_p,
-                      char **ind_p,
-                      ssize_t src_offset,
-                      ssize_t dst_offset,
-                      const ssize_t *ind_offsets,
-                      const std::vector<sycl::event> &depends)
-{
-    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
-
-    sycl::event take_ev = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        using OrthogIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
-        const OrthogIndexerT orthog_indexer{nd, src_offset, dst_offset,
-                                            orthog_shape_and_strides};
-
-        using NthStrideIndexerT = dpctl::tensor::offset_utils::NthStrideOffset;
-        const NthStrideIndexerT indices_indexer{ind_nd, ind_offsets,
-                                                ind_shape_and_strides};
-
-        using AxesIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
-        const AxesIndexerT axes_indexer{ind_nd, 0,
-                                        axes_shape_and_strides + (2 * k)};
-
-        using KernelName =
-            take_kernel<ProjectorT, OrthogIndexerT, NthStrideIndexerT,
-                        AxesIndexerT, Ty, indT>;
-
-        const std::size_t gws = orthog_nelems * ind_nelems;
-
-        cgh.parallel_for<KernelName>(
-            sycl::range<1>(gws),
-            TakeFunctor<ProjectorT, OrthogIndexerT, NthStrideIndexerT,
-                        AxesIndexerT, Ty, indT>(
-                src_p, dst_p, ind_p, k, ind_nelems, axes_shape_and_strides,
-                orthog_indexer, indices_indexer, axes_indexer));
-    });
-
-    return take_ev;
-}
-
-template <typename ProjectorT,
-          typename OrthogIndexer,
-          typename IndicesIndexer,
-          typename AxesIndexer,
-          typename T,
-          typename indT>
-class PutFunctor
-{
-private:
-    char *dst_ = nullptr;
-    const char *val_ = nullptr;
-    char **ind_ = nullptr;
-    int k_ = 0;
-    std::size_t ind_nelems_ = 0;
-    const ssize_t *axes_shape_and_strides_ = nullptr;
-    OrthogIndexer orthog_strider;
-    IndicesIndexer ind_strider;
-    AxesIndexer axes_strider;
-
-public:
-    PutFunctor(char *dst_cp,
-               const char *val_cp,
-               char **ind_cp,
-               int k,
-               std::size_t ind_nelems,
-               const ssize_t *axes_shape_and_strides,
-               const OrthogIndexer &orthog_strider_,
-               const IndicesIndexer &ind_strider_,
-               const AxesIndexer &axes_strider_)
-        : dst_(dst_cp), val_(val_cp), ind_(ind_cp), k_(k),
-          ind_nelems_(ind_nelems),
-          axes_shape_and_strides_(axes_shape_and_strides),
-          orthog_strider(orthog_strider_), ind_strider(ind_strider_),
-          axes_strider(axes_strider_)
-    {
-    }
-
-    void operator()(sycl::id<1> id) const
-    {
-        T *dst = reinterpret_cast<T *>(dst_);
-        const T *val = reinterpret_cast<const T *>(val_);
-
-        ssize_t i_orthog = id / ind_nelems_;
-        ssize_t i_along = id - (i_orthog * ind_nelems_);
-
-        auto orthog_offsets = orthog_strider(i_orthog);
-
-        ssize_t dst_offset = orthog_offsets.get_first_offset();
-        ssize_t val_offset = orthog_offsets.get_second_offset();
-
-        static constexpr ProjectorT proj{};
-        for (int axis_idx = 0; axis_idx < k_; ++axis_idx) {
-            indT *ind_data = reinterpret_cast<indT *>(ind_[axis_idx]);
-
-            ssize_t ind_offset = ind_strider(i_along, axis_idx);
-
-            // proj produces an index in the range of the given axis
-            ssize_t projected_idx =
-                proj(axes_shape_and_strides_[axis_idx], ind_data[ind_offset]);
-            dst_offset +=
-                projected_idx * axes_shape_and_strides_[k_ + axis_idx];
-        }
-
-        val_offset += axes_strider(i_along);
-
-        dst[dst_offset] = val[val_offset];
-    }
-};
-
-template <typename ProjectorT,
-          typename OrthogIndexer,
-          typename IndicesIndexer,
-          typename AxesIndexer,
-          typename T,
-          typename indT>
-class put_kernel;
-
-typedef sycl::event (*put_fn_ptr_t)(sycl::queue &,
-                                    std::size_t,
-                                    std::size_t,
-                                    int,
-                                    int,
-                                    int,
-                                    const ssize_t *,
-                                    const ssize_t *,
-                                    const ssize_t *,
-                                    char *,
-                                    const char *,
-                                    char **,
-                                    ssize_t,
-                                    ssize_t,
-                                    const ssize_t *,
-                                    const std::vector<sycl::event> &);
-
-template <typename ProjectorT, typename Ty, typename indT>
-sycl::event put_impl(sycl::queue &q,
-                     std::size_t orthog_nelems,
-                     std::size_t ind_nelems,
-                     int nd,
-                     int ind_nd,
-                     int k,
-                     const ssize_t *orthog_shape_and_strides,
-                     const ssize_t *axes_shape_and_strides,
-                     const ssize_t *ind_shape_and_strides,
-                     char *dst_p,
-                     const char *val_p,
-                     char **ind_p,
-                     ssize_t dst_offset,
-                     ssize_t val_offset,
-                     const ssize_t *ind_offsets,
-                     const std::vector<sycl::event> &depends)
-{
-    dpctl::tensor::type_utils::validate_type_for_device<Ty>(q);
-
-    sycl::event put_ev = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        using OrthogIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
-        const OrthogIndexerT orthog_indexer{nd, dst_offset, val_offset,
-                                            orthog_shape_and_strides};
-
-        using NthStrideIndexerT = dpctl::tensor::offset_utils::NthStrideOffset;
-        const NthStrideIndexerT indices_indexer{ind_nd, ind_offsets,
-                                                ind_shape_and_strides};
-
-        using AxesIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
-        const AxesIndexerT axes_indexer{ind_nd, 0,
-                                        axes_shape_and_strides + (2 * k)};
-
-        using KernelName =
-            put_kernel<ProjectorT, OrthogIndexerT, NthStrideIndexerT,
-                       AxesIndexerT, Ty, indT>;
-
-        const std::size_t gws = orthog_nelems * ind_nelems;
-
-        cgh.parallel_for<KernelName>(
-            sycl::range<1>(gws),
-            PutFunctor<ProjectorT, OrthogIndexerT, NthStrideIndexerT,
-                       AxesIndexerT, Ty, indT>(
-                dst_p, val_p, ind_p, k, ind_nelems, axes_shape_and_strides,
-                orthog_indexer, indices_indexer, axes_indexer));
-    });
-
-    return put_ev;
-}
-
-template <typename fnT, typename T, typename indT> struct TakeWrapFactory
-{
-    fnT get()
-    {
-        if constexpr (std::is_integral<indT>::value &&
-                      !std::is_same<indT, bool>::value)
-        {
-            using dpctl::tensor::indexing_utils::WrapIndex;
-            fnT fn = take_impl<WrapIndex<indT>, T, indT>;
-            return fn;
-        }
-        else {
-            fnT fn = nullptr;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T, typename indT> struct TakeClipFactory
-{
-    fnT get()
-    {
-        if constexpr (std::is_integral<indT>::value &&
-                      !std::is_same<indT, bool>::value)
-        {
-            using dpctl::tensor::indexing_utils::ClipIndex;
-            fnT fn = take_impl<ClipIndex<indT>, T, indT>;
-            return fn;
-        }
-        else {
-            fnT fn = nullptr;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T, typename indT> struct PutWrapFactory
-{
-    fnT get()
-    {
-        if constexpr (std::is_integral<indT>::value &&
-                      !std::is_same<indT, bool>::value)
-        {
-            using dpctl::tensor::indexing_utils::WrapIndex;
-            fnT fn = put_impl<WrapIndex<indT>, T, indT>;
-            return fn;
-        }
-        else {
-            fnT fn = nullptr;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T, typename indT> struct PutClipFactory
-{
-    fnT get()
-    {
-        if constexpr (std::is_integral<indT>::value &&
-                      !std::is_same<indT, bool>::value)
-        {
-            using dpctl::tensor::indexing_utils::ClipIndex;
-            fnT fn = put_impl<ClipIndex<indT>, T, indT>;
-            return fn;
-        }
-        else {
-            fnT fn = nullptr;
-            return fn;
-        }
-    }
-};
-
-} // namespace indexing
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp b/dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp
deleted file mode 100644
index ceec04c0a1..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp
+++ /dev/null
@@ -1,1401 +0,0 @@
-//=== dot_product.hpp - Implementation of dot product kernels - *-C++-*/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for the vector dot product.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <stdexcept>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/reductions.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/sycl_alloc_utils.hpp"
-#include "utils/sycl_utils.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-
-using dpctl::tensor::ssize_t;
-namespace su_ns = dpctl::tensor::sycl_utils;
-
-template <typename lhsT,
-          typename rhsT,
-          typename outT,
-          typename BatchIndexerT,
-          typename RedIndexerT>
-struct SequentialDotProduct
-{
-private:
-    const lhsT *lhs_ = nullptr;
-    const rhsT *rhs_ = nullptr;
-    outT *out_ = nullptr;
-    BatchIndexerT batch_indexer_;
-    RedIndexerT reduced_dims_indexer_;
-    std::size_t reduction_max_gid_ = 0;
-
-public:
-    SequentialDotProduct(const lhsT *lhs,
-                         const rhsT *rhs,
-                         outT *out,
-                         BatchIndexerT batch_indexer,
-                         RedIndexerT reduced_dims_indexer,
-                         std::size_t reduction_size)
-        : lhs_(lhs), rhs_(rhs), out_(out), batch_indexer_(batch_indexer),
-          reduced_dims_indexer_(reduced_dims_indexer),
-          reduction_max_gid_(reduction_size)
-    {
-    }
-
-    void operator()(sycl::id<1> id) const
-    {
-
-        auto const &batch_offsets = batch_indexer_(id[0]);
-        const ssize_t &lhs_batch_offset = batch_offsets.get_first_offset();
-        const ssize_t &rhs_batch_offset = batch_offsets.get_second_offset();
-        const ssize_t &out_batch_offset = batch_offsets.get_third_offset();
-
-        outT red_val(0);
-        for (std::size_t m = 0; m < reduction_max_gid_; ++m) {
-            auto reduction_offsets = reduced_dims_indexer_(m);
-            auto lhs_reduction_offset = reduction_offsets.get_first_offset();
-            auto rhs_reduction_offset = reduction_offsets.get_second_offset();
-
-            using dpctl::tensor::type_utils::convert_impl;
-            red_val += convert_impl<outT, lhsT>(
-                           lhs_[lhs_batch_offset + lhs_reduction_offset]) *
-                       convert_impl<outT, rhsT>(
-                           rhs_[rhs_batch_offset + rhs_reduction_offset]);
-        }
-
-        out_[out_batch_offset] = red_val;
-    }
-};
-
-template <typename lhsT,
-          typename rhsT,
-          typename outT,
-          typename ReductionOpT,
-          typename BatchIndexerT,
-          typename RedIndexerT>
-struct DotProductFunctor
-{
-private:
-    const lhsT *lhs_ = nullptr;
-    const rhsT *rhs_ = nullptr;
-    outT *out_ = nullptr;
-    ReductionOpT reduction_op_;
-    BatchIndexerT batch_indexer_;
-    RedIndexerT reduced_dims_indexer_;
-    std::size_t reduction_max_gid_ = 0;
-    std::size_t batches_ = 1;
-    std::size_t reductions_per_wi = 16;
-
-public:
-    DotProductFunctor(const lhsT *lhs,
-                      const rhsT *rhs,
-                      outT *res,
-                      const ReductionOpT &reduction_op,
-                      const BatchIndexerT &batch_indexer,
-                      const RedIndexerT &arg_reduced_dims_indexer,
-                      std::size_t reduction_size,
-                      std::size_t iteration_size,
-                      std::size_t reduction_size_per_wi)
-        : lhs_(lhs), rhs_(rhs), out_(res), reduction_op_(reduction_op),
-          batch_indexer_(batch_indexer),
-          reduced_dims_indexer_(arg_reduced_dims_indexer),
-          reduction_max_gid_(reduction_size), batches_(iteration_size),
-          reductions_per_wi(reduction_size_per_wi)
-    {
-    }
-
-    void operator()(sycl::nd_item<1> it) const
-    {
-        const std::size_t batch_id = it.get_group(0) % batches_;
-        const std::size_t reduction_batch_id = it.get_group(0) / batches_;
-
-        const std::size_t reduction_lid = it.get_local_id(0);
-        const std::size_t wg =
-            it.get_local_range(0); //   0 <= reduction_lid < wg
-
-        // work-items operate over input with indices
-        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
-        //   + reduction_lid
-        // for 0 <= m < reductions_per_wi
-        // for each input
-
-        const auto &batch_offsets_ = batch_indexer_(batch_id);
-        const auto &lhs_batch_offset = batch_offsets_.get_first_offset();
-        const auto &rhs_batch_offset = batch_offsets_.get_second_offset();
-        const auto &out_batch_offset = batch_offsets_.get_third_offset();
-
-        outT local_red_val(0);
-        std::size_t arg_reduce_gid0 =
-            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
-        std::size_t arg_reduce_gid_max = std::min(
-            reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
-
-        for (std::size_t arg_reduce_gid = arg_reduce_gid0;
-             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg)
-        {
-            auto reduction_offsets_ = reduced_dims_indexer_(arg_reduce_gid);
-            const auto &lhs_reduction_offset =
-                reduction_offsets_.get_first_offset();
-            const auto &rhs_reduction_offset =
-                reduction_offsets_.get_second_offset();
-
-            using dpctl::tensor::type_utils::convert_impl;
-            outT val = convert_impl<outT, lhsT>(
-                           lhs_[lhs_batch_offset + lhs_reduction_offset]) *
-                       convert_impl<outT, rhsT>(
-                           rhs_[rhs_batch_offset + rhs_reduction_offset]);
-
-            local_red_val += val;
-        }
-
-        auto work_group = it.get_group();
-        outT red_val_over_wg = sycl::reduce_over_group(
-            work_group, local_red_val, outT(0), reduction_op_);
-
-        if (work_group.leader()) {
-            sycl::atomic_ref<outT, sycl::memory_order::relaxed,
-                             sycl::memory_scope::device,
-                             sycl::access::address_space::global_space>
-                res_ref(out_[out_batch_offset]);
-            res_ref += red_val_over_wg;
-        }
-    }
-};
-
-template <typename lhsT,
-          typename rhsT,
-          typename outT,
-          typename ReductionOpT,
-          typename BatchIndexerT,
-          typename RedIndexerT,
-          typename SlmT>
-struct DotProductCustomFunctor
-{
-private:
-    const lhsT *lhs_ = nullptr;
-    const rhsT *rhs_ = nullptr;
-    outT *out_ = nullptr;
-    ReductionOpT reduction_op_;
-    BatchIndexerT batch_indexer_;
-    RedIndexerT reduced_dims_indexer_;
-    SlmT local_mem_;
-    std::size_t reduction_max_gid_ = 0;
-    std::size_t batches_ = 1;
-    std::size_t reductions_per_wi = 16;
-
-public:
-    DotProductCustomFunctor(const lhsT *lhs,
-                            const rhsT *rhs,
-                            outT *res,
-                            const ReductionOpT &reduction_op,
-                            const BatchIndexerT &batch_indexer,
-                            const RedIndexerT &arg_reduced_dims_indexer,
-                            SlmT local_mem,
-                            std::size_t reduction_size,
-                            std::size_t iteration_size,
-                            std::size_t reduction_size_per_wi)
-        : lhs_(lhs), rhs_(rhs), out_(res), reduction_op_(reduction_op),
-          batch_indexer_(batch_indexer),
-          reduced_dims_indexer_(arg_reduced_dims_indexer),
-          local_mem_(local_mem), reduction_max_gid_(reduction_size),
-          batches_(iteration_size), reductions_per_wi(reduction_size_per_wi)
-    {
-    }
-
-    void operator()(sycl::nd_item<1> it) const
-    {
-        const std::size_t batch_id = it.get_group(0) % batches_;
-        const std::size_t reduction_batch_id = it.get_group(0) / batches_;
-
-        const std::size_t reduction_lid = it.get_local_id(0);
-        const std::size_t wg =
-            it.get_local_range(0); //   0 <= reduction_lid < wg
-
-        // work-items operate over input with indices
-        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
-        //   + reduction_lid
-        // for 0 <= m < reductions_per_wi
-        // for each input
-
-        const auto &batch_offsets_ = batch_indexer_(batch_id);
-        const auto &lhs_batch_offset = batch_offsets_.get_first_offset();
-        const auto &rhs_batch_offset = batch_offsets_.get_second_offset();
-        const auto &out_batch_offset = batch_offsets_.get_third_offset();
-
-        outT local_red_val(0);
-        std::size_t arg_reduce_gid0 =
-            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
-        std::size_t arg_reduce_gid_max = std::min(
-            reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
-
-        for (std::size_t arg_reduce_gid = arg_reduce_gid0;
-             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg)
-        {
-            auto reduction_offsets_ = reduced_dims_indexer_(arg_reduce_gid);
-            const auto &lhs_reduction_offset =
-                reduction_offsets_.get_first_offset();
-            const auto &rhs_reduction_offset =
-                reduction_offsets_.get_second_offset();
-
-            using dpctl::tensor::type_utils::convert_impl;
-            outT val = convert_impl<outT, lhsT>(
-                           lhs_[lhs_batch_offset + lhs_reduction_offset]) *
-                       convert_impl<outT, rhsT>(
-                           rhs_[rhs_batch_offset + rhs_reduction_offset]);
-
-            local_red_val += val;
-        }
-
-        auto work_group = it.get_group();
-        outT red_val_over_wg = su_ns::custom_reduce_over_group(
-            work_group, local_mem_, local_red_val, reduction_op_);
-
-        if (work_group.leader()) {
-            sycl::atomic_ref<outT, sycl::memory_order::relaxed,
-                             sycl::memory_scope::device,
-                             sycl::access::address_space::global_space>
-                res_ref(out_[out_batch_offset]);
-            res_ref += red_val_over_wg;
-        }
-    }
-};
-
-template <
-    typename lhsTy,
-    typename rhsTy,
-    typename resTy,
-    typename BatchIndexerT,
-    typename RedIndexerT,
-    template <typename T1, typename T2, typename T3, typename T4, typename T5>
-    class kernel_name_token>
-sycl::event sequential_dot_product(sycl::queue &exec_q,
-                                   const lhsTy *lhs,
-                                   const rhsTy *rhs,
-                                   resTy *res,
-                                   std::size_t batches,
-                                   std::size_t reduction_nelems,
-                                   const BatchIndexerT &batch_indexer,
-                                   const RedIndexerT &reduction_indexer,
-                                   const std::vector<sycl::event> &depends)
-{
-    sycl::event dot_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        cgh.parallel_for<
-            kernel_name_token<lhsTy, rhsTy, resTy, BatchIndexerT, RedIndexerT>>(
-            sycl::range<1>(batches),
-            SequentialDotProduct<lhsTy, rhsTy, resTy, BatchIndexerT,
-                                 RedIndexerT>(lhs, rhs, res, batch_indexer,
-                                              reduction_indexer,
-                                              reduction_nelems));
-    });
-
-    return dot_ev;
-}
-
-template <typename lhsTy,
-          typename rhsTy,
-          typename resTy,
-          typename ReductionOpT,
-          typename BatchIndexerT,
-          typename RedIndexerT,
-          template <typename T1,
-                    typename T2,
-                    typename T3,
-                    typename T4,
-                    typename T5,
-                    typename T6>
-          class kernel_name_token>
-sycl::event submit_atomic_dot_product(sycl::queue &exec_q,
-                                      const lhsTy *lhs,
-                                      const rhsTy *rhs,
-                                      resTy *res,
-                                      std::size_t wg,
-                                      std::size_t batches,
-                                      std::size_t reduction_nelems,
-                                      std::size_t reductions_per_wi,
-                                      std::size_t reduction_groups,
-                                      const BatchIndexerT &batch_indexer,
-                                      const RedIndexerT &reduction_indexer,
-                                      const std::vector<sycl::event> &depends)
-{
-    sycl::event dot_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        auto globalRange = sycl::range<1>{batches * reduction_groups * wg};
-        auto localRange = sycl::range<1>{wg};
-        auto ndRange = sycl::nd_range<1>(globalRange, localRange);
-
-        if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value) {
-            using KernelName =
-                class kernel_name_token<lhsTy, rhsTy, resTy, ReductionOpT,
-                                        BatchIndexerT, RedIndexerT>;
-
-            cgh.parallel_for<KernelName>(
-                ndRange, DotProductFunctor<lhsTy, rhsTy, resTy, ReductionOpT,
-                                           BatchIndexerT, RedIndexerT>(
-                             lhs, rhs, res, ReductionOpT(), batch_indexer,
-                             reduction_indexer, reduction_nelems, batches,
-                             reductions_per_wi));
-        }
-        else {
-            using SlmT = sycl::local_accessor<resTy, 1>;
-            SlmT local_memory = SlmT(localRange, cgh);
-
-            using KernelName = class custom_reduction_wrapper<kernel_name_token<
-                lhsTy, rhsTy, resTy, ReductionOpT, BatchIndexerT, RedIndexerT>>;
-
-            cgh.parallel_for<KernelName>(
-                ndRange,
-                DotProductCustomFunctor<lhsTy, rhsTy, resTy, ReductionOpT,
-                                        BatchIndexerT, RedIndexerT, SlmT>(
-                    lhs, rhs, res, ReductionOpT(), batch_indexer,
-                    reduction_indexer, local_memory, reduction_nelems, batches,
-                    reductions_per_wi));
-        }
-    });
-    return dot_ev;
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class dot_product_seq_krn;
-
-template <typename T1, typename T2, typename T3> class dot_product_init_krn;
-
-template <typename T1,
-          typename T2,
-          typename T3,
-          typename T4,
-          typename T5,
-          typename T6>
-class dot_product_krn;
-
-typedef sycl::event (*dot_product_impl_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t,
-    std::size_t,
-    const char *,
-    const char *,
-    char *,
-    int,
-    const ssize_t *,
-    ssize_t,
-    ssize_t,
-    ssize_t,
-    int,
-    const ssize_t *,
-    ssize_t,
-    ssize_t,
-    const std::vector<sycl::event> &);
-
-template <typename lhsTy, typename rhsTy, typename resTy>
-sycl::event dot_product_impl(sycl::queue &exec_q,
-                             std::size_t batches,
-                             std::size_t reduction_nelems,
-                             const char *lhs_cp,
-                             const char *rhs_cp,
-                             char *res_cp,
-                             int batch_nd,
-                             const ssize_t *batch_shape_and_strides,
-                             ssize_t batch_lhs_offset,
-                             ssize_t batch_rhs_offset,
-                             ssize_t batch_res_offset,
-                             int red_nd,
-                             const ssize_t *reduction_shape_stride,
-                             ssize_t reduction_lhs_offset,
-                             ssize_t reduction_rhs_offset,
-                             const std::vector<sycl::event> &depends = {})
-{
-    const lhsTy *lhs_tp = reinterpret_cast<const lhsTy *>(lhs_cp);
-    const rhsTy *rhs_tp = reinterpret_cast<const rhsTy *>(rhs_cp);
-    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
-
-    const sycl::device &d = exec_q.get_device();
-    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
-    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
-
-    if (reduction_nelems < wg) {
-        using InputOutputBatchIndexerT =
-            dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
-        using ReductionIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
-
-        const InputOutputBatchIndexerT inp_out_batch_indexer{
-            batch_nd, batch_lhs_offset, batch_rhs_offset, batch_res_offset,
-            batch_shape_and_strides};
-        const ReductionIndexerT reduction_indexer{red_nd, reduction_lhs_offset,
-                                                  reduction_rhs_offset,
-                                                  reduction_shape_stride};
-
-        sycl::event dot_ev =
-            sequential_dot_product<lhsTy, rhsTy, resTy,
-                                   InputOutputBatchIndexerT, ReductionIndexerT,
-                                   dot_product_seq_krn>(
-                exec_q, lhs_tp, rhs_tp, res_tp, batches, reduction_nelems,
-                inp_out_batch_indexer, reduction_indexer, depends);
-
-        return dot_ev;
-    }
-    else {
-        sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
-            using IndexerT =
-                dpctl::tensor::offset_utils::UnpackedStridedIndexer;
-
-            const ssize_t *const &res_shape = batch_shape_and_strides;
-            const ssize_t *const &res_strides =
-                batch_shape_and_strides + 3 * batch_nd;
-            const IndexerT res_indexer(batch_nd, batch_res_offset, res_shape,
-                                       res_strides);
-            using InitKernelName =
-                class dot_product_init_krn<lhsTy, rhsTy, resTy>;
-            cgh.depends_on(depends);
-
-            cgh.parallel_for<InitKernelName>(
-                sycl::range<1>(batches), [=](sycl::id<1> id) {
-                    auto res_offset = res_indexer(id[0]);
-                    res_tp[res_offset] = 0;
-                });
-        });
-
-        using ReductionOpT = sycl::plus<resTy>;
-
-        using BatchIndexerT =
-            dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
-        using ReductionIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
-
-        const BatchIndexerT batch_indexer{batch_nd, batch_lhs_offset,
-                                          batch_rhs_offset, batch_res_offset,
-                                          batch_shape_and_strides};
-        const ReductionIndexerT reduction_indexer{red_nd, reduction_lhs_offset,
-                                                  reduction_rhs_offset,
-                                                  reduction_shape_stride};
-
-        static constexpr std::size_t preferred_reductions_per_wi =
-            4; // determined experimentally
-        std::size_t reductions_per_wi =
-            (reduction_nelems < preferred_reductions_per_wi * wg)
-                ? std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg)
-                : preferred_reductions_per_wi;
-
-        std::size_t reduction_groups =
-            (reduction_nelems + reductions_per_wi * wg - 1) /
-            (reductions_per_wi * wg);
-
-        sycl::event dot_ev =
-            submit_atomic_dot_product<lhsTy, rhsTy, resTy, ReductionOpT,
-                                      BatchIndexerT, ReductionIndexerT,
-                                      dot_product_krn>(
-                exec_q, lhs_tp, rhs_tp, res_tp, wg, batches, reduction_nelems,
-                reductions_per_wi, reduction_groups, batch_indexer,
-                reduction_indexer, {res_init_ev});
-
-        return dot_ev;
-    }
-}
-
-typedef sycl::event (*dot_product_contig_impl_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t,
-    std::size_t,
-    const char *,
-    const char *,
-    char *,
-    ssize_t,
-    ssize_t,
-    ssize_t,
-    ssize_t,
-    ssize_t,
-    const std::vector<sycl::event> &);
-
-template <typename lhsTy, typename rhsTy, typename resTy>
-sycl::event
-dot_product_contig_impl(sycl::queue &exec_q,
-                        std::size_t batches,
-                        std::size_t reduction_nelems,
-                        const char *lhs_cp,
-                        const char *rhs_cp,
-                        char *res_cp,
-                        ssize_t batch_lhs_offset,
-                        ssize_t batch_rhs_offset,
-                        ssize_t batch_res_offset,
-                        ssize_t reduction_lhs_offset,
-                        ssize_t reduction_rhs_offset,
-                        const std::vector<sycl::event> &depends = {})
-{
-    const lhsTy *lhs_tp = reinterpret_cast<const lhsTy *>(lhs_cp) +
-                          batch_lhs_offset + reduction_lhs_offset;
-    const rhsTy *rhs_tp = reinterpret_cast<const rhsTy *>(rhs_cp) +
-                          batch_rhs_offset + reduction_rhs_offset;
-    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + batch_res_offset;
-
-    const sycl::device &d = exec_q.get_device();
-    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
-    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
-
-    if (reduction_nelems < wg) {
-        using InputBatchIndexerT =
-            dpctl::tensor::offset_utils::Strided1DIndexer;
-        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        using InputOutputBatchIndexerT =
-            dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer<
-                InputBatchIndexerT, InputBatchIndexerT, NoOpIndexerT>;
-        using ReductionIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                NoOpIndexerT, NoOpIndexerT>;
-
-        const InputBatchIndexerT inp_batch_indexer{/* size */ batches,
-                                                   /* step */ reduction_nelems};
-        const InputOutputBatchIndexerT inp_out_batch_indexer{
-            inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}};
-        static constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{},
-                                                             NoOpIndexerT{}};
-
-        sycl::event dot_ev =
-            sequential_dot_product<lhsTy, rhsTy, resTy,
-                                   InputOutputBatchIndexerT, ReductionIndexerT,
-                                   dot_product_seq_krn>(
-                exec_q, lhs_tp, rhs_tp, res_tp, batches, reduction_nelems,
-                inp_out_batch_indexer, reduction_indexer, depends);
-
-        return dot_ev;
-    }
-    else {
-        sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(depends);
-
-            cgh.fill<resTy>(res_tp, resTy(0), batches);
-        });
-
-        using ReductionOpT = sycl::plus<resTy>;
-
-        using InputBatchIndexerT =
-            dpctl::tensor::offset_utils::Strided1DIndexer;
-        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        using InputOutputBatchIndexerT =
-            dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer<
-                InputBatchIndexerT, InputBatchIndexerT, NoOpIndexerT>;
-        using ReductionIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                NoOpIndexerT, NoOpIndexerT>;
-
-        const InputBatchIndexerT inp_batch_indexer{/* size */ batches,
-                                                   /* step */ reduction_nelems};
-        const InputOutputBatchIndexerT inp_out_batch_indexer{
-            inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}};
-        static constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{},
-                                                             NoOpIndexerT{}};
-
-        static constexpr std::size_t preferred_reductions_per_wi =
-            4; // determined experimentally
-        std::size_t reductions_per_wi =
-            (reduction_nelems < preferred_reductions_per_wi * wg)
-                ? std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg)
-                : preferred_reductions_per_wi;
-
-        std::size_t reduction_groups =
-            (reduction_nelems + reductions_per_wi * wg - 1) /
-            (reductions_per_wi * wg);
-
-        sycl::event dot_ev =
-            submit_atomic_dot_product<lhsTy, rhsTy, resTy, ReductionOpT,
-                                      InputOutputBatchIndexerT,
-                                      ReductionIndexerT, dot_product_krn>(
-                exec_q, lhs_tp, rhs_tp, res_tp, wg, batches, reduction_nelems,
-                reductions_per_wi, reduction_groups, inp_out_batch_indexer,
-                reduction_indexer, {res_init_ev});
-
-        return dot_ev;
-    }
-}
-
-template <typename lhsT,
-          typename rhsT,
-          typename outT,
-          typename ReductionOpT,
-          typename BatchIndexerT,
-          typename RedIndexerT>
-struct DotProductNoAtomicFunctor
-{
-private:
-    const lhsT *lhs_ = nullptr;
-    const rhsT *rhs_ = nullptr;
-    outT *out_ = nullptr;
-    ReductionOpT reduction_op_;
-    BatchIndexerT batch_indexer_;
-    RedIndexerT reduced_dims_indexer_;
-    std::size_t reduction_max_gid_ = 0;
-    std::size_t batches_ = 1;
-    std::size_t reductions_per_wi = 16;
-
-public:
-    DotProductNoAtomicFunctor(const lhsT *lhs,
-                              const rhsT *rhs,
-                              outT *res,
-                              const ReductionOpT &reduction_op,
-                              const BatchIndexerT &batch_indexer,
-                              const RedIndexerT &arg_reduced_dims_indexer,
-                              std::size_t reduction_size,
-                              std::size_t iteration_size,
-                              std::size_t reduction_size_per_wi)
-        : lhs_(lhs), rhs_(rhs), out_(res), reduction_op_(reduction_op),
-          batch_indexer_(batch_indexer),
-          reduced_dims_indexer_(arg_reduced_dims_indexer),
-          reduction_max_gid_(reduction_size), batches_(iteration_size),
-          reductions_per_wi(reduction_size_per_wi)
-    {
-    }
-
-    void operator()(sycl::nd_item<1> it) const
-    {
-        const std::size_t reduction_lid = it.get_local_id(0);
-        const std::size_t wg =
-            it.get_local_range(0); //   0 <= reduction_lid < wg
-
-        const std::size_t batch_id = it.get_group(0) % batches_;
-        const std::size_t reduction_batch_id = it.get_group(0) / batches_;
-        const std::size_t n_reduction_groups = it.get_group_range(0) / batches_;
-
-        // work-items operate over input with indices
-        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
-        //   + reduction_lid
-        // for 0 <= m < reductions_per_wi
-        // for each input
-
-        const auto &batch_offsets_ = batch_indexer_(batch_id);
-        const auto &lhs_batch_offset = batch_offsets_.get_first_offset();
-        const auto &rhs_batch_offset = batch_offsets_.get_second_offset();
-        const auto &out_batch_offset = batch_offsets_.get_third_offset();
-
-        outT local_red_val(0);
-        std::size_t arg_reduce_gid0 =
-            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
-        std::size_t arg_reduce_gid_max = std::min(
-            reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
-
-        for (std::size_t arg_reduce_gid = arg_reduce_gid0;
-             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg)
-        {
-            auto reduction_offsets_ = reduced_dims_indexer_(arg_reduce_gid);
-            const auto &lhs_reduction_offset =
-                reduction_offsets_.get_first_offset();
-            const auto &rhs_reduction_offset =
-                reduction_offsets_.get_second_offset();
-
-            using dpctl::tensor::type_utils::convert_impl;
-            outT val = convert_impl<outT, lhsT>(
-                           lhs_[lhs_batch_offset + lhs_reduction_offset]) *
-                       convert_impl<outT, rhsT>(
-                           rhs_[rhs_batch_offset + rhs_reduction_offset]);
-
-            local_red_val += val;
-        }
-
-        auto work_group = it.get_group();
-
-        using RedOpT = typename std::conditional<std::is_same_v<outT, bool>,
-                                                 sycl::logical_or<outT>,
-                                                 sycl::plus<outT>>::type;
-        outT red_val_over_wg = sycl::reduce_over_group(
-            work_group, local_red_val, outT(0), RedOpT());
-
-        if (work_group.leader()) {
-            // each group writes to a different memory location
-            out_[out_batch_offset * n_reduction_groups + reduction_batch_id] =
-                red_val_over_wg;
-        }
-    }
-};
-
-template <typename lhsT,
-          typename rhsT,
-          typename outT,
-          typename ReductionOpT,
-          typename BatchIndexerT,
-          typename RedIndexerT,
-          typename SlmT>
-struct DotProductNoAtomicCustomFunctor
-{
-private:
-    const lhsT *lhs_ = nullptr;
-    const rhsT *rhs_ = nullptr;
-    outT *out_ = nullptr;
-    ReductionOpT reduction_op_;
-    BatchIndexerT batch_indexer_;
-    RedIndexerT reduced_dims_indexer_;
-    SlmT local_mem_;
-    std::size_t reduction_max_gid_ = 0;
-    std::size_t batches_ = 1;
-    std::size_t reductions_per_wi = 16;
-
-public:
-    DotProductNoAtomicCustomFunctor(const lhsT *lhs,
-                                    const rhsT *rhs,
-                                    outT *res,
-                                    const ReductionOpT &reduction_op,
-                                    const BatchIndexerT &batch_indexer,
-                                    const RedIndexerT &arg_reduced_dims_indexer,
-                                    SlmT local_mem,
-                                    std::size_t reduction_size,
-                                    std::size_t iteration_size,
-                                    std::size_t reduction_size_per_wi)
-        : lhs_(lhs), rhs_(rhs), out_(res), reduction_op_(reduction_op),
-          batch_indexer_(batch_indexer),
-          reduced_dims_indexer_(arg_reduced_dims_indexer),
-          local_mem_(local_mem), reduction_max_gid_(reduction_size),
-          batches_(iteration_size), reductions_per_wi(reduction_size_per_wi)
-    {
-    }
-
-    void operator()(sycl::nd_item<1> it) const
-    {
-        const std::size_t reduction_lid = it.get_local_id(0);
-        const std::size_t wg =
-            it.get_local_range(0); //   0 <= reduction_lid < wg
-
-        const std::size_t batch_id = it.get_group(0) % batches_;
-        const std::size_t reduction_batch_id = it.get_group(0) / batches_;
-        const std::size_t n_reduction_groups = it.get_group_range(0) / batches_;
-
-        // work-items operate over input with indices
-        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
-        //   + reduction_lid
-        // for 0 <= m < reductions_per_wi
-        // for each input
-
-        const auto &batch_offsets_ = batch_indexer_(batch_id);
-        const auto &lhs_batch_offset = batch_offsets_.get_first_offset();
-        const auto &rhs_batch_offset = batch_offsets_.get_second_offset();
-        const auto &out_batch_offset = batch_offsets_.get_third_offset();
-
-        outT local_red_val(0);
-        std::size_t arg_reduce_gid0 =
-            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
-        std::size_t arg_reduce_gid_max = std::min(
-            reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
-
-        for (std::size_t arg_reduce_gid = arg_reduce_gid0;
-             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg)
-        {
-            auto reduction_offsets_ = reduced_dims_indexer_(arg_reduce_gid);
-            const auto &lhs_reduction_offset =
-                reduction_offsets_.get_first_offset();
-            const auto &rhs_reduction_offset =
-                reduction_offsets_.get_second_offset();
-
-            using dpctl::tensor::type_utils::convert_impl;
-            outT val = convert_impl<outT, lhsT>(
-                           lhs_[lhs_batch_offset + lhs_reduction_offset]) *
-                       convert_impl<outT, rhsT>(
-                           rhs_[rhs_batch_offset + rhs_reduction_offset]);
-
-            local_red_val += val;
-        }
-
-        auto work_group = it.get_group();
-
-        outT red_val_over_wg = su_ns::custom_reduce_over_group(
-            work_group, local_mem_, local_red_val, reduction_op_);
-
-        if (work_group.leader()) {
-            // each group writes to a different memory location
-            out_[out_batch_offset * n_reduction_groups + reduction_batch_id] =
-                red_val_over_wg;
-        }
-    }
-};
-
-template <typename lhsTy,
-          typename rhsTy,
-          typename resTy,
-          typename ReductionOpT,
-          typename BatchIndexerT,
-          typename RedIndexerT,
-          template <typename T1,
-                    typename T2,
-                    typename T3,
-                    typename T4,
-                    typename T5,
-                    typename T6>
-          class kernel_name_token>
-sycl::event
-submit_no_atomic_dot_product(sycl::queue &exec_q,
-                             const lhsTy *lhs,
-                             const rhsTy *rhs,
-                             resTy *res,
-                             std::size_t wg,
-                             std::size_t batches,
-                             std::size_t reduction_nelems,
-                             std::size_t reductions_per_wi,
-                             std::size_t reduction_groups,
-                             const BatchIndexerT &batch_indexer,
-                             const RedIndexerT &reduction_indexer,
-                             const std::vector<sycl::event> &depends)
-{
-    sycl::event dot_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        auto globalRange = sycl::range<1>{batches * reduction_groups * wg};
-        auto localRange = sycl::range<1>{wg};
-        auto ndRange = sycl::nd_range<1>(globalRange, localRange);
-
-        if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value) {
-            using KernelName =
-                class kernel_name_token<lhsTy, rhsTy, resTy, ReductionOpT,
-                                        BatchIndexerT, RedIndexerT>;
-
-            cgh.parallel_for<KernelName>(
-                ndRange,
-                DotProductNoAtomicFunctor<lhsTy, rhsTy, resTy, ReductionOpT,
-                                          BatchIndexerT, RedIndexerT>(
-                    lhs, rhs, res, ReductionOpT(), batch_indexer,
-                    reduction_indexer, reduction_nelems, batches,
-                    reductions_per_wi));
-        }
-        else {
-            using SlmT = sycl::local_accessor<resTy, 1>;
-            SlmT local_memory = SlmT(localRange, cgh);
-
-            using KernelName = class custom_reduction_wrapper<kernel_name_token<
-                lhsTy, rhsTy, resTy, ReductionOpT, BatchIndexerT, RedIndexerT>>;
-
-            cgh.parallel_for<KernelName>(
-                ndRange,
-                DotProductNoAtomicCustomFunctor<lhsTy, rhsTy, resTy,
-                                                ReductionOpT, BatchIndexerT,
-                                                RedIndexerT, SlmT>(
-                    lhs, rhs, res, ReductionOpT(), batch_indexer,
-                    reduction_indexer, local_memory, reduction_nelems, batches,
-                    reductions_per_wi));
-        }
-    });
-    return dot_ev;
-}
-
-template <typename T1,
-          typename T2,
-          typename T3,
-          typename T4,
-          typename T5,
-          typename T6>
-class dot_product_tree_krn;
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class dot_product_tree_reduction_krn;
-
-template <typename lhsTy, typename rhsTy, typename resTy>
-sycl::event dot_product_tree_impl(sycl::queue &exec_q,
-                                  std::size_t batches,
-                                  std::size_t reduction_nelems,
-                                  const char *lhs_cp,
-                                  const char *rhs_cp,
-                                  char *res_cp,
-                                  int batch_nd,
-                                  const ssize_t *batch_shape_and_strides,
-                                  ssize_t batch_lhs_offset,
-                                  ssize_t batch_rhs_offset,
-                                  ssize_t batch_res_offset,
-                                  int red_nd,
-                                  const ssize_t *reduction_shape_stride,
-                                  ssize_t reduction_lhs_offset,
-                                  ssize_t reduction_rhs_offset,
-                                  const std::vector<sycl::event> &depends = {})
-{
-    const lhsTy *lhs_tp = reinterpret_cast<const lhsTy *>(lhs_cp);
-    const rhsTy *rhs_tp = reinterpret_cast<const rhsTy *>(rhs_cp);
-    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
-
-    const sycl::device &d = exec_q.get_device();
-    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
-    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
-
-    if (reduction_nelems < wg) {
-        using InputOutputBatchIndexerT =
-            dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
-        using ReductionIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
-
-        const InputOutputBatchIndexerT inp_out_batch_indexer{
-            batch_nd, batch_lhs_offset, batch_rhs_offset, batch_res_offset,
-            batch_shape_and_strides};
-        const ReductionIndexerT reduction_indexer{red_nd, reduction_lhs_offset,
-                                                  reduction_rhs_offset,
-                                                  reduction_shape_stride};
-
-        sycl::event dot_ev =
-            sequential_dot_product<lhsTy, rhsTy, resTy,
-                                   InputOutputBatchIndexerT, ReductionIndexerT,
-                                   dot_product_seq_krn>(
-                exec_q, lhs_tp, rhs_tp, res_tp, batches, reduction_nelems,
-                inp_out_batch_indexer, reduction_indexer, depends);
-
-        return dot_ev;
-    }
-
-    static constexpr std::size_t preferred_reductions_per_wi = 8;
-    // prevents running out of resources on CPU
-    std::size_t max_wg = reduction_detail::get_work_group_size(d);
-
-    using ReductionOpT = typename std::conditional<std::is_same_v<resTy, bool>,
-                                                   sycl::logical_or<resTy>,
-                                                   sycl::plus<resTy>>::type;
-
-    std::size_t reductions_per_wi(preferred_reductions_per_wi);
-    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
-        using BatchIndexerT =
-            dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
-        using ReductionIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
-
-        const BatchIndexerT batch_indexer{batch_nd, batch_lhs_offset,
-                                          batch_rhs_offset, batch_res_offset,
-                                          batch_shape_and_strides};
-        const ReductionIndexerT reduction_indexer{red_nd, reduction_lhs_offset,
-                                                  reduction_rhs_offset,
-                                                  reduction_shape_stride};
-
-        if (batches == 1) {
-            // increase GPU occupancy
-            wg = max_wg;
-        }
-        reductions_per_wi =
-            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
-
-        std::size_t reduction_groups =
-            (reduction_nelems + reductions_per_wi * wg - 1) /
-            (reductions_per_wi * wg);
-        assert(reduction_groups == 1);
-
-        sycl::event dot_ev =
-            submit_no_atomic_dot_product<lhsTy, rhsTy, resTy, ReductionOpT,
-                                         BatchIndexerT, ReductionIndexerT,
-                                         dot_product_tree_krn>(
-                exec_q, lhs_tp, rhs_tp, res_tp, wg, batches, reduction_nelems,
-                reductions_per_wi, reduction_groups, batch_indexer,
-                reduction_indexer, depends);
-
-        return dot_ev;
-    }
-    else {
-        static constexpr resTy identity_val =
-            sycl::known_identity<ReductionOpT, resTy>::value;
-
-        // more than one work-groups is needed, requires a temporary
-        std::size_t reduction_groups =
-            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
-            (preferred_reductions_per_wi * wg);
-        assert(reduction_groups > 1);
-
-        std::size_t second_iter_reduction_groups_ =
-            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
-            (preferred_reductions_per_wi * wg);
-
-        // returns unique_ptr
-        auto partially_reduced_tmp_owner =
-            dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
-                batches * (reduction_groups + second_iter_reduction_groups_),
-                exec_q);
-
-        resTy *partially_reduced_tmp = partially_reduced_tmp_owner.get();
-        resTy *partially_reduced_tmp2 =
-            partially_reduced_tmp + reduction_groups * batches;
-
-        sycl::event first_reduction_ev;
-        {
-            using LhsIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
-            using RhsIndexerT =
-                dpctl::tensor::offset_utils::UnpackedStridedIndexer;
-            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-            using InputOutputBatchIndexerT =
-                dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer<
-                    LhsIndexerT, RhsIndexerT, ResIndexerT>;
-            using ReductionIndexerT =
-                dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
-
-            const LhsIndexerT lhs_indexer(batch_nd, batch_lhs_offset,
-                                          batch_shape_and_strides);
-            const RhsIndexerT rhs_indexer(
-                batch_nd, batch_rhs_offset, batch_shape_and_strides,
-                batch_shape_and_strides + 2 * batch_nd);
-            static constexpr ResIndexerT noop_tmp_indexer{};
-
-            const InputOutputBatchIndexerT in_out_iter_indexer{
-                lhs_indexer, rhs_indexer, noop_tmp_indexer};
-            const ReductionIndexerT reduction_indexer{
-                red_nd, reduction_lhs_offset, reduction_rhs_offset,
-                reduction_shape_stride};
-
-            first_reduction_ev = submit_no_atomic_dot_product<
-                lhsTy, rhsTy, resTy, ReductionOpT, InputOutputBatchIndexerT,
-                ReductionIndexerT, dot_product_tree_krn>(
-                exec_q, lhs_tp, rhs_tp, partially_reduced_tmp, wg, batches,
-                reduction_nelems, preferred_reductions_per_wi, reduction_groups,
-                in_out_iter_indexer, reduction_indexer, depends);
-        }
-
-        std::size_t remaining_reduction_nelems = reduction_groups;
-
-        resTy *temp_arg = partially_reduced_tmp;
-        resTy *temp2_arg = partially_reduced_tmp2;
-        sycl::event dependent_ev = first_reduction_ev;
-
-        while (remaining_reduction_nelems >
-               preferred_reductions_per_wi * max_wg)
-        {
-            std::size_t reduction_groups_ =
-                (remaining_reduction_nelems + preferred_reductions_per_wi * wg -
-                 1) /
-                (preferred_reductions_per_wi * wg);
-            assert(reduction_groups_ > 1);
-
-            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-            using InputOutputIterIndexerT =
-                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                    InputIndexerT, ResIndexerT>;
-            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-
-            const InputIndexerT inp_indexer{/* size */ batches,
-                                            /* step */ reduction_groups_};
-            static constexpr ResIndexerT res_iter_indexer{};
-
-            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
-                                                              res_iter_indexer};
-            static constexpr ReductionIndexerT reduction_indexer{};
-
-            sycl::event partial_reduction_ev =
-                dpctl::tensor::kernels::submit_no_atomic_reduction<
-                    resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                    ReductionIndexerT, dot_product_tree_reduction_krn>(
-                    exec_q, temp_arg, temp2_arg, identity_val, wg, batches,
-                    remaining_reduction_nelems, preferred_reductions_per_wi,
-                    reduction_groups_, in_out_iter_indexer, reduction_indexer,
-                    {dependent_ev});
-
-            remaining_reduction_nelems = reduction_groups_;
-            std::swap(temp_arg, temp2_arg);
-            dependent_ev = std::move(partial_reduction_ev);
-        }
-
-        // final reduction to res
-        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-        using ResIndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer;
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                InputIndexerT, ResIndexerT>;
-        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-
-        const InputIndexerT inp_indexer{/* size */ batches,
-                                        /* step */ remaining_reduction_nelems};
-        const ResIndexerT res_iter_indexer{
-            batch_nd, batch_res_offset,
-            /* shape */ batch_shape_and_strides,
-            /* strides */ batch_shape_and_strides + 2 * batch_nd};
-
-        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
-                                                          res_iter_indexer};
-        static constexpr ReductionIndexerT reduction_indexer{};
-
-        wg = max_wg;
-        reductions_per_wi = std::max<std::size_t>(
-            1, (remaining_reduction_nelems + wg - 1) / wg);
-
-        reduction_groups =
-            (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
-            (reductions_per_wi * wg);
-        assert(reduction_groups == 1);
-
-        sycl::event final_reduction_ev =
-            dpctl::tensor::kernels::submit_no_atomic_reduction<
-                resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                ReductionIndexerT, dot_product_tree_reduction_krn>(
-                exec_q, temp_arg, res_tp, identity_val, wg, batches,
-                remaining_reduction_nelems, reductions_per_wi, reduction_groups,
-                in_out_iter_indexer, reduction_indexer, {dependent_ev});
-
-        // transfer ownership of USM allocation to host_task
-        sycl::event cleanup_host_task_event =
-            dpctl::tensor::alloc_utils::async_smart_free(
-                exec_q, {final_reduction_ev}, partially_reduced_tmp_owner);
-
-        return cleanup_host_task_event;
-    }
-}
-
-template <typename lhsTy, typename rhsTy, typename resTy>
-sycl::event
-dot_product_contig_tree_impl(sycl::queue &exec_q,
-                             std::size_t batches,
-                             std::size_t reduction_nelems,
-                             const char *lhs_cp,
-                             const char *rhs_cp,
-                             char *res_cp,
-                             ssize_t batch_lhs_offset,
-                             ssize_t batch_rhs_offset,
-                             ssize_t batch_res_offset,
-                             ssize_t reduction_lhs_offset,
-                             ssize_t reduction_rhs_offset,
-                             const std::vector<sycl::event> &depends = {})
-{
-    const lhsTy *lhs_tp = reinterpret_cast<const lhsTy *>(lhs_cp) +
-                          batch_lhs_offset + reduction_lhs_offset;
-    const rhsTy *rhs_tp = reinterpret_cast<const rhsTy *>(rhs_cp) +
-                          batch_rhs_offset + reduction_rhs_offset;
-    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + batch_res_offset;
-
-    const sycl::device &d = exec_q.get_device();
-    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
-    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
-
-    if (reduction_nelems < wg) {
-        using InputBatchIndexerT =
-            dpctl::tensor::offset_utils::Strided1DIndexer;
-        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        using InputOutputBatchIndexerT =
-            dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer<
-                InputBatchIndexerT, InputBatchIndexerT, NoOpIndexerT>;
-        using ReductionIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                NoOpIndexerT, NoOpIndexerT>;
-
-        const InputBatchIndexerT inp_batch_indexer{/* size */ batches,
-                                                   /* step */ reduction_nelems};
-        const InputOutputBatchIndexerT inp_out_batch_indexer{
-            inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}};
-        static constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{},
-                                                             NoOpIndexerT{}};
-
-        sycl::event dot_ev =
-            sequential_dot_product<lhsTy, rhsTy, resTy,
-                                   InputOutputBatchIndexerT, ReductionIndexerT,
-                                   dot_product_seq_krn>(
-                exec_q, lhs_tp, rhs_tp, res_tp, batches, reduction_nelems,
-                inp_out_batch_indexer, reduction_indexer, depends);
-
-        return dot_ev;
-    }
-
-    static constexpr std::size_t preferred_reductions_per_wi = 8;
-    // prevents running out of resources on CPU
-    std::size_t max_wg = reduction_detail::get_work_group_size(d);
-
-    using ReductionOpT = typename std::conditional<std::is_same_v<resTy, bool>,
-                                                   sycl::logical_or<resTy>,
-                                                   sycl::plus<resTy>>::type;
-
-    std::size_t reductions_per_wi(preferred_reductions_per_wi);
-    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
-        using InputBatchIndexerT =
-            dpctl::tensor::offset_utils::Strided1DIndexer;
-        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        using InputOutputBatchIndexerT =
-            dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer<
-                InputBatchIndexerT, InputBatchIndexerT, NoOpIndexerT>;
-        using ReductionIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                NoOpIndexerT, NoOpIndexerT>;
-
-        const InputBatchIndexerT inp_batch_indexer{/* size */ batches,
-                                                   /* step */ reduction_nelems};
-        const InputOutputBatchIndexerT inp_out_batch_indexer{
-            inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}};
-        static constexpr ReductionIndexerT reduction_indexer{NoOpIndexerT{},
-                                                             NoOpIndexerT{}};
-
-        if (batches == 1) {
-            // increase GPU occupancy
-            wg = max_wg;
-        }
-        reductions_per_wi =
-            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
-
-        std::size_t reduction_groups =
-            (reduction_nelems + reductions_per_wi * wg - 1) /
-            (reductions_per_wi * wg);
-        assert(reduction_groups == 1);
-
-        sycl::event dot_ev = submit_no_atomic_dot_product<
-            lhsTy, rhsTy, resTy, ReductionOpT, InputOutputBatchIndexerT,
-            ReductionIndexerT, dot_product_tree_krn>(
-            exec_q, lhs_tp, rhs_tp, res_tp, wg, batches, reduction_nelems,
-            reductions_per_wi, reduction_groups, inp_out_batch_indexer,
-            reduction_indexer, depends);
-
-        return dot_ev;
-    }
-    else {
-        static constexpr resTy identity_val =
-            sycl::known_identity<ReductionOpT, resTy>::value;
-
-        // more than one work-groups is needed, requires a temporary
-        std::size_t reduction_groups =
-            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
-            (preferred_reductions_per_wi * wg);
-        assert(reduction_groups > 1);
-
-        std::size_t second_iter_reduction_groups_ =
-            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
-            (preferred_reductions_per_wi * wg);
-
-        // unique_ptr that owns temporary allocation for partial reductions
-        auto partially_reduced_tmp_owner =
-            dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
-                batches * (reduction_groups + second_iter_reduction_groups_),
-                exec_q);
-        // get raw pointers
-        resTy *partially_reduced_tmp = partially_reduced_tmp_owner.get();
-        resTy *partially_reduced_tmp2 =
-            partially_reduced_tmp + reduction_groups * batches;
-
-        sycl::event first_reduction_ev;
-        {
-            using InputBatchIndexerT =
-                dpctl::tensor::offset_utils::Strided1DIndexer;
-            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-            using InputOutputBatchIndexerT =
-                dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer<
-                    InputBatchIndexerT, InputBatchIndexerT, NoOpIndexerT>;
-            using ReductionIndexerT =
-                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                    NoOpIndexerT, NoOpIndexerT>;
-
-            const InputBatchIndexerT inp_batch_indexer{
-                /* size */ batches,
-                /* step */ reduction_nelems};
-            const InputOutputBatchIndexerT inp_out_batch_indexer{
-                inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}};
-            static constexpr ReductionIndexerT reduction_indexer{
-                NoOpIndexerT{}, NoOpIndexerT{}};
-
-            first_reduction_ev = submit_no_atomic_dot_product<
-                lhsTy, rhsTy, resTy, ReductionOpT, InputOutputBatchIndexerT,
-                ReductionIndexerT, dot_product_tree_krn>(
-                exec_q, lhs_tp, rhs_tp, partially_reduced_tmp, wg, batches,
-                reduction_nelems, preferred_reductions_per_wi, reduction_groups,
-                inp_out_batch_indexer, reduction_indexer, depends);
-        }
-
-        std::size_t remaining_reduction_nelems = reduction_groups;
-
-        resTy *temp_arg = partially_reduced_tmp;
-        resTy *temp2_arg = partially_reduced_tmp2;
-        sycl::event dependent_ev = first_reduction_ev;
-
-        while (remaining_reduction_nelems >
-               preferred_reductions_per_wi * max_wg)
-        {
-            std::size_t reduction_groups_ =
-                (remaining_reduction_nelems + preferred_reductions_per_wi * wg -
-                 1) /
-                (preferred_reductions_per_wi * wg);
-            assert(reduction_groups_ > 1);
-
-            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-            using InputOutputIterIndexerT =
-                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                    InputIndexerT, ResIndexerT>;
-            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-
-            const InputIndexerT inp_indexer{/* size */ batches,
-                                            /* step */ reduction_groups_};
-            static constexpr ResIndexerT res_iter_indexer{};
-
-            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
-                                                              res_iter_indexer};
-            static constexpr ReductionIndexerT reduction_indexer{};
-
-            sycl::event partial_reduction_ev =
-                dpctl::tensor::kernels::submit_no_atomic_reduction<
-                    resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                    ReductionIndexerT, dot_product_tree_reduction_krn>(
-                    exec_q, temp_arg, temp2_arg, identity_val, wg, batches,
-                    remaining_reduction_nelems, preferred_reductions_per_wi,
-                    reduction_groups_, in_out_iter_indexer, reduction_indexer,
-                    {dependent_ev});
-
-            remaining_reduction_nelems = reduction_groups_;
-            std::swap(temp_arg, temp2_arg);
-            dependent_ev = std::move(partial_reduction_ev);
-        }
-
-        // final reduction to res
-        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-        using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                InputIndexerT, ResIndexerT>;
-        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-
-        const InputIndexerT inp_indexer{/* size */ batches,
-                                        /* step */ remaining_reduction_nelems};
-        static constexpr ResIndexerT res_iter_indexer{};
-
-        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
-                                                          res_iter_indexer};
-        static constexpr ReductionIndexerT reduction_indexer{};
-
-        wg = max_wg;
-        reductions_per_wi = std::max<std::size_t>(
-            1, (remaining_reduction_nelems + wg - 1) / wg);
-
-        reduction_groups =
-            (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
-            (reductions_per_wi * wg);
-        assert(reduction_groups == 1);
-
-        sycl::event final_reduction_ev =
-            dpctl::tensor::kernels::submit_no_atomic_reduction<
-                resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                ReductionIndexerT, dot_product_tree_reduction_krn>(
-                exec_q, temp_arg, res_tp, identity_val, wg, batches,
-                remaining_reduction_nelems, reductions_per_wi, reduction_groups,
-                in_out_iter_indexer, reduction_indexer, {dependent_ev});
-
-        sycl::event cleanup_host_task_event =
-            dpctl::tensor::alloc_utils::async_smart_free(
-                exec_q, {final_reduction_ev}, partially_reduced_tmp_owner);
-
-        return cleanup_host_task_event;
-    }
-}
-
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp b/dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp
deleted file mode 100644
index 2b5a42af19..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp
+++ /dev/null
@@ -1,4228 +0,0 @@
-//===  gemm.hpp - Implementation of GEMM kernels --*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===---------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for general matrix multiplication (GEMM).
-//===---------------------------------------------------------------------===//
-
-#pragma once
-
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <stdexcept>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/reductions.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/sycl_alloc_utils.hpp"
-#include "utils/sycl_utils.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-
-using dpctl::tensor::ssize_t;
-
-namespace gemm_detail
-{
-
-template <typename T, std::size_t m_groups>
-void scale_gemm_k_parameters(const std::size_t &local_mem_size,
-                             const std::size_t &reserved_slm_size,
-                             const std::size_t delta_k,
-                             std::size_t &n_wi,
-                             std::size_t &delta_n)
-{
-    static constexpr std::size_t slm_elem_size = sizeof(T) * m_groups;
-
-    while (slm_elem_size * (n_wi + delta_n) * delta_k + reserved_slm_size >=
-           local_mem_size)
-    {
-        n_wi = n_wi / 2;
-        delta_n = delta_n / 2;
-        if (delta_n == 0)
-            throw std::runtime_error("Insufficient resources");
-    }
-}
-
-template <typename T, int wi_delta_m>
-void scale_gemm_nm_parameters(const std::size_t &local_mem_size,
-                              const std::size_t &reserved_slm_size,
-                              const std::size_t &wi_delta_n,
-                              std::size_t &wi_delta_k,
-                              std::size_t &wg_delta_n,
-                              std::size_t &wg_delta_m)
-{
-    static constexpr std::size_t slm_A_elem_size = sizeof(T);
-    static constexpr std::size_t slm_B_elem_size = sizeof(T) * wi_delta_m;
-
-    while ((wi_delta_n * wg_delta_n * wi_delta_k * slm_A_elem_size) +
-               (wi_delta_k * wg_delta_m * slm_B_elem_size) +
-               reserved_slm_size >=
-           local_mem_size)
-    {
-        wg_delta_n /= 2;
-        wg_delta_m /= 2;
-        wi_delta_k /= 2;
-        if (wg_delta_n == 0)
-            throw std::runtime_error("Insufficient resources");
-    }
-}
-} // namespace gemm_detail
-
-using dpctl::tensor::sycl_utils::choose_workgroup_size;
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class gemm_seq_reduction_krn;
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class gemm_tree_reduction_krn;
-
-template <typename T, typename ReductionOpT>
-sycl::event single_reduction_for_gemm(sycl::queue &exec_q,
-                                      T *tmp_tp,
-                                      T *res_tp,
-                                      T identity_val,
-                                      std::size_t iter_nelems,
-                                      std::size_t reduction_nelems,
-                                      std::size_t reduction_groups,
-                                      std::size_t wg,
-                                      std::size_t max_wg,
-                                      std::size_t preferred_reductions_per_wi,
-                                      std::size_t reductions_per_wi,
-                                      int res_nd,
-                                      ssize_t res_offset,
-                                      const ssize_t *res_shapes_strides,
-                                      const std::vector<sycl::event> &depends)
-{
-    sycl::event red_ev;
-    if (reduction_nelems < wg) {
-        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        using ResIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                NoOpIndexerT, ResIndexerT>;
-        using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-
-        const ResIndexerT res_iter_indexer{res_nd, 0, res_shapes_strides};
-        const InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
-                                                          res_iter_indexer};
-        const ReductionIndexerT reduction_indexer{/* size   */ reduction_nelems,
-                                                  /* step   */ iter_nelems};
-
-        red_ev = exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(depends);
-
-            sycl::range<1> iter_range{iter_nelems};
-
-            cgh.parallel_for<class gemm_seq_reduction_krn<
-                T, T, ReductionOpT, InputOutputIterIndexerT,
-                ReductionIndexerT>>(
-                iter_range,
-                SequentialReduction<T, T, ReductionOpT, InputOutputIterIndexerT,
-                                    ReductionIndexerT>(
-                    tmp_tp, res_tp, ReductionOpT(), identity_val,
-                    in_out_iter_indexer, reduction_indexer, reduction_nelems));
-        });
-    }
-    else {
-        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        using ResIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                NoOpIndexerT, ResIndexerT>;
-        using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-
-        const ResIndexerT res_iter_indexer{res_nd, 0, res_shapes_strides};
-        const InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
-                                                          res_iter_indexer};
-        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
-                                                  /* step */ iter_nelems};
-
-        if (iter_nelems == 1) {
-            // increase GPU occupancy
-            wg = max_wg;
-        }
-        reductions_per_wi =
-            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
-
-        std::size_t reduction_groups =
-            (reduction_nelems + reductions_per_wi * wg - 1) /
-            (reductions_per_wi * wg);
-        assert(reduction_groups == 1);
-
-        red_ev = dpctl::tensor::kernels::submit_no_atomic_reduction<
-            T, T, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT,
-            gemm_tree_reduction_krn>(
-            exec_q, tmp_tp, res_tp, identity_val, wg, iter_nelems,
-            reduction_nelems, reductions_per_wi, reduction_groups,
-            in_out_iter_indexer, reduction_indexer, depends);
-    }
-    return red_ev;
-}
-
-template <typename T, typename ReductionOpT>
-sycl::event
-single_reduction_for_gemm_contig(sycl::queue &exec_q,
-                                 T *tmp_tp,
-                                 T *res_tp,
-                                 T identity_val,
-                                 std::size_t iter_nelems,
-                                 std::size_t reduction_nelems,
-                                 std::size_t reduction_groups,
-                                 std::size_t wg,
-                                 std::size_t max_wg,
-                                 std::size_t preferred_reductions_per_wi,
-                                 std::size_t reductions_per_wi,
-                                 const std::vector<sycl::event> &depends)
-{
-    sycl::event red_ev;
-    if (reduction_nelems < wg) {
-        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                NoOpIndexerT, NoOpIndexerT>;
-        using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-
-        static constexpr InputOutputIterIndexerT in_out_iter_indexer{
-            NoOpIndexerT{}, NoOpIndexerT{}};
-        // tmp allocation is a C-contiguous matrix (reduction_nelems,
-        // iter_nelems) and we are reducing by axis 0
-        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
-                                                  /* step */ iter_nelems};
-
-        red_ev = exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(depends);
-
-            sycl::range<1> iter_range{iter_nelems};
-
-            cgh.parallel_for<class gemm_seq_reduction_krn<
-                T, T, ReductionOpT, InputOutputIterIndexerT,
-                ReductionIndexerT>>(
-                iter_range,
-                SequentialReduction<T, T, ReductionOpT, InputOutputIterIndexerT,
-                                    ReductionIndexerT>(
-                    tmp_tp, res_tp, ReductionOpT(), identity_val,
-                    in_out_iter_indexer, reduction_indexer, reduction_nelems));
-        });
-    }
-    else {
-        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                NoOpIndexerT, NoOpIndexerT>;
-        using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-
-        static constexpr InputOutputIterIndexerT in_out_iter_indexer{
-            NoOpIndexerT{}, NoOpIndexerT{}};
-        // tmp allocation is a C-contiguous matrix
-        // (reduction_nelems, iter_nelems). Reducing along axis 0
-        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
-                                                  /* step */ iter_nelems};
-
-        if (iter_nelems == 1) {
-            // increase GPU occupancy
-            wg = max_wg;
-        }
-        reductions_per_wi =
-            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
-
-        std::size_t reduction_groups =
-            (reduction_nelems + reductions_per_wi * wg - 1) /
-            (reductions_per_wi * wg);
-        assert(reduction_groups == 1);
-
-        red_ev = dpctl::tensor::kernels::submit_no_atomic_reduction<
-            T, T, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT,
-            gemm_tree_reduction_krn>(
-            exec_q, tmp_tp, res_tp, identity_val, wg, iter_nelems,
-            reduction_nelems, reductions_per_wi, reduction_groups,
-            in_out_iter_indexer, reduction_indexer, depends);
-    }
-    return red_ev;
-}
-
-template <typename T, typename ReductionOpT>
-sycl::event tree_reduction_for_gemm(sycl::queue &exec_q,
-                                    T *partially_reduced_tmp,
-                                    T *partially_reduced_tmp2,
-                                    T *res_tp,
-                                    T identity_val,
-                                    std::size_t iter_nelems,
-                                    std::size_t reduction_nelems,
-                                    std::size_t reduction_groups,
-                                    std::size_t wg,
-                                    std::size_t max_wg,
-                                    std::size_t preferred_reductions_per_wi,
-                                    std::size_t reductions_per_wi,
-                                    int res_nd,
-                                    ssize_t res_offset,
-                                    const ssize_t *res_shape_strides,
-                                    const std::vector<sycl::event> &depends)
-{
-    sycl::event first_reduction_ev;
-    {
-        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                NoOpIndexerT, NoOpIndexerT>;
-        using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-
-        static constexpr InputOutputIterIndexerT in_out_iter_indexer{
-            NoOpIndexerT{}, NoOpIndexerT{}};
-        // partially_reduced_tmp is C-contig matrix with shape
-        // (reduction_nelems, iter_nelems). Reducing along axis 0.
-        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
-                                                  /* step */ iter_nelems};
-
-        first_reduction_ev = dpctl::tensor::kernels::submit_no_atomic_reduction<
-            T, T, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT,
-            gemm_tree_reduction_krn>(
-            exec_q, partially_reduced_tmp, partially_reduced_tmp2, identity_val,
-            wg, iter_nelems, reduction_nelems, reductions_per_wi,
-            reduction_groups, in_out_iter_indexer, reduction_indexer, depends);
-    }
-
-    std::size_t remaining_reduction_nelems = reduction_groups;
-
-    T *temp_arg = partially_reduced_tmp2;
-    T *temp2_arg = partially_reduced_tmp;
-    sycl::event dependent_ev = first_reduction_ev;
-
-    while (remaining_reduction_nelems > preferred_reductions_per_wi * max_wg) {
-        std::size_t reduction_groups_ = (remaining_reduction_nelems +
-                                         preferred_reductions_per_wi * wg - 1) /
-                                        (preferred_reductions_per_wi * wg);
-        assert(reduction_groups_ > 1);
-
-        // keep reducing
-        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-        using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                InputIndexerT, ResIndexerT>;
-        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-
-        const InputIndexerT inp_indexer{/* size */ iter_nelems,
-                                        /* step */ reduction_groups_};
-        static constexpr ResIndexerT res_iter_indexer{};
-
-        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
-                                                          res_iter_indexer};
-
-        static constexpr ReductionIndexerT reduction_indexer{};
-
-        sycl::event partial_reduction_ev =
-            dpctl::tensor::kernels::submit_no_atomic_reduction<
-                T, T, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT,
-                gemm_tree_reduction_krn>(
-                exec_q, temp_arg, temp2_arg, identity_val, wg, iter_nelems,
-                remaining_reduction_nelems, reductions_per_wi,
-                reduction_groups_, in_out_iter_indexer, reduction_indexer,
-                {dependent_ev});
-
-        remaining_reduction_nelems = reduction_groups_;
-        std::swap(temp_arg, temp2_arg);
-        dependent_ev = std::move(partial_reduction_ev);
-    }
-
-    // final reduction to res
-    using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-    using ResIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
-    using InputOutputIterIndexerT =
-        dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<InputIndexerT,
-                                                                ResIndexerT>;
-    using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-
-    const InputIndexerT inp_indexer{/* size */ iter_nelems,
-                                    /* step */ remaining_reduction_nelems};
-    const ResIndexerT res_iter_indexer{
-        /* ndim                */ res_nd,
-        /* offset              */ static_cast<ssize_t>(res_offset),
-        /* packed shape_strides*/ res_shape_strides};
-
-    const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
-                                                      res_iter_indexer};
-    static constexpr ReductionIndexerT reduction_indexer{};
-
-    wg = max_wg;
-    reductions_per_wi =
-        std::max<std::size_t>(1, (remaining_reduction_nelems + wg - 1) / wg);
-
-    reduction_groups =
-        (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
-        (reductions_per_wi * wg);
-    assert(reduction_groups == 1);
-
-    sycl::event final_reduction_ev =
-        dpctl::tensor::kernels::submit_no_atomic_reduction<
-            T, T, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT,
-            gemm_tree_reduction_krn>(
-            exec_q, temp_arg, res_tp, identity_val, wg, iter_nelems,
-            remaining_reduction_nelems, reductions_per_wi, reduction_groups,
-            in_out_iter_indexer, reduction_indexer, {dependent_ev});
-
-    return final_reduction_ev;
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class gemm_reduction_over_group_temps_contig_krn;
-
-template <typename T, typename ReductionOpT>
-sycl::event
-tree_reduction_for_gemm_contig(sycl::queue &exec_q,
-                               T *partially_reduced_tmp,
-                               T *partially_reduced_tmp2,
-                               T *res_tp,
-                               T identity_val,
-                               std::size_t iter_nelems,
-                               std::size_t reduction_nelems,
-                               std::size_t reduction_groups,
-                               std::size_t wg,
-                               std::size_t max_wg,
-                               std::size_t preferred_reductions_per_wi,
-                               std::size_t reductions_per_wi,
-                               const std::vector<sycl::event> &depends)
-{
-    using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-    using InputOutputIterIndexerT =
-        dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<NoOpIndexerT,
-                                                                NoOpIndexerT>;
-    using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-
-    static constexpr InputOutputIterIndexerT in_out_iter_indexer{
-        NoOpIndexerT{}, NoOpIndexerT{}};
-    const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
-                                              /* step */ iter_nelems};
-
-    const sycl::event &first_reduction_ev =
-        dpctl::tensor::kernels::submit_no_atomic_reduction<
-            T, T, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT,
-            gemm_reduction_over_group_temps_contig_krn>(
-            exec_q, partially_reduced_tmp, partially_reduced_tmp2, identity_val,
-            wg, iter_nelems, reduction_nelems, reductions_per_wi,
-            reduction_groups, in_out_iter_indexer, reduction_indexer, depends);
-
-    std::size_t remaining_reduction_nelems = reduction_groups;
-
-    T *temp_arg = partially_reduced_tmp2;
-    T *temp2_arg = partially_reduced_tmp;
-    sycl::event dependent_ev = first_reduction_ev;
-
-    while (remaining_reduction_nelems > preferred_reductions_per_wi * max_wg) {
-        std::size_t reduction_groups_ = (remaining_reduction_nelems +
-                                         preferred_reductions_per_wi * wg - 1) /
-                                        (preferred_reductions_per_wi * wg);
-        assert(reduction_groups_ > 1);
-
-        // keep reducing
-        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-        using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                InputIndexerT, ResIndexerT>;
-        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-
-        // n * m = iter_nelems because essentially, this process
-        // creates a stack of reduction_nelems 2D matrices and we reduce
-        // along the stack axis
-        const InputIndexerT inp_indexer{/* size */ iter_nelems,
-                                        /* step */ reduction_groups_};
-        static constexpr ResIndexerT res_iter_indexer{};
-
-        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
-                                                          res_iter_indexer};
-
-        static constexpr ReductionIndexerT reduction_indexer{};
-
-        sycl::event partial_reduction_ev =
-            dpctl::tensor::kernels::submit_no_atomic_reduction<
-                T, T, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT,
-                gemm_reduction_over_group_temps_contig_krn>(
-                exec_q, temp_arg, temp2_arg, identity_val, wg, iter_nelems,
-                remaining_reduction_nelems, reductions_per_wi,
-                reduction_groups_, in_out_iter_indexer, reduction_indexer,
-                {dependent_ev});
-
-        remaining_reduction_nelems = reduction_groups_;
-        std::swap(temp_arg, temp2_arg);
-        dependent_ev = std::move(partial_reduction_ev);
-    }
-
-    // final reduction to res
-    {
-        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-        using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                InputIndexerT, ResIndexerT>;
-        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-
-        const InputIndexerT inp_indexer{
-            /* size   */ iter_nelems,
-            /* step   */ remaining_reduction_nelems};
-        static constexpr ResIndexerT res_iter_indexer{};
-
-        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
-                                                          res_iter_indexer};
-        static constexpr ReductionIndexerT reduction_indexer{};
-
-        wg = max_wg;
-        reductions_per_wi = std::max<std::size_t>(
-            1, (remaining_reduction_nelems + wg - 1) / wg);
-
-        std::size_t reduction_groups =
-            (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
-            (reductions_per_wi * wg);
-        assert(reduction_groups == 1);
-
-        sycl::event final_reduction_ev =
-            dpctl::tensor::kernels::submit_no_atomic_reduction<
-                T, T, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT,
-                gemm_reduction_over_group_temps_contig_krn>(
-                exec_q, temp_arg, res_tp, identity_val, wg, iter_nelems,
-                remaining_reduction_nelems, reductions_per_wi, reduction_groups,
-                in_out_iter_indexer, reduction_indexer, {dependent_ev});
-
-        return final_reduction_ev;
-    }
-}
-
-template <typename lhsT,
-          typename rhsT,
-          typename resT,
-          typename LocAccT,
-          typename OuterInnerDimsIndexerT,
-          typename BatchDimsIndexerT,
-          std::size_t m_groups>
-class GemmBatchFunctorThreadK
-{
-private:
-    const lhsT *lhs = nullptr;
-    const rhsT *rhs = nullptr;
-    resT *res = nullptr;
-    LocAccT workspace;
-    LocAccT local_B_block;
-    std::size_t n = 0;
-    std::size_t n_blocks = 0;
-    std::size_t delta_n = 0;
-    std::size_t k = 0;
-    std::size_t k_blocks = 0;
-    std::size_t delta_k = 0;
-    std::size_t n_wi = 0;
-    std::size_t m = 0;
-    std::size_t batch_nelems = 0;
-    BatchDimsIndexerT batch_indexer;
-    OuterInnerDimsIndexerT lhs_indexer;
-    OuterInnerDimsIndexerT rhs_indexer;
-    OuterInnerDimsIndexerT res_indexer;
-
-public:
-    GemmBatchFunctorThreadK(const lhsT *lhs_,
-                            const rhsT *rhs_,
-                            resT *res_,
-                            LocAccT workspace_,
-                            LocAccT local_B_block_,
-                            std::size_t n_,
-                            std::size_t n_blocks_,
-                            std::size_t delta_n_,
-                            std::size_t k_,
-                            std::size_t k_blocks_,
-                            std::size_t delta_k_,
-                            std::size_t n_wi_,
-                            std::size_t m_,
-                            std::size_t batch_nelems_,
-                            const BatchDimsIndexerT &batch_indexer_,
-                            const OuterInnerDimsIndexerT &lhs_indexer_,
-                            const OuterInnerDimsIndexerT &rhs_indexer_,
-                            const OuterInnerDimsIndexerT &res_indexer_)
-        : lhs(lhs_), rhs(rhs_), res(res_), workspace(workspace_),
-          local_B_block(local_B_block_), n(n_), n_blocks(n_blocks_),
-          delta_n(delta_n_), k(k_), k_blocks(k_blocks_), delta_k(delta_k_),
-          n_wi(n_wi_), m(m_), batch_nelems(batch_nelems_),
-          batch_indexer(batch_indexer_), lhs_indexer(lhs_indexer_),
-          rhs_indexer(rhs_indexer_), res_indexer(res_indexer_)
-    {
-    }
-
-    void operator()(sycl::nd_item<1> it) const
-    {
-        // for batching:
-        // (current matrix in batch) m_id = global_id / (global_range /
-        // batch_nelems) for lhs, offset = m_id * (n * k) for rhs, offset =
-        // m_id
-        // * (k * m) for res, offset = m_id * (n * m)
-        const std::size_t n_groups_per_batch =
-            it.get_group_range(0) / batch_nelems;
-        const std::size_t m_id = it.get_group_linear_id() / n_groups_per_batch;
-        const std::size_t gr_id =
-            it.get_group_linear_id() - m_id * n_groups_per_batch;
-        const std::size_t lid = it.get_local_linear_id();
-
-        const auto &three_offsets_ = batch_indexer(static_cast<ssize_t>(m_id));
-
-        const auto &lhs_offset = three_offsets_.get_first_offset();
-        const auto &rhs_offset = three_offsets_.get_second_offset();
-        const auto &res_offset = three_offsets_.get_third_offset();
-
-        // lift gr_id -> (block_i, block_j, block_s)
-        //   block_i moves fastest, then block_s, then block_j
-
-        const std::size_t r_size = (n_blocks * k_blocks);
-        // 0 <= block_j < m_blocks,
-        const std::size_t block_j = gr_id / r_size;
-        // 0 <= block_r < n_blocks * k_blocks
-        const std::size_t block_r = gr_id - block_j * r_size;
-        // 0 <= block_s < k_blocks
-        const std::size_t block_s = block_r / n_blocks;
-        // 0 <= block_i < n_blocks
-        const std::size_t block_i = block_r - block_s * n_blocks;
-
-        // 0 <= local_i < delta_n
-        const std::size_t local_i = lid / (delta_k);
-        // 0 <= local_s < delta_k
-        const std::size_t local_s = lid - local_i * (delta_k);
-
-        std::size_t i = block_i * delta_n + local_i;
-        std::size_t j = m_groups * block_j;
-        std::size_t s = block_s * delta_k * n_wi + local_s;
-
-        using accV_t = typename LocAccT::value_type;
-
-        static constexpr resT identity_ = resT(0);
-        if (local_i == 0) {
-            for (std::size_t q = 0; q < n_wi * delta_k; q += delta_k) {
-                const std::size_t sq = s + q;
-                const std::size_t sqmj = sq * m + j;
-
-                if constexpr (m_groups == 1 && std::is_same_v<accV_t, resT>) {
-                    local_B_block[local_s + q] =
-                        (sq < k && j < m)
-                            ? static_cast<resT>(
-                                  rhs[rhs_offset + rhs_indexer(sqmj)])
-                            : identity_;
-                }
-                else {
-                    accV_t local_B_vec;
-#pragma unroll
-                    for (std::size_t vec_idx = 0; vec_idx < m_groups; ++vec_idx)
-                    {
-                        local_B_vec[vec_idx] =
-                            (sq < k && j + vec_idx < m)
-                                ? static_cast<resT>(
-                                      rhs[rhs_offset +
-                                          rhs_indexer(sqmj + vec_idx)])
-                                : identity_;
-                    }
-                    local_B_block[local_s + q] = local_B_vec;
-                }
-            }
-        }
-
-        it.barrier(sycl::access::fence_space::local_space);
-
-        std::size_t t_shift = block_s * delta_k * n_wi;
-        std::size_t global_s_offset = i * k + t_shift;
-
-        accV_t private_sum(identity_);
-        static constexpr accV_t vec_identity_(identity_);
-        for (std::size_t t = local_s; t < local_B_block.size(); t += delta_k) {
-            private_sum +=
-                ((i < n) && (t + t_shift < k))
-                    ? (static_cast<resT>(
-                           lhs[lhs_offset + lhs_indexer(global_s_offset + t)]) *
-                       local_B_block[t])
-                    : vec_identity_;
-        }
-
-        std::size_t workspace_i_shift = local_i * delta_k;
-        workspace[workspace_i_shift + local_s] = private_sum;
-
-        it.barrier(sycl::access::fence_space::local_space);
-
-        if (local_s == 0 && i < n) {
-            accV_t local_sum(workspace[workspace_i_shift]);
-            for (std::size_t t = 1; t < delta_k; ++t) {
-                local_sum += workspace[workspace_i_shift + t];
-            }
-
-            sycl::atomic_ref<resT, sycl::memory_order::relaxed,
-                             sycl::memory_scope::device,
-                             sycl::access::address_space::global_space>
-                aout0(res[res_offset + res_indexer(i * m + j)]);
-
-            if constexpr (m_groups == 1 && std::is_same_v<accV_t, resT>) {
-                aout0 += local_sum;
-            }
-            else {
-                aout0 += local_sum[0];
-
-#pragma unroll
-                for (std::size_t vec_id = 1; vec_id < m_groups; ++vec_id) {
-                    if (j + vec_id < m) {
-                        sycl::atomic_ref<
-                            resT, sycl::memory_order::relaxed,
-                            sycl::memory_scope::device,
-                            sycl::access::address_space::global_space>
-                            aout1(res[res_offset +
-                                      res_indexer(i * m + j + vec_id)]);
-
-                        aout1 += local_sum[vec_id];
-                    }
-                }
-            }
-        }
-    }
-};
-
-template <typename T1, typename T2, typename T3> class gemm_init_krn;
-
-template <typename T1, typename T2, typename T3, typename T4, std::size_t>
-class gemm_k_krn;
-
-template <typename T1, typename T2, typename T3, typename T4, std::size_t>
-class gemm_nm_krn;
-
-template <typename T1,
-          typename T2,
-          typename T3,
-          typename T4,
-          typename T5,
-          std::size_t>
-class gemm_batch_k_krn;
-
-template <typename T1,
-          typename T2,
-          typename T3,
-          typename T4,
-          typename T5,
-          std::size_t>
-class gemm_batch_nm_krn;
-
-namespace gemm_detail
-{
-
-template <typename lhsTy,
-          typename rhsTy,
-          typename resTy,
-          typename BatchIndexerT,
-          typename LhsIndexerT,
-          typename RhsIndexerT,
-          typename ResIndexerT>
-sycl::event _gemm_k_impl(sycl::queue &exec_q,
-                         const lhsTy *lhs_tp,
-                         const rhsTy *rhs_tp,
-                         resTy *res_tp,
-                         const std::size_t batch_nelems,
-                         const std::size_t n,
-                         const std::size_t k,
-                         const std::size_t m,
-                         const BatchIndexerT &batch_indexer,
-                         const LhsIndexerT &lhs_indexer,
-                         const RhsIndexerT &rhs_indexer,
-                         const ResIndexerT &res_indexer,
-                         const std::vector<sycl::event> &depends)
-{
-    static constexpr std::size_t m_groups = 4;
-    const std::size_t delta_k(4);
-    std::size_t n_wi(64);
-    std::size_t delta_n(32);
-
-    static_assert(std::is_same_v<LhsIndexerT, RhsIndexerT>);
-    static_assert(std::is_same_v<LhsIndexerT, ResIndexerT>);
-
-    const sycl::device &dev = exec_q.get_device();
-    const std::size_t local_mem_size =
-        dev.get_info<sycl::info::device::local_mem_size>();
-    const std::size_t reserved_slm_size = 512;
-
-    gemm_detail::scale_gemm_k_parameters<resTy, m_groups>(
-        local_mem_size, reserved_slm_size, delta_k,
-        n_wi,   // modified by reference
-        delta_n // modified by reference
-    );
-
-    std::size_t n_blocks = (n + delta_n - 1) / delta_n;
-    std::size_t m_blocks = (m + m_groups - 1) / m_groups;
-    std::size_t k_blocks = (k + n_wi * delta_k - 1) / (n_wi * delta_k);
-
-    std::size_t lws = delta_n * delta_k;
-
-    auto gRange =
-        sycl::range<1>(batch_nelems * n_blocks * m_blocks * k_blocks * lws);
-    auto lRange = sycl::range<1>(lws);
-
-    auto ndRange = sycl::nd_range<1>(gRange, lRange);
-
-    sycl::event gemm_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        using LocAccT = sycl::local_accessor<sycl::vec<resTy, m_groups>, 1>;
-        LocAccT local_B_block(n_wi * delta_k, cgh);
-        LocAccT workspace(delta_n * delta_k, cgh);
-
-        using KernelName =
-            class gemm_batch_k_krn<lhsTy, rhsTy, resTy, LhsIndexerT,
-                                   BatchIndexerT, m_groups>;
-        cgh.parallel_for<KernelName>(
-            ndRange,
-            GemmBatchFunctorThreadK<lhsTy, rhsTy, resTy, LocAccT, LhsIndexerT,
-                                    BatchIndexerT, m_groups>(
-                lhs_tp, rhs_tp, res_tp, std::move(workspace),
-                std::move(local_B_block), n, n_blocks, delta_n, k, k_blocks,
-                delta_k, n_wi, m, batch_nelems, batch_indexer, lhs_indexer,
-                rhs_indexer, res_indexer));
-    });
-    return gemm_ev;
-}
-
-template <typename lhsTy,
-          typename rhsTy,
-          typename resTy,
-          typename BatchIndexerT,
-          typename LhsIndexerT,
-          typename RhsIndexerT,
-          typename ResIndexerT>
-sycl::event _gemm_small_m_impl(sycl::queue &exec_q,
-                               const lhsTy *lhs_tp,
-                               const rhsTy *rhs_tp,
-                               resTy *res_tp,
-                               const std::size_t batch_nelems,
-                               const std::size_t n,
-                               const std::size_t k,
-                               const std::size_t m,
-                               const BatchIndexerT &batch_indexer,
-                               const LhsIndexerT &lhs_indexer,
-                               const RhsIndexerT &rhs_indexer,
-                               const ResIndexerT &res_indexer,
-                               const std::vector<sycl::event> &depends)
-{
-    static constexpr std::size_t m_groups = 1;
-    const std::size_t delta_k(4);
-    std::size_t n_wi(64);
-    std::size_t delta_n(32);
-
-    static_assert(std::is_same_v<LhsIndexerT, RhsIndexerT>);
-    static_assert(std::is_same_v<LhsIndexerT, ResIndexerT>);
-
-    const sycl::device &dev = exec_q.get_device();
-    const std::size_t local_mem_size =
-        dev.get_info<sycl::info::device::local_mem_size>();
-    const std::size_t reserved_slm_size = 512;
-
-    gemm_detail::scale_gemm_k_parameters<resTy, m_groups>(
-        local_mem_size, reserved_slm_size, delta_k,
-        n_wi,   // modified by reference
-        delta_n // modified by reference
-    );
-
-    std::size_t n_blocks = (n + delta_n - 1) / delta_n;
-    std::size_t m_blocks = (m + m_groups - 1) / m_groups;
-    std::size_t k_blocks = (k + n_wi * delta_k - 1) / (n_wi * delta_k);
-
-    std::size_t lws = delta_n * delta_k;
-
-    auto gRange =
-        sycl::range<1>(batch_nelems * n_blocks * m_blocks * k_blocks * lws);
-    auto lRange = sycl::range<1>(lws);
-
-    auto ndRange = sycl::nd_range<1>(gRange, lRange);
-
-    sycl::event gemm_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        using LocAccT = sycl::local_accessor<resTy, 1>;
-        LocAccT local_B_block(n_wi * delta_k, cgh);
-        LocAccT workspace(delta_n * delta_k, cgh);
-
-        using KernelName =
-            class gemm_batch_k_krn<lhsTy, rhsTy, resTy, LhsIndexerT,
-                                   BatchIndexerT, m_groups>;
-        cgh.parallel_for<KernelName>(
-            ndRange,
-            GemmBatchFunctorThreadK<lhsTy, rhsTy, resTy, LocAccT, LhsIndexerT,
-                                    BatchIndexerT, m_groups>(
-                lhs_tp, rhs_tp, res_tp, std::move(workspace),
-                std::move(local_B_block), n, n_blocks, delta_n, k, k_blocks,
-                delta_k, n_wi, m, batch_nelems, batch_indexer, lhs_indexer,
-                rhs_indexer, res_indexer));
-    });
-
-    return gemm_ev;
-}
-
-} // end of namespace gemm_detail
-
-template <typename lhsT,
-          typename rhsT,
-          typename resT,
-          typename LocAccT1,
-          typename LocAccT2,
-          typename BatchDimsIndexerT,
-          typename LhsIndexerT,
-          typename RhsIndexerT,
-          typename ResIndexerT,
-          std::uint32_t wi_delta_n,
-          std::uint32_t wi_delta_m_vecs,
-          std::uint32_t m_vec_size>
-class GemmBatchFunctorThreadNM_vecm
-{
-private:
-    const lhsT *lhs = nullptr;
-    const rhsT *rhs = nullptr;
-    resT *res = nullptr;
-    LocAccT1 local_lhs_block;
-    LocAccT2 local_rhs_block;
-    std::size_t batch_nelems;
-    std::size_t n = 0;
-    std::size_t k = 0;
-    std::size_t m = 0;
-    std::size_t n_groups = 0;
-    std::uint32_t wg_delta_n = 0;
-    std::uint32_t wg_delta_m = 0;
-    std::uint32_t wi_delta_k = 0;
-    BatchDimsIndexerT batch_indexer;
-    LhsIndexerT lhs_indexer;
-    RhsIndexerT rhs_indexer;
-    ResIndexerT res_indexer;
-
-public:
-    /*! @brief */
-    GemmBatchFunctorThreadNM_vecm(const lhsT *lhs_,
-                                  const rhsT *rhs_,
-                                  resT *res_,
-                                  LocAccT1 local_lhs_block_,
-                                  LocAccT2 local_rhs_block_,
-                                  std::size_t batch_nelems_,
-                                  std::size_t n_,
-                                  std::size_t k_,
-                                  std::size_t m_,
-                                  std::size_t n_groups_,
-                                  std::size_t wg_delta_n_,
-                                  std::size_t wg_delta_m_,
-                                  std::size_t wi_delta_k_,
-                                  const BatchDimsIndexerT &batch_indexer_,
-                                  const LhsIndexerT &lhs_indexer_,
-                                  const RhsIndexerT &rhs_indexer_,
-                                  const ResIndexerT &res_indexer_)
-        : lhs(lhs_), rhs(rhs_), res(res_), local_lhs_block(local_lhs_block_),
-          local_rhs_block(local_rhs_block_), batch_nelems(batch_nelems_), n(n_),
-          k(k_), m(m_), n_groups(n_groups_), wg_delta_n(wg_delta_n_),
-          wg_delta_m(wg_delta_m_), wi_delta_k(wi_delta_k_),
-          batch_indexer(batch_indexer_), lhs_indexer(lhs_indexer_),
-          rhs_indexer(rhs_indexer_), res_indexer(res_indexer_)
-    {
-    }
-
-    void operator()(sycl::nd_item<1> it) const
-    {
-        static constexpr resT zero_(0);
-        static constexpr std::uint32_t wi_total_delta_m =
-            wi_delta_m_vecs * m_vec_size;
-
-        const std::size_t gws_per_batch = it.get_group_range(0) / batch_nelems;
-        const std::size_t batch_id = it.get_group_linear_id() / gws_per_batch;
-        const std::size_t gr_id =
-            it.get_group_linear_id() - batch_id * gws_per_batch;
-
-        const auto &three_offsets_ =
-            batch_indexer(static_cast<ssize_t>(batch_id));
-
-        const auto &lhs_offset = three_offsets_.get_first_offset();
-        const auto &rhs_offset = three_offsets_.get_second_offset();
-        const auto &res_offset = three_offsets_.get_third_offset();
-
-        // 0 <= block_j < m_groups
-        const std::size_t block_j = gr_id / n_groups;
-        // 0 <= block_i < n_groups
-        const std::size_t block_i = gr_id - block_j * n_groups;
-
-        // Assumption: lws == wg_delta_n * wg_delta_m
-        const std::uint32_t lid = it.get_local_linear_id();
-        // 0 <= local_j < (lws / wg_delta_n == wg_delta_m)
-        const std::uint32_t local_j = lid / wg_delta_n;
-        // sub-group lanes map to adjacent local_i
-        const std::uint32_t local_i = lid - local_j * wg_delta_n;
-
-        // Coordinates of the block of C the work-group works on
-        std::size_t i = block_i * wg_delta_n * wi_delta_n;
-        std::size_t j = block_j * wg_delta_m * wi_total_delta_m;
-
-        using slmA_t = typename LocAccT1::value_type;
-        using slmB_t = typename LocAccT2::value_type;
-
-        const std::size_t a_st0 = k;
-        const std::size_t a_st1 = 1;
-
-        const std::size_t b_st0 = m;
-        const std::size_t b_st1 = 1;
-
-        const std::size_t c_st0 = m;
-        const std::size_t c_st1 = 1;
-
-        // allocate/initialize private matrix C
-        // size ( wi_total_delta_n, wi_total_delta_m )
-        static constexpr std::uint32_t C_size = wi_delta_n * wi_delta_m_vecs;
-        std::array<slmB_t, C_size> private_C{slmB_t{zero_}};
-
-        for (std::size_t s = 0; s < k; s += wi_delta_k) {
-            // populate local_lhs_block<resT> ( wg_delta_n * wi_delta_n,
-            // wi_delta_k)
-            for (std::uint32_t vid = lid; vid < local_lhs_block.size();
-                 vid += it.get_local_range()[0])
-            {
-                // 0 <= v_i < wg_delta_n * wi_delta_n
-                const std::uint32_t v_i = vid / wi_delta_k;
-                // 0 <= v_s < wi_delta_k
-                const std::uint32_t v_s = vid - v_i * wi_delta_k;
-
-                const std::size_t g_i = i + v_i;
-                const std::size_t g_s = s + v_s;
-
-                const std::uint32_t mapped_vid =
-                    wg_delta_n * wi_delta_n * v_s + v_i;
-                local_lhs_block[mapped_vid] =
-                    (g_i < n && g_s < k)
-                        ? static_cast<resT>(
-                              lhs[lhs_offset +
-                                  lhs_indexer(g_i * a_st0 + g_s * a_st1)])
-                        : zero_;
-            }
-
-            // populate local_rhs_block<vec<resT, m_vec_size>> ( wg_delta_m *
-            // wi_delta_m_vecs, wi_delta_k )
-            for (std::uint32_t vid = lid; vid < local_rhs_block.size();
-                 vid += it.get_local_range()[0])
-            {
-                // 0 <= v_j < wg_delta_m * wi_delta_m_vecs
-                const std::uint32_t v_j = vid / wi_delta_k;
-                // 0 <= v_s < wi_delta_k
-                const std::uint32_t v_s = vid - v_j * wi_delta_k;
-
-                const std::size_t g_j = j + v_j * m_vec_size;
-                const std::size_t g_s = s + v_s;
-                const std::uint32_t mapped_vid =
-                    wg_delta_m * wi_delta_m_vecs * v_s + v_j;
-
-                if constexpr (m_vec_size == 1) {
-                    local_rhs_block[mapped_vid] =
-                        (g_j < m && g_s < k)
-                            ? static_cast<resT>(
-                                  rhs[rhs_offset +
-                                      rhs_indexer(g_s * b_st0 + g_j * b_st1)])
-                            : zero_;
-                }
-                else {
-                    slmB_t vec{};
-#pragma unroll
-                    for (std::uint32_t lane_id = 0; lane_id < m_vec_size;
-                         ++lane_id)
-                    {
-                        const std::size_t g_j1 = g_j + lane_id;
-                        vec[lane_id] = (g_j1 < m && g_s < k)
-                                           ? static_cast<resT>(
-                                                 rhs[rhs_offset +
-                                                     rhs_indexer(g_s * b_st0 +
-                                                                 g_j1 * b_st1)])
-                                           : zero_;
-                    };
-
-                    local_rhs_block[mapped_vid] = vec;
-                }
-            }
-
-            it.barrier(sycl::access::fence_space::local_space);
-
-            const std::uint32_t lo_lhs_st_k = (wg_delta_n * wi_delta_n);
-            const std::uint32_t lo_rhs_rk_k = (wg_delta_m * wi_delta_m_vecs);
-            for (std::uint32_t pr_k = 0; pr_k < wi_delta_k; ++pr_k) {
-                std::array<slmA_t, wi_delta_n> pr_lhs{};
-#pragma unroll
-                for (std::uint32_t pr_i = 0; pr_i < wi_delta_n; ++pr_i) {
-                    pr_lhs[pr_i] =
-                        local_lhs_block[pr_k * lo_lhs_st_k +
-                                        (local_i + pr_i * wg_delta_n)];
-                }
-
-                std::array<slmB_t, wi_delta_m_vecs> pr_rhs{};
-#pragma unroll
-                for (std::uint32_t pr_j = 0; pr_j < wi_delta_m_vecs; ++pr_j) {
-                    pr_rhs[pr_j] =
-                        local_rhs_block[pr_k * lo_rhs_rk_k +
-                                        (local_j + pr_j * wg_delta_m)];
-                }
-
-#pragma unroll
-                for (std::uint32_t pr_i = 0; pr_i < wi_delta_n; ++pr_i) {
-#pragma unroll
-                    for (std::uint32_t pr_j = 0; pr_j < wi_delta_m_vecs; ++pr_j)
-                    {
-                        private_C[pr_i * wi_delta_m_vecs + pr_j] +=
-                            pr_lhs[pr_i] * pr_rhs[pr_j];
-                    }
-                }
-            }
-
-            it.barrier(sycl::access::fence_space::local_space);
-        }
-
-        if constexpr (m_vec_size == 1) {
-#pragma unroll
-            for (std::uint32_t pr_i = 0; pr_i < wi_delta_n; ++pr_i) {
-                std::size_t out_i = i + local_i + pr_i * wg_delta_n;
-                if (out_i < n) {
-#pragma unroll
-                    for (std::uint32_t pr_j = 0; pr_j < wi_delta_m_vecs; ++pr_j)
-                    {
-                        const std::size_t out_j =
-                            j + (local_j + pr_j * wg_delta_m) * m_vec_size;
-                        const std::size_t out_flat_id =
-                            out_i * c_st0 + out_j * c_st1;
-                        if (out_j < m) {
-                            res[res_offset + res_indexer(out_flat_id)] =
-                                private_C[pr_i * wi_delta_m_vecs + pr_j];
-                        }
-                    }
-                }
-            }
-        }
-        else {
-#pragma unroll
-            for (std::uint32_t pr_i = 0; pr_i < wi_delta_n; ++pr_i) {
-                std::size_t out_i = i + local_i + pr_i * wg_delta_n;
-                if (out_i < n) {
-                    // could be unrolled
-                    for (std::uint32_t pr_j = 0; pr_j < wi_delta_m_vecs; ++pr_j)
-                    {
-                        std::size_t out_j =
-                            j + (local_j + pr_j * wg_delta_m) * m_vec_size;
-#pragma unroll
-                        for (std::uint32_t lane_id = 0; lane_id < m_vec_size;
-                             ++lane_id)
-                        {
-                            const std::size_t out_flat_id =
-                                out_i * c_st0 + (out_j + lane_id) * c_st1;
-                            if (out_j + lane_id < m) {
-                                res[res_offset + res_indexer(out_flat_id)] =
-                                    private_C[pr_i * wi_delta_m_vecs + pr_j]
-                                             [lane_id];
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-};
-
-struct GemmBatchFunctorThreadNM_vecm_HyperParameters
-{
-private:
-    std::uint32_t wi_delta_n = 2;
-    std::uint32_t wi_delta_m_vecs = 4;
-    std::uint32_t m_vec_size = 1;
-
-public:
-    constexpr GemmBatchFunctorThreadNM_vecm_HyperParameters();
-    constexpr GemmBatchFunctorThreadNM_vecm_HyperParameters(
-        std::uint32_t wi_delta_n_,
-        std::uint32_t wi_delta_m_vecs_,
-        std::uint32_t m_vec_size_)
-        : wi_delta_n(wi_delta_n_), wi_delta_m_vecs(wi_delta_m_vecs_),
-          m_vec_size(m_vec_size_)
-    {
-    }
-
-    constexpr std::uint32_t get_wi_delta_n() const { return wi_delta_n; }
-    constexpr std::uint32_t get_wi_delta_m_vecs() const
-    {
-        return wi_delta_m_vecs;
-    }
-    constexpr std::uint32_t get_m_vec_size() const { return m_vec_size; }
-};
-
-template <typename resT>
-struct GemmBatchFunctorThreadNM_vecm_HyperParametersSelector
-{
-    constexpr GemmBatchFunctorThreadNM_vecm_HyperParametersSelector() {}
-
-    constexpr GemmBatchFunctorThreadNM_vecm_HyperParameters get() const
-    {
-        if constexpr (sizeof(resT) == 1) {
-            // 1 * 8 * 2 * 4 == 64
-            return GemmBatchFunctorThreadNM_vecm_HyperParameters(8, 2, 4);
-        }
-        else if constexpr (sizeof(resT) == 2) {
-            // 2 * 4 * 2 * 4 == 64
-            return GemmBatchFunctorThreadNM_vecm_HyperParameters(4, 2, 4);
-        }
-        else if constexpr (sizeof(resT) == 4) {
-            // 4 * 4 * 1 * 4 == 64
-            return GemmBatchFunctorThreadNM_vecm_HyperParameters(4, 1, 4);
-        }
-        else if constexpr (sizeof(resT) == 8) {
-            // 8 * 2 * 1 * 4 == 64
-            if constexpr (std::is_same_v<resT, std::complex<float>>) {
-                return GemmBatchFunctorThreadNM_vecm_HyperParameters(2, 4, 1);
-            }
-            else {
-                return GemmBatchFunctorThreadNM_vecm_HyperParameters(2, 1, 4);
-            }
-        }
-        else if constexpr (std::is_same_v<resT, std::complex<double>>) {
-            // 16 * 2 * 2 * 1 == 64
-            return GemmBatchFunctorThreadNM_vecm_HyperParameters(2, 2, 1);
-        }
-        else {
-            return GemmBatchFunctorThreadNM_vecm_HyperParameters(2, 2, 1);
-        }
-    }
-};
-
-template <typename T1,
-          typename T2,
-          typename T3,
-          typename T4,
-          typename T5,
-          typename T6,
-          typename T7,
-          std::uint32_t p1,
-          std::uint32_t p2,
-          std::uint32_t p3>
-class gemm_batch_nm_vecm_krn;
-
-namespace gemm_detail
-{
-
-template <typename T, std::uint32_t wi_delta_n, std::uint32_t wi_delta_m>
-std::tuple<std::uint32_t, std::uint32_t>
-get_wg_delta_m_and_wi_delta_k(const std::size_t slm_byte_size,
-                              const std::uint32_t wg_delta_n,
-                              const std::uint32_t suggested_wg_delta_m)
-{
-    std::uint32_t wg_delta_m = suggested_wg_delta_m;
-
-    const std::size_t slm_max_rows =
-        slm_byte_size /
-        ((wg_delta_n * wi_delta_n + wg_delta_m * wi_delta_m) * sizeof(T));
-
-    std::uint32_t wi_delta_k =
-        (slm_max_rows >= 64)
-            ? 64
-            : 32 * static_cast<std::uint32_t>(slm_max_rows / 32);
-
-    for (std::uint32_t it = 0; !wi_delta_k && (it < 4); ++it) {
-        wg_delta_m /= 2;
-
-        const std::size_t slm_max_rows =
-            slm_byte_size /
-            ((wg_delta_n * wi_delta_n + wg_delta_m * wi_delta_m) * sizeof(T));
-
-        wi_delta_k =
-            (slm_max_rows >= 64)
-                ? 64
-                : ((slm_max_rows >= 32)
-                       ? 32
-                       : (slm_max_rows >= 16 ? 16
-                                             : 8 * static_cast<std::uint32_t>(
-                                                       slm_max_rows / 8)));
-    }
-
-    if (!wi_delta_k) {
-        throw std::runtime_error("Insufficient resources");
-    }
-
-    return std::make_tuple(wg_delta_m, wi_delta_k);
-}
-
-template <typename lhsTy,
-          typename rhsTy,
-          typename resTy,
-          typename BatchIndexerT,
-          typename LhsIndexerT,
-          typename RhsIndexerT,
-          typename ResIndexerT>
-sycl::event _gemm_batch_nm_impl(sycl::queue &exec_q,
-                                const lhsTy *lhs_tp,
-                                const rhsTy *rhs_tp,
-                                resTy *res_tp,
-                                const std::size_t batch_nelems,
-                                const std::size_t n,
-                                const std::size_t k,
-                                const std::size_t m,
-                                const BatchIndexerT &batch_indexer,
-                                const LhsIndexerT &lhs_indexer,
-                                const RhsIndexerT &rhs_indexer,
-                                const ResIndexerT &res_indexer,
-                                std::vector<sycl::event> const &depends)
-{
-    static constexpr GemmBatchFunctorThreadNM_vecm_HyperParametersSelector<
-        resTy>
-        selector{};
-    static constexpr auto hyper_params = selector.get();
-
-    static constexpr std::uint32_t wi_delta_n = hyper_params.get_wi_delta_n();
-    static constexpr std::uint32_t wi_delta_m_vecs =
-        hyper_params.get_wi_delta_m_vecs();
-    static constexpr std::uint32_t m_vec_size = hyper_params.get_m_vec_size();
-
-    static constexpr std::uint32_t wi_total_delta_m =
-        wi_delta_m_vecs * m_vec_size;
-
-    using KernelName =
-        class gemm_batch_nm_vecm_krn<lhsTy, rhsTy, resTy, BatchIndexerT,
-                                     LhsIndexerT, RhsIndexerT, ResIndexerT,
-                                     wi_delta_n, wi_delta_m_vecs, m_vec_size>;
-
-    const auto &kernel_id = sycl::get_kernel_id<KernelName>();
-
-    auto const &ctx = exec_q.get_context();
-    auto const &dev = exec_q.get_device();
-    auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
-        ctx, {dev}, {kernel_id});
-
-    auto krn = kb.get_kernel(kernel_id);
-
-    const std::uint32_t max_sg_size = krn.template get_info<
-        sycl::info::kernel_device_specific::max_sub_group_size>(dev);
-
-    const std::size_t k_wg_sz = krn.template get_info<
-        sycl::info::kernel_device_specific::work_group_size>(dev);
-
-    // Limit work-group size
-    static constexpr std::size_t wg_sz_limit(2048);
-    const std::size_t max_wg_sz = std::min(wg_sz_limit, k_wg_sz);
-
-    const std::uint32_t max_subgroups_per_wg =
-        static_cast<std::uint32_t>(max_wg_sz / max_sg_size);
-
-    const std::size_t reserved_slm_byte_size = 512;
-    const std::size_t slm_byte_size =
-        dev.get_info<sycl::info::device::local_mem_size>();
-
-    const std::uint32_t wg_delta_n = max_sg_size;
-    std::uint32_t wg_delta_m = 0;
-    std::uint32_t wi_delta_k = 0;
-
-    std::tie(wg_delta_m, wi_delta_k) =
-        get_wg_delta_m_and_wi_delta_k<resTy, wi_delta_n, wi_total_delta_m>(
-            slm_byte_size - reserved_slm_byte_size, wg_delta_n,
-            max_subgroups_per_wg);
-
-    const std::uint32_t lws = wg_delta_n * wg_delta_m;
-
-    const std::size_t n_groups =
-        (n + wg_delta_n * wi_delta_n - 1) / (wg_delta_n * wi_delta_n);
-    const std::size_t m_groups = (m + wg_delta_m * wi_total_delta_m - 1) /
-                                 (wg_delta_m * wi_total_delta_m);
-
-    const std::size_t gws = lws * batch_nelems * n_groups * m_groups;
-
-    sycl::range<1> lRange(lws);
-    sycl::range<1> gRange(gws);
-    sycl::nd_range<1> ndRange(gRange, lRange);
-
-    using slmB_t =
-        typename std::conditional<m_vec_size == 1, resTy,
-                                  sycl::vec<resTy, m_vec_size>>::type;
-
-    sycl::event gemm_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        cgh.use_kernel_bundle(kb);
-
-        using LocAccT1 = sycl::local_accessor<resTy, 1>;
-        LocAccT1 local_A_block(wg_delta_n * wi_delta_n * wi_delta_k, cgh);
-
-        using LocAccT2 = sycl::local_accessor<slmB_t, 1>;
-        LocAccT2 local_B_block(wg_delta_m * wi_delta_m_vecs * wi_delta_k, cgh);
-
-        using Impl_FunctorT = GemmBatchFunctorThreadNM_vecm<
-            lhsTy, rhsTy, resTy, LocAccT1, LocAccT2, BatchIndexerT, LhsIndexerT,
-            RhsIndexerT, ResIndexerT, wi_delta_n, wi_delta_m_vecs, m_vec_size>;
-
-        cgh.parallel_for<KernelName>(
-            ndRange, Impl_FunctorT(
-                         lhs_tp, rhs_tp, res_tp, std::move(local_A_block),
-                         std::move(local_B_block), batch_nelems, n, k, m,
-                         n_groups, wg_delta_n, wg_delta_m, wi_delta_k,
-                         batch_indexer, lhs_indexer, rhs_indexer, res_indexer));
-    });
-    return gemm_ev;
-}
-
-} // namespace gemm_detail
-
-typedef sycl::event (*gemm_impl_fn_ptr_t)(
-    sycl::queue &,
-    const char *,    // lhs
-    const char *,    // rhs
-    char *,          // res
-    std::size_t,     // lhs_outer_nelems (n)
-    std::size_t,     // inner_nelems (k)
-    std::size_t,     // rhs_outer_nelems (m)
-    int,             // inner nd
-    int,             // lhs outer nd
-    const ssize_t *, // lhs shape and strides
-    int,             // rhs outer nd
-    const ssize_t *, // rhs shape and strides
-    int,             // res outer nd
-    const ssize_t *, // res shape and strides
-    std::vector<sycl::event> const &);
-
-template <typename lhsTy, typename rhsTy, typename resTy>
-sycl::event gemm_impl(sycl::queue &exec_q,
-                      const char *lhs_cp,
-                      const char *rhs_cp,
-                      char *res_cp,
-                      std::size_t n,
-                      std::size_t k,
-                      std::size_t m,
-                      int inner_nd,
-                      int lhs_outer_nd,
-                      const ssize_t *lhs_shape_strides,
-                      int rhs_outer_nd,
-                      const ssize_t *rhs_shape_strides,
-                      int res_outer_nd,
-                      const ssize_t *res_shape_strides,
-                      std::vector<sycl::event> const &depends = {})
-{
-    const lhsTy *lhs_tp = reinterpret_cast<const lhsTy *>(lhs_cp);
-    const rhsTy *rhs_tp = reinterpret_cast<const rhsTy *>(rhs_cp);
-    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
-
-    using OuterInnerIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
-    const OuterInnerIndexerT lhs_indexer(inner_nd + lhs_outer_nd, 0,
-                                         lhs_shape_strides);
-    const OuterInnerIndexerT rhs_indexer(inner_nd + rhs_outer_nd, 0,
-                                         rhs_shape_strides);
-    const OuterInnerIndexerT res_indexer(res_outer_nd, 0, res_shape_strides);
-
-    using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer;
-    static constexpr BatchIndexerT batch_indexer{};
-
-    static constexpr std::size_t single_batch_nelems = 1;
-
-    const std::size_t min_nm = std::min(n, m);
-    const std::size_t max_nm = std::max(n, m);
-
-    if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) {
-        return gemm_detail::_gemm_batch_nm_impl<
-            lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerIndexerT,
-            OuterInnerIndexerT, OuterInnerIndexerT>(
-            exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
-            batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends);
-    }
-
-    sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        using IndexerT = dpctl::tensor::offset_utils::StridedIndexer;
-        const IndexerT res_indexer(res_outer_nd, 0, res_shape_strides);
-        using InitKernelName = class gemm_init_krn<lhsTy, rhsTy, resTy>;
-        cgh.parallel_for<InitKernelName>(
-            sycl::range<1>(n * m), [=](sycl::id<1> id) {
-                auto res_offset = res_indexer(id[0]);
-                res_tp[res_offset] = resTy(0);
-            });
-    });
-
-    if (k == 0) {
-        return res_init_ev;
-    }
-
-    if ((max_nm < 64)) {
-        if (m < 4) {
-            return gemm_detail::_gemm_small_m_impl<
-                lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerIndexerT,
-                OuterInnerIndexerT, OuterInnerIndexerT>(
-                exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
-                batch_indexer, lhs_indexer, rhs_indexer, res_indexer,
-                {res_init_ev});
-        }
-        return gemm_detail::_gemm_k_impl<lhsTy, rhsTy, resTy, BatchIndexerT,
-                                         OuterInnerIndexerT, OuterInnerIndexerT,
-                                         OuterInnerIndexerT>(
-            exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
-            batch_indexer, lhs_indexer, rhs_indexer, res_indexer,
-            {res_init_ev});
-    }
-
-    return gemm_detail::_gemm_batch_nm_impl<
-        lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerIndexerT,
-        OuterInnerIndexerT, OuterInnerIndexerT>(
-        exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
-        batch_indexer, lhs_indexer, rhs_indexer, res_indexer, {res_init_ev});
-}
-
-typedef sycl::event (*gemm_contig_impl_fn_ptr_t)(
-    sycl::queue &,
-    const char *, // lhs
-    const char *, // rhs
-    char *,       // res
-    std::size_t,  // n
-    std::size_t,  // k
-    std::size_t,  // m
-    std::vector<sycl::event> const &);
-
-template <typename lhsTy, typename rhsTy, typename resTy>
-sycl::event gemm_contig_impl(sycl::queue &exec_q,
-                             const char *lhs_cp,
-                             const char *rhs_cp,
-                             char *res_cp,
-                             std::size_t n,
-                             std::size_t k,
-                             std::size_t m,
-                             std::vector<sycl::event> const &depends = {})
-{
-    const lhsTy *lhs_tp = reinterpret_cast<const lhsTy *>(lhs_cp);
-    const rhsTy *rhs_tp = reinterpret_cast<const rhsTy *>(rhs_cp);
-    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
-
-    using OuterInnerIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-    static constexpr OuterInnerIndexerT lhs_indexer{};
-    static constexpr OuterInnerIndexerT rhs_indexer{};
-    static constexpr OuterInnerIndexerT res_indexer{};
-
-    using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer;
-    static constexpr BatchIndexerT batch_indexer{};
-
-    static constexpr std::size_t single_batch_nelems = 1;
-
-    const std::size_t min_nm = std::min(n, m);
-    const std::size_t max_nm = std::max(n, m);
-    if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) {
-        return gemm_detail::_gemm_batch_nm_impl<
-            lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerIndexerT,
-            OuterInnerIndexerT, OuterInnerIndexerT>(
-            exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
-            batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends);
-    }
-
-    sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        cgh.fill<resTy>(res_tp, resTy(0), n * m);
-    });
-
-    if (k == 0) {
-        return res_init_ev;
-    }
-
-    if (max_nm < 64) {
-        if (m < 4) {
-            return gemm_detail::_gemm_small_m_impl<
-                lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerIndexerT,
-                OuterInnerIndexerT, OuterInnerIndexerT>(
-                exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
-                batch_indexer, lhs_indexer, rhs_indexer, res_indexer,
-                {res_init_ev});
-        }
-        return gemm_detail::_gemm_k_impl<lhsTy, rhsTy, resTy, BatchIndexerT,
-                                         OuterInnerIndexerT, OuterInnerIndexerT,
-                                         OuterInnerIndexerT>(
-            exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
-            batch_indexer, lhs_indexer, rhs_indexer, res_indexer,
-            {res_init_ev});
-    }
-
-    return gemm_detail::_gemm_batch_nm_impl<
-        lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerIndexerT,
-        OuterInnerIndexerT, OuterInnerIndexerT>(
-        exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
-        batch_indexer, lhs_indexer, rhs_indexer, res_indexer, {res_init_ev});
-}
-
-template <typename T1, typename T2, typename T3> class gemm_batch_init_krn;
-
-typedef sycl::event (*gemm_batch_impl_fn_ptr_t)(
-    sycl::queue &,
-    const char *,    // lhs
-    const char *,    // rhs
-    char *,          // res
-    std::size_t,     // batch nelems
-    std::size_t,     // lhs outer nelems (n)
-    std::size_t,     // inner nelems (k)
-    std::size_t,     // rhs outer nelems (m)
-    int,             // batching nd
-    const ssize_t *, // batch shape strides
-    ssize_t,         // lhs batch offset
-    ssize_t,         // rhs batch offset
-    ssize_t,         // res batch offset
-    int,             // inner dims
-    int,             // lhs outer dims
-    const ssize_t *, // lhs outer and inner shape and strides
-    int,             // rhs outer dims
-    const ssize_t *, // rhs outer and inner shape and strides
-    int,             // res outer dims
-    const ssize_t *, // res outer and inner shape and strides
-    const ssize_t *, // res full shape and strides
-    std::vector<sycl::event> const &);
-
-template <typename lhsTy, typename rhsTy, typename resTy>
-sycl::event gemm_batch_impl(sycl::queue &exec_q,
-                            const char *lhs_cp,
-                            const char *rhs_cp,
-                            char *res_cp,
-                            std::size_t batch_nelems,
-                            std::size_t n,
-                            std::size_t k,
-                            std::size_t m,
-                            int batch_nd,
-                            const ssize_t *batch_shape_strides,
-                            ssize_t lhs_batch_offset,
-                            ssize_t rhs_batch_offset,
-                            ssize_t res_batch_offset,
-                            int inner_nd,
-                            int lhs_outer_nd,
-                            const ssize_t *lhs_outer_inner_shapes_strides,
-                            int rhs_outer_nd,
-                            const ssize_t *rhs_outer_inner_shapes_strides,
-                            int res_outer_nd,
-                            const ssize_t *res_outer_shapes_strides,
-                            const ssize_t *res_shape_strides,
-                            std::vector<sycl::event> const &depends = {})
-{
-    const lhsTy *lhs_tp = reinterpret_cast<const lhsTy *>(lhs_cp);
-    const rhsTy *rhs_tp = reinterpret_cast<const rhsTy *>(rhs_cp);
-    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
-
-    using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
-    const OuterInnerDimsIndexerT lhs_indexer(inner_nd + lhs_outer_nd, 0,
-                                             lhs_outer_inner_shapes_strides);
-    const OuterInnerDimsIndexerT rhs_indexer(inner_nd + rhs_outer_nd, 0,
-                                             rhs_outer_inner_shapes_strides);
-    const OuterInnerDimsIndexerT res_indexer(res_outer_nd, 0,
-                                             res_outer_shapes_strides);
-    using BatchDimsIndexerT =
-        dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
-    const BatchDimsIndexerT batch_indexer(batch_nd, lhs_batch_offset,
-                                          rhs_batch_offset, res_batch_offset,
-                                          batch_shape_strides);
-
-    const std::size_t min_nm = std::min(n, m);
-    const std::size_t max_nm = std::max(n, m);
-
-    if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) {
-        return gemm_detail::_gemm_batch_nm_impl<
-            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
-            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
-            exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
-            batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends);
-    }
-
-    sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        using IndexerT = dpctl::tensor::offset_utils::StridedIndexer;
-        const IndexerT res_indexer(batch_nd + res_outer_nd, res_batch_offset,
-                                   res_shape_strides);
-        using InitKernelName = class gemm_batch_init_krn<lhsTy, rhsTy, resTy>;
-        cgh.parallel_for<InitKernelName>(
-            sycl::range<1>(n * m * batch_nelems), [=](sycl::id<1> id) {
-                auto res_offset = res_indexer(id[0]);
-                res_tp[res_offset] = resTy(0);
-            });
-    });
-
-    if (k == 0) {
-        return res_init_ev;
-    }
-
-    if (m < 4) {
-        return gemm_detail::_gemm_small_m_impl<
-            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
-            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
-            exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
-            batch_indexer, lhs_indexer, rhs_indexer, res_indexer,
-            {res_init_ev});
-    }
-    else if (k > n && k > m) {
-        return gemm_detail::_gemm_k_impl<
-            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
-            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
-            exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
-            batch_indexer, lhs_indexer, rhs_indexer, res_indexer,
-            {res_init_ev});
-    }
-    else {
-        return gemm_detail::_gemm_batch_nm_impl<
-            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
-            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
-            exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
-            batch_indexer, lhs_indexer, rhs_indexer, res_indexer,
-            {res_init_ev});
-    }
-}
-
-typedef sycl::event (*gemm_batch_contig_impl_fn_ptr_t)(
-    sycl::queue &,
-    const char *, // lhs
-    const char *, // rhs
-    char *,       // res
-    std::size_t,  // batch nelems
-    std::size_t,  // n
-    std::size_t,  // k
-    std::size_t,  // m
-    ssize_t,      // lhs batch offset
-    ssize_t,      // rhs batch offset
-    ssize_t,      // res batch offset
-    std::vector<sycl::event> const &);
-
-template <typename lhsTy, typename rhsTy, typename resTy>
-sycl::event gemm_batch_contig_impl(sycl::queue &exec_q,
-                                   const char *lhs_cp,
-                                   const char *rhs_cp,
-                                   char *res_cp,
-                                   std::size_t batch_nelems,
-                                   std::size_t n,
-                                   std::size_t k,
-                                   std::size_t m,
-                                   ssize_t lhs_batch_offset,
-                                   ssize_t rhs_batch_offset,
-                                   ssize_t res_batch_offset,
-                                   std::vector<sycl::event> const &depends = {})
-{
-    const lhsTy *lhs_tp =
-        reinterpret_cast<const lhsTy *>(lhs_cp) + lhs_batch_offset;
-    const rhsTy *rhs_tp =
-        reinterpret_cast<const rhsTy *>(rhs_cp) + rhs_batch_offset;
-    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + res_batch_offset;
-
-    using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-    static constexpr OuterInnerDimsIndexerT lhs_indexer{};
-    static constexpr OuterInnerDimsIndexerT rhs_indexer{};
-    static constexpr OuterInnerDimsIndexerT res_indexer{};
-
-    using dpctl::tensor::offset_utils::Strided1DIndexer;
-    using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
-    using BatchDimsIndexerT =
-        ThreeOffsets_CombinedIndexer<Strided1DIndexer, Strided1DIndexer,
-                                     Strided1DIndexer>;
-
-    const BatchDimsIndexerT batch_indexer(
-        Strided1DIndexer{/* size */ batch_nelems,
-                         /* step */ n * k},
-        Strided1DIndexer{/* size */ batch_nelems,
-                         /* step */ k * m},
-        Strided1DIndexer{/* size */ batch_nelems,
-                         /* step */ n * m});
-
-    const std::size_t min_nm = std::min(n, m);
-    const std::size_t max_nm = std::max(n, m);
-
-    if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) {
-        return gemm_detail::_gemm_batch_nm_impl<
-            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
-            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
-            exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
-            batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends);
-    }
-
-    sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        cgh.fill<resTy>(res_tp, resTy(0), n * m * batch_nelems);
-    });
-
-    if (k == 0) {
-        return res_init_ev;
-    }
-
-    if (max_nm < 64) {
-        if (m < 4) {
-            return gemm_detail::_gemm_small_m_impl<
-                lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
-                OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
-                exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
-                batch_indexer, lhs_indexer, rhs_indexer, res_indexer,
-                {res_init_ev});
-        }
-        return gemm_detail::_gemm_k_impl<
-            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
-            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
-            exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
-            batch_indexer, lhs_indexer, rhs_indexer, res_indexer,
-            {res_init_ev});
-    }
-
-    return gemm_detail::_gemm_batch_nm_impl<
-        lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
-        OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
-        exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, batch_indexer,
-        lhs_indexer, rhs_indexer, res_indexer, {res_init_ev});
-}
-
-// ========== Gemm Tree
-
-template <typename lhsT,
-          typename rhsT,
-          typename resT,
-          typename LocAccT1,
-          typename LocAccT2,
-          typename OuterInnerDimsIndexerT,
-          typename ResIndexerT,
-          typename BatchDimsIndexerT,
-          int wi_delta_n,
-          int wi_delta_m>
-class GemmBatchNoAtomicFunctorThreadNM
-{
-private:
-    const lhsT *lhs = nullptr;
-    const rhsT *rhs = nullptr;
-    resT *res = nullptr;
-    LocAccT1 local_A_block;
-    LocAccT2 local_B_block;
-    std::size_t n = 0;
-    std::size_t wg_delta_n = 0;
-    std::size_t k = 0;
-    std::size_t k_blocks = 0;
-    std::size_t wi_delta_k = 0;
-    std::size_t m = 0;
-    std::size_t m_blocks = 0;
-    std::size_t wg_delta_m = 0;
-    std::size_t batch_nelems;
-    BatchDimsIndexerT batch_indexer;
-    OuterInnerDimsIndexerT lhs_indexer;
-    OuterInnerDimsIndexerT rhs_indexer;
-    ResIndexerT res_indexer;
-
-public:
-    GemmBatchNoAtomicFunctorThreadNM(const lhsT *lhs_,
-                                     const rhsT *rhs_,
-                                     resT *res_,
-                                     LocAccT1 local_A_block_,
-                                     LocAccT2 local_B_block_,
-                                     std::size_t n_,
-                                     std::size_t wg_delta_n_,
-                                     std::size_t k_,
-                                     std::size_t k_blocks_,
-                                     std::size_t wi_delta_k_,
-                                     std::size_t m_,
-                                     std::size_t m_blocks_,
-                                     std::size_t wg_delta_m_,
-                                     std::size_t batch_nelems_,
-                                     const BatchDimsIndexerT batch_indexer_,
-                                     const OuterInnerDimsIndexerT lhs_indexer_,
-                                     const OuterInnerDimsIndexerT rhs_indexer_,
-                                     const ResIndexerT res_indexer_)
-        : lhs(lhs_), rhs(rhs_), res(res_), local_A_block(local_A_block_),
-          local_B_block(local_B_block_), n(n_), wg_delta_n(wg_delta_n_), k(k_),
-          k_blocks(k_blocks_), wi_delta_k(wi_delta_k_), m(m_),
-          m_blocks(m_blocks_), wg_delta_m(wg_delta_m_),
-          batch_nelems(batch_nelems_), batch_indexer(batch_indexer_),
-          lhs_indexer(lhs_indexer_), rhs_indexer(rhs_indexer_),
-          res_indexer(res_indexer_)
-    {
-    }
-
-    void operator()(sycl::nd_item<1> it) const
-    {
-        const std::size_t n_groups_per_batch =
-            it.get_group_range(0) / batch_nelems;
-        const std::size_t m_id = it.get_group_linear_id() / n_groups_per_batch;
-        const std::size_t gr_id =
-            it.get_group_linear_id() - m_id * n_groups_per_batch;
-
-        const auto &three_offsets_ = batch_indexer(static_cast<ssize_t>(m_id));
-
-        // lift group_id to (block_i, block_j, block_s),
-        //    0 <= block_i < n_blocks, 0 <= block_j < m_blocks, 0 <= block_s
-        //    < k_blocks
-
-        const auto &lhs_offset = three_offsets_.get_first_offset();
-        const auto &rhs_offset = three_offsets_.get_second_offset();
-        const auto &res_offset = three_offsets_.get_third_offset();
-
-        std::size_t block_i = gr_id / (m_blocks * k_blocks);
-        std::size_t block_r = gr_id - block_i * (m_blocks * k_blocks);
-        std::size_t block_j = block_r / k_blocks;
-        std::size_t block_s = block_r - block_j * k_blocks;
-
-        std::size_t lid = it.get_local_linear_id();
-        std::size_t local_i = lid / wg_delta_m; // 0<= local_i < wg_delta_n
-        std::size_t local_j =
-            lid - local_i * wg_delta_m; // 0<= local_j < wg_delta_m
-
-        // load A block and B blocks into SLM
-
-        std::size_t i = block_i * wi_delta_n * wg_delta_n;
-        std::size_t j = block_j * wi_delta_m * wg_delta_m;
-        std::size_t s = block_s * wi_delta_k;
-
-        const std::int64_t a_st0 = k;
-        const std::int64_t a_st1 = 1;
-
-        const std::int64_t b_st0 = m;
-        const std::int64_t b_st1 = 1;
-
-        const std::int64_t c_st0 = m;
-        const std::int64_t c_st1 = 1;
-
-        std::size_t lws = it.get_local_range(0);
-
-        for (std::size_t vid = lid; vid < local_A_block.size(); vid += lws) {
-            std::size_t v_i =
-                vid / wi_delta_k; // 0<= v_i < wg_delta_n * wi_delta_n
-            std::size_t v_s = vid - v_i * wi_delta_k; // 0<= v_s < wi_delta_k
-
-            std::size_t g_i = i + v_i;
-            std::size_t g_s = s + v_s;
-
-            local_A_block[vid] =
-                (g_i < n && g_s < k)
-                    ? static_cast<resT>(
-                          lhs[lhs_offset +
-                              lhs_indexer(g_i * a_st0 + g_s * a_st1)])
-                    : resT(0);
-        }
-
-        using slmB_t = typename LocAccT2::value_type;
-
-        for (std::size_t vid = lid; vid < local_B_block.size(); vid += lws) {
-            std::size_t v_j = vid / wi_delta_k;       // 0<= v_i < wg_delta_m
-            std::size_t v_s = vid - v_j * wi_delta_k; // 0<= v_s < wi_delta_k
-
-            std::size_t g_j = j + v_j * wi_delta_m;
-            std::size_t g_s = s + v_s;
-
-            if constexpr (wi_delta_m == 1 && std::is_same_v<slmB_t, resT>) {
-                local_B_block[vid] =
-                    (g_j < m && g_s < k)
-                        ? static_cast<resT>(
-                              rhs[rhs_offset +
-                                  rhs_indexer(g_s * b_st0 + g_j * b_st1)])
-                        : resT(0);
-            }
-            else {
-                slmB_t vec{};
-#pragma unroll
-                for (std::uint8_t lane_id = 0; lane_id < wi_delta_m; ++lane_id)
-                {
-                    std::size_t g_j1 = g_j + lane_id;
-                    vec[lane_id] =
-                        (g_j1 < m && g_s < k)
-                            ? static_cast<resT>(
-                                  rhs[rhs_offset +
-                                      rhs_indexer(g_s * b_st0 + g_j1 * b_st1)])
-                            : resT(0);
-                }
-
-                local_B_block[vid] = vec;
-            }
-        }
-
-        it.barrier(sycl::access::fence_space::local_space);
-
-        i += local_i * wi_delta_n;
-        j += local_j * wi_delta_m;
-
-        const std::size_t a_offset = local_i * wi_delta_k * wi_delta_n;
-        const std::size_t b_offset = local_j * wi_delta_k;
-
-        static constexpr resT identity_(0);
-
-        for (std::uint8_t private_i = 0; private_i < wi_delta_n; ++private_i) {
-            const std::size_t a_pr_offset = private_i * wi_delta_k;
-
-            slmB_t local_sum(identity_);
-            for (std::size_t private_s = 0; private_s < wi_delta_k; ++private_s)
-            {
-                local_sum = local_sum +
-                            (local_A_block[a_offset + a_pr_offset + private_s] *
-                             local_B_block[b_offset + private_s]);
-            }
-
-            const std::size_t gl_i = i + private_i;
-
-            if constexpr (wi_delta_m == 1 && std::is_same_v<slmB_t, resT>) {
-                const std::size_t gl_j = j;
-                if (gl_i < n && gl_j < m) {
-                    res[res_offset + res_indexer(gl_i * c_st0 + gl_j * c_st1) +
-                        (block_s * n * m * batch_nelems)] = local_sum;
-                }
-            }
-            else {
-#pragma unroll
-                for (std::uint8_t lane_id = 0; lane_id < wi_delta_m; ++lane_id)
-                {
-                    const std::size_t gl_j = j + lane_id;
-
-                    if (gl_i < n && gl_j < m) {
-                        res[res_offset +
-                            res_indexer(gl_i * c_st0 + gl_j * c_st1) +
-                            (block_s * n * m * batch_nelems)] =
-                            local_sum[lane_id];
-                    }
-                }
-            }
-        }
-    }
-};
-
-template <typename lhsT,
-          typename rhsT,
-          typename resT,
-          typename LocAccT,
-          typename OuterInnerDimsIndexerT,
-          typename ResIndexerT,
-          typename BatchDimsIndexerT,
-          std::size_t m_groups>
-class GemmBatchNoAtomicFunctorThreadK
-{
-private:
-    const lhsT *lhs = nullptr;
-    const rhsT *rhs = nullptr;
-    resT *res = nullptr;
-    LocAccT workspace;
-    LocAccT local_B_block;
-    std::size_t n = 0;
-    std::size_t n_blocks = 0;
-    std::size_t delta_n = 0;
-    std::size_t k = 0;
-    std::size_t k_blocks = 0;
-    std::size_t delta_k = 0;
-    std::size_t n_wi = 0;
-    std::size_t m = 0;
-    std::size_t batch_nelems = 0;
-    BatchDimsIndexerT batch_indexer;
-    OuterInnerDimsIndexerT lhs_indexer;
-    OuterInnerDimsIndexerT rhs_indexer;
-    ResIndexerT res_indexer;
-
-public:
-    GemmBatchNoAtomicFunctorThreadK(const lhsT *lhs_,
-                                    const rhsT *rhs_,
-                                    resT *res_,
-                                    LocAccT workspace_,
-                                    LocAccT local_B_block_,
-                                    std::size_t n_,
-                                    std::size_t n_blocks_,
-                                    std::size_t delta_n_,
-                                    std::size_t k_,
-                                    std::size_t k_blocks_,
-                                    std::size_t delta_k_,
-                                    std::size_t n_wi_,
-                                    std::size_t m_,
-                                    std::size_t batch_nelems_,
-                                    const BatchDimsIndexerT &batch_indexer_,
-                                    const OuterInnerDimsIndexerT &lhs_indexer_,
-                                    const OuterInnerDimsIndexerT &rhs_indexer_,
-                                    const ResIndexerT &res_indexer_)
-        : lhs(lhs_), rhs(rhs_), res(res_), workspace(workspace_),
-          local_B_block(local_B_block_), n(n_), n_blocks(n_blocks_),
-          delta_n(delta_n_), k(k_), k_blocks(k_blocks_), delta_k(delta_k_),
-          n_wi(n_wi_), m(m_), batch_nelems(batch_nelems_),
-          batch_indexer(batch_indexer_), lhs_indexer(lhs_indexer_),
-          rhs_indexer(rhs_indexer_), res_indexer(res_indexer_)
-    {
-    }
-
-    void operator()(sycl::nd_item<1> it) const
-    {
-        const std::size_t n_groups_per_batch =
-            it.get_group_range(0) / batch_nelems;
-        const std::size_t m_id = it.get_group_linear_id() / n_groups_per_batch;
-        const std::size_t gr_id =
-            it.get_group_linear_id() - m_id * n_groups_per_batch;
-        std::size_t lid = it.get_local_linear_id();
-
-        const auto &three_offsets_ = batch_indexer(static_cast<ssize_t>(m_id));
-        const auto &lhs_offset = three_offsets_.get_first_offset();
-        const auto &rhs_offset = three_offsets_.get_second_offset();
-        const auto &res_offset = three_offsets_.get_third_offset();
-
-        // lift gr_id -> (block_i, block_j, block_s)
-        //   block_i moves fastest, then block_s, then block_j
-
-        const std::size_t r_size = (n_blocks * k_blocks);
-        // 0 <= block_j < m_blocks
-        std::size_t block_j = gr_id / r_size;
-        // 0 <= block_r < n_blocks * k_blocks
-        std::size_t block_r = gr_id - block_j * r_size;
-        // 0 <= block_s < k_blocks
-        std::size_t block_s = block_r / n_blocks;
-        // 0 <= block_i < n_blocks
-        std::size_t block_i = block_r - block_s * n_blocks;
-
-        std::size_t local_i = lid / (delta_k); // 0 <= local_i < delta_n
-        std::size_t local_s =
-            lid - local_i * (delta_k); // 0 <= local_s < delta_k
-
-        std::size_t i = block_i * delta_n + local_i;
-        std::size_t j = m_groups * block_j;
-        std::size_t s = block_s * delta_k * n_wi + local_s;
-
-        using accV_t = typename LocAccT::value_type;
-
-        static constexpr resT identity_ = resT(0);
-        if (local_i == 0) {
-            for (std::size_t q = 0; q < n_wi * delta_k; q += delta_k) {
-                std::size_t sq = s + q;
-                std::size_t sqmj = sq * m + j;
-
-                if constexpr (m_groups == 1 && std::is_same_v<accV_t, resT>) {
-                    local_B_block[local_s + q] =
-                        (sq < k && j < m)
-                            ? static_cast<resT>(
-                                  rhs[rhs_offset + rhs_indexer(sqmj)])
-                            : identity_;
-                }
-                else {
-                    accV_t local_B_vec;
-#pragma unroll
-                    for (std::size_t vec_idx = 0; vec_idx < m_groups; ++vec_idx)
-                    {
-                        local_B_vec[vec_idx] =
-                            (sq < k && j + vec_idx < m)
-                                ? static_cast<resT>(
-                                      rhs[rhs_offset +
-                                          rhs_indexer(sqmj + vec_idx)])
-                                : identity_;
-                    }
-                    local_B_block[local_s + q] = local_B_vec;
-                }
-            }
-        }
-
-        it.barrier(sycl::access::fence_space::local_space);
-
-        std::size_t t_shift = block_s * delta_k * n_wi;
-        std::size_t global_s_offset = i * k + t_shift;
-
-        accV_t private_sum(identity_);
-        static constexpr accV_t vec_identity_(identity_);
-        for (std::size_t t = local_s; t < local_B_block.size(); t += delta_k) {
-            private_sum +=
-                ((i < n) && (t + t_shift < k))
-                    ? (static_cast<resT>(
-                           lhs[lhs_offset + lhs_indexer(global_s_offset + t)]) *
-                       local_B_block[t])
-                    : vec_identity_;
-        }
-
-        std::size_t workspace_i_shift = local_i * delta_k;
-        workspace[workspace_i_shift + local_s] = private_sum;
-
-        it.barrier(sycl::access::fence_space::local_space);
-
-        if (local_s == 0 && i < n) {
-            accV_t local_sum(workspace[workspace_i_shift]);
-            for (std::size_t t = 1; t < delta_k; ++t) {
-                local_sum += workspace[workspace_i_shift + t];
-            }
-
-            const std::size_t total_offset =
-                res_offset + (block_s * n * m * batch_nelems);
-
-            if constexpr (m_groups == 1 && std::is_same_v<accV_t, resT>) {
-                res[total_offset + res_indexer(i * m + j)] = local_sum;
-            }
-            else {
-                res[total_offset + res_indexer(i * m + j)] = local_sum[0];
-
-#pragma unroll
-                for (std::size_t vec_id = 1; vec_id < m_groups; ++vec_id) {
-                    if (j + vec_id < m) {
-                        res[total_offset + res_indexer(i * m + j + vec_id)] =
-                            local_sum[vec_id];
-                    }
-                }
-            }
-        }
-    }
-};
-
-template <typename T1,
-          typename T2,
-          typename T3,
-          typename T4,
-          typename T5,
-          typename T6,
-          std::size_t>
-class gemm_batch_tree_k_krn;
-
-template <typename T1,
-          typename T2,
-          typename T3,
-          typename T4,
-          typename T5,
-          typename T6,
-          std::size_t>
-class gemm_batch_tree_nm_krn;
-
-namespace gemm_detail
-{
-
-template <typename lhsTy,
-          typename rhsTy,
-          typename resTy,
-          typename BatchIndexerT,
-          typename LhsIndexerT,
-          typename RhsIndexerT,
-          typename ResIndexerT,
-          std::uint32_t m_groups>
-sycl::event _gemm_tree_k_step(sycl::queue &exec_q,
-                              const lhsTy *lhs_tp,
-                              const rhsTy *rhs_tp,
-                              resTy *res_tp,
-                              const std::size_t batch_nelems,
-                              const std::size_t n,
-                              const std::size_t k,
-                              const std::size_t m,
-                              const std::size_t delta_n,
-                              const std::size_t n_wi,
-                              const std::size_t delta_k,
-                              const BatchIndexerT &batch_indexer,
-                              const LhsIndexerT &lhs_indexer,
-                              const RhsIndexerT &rhs_indexer,
-                              const ResIndexerT &res_indexer,
-                              const std::vector<sycl::event> &depends)
-{
-    static_assert(std::is_same_v<LhsIndexerT, RhsIndexerT>);
-
-    sycl::event gemm_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        const std::size_t n_blocks = (n + delta_n - 1) / delta_n;
-        const std::size_t k_blocks =
-            (k + n_wi * delta_k - 1) / (n_wi * delta_k);
-        const std::size_t m_blocks = (m + m_groups - 1) / m_groups;
-
-        const std::size_t lws = delta_n * delta_k;
-        const std::size_t gws =
-            batch_nelems * n_blocks * m_blocks * k_blocks * lws;
-
-        auto gRange = sycl::range<1>(gws);
-        auto lRange = sycl::range<1>(lws);
-        auto ndRange = sycl::nd_range<1>(gRange, lRange);
-
-        using slmB_t =
-            typename std::conditional<m_groups == 1, resTy,
-                                      sycl::vec<resTy, m_groups>>::type;
-
-        using LocAccT = sycl::local_accessor<slmB_t, 1>;
-        LocAccT local_B_block(n_wi * delta_k, cgh);
-        LocAccT workspace(delta_n * delta_k, cgh);
-
-        using KernelName =
-            class gemm_batch_tree_k_krn<lhsTy, rhsTy, resTy, LhsIndexerT,
-                                        ResIndexerT, BatchIndexerT, m_groups>;
-
-        cgh.parallel_for<KernelName>(
-            ndRange,
-            GemmBatchNoAtomicFunctorThreadK<lhsTy, rhsTy, resTy, LocAccT,
-                                            LhsIndexerT, ResIndexerT,
-                                            BatchIndexerT, m_groups>(
-                lhs_tp, rhs_tp, res_tp, std::move(workspace),
-                std::move(local_B_block), n, n_blocks, delta_n, k, k_blocks,
-                delta_k, n_wi, m, batch_nelems, batch_indexer, lhs_indexer,
-                rhs_indexer, res_indexer));
-    });
-    return gemm_ev;
-}
-
-} // end of namespace gemm_detail
-
-template <typename lhsTy,
-          typename rhsTy,
-          typename resTy,
-          std::uint32_t m_groups>
-sycl::event
-gemm_batch_tree_k_impl(sycl::queue &exec_q,
-                       const lhsTy *lhs_tp,
-                       const rhsTy *rhs_tp,
-                       resTy *res_tp,
-                       std::size_t batch_nelems,
-                       std::size_t n,
-                       std::size_t k,
-                       std::size_t m,
-                       int batch_nd,
-                       const ssize_t *batch_shape_strides,
-                       ssize_t lhs_batch_offset,
-                       ssize_t rhs_batch_offset,
-                       ssize_t res_batch_offset,
-                       int inner_nd,
-                       int lhs_outer_nd,
-                       const ssize_t *lhs_outer_inner_shapes_strides,
-                       int rhs_outer_nd,
-                       const ssize_t *rhs_outer_inner_shapes_strides,
-                       int res_outer_nd,
-                       const ssize_t *res_outer_shapes_strides,
-                       const ssize_t *res_shape_strides,
-                       std::vector<sycl::event> const &depends)
-{
-    std::size_t delta_k(4);
-    std::size_t n_wi(64);
-    std::size_t delta_n(32);
-
-    const sycl::device &dev = exec_q.get_device();
-    const std::size_t local_mem_size =
-        dev.get_info<sycl::info::device::local_mem_size>();
-    const std::size_t reserved_slm_size = 512;
-
-    gemm_detail::scale_gemm_k_parameters<resTy, m_groups>(
-        local_mem_size, reserved_slm_size, delta_k,
-        n_wi,   // modified by reference
-        delta_n // modified by reference
-    );
-
-    if (k <= (delta_k * n_wi)) {
-        using OuterInnerDimsIndexerT =
-            dpctl::tensor::offset_utils::StridedIndexer;
-        const OuterInnerDimsIndexerT lhs_indexer(
-            inner_nd + lhs_outer_nd, 0, lhs_outer_inner_shapes_strides);
-        const OuterInnerDimsIndexerT rhs_indexer(
-            inner_nd + rhs_outer_nd, 0, rhs_outer_inner_shapes_strides);
-        const OuterInnerDimsIndexerT res_indexer(res_outer_nd, 0,
-                                                 res_outer_shapes_strides);
-        using BatchDimsIndexerT =
-            dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
-        const BatchDimsIndexerT batch_indexer(
-            batch_nd, lhs_batch_offset, rhs_batch_offset, res_batch_offset,
-            batch_shape_strides);
-
-        return gemm_detail::_gemm_tree_k_step<
-            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
-            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, m_groups>(
-            exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, delta_n,
-            n_wi, delta_k, batch_indexer, lhs_indexer, rhs_indexer, res_indexer,
-            depends);
-    }
-    else {
-        using ReductionOpT =
-            typename std::conditional<std::is_same_v<resTy, bool>,
-                                      sycl::logical_or<resTy>,
-                                      sycl::plus<resTy>>::type;
-        static constexpr resTy identity_val =
-            sycl::known_identity<ReductionOpT, resTy>::value;
-
-        std::size_t iter_nelems = batch_nelems * n * m;
-        std::size_t reduction_nelems =
-            (k + delta_k * n_wi - 1) / (delta_k * n_wi);
-
-        // more than one work-group is needed, requires a
-        // temporary delta_k * n_wi elements processed along k,
-        // so if more to process use multiple
-        const auto &sg_sizes =
-            dev.get_info<sycl::info::device::sub_group_sizes>();
-        std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
-
-        static constexpr std::size_t preferred_reductions_per_wi = 4;
-        std::size_t reductions_per_wi(preferred_reductions_per_wi);
-
-        std::size_t reduction_groups =
-            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
-            (preferred_reductions_per_wi * wg);
-
-        // max_max_wg prevents running out of resources on CPU
-        static constexpr std::size_t max_max_wg = 2048;
-        std::size_t max_wg = std::min(
-            max_max_wg,
-            dev.get_info<sycl::info::device::max_work_group_size>() / 2);
-
-        if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
-            auto tmp_owner =
-                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
-                    iter_nelems * reduction_nelems, exec_q);
-            resTy *tmp = tmp_owner.get();
-
-            using OuterInnerDimsIndexerT =
-                dpctl::tensor::offset_utils::StridedIndexer;
-            using TmpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-            const OuterInnerDimsIndexerT lhs_indexer(
-                inner_nd + lhs_outer_nd, 0, lhs_outer_inner_shapes_strides);
-            const OuterInnerDimsIndexerT rhs_indexer(
-                inner_nd + rhs_outer_nd, 0, rhs_outer_inner_shapes_strides);
-            static constexpr TmpIndexerT res_indexer{};
-
-            using dpctl::tensor::offset_utils::Strided1DIndexer;
-            using dpctl::tensor::offset_utils::StridedIndexer;
-            using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
-            using dpctl::tensor::offset_utils::UnpackedStridedIndexer;
-            using BatchDimsIndexerT = ThreeOffsets_CombinedIndexer<
-                StridedIndexer, UnpackedStridedIndexer, Strided1DIndexer>;
-            const StridedIndexer lhs_batch_indexer(batch_nd, lhs_batch_offset,
-                                                   batch_shape_strides);
-            const UnpackedStridedIndexer rhs_batch_indexer(
-                batch_nd, rhs_batch_offset, batch_shape_strides,
-                batch_shape_strides + 2 * batch_nd);
-            const Strided1DIndexer tmp_batch_indexer(
-                /* size   */ batch_nelems,
-                /* step   */ n * m);
-            const BatchDimsIndexerT batch_indexer(
-                lhs_batch_indexer, rhs_batch_indexer, tmp_batch_indexer);
-
-            sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step<
-                lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
-                OuterInnerDimsIndexerT, TmpIndexerT, m_groups>(
-                exec_q, lhs_tp, rhs_tp, tmp, batch_nelems, n, k, m, delta_n,
-                n_wi, delta_k, batch_indexer, lhs_indexer, rhs_indexer,
-                res_indexer, depends);
-
-            sycl::event red_ev = single_reduction_for_gemm<resTy, ReductionOpT>(
-                exec_q, tmp, res_tp, identity_val, iter_nelems,
-                reduction_nelems, reduction_groups, wg, max_wg,
-                preferred_reductions_per_wi, reductions_per_wi,
-                batch_nd + res_outer_nd, res_batch_offset, res_shape_strides,
-                {gemm_ev});
-
-            sycl::event cleanup_host_task_event =
-                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
-                                                             tmp_owner);
-
-            return cleanup_host_task_event;
-        }
-        else {
-            assert(reduction_groups > 1);
-
-            const std::size_t tmp_alloc_size =
-                iter_nelems * (
-                                  /* temp */ reduction_nelems +
-                                  /* first reduction temp */ reduction_groups);
-
-            // get unique_ptr owning the temporary allocation
-            auto tmp_owner =
-                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
-                    tmp_alloc_size, exec_q);
-            // get raw USM pointer
-            resTy *partially_reduced_tmp = tmp_owner.get();
-            resTy *partially_reduced_tmp2 =
-                partially_reduced_tmp + reduction_nelems * iter_nelems;
-            ;
-
-            using OuterInnerDimsIndexerT =
-                dpctl::tensor::offset_utils::StridedIndexer;
-            using TmpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-            const OuterInnerDimsIndexerT lhs_indexer(
-                inner_nd + lhs_outer_nd, 0, lhs_outer_inner_shapes_strides);
-            const OuterInnerDimsIndexerT rhs_indexer(
-                inner_nd + rhs_outer_nd, 0, rhs_outer_inner_shapes_strides);
-            static constexpr TmpIndexerT res_indexer{};
-            using dpctl::tensor::offset_utils::Strided1DIndexer;
-            using dpctl::tensor::offset_utils::StridedIndexer;
-            using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
-            using BatchDimsIndexerT =
-                ThreeOffsets_CombinedIndexer<StridedIndexer, StridedIndexer,
-                                             Strided1DIndexer>;
-            const StridedIndexer lhs_batch_indexer(batch_nd, lhs_batch_offset,
-                                                   batch_shape_strides);
-            const StridedIndexer rhs_batch_indexer(
-                batch_nd, rhs_batch_offset, batch_shape_strides + 2 * batch_nd);
-            const Strided1DIndexer tmp_batch_indexer(
-                /* size   */ batch_nelems,
-                /* step   */ n * m);
-            const BatchDimsIndexerT batch_indexer(
-                lhs_batch_indexer, rhs_batch_indexer, tmp_batch_indexer);
-
-            sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step<
-                lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
-                OuterInnerDimsIndexerT, TmpIndexerT, m_groups>(
-                exec_q, lhs_tp, rhs_tp, partially_reduced_tmp, batch_nelems, n,
-                k, m, delta_n, n_wi, delta_k, batch_indexer, lhs_indexer,
-                rhs_indexer, res_indexer, depends);
-
-            sycl::event red_ev = tree_reduction_for_gemm<resTy, ReductionOpT>(
-                exec_q, partially_reduced_tmp, partially_reduced_tmp2, res_tp,
-                identity_val, iter_nelems, reduction_nelems, reduction_groups,
-                wg, max_wg, preferred_reductions_per_wi, reductions_per_wi,
-                batch_nd + res_outer_nd, res_batch_offset, res_shape_strides,
-                {gemm_ev});
-
-            sycl::event cleanup_host_task_event =
-                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
-                                                             tmp_owner);
-
-            return cleanup_host_task_event;
-        }
-    }
-}
-
-namespace gemm_detail
-{
-
-template <typename lhsTy,
-          typename rhsTy,
-          typename resTy,
-          typename BatchIndexerT,
-          typename LhsIndexerT,
-          typename RhsIndexerT,
-          typename ResIndexerT,
-          std::uint32_t wi_delta_n,
-          std::uint32_t wi_delta_m>
-sycl::event _gemm_tree_nm_step(sycl::queue &exec_q,
-                               const lhsTy *lhs_tp,
-                               const rhsTy *rhs_tp,
-                               resTy *res_tp,
-                               const std::size_t batch_nelems,
-                               const std::size_t n,
-                               const std::size_t k,
-                               const std::size_t m,
-                               const std::uint32_t wg_delta_n,
-                               const std::uint32_t wg_delta_m,
-                               const std::uint32_t wi_delta_k,
-                               const BatchIndexerT &batch_indexer,
-                               const LhsIndexerT &lhs_indexer,
-                               const RhsIndexerT &rhs_indexer,
-                               const ResIndexerT &res_indexer,
-                               const std::vector<sycl::event> &depends)
-{
-    static_assert(std::is_same_v<LhsIndexerT, RhsIndexerT>);
-
-    sycl::event gemm_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        const std::size_t lws = wg_delta_n * wg_delta_m;
-
-        const std::size_t n_blocks =
-            ((n + wi_delta_n * wg_delta_n - 1) / (wi_delta_n * wg_delta_n));
-        const std::size_t k_blocks = ((k + wi_delta_k - 1) / wi_delta_k);
-        const std::size_t m_blocks =
-            ((m + wi_delta_m * wg_delta_m - 1) / (wi_delta_m * wg_delta_m));
-
-        const std::size_t gws =
-            batch_nelems * n_blocks * m_blocks * k_blocks * lws;
-
-        auto gwsRange = sycl::range<1>(gws);
-        auto lwsRange = sycl::range<1>(lws);
-        auto ndRange = sycl::nd_range<1>(gwsRange, lwsRange);
-
-        using slmB_t =
-            typename std::conditional<wi_delta_m == 1, resTy,
-                                      sycl::vec<resTy, wi_delta_m>>::type;
-        using LocAccT1 = sycl::local_accessor<resTy, 1>;
-        using LocAccT2 = sycl::local_accessor<slmB_t, 1>;
-
-        const sycl::range<1> local_A_size((wi_delta_n * wg_delta_n) *
-                                          wi_delta_k);
-        const sycl::range<1> local_B_size(wi_delta_k * wg_delta_m);
-
-        LocAccT1 local_A_block(local_A_size, cgh);
-        LocAccT2 local_B_block(local_B_size, cgh);
-
-        using KernelName =
-            class gemm_batch_tree_nm_krn<lhsTy, rhsTy, resTy, LhsIndexerT,
-                                         ResIndexerT, BatchIndexerT,
-                                         wi_delta_m>;
-        cgh.parallel_for<KernelName>(
-            ndRange, GemmBatchNoAtomicFunctorThreadNM<
-                         lhsTy, rhsTy, resTy, LocAccT1, LocAccT2, LhsIndexerT,
-                         ResIndexerT, BatchIndexerT, wi_delta_n, wi_delta_m>(
-                         lhs_tp, rhs_tp, res_tp, std::move(local_A_block),
-                         std::move(local_B_block), n, wg_delta_n, k, k_blocks,
-                         wi_delta_k, m, m_blocks, wg_delta_m, batch_nelems,
-                         batch_indexer, lhs_indexer, rhs_indexer, res_indexer));
-    });
-    return gemm_ev;
-}
-
-} // end namespace gemm_detail
-
-template <typename lhsTy, typename rhsTy, typename resTy, int wi_delta_m>
-sycl::event
-gemm_batch_tree_nm_impl(sycl::queue &exec_q,
-                        const lhsTy *lhs_tp,
-                        const rhsTy *rhs_tp,
-                        resTy *res_tp,
-                        std::size_t batch_nelems,
-                        std::size_t n,
-                        std::size_t k,
-                        std::size_t m,
-                        int batch_nd,
-                        const ssize_t *batch_shape_strides,
-                        ssize_t lhs_batch_offset,
-                        ssize_t rhs_batch_offset,
-                        ssize_t res_batch_offset,
-                        int inner_nd,
-                        int lhs_outer_nd,
-                        const ssize_t *lhs_outer_inner_shapes_strides,
-                        int rhs_outer_nd,
-                        const ssize_t *rhs_outer_inner_shapes_strides,
-                        int res_outer_nd,
-                        const ssize_t *res_outer_shapes_strides,
-                        const ssize_t *res_shape_strides,
-                        std::vector<sycl::event> const &depends)
-{
-    static constexpr int wi_delta_n = 2;
-    std::size_t wg_delta_n(16); // rows of A processed in WG
-    std::size_t wg_delta_m(16); // rows of B processed in WG
-    std::size_t wi_delta_k(64); // Elements in K dimension processed by WI
-
-    const sycl::device &dev = exec_q.get_device();
-    const std::size_t local_mem_size =
-        dev.get_info<sycl::info::device::local_mem_size>();
-    const std::size_t reserved_slm_size = 512;
-
-    gemm_detail::scale_gemm_nm_parameters<resTy, wi_delta_m>(
-        local_mem_size, reserved_slm_size, wi_delta_n,
-        wi_delta_k, // modified by reference
-        wg_delta_n, // modified by reference
-        wg_delta_m  // modified by reference
-    );
-
-    // each group processes delta_k * n_wi
-    // items in a column, so no need for allocating
-    // temp memory if only one group is needed
-    if (k <= wi_delta_k) {
-        using OuterInnerDimsIndexerT =
-            dpctl::tensor::offset_utils::StridedIndexer;
-        const OuterInnerDimsIndexerT lhs_indexer(
-            inner_nd + lhs_outer_nd, 0, lhs_outer_inner_shapes_strides);
-        const OuterInnerDimsIndexerT rhs_indexer(
-            inner_nd + rhs_outer_nd, 0, rhs_outer_inner_shapes_strides);
-        const OuterInnerDimsIndexerT res_indexer(res_outer_nd, 0,
-                                                 res_outer_shapes_strides);
-        using BatchDimsIndexerT =
-            dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
-        const BatchDimsIndexerT batch_indexer(
-            batch_nd, lhs_batch_offset, rhs_batch_offset, res_batch_offset,
-            batch_shape_strides);
-
-        return gemm_detail::_gemm_tree_nm_step<
-            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
-            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, wi_delta_n,
-            wi_delta_m>(exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
-                        wg_delta_n, wg_delta_m, wi_delta_k, batch_indexer,
-                        lhs_indexer, rhs_indexer, res_indexer, depends);
-    }
-    else {
-        using ReductionOpT =
-            typename std::conditional<std::is_same_v<resTy, bool>,
-                                      sycl::logical_or<resTy>,
-                                      sycl::plus<resTy>>::type;
-        static constexpr resTy identity_val =
-            sycl::known_identity<ReductionOpT, resTy>::value;
-        std::size_t iter_nelems = batch_nelems * n * m;
-        std::size_t reduction_nelems = (k + wi_delta_k - 1) / wi_delta_k;
-
-        // more than one work-group is needed, requires a temporary
-        // delta_k * n_wi elements processed along k, so if more to
-        // process use multiple
-        const auto &sg_sizes =
-            dev.get_info<sycl::info::device::sub_group_sizes>();
-        std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
-
-        static constexpr std::size_t preferred_reductions_per_wi = 4;
-        std::size_t reductions_per_wi(preferred_reductions_per_wi);
-
-        std::size_t reduction_groups =
-            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
-            (preferred_reductions_per_wi * wg);
-
-        std::size_t max_wg = reduction_detail::get_work_group_size(dev);
-
-        if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
-            auto tmp_owner =
-                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
-                    iter_nelems * reduction_nelems, exec_q);
-            resTy *tmp = tmp_owner.get();
-
-            using OuterInnerDimsIndexerT =
-                dpctl::tensor::offset_utils::StridedIndexer;
-            using TmpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-            const OuterInnerDimsIndexerT lhs_indexer(
-                inner_nd + lhs_outer_nd, 0, lhs_outer_inner_shapes_strides);
-            const OuterInnerDimsIndexerT rhs_indexer(
-                inner_nd + rhs_outer_nd, 0, rhs_outer_inner_shapes_strides);
-            static constexpr TmpIndexerT res_indexer{};
-
-            using dpctl::tensor::offset_utils::Strided1DIndexer;
-            using dpctl::tensor::offset_utils::StridedIndexer;
-            using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
-            using dpctl::tensor::offset_utils::UnpackedStridedIndexer;
-            using BatchDimsIndexerT = ThreeOffsets_CombinedIndexer<
-                StridedIndexer, UnpackedStridedIndexer, Strided1DIndexer>;
-            const StridedIndexer lhs_batch_indexer(batch_nd, lhs_batch_offset,
-                                                   batch_shape_strides);
-            const UnpackedStridedIndexer rhs_batch_indexer(
-                batch_nd, rhs_batch_offset, batch_shape_strides,
-                batch_shape_strides + 2 * batch_nd);
-            const Strided1DIndexer tmp_batch_indexer(
-                /* size   */ batch_nelems,
-                /* step   */ n * m);
-            const BatchDimsIndexerT batch_indexer(
-                lhs_batch_indexer, rhs_batch_indexer, tmp_batch_indexer);
-
-            sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step<
-                lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
-                OuterInnerDimsIndexerT, TmpIndexerT, wi_delta_n, wi_delta_m>(
-                exec_q, lhs_tp, rhs_tp, tmp, batch_nelems, n, k, m, wg_delta_n,
-                wg_delta_m, wi_delta_k, batch_indexer, lhs_indexer, rhs_indexer,
-                res_indexer, depends);
-
-            sycl::event red_ev = single_reduction_for_gemm<resTy, ReductionOpT>(
-                exec_q, tmp, res_tp, identity_val, iter_nelems,
-                reduction_nelems, reduction_groups, wg, max_wg,
-                preferred_reductions_per_wi, reductions_per_wi,
-                batch_nd + res_outer_nd, res_batch_offset, res_shape_strides,
-                {gemm_ev});
-
-            sycl::event cleanup_host_task_event =
-                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
-                                                             tmp_owner);
-
-            return cleanup_host_task_event;
-        }
-        else {
-            assert(reduction_groups > 1);
-
-            const std::size_t tmp_alloc_size =
-                iter_nelems * (/* temp */ reduction_nelems +
-                               /* first reduction temp */ reduction_groups);
-
-            auto tmp_owner =
-                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
-                    tmp_alloc_size, exec_q);
-
-            resTy *partially_reduced_tmp = tmp_owner.get();
-            resTy *partially_reduced_tmp2 =
-                partially_reduced_tmp + reduction_nelems * iter_nelems;
-            ;
-
-            using OuterInnerDimsIndexerT =
-                dpctl::tensor::offset_utils::StridedIndexer;
-            using TmpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-
-            const OuterInnerDimsIndexerT lhs_indexer(
-                inner_nd + lhs_outer_nd, 0, lhs_outer_inner_shapes_strides);
-            const OuterInnerDimsIndexerT rhs_indexer(
-                inner_nd + rhs_outer_nd, 0, rhs_outer_inner_shapes_strides);
-            static constexpr TmpIndexerT res_indexer{};
-
-            using dpctl::tensor::offset_utils::Strided1DIndexer;
-            using dpctl::tensor::offset_utils::StridedIndexer;
-            using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
-            using dpctl::tensor::offset_utils::UnpackedStridedIndexer;
-            using BatchDimsIndexerT = ThreeOffsets_CombinedIndexer<
-                StridedIndexer, UnpackedStridedIndexer, Strided1DIndexer>;
-
-            const StridedIndexer lhs_batch_indexer(batch_nd, lhs_batch_offset,
-                                                   batch_shape_strides);
-            const UnpackedStridedIndexer rhs_batch_indexer(
-                batch_nd, rhs_batch_offset, batch_shape_strides,
-                batch_shape_strides + 2 * batch_nd);
-            const Strided1DIndexer tmp_batch_indexer(
-                /* size   */ batch_nelems,
-                /* step   */ n * m);
-            const BatchDimsIndexerT batch_indexer(
-                lhs_batch_indexer, rhs_batch_indexer, tmp_batch_indexer);
-
-            sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step<
-                lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
-                OuterInnerDimsIndexerT, TmpIndexerT, wi_delta_n, wi_delta_m>(
-                exec_q, lhs_tp, rhs_tp, partially_reduced_tmp, batch_nelems, n,
-                k, m, wg_delta_n, wg_delta_m, wi_delta_k, batch_indexer,
-                lhs_indexer, rhs_indexer, res_indexer, depends);
-
-            sycl::event red_ev = tree_reduction_for_gemm<resTy, ReductionOpT>(
-                exec_q, partially_reduced_tmp, partially_reduced_tmp2, res_tp,
-                identity_val, iter_nelems, reduction_nelems, reduction_groups,
-                wg, max_wg, preferred_reductions_per_wi, reductions_per_wi,
-                batch_nd + res_outer_nd, res_batch_offset, res_shape_strides,
-                {gemm_ev});
-
-            sycl::event cleanup_host_task_event =
-                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
-                                                             tmp_owner);
-
-            return cleanup_host_task_event;
-        }
-    }
-}
-
-template <typename lhsTy, typename rhsTy, typename resTy>
-sycl::event gemm_batch_nm_impl(sycl::queue &exec_q,
-                               const lhsTy *lhs_tp,
-                               const rhsTy *rhs_tp,
-                               resTy *res_tp,
-                               std::size_t batch_nelems,
-                               std::size_t n,
-                               std::size_t k,
-                               std::size_t m,
-                               int batch_nd,
-                               const ssize_t *batch_shape_strides,
-                               ssize_t lhs_batch_offset,
-                               ssize_t rhs_batch_offset,
-                               ssize_t res_batch_offset,
-                               int inner_nd,
-                               int lhs_outer_nd,
-                               const ssize_t *lhs_outer_inner_shapes_strides,
-                               int rhs_outer_nd,
-                               const ssize_t *rhs_outer_inner_shapes_strides,
-                               int res_outer_nd,
-                               const ssize_t *res_outer_shapes_strides,
-                               const ssize_t *res_shape_strides,
-                               std::vector<sycl::event> const &depends = {})
-{
-
-    using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
-    const OuterInnerDimsIndexerT lhs_indexer(inner_nd + lhs_outer_nd, 0,
-                                             lhs_outer_inner_shapes_strides);
-    const OuterInnerDimsIndexerT rhs_indexer(inner_nd + rhs_outer_nd, 0,
-                                             rhs_outer_inner_shapes_strides);
-    const OuterInnerDimsIndexerT res_indexer(res_outer_nd, 0,
-                                             res_outer_shapes_strides);
-
-    using BatchDimsIndexerT =
-        dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;
-    const BatchDimsIndexerT batch_indexer(batch_nd, lhs_batch_offset,
-                                          rhs_batch_offset, res_batch_offset,
-                                          batch_shape_strides);
-
-    sycl::event gemm_ev = gemm_detail::_gemm_batch_nm_impl<
-        lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
-        OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
-        exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, batch_indexer,
-        lhs_indexer, rhs_indexer, res_indexer, depends);
-
-    return gemm_ev;
-}
-
-template <typename T1, typename T2, typename T3>
-class gemm_batch_tree_empty_krn;
-
-template <typename lhsTy, typename rhsTy, typename resTy>
-sycl::event gemm_batch_tree_impl(sycl::queue &exec_q,
-                                 const char *lhs_cp,
-                                 const char *rhs_cp,
-                                 char *res_cp,
-                                 std::size_t batch_nelems,
-                                 std::size_t n,
-                                 std::size_t k,
-                                 std::size_t m,
-                                 int batch_nd,
-                                 const ssize_t *batch_shape_strides,
-                                 ssize_t lhs_batch_offset,
-                                 ssize_t rhs_batch_offset,
-                                 ssize_t res_batch_offset,
-                                 int inner_nd,
-                                 int lhs_outer_nd,
-                                 const ssize_t *lhs_outer_inner_shapes_strides,
-                                 int rhs_outer_nd,
-                                 const ssize_t *rhs_outer_inner_shapes_strides,
-                                 int res_outer_nd,
-                                 const ssize_t *res_outer_shapes_strides,
-                                 const ssize_t *res_shape_strides,
-                                 std::vector<sycl::event> const &depends = {})
-{
-    const lhsTy *lhs_tp = reinterpret_cast<const lhsTy *>(lhs_cp);
-    const rhsTy *rhs_tp = reinterpret_cast<const rhsTy *>(rhs_cp);
-    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
-
-    const std::size_t min_nm = std::min(n, m);
-    const std::size_t max_nm = std::max(n, m);
-
-    if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) {
-        return gemm_batch_nm_impl<lhsTy, rhsTy, resTy>(
-            exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, batch_nd,
-            batch_shape_strides, lhs_batch_offset, rhs_batch_offset,
-            res_batch_offset, inner_nd, lhs_outer_nd,
-            lhs_outer_inner_shapes_strides, rhs_outer_nd,
-            rhs_outer_inner_shapes_strides, res_outer_nd,
-            res_outer_shapes_strides, res_shape_strides, depends);
-    }
-
-    if (k == 0) {
-        sycl::event gemm_batch_no_reduction_ev =
-            exec_q.submit([&](sycl::handler &cgh) {
-                cgh.depends_on(depends);
-
-                using IndexerT = dpctl::tensor::offset_utils::StridedIndexer;
-                const IndexerT res_indexer(batch_nd + res_outer_nd,
-                                           res_batch_offset, res_shape_strides);
-                using InitKernelName =
-                    class gemm_batch_tree_empty_krn<lhsTy, rhsTy, resTy>;
-                cgh.parallel_for<InitKernelName>(
-                    sycl::range<1>(n * m * batch_nelems), [=](sycl::id<1> id) {
-                        auto res_offset = res_indexer(id[0]);
-                        res_tp[res_offset] = resTy(0);
-                    });
-            });
-        return gemm_batch_no_reduction_ev;
-    }
-
-    if (max_nm < 64) {
-        using dpctl::tensor::type_utils::is_complex;
-        if constexpr (!is_complex<resTy>::value) {
-            if (m < 4) {
-                static constexpr std::uint32_t m_groups_one = 1;
-                return gemm_batch_tree_k_impl<lhsTy, rhsTy, resTy,
-                                              m_groups_one>(
-                    exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
-                    batch_nd, batch_shape_strides, lhs_batch_offset,
-                    rhs_batch_offset, res_batch_offset, inner_nd, lhs_outer_nd,
-                    lhs_outer_inner_shapes_strides, rhs_outer_nd,
-                    rhs_outer_inner_shapes_strides, res_outer_nd,
-                    res_outer_shapes_strides, res_shape_strides, depends);
-            }
-            else {
-                static constexpr std::uint32_t m_groups_four = 4;
-                return gemm_batch_tree_k_impl<lhsTy, rhsTy, resTy,
-                                              m_groups_four>(
-                    exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
-                    batch_nd, batch_shape_strides, lhs_batch_offset,
-                    rhs_batch_offset, res_batch_offset, inner_nd, lhs_outer_nd,
-                    lhs_outer_inner_shapes_strides, rhs_outer_nd,
-                    rhs_outer_inner_shapes_strides, res_outer_nd,
-                    res_outer_shapes_strides, res_shape_strides, depends);
-            }
-        }
-        else {
-            static constexpr std::uint32_t m_groups_one = 1;
-            return gemm_batch_tree_k_impl<lhsTy, rhsTy, resTy, m_groups_one>(
-                exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, batch_nd,
-                batch_shape_strides, lhs_batch_offset, rhs_batch_offset,
-                res_batch_offset, inner_nd, lhs_outer_nd,
-                lhs_outer_inner_shapes_strides, rhs_outer_nd,
-                rhs_outer_inner_shapes_strides, res_outer_nd,
-                res_outer_shapes_strides, res_shape_strides, depends);
-        }
-    }
-    else { // m > 1, n > k or m > k
-        using dpctl::tensor::type_utils::is_complex;
-        if constexpr (!is_complex<resTy>::value) {
-            static constexpr std::uint32_t m_groups_four = 4;
-            return gemm_batch_tree_nm_impl<lhsTy, rhsTy, resTy, m_groups_four>(
-                exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, batch_nd,
-                batch_shape_strides, lhs_batch_offset, rhs_batch_offset,
-                res_batch_offset, inner_nd, lhs_outer_nd,
-                lhs_outer_inner_shapes_strides, rhs_outer_nd,
-                rhs_outer_inner_shapes_strides, res_outer_nd,
-                res_outer_shapes_strides, res_shape_strides, depends);
-        }
-        else { // m > 1, n > k or m > k, resTy complex
-            static constexpr std::uint32_t m_groups_one = 1;
-            return gemm_batch_tree_nm_impl<lhsTy, rhsTy, resTy, m_groups_one>(
-                exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, batch_nd,
-                batch_shape_strides, lhs_batch_offset, rhs_batch_offset,
-                res_batch_offset, inner_nd, lhs_outer_nd,
-                lhs_outer_inner_shapes_strides, rhs_outer_nd,
-                rhs_outer_inner_shapes_strides, res_outer_nd,
-                res_outer_shapes_strides, res_shape_strides, depends);
-        }
-    }
-}
-
-template <typename lhsTy, typename rhsTy, typename resTy, std::size_t m_groups>
-sycl::event
-gemm_batch_contig_tree_k_impl(sycl::queue &exec_q,
-                              const lhsTy *lhs_tp,
-                              const rhsTy *rhs_tp,
-                              resTy *res_tp,
-                              std::size_t batch_nelems,
-                              std::size_t n,
-                              std::size_t k,
-                              std::size_t m,
-                              std::vector<sycl::event> const &depends)
-{
-    std::size_t delta_k(4);
-    std::size_t n_wi(64);
-    std::size_t delta_n(32);
-
-    const sycl::device &dev = exec_q.get_device();
-    const std::size_t local_mem_size =
-        dev.get_info<sycl::info::device::local_mem_size>();
-    const std::size_t reserved_slm_size = 512;
-
-    gemm_detail::scale_gemm_k_parameters<resTy, m_groups>(
-        local_mem_size, reserved_slm_size, delta_k,
-        n_wi,   // modified by reference
-        delta_n // modified by reference
-    );
-
-    if (k <= (delta_k * n_wi)) {
-        using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        static constexpr OuterInnerDimsIndexerT lhs_indexer{};
-        static constexpr OuterInnerDimsIndexerT rhs_indexer{};
-        static constexpr OuterInnerDimsIndexerT res_indexer{};
-
-        using dpctl::tensor::offset_utils::Strided1DIndexer;
-        using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
-        using BatchDimsIndexerT =
-            ThreeOffsets_CombinedIndexer<Strided1DIndexer, Strided1DIndexer,
-                                         Strided1DIndexer>;
-
-        using dpctl::tensor::offset_utils::Strided1DIndexer;
-        const BatchDimsIndexerT batch_indexer(
-            Strided1DIndexer{/* size   */ batch_nelems,
-                             /* step   */ n * k},
-            Strided1DIndexer{/* size   */ batch_nelems,
-                             /* step   */ k * m},
-            Strided1DIndexer{/* size   */ batch_nelems,
-                             /* step   */ n * m});
-
-        return gemm_detail::_gemm_tree_k_step<
-            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
-            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, m_groups>(
-            exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, delta_n,
-            n_wi, delta_k, batch_indexer, lhs_indexer, rhs_indexer, res_indexer,
-            depends);
-    }
-    else {
-        using ReductionOpT =
-            typename std::conditional<std::is_same_v<resTy, bool>,
-                                      sycl::logical_or<resTy>,
-                                      sycl::plus<resTy>>::type;
-        static constexpr resTy identity_val =
-            sycl::known_identity<ReductionOpT, resTy>::value;
-
-        std::size_t iter_nelems = batch_nelems * n * m;
-        std::size_t reduction_nelems =
-            (k + delta_k * n_wi - 1) / (delta_k * n_wi);
-
-        // more than one work-group is needed, requires a
-        // temporary delta_k * n_wi elements processed along k,
-        // so if more to process use multiple
-        const auto &sg_sizes =
-            dev.get_info<sycl::info::device::sub_group_sizes>();
-        std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
-
-        static constexpr std::size_t preferred_reductions_per_wi = 4;
-        std::size_t reductions_per_wi(preferred_reductions_per_wi);
-
-        std::size_t reduction_groups =
-            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
-            (preferred_reductions_per_wi * wg);
-
-        std::size_t max_wg = reduction_detail::get_work_group_size(dev);
-
-        if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
-            auto tmp_owner =
-                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
-                    iter_nelems * reduction_nelems, exec_q);
-            resTy *tmp = tmp_owner.get();
-
-            using OuterInnerDimsIndexerT =
-                dpctl::tensor::offset_utils::NoOpIndexer;
-            static constexpr OuterInnerDimsIndexerT lhs_indexer{};
-            static constexpr OuterInnerDimsIndexerT rhs_indexer{};
-            static constexpr OuterInnerDimsIndexerT tmp_indexer{};
-            using dpctl::tensor::offset_utils::Strided1DIndexer;
-            using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
-            using BatchDimsIndexerT =
-                ThreeOffsets_CombinedIndexer<Strided1DIndexer, Strided1DIndexer,
-                                             Strided1DIndexer>;
-
-            const BatchDimsIndexerT batch_indexer(
-                Strided1DIndexer{/* size   */ batch_nelems,
-                                 /* step   */ n * k},
-                Strided1DIndexer{/* size   */ batch_nelems,
-                                 /* step   */ k * m},
-                Strided1DIndexer{/* size   */ batch_nelems,
-                                 /* step   */ n * m});
-
-            sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step<
-                lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
-                OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, m_groups>(
-                exec_q, lhs_tp, rhs_tp, tmp, batch_nelems, n, k, m, delta_n,
-                n_wi, delta_k, batch_indexer, lhs_indexer, rhs_indexer,
-                tmp_indexer, depends);
-
-            sycl::event red_ev =
-                single_reduction_for_gemm_contig<resTy, ReductionOpT>(
-                    exec_q, tmp, res_tp, identity_val, iter_nelems,
-                    reduction_nelems, reduction_groups, wg, max_wg,
-                    preferred_reductions_per_wi, reductions_per_wi, {gemm_ev});
-
-            sycl::event cleanup_host_task_event =
-                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
-                                                             tmp_owner);
-
-            return cleanup_host_task_event;
-        }
-        else {
-            assert(reduction_groups > 1);
-
-            const std::size_t tmp_alloc_size =
-                iter_nelems * (/* temp */ reduction_nelems +
-                               /* first reduction temp */ reduction_groups);
-
-            auto tmp_owner =
-                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
-                    tmp_alloc_size, exec_q);
-
-            resTy *partially_reduced_tmp = tmp_owner.get();
-            resTy *partially_reduced_tmp2 =
-                partially_reduced_tmp + reduction_nelems * iter_nelems;
-
-            using OuterInnerDimsIndexerT =
-                dpctl::tensor::offset_utils::NoOpIndexer;
-            static constexpr OuterInnerDimsIndexerT lhs_indexer{};
-            static constexpr OuterInnerDimsIndexerT rhs_indexer{};
-            static constexpr OuterInnerDimsIndexerT tmp_indexer{};
-            using dpctl::tensor::offset_utils::Strided1DIndexer;
-            using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
-            using BatchDimsIndexerT =
-                ThreeOffsets_CombinedIndexer<Strided1DIndexer, Strided1DIndexer,
-                                             Strided1DIndexer>;
-
-            const BatchDimsIndexerT batch_indexer(
-                Strided1DIndexer{/* size   */ batch_nelems,
-                                 /* step   */ n * k},
-                Strided1DIndexer{/* size   */ batch_nelems,
-                                 /* step   */ k * m},
-                Strided1DIndexer{/* size   */ batch_nelems,
-                                 /* step   */ n * m});
-
-            sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step<
-                lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
-                OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, m_groups>(
-                exec_q, lhs_tp, rhs_tp, partially_reduced_tmp, batch_nelems, n,
-                k, m, delta_n, n_wi, delta_k, batch_indexer, lhs_indexer,
-                rhs_indexer, tmp_indexer, depends);
-
-            sycl::event red_ev =
-                tree_reduction_for_gemm_contig<resTy, ReductionOpT>(
-                    exec_q, partially_reduced_tmp, partially_reduced_tmp2,
-                    res_tp, identity_val, iter_nelems, reduction_nelems,
-                    reduction_groups, wg, max_wg, preferred_reductions_per_wi,
-                    reductions_per_wi, {gemm_ev});
-
-            sycl::event cleanup_host_task_event =
-                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
-                                                             tmp_owner);
-
-            return cleanup_host_task_event;
-        }
-    }
-}
-
-template <typename lhsTy, typename rhsTy, typename resTy, int wi_delta_m>
-sycl::event
-gemm_batch_contig_tree_nm_impl(sycl::queue &exec_q,
-                               const lhsTy *lhs_tp,
-                               const rhsTy *rhs_tp,
-                               resTy *res_tp,
-                               std::size_t batch_nelems,
-                               std::size_t n,
-                               std::size_t k,
-                               std::size_t m,
-                               std::vector<sycl::event> const &depends)
-{
-    static constexpr int wi_delta_n = 2;
-    std::size_t wg_delta_n(16); // rows of A processed in WG
-    std::size_t wg_delta_m(16); // rows of B processed in WG
-    std::size_t wi_delta_k(64); // Elements in K dimension processed by WI
-
-    const sycl::device &dev = exec_q.get_device();
-    const std::size_t local_mem_size =
-        dev.get_info<sycl::info::device::local_mem_size>();
-    const std::size_t reserved_slm_size = 512;
-
-    gemm_detail::scale_gemm_nm_parameters<resTy, wi_delta_m>(
-        local_mem_size, reserved_slm_size, wi_delta_n,
-        wi_delta_k, // modified by reference
-        wg_delta_n, // modified by reference
-        wg_delta_m  // modified by reference
-    );
-
-    // each group processes delta_k * n_wi
-    // items in a column, so no need for allocating
-    // temp memory if only one group is needed
-    if (k <= wi_delta_k) {
-        using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        static constexpr OuterInnerDimsIndexerT lhs_indexer{};
-        static constexpr OuterInnerDimsIndexerT rhs_indexer{};
-        static constexpr OuterInnerDimsIndexerT res_indexer{};
-
-        using dpctl::tensor::offset_utils::Strided1DIndexer;
-        using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
-        using BatchDimsIndexerT =
-            ThreeOffsets_CombinedIndexer<Strided1DIndexer, Strided1DIndexer,
-                                         Strided1DIndexer>;
-
-        const BatchDimsIndexerT batch_indexer(
-            Strided1DIndexer{/* size   */ batch_nelems,
-                             /* step   */ n * k},
-            Strided1DIndexer{/* size   */ batch_nelems,
-                             /* step   */ k * m},
-            Strided1DIndexer{/* size   */ batch_nelems,
-                             /* step   */ n * m});
-
-        return gemm_detail::_gemm_tree_nm_step<
-            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
-            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, wi_delta_n,
-            wi_delta_m>(exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
-                        wg_delta_n, wg_delta_m, wi_delta_k, batch_indexer,
-                        lhs_indexer, rhs_indexer, res_indexer, depends);
-    }
-    else {
-        using ReductionOpT =
-            typename std::conditional<std::is_same_v<resTy, bool>,
-                                      sycl::logical_or<resTy>,
-                                      sycl::plus<resTy>>::type;
-        static constexpr resTy identity_val =
-            sycl::known_identity<ReductionOpT, resTy>::value;
-        std::size_t iter_nelems = batch_nelems * n * m;
-        std::size_t reduction_nelems = (k + wi_delta_k - 1) / wi_delta_k;
-
-        // more than one work-group is needed, requires a temporary
-        // delta_k * n_wi elements processed along k, so if more to
-        // process use multiple
-        const auto &sg_sizes =
-            dev.get_info<sycl::info::device::sub_group_sizes>();
-        std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
-
-        static constexpr std::size_t preferred_reductions_per_wi = 4;
-        std::size_t reductions_per_wi(preferred_reductions_per_wi);
-
-        std::size_t reduction_groups =
-            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
-            (preferred_reductions_per_wi * wg);
-
-        std::size_t max_wg = reduction_detail::get_work_group_size(dev);
-
-        if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
-            auto tmp_owner =
-                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
-                    iter_nelems * reduction_nelems, exec_q);
-
-            resTy *tmp = tmp_owner.get();
-
-            using OuterInnerDimsIndexerT =
-                dpctl::tensor::offset_utils::NoOpIndexer;
-            static constexpr OuterInnerDimsIndexerT lhs_indexer{};
-            static constexpr OuterInnerDimsIndexerT rhs_indexer{};
-            static constexpr OuterInnerDimsIndexerT tmp_indexer{};
-
-            using dpctl::tensor::offset_utils::Strided1DIndexer;
-            using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
-            using BatchDimsIndexerT =
-                ThreeOffsets_CombinedIndexer<Strided1DIndexer, Strided1DIndexer,
-                                             Strided1DIndexer>;
-
-            const BatchDimsIndexerT batch_indexer(
-                Strided1DIndexer{/* size */ batch_nelems,
-                                 /* step */ n * k},
-                Strided1DIndexer{/* size   */ batch_nelems,
-                                 /* step   */ k * m},
-                Strided1DIndexer{/* size   */ batch_nelems,
-                                 /* step   */ n * m});
-
-            sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step<
-                lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
-                OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, wi_delta_n,
-                wi_delta_m>(exec_q, lhs_tp, rhs_tp, tmp, batch_nelems, n, k, m,
-                            wg_delta_n, wg_delta_m, wi_delta_k, batch_indexer,
-                            lhs_indexer, rhs_indexer, tmp_indexer, depends);
-
-            sycl::event red_ev =
-                single_reduction_for_gemm_contig<resTy, ReductionOpT>(
-                    exec_q, tmp, res_tp, identity_val, iter_nelems,
-                    reduction_nelems, reduction_groups, wg, max_wg,
-                    preferred_reductions_per_wi, reductions_per_wi, {gemm_ev});
-
-            sycl::event cleanup_host_task_event =
-                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
-                                                             tmp_owner);
-
-            return cleanup_host_task_event;
-        }
-        else {
-            assert(reduction_groups > 1);
-
-            const std::size_t tmp_alloc_size =
-                iter_nelems * (/* temp */ reduction_nelems +
-                               /* first reduction temp */ reduction_groups);
-
-            auto tmp_owner =
-                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
-                    tmp_alloc_size, exec_q);
-
-            resTy *partially_reduced_tmp = tmp_owner.get();
-            resTy *partially_reduced_tmp2 =
-                partially_reduced_tmp + reduction_nelems * iter_nelems;
-
-            using OuterInnerDimsIndexerT =
-                dpctl::tensor::offset_utils::NoOpIndexer;
-            static constexpr OuterInnerDimsIndexerT lhs_indexer{};
-            static constexpr OuterInnerDimsIndexerT rhs_indexer{};
-            static constexpr OuterInnerDimsIndexerT tmp_indexer{};
-
-            using dpctl::tensor::offset_utils::Strided1DIndexer;
-            using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
-            using BatchDimsIndexerT =
-                ThreeOffsets_CombinedIndexer<Strided1DIndexer, Strided1DIndexer,
-                                             Strided1DIndexer>;
-
-            const BatchDimsIndexerT batch_indexer(
-                Strided1DIndexer{/* size */ batch_nelems,
-                                 /* step */ n * k},
-                Strided1DIndexer{/* size   */ batch_nelems,
-                                 /* step   */ k * m},
-                Strided1DIndexer{/* size   */ batch_nelems,
-                                 /* step   */ n * m});
-
-            sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step<
-                lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
-                OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, wi_delta_n,
-                wi_delta_m>(exec_q, lhs_tp, rhs_tp, partially_reduced_tmp,
-                            batch_nelems, n, k, m, wg_delta_n, wg_delta_m,
-                            wi_delta_k, batch_indexer, lhs_indexer, rhs_indexer,
-                            tmp_indexer, depends);
-
-            sycl::event red_ev =
-                tree_reduction_for_gemm_contig<resTy, ReductionOpT>(
-                    exec_q, partially_reduced_tmp, partially_reduced_tmp2,
-                    res_tp, identity_val, iter_nelems, reduction_nelems,
-                    reduction_groups, wg, max_wg, preferred_reductions_per_wi,
-                    reductions_per_wi, {gemm_ev});
-
-            sycl::event cleanup_host_task_event =
-                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
-                                                             tmp_owner);
-
-            return cleanup_host_task_event;
-        }
-    }
-}
-
-template <typename lhsTy, typename rhsTy, typename resTy>
-sycl::event gemm_nm_impl(sycl::queue &exec_q,
-                         const lhsTy *lhs_tp,
-                         const rhsTy *rhs_tp,
-                         resTy *res_tp,
-                         std::size_t n,
-                         std::size_t k,
-                         std::size_t m,
-                         int inner_nd,
-                         int lhs_outer_nd,
-                         const ssize_t *lhs_shape_strides,
-                         int rhs_outer_nd,
-                         const ssize_t *rhs_shape_strides,
-                         int res_outer_nd,
-                         const ssize_t *res_shape_strides,
-                         std::vector<sycl::event> const &depends = {})
-{
-    using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
-    const OuterInnerDimsIndexerT lhs_indexer(inner_nd + lhs_outer_nd, 0,
-                                             lhs_shape_strides);
-    const OuterInnerDimsIndexerT rhs_indexer(inner_nd + rhs_outer_nd, 0,
-                                             rhs_shape_strides);
-    const OuterInnerDimsIndexerT res_indexer(res_outer_nd, 0,
-                                             res_shape_strides);
-
-    using BatchDimsIndexerT =
-        dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer;
-    static constexpr BatchDimsIndexerT batch_indexer{};
-
-    static constexpr std::size_t single_batch_nelems = 1;
-
-    sycl::event gemm_ev = gemm_detail::_gemm_batch_nm_impl<
-        lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
-        OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
-        exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
-        batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends);
-
-    return gemm_ev;
-}
-
-template <typename lhsTy, typename rhsTy, typename resTy>
-sycl::event
-gemm_batch_nm_contig_impl(sycl::queue &exec_q,
-                          const lhsTy *lhs_tp,
-                          const rhsTy *rhs_tp,
-                          resTy *res_tp,
-                          std::size_t batch_nelems,
-                          std::size_t n,
-                          std::size_t k,
-                          std::size_t m,
-                          std::vector<sycl::event> const &depends = {})
-{
-    using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-    static constexpr OuterInnerDimsIndexerT lhs_indexer{};
-    static constexpr OuterInnerDimsIndexerT rhs_indexer{};
-    static constexpr OuterInnerDimsIndexerT res_indexer{};
-
-    static constexpr std::size_t single_batch_nelems = 1;
-    if (batch_nelems == single_batch_nelems) {
-        using BatchDimsIndexerT =
-            dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer;
-        static constexpr BatchDimsIndexerT batch_indexer{};
-
-        sycl::event gemm_ev = gemm_detail::_gemm_batch_nm_impl<
-            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
-            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
-            exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
-            batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends);
-
-        return gemm_ev;
-    }
-    else {
-        using dpctl::tensor::offset_utils::Strided1DIndexer;
-        using dpctl::tensor::offset_utils::ThreeOffsets_CombinedIndexer;
-        using BatchDimsIndexerT =
-            ThreeOffsets_CombinedIndexer<Strided1DIndexer, Strided1DIndexer,
-                                         Strided1DIndexer>;
-
-        using dpctl::tensor::offset_utils::Strided1DIndexer;
-
-        const BatchDimsIndexerT batch_indexer(
-            Strided1DIndexer{/* size   */ batch_nelems,
-                             /* step   */ n * k},
-            Strided1DIndexer{/* size   */ batch_nelems,
-                             /* step   */ k * m},
-            Strided1DIndexer{/* size   */ batch_nelems,
-                             /* step   */ n * m});
-
-        sycl::event gemm_ev = gemm_detail::_gemm_batch_nm_impl<
-            lhsTy, rhsTy, resTy, BatchDimsIndexerT, OuterInnerDimsIndexerT,
-            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT>(
-            exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
-            batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends);
-
-        return gemm_ev;
-    }
-}
-
-template <typename lhsTy, typename rhsTy, typename resTy>
-sycl::event
-gemm_batch_contig_tree_impl(sycl::queue &exec_q,
-                            const char *lhs_cp,
-                            const char *rhs_cp,
-                            char *res_cp,
-                            std::size_t batch_nelems,
-                            std::size_t n,
-                            std::size_t k,
-                            std::size_t m,
-                            ssize_t lhs_batch_offset,
-                            ssize_t rhs_batch_offset,
-                            ssize_t res_batch_offset,
-                            std::vector<sycl::event> const &depends = {})
-{
-    const lhsTy *lhs_tp =
-        reinterpret_cast<const lhsTy *>(lhs_cp) + lhs_batch_offset;
-    const rhsTy *rhs_tp =
-        reinterpret_cast<const rhsTy *>(rhs_cp) + rhs_batch_offset;
-    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + res_batch_offset;
-
-    const std::size_t min_nm = std::min(n, m);
-    const std::size_t max_nm = std::max(n, m);
-
-    if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) {
-        return gemm_batch_nm_contig_impl<lhsTy, rhsTy, resTy>(
-            exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, depends);
-    }
-
-    if (k == 0) {
-        sycl::event gemm_batch_no_reduction_ev =
-            exec_q.submit([&](sycl::handler &cgh) {
-                cgh.depends_on(depends);
-                cgh.fill<resTy>(res_tp, resTy(0), n * m * batch_nelems);
-            });
-        return gemm_batch_no_reduction_ev;
-    }
-
-    if (max_nm < 64) {
-        using dpctl::tensor::type_utils::is_complex;
-        if constexpr (!is_complex<resTy>::value) {
-            if (m < 4) {
-                return gemm_batch_contig_tree_k_impl<lhsTy, rhsTy, resTy, 1>(
-                    exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
-                    depends);
-            }
-            else {
-                return gemm_batch_contig_tree_k_impl<lhsTy, rhsTy, resTy, 4>(
-                    exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m,
-                    depends);
-            }
-        }
-        else {
-            return gemm_batch_contig_tree_k_impl<lhsTy, rhsTy, resTy, 1>(
-                exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, depends);
-        }
-    }
-    else { // m > 1, n > k or m > k
-        using dpctl::tensor::type_utils::is_complex;
-        if constexpr (!is_complex<resTy>::value) {
-            return gemm_batch_contig_tree_nm_impl<lhsTy, rhsTy, resTy, 4>(
-                exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, depends);
-        }
-        else { // m > 1, n > k or m > k, resTy complex
-            return gemm_batch_contig_tree_nm_impl<lhsTy, rhsTy, resTy, 1>(
-                exec_q, lhs_tp, rhs_tp, res_tp, batch_nelems, n, k, m, depends);
-        }
-    }
-}
-
-// Gemm tree non-batched
-
-template <typename T1,
-          typename T2,
-          typename T3,
-          typename T4,
-          typename T5,
-          std::size_t>
-class gemm_tree_nm_krn;
-
-template <typename T1,
-          typename T2,
-          typename T3,
-          typename T4,
-          typename T5,
-          std::size_t>
-class gemm_tree_k_krn;
-
-template <typename lhsTy, typename rhsTy, typename resTy, std::size_t m_groups>
-sycl::event gemm_tree_k_impl(sycl::queue &exec_q,
-                             const lhsTy *lhs_tp,
-                             const rhsTy *rhs_tp,
-                             resTy *res_tp,
-                             std::size_t n,
-                             std::size_t k,
-                             std::size_t m,
-                             int inner_nd,
-                             int lhs_outer_nd,
-                             const ssize_t *lhs_outer_inner_shapes_strides,
-                             int rhs_outer_nd,
-                             const ssize_t *rhs_outer_inner_shapes_strides,
-                             int res_nd,
-                             const ssize_t *res_shapes_strides,
-                             const std::vector<sycl::event> &depends)
-{
-    std::size_t delta_k(4);
-    std::size_t n_wi(64);
-    std::size_t delta_n(32);
-
-    const sycl::device &dev = exec_q.get_device();
-    const std::size_t local_mem_size =
-        dev.get_info<sycl::info::device::local_mem_size>();
-    const std::size_t reserved_slm_size = 512;
-
-    gemm_detail::scale_gemm_k_parameters<resTy, m_groups>(
-        local_mem_size, reserved_slm_size, delta_k,
-        n_wi,   // modified by reference
-        delta_n // modified by reference
-    );
-
-    using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer;
-    static constexpr BatchIndexerT batch_indexer{};
-
-    static constexpr std::size_t single_batch_nelems = 1;
-
-    using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
-    const OuterInnerDimsIndexerT lhs_indexer(inner_nd + lhs_outer_nd, 0,
-                                             lhs_outer_inner_shapes_strides);
-    const OuterInnerDimsIndexerT rhs_indexer(inner_nd + rhs_outer_nd, 0,
-                                             rhs_outer_inner_shapes_strides);
-
-    sycl::event gemm_ev;
-    if (k <= (delta_k * n_wi)) {
-        const OuterInnerDimsIndexerT res_indexer(res_nd, 0, res_shapes_strides);
-
-        return gemm_detail::_gemm_tree_k_step<
-            lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
-            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, m_groups>(
-            exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
-            delta_n, n_wi, delta_k, batch_indexer, lhs_indexer, rhs_indexer,
-            res_indexer, depends);
-    }
-    else {
-        using ReductionOpT =
-            typename std::conditional<std::is_same_v<resTy, bool>,
-                                      sycl::logical_or<resTy>,
-                                      sycl::plus<resTy>>::type;
-        static constexpr resTy identity_val =
-            sycl::known_identity<ReductionOpT, resTy>::value;
-
-        std::size_t iter_nelems = n * m;
-        std::size_t reduction_nelems =
-            (k + delta_k * n_wi - 1) / (delta_k * n_wi);
-
-        // more than one work-groups is needed, requires a temporary
-        // delta_k * n_wi elements processed along k, so if more to
-        // process use multiple
-        const auto &sg_sizes =
-            dev.get_info<sycl::info::device::sub_group_sizes>();
-        std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
-
-        static constexpr std::size_t preferred_reductions_per_wi = 8;
-        std::size_t reductions_per_wi(preferred_reductions_per_wi);
-
-        std::size_t reduction_groups =
-            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
-            (preferred_reductions_per_wi * wg);
-
-        std::size_t max_wg = reduction_detail::get_work_group_size(dev);
-
-        if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
-
-            auto tmp_owner =
-                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
-                    iter_nelems * reduction_nelems, exec_q);
-            resTy *tmp = tmp_owner.get();
-
-            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-            static constexpr ResIndexerT res_indexer{};
-
-            sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step<
-                lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
-                OuterInnerDimsIndexerT, ResIndexerT, m_groups>(
-                exec_q, lhs_tp, rhs_tp, tmp, single_batch_nelems, n, k, m,
-                delta_n, n_wi, delta_k, batch_indexer, lhs_indexer, rhs_indexer,
-                res_indexer, depends);
-
-            sycl::event red_ev = single_reduction_for_gemm<resTy, ReductionOpT>(
-                exec_q, tmp, res_tp, identity_val, iter_nelems,
-                reduction_nelems, reduction_groups, wg, max_wg,
-                preferred_reductions_per_wi, reductions_per_wi, res_nd, 0,
-                res_shapes_strides, {gemm_ev});
-
-            sycl::event cleanup_host_task_event =
-                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
-                                                             tmp_owner);
-            return cleanup_host_task_event;
-        }
-        else {
-            assert(reduction_groups > 1);
-
-            const std::size_t tmp_alloc_size =
-                iter_nelems * (/* temp */ reduction_nelems +
-                               /* first reduction temp */ reduction_groups);
-
-            auto tmp_owner =
-                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
-                    tmp_alloc_size, exec_q);
-
-            resTy *partially_reduced_tmp = tmp_owner.get();
-            resTy *partially_reduced_tmp2 =
-                partially_reduced_tmp + reduction_nelems * iter_nelems;
-
-            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-            static constexpr ResIndexerT res_indexer{};
-
-            sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step<
-                lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
-                OuterInnerDimsIndexerT, ResIndexerT, m_groups>(
-                exec_q, lhs_tp, rhs_tp, partially_reduced_tmp,
-                single_batch_nelems, n, k, m, delta_n, n_wi, delta_k,
-                batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends);
-
-            // tree_reduction_for_gemm returns sycl::event for reduction
-            sycl::event red_ev = tree_reduction_for_gemm<resTy, ReductionOpT>(
-                exec_q, partially_reduced_tmp, partially_reduced_tmp2, res_tp,
-                identity_val, iter_nelems, reduction_nelems, reduction_groups,
-                wg, max_wg, preferred_reductions_per_wi, reductions_per_wi,
-                res_nd, 0, res_shapes_strides, {gemm_ev});
-
-            sycl::event cleanup_host_task_event =
-                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
-                                                             tmp_owner);
-
-            return cleanup_host_task_event;
-        }
-    }
-}
-
-template <typename lhsTy, typename rhsTy, typename resTy, int wi_delta_m>
-sycl::event gemm_tree_nm_impl(sycl::queue &exec_q,
-                              const lhsTy *lhs_tp,
-                              const rhsTy *rhs_tp,
-                              resTy *res_tp,
-                              std::size_t n,
-                              std::size_t k,
-                              std::size_t m,
-                              int inner_nd,
-                              int lhs_outer_nd,
-                              const ssize_t *lhs_outer_inner_shapes_strides,
-                              int rhs_outer_nd,
-                              const ssize_t *rhs_outer_inner_shapes_strides,
-                              int res_nd,
-                              const ssize_t *res_shapes_strides,
-                              const std::vector<sycl::event> &depends)
-{
-    static constexpr int wi_delta_n = 2;
-    std::size_t wg_delta_n(16); // rows of A processed in WG
-    std::size_t wg_delta_m(16); // rows of B processed in WG
-    std::size_t wi_delta_k(64); // Elements in K dimension processed by WI
-
-    const sycl::device &dev = exec_q.get_device();
-    const std::size_t local_mem_size =
-        dev.get_info<sycl::info::device::local_mem_size>();
-    const std::size_t reserved_slm_size = 512;
-
-    gemm_detail::scale_gemm_nm_parameters<resTy, wi_delta_m>(
-        local_mem_size, reserved_slm_size, wi_delta_n,
-        wi_delta_k, // modified by reference
-        wg_delta_n, // modified by reference
-        wg_delta_m  // modified by reference
-    );
-
-    using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer;
-    static constexpr BatchIndexerT batch_indexer{};
-
-    static constexpr std::size_t single_batch_nelems = 1;
-
-    using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
-    const OuterInnerDimsIndexerT lhs_indexer(inner_nd + lhs_outer_nd, 0,
-                                             lhs_outer_inner_shapes_strides);
-    const OuterInnerDimsIndexerT rhs_indexer(inner_nd + rhs_outer_nd, 0,
-                                             rhs_outer_inner_shapes_strides);
-
-    // each group processes delta_k items in a column,
-    // so no need to allocate temp memory if one group needed
-    if (k <= wi_delta_k) {
-        const OuterInnerDimsIndexerT res_indexer(res_nd, 0, res_shapes_strides);
-
-        return gemm_detail::_gemm_tree_nm_step<
-            lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
-            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, wi_delta_n,
-            wi_delta_m>(exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n,
-                        k, m, wg_delta_n, wg_delta_m, wi_delta_k, batch_indexer,
-                        lhs_indexer, rhs_indexer, res_indexer, depends);
-    }
-    else {
-        using ReductionOpT =
-            typename std::conditional<std::is_same_v<resTy, bool>,
-                                      sycl::logical_or<resTy>,
-                                      sycl::plus<resTy>>::type;
-        static constexpr resTy identity_val =
-            sycl::known_identity<ReductionOpT, resTy>::value;
-
-        std::size_t iter_nelems = n * m;
-        std::size_t reduction_nelems = (k + wi_delta_k - 1) / wi_delta_k;
-
-        // more than one work-groups is needed, requires a temporary
-        // wi_delta_k elements processed along k, so if more to
-        // process use multiple
-        const auto &sg_sizes =
-            dev.get_info<sycl::info::device::sub_group_sizes>();
-        std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
-
-        static constexpr std::size_t preferred_reductions_per_wi = 8;
-        std::size_t reductions_per_wi(preferred_reductions_per_wi);
-
-        std::size_t reduction_groups =
-            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
-            (preferred_reductions_per_wi * wg);
-
-        std::size_t max_wg = reduction_detail::get_work_group_size(dev);
-
-        if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
-            auto tmp_owner =
-                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
-                    iter_nelems * reduction_nelems, exec_q);
-            resTy *tmp = tmp_owner.get();
-
-            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-            static constexpr ResIndexerT res_indexer{};
-
-            sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step<
-                lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
-                OuterInnerDimsIndexerT, ResIndexerT, wi_delta_n, wi_delta_m>(
-                exec_q, lhs_tp, rhs_tp, tmp, single_batch_nelems, n, k, m,
-                wg_delta_n, wg_delta_m, wi_delta_k, batch_indexer, lhs_indexer,
-                rhs_indexer, res_indexer, depends);
-
-            sycl::event red_ev = single_reduction_for_gemm<resTy, ReductionOpT>(
-                exec_q, tmp, res_tp, identity_val, iter_nelems,
-                reduction_nelems, reduction_groups, wg, max_wg,
-                preferred_reductions_per_wi, reductions_per_wi, res_nd, 0,
-                res_shapes_strides, {gemm_ev});
-
-            sycl::event cleanup_host_task_event =
-                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
-                                                             tmp_owner);
-
-            return cleanup_host_task_event;
-        }
-        else {
-            assert(reduction_groups > 1);
-
-            const std::size_t tmp_alloc_size =
-                iter_nelems * (/* temp */ reduction_nelems +
-                               /* first reduction temp */ reduction_groups);
-            auto tmp_owner =
-                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
-                    tmp_alloc_size, exec_q);
-
-            resTy *partially_reduced_tmp = tmp_owner.get();
-            resTy *partially_reduced_tmp2 =
-                partially_reduced_tmp + reduction_nelems * iter_nelems;
-
-            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-            static constexpr ResIndexerT res_indexer{};
-
-            sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step<
-                lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
-                OuterInnerDimsIndexerT, ResIndexerT, wi_delta_n, wi_delta_m>(
-                exec_q, lhs_tp, rhs_tp, partially_reduced_tmp,
-                single_batch_nelems, n, k, m, wg_delta_n, wg_delta_m,
-                wi_delta_k, batch_indexer, lhs_indexer, rhs_indexer,
-                res_indexer, depends);
-
-            sycl::event red_ev = tree_reduction_for_gemm<resTy, ReductionOpT>(
-                exec_q, partially_reduced_tmp, partially_reduced_tmp2, res_tp,
-                identity_val, iter_nelems, reduction_nelems, reduction_groups,
-                wg, max_wg, preferred_reductions_per_wi, reductions_per_wi,
-                res_nd, 0, res_shapes_strides, {gemm_ev});
-
-            sycl::event cleanup_host_task_event =
-                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
-                                                             tmp_owner);
-
-            return cleanup_host_task_event;
-        }
-    }
-}
-
-template <typename T1, typename T2, typename T3> class gemm_tree_empty_krn;
-
-template <typename lhsTy, typename rhsTy, typename resTy>
-sycl::event gemm_tree_impl(sycl::queue &exec_q,
-                           const char *lhs_cp,
-                           const char *rhs_cp,
-                           char *res_cp,
-                           std::size_t n,
-                           std::size_t k,
-                           std::size_t m,
-                           int inner_nd,
-                           int lhs_outer_nd,
-                           const ssize_t *lhs_outer_inner_shapes_strides,
-                           int rhs_outer_nd,
-                           const ssize_t *rhs_outer_inner_shapes_strides,
-                           int res_nd,
-                           const ssize_t *res_shapes_strides,
-                           std::vector<sycl::event> const &depends = {})
-{
-    const lhsTy *lhs_tp = reinterpret_cast<const lhsTy *>(lhs_cp);
-    const rhsTy *rhs_tp = reinterpret_cast<const rhsTy *>(rhs_cp);
-    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
-
-    const std::size_t min_nm = std::min(n, m);
-    const std::size_t max_nm = std::max(n, m);
-
-    if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) {
-        return gemm_nm_impl<lhsTy, rhsTy, resTy>(
-            exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, inner_nd, lhs_outer_nd,
-            lhs_outer_inner_shapes_strides, rhs_outer_nd,
-            rhs_outer_inner_shapes_strides, res_nd, res_shapes_strides,
-            depends);
-    }
-
-    if (k == 0) {
-        sycl::event gemm_no_reduction_ev =
-            exec_q.submit([&](sycl::handler &cgh) {
-                cgh.depends_on(depends);
-
-                using IndexerT = dpctl::tensor::offset_utils::StridedIndexer;
-                const IndexerT res_indexer(res_nd, 0, res_shapes_strides);
-                using InitKernelName =
-                    class gemm_tree_empty_krn<lhsTy, rhsTy, resTy>;
-                cgh.parallel_for<InitKernelName>(
-                    sycl::range<1>(n * m), [=](sycl::id<1> id) {
-                        auto res_offset = res_indexer(id[0]);
-                        res_tp[res_offset] = resTy(0);
-                    });
-            });
-        return gemm_no_reduction_ev;
-    }
-
-    if (max_nm < 64) {
-        using dpctl::tensor::type_utils::is_complex;
-        if constexpr (!is_complex<resTy>::value) {
-            if (m < 4) {
-                return gemm_tree_k_impl<lhsTy, rhsTy, resTy, 1>(
-                    exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, inner_nd,
-                    lhs_outer_nd, lhs_outer_inner_shapes_strides, rhs_outer_nd,
-                    rhs_outer_inner_shapes_strides, res_nd, res_shapes_strides,
-                    depends);
-            }
-            else {
-                return gemm_tree_k_impl<lhsTy, rhsTy, resTy, 4>(
-                    exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, inner_nd,
-                    lhs_outer_nd, lhs_outer_inner_shapes_strides, rhs_outer_nd,
-                    rhs_outer_inner_shapes_strides, res_nd, res_shapes_strides,
-                    depends);
-            }
-        }
-        else {
-            return gemm_tree_k_impl<lhsTy, rhsTy, resTy, 1>(
-                exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, inner_nd, lhs_outer_nd,
-                lhs_outer_inner_shapes_strides, rhs_outer_nd,
-                rhs_outer_inner_shapes_strides, res_nd, res_shapes_strides,
-                depends);
-        }
-    }
-    else { // m > 1, n > k or m > k
-        using dpctl::tensor::type_utils::is_complex;
-        if constexpr (!is_complex<resTy>::value) {
-            return gemm_tree_nm_impl<lhsTy, rhsTy, resTy, 4>(
-                exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, inner_nd, lhs_outer_nd,
-                lhs_outer_inner_shapes_strides, rhs_outer_nd,
-                rhs_outer_inner_shapes_strides, res_nd, res_shapes_strides,
-                depends);
-        }
-        else {
-            return gemm_tree_nm_impl<lhsTy, rhsTy, resTy, 1>(
-                exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, inner_nd, lhs_outer_nd,
-                lhs_outer_inner_shapes_strides, rhs_outer_nd,
-                rhs_outer_inner_shapes_strides, res_nd, res_shapes_strides,
-                depends);
-        }
-    }
-}
-
-template <typename lhsTy, typename rhsTy, typename resTy, std::size_t m_groups>
-sycl::event gemm_contig_tree_k_impl(sycl::queue &exec_q,
-                                    const lhsTy *lhs_tp,
-                                    const rhsTy *rhs_tp,
-                                    resTy *res_tp,
-                                    std::size_t n,
-                                    std::size_t k,
-                                    std::size_t m,
-                                    std::vector<sycl::event> const &depends)
-{
-    std::size_t delta_k(4);
-    std::size_t n_wi(64);
-    std::size_t delta_n(32);
-
-    const sycl::device &dev = exec_q.get_device();
-    const std::size_t local_mem_size =
-        dev.get_info<sycl::info::device::local_mem_size>();
-    const std::size_t reserved_slm_size = 512;
-
-    gemm_detail::scale_gemm_k_parameters<resTy, m_groups>(
-        local_mem_size, reserved_slm_size, delta_k,
-        n_wi,   // modified by reference
-        delta_n // modified by reference
-    );
-
-    using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-    static constexpr OuterInnerDimsIndexerT lhs_indexer{};
-    static constexpr OuterInnerDimsIndexerT rhs_indexer{};
-    static constexpr OuterInnerDimsIndexerT res_indexer{};
-
-    using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer;
-    static constexpr BatchIndexerT batch_indexer{};
-
-    static constexpr std::size_t single_batch_nelems = 1;
-
-    sycl::event gemm_ev;
-    if (k <= (delta_k * n_wi)) {
-        return gemm_detail::_gemm_tree_k_step<
-            lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
-            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, m_groups>(
-            exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
-            delta_n, n_wi, delta_k, batch_indexer, lhs_indexer, rhs_indexer,
-            res_indexer, depends);
-    }
-    else {
-        using ReductionOpT =
-            typename std::conditional<std::is_same_v<resTy, bool>,
-                                      sycl::logical_or<resTy>,
-                                      sycl::plus<resTy>>::type;
-        static constexpr resTy identity_val =
-            sycl::known_identity<ReductionOpT, resTy>::value;
-
-        std::size_t iter_nelems = n * m;
-        std::size_t reduction_nelems =
-            (k + delta_k * n_wi - 1) / (delta_k * n_wi);
-
-        // more than one work-groups is needed, requires a
-        // temporary delta_k * n_wi elements processed along k,
-        // so if more to process use multiple
-        const auto &sg_sizes =
-            dev.get_info<sycl::info::device::sub_group_sizes>();
-        std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
-
-        static constexpr std::size_t preferred_reductions_per_wi = 8;
-        std::size_t reductions_per_wi(preferred_reductions_per_wi);
-
-        std::size_t reduction_groups =
-            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
-            (preferred_reductions_per_wi * wg);
-
-        std::size_t max_wg = reduction_detail::get_work_group_size(dev);
-
-        if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
-            auto tmp_owner =
-                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
-                    iter_nelems * reduction_nelems, exec_q);
-            resTy *tmp = tmp_owner.get();
-
-            sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step<
-                lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
-                OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, m_groups>(
-                exec_q, lhs_tp, rhs_tp, tmp, single_batch_nelems, n, k, m,
-                delta_n, n_wi, delta_k, batch_indexer, lhs_indexer, rhs_indexer,
-                res_indexer, depends);
-
-            sycl::event red_ev =
-                single_reduction_for_gemm_contig<resTy, ReductionOpT>(
-                    exec_q, tmp, res_tp, identity_val, iter_nelems,
-                    reduction_nelems, reduction_groups, wg, max_wg,
-                    preferred_reductions_per_wi, reductions_per_wi, {gemm_ev});
-
-            sycl::event cleanup_host_task_event =
-                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
-                                                             tmp_owner);
-            return cleanup_host_task_event;
-        }
-        else {
-            assert(reduction_groups > 1);
-
-            const std::size_t tmp_alloc_size =
-                iter_nelems * (/* temp */ reduction_nelems +
-                               /* first reduction temp */ reduction_groups);
-            auto tmp_owner =
-                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
-                    tmp_alloc_size, exec_q);
-
-            resTy *partially_reduced_tmp = tmp_owner.get();
-            resTy *partially_reduced_tmp2 =
-                partially_reduced_tmp + reduction_nelems * iter_nelems;
-
-            sycl::event gemm_ev = gemm_detail::_gemm_tree_k_step<
-                lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
-                OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, m_groups>(
-                exec_q, lhs_tp, rhs_tp, partially_reduced_tmp,
-                single_batch_nelems, n, k, m, delta_n, n_wi, delta_k,
-                batch_indexer, lhs_indexer, rhs_indexer, res_indexer, depends);
-
-            // tree_reduction_for_gemm_contig returns sycl::event
-            // for reduction
-            sycl::event red_ev =
-                tree_reduction_for_gemm_contig<resTy, ReductionOpT>(
-                    exec_q, partially_reduced_tmp, partially_reduced_tmp2,
-                    res_tp, identity_val, iter_nelems, reduction_nelems,
-                    reduction_groups, wg, max_wg, preferred_reductions_per_wi,
-                    reductions_per_wi, {gemm_ev});
-
-            sycl::event cleanup_host_task_event =
-                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
-                                                             tmp_owner);
-
-            return cleanup_host_task_event;
-        }
-    }
-}
-
-template <typename lhsTy, typename rhsTy, typename resTy, int wi_delta_m>
-sycl::event gemm_contig_tree_nm_impl(sycl::queue &exec_q,
-                                     const lhsTy *lhs_tp,
-                                     const rhsTy *rhs_tp,
-                                     resTy *res_tp,
-                                     std::size_t n,
-                                     std::size_t k,
-                                     std::size_t m,
-                                     std::vector<sycl::event> const &depends)
-{
-    static constexpr int wi_delta_n = 2;
-    std::size_t wg_delta_n(16); // rows of A processed in WG
-    std::size_t wg_delta_m(16); // rows of B processed in WG
-    std::size_t wi_delta_k(64); // Elements in K dimension processed by WI
-
-    const sycl::device &dev = exec_q.get_device();
-    const std::size_t local_mem_size =
-        dev.get_info<sycl::info::device::local_mem_size>();
-    const std::size_t reserved_slm_size = 512;
-
-    gemm_detail::scale_gemm_nm_parameters<resTy, wi_delta_m>(
-        local_mem_size, reserved_slm_size, wi_delta_n,
-        wi_delta_k, // modified by reference
-        wg_delta_n, // modified by reference
-        wg_delta_m  // modified by reference
-    );
-
-    using OuterInnerDimsIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-    static constexpr OuterInnerDimsIndexerT lhs_indexer{};
-    static constexpr OuterInnerDimsIndexerT rhs_indexer{};
-    static constexpr OuterInnerDimsIndexerT res_indexer{};
-
-    using BatchIndexerT = dpctl::tensor::offset_utils::ThreeZeroOffsets_Indexer;
-    static constexpr BatchIndexerT batch_indexer{};
-
-    static constexpr std::size_t single_batch_nelems = 1;
-
-    // each group processes delta_k items in a column,
-    // so no need to allocate temp memory if one group needed
-    if (k <= wi_delta_k) {
-
-        return gemm_detail::_gemm_tree_nm_step<
-            lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
-            OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, wi_delta_n,
-            wi_delta_m>(exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n,
-                        k, m, wg_delta_n, wg_delta_m, wi_delta_k, batch_indexer,
-                        lhs_indexer, rhs_indexer, res_indexer, depends);
-    }
-    else {
-        using ReductionOpT =
-            typename std::conditional<std::is_same_v<resTy, bool>,
-                                      sycl::logical_or<resTy>,
-                                      sycl::plus<resTy>>::type;
-        static constexpr resTy identity_val =
-            sycl::known_identity<ReductionOpT, resTy>::value;
-
-        std::size_t iter_nelems = n * m;
-        std::size_t reduction_nelems = (k + wi_delta_k - 1) / wi_delta_k;
-
-        // more than one work-groups is needed, requires a temporary
-        // wi_delta_k elements processed along k, so if more to
-        // process use multiple
-        const auto &sg_sizes =
-            dev.get_info<sycl::info::device::sub_group_sizes>();
-        std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
-
-        static constexpr std::size_t preferred_reductions_per_wi = 8;
-        std::size_t reductions_per_wi(preferred_reductions_per_wi);
-
-        std::size_t reduction_groups =
-            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
-            (preferred_reductions_per_wi * wg);
-
-        std::size_t max_wg = reduction_detail::get_work_group_size(dev);
-
-        if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
-            auto tmp_owner =
-                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
-                    iter_nelems * reduction_nelems, exec_q);
-            resTy *tmp = tmp_owner.get();
-
-            sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step<
-                lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
-                OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, wi_delta_n,
-                wi_delta_m>(exec_q, lhs_tp, rhs_tp, tmp, single_batch_nelems, n,
-                            k, m, wg_delta_n, wg_delta_m, wi_delta_k,
-                            batch_indexer, lhs_indexer, rhs_indexer,
-                            res_indexer, depends);
-
-            sycl::event red_ev =
-                single_reduction_for_gemm_contig<resTy, ReductionOpT>(
-                    exec_q, tmp, res_tp, identity_val, iter_nelems,
-                    reduction_nelems, reduction_groups, wg, max_wg,
-                    preferred_reductions_per_wi, reductions_per_wi, {gemm_ev});
-
-            sycl::event cleanup_host_task_event =
-                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
-                                                             tmp_owner);
-            return cleanup_host_task_event;
-        }
-        else {
-            assert(reduction_groups > 1);
-
-            const std::size_t tmp_alloc_size =
-                iter_nelems * (/* temp */ reduction_nelems +
-                               /* first reduction temp */ reduction_groups);
-
-            auto tmp_owner =
-                dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
-                    tmp_alloc_size, exec_q);
-            resTy *partially_reduced_tmp = tmp_owner.get();
-            resTy *partially_reduced_tmp2 =
-                partially_reduced_tmp + reduction_nelems * iter_nelems;
-
-            sycl::event gemm_ev = gemm_detail::_gemm_tree_nm_step<
-                lhsTy, rhsTy, resTy, BatchIndexerT, OuterInnerDimsIndexerT,
-                OuterInnerDimsIndexerT, OuterInnerDimsIndexerT, wi_delta_n,
-                wi_delta_m>(exec_q, lhs_tp, rhs_tp, partially_reduced_tmp,
-                            single_batch_nelems, n, k, m, wg_delta_n,
-                            wg_delta_m, wi_delta_k, batch_indexer, lhs_indexer,
-                            rhs_indexer, res_indexer, depends);
-
-            sycl::event red_ev =
-                tree_reduction_for_gemm_contig<resTy, ReductionOpT>(
-                    exec_q, partially_reduced_tmp, partially_reduced_tmp2,
-                    res_tp, identity_val, iter_nelems, reduction_nelems,
-                    reduction_groups, wg, max_wg, preferred_reductions_per_wi,
-                    reductions_per_wi, {gemm_ev});
-
-            sycl::event cleanup_host_task_event =
-                dpctl::tensor::alloc_utils::async_smart_free(exec_q, {red_ev},
-                                                             tmp_owner);
-
-            return cleanup_host_task_event;
-        }
-    }
-}
-
-template <typename lhsTy, typename rhsTy, typename resTy>
-sycl::event gemm_contig_tree_impl(sycl::queue &exec_q,
-                                  const char *lhs_cp,
-                                  const char *rhs_cp,
-                                  char *res_cp,
-                                  std::size_t n,
-                                  std::size_t k,
-                                  std::size_t m,
-                                  std::vector<sycl::event> const &depends = {})
-{
-    const lhsTy *lhs_tp = reinterpret_cast<const lhsTy *>(lhs_cp);
-    const rhsTy *rhs_tp = reinterpret_cast<const rhsTy *>(rhs_cp);
-    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
-
-    const std::size_t min_nm = std::min(n, m);
-    const std::size_t max_nm = std::max(n, m);
-
-    if (min_nm > 0 && (max_nm >= ((64 * 1024) / min_nm))) {
-        static constexpr std::size_t single_batch_nelems = 1;
-        return gemm_batch_nm_contig_impl<lhsTy, rhsTy, resTy>(
-            exec_q, lhs_tp, rhs_tp, res_tp, single_batch_nelems, n, k, m,
-            depends);
-    }
-
-    if (k == 0) {
-        sycl::event gemm_no_reduction_ev =
-            exec_q.submit([&](sycl::handler &cgh) {
-                cgh.depends_on(depends);
-                cgh.fill<resTy>(res_tp, resTy(0), n * m);
-            });
-        return gemm_no_reduction_ev;
-    }
-
-    if (max_nm < 64) {
-        using dpctl::tensor::type_utils::is_complex;
-        if constexpr (!is_complex<resTy>::value) {
-            if (m < 4) {
-                return gemm_contig_tree_k_impl<lhsTy, rhsTy, resTy, 1>(
-                    exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, depends);
-            }
-            else {
-                return gemm_contig_tree_k_impl<lhsTy, rhsTy, resTy, 4>(
-                    exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, depends);
-            }
-        }
-        else {
-            return gemm_contig_tree_k_impl<lhsTy, rhsTy, resTy, 1>(
-                exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, depends);
-        }
-    }
-    else { // m > 1, n > k or m > k
-        using dpctl::tensor::type_utils::is_complex;
-        if constexpr (!is_complex<resTy>::value) {
-            return gemm_contig_tree_nm_impl<lhsTy, rhsTy, resTy, 4>(
-                exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, depends);
-        }
-        else {
-            return gemm_contig_tree_nm_impl<lhsTy, rhsTy, resTy, 1>(
-                exec_q, lhs_tp, rhs_tp, res_tp, n, k, m, depends);
-        }
-    }
-}
-
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
deleted file mode 100644
index 7251e3fad9..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp
+++ /dev/null
@@ -1,3322 +0,0 @@
-//=== reductions.hpp - Implementation of reduction kernels ------- *-C++-*/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for tensor reduction along axis.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <stdexcept>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "dpctl_tensor_types.hpp"
-#include "utils/math_utils.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/sycl_alloc_utils.hpp"
-#include "utils/sycl_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-
-using dpctl::tensor::ssize_t;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace su_ns = dpctl::tensor::sycl_utils;
-
-namespace reduction_detail
-{
-
-inline std::size_t get_work_group_size(const sycl::device &d)
-{
-    // prevents running out of resources on CPU
-    return std::min<std::size_t>(
-        2048, d.get_info<sycl::info::device::max_work_group_size>() / 2);
-}
-
-} // namespace reduction_detail
-
-template <typename ReductionOpT, typename T> struct needs_workaround
-{
-    static constexpr bool value =
-        (std::is_same_v<ReductionOpT, sycl::multiplies<T>> &&
-         (std::is_same_v<T, std::int64_t> ||
-          std::is_same_v<T, std::uint64_t>)) ||
-        (__LIBSYCL_MAJOR_VERSION < 7 && std::is_same_v<T, bool> &&
-         std::is_same_v<ReductionOpT, sycl::logical_or<T>>);
-};
-
-template <typename ReductionOpT, typename T> struct can_use_reduce_over_group
-{
-    static constexpr bool value =
-        sycl::has_known_identity<ReductionOpT, T>::value &&
-        !needs_workaround<ReductionOpT, T>::value;
-};
-
-template <typename argT,
-          typename outT,
-          typename ReductionOp,
-          typename InputOutputIterIndexerT,
-          typename InputRedIndexerT>
-struct SequentialReduction
-{
-private:
-    const argT *inp_ = nullptr;
-    outT *out_ = nullptr;
-    ReductionOp reduction_op_;
-    outT identity_;
-    InputOutputIterIndexerT inp_out_iter_indexer_;
-    InputRedIndexerT inp_reduced_dims_indexer_;
-    std::size_t reduction_max_gid_ = 0;
-
-public:
-    SequentialReduction(const argT *inp,
-                        outT *res,
-                        const ReductionOp &reduction_op,
-                        const outT &identity_val,
-                        const InputOutputIterIndexerT &arg_res_iter_indexer,
-                        const InputRedIndexerT &arg_reduced_dims_indexer,
-                        std::size_t reduction_size)
-        : inp_(inp), out_(res), reduction_op_(reduction_op),
-          identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer),
-          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
-          reduction_max_gid_(reduction_size)
-    {
-    }
-
-    void operator()(sycl::id<1> id) const
-    {
-
-        auto const &inp_out_iter_offsets_ = inp_out_iter_indexer_(id[0]);
-        const ssize_t &inp_iter_offset =
-            inp_out_iter_offsets_.get_first_offset();
-        const ssize_t &out_iter_offset =
-            inp_out_iter_offsets_.get_second_offset();
-
-        outT red_val(identity_);
-        for (std::size_t m = 0; m < reduction_max_gid_; ++m) {
-            const ssize_t inp_reduction_offset = inp_reduced_dims_indexer_(m);
-            const ssize_t inp_offset = inp_iter_offset + inp_reduction_offset;
-
-            using dpctl::tensor::type_utils::convert_impl;
-            outT val;
-            if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value ||
-                          su_ns::IsLogicalOr<outT, ReductionOp>::value)
-            {
-                val = convert_impl<bool, argT>(inp_[inp_offset]);
-            }
-            else {
-                val = convert_impl<outT, argT>(inp_[inp_offset]);
-            }
-            red_val = reduction_op_(red_val, val);
-        }
-
-        out_[out_iter_offset] = red_val;
-    }
-};
-
-/* === Reduction, using sycl::reduce_over_group, and sycl::atomic_ref === */
-
-/*
-  This kernel only works for outT with sizeof(outT) == 4, or sizeof(outT) == 8
-  if the device has aspect atomic64 and only with those supported by
-  sycl::atomic_ref
-*/
-template <typename argT,
-          typename outT,
-          typename ReductionOp,
-          typename InputOutputIterIndexerT,
-          typename InputRedIndexerT>
-struct ReductionOverGroupWithAtomicFunctor
-{
-private:
-    const argT *inp_ = nullptr;
-    outT *out_ = nullptr;
-    ReductionOp reduction_op_;
-    outT identity_;
-    InputOutputIterIndexerT inp_out_iter_indexer_;
-    InputRedIndexerT inp_reduced_dims_indexer_;
-    std::size_t reduction_max_gid_ = 0;
-    std::size_t iter_gws_ = 1;
-    std::size_t reductions_per_wi = 16;
-
-public:
-    ReductionOverGroupWithAtomicFunctor(
-        const argT *data,
-        outT *res,
-        const ReductionOp &reduction_op,
-        const outT &identity_val,
-        const InputOutputIterIndexerT &arg_res_iter_indexer,
-        const InputRedIndexerT &arg_reduced_dims_indexer,
-        std::size_t reduction_size,
-        std::size_t iteration_size,
-        std::size_t reduction_size_per_wi)
-        : inp_(data), out_(res), reduction_op_(reduction_op),
-          identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer),
-          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
-          reduction_max_gid_(reduction_size), iter_gws_(iteration_size),
-          reductions_per_wi(reduction_size_per_wi)
-    {
-    }
-
-    void operator()(sycl::nd_item<1> it) const
-    {
-        const std::size_t iter_gid = it.get_group(0) % iter_gws_;
-        const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_;
-
-        const std::size_t reduction_lid = it.get_local_id(0);
-        const std::size_t wg =
-            it.get_local_range(0); //   0 <= reduction_lid < wg
-
-        // work-items operate over input with indices
-        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
-        //   + reduction_lid
-        // for 0 <= m < reductions_per_wi
-
-        const auto &inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
-        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
-        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
-
-        outT local_red_val(identity_);
-        std::size_t arg_reduce_gid0 =
-            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
-        std::size_t arg_reduce_gid_max = std::min(
-            reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
-
-        for (std::size_t arg_reduce_gid = arg_reduce_gid0;
-             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg)
-        {
-            auto inp_reduction_offset =
-                inp_reduced_dims_indexer_(arg_reduce_gid);
-            auto inp_offset = inp_iter_offset + inp_reduction_offset;
-
-            using dpctl::tensor::type_utils::convert_impl;
-            outT val;
-            if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value ||
-                          su_ns::IsLogicalOr<outT, ReductionOp>::value)
-            {
-                // handle nans
-                val = convert_impl<bool, argT>(inp_[inp_offset]);
-            }
-            else {
-                val = convert_impl<outT, argT>(inp_[inp_offset]);
-            }
-
-            local_red_val = reduction_op_(local_red_val, val);
-        }
-
-        auto work_group = it.get_group();
-        // This only works if reduction_op_ is from small set of operators
-        outT red_val_over_wg;
-        if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value) {
-            red_val_over_wg = static_cast<outT>(
-                sycl::all_of_group(work_group, local_red_val));
-        }
-        else if constexpr (su_ns::IsLogicalOr<outT, ReductionOp>::value) {
-            red_val_over_wg = static_cast<outT>(
-                sycl::any_of_group(work_group, local_red_val));
-        }
-        else {
-            red_val_over_wg = sycl::reduce_over_group(work_group, local_red_val,
-                                                      identity_, reduction_op_);
-        }
-
-        if (work_group.leader()) {
-            sycl::atomic_ref<outT, sycl::memory_order::relaxed,
-                             sycl::memory_scope::device,
-                             sycl::access::address_space::global_space>
-                res_ref(out_[out_iter_offset]);
-            if constexpr (su_ns::IsPlus<outT, ReductionOp>::value) {
-                res_ref += red_val_over_wg;
-            }
-            else if constexpr (su_ns::IsMaximum<outT, ReductionOp>::value) {
-                res_ref.fetch_max(red_val_over_wg);
-            }
-            else if constexpr (su_ns::IsMinimum<outT, ReductionOp>::value) {
-                res_ref.fetch_min(red_val_over_wg);
-            }
-            else if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value) {
-                res_ref.fetch_and(red_val_over_wg);
-            }
-            else if constexpr (su_ns::IsLogicalOr<outT, ReductionOp>::value) {
-                res_ref.fetch_or(red_val_over_wg);
-            }
-            else {
-                outT read_val = res_ref.load();
-                outT new_val{};
-                do {
-                    new_val = reduction_op_(read_val, red_val_over_wg);
-                } while (!res_ref.compare_exchange_strong(read_val, new_val));
-            }
-        }
-    }
-};
-
-/* === Reduction, using custom_reduce_over_group, and sycl::atomic_ref === */
-
-template <typename argT,
-          typename outT,
-          typename ReductionOp,
-          typename InputOutputIterIndexerT,
-          typename InputRedIndexerT,
-          typename SlmT>
-struct CustomReductionOverGroupWithAtomicFunctor
-{
-private:
-    const argT *inp_ = nullptr;
-    outT *out_ = nullptr;
-    ReductionOp reduction_op_;
-    outT identity_;
-    InputOutputIterIndexerT inp_out_iter_indexer_;
-    InputRedIndexerT inp_reduced_dims_indexer_;
-    SlmT local_mem_;
-    std::size_t reduction_max_gid_ = 0;
-    std::size_t iter_gws_ = 1;
-    std::size_t reductions_per_wi = 16;
-
-public:
-    CustomReductionOverGroupWithAtomicFunctor(
-        const argT *data,
-        outT *res,
-        const ReductionOp &reduction_op,
-        const outT &identity_val,
-        const InputOutputIterIndexerT &arg_res_iter_indexer,
-        const InputRedIndexerT &arg_reduced_dims_indexer,
-        SlmT local_mem,
-        std::size_t reduction_size,
-        std::size_t iteration_size,
-        std::size_t reduction_size_per_wi)
-        : inp_(data), out_(res), reduction_op_(reduction_op),
-          identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer),
-          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
-          local_mem_(local_mem), reduction_max_gid_(reduction_size),
-          iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi)
-    {
-    }
-
-    void operator()(sycl::nd_item<1> it) const
-    {
-        const std::size_t iter_gid = it.get_group(0) % iter_gws_;
-        const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_;
-
-        const std::size_t reduction_lid = it.get_local_id(0);
-        const std::size_t wg =
-            it.get_local_range(0); //   0 <= reduction_lid < wg
-
-        // work-items operate over input with indices
-        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
-        //   + reduction_lid
-        // for 0 <= m < reductions_per_wi
-
-        const auto &inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
-        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
-        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
-
-        outT local_red_val(identity_);
-        std::size_t arg_reduce_gid0 =
-            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
-        std::size_t arg_reduce_gid_max = std::min(
-            reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg);
-
-        for (std::size_t arg_reduce_gid = arg_reduce_gid0;
-             arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg)
-        {
-            auto inp_reduction_offset =
-                inp_reduced_dims_indexer_(arg_reduce_gid);
-            auto inp_offset = inp_iter_offset + inp_reduction_offset;
-
-            using dpctl::tensor::type_utils::convert_impl;
-            outT val;
-            if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value ||
-                          su_ns::IsLogicalOr<outT, ReductionOp>::value)
-            {
-                // handle nans
-                val = convert_impl<bool, argT>(inp_[inp_offset]);
-            }
-            else {
-                val = convert_impl<outT, argT>(inp_[inp_offset]);
-            }
-
-            local_red_val = reduction_op_(local_red_val, val);
-        }
-
-        auto work_group = it.get_group();
-        outT red_val_over_wg = su_ns::custom_reduce_over_group(
-            work_group, local_mem_, local_red_val, reduction_op_);
-
-        if (work_group.leader()) {
-            sycl::atomic_ref<outT, sycl::memory_order::relaxed,
-                             sycl::memory_scope::device,
-                             sycl::access::address_space::global_space>
-                res_ref(out_[out_iter_offset]);
-            // retain these checks in case a reduce_over_group work-around is
-            // needed
-            if constexpr (su_ns::IsSyclPlus<outT, ReductionOp>::value) {
-                res_ref += red_val_over_wg;
-            }
-            else if constexpr (su_ns::IsSyclMaximum<outT, ReductionOp>::value) {
-                res_ref.fetch_max(red_val_over_wg);
-            }
-            else if constexpr (su_ns::IsSyclMinimum<outT, ReductionOp>::value) {
-                res_ref.fetch_min(red_val_over_wg);
-            }
-            else if constexpr (su_ns::IsSyclLogicalAnd<outT,
-                                                       ReductionOp>::value)
-            {
-                res_ref.fetch_and(red_val_over_wg);
-            }
-            else if constexpr (su_ns::IsSyclLogicalOr<outT, ReductionOp>::value)
-            {
-                res_ref.fetch_or(red_val_over_wg);
-            }
-            else {
-                outT read_val = res_ref.load();
-                outT new_val{};
-                do {
-                    new_val = reduction_op_(read_val, red_val_over_wg);
-                } while (!res_ref.compare_exchange_strong(read_val, new_val));
-            }
-        }
-    }
-};
-
-template <typename argT,
-          typename outT,
-          typename ReductionOp,
-          typename InputOutputIterIndexerT,
-          typename InputRedIndexerT>
-struct ReductionOverGroupNoAtomicFunctor
-{
-private:
-    const argT *inp_ = nullptr;
-    outT *out_ = nullptr;
-    ReductionOp reduction_op_;
-    outT identity_;
-    InputOutputIterIndexerT inp_out_iter_indexer_;
-    InputRedIndexerT inp_reduced_dims_indexer_;
-    std::size_t reduction_max_gid_ = 0;
-    std::size_t iter_gws_ = 1;
-    std::size_t reductions_per_wi = 16;
-
-public:
-    ReductionOverGroupNoAtomicFunctor(
-        const argT *data,
-        outT *res,
-        const ReductionOp &reduction_op,
-        const outT &identity_val,
-        const InputOutputIterIndexerT &arg_res_iter_indexer,
-        const InputRedIndexerT &arg_reduced_dims_indexer,
-        std::size_t reduction_size,
-        std::size_t iteration_size,
-        std::size_t reduction_size_per_wi)
-        : inp_(data), out_(res), reduction_op_(reduction_op),
-          identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer),
-          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
-          reduction_max_gid_(reduction_size), iter_gws_(iteration_size),
-          reductions_per_wi(reduction_size_per_wi)
-    {
-    }
-
-    void operator()(sycl::nd_item<1> it) const
-    {
-        const std::size_t reduction_lid = it.get_local_id(0);
-        const std::size_t wg =
-            it.get_local_range(0); //   0 <= reduction_lid < wg
-
-        const std::size_t iter_gid = it.get_group(0) % iter_gws_;
-        const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_;
-        const std::size_t n_reduction_groups =
-            it.get_group_range(0) / iter_gws_;
-
-        // work-items operates over input with indices
-        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
-        //   + reduction_lid
-        // for 0 <= m < reductions_per_wi
-
-        const auto &inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
-        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
-        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
-
-        outT local_red_val(identity_);
-        std::size_t arg_reduce_gid0 =
-            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
-        for (std::size_t m = 0; m < reductions_per_wi; ++m) {
-            std::size_t arg_reduce_gid = arg_reduce_gid0 + m * wg;
-
-            if (arg_reduce_gid < reduction_max_gid_) {
-                auto inp_reduction_offset =
-                    inp_reduced_dims_indexer_(arg_reduce_gid);
-                auto inp_offset = inp_iter_offset + inp_reduction_offset;
-
-                using dpctl::tensor::type_utils::convert_impl;
-                outT val;
-                if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value ||
-                              su_ns::IsLogicalOr<outT, ReductionOp>::value)
-                {
-                    // handle nans
-                    val = convert_impl<bool, argT>(inp_[inp_offset]);
-                }
-                else {
-                    val = convert_impl<outT, argT>(inp_[inp_offset]);
-                }
-
-                local_red_val = reduction_op_(local_red_val, val);
-            }
-        }
-
-        auto work_group = it.get_group();
-        // This only works if reduction_op_ is from small set of operators
-        outT red_val_over_wg;
-        if constexpr (su_ns::IsLogicalAnd<outT, ReductionOp>::value) {
-            red_val_over_wg = sycl::all_of_group(work_group, local_red_val);
-        }
-        else if constexpr (su_ns::IsLogicalOr<outT, ReductionOp>::value) {
-            red_val_over_wg = sycl::any_of_group(work_group, local_red_val);
-        }
-        else {
-            red_val_over_wg = sycl::reduce_over_group(work_group, local_red_val,
-                                                      identity_, reduction_op_);
-        }
-
-        if (work_group.leader()) {
-            // each group writes to a different memory location
-            out_[out_iter_offset * n_reduction_groups + reduction_batch_id] =
-                red_val_over_wg;
-        }
-    }
-};
-
-/* = Reduction, using custom_reduce_over_group and not using atomic_ref*/
-
-template <typename argT,
-          typename outT,
-          typename ReductionOp,
-          typename InputOutputIterIndexerT,
-          typename InputRedIndexerT,
-          typename SlmT>
-struct CustomReductionOverGroupNoAtomicFunctor
-{
-private:
-    const argT *inp_ = nullptr;
-    outT *out_ = nullptr;
-    ReductionOp reduction_op_;
-    outT identity_;
-    InputOutputIterIndexerT inp_out_iter_indexer_;
-    InputRedIndexerT inp_reduced_dims_indexer_;
-    SlmT local_mem_;
-    std::size_t reduction_max_gid_ = 0;
-    std::size_t iter_gws_ = 1;
-    std::size_t reductions_per_wi = 16;
-
-public:
-    CustomReductionOverGroupNoAtomicFunctor(
-        const argT *data,
-        outT *res,
-        const ReductionOp &reduction_op,
-        const outT &identity_val,
-        const InputOutputIterIndexerT &arg_res_iter_indexer,
-        const InputRedIndexerT &arg_reduced_dims_indexer,
-        SlmT local_mem,
-        std::size_t reduction_size,
-        std::size_t iteration_size,
-        std::size_t reduction_size_per_wi)
-        : inp_(data), out_(res), reduction_op_(reduction_op),
-          identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer),
-          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
-          local_mem_(local_mem), reduction_max_gid_(reduction_size),
-          iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi)
-    {
-    }
-
-    void operator()(sycl::nd_item<1> it) const
-    {
-        const std::size_t reduction_lid = it.get_local_id(0);
-        const std::size_t wg =
-            it.get_local_range(0); //   0 <= reduction_lid < wg
-
-        const std::size_t iter_gid = it.get_group(0) % iter_gws_;
-        const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_;
-        const std::size_t n_reduction_groups =
-            it.get_group_range(0) / iter_gws_;
-
-        // work-items operates over input with indices
-        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
-        //   + reduction_lid
-        // for 0 <= m < reductions_per_wi
-
-        auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
-        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
-        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
-
-        outT local_red_val(identity_);
-        std::size_t arg_reduce_gid0 =
-            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
-        for (std::size_t m = 0; m < reductions_per_wi; ++m) {
-            std::size_t arg_reduce_gid = arg_reduce_gid0 + m * wg;
-
-            if (arg_reduce_gid < reduction_max_gid_) {
-                auto inp_reduction_offset =
-                    inp_reduced_dims_indexer_(arg_reduce_gid);
-                auto inp_offset = inp_iter_offset + inp_reduction_offset;
-
-                using dpctl::tensor::type_utils::convert_impl;
-                outT val;
-                if constexpr (std::is_same_v<ReductionOp,
-                                             sycl::logical_and<outT>> ||
-                              std::is_same_v<ReductionOp,
-                                             sycl::logical_or<outT>>)
-                {
-                    // handle nans
-                    val = convert_impl<bool, argT>(inp_[inp_offset]);
-                }
-                else {
-                    val = convert_impl<outT, argT>(inp_[inp_offset]);
-                }
-
-                local_red_val = reduction_op_(local_red_val, val);
-            }
-        }
-
-        auto work_group = it.get_group();
-        // This only works if reduction_op_ is from small set of operators
-        outT red_val_over_wg = su_ns::custom_reduce_over_group(
-            work_group, local_mem_, local_red_val, reduction_op_);
-
-        if (work_group.leader()) {
-            // each group writes to a different memory location
-            out_[out_iter_offset * n_reduction_groups + reduction_batch_id] =
-                red_val_over_wg;
-        }
-    }
-};
-
-template <
-    typename argTy,
-    typename resTy,
-    typename ReductionOpT,
-    typename InputOutputIterIndexerT,
-    typename ReductionIndexerT,
-    template <typename T1, typename T2, typename T3, typename T4, typename T5>
-    class kernel_name_token>
-sycl::event
-sequential_reduction(sycl::queue &exec_q,
-                     const argTy *arg,
-                     resTy *res,
-                     resTy identity_val,
-                     std::size_t iter_nelems,
-                     std::size_t reduction_nelems,
-                     const InputOutputIterIndexerT &in_out_iter_indexer,
-                     const ReductionIndexerT &reduction_indexer,
-                     const std::vector<sycl::event> &depends)
-{
-    sycl::event red_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        using KernelName =
-            class kernel_name_token<argTy, resTy, ReductionOpT,
-                                    InputOutputIterIndexerT, ReductionIndexerT>;
-
-        cgh.parallel_for<KernelName>(
-            sycl::range<1>(iter_nelems),
-            SequentialReduction<argTy, resTy, ReductionOpT,
-                                InputOutputIterIndexerT, ReductionIndexerT>(
-                arg, res, ReductionOpT(), identity_val, in_out_iter_indexer,
-                reduction_indexer, reduction_nelems));
-    });
-
-    return red_ev;
-}
-
-template <typename BasedKernelName> class custom_reduction_wrapper;
-
-template <
-    typename argTy,
-    typename resTy,
-    typename ReductionOpT,
-    typename InputOutputIterIndexerT,
-    typename ReductionIndexerT,
-    template <typename T1, typename T2, typename T3, typename T4, typename T5>
-    class kernel_name_token>
-sycl::event
-submit_atomic_reduction(sycl::queue &exec_q,
-                        const argTy *arg,
-                        resTy *res,
-                        resTy identity_val,
-                        std::size_t wg,
-                        std::size_t iter_nelems,
-                        std::size_t reduction_nelems,
-                        std::size_t reductions_per_wi,
-                        std::size_t reduction_groups,
-                        const InputOutputIterIndexerT &in_out_iter_indexer,
-                        const ReductionIndexerT &reduction_indexer,
-                        const std::vector<sycl::event> &depends)
-{
-    sycl::event red_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        auto globalRange = sycl::range<1>{iter_nelems * reduction_groups * wg};
-        auto localRange = sycl::range<1>{wg};
-        auto ndRange = sycl::nd_range<1>(globalRange, localRange);
-
-        if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value) {
-            using KernelName =
-                class kernel_name_token<argTy, resTy, ReductionOpT,
-                                        InputOutputIterIndexerT,
-                                        ReductionIndexerT>;
-
-            cgh.parallel_for<KernelName>(
-                ndRange,
-                ReductionOverGroupWithAtomicFunctor<argTy, resTy, ReductionOpT,
-                                                    InputOutputIterIndexerT,
-                                                    ReductionIndexerT>(
-                    arg, res, ReductionOpT(), identity_val, in_out_iter_indexer,
-                    reduction_indexer, reduction_nelems, iter_nelems,
-                    reductions_per_wi));
-        }
-        else {
-            using SlmT = sycl::local_accessor<resTy, 1>;
-            SlmT local_memory = SlmT(localRange, cgh);
-
-            using KernelName = class custom_reduction_wrapper<
-                kernel_name_token<argTy, resTy, ReductionOpT,
-                                  InputOutputIterIndexerT, ReductionIndexerT>>;
-
-            cgh.parallel_for<KernelName>(
-                ndRange,
-                CustomReductionOverGroupWithAtomicFunctor<
-                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                    ReductionIndexerT, SlmT>(
-                    arg, res, ReductionOpT(), identity_val, in_out_iter_indexer,
-                    reduction_indexer, local_memory, reduction_nelems,
-                    iter_nelems, reductions_per_wi));
-        }
-    });
-    return red_ev;
-}
-
-template <typename T1, typename T2, typename T3>
-class reduction_over_group_with_atomics_init_krn;
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class reduction_seq_krn;
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class reduction_over_group_with_atomics_krn;
-
-typedef sycl::event (*reduction_strided_impl_fn_ptr)(
-    sycl::queue &,
-    std::size_t,
-    std::size_t,
-    const char *,
-    char *,
-    int,
-    const ssize_t *,
-    ssize_t,
-    ssize_t,
-    int,
-    const ssize_t *,
-    ssize_t,
-    const std::vector<sycl::event> &);
-
-using dpctl::tensor::sycl_utils::choose_workgroup_size;
-
-template <typename argTy, typename resTy, typename ReductionOpT>
-sycl::event reduction_over_group_with_atomics_strided_impl(
-    sycl::queue &exec_q,
-    std::size_t iter_nelems,      // number of reductions    (num. of rows in a
-                                  // matrix when reducing over rows)
-    std::size_t reduction_nelems, // size of each reduction  (length of rows,
-                                  // i.e. number of columns)
-    const char *arg_cp,
-    char *res_cp,
-    int iter_nd,
-    const ssize_t *iter_shape_and_strides,
-    ssize_t iter_arg_offset,
-    ssize_t iter_res_offset,
-    int red_nd,
-    const ssize_t *reduction_shape_stride,
-    ssize_t reduction_arg_offset,
-    const std::vector<sycl::event> &depends)
-{
-    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
-    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
-
-    static constexpr resTy identity_val =
-        su_ns::Identity<ReductionOpT, resTy>::value;
-
-    const sycl::device &d = exec_q.get_device();
-    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
-    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
-
-    if (reduction_nelems < wg) {
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
-        using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
-
-        const InputOutputIterIndexerT in_out_iter_indexer{
-            iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides};
-        const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
-                                                  reduction_shape_stride};
-
-        sycl::event comp_ev =
-            sequential_reduction<argTy, resTy, ReductionOpT,
-                                 InputOutputIterIndexerT, ReductionIndexerT,
-                                 reduction_seq_krn>(
-                exec_q, arg_tp, res_tp, identity_val, iter_nelems,
-                reduction_nelems, in_out_iter_indexer, reduction_indexer,
-                depends);
-
-        return comp_ev;
-    }
-    else {
-        sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
-            using IndexerT =
-                dpctl::tensor::offset_utils::UnpackedStridedIndexer;
-
-            const ssize_t *const &res_shape = iter_shape_and_strides;
-            const ssize_t *const &res_strides =
-                iter_shape_and_strides + 2 * iter_nd;
-            const IndexerT res_indexer(iter_nd, iter_res_offset, res_shape,
-                                       res_strides);
-            using InitKernelName =
-                class reduction_over_group_with_atomics_init_krn<resTy, argTy,
-                                                                 ReductionOpT>;
-            cgh.depends_on(depends);
-
-            cgh.parallel_for<InitKernelName>(
-                sycl::range<1>(iter_nelems), [=](sycl::id<1> id) {
-                    auto res_offset = res_indexer(id[0]);
-                    res_tp[res_offset] = identity_val;
-                });
-        });
-
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
-        using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
-
-        const InputOutputIterIndexerT in_out_iter_indexer{
-            iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides};
-        const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
-                                                  reduction_shape_stride};
-
-        static constexpr std::size_t preferred_reductions_per_wi = 8;
-        std::size_t reductions_per_wi =
-            (reduction_nelems < preferred_reductions_per_wi * wg)
-                ? std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg)
-                : preferred_reductions_per_wi;
-
-        std::size_t reduction_groups =
-            (reduction_nelems + reductions_per_wi * wg - 1) /
-            (reductions_per_wi * wg);
-
-        sycl::event comp_ev =
-            submit_atomic_reduction<argTy, resTy, ReductionOpT,
-                                    InputOutputIterIndexerT, ReductionIndexerT,
-                                    reduction_over_group_with_atomics_krn>(
-                exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems,
-                reduction_nelems, reductions_per_wi, reduction_groups,
-                in_out_iter_indexer, reduction_indexer, {res_init_ev});
-
-        return comp_ev;
-    }
-}
-
-// Contig
-
-typedef sycl::event (*reduction_contig_impl_fn_ptr)(
-    sycl::queue &,
-    std::size_t,
-    std::size_t,
-    const char *,
-    char *,
-    ssize_t,
-    ssize_t,
-    ssize_t,
-    const std::vector<sycl::event> &);
-
-/* @brief Reduce rows in a matrix */
-template <typename argTy, typename resTy, typename ReductionOpT>
-sycl::event reduction_axis1_over_group_with_atomics_contig_impl(
-    sycl::queue &exec_q,
-    std::size_t iter_nelems,      // number of reductions    (num. of rows in a
-                                  // matrix when reducing over rows)
-    std::size_t reduction_nelems, // size of each reduction  (length of rows,
-                                  // i.e. number of columns)
-    const char *arg_cp,
-    char *res_cp,
-    ssize_t iter_arg_offset,
-    ssize_t iter_res_offset,
-    ssize_t reduction_arg_offset,
-    const std::vector<sycl::event> &depends)
-{
-    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
-                          iter_arg_offset + reduction_arg_offset;
-    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
-
-    static constexpr resTy identity_val =
-        su_ns::Identity<ReductionOpT, resTy>::value;
-
-    const sycl::device &d = exec_q.get_device();
-    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
-    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
-
-    if (reduction_nelems < wg) {
-        using InputIterIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                InputIterIndexerT, NoOpIndexerT>;
-        using ReductionIndexerT = NoOpIndexerT;
-
-        const InputOutputIterIndexerT in_out_iter_indexer{
-            InputIterIndexerT{/* size */ iter_nelems,
-                              /* step */ reduction_nelems},
-            NoOpIndexerT{}};
-        static constexpr ReductionIndexerT reduction_indexer{};
-
-        sycl::event comp_ev =
-            sequential_reduction<argTy, resTy, ReductionOpT,
-                                 InputOutputIterIndexerT, ReductionIndexerT,
-                                 reduction_seq_krn>(
-                exec_q, arg_tp, res_tp, identity_val, iter_nelems,
-                reduction_nelems, in_out_iter_indexer, reduction_indexer,
-                depends);
-
-        return comp_ev;
-    }
-    else {
-        sycl::event res_init_ev = exec_q.fill<resTy>(
-            res_tp, resTy(identity_val), iter_nelems, depends);
-
-        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        using RowsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                RowsIndexerT, NoOpIndexerT>;
-        using ReductionIndexerT = NoOpIndexerT;
-
-        const RowsIndexerT rows_indexer{/* size */ iter_nelems,
-                                        /* step */ reduction_nelems};
-        static constexpr NoOpIndexerT result_indexer{};
-        const InputOutputIterIndexerT in_out_iter_indexer{rows_indexer,
-                                                          result_indexer};
-        static constexpr ReductionIndexerT reduction_indexer{};
-
-        static constexpr std::size_t preferred_reductions_per_wi = 8;
-        std::size_t reductions_per_wi =
-            (reduction_nelems < preferred_reductions_per_wi * wg)
-                ? std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg)
-                : preferred_reductions_per_wi;
-
-        std::size_t reduction_groups =
-            (reduction_nelems + reductions_per_wi * wg - 1) /
-            (reductions_per_wi * wg);
-
-        sycl::event comp_ev =
-            submit_atomic_reduction<argTy, resTy, ReductionOpT,
-                                    InputOutputIterIndexerT, ReductionIndexerT,
-                                    reduction_over_group_with_atomics_krn>(
-                exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems,
-                reduction_nelems, reductions_per_wi, reduction_groups,
-                in_out_iter_indexer, reduction_indexer, {res_init_ev});
-
-        return comp_ev;
-    }
-}
-
-/* @brief Reduce rows in a matrix */
-template <typename argTy, typename resTy, typename ReductionOpT>
-sycl::event reduction_axis0_over_group_with_atomics_contig_impl(
-    sycl::queue &exec_q,
-    std::size_t iter_nelems,      // number of reductions    (num. of cols in a
-                                  // matrix when reducing over cols)
-    std::size_t reduction_nelems, // size of each reduction  (length of cols,
-                                  // i.e. number of rows)
-    const char *arg_cp,
-    char *res_cp,
-    ssize_t iter_arg_offset,
-    ssize_t iter_res_offset,
-    ssize_t reduction_arg_offset,
-    const std::vector<sycl::event> &depends)
-{
-    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
-                          iter_arg_offset + reduction_arg_offset;
-    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
-
-    static constexpr resTy identity_val =
-        su_ns::Identity<ReductionOpT, resTy>::value;
-
-    const sycl::device &d = exec_q.get_device();
-    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
-    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
-
-    if (reduction_nelems < wg) {
-        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                NoOpIndexerT, NoOpIndexerT>;
-        using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-
-        const InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
-                                                          NoOpIndexerT{}};
-        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
-                                                  /* step */ iter_nelems};
-
-        sycl::event comp_ev =
-            sequential_reduction<argTy, resTy, ReductionOpT,
-                                 InputOutputIterIndexerT, ReductionIndexerT,
-                                 reduction_seq_krn>(
-                exec_q, arg_tp, res_tp, identity_val, iter_nelems,
-                reduction_nelems, in_out_iter_indexer, reduction_indexer,
-                depends);
-
-        return comp_ev;
-    }
-    else {
-        sycl::event res_init_ev = exec_q.fill<resTy>(
-            res_tp, resTy(identity_val), iter_nelems, depends);
-
-        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                NoOpIndexerT, NoOpIndexerT>;
-        using ReductionIndexerT = ColsIndexerT;
-
-        static constexpr NoOpIndexerT columns_indexer{};
-        static constexpr NoOpIndexerT result_indexer{};
-        const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
-                                                          result_indexer};
-        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
-                                                  /* step */ iter_nelems};
-
-        static constexpr std::size_t preferred_reductions_per_wi = 8;
-        std::size_t reductions_per_wi =
-            (reduction_nelems < preferred_reductions_per_wi * wg)
-                ? std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg)
-                : preferred_reductions_per_wi;
-
-        std::size_t reduction_groups =
-            (reduction_nelems + reductions_per_wi * wg - 1) /
-            (reductions_per_wi * wg);
-
-        sycl::event comp_ev =
-            submit_atomic_reduction<argTy, resTy, ReductionOpT,
-                                    InputOutputIterIndexerT, ReductionIndexerT,
-                                    reduction_over_group_with_atomics_krn>(
-                exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems,
-                reduction_nelems, reductions_per_wi, reduction_groups,
-                in_out_iter_indexer, reduction_indexer, {res_init_ev});
-
-        return comp_ev;
-    }
-}
-
-/* = Reduction, using sycl::reduce_over_group, but not using atomic_ref = */
-
-template <
-    typename argTy,
-    typename resTy,
-    typename ReductionOpT,
-    typename InputOutputIterIndexerT,
-    typename ReductionIndexerT,
-    template <typename T1, typename T2, typename T3, typename T4, typename T5>
-    class kernel_name_token>
-sycl::event
-submit_no_atomic_reduction(sycl::queue &exec_q,
-                           const argTy *arg,
-                           resTy *res,
-                           resTy identity_val,
-                           std::size_t wg,
-                           std::size_t iter_nelems,
-                           std::size_t reduction_nelems,
-                           std::size_t reductions_per_wi,
-                           std::size_t reduction_groups,
-                           const InputOutputIterIndexerT &in_out_iter_indexer,
-                           const ReductionIndexerT &reduction_indexer,
-                           const std::vector<sycl::event> &depends)
-{
-    sycl::event red_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        auto globalRange = sycl::range<1>{iter_nelems * reduction_groups * wg};
-        auto localRange = sycl::range<1>{wg};
-        auto ndRange = sycl::nd_range<1>(globalRange, localRange);
-
-        if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value) {
-            using KernelName =
-                class kernel_name_token<argTy, resTy, ReductionOpT,
-                                        InputOutputIterIndexerT,
-                                        ReductionIndexerT>;
-
-            cgh.parallel_for<KernelName>(
-                ndRange,
-                ReductionOverGroupNoAtomicFunctor<argTy, resTy, ReductionOpT,
-                                                  InputOutputIterIndexerT,
-                                                  ReductionIndexerT>(
-                    arg, res, ReductionOpT(), identity_val, in_out_iter_indexer,
-                    reduction_indexer, reduction_nelems, iter_nelems,
-                    reductions_per_wi));
-        }
-        else {
-            using SlmT = sycl::local_accessor<resTy, 1>;
-            SlmT local_memory = SlmT(localRange, cgh);
-            using KernelName = class custom_reduction_wrapper<
-                kernel_name_token<argTy, resTy, ReductionOpT,
-                                  InputOutputIterIndexerT, ReductionIndexerT>>;
-
-            cgh.parallel_for<KernelName>(
-                ndRange,
-                CustomReductionOverGroupNoAtomicFunctor<
-                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                    ReductionIndexerT, SlmT>(
-                    arg, res, ReductionOpT(), identity_val, in_out_iter_indexer,
-                    reduction_indexer, local_memory, reduction_nelems,
-                    iter_nelems, reductions_per_wi));
-        }
-    });
-    return red_ev;
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class reduction_over_group_temps_krn;
-
-typedef sycl::event (*reduction_strided_impl_fn_ptr)(
-    sycl::queue &,
-    std::size_t,
-    std::size_t,
-    const char *,
-    char *,
-    int,
-    const ssize_t *,
-    ssize_t,
-    ssize_t,
-    int,
-    const ssize_t *,
-    ssize_t,
-    const std::vector<sycl::event> &);
-
-template <typename T1, typename T2, typename T3>
-class reduction_over_group_temps_empty_krn;
-
-template <typename argTy, typename resTy, typename ReductionOpT>
-sycl::event reduction_over_group_temps_strided_impl(
-    sycl::queue &exec_q,
-    std::size_t iter_nelems,      // number of reductions    (num. of rows in a
-                                  // matrix when reducing over rows)
-    std::size_t reduction_nelems, // size of each reduction  (length of rows,
-                                  // i.e. number of columns)
-    const char *arg_cp,
-    char *res_cp,
-    int iter_nd,
-    const ssize_t *iter_shape_and_strides,
-    ssize_t iter_arg_offset,
-    ssize_t iter_res_offset,
-    int red_nd,
-    const ssize_t *reduction_shape_stride,
-    ssize_t reduction_arg_offset,
-    const std::vector<sycl::event> &depends)
-{
-    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
-    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
-
-    static constexpr resTy identity_val =
-        su_ns::Identity<ReductionOpT, resTy>::value;
-
-    if (reduction_nelems == 0) {
-        sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
-            using IndexerT =
-                dpctl::tensor::offset_utils::UnpackedStridedIndexer;
-
-            const ssize_t *const &res_shape = iter_shape_and_strides;
-            const ssize_t *const &res_strides =
-                iter_shape_and_strides + 2 * iter_nd;
-            const IndexerT res_indexer(iter_nd, iter_res_offset, res_shape,
-                                       res_strides);
-            using InitKernelName =
-                class reduction_over_group_temps_empty_krn<resTy, argTy,
-                                                           ReductionOpT>;
-            cgh.depends_on(depends);
-
-            cgh.parallel_for<InitKernelName>(
-                sycl::range<1>(iter_nelems), [=](sycl::id<1> id) {
-                    auto res_offset = res_indexer(id[0]);
-                    res_tp[res_offset] = identity_val;
-                });
-        });
-
-        return res_init_ev;
-    }
-
-    const sycl::device &d = exec_q.get_device();
-    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
-    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
-
-    if (reduction_nelems < wg) {
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
-        using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
-
-        const InputOutputIterIndexerT in_out_iter_indexer{
-            iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides};
-        const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
-                                                  reduction_shape_stride};
-
-        sycl::event comp_ev =
-            sequential_reduction<argTy, resTy, ReductionOpT,
-                                 InputOutputIterIndexerT, ReductionIndexerT,
-                                 reduction_seq_krn>(
-                exec_q, arg_tp, res_tp, identity_val, iter_nelems,
-                reduction_nelems, in_out_iter_indexer, reduction_indexer,
-                depends);
-
-        return comp_ev;
-    }
-
-    static constexpr std::size_t preferred_reductions_per_wi = 8;
-    // prevents running out of resources on CPU
-    std::size_t max_wg = reduction_detail::get_work_group_size(d);
-
-    std::size_t reductions_per_wi(preferred_reductions_per_wi);
-    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
-        // Perform reduction using one 1 work-group per iteration,
-        // can output directly to res
-
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
-        using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
-
-        const InputOutputIterIndexerT in_out_iter_indexer{
-            iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides};
-        const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
-                                                  reduction_shape_stride};
-
-        if (iter_nelems == 1) {
-            // increase GPU occupancy
-            wg = max_wg;
-        }
-        reductions_per_wi =
-            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
-
-        std::size_t reduction_groups =
-            (reduction_nelems + reductions_per_wi * wg - 1) /
-            (reductions_per_wi * wg);
-        assert(reduction_groups == 1);
-
-        sycl::event comp_ev = submit_no_atomic_reduction<
-            argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-            ReductionIndexerT, reduction_over_group_temps_krn>(
-            exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems,
-            reduction_nelems, reductions_per_wi, reduction_groups,
-            in_out_iter_indexer, reduction_indexer, depends);
-
-        return comp_ev;
-    }
-    else {
-        // more than one work-groups is needed, requires a temporary
-        std::size_t reduction_groups =
-            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
-            (preferred_reductions_per_wi * wg);
-        assert(reduction_groups > 1);
-
-        std::size_t second_iter_reduction_groups_ =
-            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
-            (preferred_reductions_per_wi * wg);
-
-        const std::size_t tmp_alloc_size =
-            iter_nelems * (reduction_groups + second_iter_reduction_groups_);
-        auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
-            tmp_alloc_size, exec_q);
-
-        resTy *partially_reduced_tmp = tmp_owner.get();
-        resTy *partially_reduced_tmp2 =
-            partially_reduced_tmp + reduction_groups * iter_nelems;
-        ;
-
-        sycl::event first_reduction_ev;
-        {
-            using InputIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
-            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-            using InputOutputIterIndexerT =
-                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                    InputIndexerT, ResIndexerT>;
-            using ReductionIndexerT =
-                dpctl::tensor::offset_utils::StridedIndexer;
-
-            // Only 2*iter_nd entries describing shape and strides of
-            // iterated dimensions of input array from
-            // iter_shape_and_strides are going to be accessed by
-            // inp_indexer
-            const InputIndexerT inp_indexer(iter_nd, iter_arg_offset,
-                                            iter_shape_and_strides);
-            static constexpr ResIndexerT noop_tmp_indexer{};
-
-            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
-                                                              noop_tmp_indexer};
-            const ReductionIndexerT reduction_indexer{
-                red_nd, reduction_arg_offset, reduction_shape_stride};
-
-            first_reduction_ev = submit_no_atomic_reduction<
-                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                ReductionIndexerT, reduction_over_group_temps_krn>(
-                exec_q, arg_tp, partially_reduced_tmp, identity_val, wg,
-                iter_nelems, reduction_nelems, preferred_reductions_per_wi,
-                reduction_groups, in_out_iter_indexer, reduction_indexer,
-                depends);
-        }
-
-        std::size_t remaining_reduction_nelems = reduction_groups;
-
-        resTy *temp_arg = partially_reduced_tmp;
-        resTy *temp2_arg = partially_reduced_tmp2;
-        sycl::event dependent_ev = first_reduction_ev;
-
-        while (remaining_reduction_nelems >
-               preferred_reductions_per_wi * max_wg)
-        {
-            std::size_t reduction_groups_ =
-                (remaining_reduction_nelems + preferred_reductions_per_wi * wg -
-                 1) /
-                (preferred_reductions_per_wi * wg);
-            assert(reduction_groups_ > 1);
-
-            // keep reducing
-            sycl::event partial_reduction_ev;
-            {
-                using InputIndexerT =
-                    dpctl::tensor::offset_utils::Strided1DIndexer;
-                using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-                using InputOutputIterIndexerT =
-                    dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                        InputIndexerT, ResIndexerT>;
-                using ReductionIndexerT =
-                    dpctl::tensor::offset_utils::NoOpIndexer;
-
-                const InputIndexerT inp_indexer{/* size */ iter_nelems,
-                                                /* step */ reduction_groups_};
-                static constexpr ResIndexerT res_iter_indexer{};
-
-                const InputOutputIterIndexerT in_out_iter_indexer{
-                    inp_indexer, res_iter_indexer};
-                static constexpr ReductionIndexerT reduction_indexer{};
-
-                partial_reduction_ev = submit_no_atomic_reduction<
-                    resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                    ReductionIndexerT, reduction_over_group_temps_krn>(
-                    exec_q, temp_arg, temp2_arg, identity_val, wg, iter_nelems,
-                    remaining_reduction_nelems, preferred_reductions_per_wi,
-                    reduction_groups_, in_out_iter_indexer, reduction_indexer,
-                    {dependent_ev});
-            }
-
-            remaining_reduction_nelems = reduction_groups_;
-            std::swap(temp_arg, temp2_arg);
-            dependent_ev = std::move(partial_reduction_ev);
-        }
-
-        // final reduction to res
-        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-        using ResIndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer;
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                InputIndexerT, ResIndexerT>;
-        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-
-        const InputIndexerT inp_indexer{/* size */ iter_nelems,
-                                        /* step */ remaining_reduction_nelems};
-        const ResIndexerT res_iter_indexer{
-            iter_nd, iter_res_offset,
-            /* shape */ iter_shape_and_strides,
-            /* strides */ iter_shape_and_strides + 2 * iter_nd};
-
-        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
-                                                          res_iter_indexer};
-        static constexpr ReductionIndexerT reduction_indexer{};
-
-        wg = max_wg;
-        reductions_per_wi = std::max<std::size_t>(
-            1, (remaining_reduction_nelems + wg - 1) / wg);
-
-        reduction_groups =
-            (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
-            (reductions_per_wi * wg);
-        assert(reduction_groups == 1);
-
-        sycl::event final_reduction_ev = submit_no_atomic_reduction<
-            resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-            ReductionIndexerT, reduction_over_group_temps_krn>(
-            exec_q, temp_arg, res_tp, identity_val, wg, iter_nelems,
-            remaining_reduction_nelems, reductions_per_wi, reduction_groups,
-            in_out_iter_indexer, reduction_indexer, {dependent_ev});
-
-        sycl::event cleanup_host_task_event =
-            dpctl::tensor::alloc_utils::async_smart_free(
-                exec_q, {final_reduction_ev}, tmp_owner);
-
-        // FIXME: do not return host-task event
-        //   Instead collect all host-tasks to a list
-
-        return cleanup_host_task_event;
-    }
-}
-
-template <typename argTy, typename resTy, typename ReductionOpT>
-sycl::event reduction_axis1_over_group_temps_contig_impl(
-    sycl::queue &exec_q,
-    std::size_t iter_nelems,      // number of reductions    (num. of rows in a
-                                  // matrix when reducing over rows)
-    std::size_t reduction_nelems, // size of each reduction  (length of rows,
-                                  // i.e. number of columns)
-    const char *arg_cp,
-    char *res_cp,
-    ssize_t iter_arg_offset,
-    ssize_t iter_res_offset,
-    ssize_t reduction_arg_offset,
-    const std::vector<sycl::event> &depends)
-{
-    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
-                          iter_arg_offset + reduction_arg_offset;
-    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
-
-    static constexpr resTy identity_val =
-        su_ns::Identity<ReductionOpT, resTy>::value;
-
-    if (reduction_nelems == 0) {
-        sycl::event res_init_ev = exec_q.fill<resTy>(
-            res_tp, resTy(identity_val), iter_nelems, depends);
-
-        return res_init_ev;
-    }
-
-    const sycl::device &d = exec_q.get_device();
-    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
-    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
-
-    if (reduction_nelems < wg) {
-        using InputIterIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                InputIterIndexerT, NoOpIndexerT>;
-        using ReductionIndexerT = NoOpIndexerT;
-
-        const InputOutputIterIndexerT in_out_iter_indexer{
-            InputIterIndexerT{/* size */ iter_nelems,
-                              /* step */ reduction_nelems},
-            NoOpIndexerT{}};
-        static constexpr ReductionIndexerT reduction_indexer{};
-
-        sycl::event comp_ev =
-            sequential_reduction<argTy, resTy, ReductionOpT,
-                                 InputOutputIterIndexerT, ReductionIndexerT,
-                                 reduction_seq_krn>(
-                exec_q, arg_tp, res_tp, identity_val, iter_nelems,
-                reduction_nelems, in_out_iter_indexer, reduction_indexer,
-                depends);
-
-        return comp_ev;
-    }
-
-    static constexpr std::size_t preferred_reductions_per_wi = 8;
-    // prevents running out of resources on CPU
-    std::size_t max_wg = reduction_detail::get_work_group_size(d);
-
-    std::size_t reductions_per_wi(preferred_reductions_per_wi);
-    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
-        // Perform reduction using one 1 work-group per iteration,
-        // can output directly to res
-
-        using InputIterIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                InputIterIndexerT, NoOpIndexerT>;
-        using ReductionIndexerT = NoOpIndexerT;
-
-        const InputOutputIterIndexerT in_out_iter_indexer{
-            InputIterIndexerT{/* size */ iter_nelems,
-                              /* step */ reduction_nelems},
-            NoOpIndexerT{}};
-        static constexpr ReductionIndexerT reduction_indexer{};
-
-        if (iter_nelems == 1) {
-            // increase GPU occupancy
-            wg = max_wg;
-        }
-        reductions_per_wi =
-            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
-
-        std::size_t reduction_groups =
-            (reduction_nelems + reductions_per_wi * wg - 1) /
-            (reductions_per_wi * wg);
-        assert(reduction_groups == 1);
-
-        sycl::event comp_ev = submit_no_atomic_reduction<
-            argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-            ReductionIndexerT, reduction_over_group_temps_krn>(
-            exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems,
-            reduction_nelems, reductions_per_wi, reduction_groups,
-            in_out_iter_indexer, reduction_indexer, depends);
-
-        return comp_ev;
-    }
-    else {
-        // more than one work-groups is needed, requires a temporary
-        std::size_t reduction_groups =
-            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
-            (preferred_reductions_per_wi * wg);
-        assert(reduction_groups > 1);
-
-        std::size_t second_iter_reduction_groups_ =
-            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
-            (preferred_reductions_per_wi * wg);
-
-        const std::size_t tmp_alloc_size =
-            iter_nelems * (reduction_groups + second_iter_reduction_groups_);
-        auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
-            tmp_alloc_size, exec_q);
-        resTy *partially_reduced_tmp = tmp_owner.get();
-        resTy *partially_reduced_tmp2 =
-            partially_reduced_tmp + reduction_groups * iter_nelems;
-
-        sycl::event first_reduction_ev;
-        {
-            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-            using RowsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-            using InputOutputIterIndexerT =
-                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                    RowsIndexerT, NoOpIndexerT>;
-            using ReductionIndexerT = NoOpIndexerT;
-
-            const RowsIndexerT rows_indexer{/* size */ iter_nelems,
-                                            /* step */ reduction_nelems};
-            static constexpr NoOpIndexerT noop_tmp_indexer{};
-            const InputOutputIterIndexerT in_out_iter_indexer{rows_indexer,
-                                                              noop_tmp_indexer};
-            static constexpr ReductionIndexerT reduction_indexer{};
-
-            first_reduction_ev = submit_no_atomic_reduction<
-                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                ReductionIndexerT, reduction_over_group_temps_krn>(
-                exec_q, arg_tp, partially_reduced_tmp, identity_val, wg,
-                iter_nelems, reduction_nelems, preferred_reductions_per_wi,
-                reduction_groups, in_out_iter_indexer, reduction_indexer,
-                depends);
-        }
-
-        std::size_t remaining_reduction_nelems = reduction_groups;
-
-        resTy *temp_arg = partially_reduced_tmp;
-        resTy *temp2_arg = partially_reduced_tmp2;
-        sycl::event dependent_ev = first_reduction_ev;
-
-        while (remaining_reduction_nelems >
-               preferred_reductions_per_wi * max_wg)
-        {
-            std::size_t reduction_groups_ =
-                (remaining_reduction_nelems + preferred_reductions_per_wi * wg -
-                 1) /
-                (preferred_reductions_per_wi * wg);
-            assert(reduction_groups_ > 1);
-
-            // keep reducing
-            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-            using InputOutputIterIndexerT =
-                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                    InputIndexerT, ResIndexerT>;
-            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-
-            const InputIndexerT inp_indexer{/* size */ iter_nelems,
-                                            /* step */ reduction_groups_};
-            static constexpr ResIndexerT res_iter_indexer{};
-
-            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
-                                                              res_iter_indexer};
-            static constexpr ReductionIndexerT reduction_indexer{};
-
-            sycl::event partial_reduction_ev = submit_no_atomic_reduction<
-                resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                ReductionIndexerT, reduction_over_group_temps_krn>(
-                exec_q, temp_arg, temp2_arg, identity_val, wg, iter_nelems,
-                remaining_reduction_nelems, preferred_reductions_per_wi,
-                reduction_groups_, in_out_iter_indexer, reduction_indexer,
-                {dependent_ev});
-
-            remaining_reduction_nelems = reduction_groups_;
-            std::swap(temp_arg, temp2_arg);
-            dependent_ev = std::move(partial_reduction_ev);
-        }
-
-        // final reduction to res
-        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-        using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                InputIndexerT, ResIndexerT>;
-        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-
-        const InputIndexerT inp_indexer{/* size */ iter_nelems,
-                                        /* step */ remaining_reduction_nelems};
-        static constexpr ResIndexerT res_iter_indexer{};
-
-        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
-                                                          res_iter_indexer};
-        static constexpr ReductionIndexerT reduction_indexer{};
-
-        wg = max_wg;
-        reductions_per_wi = std::max<std::size_t>(
-            1, (remaining_reduction_nelems + wg - 1) / wg);
-
-        reduction_groups =
-            (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
-            (reductions_per_wi * wg);
-        assert(reduction_groups == 1);
-
-        sycl::event final_reduction_ev = submit_no_atomic_reduction<
-            resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-            ReductionIndexerT, reduction_over_group_temps_krn>(
-            exec_q, temp_arg, res_tp, identity_val, wg, iter_nelems,
-            remaining_reduction_nelems, reductions_per_wi, reduction_groups,
-            in_out_iter_indexer, reduction_indexer, {dependent_ev});
-
-        sycl::event cleanup_host_task_event =
-            dpctl::tensor::alloc_utils::async_smart_free(
-                exec_q, {final_reduction_ev}, tmp_owner);
-
-        // FIXME: do not return host-task event
-        //   Instead collect all host-tasks to a list
-
-        return cleanup_host_task_event;
-    }
-}
-
-template <typename argTy, typename resTy, typename ReductionOpT>
-sycl::event reduction_axis0_over_group_temps_contig_impl(
-    sycl::queue &exec_q,
-    std::size_t iter_nelems,      // number of reductions    (num. of rows in a
-                                  // matrix when reducing over rows)
-    std::size_t reduction_nelems, // size of each reduction  (length of rows,
-                                  // i.e. number of columns)
-    const char *arg_cp,
-    char *res_cp,
-    ssize_t iter_arg_offset,
-    ssize_t iter_res_offset,
-    ssize_t reduction_arg_offset,
-    const std::vector<sycl::event> &depends)
-{
-    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
-                          iter_arg_offset + reduction_arg_offset;
-    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
-
-    static constexpr resTy identity_val =
-        su_ns::Identity<ReductionOpT, resTy>::value;
-
-    if (reduction_nelems == 0) {
-        sycl::event res_init_ev = exec_q.fill<resTy>(
-            res_tp, resTy(identity_val), iter_nelems, depends);
-
-        return res_init_ev;
-    }
-
-    const sycl::device &d = exec_q.get_device();
-    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
-    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
-
-    if (reduction_nelems < wg) {
-        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                NoOpIndexerT, NoOpIndexerT>;
-        using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-
-        const InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
-                                                          NoOpIndexerT{}};
-        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
-                                                  /* step */ iter_nelems};
-
-        sycl::event comp_ev =
-            sequential_reduction<argTy, resTy, ReductionOpT,
-                                 InputOutputIterIndexerT, ReductionIndexerT,
-                                 reduction_seq_krn>(
-                exec_q, arg_tp, res_tp, identity_val, iter_nelems,
-                reduction_nelems, in_out_iter_indexer, reduction_indexer,
-                depends);
-
-        return comp_ev;
-    }
-
-    static constexpr std::size_t preferred_reductions_per_wi = 8;
-    // prevents running out of resources on CPU
-    std::size_t max_wg = reduction_detail::get_work_group_size(d);
-
-    std::size_t reductions_per_wi(preferred_reductions_per_wi);
-    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
-        // Perform reduction using one 1 work-group per iteration,
-        // can output directly to res
-
-        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                NoOpIndexerT, NoOpIndexerT>;
-        using ReductionIndexerT = ColsIndexerT;
-
-        static constexpr NoOpIndexerT columns_indexer{};
-        static constexpr NoOpIndexerT result_indexer{};
-        const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
-                                                          result_indexer};
-        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
-                                                  /* step */ iter_nelems};
-
-        if (iter_nelems == 1) {
-            // increase GPU occupancy
-            wg = max_wg;
-        }
-        reductions_per_wi =
-            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
-
-        std::size_t reduction_groups =
-            (reduction_nelems + reductions_per_wi * wg - 1) /
-            (reductions_per_wi * wg);
-        assert(reduction_groups == 1);
-
-        sycl::event comp_ev = submit_no_atomic_reduction<
-            argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-            ReductionIndexerT, reduction_over_group_temps_krn>(
-            exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems,
-            reduction_nelems, reductions_per_wi, reduction_groups,
-            in_out_iter_indexer, reduction_indexer, depends);
-
-        return comp_ev;
-    }
-    else {
-        // more than one work-groups is needed, requires a temporary
-        std::size_t reduction_groups =
-            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
-            (preferred_reductions_per_wi * wg);
-        assert(reduction_groups > 1);
-
-        std::size_t second_iter_reduction_groups_ =
-            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
-            (preferred_reductions_per_wi * wg);
-
-        const std::size_t tmp_alloc_size =
-            iter_nelems * (reduction_groups + second_iter_reduction_groups_);
-
-        auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
-            tmp_alloc_size, exec_q);
-
-        resTy *partially_reduced_tmp = tmp_owner.get();
-        resTy *partially_reduced_tmp2 =
-            partially_reduced_tmp + reduction_groups * iter_nelems;
-
-        sycl::event first_reduction_ev;
-        {
-            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-            using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-            using InputOutputIterIndexerT =
-                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                    NoOpIndexerT, NoOpIndexerT>;
-            using ReductionIndexerT = ColsIndexerT;
-
-            static constexpr NoOpIndexerT columns_indexer{};
-            static constexpr NoOpIndexerT noop_tmp_indexer{};
-            const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
-                                                              noop_tmp_indexer};
-            const ReductionIndexerT reduction_indexer{
-                /* size */ reduction_nelems,
-                /* step */ iter_nelems};
-
-            first_reduction_ev = submit_no_atomic_reduction<
-                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                ReductionIndexerT, reduction_over_group_temps_krn>(
-                exec_q, arg_tp, partially_reduced_tmp, identity_val, wg,
-                iter_nelems, reduction_nelems, preferred_reductions_per_wi,
-                reduction_groups, in_out_iter_indexer, reduction_indexer,
-                depends);
-        }
-
-        std::size_t remaining_reduction_nelems = reduction_groups;
-
-        resTy *temp_arg = partially_reduced_tmp;
-        resTy *temp2_arg = partially_reduced_tmp2;
-        sycl::event dependent_ev = first_reduction_ev;
-
-        while (remaining_reduction_nelems >
-               preferred_reductions_per_wi * max_wg)
-        {
-            std::size_t reduction_groups_ =
-                (remaining_reduction_nelems + preferred_reductions_per_wi * wg -
-                 1) /
-                (preferred_reductions_per_wi * wg);
-            assert(reduction_groups_ > 1);
-
-            // keep reducing
-            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-            using InputOutputIterIndexerT =
-                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                    InputIndexerT, ResIndexerT>;
-            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-
-            const InputIndexerT inp_indexer{/* size */ iter_nelems,
-                                            /* step */ reduction_groups_};
-            static constexpr ResIndexerT res_iter_indexer{};
-
-            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
-                                                              res_iter_indexer};
-            static constexpr ReductionIndexerT reduction_indexer{};
-
-            sycl::event partial_reduction_ev = submit_no_atomic_reduction<
-                resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                ReductionIndexerT, reduction_over_group_temps_krn>(
-                exec_q, temp_arg, temp2_arg, identity_val, wg, iter_nelems,
-                remaining_reduction_nelems, preferred_reductions_per_wi,
-                reduction_groups_, in_out_iter_indexer, reduction_indexer,
-                {dependent_ev});
-
-            remaining_reduction_nelems = reduction_groups_;
-            std::swap(temp_arg, temp2_arg);
-            dependent_ev = std::move(partial_reduction_ev);
-        }
-
-        // final reduction to res
-        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-        using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                InputIndexerT, ResIndexerT>;
-        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-
-        const InputIndexerT inp_indexer{/* size */ iter_nelems,
-                                        /* step */ remaining_reduction_nelems};
-        static constexpr ResIndexerT res_iter_indexer{};
-
-        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
-                                                          res_iter_indexer};
-        static constexpr ReductionIndexerT reduction_indexer{};
-
-        wg = max_wg;
-        reductions_per_wi = std::max<std::size_t>(
-            1, (remaining_reduction_nelems + wg - 1) / wg);
-
-        reduction_groups =
-            (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
-            (reductions_per_wi * wg);
-        assert(reduction_groups == 1);
-
-        sycl::event final_reduction_ev = submit_no_atomic_reduction<
-            resTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-            ReductionIndexerT, reduction_over_group_temps_krn>(
-            exec_q, temp_arg, res_tp, identity_val, wg, iter_nelems,
-            remaining_reduction_nelems, reductions_per_wi, reduction_groups,
-            in_out_iter_indexer, reduction_indexer, {dependent_ev});
-
-        sycl::event cleanup_host_task_event =
-            dpctl::tensor::alloc_utils::async_smart_free(
-                exec_q, {final_reduction_ev}, tmp_owner);
-
-        // FIXME: do not return host-task event
-        //   Instead collect all host-tasks to a list
-
-        return cleanup_host_task_event;
-    }
-}
-
-// Argmax and Argmin
-
-/* Sequential search reduction */
-
-template <typename argT,
-          typename outT,
-          typename ReductionOp,
-          typename IdxReductionOp,
-          typename InputOutputIterIndexerT,
-          typename InputRedIndexerT>
-struct SequentialSearchReduction
-{
-private:
-    const argT *inp_ = nullptr;
-    outT *out_ = nullptr;
-    ReductionOp reduction_op_;
-    argT identity_;
-    IdxReductionOp idx_reduction_op_;
-    outT idx_identity_;
-    InputOutputIterIndexerT inp_out_iter_indexer_;
-    InputRedIndexerT inp_reduced_dims_indexer_;
-    std::size_t reduction_max_gid_ = 0;
-
-public:
-    SequentialSearchReduction(
-        const argT *inp,
-        outT *res,
-        const ReductionOp &reduction_op,
-        const argT &identity_val,
-        const IdxReductionOp &idx_reduction_op,
-        const outT &idx_identity_val,
-        const InputOutputIterIndexerT &arg_res_iter_indexer,
-        const InputRedIndexerT &arg_reduced_dims_indexer,
-        std::size_t reduction_size)
-        : inp_(inp), out_(res), reduction_op_(reduction_op),
-          identity_(identity_val), idx_reduction_op_(idx_reduction_op),
-          idx_identity_(idx_identity_val),
-          inp_out_iter_indexer_(arg_res_iter_indexer),
-          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
-          reduction_max_gid_(reduction_size)
-    {
-    }
-
-    void operator()(sycl::id<1> id) const
-    {
-
-        auto const &inp_out_iter_offsets_ = inp_out_iter_indexer_(id[0]);
-        const ssize_t &inp_iter_offset =
-            inp_out_iter_offsets_.get_first_offset();
-        const ssize_t &out_iter_offset =
-            inp_out_iter_offsets_.get_second_offset();
-
-        argT red_val(identity_);
-        outT idx_val(idx_identity_);
-        for (std::size_t m = 0; m < reduction_max_gid_; ++m) {
-            const ssize_t inp_reduction_offset = inp_reduced_dims_indexer_(m);
-            const ssize_t inp_offset = inp_iter_offset + inp_reduction_offset;
-
-            argT val = inp_[inp_offset];
-            if (val == red_val) {
-                idx_val = idx_reduction_op_(idx_val, static_cast<outT>(m));
-            }
-            else {
-                if constexpr (su_ns::IsMinimum<argT, ReductionOp>::value) {
-                    using dpctl::tensor::type_utils::is_complex;
-                    if constexpr (is_complex<argT>::value) {
-                        using dpctl::tensor::math_utils::less_complex;
-                        // less_complex always returns false for NaNs, so check
-                        if (less_complex<argT>(val, red_val) ||
-                            std::isnan(std::real(val)) ||
-                            std::isnan(std::imag(val)))
-                        {
-                            red_val = val;
-                            idx_val = static_cast<outT>(m);
-                        }
-                    }
-                    else if constexpr (std::is_floating_point_v<argT> ||
-                                       std::is_same_v<argT, sycl::half>)
-                    {
-                        if (val < red_val || std::isnan(val)) {
-                            red_val = val;
-                            idx_val = static_cast<outT>(m);
-                        }
-                    }
-                    else {
-                        if (val < red_val) {
-                            red_val = val;
-                            idx_val = static_cast<outT>(m);
-                        }
-                    }
-                }
-                else if constexpr (su_ns::IsMaximum<argT, ReductionOp>::value) {
-                    using dpctl::tensor::type_utils::is_complex;
-                    if constexpr (is_complex<argT>::value) {
-                        using dpctl::tensor::math_utils::greater_complex;
-                        if (greater_complex<argT>(val, red_val) ||
-                            std::isnan(std::real(val)) ||
-                            std::isnan(std::imag(val)))
-                        {
-                            red_val = val;
-                            idx_val = static_cast<outT>(m);
-                        }
-                    }
-                    else if constexpr (std::is_floating_point_v<argT> ||
-                                       std::is_same_v<argT, sycl::half>)
-                    {
-                        if (val > red_val || std::isnan(val)) {
-                            red_val = val;
-                            idx_val = static_cast<outT>(m);
-                        }
-                    }
-                    else {
-                        if (val > red_val) {
-                            red_val = val;
-                            idx_val = static_cast<outT>(m);
-                        }
-                    }
-                }
-            }
-        }
-        out_[out_iter_offset] = idx_val;
-    }
-};
-
-/* = Search reduction using reduce_over_group*/
-
-template <typename argT,
-          typename outT,
-          typename ReductionOp,
-          typename IdxReductionOp,
-          typename InputOutputIterIndexerT,
-          typename InputRedIndexerT,
-          bool First,
-          bool Last>
-struct SearchReduction
-{
-private:
-    const argT *inp_ = nullptr;
-    argT *vals_ = nullptr;
-    const outT *inds_ = nullptr;
-    outT *out_ = nullptr;
-    ReductionOp reduction_op_;
-    argT identity_;
-    IdxReductionOp idx_reduction_op_;
-    outT idx_identity_;
-    InputOutputIterIndexerT inp_out_iter_indexer_;
-    InputRedIndexerT inp_reduced_dims_indexer_;
-    std::size_t reduction_max_gid_ = 0;
-    std::size_t iter_gws_ = 1;
-    std::size_t reductions_per_wi = 16;
-
-public:
-    SearchReduction(const argT *data,
-                    argT *vals,
-                    const outT *inds,
-                    outT *res,
-                    const ReductionOp &reduction_op,
-                    const argT &identity_val,
-                    const IdxReductionOp &idx_reduction_op,
-                    const outT &idx_identity_val,
-                    const InputOutputIterIndexerT &arg_res_iter_indexer,
-                    const InputRedIndexerT &arg_reduced_dims_indexer,
-                    std::size_t reduction_size,
-                    std::size_t iteration_size,
-                    std::size_t reduction_size_per_wi)
-        : inp_(data), vals_(vals), inds_(inds), out_(res),
-          reduction_op_(reduction_op), identity_(identity_val),
-          idx_reduction_op_(idx_reduction_op), idx_identity_(idx_identity_val),
-          inp_out_iter_indexer_(arg_res_iter_indexer),
-          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
-          reduction_max_gid_(reduction_size), iter_gws_(iteration_size),
-          reductions_per_wi(reduction_size_per_wi)
-    {
-    }
-
-    void operator()(sycl::nd_item<1> it) const
-    {
-        const std::size_t reduction_lid = it.get_local_id(0);
-        const std::size_t wg =
-            it.get_local_range(0); //   0 <= reduction_lid < wg
-
-        const std::size_t iter_gid = it.get_group(0) % iter_gws_;
-        const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_;
-        const std::size_t n_reduction_groups =
-            it.get_group_range(0) / iter_gws_;
-
-        // work-items operates over input with indices
-        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
-        //   + reduction_lid
-        // for 0 <= m < reductions_per_wi
-
-        const auto &inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
-        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
-        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
-
-        argT local_red_val(identity_);
-        outT local_idx(idx_identity_);
-        std::size_t arg_reduce_gid0 =
-            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
-        for (std::size_t m = 0; m < reductions_per_wi; ++m) {
-            std::size_t arg_reduce_gid = arg_reduce_gid0 + m * wg;
-
-            if (arg_reduce_gid < reduction_max_gid_) {
-                auto inp_reduction_offset =
-                    inp_reduced_dims_indexer_(arg_reduce_gid);
-                auto inp_offset = inp_iter_offset + inp_reduction_offset;
-
-                argT val = inp_[inp_offset];
-                if (val == local_red_val) {
-                    if constexpr (!First) {
-                        local_idx =
-                            idx_reduction_op_(local_idx, inds_[inp_offset]);
-                    }
-                    else {
-                        local_idx = idx_reduction_op_(
-                            local_idx, static_cast<outT>(arg_reduce_gid));
-                    }
-                }
-                else {
-                    if constexpr (su_ns::IsMinimum<argT, ReductionOp>::value) {
-                        if (val < local_red_val) {
-                            local_red_val = val;
-                            if constexpr (!First) {
-                                local_idx = inds_[inp_offset];
-                            }
-                            else {
-                                local_idx = static_cast<outT>(arg_reduce_gid);
-                            }
-                        }
-                    }
-                    else if constexpr (su_ns::IsMaximum<argT,
-                                                        ReductionOp>::value)
-                    {
-                        if (val > local_red_val) {
-                            local_red_val = val;
-                            if constexpr (!First) {
-                                local_idx = inds_[inp_offset];
-                            }
-                            else {
-                                local_idx = static_cast<outT>(arg_reduce_gid);
-                            }
-                        }
-                    }
-                }
-            }
-        }
-
-        auto work_group = it.get_group();
-        // This only works if reduction_op_ is from small set of operators
-        argT red_val_over_wg = sycl::reduce_over_group(
-            work_group, local_red_val, identity_, reduction_op_);
-
-        if constexpr (std::is_integral_v<argT>) {
-            local_idx =
-                (red_val_over_wg == local_red_val) ? local_idx : idx_identity_;
-        }
-        else {
-            local_idx =
-                (red_val_over_wg == local_red_val ||
-                 std::isnan(red_val_over_wg) || std::isnan(local_red_val))
-                    ? local_idx
-                    : idx_identity_;
-        }
-        outT idx_over_wg = sycl::reduce_over_group(
-            work_group, local_idx, idx_identity_, idx_reduction_op_);
-
-        if (work_group.leader()) {
-            // each group writes to a different memory location
-            if constexpr (!Last) {
-                // if not the final reduction, write value corresponding to
-                // an index to a temporary
-                vals_[out_iter_offset * n_reduction_groups +
-                      reduction_batch_id] = red_val_over_wg;
-            }
-            out_[out_iter_offset * n_reduction_groups + reduction_batch_id] =
-                idx_over_wg;
-        }
-    }
-};
-
-/* = Search reduction using custom_reduce_over_group*/
-
-template <typename argT,
-          typename outT,
-          typename ReductionOp,
-          typename IdxReductionOp,
-          typename InputOutputIterIndexerT,
-          typename InputRedIndexerT,
-          typename SlmT,
-          bool First,
-          bool Last>
-struct CustomSearchReduction
-{
-private:
-    const argT *inp_ = nullptr;
-    argT *vals_ = nullptr;
-    const outT *inds_ = nullptr;
-    outT *out_ = nullptr;
-    ReductionOp reduction_op_;
-    argT identity_;
-    IdxReductionOp idx_reduction_op_;
-    outT idx_identity_;
-    InputOutputIterIndexerT inp_out_iter_indexer_;
-    InputRedIndexerT inp_reduced_dims_indexer_;
-    SlmT local_mem_;
-    std::size_t reduction_max_gid_ = 0;
-    std::size_t iter_gws_ = 1;
-    std::size_t reductions_per_wi = 16;
-
-public:
-    CustomSearchReduction(const argT *data,
-                          argT *vals,
-                          outT *inds,
-                          outT *res,
-                          const ReductionOp &reduction_op,
-                          const argT &identity_val,
-                          const IdxReductionOp &idx_reduction_op,
-                          const outT &idx_identity_val,
-                          const InputOutputIterIndexerT &arg_res_iter_indexer,
-                          const InputRedIndexerT &arg_reduced_dims_indexer,
-                          SlmT local_mem,
-                          std::size_t reduction_size,
-                          std::size_t iteration_size,
-                          std::size_t reduction_size_per_wi)
-        : inp_(data), vals_(vals), inds_(inds), out_(res),
-          reduction_op_(reduction_op), identity_(identity_val),
-          idx_reduction_op_(idx_reduction_op), idx_identity_(idx_identity_val),
-          inp_out_iter_indexer_(arg_res_iter_indexer),
-          inp_reduced_dims_indexer_(arg_reduced_dims_indexer),
-          local_mem_(local_mem), reduction_max_gid_(reduction_size),
-          iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi)
-    {
-    }
-
-    void operator()(sycl::nd_item<1> it) const
-    {
-        const std::size_t reduction_lid = it.get_local_id(0);
-        const std::size_t wg =
-            it.get_local_range(0); //   0 <= reduction_lid < wg
-
-        const std::size_t iter_gid = it.get_group(0) % iter_gws_;
-        const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_;
-        const std::size_t n_reduction_groups =
-            it.get_group_range(0) / iter_gws_;
-
-        // work-items operates over input with indices
-        //   inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg
-        //   + reduction_lid
-        // for 0 <= m < reductions_per_wi
-
-        const auto &inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid);
-        const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset();
-        const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset();
-
-        argT local_red_val(identity_);
-        outT local_idx(idx_identity_);
-        std::size_t arg_reduce_gid0 =
-            reduction_lid + reduction_batch_id * wg * reductions_per_wi;
-        for (std::size_t m = 0; m < reductions_per_wi; ++m) {
-            std::size_t arg_reduce_gid = arg_reduce_gid0 + m * wg;
-
-            if (arg_reduce_gid < reduction_max_gid_) {
-                auto inp_reduction_offset =
-                    inp_reduced_dims_indexer_(arg_reduce_gid);
-                auto inp_offset = inp_iter_offset + inp_reduction_offset;
-
-                argT val = inp_[inp_offset];
-                if (val == local_red_val) {
-                    if constexpr (!First) {
-                        local_idx =
-                            idx_reduction_op_(local_idx, inds_[inp_offset]);
-                    }
-                    else {
-                        local_idx = idx_reduction_op_(
-                            local_idx, static_cast<outT>(arg_reduce_gid));
-                    }
-                }
-                else {
-                    if constexpr (su_ns::IsMinimum<argT, ReductionOp>::value) {
-                        using dpctl::tensor::type_utils::is_complex;
-                        if constexpr (is_complex<argT>::value) {
-                            using dpctl::tensor::math_utils::less_complex;
-                            // less_complex always returns false for NaNs, so
-                            // check
-                            if (less_complex<argT>(val, local_red_val) ||
-                                std::isnan(std::real(val)) ||
-                                std::isnan(std::imag(val)))
-                            {
-                                local_red_val = val;
-                                if constexpr (!First) {
-                                    local_idx = inds_[inp_offset];
-                                }
-                                else {
-                                    local_idx =
-                                        static_cast<outT>(arg_reduce_gid);
-                                }
-                            }
-                        }
-                        else if constexpr (std::is_floating_point_v<argT> ||
-                                           std::is_same_v<argT, sycl::half>)
-                        {
-                            if (val < local_red_val || std::isnan(val)) {
-                                local_red_val = val;
-                                if constexpr (!First) {
-                                    local_idx = inds_[inp_offset];
-                                }
-                                else {
-                                    local_idx =
-                                        static_cast<outT>(arg_reduce_gid);
-                                }
-                            }
-                        }
-                        else {
-                            if (val < local_red_val) {
-                                local_red_val = val;
-                                if constexpr (!First) {
-                                    local_idx = inds_[inp_offset];
-                                }
-                                else {
-                                    local_idx =
-                                        static_cast<outT>(arg_reduce_gid);
-                                }
-                            }
-                        }
-                    }
-                    else if constexpr (su_ns::IsMaximum<argT,
-                                                        ReductionOp>::value)
-                    {
-                        using dpctl::tensor::type_utils::is_complex;
-                        if constexpr (is_complex<argT>::value) {
-                            using dpctl::tensor::math_utils::greater_complex;
-                            if (greater_complex<argT>(val, local_red_val) ||
-                                std::isnan(std::real(val)) ||
-                                std::isnan(std::imag(val)))
-                            {
-                                local_red_val = val;
-                                if constexpr (!First) {
-                                    local_idx = inds_[inp_offset];
-                                }
-                                else {
-                                    local_idx =
-                                        static_cast<outT>(arg_reduce_gid);
-                                }
-                            }
-                        }
-                        else if constexpr (std::is_floating_point_v<argT> ||
-                                           std::is_same_v<argT, sycl::half>)
-                        {
-                            if (val > local_red_val || std::isnan(val)) {
-                                local_red_val = val;
-                                if constexpr (!First) {
-                                    local_idx = inds_[inp_offset];
-                                }
-                                else {
-                                    local_idx =
-                                        static_cast<outT>(arg_reduce_gid);
-                                }
-                            }
-                        }
-                        else {
-                            if (val > local_red_val) {
-                                local_red_val = val;
-                                if constexpr (!First) {
-                                    local_idx = inds_[inp_offset];
-                                }
-                                else {
-                                    local_idx =
-                                        static_cast<outT>(arg_reduce_gid);
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-
-        auto work_group = it.get_group();
-        // This only works if reduction_op_ is from small set of operators
-        argT red_val_over_wg = su_ns::custom_reduce_over_group(
-            work_group, local_mem_, local_red_val, reduction_op_);
-
-        using dpctl::tensor::type_utils::is_complex;
-        if constexpr (is_complex<argT>::value) {
-            // equality does not hold for NaNs, so check here
-            local_idx = (red_val_over_wg == local_red_val ||
-                         std::isnan(std::real(local_red_val)) ||
-                         std::isnan(std::imag(local_red_val)))
-                            ? local_idx
-                            : idx_identity_;
-        }
-        else if constexpr (std::is_floating_point_v<argT> ||
-                           std::is_same_v<argT, sycl::half>)
-        {
-            // equality does not hold for NaNs, so check here
-            local_idx =
-                (red_val_over_wg == local_red_val || std::isnan(local_red_val))
-                    ? local_idx
-                    : idx_identity_;
-        }
-        else {
-            local_idx =
-                red_val_over_wg == local_red_val ? local_idx : idx_identity_;
-        }
-        outT idx_over_wg = sycl::reduce_over_group(
-            work_group, local_idx, idx_identity_, idx_reduction_op_);
-        if (work_group.leader()) {
-            // each group writes to a different memory location
-            if constexpr (!Last) {
-                // if not the final reduction, write value corresponding to
-                // an index to a temporary
-                vals_[out_iter_offset * n_reduction_groups +
-                      reduction_batch_id] = red_val_over_wg;
-            }
-            out_[out_iter_offset * n_reduction_groups + reduction_batch_id] =
-                idx_over_wg;
-        }
-    }
-};
-
-typedef sycl::event (*search_strided_impl_fn_ptr)(
-    sycl::queue &,
-    std::size_t,
-    std::size_t,
-    const char *,
-    char *,
-    int,
-    const ssize_t *,
-    ssize_t,
-    ssize_t,
-    int,
-    const ssize_t *,
-    ssize_t,
-    const std::vector<sycl::event> &);
-
-template <typename T1,
-          typename T2,
-          typename T3,
-          typename T4,
-          typename T5,
-          typename T6>
-class search_seq_strided_krn;
-
-template <typename T1,
-          typename T2,
-          typename T3,
-          typename T4,
-          typename T5,
-          typename T6>
-class search_seq_contig_krn;
-
-template <typename T1,
-          typename T2,
-          typename T3,
-          typename T4,
-          typename T5,
-          typename T6,
-          bool b1,
-          bool b2>
-class search_over_group_krn;
-
-template <typename T1,
-          typename T2,
-          typename T3,
-          typename T4,
-          typename T5,
-          typename T6,
-          typename T7,
-          bool b1,
-          bool b2>
-class custom_search_over_group_krn;
-
-template <typename T1, typename T2, typename T3> class search_empty_krn;
-
-template <typename argTy,
-          typename resTy,
-          typename ReductionOpT,
-          typename IndexOpT,
-          typename InputOutputIterIndexerT,
-          typename ReductionIndexerT,
-          bool First,
-          bool Last>
-sycl::event
-submit_search_reduction(sycl::queue &exec_q,
-                        const argTy *arg,
-                        argTy *arg_tmp,
-                        resTy *res_tmp,
-                        resTy *res,
-                        argTy identity_val,
-                        resTy idx_identity_val,
-                        std::size_t wg,
-                        std::size_t iter_nelems,
-                        std::size_t reduction_nelems,
-                        std::size_t reductions_per_wi,
-                        std::size_t reduction_groups,
-                        const InputOutputIterIndexerT &in_out_iter_indexer,
-                        const ReductionIndexerT &reduction_indexer,
-                        const std::vector<sycl::event> &depends)
-{
-    sycl::event red_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        auto globalRange = sycl::range<1>{iter_nelems * reduction_groups * wg};
-        auto localRange = sycl::range<1>{wg};
-        auto ndRange = sycl::nd_range<1>(globalRange, localRange);
-
-        if constexpr (can_use_reduce_over_group<ReductionOpT, resTy>::value) {
-            using KernelName =
-                class search_over_group_krn<argTy, resTy, ReductionOpT,
-                                            IndexOpT, InputOutputIterIndexerT,
-                                            ReductionIndexerT, First, Last>;
-            cgh.parallel_for<KernelName>(
-                ndRange, SearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
-                                         InputOutputIterIndexerT,
-                                         ReductionIndexerT, First, Last>(
-                             arg, arg_tmp, res_tmp, res, ReductionOpT(),
-                             identity_val, IndexOpT(), idx_identity_val,
-                             in_out_iter_indexer, reduction_indexer,
-                             reduction_nelems, iter_nelems, reductions_per_wi));
-        }
-        else {
-            using SlmT = sycl::local_accessor<argTy, 1>;
-            SlmT local_memory = SlmT(localRange, cgh);
-            using KernelName = class custom_search_over_group_krn<
-                argTy, resTy, ReductionOpT, IndexOpT, InputOutputIterIndexerT,
-                ReductionIndexerT, SlmT, First, Last>;
-            cgh.parallel_for<KernelName>(
-                ndRange,
-                CustomSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
-                                      InputOutputIterIndexerT,
-                                      ReductionIndexerT, SlmT, First, Last>(
-                    arg, arg_tmp, res_tmp, res, ReductionOpT(), identity_val,
-                    IndexOpT(), idx_identity_val, in_out_iter_indexer,
-                    reduction_indexer, local_memory, reduction_nelems,
-                    iter_nelems, reductions_per_wi));
-        }
-    });
-    return red_ev;
-}
-
-template <typename argTy,
-          typename resTy,
-          typename ReductionOpT,
-          typename IndexOpT>
-sycl::event search_over_group_temps_strided_impl(
-    sycl::queue &exec_q,
-    std::size_t iter_nelems,      // number of reductions    (num. of rows in a
-                                  // matrix when reducing over rows)
-    std::size_t reduction_nelems, // size of each reduction  (length of rows,
-                                  // i.e. number of columns)
-    const char *arg_cp,
-    char *res_cp,
-    int iter_nd,
-    const ssize_t *iter_shape_and_strides,
-    ssize_t iter_arg_offset,
-    ssize_t iter_res_offset,
-    int red_nd,
-    const ssize_t *reduction_shape_stride,
-    ssize_t reduction_arg_offset,
-    const std::vector<sycl::event> &depends)
-{
-    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
-    resTy *res_tp = reinterpret_cast<resTy *>(res_cp);
-
-    static constexpr argTy identity_val =
-        su_ns::Identity<ReductionOpT, argTy>::value;
-    static constexpr resTy idx_identity_val =
-        su_ns::Identity<IndexOpT, resTy>::value;
-
-    if (reduction_nelems == 0) {
-        sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
-            using IndexerT =
-                dpctl::tensor::offset_utils::UnpackedStridedIndexer;
-
-            const ssize_t *const &res_shape = iter_shape_and_strides;
-            const ssize_t *const &res_strides =
-                iter_shape_and_strides + 2 * iter_nd;
-            const IndexerT res_indexer(iter_nd, iter_res_offset, res_shape,
-                                       res_strides);
-            using InitKernelName =
-                class search_empty_krn<resTy, argTy, ReductionOpT>;
-            cgh.depends_on(depends);
-
-            cgh.parallel_for<InitKernelName>(
-                sycl::range<1>(iter_nelems), [=](sycl::id<1> id) {
-                    auto res_offset = res_indexer(id[0]);
-                    res_tp[res_offset] = idx_identity_val;
-                });
-        });
-
-        return res_init_ev;
-    }
-
-    const sycl::device &d = exec_q.get_device();
-    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
-    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
-
-    if (reduction_nelems < wg) {
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
-        using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
-
-        const InputOutputIterIndexerT in_out_iter_indexer{
-            iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides};
-        const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
-                                                  reduction_shape_stride};
-
-        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(depends);
-
-            cgh.parallel_for<class search_seq_strided_krn<
-                argTy, resTy, ReductionOpT, IndexOpT, InputOutputIterIndexerT,
-                ReductionIndexerT>>(
-                sycl::range<1>(iter_nelems),
-                SequentialSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
-                                          InputOutputIterIndexerT,
-                                          ReductionIndexerT>(
-                    arg_tp, res_tp, ReductionOpT(), identity_val, IndexOpT(),
-                    idx_identity_val, in_out_iter_indexer, reduction_indexer,
-                    reduction_nelems));
-        });
-
-        return comp_ev;
-    }
-
-    static constexpr std::size_t preferred_reductions_per_wi = 4;
-    // prevents running out of resources on CPU
-    std::size_t max_wg = reduction_detail::get_work_group_size(d);
-
-    std::size_t reductions_per_wi(preferred_reductions_per_wi);
-    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
-        // Perform reduction using one 1 work-group per iteration,
-        // can output directly to res
-
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
-        using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
-
-        const InputOutputIterIndexerT in_out_iter_indexer{
-            iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides};
-        const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset,
-                                                  reduction_shape_stride};
-
-        if (iter_nelems == 1) {
-            // increase GPU occupancy
-            wg = max_wg;
-        }
-        reductions_per_wi =
-            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
-
-        std::size_t reduction_groups =
-            (reduction_nelems + reductions_per_wi * wg - 1) /
-            (reductions_per_wi * wg);
-        assert(reduction_groups == 1);
-
-        sycl::event comp_ev =
-            submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
-                                    InputOutputIterIndexerT, ReductionIndexerT,
-                                    true, true>(
-                exec_q, arg_tp, nullptr, nullptr, res_tp, identity_val,
-                idx_identity_val, wg, iter_nelems, reduction_nelems,
-                reductions_per_wi, reduction_groups, in_out_iter_indexer,
-                reduction_indexer, depends);
-
-        return comp_ev;
-    }
-    else {
-        // more than one work-groups is needed, requires a temporary
-        std::size_t reduction_groups =
-            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
-            (preferred_reductions_per_wi * wg);
-        assert(reduction_groups > 1);
-
-        std::size_t second_iter_reduction_groups_ =
-            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
-            (preferred_reductions_per_wi * wg);
-
-        const std::size_t tmp_alloc_size =
-            iter_nelems * (reduction_groups + second_iter_reduction_groups_);
-        auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
-            tmp_alloc_size, exec_q);
-
-        resTy *partially_reduced_tmp = tmp_owner.get();
-        resTy *partially_reduced_tmp2 =
-            partially_reduced_tmp + reduction_groups * iter_nelems;
-
-        auto val_tmp_owner =
-            dpctl::tensor::alloc_utils::smart_malloc_device<argTy>(
-                tmp_alloc_size, exec_q);
-
-        argTy *partially_reduced_vals_tmp = val_tmp_owner.get();
-        argTy *partially_reduced_vals_tmp2 =
-            partially_reduced_vals_tmp + reduction_groups * iter_nelems;
-
-        sycl::event first_reduction_ev;
-        {
-            using InputIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
-            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-            using InputOutputIterIndexerT =
-                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                    InputIndexerT, ResIndexerT>;
-            using ReductionIndexerT =
-                dpctl::tensor::offset_utils::StridedIndexer;
-
-            // Only 2*iter_nd entries describing shape and strides of iterated
-            // dimensions of input array from iter_shape_and_strides are going
-            // to be accessed by inp_indexer
-            const InputIndexerT inp_indexer(iter_nd, iter_arg_offset,
-                                            iter_shape_and_strides);
-            static constexpr ResIndexerT noop_tmp_indexer{};
-
-            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
-                                                              noop_tmp_indexer};
-            const ReductionIndexerT reduction_indexer{
-                red_nd, reduction_arg_offset, reduction_shape_stride};
-
-            first_reduction_ev =
-                submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
-                                        InputOutputIterIndexerT,
-                                        ReductionIndexerT, true, false>(
-                    exec_q, arg_tp, partially_reduced_vals_tmp, nullptr,
-                    partially_reduced_tmp, identity_val, idx_identity_val, wg,
-                    iter_nelems, reduction_nelems, reductions_per_wi,
-                    reduction_groups, in_out_iter_indexer, reduction_indexer,
-                    depends);
-        }
-
-        std::size_t remaining_reduction_nelems = reduction_groups;
-
-        resTy *temp_arg = partially_reduced_tmp;
-        resTy *temp2_arg = partially_reduced_tmp2;
-
-        argTy *vals_temp_arg = partially_reduced_vals_tmp;
-        argTy *vals_temp2_arg = partially_reduced_vals_tmp2;
-
-        sycl::event dependent_ev = first_reduction_ev;
-
-        while (remaining_reduction_nelems >
-               preferred_reductions_per_wi * max_wg)
-        {
-            std::size_t reduction_groups_ =
-                (remaining_reduction_nelems + preferred_reductions_per_wi * wg -
-                 1) /
-                (preferred_reductions_per_wi * wg);
-            assert(reduction_groups_ > 1);
-
-            // keep reducing
-            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-            using InputOutputIterIndexerT =
-                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                    InputIndexerT, ResIndexerT>;
-            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-
-            const InputIndexerT inp_indexer{/* size */ iter_nelems,
-                                            /* step */ reduction_groups_};
-            static constexpr ResIndexerT res_iter_indexer{};
-
-            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
-                                                              res_iter_indexer};
-            static constexpr ReductionIndexerT reduction_indexer{};
-
-            sycl::event partial_reduction_ev =
-                submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
-                                        InputOutputIterIndexerT,
-                                        ReductionIndexerT, false, false>(
-                    exec_q, vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg,
-                    identity_val, idx_identity_val, wg, iter_nelems,
-                    remaining_reduction_nelems, preferred_reductions_per_wi,
-                    reduction_groups_, in_out_iter_indexer, reduction_indexer,
-                    {dependent_ev});
-
-            remaining_reduction_nelems = reduction_groups_;
-            std::swap(temp_arg, temp2_arg);
-            std::swap(vals_temp_arg, vals_temp2_arg);
-            dependent_ev = partial_reduction_ev;
-        }
-
-        // final reduction to res
-        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-        using ResIndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer;
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                InputIndexerT, ResIndexerT>;
-        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-
-        const InputIndexerT inp_indexer{/* size */ iter_nelems,
-                                        /* step */ remaining_reduction_nelems};
-        const ResIndexerT res_iter_indexer{
-            iter_nd, iter_res_offset,
-            /* shape */ iter_shape_and_strides,
-            /* strides */ iter_shape_and_strides + 2 * iter_nd};
-
-        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
-                                                          res_iter_indexer};
-        static constexpr ReductionIndexerT reduction_indexer{};
-
-        wg = max_wg;
-        reductions_per_wi = std::max<std::size_t>(
-            1, (remaining_reduction_nelems + wg - 1) / wg);
-
-        reduction_groups =
-            (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
-            (reductions_per_wi * wg);
-        assert(reduction_groups == 1);
-
-        sycl::event final_reduction_ev =
-            submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
-                                    InputOutputIterIndexerT, ReductionIndexerT,
-                                    false, true>(
-                exec_q, vals_temp_arg, nullptr, temp_arg, res_tp, identity_val,
-                idx_identity_val, wg, iter_nelems, remaining_reduction_nelems,
-                reductions_per_wi, reduction_groups, in_out_iter_indexer,
-                reduction_indexer, {dependent_ev});
-
-        sycl::event cleanup_host_task_event =
-            dpctl::tensor::alloc_utils::async_smart_free(
-                exec_q, {final_reduction_ev}, tmp_owner, val_tmp_owner);
-
-        // FIXME: do not return host-task event
-        //   Instead collect all host-tasks to a list
-
-        return cleanup_host_task_event;
-    }
-}
-
-typedef sycl::event (*search_contig_impl_fn_ptr)(
-    sycl::queue &,
-    std::size_t,
-    std::size_t,
-    const char *,
-    char *,
-    ssize_t,
-    ssize_t,
-    ssize_t,
-    const std::vector<sycl::event> &);
-
-template <typename argTy,
-          typename resTy,
-          typename ReductionOpT,
-          typename IndexOpT>
-sycl::event search_axis1_over_group_temps_contig_impl(
-    sycl::queue &exec_q,
-    std::size_t iter_nelems,      // number of reductions    (num. of rows in a
-                                  // matrix when reducing over rows)
-    std::size_t reduction_nelems, // size of each reduction  (length of rows,
-                                  // i.e. number of columns)
-    const char *arg_cp,
-    char *res_cp,
-    ssize_t iter_arg_offset,
-    ssize_t iter_res_offset,
-    ssize_t reduction_arg_offset,
-    const std::vector<sycl::event> &depends)
-{
-    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
-                          iter_arg_offset + reduction_arg_offset;
-    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
-
-    static constexpr argTy identity_val =
-        su_ns::Identity<ReductionOpT, argTy>::value;
-    static constexpr resTy idx_identity_val =
-        su_ns::Identity<IndexOpT, resTy>::value;
-
-    if (reduction_nelems == 0) {
-        sycl::event res_init_ev = exec_q.fill<resTy>(
-            res_tp, resTy(idx_identity_val), iter_nelems, depends);
-
-        return res_init_ev;
-    }
-
-    const sycl::device &d = exec_q.get_device();
-    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
-    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
-
-    if (reduction_nelems < wg) {
-        using InputIterIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                InputIterIndexerT, NoOpIndexerT>;
-        using ReductionIndexerT = NoOpIndexerT;
-
-        const InputOutputIterIndexerT in_out_iter_indexer{
-            InputIterIndexerT{/* size */ iter_nelems,
-                              /* step */ reduction_nelems},
-            NoOpIndexerT{}};
-        static constexpr ReductionIndexerT reduction_indexer{};
-
-        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(depends);
-
-            cgh.parallel_for<class search_seq_contig_krn<
-                argTy, resTy, ReductionOpT, IndexOpT, InputOutputIterIndexerT,
-                ReductionIndexerT>>(
-                sycl::range<1>(iter_nelems),
-                SequentialSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
-                                          InputOutputIterIndexerT,
-                                          ReductionIndexerT>(
-                    arg_tp, res_tp, ReductionOpT(), identity_val, IndexOpT(),
-                    idx_identity_val, in_out_iter_indexer, reduction_indexer,
-                    reduction_nelems));
-        });
-
-        return comp_ev;
-    }
-
-    static constexpr std::size_t preferred_reductions_per_wi = 8;
-    // prevents running out of resources on CPU
-    std::size_t max_wg = reduction_detail::get_work_group_size(d);
-
-    std::size_t reductions_per_wi(preferred_reductions_per_wi);
-    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
-        // Perform reduction using one 1 work-group per iteration,
-        // can output directly to res
-        using InputIterIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                InputIterIndexerT, NoOpIndexerT>;
-        using ReductionIndexerT = NoOpIndexerT;
-
-        const InputOutputIterIndexerT in_out_iter_indexer{
-            InputIterIndexerT{/* size */ iter_nelems,
-                              /* step */ reduction_nelems},
-            NoOpIndexerT{}};
-        static constexpr ReductionIndexerT reduction_indexer{};
-
-        if (iter_nelems == 1) {
-            // increase GPU occupancy
-            wg = max_wg;
-        }
-        reductions_per_wi =
-            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
-
-        std::size_t reduction_groups =
-            (reduction_nelems + reductions_per_wi * wg - 1) /
-            (reductions_per_wi * wg);
-        assert(reduction_groups == 1);
-
-        sycl::event comp_ev =
-            submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
-                                    InputOutputIterIndexerT, ReductionIndexerT,
-                                    true, true>(
-                exec_q, arg_tp, nullptr, nullptr, res_tp, identity_val,
-                idx_identity_val, wg, iter_nelems, reduction_nelems,
-                reductions_per_wi, reduction_groups, in_out_iter_indexer,
-                reduction_indexer, depends);
-
-        return comp_ev;
-    }
-    else {
-        // more than one work-groups is needed, requires a temporary
-        std::size_t reduction_groups =
-            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
-            (preferred_reductions_per_wi * wg);
-        assert(reduction_groups > 1);
-
-        std::size_t second_iter_reduction_groups_ =
-            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
-            (preferred_reductions_per_wi * wg);
-
-        const std::size_t tmp_alloc_size =
-            iter_nelems * (reduction_groups + second_iter_reduction_groups_);
-        auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
-            tmp_alloc_size, exec_q);
-        resTy *partially_reduced_tmp = tmp_owner.get();
-        resTy *partially_reduced_tmp2 =
-            partially_reduced_tmp + reduction_groups * iter_nelems;
-
-        auto val_tmp_owner =
-            dpctl::tensor::alloc_utils::smart_malloc_device<argTy>(
-                tmp_alloc_size, exec_q);
-        argTy *partially_reduced_vals_tmp = val_tmp_owner.get();
-        argTy *partially_reduced_vals_tmp2 =
-            partially_reduced_vals_tmp + reduction_groups * iter_nelems;
-
-        sycl::event first_reduction_ev;
-        {
-            using InputIterIndexerT =
-                dpctl::tensor::offset_utils::Strided1DIndexer;
-            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-            using InputOutputIterIndexerT =
-                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                    InputIterIndexerT, NoOpIndexerT>;
-            using ReductionIndexerT = NoOpIndexerT;
-
-            const InputOutputIterIndexerT in_out_iter_indexer{
-                InputIterIndexerT{/* size */ iter_nelems,
-                                  /* step */ reduction_nelems},
-                NoOpIndexerT{}};
-            static constexpr ReductionIndexerT reduction_indexer{};
-
-            first_reduction_ev =
-                submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
-                                        InputOutputIterIndexerT,
-                                        ReductionIndexerT, true, false>(
-                    exec_q, arg_tp, partially_reduced_vals_tmp, nullptr,
-                    partially_reduced_tmp, identity_val, idx_identity_val, wg,
-                    iter_nelems, reduction_nelems, preferred_reductions_per_wi,
-                    reduction_groups, in_out_iter_indexer, reduction_indexer,
-                    depends);
-        }
-
-        std::size_t remaining_reduction_nelems = reduction_groups;
-
-        resTy *temp_arg = partially_reduced_tmp;
-        resTy *temp2_arg = partially_reduced_tmp2;
-
-        argTy *vals_temp_arg = partially_reduced_vals_tmp;
-        argTy *vals_temp2_arg = partially_reduced_vals_tmp2;
-
-        sycl::event dependent_ev = first_reduction_ev;
-
-        while (remaining_reduction_nelems >
-               preferred_reductions_per_wi * max_wg)
-        {
-            std::size_t reduction_groups_ =
-                (remaining_reduction_nelems + preferred_reductions_per_wi * wg -
-                 1) /
-                (preferred_reductions_per_wi * wg);
-            assert(reduction_groups_ > 1);
-
-            // keep reducing
-            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-            using InputOutputIterIndexerT =
-                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                    InputIndexerT, ResIndexerT>;
-            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-
-            const InputIndexerT inp_indexer{/* size */ iter_nelems,
-                                            /* step */ reduction_groups_};
-            static constexpr ResIndexerT res_iter_indexer{};
-
-            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
-                                                              res_iter_indexer};
-            static constexpr ReductionIndexerT reduction_indexer{};
-
-            sycl::event partial_reduction_ev =
-                submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
-                                        InputOutputIterIndexerT,
-                                        ReductionIndexerT, false, false>(
-                    exec_q, vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg,
-                    identity_val, idx_identity_val, wg, iter_nelems,
-                    remaining_reduction_nelems, preferred_reductions_per_wi,
-                    reduction_groups_, in_out_iter_indexer, reduction_indexer,
-                    {dependent_ev});
-
-            remaining_reduction_nelems = reduction_groups_;
-            std::swap(temp_arg, temp2_arg);
-            std::swap(vals_temp_arg, vals_temp2_arg);
-            dependent_ev = partial_reduction_ev;
-        }
-
-        // final reduction to res
-        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-        using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                InputIndexerT, ResIndexerT>;
-        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-
-        const InputIndexerT inp_indexer{/* size */ iter_nelems,
-                                        /* step */ remaining_reduction_nelems};
-        static constexpr ResIndexerT res_iter_indexer{};
-
-        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
-                                                          res_iter_indexer};
-        static constexpr ReductionIndexerT reduction_indexer{};
-
-        wg = max_wg;
-        reductions_per_wi = std::max<std::size_t>(
-            1, (remaining_reduction_nelems + wg - 1) / wg);
-
-        reduction_groups =
-            (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
-            (reductions_per_wi * wg);
-        assert(reduction_groups == 1);
-
-        sycl::event final_reduction_ev =
-            submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
-                                    InputOutputIterIndexerT, ReductionIndexerT,
-                                    false, true>(
-                exec_q, vals_temp_arg, nullptr, temp_arg, res_tp, identity_val,
-                idx_identity_val, wg, iter_nelems, remaining_reduction_nelems,
-                reductions_per_wi, reduction_groups, in_out_iter_indexer,
-                reduction_indexer, {dependent_ev});
-
-        sycl::event cleanup_host_task_event =
-            dpctl::tensor::alloc_utils::async_smart_free(
-                exec_q, {final_reduction_ev}, tmp_owner, val_tmp_owner);
-
-        // FIXME: do not return host-task event
-        //   Instead collect all host-tasks to a list
-
-        return cleanup_host_task_event;
-    }
-}
-
-template <typename argTy,
-          typename resTy,
-          typename ReductionOpT,
-          typename IndexOpT>
-sycl::event search_axis0_over_group_temps_contig_impl(
-    sycl::queue &exec_q,
-    std::size_t iter_nelems,      // number of reductions    (num. of rows in a
-                                  // matrix when reducing over rows)
-    std::size_t reduction_nelems, // size of each reduction  (length of rows,
-                                  // i.e. number of columns)
-    const char *arg_cp,
-    char *res_cp,
-    ssize_t iter_arg_offset,
-    ssize_t iter_res_offset,
-    ssize_t reduction_arg_offset,
-    const std::vector<sycl::event> &depends)
-{
-    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
-                          iter_arg_offset + reduction_arg_offset;
-    resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;
-
-    static constexpr argTy identity_val =
-        su_ns::Identity<ReductionOpT, argTy>::value;
-    static constexpr resTy idx_identity_val =
-        su_ns::Identity<IndexOpT, resTy>::value;
-
-    if (reduction_nelems == 0) {
-        sycl::event res_init_ev = exec_q.fill<resTy>(
-            res_tp, resTy(idx_identity_val), iter_nelems, depends);
-
-        return res_init_ev;
-    }
-
-    const sycl::device &d = exec_q.get_device();
-    const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
-    std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
-
-    if (reduction_nelems < wg) {
-        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                NoOpIndexerT, NoOpIndexerT>;
-        using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-
-        const InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
-                                                          NoOpIndexerT{}};
-        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
-                                                  /* step */ iter_nelems};
-
-        sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(depends);
-
-            using KernelName =
-                class search_seq_contig_krn<argTy, resTy, ReductionOpT,
-                                            IndexOpT, InputOutputIterIndexerT,
-                                            ReductionIndexerT>;
-
-            sycl::range<1> iter_range{iter_nelems};
-
-            cgh.parallel_for<KernelName>(
-                iter_range,
-                SequentialSearchReduction<argTy, resTy, ReductionOpT, IndexOpT,
-                                          InputOutputIterIndexerT,
-                                          ReductionIndexerT>(
-                    arg_tp, res_tp, ReductionOpT(), identity_val, IndexOpT(),
-                    idx_identity_val, in_out_iter_indexer, reduction_indexer,
-                    reduction_nelems));
-        });
-
-        return comp_ev;
-    }
-
-    static constexpr std::size_t preferred_reductions_per_wi = 8;
-    // prevents running out of resources on CPU
-    std::size_t max_wg = reduction_detail::get_work_group_size(d);
-
-    std::size_t reductions_per_wi(preferred_reductions_per_wi);
-    if (reduction_nelems <= preferred_reductions_per_wi * max_wg) {
-        // Perform reduction using one 1 work-group per iteration,
-        // can output directly to res
-        using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                NoOpIndexerT, NoOpIndexerT>;
-        using ReductionIndexerT = ColsIndexerT;
-
-        static constexpr NoOpIndexerT columns_indexer{};
-        static constexpr NoOpIndexerT result_indexer{};
-        const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
-                                                          result_indexer};
-        const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems,
-                                                  /* step */ iter_nelems};
-
-        if (iter_nelems == 1) {
-            // increase GPU occupancy
-            wg = max_wg;
-        }
-        reductions_per_wi =
-            std::max<std::size_t>(1, (reduction_nelems + wg - 1) / wg);
-
-        std::size_t reduction_groups =
-            (reduction_nelems + reductions_per_wi * wg - 1) /
-            (reductions_per_wi * wg);
-        assert(reduction_groups == 1);
-
-        sycl::event comp_ev =
-            submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
-                                    InputOutputIterIndexerT, ReductionIndexerT,
-                                    true, true>(
-                exec_q, arg_tp, nullptr, nullptr, res_tp, identity_val,
-                idx_identity_val, wg, iter_nelems, reduction_nelems,
-                reductions_per_wi, reduction_groups, in_out_iter_indexer,
-                reduction_indexer, depends);
-
-        return comp_ev;
-    }
-    else {
-        // more than one work-groups is needed, requires a temporary
-        std::size_t reduction_groups =
-            (reduction_nelems + preferred_reductions_per_wi * wg - 1) /
-            (preferred_reductions_per_wi * wg);
-        assert(reduction_groups > 1);
-
-        std::size_t second_iter_reduction_groups_ =
-            (reduction_groups + preferred_reductions_per_wi * wg - 1) /
-            (preferred_reductions_per_wi * wg);
-
-        const std::size_t tmp_alloc_size =
-            iter_nelems * (reduction_groups + second_iter_reduction_groups_);
-        auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device<resTy>(
-            tmp_alloc_size, exec_q);
-
-        resTy *partially_reduced_tmp = tmp_owner.get();
-        resTy *partially_reduced_tmp2 =
-            partially_reduced_tmp + reduction_groups * iter_nelems;
-
-        auto vals_tmp_owner =
-            dpctl::tensor::alloc_utils::smart_malloc_device<argTy>(
-                tmp_alloc_size, exec_q);
-        argTy *partially_reduced_vals_tmp = vals_tmp_owner.get();
-        argTy *partially_reduced_vals_tmp2 =
-            partially_reduced_vals_tmp + reduction_groups * iter_nelems;
-
-        sycl::event first_reduction_ev;
-        {
-            using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-            using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-            using InputOutputIterIndexerT =
-                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                    NoOpIndexerT, NoOpIndexerT>;
-            using ReductionIndexerT = ColsIndexerT;
-
-            static constexpr NoOpIndexerT columns_indexer{};
-            static constexpr NoOpIndexerT result_indexer{};
-            const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
-                                                              result_indexer};
-            const ReductionIndexerT reduction_indexer{
-                /* size */ reduction_nelems,
-                /* step */ iter_nelems};
-
-            first_reduction_ev =
-                submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
-                                        InputOutputIterIndexerT,
-                                        ReductionIndexerT, true, false>(
-                    exec_q, arg_tp, partially_reduced_vals_tmp, nullptr,
-                    partially_reduced_tmp, identity_val, idx_identity_val, wg,
-                    iter_nelems, reduction_nelems, preferred_reductions_per_wi,
-                    reduction_groups, in_out_iter_indexer, reduction_indexer,
-                    depends);
-        }
-
-        std::size_t remaining_reduction_nelems = reduction_groups;
-
-        resTy *temp_arg = partially_reduced_tmp;
-        resTy *temp2_arg = partially_reduced_tmp2;
-
-        argTy *vals_temp_arg = partially_reduced_vals_tmp;
-        argTy *vals_temp2_arg = partially_reduced_vals_tmp2;
-
-        sycl::event dependent_ev = first_reduction_ev;
-
-        while (remaining_reduction_nelems >
-               preferred_reductions_per_wi * max_wg)
-        {
-            std::size_t reduction_groups_ =
-                (remaining_reduction_nelems + preferred_reductions_per_wi * wg -
-                 1) /
-                (preferred_reductions_per_wi * wg);
-            assert(reduction_groups_ > 1);
-
-            // keep reducing
-            using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-            using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-            using InputOutputIterIndexerT =
-                dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                    InputIndexerT, ResIndexerT>;
-            using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-
-            const InputIndexerT inp_indexer{/* size */ iter_nelems,
-                                            /* step */ reduction_groups_};
-            static constexpr ResIndexerT res_iter_indexer{};
-
-            const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
-                                                              res_iter_indexer};
-            static constexpr ReductionIndexerT reduction_indexer{};
-
-            sycl::event partial_reduction_ev =
-                submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
-                                        InputOutputIterIndexerT,
-                                        ReductionIndexerT, false, false>(
-                    exec_q, vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg,
-                    identity_val, idx_identity_val, wg, iter_nelems,
-                    remaining_reduction_nelems, preferred_reductions_per_wi,
-                    reduction_groups_, in_out_iter_indexer, reduction_indexer,
-                    {dependent_ev});
-
-            remaining_reduction_nelems = reduction_groups_;
-            std::swap(temp_arg, temp2_arg);
-            std::swap(vals_temp_arg, vals_temp2_arg);
-            dependent_ev = partial_reduction_ev;
-        }
-
-        // final reduction to res
-        using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-        using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-        using InputOutputIterIndexerT =
-            dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
-                InputIndexerT, ResIndexerT>;
-        using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-
-        const InputIndexerT inp_indexer{/* size */ iter_nelems,
-                                        /* step */ remaining_reduction_nelems};
-        static constexpr ResIndexerT res_iter_indexer{};
-
-        const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer,
-                                                          res_iter_indexer};
-        static constexpr ReductionIndexerT reduction_indexer{};
-
-        wg = max_wg;
-        reductions_per_wi = std::max<std::size_t>(
-            1, (remaining_reduction_nelems + wg - 1) / wg);
-
-        reduction_groups =
-            (remaining_reduction_nelems + reductions_per_wi * wg - 1) /
-            (reductions_per_wi * wg);
-        assert(reduction_groups == 1);
-
-        sycl::event final_reduction_ev =
-            submit_search_reduction<argTy, resTy, ReductionOpT, IndexOpT,
-                                    InputOutputIterIndexerT, ReductionIndexerT,
-                                    false, true>(
-                exec_q, vals_temp_arg, nullptr, temp_arg, res_tp, identity_val,
-                idx_identity_val, wg, iter_nelems, remaining_reduction_nelems,
-                reductions_per_wi, reduction_groups, in_out_iter_indexer,
-                reduction_indexer, {dependent_ev});
-
-        sycl::event cleanup_host_task_event =
-            dpctl::tensor::alloc_utils::async_smart_free(
-                exec_q, {final_reduction_ev}, tmp_owner, vals_tmp_owner);
-
-        // FIXME: do not return host-task event
-        //   Instead collect all host-tasks to a list
-
-        return cleanup_host_task_event;
-    }
-}
-
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/repeat.hpp b/dpctl/tensor/libtensor/include/kernels/repeat.hpp
deleted file mode 100644
index 2d7b897ace..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/repeat.hpp
+++ /dev/null
@@ -1,457 +0,0 @@
-//=== repeat.hpp -  Implementation of repeat kernels ---*-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for tensor repeating operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <algorithm>
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "dpctl_tensor_types.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace repeat
-{
-
-using dpctl::tensor::ssize_t;
-using namespace dpctl::tensor::offset_utils;
-
-template <typename OrthogIndexer,
-          typename SrcAxisIndexer,
-          typename DstAxisIndexer,
-          typename RepIndexer,
-          typename T,
-          typename repT>
-class repeat_by_sequence_kernel;
-
-template <typename OrthogIndexer,
-          typename SrcAxisIndexer,
-          typename DstAxisIndexer,
-          typename RepIndexer,
-          typename T,
-          typename repT>
-class RepeatSequenceFunctor
-{
-private:
-    const T *src = nullptr;
-    T *dst = nullptr;
-    const repT *reps = nullptr;
-    const repT *cumsum = nullptr;
-    std::size_t src_axis_nelems = 1;
-    OrthogIndexer orthog_strider;
-    SrcAxisIndexer src_axis_strider;
-    DstAxisIndexer dst_axis_strider;
-    RepIndexer reps_strider;
-
-public:
-    RepeatSequenceFunctor(const T *src_,
-                          T *dst_,
-                          const repT *reps_,
-                          const repT *cumsum_,
-                          std::size_t src_axis_nelems_,
-                          const OrthogIndexer &orthog_strider_,
-                          const SrcAxisIndexer &src_axis_strider_,
-                          const DstAxisIndexer &dst_axis_strider_,
-                          const RepIndexer &reps_strider_)
-        : src(src_), dst(dst_), reps(reps_), cumsum(cumsum_),
-          src_axis_nelems(src_axis_nelems_), orthog_strider(orthog_strider_),
-          src_axis_strider(src_axis_strider_),
-          dst_axis_strider(dst_axis_strider_), reps_strider(reps_strider_)
-    {
-    }
-
-    void operator()(sycl::id<1> idx) const
-    {
-        std::size_t id = idx[0];
-        auto i_orthog = id / src_axis_nelems;
-        auto i_along = id - (i_orthog * src_axis_nelems);
-
-        auto orthog_offsets = orthog_strider(i_orthog);
-        auto src_offset = orthog_offsets.get_first_offset();
-        auto dst_offset = orthog_offsets.get_second_offset();
-
-        auto val = src[src_offset + src_axis_strider(i_along)];
-        auto last = cumsum[i_along];
-        auto first = last - reps[reps_strider(i_along)];
-        for (auto i = first; i < last; ++i) {
-            dst[dst_offset + dst_axis_strider(i)] = val;
-        }
-    }
-};
-
-typedef sycl::event (*repeat_by_sequence_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t,
-    std::size_t,
-    const char *,
-    char *,
-    const char *,
-    const char *,
-    int,
-    const ssize_t *,
-    ssize_t,
-    ssize_t,
-    ssize_t,
-    ssize_t,
-    ssize_t,
-    ssize_t,
-    ssize_t,
-    ssize_t,
-    const std::vector<sycl::event> &);
-
-template <typename T, typename repT>
-sycl::event
-repeat_by_sequence_impl(sycl::queue &q,
-                        std::size_t orthog_nelems,
-                        std::size_t src_axis_nelems,
-                        const char *src_cp,
-                        char *dst_cp,
-                        const char *reps_cp,
-                        const char *cumsum_cp,
-                        int orthog_nd,
-                        const ssize_t *orthog_src_dst_shape_and_strides,
-                        ssize_t src_offset,
-                        ssize_t dst_offset,
-                        ssize_t src_axis_shape,
-                        ssize_t src_axis_stride,
-                        ssize_t dst_axis_shape,
-                        ssize_t dst_axis_stride,
-                        ssize_t reps_shape,
-                        ssize_t reps_stride,
-                        const std::vector<sycl::event> &depends)
-{
-    sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        const T *src_tp = reinterpret_cast<const T *>(src_cp);
-        const repT *reps_tp = reinterpret_cast<const repT *>(reps_cp);
-        const repT *cumsum_tp = reinterpret_cast<const repT *>(cumsum_cp);
-        T *dst_tp = reinterpret_cast<T *>(dst_cp);
-
-        // orthog ndim indexer
-        const TwoOffsets_StridedIndexer orthog_indexer{
-            orthog_nd, src_offset, dst_offset,
-            orthog_src_dst_shape_and_strides};
-        // indexers along repeated axis
-        const Strided1DIndexer src_axis_indexer{/* size */ src_axis_shape,
-                                                /* step */ src_axis_stride};
-        const Strided1DIndexer dst_axis_indexer{/* size */ dst_axis_shape,
-                                                /* step */ dst_axis_stride};
-        // indexer along reps array
-        const Strided1DIndexer reps_indexer{/* size */ reps_shape,
-                                            /* step */ reps_stride};
-
-        const std::size_t gws = orthog_nelems * src_axis_nelems;
-
-        cgh.parallel_for<repeat_by_sequence_kernel<
-            TwoOffsets_StridedIndexer, Strided1DIndexer, Strided1DIndexer,
-            Strided1DIndexer, T, repT>>(
-            sycl::range<1>(gws),
-            RepeatSequenceFunctor<TwoOffsets_StridedIndexer, Strided1DIndexer,
-                                  Strided1DIndexer, Strided1DIndexer, T, repT>(
-                src_tp, dst_tp, reps_tp, cumsum_tp, src_axis_nelems,
-                orthog_indexer, src_axis_indexer, dst_axis_indexer,
-                reps_indexer));
-    });
-
-    return repeat_ev;
-}
-
-template <typename fnT, typename T> struct RepeatSequenceFactory
-{
-    fnT get()
-    {
-        fnT fn = repeat_by_sequence_impl<T, std::int64_t>;
-        return fn;
-    }
-};
-
-typedef sycl::event (*repeat_by_sequence_1d_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t,
-    const char *,
-    char *,
-    const char *,
-    const char *,
-    int,
-    const ssize_t *,
-    ssize_t,
-    ssize_t,
-    ssize_t,
-    ssize_t,
-    const std::vector<sycl::event> &);
-
-template <typename T, typename repT>
-sycl::event repeat_by_sequence_1d_impl(sycl::queue &q,
-                                       std::size_t src_nelems,
-                                       const char *src_cp,
-                                       char *dst_cp,
-                                       const char *reps_cp,
-                                       const char *cumsum_cp,
-                                       int src_nd,
-                                       const ssize_t *src_shape_strides,
-                                       ssize_t dst_shape,
-                                       ssize_t dst_stride,
-                                       ssize_t reps_shape,
-                                       ssize_t reps_stride,
-                                       const std::vector<sycl::event> &depends)
-{
-    sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        const T *src_tp = reinterpret_cast<const T *>(src_cp);
-        const repT *reps_tp = reinterpret_cast<const repT *>(reps_cp);
-        const repT *cumsum_tp = reinterpret_cast<const repT *>(cumsum_cp);
-        T *dst_tp = reinterpret_cast<T *>(dst_cp);
-
-        // orthog ndim indexer
-        static constexpr TwoZeroOffsets_Indexer orthog_indexer{};
-        // indexers along repeated axis
-        const StridedIndexer src_indexer{src_nd, 0, src_shape_strides};
-        const Strided1DIndexer dst_indexer{/* size */ dst_shape,
-                                           /* step */ dst_stride};
-        // indexer along reps array
-        const Strided1DIndexer reps_indexer{/* size */ reps_shape,
-                                            /* step */ reps_stride};
-
-        const std::size_t gws = src_nelems;
-
-        cgh.parallel_for<repeat_by_sequence_kernel<
-            TwoZeroOffsets_Indexer, StridedIndexer, Strided1DIndexer,
-            Strided1DIndexer, T, repT>>(
-            sycl::range<1>(gws),
-            RepeatSequenceFunctor<TwoZeroOffsets_Indexer, StridedIndexer,
-                                  Strided1DIndexer, Strided1DIndexer, T, repT>(
-                src_tp, dst_tp, reps_tp, cumsum_tp, src_nelems, orthog_indexer,
-                src_indexer, dst_indexer, reps_indexer));
-    });
-
-    return repeat_ev;
-}
-
-template <typename fnT, typename T> struct RepeatSequence1DFactory
-{
-    fnT get()
-    {
-        fnT fn = repeat_by_sequence_1d_impl<T, std::int64_t>;
-        return fn;
-    }
-};
-
-template <typename OrthogIndexer,
-          typename SrcAxisIndexer,
-          typename DstAxisIndexer,
-          typename T>
-class repeat_by_scalar_kernel;
-
-template <typename OrthogIndexer,
-          typename SrcAxisIndexer,
-          typename DstAxisIndexer,
-          typename T>
-class RepeatScalarFunctor
-{
-private:
-    const T *src = nullptr;
-    T *dst = nullptr;
-    ssize_t reps = 1;
-    std::size_t dst_axis_nelems = 0;
-    OrthogIndexer orthog_strider;
-    SrcAxisIndexer src_axis_strider;
-    DstAxisIndexer dst_axis_strider;
-
-public:
-    RepeatScalarFunctor(const T *src_,
-                        T *dst_,
-                        const ssize_t reps_,
-                        std::size_t dst_axis_nelems_,
-                        const OrthogIndexer &orthog_strider_,
-                        const SrcAxisIndexer &src_axis_strider_,
-                        const DstAxisIndexer &dst_axis_strider_)
-        : src(src_), dst(dst_), reps(reps_), dst_axis_nelems(dst_axis_nelems_),
-          orthog_strider(orthog_strider_), src_axis_strider(src_axis_strider_),
-          dst_axis_strider(dst_axis_strider_)
-    {
-    }
-
-    void operator()(sycl::id<1> idx) const
-    {
-        std::size_t id = idx[0];
-        auto i_orthog = id / dst_axis_nelems;
-        auto i_along = id - (i_orthog * dst_axis_nelems);
-
-        auto orthog_offsets = orthog_strider(i_orthog);
-        auto src_offset = orthog_offsets.get_first_offset();
-        auto dst_offset = orthog_offsets.get_second_offset();
-
-        auto dst_axis_offset = dst_axis_strider(i_along);
-        auto src_axis_offset = src_axis_strider(i_along / reps);
-        dst[dst_offset + dst_axis_offset] = src[src_offset + src_axis_offset];
-    }
-};
-
-typedef sycl::event (*repeat_by_scalar_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t,
-    std::size_t,
-    const char *,
-    char *,
-    const ssize_t,
-    int,
-    const ssize_t *,
-    ssize_t,
-    ssize_t,
-    ssize_t,
-    ssize_t,
-    ssize_t,
-    ssize_t,
-    const std::vector<sycl::event> &);
-
-template <typename T>
-sycl::event repeat_by_scalar_impl(sycl::queue &q,
-                                  std::size_t orthog_nelems,
-                                  std::size_t dst_axis_nelems,
-                                  const char *src_cp,
-                                  char *dst_cp,
-                                  const ssize_t reps,
-                                  int orthog_nd,
-                                  const ssize_t *orthog_shape_and_strides,
-                                  ssize_t src_offset,
-                                  ssize_t dst_offset,
-                                  ssize_t src_axis_shape,
-                                  ssize_t src_axis_stride,
-                                  ssize_t dst_axis_shape,
-                                  ssize_t dst_axis_stride,
-                                  const std::vector<sycl::event> &depends)
-{
-    sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        const T *src_tp = reinterpret_cast<const T *>(src_cp);
-        T *dst_tp = reinterpret_cast<T *>(dst_cp);
-
-        // orthog ndim indexer
-        const TwoOffsets_StridedIndexer orthog_indexer{
-            orthog_nd, src_offset, dst_offset, orthog_shape_and_strides};
-        // indexers along repeated axis
-        const Strided1DIndexer src_axis_indexer{/* size */ src_axis_shape,
-                                                /* step */ src_axis_stride};
-        const Strided1DIndexer dst_axis_indexer{/* size */ dst_axis_shape,
-                                                /* step */ dst_axis_stride};
-
-        const std::size_t gws = orthog_nelems * dst_axis_nelems;
-
-        cgh.parallel_for<repeat_by_scalar_kernel<
-            TwoOffsets_StridedIndexer, Strided1DIndexer, Strided1DIndexer, T>>(
-            sycl::range<1>(gws),
-            RepeatScalarFunctor<TwoOffsets_StridedIndexer, Strided1DIndexer,
-                                Strided1DIndexer, T>(
-                src_tp, dst_tp, reps, dst_axis_nelems, orthog_indexer,
-                src_axis_indexer, dst_axis_indexer));
-    });
-
-    return repeat_ev;
-}
-
-template <typename fnT, typename T> struct RepeatScalarFactory
-{
-    fnT get()
-    {
-        fnT fn = repeat_by_scalar_impl<T>;
-        return fn;
-    }
-};
-
-typedef sycl::event (*repeat_by_scalar_1d_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t,
-    const char *,
-    char *,
-    const ssize_t,
-    int,
-    const ssize_t *,
-    ssize_t,
-    ssize_t,
-    const std::vector<sycl::event> &);
-
-template <typename T>
-sycl::event repeat_by_scalar_1d_impl(sycl::queue &q,
-                                     std::size_t dst_nelems,
-                                     const char *src_cp,
-                                     char *dst_cp,
-                                     const ssize_t reps,
-                                     int src_nd,
-                                     const ssize_t *src_shape_strides,
-                                     ssize_t dst_shape,
-                                     ssize_t dst_stride,
-                                     const std::vector<sycl::event> &depends)
-{
-    sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        const T *src_tp = reinterpret_cast<const T *>(src_cp);
-        T *dst_tp = reinterpret_cast<T *>(dst_cp);
-
-        // orthog ndim indexer
-        static constexpr TwoZeroOffsets_Indexer orthog_indexer{};
-        // indexers along repeated axis
-        const StridedIndexer src_indexer(src_nd, 0, src_shape_strides);
-        const Strided1DIndexer dst_indexer{/* size */ dst_shape,
-                                           /* step */ dst_stride};
-
-        const std::size_t gws = dst_nelems;
-
-        cgh.parallel_for<repeat_by_scalar_kernel<
-            TwoZeroOffsets_Indexer, StridedIndexer, Strided1DIndexer, T>>(
-            sycl::range<1>(gws),
-            RepeatScalarFunctor<TwoZeroOffsets_Indexer, StridedIndexer,
-                                Strided1DIndexer, T>(src_tp, dst_tp, reps,
-                                                     dst_nelems, orthog_indexer,
-                                                     src_indexer, dst_indexer));
-    });
-
-    return repeat_ev;
-}
-
-template <typename fnT, typename T> struct RepeatScalar1DFactory
-{
-    fnT get()
-    {
-        fnT fn = repeat_by_scalar_1d_impl<T>;
-        return fn;
-    }
-};
-
-} // namespace repeat
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/isin.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/isin.hpp
deleted file mode 100644
index c6337c92ca..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/sorting/isin.hpp
+++ /dev/null
@@ -1,239 +0,0 @@
-//=== isin.hpp -                                      ---*-C++-*--/===//
-//    Implementation of searching for membership in sorted array
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for tensor membership operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include <cstddef>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/sorting/search_sorted_detail.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/rich_comparisons.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-
-using dpctl::tensor::ssize_t;
-
-template <typename T,
-          typename HayIndexerT,
-          typename NeedlesIndexerT,
-          typename OutIndexerT>
-struct IsinFunctor
-{
-private:
-    bool invert;
-    const T *hay_tp;
-    const T *needles_tp;
-    bool *out_tp;
-    std::size_t hay_nelems;
-    HayIndexerT hay_indexer;
-    NeedlesIndexerT needles_indexer;
-    OutIndexerT out_indexer;
-
-public:
-    IsinFunctor(const bool invert_,
-                const T *hay_,
-                const T *needles_,
-                bool *out_,
-                const std::size_t hay_nelems_,
-                const HayIndexerT &hay_indexer_,
-                const NeedlesIndexerT &needles_indexer_,
-                const OutIndexerT &out_indexer_)
-        : invert(invert_), hay_tp(hay_), needles_tp(needles_), out_tp(out_),
-          hay_nelems(hay_nelems_), hay_indexer(hay_indexer_),
-          needles_indexer(needles_indexer_), out_indexer(out_indexer_)
-    {
-    }
-
-    void operator()(sycl::id<1> id) const
-    {
-        using Compare =
-            typename dpctl::tensor::rich_comparisons::AscendingSorter<T>::type;
-        static constexpr Compare comp{};
-
-        const std::size_t i = id[0];
-        const T needle_v = needles_tp[needles_indexer(i)];
-
-        // position of the needle_v in the hay array
-        std::size_t pos{};
-
-        static constexpr std::size_t zero(0);
-        // search in hay in left-closed interval, give `pos` such that
-        // hay[pos - 1] < needle_v <= hay[pos]
-
-        // lower_bound returns the first pos such that bool(hay[pos] <
-        // needle_v) is false, i.e. needle_v <= hay[pos]
-        pos = search_sorted_detail::lower_bound_indexed_impl(
-            hay_tp, zero, hay_nelems, needle_v, comp, hay_indexer);
-        bool out = (pos == hay_nelems ? false : hay_tp[pos] == needle_v);
-        out_tp[out_indexer(i)] = (invert) ? !out : out;
-    }
-};
-
-typedef sycl::event (*isin_contig_impl_fp_ptr_t)(
-    sycl::queue &,
-    const bool,
-    const std::size_t,
-    const std::size_t,
-    const char *,
-    const ssize_t,
-    const char *,
-    const ssize_t,
-    char *,
-    const ssize_t,
-    const std::vector<sycl::event> &);
-
-template <typename T> class isin_contig_impl_krn;
-
-template <typename T>
-sycl::event isin_contig_impl(sycl::queue &exec_q,
-                             const bool invert,
-                             const std::size_t hay_nelems,
-                             const std::size_t needles_nelems,
-                             const char *hay_cp,
-                             const ssize_t hay_offset,
-                             const char *needles_cp,
-                             const ssize_t needles_offset,
-                             char *out_cp,
-                             const ssize_t out_offset,
-                             const std::vector<sycl::event> &depends)
-{
-    const T *hay_tp = reinterpret_cast<const T *>(hay_cp) + hay_offset;
-    const T *needles_tp =
-        reinterpret_cast<const T *>(needles_cp) + needles_offset;
-
-    bool *out_tp = reinterpret_cast<bool *>(out_cp) + out_offset;
-
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        using KernelName = class isin_contig_impl_krn<T>;
-
-        sycl::range<1> gRange(needles_nelems);
-
-        using TrivialIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-
-        static constexpr TrivialIndexerT hay_indexer{};
-        static constexpr TrivialIndexerT needles_indexer{};
-        static constexpr TrivialIndexerT out_indexer{};
-
-        const auto fnctr =
-            IsinFunctor<T, TrivialIndexerT, TrivialIndexerT, TrivialIndexerT>(
-                invert, hay_tp, needles_tp, out_tp, hay_nelems, hay_indexer,
-                needles_indexer, out_indexer);
-
-        cgh.parallel_for<KernelName>(gRange, fnctr);
-    });
-
-    return comp_ev;
-}
-
-typedef sycl::event (*isin_strided_impl_fp_ptr_t)(
-    sycl::queue &,
-    const bool,
-    const std::size_t,
-    const std::size_t,
-    const char *,
-    const ssize_t,
-    const ssize_t,
-    const char *,
-    const ssize_t,
-    char *,
-    const ssize_t,
-    int,
-    const ssize_t *,
-    const std::vector<sycl::event> &);
-
-template <typename T> class isin_strided_impl_krn;
-
-template <typename T>
-sycl::event isin_strided_impl(
-    sycl::queue &exec_q,
-    const bool invert,
-    const std::size_t hay_nelems,
-    const std::size_t needles_nelems,
-    const char *hay_cp,
-    const ssize_t hay_offset,
-    // hay is 1D, so hay_nelems, hay_offset, hay_stride describe strided array
-    const ssize_t hay_stride,
-    const char *needles_cp,
-    const ssize_t needles_offset,
-    char *out_cp,
-    const ssize_t out_offset,
-    const int needles_nd,
-    // packed_shape_strides is [needles_shape, needles_strides,
-    // out_strides] has length of 3*needles_nd
-    const ssize_t *packed_shape_strides,
-    const std::vector<sycl::event> &depends)
-{
-    const T *hay_tp = reinterpret_cast<const T *>(hay_cp);
-    const T *needles_tp = reinterpret_cast<const T *>(needles_cp);
-
-    bool *out_tp = reinterpret_cast<bool *>(out_cp);
-
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        sycl::range<1> gRange(needles_nelems);
-
-        using HayIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-        const HayIndexerT hay_indexer(
-            /* offset */ hay_offset,
-            /* size   */ hay_nelems,
-            /* step   */ hay_stride);
-
-        using NeedlesIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
-        const ssize_t *needles_shape_strides = packed_shape_strides;
-        const NeedlesIndexerT needles_indexer(needles_nd, needles_offset,
-                                              needles_shape_strides);
-        using OutIndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer;
-
-        const ssize_t *out_shape = packed_shape_strides;
-        const ssize_t *out_strides = packed_shape_strides + 2 * needles_nd;
-        const OutIndexerT out_indexer(needles_nd, out_offset, out_shape,
-                                      out_strides);
-
-        const auto fnctr =
-            IsinFunctor<T, HayIndexerT, NeedlesIndexerT, OutIndexerT>(
-                invert, hay_tp, needles_tp, out_tp, hay_nelems, hay_indexer,
-                needles_indexer, out_indexer);
-        using KernelName = class isin_strided_impl_krn<T>;
-
-        cgh.parallel_for<KernelName>(gRange, fnctr);
-    });
-
-    return comp_ev;
-}
-
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
deleted file mode 100644
index 504b47c5f9..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
+++ /dev/null
@@ -1,833 +0,0 @@
-//=== sorting.hpp -  Implementation of sorting kernels       ---*-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for tensor sort/argsort operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include <cassert>
-#include <cstddef>
-#include <functional>
-#include <iterator>
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/sorting/search_sorted_detail.hpp"
-#include "kernels/sorting/sort_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-
-namespace merge_sort_detail
-{
-
-using dpctl::tensor::ssize_t;
-using namespace dpctl::tensor::kernels::search_sorted_detail;
-
-/*! @brief Merge two contiguous sorted segments */
-template <typename InAcc, typename OutAcc, typename Compare>
-void merge_impl(const std::size_t offset,
-                const InAcc in_acc,
-                OutAcc out_acc,
-                const std::size_t start_1,
-                const std::size_t end_1,
-                const std::size_t end_2,
-                const std::size_t start_out,
-                Compare comp,
-                const std::size_t chunk)
-{
-    const std::size_t start_2 = end_1;
-    // Borders of the sequences to merge within this call
-    const std::size_t local_start_1 = sycl::min(offset + start_1, end_1);
-    const std::size_t local_end_1 = sycl::min(local_start_1 + chunk, end_1);
-    const std::size_t local_start_2 = sycl::min(offset + start_2, end_2);
-    const std::size_t local_end_2 = sycl::min(local_start_2 + chunk, end_2);
-
-    const std::size_t local_size_1 = local_end_1 - local_start_1;
-    const std::size_t local_size_2 = local_end_2 - local_start_2;
-
-    const auto r_item_1 = in_acc[end_1 - 1];
-    const auto l_item_2 = (start_2 < end_2) ? in_acc[start_2] : r_item_1;
-
-    // Copy if the sequences are sorted with respect to each other or merge
-    // otherwise
-    if (!comp(l_item_2, r_item_1)) {
-        const std::size_t out_shift_1 = start_out + local_start_1 - start_1;
-        const std::size_t out_shift_2 =
-            start_out + end_1 - start_1 + local_start_2 - start_2;
-
-        for (std::size_t i = 0; i < local_size_1; ++i) {
-            out_acc[out_shift_1 + i] = in_acc[local_start_1 + i];
-        }
-        for (std::size_t i = 0; i < local_size_2; ++i) {
-            out_acc[out_shift_2 + i] = in_acc[local_start_2 + i];
-        }
-    }
-    else if (comp(r_item_1, l_item_2)) {
-        const std::size_t out_shift_1 =
-            start_out + end_2 - start_2 + local_start_1 - start_1;
-        const std::size_t out_shift_2 = start_out + local_start_2 - start_2;
-        for (std::size_t i = 0; i < local_size_1; ++i) {
-            out_acc[out_shift_1 + i] = in_acc[local_start_1 + i];
-        }
-        for (std::size_t i = 0; i < local_size_2; ++i) {
-            out_acc[out_shift_2 + i] = in_acc[local_start_2 + i];
-        }
-    }
-    // Perform merging
-    else {
-
-        // Process 1st sequence
-        if (local_start_1 < local_end_1) {
-            // Reduce the range for searching within the 2nd sequence and handle
-            // bound items find left border in 2nd sequence
-            const auto local_l_item_1 = in_acc[local_start_1];
-            std::size_t l_search_bound_2 =
-                lower_bound_impl(in_acc, start_2, end_2, local_l_item_1, comp);
-            const std::size_t l_shift_1 = local_start_1 - start_1;
-            const std::size_t l_shift_2 = l_search_bound_2 - start_2;
-
-            out_acc[start_out + l_shift_1 + l_shift_2] = local_l_item_1;
-
-            std::size_t r_search_bound_2{};
-            // find right border in 2nd sequence
-            if (local_size_1 > 1) {
-                const auto local_r_item_1 = in_acc[local_end_1 - 1];
-                r_search_bound_2 = lower_bound_impl(
-                    in_acc, l_search_bound_2, end_2, local_r_item_1, comp);
-                const auto r_shift_1 = local_end_1 - 1 - start_1;
-                const auto r_shift_2 = r_search_bound_2 - start_2;
-
-                out_acc[start_out + r_shift_1 + r_shift_2] = local_r_item_1;
-            }
-
-            // Handle intermediate items
-            if (r_search_bound_2 == l_search_bound_2) {
-                const std::size_t shift_2 = l_search_bound_2 - start_2;
-                for (std::size_t idx = local_start_1 + 1; idx < local_end_1 - 1;
-                     ++idx)
-                {
-                    const auto intermediate_item_1 = in_acc[idx];
-                    const std::size_t shift_1 = idx - start_1;
-                    out_acc[start_out + shift_1 + shift_2] =
-                        intermediate_item_1;
-                }
-            }
-            else {
-                for (std::size_t idx = local_start_1 + 1; idx < local_end_1 - 1;
-                     ++idx)
-                {
-                    const auto intermediate_item_1 = in_acc[idx];
-                    // we shouldn't seek in whole 2nd sequence. Just for the
-                    // part where the 1st sequence should be
-                    l_search_bound_2 = lower_bound_impl(
-                        in_acc, l_search_bound_2, r_search_bound_2,
-                        intermediate_item_1, comp);
-                    const std::size_t shift_1 = idx - start_1;
-                    const std::size_t shift_2 = l_search_bound_2 - start_2;
-
-                    out_acc[start_out + shift_1 + shift_2] =
-                        intermediate_item_1;
-                }
-            }
-        }
-        // Process 2nd sequence
-        if (local_start_2 < local_end_2) {
-            // Reduce the range for searching within the 1st sequence and handle
-            // bound items find left border in 1st sequence
-            const auto local_l_item_2 = in_acc[local_start_2];
-            std::size_t l_search_bound_1 =
-                upper_bound_impl(in_acc, start_1, end_1, local_l_item_2, comp);
-            const std::size_t l_shift_1 = l_search_bound_1 - start_1;
-            const std::size_t l_shift_2 = local_start_2 - start_2;
-
-            out_acc[start_out + l_shift_1 + l_shift_2] = local_l_item_2;
-
-            std::size_t r_search_bound_1{};
-            // find right border in 1st sequence
-            if (local_size_2 > 1) {
-                const auto local_r_item_2 = in_acc[local_end_2 - 1];
-                r_search_bound_1 = upper_bound_impl(
-                    in_acc, l_search_bound_1, end_1, local_r_item_2, comp);
-                const std::size_t r_shift_1 = r_search_bound_1 - start_1;
-                const std::size_t r_shift_2 = local_end_2 - 1 - start_2;
-
-                out_acc[start_out + r_shift_1 + r_shift_2] = local_r_item_2;
-            }
-
-            // Handle intermediate items
-            if (l_search_bound_1 == r_search_bound_1) {
-                const std::size_t shift_1 = l_search_bound_1 - start_1;
-                for (auto idx = local_start_2 + 1; idx < local_end_2 - 1; ++idx)
-                {
-                    const auto intermediate_item_2 = in_acc[idx];
-                    const std::size_t shift_2 = idx - start_2;
-                    out_acc[start_out + shift_1 + shift_2] =
-                        intermediate_item_2;
-                }
-            }
-            else {
-                for (auto idx = local_start_2 + 1; idx < local_end_2 - 1; ++idx)
-                {
-                    const auto intermediate_item_2 = in_acc[idx];
-                    // we shouldn't seek in whole 1st sequence. Just for the
-                    // part where the 2nd sequence should be
-                    l_search_bound_1 = upper_bound_impl(
-                        in_acc, l_search_bound_1, r_search_bound_1,
-                        intermediate_item_2, comp);
-                    const std::size_t shift_1 = l_search_bound_1 - start_1;
-                    const std::size_t shift_2 = idx - start_2;
-
-                    out_acc[start_out + shift_1 + shift_2] =
-                        intermediate_item_2;
-                }
-            }
-        }
-    }
-}
-
-template <typename Iter, typename Compare>
-void insertion_sort_impl(Iter &&first,
-                         std::size_t begin,
-                         std::size_t end,
-                         Compare &&comp)
-{
-    for (std::size_t i = begin + 1; i < end; ++i) {
-        const auto val_i = first[i];
-        std::size_t j = i - 1;
-        while ((j + 1 > begin) && (comp(val_i, first[j]))) {
-            first[j + 1] = first[j];
-            --j;
-        }
-        if (j + 1 < i) {
-            first[j + 1] = val_i;
-        }
-    }
-}
-
-template <typename Iter, typename Compare>
-void leaf_sort_impl(Iter &&first,
-                    std::size_t begin,
-                    std::size_t end,
-                    Compare &&comp)
-{
-    return insertion_sort_impl<Iter, Compare>(std::forward<Iter>(first),
-                                              std::move(begin), std::move(end),
-                                              std::forward<Compare>(comp));
-}
-
-template <typename Iter> struct GetValueType
-{
-    using value_type = typename std::iterator_traits<Iter>::value_type;
-};
-
-template <typename ElementType,
-          sycl::access::address_space Space,
-          sycl::access::decorated IsDecorated>
-struct GetValueType<sycl::multi_ptr<ElementType, Space, IsDecorated>>
-{
-    using value_type = ElementType;
-};
-
-template <typename ElementType,
-          int Dim,
-          sycl::access_mode Mode,
-          sycl::target Target,
-          sycl::access::placeholder isPlaceholder>
-struct GetValueType<
-    sycl::accessor<ElementType, Dim, Mode, Target, isPlaceholder>>
-{
-    using value_type = ElementType;
-};
-
-template <typename ElementType, int Dim, typename AllocatorT>
-struct GetValueType<sycl::buffer<ElementType, Dim, AllocatorT>>
-{
-    using value_type = ElementType;
-};
-
-template <typename Iter> struct GetReadOnlyAccess
-{
-    Iter operator()(const Iter &it, sycl::handler &) { return it; }
-};
-
-template <typename ElementType, int Dim, typename AllocatorT>
-struct GetReadOnlyAccess<sycl::buffer<ElementType, Dim, AllocatorT>>
-{
-    auto operator()(const sycl::buffer<ElementType, Dim, AllocatorT> &buf,
-                    sycl::handler &cgh)
-    {
-        sycl::accessor acc(buf, cgh, sycl::read_only);
-        return acc;
-    }
-};
-
-template <typename Iter> struct GetWriteDiscardAccess
-{
-    Iter operator()(Iter it, sycl::handler &) { return it; }
-};
-
-template <typename ElementType, int Dim, typename AllocatorT>
-struct GetWriteDiscardAccess<sycl::buffer<ElementType, Dim, AllocatorT>>
-{
-    auto operator()(sycl::buffer<ElementType, Dim, AllocatorT> &buf,
-                    sycl::handler &cgh)
-    {
-        sycl::accessor acc(buf, cgh, sycl::write_only, sycl::no_init);
-        return acc;
-    }
-};
-
-template <typename Iter> struct GetReadWriteAccess
-{
-    Iter operator()(Iter &it, sycl::handler &) { return it; }
-};
-
-template <typename ElementType, int Dim, typename AllocatorT>
-struct GetReadWriteAccess<sycl::buffer<ElementType, Dim, AllocatorT>>
-{
-    auto operator()(sycl::buffer<ElementType, Dim, AllocatorT> &buf,
-                    sycl::handler &cgh)
-    {
-        sycl::accessor acc(buf, cgh, sycl::read_write);
-        return acc;
-    }
-};
-
-template <typename T1, typename T2, typename Comp>
-class sort_base_step_contig_krn;
-
-template <typename InpAcc, typename OutAcc, typename Comp>
-sycl::event
-sort_base_step_contig_impl(sycl::queue &q,
-                           const std::size_t iter_nelems,
-                           const std::size_t sort_nelems,
-                           const InpAcc input,
-                           OutAcc output,
-                           const Comp &comp,
-                           const std::size_t conseq_nelems_sorted,
-                           const std::vector<sycl::event> &depends = {})
-{
-
-    using inpT = typename GetValueType<InpAcc>::value_type;
-    using outT = typename GetValueType<OutAcc>::value_type;
-    using KernelName = sort_base_step_contig_krn<inpT, outT, Comp>;
-
-    const std::size_t n_segments =
-        quotient_ceil(sort_nelems, conseq_nelems_sorted);
-
-    sycl::event base_sort = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        const sycl::range<1> gRange{iter_nelems * n_segments};
-
-        auto input_acc = GetReadOnlyAccess<InpAcc>{}(input, cgh);
-        auto output_acc = GetWriteDiscardAccess<OutAcc>{}(output, cgh);
-
-        cgh.parallel_for<KernelName>(gRange, [=](sycl::id<1> id) {
-            const std::size_t iter_id = id[0] / n_segments;
-            const std::size_t segment_id = id[0] - iter_id * n_segments;
-
-            const std::size_t iter_offset = iter_id * sort_nelems;
-            const std::size_t beg_id =
-                iter_offset + segment_id * conseq_nelems_sorted;
-            const std::size_t end_id =
-                iter_offset +
-                std::min((segment_id + 1) * conseq_nelems_sorted, sort_nelems);
-            for (std::size_t i = beg_id; i < end_id; ++i) {
-                output_acc[i] = input_acc[i];
-            }
-
-            leaf_sort_impl(output_acc, beg_id, end_id, comp);
-        });
-    });
-
-    return base_sort;
-}
-
-template <typename T1, typename T2, typename Comp>
-class sort_over_work_group_contig_krn;
-
-template <typename InpAcc, typename OutAcc, typename Comp>
-sycl::event
-sort_over_work_group_contig_impl(sycl::queue &q,
-                                 std::size_t iter_nelems,
-                                 std::size_t sort_nelems,
-                                 const InpAcc input,
-                                 OutAcc output,
-                                 const Comp &comp,
-                                 std::size_t &nelems_wg_sorts,
-                                 const std::vector<sycl::event> &depends = {})
-{
-    using inpT = typename GetValueType<InpAcc>::value_type;
-    using T = typename GetValueType<OutAcc>::value_type;
-    using KernelName = sort_over_work_group_contig_krn<inpT, T, Comp>;
-
-    const auto &kernel_id = sycl::get_kernel_id<KernelName>();
-
-    auto const &ctx = q.get_context();
-    auto const &dev = q.get_device();
-    auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
-        ctx, {dev}, {kernel_id});
-
-    auto krn = kb.get_kernel(kernel_id);
-
-    const std::uint32_t max_sg_size = krn.template get_info<
-        sycl::info::kernel_device_specific::max_sub_group_size>(dev);
-    const std::uint64_t device_local_memory_size =
-        dev.get_info<sycl::info::device::local_mem_size>();
-
-    //  leave 512 bytes of local memory for RT
-    const std::uint64_t safety_margin = 512;
-
-    const std::uint64_t nelems_per_slm =
-        (device_local_memory_size - safety_margin) / (2 * sizeof(T));
-
-    static constexpr std::uint32_t sub_groups_per_work_group = 4;
-    const std::uint32_t elems_per_wi = dev.has(sycl::aspect::cpu) ? 8 : 2;
-
-    const std::size_t lws = sub_groups_per_work_group * max_sg_size;
-
-    nelems_wg_sorts = elems_per_wi * lws;
-
-    if (nelems_wg_sorts > nelems_per_slm) {
-        nelems_wg_sorts = (q.get_device().has(sycl::aspect::cpu) ? 16 : 4);
-
-        return sort_base_step_contig_impl<InpAcc, OutAcc, Comp>(
-            q, iter_nelems, sort_nelems, input, output, comp, nelems_wg_sorts,
-            depends);
-    }
-
-    // This assumption permits doing away with using a loop
-    assert(nelems_wg_sorts % lws == 0);
-
-    const std::size_t n_segments = quotient_ceil(sort_nelems, nelems_wg_sorts);
-
-    sycl::event base_sort_ev = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        cgh.use_kernel_bundle(kb);
-
-        sycl::range<1> global_range{iter_nelems * n_segments * lws};
-        sycl::range<1> local_range{lws};
-
-        sycl::range<1> slm_range{nelems_wg_sorts};
-        sycl::local_accessor<T, 1> work_space(slm_range, cgh);
-        sycl::local_accessor<T, 1> scratch_space(slm_range, cgh);
-
-        auto input_acc = GetReadOnlyAccess<InpAcc>{}(input, cgh);
-        auto output_acc = GetWriteDiscardAccess<OutAcc>{}(output, cgh);
-
-        sycl::nd_range<1> ndRange(global_range, local_range);
-
-        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> it) {
-            const std::size_t group_id = it.get_group_linear_id();
-            const std::size_t iter_id = group_id / n_segments;
-            const std::size_t segment_id = group_id - iter_id * n_segments;
-            const std::size_t lid = it.get_local_linear_id();
-
-            const std::size_t segment_start_idx = segment_id * nelems_wg_sorts;
-            const std::size_t segment_end_idx =
-                std::min(segment_start_idx + nelems_wg_sorts, sort_nelems);
-            const std::size_t wg_chunk_size =
-                segment_end_idx - segment_start_idx;
-
-            // load input into SLM
-            for (std::size_t array_id = segment_start_idx + lid;
-                 array_id < segment_end_idx; array_id += lws)
-            {
-                T v = (array_id < sort_nelems)
-                          ? input_acc[iter_id * sort_nelems + array_id]
-                          : T{};
-                work_space[array_id - segment_start_idx] = v;
-            }
-            sycl::group_barrier(it.get_group());
-
-            const std::size_t chunk = quotient_ceil(nelems_wg_sorts, lws);
-
-            const std::size_t chunk_start_idx = lid * chunk;
-            const std::size_t chunk_end_idx =
-                sycl::min(chunk_start_idx + chunk, wg_chunk_size);
-
-            leaf_sort_impl(work_space, chunk_start_idx, chunk_end_idx, comp);
-
-            sycl::group_barrier(it.get_group());
-
-            bool data_in_temp = false;
-            std::size_t n_chunks_merged = 1;
-
-            // merge chunk while n_chunks_merged * chunk < wg_chunk_size
-            const std::size_t max_chunks_merged =
-                1 + ((wg_chunk_size - 1) / chunk);
-            for (; n_chunks_merged < max_chunks_merged;
-                 data_in_temp = !data_in_temp, n_chunks_merged *= 2)
-            {
-                const std::size_t nelems_sorted_so_far =
-                    n_chunks_merged * chunk;
-                const std::size_t q = (lid / n_chunks_merged);
-                const std::size_t start_1 =
-                    sycl::min(2 * nelems_sorted_so_far * q, wg_chunk_size);
-                const std::size_t end_1 =
-                    sycl::min(start_1 + nelems_sorted_so_far, wg_chunk_size);
-                const std::size_t end_2 =
-                    sycl::min(end_1 + nelems_sorted_so_far, wg_chunk_size);
-                const std::size_t offset = chunk * (lid - q * n_chunks_merged);
-
-                if (data_in_temp) {
-                    merge_impl(offset, scratch_space, work_space, start_1,
-                               end_1, end_2, start_1, comp, chunk);
-                }
-                else {
-                    merge_impl(offset, work_space, scratch_space, start_1,
-                               end_1, end_2, start_1, comp, chunk);
-                }
-                sycl::group_barrier(it.get_group());
-            }
-
-            const auto &out_src = (data_in_temp) ? scratch_space : work_space;
-            for (std::size_t array_id = segment_start_idx + lid;
-                 array_id < segment_end_idx; array_id += lws)
-            {
-                if (array_id < sort_nelems) {
-                    output_acc[iter_id * sort_nelems + array_id] =
-                        out_src[array_id - segment_start_idx];
-                }
-            }
-        });
-    });
-
-    return base_sort_ev;
-}
-
-class vacuous_krn;
-
-inline sycl::event tie_events(sycl::queue &q,
-                              const std::vector<sycl::event> depends)
-{
-    if (depends.empty())
-        return sycl::event();
-    if (depends.size() == 1)
-        return depends[0];
-
-    sycl::event e = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-        using KernelName = vacuous_krn;
-        cgh.single_task<KernelName>([]() {});
-    });
-
-    return e;
-}
-
-template <typename T, typename Comp> class merge_adjacent_blocks_to_temp_krn;
-
-template <typename T, typename Comp> class merge_adjacent_blocks_from_temp_krn;
-
-template <typename Acc, typename Comp>
-sycl::event
-merge_sorted_block_contig_impl(sycl::queue &q,
-                               std::size_t iter_nelems,
-                               std::size_t sort_nelems,
-                               Acc output,
-                               const Comp comp,
-                               std::size_t sorted_block_size,
-                               const std::vector<sycl::event> &depends = {})
-{
-
-    if (sorted_block_size >= sort_nelems)
-        return tie_events(q, depends);
-
-    // experimentally determined value
-    // size of segments worked upon by each work-item during merging
-    const sycl::device &dev = q.get_device();
-    const std::size_t segment_size = (dev.has(sycl::aspect::cpu)) ? 32 : 4;
-
-    const std::size_t chunk_size =
-        (sorted_block_size < segment_size) ? sorted_block_size : segment_size;
-
-    assert(sorted_block_size % chunk_size == 0);
-
-    using T = typename GetValueType<Acc>::value_type;
-
-    sycl::buffer<T, 1> temp_buf(sycl::range<1>{iter_nelems * sort_nelems});
-    // T *allocated_mem = sycl::malloc_device<T>(iter_nelems * sort_nelems, q);
-
-    bool needs_copy = true;
-    bool used_depends = false;
-
-    sycl::event dep_ev;
-    std::size_t chunks_merged = sorted_block_size / chunk_size;
-
-    assert(!(chunks_merged & (chunks_merged - 1)));
-
-    using ToTempKernelName = class merge_adjacent_blocks_to_temp_krn<T, Comp>;
-    using FromTempKernelName =
-        class merge_adjacent_blocks_from_temp_krn<T, Comp>;
-
-    while (chunks_merged * chunk_size < sort_nelems) {
-        sycl::event local_dep = dep_ev;
-
-        sycl::event merge_ev = q.submit([&](sycl::handler &cgh) {
-            if (used_depends) {
-                cgh.depends_on(local_dep);
-            }
-            else {
-                cgh.depends_on(depends);
-                used_depends = true;
-            }
-
-            const std::size_t n_chunks = quotient_ceil(sort_nelems, chunk_size);
-
-            if (needs_copy) {
-                sycl::accessor temp_acc{temp_buf, cgh, sycl::write_only,
-                                        sycl::no_init};
-                auto output_acc = GetReadOnlyAccess<Acc>{}(output, cgh);
-                cgh.parallel_for<ToTempKernelName>(
-                    {iter_nelems * n_chunks}, [=](sycl::id<1> wid) {
-                        auto flat_idx = wid[0];
-                        auto iter_idx = flat_idx / n_chunks;
-                        auto idx = flat_idx - n_chunks * iter_idx;
-
-                        const std::size_t idx_mult =
-                            (idx / chunks_merged) * chunks_merged;
-                        const std::size_t idx_rem = (idx - idx_mult);
-                        const std::size_t start_1 =
-                            sycl::min(2 * idx_mult * chunk_size, sort_nelems);
-                        const std::size_t end_1 = sycl::min(
-                            start_1 + chunks_merged * chunk_size, sort_nelems);
-                        const std::size_t end_2 = sycl::min(
-                            end_1 + chunks_merged * chunk_size, sort_nelems);
-                        const std::size_t offset = chunk_size * idx_rem;
-
-                        const std::size_t iter_offset = iter_idx * sort_nelems;
-
-                        merge_impl(offset, output_acc, temp_acc,
-                                   iter_offset + start_1, iter_offset + end_1,
-                                   iter_offset + end_2, iter_offset + start_1,
-                                   comp, chunk_size);
-                    });
-            }
-            else {
-                sycl::accessor temp_acc{temp_buf, cgh, sycl::read_only};
-                auto output_acc = GetWriteDiscardAccess<Acc>{}(output, cgh);
-                cgh.parallel_for<FromTempKernelName>(
-                    {iter_nelems * n_chunks}, [=](sycl::id<1> wid) {
-                        auto flat_idx = wid[0];
-                        auto iter_idx = flat_idx / n_chunks;
-                        auto idx = flat_idx - n_chunks * iter_idx;
-
-                        const std::size_t idx_mult =
-                            (idx / chunks_merged) * chunks_merged;
-                        const std::size_t idx_rem = (idx - idx_mult);
-                        const std::size_t start_1 =
-                            sycl::min(2 * idx_mult * chunk_size, sort_nelems);
-                        const std::size_t end_1 = sycl::min(
-                            start_1 + chunks_merged * chunk_size, sort_nelems);
-                        const std::size_t end_2 = sycl::min(
-                            end_1 + chunks_merged * chunk_size, sort_nelems);
-                        const std::size_t offset = chunk_size * idx_rem;
-
-                        const std::size_t iter_offset = iter_idx * sort_nelems;
-
-                        merge_impl(offset, temp_acc, output_acc,
-                                   iter_offset + start_1, iter_offset + end_1,
-                                   iter_offset + end_2, iter_offset + start_1,
-                                   comp, chunk_size);
-                    });
-            }
-        });
-
-        chunks_merged *= 2;
-        dep_ev = merge_ev;
-
-        if (chunks_merged * chunk_size < sort_nelems) {
-            needs_copy = !needs_copy;
-        }
-    }
-
-    if (needs_copy) {
-        sycl::event copy_ev = q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(dep_ev);
-
-            sycl::accessor temp_acc{temp_buf, cgh, sycl::read_only};
-            auto output_acc = GetWriteDiscardAccess<Acc>{}(output, cgh);
-
-            cgh.copy(temp_acc, output_acc);
-        });
-        dep_ev = copy_ev;
-    }
-
-    return dep_ev;
-}
-
-} // end of namespace merge_sort_detail
-
-template <typename argTy, typename Comp = std::less<argTy>>
-sycl::event stable_sort_axis1_contig_impl(
-    sycl::queue &exec_q,
-    std::size_t iter_nelems, // number of sub-arrays to sort (num. of rows in a
-                             // matrix when sorting over rows)
-    std::size_t sort_nelems, // size of each array to sort  (length of rows,
-                             // i.e. number of columns)
-    const char *arg_cp,
-    char *res_cp,
-    ssize_t iter_arg_offset,
-    ssize_t iter_res_offset,
-    ssize_t sort_arg_offset,
-    ssize_t sort_res_offset,
-    const std::vector<sycl::event> &depends)
-{
-    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
-                          iter_arg_offset + sort_arg_offset;
-    argTy *res_tp =
-        reinterpret_cast<argTy *>(res_cp) + iter_res_offset + sort_res_offset;
-
-    auto comp = Comp{};
-
-    // constant chosen experimentally to ensure monotonicity of
-    // sorting performance, as measured on GPU Max, and Iris Xe
-    constexpr std::size_t sequential_sorting_threshold = 16;
-
-    if (sort_nelems < sequential_sorting_threshold) {
-        // equal work-item sorts entire row
-        sycl::event sequential_sorting_ev =
-            merge_sort_detail::sort_base_step_contig_impl<const argTy *,
-                                                          argTy *, Comp>(
-                exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, comp,
-                sort_nelems, depends);
-
-        return sequential_sorting_ev;
-    }
-    else {
-        std::size_t sorted_block_size{};
-
-        // Sort segments of the array
-        sycl::event base_sort_ev =
-            merge_sort_detail::sort_over_work_group_contig_impl<const argTy *,
-                                                                argTy *, Comp>(
-                exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, comp,
-                sorted_block_size, // modified in place with size of sorted
-                                   // block size
-                depends);
-
-        // Merge segments in parallel until all elements are sorted
-        sycl::event merges_ev =
-            merge_sort_detail::merge_sorted_block_contig_impl<argTy *, Comp>(
-                exec_q, iter_nelems, sort_nelems, res_tp, comp,
-                sorted_block_size, {base_sort_ev});
-
-        return merges_ev;
-    }
-}
-
-template <typename T1, typename T2> class populate_index_data_krn;
-
-template <typename T1, typename T2> class index_map_to_rows_krn;
-
-template <typename IndexT, typename ValueT, typename ValueComp> struct IndexComp
-{
-    IndexComp(const ValueT *data, const ValueComp &comp_op)
-        : ptr(data), value_comp(comp_op)
-    {
-    }
-
-    bool operator()(const IndexT &i1, const IndexT &i2) const
-    {
-        return value_comp(ptr[i1], ptr[i2]);
-    }
-
-private:
-    const ValueT *ptr;
-    ValueComp value_comp;
-};
-
-template <typename argTy,
-          typename IndexTy,
-          typename ValueComp = std::less<argTy>>
-sycl::event stable_argsort_axis1_contig_impl(
-    sycl::queue &exec_q,
-    std::size_t iter_nelems, // number of sub-arrays to sort (num. of rows in a
-                             // matrix when sorting over rows)
-    std::size_t sort_nelems, // size of each array to sort  (length of rows,
-                             // i.e. number of columns)
-    const char *arg_cp,
-    char *res_cp,
-    ssize_t iter_arg_offset,
-    ssize_t iter_res_offset,
-    ssize_t sort_arg_offset,
-    ssize_t sort_res_offset,
-    const std::vector<sycl::event> &depends)
-{
-    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
-                          iter_arg_offset + sort_arg_offset;
-    IndexTy *res_tp =
-        reinterpret_cast<IndexTy *>(res_cp) + iter_res_offset + sort_res_offset;
-
-    const IndexComp<IndexTy, argTy, ValueComp> index_comp{arg_tp, ValueComp{}};
-
-    static constexpr std::size_t determine_automatically = 0;
-    std::size_t sorted_block_size = determine_automatically;
-
-    const std::size_t total_nelems = iter_nelems * sort_nelems;
-
-    using dpctl::tensor::kernels::sort_utils_detail::iota_impl;
-
-    using IotaKernelName = populate_index_data_krn<argTy, IndexTy>;
-
-    sycl::event populate_indexed_data_ev = iota_impl<IotaKernelName, IndexTy>(
-        exec_q, res_tp, total_nelems, depends);
-
-    // Sort segments of the array
-    sycl::event base_sort_ev =
-        merge_sort_detail::sort_over_work_group_contig_impl(
-            exec_q, iter_nelems, sort_nelems, res_tp, res_tp, index_comp,
-            sorted_block_size, // modified in place with size of sorted block
-                               // size
-            {populate_indexed_data_ev});
-
-    // Merge segments in parallel until all elements are sorted
-    sycl::event merges_ev = merge_sort_detail::merge_sorted_block_contig_impl(
-        exec_q, iter_nelems, sort_nelems, res_tp, index_comp, sorted_block_size,
-        {base_sort_ev});
-
-    // no need to map back if iter_nelems == 1
-    if (iter_nelems == 1u) {
-        return merges_ev;
-    }
-
-    using MapBackKernelName = index_map_to_rows_krn<argTy, IndexTy>;
-    using dpctl::tensor::kernels::sort_utils_detail::map_back_impl;
-
-    sycl::event write_out_ev = map_back_impl<MapBackKernelName, IndexTy>(
-        exec_q, total_nelems, res_tp, res_tp, sort_nelems, {merges_ev});
-
-    return write_out_ev;
-}
-
-} // end of namespace kernels
-} // end of namespace tensor
-} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
deleted file mode 100644
index 3f4314ea84..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
+++ /dev/null
@@ -1,1897 +0,0 @@
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_sorting_impl
-/// extension.
-//===--------------------------------------------------------------------===//
-
-// Implementation in this file were adapted from oneDPL's radix sort
-// implementation, license Apache-2.0 WITH LLVM-exception
-
-#pragma once
-
-#include <array>
-#include <cstddef>
-#include <cstdint>
-#include <limits>
-#include <stdexcept>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include <sycl/sycl.hpp>
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/sorting/sort_utils.hpp"
-#include "utils/sycl_alloc_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-
-namespace radix_sort_details
-{
-
-template <std::uint32_t, bool, typename... TrailingNames>
-class radix_sort_count_kernel;
-
-template <std::uint32_t, typename... TrailingNames>
-class radix_sort_scan_kernel;
-
-template <std::uint32_t, bool, typename... TrailingNames>
-class radix_sort_reorder_peer_kernel;
-
-template <std::uint32_t, bool, typename... TrailingNames>
-class radix_sort_reorder_kernel;
-
-/*! @brief Computes smallest exponent such that `n <= (1 << exponent)` */
-template <typename SizeT,
-          std::enable_if_t<std::is_unsigned_v<SizeT> &&
-                               sizeof(SizeT) == sizeof(std::uint64_t),
-                           int> = 0>
-std::uint32_t ceil_log2(SizeT n)
-{
-    // if n > 2^b, n = q * 2^b + r for q > 0 and 0 <= r < 2^b
-    // floor_log2(q * 2^b + r) == floor_log2(q * 2^b) == q + floor_log2(n1)
-    // ceil_log2(n) == 1 + floor_log2(n-1)
-    if (n <= 1)
-        return std::uint32_t{1};
-
-    std::uint32_t exp{1};
-    --n;
-    if (n >= (SizeT{1} << 32)) {
-        n >>= 32;
-        exp += 32;
-    }
-    if (n >= (SizeT{1} << 16)) {
-        n >>= 16;
-        exp += 16;
-    }
-    if (n >= (SizeT{1} << 8)) {
-        n >>= 8;
-        exp += 8;
-    }
-    if (n >= (SizeT{1} << 4)) {
-        n >>= 4;
-        exp += 4;
-    }
-    if (n >= (SizeT{1} << 2)) {
-        n >>= 2;
-        exp += 2;
-    }
-    if (n >= (SizeT{1} << 1)) {
-        n >>= 1;
-        ++exp;
-    }
-    return exp;
-}
-
-//----------------------------------------------------------
-// bitwise order-preserving conversions to unsigned integers
-//----------------------------------------------------------
-
-template <bool is_ascending> bool order_preserving_cast(bool val)
-{
-    if constexpr (is_ascending)
-        return val;
-    else
-        return !val;
-}
-
-template <bool is_ascending,
-          typename UIntT,
-          std::enable_if_t<std::is_unsigned_v<UIntT>, int> = 0>
-UIntT order_preserving_cast(UIntT val)
-{
-    if constexpr (is_ascending) {
-        return val;
-    }
-    else {
-        // bitwise invert
-        return (~val);
-    }
-}
-
-template <bool is_ascending,
-          typename IntT,
-          std::enable_if_t<std::is_integral_v<IntT> && std::is_signed_v<IntT>,
-                           int> = 0>
-std::make_unsigned_t<IntT> order_preserving_cast(IntT val)
-{
-    using UIntT = std::make_unsigned_t<IntT>;
-    const UIntT uint_val = sycl::bit_cast<UIntT>(val);
-
-    if constexpr (is_ascending) {
-        // ascending_mask: 100..0
-        static constexpr UIntT ascending_mask =
-            (UIntT(1) << std::numeric_limits<IntT>::digits);
-        return (uint_val ^ ascending_mask);
-    }
-    else {
-        // descending_mask: 011..1
-        static constexpr UIntT descending_mask =
-            (std::numeric_limits<UIntT>::max() >> 1);
-        return (uint_val ^ descending_mask);
-    }
-}
-
-template <bool is_ascending> std::uint16_t order_preserving_cast(sycl::half val)
-{
-    using UIntT = std::uint16_t;
-
-    const UIntT uint_val = sycl::bit_cast<UIntT>(
-        (sycl::isnan(val)) ? std::numeric_limits<sycl::half>::quiet_NaN()
-                           : val);
-    UIntT mask;
-
-    // test the sign bit of the original value
-    const bool zero_fp_sign_bit = (UIntT(0) == (uint_val >> 15));
-
-    static constexpr UIntT zero_mask = UIntT(0x8000u);
-    static constexpr UIntT nonzero_mask = UIntT(0xFFFFu);
-
-    static constexpr UIntT inv_zero_mask = static_cast<UIntT>(~zero_mask);
-    static constexpr UIntT inv_nonzero_mask = static_cast<UIntT>(~nonzero_mask);
-
-    if constexpr (is_ascending) {
-        mask = (zero_fp_sign_bit) ? zero_mask : nonzero_mask;
-    }
-    else {
-        mask = (zero_fp_sign_bit) ? (inv_zero_mask) : (inv_nonzero_mask);
-    }
-
-    return (uint_val ^ mask);
-}
-
-template <bool is_ascending,
-          typename FloatT,
-          std::enable_if_t<std::is_floating_point_v<FloatT> &&
-                               sizeof(FloatT) == sizeof(std::uint32_t),
-                           int> = 0>
-std::uint32_t order_preserving_cast(FloatT val)
-{
-    using UIntT = std::uint32_t;
-
-    UIntT uint_val = sycl::bit_cast<UIntT>(
-        (sycl::isnan(val)) ? std::numeric_limits<FloatT>::quiet_NaN() : val);
-
-    UIntT mask;
-
-    // test the sign bit of the original value
-    const bool zero_fp_sign_bit = (UIntT(0) == (uint_val >> 31));
-
-    static constexpr UIntT zero_mask = UIntT(0x80000000u);
-    static constexpr UIntT nonzero_mask = UIntT(0xFFFFFFFFu);
-
-    if constexpr (is_ascending)
-        mask = (zero_fp_sign_bit) ? zero_mask : nonzero_mask;
-    else
-        mask = (zero_fp_sign_bit) ? (~zero_mask) : (~nonzero_mask);
-
-    return (uint_val ^ mask);
-}
-
-template <bool is_ascending,
-          typename FloatT,
-          std::enable_if_t<std::is_floating_point_v<FloatT> &&
-                               sizeof(FloatT) == sizeof(std::uint64_t),
-                           int> = 0>
-std::uint64_t order_preserving_cast(FloatT val)
-{
-    using UIntT = std::uint64_t;
-
-    UIntT uint_val = sycl::bit_cast<UIntT>(
-        (sycl::isnan(val)) ? std::numeric_limits<FloatT>::quiet_NaN() : val);
-    UIntT mask;
-
-    // test the sign bit of the original value
-    const bool zero_fp_sign_bit = (UIntT(0) == (uint_val >> 63));
-
-    static constexpr UIntT zero_mask = UIntT(0x8000000000000000u);
-    static constexpr UIntT nonzero_mask = UIntT(0xFFFFFFFFFFFFFFFFu);
-
-    if constexpr (is_ascending)
-        mask = (zero_fp_sign_bit) ? zero_mask : nonzero_mask;
-    else
-        mask = (zero_fp_sign_bit) ? (~zero_mask) : (~nonzero_mask);
-
-    return (uint_val ^ mask);
-}
-
-//-----------------
-// bucket functions
-//-----------------
-
-template <typename T> constexpr std::size_t number_of_bits_in_type()
-{
-    constexpr std::size_t type_bits =
-        (sizeof(T) * std::numeric_limits<unsigned char>::digits);
-    return type_bits;
-}
-
-// the number of buckets (size of radix bits) in T
-template <typename T>
-constexpr std::uint32_t number_of_buckets_in_type(std::uint32_t radix_bits)
-{
-    constexpr std::size_t type_bits = number_of_bits_in_type<T>();
-    return (type_bits + radix_bits - 1) / radix_bits;
-}
-
-// get bits value (bucket) in a certain radix position
-template <std::uint32_t radix_mask, typename T>
-std::uint32_t get_bucket_id(T val, std::uint32_t radix_offset)
-{
-    static_assert(std::is_unsigned_v<T>);
-
-    return (val >> radix_offset) & T(radix_mask);
-}
-
-//--------------------------------
-// count kernel (single iteration)
-//--------------------------------
-
-template <typename KernelName,
-          std::uint32_t radix_bits,
-          typename ValueT,
-          typename CountT,
-          typename Proj>
-sycl::event
-radix_sort_count_submit(sycl::queue &exec_q,
-                        std::size_t n_iters,
-                        std::size_t n_segments,
-                        std::size_t wg_size,
-                        std::uint32_t radix_offset,
-                        std::size_t n_values,
-                        ValueT *vals_ptr,
-                        std::size_t n_counts,
-                        CountT *counts_ptr,
-                        const Proj &proj_op,
-                        const bool is_ascending,
-                        const std::vector<sycl::event> &dependency_events)
-{
-    // bin_count = radix_states used for an array storing bucket state counters
-    static constexpr std::uint32_t radix_states =
-        (std::uint32_t(1) << radix_bits);
-    static constexpr std::uint32_t radix_mask = radix_states - 1;
-
-    // iteration space info
-    const std::size_t n = n_values;
-    // each segment is processed by a work-group
-    const std::size_t elems_per_segment = (n + n_segments - 1) / n_segments;
-    const std::size_t no_op_flag_id = n_counts - 1;
-
-    assert(n_counts == (n_segments + 1) * radix_states + 1);
-
-    sycl::event local_count_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependency_events);
-
-        sycl::local_accessor<CountT, 1> counts_lacc(wg_size * radix_states,
-                                                    cgh);
-
-        sycl::nd_range<1> ndRange(n_iters * n_segments * wg_size, wg_size);
-
-        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> ndit) {
-            // 0 <= lid < wg_size
-            const std::size_t lid = ndit.get_local_id(0);
-            // 0 <= group_id < n_segments * n_iters
-            const std::size_t group_id = ndit.get_group(0);
-            const std::size_t iter_id = group_id / n_segments;
-            const std::size_t val_iter_offset = iter_id * n;
-            // 0 <= wgr_id < n_segments
-            const std::size_t wgr_id = group_id - iter_id * n_segments;
-
-            const std::size_t seg_start = elems_per_segment * wgr_id;
-
-            // count per work-item: create a private array for storing count
-            // values here bin_count = radix_states
-            std::array<CountT, radix_states> counts_arr = {CountT{0}};
-
-            // count per work-item: count values and write result to private
-            // count array
-            const std::size_t seg_end =
-                sycl::min(seg_start + elems_per_segment, n);
-            if (is_ascending) {
-                for (std::size_t val_id = seg_start + lid; val_id < seg_end;
-                     val_id += wg_size)
-                {
-                    // get the bucket for the bit-ordered input value,
-                    // applying the offset and mask for radix bits
-                    const auto val =
-                        order_preserving_cast</*is_ascending*/ true>(
-                            proj_op(vals_ptr[val_iter_offset + val_id]));
-                    const std::uint32_t bucket_id =
-                        get_bucket_id<radix_mask>(val, radix_offset);
-
-                    // increment counter for this bit bucket
-                    ++counts_arr[bucket_id];
-                }
-            }
-            else {
-                for (std::size_t val_id = seg_start + lid; val_id < seg_end;
-                     val_id += wg_size)
-                {
-                    // get the bucket for the bit-ordered input value,
-                    // applying the offset and mask for radix bits
-                    const auto val =
-                        order_preserving_cast</*is_ascending*/ false>(
-                            proj_op(vals_ptr[val_iter_offset + val_id]));
-                    const std::uint32_t bucket_id =
-                        get_bucket_id<radix_mask>(val, radix_offset);
-
-                    // increment counter for this bit bucket
-                    ++counts_arr[bucket_id];
-                }
-            }
-
-            // count per work-item: write private count array to local count
-            // array counts_lacc is concatenation of private count arrays from
-            // each work-item in the order of their local ids
-            const std::uint32_t count_start_id = radix_states * lid;
-            for (std::uint32_t radix_state_id = 0;
-                 radix_state_id < radix_states; ++radix_state_id)
-            {
-                counts_lacc[count_start_id + radix_state_id] =
-                    counts_arr[radix_state_id];
-            }
-
-            sycl::group_barrier(ndit.get_group());
-
-            // count per work-group: reduce till count_lacc[] size > wg_size
-            // all work-items in the work-group do the work.
-            for (std::uint32_t i = 1; i < radix_states; ++i) {
-                // Since we interested in computing total count over work-group
-                // for each radix state, the correct result is only assured if
-                // wg_size >= radix_states
-                counts_lacc[lid] += counts_lacc[wg_size * i + lid];
-            }
-
-            sycl::group_barrier(ndit.get_group());
-
-            // count per work-group: reduce until count_lacc[] size >
-            // radix_states (n_witems /= 2 per iteration)
-            for (std::uint32_t n_witems = (wg_size >> 1);
-                 n_witems >= radix_states; n_witems >>= 1)
-            {
-                if (lid < n_witems)
-                    counts_lacc[lid] += counts_lacc[n_witems + lid];
-
-                sycl::group_barrier(ndit.get_group());
-            }
-
-            const std::size_t iter_counter_offset = iter_id * n_counts;
-
-            // count per work-group: write local count array to global count
-            // array
-            if (lid < radix_states) {
-                // move buckets with the same id to adjacent positions,
-                // thus splitting count array into radix_states regions
-                counts_ptr[iter_counter_offset + (n_segments + 1) * lid +
-                           wgr_id] = counts_lacc[lid];
-            }
-
-            // side work: reset 'no-operation-flag', signaling to skip re-order
-            // phase
-            if (wgr_id == 0 && lid == 0) {
-                CountT &no_op_flag =
-                    counts_ptr[iter_counter_offset + no_op_flag_id];
-                no_op_flag = 0;
-            }
-        });
-    });
-
-    return local_count_ev;
-}
-
-//-----------------------------------------------------------------------
-// radix sort: scan kernel (single iteration)
-//-----------------------------------------------------------------------
-
-template <typename KernelName, std::uint32_t radix_bits, typename CountT>
-sycl::event radix_sort_scan_submit(sycl::queue &exec_q,
-                                   std::size_t n_iters,
-                                   std::size_t n_segments,
-                                   std::size_t wg_size,
-                                   std::size_t n_values,
-                                   std::size_t n_counts,
-                                   CountT *counts_ptr,
-                                   const std::vector<sycl::event> depends)
-{
-    const std::size_t no_op_flag_id = n_counts - 1;
-
-    // Scan produces local offsets using count values.
-    // There are no local offsets for the first segment, but the rest segments
-    // should be scanned with respect to the count value in the first segment
-    // what requires n + 1 positions
-    const std::size_t scan_size = n_segments + 1;
-    wg_size = std::min(scan_size, wg_size);
-
-    static constexpr std::uint32_t radix_states = std::uint32_t(1)
-                                                  << radix_bits;
-
-    // compilation of the kernel prevents out of resources issue, which may
-    // occur due to usage of collective algorithms such as joint_exclusive_scan
-    // even if local memory is not explicitly requested
-    sycl::event scan_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        sycl::nd_range<1> ndRange(n_iters * radix_states * wg_size, wg_size);
-
-        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> ndit) {
-            const std::size_t group_id = ndit.get_group(0);
-            const std::size_t iter_id = group_id / radix_states;
-            const std::size_t wgr_id = group_id - iter_id * radix_states;
-            // find borders of a region with a specific bucket id
-            auto begin_ptr =
-                counts_ptr + scan_size * wgr_id + iter_id * n_counts;
-
-            sycl::joint_exclusive_scan(ndit.get_group(), begin_ptr,
-                                       begin_ptr + scan_size, begin_ptr,
-                                       CountT(0), sycl::plus<CountT>{});
-
-            const auto lid = ndit.get_local_linear_id();
-
-            // NB: No race condition here, because the condition may ever be
-            // true for only on one WG, one WI.
-            if ((lid == wg_size - 1) && (begin_ptr[scan_size - 1] == n_values))
-            {
-                // set flag, since all the values got into one
-                // this is optimization, may happy often for
-                // higher radix offsets (all zeros)
-                auto &no_op_flag =
-                    counts_ptr[iter_id * n_counts + no_op_flag_id];
-                no_op_flag = 1;
-            }
-        });
-    });
-
-    return scan_ev;
-}
-
-//-----------------------------------------------------------------------
-// radix sort: group level reorder algorithms
-//-----------------------------------------------------------------------
-
-struct empty_storage
-{
-    template <typename... T> empty_storage(T &&...) {}
-};
-
-// Number with `n` least significant bits of uint32_t
-inline std::uint32_t n_ls_bits_set(std::uint32_t n) noexcept
-{
-    static constexpr std::uint32_t zero{};
-    static constexpr std::uint32_t all_bits_set = ~zero;
-
-    return ~(all_bits_set << n);
-}
-
-enum class peer_prefix_algo
-{
-    subgroup_ballot,
-    atomic_fetch_or,
-    scan_then_broadcast
-};
-
-template <typename OffsetT, peer_prefix_algo Algo> struct peer_prefix_helper;
-
-template <typename AccT> auto get_accessor_pointer(const AccT &acc)
-{
-    return acc.template get_multi_ptr<sycl::access::decorated::no>().get();
-}
-
-template <typename OffsetT>
-struct peer_prefix_helper<OffsetT, peer_prefix_algo::atomic_fetch_or>
-{
-    using AtomicT = sycl::atomic_ref<std::uint32_t,
-                                     sycl::memory_order::relaxed,
-                                     sycl::memory_scope::work_group,
-                                     sycl::access::address_space::local_space>;
-    using TempStorageT = sycl::local_accessor<std::uint32_t, 1>;
-
-private:
-    sycl::sub_group sgroup;
-    std::uint32_t lid;
-    std::uint32_t item_mask;
-    AtomicT atomic_peer_mask;
-
-public:
-    peer_prefix_helper(sycl::nd_item<1> ndit, TempStorageT lacc)
-        : sgroup(ndit.get_sub_group()), lid(ndit.get_local_linear_id()),
-          item_mask(n_ls_bits_set(lid)), atomic_peer_mask(lacc[0])
-    {
-    }
-
-    std::uint32_t peer_contribution(OffsetT &new_offset_id,
-                                    OffsetT offset_prefix,
-                                    bool wi_bit_set) const
-    {
-        // reset mask for each radix state
-        if (lid == 0)
-            atomic_peer_mask.store(std::uint32_t{0});
-        sycl::group_barrier(sgroup);
-
-        const std::uint32_t uint_contrib{wi_bit_set ? std::uint32_t{1}
-                                                    : std::uint32_t{0}};
-
-        // set local id's bit to 1 if the bucket value matches the radix state
-        atomic_peer_mask.fetch_or(uint_contrib << lid);
-        sycl::group_barrier(sgroup);
-        std::uint32_t peer_mask_bits = atomic_peer_mask.load();
-        std::uint32_t sg_total_offset = sycl::popcount(peer_mask_bits);
-
-        // get the local offset index from the bits set in the peer mask with
-        // index less than the work item ID
-        peer_mask_bits &= item_mask;
-        new_offset_id |= wi_bit_set
-                             ? (offset_prefix + sycl::popcount(peer_mask_bits))
-                             : OffsetT{0};
-        return sg_total_offset;
-    }
-};
-
-template <typename OffsetT>
-struct peer_prefix_helper<OffsetT, peer_prefix_algo::scan_then_broadcast>
-{
-    using TempStorageT = empty_storage;
-    using ItemType = sycl::nd_item<1>;
-    using SubGroupType = sycl::sub_group;
-
-private:
-    SubGroupType sgroup;
-    std::uint32_t sg_size;
-
-public:
-    peer_prefix_helper(sycl::nd_item<1> ndit, TempStorageT)
-        : sgroup(ndit.get_sub_group()), sg_size(sgroup.get_local_range()[0])
-    {
-    }
-
-    std::uint32_t peer_contribution(OffsetT &new_offset_id,
-                                    OffsetT offset_prefix,
-                                    bool wi_bit_set) const
-    {
-        const std::uint32_t contrib{wi_bit_set ? std::uint32_t{1}
-                                               : std::uint32_t{0}};
-
-        std::uint32_t sg_item_offset = sycl::exclusive_scan_over_group(
-            sgroup, contrib, sycl::plus<std::uint32_t>{});
-
-        new_offset_id |=
-            (wi_bit_set ? (offset_prefix + sg_item_offset) : OffsetT(0));
-
-        // the last scanned value does not contain number of all copies, thus
-        // adding contribution
-        std::uint32_t sg_total_offset = sycl::group_broadcast(
-            sgroup, sg_item_offset + contrib, sg_size - 1);
-
-        return sg_total_offset;
-    }
-};
-
-template <typename OffsetT>
-struct peer_prefix_helper<OffsetT, peer_prefix_algo::subgroup_ballot>
-{
-private:
-    sycl::sub_group sgroup;
-    std::uint32_t lid;
-    sycl::ext::oneapi::sub_group_mask item_sg_mask;
-
-    sycl::ext::oneapi::sub_group_mask mask_builder(std::uint32_t mask,
-                                                   std::uint32_t sg_size)
-    {
-        return sycl::detail::Builder::createSubGroupMask<
-            sycl::ext::oneapi::sub_group_mask>(mask, sg_size);
-    }
-
-public:
-    using TempStorageT = empty_storage;
-
-    peer_prefix_helper(sycl::nd_item<1> ndit, TempStorageT)
-        : sgroup(ndit.get_sub_group()), lid(ndit.get_local_linear_id()),
-          item_sg_mask(
-              mask_builder(n_ls_bits_set(lid), sgroup.get_local_linear_range()))
-    {
-    }
-
-    std::uint32_t peer_contribution(OffsetT &new_offset_id,
-                                    OffsetT offset_prefix,
-                                    bool wi_bit_set) const
-    {
-        // set local id's bit to 1 if the bucket value matches the radix state
-        auto peer_mask = sycl::ext::oneapi::group_ballot(sgroup, wi_bit_set);
-        std::uint32_t peer_mask_bits{};
-
-        peer_mask.extract_bits(peer_mask_bits);
-        std::uint32_t sg_total_offset = sycl::popcount(peer_mask_bits);
-
-        // get the local offset index from the bits set in the peer mask with
-        // index less than the work item ID
-        peer_mask &= item_sg_mask;
-        peer_mask.extract_bits(peer_mask_bits);
-
-        new_offset_id |= wi_bit_set
-                             ? (offset_prefix + sycl::popcount(peer_mask_bits))
-                             : OffsetT(0);
-
-        return sg_total_offset;
-    }
-};
-
-template <typename InputT, typename OutputT>
-void copy_func_for_radix_sort(const std::size_t n_segments,
-                              const std::size_t elems_per_segment,
-                              const std::size_t sg_size,
-                              const std::uint32_t lid,
-                              const std::size_t wgr_id,
-                              const InputT *input_ptr,
-                              const std::size_t n_values,
-                              OutputT *output_ptr)
-{
-    // item info
-    const std::size_t seg_start = elems_per_segment * wgr_id;
-
-    std::size_t seg_end = sycl::min(seg_start + elems_per_segment, n_values);
-
-    // ensure that each work item in a subgroup does the same number of loop
-    // iterations
-    const std::uint16_t tail_size = (seg_end - seg_start) % sg_size;
-    seg_end -= tail_size;
-
-    // find offsets for the same values within a segment and fill the resulting
-    // buffer
-    for (std::size_t val_id = seg_start + lid; val_id < seg_end;
-         val_id += sg_size)
-    {
-        output_ptr[val_id] = std::move(input_ptr[val_id]);
-    }
-
-    if (tail_size > 0 && lid < tail_size) {
-        const std::size_t val_id = seg_end + lid;
-        output_ptr[val_id] = std::move(input_ptr[val_id]);
-    }
-}
-
-//-----------------------------------------------------------------------
-// radix sort: reorder kernel (per iteration)
-//-----------------------------------------------------------------------
-template <typename KernelName,
-          std::uint32_t radix_bits,
-          peer_prefix_algo PeerAlgo,
-          typename InputT,
-          typename OutputT,
-          typename OffsetT,
-          typename ProjT>
-sycl::event
-radix_sort_reorder_submit(sycl::queue &exec_q,
-                          std::size_t n_iters,
-                          std::size_t n_segments,
-                          std::uint32_t radix_offset,
-                          std::size_t n_values,
-                          const InputT *input_ptr,
-                          OutputT *output_ptr,
-                          std::size_t n_offsets,
-                          OffsetT *offset_ptr,
-                          const ProjT &proj_op,
-                          const bool is_ascending,
-                          const std::vector<sycl::event> dependency_events)
-{
-    using ValueT = InputT;
-    using PeerHelper = peer_prefix_helper<OffsetT, PeerAlgo>;
-
-    static constexpr std::uint32_t radix_states = std::uint32_t{1}
-                                                  << radix_bits;
-    static constexpr std::uint32_t radix_mask = radix_states - 1;
-    const std::size_t elems_per_segment =
-        (n_values + n_segments - 1) / n_segments;
-
-    const std::size_t no_op_flag_id = n_offsets - 1;
-
-    const auto &kernel_id = sycl::get_kernel_id<KernelName>();
-
-    auto const &ctx = exec_q.get_context();
-    auto const &dev = exec_q.get_device();
-    auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
-        ctx, {dev}, {kernel_id});
-
-    auto krn = kb.get_kernel(kernel_id);
-
-    const std::uint32_t sg_size = krn.template get_info<
-        sycl::info::kernel_device_specific::max_sub_group_size>(dev);
-
-    sycl::event reorder_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependency_events);
-        cgh.use_kernel_bundle(kb);
-
-        using StorageT = typename PeerHelper::TempStorageT;
-
-        StorageT peer_temp(1, cgh);
-
-        sycl::range<1> lRange{sg_size};
-        sycl::range<1> gRange{n_iters * n_segments * sg_size};
-
-        sycl::nd_range<1> ndRange{gRange, lRange};
-
-        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> ndit) {
-            const std::size_t group_id = ndit.get_group(0);
-            const std::size_t iter_id = group_id / n_segments;
-            const std::size_t segment_id = group_id - iter_id * n_segments;
-
-            auto b_offset_ptr = offset_ptr + iter_id * n_offsets;
-            auto b_input_ptr = input_ptr + iter_id * n_values;
-            auto b_output_ptr = output_ptr + iter_id * n_values;
-
-            const std::uint32_t lid = ndit.get_local_id(0);
-
-            auto &no_op_flag = b_offset_ptr[no_op_flag_id];
-            if (no_op_flag) {
-                // no reordering necessary, simply copy
-                copy_func_for_radix_sort<InputT, OutputT>(
-                    n_segments, elems_per_segment, sg_size, lid, segment_id,
-                    b_input_ptr, n_values, b_output_ptr);
-                return;
-            }
-
-            // create a private array for storing offset values
-            // and add total offset and offset for compute unit
-            // for a certain radix state
-            std::array<OffsetT, radix_states> offset_arr{};
-            const std::size_t scan_size = n_segments + 1;
-
-            OffsetT scanned_bin = 0;
-
-            /* find cumulative offset */
-            static constexpr std::uint32_t zero_radix_state_id = 0;
-            offset_arr[zero_radix_state_id] = b_offset_ptr[segment_id];
-
-            for (std::uint32_t radix_state_id = 1;
-                 radix_state_id < radix_states; ++radix_state_id)
-            {
-                const std::uint32_t local_offset_id =
-                    segment_id + scan_size * radix_state_id;
-
-                // scan bins serially
-                const std::size_t last_segment_bucket_id =
-                    radix_state_id * scan_size - 1;
-                scanned_bin += b_offset_ptr[last_segment_bucket_id];
-
-                offset_arr[radix_state_id] =
-                    scanned_bin + b_offset_ptr[local_offset_id];
-            }
-
-            const std::size_t seg_start = elems_per_segment * segment_id;
-            std::size_t seg_end =
-                sycl::min(seg_start + elems_per_segment, n_values);
-            // ensure that each work item in a subgroup does the same number of
-            // loop iterations
-            const std::uint32_t tail_size = (seg_end - seg_start) % sg_size;
-            seg_end -= tail_size;
-
-            const PeerHelper peer_prefix_hlp(ndit, peer_temp);
-
-            // find offsets for the same values within a segment and fill the
-            // resulting buffer
-            if (is_ascending) {
-                for (std::size_t val_id = seg_start + lid; val_id < seg_end;
-                     val_id += sg_size)
-                {
-                    ValueT in_val = std::move(b_input_ptr[val_id]);
-
-                    // get the bucket for the bit-ordered input value, applying
-                    // the offset and mask for radix bits
-                    const auto mapped_val =
-                        order_preserving_cast</*is_ascending*/ true>(
-                            proj_op(in_val));
-                    std::uint32_t bucket_id =
-                        get_bucket_id<radix_mask>(mapped_val, radix_offset);
-
-                    OffsetT new_offset_id = 0;
-                    for (std::uint32_t radix_state_id = 0;
-                         radix_state_id < radix_states; ++radix_state_id)
-                    {
-                        bool is_current_bucket = (bucket_id == radix_state_id);
-                        std::uint32_t sg_total_offset =
-                            peer_prefix_hlp.peer_contribution(
-                                /* modified by reference */ new_offset_id,
-                                offset_arr[radix_state_id],
-                                /* bit contribution from this work-item */
-                                is_current_bucket);
-                        offset_arr[radix_state_id] += sg_total_offset;
-                    }
-                    b_output_ptr[new_offset_id] = std::move(in_val);
-                }
-            }
-            else {
-                for (std::size_t val_id = seg_start + lid; val_id < seg_end;
-                     val_id += sg_size)
-                {
-                    ValueT in_val = std::move(b_input_ptr[val_id]);
-
-                    // get the bucket for the bit-ordered input value, applying
-                    // the offset and mask for radix bits
-                    const auto mapped_val =
-                        order_preserving_cast</*is_ascending*/ false>(
-                            proj_op(in_val));
-                    std::uint32_t bucket_id =
-                        get_bucket_id<radix_mask>(mapped_val, radix_offset);
-
-                    OffsetT new_offset_id = 0;
-                    for (std::uint32_t radix_state_id = 0;
-                         radix_state_id < radix_states; ++radix_state_id)
-                    {
-                        bool is_current_bucket = (bucket_id == radix_state_id);
-                        std::uint32_t sg_total_offset =
-                            peer_prefix_hlp.peer_contribution(
-                                /* modified by reference */ new_offset_id,
-                                offset_arr[radix_state_id],
-                                /* bit contribution from this work-item */
-                                is_current_bucket);
-                        offset_arr[radix_state_id] += sg_total_offset;
-                    }
-                    b_output_ptr[new_offset_id] = std::move(in_val);
-                }
-            }
-            if (tail_size > 0) {
-                ValueT in_val;
-
-                // default: is greater than any actual radix state
-                std::uint32_t bucket_id = radix_states;
-                if (lid < tail_size) {
-                    in_val = std::move(b_input_ptr[seg_end + lid]);
-
-                    const auto proj_val = proj_op(in_val);
-                    const auto mapped_val =
-                        (is_ascending)
-                            ? order_preserving_cast</*is_ascending*/ true>(
-                                  proj_val)
-                            : order_preserving_cast</*is_ascending*/ false>(
-                                  proj_val);
-                    bucket_id =
-                        get_bucket_id<radix_mask>(mapped_val, radix_offset);
-                }
-
-                OffsetT new_offset_id = 0;
-                for (std::uint32_t radix_state_id = 0;
-                     radix_state_id < radix_states; ++radix_state_id)
-                {
-                    bool is_current_bucket = (bucket_id == radix_state_id);
-                    std::uint32_t sg_total_offset =
-                        peer_prefix_hlp.peer_contribution(
-                            new_offset_id, offset_arr[radix_state_id],
-                            is_current_bucket);
-
-                    offset_arr[radix_state_id] += sg_total_offset;
-                }
-
-                if (lid < tail_size) {
-                    b_output_ptr[new_offset_id] = std::move(in_val);
-                }
-            }
-        });
-    });
-
-    return reorder_ev;
-}
-
-template <typename sizeT>
-sizeT _slm_adjusted_work_group_size(sycl::queue &exec_q,
-                                    sizeT required_slm_bytes_per_wg,
-                                    sizeT wg_size)
-{
-    const auto &dev = exec_q.get_device();
-
-    if (wg_size == 0)
-        wg_size =
-            dev.template get_info<sycl::info::device::max_work_group_size>();
-
-    const auto local_mem_sz =
-        dev.template get_info<sycl::info::device::local_mem_size>();
-
-    return sycl::min(local_mem_sz / required_slm_bytes_per_wg, wg_size);
-}
-
-//-----------------------------------------------------------------------
-// radix sort: one iteration
-//-----------------------------------------------------------------------
-
-template <std::uint32_t radix_bits, bool even>
-struct parallel_radix_sort_iteration_step
-{
-    template <typename... Name>
-    using count_phase = radix_sort_count_kernel<radix_bits, even, Name...>;
-    template <typename... Name>
-    using local_scan_phase = radix_sort_scan_kernel<radix_bits, Name...>;
-    template <typename... Name>
-    using reorder_peer_phase =
-        radix_sort_reorder_peer_kernel<radix_bits, even, Name...>;
-    template <typename... Name>
-    using reorder_phase = radix_sort_reorder_kernel<radix_bits, even, Name...>;
-
-    template <typename InputT,
-              typename OutputT,
-              typename CountT,
-              typename ProjT>
-    static sycl::event submit(sycl::queue &exec_q,
-                              std::size_t n_iters,
-                              std::size_t n_segments,
-                              std::uint32_t radix_iter,
-                              std::size_t n_values,
-                              const InputT *in_ptr,
-                              OutputT *out_ptr,
-                              std::size_t n_counts,
-                              CountT *counts_ptr,
-                              const ProjT &proj_op,
-                              const bool is_ascending,
-                              const std::vector<sycl::event> &dependency_events)
-    {
-        using _RadixCountKernel = count_phase<InputT, OutputT, CountT, ProjT>;
-        using _RadixLocalScanKernel =
-            local_scan_phase<InputT, OutputT, CountT, ProjT>;
-        using _RadixReorderPeerKernel =
-            reorder_peer_phase<InputT, OutputT, CountT, ProjT>;
-        using _RadixReorderKernel =
-            reorder_phase<InputT, OutputT, CountT, ProjT>;
-
-        const auto &supported_sub_group_sizes =
-            exec_q.get_device()
-                .template get_info<sycl::info::device::sub_group_sizes>();
-        const std::size_t max_sg_size =
-            (supported_sub_group_sizes.empty()
-                 ? 0
-                 : supported_sub_group_sizes.back());
-        const std::size_t reorder_sg_size = max_sg_size;
-        const std::size_t scan_wg_size =
-            exec_q.get_device()
-                .template get_info<sycl::info::device::max_work_group_size>();
-
-        static constexpr std::size_t two_mils = (std::size_t(1) << 21);
-        std::size_t count_wg_size =
-            ((max_sg_size > 0) && (n_values > two_mils) ? 128 : max_sg_size);
-
-        static constexpr std::uint32_t radix_states = std::uint32_t(1)
-                                                      << radix_bits;
-
-        // correct count_wg_size according to local memory limit in count phase
-        const auto max_count_wg_size = _slm_adjusted_work_group_size(
-            exec_q, sizeof(CountT) * radix_states, count_wg_size);
-        count_wg_size =
-            static_cast<::std::size_t>((max_count_wg_size / radix_states)) *
-            radix_states;
-
-        // work-group size must be a power of 2 and not less than the number of
-        // states, for scanning to work correctly
-
-        const std::size_t rounded_down_count_wg_size =
-            std::size_t{1} << (number_of_bits_in_type<std::size_t>() -
-                               sycl::clz(count_wg_size) - 1);
-        count_wg_size =
-            sycl::max(rounded_down_count_wg_size, std::size_t(radix_states));
-
-        // Compute the radix position for the given iteration
-        std::uint32_t radix_offset = radix_iter * radix_bits;
-
-        // 1. Count Phase
-        sycl::event count_ev =
-            radix_sort_count_submit<_RadixCountKernel, radix_bits>(
-                exec_q, n_iters, n_segments, count_wg_size, radix_offset,
-                n_values, in_ptr, n_counts, counts_ptr, proj_op, is_ascending,
-                dependency_events);
-
-        // 2. Scan Phase
-        sycl::event scan_ev =
-            radix_sort_scan_submit<_RadixLocalScanKernel, radix_bits>(
-                exec_q, n_iters, n_segments, scan_wg_size, n_values, n_counts,
-                counts_ptr, {count_ev});
-
-        // 3. Reorder Phase
-        sycl::event reorder_ev{};
-        // subgroup_ballot-based peer algo uses extract_bits to populate
-        // uint32_t mask and hence relies on sub-group to be 32 or narrower
-        static constexpr std::size_t sg32_v = 32u;
-        static constexpr std::size_t sg16_v = 16u;
-        static constexpr std::size_t sg08_v = 8u;
-        if (sg32_v == reorder_sg_size || sg16_v == reorder_sg_size ||
-            sg08_v == reorder_sg_size)
-        {
-            static constexpr auto peer_algorithm =
-                peer_prefix_algo::subgroup_ballot;
-
-            reorder_ev = radix_sort_reorder_submit<_RadixReorderPeerKernel,
-                                                   radix_bits, peer_algorithm>(
-                exec_q, n_iters, n_segments, radix_offset, n_values, in_ptr,
-                out_ptr, n_counts, counts_ptr, proj_op, is_ascending,
-                {scan_ev});
-        }
-        else {
-            static constexpr auto peer_algorithm =
-                peer_prefix_algo::scan_then_broadcast;
-
-            reorder_ev = radix_sort_reorder_submit<_RadixReorderKernel,
-                                                   radix_bits, peer_algorithm>(
-                exec_q, n_iters, n_segments, radix_offset, n_values, in_ptr,
-                out_ptr, n_counts, counts_ptr, proj_op, is_ascending,
-                {scan_ev});
-        }
-
-        return reorder_ev;
-    }
-}; // struct parallel_radix_sort_iteration
-
-template <typename Names, std::uint16_t... Constants>
-class radix_sort_one_wg_krn;
-
-template <typename KernelNameBase,
-          std::uint16_t wg_size = 256,
-          std::uint16_t block_size = 16,
-          std::uint32_t radix = 4,
-          std::uint16_t req_sub_group_size = (block_size < 4 ? 32 : 16)>
-struct subgroup_radix_sort
-{
-private:
-    class use_slm_tag
-    {
-    };
-    class use_global_mem_tag
-    {
-    };
-
-public:
-    template <typename ValueT, typename OutputT, typename ProjT>
-    sycl::event operator()(sycl::queue &exec_q,
-                           std::size_t n_iters,
-                           std::size_t n_to_sort,
-                           ValueT *input_ptr,
-                           OutputT *output_ptr,
-                           ProjT proj_op,
-                           const bool is_ascending,
-                           const std::vector<sycl::event> &depends)
-    {
-        static_assert(std::is_same_v<std::remove_cv_t<ValueT>, OutputT>);
-
-        using _SortKernelLoc =
-            radix_sort_one_wg_krn<KernelNameBase, wg_size, block_size, 0>;
-        using _SortKernelPartGlob =
-            radix_sort_one_wg_krn<KernelNameBase, wg_size, block_size, 1>;
-        using _SortKernelGlob =
-            radix_sort_one_wg_krn<KernelNameBase, wg_size, block_size, 2>;
-
-        static constexpr std::size_t max_concurrent_work_groups = 128U;
-
-        // Choose this to occupy the entire accelerator
-        const std::size_t n_work_groups =
-            std::min<std::size_t>(n_iters, max_concurrent_work_groups);
-
-        // determine which temporary allocation can be accommodated in SLM
-        const auto &SLM_availability =
-            check_slm_size<ValueT>(exec_q, n_to_sort);
-
-        const std::size_t n_batch_size = n_work_groups;
-
-        switch (SLM_availability) {
-        case temp_allocations::both_in_slm:
-        {
-            static constexpr auto storage_for_values = use_slm_tag{};
-            static constexpr auto storage_for_counters = use_slm_tag{};
-
-            return one_group_submitter<_SortKernelLoc>()(
-                exec_q, n_iters, n_iters, n_to_sort, input_ptr, output_ptr,
-                proj_op, is_ascending, storage_for_values, storage_for_counters,
-                depends);
-        }
-        case temp_allocations::counters_in_slm:
-        {
-            static constexpr auto storage_for_values = use_global_mem_tag{};
-            static constexpr auto storage_for_counters = use_slm_tag{};
-
-            return one_group_submitter<_SortKernelPartGlob>()(
-                exec_q, n_iters, n_batch_size, n_to_sort, input_ptr, output_ptr,
-                proj_op, is_ascending, storage_for_values, storage_for_counters,
-                depends);
-        }
-        default:
-        {
-            static constexpr auto storage_for_values = use_global_mem_tag{};
-            static constexpr auto storage_for_counters = use_global_mem_tag{};
-
-            return one_group_submitter<_SortKernelGlob>()(
-                exec_q, n_iters, n_batch_size, n_to_sort, input_ptr, output_ptr,
-                proj_op, is_ascending, storage_for_values, storage_for_counters,
-                depends);
-        }
-        }
-    }
-
-private:
-    template <typename KeyT, typename> class TempBuf;
-
-    template <typename KeyT> class TempBuf<KeyT, use_slm_tag>
-    {
-        std::size_t buf_size;
-
-    public:
-        TempBuf(std::size_t, std::size_t n) : buf_size(n) {}
-        auto get_acc(sycl::handler &cgh)
-        {
-            return sycl::local_accessor<KeyT>(buf_size, cgh);
-        }
-
-        std::size_t get_iter_stride() const { return std::size_t{0}; }
-    };
-
-    template <typename KeyT> class TempBuf<KeyT, use_global_mem_tag>
-    {
-        sycl::buffer<KeyT> buf;
-        std::size_t iter_stride;
-
-    public:
-        TempBuf(std::size_t n_iters, std::size_t n)
-            : buf(n_iters * n), iter_stride(n)
-        {
-        }
-        auto get_acc(sycl::handler &cgh)
-        {
-            return sycl::accessor(buf, cgh, sycl::read_write, sycl::no_init);
-        }
-        std::size_t get_iter_stride() const { return iter_stride; }
-    };
-
-    static_assert(wg_size <= 1024);
-    static constexpr std::uint16_t bin_count = (1 << radix);
-    static constexpr std::uint16_t counter_buf_sz = wg_size * bin_count + 1;
-
-    enum class temp_allocations
-    {
-        both_in_slm,
-        counters_in_slm,
-        both_in_global_mem
-    };
-
-    template <typename T, typename SizeT>
-    temp_allocations check_slm_size(const sycl::queue &exec_q, SizeT n)
-    {
-        // the kernel is designed for data size <= 64K
-        assert(n <= (SizeT(1) << 16));
-
-        static constexpr auto req_slm_size_counters =
-            counter_buf_sz * sizeof(std::uint16_t);
-
-        const auto &dev = exec_q.get_device();
-
-        // Pessimistically only use half of the memory to take into account
-        // a SYCL group algorithm might use a portion of SLM
-        const std::size_t max_slm_size =
-            dev.template get_info<sycl::info::device::local_mem_size>() / 2;
-
-        const auto n_uniform = 1 << ceil_log2(n);
-        const auto req_slm_size_val = sizeof(T) * n_uniform;
-
-        return ((req_slm_size_val + req_slm_size_counters) <= max_slm_size)
-                   ?
-                   // the values and the counters are placed in SLM
-                   temp_allocations::both_in_slm
-                   : (req_slm_size_counters <= max_slm_size)
-                         ?
-                         // the counters are placed in SLM, the values - in the
-                         // global memory
-                         temp_allocations::counters_in_slm
-                         :
-                         // the values and the counters are placed in the global
-                         // memory
-                         temp_allocations::both_in_global_mem;
-    }
-
-    template <typename KernelName> struct one_group_submitter
-    {
-        template <typename InputT,
-                  typename OutputT,
-                  typename ProjT,
-                  typename SLM_value_tag,
-                  typename SLM_counter_tag>
-        sycl::event operator()(sycl::queue &exec_q,
-                               std::size_t n_iters,
-                               std::size_t n_batch_size,
-                               std::size_t n_values,
-                               InputT *input_arr,
-                               OutputT *output_arr,
-                               const ProjT &proj_op,
-                               const bool is_ascending,
-                               SLM_value_tag,
-                               SLM_counter_tag,
-                               const std::vector<sycl::event> &depends)
-        {
-            assert(!(n_values >> 16));
-
-            assert(n_values <= static_cast<std::size_t>(block_size) *
-                                   static_cast<std::size_t>(wg_size));
-
-            const std::uint16_t n = static_cast<std::uint16_t>(n_values);
-            static_assert(std::is_same_v<std::remove_cv_t<InputT>, OutputT>);
-
-            using ValueT = OutputT;
-
-            using KeyT = std::invoke_result_t<ProjT, ValueT>;
-
-            TempBuf<ValueT, SLM_value_tag> buf_val(
-                n_batch_size, static_cast<std::size_t>(block_size * wg_size));
-            TempBuf<std::uint16_t, SLM_counter_tag> buf_count(
-                n_batch_size, static_cast<std::size_t>(counter_buf_sz));
-
-            sycl::range<1> lRange{wg_size};
-
-            sycl::event sort_ev;
-            std::vector<sycl::event> deps{depends};
-
-            const std::size_t n_batches =
-                (n_iters + n_batch_size - 1) / n_batch_size;
-
-            const auto &kernel_id = sycl::get_kernel_id<KernelName>();
-
-            auto const &ctx = exec_q.get_context();
-            auto const &dev = exec_q.get_device();
-            auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
-                ctx, {dev}, {kernel_id});
-
-            const auto &krn = kb.get_kernel(kernel_id);
-
-            const std::uint32_t krn_sg_size = krn.template get_info<
-                sycl::info::kernel_device_specific::max_sub_group_size>(dev);
-
-            // due to a bug in CPU device implementation, an additional
-            // synchronization is necessary for short sub-group sizes
-            const bool work_around_needed =
-                exec_q.get_device().has(sycl::aspect::cpu) &&
-                (krn_sg_size < 16);
-
-            for (std::size_t batch_id = 0; batch_id < n_batches; ++batch_id) {
-
-                const std::size_t block_start = batch_id * n_batch_size;
-
-                // input_arr/output_arr each has shape (n_iters, n)
-                InputT *this_input_arr = input_arr + block_start * n_values;
-                OutputT *this_output_arr = output_arr + block_start * n_values;
-
-                const std::size_t block_end =
-                    std::min<std::size_t>(block_start + n_batch_size, n_iters);
-
-                sycl::range<1> gRange{(block_end - block_start) * wg_size};
-                sycl::nd_range ndRange{gRange, lRange};
-
-                sort_ev = exec_q.submit([&](sycl::handler &cgh) {
-                    cgh.depends_on(deps);
-                    cgh.use_kernel_bundle(kb);
-
-                    // allocation to use for value exchanges
-                    auto exchange_acc = buf_val.get_acc(cgh);
-                    const std::size_t exchange_acc_iter_stride =
-                        buf_val.get_iter_stride();
-
-                    // allocation for counters
-                    auto counter_acc = buf_count.get_acc(cgh);
-                    const std::size_t counter_acc_iter_stride =
-                        buf_count.get_iter_stride();
-
-                    cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1>
-                                                                  ndit) {
-                        ValueT values[block_size];
-
-                        const std::size_t iter_id = ndit.get_group(0);
-                        const std::size_t iter_val_offset =
-                            iter_id * static_cast<std::size_t>(n);
-                        const std::size_t iter_counter_offset =
-                            iter_id * counter_acc_iter_stride;
-                        const std::size_t iter_exchange_offset =
-                            iter_id * exchange_acc_iter_stride;
-
-                        std::uint16_t wi = ndit.get_local_linear_id();
-                        std::uint16_t begin_bit = 0;
-
-                        static constexpr std::uint16_t end_bit =
-                            number_of_bits_in_type<KeyT>();
-
-                        // copy from input array into values
-#pragma unroll
-                        for (std::uint16_t i = 0; i < block_size; ++i) {
-                            const std::uint16_t id = wi * block_size + i;
-                            values[i] =
-                                (id < n) ? this_input_arr[iter_val_offset + id]
-                                         : ValueT{};
-                        }
-
-                        while (true) {
-                            // indices for indirect access in the "re-order"
-                            // phase
-                            std::uint16_t indices[block_size];
-                            {
-                                // pointers to bucket's counters
-                                std::uint16_t *counters[block_size];
-
-                                // counting phase
-                                auto pcounter =
-                                    get_accessor_pointer(counter_acc) +
-                                    (wi + iter_counter_offset);
-
-                                // initialize counters
-#pragma unroll
-                                for (std::uint16_t i = 0; i < bin_count; ++i)
-                                    pcounter[i * wg_size] = std::uint16_t{0};
-
-                                sycl::group_barrier(ndit.get_group());
-
-                                if (is_ascending) {
-#pragma unroll
-                                    for (std::uint16_t i = 0; i < block_size;
-                                         ++i)
-                                    {
-                                        const std::uint16_t id =
-                                            wi * block_size + i;
-                                        static constexpr std::uint16_t
-                                            bin_mask = bin_count - 1;
-
-                                        // points to the padded element, i.e. id
-                                        // is in-range
-                                        static constexpr std::uint16_t
-                                            default_out_of_range_bin_id =
-                                                bin_mask;
-
-                                        const std::uint16_t bin =
-                                            (id < n)
-                                                ? get_bucket_id<bin_mask>(
-                                                      order_preserving_cast<
-                                                          /* is_ascending */
-                                                          true>(
-                                                          proj_op(values[i])),
-                                                      begin_bit)
-                                                : default_out_of_range_bin_id;
-
-                                        // counting and local offset calculation
-                                        counters[i] = &pcounter[bin * wg_size];
-                                        indices[i] = *counters[i];
-                                        *counters[i] = indices[i] + 1;
-
-                                        if (work_around_needed) {
-                                            sycl::group_barrier(
-                                                ndit.get_group());
-                                        }
-                                    }
-                                }
-                                else {
-#pragma unroll
-                                    for (std::uint16_t i = 0; i < block_size;
-                                         ++i)
-                                    {
-                                        const std::uint16_t id =
-                                            wi * block_size + i;
-                                        static constexpr std::uint16_t
-                                            bin_mask = bin_count - 1;
-
-                                        // points to the padded element, i.e. id
-                                        // is in-range
-                                        static constexpr std::uint16_t
-                                            default_out_of_range_bin_id =
-                                                bin_mask;
-
-                                        const std::uint16_t bin =
-                                            (id < n)
-                                                ? get_bucket_id<bin_mask>(
-                                                      order_preserving_cast<
-                                                          /* is_ascending */
-                                                          false>(
-                                                          proj_op(values[i])),
-                                                      begin_bit)
-                                                : default_out_of_range_bin_id;
-
-                                        // counting and local offset calculation
-                                        counters[i] = &pcounter[bin * wg_size];
-                                        indices[i] = *counters[i];
-                                        *counters[i] = indices[i] + 1;
-
-                                        if (work_around_needed) {
-                                            sycl::group_barrier(
-                                                ndit.get_group());
-                                        }
-                                    }
-                                }
-
-                                sycl::group_barrier(ndit.get_group());
-
-                                // exclusive scan phase
-                                {
-
-                                    // scan contiguous numbers
-                                    std::uint16_t bin_sum[bin_count];
-                                    const std::size_t counter_offset0 =
-                                        iter_counter_offset + wi * bin_count;
-                                    bin_sum[0] = counter_acc[counter_offset0];
-
-#pragma unroll
-                                    for (std::uint16_t i = 1; i < bin_count;
-                                         ++i)
-                                        bin_sum[i] =
-                                            bin_sum[i - 1] +
-                                            counter_acc[counter_offset0 + i];
-
-                                    sycl::group_barrier(ndit.get_group());
-
-                                    // exclusive scan local sum
-                                    std::uint16_t sum_scan =
-                                        sycl::exclusive_scan_over_group(
-                                            ndit.get_group(),
-                                            bin_sum[bin_count - 1],
-                                            sycl::plus<std::uint16_t>());
-
-// add to local sum, generate exclusive scan result
-#pragma unroll
-                                    for (std::uint16_t i = 0; i < bin_count;
-                                         ++i)
-                                        counter_acc[counter_offset0 + i + 1] =
-                                            sum_scan + bin_sum[i];
-
-                                    if (wi == 0)
-                                        counter_acc[iter_counter_offset + 0] =
-                                            std::uint32_t{0};
-
-                                    sycl::group_barrier(ndit.get_group());
-                                }
-
-#pragma unroll
-                                for (std::uint16_t i = 0; i < block_size; ++i) {
-                                    // a global index is a local offset plus a
-                                    // global base index
-                                    indices[i] += *counters[i];
-                                }
-
-                                sycl::group_barrier(ndit.get_group());
-                            }
-
-                            begin_bit += radix;
-
-                            // "re-order" phase
-                            sycl::group_barrier(ndit.get_group());
-                            if (begin_bit >= end_bit) {
-                                // the last iteration - writing out the result
-#pragma unroll
-                                for (std::uint16_t i = 0; i < block_size; ++i) {
-                                    const std::uint16_t r = indices[i];
-                                    if (r < n) {
-                                        this_output_arr[iter_val_offset + r] =
-                                            values[i];
-                                    }
-                                }
-
-                                return;
-                            }
-
-                            // data exchange
-#pragma unroll
-                            for (std::uint16_t i = 0; i < block_size; ++i) {
-                                const std::uint16_t r = indices[i];
-                                if (r < n)
-                                    exchange_acc[iter_exchange_offset + r] =
-                                        values[i];
-                            }
-
-                            sycl::group_barrier(ndit.get_group());
-
-#pragma unroll
-                            for (std::uint16_t i = 0; i < block_size; ++i) {
-                                const std::uint16_t id = wi * block_size + i;
-                                if (id < n)
-                                    values[i] =
-                                        exchange_acc[iter_exchange_offset + id];
-                            }
-
-                            sycl::group_barrier(ndit.get_group());
-                        }
-                    });
-                });
-
-                deps = {sort_ev};
-            }
-
-            return sort_ev;
-        }
-    };
-};
-
-template <typename ValueT, typename ProjT> struct OneWorkGroupRadixSortKernel;
-
-//-----------------------------------------------------------------------
-// radix sort: main function
-//-----------------------------------------------------------------------
-template <typename ValueT, typename ProjT>
-sycl::event parallel_radix_sort_impl(sycl::queue &exec_q,
-                                     std::size_t n_iters,
-                                     std::size_t n_to_sort,
-                                     const ValueT *input_arr,
-                                     ValueT *output_arr,
-                                     const ProjT &proj_op,
-                                     const bool is_ascending,
-                                     const std::vector<sycl::event> &depends)
-{
-    assert(n_to_sort > 1);
-
-    using KeyT = std::remove_cv_t<
-        std::remove_reference_t<std::invoke_result_t<ProjT, ValueT>>>;
-
-    // radix bits represent number of processed bits in each value during one
-    // iteration
-    static constexpr std::uint32_t radix_bits = 4;
-
-    sycl::event sort_ev{};
-
-    const auto &dev = exec_q.get_device();
-    const auto max_wg_size =
-        dev.template get_info<sycl::info::device::max_work_group_size>();
-
-    static constexpr std::uint16_t ref_wg_size = 64;
-    if (n_to_sort <= 16384 && ref_wg_size * 8 <= max_wg_size) {
-        using _RadixSortKernel = OneWorkGroupRadixSortKernel<ValueT, ProjT>;
-
-        if (n_to_sort <= 64 && ref_wg_size <= max_wg_size) {
-            // wg_size * block_size == 64 * 1 * 1 == 64
-            static constexpr std::uint16_t wg_size = ref_wg_size;
-            static constexpr std::uint16_t block_size = 1;
-
-            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
-                                          radix_bits>{}(
-                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
-                is_ascending, depends);
-        }
-        else if (n_to_sort <= 128 && ref_wg_size * 2 <= max_wg_size) {
-            // wg_size * block_size == 64 * 2 * 1 == 128
-            static constexpr std::uint16_t wg_size = ref_wg_size * 2;
-            static constexpr std::uint16_t block_size = 1;
-
-            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
-                                          radix_bits>{}(
-                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
-                is_ascending, depends);
-        }
-        else if (n_to_sort <= 256 && ref_wg_size * 2 <= max_wg_size) {
-            // wg_size * block_size == 64 * 2 * 2 == 256
-            static constexpr std::uint16_t wg_size = ref_wg_size * 2;
-            static constexpr std::uint16_t block_size = 2;
-
-            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
-                                          radix_bits>{}(
-                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
-                is_ascending, depends);
-        }
-        else if (n_to_sort <= 512 && ref_wg_size * 2 <= max_wg_size) {
-            // wg_size * block_size == 64 * 2 * 4 == 512
-            static constexpr std::uint16_t wg_size = ref_wg_size * 2;
-            static constexpr std::uint16_t block_size = 4;
-
-            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
-                                          radix_bits>{}(
-                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
-                is_ascending, depends);
-        }
-        else if (n_to_sort <= 1024 && ref_wg_size * 2 <= max_wg_size) {
-            // wg_size * block_size == 64 * 2 * 8 == 1024
-            static constexpr std::uint16_t wg_size = ref_wg_size * 2;
-            static constexpr std::uint16_t block_size = 8;
-
-            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
-                                          radix_bits>{}(
-                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
-                is_ascending, depends);
-        }
-        else if (n_to_sort <= 2048 && ref_wg_size * 4 <= max_wg_size) {
-            // wg_size * block_size == 64 * 4 * 8 == 2048
-            static constexpr std::uint16_t wg_size = ref_wg_size * 4;
-            static constexpr std::uint16_t block_size = 8;
-
-            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
-                                          radix_bits>{}(
-                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
-                is_ascending, depends);
-        }
-        else if (n_to_sort <= 4096 && ref_wg_size * 4 <= max_wg_size) {
-            // wg_size * block_size == 64 * 4 * 16 == 4096
-            static constexpr std::uint16_t wg_size = ref_wg_size * 4;
-            static constexpr std::uint16_t block_size = 16;
-
-            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
-                                          radix_bits>{}(
-                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
-                is_ascending, depends);
-        }
-        else if (n_to_sort <= 8192 && ref_wg_size * 8 <= max_wg_size) {
-            // wg_size * block_size == 64 * 8 * 16 == 8192
-            static constexpr std::uint16_t wg_size = ref_wg_size * 8;
-            static constexpr std::uint16_t block_size = 16;
-
-            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
-                                          radix_bits>{}(
-                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
-                is_ascending, depends);
-        }
-        else {
-            // wg_size * block_size == 64 * 8 * 32 == 16384
-            static constexpr std::uint16_t wg_size = ref_wg_size * 8;
-            static constexpr std::uint16_t block_size = 32;
-
-            sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size,
-                                          radix_bits>{}(
-                exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op,
-                is_ascending, depends);
-        }
-    }
-    else {
-        static constexpr std::uint32_t radix_iters =
-            number_of_buckets_in_type<KeyT>(radix_bits);
-        static constexpr std::uint32_t radix_states = std::uint32_t(1)
-                                                      << radix_bits;
-
-        static constexpr std::size_t bound_512k = (std::size_t(1) << 19);
-        static constexpr std::size_t bound_2m = (std::size_t(1) << 21);
-
-        const auto wg_sz_k = (n_to_sort < bound_512k)  ? 8
-                             : (n_to_sort <= bound_2m) ? 4
-                                                       : 1;
-        const std::size_t wg_size = max_wg_size / wg_sz_k;
-
-        const std::size_t n_segments = (n_to_sort + wg_size - 1) / wg_size;
-
-        // Additional radix_states elements are used for getting local offsets
-        // from count values + no_op flag; 'No operation' flag specifies whether
-        // to skip re-order phase if the all keys are the same (lie in one bin)
-        const std::size_t n_counts =
-            (n_segments + 1) * radix_states + 1 /*no_op flag*/;
-
-        using CountT = std::uint32_t;
-
-        // memory for storing count and offset values
-        auto count_owner =
-            dpctl::tensor::alloc_utils::smart_malloc_device<CountT>(
-                n_iters * n_counts, exec_q);
-
-        CountT *count_ptr = count_owner.get();
-
-        static constexpr std::uint32_t zero_radix_iter{0};
-
-        if constexpr (std::is_same_v<KeyT, bool>) {
-
-            sort_ev = parallel_radix_sort_iteration_step<
-                radix_bits, /*even=*/true>::submit(exec_q, n_iters, n_segments,
-                                                   zero_radix_iter, n_to_sort,
-                                                   input_arr, output_arr,
-                                                   n_counts, count_ptr, proj_op,
-                                                   is_ascending, depends);
-
-            sort_ev = dpctl::tensor::alloc_utils::async_smart_free(
-                exec_q, {sort_ev}, count_owner);
-
-            return sort_ev;
-        }
-
-        auto tmp_arr_owner =
-            dpctl::tensor::alloc_utils::smart_malloc_device<ValueT>(
-                n_iters * n_to_sort, exec_q);
-
-        ValueT *tmp_arr = tmp_arr_owner.get();
-
-        // iterations per each bucket
-        assert("Number of iterations must be even" && radix_iters % 2 == 0);
-        assert(radix_iters > 0);
-
-        sort_ev = parallel_radix_sort_iteration_step<
-            radix_bits, /*even=*/true>::submit(exec_q, n_iters, n_segments,
-                                               zero_radix_iter, n_to_sort,
-                                               input_arr, tmp_arr, n_counts,
-                                               count_ptr, proj_op, is_ascending,
-                                               depends);
-
-        for (std::uint32_t radix_iter = 1; radix_iter < radix_iters;
-             ++radix_iter)
-        {
-            if (radix_iter % 2 == 0) {
-                sort_ev = parallel_radix_sort_iteration_step<
-                    radix_bits,
-                    /*even=*/true>::submit(exec_q, n_iters, n_segments,
-                                           radix_iter, n_to_sort, output_arr,
-                                           tmp_arr, n_counts, count_ptr,
-                                           proj_op, is_ascending, {sort_ev});
-            }
-            else {
-                sort_ev = parallel_radix_sort_iteration_step<
-                    radix_bits,
-                    /*even=*/false>::submit(exec_q, n_iters, n_segments,
-                                            radix_iter, n_to_sort, tmp_arr,
-                                            output_arr, n_counts, count_ptr,
-                                            proj_op, is_ascending, {sort_ev});
-            }
-        }
-
-        sort_ev = dpctl::tensor::alloc_utils::async_smart_free(
-            exec_q, {sort_ev}, tmp_arr_owner, count_owner);
-    }
-
-    return sort_ev;
-}
-
-struct IdentityProj
-{
-    constexpr IdentityProj() {}
-
-    template <typename T> constexpr T operator()(T val) const { return val; }
-};
-
-template <typename ValueT, typename IndexT> struct ValueProj
-{
-    constexpr ValueProj() {}
-
-    constexpr ValueT operator()(const std::pair<ValueT, IndexT> &pair) const
-    {
-        return pair.first;
-    }
-};
-
-template <typename IndexT, typename ValueT, typename ProjT> struct IndexedProj
-{
-    IndexedProj(const ValueT *arg_ptr) : ptr(arg_ptr), value_projector{} {}
-
-    IndexedProj(const ValueT *arg_ptr, const ProjT &proj_op)
-        : ptr(arg_ptr), value_projector(proj_op)
-    {
-    }
-
-    auto operator()(IndexT i) const { return value_projector(ptr[i]); }
-
-private:
-    const ValueT *ptr;
-    ProjT value_projector;
-};
-
-} // end of namespace radix_sort_details
-
-using dpctl::tensor::ssize_t;
-
-template <typename argTy>
-sycl::event
-radix_sort_axis1_contig_impl(sycl::queue &exec_q,
-                             const bool sort_ascending,
-                             // number of sub-arrays to sort (num. of rows in a
-                             // matrix when sorting over rows)
-                             std::size_t iter_nelems,
-                             // size of each array to sort  (length of rows,
-                             // i.e. number of columns)
-                             std::size_t sort_nelems,
-                             const char *arg_cp,
-                             char *res_cp,
-                             ssize_t iter_arg_offset,
-                             ssize_t iter_res_offset,
-                             ssize_t sort_arg_offset,
-                             ssize_t sort_res_offset,
-                             const std::vector<sycl::event> &depends)
-{
-    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
-                          iter_arg_offset + sort_arg_offset;
-    argTy *res_tp =
-        reinterpret_cast<argTy *>(res_cp) + iter_res_offset + sort_res_offset;
-
-    using Proj = radix_sort_details::IdentityProj;
-    static constexpr Proj proj_op{};
-
-    sycl::event radix_sort_ev =
-        radix_sort_details::parallel_radix_sort_impl<argTy, Proj>(
-            exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, proj_op,
-            sort_ascending, depends);
-
-    return radix_sort_ev;
-}
-
-template <typename ValueT, typename IndexT>
-class radix_argsort_index_write_out_krn;
-
-template <typename ValueT, typename IndexT> class radix_argsort_iota_krn;
-
-template <typename argTy, typename IndexTy>
-sycl::event
-radix_argsort_axis1_contig_impl(sycl::queue &exec_q,
-                                const bool sort_ascending,
-                                // number of sub-arrays to sort (num. of
-                                // rows in a matrix when sorting over rows)
-                                std::size_t iter_nelems,
-                                // size of each array to sort  (length of
-                                // rows, i.e. number of columns)
-                                std::size_t sort_nelems,
-                                const char *arg_cp,
-                                char *res_cp,
-                                ssize_t iter_arg_offset,
-                                ssize_t iter_res_offset,
-                                ssize_t sort_arg_offset,
-                                ssize_t sort_res_offset,
-                                const std::vector<sycl::event> &depends)
-{
-    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
-                          iter_arg_offset + sort_arg_offset;
-    IndexTy *res_tp =
-        reinterpret_cast<IndexTy *>(res_cp) + iter_res_offset + sort_res_offset;
-
-    const std::size_t total_nelems = iter_nelems * sort_nelems;
-    auto workspace_owner =
-        dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(total_nelems,
-                                                                 exec_q);
-
-    // get raw USM pointer
-    IndexTy *workspace = workspace_owner.get();
-
-    using IdentityProjT = radix_sort_details::IdentityProj;
-    using IndexedProjT =
-        radix_sort_details::IndexedProj<IndexTy, argTy, IdentityProjT>;
-    const IndexedProjT proj_op{arg_tp};
-
-    using IotaKernelName = radix_argsort_iota_krn<argTy, IndexTy>;
-
-    using dpctl::tensor::kernels::sort_utils_detail::iota_impl;
-
-    sycl::event iota_ev = iota_impl<IotaKernelName, IndexTy>(
-        exec_q, workspace, total_nelems, depends);
-
-    sycl::event radix_sort_ev =
-        radix_sort_details::parallel_radix_sort_impl<IndexTy, IndexedProjT>(
-            exec_q, iter_nelems, sort_nelems, workspace, res_tp, proj_op,
-            sort_ascending, {iota_ev});
-
-    using MapBackKernelName = radix_argsort_index_write_out_krn<argTy, IndexTy>;
-    using dpctl::tensor::kernels::sort_utils_detail::map_back_impl;
-
-    sycl::event dep = radix_sort_ev;
-
-    // no need to perform map_back ( id % sort_nelems)
-    //   if total_nelems == sort_nelems
-    if (iter_nelems > 1u) {
-        dep = map_back_impl<MapBackKernelName, IndexTy>(
-            exec_q, total_nelems, res_tp, res_tp, sort_nelems, {dep});
-    }
-
-    sycl::event cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
-        exec_q, {dep}, workspace_owner);
-
-    return cleanup_ev;
-}
-
-} // end of namespace kernels
-} // end of namespace tensor
-} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp
deleted file mode 100644
index b96e3dea82..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp
+++ /dev/null
@@ -1,118 +0,0 @@
-//=== searchsorted.hpp -                                      ---*-C++-*--/===//
-//    Implementation of searching in sorted array
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for tensor sort/argsort operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include <cstddef>
-#include <cstdint>
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-
-namespace search_sorted_detail
-{
-
-template <typename T> T quotient_ceil(T n, T m) { return (n + m - 1) / m; }
-
-template <typename Acc, typename Value, typename Compare>
-std::size_t lower_bound_impl(const Acc acc,
-                             const std::size_t first,
-                             const std::size_t last,
-                             const Value &value,
-                             const Compare &comp)
-{
-    std::size_t n = last - first;
-    std::size_t cur = n, start = first;
-    std::size_t it;
-    while (n > 0) {
-        it = start;
-        cur = n / 2;
-        it += cur;
-        if (comp(acc[it], value)) {
-            n -= cur + 1, start = ++it;
-        }
-        else
-            n = cur;
-    }
-    return start;
-}
-
-template <typename Acc, typename Value, typename Compare>
-std::size_t upper_bound_impl(const Acc acc,
-                             const std::size_t first,
-                             const std::size_t last,
-                             const Value &value,
-                             const Compare &comp)
-{
-    const auto &op_comp = [comp](auto x, auto y) { return !comp(y, x); };
-    return lower_bound_impl(acc, first, last, value, op_comp);
-}
-
-template <typename Acc, typename Value, typename Compare, typename IndexerT>
-std::size_t lower_bound_indexed_impl(const Acc acc,
-                                     std::size_t first,
-                                     std::size_t last,
-                                     const Value &value,
-                                     const Compare &comp,
-                                     const IndexerT &acc_indexer)
-{
-    std::size_t n = last - first;
-    std::size_t cur = n, start = first;
-    std::size_t it;
-    while (n > 0) {
-        it = start;
-        cur = n / 2;
-        it += cur;
-        if (comp(acc[acc_indexer(it)], value)) {
-            n -= cur + 1, start = ++it;
-        }
-        else
-            n = cur;
-    }
-    return start;
-}
-
-template <typename Acc, typename Value, typename Compare, typename IndexerT>
-std::size_t upper_bound_indexed_impl(const Acc acc,
-                                     const std::size_t first,
-                                     const std::size_t last,
-                                     const Value &value,
-                                     const Compare &comp,
-                                     const IndexerT &acc_indexer)
-{
-    const auto &op_comp = [comp](auto x, auto y) { return !comp(y, x); };
-    return lower_bound_indexed_impl(acc, first, last, value, op_comp,
-                                    acc_indexer);
-}
-
-} // namespace search_sorted_detail
-
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/searchsorted.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/searchsorted.hpp
deleted file mode 100644
index 6042f23e03..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/sorting/searchsorted.hpp
+++ /dev/null
@@ -1,256 +0,0 @@
-//=== searchsorted.hpp -                                      ---*-C++-*--/===//
-//    Implementation of searching in sorted array
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for tensor sort/argsort operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/sorting/search_sorted_detail.hpp"
-#include "utils/offset_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-
-using dpctl::tensor::ssize_t;
-
-template <typename argTy,
-          typename indTy,
-          bool left_side,
-          typename HayIndexerT,
-          typename NeedlesIndexerT,
-          typename PositionsIndexerT,
-          typename Compare>
-struct SearchSortedFunctor
-{
-private:
-    const argTy *hay_tp;
-    const argTy *needles_tp;
-    indTy *positions_tp;
-    std::size_t hay_nelems;
-    HayIndexerT hay_indexer;
-    NeedlesIndexerT needles_indexer;
-    PositionsIndexerT positions_indexer;
-
-public:
-    SearchSortedFunctor(const argTy *hay_,
-                        const argTy *needles_,
-                        indTy *positions_,
-                        const std::size_t hay_nelems_,
-                        const HayIndexerT &hay_indexer_,
-                        const NeedlesIndexerT &needles_indexer_,
-                        const PositionsIndexerT &positions_indexer_)
-        : hay_tp(hay_), needles_tp(needles_), positions_tp(positions_),
-          hay_nelems(hay_nelems_), hay_indexer(hay_indexer_),
-          needles_indexer(needles_indexer_),
-          positions_indexer(positions_indexer_)
-    {
-    }
-
-    void operator()(sycl::id<1> id) const
-    {
-        const Compare comp{};
-
-        const std::size_t i = id[0];
-        const argTy needle_v = needles_tp[needles_indexer(i)];
-
-        // position of the needle_v in the hay array
-        indTy pos{};
-
-        static constexpr std::size_t zero(0);
-        if constexpr (left_side) {
-            // search in hay in left-closed interval, give `pos` such that
-            // hay[pos - 1] < needle_v <= hay[pos]
-
-            // lower_bound returns the first pos such that bool(hay[pos] <
-            // needle_v) is false, i.e. needle_v <= hay[pos]
-            pos = search_sorted_detail::lower_bound_indexed_impl(
-                hay_tp, zero, hay_nelems, needle_v, comp, hay_indexer);
-        }
-        else {
-            // search in hay in right-closed interval: hay[pos - 1] <= needle_v
-            // < hay[pos]
-
-            // upper_bound returns the first pos such that bool(needle_v <
-            // hay[pos]) is true, i.e. needle_v < hay[pos]
-            pos = search_sorted_detail::upper_bound_indexed_impl(
-                hay_tp, zero, hay_nelems, needle_v, comp, hay_indexer);
-        }
-
-        positions_tp[positions_indexer(i)] = pos;
-    }
-};
-
-typedef sycl::event (*searchsorted_contig_impl_fp_ptr_t)(
-    sycl::queue &,
-    const std::size_t,
-    const std::size_t,
-    const char *,
-    const ssize_t,
-    const char *,
-    const ssize_t,
-    char *,
-    const ssize_t,
-    const std::vector<sycl::event> &);
-
-template <typename T1, typename T2, bool left_closed>
-class searchsorted_contig_impl_krn;
-
-template <typename argTy, typename indTy, bool left_closed, typename Compare>
-sycl::event searchsorted_contig_impl(sycl::queue &exec_q,
-                                     const std::size_t hay_nelems,
-                                     const std::size_t needles_nelems,
-                                     const char *hay_cp,
-                                     const ssize_t hay_offset,
-                                     const char *needles_cp,
-                                     const ssize_t needles_offset,
-                                     char *positions_cp,
-                                     const ssize_t positions_offset,
-                                     const std::vector<sycl::event> &depends)
-{
-    const argTy *hay_tp = reinterpret_cast<const argTy *>(hay_cp) + hay_offset;
-    const argTy *needles_tp =
-        reinterpret_cast<const argTy *>(needles_cp) + needles_offset;
-
-    indTy *positions_tp =
-        reinterpret_cast<indTy *>(positions_cp) + positions_offset;
-
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        using KernelName =
-            class searchsorted_contig_impl_krn<argTy, indTy, left_closed>;
-
-        sycl::range<1> gRange(needles_nelems);
-
-        using TrivialIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
-
-        static constexpr TrivialIndexerT hay_indexer{};
-        static constexpr TrivialIndexerT needles_indexer{};
-        static constexpr TrivialIndexerT positions_indexer{};
-
-        const auto fnctr =
-            SearchSortedFunctor<argTy, indTy, left_closed, TrivialIndexerT,
-                                TrivialIndexerT, TrivialIndexerT, Compare>(
-                hay_tp, needles_tp, positions_tp, hay_nelems, hay_indexer,
-                needles_indexer, positions_indexer);
-
-        cgh.parallel_for<KernelName>(gRange, fnctr);
-    });
-
-    return comp_ev;
-}
-
-typedef sycl::event (*searchsorted_strided_impl_fp_ptr_t)(
-    sycl::queue &,
-    const std::size_t,
-    const std::size_t,
-    const char *,
-    const ssize_t,
-    const ssize_t,
-    const char *,
-    const ssize_t,
-    char *,
-    const ssize_t,
-    int,
-    const ssize_t *,
-    const std::vector<sycl::event> &);
-
-template <typename T1, typename T2, bool left_closed>
-class searchsorted_strided_impl_krn;
-
-template <typename argTy, typename indTy, bool left_closed, typename Compare>
-sycl::event searchsorted_strided_impl(
-    sycl::queue &exec_q,
-    const std::size_t hay_nelems,
-    const std::size_t needles_nelems,
-    const char *hay_cp,
-    const ssize_t hay_offset,
-    // hay is 1D, so hay_nelems, hay_offset, hay_stride describe strided array
-    const ssize_t hay_stride,
-    const char *needles_cp,
-    const ssize_t needles_offset,
-    char *positions_cp,
-    const ssize_t positions_offset,
-    const int needles_nd,
-    // packed_shape_strides is [needles_shape, needles_strides,
-    // positions_strides] has length of 3*needles_nd
-    const ssize_t *packed_shape_strides,
-    const std::vector<sycl::event> &depends)
-{
-    const argTy *hay_tp = reinterpret_cast<const argTy *>(hay_cp);
-    const argTy *needles_tp = reinterpret_cast<const argTy *>(needles_cp);
-
-    indTy *positions_tp = reinterpret_cast<indTy *>(positions_cp);
-
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        sycl::range<1> gRange(needles_nelems);
-
-        using HayIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
-        const HayIndexerT hay_indexer(
-            /* offset */ hay_offset,
-            /* size   */ hay_nelems,
-            /* step   */ hay_stride);
-
-        using NeedlesIndexerT = dpctl::tensor::offset_utils::StridedIndexer;
-        const ssize_t *needles_shape_strides = packed_shape_strides;
-        const NeedlesIndexerT needles_indexer(needles_nd, needles_offset,
-                                              needles_shape_strides);
-        using PositionsIndexerT =
-            dpctl::tensor::offset_utils::UnpackedStridedIndexer;
-
-        const ssize_t *positions_shape = packed_shape_strides;
-        const ssize_t *positions_strides =
-            packed_shape_strides + 2 * needles_nd;
-        const PositionsIndexerT positions_indexer(
-            needles_nd, positions_offset, positions_shape, positions_strides);
-
-        const auto fnctr =
-            SearchSortedFunctor<argTy, indTy, left_closed, HayIndexerT,
-                                NeedlesIndexerT, PositionsIndexerT, Compare>(
-                hay_tp, needles_tp, positions_tp, hay_nelems, hay_indexer,
-                needles_indexer, positions_indexer);
-        using KernelName =
-            class searchsorted_strided_impl_krn<argTy, indTy, left_closed>;
-
-        cgh.parallel_for<KernelName>(gRange, fnctr);
-    });
-
-    return comp_ev;
-}
-
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/sort_impl_fn_ptr_t.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/sort_impl_fn_ptr_t.hpp
deleted file mode 100644
index 4e5ce16632..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/sorting/sort_impl_fn_ptr_t.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_sorting_impl
-/// extension.
-//===--------------------------------------------------------------------===//
-
-#pragma once
-
-#include <cstddef>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "kernels/dpctl_tensor_types.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-
-using dpctl::tensor::ssize_t;
-
-typedef sycl::event (*sort_contig_fn_ptr_t)(sycl::queue &,
-                                            std::size_t,
-                                            std::size_t,
-                                            const char *,
-                                            char *,
-                                            ssize_t,
-                                            ssize_t,
-                                            ssize_t,
-                                            ssize_t,
-                                            const std::vector<sycl::event> &);
-
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
deleted file mode 100644
index f38ece27c5..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
+++ /dev/null
@@ -1,145 +0,0 @@
-//=== sorting.hpp -  Implementation of sorting kernels       ---*-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for tensor sort/argsort operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include <cstddef>
-#include <cstdint>
-#include <vector>
-
-#include <sycl/sycl.hpp>
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace sort_utils_detail
-{
-
-namespace syclexp = sycl::ext::oneapi::experimental;
-
-template <class KernelName, typename T>
-sycl::event iota_impl(sycl::queue &exec_q,
-                      T *data,
-                      std::size_t nelems,
-                      const std::vector<sycl::event> &dependent_events)
-{
-    static constexpr std::uint32_t lws = 256;
-    static constexpr std::uint32_t n_wi = 4;
-    const std::size_t n_groups = (nelems + n_wi * lws - 1) / (n_wi * lws);
-
-    sycl::range<1> gRange{n_groups * lws};
-    sycl::range<1> lRange{lws};
-    sycl::nd_range<1> ndRange{gRange, lRange};
-
-    sycl::event e = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependent_events);
-        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> it) {
-            const std::size_t gid = it.get_global_linear_id();
-            const auto &sg = it.get_sub_group();
-            const std::uint32_t lane_id = sg.get_local_id()[0];
-
-            const std::size_t offset = (gid - lane_id) * n_wi;
-            const std::uint32_t max_sgSize = sg.get_max_local_range()[0];
-
-            std::array<T, n_wi> stripe{};
-#pragma unroll
-            for (std::uint32_t i = 0; i < n_wi; ++i) {
-                stripe[i] = T(offset + lane_id + i * max_sgSize);
-            }
-
-            if (offset + n_wi * max_sgSize < nelems) {
-                static constexpr auto group_ls_props = syclexp::properties{
-                    syclexp::data_placement_striped
-                    // , syclexp::full_group
-                };
-
-                auto out_multi_ptr = sycl::address_space_cast<
-                    sycl::access::address_space::global_space,
-                    sycl::access::decorated::yes>(&data[offset]);
-
-                syclexp::group_store(sg, sycl::span<T, n_wi>{&stripe[0], n_wi},
-                                     out_multi_ptr, group_ls_props);
-            }
-            else {
-                for (std::size_t idx = offset + lane_id; idx < nelems;
-                     idx += max_sgSize)
-                {
-                    data[idx] = T(idx);
-                }
-            }
-        });
-    });
-
-    return e;
-}
-
-template <class KernelName, typename IndexTy>
-sycl::event map_back_impl(sycl::queue &exec_q,
-                          std::size_t nelems,
-                          const IndexTy *flat_index_data,
-                          IndexTy *reduced_index_data,
-                          std::size_t row_size,
-                          const std::vector<sycl::event> &dependent_events)
-{
-    static constexpr std::uint32_t lws = 64;
-    static constexpr std::uint32_t n_wi = 4;
-    const std::size_t n_groups = (nelems + lws * n_wi - 1) / (n_wi * lws);
-
-    sycl::range<1> lRange{lws};
-    sycl::range<1> gRange{n_groups * lws};
-    sycl::nd_range<1> ndRange{gRange, lRange};
-
-    sycl::event map_back_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(dependent_events);
-
-        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> it) {
-            const std::size_t gid = it.get_global_linear_id();
-            const auto &sg = it.get_sub_group();
-            const std::uint32_t lane_id = sg.get_local_id()[0];
-            const std::uint32_t sg_size = sg.get_max_local_range()[0];
-
-            const std::size_t start_id = (gid - lane_id) * n_wi + lane_id;
-
-#pragma unroll
-            for (std::uint32_t i = 0; i < n_wi; ++i) {
-                const std::size_t data_id = start_id + i * sg_size;
-
-                if (data_id < nelems) {
-                    const IndexTy linear_index = flat_index_data[data_id];
-                    reduced_index_data[data_id] = (linear_index % row_size);
-                }
-            }
-        });
-    });
-
-    return map_back_ev;
-}
-
-} // end of namespace sort_utils_detail
-} // end of namespace kernels
-} // end of namespace tensor
-} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
deleted file mode 100644
index 558cdfc167..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
+++ /dev/null
@@ -1,505 +0,0 @@
-//=== topk.hpp -  Implementation of topk kernels       ---*-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for tensor topk operation.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include <array>
-#include <cstddef>
-#include <cstdint>
-#include <iterator>
-#include <limits>
-#include <stdexcept>
-#include <vector>
-
-#include <sycl/ext/oneapi/sub_group_mask.hpp>
-#include <sycl/sycl.hpp>
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/sorting/merge_sort.hpp"
-#include "kernels/sorting/radix_sort.hpp"
-#include "kernels/sorting/search_sorted_detail.hpp"
-#include "kernels/sorting/sort_utils.hpp"
-#include "utils/sycl_alloc_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-
-namespace topk_detail
-{
-
-void scale_topk_params(const std::uint64_t nelems_per_slm,
-                       const std::size_t sub_groups_per_work_group,
-                       const std::uint32_t elems_per_wi,
-                       const std::vector<std::size_t> &sg_sizes,
-                       std::size_t &lws,
-                       std::size_t &nelems_wg_sorts)
-{
-    for (auto it = sg_sizes.rbegin(); it != sg_sizes.rend(); ++it) {
-        auto sg_size = *it;
-        lws = sub_groups_per_work_group * sg_size;
-        nelems_wg_sorts = elems_per_wi * lws;
-        if (nelems_wg_sorts < nelems_per_slm) {
-            return;
-        }
-    }
-    // should never reach
-    throw std::runtime_error("Could not construct top k kernel parameters");
-}
-
-template <class KernelName, typename argTy, typename IndexTy>
-sycl::event write_out_impl(sycl::queue &exec_q,
-                           std::size_t iter_nelems,
-                           std::size_t k,
-                           const argTy *arg_tp,
-                           const IndexTy *index_data,
-                           std::size_t iter_index_stride,
-                           std::size_t axis_nelems,
-                           argTy *vals_tp,
-                           IndexTy *inds_tp,
-                           const std::vector<sycl::event> &depends)
-{
-    static constexpr std::uint32_t lws = 64;
-    static constexpr std::uint32_t n_wi = 4;
-    const std::size_t nelems = iter_nelems * k;
-    const std::size_t n_groups = (nelems + lws * n_wi - 1) / (n_wi * lws);
-
-    sycl::range<1> lRange{lws};
-    sycl::range<1> gRange{n_groups * lws};
-    sycl::nd_range<1> ndRange{gRange, lRange};
-
-    sycl::event write_out_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> it) {
-            const std::size_t gid = it.get_global_linear_id();
-            const auto &sg = it.get_sub_group();
-            const std::uint32_t lane_id = sg.get_local_id()[0];
-            const std::uint32_t sg_size = sg.get_max_local_range()[0];
-
-            const std::size_t start_id = (gid - lane_id) * n_wi + lane_id;
-
-#pragma unroll
-            for (std::uint32_t i = 0; i < n_wi; ++i) {
-                const std::size_t data_id = start_id + i * sg_size;
-
-                if (data_id < nelems) {
-                    const std::size_t iter_id = data_id / k;
-
-                    /*
-                    const std::size_t axis_gid = data_id - (iter_gid * k);
-                    const std::size_t src_idx = iter_gid * iter_index_stride +
-                    axis_gid;
-                    */
-                    const std::size_t src_idx =
-                        data_id + iter_id * (iter_index_stride - k);
-
-                    const IndexTy res_ind = index_data[src_idx];
-                    const argTy v = arg_tp[res_ind];
-
-                    const std::size_t dst_idx = data_id;
-                    vals_tp[dst_idx] = v;
-                    inds_tp[dst_idx] = (res_ind % axis_nelems);
-                }
-            }
-        });
-    });
-
-    return write_out_ev;
-}
-
-} // namespace topk_detail
-
-template <typename T1, typename T2> class topk_populate_index_data_krn;
-
-template <typename T1, typename T2> class topk_full_merge_map_back_krn;
-
-template <typename argTy, typename IndexTy, typename CompT>
-sycl::event
-topk_full_merge_sort_impl(sycl::queue &exec_q,
-                          std::size_t iter_nelems, // number of sub-arrays
-                          std::size_t axis_nelems, // size of each sub-array
-                          std::size_t k,
-                          const argTy *arg_tp,
-                          argTy *vals_tp,
-                          IndexTy *inds_tp,
-                          const CompT &comp,
-                          const std::vector<sycl::event> &depends)
-{
-    auto index_data_owner =
-        dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(
-            iter_nelems * axis_nelems, exec_q);
-    // extract USM pointer
-    IndexTy *index_data = index_data_owner.get();
-
-    using IotaKernelName = topk_populate_index_data_krn<argTy, IndexTy>;
-
-    using dpctl::tensor::kernels::sort_utils_detail::iota_impl;
-
-    sycl::event populate_indexed_data_ev = iota_impl<IotaKernelName, IndexTy>(
-        exec_q, index_data, iter_nelems * axis_nelems, depends);
-
-    std::size_t sorted_block_size;
-    // Sort segments of the array
-    sycl::event base_sort_ev =
-        merge_sort_detail::sort_over_work_group_contig_impl(
-            exec_q, iter_nelems, axis_nelems, index_data, index_data, comp,
-            sorted_block_size, // modified in place with size of sorted block
-                               // size
-            {populate_indexed_data_ev});
-
-    // Merge segments in parallel until all elements are sorted
-    sycl::event merges_ev = merge_sort_detail::merge_sorted_block_contig_impl(
-        exec_q, iter_nelems, axis_nelems, index_data, comp, sorted_block_size,
-        {base_sort_ev});
-
-    using WriteOutKernelName = topk_full_merge_map_back_krn<argTy, IndexTy>;
-
-    sycl::event write_out_ev =
-        topk_detail::write_out_impl<WriteOutKernelName, argTy, IndexTy>(
-            exec_q, iter_nelems, k, arg_tp, index_data, axis_nelems,
-            axis_nelems, vals_tp, inds_tp, {merges_ev});
-
-    sycl::event cleanup_host_task_event =
-        dpctl::tensor::alloc_utils::async_smart_free(exec_q, {write_out_ev},
-                                                     index_data_owner);
-
-    return cleanup_host_task_event;
-};
-
-template <typename T1, typename T2> class topk_partial_merge_map_back_krn;
-
-template <typename T1, typename T2, typename Comp>
-class topk_over_work_group_krn;
-
-template <typename argTy,
-          typename IndexTy,
-          typename ValueComp = std::less<argTy>>
-sycl::event topk_merge_impl(
-    sycl::queue &exec_q,
-    std::size_t iter_nelems, // number of sub-arrays to sort (num. of rows
-                             // in a matrix when sorting over rows)
-    std::size_t axis_nelems, // size of each array to sort  (length of
-                             // rows, i.e. number of columns)
-    std::size_t k,
-    const char *arg_cp,
-    char *vals_cp,
-    char *inds_cp,
-    const std::vector<sycl::event> &depends)
-{
-    if (axis_nelems < k) {
-        throw std::runtime_error("Invalid sort axis size for value of k");
-    }
-
-    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
-    argTy *vals_tp = reinterpret_cast<argTy *>(vals_cp);
-    IndexTy *inds_tp = reinterpret_cast<IndexTy *>(inds_cp);
-
-    using dpctl::tensor::kernels::IndexComp;
-    const IndexComp<IndexTy, argTy, ValueComp> index_comp{arg_tp, ValueComp{}};
-
-    if (axis_nelems <= 512 || k >= 1024 || k > axis_nelems / 2) {
-        return topk_full_merge_sort_impl(exec_q, iter_nelems, axis_nelems, k,
-                                         arg_tp, vals_tp, inds_tp, index_comp,
-                                         depends);
-    }
-    else {
-        using PartialKernelName =
-            topk_over_work_group_krn<IndexTy, IndexTy, ValueComp>;
-
-        const auto &kernel_id = sycl::get_kernel_id<PartialKernelName>();
-
-        auto const &ctx = exec_q.get_context();
-        auto const &dev = exec_q.get_device();
-
-        auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
-            ctx, {dev}, {kernel_id});
-
-        auto krn = kb.get_kernel(kernel_id);
-
-        const std::uint32_t max_sg_size = krn.template get_info<
-            sycl::info::kernel_device_specific::max_sub_group_size>(dev);
-        const std::uint64_t device_local_memory_size =
-            dev.get_info<sycl::info::device::local_mem_size>();
-
-        //  leave 512 bytes of local memory for RT
-        const std::uint64_t safety_margin = 512;
-
-        const std::uint64_t nelems_per_slm =
-            (device_local_memory_size - safety_margin) / (2 * sizeof(IndexTy));
-
-        static constexpr std::uint32_t sub_groups_per_work_group = 4;
-        const std::uint32_t elems_per_wi = dev.has(sycl::aspect::cpu) ? 8 : 2;
-
-        std::size_t lws = sub_groups_per_work_group * max_sg_size;
-
-        std::size_t sorted_block_size = elems_per_wi * lws;
-        if (sorted_block_size > nelems_per_slm) {
-            const std::vector<std::size_t> sg_sizes =
-                dev.get_info<sycl::info::device::sub_group_sizes>();
-            topk_detail::scale_topk_params(
-                nelems_per_slm, sub_groups_per_work_group, elems_per_wi,
-                sg_sizes,
-                lws,              // modified by reference
-                sorted_block_size // modified by reference
-            );
-        }
-
-        // This assumption permits doing away with using a loop
-        assert(sorted_block_size % lws == 0);
-
-        using search_sorted_detail::quotient_ceil;
-        const std::size_t n_segments =
-            quotient_ceil<std::size_t>(axis_nelems, sorted_block_size);
-
-        // round k up for the later merge kernel if necessary
-        const std::size_t round_k_to = dev.has(sycl::aspect::cpu) ? 32 : 4;
-        std::size_t k_rounded =
-            (k < round_k_to)
-                ? k
-                : quotient_ceil<std::size_t>(k, round_k_to) * round_k_to;
-
-        // get length of tail for alloc size
-        auto rem = axis_nelems % sorted_block_size;
-        auto alloc_len = (rem && rem < k_rounded)
-                             ? rem + k_rounded * (n_segments - 1)
-                             : k_rounded * n_segments;
-
-        // if allocation would be sufficiently large or k is larger than
-        // elements processed, use full sort
-        if (k_rounded >= axis_nelems || k_rounded >= sorted_block_size ||
-            alloc_len >= axis_nelems / 2)
-        {
-            return topk_full_merge_sort_impl(exec_q, iter_nelems, axis_nelems,
-                                             k, arg_tp, vals_tp, inds_tp,
-                                             index_comp, depends);
-        }
-
-        auto index_data_owner =
-            dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(
-                iter_nelems * alloc_len, exec_q);
-        // get raw USM pointer
-        IndexTy *index_data = index_data_owner.get();
-
-        // no need to populate index data: SLM will be populated with default
-        // values
-
-        sycl::event base_sort_ev = exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(depends);
-
-            cgh.use_kernel_bundle(kb);
-
-            sycl::range<1> global_range{iter_nelems * n_segments * lws};
-            sycl::range<1> local_range{lws};
-
-            sycl::range<1> slm_range{sorted_block_size};
-            sycl::local_accessor<IndexTy, 1> work_space(slm_range, cgh);
-            sycl::local_accessor<IndexTy, 1> scratch_space(slm_range, cgh);
-
-            sycl::nd_range<1> ndRange(global_range, local_range);
-
-            cgh.parallel_for<PartialKernelName>(
-                ndRange, [=](sycl::nd_item<1> it) {
-                    const std::size_t group_id = it.get_group_linear_id();
-                    const std::size_t iter_id = group_id / n_segments;
-                    const std::size_t segment_id =
-                        group_id - iter_id * n_segments;
-                    const std::size_t lid = it.get_local_linear_id();
-
-                    const std::size_t segment_start_idx =
-                        segment_id * sorted_block_size;
-                    const std::size_t segment_end_idx = std::min<std::size_t>(
-                        segment_start_idx + sorted_block_size, axis_nelems);
-                    const std::size_t wg_chunk_size =
-                        segment_end_idx - segment_start_idx;
-
-                    // load input into SLM
-                    for (std::size_t array_id = segment_start_idx + lid;
-                         array_id < segment_end_idx; array_id += lws)
-                    {
-                        IndexTy v = (array_id < axis_nelems)
-                                        ? iter_id * axis_nelems + array_id
-                                        : IndexTy{};
-                        work_space[array_id - segment_start_idx] = v;
-                    }
-                    sycl::group_barrier(it.get_group());
-
-                    const std::size_t chunk =
-                        quotient_ceil<std::size_t>(sorted_block_size, lws);
-
-                    const std::size_t chunk_start_idx = lid * chunk;
-                    const std::size_t chunk_end_idx =
-                        sycl::min(chunk_start_idx + chunk, wg_chunk_size);
-
-                    merge_sort_detail::leaf_sort_impl(
-                        work_space, chunk_start_idx, chunk_end_idx, index_comp);
-
-                    sycl::group_barrier(it.get_group());
-
-                    bool data_in_temp = false;
-                    std::size_t n_chunks_merged = 1;
-
-                    // merge chunk while n_chunks_merged * chunk < wg_chunk_size
-                    const std::size_t max_chunks_merged =
-                        1 + ((wg_chunk_size - 1) / chunk);
-                    for (; n_chunks_merged < max_chunks_merged;
-                         data_in_temp = !data_in_temp, n_chunks_merged *= 2)
-                    {
-                        const std::size_t nelems_sorted_so_far =
-                            n_chunks_merged * chunk;
-                        const std::size_t q = (lid / n_chunks_merged);
-                        const std::size_t start_1 = sycl::min(
-                            2 * nelems_sorted_so_far * q, wg_chunk_size);
-                        const std::size_t end_1 = sycl::min(
-                            start_1 + nelems_sorted_so_far, wg_chunk_size);
-                        const std::size_t end_2 = sycl::min(
-                            end_1 + nelems_sorted_so_far, wg_chunk_size);
-                        const std::size_t offset =
-                            chunk * (lid - q * n_chunks_merged);
-
-                        if (data_in_temp) {
-                            merge_sort_detail::merge_impl(
-                                offset, scratch_space, work_space, start_1,
-                                end_1, end_2, start_1, index_comp, chunk);
-                        }
-                        else {
-                            merge_sort_detail::merge_impl(
-                                offset, work_space, scratch_space, start_1,
-                                end_1, end_2, start_1, index_comp, chunk);
-                        }
-                        sycl::group_barrier(it.get_group());
-                    }
-
-                    // output assumed to be structured as (iter_nelems,
-                    // alloc_len)
-                    const std::size_t k_segment_start_idx =
-                        segment_id * k_rounded;
-                    const std::size_t k_segment_end_idx = std::min<std::size_t>(
-                        k_segment_start_idx + k_rounded, alloc_len);
-                    const auto &out_src =
-                        (data_in_temp) ? scratch_space : work_space;
-                    for (std::size_t array_id = k_segment_start_idx + lid;
-                         array_id < k_segment_end_idx; array_id += lws)
-                    {
-                        if (lid < k_rounded) {
-                            index_data[iter_id * alloc_len + array_id] =
-                                out_src[array_id - k_segment_start_idx];
-                        }
-                    }
-                });
-        });
-
-        // Merge segments in parallel until all elements are sorted
-        sycl::event merges_ev =
-            merge_sort_detail::merge_sorted_block_contig_impl(
-                exec_q, iter_nelems, alloc_len, index_data, index_comp,
-                k_rounded, {base_sort_ev});
-
-        // Write out top k of the merge-sorted memory
-        using WriteOutKernelName =
-            topk_partial_merge_map_back_krn<argTy, IndexTy>;
-
-        sycl::event write_topk_ev =
-            topk_detail::write_out_impl<WriteOutKernelName, argTy, IndexTy>(
-                exec_q, iter_nelems, k, arg_tp, index_data, alloc_len,
-                axis_nelems, vals_tp, inds_tp, {merges_ev});
-
-        sycl::event cleanup_host_task_event =
-            dpctl::tensor::alloc_utils::async_smart_free(
-                exec_q, {write_topk_ev}, index_data_owner);
-
-        return cleanup_host_task_event;
-    }
-}
-
-template <typename T1, typename T2> class topk_iota_krn;
-
-template <typename T1, typename T2> class topk_radix_map_back_krn;
-
-template <typename argTy, typename IndexTy>
-sycl::event topk_radix_impl(sycl::queue &exec_q,
-                            std::size_t iter_nelems, // number of sub-arrays
-                            std::size_t axis_nelems, // size of each sub-array
-                            std::size_t k,
-                            bool ascending,
-                            const char *arg_cp,
-                            char *vals_cp,
-                            char *inds_cp,
-                            const std::vector<sycl::event> &depends)
-{
-    if (axis_nelems < k) {
-        throw std::runtime_error("Invalid sort axis size for value of k");
-    }
-
-    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp);
-    argTy *vals_tp = reinterpret_cast<argTy *>(vals_cp);
-    IndexTy *inds_tp = reinterpret_cast<IndexTy *>(inds_cp);
-
-    const std::size_t total_nelems = iter_nelems * axis_nelems;
-    const std::size_t padded_total_nelems = ((total_nelems + 63) / 64) * 64;
-    auto workspace_owner =
-        dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(
-            padded_total_nelems + total_nelems, exec_q);
-
-    // get raw USM pointer
-    IndexTy *workspace = workspace_owner.get();
-    IndexTy *tmp_tp = workspace + padded_total_nelems;
-
-    using IdentityProjT = radix_sort_details::IdentityProj;
-    using IndexedProjT =
-        radix_sort_details::IndexedProj<IndexTy, argTy, IdentityProjT>;
-    const IndexedProjT proj_op{arg_tp};
-
-    using IotaKernelName = topk_iota_krn<argTy, IndexTy>;
-
-    using dpctl::tensor::kernels::sort_utils_detail::iota_impl;
-
-    sycl::event iota_ev = iota_impl<IotaKernelName, IndexTy>(
-        exec_q, workspace, total_nelems, depends);
-
-    sycl::event radix_sort_ev =
-        radix_sort_details::parallel_radix_sort_impl<IndexTy, IndexedProjT>(
-            exec_q, iter_nelems, axis_nelems, workspace, tmp_tp, proj_op,
-            ascending, {iota_ev});
-
-    // Write out top k of the temporary
-    using WriteOutKernelName = topk_radix_map_back_krn<argTy, IndexTy>;
-
-    sycl::event write_topk_ev =
-        topk_detail::write_out_impl<WriteOutKernelName, argTy, IndexTy>(
-            exec_q, iter_nelems, k, arg_tp, tmp_tp, axis_nelems, axis_nelems,
-            vals_tp, inds_tp, {radix_sort_ev});
-
-    sycl::event cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
-        exec_q, {write_topk_ev}, workspace_owner);
-
-    return cleanup_ev;
-}
-
-} // end of namespace kernels
-} // end of namespace tensor
-} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/where.hpp b/dpctl/tensor/libtensor/include/kernels/where.hpp
deleted file mode 100644
index e8309b359c..0000000000
--- a/dpctl/tensor/libtensor/include/kernels/where.hpp
+++ /dev/null
@@ -1,335 +0,0 @@
-//=== where.hpp -  Implementation of where kernels ---*-C++-*--/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines kernels for dpctl.tensor.where.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <algorithm>
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "dpctl_tensor_types.hpp"
-#include "kernels/alignment.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/sycl_utils.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace kernels
-{
-namespace search
-{
-
-using dpctl::tensor::ssize_t;
-using namespace dpctl::tensor::offset_utils;
-
-using dpctl::tensor::kernels::alignment_utils::
-    disabled_sg_loadstore_wrapper_krn;
-using dpctl::tensor::kernels::alignment_utils::is_aligned;
-using dpctl::tensor::kernels::alignment_utils::required_alignment;
-
-using dpctl::tensor::sycl_utils::sub_group_load;
-using dpctl::tensor::sycl_utils::sub_group_store;
-
-template <typename T, typename condT, typename IndexerT>
-class where_strided_kernel;
-template <typename T, typename condT, std::uint8_t vec_sz, std::uint8_t n_vecs>
-class where_contig_kernel;
-
-template <typename T,
-          typename condT,
-          std::uint8_t vec_sz = 4u,
-          std::uint8_t n_vecs = 2u,
-          bool enable_sg_loadstore = true>
-class WhereContigFunctor
-{
-private:
-    std::size_t nelems = 0;
-    const condT *cond_p = nullptr;
-    const T *x1_p = nullptr;
-    const T *x2_p = nullptr;
-    T *dst_p = nullptr;
-
-public:
-    WhereContigFunctor(std::size_t nelems_,
-                       const condT *cond_p_,
-                       const T *x1_p_,
-                       const T *x2_p_,
-                       T *dst_p_)
-        : nelems(nelems_), cond_p(cond_p_), x1_p(x1_p_), x2_p(x2_p_),
-          dst_p(dst_p_)
-    {
-    }
-
-    void operator()(sycl::nd_item<1> ndit) const
-    {
-        static constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz;
-
-        using dpctl::tensor::type_utils::is_complex;
-        if constexpr (!enable_sg_loadstore || is_complex<condT>::value ||
-                      is_complex<T>::value)
-        {
-            const std::uint16_t sgSize =
-                ndit.get_sub_group().get_local_range()[0];
-            const std::size_t gid = ndit.get_global_linear_id();
-
-            const std::uint16_t nelems_per_sg = sgSize * nelems_per_wi;
-            const std::size_t start =
-                (gid / sgSize) * (nelems_per_sg - sgSize) + gid;
-            const std::size_t end = std::min(nelems, start + nelems_per_sg);
-            for (std::size_t offset = start; offset < end; offset += sgSize) {
-                using dpctl::tensor::type_utils::convert_impl;
-                const bool check = convert_impl<bool, condT>(cond_p[offset]);
-                dst_p[offset] = check ? x1_p[offset] : x2_p[offset];
-            }
-        }
-        else {
-            auto sg = ndit.get_sub_group();
-            const std::uint16_t sgSize = sg.get_max_local_range()[0];
-
-            const std::size_t base =
-                nelems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
-                                 sg.get_group_id()[0] * sgSize);
-
-            if (base + nelems_per_wi * sgSize < nelems) {
-                sycl::vec<T, vec_sz> dst_vec;
-
-#pragma unroll
-                for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
-                    const std::size_t idx = base + it * sgSize;
-                    auto x1_multi_ptr = sycl::address_space_cast<
-                        sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(&x1_p[idx]);
-                    auto x2_multi_ptr = sycl::address_space_cast<
-                        sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(&x2_p[idx]);
-                    auto cond_multi_ptr = sycl::address_space_cast<
-                        sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(&cond_p[idx]);
-                    auto dst_multi_ptr = sycl::address_space_cast<
-                        sycl::access::address_space::global_space,
-                        sycl::access::decorated::yes>(&dst_p[idx]);
-
-                    const sycl::vec<T, vec_sz> x1_vec =
-                        sub_group_load<vec_sz>(sg, x1_multi_ptr);
-                    const sycl::vec<T, vec_sz> x2_vec =
-                        sub_group_load<vec_sz>(sg, x2_multi_ptr);
-                    const sycl::vec<condT, vec_sz> cond_vec =
-                        sub_group_load<vec_sz>(sg, cond_multi_ptr);
-#pragma unroll
-                    for (std::uint8_t k = 0; k < vec_sz; ++k) {
-                        dst_vec[k] = cond_vec[k] ? x1_vec[k] : x2_vec[k];
-                    }
-                    sub_group_store<vec_sz>(sg, dst_vec, dst_multi_ptr);
-                }
-            }
-            else {
-                const std::size_t lane_id = sg.get_local_id()[0];
-                for (std::size_t k = base + lane_id; k < nelems; k += sgSize) {
-                    dst_p[k] = cond_p[k] ? x1_p[k] : x2_p[k];
-                }
-            }
-        }
-    }
-};
-
-typedef sycl::event (*where_contig_impl_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t,
-    const char *,
-    const char *,
-    const char *,
-    char *,
-    const std::vector<sycl::event> &);
-
-template <typename T, typename condT>
-sycl::event where_contig_impl(sycl::queue &q,
-                              std::size_t nelems,
-                              const char *cond_cp,
-                              const char *x1_cp,
-                              const char *x2_cp,
-                              char *dst_cp,
-                              const std::vector<sycl::event> &depends)
-{
-    const condT *cond_tp = reinterpret_cast<const condT *>(cond_cp);
-    const T *x1_tp = reinterpret_cast<const T *>(x1_cp);
-    const T *x2_tp = reinterpret_cast<const T *>(x2_cp);
-    T *dst_tp = reinterpret_cast<T *>(dst_cp);
-
-    sycl::event where_ev = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        std::size_t lws = 64;
-        static constexpr std::uint8_t vec_sz = 4u;
-        static constexpr std::uint8_t n_vecs = 2u;
-        const std::size_t n_groups =
-            ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
-        const auto gws_range = sycl::range<1>(n_groups * lws);
-        const auto lws_range = sycl::range<1>(lws);
-
-        if (is_aligned<required_alignment>(cond_cp) &&
-            is_aligned<required_alignment>(x1_cp) &&
-            is_aligned<required_alignment>(x2_cp) &&
-            is_aligned<required_alignment>(dst_cp))
-        {
-            static constexpr bool enable_sg_loadstore = true;
-            using KernelName = where_contig_kernel<T, condT, vec_sz, n_vecs>;
-
-            cgh.parallel_for<KernelName>(
-                sycl::nd_range<1>(gws_range, lws_range),
-                WhereContigFunctor<T, condT, vec_sz, n_vecs,
-                                   enable_sg_loadstore>(nelems, cond_tp, x1_tp,
-                                                        x2_tp, dst_tp));
-        }
-        else {
-            static constexpr bool disable_sg_loadstore = false;
-            using InnerKernelName =
-                where_contig_kernel<T, condT, vec_sz, n_vecs>;
-            using KernelName =
-                disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
-
-            cgh.parallel_for<KernelName>(
-                sycl::nd_range<1>(gws_range, lws_range),
-                WhereContigFunctor<T, condT, vec_sz, n_vecs,
-                                   disable_sg_loadstore>(nelems, cond_tp, x1_tp,
-                                                         x2_tp, dst_tp));
-        }
-    });
-
-    return where_ev;
-}
-
-template <typename T, typename condT, typename IndexerT>
-class WhereStridedFunctor
-{
-private:
-    const T *x1_p = nullptr;
-    const T *x2_p = nullptr;
-    T *dst_p = nullptr;
-    const condT *cond_p = nullptr;
-    IndexerT indexer;
-
-public:
-    WhereStridedFunctor(const condT *cond_p_,
-                        const T *x1_p_,
-                        const T *x2_p_,
-                        T *dst_p_,
-                        const IndexerT &indexer_)
-        : x1_p(x1_p_), x2_p(x2_p_), dst_p(dst_p_), cond_p(cond_p_),
-          indexer(indexer_)
-    {
-    }
-
-    void operator()(sycl::id<1> id) const
-    {
-        std::size_t gid = id[0];
-        auto offsets = indexer(static_cast<ssize_t>(gid));
-
-        using dpctl::tensor::type_utils::convert_impl;
-        bool check =
-            convert_impl<bool, condT>(cond_p[offsets.get_first_offset()]);
-
-        dst_p[offsets.get_fourth_offset()] =
-            check ? x1_p[offsets.get_second_offset()]
-                  : x2_p[offsets.get_third_offset()];
-    }
-};
-
-typedef sycl::event (*where_strided_impl_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t,
-    int,
-    const char *,
-    const char *,
-    const char *,
-    char *,
-    const ssize_t *,
-    ssize_t,
-    ssize_t,
-    ssize_t,
-    ssize_t,
-    const std::vector<sycl::event> &);
-
-template <typename T, typename condT>
-sycl::event where_strided_impl(sycl::queue &q,
-                               std::size_t nelems,
-                               int nd,
-                               const char *cond_cp,
-                               const char *x1_cp,
-                               const char *x2_cp,
-                               char *dst_cp,
-                               const ssize_t *shape_strides,
-                               ssize_t x1_offset,
-                               ssize_t x2_offset,
-                               ssize_t cond_offset,
-                               ssize_t dst_offset,
-                               const std::vector<sycl::event> &depends)
-{
-    const condT *cond_tp = reinterpret_cast<const condT *>(cond_cp);
-    const T *x1_tp = reinterpret_cast<const T *>(x1_cp);
-    const T *x2_tp = reinterpret_cast<const T *>(x2_cp);
-    T *dst_tp = reinterpret_cast<T *>(dst_cp);
-
-    sycl::event where_ev = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        const FourOffsets_StridedIndexer indexer{
-            nd, cond_offset, x1_offset, x2_offset, dst_offset, shape_strides};
-
-        cgh.parallel_for<
-            where_strided_kernel<T, condT, FourOffsets_StridedIndexer>>(
-            sycl::range<1>(nelems),
-            WhereStridedFunctor<T, condT, FourOffsets_StridedIndexer>(
-                cond_tp, x1_tp, x2_tp, dst_tp, indexer));
-    });
-
-    return where_ev;
-}
-
-template <typename fnT, typename T, typename condT> struct WhereStridedFactory
-{
-    fnT get()
-    {
-        fnT fn = where_strided_impl<T, condT>;
-        return fn;
-    }
-};
-
-template <typename fnT, typename T, typename condT> struct WhereContigFactory
-{
-    fnT get()
-    {
-        fnT fn = where_contig_impl<T, condT>;
-        return fn;
-    }
-};
-
-} // namespace search
-} // namespace kernels
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/utils/indexing_utils.hpp b/dpctl/tensor/libtensor/include/utils/indexing_utils.hpp
deleted file mode 100644
index 2ff6c0abe7..0000000000
--- a/dpctl/tensor/libtensor/include/utils/indexing_utils.hpp
+++ /dev/null
@@ -1,149 +0,0 @@
-//===----- indexing_utils.hpp - Utilities for indexing modes  -----*-C++-*/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines utilities for handling out-of-bounds integer indices in
-/// kernels that involve indexing operations, such as take, put, or advanced
-/// tensor integer indexing.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <cstdint>
-#include <limits>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "kernels/dpctl_tensor_types.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace indexing_utils
-{
-
-using dpctl::tensor::ssize_t;
-
-/*
- * ssize_t for indices is a design choice, dpctl::tensor::usm_ndarray
- * uses py::ssize_t for shapes and strides internally and Python uses
- * py_ssize_t for sizes of e.g. lists.
- */
-
-template <typename IndT> struct WrapIndex
-{
-    static_assert(std::is_integral_v<IndT>);
-
-    ssize_t operator()(ssize_t max_item, IndT ind) const
-    {
-        ssize_t projected;
-        static constexpr ssize_t unit(1);
-        max_item = sycl::max(max_item, unit);
-
-        static constexpr std::uintmax_t ind_max =
-            std::numeric_limits<IndT>::max();
-        static constexpr std::uintmax_t ssize_max =
-            std::numeric_limits<ssize_t>::max();
-
-        if constexpr (std::is_signed_v<IndT>) {
-            static constexpr std::intmax_t ind_min =
-                std::numeric_limits<IndT>::min();
-            static constexpr std::intmax_t ssize_min =
-                std::numeric_limits<ssize_t>::min();
-
-            if constexpr (ind_max <= ssize_max && ind_min >= ssize_min) {
-                const ssize_t ind_ = static_cast<ssize_t>(ind);
-                const ssize_t lb = -max_item;
-                const ssize_t ub = max_item - 1;
-                projected = sycl::clamp(ind_, lb, ub);
-            }
-            else {
-                const IndT lb = static_cast<IndT>(-max_item);
-                const IndT ub = static_cast<IndT>(max_item - 1);
-                projected = static_cast<ssize_t>(sycl::clamp(ind, lb, ub));
-            }
-            return (projected < 0) ? projected + max_item : projected;
-        }
-        else {
-            if constexpr (ind_max <= ssize_max) {
-                const ssize_t ind_ = static_cast<ssize_t>(ind);
-                const ssize_t ub = max_item - 1;
-                projected = sycl::min(ind_, ub);
-            }
-            else {
-                const IndT ub = static_cast<IndT>(max_item - 1);
-                projected = static_cast<ssize_t>(sycl::min(ind, ub));
-            }
-            return projected;
-        }
-    }
-};
-
-template <typename IndT> struct ClipIndex
-{
-    static_assert(std::is_integral_v<IndT>);
-
-    ssize_t operator()(ssize_t max_item, IndT ind) const
-    {
-        ssize_t projected;
-        static constexpr ssize_t unit(1);
-        max_item = sycl::max<ssize_t>(max_item, unit);
-
-        static constexpr std::uintmax_t ind_max =
-            std::numeric_limits<IndT>::max();
-        static constexpr std::uintmax_t ssize_max =
-            std::numeric_limits<ssize_t>::max();
-        if constexpr (std::is_signed_v<IndT>) {
-            static constexpr std::intmax_t ind_min =
-                std::numeric_limits<IndT>::min();
-            static constexpr std::intmax_t ssize_min =
-                std::numeric_limits<ssize_t>::min();
-
-            if constexpr (ind_max <= ssize_max && ind_min >= ssize_min) {
-                const ssize_t ind_ = static_cast<ssize_t>(ind);
-                static constexpr ssize_t lb(0);
-                const ssize_t ub = max_item - 1;
-                projected = sycl::clamp(ind_, lb, ub);
-            }
-            else {
-                static constexpr IndT lb(0);
-                const IndT ub = static_cast<IndT>(max_item - 1);
-                projected = static_cast<std::size_t>(sycl::clamp(ind, lb, ub));
-            }
-        }
-        else {
-            if constexpr (ind_max <= ssize_max) {
-                const ssize_t ind_ = static_cast<ssize_t>(ind);
-                const ssize_t ub = max_item - 1;
-                projected = sycl::min(ind_, ub);
-            }
-            else {
-                const IndT ub = static_cast<IndT>(max_item - 1);
-                projected = static_cast<ssize_t>(sycl::min(ind, ub));
-            }
-        }
-        return projected;
-    }
-};
-
-} // namespace indexing_utils
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/utils/math_utils.hpp b/dpctl/tensor/libtensor/include/utils/math_utils.hpp
deleted file mode 100644
index 92b3a3dc56..0000000000
--- a/dpctl/tensor/libtensor/include/utils/math_utils.hpp
+++ /dev/null
@@ -1,138 +0,0 @@
-//===------- math_utils.hpp - Implementation of math utils  -------*-C++-*/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines math utility functions.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <cmath>
-#include <complex>
-#include <sycl/sycl.hpp>
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace math_utils
-{
-
-template <typename T> bool less_complex(const T &x1, const T &x2)
-{
-    using realT = typename T::value_type;
-    realT real1 = std::real(x1);
-    realT real2 = std::real(x2);
-    realT imag1 = std::imag(x1);
-    realT imag2 = std::imag(x2);
-
-    return (real1 == real2)
-               ? (imag1 < imag2)
-               : (real1 < real2 && !std::isnan(imag1) && !std::isnan(imag2));
-}
-
-template <typename T> bool greater_complex(const T &x1, const T &x2)
-{
-    using realT = typename T::value_type;
-    realT real1 = std::real(x1);
-    realT real2 = std::real(x2);
-    realT imag1 = std::imag(x1);
-    realT imag2 = std::imag(x2);
-
-    return (real1 == real2)
-               ? (imag1 > imag2)
-               : (real1 > real2 && !std::isnan(imag1) && !std::isnan(imag2));
-}
-
-template <typename T> bool less_equal_complex(const T &x1, const T &x2)
-{
-    using realT = typename T::value_type;
-    realT real1 = std::real(x1);
-    realT real2 = std::real(x2);
-    realT imag1 = std::imag(x1);
-    realT imag2 = std::imag(x2);
-
-    return (real1 == real2)
-               ? (imag1 <= imag2)
-               : (real1 < real2 && !std::isnan(imag1) && !std::isnan(imag2));
-}
-
-template <typename T> bool greater_equal_complex(const T &x1, const T &x2)
-{
-    using realT = typename T::value_type;
-    realT real1 = std::real(x1);
-    realT real2 = std::real(x2);
-    realT imag1 = std::imag(x1);
-    realT imag2 = std::imag(x2);
-
-    return (real1 == real2)
-               ? (imag1 >= imag2)
-               : (real1 > real2 && !std::isnan(imag1) && !std::isnan(imag2));
-}
-
-template <typename T> T max_complex(const T &x1, const T &x2)
-{
-    using realT = typename T::value_type;
-    realT real1 = std::real(x1);
-    realT real2 = std::real(x2);
-    realT imag1 = std::imag(x1);
-    realT imag2 = std::imag(x2);
-
-    bool isnan_imag1 = std::isnan(imag1);
-    bool gt = (real1 == real2)
-                  ? (imag1 > imag2)
-                  : (real1 > real2 && !isnan_imag1 && !std::isnan(imag2));
-    return (std::isnan(real1) || isnan_imag1 || gt) ? x1 : x2;
-}
-
-template <typename T> T min_complex(const T &x1, const T &x2)
-{
-    using realT = typename T::value_type;
-    realT real1 = std::real(x1);
-    realT real2 = std::real(x2);
-    realT imag1 = std::imag(x1);
-    realT imag2 = std::imag(x2);
-
-    bool isnan_imag1 = std::isnan(imag1);
-    bool lt = (real1 == real2)
-                  ? (imag1 < imag2)
-                  : (real1 < real2 && !isnan_imag1 && !std::isnan(imag2));
-    return (std::isnan(real1) || isnan_imag1 || lt) ? x1 : x2;
-}
-
-template <typename T> T logaddexp(T x, T y)
-{
-    if (x == y) { // handle signed infinities
-        const T log2 = sycl::log(T(2));
-        return x + log2;
-    }
-    else {
-        const T tmp = x - y;
-        static constexpr T zero(0);
-
-        return (tmp > zero)
-                   ? (x + sycl::log1p(sycl::exp(-tmp)))
-                   : ((tmp <= zero) ? y + sycl::log1p(sycl::exp(tmp))
-                                    : std::numeric_limits<T>::quiet_NaN());
-    }
-}
-
-} // namespace math_utils
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/utils/memory_overlap.hpp b/dpctl/tensor/libtensor/include/utils/memory_overlap.hpp
deleted file mode 100644
index af8d1a4874..0000000000
--- a/dpctl/tensor/libtensor/include/utils/memory_overlap.hpp
+++ /dev/null
@@ -1,152 +0,0 @@
-//===-- memory_overlap.hpp - Array memory overlap determination   ---*-C++-*-//
-//                                                                        ===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines utility to determine whether two arrays have memory
-/// overlap.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include "dpctl4pybind11.hpp"
-#include <pybind11/pybind11.h>
-
-/* @brief check for overlap of memory regions behind arrays.
-
-Presently assume that array occupies all bytes between smallest and largest
-displaced elements.
-
-TODO: Write proper Frobenius solver to account for holes, e.g.
-   overlap( x_contig[::2], x_contig[1::2]) should give False,
-   while this implementation gives True.
-*/
-namespace dpctl
-{
-namespace tensor
-{
-namespace overlap
-{
-
-struct MemoryOverlap
-{
-
-    bool operator()(dpctl::tensor::usm_ndarray ar1,
-                    dpctl::tensor::usm_ndarray ar2) const
-    {
-        const char *ar1_data = ar1.get_data();
-
-        const auto &ar1_offsets = ar1.get_minmax_offsets();
-        py::ssize_t ar1_elem_size =
-            static_cast<py::ssize_t>(ar1.get_elemsize());
-
-        const char *ar2_data = ar2.get_data();
-        const auto &ar2_offsets = ar2.get_minmax_offsets();
-        py::ssize_t ar2_elem_size =
-            static_cast<py::ssize_t>(ar2.get_elemsize());
-
-        /* Memory of array1 extends from  */
-        /*    [ar1_data + ar1_offsets.first * ar1_elem_size, ar1_data +
-         * ar1_offsets.second * ar1_elem_size + ar1_elem_size] */
-        /* Memory of array2 extends from */
-        /*    [ar2_data + ar2_offsets.first * ar2_elem_size, ar2_data +
-         * ar2_offsets.second * ar2_elem_size + ar2_elem_size] */
-
-        /* Intervals [x0, x1] and [y0, y1] do not overlap if (x0 <= x1) && (y0
-         * <= y1)
-         * && (x1 <=y0 || y1 <= x0 ) */
-        /* Given that x0 <= x1 and y0 <= y1 are true by construction, the
-         * condition for overlap us (x1 > y0) && (y1 > x0) */
-
-        /*  Applying:
-            (ar1_data + ar1_offsets.second * ar1_elem_size + ar1_elem_size >
-        ar2_data
-        + ar2_offsets.first * ar2_elem_size) && (ar2_data + ar2_offsets.second *
-        ar2_elem_size + ar2_elem_size > ar1_data + ar1_offsets.first *
-        ar1_elem_size)
-        */
-
-        auto byte_distance = static_cast<py::ssize_t>(ar2_data - ar1_data);
-
-        py::ssize_t x1_minus_y0 =
-            (-byte_distance +
-             (ar1_elem_size + (ar1_offsets.second * ar1_elem_size) -
-              (ar2_offsets.first * ar2_elem_size)));
-
-        py::ssize_t y1_minus_x0 =
-            (byte_distance +
-             (ar2_elem_size + (ar2_offsets.second * ar2_elem_size) -
-              (ar1_offsets.first * ar1_elem_size)));
-
-        bool memory_overlap = (x1_minus_y0 > 0) && (y1_minus_x0 > 0);
-
-        return memory_overlap;
-    }
-};
-
-struct SameLogicalTensors
-{
-    bool operator()(dpctl::tensor::usm_ndarray ar1,
-                    dpctl::tensor::usm_ndarray ar2) const
-    {
-        // Same ndim
-        int nd1 = ar1.get_ndim();
-        if (nd1 != ar2.get_ndim())
-            return false;
-
-        // Same dtype
-        int tn1 = ar1.get_typenum();
-        if (tn1 != ar2.get_typenum())
-            return false;
-
-        // Same pointer
-        const char *ar1_data = ar1.get_data();
-        const char *ar2_data = ar2.get_data();
-
-        if (ar1_data != ar2_data)
-            return false;
-
-        // Same shape and strides
-        const py::ssize_t *ar1_shape = ar1.get_shape_raw();
-        const py::ssize_t *ar2_shape = ar2.get_shape_raw();
-
-        if (!std::equal(ar1_shape, ar1_shape + nd1, ar2_shape))
-            return false;
-
-        // Same shape and strides
-        auto const &ar1_strides = ar1.get_strides_vector();
-        auto const &ar2_strides = ar2.get_strides_vector();
-
-        auto ar1_beg_it = std::begin(ar1_strides);
-        auto ar1_end_it = std::end(ar1_strides);
-
-        auto ar2_beg_it = std::begin(ar2_strides);
-
-        if (!std::equal(ar1_beg_it, ar1_end_it, ar2_beg_it))
-            return false;
-
-        // all checks passed: arrays are logical views
-        // into the same memory
-        return true;
-    }
-};
-
-} // namespace overlap
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp
deleted file mode 100644
index 84222d83ac..0000000000
--- a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp
+++ /dev/null
@@ -1,781 +0,0 @@
-//===-- offset_utils.hpp - Indexer classes for strided iteration  ---*-C++-*-//
-//                                                                        ===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines Indexer callable operator to compute element offset in
-/// an array addressed by gloabl_id.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include <algorithm>
-#include <cstddef>
-#include <memory> // for std::make_shared, std::unique_ptr
-#include <tuple>
-#include <utility> // for std::move, std::forward
-#include <vector>
-
-#include <sycl/sycl.hpp>
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "utils/strided_iters.hpp"
-#include "utils/sycl_alloc_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace offset_utils
-{
-
-namespace detail
-{
-
-struct sink_t
-{
-    sink_t(){};
-    template <class T> sink_t(T &&){};
-};
-
-template <class V> std::size_t __accumulate_size(std::size_t &s, V &&v)
-{
-    return s += v.size();
-}
-
-template <class V, class U> sink_t __appender(V &lhs, U &&rhs)
-{
-    lhs.insert(lhs.end(), rhs.begin(), rhs.end());
-    return {};
-}
-
-template <typename T, typename A, typename... Vs>
-std::vector<T, A> concat(std::vector<T, A> lhs, Vs &&...vs)
-{
-    std::size_t s = lhs.size();
-    {
-        // limited scope ensures array is freed
-        [[maybe_unused]] sink_t tmp[] = {__accumulate_size(s, vs)..., 0};
-    }
-    lhs.reserve(s);
-    {
-        // array of no-data objects ensures ordering of calls to the appender
-        [[maybe_unused]] sink_t tmp[] = {
-            __appender(lhs, std::forward<Vs>(vs))..., 0};
-    }
-
-    return std::move(lhs); // prevent return-value optimization
-}
-
-} // namespace detail
-
-template <typename indT, typename... Vs>
-std::tuple<std::unique_ptr<indT, dpctl::tensor::alloc_utils::USMDeleter>,
-           std::size_t,
-           sycl::event>
-device_allocate_and_pack(sycl::queue &q,
-                         std::vector<sycl::event> &host_task_events,
-                         Vs &&...vs)
-{
-
-    using dpctl::tensor::alloc_utils::usm_host_allocator;
-
-    // memory transfer optimization, use USM-host for temporary speeds up
-    // transfer to device, especially on dGPUs
-    using usm_host_allocatorT = usm_host_allocator<indT>;
-    using shT = std::vector<indT, usm_host_allocatorT>;
-
-    usm_host_allocatorT usm_host_alloc(q);
-    shT empty{0, usm_host_alloc};
-    shT packed_shape_strides = detail::concat(std::move(empty), vs...);
-
-    auto packed_shape_strides_owner =
-        std::make_shared<shT>(std::move(packed_shape_strides));
-
-    auto sz = packed_shape_strides_owner->size();
-    auto shape_strides_owner =
-        dpctl::tensor::alloc_utils::smart_malloc_device<indT>(sz, q);
-    indT *shape_strides = shape_strides_owner.get();
-
-    sycl::event copy_ev =
-        q.copy<indT>(packed_shape_strides_owner->data(), shape_strides, sz);
-
-    sycl::event cleanup_host_task_ev = q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(copy_ev);
-        cgh.host_task([packed_shape_strides_owner =
-                           std::move(packed_shape_strides_owner)] {
-            // increment shared pointer ref-count to keep it alive
-            // till copy operation completes;
-        });
-    });
-    host_task_events.push_back(cleanup_host_task_ev);
-
-    return std::make_tuple(std::move(shape_strides_owner), sz, copy_ev);
-}
-
-struct NoOpIndexer
-{
-    constexpr NoOpIndexer() {}
-    constexpr std::size_t operator()(std::size_t gid) const { return gid; }
-};
-
-using dpctl::tensor::ssize_t;
-
-/* @brief Indexer with shape and strides arrays of same size are packed */
-struct StridedIndexer
-{
-    StridedIndexer(int _nd,
-                   ssize_t _offset,
-                   ssize_t const *_packed_shape_strides)
-        : nd(_nd), starting_offset(_offset),
-          shape_strides(_packed_shape_strides)
-    {
-    }
-
-    ssize_t operator()(ssize_t gid) const { return compute_offset(gid); }
-
-    ssize_t operator()(std::size_t gid) const
-    {
-        return compute_offset(static_cast<ssize_t>(gid));
-    }
-
-private:
-    int nd;
-    ssize_t starting_offset;
-    ssize_t const *shape_strides;
-
-    ssize_t compute_offset(ssize_t gid) const
-    {
-        using dpctl::tensor::strides::CIndexer_vector;
-
-        CIndexer_vector _ind(nd);
-        ssize_t relative_offset(0);
-        _ind.get_displacement<const ssize_t *, const ssize_t *>(
-            gid,
-            shape_strides,      // shape ptr
-            shape_strides + nd, // strides ptr
-            relative_offset);
-        return starting_offset + relative_offset;
-    }
-};
-
-// ensure that indexer is device copyable
-static_assert(sycl::is_device_copyable_v<StridedIndexer>);
-
-/* @brief Indexer with shape, strides provided separately */
-struct UnpackedStridedIndexer
-{
-    UnpackedStridedIndexer(int _nd,
-                           ssize_t _offset,
-                           ssize_t const *_shape,
-                           ssize_t const *_strides)
-        : nd(_nd), starting_offset(_offset), shape(_shape), strides(_strides)
-    {
-    }
-
-    ssize_t operator()(ssize_t gid) const { return compute_offset(gid); }
-
-    ssize_t operator()(std::size_t gid) const
-    {
-        return compute_offset(static_cast<ssize_t>(gid));
-    }
-
-private:
-    int nd;
-    ssize_t starting_offset;
-    ssize_t const *shape;
-    ssize_t const *strides;
-
-    ssize_t compute_offset(ssize_t gid) const
-    {
-        using dpctl::tensor::strides::CIndexer_vector;
-
-        CIndexer_vector _ind(nd);
-        ssize_t relative_offset(0);
-        _ind.get_displacement<const ssize_t *, const ssize_t *>(
-            gid,
-            shape,   // shape ptr
-            strides, // strides ptr
-            relative_offset);
-        return starting_offset + relative_offset;
-    }
-};
-
-// ensure that indexer is device copyable
-static_assert(sycl::is_device_copyable_v<UnpackedStridedIndexer>);
-
-struct Strided1DIndexer
-{
-    Strided1DIndexer(std::size_t _size) : offset{}, size(_size), step(1) {}
-    Strided1DIndexer(ssize_t _size)
-        : offset{}, size(static_cast<std::size_t>(_size)), step(1)
-    {
-    }
-    Strided1DIndexer(std::size_t _size, ssize_t _step)
-        : offset{}, size(_size), step(_step)
-    {
-    }
-    Strided1DIndexer(std::size_t _size, std::size_t _step)
-        : offset{}, size(_size), step(static_cast<ssize_t>(_step))
-    {
-    }
-    Strided1DIndexer(ssize_t _size, ssize_t _step)
-        : offset{}, size(static_cast<std::size_t>(_size)), step(_step)
-    {
-    }
-    Strided1DIndexer(ssize_t _offset, std::size_t _size, ssize_t _step)
-        : offset(_offset), size(_size), step(_step)
-    {
-    }
-    Strided1DIndexer(ssize_t _offset, std::size_t _size, std::size_t _step)
-        : offset(_offset), size(_size), step(static_cast<ssize_t>(_step))
-    {
-    }
-    Strided1DIndexer(ssize_t _offset, ssize_t _size, ssize_t _step)
-        : offset(_offset), size(static_cast<std::size_t>(_size)), step(_step)
-    {
-    }
-
-    ssize_t operator()(std::size_t gid) const
-    {
-        // ensure 0 <= gid < size
-        return offset + std::min<std::size_t>(gid, size - 1) * step;
-    }
-
-private:
-    ssize_t offset = 0;
-    std::size_t size = 1;
-    ssize_t step = 1;
-};
-
-static_assert(sycl::is_device_copyable_v<Strided1DIndexer>);
-
-struct Strided1DCyclicIndexer
-{
-    Strided1DCyclicIndexer(ssize_t _offset, ssize_t _size, ssize_t _step)
-        : offset(_offset), size(static_cast<std::size_t>(_size)), step(_step)
-    {
-    }
-
-    ssize_t operator()(std::size_t gid) const
-    {
-        return offset + (gid % size) * step;
-    }
-
-private:
-    ssize_t offset = 0;
-    std::size_t size = 1;
-    ssize_t step = 1;
-};
-
-static_assert(sycl::is_device_copyable_v<Strided1DCyclicIndexer>);
-
-template <typename displacementT> struct TwoOffsets
-{
-    constexpr TwoOffsets() : first_offset(0), second_offset(0) {}
-    constexpr TwoOffsets(const displacementT &first_offset_,
-                         const displacementT &second_offset_)
-        : first_offset(first_offset_), second_offset(second_offset_)
-    {
-    }
-
-    constexpr displacementT get_first_offset() const { return first_offset; }
-    constexpr displacementT get_second_offset() const { return second_offset; }
-
-private:
-    displacementT first_offset = 0;
-    displacementT second_offset = 0;
-};
-
-struct TwoOffsets_StridedIndexer
-{
-    TwoOffsets_StridedIndexer(int common_nd,
-                              ssize_t first_offset_,
-                              ssize_t second_offset_,
-                              ssize_t const *_packed_shape_strides)
-        : nd(common_nd), starting_first_offset(first_offset_),
-          starting_second_offset(second_offset_),
-          shape_strides(_packed_shape_strides)
-    {
-    }
-
-    TwoOffsets<ssize_t> operator()(ssize_t gid) const
-    {
-        return compute_offsets(gid);
-    }
-
-    TwoOffsets<ssize_t> operator()(std::size_t gid) const
-    {
-        return compute_offsets(static_cast<ssize_t>(gid));
-    }
-
-private:
-    int nd;
-    ssize_t starting_first_offset;
-    ssize_t starting_second_offset;
-    ssize_t const *shape_strides;
-
-    TwoOffsets<ssize_t> compute_offsets(ssize_t gid) const
-    {
-        using dpctl::tensor::strides::CIndexer_vector;
-
-        CIndexer_vector _ind(nd);
-        ssize_t relative_first_offset(0);
-        ssize_t relative_second_offset(0);
-        _ind.get_displacement<const ssize_t *, const ssize_t *>(
-            gid,
-            shape_strides,          // shape ptr
-            shape_strides + nd,     // strides ptr
-            shape_strides + 2 * nd, // strides ptr
-            relative_first_offset, relative_second_offset);
-        return TwoOffsets<ssize_t>(
-            starting_first_offset + relative_first_offset,
-            starting_second_offset + relative_second_offset);
-    }
-};
-
-struct TwoZeroOffsets_Indexer
-{
-    constexpr TwoZeroOffsets_Indexer() {}
-
-    constexpr TwoOffsets<ssize_t> operator()(ssize_t) const
-    {
-        return TwoOffsets<ssize_t>();
-    }
-};
-
-static_assert(sycl::is_device_copyable_v<TwoZeroOffsets_Indexer>);
-
-template <typename FirstIndexerT, typename SecondIndexerT>
-struct TwoOffsets_CombinedIndexer
-{
-private:
-    FirstIndexerT first_indexer_;
-    SecondIndexerT second_indexer_;
-
-public:
-    constexpr TwoOffsets_CombinedIndexer(const FirstIndexerT &first_indexer,
-                                         const SecondIndexerT &second_indexer)
-        : first_indexer_(first_indexer), second_indexer_(second_indexer)
-    {
-    }
-
-    constexpr TwoOffsets<ssize_t> operator()(ssize_t gid) const
-    {
-        return TwoOffsets<ssize_t>(first_indexer_(gid), second_indexer_(gid));
-    }
-};
-
-template <typename displacementT> struct ThreeOffsets
-{
-    constexpr ThreeOffsets()
-        : first_offset(0), second_offset(0), third_offset(0)
-    {
-    }
-    constexpr ThreeOffsets(const displacementT &first_offset_,
-                           const displacementT &second_offset_,
-                           const displacementT &third_offset_)
-        : first_offset(first_offset_), second_offset(second_offset_),
-          third_offset(third_offset_)
-    {
-    }
-
-    constexpr displacementT get_first_offset() const { return first_offset; }
-    constexpr displacementT get_second_offset() const { return second_offset; }
-    constexpr displacementT get_third_offset() const { return third_offset; }
-
-private:
-    displacementT first_offset = 0;
-    displacementT second_offset = 0;
-    displacementT third_offset = 0;
-};
-
-struct ThreeOffsets_StridedIndexer
-{
-    ThreeOffsets_StridedIndexer(int common_nd,
-                                ssize_t first_offset_,
-                                ssize_t second_offset_,
-                                ssize_t third_offset_,
-                                ssize_t const *_packed_shape_strides)
-        : nd(common_nd), starting_first_offset(first_offset_),
-          starting_second_offset(second_offset_),
-          starting_third_offset(third_offset_),
-          shape_strides(_packed_shape_strides)
-    {
-    }
-
-    ThreeOffsets<ssize_t> operator()(ssize_t gid) const
-    {
-        return compute_offsets(gid);
-    }
-
-    ThreeOffsets<ssize_t> operator()(std::size_t gid) const
-    {
-        return compute_offsets(static_cast<ssize_t>(gid));
-    }
-
-private:
-    int nd;
-    ssize_t starting_first_offset;
-    ssize_t starting_second_offset;
-    ssize_t starting_third_offset;
-    ssize_t const *shape_strides;
-
-    ThreeOffsets<ssize_t> compute_offsets(ssize_t gid) const
-    {
-        using dpctl::tensor::strides::CIndexer_vector;
-
-        CIndexer_vector _ind(nd);
-        ssize_t relative_first_offset(0);
-        ssize_t relative_second_offset(0);
-        ssize_t relative_third_offset(0);
-        _ind.get_displacement<const ssize_t *, const ssize_t *>(
-            gid,
-            shape_strides,          // shape ptr
-            shape_strides + nd,     // strides ptr
-            shape_strides + 2 * nd, // strides ptr
-            shape_strides + 3 * nd, // strides ptr
-            relative_first_offset, relative_second_offset,
-            relative_third_offset);
-        return ThreeOffsets<ssize_t>(
-            starting_first_offset + relative_first_offset,
-            starting_second_offset + relative_second_offset,
-            starting_third_offset + relative_third_offset);
-    }
-};
-
-static_assert(sycl::is_device_copyable_v<ThreeOffsets_StridedIndexer>);
-
-struct ThreeZeroOffsets_Indexer
-{
-    constexpr ThreeZeroOffsets_Indexer() {}
-
-    constexpr ThreeOffsets<ssize_t> operator()(ssize_t) const
-    {
-        return ThreeOffsets<ssize_t>();
-    }
-
-    constexpr ThreeOffsets<ssize_t> operator()(std::size_t) const
-    {
-        return ThreeOffsets<ssize_t>();
-    }
-};
-
-static_assert(sycl::is_device_copyable_v<ThreeZeroOffsets_Indexer>);
-
-template <typename FirstIndexerT,
-          typename SecondIndexerT,
-          typename ThirdIndexerT>
-struct ThreeOffsets_CombinedIndexer
-{
-private:
-    FirstIndexerT first_indexer_;
-    SecondIndexerT second_indexer_;
-    ThirdIndexerT third_indexer_;
-
-public:
-    constexpr ThreeOffsets_CombinedIndexer(const FirstIndexerT &first_indexer,
-                                           const SecondIndexerT &second_indexer,
-                                           const ThirdIndexerT &third_indexer)
-        : first_indexer_(first_indexer), second_indexer_(second_indexer),
-          third_indexer_(third_indexer)
-    {
-    }
-
-    constexpr ThreeOffsets<ssize_t> operator()(ssize_t gid) const
-    {
-        return ThreeOffsets<ssize_t>(first_indexer_(gid), second_indexer_(gid),
-                                     third_indexer_(gid));
-    }
-};
-
-template <typename displacementT> struct FourOffsets
-{
-    constexpr FourOffsets()
-        : first_offset(0), second_offset(0), third_offset(0), fourth_offset(0)
-    {
-    }
-    constexpr FourOffsets(const displacementT &first_offset_,
-                          const displacementT &second_offset_,
-                          const displacementT &third_offset_,
-                          const displacementT &fourth_offset_)
-        : first_offset(first_offset_), second_offset(second_offset_),
-          third_offset(third_offset_), fourth_offset(fourth_offset_)
-    {
-    }
-
-    constexpr displacementT get_first_offset() const { return first_offset; }
-    constexpr displacementT get_second_offset() const { return second_offset; }
-    constexpr displacementT get_third_offset() const { return third_offset; }
-    constexpr displacementT get_fourth_offset() const { return fourth_offset; }
-
-private:
-    displacementT first_offset = 0;
-    displacementT second_offset = 0;
-    displacementT third_offset = 0;
-    displacementT fourth_offset = 0;
-};
-
-struct FourOffsets_StridedIndexer
-{
-    constexpr FourOffsets_StridedIndexer(int common_nd,
-                                         ssize_t first_offset_,
-                                         ssize_t second_offset_,
-                                         ssize_t third_offset_,
-                                         ssize_t fourth_offset_,
-                                         ssize_t const *_packed_shape_strides)
-        : nd(common_nd), starting_first_offset(first_offset_),
-          starting_second_offset(second_offset_),
-          starting_third_offset(third_offset_),
-          starting_fourth_offset(fourth_offset_),
-          shape_strides(_packed_shape_strides)
-    {
-    }
-
-    constexpr FourOffsets<ssize_t> operator()(ssize_t gid) const
-    {
-        return compute_offsets(gid);
-    }
-
-    constexpr FourOffsets<ssize_t> operator()(std::size_t gid) const
-    {
-        return compute_offsets(static_cast<ssize_t>(gid));
-    }
-
-private:
-    int nd;
-    ssize_t starting_first_offset;
-    ssize_t starting_second_offset;
-    ssize_t starting_third_offset;
-    ssize_t starting_fourth_offset;
-    ssize_t const *shape_strides;
-
-    FourOffsets<ssize_t> compute_offsets(ssize_t gid) const
-    {
-        using dpctl::tensor::strides::CIndexer_vector;
-
-        CIndexer_vector _ind(nd);
-        ssize_t relative_first_offset(0);
-        ssize_t relative_second_offset(0);
-        ssize_t relative_third_offset(0);
-        ssize_t relative_fourth_offset(0);
-        _ind.get_displacement<const ssize_t *, const ssize_t *>(
-            gid,
-            shape_strides,          // shape ptr
-            shape_strides + nd,     // strides ptr
-            shape_strides + 2 * nd, // strides ptr
-            shape_strides + 3 * nd, // strides ptr
-            shape_strides + 4 * nd, // strides ptr
-            relative_first_offset, relative_second_offset,
-            relative_third_offset, relative_fourth_offset);
-        return FourOffsets<ssize_t>(
-            starting_first_offset + relative_first_offset,
-            starting_second_offset + relative_second_offset,
-            starting_third_offset + relative_third_offset,
-            starting_fourth_offset + relative_fourth_offset);
-    }
-};
-
-static_assert(sycl::is_device_copyable_v<FourOffsets_StridedIndexer>);
-
-struct FourZeroOffsets_Indexer
-{
-    constexpr FourZeroOffsets_Indexer() {}
-
-    constexpr FourOffsets<ssize_t> operator()(ssize_t) const
-    {
-        return FourOffsets<ssize_t>();
-    }
-};
-
-static_assert(sycl::is_device_copyable_v<FourZeroOffsets_Indexer>);
-
-struct NthStrideOffset
-{
-    NthStrideOffset(int common_nd,
-                    ssize_t const *_offsets,
-                    ssize_t const *_packed_shape_strides)
-        : _ind(common_nd), nd(common_nd), offsets(_offsets),
-          shape_strides(_packed_shape_strides)
-    {
-    }
-
-    std::size_t operator()(ssize_t gid, int n) const
-    {
-        ssize_t relative_offset(0);
-        _ind.get_displacement<const ssize_t *, const ssize_t *>(
-            gid, shape_strides, shape_strides + ((n + 1) * nd),
-            relative_offset);
-
-        return relative_offset + offsets[n];
-    }
-
-private:
-    dpctl::tensor::strides::CIndexer_vector<ssize_t> _ind;
-
-    int nd;
-    ssize_t const *offsets;
-    ssize_t const *shape_strides;
-};
-
-static_assert(sycl::is_device_copyable_v<NthStrideOffset>);
-
-template <int nd> struct FixedDimStridedIndexer
-{
-    FixedDimStridedIndexer(const std::array<ssize_t, nd> &_shape,
-                           const std::array<ssize_t, nd> &_strides,
-                           ssize_t _offset)
-        : _ind(_shape), strides(_strides), starting_offset(_offset)
-    {
-    }
-    std::size_t operator()(std::size_t gid) const
-    {
-        dpctl::tensor::strides::CIndexer_array<nd, ssize_t> local_indexer(
-            std::move(_ind));
-        local_indexer.set(gid);
-        auto mi = local_indexer.get();
-
-        ssize_t relative_offset = 0;
-
-#pragma unroll
-        for (int i = 0; i < nd; ++i) {
-            relative_offset += mi[i] * strides[i];
-        }
-        return starting_offset + relative_offset;
-    }
-
-private:
-    dpctl::tensor::strides::CIndexer_array<nd, ssize_t> _ind;
-
-    std::array<ssize_t, nd> strides;
-    ssize_t starting_offset;
-};
-
-static_assert(sycl::is_device_copyable_v<FixedDimStridedIndexer<1>>);
-
-template <int nd> struct TwoOffsets_FixedDimStridedIndexer
-{
-    TwoOffsets_FixedDimStridedIndexer(const std::array<ssize_t, nd> &_shape,
-                                      const std::array<ssize_t, nd> &_strides1,
-                                      const std::array<ssize_t, nd> &_strides2,
-                                      ssize_t _offset1,
-                                      ssize_t _offset2)
-        : _ind(_shape), strides1(_strides1), strides2(_strides2),
-          starting_offset1(_offset1), starting_offset2(_offset2)
-    {
-    }
-
-    TwoOffsets<ssize_t> operator()(std::size_t gid) const
-    {
-        dpctl::tensor::strides::CIndexer_array<nd, ssize_t> local_indexer(
-            std::move(_ind));
-        local_indexer.set(gid);
-        auto mi = local_indexer.get();
-
-        ssize_t relative_offset1 = 0;
-#pragma unroll
-        for (int i = 0; i < nd; ++i) {
-            relative_offset1 += mi[i] * strides1[i];
-        }
-
-        ssize_t relative_offset2 = 0;
-#pragma unroll
-        for (int i = 0; i < nd; ++i) {
-            relative_offset2 += mi[i] * strides2[i];
-        }
-
-        return TwoOffsets<ssize_t>(starting_offset1 + relative_offset1,
-                                   starting_offset2 + relative_offset2);
-    }
-
-private:
-    dpctl::tensor::strides::CIndexer_array<nd, ssize_t> _ind;
-
-    std::array<ssize_t, nd> strides1;
-    std::array<ssize_t, nd> strides2;
-    ssize_t starting_offset1;
-    ssize_t starting_offset2;
-};
-
-static_assert(sycl::is_device_copyable_v<TwoOffsets_FixedDimStridedIndexer<1>>);
-
-template <int nd> struct ThreeOffsets_FixedDimStridedIndexer
-{
-    ThreeOffsets_FixedDimStridedIndexer(
-        const std::array<ssize_t, nd> &_shape,
-        const std::array<ssize_t, nd> &_strides1,
-        const std::array<ssize_t, nd> &_strides2,
-        const std::array<ssize_t, nd> &_strides3,
-        ssize_t _offset1,
-        ssize_t _offset2,
-        ssize_t _offset3)
-        : _ind(_shape), strides1(_strides1), strides2(_strides2),
-          strides3(_strides3), starting_offset1(_offset1),
-          starting_offset2(_offset2), starting_offset3(_offset3)
-    {
-    }
-
-    ThreeOffsets<ssize_t> operator()(std::size_t gid) const
-    {
-        dpctl::tensor::strides::CIndexer_array<nd, ssize_t> local_indexer(
-            std::move(_ind));
-        local_indexer.set(gid);
-        auto mi = local_indexer.get();
-
-        ssize_t relative_offset1 = 0;
-#pragma unroll
-        for (int i = 0; i < nd; ++i) {
-            relative_offset1 += mi[i] * strides1[i];
-        }
-
-        ssize_t relative_offset2 = 0;
-#pragma unroll
-        for (int i = 0; i < nd; ++i) {
-            relative_offset2 += mi[i] * strides2[i];
-        }
-
-        ssize_t relative_offset3 = 0;
-#pragma unroll
-        for (int i = 0; i < nd; ++i) {
-            relative_offset3 += mi[i] * strides3[i];
-        }
-
-        return ThreeOffsets<ssize_t>(starting_offset1 + relative_offset1,
-                                     starting_offset2 + relative_offset2,
-                                     starting_offset3 + relative_offset3);
-    }
-
-private:
-    dpctl::tensor::strides::CIndexer_array<nd, ssize_t> _ind;
-
-    std::array<ssize_t, nd> strides1;
-    std::array<ssize_t, nd> strides2;
-    std::array<ssize_t, nd> strides3;
-    ssize_t starting_offset1;
-    ssize_t starting_offset2;
-    ssize_t starting_offset3;
-};
-
-static_assert(
-    sycl::is_device_copyable_v<ThreeOffsets_FixedDimStridedIndexer<1>>);
-
-} // namespace offset_utils
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/utils/output_validation.hpp b/dpctl/tensor/libtensor/include/utils/output_validation.hpp
deleted file mode 100644
index 77658a4575..0000000000
--- a/dpctl/tensor/libtensor/include/utils/output_validation.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-//===- output_validation.hpp - Utilities for output array validation
-//-*-C++-*===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines utilities for determining if an array is a valid output
-/// array.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <stdexcept>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/pybind11.h>
-
-namespace dpctl
-{
-
-namespace tensor
-{
-
-namespace validation
-{
-
-/*! @brief Raises a value error if an array is read-only.
-
-    This should be called with an array before writing.*/
-struct CheckWritable
-{
-    static void throw_if_not_writable(const dpctl::tensor::usm_ndarray &arr)
-    {
-        if (!arr.is_writable()) {
-            throw py::value_error("output array is read-only.");
-        }
-        return;
-    }
-};
-
-/*! @brief Raises a value error if an array's memory is not sufficiently ample
-    to accommodate an input number of elements.
-
-    This should be called with an array before writing.*/
-struct AmpleMemory
-{
-    template <typename T>
-    static void throw_if_not_ample(const dpctl::tensor::usm_ndarray &arr,
-                                   T nelems)
-    {
-        auto arr_offsets = arr.get_minmax_offsets();
-        T range = static_cast<T>(arr_offsets.second - arr_offsets.first);
-        if (range + 1 < nelems) {
-            throw py::value_error("Memory addressed by the output array is not "
-                                  "sufficiently ample.");
-        }
-        return;
-    }
-};
-
-} // namespace validation
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/utils/rich_comparisons.hpp b/dpctl/tensor/libtensor/include/utils/rich_comparisons.hpp
deleted file mode 100644
index beaa1b5dd7..0000000000
--- a/dpctl/tensor/libtensor/include/utils/rich_comparisons.hpp
+++ /dev/null
@@ -1,137 +0,0 @@
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_sorting_impl
-/// extension.
-//===--------------------------------------------------------------------===//
-
-#pragma once
-
-#include <cmath>
-#include <complex>
-#include <type_traits>
-
-#include "sycl/sycl.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace rich_comparisons
-{
-
-namespace detail
-{
-template <typename fpT> struct ExtendedRealFPLess
-{
-    /* [R, nan] */
-    bool operator()(const fpT v1, const fpT v2) const
-    {
-        return (!std::isnan(v1) && (std::isnan(v2) || (v1 < v2)));
-    }
-};
-
-template <typename fpT> struct ExtendedRealFPGreater
-{
-    bool operator()(const fpT v1, const fpT v2) const
-    {
-        return (!std::isnan(v2) && (std::isnan(v1) || (v2 < v1)));
-    }
-};
-
-template <typename cT> struct ExtendedComplexFPLess
-{
-    /* [(R, R), (R, nan), (nan, R), (nan, nan)] */
-
-    bool operator()(const cT &v1, const cT &v2) const
-    {
-        using realT = typename cT::value_type;
-
-        const realT real1 = std::real(v1);
-        const realT real2 = std::real(v2);
-
-        const bool r1_nan = std::isnan(real1);
-        const bool r2_nan = std::isnan(real2);
-
-        const realT imag1 = std::imag(v1);
-        const realT imag2 = std::imag(v2);
-
-        const bool i1_nan = std::isnan(imag1);
-        const bool i2_nan = std::isnan(imag2);
-
-        const int idx1 = ((r1_nan) ? 2 : 0) + ((i1_nan) ? 1 : 0);
-        const int idx2 = ((r2_nan) ? 2 : 0) + ((i2_nan) ? 1 : 0);
-
-        const bool res =
-            !(r1_nan && i1_nan) &&
-            ((idx1 < idx2) ||
-             ((idx1 == idx2) &&
-              ((r1_nan && !i1_nan && (imag1 < imag2)) ||
-               (!r1_nan && i1_nan && (real1 < real2)) ||
-               (!r1_nan && !i1_nan &&
-                ((real1 < real2) || (!(real2 < real1) && (imag1 < imag2)))))));
-
-        return res;
-    }
-};
-
-template <typename cT> struct ExtendedComplexFPGreater
-{
-    bool operator()(const cT &v1, const cT &v2) const
-    {
-        auto less_ = ExtendedComplexFPLess<cT>{};
-        return less_(v2, v1);
-    }
-};
-
-template <typename T>
-inline constexpr bool is_fp_v =
-    (std::is_same_v<T, sycl::half> || std::is_same_v<T, float> ||
-     std::is_same_v<T, double>);
-
-} // end of namespace detail
-
-template <typename argTy> struct AscendingSorter
-{
-    using type = std::conditional_t<detail::is_fp_v<argTy>,
-                                    detail::ExtendedRealFPLess<argTy>,
-                                    std::less<argTy>>;
-};
-
-template <typename T> struct AscendingSorter<std::complex<T>>
-{
-    using type = detail::ExtendedComplexFPLess<std::complex<T>>;
-};
-
-template <typename argTy> struct DescendingSorter
-{
-    using type = std::conditional_t<detail::is_fp_v<argTy>,
-                                    detail::ExtendedRealFPGreater<argTy>,
-                                    std::greater<argTy>>;
-};
-
-template <typename T> struct DescendingSorter<std::complex<T>>
-{
-    using type = detail::ExtendedComplexFPGreater<std::complex<T>>;
-};
-
-} // end of namespace rich_comparisons
-} // end of namespace tensor
-} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/utils/strided_iters.hpp b/dpctl/tensor/libtensor/include/utils/strided_iters.hpp
deleted file mode 100644
index 3b5ff9e5aa..0000000000
--- a/dpctl/tensor/libtensor/include/utils/strided_iters.hpp
+++ /dev/null
@@ -1,985 +0,0 @@
-//===-- strided_iters.cpp - CIndexer classes for strided iteration ---*-C++-*-
-//===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines CIndexer_array, and CIndexer_vector classes, as well
-/// iteration space simplifiers.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include <algorithm> // sort
-#include <array>
-#include <cstddef>
-#include <numeric> // std::iota
-#include <tuple>
-#include <vector>
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace strides
-{
-
-/* An N-dimensional array can be stored in a single
- * contiguous chunk of memory by contiguously laying
- * array elements in lexicographinc order of their
- * array indices. Such a layout is called C-contiguous.
- *
- * E.g. for (2, 3, 2) array `a` with zero-based indexing convention
- * the C-array's elements are
- * { a[0,0,0], a[0,0,1], a[0,1,0], a[0,1,1], a[0,2,0], a[0,2,1],
- *   a[1,0,0], a[1,0,1], a[1,1,0], a[1,1,1], a[1,2,0], a[1,2,1] }
- *
- * Indexer maps zero-based index in C-array to a multi-index
- * for the purpose of computing element displacement in the
- * strided array, i.e. in the above example for k = 5, the displacement
- * is (s0*0 + s1*2 + s2*1), and for k = 7 it is (s0*1 + s1*0 + s2*1)
- * for N-dimensional array with strides (s0, s1, s2).
- *
- * Cindexer_vector need not know array rank `dim` at compile time.
- * Shape and strides are stored in std::vector, which are not trivially
- * copyable.
- *
- * For the class to be trivially copyable for offloading displacement
- * computation methods take accessor/pointer arguments of type T for
- * shape and stride and modify displacement argument passed by reference.
- */
-template <typename indT = std::ptrdiff_t> class CIndexer_vector
-{
-    static_assert(std::is_integral<indT>::value, "Integral type is required");
-    static_assert(std::is_signed<indT>::value,
-                  "Signed integral type is required");
-    int nd;
-
-public:
-    CIndexer_vector(int dim) : nd(dim) {}
-
-    template <class ShapeTy> indT size(const ShapeTy &shape) const
-    {
-        indT s = static_cast<indT>(1);
-        for (int i = 0; i < nd; ++i) {
-            s *= shape[i];
-        }
-        return s;
-    }
-
-    template <class ShapeTy, class StridesTy>
-    void get_displacement(const indT i,
-                          const ShapeTy &shape,
-                          const StridesTy &stride,
-                          indT &disp) const
-    {
-        if (nd == 1) {
-            disp = i * stride[0];
-            return;
-        }
-
-        indT i_ = i;
-        indT d = 0;
-        for (int dim = nd; --dim > 0;) {
-            const indT si = shape[dim];
-            const indT q = i_ / si;
-            const indT r = (i_ - q * si);
-            d += r * stride[dim];
-            i_ = q;
-        }
-        disp = d + i_ * stride[0];
-    }
-
-    template <class ShapeTy, class StridesTy>
-    void get_displacement(const indT i,
-                          const ShapeTy &shape,
-                          const StridesTy &stride1,
-                          const StridesTy &stride2,
-                          indT &disp1,
-                          indT &disp2) const
-    {
-        if (nd == 1) {
-            disp1 = i * stride1[0];
-            disp2 = i * stride2[0];
-            return;
-        }
-
-        indT i_ = i;
-        indT d1 = 0, d2 = 0;
-        for (int dim = nd; --dim > 0;) {
-            const indT si = shape[dim];
-            const indT q = i_ / si;
-            const indT r = (i_ - q * si);
-            i_ = q;
-            d1 += r * stride1[dim];
-            d2 += r * stride2[dim];
-        }
-        disp1 = d1 + i_ * stride1[0];
-        disp2 = d2 + i_ * stride2[0];
-        return;
-    }
-
-    template <class ShapeTy, class StridesTy>
-    void get_displacement(const indT i,
-                          const ShapeTy &shape,
-                          const StridesTy &stride1,
-                          const StridesTy &stride2,
-                          const StridesTy &stride3,
-                          indT &disp1,
-                          indT &disp2,
-                          indT &disp3) const
-    {
-        if (nd == 1) {
-            disp1 = i * stride1[0];
-            disp2 = i * stride2[0];
-            disp3 = i * stride3[0];
-            return;
-        }
-
-        indT i_ = i;
-        indT d1 = 0, d2 = 0, d3 = 0;
-        for (int dim = nd; --dim > 0;) {
-            const indT si = shape[dim];
-            const indT q = i_ / si;
-            const indT r = (i_ - q * si);
-            i_ = q;
-            d1 += r * stride1[dim];
-            d2 += r * stride2[dim];
-            d3 += r * stride3[dim];
-        };
-        disp1 = d1 + i_ * stride1[0];
-        disp2 = d2 + i_ * stride2[0];
-        disp3 = d3 + i_ * stride3[0];
-        return;
-    }
-
-    template <class ShapeTy, class StridesTy>
-    void get_displacement(const indT i,
-                          const ShapeTy &shape,
-                          const StridesTy &stride1,
-                          const StridesTy &stride2,
-                          const StridesTy &stride3,
-                          const StridesTy &stride4,
-                          indT &disp1,
-                          indT &disp2,
-                          indT &disp3,
-                          indT &disp4) const
-    {
-        if (nd == 1) {
-            disp1 = i * stride1[0];
-            disp2 = i * stride2[0];
-            disp3 = i * stride3[0];
-            disp4 = i * stride4[0];
-            return;
-        }
-
-        indT i_ = i;
-        indT d1 = 0, d2 = 0, d3 = 0, d4 = 0;
-        for (int dim = nd; --dim > 0;) {
-            const indT si = shape[dim];
-            const indT q = i_ / si;
-            const indT r = (i_ - q * si);
-            i_ = q;
-            d1 += r * stride1[dim];
-            d2 += r * stride2[dim];
-            d3 += r * stride3[dim];
-            d4 += r * stride4[dim];
-        }
-        disp1 = d1 + i_ * stride1[0];
-        disp2 = d2 + i_ * stride2[0];
-        disp3 = d3 + i_ * stride3[0];
-        disp4 = d4 + i_ * stride4[0];
-        return;
-    }
-
-    template <class ShapeTy, class StridesTy, int nstrides>
-    void get_displacement(const indT i,
-                          const ShapeTy &shape,
-                          const std::array<StridesTy, nstrides> &strides,
-                          std::array<indT, nstrides> &disps) const
-    {
-        if (nd == 1) {
-            for (int k = 0; k < nstrides; ++k) {
-                disps[k] = i * strides[k][0];
-            }
-            return;
-        }
-
-        indT i_ = i;
-        std::array<indT, nstrides> ds;
-        for (int k = 0; k < nstrides; ++k) {
-            ds[k] = 0;
-        }
-
-        for (int dim = nd; --dim > 0;) {
-            const indT si = shape[dim];
-            const indT q = i_ / si;
-            const indT r = (i_ - q * si);
-            for (int k = 0; k < nstrides; ++k) {
-                ds[k] += r * strides[k][dim];
-            }
-            i_ = q;
-        };
-        for (int k = 0; k < nstrides; ++k) {
-            disps[k] = ds[k] + i_ * strides[k][0];
-        }
-        return;
-    }
-
-    template <class ShapeTy, class StridesTy>
-    void get_left_rolled_displacement(const indT i,
-                                      const ShapeTy &shape,
-                                      const StridesTy &stride,
-                                      const StridesTy &shifts,
-                                      indT &disp) const
-    {
-        indT i_ = i;
-        indT d(0);
-        for (int dim = nd; --dim > 0;) {
-            const indT si = shape[dim];
-            const indT q = i_ / si;
-            const indT r = (i_ - q * si);
-            // assumes si > shifts[dim] >= 0
-            const indT shifted_r =
-                (r < shifts[dim] ? r + si - shifts[dim] : r - shifts[dim]);
-            d += shifted_r * stride[dim];
-            i_ = q;
-        }
-        const indT shifted_r =
-            (i_ < shifts[0] ? i_ + shape[0] - shifts[0] : i_ - shifts[0]);
-        disp = d + shifted_r * stride[0];
-    }
-};
-
-/*
- * CIndexer is for arrays whose array-rank is known at compile time.
- * Statically allocated shape and multi_index arrays are members of
- * the class instance, and it remains trivially copyable.
- *
- * Method `set(k)` populates work-item private array multi_index, which
- * can be accessed using `get()` to compute the displacement as needed.
- */
-
-template <int _ndim, typename indT = std::ptrdiff_t> class CIndexer_array
-{
-    static constexpr int ndim = _ndim;
-
-    static_assert(std::is_integral<indT>::value, "Integral type is required");
-    static_assert(std::is_signed<indT>::value,
-                  "Signed integral type is required");
-    static_assert(ndim > 0, "Dimensionality must be positive");
-
-private:
-    typedef std::array<indT, ndim> index_t;
-
-    indT elem_count;
-    index_t shape;
-    index_t multi_index;
-
-public:
-    CIndexer_array() : elem_count(0), shape{}, multi_index{} {}
-
-    explicit CIndexer_array(const index_t &input_shape)
-        : elem_count(0), shape{}, multi_index{}
-    {
-        indT s(1);
-        for (int i = 0; i < ndim; ++i) {
-            shape[i] = input_shape[i];
-            s *= input_shape[i];
-        }
-        elem_count = s;
-    }
-
-    indT size() const { return elem_count; }
-    indT rank() const { return ndim; }
-
-    void set(const indT i)
-    {
-        if (ndim == 1) {
-            multi_index[0] = i;
-            return;
-        }
-
-        indT i_ = i;
-#pragma unroll
-        for (int dim = ndim; --dim > 0;) {
-            indT si = shape[dim];
-            indT q = i_ / si;
-            multi_index[dim] = i_ - q * si;
-            i_ = q;
-        }
-        multi_index[0] = i_;
-    }
-
-    const index_t &get() const { return multi_index; }
-};
-
-/*
-    For purposes of iterating over elements of array with
-    `shape` and `strides` given as pointers
-    `simplify_iteration_strides(nd, shape_ptr, strides_ptr, disp)`
-    may modify memory and returns new length of these arrays.
-
-    The new shape and new strides, as well as the offset
-    `(new_shape, new_strides, disp)` are such that iterating over
-    them will traverse the same elements, possibly in
-    different order.
-
-    ..Example: python
-        import itertools
-        # for some array Y over whose elements we iterate
-        csh, cst, cp = contract_iter(Y.shape, Y.strides)
-        def pointers_set(sh, st, p):
-            citers = itertools.product(*map(lambda s: range(s), sh))
-            dot = lambda st, it: sum(st[k]*it[k] for k in range(len(st)))
-            return set(p + dot(st, it) for it in citers)
-        ps1 = pointers_set(csh, cst, cp)
-        ps2 = pointers_set(Y.shape, Y.strides, 0)
-        assert ps1 == ps2
-
- */
-template <class ShapeTy, class StridesTy>
-int simplify_iteration_stride(const int nd,
-                              ShapeTy *shape,
-                              StridesTy *strides,
-                              StridesTy &disp)
-{
-    disp = StridesTy(0);
-    if (nd < 2)
-        return nd;
-
-    std::vector<int> pos(nd);
-    std::iota(pos.begin(), pos.end(), 0);
-
-    std::stable_sort(
-        pos.begin(), pos.end(), [&strides, &shape](int i1, int i2) {
-            auto abs_str1 = (strides[i1] < 0) ? -strides[i1] : strides[i1];
-            auto abs_str2 = (strides[i2] < 0) ? -strides[i2] : strides[i2];
-            return (abs_str1 > abs_str2) ||
-                   (abs_str1 == abs_str2 && shape[i1] > shape[i2]);
-        });
-
-    std::vector<ShapeTy> shape_w;
-    std::vector<StridesTy> strides_w;
-    int nd_ = nd;
-    shape_w.reserve(nd_);
-    strides_w.reserve(nd_);
-
-    for (int i = 0; i < nd; ++i) {
-        auto p = pos[i];
-        auto sh_p = shape[p];
-        auto str_p = strides[p];
-        shape_w.push_back(sh_p);
-        if (str_p < 0) {
-            disp += str_p * (sh_p - 1);
-            str_p = -str_p;
-        }
-        strides_w.push_back(str_p);
-    }
-
-    {
-        bool changed;
-        do {
-            changed = false;
-            for (int i = 0; i + 1 < nd_; ++i) {
-                StridesTy step = strides_w[i + 1];
-                StridesTy jump = strides_w[i] - (shape_w[i + 1] - 1) * step;
-                if (jump == step) {
-                    changed = true;
-                    for (int k = i; k + 1 < nd_; ++k) {
-                        strides_w[k] = strides_w[k + 1];
-                    }
-                    shape_w[i] *= shape_w[i + 1];
-                    for (int k = i + 1; k + 1 < nd_; ++k) {
-                        shape_w[k] = shape_w[k + 1];
-                    }
-                    --nd_;
-                }
-            }
-        } while (changed);
-    }
-
-    for (int i = 0; i < nd_; ++i) {
-        shape[i] = shape_w[i];
-    }
-    for (int i = 0; i < nd_; ++i) {
-        strides[i] = strides_w[i];
-    }
-
-    return nd_;
-}
-
-/*
-    For purposes of iterating over pairs of elements of two arrays
-    with  `shape` and strides `strides1`, `strides2` given as pointers
-    `simplify_iteration_two_strides(nd, shape_ptr, strides1_ptr,
-    strides2_ptr, disp1, disp2)`
-    may modify memory and returns new length of these arrays.
-
-    The new shape and new strides, as well as the offset
-    `(new_shape, new_strides1, disp1, new_stride2, disp2)` are such that
-    iterating over them will traverse the same set of pairs of elements,
-    possibly in a different order.
- */
-template <class ShapeTy, class StridesTy>
-int simplify_iteration_two_strides(const int nd,
-                                   ShapeTy *shape,
-                                   StridesTy *strides1,
-                                   StridesTy *strides2,
-                                   StridesTy &disp1,
-                                   StridesTy &disp2)
-{
-    disp1 = StridesTy(0);
-    disp2 = StridesTy(0);
-    if (nd < 2)
-        return nd;
-
-    std::vector<int> pos(nd);
-    std::iota(pos.begin(), pos.end(), 0);
-
-    std::stable_sort(
-        pos.begin(), pos.end(), [&strides1, &strides2, &shape](int i1, int i2) {
-            auto abs_str1_i1 =
-                (strides1[i1] < 0) ? -strides1[i1] : strides1[i1];
-            auto abs_str1_i2 =
-                (strides1[i2] < 0) ? -strides1[i2] : strides1[i2];
-            auto abs_str2_i1 =
-                (strides2[i1] < 0) ? -strides2[i1] : strides2[i1];
-            auto abs_str2_i2 =
-                (strides2[i2] < 0) ? -strides2[i2] : strides2[i2];
-            return (abs_str2_i1 > abs_str2_i2) ||
-                   (abs_str2_i1 == abs_str2_i2 &&
-                    (abs_str1_i1 > abs_str1_i2 ||
-                     (abs_str1_i1 == abs_str1_i2 && shape[i1] > shape[i2])));
-        });
-
-    std::vector<ShapeTy> shape_w;
-    std::vector<StridesTy> strides1_w;
-    std::vector<StridesTy> strides2_w;
-
-    bool contractable = true;
-    for (int i = 0; i < nd; ++i) {
-        auto p = pos[i];
-        auto sh_p = shape[p];
-        auto str1_p = strides1[p];
-        auto str2_p = strides2[p];
-        shape_w.push_back(sh_p);
-        if (str1_p <= 0 && str2_p <= 0 && std::min(str1_p, str2_p) < 0) {
-            disp1 += str1_p * (sh_p - 1);
-            str1_p = -str1_p;
-            disp2 += str2_p * (sh_p - 1);
-            str2_p = -str2_p;
-        }
-        if (str1_p < 0 || str2_p < 0) {
-            contractable = false;
-        }
-        strides1_w.push_back(str1_p);
-        strides2_w.push_back(str2_p);
-    }
-
-    int nd_ = nd;
-    while (contractable) {
-        bool changed = false;
-        for (int i = 0; i + 1 < nd_; ++i) {
-            StridesTy str1 = strides1_w[i + 1];
-            StridesTy str2 = strides2_w[i + 1];
-            StridesTy jump1 = strides1_w[i] - (shape_w[i + 1] - 1) * str1;
-            StridesTy jump2 = strides2_w[i] - (shape_w[i + 1] - 1) * str2;
-
-            if (jump1 == str1 && jump2 == str2) {
-                changed = true;
-                shape_w[i] *= shape_w[i + 1];
-                for (int j = i; j < nd_; ++j) {
-                    strides1_w[j] = strides1_w[j + 1];
-                }
-                for (int j = i; j < nd_; ++j) {
-                    strides2_w[j] = strides2_w[j + 1];
-                }
-                for (int j = i + 1; j + 1 < nd_; ++j) {
-                    shape_w[j] = shape_w[j + 1];
-                }
-                --nd_;
-                break;
-            }
-        }
-        if (!changed)
-            break;
-    }
-    for (int i = 0; i < nd_; ++i) {
-        shape[i] = shape_w[i];
-    }
-    for (int i = 0; i < nd_; ++i) {
-        strides1[i] = strides1_w[i];
-    }
-    for (int i = 0; i < nd_; ++i) {
-        strides2[i] = strides2_w[i];
-    }
-
-    return nd_;
-}
-
-template <typename T, class Error, typename vecT = std::vector<T>>
-std::tuple<vecT, vecT, T> contract_iter(const vecT &shape, const vecT &strides)
-{
-    const std::size_t dim = shape.size();
-    if (dim != strides.size()) {
-        throw Error("Shape and strides must be of equal size.");
-    }
-    vecT out_shape = shape;
-    vecT out_strides = strides;
-    T disp(0);
-
-    int nd = simplify_iteration_stride(dim, out_shape.data(),
-                                       out_strides.data(), disp);
-    out_shape.resize(nd);
-    out_strides.resize(nd);
-    return std::make_tuple(out_shape, out_strides, disp);
-}
-
-template <typename T, class Error, typename vecT = std::vector<T>>
-std::tuple<vecT, vecT, T, vecT, T>
-contract_iter2(const vecT &shape, const vecT &strides1, const vecT &strides2)
-{
-    const std::size_t dim = shape.size();
-    if (dim != strides1.size() || dim != strides2.size()) {
-        throw Error("Shape and strides must be of equal size.");
-    }
-    vecT out_shape = shape;
-    vecT out_strides1 = strides1;
-    vecT out_strides2 = strides2;
-    T disp1(0);
-    T disp2(0);
-
-    int nd = simplify_iteration_two_strides(dim, out_shape.data(),
-                                            out_strides1.data(),
-                                            out_strides2.data(), disp1, disp2);
-    out_shape.resize(nd);
-    out_strides1.resize(nd);
-    out_strides2.resize(nd);
-    return std::make_tuple(out_shape, out_strides1, disp1, out_strides2, disp2);
-}
-
-/*
-    For purposes of iterating over pairs of elements of three arrays
-    with  `shape` and strides `strides1`, `strides2`, `strides3` given as
-    pointers `simplify_iteration_three_strides(nd, shape_ptr, strides1_ptr,
-    strides2_ptr, strides3_ptr, disp1, disp2, disp3)`
-    may modify memory and returns new length of these arrays.
-
-    The new shape and new strides, as well as the offset
-    `(new_shape, new_strides1, disp1, new_stride2, disp2, new_stride3, disp3)`
-    are such that iterating over them will traverse the same set of tuples of
-    elements, possibly in a different order.
- */
-template <class ShapeTy, class StridesTy>
-int simplify_iteration_three_strides(const int nd,
-                                     ShapeTy *shape,
-                                     StridesTy *strides1,
-                                     StridesTy *strides2,
-                                     StridesTy *strides3,
-                                     StridesTy &disp1,
-                                     StridesTy &disp2,
-                                     StridesTy &disp3)
-{
-    disp1 = StridesTy(0);
-    disp2 = StridesTy(0);
-    if (nd < 2)
-        return nd;
-
-    std::vector<int> pos(nd);
-    std::iota(pos.begin(), pos.end(), 0);
-
-    std::stable_sort(pos.begin(), pos.end(),
-                     [&strides1, &strides2, &strides3, &shape](int i1, int i2) {
-                         auto abs_str1_i1 =
-                             (strides1[i1] < 0) ? -strides1[i1] : strides1[i1];
-                         auto abs_str1_i2 =
-                             (strides1[i2] < 0) ? -strides1[i2] : strides1[i2];
-                         auto abs_str2_i1 =
-                             (strides2[i1] < 0) ? -strides2[i1] : strides2[i1];
-                         auto abs_str2_i2 =
-                             (strides2[i2] < 0) ? -strides2[i2] : strides2[i2];
-                         auto abs_str3_i1 =
-                             (strides3[i1] < 0) ? -strides3[i1] : strides3[i1];
-                         auto abs_str3_i2 =
-                             (strides3[i2] < 0) ? -strides3[i2] : strides3[i2];
-                         return (abs_str3_i1 > abs_str3_i2) ||
-                                ((abs_str3_i1 == abs_str3_i2) &&
-                                 ((abs_str2_i1 > abs_str2_i2) ||
-                                  ((abs_str2_i1 == abs_str2_i2) &&
-                                   ((abs_str1_i1 > abs_str1_i2) ||
-                                    ((abs_str1_i1 == abs_str1_i2) &&
-                                     (shape[i1] > shape[i2]))))));
-                     });
-
-    std::vector<ShapeTy> shape_w;
-    std::vector<StridesTy> strides1_w;
-    std::vector<StridesTy> strides2_w;
-    std::vector<StridesTy> strides3_w;
-
-    bool contractable = true;
-    for (int i = 0; i < nd; ++i) {
-        auto p = pos[i];
-        auto sh_p = shape[p];
-        auto str1_p = strides1[p];
-        auto str2_p = strides2[p];
-        auto str3_p = strides3[p];
-        shape_w.push_back(sh_p);
-        if (str1_p <= 0 && str2_p <= 0 && str3_p <= 0 &&
-            std::min({str1_p, str2_p, str3_p}) < 0)
-        {
-            disp1 += str1_p * (sh_p - 1);
-            str1_p = -str1_p;
-            disp2 += str2_p * (sh_p - 1);
-            str2_p = -str2_p;
-            disp3 += str3_p * (sh_p - 1);
-            str3_p = -str3_p;
-        }
-        if (str1_p < 0 || str2_p < 0 || str3_p < 0) {
-            contractable = false;
-        }
-        strides1_w.push_back(str1_p);
-        strides2_w.push_back(str2_p);
-        strides3_w.push_back(str3_p);
-    }
-    int nd_ = nd;
-    while (contractable) {
-        bool changed = false;
-        for (int i = 0; i + 1 < nd_; ++i) {
-            StridesTy str1 = strides1_w[i + 1];
-            StridesTy str2 = strides2_w[i + 1];
-            StridesTy str3 = strides3_w[i + 1];
-            StridesTy jump1 = strides1_w[i] - (shape_w[i + 1] - 1) * str1;
-            StridesTy jump2 = strides2_w[i] - (shape_w[i + 1] - 1) * str2;
-            StridesTy jump3 = strides3_w[i] - (shape_w[i + 1] - 1) * str3;
-
-            if (jump1 == str1 && jump2 == str2 && jump3 == str3) {
-                changed = true;
-                shape_w[i] *= shape_w[i + 1];
-                for (int j = i; j < nd_; ++j) {
-                    strides1_w[j] = strides1_w[j + 1];
-                }
-                for (int j = i; j < nd_; ++j) {
-                    strides2_w[j] = strides2_w[j + 1];
-                }
-                for (int j = i; j < nd_; ++j) {
-                    strides3_w[j] = strides3_w[j + 1];
-                }
-                for (int j = i + 1; j + 1 < nd_; ++j) {
-                    shape_w[j] = shape_w[j + 1];
-                }
-                --nd_;
-                break;
-            }
-        }
-        if (!changed)
-            break;
-    }
-    for (int i = 0; i < nd_; ++i) {
-        shape[i] = shape_w[i];
-    }
-    for (int i = 0; i < nd_; ++i) {
-        strides1[i] = strides1_w[i];
-    }
-    for (int i = 0; i < nd_; ++i) {
-        strides2[i] = strides2_w[i];
-    }
-    for (int i = 0; i < nd_; ++i) {
-        strides3[i] = strides3_w[i];
-    }
-
-    return nd_;
-}
-
-template <typename T, class Error, typename vecT = std::vector<T>>
-std::tuple<vecT, vecT, T, vecT, T, vecT, T> contract_iter3(const vecT &shape,
-                                                           const vecT &strides1,
-                                                           const vecT &strides2,
-                                                           const vecT &strides3)
-{
-    const std::size_t dim = shape.size();
-    if (dim != strides1.size() || dim != strides2.size() ||
-        dim != strides3.size())
-    {
-        throw Error("Shape and strides must be of equal size.");
-    }
-    vecT out_shape = shape;
-    vecT out_strides1 = strides1;
-    vecT out_strides2 = strides2;
-    vecT out_strides3 = strides3;
-    T disp1(0);
-    T disp2(0);
-    T disp3(0);
-
-    int nd = simplify_iteration_three_strides(
-        dim, out_shape.data(), out_strides1.data(), out_strides2.data(),
-        out_strides3.data(), disp1, disp2, disp3);
-    out_shape.resize(nd);
-    out_strides1.resize(nd);
-    out_strides2.resize(nd);
-    out_strides3.resize(nd);
-    return std::make_tuple(out_shape, out_strides1, disp1, out_strides2, disp2,
-                           out_strides3, disp3);
-}
-
-/*
-    For purposes of iterating over pairs of elements of four arrays
-    with  `shape` and strides `strides1`, `strides2`, `strides3`,
-    `strides4` given as pointers `simplify_iteration_four_strides(nd,
-    shape_ptr, strides1_ptr, strides2_ptr, strides3_ptr, strides4_ptr,
-    disp1, disp2, disp3, disp4)` may modify memory and returns new
-    length of these arrays.
-
-    The new shape and new strides, as well as the offset
-    `(new_shape, new_strides1, disp1, new_stride2, disp2, new_stride3, disp3,
-    new_stride4, disp4)` are such that iterating over them will traverse the
-    same set of tuples of elements, possibly in a different order.
- */
-template <class ShapeTy, class StridesTy>
-int simplify_iteration_four_strides(const int nd,
-                                    ShapeTy *shape,
-                                    StridesTy *strides1,
-                                    StridesTy *strides2,
-                                    StridesTy *strides3,
-                                    StridesTy *strides4,
-                                    StridesTy &disp1,
-                                    StridesTy &disp2,
-                                    StridesTy &disp3,
-                                    StridesTy &disp4)
-{
-    disp1 = StridesTy(0);
-    disp2 = StridesTy(0);
-    if (nd < 2)
-        return nd;
-
-    std::vector<int> pos(nd);
-    std::iota(pos.begin(), pos.end(), 0);
-
-    std::stable_sort(
-        pos.begin(), pos.end(),
-        [&strides1, &strides2, &strides3, &strides4, &shape](int i1, int i2) {
-            auto abs_str1_i1 =
-                (strides1[i1] < 0) ? -strides1[i1] : strides1[i1];
-            auto abs_str1_i2 =
-                (strides1[i2] < 0) ? -strides1[i2] : strides1[i2];
-            auto abs_str2_i1 =
-                (strides2[i1] < 0) ? -strides2[i1] : strides2[i1];
-            auto abs_str2_i2 =
-                (strides2[i2] < 0) ? -strides2[i2] : strides2[i2];
-            auto abs_str3_i1 =
-                (strides3[i1] < 0) ? -strides3[i1] : strides3[i1];
-            auto abs_str3_i2 =
-                (strides3[i2] < 0) ? -strides3[i2] : strides3[i2];
-            auto abs_str4_i1 =
-                (strides4[i1] < 0) ? -strides4[i1] : strides4[i1];
-            auto abs_str4_i2 =
-                (strides4[i2] < 0) ? -strides4[i2] : strides4[i2];
-            return (abs_str4_i1 > abs_str4_i2) ||
-                   ((abs_str4_i1 == abs_str4_i2) &&
-                    ((abs_str3_i1 > abs_str3_i2) ||
-                     ((abs_str3_i1 == abs_str3_i2) &&
-                      ((abs_str2_i1 > abs_str2_i2) ||
-                       ((abs_str2_i1 == abs_str2_i2) &&
-                        ((abs_str1_i1 > abs_str1_i2) ||
-                         ((abs_str1_i1 == abs_str1_i2) &&
-                          (shape[i1] > shape[i2]))))))));
-        });
-
-    std::vector<ShapeTy> shape_w;
-    std::vector<StridesTy> strides1_w;
-    std::vector<StridesTy> strides2_w;
-    std::vector<StridesTy> strides3_w;
-    std::vector<StridesTy> strides4_w;
-
-    bool contractable = true;
-    for (int i = 0; i < nd; ++i) {
-        auto p = pos[i];
-        auto sh_p = shape[p];
-        auto str1_p = strides1[p];
-        auto str2_p = strides2[p];
-        auto str3_p = strides3[p];
-        auto str4_p = strides4[p];
-        shape_w.push_back(sh_p);
-        if (str1_p <= 0 && str2_p <= 0 && str3_p <= 0 && str4_p <= 0 &&
-            std::min({str1_p, str2_p, str3_p, str4_p}) < 0)
-        {
-            disp1 += str1_p * (sh_p - 1);
-            str1_p = -str1_p;
-            disp2 += str2_p * (sh_p - 1);
-            str2_p = -str2_p;
-            disp3 += str3_p * (sh_p - 1);
-            str3_p = -str3_p;
-            disp4 += str4_p * (sh_p - 1);
-            str4_p = -str4_p;
-        }
-        if (str1_p < 0 || str2_p < 0 || str3_p < 0 || str4_p < 0) {
-            contractable = false;
-        }
-        strides1_w.push_back(str1_p);
-        strides2_w.push_back(str2_p);
-        strides3_w.push_back(str3_p);
-        strides4_w.push_back(str4_p);
-    }
-    int nd_ = nd;
-    while (contractable) {
-        bool changed = false;
-        for (int i = 0; i + 1 < nd_; ++i) {
-            StridesTy str1 = strides1_w[i + 1];
-            StridesTy str2 = strides2_w[i + 1];
-            StridesTy str3 = strides3_w[i + 1];
-            StridesTy str4 = strides4_w[i + 1];
-            StridesTy jump1 = strides1_w[i] - (shape_w[i + 1] - 1) * str1;
-            StridesTy jump2 = strides2_w[i] - (shape_w[i + 1] - 1) * str2;
-            StridesTy jump3 = strides3_w[i] - (shape_w[i + 1] - 1) * str3;
-            StridesTy jump4 = strides4_w[i] - (shape_w[i + 1] - 1) * str4;
-
-            if (jump1 == str1 && jump2 == str2 && jump3 == str3 &&
-                jump4 == str4)
-            {
-                changed = true;
-                shape_w[i] *= shape_w[i + 1];
-                for (int j = i; j < nd_; ++j) {
-                    strides1_w[j] = strides1_w[j + 1];
-                }
-                for (int j = i; j < nd_; ++j) {
-                    strides2_w[j] = strides2_w[j + 1];
-                }
-                for (int j = i; j < nd_; ++j) {
-                    strides3_w[j] = strides3_w[j + 1];
-                }
-                for (int j = i; j < nd_; ++j) {
-                    strides4_w[j] = strides4_w[j + 1];
-                }
-                for (int j = i + 1; j + 1 < nd_; ++j) {
-                    shape_w[j] = shape_w[j + 1];
-                }
-                --nd_;
-                break;
-            }
-        }
-        if (!changed)
-            break;
-    }
-    for (int i = 0; i < nd_; ++i) {
-        shape[i] = shape_w[i];
-    }
-    for (int i = 0; i < nd_; ++i) {
-        strides1[i] = strides1_w[i];
-    }
-    for (int i = 0; i < nd_; ++i) {
-        strides2[i] = strides2_w[i];
-    }
-    for (int i = 0; i < nd_; ++i) {
-        strides3[i] = strides3_w[i];
-    }
-    for (int i = 0; i < nd_; ++i) {
-        strides4[i] = strides4_w[i];
-    }
-
-    return nd_;
-}
-
-template <typename T, class Error, typename vecT = std::vector<T>>
-std::tuple<vecT, vecT, T, vecT, T, vecT, T, vecT, T>
-contract_iter4(const vecT &shape,
-               const vecT &strides1,
-               const vecT &strides2,
-               const vecT &strides3,
-               const vecT &strides4)
-{
-    const std::size_t dim = shape.size();
-    if (dim != strides1.size() || dim != strides2.size() ||
-        dim != strides3.size() || dim != strides4.size())
-    {
-        throw Error("Shape and strides must be of equal size.");
-    }
-    vecT out_shape = shape;
-    vecT out_strides1 = strides1;
-    vecT out_strides2 = strides2;
-    vecT out_strides3 = strides3;
-    vecT out_strides4 = strides4;
-    T disp1(0);
-    T disp2(0);
-    T disp3(0);
-    T disp4(0);
-
-    int nd = simplify_iteration_four_strides(
-        dim, out_shape.data(), out_strides1.data(), out_strides2.data(),
-        out_strides3.data(), out_strides4.data(), disp1, disp2, disp3, disp4);
-    out_shape.resize(nd);
-    out_strides1.resize(nd);
-    out_strides2.resize(nd);
-    out_strides3.resize(nd);
-    out_strides4.resize(nd);
-    return std::make_tuple(out_shape, out_strides1, disp1, out_strides2, disp2,
-                           out_strides3, disp3, out_strides4, disp4);
-}
-
-/*
-    For purposes of iterating over elements of an array with  `shape` and
-    strides `strides` given as pointers `compact_iteration(nd, shape, strides)`
-    may modify memory and returns the new length of the array.
-
-    The new shape and new strides `(new_shape, new_strides)` are such that
-    iterating over them will traverse the same elements in the same order,
-    possibly with reduced dimensionality.
- */
-template <class ShapeTy, class StridesTy>
-int compact_iteration(const int nd, ShapeTy *shape, StridesTy *strides)
-{
-    if (nd < 2)
-        return nd;
-
-    bool contractable = true;
-    for (int i = 0; i < nd; ++i) {
-        if (strides[i] < 0) {
-            contractable = false;
-        }
-    }
-
-    int nd_ = nd;
-    while (contractable) {
-        bool changed = false;
-        for (int i = 0; i + 1 < nd_; ++i) {
-            StridesTy str = strides[i + 1];
-            StridesTy jump = strides[i] - (shape[i + 1] - 1) * str;
-
-            if (jump == str) {
-                changed = true;
-                shape[i] *= shape[i + 1];
-                for (int j = i; j < nd_; ++j) {
-                    strides[j] = strides[j + 1];
-                }
-                for (int j = i + 1; j + 1 < nd_; ++j) {
-                    shape[j] = shape[j + 1];
-                }
-                --nd_;
-                break;
-            }
-        }
-        if (!changed)
-            break;
-    }
-
-    return nd_;
-}
-
-} // namespace strides
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
deleted file mode 100644
index 90a6e204ad..0000000000
--- a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
+++ /dev/null
@@ -1,220 +0,0 @@
-//===-- sycl_alloc_utils.cpp - Allocation utilities ---*-C++-*- ----------====//
-//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines CIndexer_array, and CIndexer_vector classes, as well
-/// iteration space simplifiers.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include <cstddef>     // for std::size_t
-#include <exception>   // for std::exception
-#include <iostream>    // for std::cerr
-#include <memory>      // for std::unique_ptr
-#include <stdexcept>   // for std::runtime_error
-#include <type_traits> // for std::true_type, std::false_type
-#include <utility>     // for std::move
-#include <vector>
-
-#include "sycl/sycl.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace alloc_utils
-{
-
-template <typename T>
-class usm_host_allocator : public sycl::usm_allocator<T, sycl::usm::alloc::host>
-{
-public:
-    using baseT = sycl::usm_allocator<T, sycl::usm::alloc::host>;
-    using baseT::baseT;
-
-    template <typename U> struct rebind
-    {
-        typedef usm_host_allocator<U> other;
-    };
-
-    void deallocate(T *ptr, std::size_t n)
-    {
-        try {
-            baseT::deallocate(ptr, n);
-        } catch (const std::exception &e) {
-            std::cerr
-                << "Exception caught in `usm_host_allocator::deallocate`: "
-                << e.what() << std::endl;
-        }
-    }
-};
-
-template <typename T>
-void sycl_free_noexcept(T *ptr, const sycl::context &ctx) noexcept
-{
-    try {
-        sycl::free(ptr, ctx);
-    } catch (const std::exception &e) {
-        std::cerr << "Call to sycl::free caught exception: " << e.what()
-                  << std::endl;
-    }
-}
-
-template <typename T>
-void sycl_free_noexcept(T *ptr, const sycl::queue &q) noexcept
-{
-    sycl_free_noexcept(ptr, q.get_context());
-}
-
-class USMDeleter
-{
-private:
-    sycl::context ctx_;
-
-public:
-    USMDeleter(const sycl::queue &q) : ctx_(q.get_context()) {}
-    USMDeleter(const sycl::context &ctx) : ctx_(ctx) {}
-
-    template <typename T> void operator()(T *ptr) const
-    {
-        sycl_free_noexcept(ptr, ctx_);
-    }
-};
-
-template <typename T>
-std::unique_ptr<T, USMDeleter>
-smart_malloc(std::size_t count,
-             const sycl::queue &q,
-             sycl::usm::alloc kind,
-             const sycl::property_list &propList = {})
-{
-    T *ptr = sycl::malloc<T>(count, q, kind, propList);
-    if (nullptr == ptr) {
-        throw std::runtime_error("Unable to allocate device_memory");
-    }
-
-    auto usm_deleter = USMDeleter(q);
-    return std::unique_ptr<T, USMDeleter>(ptr, usm_deleter);
-}
-
-template <typename T>
-std::unique_ptr<T, USMDeleter>
-smart_malloc_device(std::size_t count,
-                    const sycl::queue &q,
-                    const sycl::property_list &propList = {})
-{
-    return smart_malloc<T>(count, q, sycl::usm::alloc::device, propList);
-}
-
-template <typename T>
-std::unique_ptr<T, USMDeleter>
-smart_malloc_shared(std::size_t count,
-                    const sycl::queue &q,
-                    const sycl::property_list &propList = {})
-{
-    return smart_malloc<T>(count, q, sycl::usm::alloc::shared, propList);
-}
-
-template <typename T>
-std::unique_ptr<T, USMDeleter>
-smart_malloc_host(std::size_t count,
-                  const sycl::queue &q,
-                  const sycl::property_list &propList = {})
-{
-    return smart_malloc<T>(count, q, sycl::usm::alloc::host, propList);
-}
-
-namespace detail
-{
-template <typename T> struct valid_smart_ptr : public std::false_type
-{
-};
-
-template <typename ValT, typename DelT>
-struct valid_smart_ptr<std::unique_ptr<ValT, DelT> &>
-    : public std::is_same<DelT, USMDeleter>
-{
-};
-
-template <typename ValT, typename DelT>
-struct valid_smart_ptr<std::unique_ptr<ValT, DelT>>
-    : public std::is_same<DelT, USMDeleter>
-{
-};
-
-// base case
-template <typename... Rest> struct all_valid_smart_ptrs
-{
-    static constexpr bool value = true;
-};
-
-template <typename Arg, typename... RestArgs>
-struct all_valid_smart_ptrs<Arg, RestArgs...>
-{
-    static constexpr bool value = valid_smart_ptr<Arg>::value &&
-                                  (all_valid_smart_ptrs<RestArgs...>::value);
-};
-} // end of namespace detail
-
-/*! @brief Submit host_task and transfer ownership from smart pointers to it */
-template <typename... UniquePtrTs>
-sycl::event async_smart_free(sycl::queue &exec_q,
-                             const std::vector<sycl::event> &depends,
-                             UniquePtrTs &&...unique_pointers)
-{
-    static constexpr std::size_t n = sizeof...(UniquePtrTs);
-    static_assert(
-        n > 0, "async_smart_free requires at least one smart pointer argument");
-
-    static_assert(
-        detail::all_valid_smart_ptrs<UniquePtrTs...>::value,
-        "async_smart_free requires unique_ptr created with smart_malloc");
-
-    std::vector<void *> ptrs;
-    ptrs.reserve(n);
-    (ptrs.push_back(reinterpret_cast<void *>(unique_pointers.get())), ...);
-
-    std::vector<USMDeleter> dels;
-    dels.reserve(n);
-    (dels.emplace_back(unique_pointers.get_deleter()), ...);
-
-    sycl::event ht_e = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        cgh.host_task([ptrs = std::move(ptrs), dels = std::move(dels)]() {
-            for (std::size_t i = 0; i < ptrs.size(); ++i) {
-                dels[i](ptrs[i]);
-            }
-        });
-    });
-
-    // Upon successful submission of host_task, USM allocations are owned
-    // by the host_task. Release smart pointer ownership to avoid double
-    // deallocation
-    (unique_pointers.release(), ...);
-
-    return ht_e;
-}
-
-} // end of namespace alloc_utils
-} // end of namespace tensor
-} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
deleted file mode 100644
index f78193e614..0000000000
--- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
+++ /dev/null
@@ -1,646 +0,0 @@
-//=== sycl_utils.hpp - Implementation of utilities         ------- *-C++-*/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines utilities used for kernel submission.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-#include <vector>
-
-#include "math_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace sycl_utils
-{
-namespace detail
-{
-
-template <typename...> struct TypeList;
-
-template <typename Head, typename... Tail> struct TypeList<Head, Tail...>
-{
-    using head = Head;
-    using tail = TypeList<Tail...>;
-};
-
-using NullTypeList = TypeList<>;
-template <typename T>
-struct IsNullTypeList : std::conditional_t<std::is_same_v<T, NullTypeList>,
-                                           std::true_type,
-                                           std::false_type>
-{
-};
-
-// recursively check if type is contained in given TypeList
-template <typename T, typename TList>
-struct IsContained
-    : std::conditional_t<
-          std::is_same_v<typename TList::head, std::remove_cv_t<T>>,
-          std::true_type,
-          IsContained<T, typename TList::tail>>
-{
-};
-
-template <> struct TypeList<>
-{
-};
-
-// std::false_type when last case has been checked for membership
-template <typename T> struct IsContained<T, NullTypeList> : std::false_type
-{
-};
-
-template <class T> struct IsComplex : std::false_type
-{
-};
-template <class T> struct IsComplex<std::complex<T>> : std::true_type
-{
-};
-
-} // namespace detail
-
-template <typename T>
-using sycl_ops = detail::TypeList<sycl::plus<T>,
-                                  sycl::bit_or<T>,
-                                  sycl::bit_xor<T>,
-                                  sycl::bit_and<T>,
-                                  sycl::maximum<T>,
-                                  sycl::minimum<T>,
-                                  sycl::multiplies<T>>;
-
-template <typename T, typename Op> struct IsSyclOp
-{
-    static constexpr bool value =
-        detail::IsContained<Op, sycl_ops<std::remove_const_t<T>>>::value ||
-        detail::IsContained<Op, sycl_ops<std::add_const_t<T>>>::value;
-};
-
-/*! @brief Find the smallest multiple of supported sub-group size larger than
- * nelems */
-template <std::size_t f = 4>
-std::size_t choose_workgroup_size(const std::size_t nelems,
-                                  const std::vector<std::size_t> &sg_sizes)
-{
-    std::vector<std::size_t> wg_choices;
-    wg_choices.reserve(f * sg_sizes.size());
-
-    for (const auto &sg_size : sg_sizes) {
-#pragma unroll
-        for (std::size_t i = 1; i <= f; ++i) {
-            wg_choices.push_back(sg_size * i);
-        }
-    }
-    std::sort(std::begin(wg_choices), std::end(wg_choices));
-
-    std::size_t wg = 1;
-    for (std::size_t i = 0; i < wg_choices.size(); ++i) {
-        if (wg_choices[i] == wg) {
-            continue;
-        }
-        wg = wg_choices[i];
-        std::size_t n_groups = ((nelems + wg - 1) / wg);
-        if (n_groups == 1)
-            break;
-    }
-
-    return wg;
-}
-
-namespace detail
-{
-
-template <typename LocAccT, typename OpT>
-void _fold(LocAccT &local_mem_acc,
-           const std::uint32_t lid,
-           const std::uint32_t cutoff,
-           const std::uint32_t step,
-           const OpT &op)
-{
-    if (lid < cutoff) {
-        local_mem_acc[lid] = op(local_mem_acc[lid], local_mem_acc[step + lid]);
-    }
-}
-
-template <typename LocAccT, typename OpT>
-void _fold(LocAccT &local_mem_acc,
-           const std::uint32_t lid,
-           const std::uint32_t step,
-           const OpT &op)
-{
-    if (lid < step) {
-        local_mem_acc[lid] = op(local_mem_acc[lid], local_mem_acc[step + lid]);
-    }
-}
-
-} // end of namespace detail
-
-template <typename T, typename GroupT, typename LocAccT, typename OpT>
-T custom_reduce_over_group(const GroupT &wg,
-                           LocAccT local_mem_acc,
-                           const T &local_val,
-                           const OpT &op)
-{
-    // value experimentally tuned to achieve best runtime on Iris Xe,
-    // Arc A140V integrated Intel GPUs, and discrete Intel Max GPU.
-    static constexpr std::uint32_t low_sz = 8u;
-    // maximal work-group size
-    static constexpr std::uint32_t high_sz = 1024u;
-    const std::uint32_t wgs = wg.get_local_linear_range();
-    const std::uint32_t lid = wg.get_local_linear_id();
-
-    local_mem_acc[lid] = local_val;
-    sycl::group_barrier(wg, sycl::memory_scope::work_group);
-
-    std::uint32_t n_witems = wgs;
-    if (wgs & (wgs - 1)) {
-        // wgs is not a power of 2
-#pragma unroll
-        for (std::uint32_t sz = high_sz; sz >= low_sz; sz >>= 1) {
-            if (n_witems >= sz) {
-                const std::uint32_t n_witems_ = (n_witems + 1) >> 1;
-                detail::_fold(local_mem_acc, lid, n_witems - n_witems_,
-                              n_witems_, op);
-                sycl::group_barrier(wg, sycl::memory_scope::work_group);
-                n_witems = n_witems_;
-            }
-        }
-    }
-    else {
-        // wgs is a power of 2
-#pragma unroll
-        for (std::uint32_t sz = high_sz; sz >= low_sz; sz >>= 1) {
-            if (n_witems >= sz) {
-                n_witems >>= 1;
-                detail::_fold(local_mem_acc, lid, n_witems, op);
-                sycl::group_barrier(wg, sycl::memory_scope::work_group);
-            }
-        }
-    }
-
-    T red_val_over_wg = local_mem_acc[0];
-    if (wg.leader()) {
-        for (std::uint32_t i = 1; i < n_witems; ++i) {
-            red_val_over_wg = op(red_val_over_wg, local_mem_acc[i]);
-        }
-    }
-
-    return sycl::group_broadcast(wg, red_val_over_wg, 0);
-}
-
-template <typename GroupT,
-          typename SubGroupT,
-          typename LocAccT,
-          typename T,
-          typename OpT>
-T custom_inclusive_scan_over_group(GroupT &&wg,
-                                   SubGroupT &&sg,
-                                   LocAccT &&local_mem_acc,
-                                   const T &local_val,
-                                   const T &identity,
-                                   OpT &&op)
-{
-    const std::uint32_t local_id = wg.get_local_id(0);
-    const std::uint32_t wgs = wg.get_local_range(0);
-
-    const std::uint32_t lane_id = sg.get_local_id()[0];
-    const std::uint32_t sgSize = sg.get_local_range()[0];
-
-    T scan_val = local_val;
-    for (std::uint32_t step = 1; step < sgSize; step *= 2) {
-        const bool advanced_lane = (lane_id >= step);
-        const std::uint32_t src_lane_id =
-            (advanced_lane ? lane_id - step : lane_id);
-        const T modifier = sycl::select_from_group(sg, scan_val, src_lane_id);
-        if (advanced_lane) {
-            scan_val = op(scan_val, modifier);
-        }
-    }
-
-    local_mem_acc[local_id] = scan_val;
-    sycl::group_barrier(wg, sycl::memory_scope::work_group);
-
-    const std::uint32_t max_sgSize = sg.get_max_local_range()[0];
-    const std::uint32_t sgr_id = sg.get_group_id()[0];
-
-    // now scan
-    const std::uint32_t n_aggregates = 1 + ((wgs - 1) / max_sgSize);
-    const bool large_wg = (n_aggregates > max_sgSize);
-    if (large_wg) {
-        if (wg.leader()) {
-            T _scan_val = identity;
-            for (std::uint32_t i = 1; i <= n_aggregates - max_sgSize; ++i) {
-                _scan_val = op(local_mem_acc[i * max_sgSize - 1], _scan_val);
-                local_mem_acc[i * max_sgSize - 1] = _scan_val;
-            }
-        }
-        sycl::group_barrier(wg, sycl::memory_scope::work_group);
-    }
-
-    if (sgr_id == 0) {
-        const std::uint32_t offset =
-            (large_wg) ? n_aggregates - max_sgSize : 0u;
-        const bool in_range = (lane_id < n_aggregates);
-        const bool in_bounds = in_range && (lane_id > 0 || large_wg);
-
-        T __scan_val = (in_bounds)
-                           ? local_mem_acc[(offset + lane_id) * max_sgSize - 1]
-                           : identity;
-        for (std::uint32_t step = 1; step < sgSize; step *= 2) {
-            const bool advanced_lane = (lane_id >= step);
-            const std::uint32_t src_lane_id =
-                (advanced_lane ? lane_id - step : lane_id);
-            const T modifier =
-                sycl::select_from_group(sg, __scan_val, src_lane_id);
-            if (advanced_lane && in_range) {
-                __scan_val = op(__scan_val, modifier);
-            }
-        }
-        if (in_bounds) {
-            local_mem_acc[(offset + lane_id) * max_sgSize - 1] = __scan_val;
-        }
-    }
-    sycl::group_barrier(wg, sycl::memory_scope::work_group);
-
-    if (sgr_id > 0) {
-        const T modifier = local_mem_acc[sgr_id * max_sgSize - 1];
-        scan_val = op(scan_val, modifier);
-    }
-
-    // ensure all work-items finished reading from SLM
-    sycl::group_barrier(wg, sycl::memory_scope::work_group);
-
-    return scan_val;
-}
-
-// Reduction functors
-
-// Maximum
-
-template <typename T> struct Maximum
-{
-    T operator()(const T &x, const T &y) const
-    {
-        if constexpr (detail::IsComplex<T>::value) {
-            using dpctl::tensor::math_utils::max_complex;
-            return max_complex<T>(x, y);
-        }
-        else if constexpr (std::is_floating_point_v<T> ||
-                           std::is_same_v<T, sycl::half>)
-        {
-            return (std::isnan(x) || x > y) ? x : y;
-        }
-        else if constexpr (std::is_same_v<T, bool>) {
-            return x || y;
-        }
-        else {
-            return (x > y) ? x : y;
-        }
-    }
-};
-
-// Minimum
-
-template <typename T> struct Minimum
-{
-    T operator()(const T &x, const T &y) const
-    {
-        if constexpr (detail::IsComplex<T>::value) {
-            using dpctl::tensor::math_utils::min_complex;
-            return min_complex<T>(x, y);
-        }
-        else if constexpr (std::is_floating_point_v<T> ||
-                           std::is_same_v<T, sycl::half>)
-        {
-            return (std::isnan(x) || x < y) ? x : y;
-        }
-        else if constexpr (std::is_same_v<T, bool>) {
-            return x && y;
-        }
-        else {
-            return (x < y) ? x : y;
-        }
-    }
-};
-
-// Define identities and operator checking structs
-
-template <typename Op, typename T, typename = void> struct GetIdentity
-{
-};
-
-// Maximum
-
-template <typename T, class Op>
-using IsMaximum = std::bool_constant<std::is_same_v<Op, sycl::maximum<T>> ||
-                                     std::is_same_v<Op, Maximum<T>>>;
-
-template <typename T, class Op>
-using IsSyclMaximum = std::bool_constant<std::is_same_v<Op, sycl::maximum<T>>>;
-
-template <typename Op, typename T>
-struct GetIdentity<Op, T, std::enable_if_t<IsMaximum<T, Op>::value>>
-{
-    static constexpr T value =
-        static_cast<T>(std::numeric_limits<T>::has_infinity
-                           ? static_cast<T>(-std::numeric_limits<T>::infinity())
-                           : std::numeric_limits<T>::lowest());
-};
-
-template <typename Op>
-struct GetIdentity<Op, bool, std::enable_if_t<IsMaximum<bool, Op>::value>>
-{
-    static constexpr bool value = false;
-};
-
-template <typename Op, typename T>
-struct GetIdentity<Op,
-                   std::complex<T>,
-                   std::enable_if_t<IsMaximum<std::complex<T>, Op>::value>>
-{
-    static constexpr std::complex<T> value{-std::numeric_limits<T>::infinity(),
-                                           -std::numeric_limits<T>::infinity()};
-};
-
-// Minimum
-
-template <typename T, class Op>
-using IsMinimum = std::bool_constant<std::is_same_v<Op, sycl::minimum<T>> ||
-                                     std::is_same_v<Op, Minimum<T>>>;
-
-template <typename T, class Op>
-using IsSyclMinimum = std::bool_constant<std::is_same_v<Op, sycl::minimum<T>>>;
-
-template <typename Op, typename T>
-struct GetIdentity<Op, T, std::enable_if_t<IsMinimum<T, Op>::value>>
-{
-    static constexpr T value =
-        static_cast<T>(std::numeric_limits<T>::has_infinity
-                           ? static_cast<T>(std::numeric_limits<T>::infinity())
-                           : std::numeric_limits<T>::max());
-};
-
-template <typename Op>
-struct GetIdentity<Op, bool, std::enable_if_t<IsMinimum<bool, Op>::value>>
-{
-    static constexpr bool value = true;
-};
-
-template <typename Op, typename T>
-struct GetIdentity<Op,
-                   std::complex<T>,
-                   std::enable_if_t<IsMinimum<std::complex<T>, Op>::value>>
-{
-    static constexpr std::complex<T> value{std::numeric_limits<T>::infinity(),
-                                           std::numeric_limits<T>::infinity()};
-};
-
-// Plus
-
-template <typename T, class Op>
-using IsPlus = std::bool_constant<std::is_same_v<Op, sycl::plus<T>> ||
-                                  std::is_same_v<Op, std::plus<T>>>;
-
-template <typename T, class Op>
-using IsSyclPlus = std::bool_constant<std::is_same_v<Op, sycl::plus<T>>>;
-
-// Multiplies
-
-template <typename T, class Op>
-using IsMultiplies =
-    std::bool_constant<std::is_same_v<Op, sycl::multiplies<T>> ||
-                       std::is_same_v<Op, std::multiplies<T>>>;
-
-template <typename T, class Op>
-using IsSyclMultiplies =
-    std::bool_constant<std::is_same_v<Op, sycl::multiplies<T>>>;
-
-template <typename Op, typename T>
-struct GetIdentity<Op, T, std::enable_if_t<IsMultiplies<T, Op>::value>>
-{
-    static constexpr T value = static_cast<T>(1);
-};
-
-// LogSumExp
-
-template <typename T> struct LogSumExp
-{
-    T operator()(const T &x, const T &y) const
-    {
-        using dpctl::tensor::math_utils::logaddexp;
-        return logaddexp<T>(x, y);
-    }
-};
-
-template <typename T, class Op>
-using IsLogSumExp = std::bool_constant<std::is_same_v<Op, LogSumExp<T>>>;
-
-// only defined for types with infinity
-template <typename Op, typename T>
-struct GetIdentity<Op, T, std::enable_if_t<IsLogSumExp<T, Op>::value>>
-{
-    static constexpr T value = -std::numeric_limits<T>::infinity();
-};
-
-// Hypot
-
-template <typename T> struct Hypot
-{
-    T operator()(const T &x, const T &y) const { return sycl::hypot(x, y); }
-};
-
-template <typename T, class Op>
-using IsHypot = std::bool_constant<std::is_same_v<Op, Hypot<T>>>;
-
-template <typename Op, typename T>
-struct GetIdentity<Op, T, std::enable_if_t<IsHypot<T, Op>::value>>
-{
-    static constexpr T value = 0;
-};
-
-// Logical_And
-
-template <typename T, class Op>
-using IsLogicalAnd =
-    std::bool_constant<std::is_same_v<Op, sycl::logical_and<T>> ||
-                       std::is_same_v<Op, std::logical_and<T>>>;
-
-template <typename T, class Op>
-using IsSyclLogicalAnd =
-    std::bool_constant<std::is_same_v<Op, sycl::logical_and<T>>>;
-
-template <typename Op, typename T>
-struct GetIdentity<Op, T, std::enable_if_t<IsLogicalAnd<T, Op>::value>>
-{
-    static constexpr T value = static_cast<T>(1);
-};
-
-// Logical_Or
-
-template <typename T, class Op>
-using IsLogicalOr =
-    std::bool_constant<std::is_same_v<Op, sycl::logical_or<T>> ||
-                       std::is_same_v<Op, std::logical_or<T>>>;
-
-template <typename T, class Op>
-using IsSyclLogicalOr =
-    std::bool_constant<std::is_same_v<Op, sycl::logical_or<T>>>;
-
-template <typename Op, typename T>
-struct GetIdentity<Op, T, std::enable_if_t<IsLogicalOr<T, Op>::value>>
-{
-    static constexpr T value = static_cast<T>(0);
-};
-
-// Identity
-
-template <typename Op, typename T, typename = void> struct Identity
-{
-};
-
-template <typename Op, typename T>
-using UseBuiltInIdentity =
-    std::conjunction<IsSyclOp<T, Op>, sycl::has_known_identity<Op, T>>;
-
-template <typename Op, typename T>
-struct Identity<Op, T, std::enable_if_t<!UseBuiltInIdentity<Op, T>::value>>
-{
-    static constexpr T value = GetIdentity<Op, T>::value;
-};
-
-template <typename Op, typename T>
-struct Identity<Op, T, std::enable_if_t<UseBuiltInIdentity<Op, T>::value>>
-{
-    static constexpr T value = sycl::known_identity<Op, T>::value;
-};
-
-// Sub-group load/store
-
-#ifndef USE_GROUP_LOAD_STORE
-#if defined(SYCL_EXT_ONEAPI_GROUP_LOAD_STORE) &&                               \
-    SYCL_EXT_ONEAPI_GROUP_LOAD_STORE
-#define USE_GROUP_LOAD_STORE 1
-#else
-#if defined(__LIBSYCL_MAJOR_VERSION) && (__LIBSYCL_MAJOR_VERSION >= 8u)
-#define USE_GROUP_LOAD_STORE 1
-#else
-#define USE_GROUP_LOAD_STORE 0
-#endif
-#endif
-#endif
-
-#if (USE_GROUP_LOAD_STORE)
-namespace ls_ns = sycl::ext::oneapi::experimental;
-#endif
-
-template <std::uint8_t vec_sz,
-          sycl::access::address_space Space,
-          sycl::access::decorated DecorateAddress,
-          typename ElementType>
-auto sub_group_load(const sycl::sub_group &sg,
-                    sycl::multi_ptr<ElementType, Space, DecorateAddress> m_ptr)
-{
-#if (USE_GROUP_LOAD_STORE)
-    using ValueT = typename std::remove_cv_t<ElementType>;
-    sycl::vec<ValueT, vec_sz> x{};
-    static constexpr auto striped =
-        ls_ns::properties{ls_ns::data_placement_striped};
-    ls_ns::group_load(sg, m_ptr, x, striped);
-    return x;
-#else
-    return sg.load<vec_sz>(m_ptr);
-#endif
-}
-
-template <sycl::access::address_space Space,
-          sycl::access::decorated DecorateAddress,
-          typename ElementType>
-auto sub_group_load(const sycl::sub_group &sg,
-                    sycl::multi_ptr<ElementType, Space, DecorateAddress> m_ptr)
-{
-#if (USE_GROUP_LOAD_STORE)
-    using ValueT = typename std::remove_cv_t<ElementType>;
-    ValueT x{};
-    static constexpr auto striped =
-        ls_ns::properties{ls_ns::data_placement_striped};
-    ls_ns::group_load(sg, m_ptr, x, striped);
-    return x;
-#else
-    return sg.load(m_ptr);
-#endif
-}
-
-template <std::uint8_t vec_sz,
-          sycl::access::address_space Space,
-          sycl::access::decorated DecorateAddress,
-          typename VecT,
-          typename ElementType>
-std::enable_if_t<
-    std::is_same_v<std::remove_cv_t<ElementType>, std::remove_cv_t<VecT>>,
-    void>
-sub_group_store(const sycl::sub_group &sg,
-                const sycl::vec<VecT, vec_sz> &val,
-                sycl::multi_ptr<ElementType, Space, DecorateAddress> m_ptr)
-{
-#if (USE_GROUP_LOAD_STORE)
-    static_assert(std::is_same_v<VecT, ElementType>);
-    static constexpr auto striped =
-        ls_ns::properties{ls_ns::data_placement_striped};
-    ls_ns::group_store(sg, val, m_ptr, striped);
-    return;
-#else
-    sg.store<vec_sz>(m_ptr, val);
-    return;
-#endif
-}
-
-template <sycl::access::address_space Space,
-          sycl::access::decorated DecorateAddress,
-          typename VecT,
-          typename ElementType>
-std::enable_if_t<
-    std::is_same_v<std::remove_cv_t<ElementType>, std::remove_cv_t<VecT>>,
-    void>
-sub_group_store(const sycl::sub_group &sg,
-                const VecT &val,
-                sycl::multi_ptr<ElementType, Space, DecorateAddress> m_ptr)
-{
-#if (USE_GROUP_LOAD_STORE)
-    static constexpr auto striped =
-        ls_ns::properties{ls_ns::data_placement_striped};
-    ls_ns::group_store(sg, val, m_ptr, striped);
-    return;
-#else
-    sg.store(m_ptr, val);
-    return;
-#endif
-}
-
-} // namespace sycl_utils
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp b/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp
deleted file mode 100644
index 29e0559887..0000000000
--- a/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp
+++ /dev/null
@@ -1,133 +0,0 @@
-//===--type_dispatch.cpp - Type-dispatch table building utils ----*-C++-*- ===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines class to implement dispatch tables for pair of types
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <stdexcept>
-
-#include "dpctl4pybind11.hpp"
-#include "type_dispatch_building.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-
-namespace type_dispatch
-{
-
-struct usm_ndarray_types
-{
-
-    int typenum_to_lookup_id(int typenum) const
-    {
-        using typenum_t = ::dpctl::tensor::type_dispatch::typenum_t;
-        auto const &api = ::dpctl::detail::dpctl_capi::get();
-
-        if (typenum == api.UAR_DOUBLE_) {
-            return static_cast<int>(typenum_t::DOUBLE);
-        }
-        else if (typenum == api.UAR_INT64_) {
-            return static_cast<int>(typenum_t::INT64);
-        }
-        else if (typenum == api.UAR_INT32_) {
-            return static_cast<int>(typenum_t::INT32);
-        }
-        else if (typenum == api.UAR_BOOL_) {
-            return static_cast<int>(typenum_t::BOOL);
-        }
-        else if (typenum == api.UAR_CDOUBLE_) {
-            return static_cast<int>(typenum_t::CDOUBLE);
-        }
-        else if (typenum == api.UAR_FLOAT_) {
-            return static_cast<int>(typenum_t::FLOAT);
-        }
-        else if (typenum == api.UAR_INT16_) {
-            return static_cast<int>(typenum_t::INT16);
-        }
-        else if (typenum == api.UAR_INT8_) {
-            return static_cast<int>(typenum_t::INT8);
-        }
-        else if (typenum == api.UAR_UINT64_) {
-            return static_cast<int>(typenum_t::UINT64);
-        }
-        else if (typenum == api.UAR_UINT32_) {
-            return static_cast<int>(typenum_t::UINT32);
-        }
-        else if (typenum == api.UAR_UINT16_) {
-            return static_cast<int>(typenum_t::UINT16);
-        }
-        else if (typenum == api.UAR_UINT8_) {
-            return static_cast<int>(typenum_t::UINT8);
-        }
-        else if (typenum == api.UAR_CFLOAT_) {
-            return static_cast<int>(typenum_t::CFLOAT);
-        }
-        else if (typenum == api.UAR_HALF_) {
-            return static_cast<int>(typenum_t::HALF);
-        }
-        else if (typenum == api.UAR_INT_ || typenum == api.UAR_UINT_) {
-            switch (sizeof(int)) {
-            case sizeof(std::int32_t):
-                return ((typenum == api.UAR_INT_)
-                            ? static_cast<int>(typenum_t::INT32)
-                            : static_cast<int>(typenum_t::UINT32));
-            case sizeof(std::int64_t):
-                return ((typenum == api.UAR_INT_)
-                            ? static_cast<int>(typenum_t::INT64)
-                            : static_cast<int>(typenum_t::UINT64));
-            default:
-                throw_unrecognized_typenum_error(typenum);
-            }
-        }
-        else if (typenum == api.UAR_LONGLONG_ || typenum == api.UAR_ULONGLONG_)
-        {
-            switch (sizeof(long long)) {
-            case sizeof(std::int64_t):
-                return ((typenum == api.UAR_LONGLONG_)
-                            ? static_cast<int>(typenum_t::INT64)
-                            : static_cast<int>(typenum_t::UINT64));
-            default:
-                throw_unrecognized_typenum_error(typenum);
-            }
-        }
-        else {
-            throw_unrecognized_typenum_error(typenum);
-        }
-        // return code signalling error, should never be reached
-        assert(false);
-        return -1;
-    }
-
-private:
-    void throw_unrecognized_typenum_error(int typenum) const
-    {
-        throw std::runtime_error("Unrecognized typenum " +
-                                 std::to_string(typenum) + " encountered.");
-    }
-};
-
-} // namespace type_dispatch
-
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp b/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp
deleted file mode 100644
index b1308184fc..0000000000
--- a/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp
+++ /dev/null
@@ -1,291 +0,0 @@
-//===--type_dispatch_building.cpp - Type-dispatch table building utils -*-C++-*-
-//===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines class to implement dispatch tables for pair of types
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include <complex>
-#include <cstdint>
-#include <type_traits>
-
-#include <sycl/sycl.hpp>
-
-namespace dpctl
-{
-namespace tensor
-{
-
-namespace type_dispatch
-{
-
-enum class typenum_t : int
-{
-    BOOL = 0,
-    INT8, // 1
-    UINT8,
-    INT16,
-    UINT16,
-    INT32, // 5
-    UINT32,
-    INT64,
-    UINT64,
-    HALF,
-    FLOAT, // 10
-    DOUBLE,
-    CFLOAT,
-    CDOUBLE, // 13
-};
-inline constexpr int num_types = 14; // number of elements in typenum_t
-
-template <typename funcPtrT,
-          template <typename fnT, typename D, typename S>
-          typename factory,
-          int _num_types>
-class DispatchTableBuilder
-{
-private:
-    template <typename dstTy>
-    const std::vector<funcPtrT> row_per_dst_type() const
-    {
-        std::vector<funcPtrT> per_dstTy = {
-            factory<funcPtrT, dstTy, bool>{}.get(),
-            factory<funcPtrT, dstTy, std::int8_t>{}.get(),
-            factory<funcPtrT, dstTy, std::uint8_t>{}.get(),
-            factory<funcPtrT, dstTy, std::int16_t>{}.get(),
-            factory<funcPtrT, dstTy, std::uint16_t>{}.get(),
-            factory<funcPtrT, dstTy, std::int32_t>{}.get(),
-            factory<funcPtrT, dstTy, std::uint32_t>{}.get(),
-            factory<funcPtrT, dstTy, std::int64_t>{}.get(),
-            factory<funcPtrT, dstTy, std::uint64_t>{}.get(),
-            factory<funcPtrT, dstTy, sycl::half>{}.get(),
-            factory<funcPtrT, dstTy, float>{}.get(),
-            factory<funcPtrT, dstTy, double>{}.get(),
-            factory<funcPtrT, dstTy, std::complex<float>>{}.get(),
-            factory<funcPtrT, dstTy, std::complex<double>>{}.get()};
-        assert(per_dstTy.size() == _num_types);
-        return per_dstTy;
-    }
-
-public:
-    DispatchTableBuilder() = default;
-    ~DispatchTableBuilder() = default;
-
-    void populate_dispatch_table(funcPtrT table[][_num_types]) const
-    {
-        const auto map_by_dst_type = {row_per_dst_type<bool>(),
-                                      row_per_dst_type<std::int8_t>(),
-                                      row_per_dst_type<std::uint8_t>(),
-                                      row_per_dst_type<std::int16_t>(),
-                                      row_per_dst_type<std::uint16_t>(),
-                                      row_per_dst_type<std::int32_t>(),
-                                      row_per_dst_type<std::uint32_t>(),
-                                      row_per_dst_type<std::int64_t>(),
-                                      row_per_dst_type<std::uint64_t>(),
-                                      row_per_dst_type<sycl::half>(),
-                                      row_per_dst_type<float>(),
-                                      row_per_dst_type<double>(),
-                                      row_per_dst_type<std::complex<float>>(),
-                                      row_per_dst_type<std::complex<double>>()};
-        assert(map_by_dst_type.size() == _num_types);
-        int dst_id = 0;
-        for (const auto &row : map_by_dst_type) {
-            int src_id = 0;
-            for (const auto &fn_ptr : row) {
-                table[dst_id][src_id] = fn_ptr;
-                ++src_id;
-            }
-            ++dst_id;
-        }
-    }
-};
-
-template <typename funcPtrT,
-          template <typename fnT, typename T>
-          typename factory,
-          int _num_types>
-class DispatchVectorBuilder
-{
-private:
-    template <typename Ty> const funcPtrT func_per_type() const
-    {
-        funcPtrT f = factory<funcPtrT, Ty>{}.get();
-        return f;
-    }
-
-public:
-    DispatchVectorBuilder() = default;
-    ~DispatchVectorBuilder() = default;
-
-    void populate_dispatch_vector(funcPtrT vector[]) const
-    {
-        const auto fn_map_by_type = {func_per_type<bool>(),
-                                     func_per_type<std::int8_t>(),
-                                     func_per_type<std::uint8_t>(),
-                                     func_per_type<std::int16_t>(),
-                                     func_per_type<std::uint16_t>(),
-                                     func_per_type<std::int32_t>(),
-                                     func_per_type<std::uint32_t>(),
-                                     func_per_type<std::int64_t>(),
-                                     func_per_type<std::uint64_t>(),
-                                     func_per_type<sycl::half>(),
-                                     func_per_type<float>(),
-                                     func_per_type<double>(),
-                                     func_per_type<std::complex<float>>(),
-                                     func_per_type<std::complex<double>>()};
-        assert(fn_map_by_type.size() == _num_types);
-        int ty_id = 0;
-        for (const auto &fn : fn_map_by_type) {
-            vector[ty_id] = fn;
-            ++ty_id;
-        }
-    }
-};
-
-/*! @brief struct to define result_type typename for Ty == ArgTy */
-template <typename Ty, typename ArgTy, typename ResTy = ArgTy>
-struct TypeMapResultEntry : std::is_same<Ty, ArgTy>
-{
-    using result_type = ResTy;
-};
-
-/*! @brief struct to define result_type typename for Ty1 == ArgTy1 && Ty2 ==
- * ArgTy2 */
-template <typename Ty1,
-          typename ArgTy1,
-          typename Ty2,
-          typename ArgTy2,
-          typename ResTy>
-struct BinaryTypeMapResultEntry
-    : std::conjunction<std::is_same<Ty1, ArgTy1>, std::is_same<Ty2, ArgTy2>>
-{
-    using result_type = ResTy;
-};
-
-/*! @brief fall-through struct with specified result_type, usually void */
-template <typename Ty = void> struct DefaultResultEntry : std::true_type
-{
-    using result_type = Ty;
-};
-
-/*! @brief Utility struct to convert C++ type into typeid integer */
-template <typename T> struct GetTypeid
-{
-    int get()
-    {
-        if constexpr (std::is_same_v<T, bool>) {
-            return static_cast<int>(typenum_t::BOOL);
-        }
-        else if constexpr (std::is_same_v<T, std::int8_t>) {
-            return static_cast<int>(typenum_t::INT8);
-        }
-        else if constexpr (std::is_same_v<T, std::uint8_t>) {
-            return static_cast<int>(typenum_t::UINT8);
-        }
-        else if constexpr (std::is_same_v<T, std::int16_t>) {
-            return static_cast<int>(typenum_t::INT16);
-        }
-        else if constexpr (std::is_same_v<T, std::uint16_t>) {
-            return static_cast<int>(typenum_t::UINT16);
-        }
-        else if constexpr (std::is_same_v<T, std::int32_t>) {
-            return static_cast<int>(typenum_t::INT32);
-        }
-        else if constexpr (std::is_same_v<T, std::uint32_t>) {
-            return static_cast<int>(typenum_t::UINT32);
-        }
-        else if constexpr (std::is_same_v<T, std::int64_t>) {
-            return static_cast<int>(typenum_t::INT64);
-        }
-        else if constexpr (std::is_same_v<T, std::uint64_t>) {
-            return static_cast<int>(typenum_t::UINT64);
-        }
-        else if constexpr (std::is_same_v<T, sycl::half>) {
-            return static_cast<int>(typenum_t::HALF);
-        }
-        else if constexpr (std::is_same_v<T, float>) {
-            return static_cast<int>(typenum_t::FLOAT);
-        }
-        else if constexpr (std::is_same_v<T, double>) {
-            return static_cast<int>(typenum_t::DOUBLE);
-        }
-        else if constexpr (std::is_same_v<T, std::complex<float>>) {
-            return static_cast<int>(typenum_t::CFLOAT);
-        }
-        else if constexpr (std::is_same_v<T, std::complex<double>>) {
-            return static_cast<int>(typenum_t::CDOUBLE);
-        }
-        else if constexpr (std::is_same_v<T, void>) { // special token
-            return -1;
-        }
-
-        assert(("Unsupported type T", false));
-        return -2;
-    }
-};
-
-/*! @brief Class to generate vector of null function pointers */
-template <typename FunPtrT> struct NullPtrVector
-{
-
-    using value_type = FunPtrT;
-    using const_reference = value_type const &;
-
-    NullPtrVector() : val(nullptr) {}
-
-    const_reference operator[](int) const { return val; }
-
-private:
-    value_type val;
-};
-
-/*! @brief Class to generate table of null function pointers */
-template <typename FunPtrT> struct NullPtrTable
-{
-    using value_type = NullPtrVector<FunPtrT>;
-    using const_reference = value_type const &;
-
-    NullPtrTable() : val() {}
-
-    const_reference operator[](int) const { return val; }
-
-private:
-    value_type val;
-};
-
-template <typename Ty1, typename ArgTy, typename Ty2, typename outTy>
-struct TypePairDefinedEntry
-    : std::conjunction<std::is_same<Ty1, ArgTy>, std::is_same<Ty2, outTy>>
-{
-    static constexpr bool is_defined = true;
-};
-
-struct NotDefinedEntry : std::true_type
-{
-    static constexpr bool is_defined = false;
-};
-
-} // namespace type_dispatch
-
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/utils/type_utils.hpp b/dpctl/tensor/libtensor/include/utils/type_utils.hpp
deleted file mode 100644
index 4921659166..0000000000
--- a/dpctl/tensor/libtensor/include/utils/type_utils.hpp
+++ /dev/null
@@ -1,158 +0,0 @@
-//===------ type_utils.hpp - Implementation of types utils  ----*-C++-*/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions for value casting.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <stdexcept>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-#include <utility>
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace type_utils
-{
-
-template <typename T, typename = void>
-struct is_complex : public std::false_type
-{
-};
-
-template <typename T>
-struct is_complex<
-    T,
-    std::enable_if_t<std::is_same_v<std::remove_cv_t<T>, std::complex<float>> ||
-                     std::is_same_v<std::remove_cv_t<T>, std::complex<double>>>>
-    : public std::true_type
-{
-};
-
-template <typename T> inline constexpr bool is_complex_v = is_complex<T>::value;
-
-template <typename dstTy, typename srcTy> dstTy convert_impl(const srcTy &v)
-{
-    if constexpr (std::is_same_v<dstTy, srcTy>) {
-        return v;
-    }
-    else if constexpr (std::is_same_v<dstTy, bool>) {
-        if constexpr (is_complex_v<srcTy>) {
-            // bool(complex_v) ==
-            //     (complex_v.real() != 0) && (complex_v.imag() !=0)
-            return (convert_impl<bool, typename srcTy::value_type>(v.real()) ||
-                    convert_impl<bool, typename srcTy::value_type>(v.imag()));
-        }
-        else {
-            return static_cast<dstTy>(v != srcTy{0});
-        }
-    }
-    else if constexpr (std::is_same_v<srcTy, bool>) {
-        // C++ interprets a byte of storage behind bool by only
-        // testing is least significant bit, leading to both
-        // 0x00 and 0x02 interpreted as False, while 0x01 and 0xFF
-        // interpreted as True. NumPy's interpretation of underlying
-        // storage is different: any bit set is interpreted as True,
-        // no bits set as False, see gh-2121
-        const std::uint8_t &u = sycl::bit_cast<std::uint8_t>(v);
-        if constexpr (is_complex_v<dstTy>) {
-            return (u == 0) ? dstTy{} : dstTy{1, 0};
-        }
-        else {
-            return (u == 0) ? dstTy{} : dstTy{1};
-        }
-    }
-    else if constexpr (is_complex_v<srcTy> && !is_complex_v<dstTy>) {
-        // real_t(complex_v) == real_t(complex_v.real())
-        return convert_impl<dstTy, typename srcTy::value_type>(v.real());
-    }
-    else if constexpr (!std::is_integral_v<srcTy> &&
-                       !std::is_same_v<dstTy, bool> &&
-                       std::is_integral_v<dstTy> && std::is_unsigned_v<dstTy>)
-    {
-        // first cast to signed variant, the cast to unsigned one
-        using signedT = typename std::make_signed_t<dstTy>;
-        return static_cast<dstTy>(convert_impl<signedT, srcTy>(v));
-    }
-    else {
-        return static_cast<dstTy>(v);
-    }
-}
-
-template <typename T> void validate_type_for_device(const sycl::device &d)
-{
-    if constexpr (std::is_same_v<T, double>) {
-        if (!d.has(sycl::aspect::fp64)) {
-            throw std::runtime_error("Device " +
-                                     d.get_info<sycl::info::device::name>() +
-                                     " does not support type 'float64'");
-        }
-    }
-    else if constexpr (std::is_same_v<T, std::complex<double>>) {
-        if (!d.has(sycl::aspect::fp64)) {
-            throw std::runtime_error("Device " +
-                                     d.get_info<sycl::info::device::name>() +
-                                     " does not support type 'complex128'");
-        }
-    }
-    else if constexpr (std::is_same_v<T, sycl::half>) {
-        if (!d.has(sycl::aspect::fp16)) {
-            throw std::runtime_error("Device " +
-                                     d.get_info<sycl::info::device::name>() +
-                                     " does not support type 'float16'");
-        }
-    }
-}
-
-template <typename T> void validate_type_for_device(const sycl::queue &q)
-{
-    validate_type_for_device<T>(q.get_device());
-}
-
-template <typename Op, typename Vec, std::size_t... I>
-auto vec_cast_impl(const Vec &v, std::index_sequence<I...>)
-{
-    return Op{v[I]...};
-}
-
-template <typename dstT,
-          typename srcT,
-          std::size_t N,
-          typename Indices = std::make_index_sequence<N>>
-auto vec_cast(const sycl::vec<srcT, N> &s)
-{
-    if constexpr (std::is_same_v<srcT, dstT>) {
-        return s;
-    }
-    else {
-        return vec_cast_impl<sycl::vec<dstT, N>, sycl::vec<srcT, N>>(s,
-                                                                     Indices{});
-    }
-}
-
-} // namespace type_utils
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/accumulators.cpp b/dpctl/tensor/libtensor/source/accumulators.cpp
deleted file mode 100644
index 10a9ff8378..0000000000
--- a/dpctl/tensor/libtensor/source/accumulators.cpp
+++ /dev/null
@@ -1,401 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===----------------------------------------------------------------------===//
-
-#include <cstddef>
-#include <cstdint>
-#include <limits>
-#include <stdexcept>
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include "kernels/accumulators.hpp"
-#include "simplify_iteration_space.hpp"
-#include "utils/memory_overlap.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/output_validation.hpp"
-#include "utils/sycl_alloc_utils.hpp"
-#include "utils/type_dispatch.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-// Computation of positions of masked elements
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::kernels::accumulators::cumsum_val_contig_impl_fn_ptr_t;
-static cumsum_val_contig_impl_fn_ptr_t
-    mask_positions_contig_i64_dispatch_vector[td_ns::num_types];
-static cumsum_val_contig_impl_fn_ptr_t
-    mask_positions_contig_i32_dispatch_vector[td_ns::num_types];
-
-using dpctl::tensor::kernels::accumulators::cumsum_val_strided_impl_fn_ptr_t;
-static cumsum_val_strided_impl_fn_ptr_t
-    mask_positions_strided_i64_dispatch_vector[td_ns::num_types];
-static cumsum_val_strided_impl_fn_ptr_t
-    mask_positions_strided_i32_dispatch_vector[td_ns::num_types];
-
-void populate_mask_positions_dispatch_vectors(void)
-{
-    using dpctl::tensor::kernels::accumulators::
-        MaskPositionsContigFactoryForInt64;
-    td_ns::DispatchVectorBuilder<cumsum_val_contig_impl_fn_ptr_t,
-                                 MaskPositionsContigFactoryForInt64,
-                                 td_ns::num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(mask_positions_contig_i64_dispatch_vector);
-
-    using dpctl::tensor::kernels::accumulators::
-        MaskPositionsContigFactoryForInt32;
-    td_ns::DispatchVectorBuilder<cumsum_val_contig_impl_fn_ptr_t,
-                                 MaskPositionsContigFactoryForInt32,
-                                 td_ns::num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(mask_positions_contig_i32_dispatch_vector);
-
-    using dpctl::tensor::kernels::accumulators::
-        MaskPositionsStridedFactoryForInt64;
-    td_ns::DispatchVectorBuilder<cumsum_val_strided_impl_fn_ptr_t,
-                                 MaskPositionsStridedFactoryForInt64,
-                                 td_ns::num_types>
-        dvb3;
-    dvb3.populate_dispatch_vector(mask_positions_strided_i64_dispatch_vector);
-
-    using dpctl::tensor::kernels::accumulators::
-        MaskPositionsStridedFactoryForInt32;
-    td_ns::DispatchVectorBuilder<cumsum_val_strided_impl_fn_ptr_t,
-                                 MaskPositionsStridedFactoryForInt32,
-                                 td_ns::num_types>
-        dvb4;
-    dvb4.populate_dispatch_vector(mask_positions_strided_i32_dispatch_vector);
-
-    return;
-}
-
-std::size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask,
-                              const dpctl::tensor::usm_ndarray &cumsum,
-                              sycl::queue &exec_q,
-                              const std::vector<sycl::event> &depends)
-{
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(cumsum);
-
-    // cumsum is 1D
-    if (cumsum.get_ndim() != 1) {
-        throw py::value_error("Result array must be one-dimensional.");
-    }
-
-    if (!cumsum.is_c_contiguous()) {
-        throw py::value_error("Expecting `cumsum` array must be C-contiguous.");
-    }
-
-    // cumsum.shape == (mask.size,)
-    auto mask_size = mask.get_size();
-    auto cumsum_size = cumsum.get_shape(0);
-    if (cumsum_size != mask_size) {
-        throw py::value_error("Inconsistent dimensions");
-    }
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {mask, cumsum})) {
-        // FIXME: use ExecutionPlacementError
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    if (mask_size == 0) {
-        return 0;
-    }
-
-    int mask_typenum = mask.get_typenum();
-    int cumsum_typenum = cumsum.get_typenum();
-
-    // mask can be any type
-    const char *mask_data = mask.get_data();
-    char *cumsum_data = cumsum.get_data();
-
-    auto const &array_types = td_ns::usm_ndarray_types();
-
-    int mask_typeid = array_types.typenum_to_lookup_id(mask_typenum);
-    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
-
-    // cumsum must be int32_t/int64_t only
-    static constexpr int int32_typeid =
-        static_cast<int>(td_ns::typenum_t::INT32);
-    static constexpr int int64_typeid =
-        static_cast<int>(td_ns::typenum_t::INT64);
-    if (cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) {
-        throw py::value_error(
-            "Cumulative sum array must have int32 or int64 data-type.");
-    }
-
-    const bool use_i32 = (cumsum_typeid == int32_typeid);
-
-    std::vector<sycl::event> host_task_events;
-
-    if (mask.is_c_contiguous()) {
-        auto fn = (use_i32)
-                      ? mask_positions_contig_i32_dispatch_vector[mask_typeid]
-                      : mask_positions_contig_i64_dispatch_vector[mask_typeid];
-
-        std::size_t total_set;
-
-        {
-            py::gil_scoped_release release;
-
-            total_set = fn(exec_q, mask_size, mask_data, cumsum_data,
-                           host_task_events, depends);
-
-            sycl::event::wait(host_task_events);
-        }
-        return total_set;
-    }
-
-    const py::ssize_t *shape = mask.get_shape_raw();
-    auto const &strides_vector = mask.get_strides_vector();
-
-    using shT = std::vector<py::ssize_t>;
-    shT compact_shape;
-    shT compact_strides;
-
-    int mask_nd = mask.get_ndim();
-    int nd = mask_nd;
-
-    dpctl::tensor::py_internal::compact_iteration_space(
-        nd, shape, strides_vector, compact_shape, compact_strides);
-
-    // Strided implementation
-    auto strided_fn =
-        (use_i32) ? mask_positions_strided_i32_dispatch_vector[mask_typeid]
-                  : mask_positions_strided_i64_dispatch_vector[mask_typeid];
-
-    using dpctl::tensor::offset_utils::device_allocate_and_pack;
-    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
-        exec_q, host_task_events, compact_shape, compact_strides);
-    auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
-    sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
-    const py::ssize_t *shape_strides = shape_strides_owner.get();
-
-    if (2 * static_cast<std::size_t>(nd) != std::get<1>(ptr_size_event_tuple)) {
-        {
-            py::gil_scoped_release release;
-
-            copy_shape_ev.wait();
-            sycl::event::wait(host_task_events);
-
-            // ensure deleter of smart pointer is invoked with GIL released
-            shape_strides_owner.reset(nullptr);
-        }
-        throw std::runtime_error("Unexpected error");
-    }
-
-    std::vector<sycl::event> dependent_events;
-    dependent_events.reserve(depends.size() + 1);
-    dependent_events.insert(dependent_events.end(), copy_shape_ev);
-    dependent_events.insert(dependent_events.end(), depends.begin(),
-                            depends.end());
-
-    std::size_t total_set;
-
-    {
-        py::gil_scoped_release release;
-
-        total_set = strided_fn(exec_q, mask_size, mask_data, nd, shape_strides,
-                               cumsum_data, host_task_events, dependent_events);
-
-        sycl::event::wait(host_task_events);
-        // ensure deleter of smart pointer is invoked with GIL released
-        shape_strides_owner.reset(nullptr);
-    }
-
-    return total_set;
-}
-
-using dpctl::tensor::kernels::accumulators::cumsum_val_strided_impl_fn_ptr_t;
-static cumsum_val_strided_impl_fn_ptr_t
-    cumsum_1d_strided_dispatch_vector[td_ns::num_types];
-using dpctl::tensor::kernels::accumulators::cumsum_val_contig_impl_fn_ptr_t;
-static cumsum_val_contig_impl_fn_ptr_t
-    cumsum_1d_contig_dispatch_vector[td_ns::num_types];
-
-void populate_cumsum_1d_dispatch_vectors(void)
-{
-    using dpctl::tensor::kernels::accumulators::Cumsum1DContigFactory;
-    td_ns::DispatchVectorBuilder<cumsum_val_contig_impl_fn_ptr_t,
-                                 Cumsum1DContigFactory, td_ns::num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(cumsum_1d_contig_dispatch_vector);
-
-    using dpctl::tensor::kernels::accumulators::Cumsum1DStridedFactory;
-    td_ns::DispatchVectorBuilder<cumsum_val_strided_impl_fn_ptr_t,
-                                 Cumsum1DStridedFactory, td_ns::num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(cumsum_1d_strided_dispatch_vector);
-
-    return;
-}
-
-std::size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src,
-                         const dpctl::tensor::usm_ndarray &cumsum,
-                         sycl::queue &exec_q,
-                         std::vector<sycl::event> const &depends)
-{
-    // cumsum is 1D
-    if (cumsum.get_ndim() != 1) {
-        throw py::value_error("cumsum array must be one-dimensional.");
-    }
-
-    if (!cumsum.is_c_contiguous()) {
-        throw py::value_error("Expecting `cumsum` array to be C-contiguous.");
-    }
-
-    // cumsum.shape == (src.size,)
-    auto src_size = src.get_size();
-    auto cumsum_size = cumsum.get_shape(0);
-    if (cumsum_size != src_size) {
-        throw py::value_error("Inconsistent dimensions");
-    }
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, cumsum})) {
-        // FIXME: use ExecutionPlacementError
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(cumsum);
-
-    if (src_size == 0) {
-        return 0;
-    }
-
-    int src_typenum = src.get_typenum();
-    int cumsum_typenum = cumsum.get_typenum();
-
-    // src can be any type
-    const char *src_data = src.get_data();
-    char *cumsum_data = cumsum.get_data();
-
-    auto const &array_types = td_ns::usm_ndarray_types();
-
-    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
-    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
-
-    // this cumsum must be int64_t only
-    static constexpr int int64_typeid =
-        static_cast<int>(td_ns::typenum_t::INT64);
-    if (cumsum_typeid != int64_typeid) {
-        throw py::value_error(
-            "Cumulative sum array must have int64 data-type.");
-    }
-
-    std::vector<sycl::event> host_task_events;
-
-    if (src.is_c_contiguous()) {
-        auto fn = cumsum_1d_contig_dispatch_vector[src_typeid];
-        if (fn == nullptr) {
-            throw std::runtime_error(
-                "this cumsum requires integer type, got src_typeid=" +
-                std::to_string(src_typeid));
-        }
-        std::size_t total = fn(exec_q, src_size, src_data, cumsum_data,
-                               host_task_events, depends);
-        {
-            py::gil_scoped_release release;
-            sycl::event::wait(host_task_events);
-        }
-        return total;
-    }
-
-    const py::ssize_t *shape = src.get_shape_raw();
-    auto const &strides_vector = src.get_strides_vector();
-
-    using shT = std::vector<py::ssize_t>;
-    shT compact_shape;
-    shT compact_strides;
-
-    int src_nd = src.get_ndim();
-    int nd = src_nd;
-
-    dpctl::tensor::py_internal::compact_iteration_space(
-        nd, shape, strides_vector, compact_shape, compact_strides);
-
-    // Strided implementation
-    auto strided_fn = cumsum_1d_strided_dispatch_vector[src_typeid];
-    if (strided_fn == nullptr) {
-        throw std::runtime_error(
-            "this cumsum requires integer type, got src_typeid=" +
-            std::to_string(src_typeid));
-    }
-
-    using dpctl::tensor::offset_utils::device_allocate_and_pack;
-    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
-        exec_q, host_task_events, compact_shape, compact_strides);
-    auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
-    sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
-    const py::ssize_t *shape_strides = shape_strides_owner.get();
-
-    if (2 * static_cast<std::size_t>(nd) != std::get<1>(ptr_size_event_tuple)) {
-        {
-            py::gil_scoped_release release;
-
-            copy_shape_ev.wait();
-            sycl::event::wait(host_task_events);
-
-            // ensure USM deleter is called with GIL released
-            shape_strides_owner.reset(nullptr);
-        }
-        throw std::runtime_error("Unexpected error");
-    }
-
-    std::vector<sycl::event> dependent_events;
-    dependent_events.reserve(depends.size() + 1);
-    dependent_events.insert(dependent_events.end(), copy_shape_ev);
-    dependent_events.insert(dependent_events.end(), depends.begin(),
-                            depends.end());
-
-    std::size_t total =
-        strided_fn(exec_q, src_size, src_data, nd, shape_strides, cumsum_data,
-                   host_task_events, dependent_events);
-
-    {
-        py::gil_scoped_release release;
-        sycl::event::wait(host_task_events);
-
-        // ensure USM deleter is called with GIL released
-        shape_strides_owner.reset(nullptr);
-    }
-
-    return total;
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/accumulators.hpp b/dpctl/tensor/libtensor/source/accumulators.hpp
deleted file mode 100644
index 78e5786da2..0000000000
--- a/dpctl/tensor/libtensor/source/accumulators.hpp
+++ /dev/null
@@ -1,58 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#pragma once
-#include <cstddef>
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/pybind11.h>
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void populate_mask_positions_dispatch_vectors(void);
-
-extern std::size_t
-py_mask_positions(const dpctl::tensor::usm_ndarray &mask,
-                  const dpctl::tensor::usm_ndarray &cumsum,
-                  sycl::queue &exec_q,
-                  const std::vector<sycl::event> &depends = {});
-
-extern void populate_cumsum_1d_dispatch_vectors(void);
-
-extern std::size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src,
-                                const dpctl::tensor::usm_ndarray &cumsum,
-                                sycl::queue &exec_q,
-                                std::vector<sycl::event> const &depends = {});
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp b/dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp
deleted file mode 100644
index 23e0c28c1a..0000000000
--- a/dpctl/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp
+++ /dev/null
@@ -1,454 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include <cstddef>
-#include <cstdint>
-#include <exception>
-#include <limits>
-#include <stdexcept>
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include "kernels/accumulators.hpp"
-#include "simplify_iteration_space.hpp"
-#include "utils/memory_overlap.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/output_validation.hpp"
-#include "utils/sycl_alloc_utils.hpp"
-#include "utils/type_dispatch.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-template <typename strided_fnT, typename contig_fnT>
-std::pair<sycl::event, sycl::event>
-py_accumulate_over_axis(const dpctl::tensor::usm_ndarray &src,
-                        const int trailing_dims_to_accumulate,
-                        const dpctl::tensor::usm_ndarray &dst,
-                        sycl::queue &exec_q,
-                        std::vector<sycl::event> const &depends,
-                        const strided_fnT &strided_dispatch_table,
-                        const contig_fnT &contig_dispatch_table)
-{
-    int src_nd = src.get_ndim();
-    int dst_nd = dst.get_ndim();
-    if (src_nd != dst_nd) {
-        throw py::value_error("The input and output arrays must have "
-                              "the same array ranks");
-    }
-    int iter_nd = src_nd - trailing_dims_to_accumulate;
-    if (trailing_dims_to_accumulate <= 0 || iter_nd < 0) {
-        throw py::value_error(
-            "trailing_dims_to_accumulate must be positive, but no "
-            "greater than rank of the input array");
-    }
-
-    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
-    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
-
-    bool same_shapes = true;
-    std::size_t iter_nelems(1);
-    for (int i = 0; same_shapes && (i < iter_nd); ++i) {
-        auto src_shape_i = src_shape_ptr[i];
-        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
-        iter_nelems *= static_cast<std::size_t>(src_shape_i);
-    }
-
-    std::size_t acc_nelems(1);
-    for (int i = iter_nd; same_shapes && (i < src_nd); ++i) {
-        auto dst_shape_i = dst_shape_ptr[i];
-        same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_i);
-        acc_nelems *= static_cast<std::size_t>(dst_shape_i);
-    }
-
-    if (!same_shapes) {
-        throw py::value_error(
-            "Destination shape does not match the input shape");
-    }
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    if ((iter_nelems == 0) || (acc_nelems == 0)) {
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    if (overlap(src, dst)) {
-        throw py::value_error("Arrays index overlapping segments of memory");
-    }
-
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
-        dst, acc_nelems * iter_nelems);
-
-    const char *src_data = src.get_data();
-    char *dst_data = dst.get_data();
-
-    int src_typenum = src.get_typenum();
-    int dst_typenum = dst.get_typenum();
-
-    const auto &array_types = td_ns::usm_ndarray_types();
-    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    bool is_src_c_contig = src.is_c_contiguous();
-    bool is_dst_c_contig = dst.is_c_contiguous();
-
-    std::vector<sycl::event> host_task_events;
-
-    if ((is_src_c_contig && is_dst_c_contig) && iter_nd == 0) {
-        auto fn = contig_dispatch_table[src_typeid][dst_typeid];
-        if (fn == nullptr) {
-            throw std::runtime_error("Datatypes are not supported");
-        }
-
-        sycl::event acc_ev = fn(exec_q, acc_nelems, src_data, dst_data,
-                                host_task_events, depends);
-
-        return std::make_pair(
-            dpctl::utils::keep_args_alive(exec_q, {src, dst}, {acc_ev}),
-            acc_ev);
-    }
-
-    auto src_shape_vec = src.get_shape_vector();
-    auto src_strides_vec = src.get_strides_vector();
-    auto dst_strides_vec = dst.get_strides_vector();
-
-    int acc_nd = trailing_dims_to_accumulate;
-
-    using shT = std::vector<py::ssize_t>;
-    shT acc_shape(std::begin(src_shape_vec) + iter_nd, std::end(src_shape_vec));
-
-    shT acc_src_strides(std::begin(src_strides_vec) + iter_nd,
-                        std::end(src_strides_vec));
-
-    shT acc_dst_strides(std::begin(dst_strides_vec) + iter_nd,
-                        std::end(dst_strides_vec));
-
-    shT iter_shape(std::begin(src_shape_vec),
-                   std::begin(src_shape_vec) + iter_nd);
-
-    shT iter_src_strides(std::begin(src_strides_vec),
-                         std::begin(src_strides_vec) + iter_nd);
-
-    shT iter_dst_strides(std::begin(dst_strides_vec),
-                         std::begin(dst_strides_vec) + iter_nd);
-
-    shT simplified_iter_shape;
-    shT simplified_iter_src_strides;
-    shT simplified_iter_dst_strides;
-    py::ssize_t iter_src_offset(0);
-    py::ssize_t iter_dst_offset(0);
-
-    if (iter_nd == 0) {
-        iter_nd = 1;
-        simplified_iter_shape.push_back(1);
-        simplified_iter_src_strides.push_back(0);
-        simplified_iter_dst_strides.push_back(0);
-    }
-    else {
-        simplify_iteration_space(
-            iter_nd, src_shape_ptr, iter_src_strides, iter_dst_strides,
-            // output
-            simplified_iter_shape, simplified_iter_src_strides,
-            simplified_iter_dst_strides, iter_src_offset, iter_dst_offset);
-    }
-
-    // Strided implementation
-    auto strided_fn = strided_dispatch_table[src_typeid][dst_typeid];
-    if (strided_fn == nullptr) {
-        throw std::runtime_error("Datatypes are not supported");
-    }
-
-    using dpctl::tensor::offset_utils::device_allocate_and_pack;
-    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
-        exec_q, host_task_events, simplified_iter_shape,
-        simplified_iter_src_strides, simplified_iter_dst_strides, acc_shape,
-        acc_src_strides, acc_dst_strides);
-    auto packed_shapes_and_strides_owner =
-        std::move(std::get<0>(ptr_size_event_tuple));
-    const auto &copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple);
-    const py::ssize_t *packed_shapes_and_strides =
-        packed_shapes_and_strides_owner.get();
-
-    const py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides;
-    const py::ssize_t *acc_shapes_and_strides =
-        packed_shapes_and_strides + 3 * simplified_iter_shape.size();
-
-    std::vector<sycl::event> all_deps;
-    all_deps.reserve(depends.size() + 1);
-    all_deps.insert(all_deps.end(), copy_shapes_strides_ev);
-    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
-
-    sycl::event acc_ev = strided_fn(
-        exec_q, iter_nelems, acc_nelems, src_data, iter_nd,
-        iter_shape_and_strides, iter_src_offset, iter_dst_offset, acc_nd,
-        acc_shapes_and_strides, dst_data, host_task_events, all_deps);
-
-    sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
-        exec_q, {acc_ev}, packed_shapes_and_strides_owner);
-    host_task_events.push_back(temp_cleanup_ev);
-
-    return std::make_pair(
-        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events),
-        acc_ev);
-}
-
-template <typename strided_fnT, typename contig_fnT>
-std::pair<sycl::event, sycl::event> py_accumulate_final_axis_include_initial(
-    const dpctl::tensor::usm_ndarray &src,
-    const dpctl::tensor::usm_ndarray &dst,
-    sycl::queue &exec_q,
-    std::vector<sycl::event> const &depends,
-    const strided_fnT &strided_dispatch_table,
-    const contig_fnT &contig_dispatch_table)
-{
-    int src_nd = src.get_ndim();
-    int dst_nd = dst.get_ndim();
-
-    if (src_nd != dst_nd) {
-        throw py::value_error("The input and output arrays must have "
-                              "the same array ranks");
-    }
-
-    static constexpr int acc_nd = 1;
-
-    int iter_nd = src_nd - acc_nd;
-    if (iter_nd < 0) {
-        throw py::value_error("accumulation axis must not be greater than rank "
-                              "of the input array");
-    }
-
-    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
-    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
-
-    bool same_shapes = true;
-    std::size_t iter_nelems(1);
-    for (int i = 0; same_shapes && (i < iter_nd); ++i) {
-        auto src_shape_i = src_shape_ptr[i];
-        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
-        iter_nelems *= static_cast<std::size_t>(src_shape_i);
-    }
-
-    std::size_t acc_nelems(1);
-    for (int i = iter_nd; same_shapes && (i < src_nd); ++i) {
-        auto dst_shape_i = dst_shape_ptr[i];
-        same_shapes = same_shapes && (src_shape_ptr[i] + 1 == dst_shape_i);
-        acc_nelems *= static_cast<std::size_t>(dst_shape_i);
-    }
-
-    if (!same_shapes) {
-        throw py::value_error(
-            "Destination shape does not match the input shape");
-    }
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    if ((iter_nelems == 0) || (acc_nelems == 0)) {
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    if (overlap(src, dst)) {
-        throw py::value_error("Arrays index overlapping segments of memory");
-    }
-
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
-        dst, acc_nelems * iter_nelems);
-
-    const char *src_data = src.get_data();
-    char *dst_data = dst.get_data();
-
-    int src_typenum = src.get_typenum();
-    int dst_typenum = dst.get_typenum();
-
-    const auto &array_types = td_ns::usm_ndarray_types();
-    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    bool is_src_c_contig = src.is_c_contiguous();
-    bool is_dst_c_contig = dst.is_c_contiguous();
-
-    std::vector<sycl::event> host_task_events;
-
-    if ((is_src_c_contig && is_dst_c_contig) && iter_nd == 0) {
-        auto fn = contig_dispatch_table[src_typeid][dst_typeid];
-        if (fn == nullptr) {
-            throw std::runtime_error("Datatypes are not supported");
-        }
-
-        sycl::event acc_ev = fn(exec_q, acc_nelems, src_data, dst_data,
-                                host_task_events, depends);
-
-        return std::make_pair(
-            dpctl::utils::keep_args_alive(exec_q, {src, dst}, {acc_ev}),
-            acc_ev);
-    }
-
-    auto src_shape_vec = src.get_shape_vector();
-    auto src_strides_vec = src.get_strides_vector();
-    auto dst_strides_vec = dst.get_strides_vector();
-
-    using shT = std::vector<py::ssize_t>;
-    shT acc_shape(std::begin(src_shape_vec) + iter_nd, std::end(src_shape_vec));
-
-    shT acc_src_strides(std::begin(src_strides_vec) + iter_nd,
-                        std::end(src_strides_vec));
-
-    shT acc_dst_strides(std::begin(dst_strides_vec) + iter_nd,
-                        std::end(dst_strides_vec));
-
-    shT iter_shape(std::begin(src_shape_vec),
-                   std::begin(src_shape_vec) + iter_nd);
-
-    shT iter_src_strides(std::begin(src_strides_vec),
-                         std::begin(src_strides_vec) + iter_nd);
-
-    shT iter_dst_strides(std::begin(dst_strides_vec),
-                         std::begin(dst_strides_vec) + iter_nd);
-
-    shT simplified_iter_shape;
-    shT simplified_iter_src_strides;
-    shT simplified_iter_dst_strides;
-    py::ssize_t iter_src_offset(0);
-    py::ssize_t iter_dst_offset(0);
-
-    if (iter_nd == 0) {
-        iter_nd = 1;
-        simplified_iter_shape.push_back(1);
-        simplified_iter_src_strides.push_back(0);
-        simplified_iter_dst_strides.push_back(0);
-    }
-    else {
-        simplify_iteration_space(
-            iter_nd, src_shape_ptr, iter_src_strides, iter_dst_strides,
-            // output
-            simplified_iter_shape, simplified_iter_src_strides,
-            simplified_iter_dst_strides, iter_src_offset, iter_dst_offset);
-    }
-
-    // Strided implementation
-    auto strided_fn = strided_dispatch_table[src_typeid][dst_typeid];
-    if (strided_fn == nullptr) {
-        throw std::runtime_error("Datatypes are not supported");
-    }
-
-    using dpctl::tensor::offset_utils::device_allocate_and_pack;
-    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
-        exec_q, host_task_events, simplified_iter_shape,
-        simplified_iter_src_strides, simplified_iter_dst_strides, acc_shape,
-        acc_src_strides, acc_dst_strides);
-    auto packed_shapes_and_strides_owner =
-        std::move(std::get<0>(ptr_size_event_tuple));
-    const auto &copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple);
-    const py::ssize_t *packed_shapes_and_strides =
-        packed_shapes_and_strides_owner.get();
-
-    const py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides;
-    const py::ssize_t *acc_shapes_and_strides =
-        packed_shapes_and_strides + 3 * simplified_iter_shape.size();
-
-    std::vector<sycl::event> all_deps;
-    all_deps.reserve(depends.size() + 1);
-    all_deps.insert(all_deps.end(), copy_shapes_strides_ev);
-    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
-
-    sycl::event acc_ev = strided_fn(
-        exec_q, iter_nelems, acc_nelems, src_data, iter_nd,
-        iter_shape_and_strides, iter_src_offset, iter_dst_offset, acc_nd,
-        acc_shapes_and_strides, dst_data, host_task_events, all_deps);
-
-    sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
-        exec_q, {acc_ev}, packed_shapes_and_strides_owner);
-    host_task_events.push_back(temp_cleanup_ev);
-
-    return std::make_pair(
-        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events),
-        acc_ev);
-}
-
-/*! @brief Template implementing Python API for querying accumulation
- * type support */
-template <typename fnT>
-bool py_accumulate_dtype_supported(const py::dtype &input_dtype,
-                                   const py::dtype &output_dtype,
-                                   const fnT &dispatch_table)
-{
-    int arg_tn =
-        input_dtype.num(); // NumPy type numbers are the same as in dpctl
-    int out_tn =
-        output_dtype.num(); // NumPy type numbers are the same as in dpctl
-    int arg_typeid = -1;
-    int out_typeid = -1;
-
-    auto array_types = td_ns::usm_ndarray_types();
-
-    try {
-        arg_typeid = array_types.typenum_to_lookup_id(arg_tn);
-        out_typeid = array_types.typenum_to_lookup_id(out_tn);
-    } catch (const std::exception &e) {
-        throw py::value_error(e.what());
-    }
-
-    if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 ||
-        out_typeid >= td_ns::num_types)
-    {
-        throw std::runtime_error("Reduction type support check: lookup failed");
-    }
-
-    // remove_all_extents gets underlying type of table
-    using fn_ptrT = typename std::remove_all_extents<fnT>::type;
-    fn_ptrT fn = nullptr;
-
-    fn = dispatch_table[arg_typeid][out_typeid];
-
-    return (fn != nullptr);
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/accumulators/accumulators_common.cpp b/dpctl/tensor/libtensor/source/accumulators/accumulators_common.cpp
deleted file mode 100644
index 2938f541bf..0000000000
--- a/dpctl/tensor/libtensor/source/accumulators/accumulators_common.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#include <pybind11/pybind11.h>
-
-#include "cumulative_logsumexp.hpp"
-#include "cumulative_prod.hpp"
-#include "cumulative_sum.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-/*! @brief Add accumulators to Python module */
-void init_accumulator_functions(py::module_ m)
-{
-    init_cumulative_logsumexp(m);
-    init_cumulative_prod(m);
-    init_cumulative_sum(m);
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/accumulators/accumulators_common.hpp b/dpctl/tensor/libtensor/source/accumulators/accumulators_common.hpp
deleted file mode 100644
index 5f60dfa676..0000000000
--- a/dpctl/tensor/libtensor/source/accumulators/accumulators_common.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_accumulator_functions(py::module_);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp b/dpctl/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp
deleted file mode 100644
index aa1f4aa091..0000000000
--- a/dpctl/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp
+++ /dev/null
@@ -1,343 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-#include <vector>
-
-#include "accumulate_over_axis.hpp"
-#include "kernels/accumulators.hpp"
-#include "utils/sycl_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace su_ns = dpctl::tensor::sycl_utils;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace impl
-{
-
-using dpctl::tensor::kernels::accumulators::accumulate_1d_contig_impl_fn_ptr_t;
-static accumulate_1d_contig_impl_fn_ptr_t
-    cumlogsumexp_1d_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-using dpctl::tensor::kernels::accumulators::accumulate_strided_impl_fn_ptr_t;
-static accumulate_strided_impl_fn_ptr_t
-    cumlogsumexp_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-static accumulate_1d_contig_impl_fn_ptr_t
-    cumlogsumexp_1d_include_initial_contig_dispatch_table[td_ns::num_types]
-                                                         [td_ns::num_types];
-
-static accumulate_strided_impl_fn_ptr_t
-    cumlogsumexp_include_initial_strided_dispatch_table[td_ns::num_types]
-                                                       [td_ns::num_types];
-
-template <typename argTy, typename outTy>
-struct TypePairSupportDataForLogSumExpAccumulation
-{
-    static constexpr bool is_defined = std::disjunction<
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, sycl::half>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
-
-        // input int8_t
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, sycl::half>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
-
-        // input uint8_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, sycl::half>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
-
-        // input int16_t
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
-
-        // input uint16_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
-
-        // input int32_t
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
-
-        // input uint32_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
-
-        // input int64_t
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
-
-        // input uint64_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
-
-        // input half
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, double>,
-
-        // input float
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
-
-        // input double
-        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
-
-        // fall-through
-        td_ns::NotDefinedEntry>::is_defined;
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct CumLogSumExp1DContigFactory
-{
-    fnT get()
-    {
-        if constexpr (TypePairSupportDataForLogSumExpAccumulation<
-                          srcTy, dstTy>::is_defined)
-        {
-            using ScanOpT = su_ns::LogSumExp<dstTy>;
-            static constexpr bool include_initial = false;
-            if constexpr (std::is_same_v<srcTy, dstTy>) {
-                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
-                fnT fn = dpctl::tensor::kernels::accumulators::
-                    accumulate_1d_contig_impl<srcTy, dstTy,
-                                              NoOpTransformer<dstTy>, ScanOpT,
-                                              include_initial>;
-                return fn;
-            }
-            else {
-                using dpctl::tensor::kernels::accumulators::CastTransformer;
-                fnT fn = dpctl::tensor::kernels::accumulators::
-                    accumulate_1d_contig_impl<srcTy, dstTy,
-                                              CastTransformer<srcTy, dstTy>,
-                                              ScanOpT, include_initial>;
-                return fn;
-            }
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct CumLogSumExp1DIncludeInitialContigFactory
-{
-    fnT get()
-    {
-        if constexpr (TypePairSupportDataForLogSumExpAccumulation<
-                          srcTy, dstTy>::is_defined)
-        {
-            using ScanOpT = su_ns::LogSumExp<dstTy>;
-            static constexpr bool include_initial = true;
-            if constexpr (std::is_same_v<srcTy, dstTy>) {
-                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
-                fnT fn = dpctl::tensor::kernels::accumulators::
-                    accumulate_1d_contig_impl<srcTy, dstTy,
-                                              NoOpTransformer<dstTy>, ScanOpT,
-                                              include_initial>;
-                return fn;
-            }
-            else {
-                using dpctl::tensor::kernels::accumulators::CastTransformer;
-                fnT fn = dpctl::tensor::kernels::accumulators::
-                    accumulate_1d_contig_impl<srcTy, dstTy,
-                                              CastTransformer<srcTy, dstTy>,
-                                              ScanOpT, include_initial>;
-                return fn;
-            }
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct CumLogSumExpStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (TypePairSupportDataForLogSumExpAccumulation<
-                          srcTy, dstTy>::is_defined)
-        {
-            using ScanOpT = su_ns::LogSumExp<dstTy>;
-            static constexpr bool include_initial = false;
-            if constexpr (std::is_same_v<srcTy, dstTy>) {
-                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
-                fnT fn = dpctl::tensor::kernels::accumulators::
-                    accumulate_strided_impl<srcTy, dstTy,
-                                            NoOpTransformer<dstTy>, ScanOpT,
-                                            include_initial>;
-                return fn;
-            }
-            else {
-                using dpctl::tensor::kernels::accumulators::CastTransformer;
-                fnT fn = dpctl::tensor::kernels::accumulators::
-                    accumulate_strided_impl<srcTy, dstTy,
-                                            CastTransformer<srcTy, dstTy>,
-                                            ScanOpT, include_initial>;
-                return fn;
-            }
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct CumLogSumExpIncludeInitialStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (TypePairSupportDataForLogSumExpAccumulation<
-                          srcTy, dstTy>::is_defined)
-        {
-            using ScanOpT = su_ns::LogSumExp<dstTy>;
-            static constexpr bool include_initial = true;
-            if constexpr (std::is_same_v<srcTy, dstTy>) {
-                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
-                fnT fn = dpctl::tensor::kernels::accumulators::
-                    accumulate_strided_impl<srcTy, dstTy,
-                                            NoOpTransformer<dstTy>, ScanOpT,
-                                            include_initial>;
-                return fn;
-            }
-            else {
-                using dpctl::tensor::kernels::accumulators::CastTransformer;
-                fnT fn = dpctl::tensor::kernels::accumulators::
-                    accumulate_strided_impl<srcTy, dstTy,
-                                            CastTransformer<srcTy, dstTy>,
-                                            ScanOpT, include_initial>;
-                return fn;
-            }
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-void populate_cumlogsumexp_dispatch_tables(void)
-{
-    td_ns::DispatchTableBuilder<accumulate_1d_contig_impl_fn_ptr_t,
-                                CumLogSumExp1DContigFactory, td_ns::num_types>
-        dtb1;
-    dtb1.populate_dispatch_table(cumlogsumexp_1d_contig_dispatch_table);
-
-    td_ns::DispatchTableBuilder<accumulate_strided_impl_fn_ptr_t,
-                                CumLogSumExpStridedFactory, td_ns::num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(cumlogsumexp_strided_dispatch_table);
-
-    td_ns::DispatchTableBuilder<accumulate_1d_contig_impl_fn_ptr_t,
-                                CumLogSumExp1DIncludeInitialContigFactory,
-                                td_ns::num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(
-        cumlogsumexp_1d_include_initial_contig_dispatch_table);
-
-    td_ns::DispatchTableBuilder<accumulate_strided_impl_fn_ptr_t,
-                                CumLogSumExpIncludeInitialStridedFactory,
-                                td_ns::num_types>
-        dtb4;
-    dtb4.populate_dispatch_table(
-        cumlogsumexp_include_initial_strided_dispatch_table);
-
-    return;
-}
-
-} // namespace impl
-
-void init_cumulative_logsumexp(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-
-    using impl::populate_cumlogsumexp_dispatch_tables;
-    populate_cumlogsumexp_dispatch_tables();
-
-    using impl::cumlogsumexp_1d_contig_dispatch_table;
-    using impl::cumlogsumexp_strided_dispatch_table;
-    auto cumlogsumexp_pyapi = [&](const arrayT &src,
-                                  int trailing_dims_to_accumulate,
-                                  const arrayT &dst, sycl::queue &exec_q,
-                                  const event_vecT &depends = {}) {
-        using dpctl::tensor::py_internal::py_accumulate_over_axis;
-        return py_accumulate_over_axis(src, trailing_dims_to_accumulate, dst,
-                                       exec_q, depends,
-                                       cumlogsumexp_strided_dispatch_table,
-                                       cumlogsumexp_1d_contig_dispatch_table);
-    };
-    m.def("_cumlogsumexp_over_axis", cumlogsumexp_pyapi, "", py::arg("src"),
-          py::arg("trailing_dims_to_accumulate"), py::arg("dst"),
-          py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-    using impl::cumlogsumexp_1d_include_initial_contig_dispatch_table;
-    using impl::cumlogsumexp_include_initial_strided_dispatch_table;
-    auto cumlogsumexp_include_initial_pyapi =
-        [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q,
-            const event_vecT &depends = {}) {
-            using dpctl::tensor::py_internal::
-                py_accumulate_final_axis_include_initial;
-            return py_accumulate_final_axis_include_initial(
-                src, dst, exec_q, depends,
-                cumlogsumexp_include_initial_strided_dispatch_table,
-                cumlogsumexp_1d_include_initial_contig_dispatch_table);
-        };
-    m.def("_cumlogsumexp_final_axis_include_initial",
-          cumlogsumexp_include_initial_pyapi, "", py::arg("src"),
-          py::arg("dst"), py::arg("sycl_queue"),
-          py::arg("depends") = py::list());
-
-    auto cumlogsumexp_dtype_supported = [&](const py::dtype &input_dtype,
-                                            const py::dtype &output_dtype) {
-        using dpctl::tensor::py_internal::py_accumulate_dtype_supported;
-        return py_accumulate_dtype_supported(
-            input_dtype, output_dtype, cumlogsumexp_strided_dispatch_table);
-    };
-    m.def("_cumlogsumexp_dtype_supported", cumlogsumexp_dtype_supported, "",
-          py::arg("arg_dtype"), py::arg("out_dtype"));
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/accumulators/cumulative_logsumexp.hpp b/dpctl/tensor/libtensor/source/accumulators/cumulative_logsumexp.hpp
deleted file mode 100644
index ba434689b2..0000000000
--- a/dpctl/tensor/libtensor/source/accumulators/cumulative_logsumexp.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_cumulative_logsumexp(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.cpp b/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.cpp
deleted file mode 100644
index 750da77a1b..0000000000
--- a/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.cpp
+++ /dev/null
@@ -1,353 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-#include <vector>
-
-#include "accumulate_over_axis.hpp"
-#include "kernels/accumulators.hpp"
-#include "utils/sycl_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace su_ns = dpctl::tensor::sycl_utils;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace impl
-{
-
-using dpctl::tensor::kernels::accumulators::accumulate_1d_contig_impl_fn_ptr_t;
-static accumulate_1d_contig_impl_fn_ptr_t
-    cumprod_1d_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-using dpctl::tensor::kernels::accumulators::accumulate_strided_impl_fn_ptr_t;
-static accumulate_strided_impl_fn_ptr_t
-    cumprod_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-static accumulate_1d_contig_impl_fn_ptr_t
-    cumprod_1d_include_initial_contig_dispatch_table[td_ns::num_types]
-                                                    [td_ns::num_types];
-
-static accumulate_strided_impl_fn_ptr_t
-    cumprod_include_initial_strided_dispatch_table[td_ns::num_types]
-                                                  [td_ns::num_types];
-
-template <typename argTy, typename outTy>
-struct TypePairSupportDataForProdAccumulation
-{
-    static constexpr bool is_defined = std::disjunction<
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, bool>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
-
-        // input int8_t
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
-
-        // input uint8_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
-
-        // input int16_t
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
-
-        // input uint16_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
-
-        // input int32_t
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
-
-        // input uint32_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
-
-        // input int64_t
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
-
-        // input uint64_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
-
-        // input half
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
-
-        // input float
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
-
-        // input double
-        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
-
-        // input std::complex
-        td_ns::TypePairDefinedEntry<argTy,
-                                    std::complex<float>,
-                                    outTy,
-                                    std::complex<float>>,
-
-        td_ns::TypePairDefinedEntry<argTy,
-                                    std::complex<double>,
-                                    outTy,
-                                    std::complex<double>>,
-
-        // fall-through
-        td_ns::NotDefinedEntry>::is_defined;
-};
-
-template <typename T>
-using CumProdScanOpT = std::conditional_t<std::is_same_v<T, bool>,
-                                          sycl::logical_and<T>,
-                                          sycl::multiplies<T>>;
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct CumProd1DContigFactory
-{
-    fnT get()
-    {
-        if constexpr (TypePairSupportDataForProdAccumulation<srcTy,
-                                                             dstTy>::is_defined)
-        {
-            using ScanOpT = CumProdScanOpT<dstTy>;
-            static constexpr bool include_initial = false;
-            if constexpr (std::is_same_v<srcTy, dstTy>) {
-                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
-                fnT fn = dpctl::tensor::kernels::accumulators::
-                    accumulate_1d_contig_impl<srcTy, dstTy,
-                                              NoOpTransformer<dstTy>, ScanOpT,
-                                              include_initial>;
-                return fn;
-            }
-            else {
-                using dpctl::tensor::kernels::accumulators::CastTransformer;
-                fnT fn = dpctl::tensor::kernels::accumulators::
-                    accumulate_1d_contig_impl<srcTy, dstTy,
-                                              CastTransformer<srcTy, dstTy>,
-                                              ScanOpT, include_initial>;
-                return fn;
-            }
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct CumProd1DIncludeInitialContigFactory
-{
-    fnT get()
-    {
-        if constexpr (TypePairSupportDataForProdAccumulation<srcTy,
-                                                             dstTy>::is_defined)
-        {
-            using ScanOpT = CumProdScanOpT<dstTy>;
-            static constexpr bool include_initial = true;
-            if constexpr (std::is_same_v<srcTy, dstTy>) {
-                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
-                fnT fn = dpctl::tensor::kernels::accumulators::
-                    accumulate_1d_contig_impl<srcTy, dstTy,
-                                              NoOpTransformer<dstTy>, ScanOpT,
-                                              include_initial>;
-                return fn;
-            }
-            else {
-                using dpctl::tensor::kernels::accumulators::CastTransformer;
-                fnT fn = dpctl::tensor::kernels::accumulators::
-                    accumulate_1d_contig_impl<srcTy, dstTy,
-                                              CastTransformer<srcTy, dstTy>,
-                                              ScanOpT, include_initial>;
-                return fn;
-            }
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct CumProdStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (TypePairSupportDataForProdAccumulation<srcTy,
-                                                             dstTy>::is_defined)
-        {
-            using ScanOpT = CumProdScanOpT<dstTy>;
-            static constexpr bool include_initial = false;
-            if constexpr (std::is_same_v<srcTy, dstTy>) {
-                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
-                fnT fn = dpctl::tensor::kernels::accumulators::
-                    accumulate_strided_impl<srcTy, dstTy,
-                                            NoOpTransformer<dstTy>, ScanOpT,
-                                            include_initial>;
-                return fn;
-            }
-            else {
-                using dpctl::tensor::kernels::accumulators::CastTransformer;
-                fnT fn = dpctl::tensor::kernels::accumulators::
-                    accumulate_strided_impl<srcTy, dstTy,
-                                            CastTransformer<srcTy, dstTy>,
-                                            ScanOpT, include_initial>;
-                return fn;
-            }
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct CumProdIncludeInitialStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (TypePairSupportDataForProdAccumulation<srcTy,
-                                                             dstTy>::is_defined)
-        {
-            using ScanOpT = CumProdScanOpT<dstTy>;
-            static constexpr bool include_initial = true;
-            if constexpr (std::is_same_v<srcTy, dstTy>) {
-                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
-                fnT fn = dpctl::tensor::kernels::accumulators::
-                    accumulate_strided_impl<srcTy, dstTy,
-                                            NoOpTransformer<dstTy>, ScanOpT,
-                                            include_initial>;
-                return fn;
-            }
-            else {
-                using dpctl::tensor::kernels::accumulators::CastTransformer;
-                fnT fn = dpctl::tensor::kernels::accumulators::
-                    accumulate_strided_impl<srcTy, dstTy,
-                                            CastTransformer<srcTy, dstTy>,
-                                            ScanOpT, include_initial>;
-                return fn;
-            }
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-void populate_cumprod_dispatch_tables(void)
-{
-    td_ns::DispatchTableBuilder<accumulate_1d_contig_impl_fn_ptr_t,
-                                CumProd1DContigFactory, td_ns::num_types>
-        dtb1;
-    dtb1.populate_dispatch_table(cumprod_1d_contig_dispatch_table);
-
-    td_ns::DispatchTableBuilder<accumulate_strided_impl_fn_ptr_t,
-                                CumProdStridedFactory, td_ns::num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(cumprod_strided_dispatch_table);
-
-    td_ns::DispatchTableBuilder<accumulate_1d_contig_impl_fn_ptr_t,
-                                CumProd1DIncludeInitialContigFactory,
-                                td_ns::num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(
-        cumprod_1d_include_initial_contig_dispatch_table);
-
-    td_ns::DispatchTableBuilder<accumulate_strided_impl_fn_ptr_t,
-                                CumProdIncludeInitialStridedFactory,
-                                td_ns::num_types>
-        dtb4;
-    dtb4.populate_dispatch_table(
-        cumprod_include_initial_strided_dispatch_table);
-
-    return;
-}
-
-} // namespace impl
-
-void init_cumulative_prod(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-
-    using impl::populate_cumprod_dispatch_tables;
-    populate_cumprod_dispatch_tables();
-
-    using impl::cumprod_1d_contig_dispatch_table;
-    using impl::cumprod_strided_dispatch_table;
-    auto cumprod_pyapi = [&](const arrayT &src, int trailing_dims_to_accumulate,
-                             const arrayT &dst, sycl::queue &exec_q,
-                             const event_vecT &depends = {}) {
-        using dpctl::tensor::py_internal::py_accumulate_over_axis;
-        return py_accumulate_over_axis(
-            src, trailing_dims_to_accumulate, dst, exec_q, depends,
-            cumprod_strided_dispatch_table, cumprod_1d_contig_dispatch_table);
-    };
-    m.def("_cumprod_over_axis", cumprod_pyapi, "", py::arg("src"),
-          py::arg("trailing_dims_to_accumulate"), py::arg("dst"),
-          py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-    using impl::cumprod_1d_include_initial_contig_dispatch_table;
-    using impl::cumprod_include_initial_strided_dispatch_table;
-    auto cumprod_include_initial_pyapi =
-        [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q,
-            const event_vecT &depends = {}) {
-            using dpctl::tensor::py_internal::
-                py_accumulate_final_axis_include_initial;
-            return py_accumulate_final_axis_include_initial(
-                src, dst, exec_q, depends,
-                cumprod_include_initial_strided_dispatch_table,
-                cumprod_1d_include_initial_contig_dispatch_table);
-        };
-    m.def("_cumprod_final_axis_include_initial", cumprod_include_initial_pyapi,
-          "", py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
-          py::arg("depends") = py::list());
-
-    auto cumprod_dtype_supported = [&](const py::dtype &input_dtype,
-                                       const py::dtype &output_dtype) {
-        using dpctl::tensor::py_internal::py_accumulate_dtype_supported;
-        return py_accumulate_dtype_supported(input_dtype, output_dtype,
-                                             cumprod_strided_dispatch_table);
-    };
-    m.def("_cumprod_dtype_supported", cumprod_dtype_supported, "",
-          py::arg("arg_dtype"), py::arg("out_dtype"));
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.hpp b/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.hpp
deleted file mode 100644
index e7ad65672d..0000000000
--- a/dpctl/tensor/libtensor/source/accumulators/cumulative_prod.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_cumulative_prod(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.cpp b/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.cpp
deleted file mode 100644
index 7136e9005a..0000000000
--- a/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.cpp
+++ /dev/null
@@ -1,351 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-#include <vector>
-
-#include "accumulate_over_axis.hpp"
-#include "kernels/accumulators.hpp"
-#include "utils/sycl_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace su_ns = dpctl::tensor::sycl_utils;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace impl
-{
-
-using dpctl::tensor::kernels::accumulators::accumulate_1d_contig_impl_fn_ptr_t;
-static accumulate_1d_contig_impl_fn_ptr_t
-    cumsum_1d_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-using dpctl::tensor::kernels::accumulators::accumulate_strided_impl_fn_ptr_t;
-static accumulate_strided_impl_fn_ptr_t
-    cumsum_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-static accumulate_1d_contig_impl_fn_ptr_t
-    cumsum_1d_include_initial_contig_dispatch_table[td_ns::num_types]
-                                                   [td_ns::num_types];
-
-static accumulate_strided_impl_fn_ptr_t
-    cumsum_include_initial_strided_dispatch_table[td_ns::num_types]
-                                                 [td_ns::num_types];
-
-template <typename argTy, typename outTy>
-struct TypePairSupportDataForSumAccumulation
-{
-    static constexpr bool is_defined = std::disjunction<
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, bool>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
-
-        // input int8_t
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
-
-        // input uint8_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
-
-        // input int16_t
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
-
-        // input uint16_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
-
-        // input int32_t
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
-
-        // input uint32_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
-
-        // input int64_t
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
-
-        // input uint64_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
-
-        // input half
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
-
-        // input float
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
-
-        // input double
-        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
-
-        // input std::complex
-        td_ns::TypePairDefinedEntry<argTy,
-                                    std::complex<float>,
-                                    outTy,
-                                    std::complex<float>>,
-
-        td_ns::TypePairDefinedEntry<argTy,
-                                    std::complex<double>,
-                                    outTy,
-                                    std::complex<double>>,
-
-        // fall-through
-        td_ns::NotDefinedEntry>::is_defined;
-};
-
-template <typename T>
-using CumSumScanOpT = std::
-    conditional_t<std::is_same_v<T, bool>, sycl::logical_or<T>, sycl::plus<T>>;
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct CumSum1DContigFactory
-{
-    fnT get()
-    {
-        if constexpr (TypePairSupportDataForSumAccumulation<srcTy,
-                                                            dstTy>::is_defined)
-        {
-            using ScanOpT = CumSumScanOpT<dstTy>;
-            static constexpr bool include_initial = false;
-            if constexpr (std::is_same_v<srcTy, dstTy>) {
-                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
-                fnT fn = dpctl::tensor::kernels::accumulators::
-                    accumulate_1d_contig_impl<srcTy, dstTy,
-                                              NoOpTransformer<dstTy>, ScanOpT,
-                                              include_initial>;
-                return fn;
-            }
-            else {
-                using dpctl::tensor::kernels::accumulators::CastTransformer;
-                fnT fn = dpctl::tensor::kernels::accumulators::
-                    accumulate_1d_contig_impl<srcTy, dstTy,
-                                              CastTransformer<srcTy, dstTy>,
-                                              ScanOpT, include_initial>;
-                return fn;
-            }
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct CumSum1DIncludeInitialContigFactory
-{
-    fnT get()
-    {
-        if constexpr (TypePairSupportDataForSumAccumulation<srcTy,
-                                                            dstTy>::is_defined)
-        {
-            using ScanOpT = CumSumScanOpT<dstTy>;
-            static constexpr bool include_initial = true;
-            if constexpr (std::is_same_v<srcTy, dstTy>) {
-                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
-                fnT fn = dpctl::tensor::kernels::accumulators::
-                    accumulate_1d_contig_impl<srcTy, dstTy,
-                                              NoOpTransformer<dstTy>, ScanOpT,
-                                              include_initial>;
-                return fn;
-            }
-            else {
-                using dpctl::tensor::kernels::accumulators::CastTransformer;
-                fnT fn = dpctl::tensor::kernels::accumulators::
-                    accumulate_1d_contig_impl<srcTy, dstTy,
-                                              CastTransformer<srcTy, dstTy>,
-                                              ScanOpT, include_initial>;
-                return fn;
-            }
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct CumSumStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (TypePairSupportDataForSumAccumulation<srcTy,
-                                                            dstTy>::is_defined)
-        {
-            using ScanOpT = CumSumScanOpT<dstTy>;
-            static constexpr bool include_initial = false;
-            if constexpr (std::is_same_v<srcTy, dstTy>) {
-                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
-                fnT fn = dpctl::tensor::kernels::accumulators::
-                    accumulate_strided_impl<srcTy, dstTy,
-                                            NoOpTransformer<dstTy>, ScanOpT,
-                                            include_initial>;
-                return fn;
-            }
-            else {
-                using dpctl::tensor::kernels::accumulators::CastTransformer;
-                fnT fn = dpctl::tensor::kernels::accumulators::
-                    accumulate_strided_impl<srcTy, dstTy,
-                                            CastTransformer<srcTy, dstTy>,
-                                            ScanOpT, include_initial>;
-                return fn;
-            }
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct CumSumIncludeInitialStridedFactory
-{
-    fnT get()
-    {
-        if constexpr (TypePairSupportDataForSumAccumulation<srcTy,
-                                                            dstTy>::is_defined)
-        {
-            using ScanOpT = CumSumScanOpT<dstTy>;
-            static constexpr bool include_initial = true;
-            if constexpr (std::is_same_v<srcTy, dstTy>) {
-                using dpctl::tensor::kernels::accumulators::NoOpTransformer;
-                fnT fn = dpctl::tensor::kernels::accumulators::
-                    accumulate_strided_impl<srcTy, dstTy,
-                                            NoOpTransformer<dstTy>, ScanOpT,
-                                            include_initial>;
-                return fn;
-            }
-            else {
-                using dpctl::tensor::kernels::accumulators::CastTransformer;
-                fnT fn = dpctl::tensor::kernels::accumulators::
-                    accumulate_strided_impl<srcTy, dstTy,
-                                            CastTransformer<srcTy, dstTy>,
-                                            ScanOpT, include_initial>;
-                return fn;
-            }
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-void populate_cumsum_dispatch_tables(void)
-{
-    td_ns::DispatchTableBuilder<accumulate_1d_contig_impl_fn_ptr_t,
-                                CumSum1DContigFactory, td_ns::num_types>
-        dtb1;
-    dtb1.populate_dispatch_table(cumsum_1d_contig_dispatch_table);
-
-    td_ns::DispatchTableBuilder<accumulate_strided_impl_fn_ptr_t,
-                                CumSumStridedFactory, td_ns::num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(cumsum_strided_dispatch_table);
-
-    td_ns::DispatchTableBuilder<accumulate_1d_contig_impl_fn_ptr_t,
-                                CumSum1DIncludeInitialContigFactory,
-                                td_ns::num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(
-        cumsum_1d_include_initial_contig_dispatch_table);
-
-    td_ns::DispatchTableBuilder<accumulate_strided_impl_fn_ptr_t,
-                                CumSumIncludeInitialStridedFactory,
-                                td_ns::num_types>
-        dtb4;
-    dtb4.populate_dispatch_table(cumsum_include_initial_strided_dispatch_table);
-
-    return;
-}
-
-} // namespace impl
-
-void init_cumulative_sum(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-
-    using impl::populate_cumsum_dispatch_tables;
-    populate_cumsum_dispatch_tables();
-
-    using impl::cumsum_1d_contig_dispatch_table;
-    using impl::cumsum_strided_dispatch_table;
-    auto cumsum_pyapi = [&](const arrayT &src, int trailing_dims_to_accumulate,
-                            const arrayT &dst, sycl::queue &exec_q,
-                            const event_vecT &depends = {}) {
-        using dpctl::tensor::py_internal::py_accumulate_over_axis;
-        return py_accumulate_over_axis(
-            src, trailing_dims_to_accumulate, dst, exec_q, depends,
-            cumsum_strided_dispatch_table, cumsum_1d_contig_dispatch_table);
-    };
-    m.def("_cumsum_over_axis", cumsum_pyapi, "", py::arg("src"),
-          py::arg("trailing_dims_to_accumulate"), py::arg("dst"),
-          py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-    using impl::cumsum_1d_include_initial_contig_dispatch_table;
-    using impl::cumsum_include_initial_strided_dispatch_table;
-    auto cumsum_include_initial_pyapi =
-        [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q,
-            const event_vecT &depends = {}) {
-            using dpctl::tensor::py_internal::
-                py_accumulate_final_axis_include_initial;
-            return py_accumulate_final_axis_include_initial(
-                src, dst, exec_q, depends,
-                cumsum_include_initial_strided_dispatch_table,
-                cumsum_1d_include_initial_contig_dispatch_table);
-        };
-    m.def("_cumsum_final_axis_include_initial", cumsum_include_initial_pyapi,
-          "", py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
-          py::arg("depends") = py::list());
-
-    auto cumsum_dtype_supported = [&](const py::dtype &input_dtype,
-                                      const py::dtype &output_dtype) {
-        using dpctl::tensor::py_internal::py_accumulate_dtype_supported;
-        return py_accumulate_dtype_supported(input_dtype, output_dtype,
-                                             cumsum_strided_dispatch_table);
-    };
-    m.def("_cumsum_dtype_supported", cumsum_dtype_supported, "",
-          py::arg("arg_dtype"), py::arg("out_dtype"));
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.hpp b/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.hpp
deleted file mode 100644
index 076b98ca1a..0000000000
--- a/dpctl/tensor/libtensor/source/accumulators/cumulative_sum.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_cumulative_sum(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
deleted file mode 100644
index 1de4bd74e0..0000000000
--- a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
+++ /dev/null
@@ -1,850 +0,0 @@
-//===-- boolean_advanced_indexing.cpp -                       --*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines implementation functions of dpctl.tensor.place and
-/// dpctl.tensor.extract, dpctl.tensor.nonzero
-//===----------------------------------------------------------------------===//
-
-#include <cstddef>
-#include <cstdint>
-#include <limits>
-#include <stdexcept>
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include "simplify_iteration_space.hpp"
-#include "utils/memory_overlap.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/output_validation.hpp"
-#include "utils/sycl_alloc_utils.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "boolean_advanced_indexing.hpp"
-#include "kernels/boolean_advanced_indexing.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-// Masked extraction
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::kernels::indexing::
-    masked_extract_all_slices_strided_impl_fn_ptr_t;
-
-static masked_extract_all_slices_strided_impl_fn_ptr_t
-    masked_extract_all_slices_strided_i32_impl_dispatch_vector
-        [td_ns::num_types];
-static masked_extract_all_slices_strided_impl_fn_ptr_t
-    masked_extract_all_slices_strided_i64_impl_dispatch_vector
-        [td_ns::num_types];
-
-using dpctl::tensor::kernels::indexing::
-    masked_extract_all_slices_contig_impl_fn_ptr_t;
-
-static masked_extract_all_slices_contig_impl_fn_ptr_t
-    masked_extract_all_slices_contig_i32_impl_dispatch_vector[td_ns::num_types];
-static masked_extract_all_slices_contig_impl_fn_ptr_t
-    masked_extract_all_slices_contig_i64_impl_dispatch_vector[td_ns::num_types];
-
-using dpctl::tensor::kernels::indexing::
-    masked_extract_some_slices_strided_impl_fn_ptr_t;
-
-static masked_extract_some_slices_strided_impl_fn_ptr_t
-    masked_extract_some_slices_strided_i32_impl_dispatch_vector
-        [td_ns::num_types];
-static masked_extract_some_slices_strided_impl_fn_ptr_t
-    masked_extract_some_slices_strided_i64_impl_dispatch_vector
-        [td_ns::num_types];
-
-void populate_masked_extract_dispatch_vectors(void)
-{
-    using dpctl::tensor::kernels::indexing::
-        MaskExtractAllSlicesStridedFactoryForInt32;
-    td_ns::DispatchVectorBuilder<
-        masked_extract_all_slices_strided_impl_fn_ptr_t,
-        MaskExtractAllSlicesStridedFactoryForInt32, td_ns::num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(
-        masked_extract_all_slices_strided_i32_impl_dispatch_vector);
-
-    using dpctl::tensor::kernels::indexing::
-        MaskExtractAllSlicesStridedFactoryForInt64;
-    td_ns::DispatchVectorBuilder<
-        masked_extract_all_slices_strided_impl_fn_ptr_t,
-        MaskExtractAllSlicesStridedFactoryForInt64, td_ns::num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(
-        masked_extract_all_slices_strided_i64_impl_dispatch_vector);
-
-    using dpctl::tensor::kernels::indexing::
-        MaskExtractSomeSlicesStridedFactoryForInt32;
-    td_ns::DispatchVectorBuilder<
-        masked_extract_some_slices_strided_impl_fn_ptr_t,
-        MaskExtractSomeSlicesStridedFactoryForInt32, td_ns::num_types>
-        dvb3;
-    dvb3.populate_dispatch_vector(
-        masked_extract_some_slices_strided_i32_impl_dispatch_vector);
-
-    using dpctl::tensor::kernels::indexing::
-        MaskExtractSomeSlicesStridedFactoryForInt64;
-    td_ns::DispatchVectorBuilder<
-        masked_extract_some_slices_strided_impl_fn_ptr_t,
-        MaskExtractSomeSlicesStridedFactoryForInt64, td_ns::num_types>
-        dvb4;
-    dvb4.populate_dispatch_vector(
-        masked_extract_some_slices_strided_i64_impl_dispatch_vector);
-
-    using dpctl::tensor::kernels::indexing::
-        MaskExtractAllSlicesContigFactoryForInt32;
-    td_ns::DispatchVectorBuilder<masked_extract_all_slices_contig_impl_fn_ptr_t,
-                                 MaskExtractAllSlicesContigFactoryForInt32,
-                                 td_ns::num_types>
-        dvb5;
-    dvb5.populate_dispatch_vector(
-        masked_extract_all_slices_contig_i32_impl_dispatch_vector);
-
-    using dpctl::tensor::kernels::indexing::
-        MaskExtractAllSlicesContigFactoryForInt64;
-    td_ns::DispatchVectorBuilder<masked_extract_all_slices_contig_impl_fn_ptr_t,
-                                 MaskExtractAllSlicesContigFactoryForInt64,
-                                 td_ns::num_types>
-        dvb6;
-    dvb6.populate_dispatch_vector(
-        masked_extract_all_slices_contig_i64_impl_dispatch_vector);
-}
-
-std::pair<sycl::event, sycl::event>
-py_extract(const dpctl::tensor::usm_ndarray &src,
-           const dpctl::tensor::usm_ndarray &cumsum,
-           int axis_start, // axis_start <= mask_i < axis_end
-           int axis_end,
-           const dpctl::tensor::usm_ndarray &dst,
-           sycl::queue &exec_q,
-           const std::vector<sycl::event> &depends)
-{
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    int src_nd = src.get_ndim();
-    if ((axis_start < 0 || axis_end > src_nd || axis_start >= axis_end)) {
-        throw py::value_error("Specified axes_start and axes_end are invalid.");
-    }
-    int mask_span_sz = axis_end - axis_start;
-
-    int dst_nd = dst.get_ndim();
-    if (src_nd != dst_nd + (mask_span_sz - 1)) {
-        throw py::value_error("Number of dimensions of source and destination "
-                              "arrays is not consistent");
-    }
-
-    if (!cumsum.is_c_contiguous() || cumsum.get_ndim() != 1) {
-        throw py::value_error("cumsum array must be a C-contiguous vector");
-    }
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, cumsum, dst})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    py::ssize_t cumsum_sz = cumsum.get_size();
-
-    const py::ssize_t *src_shape = src.get_shape_raw();
-    const py::ssize_t *dst_shape = dst.get_shape_raw();
-    bool same_ortho_dims(true);
-    std::size_t ortho_nelems(1); // number of orthogonal iterations
-
-    for (auto i = 0; i < axis_start; ++i) {
-        auto src_sh_i = src_shape[i];
-        ortho_nelems *= src_sh_i;
-        same_ortho_dims = same_ortho_dims && (src_sh_i == dst_shape[i]);
-    }
-    for (auto i = axis_end; i < src_nd; ++i) {
-        auto src_sh_i = src_shape[i];
-        ortho_nelems *= src_sh_i;
-        same_ortho_dims =
-            same_ortho_dims && (src_sh_i == dst_shape[i - (mask_span_sz - 1)]);
-    }
-
-    std::size_t masked_src_nelems(1);
-    std::size_t masked_dst_nelems(dst_shape[axis_start]);
-    for (auto i = axis_start; i < axis_end; ++i) {
-        masked_src_nelems *= src_shape[i];
-    }
-
-    // masked_dst_nelems is number of set elements in the mask, or last element
-    // in cumsum
-    if (!same_ortho_dims ||
-        (masked_src_nelems != static_cast<std::size_t>(cumsum_sz)))
-    {
-        throw py::value_error("Inconsistent array dimensions");
-    }
-
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
-        dst, ortho_nelems * masked_dst_nelems);
-
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    // check that dst does not intersect with src, not with cumsum.
-    if (overlap(dst, cumsum) || overlap(dst, src)) {
-        throw py::value_error("Destination array overlaps with inputs");
-    }
-
-    int src_typenum = src.get_typenum();
-    int dst_typenum = dst.get_typenum();
-    int cumsum_typenum = cumsum.get_typenum();
-
-    auto const &array_types = td_ns::usm_ndarray_types();
-    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
-
-    static constexpr int int32_typeid =
-        static_cast<int>(td_ns::typenum_t::INT32);
-    static constexpr int int64_typeid =
-        static_cast<int>(td_ns::typenum_t::INT64);
-    if (cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) {
-        throw py::value_error("Unexpected data type of cumsum array, expecting "
-                              "'int32' or 'int64'");
-    }
-
-    const bool use_i32 = (cumsum_typeid == int32_typeid);
-
-    if (src_typeid != dst_typeid) {
-        throw py::value_error(
-            "Destination array must have the same elemental data types");
-    }
-
-    char *src_data_p = src.get_data();
-    char *dst_data_p = dst.get_data();
-    char *cumsum_data_p = cumsum.get_data();
-
-    auto src_shape_vec = src.get_shape_vector();
-    auto src_strides_vec = src.get_strides_vector();
-
-    auto dst_shape_vec = dst.get_shape_vector();
-    auto dst_strides_vec = dst.get_strides_vector();
-
-    sycl::event extract_ev;
-    std::vector<sycl::event> host_task_events{};
-    if (axis_start == 0 && axis_end == src_nd) {
-        assert(dst_shape_vec.size() == 1);
-        assert(dst_strides_vec.size() == 1);
-
-        if (src.is_c_contiguous()) {
-            auto fn =
-                (use_i32)
-                    ? masked_extract_all_slices_contig_i32_impl_dispatch_vector
-                          [src_typeid]
-                    : masked_extract_all_slices_contig_i64_impl_dispatch_vector
-                          [src_typeid];
-
-            extract_ev =
-                fn(exec_q, cumsum_sz, src_data_p, cumsum_data_p, dst_data_p,
-                   dst_shape_vec[0], dst_strides_vec[0], depends);
-
-            //
-            host_task_events.push_back(extract_ev);
-        }
-        else {
-            // empty orthogonal directions
-            auto fn =
-                (use_i32)
-                    ? masked_extract_all_slices_strided_i32_impl_dispatch_vector
-                          [src_typeid]
-                    : masked_extract_all_slices_strided_i64_impl_dispatch_vector
-                          [src_typeid];
-
-            using dpctl::tensor::offset_utils::device_allocate_and_pack;
-            auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
-                exec_q, host_task_events, src_shape_vec, src_strides_vec);
-            auto packed_src_shape_strides_owner =
-                std::move(std::get<0>(ptr_size_event_tuple1));
-            sycl::event copy_src_shape_strides_ev =
-                std::get<2>(ptr_size_event_tuple1);
-            const py::ssize_t *packed_src_shape_strides =
-                packed_src_shape_strides_owner.get();
-
-            std::vector<sycl::event> all_deps;
-            all_deps.reserve(depends.size() + 1);
-            all_deps.insert(all_deps.end(), depends.begin(), depends.end());
-            all_deps.push_back(copy_src_shape_strides_ev);
-
-            assert(all_deps.size() == depends.size() + 1);
-
-            extract_ev = fn(exec_q, cumsum_sz, src_data_p, cumsum_data_p,
-                            dst_data_p, src_nd, packed_src_shape_strides,
-                            dst_shape_vec[0], dst_strides_vec[0], all_deps);
-
-            sycl::event cleanup_tmp_allocations_ev =
-                dpctl::tensor::alloc_utils::async_smart_free(
-                    exec_q, {extract_ev}, packed_src_shape_strides_owner);
-            host_task_events.push_back(cleanup_tmp_allocations_ev);
-        }
-    }
-    else {
-        // non-empty othogonal directions
-        auto fn =
-            (use_i32)
-                ? masked_extract_some_slices_strided_i32_impl_dispatch_vector
-                      [src_typeid]
-                : masked_extract_some_slices_strided_i64_impl_dispatch_vector
-                      [src_typeid];
-
-        int masked_src_nd = mask_span_sz;
-        int ortho_nd = src_nd - masked_src_nd;
-
-        using shT = std::vector<py::ssize_t>;
-
-        shT ortho_src_shape;
-        shT masked_src_shape;
-        shT ortho_src_strides;
-        shT masked_src_strides;
-        dpctl::tensor::py_internal::split_iteration_space(
-            src_shape_vec, src_strides_vec, axis_start, axis_end,
-            ortho_src_shape,
-            masked_src_shape, // 4 vectors modified
-            ortho_src_strides, masked_src_strides);
-
-        shT ortho_dst_shape;
-        shT masked_dst_shape;
-        shT ortho_dst_strides;
-        shT masked_dst_strides;
-        dpctl::tensor::py_internal::split_iteration_space(
-            dst_shape_vec, dst_strides_vec, axis_start, axis_start + 1,
-            ortho_dst_shape,
-            masked_dst_shape, // 4 vectors modified
-            ortho_dst_strides, masked_dst_strides);
-
-        assert(ortho_src_shape.size() == static_cast<std::size_t>(ortho_nd));
-        assert(ortho_dst_shape.size() == static_cast<std::size_t>(ortho_nd));
-        assert(std::equal(ortho_src_shape.begin(), ortho_src_shape.end(),
-                          ortho_dst_shape.begin()));
-
-        std::vector<py::ssize_t> simplified_ortho_shape;
-        std::vector<py::ssize_t> simplified_ortho_src_strides;
-        std::vector<py::ssize_t> simplified_ortho_dst_strides;
-
-        const py::ssize_t *_shape = ortho_src_shape.data();
-
-        py::ssize_t ortho_src_offset(0);
-        py::ssize_t ortho_dst_offset(0);
-
-        dpctl::tensor::py_internal::simplify_iteration_space(
-            ortho_nd, _shape, ortho_src_strides, ortho_dst_strides,
-            // output
-            simplified_ortho_shape, simplified_ortho_src_strides,
-            simplified_ortho_dst_strides, ortho_src_offset, ortho_dst_offset);
-
-        assert(masked_dst_shape.size() == 1);
-        assert(masked_dst_strides.size() == 1);
-
-        using dpctl::tensor::offset_utils::device_allocate_and_pack;
-        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
-            exec_q, host_task_events, simplified_ortho_shape,
-            simplified_ortho_src_strides, simplified_ortho_dst_strides,
-            masked_src_shape, masked_src_strides);
-        auto packed_shapes_strides_owner =
-            std::move(std::get<0>(ptr_size_event_tuple1));
-        sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
-        const py::ssize_t *packed_shapes_strides =
-            packed_shapes_strides_owner.get();
-
-        const py::ssize_t *packed_ortho_src_dst_shape_strides =
-            packed_shapes_strides;
-        const py::ssize_t *packed_masked_src_shape_strides =
-            packed_shapes_strides + (3 * ortho_nd);
-
-        std::vector<sycl::event> all_deps;
-        all_deps.reserve(depends.size() + 1);
-        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
-        all_deps.push_back(copy_shapes_strides_ev);
-
-        assert(all_deps.size() == depends.size() + 1);
-
-        // OrthogIndexerT orthog_src_dst_indexer_, MaskedIndexerT
-        // masked_src_indexer_, MaskedIndexerT masked_dst_indexer_
-        extract_ev = fn(exec_q, ortho_nelems, masked_src_nelems, src_data_p,
-                        cumsum_data_p, dst_data_p,
-                        // data to build orthog_src_dst_indexer
-                        ortho_nd, packed_ortho_src_dst_shape_strides,
-                        ortho_src_offset, ortho_dst_offset,
-                        // data to build masked_src_indexer
-                        masked_src_nd, packed_masked_src_shape_strides,
-                        // data to build masked_dst_indexer,
-                        masked_dst_shape[0], masked_dst_strides[0], all_deps);
-
-        sycl::event cleanup_tmp_allocations_ev =
-            dpctl::tensor::alloc_utils::async_smart_free(
-                exec_q, {extract_ev}, packed_shapes_strides_owner);
-        host_task_events.push_back(cleanup_tmp_allocations_ev);
-    }
-
-    sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive(
-        exec_q, {src, cumsum, dst}, host_task_events);
-
-    return std::make_pair(py_obj_management_host_task_ev, extract_ev);
-}
-
-// Masked placement
-
-using dpctl::tensor::kernels::indexing::
-    masked_place_all_slices_strided_impl_fn_ptr_t;
-
-static masked_place_all_slices_strided_impl_fn_ptr_t
-    masked_place_all_slices_strided_i32_impl_dispatch_vector[td_ns::num_types];
-static masked_place_all_slices_strided_impl_fn_ptr_t
-    masked_place_all_slices_strided_i64_impl_dispatch_vector[td_ns::num_types];
-
-using dpctl::tensor::kernels::indexing::
-    masked_place_some_slices_strided_impl_fn_ptr_t;
-
-static masked_place_some_slices_strided_impl_fn_ptr_t
-    masked_place_some_slices_strided_i32_impl_dispatch_vector[td_ns::num_types];
-static masked_place_some_slices_strided_impl_fn_ptr_t
-    masked_place_some_slices_strided_i64_impl_dispatch_vector[td_ns::num_types];
-
-void populate_masked_place_dispatch_vectors(void)
-{
-    using dpctl::tensor::kernels::indexing::
-        MaskPlaceAllSlicesStridedFactoryForInt32;
-    td_ns::DispatchVectorBuilder<masked_place_all_slices_strided_impl_fn_ptr_t,
-                                 MaskPlaceAllSlicesStridedFactoryForInt32,
-                                 td_ns::num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(
-        masked_place_all_slices_strided_i32_impl_dispatch_vector);
-
-    using dpctl::tensor::kernels::indexing::
-        MaskPlaceAllSlicesStridedFactoryForInt64;
-    td_ns::DispatchVectorBuilder<masked_place_all_slices_strided_impl_fn_ptr_t,
-                                 MaskPlaceAllSlicesStridedFactoryForInt64,
-                                 td_ns::num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(
-        masked_place_all_slices_strided_i64_impl_dispatch_vector);
-
-    using dpctl::tensor::kernels::indexing::
-        MaskPlaceSomeSlicesStridedFactoryForInt32;
-    td_ns::DispatchVectorBuilder<masked_place_some_slices_strided_impl_fn_ptr_t,
-                                 MaskPlaceSomeSlicesStridedFactoryForInt32,
-                                 td_ns::num_types>
-        dvb3;
-    dvb3.populate_dispatch_vector(
-        masked_place_some_slices_strided_i32_impl_dispatch_vector);
-
-    using dpctl::tensor::kernels::indexing::
-        MaskPlaceSomeSlicesStridedFactoryForInt64;
-    td_ns::DispatchVectorBuilder<masked_place_some_slices_strided_impl_fn_ptr_t,
-                                 MaskPlaceSomeSlicesStridedFactoryForInt64,
-                                 td_ns::num_types>
-        dvb4;
-    dvb4.populate_dispatch_vector(
-        masked_place_some_slices_strided_i64_impl_dispatch_vector);
-}
-
-/*
- * @brief Copy dst[i, ortho_id] = rhs[cumsum[i] - 1, ortho_id]  if cumsum[i] ==
- * ((i > 0) ? cumsum[i-1] + 1 : 1)
- */
-std::pair<sycl::event, sycl::event>
-py_place(const dpctl::tensor::usm_ndarray &dst,
-         const dpctl::tensor::usm_ndarray &cumsum,
-         int axis_start, // axis_start <= mask_i < axis_end
-         int axis_end,
-         const dpctl::tensor::usm_ndarray &rhs,
-         sycl::queue &exec_q,
-         const std::vector<sycl::event> &depends)
-{
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    int dst_nd = dst.get_ndim();
-    if ((axis_start < 0 || axis_end > dst_nd || axis_start >= axis_end)) {
-        throw py::value_error("Specified axes_start and axes_end are invalid.");
-    }
-    int mask_span_sz = axis_end - axis_start;
-
-    int rhs_nd = rhs.get_ndim();
-    if (dst_nd != rhs_nd + (mask_span_sz - 1)) {
-        throw py::value_error("Number of dimensions of source and destination "
-                              "arrays is not consistent");
-    }
-
-    if (!cumsum.is_c_contiguous() || cumsum.get_ndim() != 1) {
-        throw py::value_error("cumsum array must be a C-contiguous vector");
-    }
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {dst, cumsum, rhs})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    py::ssize_t cumsum_sz = cumsum.get_size();
-
-    const py::ssize_t *dst_shape = dst.get_shape_raw();
-    const py::ssize_t *rhs_shape = rhs.get_shape_raw();
-    bool same_ortho_dims(true);
-    std::size_t ortho_nelems(1); // number of orthogonal iterations
-
-    for (auto i = 0; i < axis_start; ++i) {
-        auto dst_sh_i = dst_shape[i];
-        ortho_nelems *= dst_sh_i;
-        same_ortho_dims = same_ortho_dims && (dst_sh_i == rhs_shape[i]);
-    }
-    for (auto i = axis_end; i < dst_nd; ++i) {
-        auto dst_sh_i = dst_shape[i];
-        ortho_nelems *= dst_sh_i;
-        same_ortho_dims =
-            same_ortho_dims && (dst_sh_i == rhs_shape[i - (mask_span_sz - 1)]);
-    }
-
-    std::size_t masked_dst_nelems(1);
-    for (auto i = axis_start; i < axis_end; ++i) {
-        masked_dst_nelems *= dst_shape[i];
-    }
-
-    if (!same_ortho_dims ||
-        (masked_dst_nelems != static_cast<std::size_t>(cumsum_sz)))
-    {
-        throw py::value_error("Inconsistent array dimensions");
-    }
-
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
-        dst, ortho_nelems * masked_dst_nelems);
-
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    // check that dst does not intersect with src, not with cumsum.
-    if (overlap(dst, rhs) || overlap(dst, cumsum)) {
-        throw py::value_error("Destination array overlaps with inputs");
-    }
-
-    int dst_typenum = dst.get_typenum();
-    int rhs_typenum = rhs.get_typenum();
-    int cumsum_typenum = cumsum.get_typenum();
-
-    auto const &array_types = td_ns::usm_ndarray_types();
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-    int rhs_typeid = array_types.typenum_to_lookup_id(rhs_typenum);
-    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
-
-    static constexpr int int32_typeid =
-        static_cast<int>(td_ns::typenum_t::INT32);
-    static constexpr int int64_typeid =
-        static_cast<int>(td_ns::typenum_t::INT64);
-    if (cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) {
-        throw py::value_error("Unexpected data type of cumsum array, expecting "
-                              "'int32' or 'int64'");
-    }
-
-    const bool use_i32 = (cumsum_typeid == int32_typeid);
-
-    if (dst_typeid != rhs_typeid) {
-        throw py::value_error(
-            "Destination array must have the same elemental data types");
-    }
-
-    char *dst_data_p = dst.get_data();
-    char *rhs_data_p = rhs.get_data();
-    char *cumsum_data_p = cumsum.get_data();
-
-    auto dst_shape_vec = dst.get_shape_vector();
-    auto dst_strides_vec = dst.get_strides_vector();
-
-    auto rhs_shape_vec = rhs.get_shape_vector();
-    auto rhs_strides_vec = rhs.get_strides_vector();
-
-    sycl::event place_ev;
-    std::vector<sycl::event> host_task_events{};
-    if (axis_start == 0 && axis_end == dst_nd) {
-        // empty orthogonal directions
-        auto fn = (use_i32)
-                      ? masked_place_all_slices_strided_i32_impl_dispatch_vector
-                            [dst_typeid]
-                      : masked_place_all_slices_strided_i64_impl_dispatch_vector
-                            [dst_typeid];
-
-        assert(rhs_shape_vec.size() == 1);
-        assert(rhs_strides_vec.size() == 1);
-
-        using dpctl::tensor::offset_utils::device_allocate_and_pack;
-        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
-            exec_q, host_task_events, dst_shape_vec, dst_strides_vec);
-        auto packed_dst_shape_strides_owner =
-            std::move(std::get<0>(ptr_size_event_tuple1));
-        sycl::event copy_dst_shape_strides_ev =
-            std::get<2>(ptr_size_event_tuple1);
-        const py::ssize_t *packed_dst_shape_strides =
-            packed_dst_shape_strides_owner.get();
-
-        std::vector<sycl::event> all_deps;
-        all_deps.reserve(depends.size() + 1);
-        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
-        all_deps.push_back(copy_dst_shape_strides_ev);
-
-        assert(all_deps.size() == depends.size() + 1);
-
-        place_ev = fn(exec_q, cumsum_sz, dst_data_p, cumsum_data_p, rhs_data_p,
-                      dst_nd, packed_dst_shape_strides, rhs_shape_vec[0],
-                      rhs_strides_vec[0], all_deps);
-
-        sycl::event cleanup_tmp_allocations_ev =
-            dpctl::tensor::alloc_utils::async_smart_free(
-                exec_q, {place_ev}, packed_dst_shape_strides_owner);
-        host_task_events.push_back(cleanup_tmp_allocations_ev);
-    }
-    else {
-        // non-empty othogonal directions
-        auto fn =
-            (use_i32)
-                ? masked_place_some_slices_strided_i32_impl_dispatch_vector
-                      [dst_typeid]
-                : masked_place_some_slices_strided_i64_impl_dispatch_vector
-                      [dst_typeid];
-
-        int masked_dst_nd = mask_span_sz;
-        int ortho_nd = dst_nd - masked_dst_nd;
-
-        using shT = std::vector<py::ssize_t>;
-
-        shT ortho_dst_shape;
-        shT masked_dst_shape;
-        shT ortho_dst_strides;
-        shT masked_dst_strides;
-        dpctl::tensor::py_internal::split_iteration_space(
-            dst_shape_vec, dst_strides_vec, axis_start, axis_end,
-            ortho_dst_shape,
-            masked_dst_shape, // 4 vectors modified
-            ortho_dst_strides, masked_dst_strides);
-
-        shT ortho_rhs_shape;
-        shT masked_rhs_shape;
-        shT ortho_rhs_strides;
-        shT masked_rhs_strides;
-        dpctl::tensor::py_internal::split_iteration_space(
-            rhs_shape_vec, rhs_strides_vec, axis_start, axis_start + 1,
-            ortho_rhs_shape,
-            masked_rhs_shape, // 4 vectors modified
-            ortho_rhs_strides, masked_rhs_strides);
-
-        assert(ortho_dst_shape.size() == static_cast<std::size_t>(ortho_nd));
-        assert(ortho_rhs_shape.size() == static_cast<std::size_t>(ortho_nd));
-        assert(std::equal(ortho_dst_shape.begin(), ortho_dst_shape.end(),
-                          ortho_rhs_shape.begin()));
-
-        std::vector<py::ssize_t> simplified_ortho_shape;
-        std::vector<py::ssize_t> simplified_ortho_dst_strides;
-        std::vector<py::ssize_t> simplified_ortho_rhs_strides;
-
-        const py::ssize_t *_shape = ortho_dst_shape.data();
-
-        py::ssize_t ortho_dst_offset(0);
-        py::ssize_t ortho_rhs_offset(0);
-
-        dpctl::tensor::py_internal::simplify_iteration_space(
-            ortho_nd, _shape, ortho_dst_strides, ortho_rhs_strides,
-            simplified_ortho_shape, simplified_ortho_dst_strides,
-            simplified_ortho_rhs_strides, ortho_dst_offset, ortho_rhs_offset);
-
-        assert(masked_rhs_shape.size() == 1);
-        assert(masked_rhs_strides.size() == 1);
-
-        using dpctl::tensor::offset_utils::device_allocate_and_pack;
-        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
-            exec_q, host_task_events, simplified_ortho_shape,
-            simplified_ortho_dst_strides, simplified_ortho_rhs_strides,
-            masked_dst_shape, masked_dst_strides);
-        auto packed_shapes_strides_owner =
-            std::move(std::get<0>(ptr_size_event_tuple1));
-        sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
-        const py::ssize_t *packed_shapes_strides =
-            packed_shapes_strides_owner.get();
-
-        const py::ssize_t *packed_ortho_dst_rhs_shape_strides =
-            packed_shapes_strides;
-        const py::ssize_t *packed_masked_dst_shape_strides =
-            packed_shapes_strides + (3 * ortho_nd);
-
-        std::vector<sycl::event> all_deps;
-        all_deps.reserve(depends.size() + 1);
-        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
-        all_deps.push_back(copy_shapes_strides_ev);
-
-        assert(all_deps.size() == depends.size() + 1);
-
-        place_ev = fn(exec_q, ortho_nelems, masked_dst_nelems, dst_data_p,
-                      cumsum_data_p, rhs_data_p,
-                      // data to build orthog_dst_rhs_indexer
-                      ortho_nd, packed_ortho_dst_rhs_shape_strides,
-                      ortho_dst_offset, ortho_rhs_offset,
-                      // data to build masked_dst_indexer
-                      masked_dst_nd, packed_masked_dst_shape_strides,
-                      // data to build masked_dst_indexer,
-                      masked_rhs_shape[0], masked_rhs_strides[0], all_deps);
-
-        sycl::event cleanup_tmp_allocations_ev =
-            dpctl::tensor::alloc_utils::async_smart_free(
-                exec_q, {place_ev}, packed_shapes_strides_owner);
-        host_task_events.push_back(cleanup_tmp_allocations_ev);
-    }
-
-    sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive(
-        exec_q, {dst, cumsum, rhs}, host_task_events);
-
-    return std::make_pair(py_obj_management_host_task_ev, place_ev);
-}
-
-// Non-zero
-
-std::pair<sycl::event, sycl::event>
-py_nonzero(const dpctl::tensor::usm_ndarray
-               &cumsum, // int32/int64 input array, 1D, C-contiguous
-           const dpctl::tensor::usm_ndarray
-               &indexes, // int32/int64 2D output array, C-contiguous
-           const std::vector<py::ssize_t>
-               &mask_shape, // shape of array from which cumsum was computed
-           sycl::queue &exec_q,
-           const std::vector<sycl::event> &depends)
-{
-    if (!dpctl::utils::queues_are_compatible(exec_q, {cumsum, indexes})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(indexes);
-
-    int cumsum_nd = cumsum.get_ndim();
-    if (cumsum_nd != 1 || !cumsum.is_c_contiguous()) {
-        throw py::value_error("Cumsum array must be a C-contiguous vector");
-    }
-
-    int indexes_nd = indexes.get_ndim();
-    if (indexes_nd != 2 || !indexes.is_c_contiguous()) {
-        throw py::value_error("Index array must be a C-contiguous matrix");
-    }
-
-    std::size_t _ndim = mask_shape.size();
-    if (_ndim > std::numeric_limits<int>::max()) {
-        throw py::value_error("Shape is too large");
-    }
-    int ndim = static_cast<int>(_ndim);
-
-    const py::ssize_t *indexes_shape = indexes.get_shape_raw();
-
-    if (ndim != indexes_shape[0]) {
-        throw py::value_error(
-            "Length of shape must equal width of index matrix");
-    }
-
-    auto cumsum_sz = cumsum.get_size();
-    py::ssize_t shape_nelems =
-        std::accumulate(mask_shape.begin(), mask_shape.end(), py::ssize_t(1),
-                        std::multiplies<py::ssize_t>());
-
-    if (cumsum_sz != shape_nelems) {
-        throw py::value_error("Shape and cumsum size are not constent");
-    }
-
-    py::ssize_t nz_elems = indexes_shape[1];
-
-    int indexes_typenum = indexes.get_typenum();
-    auto const &array_types = td_ns::usm_ndarray_types();
-    int indexes_typeid = array_types.typenum_to_lookup_id(indexes_typenum);
-
-    int cumsum_typenum = cumsum.get_typenum();
-    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
-
-    constexpr int int32_typeid = static_cast<int>(td_ns::typenum_t::INT32);
-    constexpr int int64_typeid = static_cast<int>(td_ns::typenum_t::INT64);
-
-    // cumsum must be int32_t or int64_t only
-    if ((cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) ||
-        (indexes_typeid != int32_typeid && indexes_typeid != int64_typeid))
-    {
-        throw py::value_error("Cumulative sum array and index array must have "
-                              "int32 or int64 data-type");
-    }
-
-    if (cumsum_sz == 0) {
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    if (overlap(cumsum, indexes)) {
-        throw py::value_error("Arrays are expected to ave no memory overlap");
-    }
-
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
-        indexes, nz_elems * _ndim);
-
-    std::vector<sycl::event> host_task_events;
-    host_task_events.reserve(2);
-
-    using dpctl::tensor::offset_utils::device_allocate_and_pack;
-    auto mask_shape_copying_tuple = device_allocate_and_pack<py::ssize_t>(
-        exec_q, host_task_events, mask_shape);
-    auto src_shape_device_owner =
-        std::move(std::get<0>(mask_shape_copying_tuple));
-    sycl::event copy_ev = std::get<2>(mask_shape_copying_tuple);
-    const py::ssize_t *src_shape_device_ptr = src_shape_device_owner.get();
-
-    std::vector<sycl::event> all_deps;
-    all_deps.reserve(depends.size() + 1);
-
-    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
-    all_deps.push_back(copy_ev);
-
-    using dpctl::tensor::kernels::indexing::non_zero_indexes_fn_ptr_t;
-    using dpctl::tensor::kernels::indexing::non_zero_indexes_impl;
-
-    int fn_index = ((cumsum_typeid == int64_typeid) ? 1 : 0) +
-                   ((indexes_typeid == int64_typeid) ? 2 : 0);
-    std::array<non_zero_indexes_fn_ptr_t, 4> fn_impls = {
-        non_zero_indexes_impl<std::int32_t, std::int32_t>,
-        non_zero_indexes_impl<std::int64_t, std::int32_t>,
-        non_zero_indexes_impl<std::int32_t, std::int64_t>,
-        non_zero_indexes_impl<std::int64_t, std::int64_t>};
-    auto fn = fn_impls[fn_index];
-
-    sycl::event non_zero_indexes_ev =
-        fn(exec_q, cumsum_sz, nz_elems, ndim, cumsum.get_data(),
-           indexes.get_data(), src_shape_device_ptr, all_deps);
-
-    sycl::event temporaries_cleanup_ev =
-        dpctl::tensor::alloc_utils::async_smart_free(
-            exec_q, {non_zero_indexes_ev}, src_shape_device_owner);
-    host_task_events.push_back(temporaries_cleanup_ev);
-
-    sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive(
-        exec_q, {cumsum, indexes}, host_task_events);
-
-    return std::make_pair(py_obj_management_host_task_ev, non_zero_indexes_ev);
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp
deleted file mode 100644
index 9ea72e969c..0000000000
--- a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp
+++ /dev/null
@@ -1,75 +0,0 @@
-//===-- boolean_advanced_indexing.hpp -                       --*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file declares Python API for implementation functions of
-/// dpctl.tensor.place, dpctl.tensor.extract, and dpctl.tensor.nonzero
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/pybind11.h>
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern std::pair<sycl::event, sycl::event>
-py_extract(const dpctl::tensor::usm_ndarray &src,
-           const dpctl::tensor::usm_ndarray &cumsum,
-           int axis_start, // axis_start <= mask_i < axis_end
-           int axis_end,
-           const dpctl::tensor::usm_ndarray &dst,
-           sycl::queue &exec_q,
-           const std::vector<sycl::event> &depends = {});
-
-extern void populate_masked_extract_dispatch_vectors(void);
-
-extern std::pair<sycl::event, sycl::event>
-py_place(const dpctl::tensor::usm_ndarray &dst,
-         const dpctl::tensor::usm_ndarray &cumsum,
-         int axis_start, // axis_start <= mask_i < axis_end
-         int axis_end,
-         const dpctl::tensor::usm_ndarray &rhs,
-         sycl::queue &exec_q,
-         const std::vector<sycl::event> &depends = {});
-
-extern void populate_masked_place_dispatch_vectors(void);
-
-extern std::pair<sycl::event, sycl::event>
-py_nonzero(const dpctl::tensor::usm_ndarray
-               &cumsum, // int32 input array, 1D, C-contiguous
-           const dpctl::tensor::usm_ndarray
-               &indexes, // int32 2D output array, C-contiguous
-           const std::vector<py::ssize_t>
-               &mask_shape, // shape of array from which cumsum was computed
-           sycl::queue &exec_q,
-           const std::vector<sycl::event> &depends = {});
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/clip.cpp b/dpctl/tensor/libtensor/source/clip.cpp
deleted file mode 100644
index 1149e26bd1..0000000000
--- a/dpctl/tensor/libtensor/source/clip.cpp
+++ /dev/null
@@ -1,261 +0,0 @@
-//===-- clip.cpp - Implementation of clip  --*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines Python API for implementation functions of
-/// dpctl.tensor.clip
-//===----------------------------------------------------------------------===//
-
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <stdexcept>
-#include <sycl/sycl.hpp>
-#include <utility>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/complex.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include "clip.hpp"
-#include "kernels/clip.hpp"
-#include "simplify_iteration_space.hpp"
-#include "utils/memory_overlap.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/output_validation.hpp"
-#include "utils/sycl_alloc_utils.hpp"
-#include "utils/type_dispatch.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::kernels::clip::clip_contig_impl_fn_ptr_t;
-using dpctl::tensor::kernels::clip::clip_strided_impl_fn_ptr_t;
-
-static clip_contig_impl_fn_ptr_t clip_contig_dispatch_vector[td_ns::num_types];
-static clip_strided_impl_fn_ptr_t
-    clip_strided_dispatch_vector[td_ns::num_types];
-
-void init_clip_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    using dpctl::tensor::kernels::clip::ClipContigFactory;
-    DispatchVectorBuilder<clip_contig_impl_fn_ptr_t, ClipContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(clip_contig_dispatch_vector);
-
-    using dpctl::tensor::kernels::clip::ClipStridedFactory;
-    DispatchVectorBuilder<clip_strided_impl_fn_ptr_t, ClipStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(clip_strided_dispatch_vector);
-}
-
-using dpctl::utils::keep_args_alive;
-
-std::pair<sycl::event, sycl::event>
-py_clip(const dpctl::tensor::usm_ndarray &src,
-        const dpctl::tensor::usm_ndarray &min,
-        const dpctl::tensor::usm_ndarray &max,
-        const dpctl::tensor::usm_ndarray &dst,
-        sycl::queue &exec_q,
-        const std::vector<sycl::event> &depends)
-{
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, min, max, dst})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    int nd = src.get_ndim();
-    int min_nd = min.get_ndim();
-    int max_nd = max.get_ndim();
-    int dst_nd = dst.get_ndim();
-
-    if (nd != min_nd || nd != max_nd) {
-        throw py::value_error(
-            "Input arrays are not of appropriate dimension for clip kernel.");
-    }
-
-    if (nd != dst_nd) {
-        throw py::value_error(
-            "Destination is not of appropriate dimension for clip kernel.");
-    }
-
-    const py::ssize_t *src_shape = src.get_shape_raw();
-    const py::ssize_t *min_shape = min.get_shape_raw();
-    const py::ssize_t *max_shape = max.get_shape_raw();
-    const py::ssize_t *dst_shape = dst.get_shape_raw();
-
-    bool shapes_equal(true);
-    std::size_t nelems(1);
-    for (int i = 0; i < nd; ++i) {
-        const auto &sh_i = dst_shape[i];
-        nelems *= static_cast<std::size_t>(sh_i);
-        shapes_equal = shapes_equal && (min_shape[i] == sh_i) &&
-                       (max_shape[i] == sh_i) && (src_shape[i] == sh_i);
-    }
-
-    if (!shapes_equal) {
-        throw py::value_error("Arrays are not of matching shapes.");
-    }
-
-    if (nelems == 0) {
-        return std::make_pair(sycl::event{}, sycl::event{});
-    }
-
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    auto const &same_logical_tensors =
-        dpctl::tensor::overlap::SameLogicalTensors();
-    if ((overlap(dst, src) && !same_logical_tensors(dst, src)) ||
-        (overlap(dst, min) && !same_logical_tensors(dst, min)) ||
-        (overlap(dst, max) && !same_logical_tensors(dst, max)))
-    {
-        throw py::value_error("Destination array overlaps with input.");
-    }
-
-    int min_typenum = min.get_typenum();
-    int max_typenum = max.get_typenum();
-    int src_typenum = src.get_typenum();
-    int dst_typenum = dst.get_typenum();
-
-    auto const &array_types = td_ns::usm_ndarray_types();
-    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
-    int min_typeid = array_types.typenum_to_lookup_id(min_typenum);
-    int max_typeid = array_types.typenum_to_lookup_id(max_typenum);
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    if (src_typeid != dst_typeid || src_typeid != min_typeid ||
-        src_typeid != max_typeid)
-    {
-        throw py::value_error("Input, min, max, and destination arrays must "
-                              "have the same data type");
-    }
-
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, nelems);
-
-    char *src_data = src.get_data();
-    char *min_data = min.get_data();
-    char *max_data = max.get_data();
-    char *dst_data = dst.get_data();
-
-    bool is_min_c_contig = min.is_c_contiguous();
-    bool is_min_f_contig = min.is_f_contiguous();
-
-    bool is_max_c_contig = max.is_c_contiguous();
-    bool is_max_f_contig = max.is_f_contiguous();
-
-    bool is_src_c_contig = src.is_c_contiguous();
-    bool is_src_f_contig = src.is_f_contiguous();
-
-    bool is_dst_c_contig = dst.is_c_contiguous();
-    bool is_dst_f_contig = dst.is_f_contiguous();
-
-    bool all_c_contig = (is_min_c_contig && is_max_c_contig &&
-                         is_src_c_contig && is_dst_c_contig);
-    bool all_f_contig = (is_min_f_contig && is_max_f_contig &&
-                         is_src_f_contig && is_dst_f_contig);
-
-    if (all_c_contig || all_f_contig) {
-        auto fn = clip_contig_dispatch_vector[src_typeid];
-
-        sycl::event clip_ev =
-            fn(exec_q, nelems, src_data, min_data, max_data, dst_data, depends);
-        sycl::event ht_ev =
-            keep_args_alive(exec_q, {src, min, max, dst}, {clip_ev});
-
-        return std::make_pair(ht_ev, clip_ev);
-    }
-
-    auto const &src_strides = src.get_strides_vector();
-    auto const &min_strides = min.get_strides_vector();
-    auto const &max_strides = max.get_strides_vector();
-    auto const &dst_strides = dst.get_strides_vector();
-
-    using shT = std::vector<py::ssize_t>;
-    shT simplified_shape;
-    shT simplified_src_strides;
-    shT simplified_min_strides;
-    shT simplified_max_strides;
-    shT simplified_dst_strides;
-    py::ssize_t src_offset(0);
-    py::ssize_t min_offset(0);
-    py::ssize_t max_offset(0);
-    py::ssize_t dst_offset(0);
-
-    dpctl::tensor::py_internal::simplify_iteration_space_4(
-        nd, src_shape, src_strides, min_strides, max_strides, dst_strides,
-        // outputs
-        simplified_shape, simplified_src_strides, simplified_min_strides,
-        simplified_max_strides, simplified_dst_strides, src_offset, min_offset,
-        max_offset, dst_offset);
-
-    auto fn = clip_strided_dispatch_vector[src_typeid];
-
-    std::vector<sycl::event> host_task_events;
-    host_task_events.reserve(2);
-
-    using dpctl::tensor::offset_utils::device_allocate_and_pack;
-    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
-        exec_q, host_task_events,
-        // common shape and strides
-        simplified_shape, simplified_src_strides, simplified_min_strides,
-        simplified_max_strides, simplified_dst_strides);
-    auto packed_shape_strides_owner =
-        std::move(std::get<0>(ptr_size_event_tuple));
-    sycl::event copy_shape_strides_ev = std::get<2>(ptr_size_event_tuple);
-    const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get();
-
-    std::vector<sycl::event> all_deps;
-    all_deps.reserve(depends.size() + 1);
-    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
-    all_deps.push_back(copy_shape_strides_ev);
-
-    assert(all_deps.size() == depends.size() + 1);
-
-    sycl::event clip_ev = fn(exec_q, nelems, nd, src_data, min_data, max_data,
-                             dst_data, packed_shape_strides, src_offset,
-                             min_offset, max_offset, dst_offset, all_deps);
-
-    // free packed temporaries
-    sycl::event temporaries_cleanup_ev =
-        dpctl::tensor::alloc_utils::async_smart_free(
-            exec_q, {clip_ev}, packed_shape_strides_owner);
-    host_task_events.push_back(temporaries_cleanup_ev);
-
-    sycl::event arg_cleanup_ev =
-        keep_args_alive(exec_q, {src, min, max, dst}, host_task_events);
-
-    return std::make_pair(arg_cleanup_ev, clip_ev);
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/clip.hpp b/dpctl/tensor/libtensor/source/clip.hpp
deleted file mode 100644
index cb771282e8..0000000000
--- a/dpctl/tensor/libtensor/source/clip.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-//===--                      clip.hpp -                       --*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file declares Python API for implementation functions of
-/// dpctl.tensor.clip
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern std::pair<sycl::event, sycl::event>
-py_clip(const dpctl::tensor::usm_ndarray &src,
-        const dpctl::tensor::usm_ndarray &min,
-        const dpctl::tensor::usm_ndarray &max,
-        const dpctl::tensor::usm_ndarray &dst,
-        sycl::queue &exec_q,
-        const std::vector<sycl::event> &depends);
-
-extern void init_clip_dispatch_vectors(void);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp b/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
deleted file mode 100644
index 4c6946505b..0000000000
--- a/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
+++ /dev/null
@@ -1,300 +0,0 @@
-//===-- tensor_py.cpp - Implementation of _tensor_impl module  --*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===----------------------------------------------------------------------===//
-
-#include <algorithm>
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <stdexcept>
-#include <sycl/sycl.hpp>
-#include <thread>
-#include <type_traits>
-#include <utility>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/complex.h>
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include "kernels/copy_and_cast.hpp"
-#include "utils/memory_overlap.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/output_validation.hpp"
-#include "utils/sycl_alloc_utils.hpp"
-#include "utils/type_dispatch.hpp"
-#include "utils/type_utils.hpp"
-
-#include "copy_as_contig.hpp"
-#include "simplify_iteration_space.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_1d_fn_ptr_t;
-using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_contig_fn_ptr_t;
-using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_generic_fn_ptr_t;
-
-static copy_and_cast_generic_fn_ptr_t
-    copy_and_cast_generic_dispatch_table[td_ns::num_types][td_ns::num_types];
-static copy_and_cast_1d_fn_ptr_t
-    copy_and_cast_1d_dispatch_table[td_ns::num_types][td_ns::num_types];
-static copy_and_cast_contig_fn_ptr_t
-    copy_and_cast_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-namespace py = pybind11;
-
-using dpctl::utils::keep_args_alive;
-
-std::pair<sycl::event, sycl::event>
-copy_usm_ndarray_into_usm_ndarray(const dpctl::tensor::usm_ndarray &src,
-                                  const dpctl::tensor::usm_ndarray &dst,
-                                  sycl::queue &exec_q,
-                                  const std::vector<sycl::event> &depends = {})
-{
-    // array dimensions must be the same
-    int src_nd = src.get_ndim();
-    int dst_nd = dst.get_ndim();
-
-    if (src_nd != dst_nd) {
-        throw py::value_error("Array dimensions are not the same.");
-    }
-
-    // shapes must be the same
-    const py::ssize_t *src_shape = src.get_shape_raw();
-    const py::ssize_t *dst_shape = dst.get_shape_raw();
-
-    bool shapes_equal(true);
-    std::size_t src_nelems(1);
-
-    for (int i = 0; shapes_equal && (i < src_nd); ++i) {
-        src_nelems *= static_cast<std::size_t>(src_shape[i]);
-        shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
-    }
-    if (!shapes_equal) {
-        throw py::value_error("Array shapes are not the same.");
-    }
-
-    if (src_nelems == 0) {
-        // nothing to do
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
-
-    // check compatibility of execution queue and allocation queue
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    int src_typenum = src.get_typenum();
-    int dst_typenum = dst.get_typenum();
-
-    auto array_types = td_ns::usm_ndarray_types();
-    int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
-    int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
-
-    char *src_data = src.get_data();
-    char *dst_data = dst.get_data();
-
-    // check that arrays do not overlap, and concurrent copying is safe.
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    if (overlap(src, dst)) {
-        // TODO: could use a temporary, but this is done by the caller
-        throw py::value_error("Arrays index overlapping segments of memory");
-    }
-
-    bool is_src_c_contig = src.is_c_contiguous();
-    bool is_src_f_contig = src.is_f_contiguous();
-
-    bool is_dst_c_contig = dst.is_c_contiguous();
-    bool is_dst_f_contig = dst.is_f_contiguous();
-
-    // check for applicability of special cases:
-    //      (both C-contiguous || both F-contiguous)
-    bool both_c_contig = (is_src_c_contig && is_dst_c_contig);
-    bool both_f_contig = (is_src_f_contig && is_dst_f_contig);
-    if (both_c_contig || both_f_contig) {
-
-        sycl::event copy_ev;
-        if (src_type_id == dst_type_id) {
-
-            int src_elem_size = src.get_elemsize();
-
-            copy_ev = exec_q.memcpy(static_cast<void *>(dst_data),
-                                    static_cast<const void *>(src_data),
-                                    src_nelems * src_elem_size, depends);
-        }
-        else {
-            auto contig_fn =
-                copy_and_cast_contig_dispatch_table[dst_type_id][src_type_id];
-            copy_ev =
-                contig_fn(exec_q, src_nelems, src_data, dst_data, depends);
-        }
-        // make sure src and dst are not GC-ed before copy_ev is complete
-        return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}),
-                              copy_ev);
-    }
-
-    if ((src_type_id == dst_type_id) && (src_nd > 1)) {
-        if (is_dst_c_contig) {
-            return py_as_c_contig(src, dst, exec_q, depends);
-        }
-        else if (is_dst_f_contig) {
-            return py_as_f_contig(src, dst, exec_q, depends);
-        }
-    }
-
-    auto const &src_strides = src.get_strides_vector();
-    auto const &dst_strides = dst.get_strides_vector();
-
-    using shT = std::vector<py::ssize_t>;
-    shT simplified_shape;
-    shT simplified_src_strides;
-    shT simplified_dst_strides;
-    py::ssize_t src_offset(0);
-    py::ssize_t dst_offset(0);
-
-    int nd = src_nd;
-    const py::ssize_t *shape = src_shape;
-
-    // nd, simplified_* and *_offset are modified by reference
-    dpctl::tensor::py_internal::simplify_iteration_space(
-        nd, shape, src_strides, dst_strides,
-        // output
-        simplified_shape, simplified_src_strides, simplified_dst_strides,
-        src_offset, dst_offset);
-
-    if (nd < 2) {
-        if (nd == 1) {
-            std::array<py::ssize_t, 1> shape_arr = {simplified_shape[0]};
-            std::array<py::ssize_t, 1> src_strides_arr = {
-                simplified_src_strides[0]};
-            std::array<py::ssize_t, 1> dst_strides_arr = {
-                simplified_dst_strides[0]};
-
-            sycl::event copy_and_cast_1d_event;
-            if ((src_strides_arr[0] == 1) && (dst_strides_arr[0] == 1) &&
-                (src_offset == 0) && (dst_offset == 0))
-            {
-                auto contig_fn =
-                    copy_and_cast_contig_dispatch_table[dst_type_id]
-                                                       [src_type_id];
-                copy_and_cast_1d_event =
-                    contig_fn(exec_q, src_nelems, src_data, dst_data, depends);
-            }
-            else {
-                auto fn =
-                    copy_and_cast_1d_dispatch_table[dst_type_id][src_type_id];
-                copy_and_cast_1d_event =
-                    fn(exec_q, src_nelems, shape_arr, src_strides_arr,
-                       dst_strides_arr, src_data, src_offset, dst_data,
-                       dst_offset, depends);
-            }
-            return std::make_pair(
-                keep_args_alive(exec_q, {src, dst}, {copy_and_cast_1d_event}),
-                copy_and_cast_1d_event);
-        }
-        else if (nd == 0) { // case of a scalar
-            assert(src_nelems == 1);
-            std::array<py::ssize_t, 1> shape_arr = {1};
-            std::array<py::ssize_t, 1> src_strides_arr = {1};
-            std::array<py::ssize_t, 1> dst_strides_arr = {1};
-
-            auto fn = copy_and_cast_1d_dispatch_table[dst_type_id][src_type_id];
-
-            sycl::event copy_and_cast_0d_event = fn(
-                exec_q, src_nelems, shape_arr, src_strides_arr, dst_strides_arr,
-                src_data, src_offset, dst_data, dst_offset, depends);
-
-            return std::make_pair(
-                keep_args_alive(exec_q, {src, dst}, {copy_and_cast_0d_event}),
-                copy_and_cast_0d_event);
-        }
-    }
-
-    // Generic implementation
-    auto copy_and_cast_fn =
-        copy_and_cast_generic_dispatch_table[dst_type_id][src_type_id];
-
-    std::vector<sycl::event> host_task_events;
-    host_task_events.reserve(2);
-
-    using dpctl::tensor::offset_utils::device_allocate_and_pack;
-    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
-        exec_q, host_task_events, simplified_shape, simplified_src_strides,
-        simplified_dst_strides);
-    auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
-    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
-    const py::ssize_t *shape_strides = shape_strides_owner.get();
-
-    const sycl::event &copy_and_cast_generic_ev = copy_and_cast_fn(
-        exec_q, src_nelems, nd, shape_strides, src_data, src_offset, dst_data,
-        dst_offset, depends, {copy_shape_ev});
-
-    // async free of shape_strides temporary
-    const auto &temporaries_cleanup_ev =
-        dpctl::tensor::alloc_utils::async_smart_free(
-            exec_q, {copy_and_cast_generic_ev}, shape_strides_owner);
-    host_task_events.push_back(temporaries_cleanup_ev);
-
-    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
-                          copy_and_cast_generic_ev);
-}
-
-void init_copy_and_cast_usm_to_usm_dispatch_tables(void)
-{
-    using namespace td_ns;
-
-    using dpctl::tensor::kernels::copy_and_cast::CopyAndCastContigFactory;
-    DispatchTableBuilder<copy_and_cast_contig_fn_ptr_t,
-                         CopyAndCastContigFactory, num_types>
-        dtb_contig;
-    dtb_contig.populate_dispatch_table(copy_and_cast_contig_dispatch_table);
-
-    using dpctl::tensor::kernels::copy_and_cast::CopyAndCastGenericFactory;
-    DispatchTableBuilder<copy_and_cast_generic_fn_ptr_t,
-                         CopyAndCastGenericFactory, num_types>
-        dtb_generic;
-    dtb_generic.populate_dispatch_table(copy_and_cast_generic_dispatch_table);
-
-    using dpctl::tensor::kernels::copy_and_cast::CopyAndCast1DFactory;
-    DispatchTableBuilder<copy_and_cast_1d_fn_ptr_t, CopyAndCast1DFactory,
-                         num_types>
-        dtb_1d;
-    dtb_1d.populate_dispatch_table(copy_and_cast_1d_dispatch_table);
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp b/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp
deleted file mode 100644
index 3d1dabe97f..0000000000
--- a/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/pybind11.h>
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern std::pair<sycl::event, sycl::event>
-copy_usm_ndarray_into_usm_ndarray(const dpctl::tensor::usm_ndarray &src,
-                                  const dpctl::tensor::usm_ndarray &dst,
-                                  sycl::queue &exec_q,
-                                  const std::vector<sycl::event> &depends = {});
-
-extern void init_copy_and_cast_usm_to_usm_dispatch_tables();
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/copy_as_contig.cpp b/dpctl/tensor/libtensor/source/copy_as_contig.cpp
deleted file mode 100644
index 8b988e73ed..0000000000
--- a/dpctl/tensor/libtensor/source/copy_as_contig.cpp
+++ /dev/null
@@ -1,747 +0,0 @@
-//==- copy_ascontig.cpp - Implementation of _tensor_impl module   -*-C++-*-/==//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===----------------------------------------------------------------------===//
-
-#include <algorithm>
-#include <cstddef>
-#include <numeric>
-#include <stdexcept>
-#include <utility>
-#include <vector>
-
-#include <sycl/sycl.hpp>
-
-#include <dpctl4pybind11.hpp>
-#include <pybind11/pybind11.h>
-
-#include "kernels/copy_as_contiguous.hpp"
-#include "utils/memory_overlap.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/output_validation.hpp"
-#include "utils/sycl_alloc_utils.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "copy_as_contig.hpp"
-#include "simplify_iteration_space.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::kernels::copy_as_contig::
-    as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t;
-using dpctl::tensor::kernels::copy_as_contig::
-    as_c_contiguous_array_impl_fn_ptr_t;
-using dpctl::tensor::kernels::copy_as_contig::
-    as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t;
-using dpctl::utils::keep_args_alive;
-
-static as_c_contiguous_array_impl_fn_ptr_t
-    as_c_contig_array_dispatch_vector[td_ns::num_types];
-
-static as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t
-    as_c_contig_1d_batch_of_square_matrices_dispatch_vector[td_ns::num_types];
-
-static as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t
-    as_c_contig_nd_batch_of_square_matrices_dispatch_vector[td_ns::num_types];
-
-void init_copy_as_contig_dispatch_vectors(void)
-{
-
-    using dpctl::tensor::kernels::copy_as_contig::
-        AsCContig1DBatchOfSquareMatricesFactory;
-    using dpctl::tensor::kernels::copy_as_contig::AsCContigFactory;
-    using dpctl::tensor::kernels::copy_as_contig::
-        AsCContigNDBatchOfSquareMatricesFactory;
-    using td_ns::DispatchVectorBuilder;
-
-    // Generic to c-contig
-    DispatchVectorBuilder<as_c_contiguous_array_impl_fn_ptr_t, AsCContigFactory,
-                          td_ns::num_types>
-        dtv_as_c_contig_array;
-
-    dtv_as_c_contig_array.populate_dispatch_vector(
-        as_c_contig_array_dispatch_vector);
-
-    // 1D batch of square views into F-contig matrices to c-contig array
-    DispatchVectorBuilder<
-        as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t,
-        AsCContig1DBatchOfSquareMatricesFactory, td_ns::num_types>
-        dtv_as_c_contig_1d_batch_of_square_matrices;
-
-    dtv_as_c_contig_1d_batch_of_square_matrices.populate_dispatch_vector(
-        as_c_contig_1d_batch_of_square_matrices_dispatch_vector);
-
-    // ND batch of square views into F-contig matrices to c-contig array
-    DispatchVectorBuilder<
-        as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t,
-        AsCContigNDBatchOfSquareMatricesFactory, td_ns::num_types>
-        dtv_as_c_contig_nd_batch_of_square_matrices;
-
-    dtv_as_c_contig_nd_batch_of_square_matrices.populate_dispatch_vector(
-        as_c_contig_nd_batch_of_square_matrices_dispatch_vector);
-}
-
-namespace
-{
-
-template <typename dimT> std::size_t get_nelems(const std::vector<dimT> &shape)
-{
-    auto mult_fn = [](std::size_t prod, const dimT &term) -> std::size_t {
-        return prod * static_cast<std::size_t>(term);
-    };
-
-    static constexpr std::size_t unit{1};
-
-    const std::size_t nelems =
-        std::accumulate(std::begin(shape), std::end(shape), unit, mult_fn);
-    return nelems;
-}
-
-} // end of anonymous namespace
-
-std::pair<sycl::event, sycl::event>
-py_as_c_contig_f2c(const dpctl::tensor::usm_ndarray &src,
-                   const dpctl::tensor::usm_ndarray &dst,
-                   sycl::queue &exec_q,
-                   const std::vector<sycl::event> &depends);
-
-std::pair<sycl::event, sycl::event>
-py_as_c_contig(const dpctl::tensor::usm_ndarray &src,
-               const dpctl::tensor::usm_ndarray &dst,
-               sycl::queue &exec_q,
-               const std::vector<sycl::event> &depends)
-{
-    /*  Same dimensions, same shape, same data-type
-     *  dst is C-contiguous.
-     */
-    const int src_nd = src.get_ndim();
-    const int dst_nd = dst.get_ndim();
-
-    if (src_nd != dst_nd) {
-        throw py::value_error("Number of dimensions must be the same");
-    }
-
-    const auto &src_shape_vec = src.get_shape_vector();
-    const auto &dst_shape_vec = dst.get_shape_vector();
-
-    if (src_shape_vec != dst_shape_vec) {
-        throw py::value_error("Shapes must be equal");
-    }
-
-    int src_typenum = src.get_typenum();
-    int dst_typenum = dst.get_typenum();
-
-    const auto &array_types = td_ns::usm_ndarray_types();
-    const int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
-    const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
-
-    if (src_type_id != dst_type_id) {
-        throw py::value_error(
-            "Source and destination arrays must have the same data type");
-    }
-
-    // ensures also that destination is plenty ample to accommodate all
-    // elements of src array
-    if (!dst.is_c_contiguous()) {
-        throw py::value_error("Destination array must be C-contiguous");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    // check compatibility of execution queue and allocation queue
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    const auto &src_strides_vec = src.get_strides_vector();
-
-    if (src_nd >= 2) {
-        auto n = dst_shape_vec.back();
-        if (n == dst_shape_vec[src_nd - 2]) {
-            static constexpr auto unit_stride = py::ssize_t(1);
-            if (src_strides_vec[src_nd - 2] == unit_stride) {
-                return py_as_c_contig_f2c(src, dst, exec_q, depends);
-            }
-        }
-    }
-
-    const std::size_t nelems = get_nelems(src_shape_vec);
-
-    if (nelems == 0) {
-        // nothing to do
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    // simplify iteration space
-    using shT = std::vector<py::ssize_t>;
-    shT simplified_shape;
-    shT simplified_src_strides;
-    shT simplified_dst_strides;
-    py::ssize_t src_offset(0);
-    py::ssize_t dst_offset(0);
-
-    int nd = src_nd;
-
-    // nd, simplified_* and *_offset are modified by reference
-    dpctl::tensor::py_internal::simplify_iteration_space(
-        nd, src_shape_vec.data(), src_strides_vec, dst.get_strides_vector(),
-        // output
-        simplified_shape, simplified_src_strides, simplified_dst_strides,
-        src_offset, dst_offset);
-
-    if (!((0 == src_offset) && (0 == dst_offset))) {
-        throw std::runtime_error(
-            "Unexpected result of simplifying iteration space, 1");
-    }
-
-    std::vector<sycl::event> host_task_events{};
-    auto ptr_size_event_tuple =
-        dpctl::tensor::offset_utils::device_allocate_and_pack<py::ssize_t>(
-            exec_q, host_task_events, simplified_shape, simplified_src_strides);
-    auto shape_stride_owner = std::move(std::get<0>(ptr_size_event_tuple));
-    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
-    const py::ssize_t *shape_stride = shape_stride_owner.get();
-
-    auto ascontig_fn = as_c_contig_array_dispatch_vector[src_type_id];
-
-    std::vector<sycl::event> all_depends;
-    all_depends.reserve(depends.size() + 1);
-    all_depends.insert(std::end(all_depends), std::begin(depends),
-                       std::end(depends));
-    all_depends.push_back(copy_shape_ev);
-
-    sycl::event ascontig_ev =
-        ascontig_fn(exec_q, nelems, nd, shape_stride, src.get_data(),
-                    dst.get_data(), all_depends);
-
-    const auto &temporaries_cleanup_ev =
-        dpctl::tensor::alloc_utils::async_smart_free(exec_q, {ascontig_ev},
-                                                     shape_stride_owner);
-    host_task_events.push_back(temporaries_cleanup_ev);
-
-    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
-                          ascontig_ev);
-}
-
-std::pair<sycl::event, sycl::event>
-py_as_f_contig_c2f(const dpctl::tensor::usm_ndarray &src,
-                   const dpctl::tensor::usm_ndarray &dst,
-                   sycl::queue &exec_q,
-                   const std::vector<sycl::event> &depends);
-
-std::pair<sycl::event, sycl::event>
-py_as_f_contig(const dpctl::tensor::usm_ndarray &src,
-               const dpctl::tensor::usm_ndarray &dst,
-               sycl::queue &exec_q,
-               const std::vector<sycl::event> &depends)
-{
-    /*  Same dimensions, same shape, same data-type
-     *  dst is F-contiguous.
-     */
-    int src_nd = src.get_ndim();
-    int dst_nd = dst.get_ndim();
-
-    if (src_nd != dst_nd) {
-        throw py::value_error("Number of dimensions must be the same");
-    }
-
-    const auto &src_shape_vec = src.get_shape_vector();
-    const auto &dst_shape_vec = dst.get_shape_vector();
-
-    if (src_shape_vec != dst_shape_vec) {
-        throw py::value_error("Shapes must be equal");
-    }
-
-    int src_typenum = src.get_typenum();
-    int dst_typenum = dst.get_typenum();
-
-    const auto &array_types = td_ns::usm_ndarray_types();
-    const int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
-    const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
-
-    if (src_type_id != dst_type_id) {
-        throw py::value_error(
-            "Source and destination arrays must have the same data type");
-    }
-
-    // ensures also that destination is plenty ample to accommodate all
-    // elements of src array
-    if (!dst.is_f_contiguous()) {
-        throw py::value_error("Destination array must be F-contiguous");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    // check compatibility of execution queue and allocation queue
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    const auto &src_strides_vec = src.get_strides_vector();
-
-    if (src_nd >= 2) {
-        auto n = dst_shape_vec.front();
-        if (n == dst_shape_vec[1]) {
-            static constexpr auto unit_stride = py::ssize_t(1);
-            if (src_strides_vec[1] == unit_stride) {
-                return py_as_f_contig_c2f(src, dst, exec_q, depends);
-            }
-        }
-    }
-
-    const std::size_t nelems = get_nelems(src_shape_vec);
-
-    if (nelems == 0) {
-        // nothing to do
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    // simplify batch iteration space
-    // NB: simplification reverses dst strides to C contig,
-    // it also reverses simplified_shape and simplified_src_strides
-
-    using shT = std::vector<py::ssize_t>;
-    shT simplified_shape;
-    shT simplified_src_strides;
-    shT simplified_dst_strides;
-    py::ssize_t src_offset(0);
-    py::ssize_t dst_offset(0);
-
-    int nd = src_nd;
-
-    // nd, simplified_* and *_offset are modified by reference
-    dpctl::tensor::py_internal::simplify_iteration_space(
-        nd, src_shape_vec.data(), src_strides_vec, dst.get_strides_vector(),
-        // output
-        simplified_shape, simplified_src_strides, simplified_dst_strides,
-        src_offset, dst_offset);
-
-    if (!((0 == src_offset) && (0 == dst_offset))) {
-        throw std::runtime_error(
-            "Unexpected result of simplifying iteration space, 1");
-    }
-
-    std::vector<sycl::event> host_task_events{};
-    auto ptr_size_event_tuple =
-        dpctl::tensor::offset_utils::device_allocate_and_pack<py::ssize_t>(
-            exec_q, host_task_events, simplified_shape, simplified_src_strides);
-    auto shape_stride_owner = std::move(std::get<0>(ptr_size_event_tuple));
-    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
-    const py::ssize_t *shape_stride = shape_stride_owner.get();
-
-    auto ascontig_fn = as_c_contig_array_dispatch_vector[src_type_id];
-
-    std::vector<sycl::event> all_depends;
-    all_depends.reserve(depends.size() + 1);
-    all_depends.insert(std::end(all_depends), std::begin(depends),
-                       std::end(depends));
-    all_depends.push_back(copy_shape_ev);
-
-    sycl::event ascontig_ev =
-        ascontig_fn(exec_q, nelems, nd, shape_stride, src.get_data(),
-                    dst.get_data(), all_depends);
-
-    const auto &temporaries_cleanup_ev =
-        dpctl::tensor::alloc_utils::async_smart_free(exec_q, {ascontig_ev},
-                                                     shape_stride_owner);
-    host_task_events.push_back(temporaries_cleanup_ev);
-
-    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
-                          ascontig_ev);
-}
-
-std::pair<sycl::event, sycl::event>
-py_as_c_contig_f2c(const dpctl::tensor::usm_ndarray &src,
-                   const dpctl::tensor::usm_ndarray &dst,
-                   sycl::queue &exec_q,
-                   const std::vector<sycl::event> &depends)
-{
-    /*  Same dimensions, same shape, same data-type
-     *  dst is C-contiguous.
-     */
-    int src_nd = src.get_ndim();
-    int dst_nd = dst.get_ndim();
-
-    if (src_nd != dst_nd) {
-        throw py::value_error("Number of dimensions must be the same.");
-    }
-    if (src_nd < 2) {
-        throw py::value_error("Arrays must have 2 or more axes");
-    }
-
-    const auto &src_shape_vec = src.get_shape_vector();
-    const auto &dst_shape_vec = dst.get_shape_vector();
-
-    std::size_t nelems{1};
-    bool equal_shapes = true;
-
-    for (int i = 0; equal_shapes && (i < src_nd); ++i) {
-        auto sh_i = src_shape_vec[i];
-        equal_shapes = equal_shapes && (sh_i == dst_shape_vec[i]);
-        nelems *= static_cast<std::size_t>(sh_i);
-    }
-
-    if (!equal_shapes) {
-        throw py::value_error("Shapes must be equal");
-    }
-
-    const auto n = src_shape_vec.back();
-    if (src_shape_vec[src_nd - 2] != n) {
-        throw py::value_error("Matrices must be square");
-    }
-
-    const auto &src_strides_vec = src.get_strides_vector();
-
-    if (src_strides_vec[src_nd - 2] != py::ssize_t(1)) {
-        throw py::value_error("Unexpected destination array layout");
-    }
-
-    int src_typenum = src.get_typenum();
-    int dst_typenum = dst.get_typenum();
-
-    auto array_types = td_ns::usm_ndarray_types();
-    const int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
-    const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
-
-    if (src_type_id != dst_type_id) {
-        throw py::value_error(
-            "Source and destination arrays must have the same data type");
-    }
-
-    // ensures also that destination is plenty ample to accommodate all
-    // elements of src array
-    if (!dst.is_c_contiguous()) {
-        throw py::value_error("Destination array must be C-contiguous");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    // check compatibility of execution queue and allocation queue
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    if (nelems == 0) {
-        // nothing to do
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    const auto &dst_strides_vec = dst.get_strides_vector();
-
-    const std::size_t batch_nelems =
-        (src_nd == 2) ? std::size_t(1) : (nelems / (n * n));
-    const py::ssize_t dst_batch_step =
-        (src_nd == 2) ? py::ssize_t(0) : dst_strides_vec[src_nd - 3];
-
-    std::vector<py::ssize_t> src_batch_strides_vec;
-    std::vector<py::ssize_t> dst_batch_strides_vec;
-    std::vector<py::ssize_t> batch_shape_vec;
-
-    if (src_nd == 2) {
-        batch_shape_vec.push_back(py::ssize_t(1));
-        src_batch_strides_vec.push_back(py::ssize_t(0));
-        dst_batch_strides_vec.push_back(dst_batch_step);
-    }
-    else {
-        batch_shape_vec.insert(std::end(batch_shape_vec),
-                               std::begin(src_shape_vec),
-                               std::end(src_shape_vec) - 2);
-        src_batch_strides_vec.insert(std::end(src_batch_strides_vec),
-                                     std::begin(src_strides_vec),
-                                     std::end(src_strides_vec) - 2);
-        dst_batch_strides_vec.insert(std::end(dst_batch_strides_vec),
-                                     std::begin(dst_strides_vec),
-                                     std::end(dst_strides_vec) - 2);
-    }
-
-    // simplify batch iteration space
-    using shT = std::vector<py::ssize_t>;
-    shT simplified_shape;
-    shT simplified_src_strides;
-    shT simplified_dst_strides;
-    py::ssize_t src_offset(0);
-    py::ssize_t dst_offset(0);
-
-    int nd = static_cast<int>(batch_shape_vec.size());
-
-    // nd, simplified_* and *_offset are modified by reference
-    dpctl::tensor::py_internal::simplify_iteration_space(
-        nd, batch_shape_vec.data(), src_batch_strides_vec,
-        dst_batch_strides_vec,
-        // output
-        simplified_shape, simplified_src_strides, simplified_dst_strides,
-        src_offset, dst_offset);
-
-    if (!((0 == src_offset) && (0 == dst_offset))) {
-        throw std::runtime_error(
-            "Unexpected result of simplifying iteration space, 1");
-    }
-
-    if (1 == nd) {
-        const auto expected_dim = static_cast<py::ssize_t>(batch_nelems);
-        if ((simplified_shape.front() != expected_dim) ||
-            (simplified_dst_strides.front() != dst_batch_step))
-        {
-            throw std::runtime_error(
-                "Unexpected result of simplifying iteration space, 2");
-        }
-
-        auto impl_fn = as_c_contig_1d_batch_of_square_matrices_dispatch_vector
-            [src_type_id];
-        const py::ssize_t src_batch_step = simplified_src_strides.front();
-
-        sycl::event ascontig_ev =
-            impl_fn(exec_q, batch_nelems, src_batch_step, dst_batch_step, n,
-                    src.get_data(), src_strides_vec.back(), dst.get_data(),
-                    dst_strides_vec[src_nd - 2], depends);
-
-        return std::make_pair(
-            keep_args_alive(exec_q, {src, dst}, {ascontig_ev}), ascontig_ev);
-    }
-
-    auto impl_fn =
-        as_c_contig_nd_batch_of_square_matrices_dispatch_vector[src_type_id];
-
-    std::vector<sycl::event> host_task_events;
-    host_task_events.reserve(2);
-
-    using dpctl::tensor::offset_utils::device_allocate_and_pack;
-    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
-        exec_q, host_task_events, simplified_shape, simplified_src_strides);
-    auto packed_shape_strides_owner =
-        std::move(std::get<0>(ptr_size_event_tuple));
-    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
-    const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get();
-
-    std::vector<sycl::event> all_depends;
-    all_depends.reserve(depends.size() + 1);
-    all_depends.insert(std::end(all_depends), std::begin(depends),
-                       std::end(depends));
-    all_depends.push_back(copy_shape_ev);
-
-    sycl::event ascontig_ev =
-        impl_fn(exec_q, batch_nelems, nd, packed_shape_strides, dst_batch_step,
-                n, src.get_data(), src_strides_vec.back(), dst.get_data(),
-                dst_strides_vec[src_nd - 2], all_depends);
-
-    // async free of shape_strides temporary
-    sycl::event temporaries_cleanup_ev =
-        dpctl::tensor::alloc_utils::async_smart_free(
-            exec_q, {ascontig_ev}, packed_shape_strides_owner);
-    host_task_events.push_back(temporaries_cleanup_ev);
-
-    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
-                          ascontig_ev);
-}
-
-std::pair<sycl::event, sycl::event>
-py_as_f_contig_c2f(const dpctl::tensor::usm_ndarray &src,
-                   const dpctl::tensor::usm_ndarray &dst,
-                   sycl::queue &exec_q,
-                   const std::vector<sycl::event> &depends)
-{
-    /*  Same dimensions, same shape, same data-type
-     *  dst is F-contiguous.
-     */
-    int src_nd = src.get_ndim();
-    int dst_nd = dst.get_ndim();
-
-    if (src_nd != dst_nd) {
-        throw py::value_error("Number of dimensions must be the same.");
-    }
-    if (src_nd < 2) {
-        throw py::value_error("Arrays must have 2 or more axes");
-    }
-
-    // ensures also that destination is plenty ample to accommodate all
-    // elements of src array
-    if (!dst.is_f_contiguous()) {
-        throw py::value_error("Destination array must be C-contiguous");
-    }
-
-    const auto &src_shape_vec = src.get_shape_vector();
-    const auto &dst_shape_vec = dst.get_shape_vector();
-
-    std::size_t nelems{1};
-    bool equal_shapes = true;
-
-    for (int i = 0; equal_shapes && (i < src_nd); ++i) {
-        auto sh_i = src_shape_vec[i];
-        equal_shapes = equal_shapes && (sh_i == dst_shape_vec[i]);
-        nelems *= static_cast<std::size_t>(sh_i);
-    }
-
-    if (!equal_shapes) {
-        throw py::value_error("Shapes must be equal");
-    }
-
-    const auto n = dst_shape_vec.front();
-    if (dst_shape_vec[1] != n) {
-        throw py::value_error("Matrices must be square");
-    }
-
-    const auto &src_strides_vec = src.get_strides_vector();
-
-    if (src_strides_vec[1] != py::ssize_t(1)) {
-        throw py::value_error("Unexpected destination array layout");
-    }
-
-    int src_typenum = src.get_typenum();
-    int dst_typenum = dst.get_typenum();
-
-    auto array_types = td_ns::usm_ndarray_types();
-    const int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
-    const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
-
-    if (src_type_id != dst_type_id) {
-        throw py::value_error(
-            "Source and destination arrays must have the same data type");
-    }
-
-    if (nelems == 0) {
-        // nothing to do
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    const auto &dst_strides_vec = dst.get_strides_vector();
-
-    const std::size_t batch_nelems =
-        (src_nd == 2) ? std::size_t(1) : (nelems / (n * n));
-    const py::ssize_t dst_batch_step =
-        (src_nd == 2) ? py::ssize_t(0) : dst_strides_vec[2];
-
-    std::vector<py::ssize_t> src_batch_strides_vec;
-    std::vector<py::ssize_t> dst_batch_strides_vec;
-    std::vector<py::ssize_t> batch_shape_vec;
-
-    if (src_nd == 2) {
-        batch_shape_vec.push_back(py::ssize_t(1));
-        src_batch_strides_vec.push_back(py::ssize_t(0));
-        dst_batch_strides_vec.push_back(dst_batch_step);
-    }
-    else {
-        batch_shape_vec.insert(std::end(batch_shape_vec),
-                               std::begin(src_shape_vec) + 2,
-                               std::end(src_shape_vec));
-        src_batch_strides_vec.insert(std::end(src_batch_strides_vec),
-                                     std::begin(src_strides_vec) + 2,
-                                     std::end(src_strides_vec));
-        dst_batch_strides_vec.insert(std::end(dst_batch_strides_vec),
-                                     std::begin(dst_strides_vec) + 2,
-                                     std::end(dst_strides_vec));
-    }
-
-    // simplify batch iteration space
-    // NB: simplification reverses dst strides to C contig,
-    // it also reverses simplified_shape and simplified_src_strides
-    using shT = std::vector<py::ssize_t>;
-    shT simplified_shape;
-    shT simplified_src_strides;
-    shT simplified_dst_strides;
-    py::ssize_t src_offset(0);
-    py::ssize_t dst_offset(0);
-
-    int nd = static_cast<int>(batch_shape_vec.size());
-
-    // nd, simplified_* and *_offset are modified by reference
-    dpctl::tensor::py_internal::simplify_iteration_space(
-        nd, batch_shape_vec.data(), src_batch_strides_vec,
-        dst_batch_strides_vec,
-        // output
-        simplified_shape, simplified_src_strides, simplified_dst_strides,
-        src_offset, dst_offset);
-
-    if (!((0 == src_offset) && (0 == dst_offset))) {
-        throw std::runtime_error(
-            "Unexpected result of simplifying iteration space, 1");
-    }
-
-    if (1 == nd) {
-        const auto expected_dim = static_cast<py::ssize_t>(batch_nelems);
-        if ((simplified_shape.front() != expected_dim) ||
-            (simplified_dst_strides.front() != dst_batch_step))
-        {
-            throw std::runtime_error(
-                "Unexpected result of simplifying iteration space, 2");
-        }
-
-        auto impl_fn = as_c_contig_1d_batch_of_square_matrices_dispatch_vector
-            [src_type_id];
-        const py::ssize_t src_batch_step = simplified_src_strides.front();
-
-        sycl::event ascontig_ev =
-            impl_fn(exec_q, batch_nelems, src_batch_step, dst_batch_step, n,
-                    src.get_data(), src_strides_vec.front(), dst.get_data(),
-                    dst_strides_vec[1], depends);
-
-        return std::make_pair(
-            keep_args_alive(exec_q, {src, dst}, {ascontig_ev}), ascontig_ev);
-    }
-
-    auto impl_fn =
-        as_c_contig_nd_batch_of_square_matrices_dispatch_vector[src_type_id];
-
-    std::vector<sycl::event> host_task_events;
-    host_task_events.reserve(2);
-
-    using dpctl::tensor::offset_utils::device_allocate_and_pack;
-    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
-        exec_q, host_task_events, simplified_shape, simplified_src_strides);
-    auto packed_shape_strides_owner =
-        std::move(std::get<0>(ptr_size_event_tuple));
-    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
-    const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get();
-
-    std::vector<sycl::event> all_depends;
-    all_depends.reserve(depends.size() + 1);
-    all_depends.insert(std::end(all_depends), std::begin(depends),
-                       std::end(depends));
-    all_depends.push_back(copy_shape_ev);
-
-    sycl::event ascontig_ev =
-        impl_fn(exec_q, batch_nelems, nd, packed_shape_strides, dst_batch_step,
-                n, src.get_data(), src_strides_vec.front(), dst.get_data(),
-                dst_strides_vec[1], all_depends);
-
-    // async free of shape_strides
-    sycl::event temporaries_cleanup_ev =
-        dpctl::tensor::alloc_utils::async_smart_free(
-            exec_q, {ascontig_ev}, packed_shape_strides_owner);
-    host_task_events.push_back(temporaries_cleanup_ev);
-
-    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
-                          ascontig_ev);
-}
-
-} // end of namespace py_internal
-} // end of namespace tensor
-} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/copy_as_contig.hpp b/dpctl/tensor/libtensor/source/copy_as_contig.hpp
deleted file mode 100644
index f2957593e0..0000000000
--- a/dpctl/tensor/libtensor/source/copy_as_contig.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-#pragma once
-
-#include <utility>
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/pybind11.h>
-#include <sycl/sycl.hpp>
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-std::pair<sycl::event, sycl::event>
-py_as_c_contig(const dpctl::tensor::usm_ndarray &,
-               const dpctl::tensor::usm_ndarray &,
-               sycl::queue &,
-               const std::vector<sycl::event> &);
-
-std::pair<sycl::event, sycl::event>
-py_as_f_contig(const dpctl::tensor::usm_ndarray &,
-               const dpctl::tensor::usm_ndarray &,
-               sycl::queue &,
-               const std::vector<sycl::event> &);
-
-void init_copy_as_contig_dispatch_vectors(void);
-
-} // end of namespace py_internal
-} // end of namespace tensor
-} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/copy_for_reshape.cpp b/dpctl/tensor/libtensor/source/copy_for_reshape.cpp
deleted file mode 100644
index 5eb0809768..0000000000
--- a/dpctl/tensor/libtensor/source/copy_for_reshape.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===----------------------------------------------------------------------===//
-
-#include <stdexcept>
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/pybind11.h>
-
-#include "copy_for_reshape.hpp"
-#include "kernels/copy_and_cast.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/output_validation.hpp"
-#include "utils/sycl_alloc_utils.hpp"
-#include "utils/type_dispatch.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::kernels::copy_and_cast::copy_for_reshape_fn_ptr_t;
-using dpctl::utils::keep_args_alive;
-
-// define static vector
-static copy_for_reshape_fn_ptr_t
-    copy_for_reshape_generic_dispatch_vector[td_ns::num_types];
-
-/*
- * Copies src into dst (same data type) of different shapes by using flat
- * iterations.
- *
- * Equivalent to the following loop:
- *
- * for i for range(src.size):
- *     dst[np.multi_index(i, dst.shape)] = src[np.multi_index(i, src.shape)]
- */
-std::pair<sycl::event, sycl::event>
-copy_usm_ndarray_for_reshape(const dpctl::tensor::usm_ndarray &src,
-                             const dpctl::tensor::usm_ndarray &dst,
-                             sycl::queue &exec_q,
-                             const std::vector<sycl::event> &depends)
-{
-    py::ssize_t src_nelems = src.get_size();
-    py::ssize_t dst_nelems = dst.get_size();
-
-    // Must have the same number of elements
-    if (src_nelems != dst_nelems) {
-        throw py::value_error(
-            "copy_usm_ndarray_for_reshape requires src and dst to "
-            "have the same number of elements.");
-    }
-
-    int src_typenum = src.get_typenum();
-    int dst_typenum = dst.get_typenum();
-
-    // typenames must be the same
-    if (src_typenum != dst_typenum) {
-        throw py::value_error(
-            "copy_usm_ndarray_for_reshape requires src and dst to "
-            "have the same type.");
-    }
-
-    if (src_nelems == 0) {
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
-
-    // check same contexts
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    if (src_nelems == 1) {
-        // handle special case of 1-element array
-        int src_elemsize = src.get_elemsize();
-        const char *src_data = src.get_data();
-        char *dst_data = dst.get_data();
-        sycl::event copy_ev =
-            exec_q.copy<char>(src_data, dst_data, src_elemsize, depends);
-        return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}),
-                              copy_ev);
-    }
-
-    // dimensions may be different
-    int src_nd = src.get_ndim();
-    int dst_nd = dst.get_ndim();
-
-    auto array_types = td_ns::usm_ndarray_types();
-    int type_id = array_types.typenum_to_lookup_id(src_typenum);
-
-    auto fn = copy_for_reshape_generic_dispatch_vector[type_id];
-
-    auto src_shape = src.get_shape_vector();
-    auto src_strides = src.get_strides_vector();
-
-    auto dst_shape = dst.get_shape_vector();
-    auto dst_strides = dst.get_strides_vector();
-
-    std::vector<sycl::event> host_task_events;
-    host_task_events.reserve(2);
-
-    // shape_strides = [src_shape, src_strides, dst_shape, dst_strides]
-    using dpctl::tensor::offset_utils::device_allocate_and_pack;
-    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
-        exec_q, host_task_events, src_shape, src_strides, dst_shape,
-        dst_strides);
-    auto copy_shape_ev = std::get<2>(ptr_size_event_tuple);
-    auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
-    const py::ssize_t *shape_strides = shape_strides_owner.get();
-
-    const char *src_data = src.get_data();
-    char *dst_data = dst.get_data();
-
-    std::vector<sycl::event> all_deps(depends.size() + 1);
-    all_deps.push_back(copy_shape_ev);
-    all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
-
-    sycl::event copy_for_reshape_event =
-        fn(exec_q, src_nelems, src_nd, dst_nd, shape_strides, src_data,
-           dst_data, all_deps);
-
-    sycl::event temporaries_cleanup_ev =
-        dpctl::tensor::alloc_utils::async_smart_free(
-            exec_q, {copy_for_reshape_event}, shape_strides_owner);
-
-    host_task_events.push_back(temporaries_cleanup_ev);
-
-    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
-                          copy_for_reshape_event);
-}
-
-void init_copy_for_reshape_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    using dpctl::tensor::kernels::copy_and_cast::CopyForReshapeGenericFactory;
-
-    DispatchVectorBuilder<copy_for_reshape_fn_ptr_t,
-                          CopyForReshapeGenericFactory, num_types>
-        dvb;
-    dvb.populate_dispatch_vector(copy_for_reshape_generic_dispatch_vector);
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/copy_for_reshape.hpp b/dpctl/tensor/libtensor/source/copy_for_reshape.hpp
deleted file mode 100644
index ac1e1b0941..0000000000
--- a/dpctl/tensor/libtensor/source/copy_for_reshape.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/pybind11.h>
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern std::pair<sycl::event, sycl::event>
-copy_usm_ndarray_for_reshape(const dpctl::tensor::usm_ndarray &src,
-                             const dpctl::tensor::usm_ndarray &dst,
-                             sycl::queue &exec_q,
-                             const std::vector<sycl::event> &depends = {});
-
-extern void init_copy_for_reshape_dispatch_vectors();
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/copy_for_roll.cpp b/dpctl/tensor/libtensor/source/copy_for_roll.cpp
deleted file mode 100644
index ed46fac47d..0000000000
--- a/dpctl/tensor/libtensor/source/copy_for_roll.cpp
+++ /dev/null
@@ -1,393 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===----------------------------------------------------------------------===//
-
-#include <cstddef>
-#include <stdexcept>
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/pybind11.h>
-
-#include "copy_for_roll.hpp"
-#include "kernels/copy_and_cast.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/output_validation.hpp"
-#include "utils/sycl_alloc_utils.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "simplify_iteration_space.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::kernels::copy_and_cast::copy_for_roll_contig_fn_ptr_t;
-using dpctl::tensor::kernels::copy_and_cast::
-    copy_for_roll_ndshift_strided_fn_ptr_t;
-using dpctl::tensor::kernels::copy_and_cast::copy_for_roll_strided_fn_ptr_t;
-using dpctl::utils::keep_args_alive;
-
-// define static vector
-static copy_for_roll_strided_fn_ptr_t
-    copy_for_roll_strided_dispatch_vector[td_ns::num_types];
-
-static copy_for_roll_contig_fn_ptr_t
-    copy_for_roll_contig_dispatch_vector[td_ns::num_types];
-
-static copy_for_roll_ndshift_strided_fn_ptr_t
-    copy_for_roll_ndshift_dispatch_vector[td_ns::num_types];
-
-/*
- * Copies src into dst (same data type) of different shapes by using flat
- * iterations.
- *
- * Equivalent to the following loop:
- *
- * for i for range(src.size):
- *     dst[np.multi_index(i, dst.shape)] = src[np.multi_index(i, src.shape)]
- */
-std::pair<sycl::event, sycl::event>
-copy_usm_ndarray_for_roll_1d(const dpctl::tensor::usm_ndarray &src,
-                             const dpctl::tensor::usm_ndarray &dst,
-                             py::ssize_t shift,
-                             sycl::queue &exec_q,
-                             const std::vector<sycl::event> &depends)
-{
-    int src_nd = src.get_ndim();
-    int dst_nd = dst.get_ndim();
-
-    // Must have the same number of dimensions
-    if (src_nd != dst_nd) {
-        throw py::value_error(
-            "copy_usm_ndarray_for_roll_1d requires src and dst to "
-            "have the same number of dimensions.");
-    }
-
-    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
-    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
-
-    if (!std::equal(src_shape_ptr, src_shape_ptr + src_nd, dst_shape_ptr)) {
-        throw py::value_error(
-            "copy_usm_ndarray_for_roll_1d requires src and dst to "
-            "have the same shape.");
-    }
-
-    py::ssize_t src_nelems = src.get_size();
-
-    int src_typenum = src.get_typenum();
-    int dst_typenum = dst.get_typenum();
-
-    // typenames must be the same
-    if (src_typenum != dst_typenum) {
-        throw py::value_error(
-            "copy_usm_ndarray_for_roll_1d requires src and dst to "
-            "have the same type.");
-    }
-
-    if (src_nelems == 0) {
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
-
-    // check same contexts
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    if (src_nelems == 1) {
-        // handle special case of 1-element array
-        int src_elemsize = src.get_elemsize();
-        const char *src_data = src.get_data();
-        char *dst_data = dst.get_data();
-        sycl::event copy_ev =
-            exec_q.copy<char>(src_data, dst_data, src_elemsize, depends);
-        return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}),
-                              copy_ev);
-    }
-
-    auto array_types = td_ns::usm_ndarray_types();
-    int type_id = array_types.typenum_to_lookup_id(src_typenum);
-
-    const bool is_src_c_contig = src.is_c_contiguous();
-    const bool is_src_f_contig = src.is_f_contiguous();
-
-    const bool is_dst_c_contig = dst.is_c_contiguous();
-    const bool is_dst_f_contig = dst.is_f_contiguous();
-
-    const bool both_c_contig = is_src_c_contig && is_dst_c_contig;
-    const bool both_f_contig = is_src_f_contig && is_dst_f_contig;
-
-    // normalize shift parameter to be 0 <= offset < src_nelems
-    std::size_t offset =
-        (shift > 0) ? (shift % src_nelems) : src_nelems + (shift % src_nelems);
-
-    const char *src_data = src.get_data();
-    char *dst_data = dst.get_data();
-
-    if (both_c_contig || both_f_contig) {
-        auto fn = copy_for_roll_contig_dispatch_vector[type_id];
-
-        if (fn != nullptr) {
-            static constexpr py::ssize_t zero_offset = 0;
-
-            sycl::event copy_for_roll_ev =
-                fn(exec_q, offset, src_nelems, src_data, zero_offset, dst_data,
-                   zero_offset, depends);
-
-            sycl::event ht_ev =
-                keep_args_alive(exec_q, {src, dst}, {copy_for_roll_ev});
-
-            return std::make_pair(ht_ev, copy_for_roll_ev);
-        }
-    }
-
-    auto const &src_strides = src.get_strides_vector();
-    auto const &dst_strides = dst.get_strides_vector();
-
-    using shT = std::vector<py::ssize_t>;
-    shT simplified_shape;
-    shT simplified_src_strides;
-    shT simplified_dst_strides;
-    py::ssize_t src_offset(0);
-    py::ssize_t dst_offset(0);
-
-    int nd = src_nd;
-    const py::ssize_t *shape = src_shape_ptr;
-
-    // nd, simplified_* and *_offset are modified by reference
-    dpctl::tensor::py_internal::simplify_iteration_space(
-        nd, shape, src_strides, dst_strides,
-        // output
-        simplified_shape, simplified_src_strides, simplified_dst_strides,
-        src_offset, dst_offset);
-
-    if (nd == 1 && simplified_src_strides[0] == 1 &&
-        simplified_dst_strides[0] == 1)
-    {
-        auto fn = copy_for_roll_contig_dispatch_vector[type_id];
-
-        if (fn != nullptr) {
-
-            sycl::event copy_for_roll_ev =
-                fn(exec_q, offset, src_nelems, src_data, src_offset, dst_data,
-                   dst_offset, depends);
-
-            sycl::event ht_ev =
-                keep_args_alive(exec_q, {src, dst}, {copy_for_roll_ev});
-
-            return std::make_pair(ht_ev, copy_for_roll_ev);
-        }
-    }
-
-    auto fn = copy_for_roll_strided_dispatch_vector[type_id];
-
-    std::vector<sycl::event> host_task_events;
-    host_task_events.reserve(2);
-
-    // shape_strides = [src_shape, src_strides, dst_strides]
-    using dpctl::tensor::offset_utils::device_allocate_and_pack;
-    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
-        exec_q, host_task_events, simplified_shape, simplified_src_strides,
-        simplified_dst_strides);
-    auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
-    sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
-    const py::ssize_t *shape_strides = shape_strides_owner.get();
-
-    std::vector<sycl::event> all_deps(depends.size() + 1);
-    all_deps.push_back(copy_shape_ev);
-    all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
-
-    sycl::event copy_for_roll_event =
-        fn(exec_q, offset, src_nelems, src_nd, shape_strides, src_data,
-           src_offset, dst_data, dst_offset, all_deps);
-
-    sycl::event temporaries_cleanup_ev =
-        dpctl::tensor::alloc_utils::async_smart_free(
-            exec_q, {copy_for_roll_event}, shape_strides_owner);
-    host_task_events.push_back(temporaries_cleanup_ev);
-
-    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
-                          copy_for_roll_event);
-}
-
-std::pair<sycl::event, sycl::event>
-copy_usm_ndarray_for_roll_nd(const dpctl::tensor::usm_ndarray &src,
-                             const dpctl::tensor::usm_ndarray &dst,
-                             const std::vector<py::ssize_t> &shifts,
-                             sycl::queue &exec_q,
-                             const std::vector<sycl::event> &depends)
-{
-    int src_nd = src.get_ndim();
-    int dst_nd = dst.get_ndim();
-
-    // Must have the same number of dimensions
-    if (src_nd != dst_nd) {
-        throw py::value_error(
-            "copy_usm_ndarray_for_roll_nd requires src and dst to "
-            "have the same number of dimensions.");
-    }
-
-    if (static_cast<std::size_t>(src_nd) != shifts.size()) {
-        throw py::value_error(
-            "copy_usm_ndarray_for_roll_nd requires shifts to "
-            "contain an integral shift for each array dimension.");
-    }
-
-    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
-    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
-
-    if (!std::equal(src_shape_ptr, src_shape_ptr + src_nd, dst_shape_ptr)) {
-        throw py::value_error(
-            "copy_usm_ndarray_for_roll_nd requires src and dst to "
-            "have the same shape.");
-    }
-
-    py::ssize_t src_nelems = src.get_size();
-
-    int src_typenum = src.get_typenum();
-    int dst_typenum = dst.get_typenum();
-
-    // typenames must be the same
-    if (src_typenum != dst_typenum) {
-        throw py::value_error(
-            "copy_usm_ndarray_for_roll_nd requires src and dst to "
-            "have the same type.");
-    }
-
-    if (src_nelems == 0) {
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
-
-    // check for compatible queues
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    if (src_nelems == 1) {
-        // handle special case of 1-element array
-        int src_elemsize = src.get_elemsize();
-        const char *src_data = src.get_data();
-        char *dst_data = dst.get_data();
-        sycl::event copy_ev =
-            exec_q.copy<char>(src_data, dst_data, src_elemsize, depends);
-        return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}),
-                              copy_ev);
-    }
-
-    auto array_types = td_ns::usm_ndarray_types();
-    int type_id = array_types.typenum_to_lookup_id(src_typenum);
-
-    std::vector<py::ssize_t> normalized_shifts{};
-    normalized_shifts.reserve(src_nd);
-
-    for (int i = 0; i < src_nd; ++i) {
-        // normalize shift parameter to be 0 <= offset < dim
-        py::ssize_t dim = src_shape_ptr[i];
-        std::size_t offset =
-            (shifts[i] >= 0) ? (shifts[i] % dim) : dim + (shifts[i] % dim);
-
-        normalized_shifts.push_back(offset);
-    }
-
-    const char *src_data = src.get_data();
-    char *dst_data = dst.get_data();
-
-    auto const &src_strides = src.get_strides_vector();
-    auto const &dst_strides = dst.get_strides_vector();
-    auto const &common_shape = src.get_shape_vector();
-
-    static constexpr py::ssize_t src_offset = 0;
-    static constexpr py::ssize_t dst_offset = 0;
-
-    auto fn = copy_for_roll_ndshift_dispatch_vector[type_id];
-
-    std::vector<sycl::event> host_task_events;
-    host_task_events.reserve(2);
-
-    // shape_strides = [src_shape, src_strides, dst_strides]
-    using dpctl::tensor::offset_utils::device_allocate_and_pack;
-    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
-        exec_q, host_task_events, common_shape, src_strides, dst_strides,
-        normalized_shifts);
-    auto shape_strides_shifts_owner =
-        std::move(std::get<0>(ptr_size_event_tuple));
-    sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
-    const py::ssize_t *shape_strides_shifts = shape_strides_shifts_owner.get();
-
-    std::vector<sycl::event> all_deps(depends.size() + 1);
-    all_deps.push_back(copy_shape_ev);
-    all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
-
-    sycl::event copy_for_roll_event =
-        fn(exec_q, src_nelems, src_nd, shape_strides_shifts, src_data,
-           src_offset, dst_data, dst_offset, all_deps);
-
-    auto temporaries_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
-        exec_q, {copy_for_roll_event}, shape_strides_shifts_owner);
-    host_task_events.push_back(temporaries_cleanup_ev);
-
-    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
-                          copy_for_roll_event);
-}
-
-void init_copy_for_roll_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    using dpctl::tensor::kernels::copy_and_cast::CopyForRollStridedFactory;
-
-    DispatchVectorBuilder<copy_for_roll_strided_fn_ptr_t,
-                          CopyForRollStridedFactory, num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(copy_for_roll_strided_dispatch_vector);
-
-    using dpctl::tensor::kernels::copy_and_cast::CopyForRollContigFactory;
-    DispatchVectorBuilder<copy_for_roll_contig_fn_ptr_t,
-                          CopyForRollContigFactory, num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(copy_for_roll_contig_dispatch_vector);
-
-    using dpctl::tensor::kernels::copy_and_cast::CopyForRollNDShiftFactory;
-    DispatchVectorBuilder<copy_for_roll_ndshift_strided_fn_ptr_t,
-                          CopyForRollNDShiftFactory, num_types>
-        dvb3;
-    dvb3.populate_dispatch_vector(copy_for_roll_ndshift_dispatch_vector);
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/copy_for_roll.hpp b/dpctl/tensor/libtensor/source/copy_for_roll.hpp
deleted file mode 100644
index 5ebfedce5e..0000000000
--- a/dpctl/tensor/libtensor/source/copy_for_roll.hpp
+++ /dev/null
@@ -1,58 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/pybind11.h>
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern std::pair<sycl::event, sycl::event>
-copy_usm_ndarray_for_roll_1d(const dpctl::tensor::usm_ndarray &src,
-                             const dpctl::tensor::usm_ndarray &dst,
-                             py::ssize_t shift,
-                             sycl::queue &exec_q,
-                             const std::vector<sycl::event> &depends = {});
-
-extern std::pair<sycl::event, sycl::event>
-copy_usm_ndarray_for_roll_nd(const dpctl::tensor::usm_ndarray &src,
-                             const dpctl::tensor::usm_ndarray &dst,
-                             const std::vector<py::ssize_t> &shifts,
-                             sycl::queue &exec_q,
-                             const std::vector<sycl::event> &depends = {});
-
-extern void init_copy_for_roll_dispatch_vectors();
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
deleted file mode 100644
index 6b52de7eee..0000000000
--- a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
+++ /dev/null
@@ -1,359 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===----------------------------------------------------------------------===//
-
-#include <algorithm>
-#include <cstddef>
-#include <stdexcept>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-
-#include "kernels/copy_and_cast.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/output_validation.hpp"
-#include "utils/sycl_alloc_utils.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "copy_numpy_ndarray_into_usm_ndarray.hpp"
-#include "simplify_iteration_space.hpp"
-
-namespace py = pybind11;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-using dpctl::tensor::kernels::copy_and_cast::
-    copy_and_cast_from_host_blocking_fn_ptr_t;
-
-static copy_and_cast_from_host_blocking_fn_ptr_t
-    copy_and_cast_from_host_blocking_dispatch_table[td_ns::num_types]
-                                                   [td_ns::num_types];
-
-using dpctl::tensor::kernels::copy_and_cast::
-    copy_and_cast_from_host_contig_blocking_fn_ptr_t;
-
-static copy_and_cast_from_host_contig_blocking_fn_ptr_t
-    copy_and_cast_from_host_contig_blocking_dispatch_table[td_ns::num_types]
-                                                          [td_ns::num_types];
-
-void copy_numpy_ndarray_into_usm_ndarray(
-    const py::array &npy_src,
-    const dpctl::tensor::usm_ndarray &dst,
-    sycl::queue &exec_q,
-    const std::vector<sycl::event> &depends)
-{
-    int src_ndim = npy_src.ndim();
-    int dst_ndim = dst.get_ndim();
-
-    if (src_ndim != dst_ndim) {
-        throw py::value_error("Source ndarray and destination usm_ndarray have "
-                              "different array ranks, "
-                              "i.e. different number of indices needed to "
-                              "address array elements.");
-    }
-
-    const py::ssize_t *src_shape = npy_src.shape();
-    const py::ssize_t *dst_shape = dst.get_shape_raw();
-    bool shapes_equal(true);
-    std::size_t src_nelems(1);
-    for (int i = 0; shapes_equal && (i < src_ndim); ++i) {
-        shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
-        src_nelems *= static_cast<std::size_t>(src_shape[i]);
-    }
-
-    if (!shapes_equal) {
-        throw py::value_error("Source ndarray and destination usm_ndarray have "
-                              "difference shapes.");
-    }
-
-    if (src_nelems == 0) {
-        // nothing to do
-        return;
-    }
-
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
-        throw py::value_error("Execution queue is not compatible with the "
-                              "allocation queue");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    // here we assume that NumPy's type numbers agree with ours for types
-    // supported in both
-    int src_typenum =
-        py::detail::array_descriptor_proxy(npy_src.dtype().ptr())->type_num;
-    int dst_typenum = dst.get_typenum();
-
-    const auto &array_types = td_ns::usm_ndarray_types();
-    int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
-    int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
-
-    py::buffer_info src_pybuf = npy_src.request();
-    const char *const src_data = static_cast<const char *const>(src_pybuf.ptr);
-    char *dst_data = dst.get_data();
-
-    int src_flags = npy_src.flags();
-
-    // check for applicability of special cases:
-    //      (same type && (both C-contiguous || both F-contiguous)
-    const bool both_c_contig =
-        ((src_flags & py::array::c_style) && dst.is_c_contiguous());
-    const bool both_f_contig =
-        ((src_flags & py::array::f_style) && dst.is_f_contiguous());
-
-    const bool same_data_types = (src_type_id == dst_type_id);
-
-    if (both_c_contig || both_f_contig) {
-        if (same_data_types) {
-            int src_elem_size = npy_src.itemsize();
-
-            sycl::event copy_ev =
-                exec_q.memcpy(static_cast<void *>(dst_data),
-                              static_cast<const void *>(src_data),
-                              src_nelems * src_elem_size, depends);
-
-            {
-                // wait for copy_ev to complete
-                // release GIL to allow other threads (host_tasks)
-                // a chance to acquire GIL
-                py::gil_scoped_release lock{};
-                copy_ev.wait();
-            }
-
-            return;
-        }
-        else {
-            py::gil_scoped_release lock{};
-
-            auto copy_and_cast_from_host_contig_blocking_fn =
-                copy_and_cast_from_host_contig_blocking_dispatch_table
-                    [dst_type_id][src_type_id];
-
-            static constexpr py::ssize_t zero_offset(0);
-
-            copy_and_cast_from_host_contig_blocking_fn(
-                exec_q, src_nelems, src_data, zero_offset, dst_data,
-                zero_offset, depends);
-
-            return;
-        }
-    }
-
-    auto const &dst_strides =
-        dst.get_strides_vector(); // N.B.: strides in elements
-
-    using shT = std::vector<py::ssize_t>;
-    shT simplified_shape;
-    shT simplified_src_strides;
-    shT simplified_dst_strides;
-    py::ssize_t src_offset(0);
-    py::ssize_t dst_offset(0);
-
-    int nd = src_ndim;
-    const py::ssize_t *shape = src_shape;
-
-    const py::ssize_t *src_strides_p =
-        npy_src.strides();                         // N.B.: strides in bytes
-    py::ssize_t src_itemsize = npy_src.itemsize(); // item size in bytes
-
-    bool is_src_c_contig = ((src_flags & py::array::c_style) != 0);
-    bool is_src_f_contig = ((src_flags & py::array::f_style) != 0);
-
-    shT src_strides_in_elems;
-    if (src_strides_p) {
-        src_strides_in_elems.resize(nd);
-        // copy and convert strides from bytes to elements
-        std::transform(
-            src_strides_p, src_strides_p + nd, std::begin(src_strides_in_elems),
-            [src_itemsize](py::ssize_t el) {
-                py::ssize_t q = el / src_itemsize;
-                if (q * src_itemsize != el) {
-                    throw std::runtime_error(
-                        "NumPy array strides are not multiple of itemsize");
-                }
-                return q;
-            });
-    }
-    else {
-        if (is_src_c_contig) {
-            src_strides_in_elems =
-                dpctl::tensor::c_contiguous_strides(nd, src_shape);
-        }
-        else if (is_src_f_contig) {
-            src_strides_in_elems =
-                dpctl::tensor::f_contiguous_strides(nd, src_shape);
-        }
-        else {
-            throw py::value_error("NumPy source array has null strides but is "
-                                  "neither C- nor F-contiguous.");
-        }
-    }
-
-    // nd, simplified_* vectors and offsets are modified by reference
-    simplify_iteration_space(nd, shape, src_strides_in_elems, dst_strides,
-                             // outputs
-                             simplified_shape, simplified_src_strides,
-                             simplified_dst_strides, src_offset, dst_offset);
-
-    assert(simplified_shape.size() == static_cast<std::size_t>(nd));
-    assert(simplified_src_strides.size() == static_cast<std::size_t>(nd));
-    assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
-
-    // handle nd == 0
-    if (nd == 0) {
-        nd = 1;
-        simplified_shape.reserve(nd);
-        simplified_shape.push_back(1);
-
-        simplified_src_strides.reserve(nd);
-        simplified_src_strides.push_back(1);
-
-        simplified_dst_strides.reserve(nd);
-        simplified_dst_strides.push_back(1);
-    }
-
-    const bool is_contig_vector =
-        ((nd == 1) && (simplified_src_strides.front() == 1) &&
-         (simplified_dst_strides.front() == 1));
-
-    const bool can_use_memcpy = (same_data_types && is_contig_vector &&
-                                 (src_offset == 0) && (dst_offset == 0));
-
-    if (can_use_memcpy) {
-        int src_elem_size = npy_src.itemsize();
-
-        sycl::event copy_ev = exec_q.memcpy(
-            static_cast<void *>(dst_data), static_cast<const void *>(src_data),
-            src_nelems * src_elem_size, depends);
-
-        {
-            // wait for copy_ev to complete
-            // release GIL to allow other threads (host_tasks)
-            // a chance to acquire GIL
-            py::gil_scoped_release lock{};
-
-            copy_ev.wait();
-        }
-
-        return;
-    }
-
-    // Minimum and maximum element offsets for source np.ndarray
-    py::ssize_t npy_src_min_nelem_offset(src_offset);
-    py::ssize_t npy_src_max_nelem_offset(src_offset);
-    for (int i = 0; i < nd; ++i) {
-        if (simplified_src_strides[i] < 0) {
-            npy_src_min_nelem_offset +=
-                simplified_src_strides[i] * (simplified_shape[i] - 1);
-        }
-        else {
-            npy_src_max_nelem_offset +=
-                simplified_src_strides[i] * (simplified_shape[i] - 1);
-        }
-    }
-
-    if (is_contig_vector) {
-        // release GIL for the blocking call
-        py::gil_scoped_release lock{};
-
-        auto copy_and_cast_from_host_contig_blocking_fn =
-            copy_and_cast_from_host_contig_blocking_dispatch_table[dst_type_id]
-                                                                  [src_type_id];
-
-        copy_and_cast_from_host_contig_blocking_fn(exec_q, src_nelems, src_data,
-                                                   src_offset, dst_data,
-                                                   dst_offset, depends);
-
-        return;
-    }
-
-    std::vector<sycl::event> host_task_events;
-    host_task_events.reserve(1);
-
-    // Copy shape strides into device memory
-    using dpctl::tensor::offset_utils::device_allocate_and_pack;
-    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
-        exec_q, host_task_events, simplified_shape, simplified_src_strides,
-        simplified_dst_strides);
-    auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
-    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
-    const py::ssize_t *shape_strides = shape_strides_owner.get();
-
-    {
-        // release GIL for the blocking call
-        py::gil_scoped_release lock{};
-
-        // Get implementation function pointer
-        auto copy_and_cast_from_host_blocking_fn =
-            copy_and_cast_from_host_blocking_dispatch_table[dst_type_id]
-                                                           [src_type_id];
-
-        copy_and_cast_from_host_blocking_fn(
-            exec_q, src_nelems, nd, shape_strides, src_data, src_offset,
-            npy_src_min_nelem_offset, npy_src_max_nelem_offset, dst_data,
-            dst_offset, depends, {copy_shape_ev});
-
-        // invoke USM deleter in smart pointer while GIL is held
-        shape_strides_owner.reset(nullptr);
-    }
-
-    return;
-}
-
-void init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(void)
-{
-    using namespace td_ns;
-    using dpctl::tensor::kernels::copy_and_cast::CopyAndCastFromHostFactory;
-
-    DispatchTableBuilder<copy_and_cast_from_host_blocking_fn_ptr_t,
-                         CopyAndCastFromHostFactory, num_types>
-        dtb_copy_from_numpy;
-
-    dtb_copy_from_numpy.populate_dispatch_table(
-        copy_and_cast_from_host_blocking_dispatch_table);
-
-    using dpctl::tensor::kernels::copy_and_cast::
-        CopyAndCastFromHostContigFactory;
-
-    DispatchTableBuilder<copy_and_cast_from_host_contig_blocking_fn_ptr_t,
-                         CopyAndCastFromHostContigFactory, num_types>
-        dtb_copy_from_numpy_contig;
-
-    dtb_copy_from_numpy_contig.populate_dispatch_table(
-        copy_and_cast_from_host_contig_blocking_dispatch_table);
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp
deleted file mode 100644
index fed827a803..0000000000
--- a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void copy_numpy_ndarray_into_usm_ndarray(
-    const py::array &npy_src,
-    const dpctl::tensor::usm_ndarray &dst,
-    sycl::queue &exec_q,
-    const std::vector<sycl::event> &depends = {});
-
-extern void init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(void);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/device_support_queries.cpp b/dpctl/tensor/libtensor/source/device_support_queries.cpp
deleted file mode 100644
index 1a76f0b30b..0000000000
--- a/dpctl/tensor/libtensor/source/device_support_queries.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#include <string>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace
-{
-
-std::string _default_device_fp_type(const sycl::device &d)
-{
-    if (d.has(sycl::aspect::fp64)) {
-        return "f8";
-    }
-    else {
-        return "f4";
-    }
-}
-
-int get_numpy_major_version()
-{
-    namespace py = pybind11;
-
-    py::module_ numpy = py::module_::import("numpy");
-    py::str version_string = numpy.attr("__version__");
-    py::module_ numpy_lib = py::module_::import("numpy.lib");
-
-    py::object numpy_version = numpy_lib.attr("NumpyVersion")(version_string);
-    int major_version = numpy_version.attr("major").cast<int>();
-
-    return major_version;
-}
-
-std::string _default_device_int_type(const sycl::device &)
-{
-    const int np_ver = get_numpy_major_version();
-
-    if (np_ver >= 2) {
-        return "i8";
-    }
-    else {
-        // code for numpy.dtype('long') to be consistent
-        // with NumPy's default integer type across
-        // platforms.
-        return "l";
-    }
-}
-
-std::string _default_device_uint_type(const sycl::device &)
-{
-    const int np_ver = get_numpy_major_version();
-
-    if (np_ver >= 2) {
-        return "u8";
-    }
-    else {
-        // code for numpy.dtype('long') to be consistent
-        // with NumPy's default integer type across
-        // platforms.
-        return "L";
-    }
-}
-
-std::string _default_device_complex_type(const sycl::device &d)
-{
-    if (d.has(sycl::aspect::fp64)) {
-        return "c16";
-    }
-    else {
-        return "c8";
-    }
-}
-
-std::string _default_device_bool_type(const sycl::device &) { return "b1"; }
-
-std::string _default_device_index_type(const sycl::device &) { return "i8"; }
-
-sycl::device _extract_device(const py::object &arg)
-{
-    auto const &api = dpctl::detail::dpctl_capi::get();
-
-    PyObject *source = arg.ptr();
-    if (api.PySyclQueue_Check_(source)) {
-        const sycl::queue &q = py::cast<sycl::queue>(arg);
-        return q.get_device();
-    }
-    else if (api.PySyclDevice_Check_(source)) {
-        return py::cast<sycl::device>(arg);
-    }
-    else {
-        throw py::type_error(
-            "Expected type `dpctl.SyclQueue` or `dpctl.SyclDevice`.");
-    }
-}
-
-} // namespace
-
-std::string default_device_fp_type(const py::object &arg)
-{
-    const sycl::device &d = _extract_device(arg);
-    return _default_device_fp_type(d);
-}
-
-std::string default_device_int_type(const py::object &arg)
-{
-    const sycl::device &d = _extract_device(arg);
-    return _default_device_int_type(d);
-}
-
-std::string default_device_uint_type(const py::object &arg)
-{
-    const sycl::device &d = _extract_device(arg);
-    return _default_device_uint_type(d);
-}
-
-std::string default_device_bool_type(const py::object &arg)
-{
-    const sycl::device &d = _extract_device(arg);
-    return _default_device_bool_type(d);
-}
-
-std::string default_device_complex_type(const py::object &arg)
-{
-    const sycl::device &d = _extract_device(arg);
-    return _default_device_complex_type(d);
-}
-
-std::string default_device_index_type(const py::object &arg)
-{
-    const sycl::device &d = _extract_device(arg);
-    return _default_device_index_type(d);
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/device_support_queries.hpp b/dpctl/tensor/libtensor/source/device_support_queries.hpp
deleted file mode 100644
index 84bf368f56..0000000000
--- a/dpctl/tensor/libtensor/source/device_support_queries.hpp
+++ /dev/null
@@ -1,49 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#pragma once
-#include <string>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern std::string default_device_fp_type(const py::object &);
-extern std::string default_device_int_type(const py::object &);
-extern std::string default_device_uint_type(const py::object &);
-extern std::string default_device_bool_type(const py::object &);
-extern std::string default_device_complex_type(const py::object &);
-extern std::string default_device_index_type(const py::object &);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/abs.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/abs.cpp
deleted file mode 100644
index 0696a19cc3..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/abs.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "abs.hpp"
-#include "elementwise_functions.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/abs.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U01: ==== ABS   (x)
-namespace impl
-{
-
-namespace abs_fn_ns = dpctl::tensor::kernels::abs;
-
-static unary_contig_impl_fn_ptr_t abs_contig_dispatch_vector[td_ns::num_types];
-static int abs_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    abs_strided_dispatch_vector[td_ns::num_types];
-
-void populate_abs_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = abs_fn_ns;
-
-    using fn_ns::AbsContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AbsContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(abs_contig_dispatch_vector);
-
-    using fn_ns::AbsStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AbsStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(abs_strided_dispatch_vector);
-
-    using fn_ns::AbsTypeMapFactory;
-    DispatchVectorBuilder<int, AbsTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(abs_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_abs(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_abs_dispatch_vectors();
-        using impl::abs_contig_dispatch_vector;
-        using impl::abs_output_typeid_vector;
-        using impl::abs_strided_dispatch_vector;
-
-        auto abs_pyapi = [&](const arrayT &src, const arrayT &dst,
-                             sycl::queue &exec_q,
-                             const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, abs_output_typeid_vector,
-                abs_contig_dispatch_vector, abs_strided_dispatch_vector);
-        };
-        m.def("_abs", abs_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto abs_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, abs_output_typeid_vector);
-        };
-        m.def("_abs_result_type", abs_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/abs.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/abs.hpp
deleted file mode 100644
index 0dde150f2c..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/abs.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_abs(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/acos.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/acos.cpp
deleted file mode 100644
index a36d324ac3..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/acos.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "acos.hpp"
-#include "elementwise_functions.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/acos.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U02: ==== ACOS   (x)
-namespace impl
-{
-
-namespace acos_fn_ns = dpctl::tensor::kernels::acos;
-
-static unary_contig_impl_fn_ptr_t acos_contig_dispatch_vector[td_ns::num_types];
-static int acos_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    acos_strided_dispatch_vector[td_ns::num_types];
-
-void populate_acos_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = acos_fn_ns;
-
-    using fn_ns::AcosContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AcosContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(acos_contig_dispatch_vector);
-
-    using fn_ns::AcosStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AcosStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(acos_strided_dispatch_vector);
-
-    using fn_ns::AcosTypeMapFactory;
-    DispatchVectorBuilder<int, AcosTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(acos_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_acos(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_acos_dispatch_vectors();
-        using impl::acos_contig_dispatch_vector;
-        using impl::acos_output_typeid_vector;
-        using impl::acos_strided_dispatch_vector;
-
-        auto acos_pyapi = [&](const arrayT &src, const arrayT &dst,
-                              sycl::queue &exec_q,
-                              const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, acos_output_typeid_vector,
-                acos_contig_dispatch_vector, acos_strided_dispatch_vector);
-        };
-        m.def("_acos", acos_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto acos_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, acos_output_typeid_vector);
-        };
-        m.def("_acos_result_type", acos_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/acos.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/acos.hpp
deleted file mode 100644
index f509a09667..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/acos.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_acos(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/acosh.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/acosh.cpp
deleted file mode 100644
index 9fa4034e72..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/acosh.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "acosh.hpp"
-#include "elementwise_functions.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/acosh.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U03: ==== ACOSH   (x)
-namespace impl
-{
-
-namespace acosh_fn_ns = dpctl::tensor::kernels::acosh;
-
-static unary_contig_impl_fn_ptr_t
-    acosh_contig_dispatch_vector[td_ns::num_types];
-static int acosh_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    acosh_strided_dispatch_vector[td_ns::num_types];
-
-void populate_acosh_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = acosh_fn_ns;
-
-    using fn_ns::AcoshContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AcoshContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(acosh_contig_dispatch_vector);
-
-    using fn_ns::AcoshStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AcoshStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(acosh_strided_dispatch_vector);
-
-    using fn_ns::AcoshTypeMapFactory;
-    DispatchVectorBuilder<int, AcoshTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(acosh_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_acosh(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_acosh_dispatch_vectors();
-        using impl::acosh_contig_dispatch_vector;
-        using impl::acosh_output_typeid_vector;
-        using impl::acosh_strided_dispatch_vector;
-
-        auto acosh_pyapi = [&](const arrayT &src, const arrayT &dst,
-                               sycl::queue &exec_q,
-                               const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, acosh_output_typeid_vector,
-                acosh_contig_dispatch_vector, acosh_strided_dispatch_vector);
-        };
-        m.def("_acosh", acosh_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto acosh_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              acosh_output_typeid_vector);
-        };
-        m.def("_acosh_result_type", acosh_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/acosh.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/acosh.hpp
deleted file mode 100644
index 6f4ff05cee..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/acosh.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_acosh(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/add.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/add.cpp
deleted file mode 100644
index 31a0b7f053..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/add.cpp
+++ /dev/null
@@ -1,237 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "kernels/elementwise_functions/add.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/common_inplace.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "add.hpp"
-#include "elementwise_functions.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
-
-using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
-
-// B01: ===== ADD (x1, x2)
-namespace impl
-{
-
-namespace add_fn_ns = dpctl::tensor::kernels::add;
-
-static binary_contig_impl_fn_ptr_t add_contig_dispatch_table[td_ns::num_types]
-                                                            [td_ns::num_types];
-
-static int add_output_id_table[td_ns::num_types][td_ns::num_types];
-static int add_inplace_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    add_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-// add(matrix, row)
-static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t
-    add_contig_matrix_contig_row_broadcast_dispatch_table[td_ns::num_types]
-                                                         [td_ns::num_types];
-
-// add(row, matrix)
-static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t
-    add_contig_row_contig_matrix_broadcast_dispatch_table[td_ns::num_types]
-                                                         [td_ns::num_types];
-
-static binary_inplace_contig_impl_fn_ptr_t
-    add_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static binary_inplace_strided_impl_fn_ptr_t
-    add_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t
-    add_inplace_row_matrix_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_add_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = add_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::AddTypeMapFactory;
-    DispatchTableBuilder<int, AddTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(add_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::AddStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, AddStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(add_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::AddContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, AddContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(add_contig_dispatch_table);
-
-    // function pointers for operation on contiguous matrix, contiguous row
-    // with contiguous matrix output
-    using fn_ns::AddContigMatrixContigRowBroadcastFactory;
-    DispatchTableBuilder<
-        binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t,
-        AddContigMatrixContigRowBroadcastFactory, num_types>
-        dtb4;
-    dtb4.populate_dispatch_table(
-        add_contig_matrix_contig_row_broadcast_dispatch_table);
-
-    // function pointers for operation on contiguous row, contiguous matrix
-    // with contiguous matrix output
-    using fn_ns::AddContigRowContigMatrixBroadcastFactory;
-    DispatchTableBuilder<
-        binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t,
-        AddContigRowContigMatrixBroadcastFactory, num_types>
-        dtb5;
-    dtb5.populate_dispatch_table(
-        add_contig_row_contig_matrix_broadcast_dispatch_table);
-
-    // function pointers for inplace operation on general strided arrays
-    using fn_ns::AddInplaceStridedFactory;
-    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
-                         AddInplaceStridedFactory, num_types>
-        dtb6;
-    dtb6.populate_dispatch_table(add_inplace_strided_dispatch_table);
-
-    // function pointers for inplace operation on contiguous inputs and output
-    using fn_ns::AddInplaceContigFactory;
-    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
-                         AddInplaceContigFactory, num_types>
-        dtb7;
-    dtb7.populate_dispatch_table(add_inplace_contig_dispatch_table);
-
-    // function pointers for inplace operation on contiguous matrix
-    // and contiguous row
-    using fn_ns::AddInplaceRowMatrixBroadcastFactory;
-    DispatchTableBuilder<binary_inplace_row_matrix_broadcast_impl_fn_ptr_t,
-                         AddInplaceRowMatrixBroadcastFactory, num_types>
-        dtb8;
-    dtb8.populate_dispatch_table(add_inplace_row_matrix_dispatch_table);
-
-    // which types are supported by the in-place kernels
-    using fn_ns::AddInplaceTypeMapFactory;
-    DispatchTableBuilder<int, AddInplaceTypeMapFactory, num_types> dtb9;
-    dtb9.populate_dispatch_table(add_inplace_output_id_table);
-};
-
-} // namespace impl
-
-void init_add(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_add_dispatch_tables();
-        using impl::add_contig_dispatch_table;
-        using impl::add_contig_matrix_contig_row_broadcast_dispatch_table;
-        using impl::add_contig_row_contig_matrix_broadcast_dispatch_table;
-        using impl::add_output_id_table;
-        using impl::add_strided_dispatch_table;
-
-        auto add_pyapi = [&](const arrayT &src1, const arrayT &src2,
-                             const arrayT &dst, sycl::queue &exec_q,
-                             const event_vecT &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, add_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                add_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                add_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                add_contig_matrix_contig_row_broadcast_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                add_contig_row_contig_matrix_broadcast_dispatch_table);
-        };
-        auto add_result_type_pyapi = [&](const py::dtype &dtype1,
-                                         const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               add_output_id_table);
-        };
-        m.def("_add", add_pyapi, "", py::arg("src1"), py::arg("src2"),
-              py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_add_result_type", add_result_type_pyapi, "");
-
-        using impl::add_inplace_contig_dispatch_table;
-        using impl::add_inplace_output_id_table;
-        using impl::add_inplace_row_matrix_dispatch_table;
-        using impl::add_inplace_strided_dispatch_table;
-
-        auto add_inplace_pyapi = [&](const arrayT &src, const arrayT &dst,
-                                     sycl::queue &exec_q,
-                                     const event_vecT &depends = {}) {
-            return py_binary_inplace_ufunc(
-                src, dst, exec_q, depends, add_inplace_output_id_table,
-                // function pointers to handle inplace operation on
-                // contiguous arrays (pointers may be nullptr)
-                add_inplace_contig_dispatch_table,
-                // function pointers to handle inplace operation on strided
-                // arrays (most general case)
-                add_inplace_strided_dispatch_table,
-                // function pointers to handle inplace operation on
-                // c-contig matrix with c-contig row with broadcasting
-                // (may be nullptr)
-                add_inplace_row_matrix_dispatch_table);
-        };
-        m.def("_add_inplace", add_inplace_pyapi, "", py::arg("lhs"),
-              py::arg("rhs"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/add.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/add.hpp
deleted file mode 100644
index 5753117ff6..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/add.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_add(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/angle.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/angle.cpp
deleted file mode 100644
index 2012ec93af..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/angle.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "angle.hpp"
-#include "elementwise_functions.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/angle.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U43: ==== ANGLE   (x)
-namespace impl
-{
-
-namespace angle_fn_ns = dpctl::tensor::kernels::angle;
-
-static unary_contig_impl_fn_ptr_t
-    angle_contig_dispatch_vector[td_ns::num_types];
-static int angle_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    angle_strided_dispatch_vector[td_ns::num_types];
-
-void populate_angle_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = angle_fn_ns;
-
-    using fn_ns::AngleContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AngleContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(angle_contig_dispatch_vector);
-
-    using fn_ns::AngleStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AngleStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(angle_strided_dispatch_vector);
-
-    using fn_ns::AngleTypeMapFactory;
-    DispatchVectorBuilder<int, AngleTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(angle_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_angle(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_angle_dispatch_vectors();
-        using impl::angle_contig_dispatch_vector;
-        using impl::angle_output_typeid_vector;
-        using impl::angle_strided_dispatch_vector;
-
-        auto angle_pyapi = [&](const arrayT &src, const arrayT &dst,
-                               sycl::queue &exec_q,
-                               const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, angle_output_typeid_vector,
-                angle_contig_dispatch_vector, angle_strided_dispatch_vector);
-        };
-        m.def("_angle", angle_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto angle_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              angle_output_typeid_vector);
-        };
-        m.def("_angle_result_type", angle_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/angle.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/angle.hpp
deleted file mode 100644
index b7ed14a75d..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/angle.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_angle(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/asin.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/asin.cpp
deleted file mode 100644
index b475e0346a..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/asin.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "asin.hpp"
-#include "elementwise_functions.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/asin.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U04: ==== ASIN   (x)
-namespace impl
-{
-
-namespace asin_fn_ns = dpctl::tensor::kernels::asin;
-
-static unary_contig_impl_fn_ptr_t asin_contig_dispatch_vector[td_ns::num_types];
-static int asin_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    asin_strided_dispatch_vector[td_ns::num_types];
-
-void populate_asin_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = asin_fn_ns;
-
-    using fn_ns::AsinContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AsinContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(asin_contig_dispatch_vector);
-
-    using fn_ns::AsinStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AsinStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(asin_strided_dispatch_vector);
-
-    using fn_ns::AsinTypeMapFactory;
-    DispatchVectorBuilder<int, AsinTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(asin_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_asin(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_asin_dispatch_vectors();
-        using impl::asin_contig_dispatch_vector;
-        using impl::asin_output_typeid_vector;
-        using impl::asin_strided_dispatch_vector;
-
-        auto asin_pyapi = [&](const arrayT &src, const arrayT &dst,
-                              sycl::queue &exec_q,
-                              const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, asin_output_typeid_vector,
-                asin_contig_dispatch_vector, asin_strided_dispatch_vector);
-        };
-        m.def("_asin", asin_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto asin_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, asin_output_typeid_vector);
-        };
-        m.def("_asin_result_type", asin_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/asin.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/asin.hpp
deleted file mode 100644
index a78e10f936..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/asin.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_asin(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/asinh.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/asinh.cpp
deleted file mode 100644
index d189449f8d..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/asinh.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "asinh.hpp"
-#include "elementwise_functions.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/asinh.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U05: ==== ASINH   (x)
-namespace impl
-{
-
-namespace asinh_fn_ns = dpctl::tensor::kernels::asinh;
-
-static unary_contig_impl_fn_ptr_t
-    asinh_contig_dispatch_vector[td_ns::num_types];
-static int asinh_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    asinh_strided_dispatch_vector[td_ns::num_types];
-
-void populate_asinh_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = asinh_fn_ns;
-
-    using fn_ns::AsinhContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AsinhContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(asinh_contig_dispatch_vector);
-
-    using fn_ns::AsinhStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AsinhStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(asinh_strided_dispatch_vector);
-
-    using fn_ns::AsinhTypeMapFactory;
-    DispatchVectorBuilder<int, AsinhTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(asinh_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_asinh(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_asinh_dispatch_vectors();
-        using impl::asinh_contig_dispatch_vector;
-        using impl::asinh_output_typeid_vector;
-        using impl::asinh_strided_dispatch_vector;
-
-        auto asinh_pyapi = [&](const arrayT &src, const arrayT &dst,
-                               sycl::queue &exec_q,
-                               const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, asinh_output_typeid_vector,
-                asinh_contig_dispatch_vector, asinh_strided_dispatch_vector);
-        };
-        m.def("_asinh", asinh_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto asinh_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              asinh_output_typeid_vector);
-        };
-        m.def("_asinh_result_type", asinh_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/asinh.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/asinh.hpp
deleted file mode 100644
index 7c906bf61a..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/asinh.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_asinh(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/atan.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/atan.cpp
deleted file mode 100644
index 2252276d31..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/atan.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "atan.hpp"
-#include "elementwise_functions.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/atan.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U06: ==== ATAN   (x)
-namespace impl
-{
-
-namespace atan_fn_ns = dpctl::tensor::kernels::atan;
-
-static unary_contig_impl_fn_ptr_t atan_contig_dispatch_vector[td_ns::num_types];
-static int atan_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    atan_strided_dispatch_vector[td_ns::num_types];
-
-void populate_atan_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = atan_fn_ns;
-
-    using fn_ns::AtanContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AtanContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(atan_contig_dispatch_vector);
-
-    using fn_ns::AtanStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AtanStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(atan_strided_dispatch_vector);
-
-    using fn_ns::AtanTypeMapFactory;
-    DispatchVectorBuilder<int, AtanTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(atan_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_atan(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_atan_dispatch_vectors();
-        using impl::atan_contig_dispatch_vector;
-        using impl::atan_output_typeid_vector;
-        using impl::atan_strided_dispatch_vector;
-
-        auto atan_pyapi = [&](const arrayT &src, const arrayT &dst,
-                              sycl::queue &exec_q,
-                              const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, atan_output_typeid_vector,
-                atan_contig_dispatch_vector, atan_strided_dispatch_vector);
-        };
-        m.def("_atan", atan_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto atan_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, atan_output_typeid_vector);
-        };
-        m.def("_atan_result_type", atan_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/atan.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/atan.hpp
deleted file mode 100644
index 29ed49f7ae..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/atan.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_atan(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/atan2.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/atan2.cpp
deleted file mode 100644
index 3cc66735cf..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/atan2.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "atan2.hpp"
-#include "elementwise_functions.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/atan2.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
-
-// B02: ===== ATAN2 (x1, x2)
-namespace impl
-{
-namespace atan2_fn_ns = dpctl::tensor::kernels::atan2;
-
-static binary_contig_impl_fn_ptr_t
-    atan2_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int atan2_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    atan2_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_atan2_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = atan2_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::Atan2TypeMapFactory;
-    DispatchTableBuilder<int, Atan2TypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(atan2_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::Atan2StridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, Atan2StridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(atan2_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::Atan2ContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, Atan2ContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(atan2_contig_dispatch_table);
-};
-
-} // namespace impl
-
-void init_atan2(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_atan2_dispatch_tables();
-        using impl::atan2_contig_dispatch_table;
-        using impl::atan2_output_id_table;
-        using impl::atan2_strided_dispatch_table;
-
-        auto atan2_pyapi = [&](const arrayT &src1, const arrayT &src2,
-                               const arrayT &dst, sycl::queue &exec_q,
-                               const event_vecT &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, atan2_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                atan2_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                atan2_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto atan2_result_type_pyapi = [&](const py::dtype &dtype1,
-                                           const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               atan2_output_id_table);
-        };
-        m.def("_atan2", atan2_pyapi, "", py::arg("src1"), py::arg("src2"),
-              py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_atan2_result_type", atan2_result_type_pyapi, "");
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/atan2.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/atan2.hpp
deleted file mode 100644
index 010c42c2e2..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/atan2.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_atan2(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/atanh.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/atanh.cpp
deleted file mode 100644
index f3f4ab071a..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/atanh.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "atanh.hpp"
-#include "elementwise_functions.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/atanh.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U07: ==== ATANH   (x)
-namespace impl
-{
-
-namespace atanh_fn_ns = dpctl::tensor::kernels::atanh;
-
-static unary_contig_impl_fn_ptr_t
-    atanh_contig_dispatch_vector[td_ns::num_types];
-static int atanh_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    atanh_strided_dispatch_vector[td_ns::num_types];
-
-void populate_atanh_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = atanh_fn_ns;
-
-    using fn_ns::AtanhContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, AtanhContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(atanh_contig_dispatch_vector);
-
-    using fn_ns::AtanhStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, AtanhStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(atanh_strided_dispatch_vector);
-
-    using fn_ns::AtanhTypeMapFactory;
-    DispatchVectorBuilder<int, AtanhTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(atanh_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_atanh(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_atanh_dispatch_vectors();
-        using impl::atanh_contig_dispatch_vector;
-        using impl::atanh_output_typeid_vector;
-        using impl::atanh_strided_dispatch_vector;
-
-        auto atanh_pyapi = [&](const arrayT &src, const arrayT &dst,
-                               sycl::queue &exec_q,
-                               const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, atanh_output_typeid_vector,
-                atanh_contig_dispatch_vector, atanh_strided_dispatch_vector);
-        };
-        m.def("_atanh", atanh_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto atanh_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              atanh_output_typeid_vector);
-        };
-        m.def("_atanh_result_type", atanh_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/atanh.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/atanh.hpp
deleted file mode 100644
index 803ea555ff..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/atanh.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_atanh(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_and.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_and.cpp
deleted file mode 100644
index a4965a4db3..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_and.cpp
+++ /dev/null
@@ -1,200 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "bitwise_and.hpp"
-#include "elementwise_functions.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/bitwise_and.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/common_inplace.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
-
-using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
-
-// B03: ===== BITWISE_AND (x1, x2)
-namespace impl
-{
-namespace bitwise_and_fn_ns = dpctl::tensor::kernels::bitwise_and;
-
-static binary_contig_impl_fn_ptr_t
-    bitwise_and_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-static int bitwise_and_output_id_table[td_ns::num_types][td_ns::num_types];
-static int bitwise_and_inplace_output_id_table[td_ns::num_types]
-                                              [td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    bitwise_and_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-static binary_inplace_contig_impl_fn_ptr_t
-    bitwise_and_inplace_contig_dispatch_table[td_ns::num_types]
-                                             [td_ns::num_types];
-static binary_inplace_strided_impl_fn_ptr_t
-    bitwise_and_inplace_strided_dispatch_table[td_ns::num_types]
-                                              [td_ns::num_types];
-
-void populate_bitwise_and_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = bitwise_and_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::BitwiseAndTypeMapFactory;
-    DispatchTableBuilder<int, BitwiseAndTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(bitwise_and_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::BitwiseAndStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, BitwiseAndStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(bitwise_and_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::BitwiseAndContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, BitwiseAndContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(bitwise_and_contig_dispatch_table);
-
-    // function pointers for inplace operation on general strided arrays
-    using fn_ns::BitwiseAndInplaceStridedFactory;
-    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
-                         BitwiseAndInplaceStridedFactory, num_types>
-        dtb4;
-    dtb4.populate_dispatch_table(bitwise_and_inplace_strided_dispatch_table);
-
-    // function pointers for inplace operation on contiguous inputs and output
-    using fn_ns::BitwiseAndInplaceContigFactory;
-    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
-                         BitwiseAndInplaceContigFactory, num_types>
-        dtb5;
-    dtb5.populate_dispatch_table(bitwise_and_inplace_contig_dispatch_table);
-
-    // which types are supported by the in-place kernels
-    using fn_ns::BitwiseAndInplaceTypeMapFactory;
-    DispatchTableBuilder<int, BitwiseAndInplaceTypeMapFactory, num_types> dtb6;
-    dtb6.populate_dispatch_table(bitwise_and_inplace_output_id_table);
-};
-
-} // namespace impl
-
-void init_bitwise_and(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_bitwise_and_dispatch_tables();
-        using impl::bitwise_and_contig_dispatch_table;
-        using impl::bitwise_and_output_id_table;
-        using impl::bitwise_and_strided_dispatch_table;
-
-        auto bitwise_and_pyapi = [&](const arrayT &src1, const arrayT &src2,
-                                     const arrayT &dst, sycl::queue &exec_q,
-                                     const event_vecT &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, bitwise_and_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                bitwise_and_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                bitwise_and_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto bitwise_and_result_type_pyapi = [&](const py::dtype &dtype1,
-                                                 const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               bitwise_and_output_id_table);
-        };
-        m.def("_bitwise_and", bitwise_and_pyapi, "", py::arg("src1"),
-              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_bitwise_and_result_type", bitwise_and_result_type_pyapi, "");
-
-        using impl::bitwise_and_inplace_contig_dispatch_table;
-        using impl::bitwise_and_inplace_output_id_table;
-        using impl::bitwise_and_inplace_strided_dispatch_table;
-
-        auto bitwise_and_inplace_pyapi = [&](const arrayT &src,
-                                             const arrayT &dst,
-                                             sycl::queue &exec_q,
-                                             const event_vecT &depends = {}) {
-            return py_binary_inplace_ufunc(
-                src, dst, exec_q, depends, bitwise_and_inplace_output_id_table,
-                // function pointers to handle inplace operation on
-                // contiguous arrays (pointers may be nullptr)
-                bitwise_and_inplace_contig_dispatch_table,
-                // function pointers to handle inplace operation on strided
-                // arrays (most general case)
-                bitwise_and_inplace_strided_dispatch_table,
-                // function pointers to handle inplace operation on
-                // c-contig matrix with c-contig row with broadcasting
-                // (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        m.def("_bitwise_and_inplace", bitwise_and_inplace_pyapi, "",
-              py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_and.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_and.hpp
deleted file mode 100644
index 6784538d19..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_and.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_bitwise_and(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_invert.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_invert.cpp
deleted file mode 100644
index d33204ca34..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_invert.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "bitwise_invert.hpp"
-#include "elementwise_functions.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/bitwise_invert.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U08: ===== BITWISE_INVERT        (x)
-namespace impl
-{
-
-namespace bitwise_invert_fn_ns = dpctl::tensor::kernels::bitwise_invert;
-
-static unary_contig_impl_fn_ptr_t
-    bitwise_invert_contig_dispatch_vector[td_ns::num_types];
-static int bitwise_invert_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    bitwise_invert_strided_dispatch_vector[td_ns::num_types];
-
-void populate_bitwise_invert_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = bitwise_invert_fn_ns;
-
-    using fn_ns::BitwiseInvertContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t,
-                          BitwiseInvertContigFactory, num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(bitwise_invert_contig_dispatch_vector);
-
-    using fn_ns::BitwiseInvertStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t,
-                          BitwiseInvertStridedFactory, num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(bitwise_invert_strided_dispatch_vector);
-
-    using fn_ns::BitwiseInvertTypeMapFactory;
-    DispatchVectorBuilder<int, BitwiseInvertTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(bitwise_invert_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_bitwise_invert(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_bitwise_invert_dispatch_vectors();
-        using impl::bitwise_invert_contig_dispatch_vector;
-        using impl::bitwise_invert_output_typeid_vector;
-        using impl::bitwise_invert_strided_dispatch_vector;
-
-        auto bitwise_invert_pyapi = [&](const arrayT &src, const arrayT &dst,
-                                        sycl::queue &exec_q,
-                                        const event_vecT &depends = {}) {
-            return py_unary_ufunc(src, dst, exec_q, depends,
-                                  bitwise_invert_output_typeid_vector,
-                                  bitwise_invert_contig_dispatch_vector,
-                                  bitwise_invert_strided_dispatch_vector);
-        };
-        m.def("_bitwise_invert", bitwise_invert_pyapi, "", py::arg("src"),
-              py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-
-        auto bitwise_invert_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(
-                dtype, bitwise_invert_output_typeid_vector);
-        };
-        m.def("_bitwise_invert_result_type", bitwise_invert_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_invert.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_invert.hpp
deleted file mode 100644
index f7f6927440..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_invert.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_bitwise_invert(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.cpp
deleted file mode 100644
index e46bb5ac91..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.cpp
+++ /dev/null
@@ -1,210 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "bitwise_left_shift.hpp"
-#include "elementwise_functions.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/bitwise_left_shift.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/common_inplace.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
-
-using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
-
-// B04: ===== BITWISE_LEFT_SHIFT (x1, x2)
-namespace impl
-{
-namespace bitwise_left_shift_fn_ns = dpctl::tensor::kernels::bitwise_left_shift;
-
-static binary_contig_impl_fn_ptr_t
-    bitwise_left_shift_contig_dispatch_table[td_ns::num_types]
-                                            [td_ns::num_types];
-
-static int bitwise_left_shift_output_id_table[td_ns::num_types]
-                                             [td_ns::num_types];
-static int bitwise_left_shift_inplace_output_id_table[td_ns::num_types]
-                                                     [td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    bitwise_left_shift_strided_dispatch_table[td_ns::num_types]
-                                             [td_ns::num_types];
-
-static binary_inplace_contig_impl_fn_ptr_t
-    bitwise_left_shift_inplace_contig_dispatch_table[td_ns::num_types]
-                                                    [td_ns::num_types];
-static binary_inplace_strided_impl_fn_ptr_t
-    bitwise_left_shift_inplace_strided_dispatch_table[td_ns::num_types]
-                                                     [td_ns::num_types];
-
-void populate_bitwise_left_shift_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = bitwise_left_shift_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::BitwiseLeftShiftTypeMapFactory;
-    DispatchTableBuilder<int, BitwiseLeftShiftTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(bitwise_left_shift_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::BitwiseLeftShiftStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t,
-                         BitwiseLeftShiftStridedFactory, num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(bitwise_left_shift_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::BitwiseLeftShiftContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t,
-                         BitwiseLeftShiftContigFactory, num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(bitwise_left_shift_contig_dispatch_table);
-
-    // function pointers for inplace operation on general strided arrays
-    using fn_ns::BitwiseLeftShiftInplaceStridedFactory;
-    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
-                         BitwiseLeftShiftInplaceStridedFactory, num_types>
-        dtb4;
-    dtb4.populate_dispatch_table(
-        bitwise_left_shift_inplace_strided_dispatch_table);
-
-    // function pointers for inplace operation on contiguous inputs and output
-    using fn_ns::BitwiseLeftShiftInplaceContigFactory;
-    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
-                         BitwiseLeftShiftInplaceContigFactory, num_types>
-        dtb5;
-    dtb5.populate_dispatch_table(
-        bitwise_left_shift_inplace_contig_dispatch_table);
-
-    // which types are supported by the in-place kernels
-    using fn_ns::BitwiseLeftShiftInplaceTypeMapFactory;
-    DispatchTableBuilder<int, BitwiseLeftShiftInplaceTypeMapFactory, num_types>
-        dtb6;
-    dtb6.populate_dispatch_table(bitwise_left_shift_inplace_output_id_table);
-};
-
-} // namespace impl
-
-void init_bitwise_left_shift(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_bitwise_left_shift_dispatch_tables();
-        using impl::bitwise_left_shift_contig_dispatch_table;
-        using impl::bitwise_left_shift_output_id_table;
-        using impl::bitwise_left_shift_strided_dispatch_table;
-
-        auto bitwise_left_shift_pyapi = [&](const arrayT &src1,
-                                            const arrayT &src2,
-                                            const arrayT &dst,
-                                            sycl::queue &exec_q,
-                                            const event_vecT &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends,
-                bitwise_left_shift_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                bitwise_left_shift_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                bitwise_left_shift_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto bitwise_left_shift_result_type_pyapi =
-            [&](const py::dtype &dtype1, const py::dtype &dtype2) {
-                return py_binary_ufunc_result_type(
-                    dtype1, dtype2, bitwise_left_shift_output_id_table);
-            };
-        m.def("_bitwise_left_shift", bitwise_left_shift_pyapi, "",
-              py::arg("src1"), py::arg("src2"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-        m.def("_bitwise_left_shift_result_type",
-              bitwise_left_shift_result_type_pyapi, "");
-
-        using impl::bitwise_left_shift_inplace_contig_dispatch_table;
-        using impl::bitwise_left_shift_inplace_output_id_table;
-        using impl::bitwise_left_shift_inplace_strided_dispatch_table;
-
-        auto bitwise_left_shift_inplace_pyapi =
-            [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q,
-                const event_vecT &depends = {}) {
-                return py_binary_inplace_ufunc(
-                    src, dst, exec_q, depends,
-                    bitwise_left_shift_inplace_output_id_table,
-                    // function pointers to handle inplace operation on
-                    // contiguous arrays (pointers may be nullptr)
-                    bitwise_left_shift_inplace_contig_dispatch_table,
-                    // function pointers to handle inplace operation on strided
-                    // arrays (most general case)
-                    bitwise_left_shift_inplace_strided_dispatch_table,
-                    // function pointers to handle inplace operation on
-                    // c-contig matrix with c-contig row with broadcasting
-                    // (may be nullptr)
-                    td_ns::NullPtrTable<
-                        binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
-            };
-        m.def("_bitwise_left_shift_inplace", bitwise_left_shift_inplace_pyapi,
-              "", py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.hpp
deleted file mode 100644
index 665b55a3e9..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_bitwise_left_shift(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_or.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_or.cpp
deleted file mode 100644
index b85f1f2b54..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_or.cpp
+++ /dev/null
@@ -1,200 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "bitwise_or.hpp"
-#include "elementwise_functions.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/bitwise_or.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/common_inplace.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
-
-using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
-
-// B05: ===== BITWISE_OR (x1, x2)
-namespace impl
-{
-namespace bitwise_or_fn_ns = dpctl::tensor::kernels::bitwise_or;
-
-static binary_contig_impl_fn_ptr_t
-    bitwise_or_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-static int bitwise_or_output_id_table[td_ns::num_types][td_ns::num_types];
-static int bitwise_or_inplace_output_id_table[td_ns::num_types]
-                                             [td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    bitwise_or_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-static binary_inplace_contig_impl_fn_ptr_t
-    bitwise_or_inplace_contig_dispatch_table[td_ns::num_types]
-                                            [td_ns::num_types];
-static binary_inplace_strided_impl_fn_ptr_t
-    bitwise_or_inplace_strided_dispatch_table[td_ns::num_types]
-                                             [td_ns::num_types];
-
-void populate_bitwise_or_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = bitwise_or_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::BitwiseOrTypeMapFactory;
-    DispatchTableBuilder<int, BitwiseOrTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(bitwise_or_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::BitwiseOrStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, BitwiseOrStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(bitwise_or_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::BitwiseOrContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, BitwiseOrContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(bitwise_or_contig_dispatch_table);
-
-    // function pointers for inplace operation on general strided arrays
-    using fn_ns::BitwiseOrInplaceStridedFactory;
-    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
-                         BitwiseOrInplaceStridedFactory, num_types>
-        dtb4;
-    dtb4.populate_dispatch_table(bitwise_or_inplace_strided_dispatch_table);
-
-    // function pointers for inplace operation on contiguous inputs and output
-    using fn_ns::BitwiseOrInplaceContigFactory;
-    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
-                         BitwiseOrInplaceContigFactory, num_types>
-        dtb5;
-    dtb5.populate_dispatch_table(bitwise_or_inplace_contig_dispatch_table);
-
-    // which types are supported by the in-place kernels
-    using fn_ns::BitwiseOrInplaceTypeMapFactory;
-    DispatchTableBuilder<int, BitwiseOrInplaceTypeMapFactory, num_types> dtb6;
-    dtb6.populate_dispatch_table(bitwise_or_inplace_output_id_table);
-};
-
-} // namespace impl
-
-void init_bitwise_or(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_bitwise_or_dispatch_tables();
-        using impl::bitwise_or_contig_dispatch_table;
-        using impl::bitwise_or_output_id_table;
-        using impl::bitwise_or_strided_dispatch_table;
-
-        auto bitwise_or_pyapi = [&](const arrayT &src1, const arrayT &src2,
-                                    const arrayT &dst, sycl::queue &exec_q,
-                                    const event_vecT &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, bitwise_or_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                bitwise_or_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                bitwise_or_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto bitwise_or_result_type_pyapi = [&](const py::dtype &dtype1,
-                                                const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               bitwise_or_output_id_table);
-        };
-        m.def("_bitwise_or", bitwise_or_pyapi, "", py::arg("src1"),
-              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_bitwise_or_result_type", bitwise_or_result_type_pyapi, "");
-
-        using impl::bitwise_or_inplace_contig_dispatch_table;
-        using impl::bitwise_or_inplace_output_id_table;
-        using impl::bitwise_or_inplace_strided_dispatch_table;
-
-        auto bitwise_or_inplace_pyapi = [&](const arrayT &src,
-                                            const arrayT &dst,
-                                            sycl::queue &exec_q,
-                                            const event_vecT &depends = {}) {
-            return py_binary_inplace_ufunc(
-                src, dst, exec_q, depends, bitwise_or_inplace_output_id_table,
-                // function pointers to handle inplace operation on
-                // contiguous arrays (pointers may be nullptr)
-                bitwise_or_inplace_contig_dispatch_table,
-                // function pointers to handle inplace operation on strided
-                // arrays (most general case)
-                bitwise_or_inplace_strided_dispatch_table,
-                // function pointers to handle inplace operation on
-                // c-contig matrix with c-contig row with broadcasting
-                // (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        m.def("_bitwise_or_inplace", bitwise_or_inplace_pyapi, "",
-              py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_or.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_or.hpp
deleted file mode 100644
index 774f5d5d51..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_or.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_bitwise_or(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.cpp
deleted file mode 100644
index ebef7e5e1b..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.cpp
+++ /dev/null
@@ -1,211 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "bitwise_right_shift.hpp"
-#include "elementwise_functions.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/bitwise_right_shift.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/common_inplace.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
-
-using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
-
-// B06: ===== BITWISE_RIGHT_SHIFT (x1, x2)
-namespace impl
-{
-namespace bitwise_right_shift_fn_ns =
-    dpctl::tensor::kernels::bitwise_right_shift;
-
-static binary_contig_impl_fn_ptr_t
-    bitwise_right_shift_contig_dispatch_table[td_ns::num_types]
-                                             [td_ns::num_types];
-
-static int bitwise_right_shift_output_id_table[td_ns::num_types]
-                                              [td_ns::num_types];
-static int bitwise_right_shift_inplace_output_id_table[td_ns::num_types]
-                                                      [td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    bitwise_right_shift_strided_dispatch_table[td_ns::num_types]
-                                              [td_ns::num_types];
-
-static binary_inplace_contig_impl_fn_ptr_t
-    bitwise_right_shift_inplace_contig_dispatch_table[td_ns::num_types]
-                                                     [td_ns::num_types];
-static binary_inplace_strided_impl_fn_ptr_t
-    bitwise_right_shift_inplace_strided_dispatch_table[td_ns::num_types]
-                                                      [td_ns::num_types];
-
-void populate_bitwise_right_shift_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = bitwise_right_shift_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::BitwiseRightShiftTypeMapFactory;
-    DispatchTableBuilder<int, BitwiseRightShiftTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(bitwise_right_shift_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::BitwiseRightShiftStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t,
-                         BitwiseRightShiftStridedFactory, num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(bitwise_right_shift_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::BitwiseRightShiftContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t,
-                         BitwiseRightShiftContigFactory, num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(bitwise_right_shift_contig_dispatch_table);
-
-    // function pointers for inplace operation on general strided arrays
-    using fn_ns::BitwiseRightShiftInplaceStridedFactory;
-    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
-                         BitwiseRightShiftInplaceStridedFactory, num_types>
-        dtb4;
-    dtb4.populate_dispatch_table(
-        bitwise_right_shift_inplace_strided_dispatch_table);
-
-    // function pointers for inplace operation on contiguous inputs and output
-    using fn_ns::BitwiseRightShiftInplaceContigFactory;
-    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
-                         BitwiseRightShiftInplaceContigFactory, num_types>
-        dtb5;
-    dtb5.populate_dispatch_table(
-        bitwise_right_shift_inplace_contig_dispatch_table);
-
-    // which types are supported by the in-place kernels
-    using fn_ns::BitwiseRightShiftInplaceTypeMapFactory;
-    DispatchTableBuilder<int, BitwiseRightShiftInplaceTypeMapFactory, num_types>
-        dtb6;
-    dtb6.populate_dispatch_table(bitwise_right_shift_inplace_output_id_table);
-};
-
-} // namespace impl
-
-void init_bitwise_right_shift(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_bitwise_right_shift_dispatch_tables();
-        using impl::bitwise_right_shift_contig_dispatch_table;
-        using impl::bitwise_right_shift_output_id_table;
-        using impl::bitwise_right_shift_strided_dispatch_table;
-
-        auto bitwise_right_shift_pyapi = [&](const arrayT &src1,
-                                             const arrayT &src2,
-                                             const arrayT &dst,
-                                             sycl::queue &exec_q,
-                                             const event_vecT &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends,
-                bitwise_right_shift_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                bitwise_right_shift_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                bitwise_right_shift_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto bitwise_right_shift_result_type_pyapi =
-            [&](const py::dtype &dtype1, const py::dtype &dtype2) {
-                return py_binary_ufunc_result_type(
-                    dtype1, dtype2, bitwise_right_shift_output_id_table);
-            };
-        m.def("_bitwise_right_shift", bitwise_right_shift_pyapi, "",
-              py::arg("src1"), py::arg("src2"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-        m.def("_bitwise_right_shift_result_type",
-              bitwise_right_shift_result_type_pyapi, "");
-
-        using impl::bitwise_right_shift_inplace_contig_dispatch_table;
-        using impl::bitwise_right_shift_inplace_output_id_table;
-        using impl::bitwise_right_shift_inplace_strided_dispatch_table;
-
-        auto bitwise_right_shift_inplace_pyapi =
-            [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q,
-                const event_vecT &depends = {}) {
-                return py_binary_inplace_ufunc(
-                    src, dst, exec_q, depends,
-                    bitwise_right_shift_inplace_output_id_table,
-                    // function pointers to handle inplace operation on
-                    // contiguous arrays (pointers may be nullptr)
-                    bitwise_right_shift_inplace_contig_dispatch_table,
-                    // function pointers to handle inplace operation on strided
-                    // arrays (most general case)
-                    bitwise_right_shift_inplace_strided_dispatch_table,
-                    // function pointers to handle inplace operation on
-                    // c-contig matrix with c-contig row with broadcasting
-                    // (may be nullptr)
-                    td_ns::NullPtrTable<
-                        binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
-            };
-        m.def("_bitwise_right_shift_inplace", bitwise_right_shift_inplace_pyapi,
-              "", py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.hpp
deleted file mode 100644
index 21286f044c..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_bitwise_right_shift(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_xor.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_xor.cpp
deleted file mode 100644
index b133d22758..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_xor.cpp
+++ /dev/null
@@ -1,200 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "bitwise_xor.hpp"
-#include "elementwise_functions.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/bitwise_xor.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/common_inplace.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
-
-using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
-
-// B07: ===== BITWISE_XOR (x1, x2)
-namespace impl
-{
-namespace bitwise_xor_fn_ns = dpctl::tensor::kernels::bitwise_xor;
-
-static binary_contig_impl_fn_ptr_t
-    bitwise_xor_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-static int bitwise_xor_output_id_table[td_ns::num_types][td_ns::num_types];
-static int bitwise_xor_inplace_output_id_table[td_ns::num_types]
-                                              [td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    bitwise_xor_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-static binary_inplace_contig_impl_fn_ptr_t
-    bitwise_xor_inplace_contig_dispatch_table[td_ns::num_types]
-                                             [td_ns::num_types];
-static binary_inplace_strided_impl_fn_ptr_t
-    bitwise_xor_inplace_strided_dispatch_table[td_ns::num_types]
-                                              [td_ns::num_types];
-
-void populate_bitwise_xor_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = bitwise_xor_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::BitwiseXorTypeMapFactory;
-    DispatchTableBuilder<int, BitwiseXorTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(bitwise_xor_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::BitwiseXorStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, BitwiseXorStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(bitwise_xor_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::BitwiseXorContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, BitwiseXorContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(bitwise_xor_contig_dispatch_table);
-
-    // function pointers for inplace operation on general strided arrays
-    using fn_ns::BitwiseXorInplaceStridedFactory;
-    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
-                         BitwiseXorInplaceStridedFactory, num_types>
-        dtb4;
-    dtb4.populate_dispatch_table(bitwise_xor_inplace_strided_dispatch_table);
-
-    // function pointers for inplace operation on contiguous inputs and output
-    using fn_ns::BitwiseXorInplaceContigFactory;
-    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
-                         BitwiseXorInplaceContigFactory, num_types>
-        dtb5;
-    dtb5.populate_dispatch_table(bitwise_xor_inplace_contig_dispatch_table);
-
-    // which types are supported by the in-place kernels
-    using fn_ns::BitwiseXorInplaceTypeMapFactory;
-    DispatchTableBuilder<int, BitwiseXorInplaceTypeMapFactory, num_types> dtb6;
-    dtb6.populate_dispatch_table(bitwise_xor_inplace_output_id_table);
-};
-
-} // namespace impl
-
-void init_bitwise_xor(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_bitwise_xor_dispatch_tables();
-        using impl::bitwise_xor_contig_dispatch_table;
-        using impl::bitwise_xor_output_id_table;
-        using impl::bitwise_xor_strided_dispatch_table;
-
-        auto bitwise_xor_pyapi = [&](const arrayT &src1, const arrayT &src2,
-                                     const arrayT &dst, sycl::queue &exec_q,
-                                     const event_vecT &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, bitwise_xor_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                bitwise_xor_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                bitwise_xor_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto bitwise_xor_result_type_pyapi = [&](const py::dtype &dtype1,
-                                                 const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               bitwise_xor_output_id_table);
-        };
-        m.def("_bitwise_xor", bitwise_xor_pyapi, "", py::arg("src1"),
-              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_bitwise_xor_result_type", bitwise_xor_result_type_pyapi, "");
-
-        using impl::bitwise_xor_inplace_contig_dispatch_table;
-        using impl::bitwise_xor_inplace_output_id_table;
-        using impl::bitwise_xor_inplace_strided_dispatch_table;
-
-        auto bitwise_xor_inplace_pyapi = [&](const arrayT &src,
-                                             const arrayT &dst,
-                                             sycl::queue &exec_q,
-                                             const event_vecT &depends = {}) {
-            return py_binary_inplace_ufunc(
-                src, dst, exec_q, depends, bitwise_xor_inplace_output_id_table,
-                // function pointers to handle inplace operation on
-                // contiguous arrays (pointers may be nullptr)
-                bitwise_xor_inplace_contig_dispatch_table,
-                // function pointers to handle inplace operation on strided
-                // arrays (most general case)
-                bitwise_xor_inplace_strided_dispatch_table,
-                // function pointers to handle inplace operation on
-                // c-contig matrix with c-contig row with broadcasting
-                // (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        m.def("_bitwise_xor_inplace", bitwise_xor_inplace_pyapi, "",
-              py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_xor.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_xor.hpp
deleted file mode 100644
index 680284e879..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_xor.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_bitwise_xor(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/cbrt.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/cbrt.cpp
deleted file mode 100644
index 84ae322ea2..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/cbrt.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "cbrt.hpp"
-#include "elementwise_functions.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/cbrt.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U37: ==== CBRT   (x)
-namespace impl
-{
-
-namespace cbrt_fn_ns = dpctl::tensor::kernels::cbrt;
-
-static unary_contig_impl_fn_ptr_t cbrt_contig_dispatch_vector[td_ns::num_types];
-static int cbrt_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    cbrt_strided_dispatch_vector[td_ns::num_types];
-
-void populate_cbrt_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = cbrt_fn_ns;
-
-    using fn_ns::CbrtContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, CbrtContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(cbrt_contig_dispatch_vector);
-
-    using fn_ns::CbrtStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, CbrtStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(cbrt_strided_dispatch_vector);
-
-    using fn_ns::CbrtTypeMapFactory;
-    DispatchVectorBuilder<int, CbrtTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(cbrt_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_cbrt(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_cbrt_dispatch_vectors();
-        using impl::cbrt_contig_dispatch_vector;
-        using impl::cbrt_output_typeid_vector;
-        using impl::cbrt_strided_dispatch_vector;
-
-        auto cbrt_pyapi = [&](const arrayT &src, const arrayT &dst,
-                              sycl::queue &exec_q,
-                              const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, cbrt_output_typeid_vector,
-                cbrt_contig_dispatch_vector, cbrt_strided_dispatch_vector);
-        };
-        m.def("_cbrt", cbrt_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto cbrt_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, cbrt_output_typeid_vector);
-        };
-        m.def("_cbrt_result_type", cbrt_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/cbrt.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/cbrt.hpp
deleted file mode 100644
index 2269290f50..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/cbrt.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_cbrt(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/ceil.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/ceil.cpp
deleted file mode 100644
index 9e925f39d1..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/ceil.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "ceil.hpp"
-#include "elementwise_functions.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/ceil.hpp"
-#include "kernels/elementwise_functions/common.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U09: ==== CEIL   (x)
-namespace impl
-{
-
-namespace ceil_fn_ns = dpctl::tensor::kernels::ceil;
-
-static unary_contig_impl_fn_ptr_t ceil_contig_dispatch_vector[td_ns::num_types];
-static int ceil_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    ceil_strided_dispatch_vector[td_ns::num_types];
-
-void populate_ceil_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = ceil_fn_ns;
-
-    using fn_ns::CeilContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, CeilContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(ceil_contig_dispatch_vector);
-
-    using fn_ns::CeilStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, CeilStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(ceil_strided_dispatch_vector);
-
-    using fn_ns::CeilTypeMapFactory;
-    DispatchVectorBuilder<int, CeilTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(ceil_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_ceil(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_ceil_dispatch_vectors();
-        using impl::ceil_contig_dispatch_vector;
-        using impl::ceil_output_typeid_vector;
-        using impl::ceil_strided_dispatch_vector;
-
-        auto ceil_pyapi = [&](const arrayT &src, const arrayT &dst,
-                              sycl::queue &exec_q,
-                              const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, ceil_output_typeid_vector,
-                ceil_contig_dispatch_vector, ceil_strided_dispatch_vector);
-        };
-        m.def("_ceil", ceil_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto ceil_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, ceil_output_typeid_vector);
-        };
-        m.def("_ceil_result_type", ceil_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/ceil.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/ceil.hpp
deleted file mode 100644
index 07407a5e3b..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/ceil.hpp
+++ /dev/null
@@ -1,44 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_ceil(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/conj.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/conj.cpp
deleted file mode 100644
index 80cb5e390d..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/conj.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "conj.hpp"
-#include "elementwise_functions.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/conj.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U10: ==== CONJ   (x)
-namespace impl
-{
-
-namespace conj_fn_ns = dpctl::tensor::kernels::conj;
-
-static unary_contig_impl_fn_ptr_t conj_contig_dispatch_vector[td_ns::num_types];
-static int conj_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    conj_strided_dispatch_vector[td_ns::num_types];
-
-void populate_conj_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = conj_fn_ns;
-
-    using fn_ns::ConjContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, ConjContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(conj_contig_dispatch_vector);
-
-    using fn_ns::ConjStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, ConjStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(conj_strided_dispatch_vector);
-
-    using fn_ns::ConjTypeMapFactory;
-    DispatchVectorBuilder<int, ConjTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(conj_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_conj(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_conj_dispatch_vectors();
-        using impl::conj_contig_dispatch_vector;
-        using impl::conj_output_typeid_vector;
-        using impl::conj_strided_dispatch_vector;
-
-        auto conj_pyapi = [&](const arrayT &src, const arrayT &dst,
-                              sycl::queue &exec_q,
-                              const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, conj_output_typeid_vector,
-                conj_contig_dispatch_vector, conj_strided_dispatch_vector);
-        };
-        m.def("_conj", conj_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto conj_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, conj_output_typeid_vector);
-        };
-        m.def("_conj_result_type", conj_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/conj.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/conj.hpp
deleted file mode 100644
index 3e77e9b40b..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/conj.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_conj(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/copysign.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/copysign.cpp
deleted file mode 100644
index 48a66d339e..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/copysign.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "copysign.hpp"
-#include "elementwise_functions.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/copysign.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
-
-// B25: ===== COPYSIGN (x1, x2)
-namespace impl
-{
-namespace copysign_fn_ns = dpctl::tensor::kernels::copysign;
-
-static binary_contig_impl_fn_ptr_t
-    copysign_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int copysign_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    copysign_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_copysign_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = copysign_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::CopysignTypeMapFactory;
-    DispatchTableBuilder<int, CopysignTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(copysign_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::CopysignStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, CopysignStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(copysign_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::CopysignContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, CopysignContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(copysign_contig_dispatch_table);
-};
-
-} // namespace impl
-
-void init_copysign(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_copysign_dispatch_tables();
-        using impl::copysign_contig_dispatch_table;
-        using impl::copysign_output_id_table;
-        using impl::copysign_strided_dispatch_table;
-
-        auto copysign_pyapi = [&](const arrayT &src1, const arrayT &src2,
-                                  const arrayT &dst, sycl::queue &exec_q,
-                                  const event_vecT &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, copysign_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                copysign_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                copysign_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto copysign_result_type_pyapi = [&](const py::dtype &dtype1,
-                                              const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               copysign_output_id_table);
-        };
-        m.def("_copysign", copysign_pyapi, "", py::arg("src1"), py::arg("src2"),
-              py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_copysign_result_type", copysign_result_type_pyapi, "");
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/copysign.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/copysign.hpp
deleted file mode 100644
index 6e2328a009..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/copysign.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_copysign(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/cos.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/cos.cpp
deleted file mode 100644
index 4f9cfbd4c9..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/cos.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "cos.hpp"
-#include "elementwise_functions.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/cos.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U11: ==== COS   (x)
-namespace impl
-{
-
-namespace cos_fn_ns = dpctl::tensor::kernels::cos;
-
-static unary_contig_impl_fn_ptr_t cos_contig_dispatch_vector[td_ns::num_types];
-static int cos_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    cos_strided_dispatch_vector[td_ns::num_types];
-
-void populate_cos_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = cos_fn_ns;
-
-    using fn_ns::CosContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, CosContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(cos_contig_dispatch_vector);
-
-    using fn_ns::CosStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, CosStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(cos_strided_dispatch_vector);
-
-    using fn_ns::CosTypeMapFactory;
-    DispatchVectorBuilder<int, CosTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(cos_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_cos(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_cos_dispatch_vectors();
-        using impl::cos_contig_dispatch_vector;
-        using impl::cos_output_typeid_vector;
-        using impl::cos_strided_dispatch_vector;
-
-        auto cos_pyapi = [&](const arrayT &src, const arrayT &dst,
-                             sycl::queue &exec_q,
-                             const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, cos_output_typeid_vector,
-                cos_contig_dispatch_vector, cos_strided_dispatch_vector);
-        };
-        m.def("_cos", cos_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto cos_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, cos_output_typeid_vector);
-        };
-        m.def("_cos_result_type", cos_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/cos.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/cos.hpp
deleted file mode 100644
index 438bcd33e5..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/cos.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_cos(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/cosh.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/cosh.cpp
deleted file mode 100644
index 2c41db31c4..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/cosh.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "cosh.hpp"
-#include "elementwise_functions.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/cosh.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U12: ==== COSH   (x)
-namespace impl
-{
-
-namespace cosh_fn_ns = dpctl::tensor::kernels::cosh;
-
-static unary_contig_impl_fn_ptr_t cosh_contig_dispatch_vector[td_ns::num_types];
-static int cosh_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    cosh_strided_dispatch_vector[td_ns::num_types];
-
-void populate_cosh_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = cosh_fn_ns;
-
-    using fn_ns::CoshContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, CoshContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(cosh_contig_dispatch_vector);
-
-    using fn_ns::CoshStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, CoshStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(cosh_strided_dispatch_vector);
-
-    using fn_ns::CoshTypeMapFactory;
-    DispatchVectorBuilder<int, CoshTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(cosh_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_cosh(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_cosh_dispatch_vectors();
-        using impl::cosh_contig_dispatch_vector;
-        using impl::cosh_output_typeid_vector;
-        using impl::cosh_strided_dispatch_vector;
-
-        auto cosh_pyapi = [&](const arrayT &src, const arrayT &dst,
-                              sycl::queue &exec_q,
-                              const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, cosh_output_typeid_vector,
-                cosh_contig_dispatch_vector, cosh_strided_dispatch_vector);
-        };
-        m.def("_cosh", cosh_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto cosh_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, cosh_output_typeid_vector);
-        };
-        m.def("_cosh_result_type", cosh_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/cosh.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/cosh.hpp
deleted file mode 100644
index b244a1649f..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/cosh.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_cosh(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp
deleted file mode 100644
index 21c3468325..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp
+++ /dev/null
@@ -1,187 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include <pybind11/pybind11.h>
-
-#include "abs.hpp"
-#include "acos.hpp"
-#include "acosh.hpp"
-#include "add.hpp"
-#include "angle.hpp"
-#include "asin.hpp"
-#include "asinh.hpp"
-#include "atan.hpp"
-#include "atan2.hpp"
-#include "atanh.hpp"
-#include "bitwise_and.hpp"
-#include "bitwise_invert.hpp"
-#include "bitwise_left_shift.hpp"
-#include "bitwise_or.hpp"
-#include "bitwise_right_shift.hpp"
-#include "bitwise_xor.hpp"
-#include "cbrt.hpp"
-#include "ceil.hpp"
-#include "conj.hpp"
-#include "copysign.hpp"
-#include "cos.hpp"
-#include "cosh.hpp"
-#include "equal.hpp"
-#include "exp.hpp"
-#include "exp2.hpp"
-#include "expm1.hpp"
-#include "floor.hpp"
-#include "floor_divide.hpp"
-#include "greater.hpp"
-#include "greater_equal.hpp"
-#include "hypot.hpp"
-#include "imag.hpp"
-#include "isfinite.hpp"
-#include "isinf.hpp"
-#include "isnan.hpp"
-#include "less.hpp"
-#include "less_equal.hpp"
-#include "log.hpp"
-#include "log10.hpp"
-#include "log1p.hpp"
-#include "log2.hpp"
-#include "logaddexp.hpp"
-#include "logical_and.hpp"
-#include "logical_not.hpp"
-#include "logical_or.hpp"
-#include "logical_xor.hpp"
-#include "maximum.hpp"
-#include "minimum.hpp"
-#include "multiply.hpp"
-#include "negative.hpp"
-#include "nextafter.hpp"
-#include "not_equal.hpp"
-#include "positive.hpp"
-#include "pow.hpp"
-#include "proj.hpp"
-#include "real.hpp"
-#include "reciprocal.hpp"
-#include "remainder.hpp"
-#include "round.hpp"
-#include "rsqrt.hpp"
-#include "sign.hpp"
-#include "signbit.hpp"
-#include "sin.hpp"
-#include "sinh.hpp"
-#include "sqrt.hpp"
-#include "square.hpp"
-#include "subtract.hpp"
-#include "tan.hpp"
-#include "tanh.hpp"
-#include "true_divide.hpp"
-#include "trunc.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-/*! @brief Add elementwise functions to Python module */
-void init_elementwise_functions(py::module_ m)
-{
-    init_abs(m);
-    init_acos(m);
-    init_acosh(m);
-    init_add(m);
-    init_angle(m);
-    init_asin(m);
-    init_asinh(m);
-    init_atan(m);
-    init_atan2(m);
-    init_atanh(m);
-    init_bitwise_and(m);
-    init_bitwise_invert(m);
-    init_bitwise_left_shift(m);
-    init_bitwise_or(m);
-    init_bitwise_right_shift(m);
-    init_bitwise_xor(m);
-    init_cbrt(m);
-    init_ceil(m);
-    init_conj(m);
-    init_copysign(m);
-    init_cos(m);
-    init_cosh(m);
-    init_divide(m);
-    init_equal(m);
-    init_exp(m);
-    init_exp2(m);
-    init_expm1(m);
-    init_floor(m);
-    init_floor_divide(m);
-    init_greater(m);
-    init_greater_equal(m);
-    init_hypot(m);
-    init_imag(m);
-    init_isfinite(m);
-    init_isinf(m);
-    init_isnan(m);
-    init_less(m);
-    init_less_equal(m);
-    init_log(m);
-    init_log10(m);
-    init_log1p(m);
-    init_log2(m);
-    init_logaddexp(m);
-    init_logical_and(m);
-    init_logical_not(m);
-    init_logical_or(m);
-    init_logical_xor(m);
-    init_maximum(m);
-    init_minimum(m);
-    init_multiply(m);
-    init_nextafter(m);
-    init_negative(m);
-    init_not_equal(m);
-    init_positive(m);
-    init_pow(m);
-    init_proj(m);
-    init_real(m);
-    init_reciprocal(m);
-    init_remainder(m);
-    init_round(m);
-    init_rsqrt(m);
-    init_sign(m);
-    init_signbit(m);
-    init_sin(m);
-    init_sinh(m);
-    init_sqrt(m);
-    init_square(m);
-    init_subtract(m);
-    init_tan(m);
-    init_tanh(m);
-    init_trunc(m);
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_common.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_common.hpp
deleted file mode 100644
index 9a78a0eb6e..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_common.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_elementwise_functions(py::module_);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
deleted file mode 100644
index 59c9c570c9..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp
+++ /dev/null
@@ -1,813 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-#pragma once
-
-#include <cstddef>
-#include <exception>
-#include <stdexcept>
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include "elementwise_functions_type_utils.hpp"
-#include "kernels/alignment.hpp"
-#include "kernels/dpctl_tensor_types.hpp"
-#include "simplify_iteration_space.hpp"
-#include "utils/memory_overlap.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/output_validation.hpp"
-#include "utils/sycl_alloc_utils.hpp"
-#include "utils/type_dispatch.hpp"
-
-namespace py = pybind11;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-static_assert(std::is_same_v<py::ssize_t, dpctl::tensor::ssize_t>);
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-using dpctl::tensor::kernels::alignment_utils::is_aligned;
-using dpctl::tensor::kernels::alignment_utils::required_alignment;
-
-/*! @brief Template implementing Python API for unary elementwise functions */
-template <typename output_typesT,
-          typename contig_dispatchT,
-          typename strided_dispatchT>
-std::pair<sycl::event, sycl::event>
-py_unary_ufunc(const dpctl::tensor::usm_ndarray &src,
-               const dpctl::tensor::usm_ndarray &dst,
-               sycl::queue &q,
-               const std::vector<sycl::event> &depends,
-               //
-               const output_typesT &output_type_vec,
-               const contig_dispatchT &contig_dispatch_vector,
-               const strided_dispatchT &strided_dispatch_vector)
-{
-    int src_typenum = src.get_typenum();
-    int dst_typenum = dst.get_typenum();
-
-    const auto &array_types = td_ns::usm_ndarray_types();
-    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    int func_output_typeid = output_type_vec[src_typeid];
-
-    // check that types are supported
-    if (dst_typeid != func_output_typeid) {
-        throw py::value_error(
-            "Destination array has unexpected elemental data type.");
-    }
-
-    // check that queues are compatible
-    if (!dpctl::utils::queues_are_compatible(q, {src, dst})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    // check that dimensions are the same
-    int src_nd = src.get_ndim();
-    if (src_nd != dst.get_ndim()) {
-        throw py::value_error("Array dimensions are not the same.");
-    }
-
-    // check that shapes are the same
-    const py::ssize_t *src_shape = src.get_shape_raw();
-    const py::ssize_t *dst_shape = dst.get_shape_raw();
-    bool shapes_equal(true);
-    std::size_t src_nelems(1);
-
-    for (int i = 0; i < src_nd; ++i) {
-        src_nelems *= static_cast<std::size_t>(src_shape[i]);
-        shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
-    }
-    if (!shapes_equal) {
-        throw py::value_error("Array shapes are not the same.");
-    }
-
-    // if nelems is zero, return
-    if (src_nelems == 0) {
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
-
-    // check memory overlap
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    auto const &same_logical_tensors =
-        dpctl::tensor::overlap::SameLogicalTensors();
-    if (overlap(src, dst) && !same_logical_tensors(src, dst)) {
-        throw py::value_error("Arrays index overlapping segments of memory");
-    }
-
-    const char *src_data = src.get_data();
-    char *dst_data = dst.get_data();
-
-    // handle contiguous inputs
-    bool is_src_c_contig = src.is_c_contiguous();
-    bool is_src_f_contig = src.is_f_contiguous();
-
-    bool is_dst_c_contig = dst.is_c_contiguous();
-    bool is_dst_f_contig = dst.is_f_contiguous();
-
-    bool both_c_contig = (is_src_c_contig && is_dst_c_contig);
-    bool both_f_contig = (is_src_f_contig && is_dst_f_contig);
-
-    if (both_c_contig || both_f_contig) {
-        auto contig_fn = contig_dispatch_vector[src_typeid];
-
-        if (contig_fn == nullptr) {
-            throw std::runtime_error(
-                "Contiguous implementation is missing for src_typeid=" +
-                std::to_string(src_typeid));
-        }
-
-        auto comp_ev = contig_fn(q, src_nelems, src_data, dst_data, depends);
-        sycl::event ht_ev =
-            dpctl::utils::keep_args_alive(q, {src, dst}, {comp_ev});
-
-        return std::make_pair(ht_ev, comp_ev);
-    }
-
-    // simplify iteration space
-    //     if 1d with strides 1 - input is contig
-    //     dispatch to strided
-
-    auto const &src_strides = src.get_strides_vector();
-    auto const &dst_strides = dst.get_strides_vector();
-
-    using shT = std::vector<py::ssize_t>;
-    shT simplified_shape;
-    shT simplified_src_strides;
-    shT simplified_dst_strides;
-    py::ssize_t src_offset(0);
-    py::ssize_t dst_offset(0);
-
-    int nd = src_nd;
-    const py::ssize_t *shape = src_shape;
-
-    dpctl::tensor::py_internal::simplify_iteration_space(
-        nd, shape, src_strides, dst_strides,
-        // output
-        simplified_shape, simplified_src_strides, simplified_dst_strides,
-        src_offset, dst_offset);
-
-    if (nd == 1 && simplified_src_strides[0] == 1 &&
-        simplified_dst_strides[0] == 1)
-    {
-        // Special case of contiguous data
-        auto contig_fn = contig_dispatch_vector[src_typeid];
-
-        if (contig_fn == nullptr) {
-            throw std::runtime_error(
-                "Contiguous implementation is missing for src_typeid=" +
-                std::to_string(src_typeid));
-        }
-
-        int src_elem_size = src.get_elemsize();
-        int dst_elem_size = dst.get_elemsize();
-        auto comp_ev =
-            contig_fn(q, src_nelems, src_data + src_elem_size * src_offset,
-                      dst_data + dst_elem_size * dst_offset, depends);
-
-        sycl::event ht_ev =
-            dpctl::utils::keep_args_alive(q, {src, dst}, {comp_ev});
-
-        return std::make_pair(ht_ev, comp_ev);
-    }
-
-    // Strided implementation
-    auto strided_fn = strided_dispatch_vector[src_typeid];
-
-    if (strided_fn == nullptr) {
-        throw std::runtime_error(
-            "Strided implementation is missing for src_typeid=" +
-            std::to_string(src_typeid));
-    }
-
-    using dpctl::tensor::offset_utils::device_allocate_and_pack;
-
-    std::vector<sycl::event> host_tasks{};
-    host_tasks.reserve(2);
-
-    auto ptr_size_event_triple_ = device_allocate_and_pack<py::ssize_t>(
-        q, host_tasks, simplified_shape, simplified_src_strides,
-        simplified_dst_strides);
-    auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_triple_));
-    const auto &copy_shape_ev = std::get<2>(ptr_size_event_triple_);
-    const py::ssize_t *shape_strides = shape_strides_owner.get();
-
-    sycl::event strided_fn_ev =
-        strided_fn(q, src_nelems, nd, shape_strides, src_data, src_offset,
-                   dst_data, dst_offset, depends, {copy_shape_ev});
-
-    // async free of shape_strides temporary
-    sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
-        q, {strided_fn_ev}, shape_strides_owner);
-
-    host_tasks.push_back(tmp_cleanup_ev);
-
-    return std::make_pair(
-        dpctl::utils::keep_args_alive(q, {src, dst}, host_tasks),
-        strided_fn_ev);
-}
-
-/*! @brief Template implementing Python API for querying of type support by
- *         unary elementwise functions */
-template <typename output_typesT>
-py::object py_unary_ufunc_result_type(const py::dtype &input_dtype,
-                                      const output_typesT &output_types)
-{
-    int tn = input_dtype.num(); // NumPy type numbers are the same as in dpctl
-    int src_typeid = -1;
-
-    auto array_types = td_ns::usm_ndarray_types();
-
-    try {
-        src_typeid = array_types.typenum_to_lookup_id(tn);
-    } catch (const std::exception &e) {
-        throw py::value_error(e.what());
-    }
-
-    using dpctl::tensor::py_internal::type_utils::_result_typeid;
-    int dst_typeid = _result_typeid(src_typeid, output_types);
-
-    if (dst_typeid < 0) {
-        auto res = py::none();
-        return py::cast<py::object>(res);
-    }
-    else {
-        using dpctl::tensor::py_internal::type_utils::_dtype_from_typenum;
-
-        auto dst_typenum_t = static_cast<td_ns::typenum_t>(dst_typeid);
-        auto dt = _dtype_from_typenum(dst_typenum_t);
-
-        return py::cast<py::object>(dt);
-    }
-}
-
-// ======================== Binary functions ===========================
-
-namespace
-{
-template <class Container, class T>
-bool isEqual(Container const &c, std::initializer_list<T> const &l)
-{
-    return std::equal(std::begin(c), std::end(c), std::begin(l), std::end(l));
-}
-} // namespace
-
-/*! @brief Template implementing Python API for binary elementwise
- *         functions */
-template <typename output_typesT,
-          typename contig_dispatchT,
-          typename strided_dispatchT,
-          typename contig_matrix_row_dispatchT,
-          typename contig_row_matrix_dispatchT>
-std::pair<sycl::event, sycl::event> py_binary_ufunc(
-    const dpctl::tensor::usm_ndarray &src1,
-    const dpctl::tensor::usm_ndarray &src2,
-    const dpctl::tensor::usm_ndarray &dst, // dst = op(src1, src2), elementwise
-    sycl::queue &exec_q,
-    const std::vector<sycl::event> depends,
-    //
-    const output_typesT &output_type_table,
-    const contig_dispatchT &contig_dispatch_table,
-    const strided_dispatchT &strided_dispatch_table,
-    const contig_matrix_row_dispatchT
-        &contig_matrix_row_broadcast_dispatch_table,
-    const contig_row_matrix_dispatchT
-        &contig_row_matrix_broadcast_dispatch_table)
-{
-    // check type_nums
-    int src1_typenum = src1.get_typenum();
-    int src2_typenum = src2.get_typenum();
-    int dst_typenum = dst.get_typenum();
-
-    auto array_types = td_ns::usm_ndarray_types();
-    int src1_typeid = array_types.typenum_to_lookup_id(src1_typenum);
-    int src2_typeid = array_types.typenum_to_lookup_id(src2_typenum);
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    int output_typeid = output_type_table[src1_typeid][src2_typeid];
-
-    if (output_typeid != dst_typeid) {
-        throw py::value_error(
-            "Destination array has unexpected elemental data type.");
-    }
-
-    // check that queues are compatible
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src1, src2, dst})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    // check shapes, broadcasting is assumed done by caller
-    // check that dimensions are the same
-    int dst_nd = dst.get_ndim();
-    if (dst_nd != src1.get_ndim() || dst_nd != src2.get_ndim()) {
-        throw py::value_error("Array dimensions are not the same.");
-    }
-
-    // check that shapes are the same
-    const py::ssize_t *src1_shape = src1.get_shape_raw();
-    const py::ssize_t *src2_shape = src2.get_shape_raw();
-    const py::ssize_t *dst_shape = dst.get_shape_raw();
-    bool shapes_equal(true);
-    std::size_t src_nelems(1);
-
-    for (int i = 0; i < dst_nd; ++i) {
-        src_nelems *= static_cast<std::size_t>(src1_shape[i]);
-        shapes_equal = shapes_equal && (src1_shape[i] == dst_shape[i] &&
-                                        src2_shape[i] == dst_shape[i]);
-    }
-    if (!shapes_equal) {
-        throw py::value_error("Array shapes are not the same.");
-    }
-
-    // if nelems is zero, return
-    if (src_nelems == 0) {
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
-
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    auto const &same_logical_tensors =
-        dpctl::tensor::overlap::SameLogicalTensors();
-    if ((overlap(src1, dst) && !same_logical_tensors(src1, dst)) ||
-        (overlap(src2, dst) && !same_logical_tensors(src2, dst)))
-    {
-        throw py::value_error("Arrays index overlapping segments of memory");
-    }
-    // check memory overlap
-    const char *src1_data = src1.get_data();
-    const char *src2_data = src2.get_data();
-    char *dst_data = dst.get_data();
-
-    // handle contiguous inputs
-    bool is_src1_c_contig = src1.is_c_contiguous();
-    bool is_src1_f_contig = src1.is_f_contiguous();
-
-    bool is_src2_c_contig = src2.is_c_contiguous();
-    bool is_src2_f_contig = src2.is_f_contiguous();
-
-    bool is_dst_c_contig = dst.is_c_contiguous();
-    bool is_dst_f_contig = dst.is_f_contiguous();
-
-    bool all_c_contig =
-        (is_src1_c_contig && is_src2_c_contig && is_dst_c_contig);
-    bool all_f_contig =
-        (is_src1_f_contig && is_src2_f_contig && is_dst_f_contig);
-
-    // dispatch for contiguous inputs
-    if (all_c_contig || all_f_contig) {
-        auto contig_fn = contig_dispatch_table[src1_typeid][src2_typeid];
-
-        if (contig_fn != nullptr) {
-            auto comp_ev = contig_fn(exec_q, src_nelems, src1_data, 0,
-                                     src2_data, 0, dst_data, 0, depends);
-            sycl::event ht_ev = dpctl::utils::keep_args_alive(
-                exec_q, {src1, src2, dst}, {comp_ev});
-
-            return std::make_pair(ht_ev, comp_ev);
-        }
-    }
-
-    // simplify strides
-    auto const &src1_strides = src1.get_strides_vector();
-    auto const &src2_strides = src2.get_strides_vector();
-    auto const &dst_strides = dst.get_strides_vector();
-
-    using shT = std::vector<py::ssize_t>;
-    shT simplified_shape;
-    shT simplified_src1_strides;
-    shT simplified_src2_strides;
-    shT simplified_dst_strides;
-    py::ssize_t src1_offset(0);
-    py::ssize_t src2_offset(0);
-    py::ssize_t dst_offset(0);
-
-    int nd = dst_nd;
-    const py::ssize_t *shape = src1_shape;
-
-    dpctl::tensor::py_internal::simplify_iteration_space_3(
-        nd, shape, src1_strides, src2_strides, dst_strides,
-        // outputs
-        simplified_shape, simplified_src1_strides, simplified_src2_strides,
-        simplified_dst_strides, src1_offset, src2_offset, dst_offset);
-
-    std::vector<sycl::event> host_tasks{};
-    if (nd < 3) {
-        static constexpr auto unit_stride =
-            std::initializer_list<py::ssize_t>{1};
-
-        if ((nd == 1) && isEqual(simplified_src1_strides, unit_stride) &&
-            isEqual(simplified_src2_strides, unit_stride) &&
-            isEqual(simplified_dst_strides, unit_stride))
-        {
-            auto contig_fn = contig_dispatch_table[src1_typeid][src2_typeid];
-
-            if (contig_fn != nullptr) {
-                auto comp_ev = contig_fn(exec_q, src_nelems, src1_data,
-                                         src1_offset, src2_data, src2_offset,
-                                         dst_data, dst_offset, depends);
-                sycl::event ht_ev = dpctl::utils::keep_args_alive(
-                    exec_q, {src1, src2, dst}, {comp_ev});
-
-                return std::make_pair(ht_ev, comp_ev);
-            }
-        }
-        if (nd == 2) {
-            static constexpr auto zero_one_strides =
-                std::initializer_list<py::ssize_t>{0, 1};
-            static constexpr auto one_zero_strides =
-                std::initializer_list<py::ssize_t>{1, 0};
-            static constexpr py::ssize_t one{1};
-            // special case of C-contiguous matrix and a row
-            if (isEqual(simplified_src2_strides, zero_one_strides) &&
-                isEqual(simplified_src1_strides, {simplified_shape[1], one}) &&
-                isEqual(simplified_dst_strides, {simplified_shape[1], one}))
-            {
-                auto matrix_row_broadcast_fn =
-                    contig_matrix_row_broadcast_dispatch_table[src1_typeid]
-                                                              [src2_typeid];
-                if (matrix_row_broadcast_fn != nullptr) {
-                    int src1_itemsize = src1.get_elemsize();
-                    int src2_itemsize = src2.get_elemsize();
-                    int dst_itemsize = dst.get_elemsize();
-
-                    if (is_aligned<required_alignment>(
-                            src1_data + src1_offset * src1_itemsize) &&
-                        is_aligned<required_alignment>(
-                            src2_data + src2_offset * src2_itemsize) &&
-                        is_aligned<required_alignment>(
-                            dst_data + dst_offset * dst_itemsize))
-                    {
-                        std::size_t n0 = simplified_shape[0];
-                        std::size_t n1 = simplified_shape[1];
-                        sycl::event comp_ev = matrix_row_broadcast_fn(
-                            exec_q, host_tasks, n0, n1, src1_data, src1_offset,
-                            src2_data, src2_offset, dst_data, dst_offset,
-                            depends);
-
-                        return std::make_pair(
-                            dpctl::utils::keep_args_alive(
-                                exec_q, {src1, src2, dst}, host_tasks),
-                            comp_ev);
-                    }
-                }
-            }
-            if (isEqual(simplified_src1_strides, one_zero_strides) &&
-                isEqual(simplified_src2_strides, {one, simplified_shape[0]}) &&
-                isEqual(simplified_dst_strides, {one, simplified_shape[0]}))
-            {
-                auto row_matrix_broadcast_fn =
-                    contig_row_matrix_broadcast_dispatch_table[src1_typeid]
-                                                              [src2_typeid];
-                if (row_matrix_broadcast_fn != nullptr) {
-
-                    int src1_itemsize = src1.get_elemsize();
-                    int src2_itemsize = src2.get_elemsize();
-                    int dst_itemsize = dst.get_elemsize();
-
-                    if (is_aligned<required_alignment>(
-                            src1_data + src1_offset * src1_itemsize) &&
-                        is_aligned<required_alignment>(
-                            src2_data + src2_offset * src2_itemsize) &&
-                        is_aligned<required_alignment>(
-                            dst_data + dst_offset * dst_itemsize))
-                    {
-                        std::size_t n0 = simplified_shape[1];
-                        std::size_t n1 = simplified_shape[0];
-                        sycl::event comp_ev = row_matrix_broadcast_fn(
-                            exec_q, host_tasks, n0, n1, src1_data, src1_offset,
-                            src2_data, src2_offset, dst_data, dst_offset,
-                            depends);
-
-                        return std::make_pair(
-                            dpctl::utils::keep_args_alive(
-                                exec_q, {src1, src2, dst}, host_tasks),
-                            comp_ev);
-                    }
-                }
-            }
-        }
-    }
-
-    // dispatch to strided code
-    auto strided_fn = strided_dispatch_table[src1_typeid][src2_typeid];
-
-    if (strided_fn == nullptr) {
-        throw std::runtime_error(
-            "Strided implementation is missing for src1_typeid=" +
-            std::to_string(src1_typeid) +
-            " and src2_typeid=" + std::to_string(src2_typeid));
-    }
-
-    using dpctl::tensor::offset_utils::device_allocate_and_pack;
-    auto ptr_sz_event_triple_ = device_allocate_and_pack<py::ssize_t>(
-        exec_q, host_tasks, simplified_shape, simplified_src1_strides,
-        simplified_src2_strides, simplified_dst_strides);
-    auto shape_strides_owner = std::move(std::get<0>(ptr_sz_event_triple_));
-    auto &copy_shape_ev = std::get<2>(ptr_sz_event_triple_);
-
-    const py::ssize_t *shape_strides = shape_strides_owner.get();
-
-    sycl::event strided_fn_ev = strided_fn(
-        exec_q, src_nelems, nd, shape_strides, src1_data, src1_offset,
-        src2_data, src2_offset, dst_data, dst_offset, depends, {copy_shape_ev});
-
-    // async free of shape_strides temporary
-    sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
-        exec_q, {strided_fn_ev}, shape_strides_owner);
-    host_tasks.push_back(tmp_cleanup_ev);
-
-    return std::make_pair(
-        dpctl::utils::keep_args_alive(exec_q, {src1, src2, dst}, host_tasks),
-        strided_fn_ev);
-}
-
-/*! @brief Type querying for binary elementwise functions */
-template <typename output_typesT>
-py::object py_binary_ufunc_result_type(const py::dtype &input1_dtype,
-                                       const py::dtype &input2_dtype,
-                                       const output_typesT &output_types_table)
-{
-    int tn1 = input1_dtype.num(); // NumPy type numbers are the same as in dpctl
-    int tn2 = input2_dtype.num(); // NumPy type numbers are the same as in dpctl
-    int src1_typeid = -1;
-    int src2_typeid = -1;
-
-    auto array_types = td_ns::usm_ndarray_types();
-
-    try {
-        src1_typeid = array_types.typenum_to_lookup_id(tn1);
-        src2_typeid = array_types.typenum_to_lookup_id(tn2);
-    } catch (const std::exception &e) {
-        throw py::value_error(e.what());
-    }
-
-    if (src1_typeid < 0 || src1_typeid >= td_ns::num_types || src2_typeid < 0 ||
-        src2_typeid >= td_ns::num_types)
-    {
-        throw std::runtime_error("binary output type lookup failed");
-    }
-    int dst_typeid = output_types_table[src1_typeid][src2_typeid];
-
-    if (dst_typeid < 0) {
-        auto res = py::none();
-        return py::cast<py::object>(res);
-    }
-    else {
-        using dpctl::tensor::py_internal::type_utils::_dtype_from_typenum;
-
-        auto dst_typenum_t = static_cast<td_ns::typenum_t>(dst_typeid);
-        auto dt = _dtype_from_typenum(dst_typenum_t);
-
-        return py::cast<py::object>(dt);
-    }
-}
-
-// ==================== Inplace binary functions =======================
-
-template <typename output_typesT,
-          typename contig_dispatchT,
-          typename strided_dispatchT,
-          typename contig_row_matrix_dispatchT>
-std::pair<sycl::event, sycl::event>
-py_binary_inplace_ufunc(const dpctl::tensor::usm_ndarray &lhs,
-                        const dpctl::tensor::usm_ndarray &rhs,
-                        sycl::queue &exec_q,
-                        const std::vector<sycl::event> depends,
-                        //
-                        const output_typesT &output_type_table,
-                        const contig_dispatchT &contig_dispatch_table,
-                        const strided_dispatchT &strided_dispatch_table,
-                        const contig_row_matrix_dispatchT
-                            &contig_row_matrix_broadcast_dispatch_table)
-{
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(lhs);
-
-    // check type_nums
-    int rhs_typenum = rhs.get_typenum();
-    int lhs_typenum = lhs.get_typenum();
-
-    auto array_types = td_ns::usm_ndarray_types();
-    int rhs_typeid = array_types.typenum_to_lookup_id(rhs_typenum);
-    int lhs_typeid = array_types.typenum_to_lookup_id(lhs_typenum);
-
-    int output_typeid = output_type_table[rhs_typeid][lhs_typeid];
-
-    if (output_typeid != lhs_typeid) {
-        throw py::value_error(
-            "Left-hand side array has unexpected elemental data type.");
-    }
-
-    // check that queues are compatible
-    if (!dpctl::utils::queues_are_compatible(exec_q, {rhs, lhs})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    // check shapes, broadcasting is assumed done by caller
-    // check that dimensions are the same
-    int lhs_nd = lhs.get_ndim();
-    if (lhs_nd != rhs.get_ndim()) {
-        throw py::value_error("Array dimensions are not the same.");
-    }
-
-    // check that shapes are the same
-    const py::ssize_t *rhs_shape = rhs.get_shape_raw();
-    const py::ssize_t *lhs_shape = lhs.get_shape_raw();
-    bool shapes_equal(true);
-    std::size_t rhs_nelems(1);
-
-    for (int i = 0; i < lhs_nd; ++i) {
-        rhs_nelems *= static_cast<std::size_t>(rhs_shape[i]);
-        shapes_equal = shapes_equal && (rhs_shape[i] == lhs_shape[i]);
-    }
-    if (!shapes_equal) {
-        throw py::value_error("Array shapes are not the same.");
-    }
-
-    // if nelems is zero, return
-    if (rhs_nelems == 0) {
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(lhs, rhs_nelems);
-
-    // check memory overlap
-    auto const &same_logical_tensors =
-        dpctl::tensor::overlap::SameLogicalTensors();
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    if (overlap(rhs, lhs) && !same_logical_tensors(rhs, lhs)) {
-        throw py::value_error("Arrays index overlapping segments of memory");
-    }
-    // check memory overlap
-    const char *rhs_data = rhs.get_data();
-    char *lhs_data = lhs.get_data();
-
-    // handle contiguous inputs
-    bool is_rhs_c_contig = rhs.is_c_contiguous();
-    bool is_rhs_f_contig = rhs.is_f_contiguous();
-
-    bool is_lhs_c_contig = lhs.is_c_contiguous();
-    bool is_lhs_f_contig = lhs.is_f_contiguous();
-
-    bool both_c_contig = (is_rhs_c_contig && is_lhs_c_contig);
-    bool both_f_contig = (is_rhs_f_contig && is_lhs_f_contig);
-
-    // dispatch for contiguous inputs
-    if (both_c_contig || both_f_contig) {
-        auto contig_fn = contig_dispatch_table[rhs_typeid][lhs_typeid];
-
-        if (contig_fn != nullptr) {
-            auto comp_ev = contig_fn(exec_q, rhs_nelems, rhs_data, 0, lhs_data,
-                                     0, depends);
-            sycl::event ht_ev =
-                dpctl::utils::keep_args_alive(exec_q, {rhs, lhs}, {comp_ev});
-
-            return std::make_pair(ht_ev, comp_ev);
-        }
-    }
-
-    // simplify strides
-    auto const &rhs_strides = rhs.get_strides_vector();
-    auto const &lhs_strides = lhs.get_strides_vector();
-
-    using shT = std::vector<py::ssize_t>;
-    shT simplified_shape;
-    shT simplified_rhs_strides;
-    shT simplified_lhs_strides;
-    py::ssize_t rhs_offset(0);
-    py::ssize_t lhs_offset(0);
-
-    int nd = lhs_nd;
-    const py::ssize_t *shape = rhs_shape;
-
-    dpctl::tensor::py_internal::simplify_iteration_space(
-        nd, shape, rhs_strides, lhs_strides,
-        // outputs
-        simplified_shape, simplified_rhs_strides, simplified_lhs_strides,
-        rhs_offset, lhs_offset);
-
-    std::vector<sycl::event> host_tasks{};
-    if (nd < 3) {
-        static constexpr auto unit_stride =
-            std::initializer_list<py::ssize_t>{1};
-
-        if ((nd == 1) && isEqual(simplified_rhs_strides, unit_stride) &&
-            isEqual(simplified_lhs_strides, unit_stride))
-        {
-            auto contig_fn = contig_dispatch_table[rhs_typeid][lhs_typeid];
-
-            if (contig_fn != nullptr) {
-                auto comp_ev =
-                    contig_fn(exec_q, rhs_nelems, rhs_data, rhs_offset,
-                              lhs_data, lhs_offset, depends);
-                sycl::event ht_ev = dpctl::utils::keep_args_alive(
-                    exec_q, {rhs, lhs}, {comp_ev});
-
-                return std::make_pair(ht_ev, comp_ev);
-            }
-        }
-        if (nd == 2) {
-            static constexpr auto one_zero_strides =
-                std::initializer_list<py::ssize_t>{1, 0};
-            static constexpr py::ssize_t one{1};
-            // special case of C-contiguous matrix and a row
-            if (isEqual(simplified_rhs_strides, one_zero_strides) &&
-                isEqual(simplified_lhs_strides, {one, simplified_shape[0]}))
-            {
-                auto row_matrix_broadcast_fn =
-                    contig_row_matrix_broadcast_dispatch_table[rhs_typeid]
-                                                              [lhs_typeid];
-                if (row_matrix_broadcast_fn != nullptr) {
-                    std::size_t n0 = simplified_shape[1];
-                    std::size_t n1 = simplified_shape[0];
-                    sycl::event comp_ev = row_matrix_broadcast_fn(
-                        exec_q, host_tasks, n0, n1, rhs_data, rhs_offset,
-                        lhs_data, lhs_offset, depends);
-
-                    return std::make_pair(dpctl::utils::keep_args_alive(
-                                              exec_q, {lhs, rhs}, host_tasks),
-                                          comp_ev);
-                }
-            }
-        }
-    }
-
-    // dispatch to strided code
-    auto strided_fn = strided_dispatch_table[rhs_typeid][lhs_typeid];
-
-    if (strided_fn == nullptr) {
-        throw std::runtime_error(
-            "Strided implementation is missing for rhs_typeid=" +
-            std::to_string(rhs_typeid) +
-            " and lhs_typeid=" + std::to_string(lhs_typeid));
-    }
-
-    using dpctl::tensor::offset_utils::device_allocate_and_pack;
-    auto ptr_sz_event_triple_ = device_allocate_and_pack<py::ssize_t>(
-        exec_q, host_tasks, simplified_shape, simplified_rhs_strides,
-        simplified_lhs_strides);
-    auto shape_strides_owner = std::move(std::get<0>(ptr_sz_event_triple_));
-    auto copy_shape_ev = std::get<2>(ptr_sz_event_triple_);
-
-    const py::ssize_t *shape_strides = shape_strides_owner.get();
-
-    sycl::event strided_fn_ev =
-        strided_fn(exec_q, rhs_nelems, nd, shape_strides, rhs_data, rhs_offset,
-                   lhs_data, lhs_offset, depends, {copy_shape_ev});
-
-    // async free of shape_strides temporary
-    sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
-        exec_q, {strided_fn_ev}, shape_strides_owner);
-
-    host_tasks.push_back(tmp_cleanup_ev);
-
-    return std::make_pair(
-        dpctl::utils::keep_args_alive(exec_q, {rhs, lhs}, host_tasks),
-        strided_fn_ev);
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp
deleted file mode 100644
index fa024401f2..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions for looking of supported types in elementwise
-/// functions.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <sycl/sycl.hpp>
-
-#include "elementwise_functions_type_utils.hpp"
-#include "utils/type_dispatch.hpp"
-
-namespace py = pybind11;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-namespace type_utils
-{
-
-py::dtype _dtype_from_typenum(td_ns::typenum_t dst_typenum_t)
-{
-    switch (dst_typenum_t) {
-    case td_ns::typenum_t::BOOL:
-        return py::dtype("?");
-    case td_ns::typenum_t::INT8:
-        return py::dtype("i1");
-    case td_ns::typenum_t::UINT8:
-        return py::dtype("u1");
-    case td_ns::typenum_t::INT16:
-        return py::dtype("i2");
-    case td_ns::typenum_t::UINT16:
-        return py::dtype("u2");
-    case td_ns::typenum_t::INT32:
-        return py::dtype("i4");
-    case td_ns::typenum_t::UINT32:
-        return py::dtype("u4");
-    case td_ns::typenum_t::INT64:
-        return py::dtype("i8");
-    case td_ns::typenum_t::UINT64:
-        return py::dtype("u8");
-    case td_ns::typenum_t::HALF:
-        return py::dtype("f2");
-    case td_ns::typenum_t::FLOAT:
-        return py::dtype("f4");
-    case td_ns::typenum_t::DOUBLE:
-        return py::dtype("f8");
-    case td_ns::typenum_t::CFLOAT:
-        return py::dtype("c8");
-    case td_ns::typenum_t::CDOUBLE:
-        return py::dtype("c16");
-    default:
-        throw py::value_error("Unrecognized dst_typeid");
-    }
-}
-
-int _result_typeid(int arg_typeid, const int *fn_output_id)
-{
-    if (arg_typeid < 0 || arg_typeid >= td_ns::num_types) {
-        throw py::value_error("Input typeid " + std::to_string(arg_typeid) +
-                              " is outside of expected bounds.");
-    }
-
-    return fn_output_id[arg_typeid];
-}
-
-} // namespace type_utils
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.hpp
deleted file mode 100644
index c4d7a91ae8..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.hpp
+++ /dev/null
@@ -1,56 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file declares functions for looking of supported types in elementwise
-/// functions.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <sycl/sycl.hpp>
-
-#include "utils/type_dispatch.hpp"
-
-namespace py = pybind11;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-namespace type_utils
-{
-
-/*! @brief Produce dtype from a type number */
-extern py::dtype _dtype_from_typenum(td_ns::typenum_t);
-
-/*! @brief Lookup typeid of the result from typeid of
- *         argument and the mapping table */
-extern int _result_typeid(int, const int *);
-
-} // namespace type_utils
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/equal.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/equal.cpp
deleted file mode 100644
index f09d1b075f..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/equal.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "equal.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/equal.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
-
-// B09: ===== EQUAL (x1, x2)
-namespace impl
-{
-namespace equal_fn_ns = dpctl::tensor::kernels::equal;
-
-static binary_contig_impl_fn_ptr_t
-    equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int equal_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_equal_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = equal_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::EqualTypeMapFactory;
-    DispatchTableBuilder<int, EqualTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(equal_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::EqualStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, EqualStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(equal_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::EqualContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, EqualContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(equal_contig_dispatch_table);
-};
-
-} // namespace impl
-
-void init_equal(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_equal_dispatch_tables();
-        using impl::equal_contig_dispatch_table;
-        using impl::equal_output_id_table;
-        using impl::equal_strided_dispatch_table;
-
-        auto equal_pyapi = [&](const arrayT &src1, const arrayT &src2,
-                               const arrayT &dst, sycl::queue &exec_q,
-                               const event_vecT &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, equal_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                equal_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                equal_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto equal_result_type_pyapi = [&](const py::dtype &dtype1,
-                                           const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               equal_output_id_table);
-        };
-        m.def("_equal", equal_pyapi, "", py::arg("src1"), py::arg("src2"),
-              py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_equal_result_type", equal_result_type_pyapi, "");
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/equal.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/equal.hpp
deleted file mode 100644
index 521b2df24b..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/equal.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_equal(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/exp.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/exp.cpp
deleted file mode 100644
index 68ea0cc450..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/exp.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "exp.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/exp.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U13: ==== EXP   (x)
-namespace impl
-{
-
-namespace exp_fn_ns = dpctl::tensor::kernels::exp;
-
-static unary_contig_impl_fn_ptr_t exp_contig_dispatch_vector[td_ns::num_types];
-static int exp_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    exp_strided_dispatch_vector[td_ns::num_types];
-
-void populate_exp_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = exp_fn_ns;
-
-    using fn_ns::ExpContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, ExpContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(exp_contig_dispatch_vector);
-
-    using fn_ns::ExpStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, ExpStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(exp_strided_dispatch_vector);
-
-    using fn_ns::ExpTypeMapFactory;
-    DispatchVectorBuilder<int, ExpTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(exp_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_exp(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_exp_dispatch_vectors();
-        using impl::exp_contig_dispatch_vector;
-        using impl::exp_output_typeid_vector;
-        using impl::exp_strided_dispatch_vector;
-
-        auto exp_pyapi = [&](const arrayT &src, const arrayT &dst,
-                             sycl::queue &exec_q,
-                             const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, exp_output_typeid_vector,
-                exp_contig_dispatch_vector, exp_strided_dispatch_vector);
-        };
-        m.def("_exp", exp_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto exp_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, exp_output_typeid_vector);
-        };
-        m.def("_exp_result_type", exp_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/exp.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/exp.hpp
deleted file mode 100644
index fe98d14e8e..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/exp.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_exp(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/exp2.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/exp2.cpp
deleted file mode 100644
index c32d293058..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/exp2.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "exp2.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/exp2.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U38: ==== EXP2   (x)
-namespace impl
-{
-
-namespace exp2_fn_ns = dpctl::tensor::kernels::exp2;
-
-static unary_contig_impl_fn_ptr_t exp2_contig_dispatch_vector[td_ns::num_types];
-static int exp2_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    exp2_strided_dispatch_vector[td_ns::num_types];
-
-void populate_exp2_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = exp2_fn_ns;
-
-    using fn_ns::Exp2ContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, Exp2ContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(exp2_contig_dispatch_vector);
-
-    using fn_ns::Exp2StridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, Exp2StridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(exp2_strided_dispatch_vector);
-
-    using fn_ns::Exp2TypeMapFactory;
-    DispatchVectorBuilder<int, Exp2TypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(exp2_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_exp2(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_exp2_dispatch_vectors();
-        using impl::exp2_contig_dispatch_vector;
-        using impl::exp2_output_typeid_vector;
-        using impl::exp2_strided_dispatch_vector;
-
-        auto exp2_pyapi = [&](const arrayT &src, const arrayT &dst,
-                              sycl::queue &exec_q,
-                              const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, exp2_output_typeid_vector,
-                exp2_contig_dispatch_vector, exp2_strided_dispatch_vector);
-        };
-        m.def("_exp2", exp2_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto exp2_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, exp2_output_typeid_vector);
-        };
-        m.def("_exp2_result_type", exp2_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/exp2.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/exp2.hpp
deleted file mode 100644
index 1750e65a07..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/exp2.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_exp2(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/expm1.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/expm1.cpp
deleted file mode 100644
index da8709ad44..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/expm1.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "expm1.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/expm1.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U14: ==== EXPM1   (x)
-namespace impl
-{
-
-namespace expm1_fn_ns = dpctl::tensor::kernels::expm1;
-
-static unary_contig_impl_fn_ptr_t
-    expm1_contig_dispatch_vector[td_ns::num_types];
-static int expm1_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    expm1_strided_dispatch_vector[td_ns::num_types];
-
-void populate_expm1_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = expm1_fn_ns;
-
-    using fn_ns::Expm1ContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, Expm1ContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(expm1_contig_dispatch_vector);
-
-    using fn_ns::Expm1StridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, Expm1StridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(expm1_strided_dispatch_vector);
-
-    using fn_ns::Expm1TypeMapFactory;
-    DispatchVectorBuilder<int, Expm1TypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(expm1_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_expm1(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_expm1_dispatch_vectors();
-        using impl::expm1_contig_dispatch_vector;
-        using impl::expm1_output_typeid_vector;
-        using impl::expm1_strided_dispatch_vector;
-
-        auto expm1_pyapi = [&](const arrayT &src, const arrayT &dst,
-                               sycl::queue &exec_q,
-                               const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, expm1_output_typeid_vector,
-                expm1_contig_dispatch_vector, expm1_strided_dispatch_vector);
-        };
-        m.def("_expm1", expm1_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto expm1_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              expm1_output_typeid_vector);
-        };
-        m.def("_expm1_result_type", expm1_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/expm1.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/expm1.hpp
deleted file mode 100644
index 1849561c28..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/expm1.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_expm1(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/floor.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/floor.cpp
deleted file mode 100644
index 3569c38f66..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/floor.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "floor.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/floor.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U15: ==== FLOOR   (x)
-namespace impl
-{
-
-namespace floor_fn_ns = dpctl::tensor::kernels::floor;
-
-static unary_contig_impl_fn_ptr_t
-    floor_contig_dispatch_vector[td_ns::num_types];
-static int floor_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    floor_strided_dispatch_vector[td_ns::num_types];
-
-void populate_floor_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = floor_fn_ns;
-
-    using fn_ns::FloorContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, FloorContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(floor_contig_dispatch_vector);
-
-    using fn_ns::FloorStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, FloorStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(floor_strided_dispatch_vector);
-
-    using fn_ns::FloorTypeMapFactory;
-    DispatchVectorBuilder<int, FloorTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(floor_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_floor(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_floor_dispatch_vectors();
-        using impl::floor_contig_dispatch_vector;
-        using impl::floor_output_typeid_vector;
-        using impl::floor_strided_dispatch_vector;
-
-        auto floor_pyapi = [&](const arrayT &src, const arrayT &dst,
-                               sycl::queue &exec_q,
-                               const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, floor_output_typeid_vector,
-                floor_contig_dispatch_vector, floor_strided_dispatch_vector);
-        };
-        m.def("_floor", floor_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto floor_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              floor_output_typeid_vector);
-        };
-        m.def("_floor_result_type", floor_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/floor.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/floor.hpp
deleted file mode 100644
index ffac98b461..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/floor.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_floor(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/floor_divide.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/floor_divide.cpp
deleted file mode 100644
index 9dacfe59d6..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/floor_divide.cpp
+++ /dev/null
@@ -1,200 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "floor_divide.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/common_inplace.hpp"
-#include "kernels/elementwise_functions/floor_divide.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
-
-using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
-
-// B10: ===== FLOOR_DIVIDE (x1, x2)
-namespace impl
-{
-namespace floor_divide_fn_ns = dpctl::tensor::kernels::floor_divide;
-
-static binary_contig_impl_fn_ptr_t
-    floor_divide_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-static int floor_divide_output_id_table[td_ns::num_types][td_ns::num_types];
-static int floor_divide_inplace_output_id_table[td_ns::num_types]
-                                               [td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    floor_divide_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-static binary_inplace_contig_impl_fn_ptr_t
-    floor_divide_inplace_contig_dispatch_table[td_ns::num_types]
-                                              [td_ns::num_types];
-static binary_inplace_strided_impl_fn_ptr_t
-    floor_divide_inplace_strided_dispatch_table[td_ns::num_types]
-                                               [td_ns::num_types];
-
-void populate_floor_divide_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = floor_divide_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::FloorDivideTypeMapFactory;
-    DispatchTableBuilder<int, FloorDivideTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(floor_divide_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::FloorDivideStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t,
-                         FloorDivideStridedFactory, num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(floor_divide_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::FloorDivideContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, FloorDivideContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(floor_divide_contig_dispatch_table);
-
-    // function pointers for inplace operation on general strided arrays
-    using fn_ns::FloorDivideInplaceStridedFactory;
-    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
-                         FloorDivideInplaceStridedFactory, num_types>
-        dtb4;
-    dtb4.populate_dispatch_table(floor_divide_inplace_strided_dispatch_table);
-
-    // function pointers for inplace operation on contiguous inputs and output
-    using fn_ns::FloorDivideInplaceContigFactory;
-    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
-                         FloorDivideInplaceContigFactory, num_types>
-        dtb5;
-    dtb5.populate_dispatch_table(floor_divide_inplace_contig_dispatch_table);
-
-    // which types are supported by the in-place kernels
-    using fn_ns::FloorDivideInplaceTypeMapFactory;
-    DispatchTableBuilder<int, FloorDivideInplaceTypeMapFactory, num_types> dtb6;
-    dtb6.populate_dispatch_table(floor_divide_inplace_output_id_table);
-};
-
-} // namespace impl
-
-void init_floor_divide(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_floor_divide_dispatch_tables();
-        using impl::floor_divide_contig_dispatch_table;
-        using impl::floor_divide_output_id_table;
-        using impl::floor_divide_strided_dispatch_table;
-
-        auto floor_divide_pyapi = [&](const arrayT &src1, const arrayT &src2,
-                                      const arrayT &dst, sycl::queue &exec_q,
-                                      const event_vecT &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, floor_divide_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                floor_divide_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                floor_divide_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto floor_divide_result_type_pyapi = [&](const py::dtype &dtype1,
-                                                  const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               floor_divide_output_id_table);
-        };
-        m.def("_floor_divide", floor_divide_pyapi, "", py::arg("src1"),
-              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_floor_divide_result_type", floor_divide_result_type_pyapi, "");
-
-        using impl::floor_divide_inplace_contig_dispatch_table;
-        using impl::floor_divide_inplace_output_id_table;
-        using impl::floor_divide_inplace_strided_dispatch_table;
-
-        auto floor_divide_inplace_pyapi = [&](const arrayT &src,
-                                              const arrayT &dst,
-                                              sycl::queue &exec_q,
-                                              const event_vecT &depends = {}) {
-            return py_binary_inplace_ufunc(
-                src, dst, exec_q, depends, floor_divide_inplace_output_id_table,
-                // function pointers to handle inplace operation on
-                // contiguous arrays (pointers may be nullptr)
-                floor_divide_inplace_contig_dispatch_table,
-                // function pointers to handle inplace operation on strided
-                // arrays (most general case)
-                floor_divide_inplace_strided_dispatch_table,
-                // function pointers to handle inplace operation on
-                // c-contig matrix with c-contig row with broadcasting
-                // (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        m.def("_floor_divide_inplace", floor_divide_inplace_pyapi, "",
-              py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/floor_divide.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/floor_divide.hpp
deleted file mode 100644
index c7667b133a..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/floor_divide.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_floor_divide(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/greater.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/greater.cpp
deleted file mode 100644
index abc2d3a3e3..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/greater.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "greater.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/greater.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
-
-// B11: ===== GREATER (x1, x2)
-namespace impl
-{
-namespace greater_fn_ns = dpctl::tensor::kernels::greater;
-
-static binary_contig_impl_fn_ptr_t
-    greater_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int greater_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    greater_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_greater_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = greater_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::GreaterTypeMapFactory;
-    DispatchTableBuilder<int, GreaterTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(greater_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::GreaterStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, GreaterStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(greater_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::GreaterContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, GreaterContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(greater_contig_dispatch_table);
-};
-
-} // namespace impl
-
-void init_greater(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_greater_dispatch_tables();
-        using impl::greater_contig_dispatch_table;
-        using impl::greater_output_id_table;
-        using impl::greater_strided_dispatch_table;
-
-        auto greater_pyapi = [&](const arrayT &src1, const arrayT &src2,
-                                 const arrayT &dst, sycl::queue &exec_q,
-                                 const event_vecT &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, greater_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                greater_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                greater_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto greater_result_type_pyapi = [&](const py::dtype &dtype1,
-                                             const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               greater_output_id_table);
-        };
-        m.def("_greater", greater_pyapi, "", py::arg("src1"), py::arg("src2"),
-              py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_greater_result_type", greater_result_type_pyapi, "");
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/greater.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/greater.hpp
deleted file mode 100644
index 7fb0777da4..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/greater.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_greater(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/greater_equal.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/greater_equal.cpp
deleted file mode 100644
index 56ba8a6cd4..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/greater_equal.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "greater_equal.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/greater_equal.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
-
-// B12: ===== GREATER_EQUAL (x1, x2)
-namespace impl
-{
-namespace greater_equal_fn_ns = dpctl::tensor::kernels::greater_equal;
-
-static binary_contig_impl_fn_ptr_t
-    greater_equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int greater_equal_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    greater_equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_greater_equal_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = greater_equal_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::GreaterEqualTypeMapFactory;
-    DispatchTableBuilder<int, GreaterEqualTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(greater_equal_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::GreaterEqualStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t,
-                         GreaterEqualStridedFactory, num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(greater_equal_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::GreaterEqualContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, GreaterEqualContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(greater_equal_contig_dispatch_table);
-};
-
-} // namespace impl
-
-void init_greater_equal(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_greater_equal_dispatch_tables();
-        using impl::greater_equal_contig_dispatch_table;
-        using impl::greater_equal_output_id_table;
-        using impl::greater_equal_strided_dispatch_table;
-
-        auto greater_equal_pyapi = [&](const arrayT &src1, const arrayT &src2,
-                                       const arrayT &dst, sycl::queue &exec_q,
-                                       const event_vecT &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, greater_equal_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                greater_equal_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                greater_equal_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto greater_equal_result_type_pyapi = [&](const py::dtype &dtype1,
-                                                   const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               greater_equal_output_id_table);
-        };
-        m.def("_greater_equal", greater_equal_pyapi, "", py::arg("src1"),
-              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_greater_equal_result_type", greater_equal_result_type_pyapi,
-              "");
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/greater_equal.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/greater_equal.hpp
deleted file mode 100644
index 17506f4cf2..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/greater_equal.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_greater_equal(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/hypot.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/hypot.cpp
deleted file mode 100644
index ff31df7a57..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/hypot.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "hypot.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/hypot.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
-
-// B24: ===== HYPOT (x1, x2)
-namespace impl
-{
-namespace hypot_fn_ns = dpctl::tensor::kernels::hypot;
-
-static binary_contig_impl_fn_ptr_t
-    hypot_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int hypot_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    hypot_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_hypot_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = hypot_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::HypotTypeMapFactory;
-    DispatchTableBuilder<int, HypotTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(hypot_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::HypotStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, HypotStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(hypot_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::HypotContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, HypotContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(hypot_contig_dispatch_table);
-};
-
-} // namespace impl
-
-void init_hypot(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_hypot_dispatch_tables();
-        using impl::hypot_contig_dispatch_table;
-        using impl::hypot_output_id_table;
-        using impl::hypot_strided_dispatch_table;
-
-        auto hypot_pyapi = [&](const arrayT &src1, const arrayT &src2,
-                               const arrayT &dst, sycl::queue &exec_q,
-                               const event_vecT &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, hypot_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                hypot_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                hypot_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto hypot_result_type_pyapi = [&](const py::dtype &dtype1,
-                                           const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               hypot_output_id_table);
-        };
-        m.def("_hypot", hypot_pyapi, "", py::arg("src1"), py::arg("src2"),
-              py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_hypot_result_type", hypot_result_type_pyapi, "");
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/hypot.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/hypot.hpp
deleted file mode 100644
index 9eb5d88172..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/hypot.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_hypot(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/imag.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/imag.cpp
deleted file mode 100644
index 49ab30d379..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/imag.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "imag.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/imag.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U16: ==== IMAG   (x)
-namespace impl
-{
-
-namespace imag_fn_ns = dpctl::tensor::kernels::imag;
-
-static unary_contig_impl_fn_ptr_t imag_contig_dispatch_vector[td_ns::num_types];
-static int imag_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    imag_strided_dispatch_vector[td_ns::num_types];
-
-void populate_imag_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = imag_fn_ns;
-
-    using fn_ns::ImagContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, ImagContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(imag_contig_dispatch_vector);
-
-    using fn_ns::ImagStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, ImagStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(imag_strided_dispatch_vector);
-
-    using fn_ns::ImagTypeMapFactory;
-    DispatchVectorBuilder<int, ImagTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(imag_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_imag(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_imag_dispatch_vectors();
-        using impl::imag_contig_dispatch_vector;
-        using impl::imag_output_typeid_vector;
-        using impl::imag_strided_dispatch_vector;
-
-        auto imag_pyapi = [&](const arrayT &src, const arrayT &dst,
-                              sycl::queue &exec_q,
-                              const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, imag_output_typeid_vector,
-                imag_contig_dispatch_vector, imag_strided_dispatch_vector);
-        };
-        m.def("_imag", imag_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto imag_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, imag_output_typeid_vector);
-        };
-        m.def("_imag_result_type", imag_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/imag.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/imag.hpp
deleted file mode 100644
index bb4be2653b..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/imag.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_isfinite(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/isfinite.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/isfinite.cpp
deleted file mode 100644
index efc8261d38..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/isfinite.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "isfinite.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/isfinite.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U17: ==== ISFINITE   (x)
-namespace impl
-{
-
-namespace isfinite_fn_ns = dpctl::tensor::kernels::isfinite;
-
-static unary_contig_impl_fn_ptr_t
-    isfinite_contig_dispatch_vector[td_ns::num_types];
-static int isfinite_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    isfinite_strided_dispatch_vector[td_ns::num_types];
-
-void populate_isfinite_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = isfinite_fn_ns;
-
-    using fn_ns::IsFiniteContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, IsFiniteContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(isfinite_contig_dispatch_vector);
-
-    using fn_ns::IsFiniteStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, IsFiniteStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(isfinite_strided_dispatch_vector);
-
-    using fn_ns::IsFiniteTypeMapFactory;
-    DispatchVectorBuilder<int, IsFiniteTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(isfinite_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_isfinite(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_isfinite_dispatch_vectors();
-        using impl::isfinite_contig_dispatch_vector;
-        using impl::isfinite_output_typeid_vector;
-        using impl::isfinite_strided_dispatch_vector;
-
-        auto isfinite_pyapi = [&](const arrayT &src, const arrayT &dst,
-                                  sycl::queue &exec_q,
-                                  const event_vecT &depends = {}) {
-            return py_unary_ufunc(src, dst, exec_q, depends,
-                                  isfinite_output_typeid_vector,
-                                  isfinite_contig_dispatch_vector,
-                                  isfinite_strided_dispatch_vector);
-        };
-        m.def("_isfinite", isfinite_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto isfinite_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              isfinite_output_typeid_vector);
-        };
-        m.def("_isfinite_result_type", isfinite_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/isfinite.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/isfinite.hpp
deleted file mode 100644
index 0f356f22bc..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/isfinite.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_imag(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/isinf.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/isinf.cpp
deleted file mode 100644
index 6a2b52667a..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/isinf.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "isinf.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/isinf.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U18: ==== ISINF   (x)
-namespace impl
-{
-
-namespace isinf_fn_ns = dpctl::tensor::kernels::isinf;
-
-static unary_contig_impl_fn_ptr_t
-    isinf_contig_dispatch_vector[td_ns::num_types];
-static int isinf_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    isinf_strided_dispatch_vector[td_ns::num_types];
-
-void populate_isinf_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = isinf_fn_ns;
-
-    using fn_ns::IsInfContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, IsInfContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(isinf_contig_dispatch_vector);
-
-    using fn_ns::IsInfStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, IsInfStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(isinf_strided_dispatch_vector);
-
-    using fn_ns::IsInfTypeMapFactory;
-    DispatchVectorBuilder<int, IsInfTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(isinf_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_isinf(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_isinf_dispatch_vectors();
-        using impl::isinf_contig_dispatch_vector;
-        using impl::isinf_output_typeid_vector;
-        using impl::isinf_strided_dispatch_vector;
-
-        auto isinf_pyapi = [&](const arrayT &src, const arrayT &dst,
-                               sycl::queue &exec_q,
-                               const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, isinf_output_typeid_vector,
-                isinf_contig_dispatch_vector, isinf_strided_dispatch_vector);
-        };
-        m.def("_isinf", isinf_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto isinf_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              isinf_output_typeid_vector);
-        };
-        m.def("_isinf_result_type", isinf_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/isinf.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/isinf.hpp
deleted file mode 100644
index 17dc5bc4e8..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/isinf.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_isinf(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/isnan.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/isnan.cpp
deleted file mode 100644
index 76add38608..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/isnan.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "isnan.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/isnan.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U19: ==== ISNAN   (x)
-namespace impl
-{
-
-namespace isnan_fn_ns = dpctl::tensor::kernels::isnan;
-
-static unary_contig_impl_fn_ptr_t
-    isnan_contig_dispatch_vector[td_ns::num_types];
-static int isnan_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    isnan_strided_dispatch_vector[td_ns::num_types];
-
-void populate_isnan_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = isnan_fn_ns;
-
-    using fn_ns::IsNanContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, IsNanContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(isnan_contig_dispatch_vector);
-
-    using fn_ns::IsNanStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, IsNanStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(isnan_strided_dispatch_vector);
-
-    using fn_ns::IsNanTypeMapFactory;
-    DispatchVectorBuilder<int, IsNanTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(isnan_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_isnan(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_isnan_dispatch_vectors();
-        using impl::isnan_contig_dispatch_vector;
-        using impl::isnan_output_typeid_vector;
-        using impl::isnan_strided_dispatch_vector;
-
-        auto isnan_pyapi = [&](const arrayT &src, const arrayT &dst,
-                               sycl::queue &exec_q,
-                               const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, isnan_output_typeid_vector,
-                isnan_contig_dispatch_vector, isnan_strided_dispatch_vector);
-        };
-        m.def("_isnan", isnan_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto isnan_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              isnan_output_typeid_vector);
-        };
-        m.def("_isnan_result_type", isnan_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/isnan.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/isnan.hpp
deleted file mode 100644
index 821722f8cc..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/isnan.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_isnan(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/less.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/less.cpp
deleted file mode 100644
index 6011697655..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/less.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "less.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/less.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
-
-// B13: ===== LESS (x1, x2)
-namespace impl
-{
-namespace less_fn_ns = dpctl::tensor::kernels::less;
-
-static binary_contig_impl_fn_ptr_t less_contig_dispatch_table[td_ns::num_types]
-                                                             [td_ns::num_types];
-static int less_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    less_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_less_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = less_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::LessTypeMapFactory;
-    DispatchTableBuilder<int, LessTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(less_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::LessStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LessStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(less_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::LessContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LessContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(less_contig_dispatch_table);
-};
-
-} // namespace impl
-
-void init_less(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_less_dispatch_tables();
-        using impl::less_contig_dispatch_table;
-        using impl::less_output_id_table;
-        using impl::less_strided_dispatch_table;
-
-        auto less_pyapi = [&](const arrayT &src1, const arrayT &src2,
-                              const arrayT &dst, sycl::queue &exec_q,
-                              const event_vecT &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, less_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                less_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                less_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto less_result_type_pyapi = [&](const py::dtype &dtype1,
-                                          const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               less_output_id_table);
-        };
-        m.def("_less", less_pyapi, "", py::arg("src1"), py::arg("src2"),
-              py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_less_result_type", less_result_type_pyapi, "");
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/less.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/less.hpp
deleted file mode 100644
index 0ed7f7f154..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/less.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_less(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/less_equal.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/less_equal.cpp
deleted file mode 100644
index a66da28e6c..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/less_equal.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "less_equal.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/less_equal.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
-
-// B14: ===== LESS_EQUAL (x1, x2)
-namespace impl
-{
-namespace less_equal_fn_ns = dpctl::tensor::kernels::less_equal;
-
-static binary_contig_impl_fn_ptr_t
-    less_equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int less_equal_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    less_equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_less_equal_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = less_equal_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::LessEqualTypeMapFactory;
-    DispatchTableBuilder<int, LessEqualTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(less_equal_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::LessEqualStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LessEqualStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(less_equal_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::LessEqualContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LessEqualContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(less_equal_contig_dispatch_table);
-};
-
-} // namespace impl
-
-void init_less_equal(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_less_equal_dispatch_tables();
-        using impl::less_equal_contig_dispatch_table;
-        using impl::less_equal_output_id_table;
-        using impl::less_equal_strided_dispatch_table;
-
-        auto less_equal_pyapi = [&](const arrayT &src1, const arrayT &src2,
-                                    const arrayT &dst, sycl::queue &exec_q,
-                                    const event_vecT &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, less_equal_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                less_equal_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                less_equal_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto less_equal_result_type_pyapi = [&](const py::dtype &dtype1,
-                                                const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               less_equal_output_id_table);
-        };
-        m.def("_less_equal", less_equal_pyapi, "", py::arg("src1"),
-              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_less_equal_result_type", less_equal_result_type_pyapi, "");
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/less_equal.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/less_equal.hpp
deleted file mode 100644
index a9b7992076..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/less_equal.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_less_equal(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/log.cpp
deleted file mode 100644
index 236fb40474..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/log.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "log.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/log.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U20: ==== LOG   (x)
-namespace impl
-{
-
-namespace log_fn_ns = dpctl::tensor::kernels::log;
-
-static unary_contig_impl_fn_ptr_t log_contig_dispatch_vector[td_ns::num_types];
-static int log_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    log_strided_dispatch_vector[td_ns::num_types];
-
-void populate_log_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = log_fn_ns;
-
-    using fn_ns::LogContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, LogContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(log_contig_dispatch_vector);
-
-    using fn_ns::LogStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, LogStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(log_strided_dispatch_vector);
-
-    using fn_ns::LogTypeMapFactory;
-    DispatchVectorBuilder<int, LogTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(log_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_log(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_log_dispatch_vectors();
-        using impl::log_contig_dispatch_vector;
-        using impl::log_output_typeid_vector;
-        using impl::log_strided_dispatch_vector;
-
-        auto log_pyapi = [&](const arrayT &src, const arrayT &dst,
-                             sycl::queue &exec_q,
-                             const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, log_output_typeid_vector,
-                log_contig_dispatch_vector, log_strided_dispatch_vector);
-        };
-        m.def("_log", log_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto log_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, log_output_typeid_vector);
-        };
-        m.def("_log_result_type", log_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/log.hpp
deleted file mode 100644
index 273e26ebc5..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/log.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_log(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log10.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/log10.cpp
deleted file mode 100644
index cfcb299256..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/log10.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "log10.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/log10.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U23: ==== LOG10   (x)
-namespace impl
-{
-
-namespace log10_fn_ns = dpctl::tensor::kernels::log10;
-
-static unary_contig_impl_fn_ptr_t
-    log10_contig_dispatch_vector[td_ns::num_types];
-static int log10_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    log10_strided_dispatch_vector[td_ns::num_types];
-
-void populate_log10_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = log10_fn_ns;
-
-    using fn_ns::Log10ContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, Log10ContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(log10_contig_dispatch_vector);
-
-    using fn_ns::Log10StridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, Log10StridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(log10_strided_dispatch_vector);
-
-    using fn_ns::Log10TypeMapFactory;
-    DispatchVectorBuilder<int, Log10TypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(log10_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_log10(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_log10_dispatch_vectors();
-        using impl::log10_contig_dispatch_vector;
-        using impl::log10_output_typeid_vector;
-        using impl::log10_strided_dispatch_vector;
-
-        auto log10_pyapi = [&](const arrayT &src, const arrayT &dst,
-                               sycl::queue &exec_q,
-                               const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, log10_output_typeid_vector,
-                log10_contig_dispatch_vector, log10_strided_dispatch_vector);
-        };
-        m.def("_log10", log10_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto log10_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              log10_output_typeid_vector);
-        };
-        m.def("_log10_result_type", log10_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log10.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/log10.hpp
deleted file mode 100644
index ec745f8758..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/log10.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_log10(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log1p.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/log1p.cpp
deleted file mode 100644
index 0f36b79058..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/log1p.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "log1p.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/log1p.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U21: ==== LOG1P   (x)
-namespace impl
-{
-
-namespace log1p_fn_ns = dpctl::tensor::kernels::log1p;
-
-static unary_contig_impl_fn_ptr_t
-    log1p_contig_dispatch_vector[td_ns::num_types];
-static int log1p_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    log1p_strided_dispatch_vector[td_ns::num_types];
-
-void populate_log1p_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = log1p_fn_ns;
-
-    using fn_ns::Log1pContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, Log1pContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(log1p_contig_dispatch_vector);
-
-    using fn_ns::Log1pStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, Log1pStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(log1p_strided_dispatch_vector);
-
-    using fn_ns::Log1pTypeMapFactory;
-    DispatchVectorBuilder<int, Log1pTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(log1p_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_log1p(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_log1p_dispatch_vectors();
-        using impl::log1p_contig_dispatch_vector;
-        using impl::log1p_output_typeid_vector;
-        using impl::log1p_strided_dispatch_vector;
-
-        auto log1p_pyapi = [&](const arrayT &src, const arrayT &dst,
-                               sycl::queue &exec_q,
-                               const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, log1p_output_typeid_vector,
-                log1p_contig_dispatch_vector, log1p_strided_dispatch_vector);
-        };
-        m.def("_log1p", log1p_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto log1p_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              log1p_output_typeid_vector);
-        };
-        m.def("_log1p_result_type", log1p_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log1p.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/log1p.hpp
deleted file mode 100644
index 01e0dc35d3..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/log1p.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_log1p(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log2.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/log2.cpp
deleted file mode 100644
index 1300ada1b0..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/log2.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "log2.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/log2.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U22: ==== LOG2   (x)
-namespace impl
-{
-
-namespace log2_fn_ns = dpctl::tensor::kernels::log2;
-
-static unary_contig_impl_fn_ptr_t log2_contig_dispatch_vector[td_ns::num_types];
-static int log2_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    log2_strided_dispatch_vector[td_ns::num_types];
-
-void populate_log2_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = log2_fn_ns;
-
-    using fn_ns::Log2ContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, Log2ContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(log2_contig_dispatch_vector);
-
-    using fn_ns::Log2StridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, Log2StridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(log2_strided_dispatch_vector);
-
-    using fn_ns::Log2TypeMapFactory;
-    DispatchVectorBuilder<int, Log2TypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(log2_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_log2(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_log2_dispatch_vectors();
-        using impl::log2_contig_dispatch_vector;
-        using impl::log2_output_typeid_vector;
-        using impl::log2_strided_dispatch_vector;
-
-        auto log2_pyapi = [&](const arrayT &src, const arrayT &dst,
-                              sycl::queue &exec_q,
-                              const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, log2_output_typeid_vector,
-                log2_contig_dispatch_vector, log2_strided_dispatch_vector);
-        };
-        m.def("_log2", log2_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto log2_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, log2_output_typeid_vector);
-        };
-        m.def("_log2_result_type", log2_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log2.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/log2.hpp
deleted file mode 100644
index b4a1311fad..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/log2.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_log2(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logaddexp.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/logaddexp.cpp
deleted file mode 100644
index 91cec9caa1..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/logaddexp.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "logaddexp.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/logaddexp.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
-
-// B15: ===== LOGADDEXP (x1, x2)
-namespace impl
-{
-namespace logaddexp_fn_ns = dpctl::tensor::kernels::logaddexp;
-
-static binary_contig_impl_fn_ptr_t
-    logaddexp_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int logaddexp_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    logaddexp_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_logaddexp_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = logaddexp_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::LogAddExpTypeMapFactory;
-    DispatchTableBuilder<int, LogAddExpTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(logaddexp_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::LogAddExpStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LogAddExpStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(logaddexp_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::LogAddExpContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LogAddExpContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(logaddexp_contig_dispatch_table);
-};
-
-} // namespace impl
-
-void init_logaddexp(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_logaddexp_dispatch_tables();
-        using impl::logaddexp_contig_dispatch_table;
-        using impl::logaddexp_output_id_table;
-        using impl::logaddexp_strided_dispatch_table;
-
-        auto logaddexp_pyapi = [&](const arrayT &src1, const arrayT &src2,
-                                   const arrayT &dst, sycl::queue &exec_q,
-                                   const event_vecT &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, logaddexp_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                logaddexp_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                logaddexp_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto logaddexp_result_type_pyapi = [&](const py::dtype &dtype1,
-                                               const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               logaddexp_output_id_table);
-        };
-        m.def("_logaddexp", logaddexp_pyapi, "", py::arg("src1"),
-              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_logaddexp_result_type", logaddexp_result_type_pyapi, "");
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/logaddexp.hpp
deleted file mode 100644
index 64b3287710..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/logaddexp.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_logaddexp(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_and.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_and.cpp
deleted file mode 100644
index f7da59fa25..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/logical_and.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "logical_and.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/logical_and.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
-
-// B16: ===== LOGICAL_AND (x1, x2)
-namespace impl
-{
-namespace logical_and_fn_ns = dpctl::tensor::kernels::logical_and;
-
-static binary_contig_impl_fn_ptr_t
-    logical_and_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int logical_and_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    logical_and_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_logical_and_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = logical_and_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::LogicalAndTypeMapFactory;
-    DispatchTableBuilder<int, LogicalAndTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(logical_and_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::LogicalAndStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LogicalAndStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(logical_and_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::LogicalAndContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LogicalAndContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(logical_and_contig_dispatch_table);
-};
-
-} // namespace impl
-
-void init_logical_and(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_logical_and_dispatch_tables();
-        using impl::logical_and_contig_dispatch_table;
-        using impl::logical_and_output_id_table;
-        using impl::logical_and_strided_dispatch_table;
-
-        auto logical_and_pyapi = [&](const arrayT &src1, const arrayT &src2,
-                                     const arrayT &dst, sycl::queue &exec_q,
-                                     const event_vecT &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, logical_and_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                logical_and_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                logical_and_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto logical_and_result_type_pyapi = [&](const py::dtype &dtype1,
-                                                 const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               logical_and_output_id_table);
-        };
-        m.def("_logical_and", logical_and_pyapi, "", py::arg("src1"),
-              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_logical_and_result_type", logical_and_result_type_pyapi, "");
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_and.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_and.hpp
deleted file mode 100644
index 461962b39a..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/logical_and.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_logical_and(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_not.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_not.cpp
deleted file mode 100644
index d799cc03de..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/logical_not.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "logical_not.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/logical_not.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U24: ==== LOGICAL_NOT   (x)
-namespace impl
-{
-
-namespace logical_not_fn_ns = dpctl::tensor::kernels::logical_not;
-
-static unary_contig_impl_fn_ptr_t
-    logical_not_contig_dispatch_vector[td_ns::num_types];
-static int logical_not_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    logical_not_strided_dispatch_vector[td_ns::num_types];
-
-void populate_logical_not_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = logical_not_fn_ns;
-
-    using fn_ns::LogicalNotContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, LogicalNotContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(logical_not_contig_dispatch_vector);
-
-    using fn_ns::LogicalNotStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, LogicalNotStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(logical_not_strided_dispatch_vector);
-
-    using fn_ns::LogicalNotTypeMapFactory;
-    DispatchVectorBuilder<int, LogicalNotTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(logical_not_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_logical_not(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_logical_not_dispatch_vectors();
-        using impl::logical_not_contig_dispatch_vector;
-        using impl::logical_not_output_typeid_vector;
-        using impl::logical_not_strided_dispatch_vector;
-
-        auto logical_not_pyapi = [&](const arrayT &src, const arrayT &dst,
-                                     sycl::queue &exec_q,
-                                     const event_vecT &depends = {}) {
-            return py_unary_ufunc(src, dst, exec_q, depends,
-                                  logical_not_output_typeid_vector,
-                                  logical_not_contig_dispatch_vector,
-                                  logical_not_strided_dispatch_vector);
-        };
-        m.def("_logical_not", logical_not_pyapi, "", py::arg("src"),
-              py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-
-        auto logical_not_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              logical_not_output_typeid_vector);
-        };
-        m.def("_logical_not_result_type", logical_not_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_not.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_not.hpp
deleted file mode 100644
index 13a78ed0eb..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/logical_not.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_logical_not(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_or.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_or.cpp
deleted file mode 100644
index e1d5abd4a3..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/logical_or.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "logical_or.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/logical_or.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
-
-// B17: ===== LOGICAL_OR (x1, x2)
-namespace impl
-{
-namespace logical_or_fn_ns = dpctl::tensor::kernels::logical_or;
-
-static binary_contig_impl_fn_ptr_t
-    logical_or_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int logical_or_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    logical_or_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_logical_or_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = logical_or_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::LogicalOrTypeMapFactory;
-    DispatchTableBuilder<int, LogicalOrTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(logical_or_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::LogicalOrStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LogicalOrStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(logical_or_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::LogicalOrContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LogicalOrContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(logical_or_contig_dispatch_table);
-};
-
-} // namespace impl
-
-void init_logical_or(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_logical_or_dispatch_tables();
-        using impl::logical_or_contig_dispatch_table;
-        using impl::logical_or_output_id_table;
-        using impl::logical_or_strided_dispatch_table;
-
-        auto logical_or_pyapi = [&](const arrayT &src1, const arrayT &src2,
-                                    const arrayT &dst, sycl::queue &exec_q,
-                                    const event_vecT &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, logical_or_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                logical_or_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                logical_or_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto logical_or_result_type_pyapi = [&](const py::dtype &dtype1,
-                                                const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               logical_or_output_id_table);
-        };
-        m.def("_logical_or", logical_or_pyapi, "", py::arg("src1"),
-              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_logical_or_result_type", logical_or_result_type_pyapi, "");
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_or.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_or.hpp
deleted file mode 100644
index 9fc581997e..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/logical_or.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_logical_xor(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_xor.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_xor.cpp
deleted file mode 100644
index 7aa6d27b27..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/logical_xor.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "logical_xor.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/logical_xor.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
-
-// B18: ===== LOGICAL_XOR (x1, x2)
-namespace impl
-{
-namespace logical_xor_fn_ns = dpctl::tensor::kernels::logical_xor;
-
-static binary_contig_impl_fn_ptr_t
-    logical_xor_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int logical_xor_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    logical_xor_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_logical_xor_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = logical_xor_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::LogicalXorTypeMapFactory;
-    DispatchTableBuilder<int, LogicalXorTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(logical_xor_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::LogicalXorStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LogicalXorStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(logical_xor_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::LogicalXorContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LogicalXorContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(logical_xor_contig_dispatch_table);
-};
-
-} // namespace impl
-
-void init_logical_xor(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_logical_xor_dispatch_tables();
-        using impl::logical_xor_contig_dispatch_table;
-        using impl::logical_xor_output_id_table;
-        using impl::logical_xor_strided_dispatch_table;
-
-        auto logical_xor_pyapi = [&](const arrayT &src1, const arrayT &src2,
-                                     const arrayT &dst, sycl::queue &exec_q,
-                                     const event_vecT &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, logical_xor_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                logical_xor_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                logical_xor_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto logical_xor_result_type_pyapi = [&](const py::dtype &dtype1,
-                                                 const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               logical_xor_output_id_table);
-        };
-        m.def("_logical_xor", logical_xor_pyapi, "", py::arg("src1"),
-              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_logical_xor_result_type", logical_xor_result_type_pyapi, "");
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_xor.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_xor.hpp
deleted file mode 100644
index aa223ad33c..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/logical_xor.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_logical_or(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/maximum.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/maximum.cpp
deleted file mode 100644
index 4cd20f9f3a..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/maximum.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "maximum.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/maximum.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
-
-// B26: ===== MAXIMUM (x1, x2)
-namespace impl
-{
-namespace maximum_fn_ns = dpctl::tensor::kernels::maximum;
-
-static binary_contig_impl_fn_ptr_t
-    maximum_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int maximum_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    maximum_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_maximum_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = maximum_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::MaximumTypeMapFactory;
-    DispatchTableBuilder<int, MaximumTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(maximum_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::MaximumStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, MaximumStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(maximum_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::MaximumContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, MaximumContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(maximum_contig_dispatch_table);
-};
-
-} // namespace impl
-
-void init_maximum(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_maximum_dispatch_tables();
-        using impl::maximum_contig_dispatch_table;
-        using impl::maximum_output_id_table;
-        using impl::maximum_strided_dispatch_table;
-
-        auto maximum_pyapi = [&](const arrayT &src1, const arrayT &src2,
-                                 const arrayT &dst, sycl::queue &exec_q,
-                                 const event_vecT &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, maximum_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                maximum_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                maximum_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto maximum_result_type_pyapi = [&](const py::dtype &dtype1,
-                                             const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               maximum_output_id_table);
-        };
-        m.def("_maximum", maximum_pyapi, "", py::arg("src1"), py::arg("src2"),
-              py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_maximum_result_type", maximum_result_type_pyapi, "");
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/maximum.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/maximum.hpp
deleted file mode 100644
index c427176a05..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/maximum.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_maximum(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/minimum.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/minimum.cpp
deleted file mode 100644
index efc19e2147..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/minimum.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "minimum.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/minimum.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
-
-// B27: ===== MINIMUM (x1, x2)
-namespace impl
-{
-namespace minimum_fn_ns = dpctl::tensor::kernels::minimum;
-
-static binary_contig_impl_fn_ptr_t
-    minimum_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int minimum_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    minimum_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_minimum_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = minimum_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::MinimumTypeMapFactory;
-    DispatchTableBuilder<int, MinimumTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(minimum_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::MinimumStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, MinimumStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(minimum_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::MinimumContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, MinimumContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(minimum_contig_dispatch_table);
-};
-
-} // namespace impl
-
-void init_minimum(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_minimum_dispatch_tables();
-        using impl::minimum_contig_dispatch_table;
-        using impl::minimum_output_id_table;
-        using impl::minimum_strided_dispatch_table;
-
-        auto minimum_pyapi = [&](const arrayT &src1, const arrayT &src2,
-                                 const arrayT &dst, sycl::queue &exec_q,
-                                 const event_vecT &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, minimum_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                minimum_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                minimum_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto minimum_result_type_pyapi = [&](const py::dtype &dtype1,
-                                             const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               minimum_output_id_table);
-        };
-        m.def("_minimum", minimum_pyapi, "", py::arg("src1"), py::arg("src2"),
-              py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_minimum_result_type", minimum_result_type_pyapi, "");
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/minimum.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/minimum.hpp
deleted file mode 100644
index 5d87fb5cda..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/minimum.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_minimum(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/multiply.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/multiply.cpp
deleted file mode 100644
index d073344d08..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/multiply.cpp
+++ /dev/null
@@ -1,238 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "multiply.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/common_inplace.hpp"
-#include "kernels/elementwise_functions/multiply.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
-
-using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
-
-// B19: ===== MULTIPLY (x1, x2)
-namespace impl
-{
-
-namespace multiply_fn_ns = dpctl::tensor::kernels::multiply;
-
-static binary_contig_impl_fn_ptr_t
-    multiply_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-static int multiply_output_id_table[td_ns::num_types][td_ns::num_types];
-static int multiply_inplace_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    multiply_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-// mul(matrix, row)
-static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t
-    multiply_contig_matrix_contig_row_broadcast_dispatch_table
-        [td_ns::num_types][td_ns::num_types];
-
-// mul(row, matrix)
-static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t
-    multiply_contig_row_contig_matrix_broadcast_dispatch_table
-        [td_ns::num_types][td_ns::num_types];
-
-static binary_inplace_contig_impl_fn_ptr_t
-    multiply_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static binary_inplace_strided_impl_fn_ptr_t
-    multiply_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t
-    multiply_inplace_row_matrix_dispatch_table[td_ns::num_types]
-                                              [td_ns::num_types];
-
-void populate_multiply_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = multiply_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::MultiplyTypeMapFactory;
-    DispatchTableBuilder<int, MultiplyTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(multiply_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::MultiplyStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, MultiplyStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(multiply_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::MultiplyContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, MultiplyContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(multiply_contig_dispatch_table);
-
-    // function pointers for operation on contiguous matrix, contiguous row
-    // with contiguous matrix output
-    using fn_ns::MultiplyContigMatrixContigRowBroadcastFactory;
-    DispatchTableBuilder<
-        binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t,
-        MultiplyContigMatrixContigRowBroadcastFactory, num_types>
-        dtb4;
-    dtb4.populate_dispatch_table(
-        multiply_contig_matrix_contig_row_broadcast_dispatch_table);
-
-    // function pointers for operation on contiguous row, contiguous matrix
-    // with contiguous matrix output
-    using fn_ns::MultiplyContigRowContigMatrixBroadcastFactory;
-    DispatchTableBuilder<
-        binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t,
-        MultiplyContigRowContigMatrixBroadcastFactory, num_types>
-        dtb5;
-    dtb5.populate_dispatch_table(
-        multiply_contig_row_contig_matrix_broadcast_dispatch_table);
-
-    // function pointers for inplace operation on general strided arrays
-    using fn_ns::MultiplyInplaceStridedFactory;
-    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
-                         MultiplyInplaceStridedFactory, num_types>
-        dtb6;
-    dtb6.populate_dispatch_table(multiply_inplace_strided_dispatch_table);
-
-    // function pointers for inplace operation on contiguous inputs and output
-    using fn_ns::MultiplyInplaceContigFactory;
-    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
-                         MultiplyInplaceContigFactory, num_types>
-        dtb7;
-    dtb7.populate_dispatch_table(multiply_inplace_contig_dispatch_table);
-
-    // function pointers for inplace operation on contiguous matrix
-    // and contiguous row
-    using fn_ns::MultiplyInplaceRowMatrixBroadcastFactory;
-    DispatchTableBuilder<binary_inplace_row_matrix_broadcast_impl_fn_ptr_t,
-                         MultiplyInplaceRowMatrixBroadcastFactory, num_types>
-        dtb8;
-    dtb8.populate_dispatch_table(multiply_inplace_row_matrix_dispatch_table);
-
-    // which types are supported by the in-place kernels
-    using fn_ns::MultiplyInplaceTypeMapFactory;
-    DispatchTableBuilder<int, MultiplyInplaceTypeMapFactory, num_types> dtb9;
-    dtb9.populate_dispatch_table(multiply_inplace_output_id_table);
-};
-
-} // namespace impl
-
-void init_multiply(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_multiply_dispatch_tables();
-        using impl::multiply_contig_dispatch_table;
-        using impl::multiply_contig_matrix_contig_row_broadcast_dispatch_table;
-        using impl::multiply_contig_row_contig_matrix_broadcast_dispatch_table;
-        using impl::multiply_output_id_table;
-        using impl::multiply_strided_dispatch_table;
-
-        auto multiply_pyapi = [&](const arrayT &src1, const arrayT &src2,
-                                  const arrayT &dst, sycl::queue &exec_q,
-                                  const event_vecT &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, multiply_output_id_table,
-                // function pointers to handle operation on contiguous
-                // arrays (pointers may be nullptr)
-                multiply_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays
-                // (most general case)
-                multiply_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix
-                // and c-contig row with broadcasting (may be nullptr)
-                multiply_contig_matrix_contig_row_broadcast_dispatch_table,
-                // function pointers to handle operation of c-contig matrix
-                // and c-contig row with broadcasting (may be nullptr)
-                multiply_contig_row_contig_matrix_broadcast_dispatch_table);
-        };
-        auto multiply_result_type_pyapi = [&](const py::dtype &dtype1,
-                                              const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               multiply_output_id_table);
-        };
-        m.def("_multiply", multiply_pyapi, "", py::arg("src1"), py::arg("src2"),
-              py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_multiply_result_type", multiply_result_type_pyapi, "");
-
-        using impl::multiply_inplace_contig_dispatch_table;
-        using impl::multiply_inplace_output_id_table;
-        using impl::multiply_inplace_row_matrix_dispatch_table;
-        using impl::multiply_inplace_strided_dispatch_table;
-
-        auto multiply_inplace_pyapi = [&](const arrayT &src, const arrayT &dst,
-                                          sycl::queue &exec_q,
-                                          const event_vecT &depends = {}) {
-            return py_binary_inplace_ufunc(
-                src, dst, exec_q, depends, multiply_inplace_output_id_table,
-                // function pointers to handle inplace operation on
-                // contiguous arrays (pointers may be nullptr)
-                multiply_inplace_contig_dispatch_table,
-                // function pointers to handle inplace operation on strided
-                // arrays (most general case)
-                multiply_inplace_strided_dispatch_table,
-                // function pointers to handle inplace operation on
-                // c-contig matrix with c-contig row with broadcasting
-                // (may be nullptr)
-                multiply_inplace_row_matrix_dispatch_table);
-        };
-        m.def("_multiply_inplace", multiply_inplace_pyapi, "", py::arg("lhs"),
-              py::arg("rhs"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/multiply.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/multiply.hpp
deleted file mode 100644
index fe9208c545..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/multiply.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_multiply(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/negative.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/negative.cpp
deleted file mode 100644
index ec6354fb74..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/negative.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "negative.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/negative.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U25: ==== NEGATIVE   (x)
-namespace impl
-{
-
-namespace negative_fn_ns = dpctl::tensor::kernels::negative;
-
-static unary_contig_impl_fn_ptr_t
-    negative_contig_dispatch_vector[td_ns::num_types];
-static int negative_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    negative_strided_dispatch_vector[td_ns::num_types];
-
-void populate_negative_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = negative_fn_ns;
-
-    using fn_ns::NegativeContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, NegativeContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(negative_contig_dispatch_vector);
-
-    using fn_ns::NegativeStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, NegativeStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(negative_strided_dispatch_vector);
-
-    using fn_ns::NegativeTypeMapFactory;
-    DispatchVectorBuilder<int, NegativeTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(negative_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_negative(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_negative_dispatch_vectors();
-        using impl::negative_contig_dispatch_vector;
-        using impl::negative_output_typeid_vector;
-        using impl::negative_strided_dispatch_vector;
-
-        auto negative_pyapi = [&](const arrayT &src, const arrayT &dst,
-                                  sycl::queue &exec_q,
-                                  const event_vecT &depends = {}) {
-            return py_unary_ufunc(src, dst, exec_q, depends,
-                                  negative_output_typeid_vector,
-                                  negative_contig_dispatch_vector,
-                                  negative_strided_dispatch_vector);
-        };
-        m.def("_negative", negative_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto negative_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              negative_output_typeid_vector);
-        };
-        m.def("_negative_result_type", negative_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/negative.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/negative.hpp
deleted file mode 100644
index dcb5b57bf1..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/negative.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_negative(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/nextafter.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/nextafter.cpp
deleted file mode 100644
index 45135199f1..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/nextafter.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "nextafter.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/nextafter.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
-
-// B28: ===== NEXTAFTER (x1, x2)
-namespace impl
-{
-namespace nextafter_fn_ns = dpctl::tensor::kernels::nextafter;
-
-static binary_contig_impl_fn_ptr_t
-    nextafter_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int nextafter_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    nextafter_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_nextafter_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = nextafter_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::NextafterTypeMapFactory;
-    DispatchTableBuilder<int, NextafterTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(nextafter_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::NextafterStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, NextafterStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(nextafter_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::NextafterContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, NextafterContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(nextafter_contig_dispatch_table);
-};
-
-} // namespace impl
-
-void init_nextafter(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_nextafter_dispatch_tables();
-        using impl::nextafter_contig_dispatch_table;
-        using impl::nextafter_output_id_table;
-        using impl::nextafter_strided_dispatch_table;
-
-        auto nextafter_pyapi = [&](const arrayT &src1, const arrayT &src2,
-                                   const arrayT &dst, sycl::queue &exec_q,
-                                   const event_vecT &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, nextafter_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                nextafter_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                nextafter_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto nextafter_result_type_pyapi = [&](const py::dtype &dtype1,
-                                               const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               nextafter_output_id_table);
-        };
-        m.def("_nextafter", nextafter_pyapi, "", py::arg("src1"),
-              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_nextafter_result_type", nextafter_result_type_pyapi, "");
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/nextafter.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/nextafter.hpp
deleted file mode 100644
index e704db90d6..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/nextafter.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_nextafter(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/not_equal.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/not_equal.cpp
deleted file mode 100644
index 634749541d..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/not_equal.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "not_equal.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/not_equal.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
-
-// B20: ===== NOT_EQUAL (x1, x2)
-namespace impl
-{
-namespace not_equal_fn_ns = dpctl::tensor::kernels::not_equal;
-
-static binary_contig_impl_fn_ptr_t
-    not_equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int not_equal_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    not_equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_not_equal_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = not_equal_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::NotEqualTypeMapFactory;
-    DispatchTableBuilder<int, NotEqualTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(not_equal_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::NotEqualStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, NotEqualStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(not_equal_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::NotEqualContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, NotEqualContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(not_equal_contig_dispatch_table);
-};
-
-} // namespace impl
-
-void init_not_equal(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_not_equal_dispatch_tables();
-        using impl::not_equal_contig_dispatch_table;
-        using impl::not_equal_output_id_table;
-        using impl::not_equal_strided_dispatch_table;
-
-        auto not_equal_pyapi = [&](const arrayT &src1, const arrayT &src2,
-                                   const arrayT &dst, sycl::queue &exec_q,
-                                   const event_vecT &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, not_equal_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                not_equal_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                not_equal_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto not_equal_result_type_pyapi = [&](const py::dtype &dtype1,
-                                               const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               not_equal_output_id_table);
-        };
-        m.def("_not_equal", not_equal_pyapi, "", py::arg("src1"),
-              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_not_equal_result_type", not_equal_result_type_pyapi, "");
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/not_equal.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/not_equal.hpp
deleted file mode 100644
index a04189f0a5..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/not_equal.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_not_equal(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/positive.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/positive.cpp
deleted file mode 100644
index 1f0c1241b2..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/positive.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "positive.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/positive.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U26: ==== POSITIVE   (x)
-namespace impl
-{
-
-namespace positive_fn_ns = dpctl::tensor::kernels::positive;
-
-static unary_contig_impl_fn_ptr_t
-    positive_contig_dispatch_vector[td_ns::num_types];
-static int positive_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    positive_strided_dispatch_vector[td_ns::num_types];
-
-void populate_positive_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = positive_fn_ns;
-
-    using fn_ns::PositiveContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, PositiveContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(positive_contig_dispatch_vector);
-
-    using fn_ns::PositiveStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, PositiveStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(positive_strided_dispatch_vector);
-
-    using fn_ns::PositiveTypeMapFactory;
-    DispatchVectorBuilder<int, PositiveTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(positive_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_positive(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_positive_dispatch_vectors();
-        using impl::positive_contig_dispatch_vector;
-        using impl::positive_output_typeid_vector;
-        using impl::positive_strided_dispatch_vector;
-
-        auto positive_pyapi = [&](const arrayT &src, const arrayT &dst,
-                                  sycl::queue &exec_q,
-                                  const event_vecT &depends = {}) {
-            return py_unary_ufunc(src, dst, exec_q, depends,
-                                  positive_output_typeid_vector,
-                                  positive_contig_dispatch_vector,
-                                  positive_strided_dispatch_vector);
-        };
-        m.def("_positive", positive_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto positive_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              positive_output_typeid_vector);
-        };
-        m.def("_positive_result_type", positive_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/positive.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/positive.hpp
deleted file mode 100644
index 9bbd63ca9e..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/positive.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_positive(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/pow.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/pow.cpp
deleted file mode 100644
index fc15932fdd..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/pow.cpp
+++ /dev/null
@@ -1,197 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "pow.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/common_inplace.hpp"
-#include "kernels/elementwise_functions/pow.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
-
-using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
-
-// B21: ===== POW (x1, x2)
-namespace impl
-{
-
-namespace pow_fn_ns = dpctl::tensor::kernels::pow;
-
-static binary_contig_impl_fn_ptr_t pow_contig_dispatch_table[td_ns::num_types]
-                                                            [td_ns::num_types];
-
-static int pow_output_id_table[td_ns::num_types][td_ns::num_types];
-static int pow_inplace_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    pow_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-static binary_inplace_contig_impl_fn_ptr_t
-    pow_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static binary_inplace_strided_impl_fn_ptr_t
-    pow_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void populate_pow_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = pow_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::PowTypeMapFactory;
-    DispatchTableBuilder<int, PowTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(pow_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::PowStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, PowStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(pow_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::PowContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, PowContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(pow_contig_dispatch_table);
-
-    // function pointers for inplace operation on general strided arrays
-    using fn_ns::PowInplaceStridedFactory;
-    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
-                         PowInplaceStridedFactory, num_types>
-        dtb4;
-    dtb4.populate_dispatch_table(pow_inplace_strided_dispatch_table);
-
-    // function pointers for inplace operation on contiguous inputs and output
-    using fn_ns::PowInplaceContigFactory;
-    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
-                         PowInplaceContigFactory, num_types>
-        dtb5;
-    dtb5.populate_dispatch_table(pow_inplace_contig_dispatch_table);
-
-    // which types are supported by the in-place kernels
-    using fn_ns::PowInplaceTypeMapFactory;
-    DispatchTableBuilder<int, PowInplaceTypeMapFactory, num_types> dtb6;
-    dtb6.populate_dispatch_table(pow_inplace_output_id_table);
-};
-
-} // namespace impl
-
-void init_pow(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_pow_dispatch_tables();
-        using impl::pow_contig_dispatch_table;
-        using impl::pow_output_id_table;
-        using impl::pow_strided_dispatch_table;
-
-        auto pow_pyapi = [&](const arrayT &src1, const arrayT &src2,
-                             const arrayT &dst, sycl::queue &exec_q,
-                             const event_vecT &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, pow_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                pow_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                pow_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto pow_result_type_pyapi = [&](const py::dtype &dtype1,
-                                         const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               pow_output_id_table);
-        };
-        m.def("_pow", pow_pyapi, "", py::arg("src1"), py::arg("src2"),
-              py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_pow_result_type", pow_result_type_pyapi, "");
-
-        using impl::pow_inplace_contig_dispatch_table;
-        using impl::pow_inplace_output_id_table;
-        using impl::pow_inplace_strided_dispatch_table;
-
-        auto pow_inplace_pyapi = [&](const arrayT &src, const arrayT &dst,
-                                     sycl::queue &exec_q,
-                                     const event_vecT &depends = {}) {
-            return py_binary_inplace_ufunc(
-                src, dst, exec_q, depends, pow_inplace_output_id_table,
-                // function pointers to handle inplace operation on
-                // contiguous arrays (pointers may be nullptr)
-                pow_inplace_contig_dispatch_table,
-                // function pointers to handle inplace operation on strided
-                // arrays (most general case)
-                pow_inplace_strided_dispatch_table,
-                // function pointers to handle inplace operation on
-                // c-contig matrix with c-contig row with broadcasting
-                // (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        m.def("_pow_inplace", pow_inplace_pyapi, "", py::arg("lhs"),
-              py::arg("rhs"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/pow.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/pow.hpp
deleted file mode 100644
index 4e052db9bb..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/pow.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_pow(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/proj.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/proj.cpp
deleted file mode 100644
index 672eae7078..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/proj.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "proj.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/proj.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U40: ==== PROJ   (x)
-namespace impl
-{
-
-namespace proj_fn_ns = dpctl::tensor::kernels::proj;
-
-static unary_contig_impl_fn_ptr_t proj_contig_dispatch_vector[td_ns::num_types];
-static int proj_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    proj_strided_dispatch_vector[td_ns::num_types];
-
-void populate_proj_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = proj_fn_ns;
-
-    using fn_ns::ProjContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, ProjContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(proj_contig_dispatch_vector);
-
-    using fn_ns::ProjStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, ProjStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(proj_strided_dispatch_vector);
-
-    using fn_ns::ProjTypeMapFactory;
-    DispatchVectorBuilder<int, ProjTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(proj_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_proj(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_proj_dispatch_vectors();
-        using impl::proj_contig_dispatch_vector;
-        using impl::proj_output_typeid_vector;
-        using impl::proj_strided_dispatch_vector;
-
-        auto proj_pyapi = [&](const arrayT &src, const arrayT &dst,
-                              sycl::queue &exec_q,
-                              const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, proj_output_typeid_vector,
-                proj_contig_dispatch_vector, proj_strided_dispatch_vector);
-        };
-        m.def("_proj", proj_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto proj_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, proj_output_typeid_vector);
-        };
-        m.def("_proj_result_type", proj_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/proj.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/proj.hpp
deleted file mode 100644
index b788837493..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/proj.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_proj(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/real.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/real.cpp
deleted file mode 100644
index 0e84c4fb4e..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/real.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "real.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/real.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U27: ==== REAL   (x)
-namespace impl
-{
-
-namespace real_fn_ns = dpctl::tensor::kernels::real;
-
-static unary_contig_impl_fn_ptr_t real_contig_dispatch_vector[td_ns::num_types];
-static int real_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    real_strided_dispatch_vector[td_ns::num_types];
-
-void populate_real_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = real_fn_ns;
-
-    using fn_ns::RealContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, RealContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(real_contig_dispatch_vector);
-
-    using fn_ns::RealStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, RealStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(real_strided_dispatch_vector);
-
-    using fn_ns::RealTypeMapFactory;
-    DispatchVectorBuilder<int, RealTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(real_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_real(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_real_dispatch_vectors();
-        using impl::real_contig_dispatch_vector;
-        using impl::real_output_typeid_vector;
-        using impl::real_strided_dispatch_vector;
-
-        auto real_pyapi = [&](const arrayT &src, const arrayT &dst,
-                              sycl::queue &exec_q,
-                              const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, real_output_typeid_vector,
-                real_contig_dispatch_vector, real_strided_dispatch_vector);
-        };
-        m.def("_real", real_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto real_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, real_output_typeid_vector);
-        };
-        m.def("_real_result_type", real_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/real.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/real.hpp
deleted file mode 100644
index f99ac98579..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/real.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_real(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/reciprocal.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/reciprocal.cpp
deleted file mode 100644
index 90ce6ff6ea..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/reciprocal.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "reciprocal.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/reciprocal.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U42: ==== REAL   (x)
-namespace impl
-{
-
-namespace reciprocal_fn_ns = dpctl::tensor::kernels::reciprocal;
-
-static unary_contig_impl_fn_ptr_t
-    reciprocal_contig_dispatch_vector[td_ns::num_types];
-static int reciprocal_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    reciprocal_strided_dispatch_vector[td_ns::num_types];
-
-void populate_reciprocal_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = reciprocal_fn_ns;
-
-    using fn_ns::ReciprocalContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, ReciprocalContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(reciprocal_contig_dispatch_vector);
-
-    using fn_ns::ReciprocalStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, ReciprocalStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(reciprocal_strided_dispatch_vector);
-
-    using fn_ns::ReciprocalTypeMapFactory;
-    DispatchVectorBuilder<int, ReciprocalTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(reciprocal_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_reciprocal(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_reciprocal_dispatch_vectors();
-        using impl::reciprocal_contig_dispatch_vector;
-        using impl::reciprocal_output_typeid_vector;
-        using impl::reciprocal_strided_dispatch_vector;
-
-        auto reciprocal_pyapi = [&](const arrayT &src, const arrayT &dst,
-                                    sycl::queue &exec_q,
-                                    const event_vecT &depends = {}) {
-            return py_unary_ufunc(src, dst, exec_q, depends,
-                                  reciprocal_output_typeid_vector,
-                                  reciprocal_contig_dispatch_vector,
-                                  reciprocal_strided_dispatch_vector);
-        };
-        m.def("_reciprocal", reciprocal_pyapi, "", py::arg("src"),
-              py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-
-        auto reciprocal_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              reciprocal_output_typeid_vector);
-        };
-        m.def("_reciprocal_result_type", reciprocal_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/reciprocal.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/reciprocal.hpp
deleted file mode 100644
index 20c6dd8654..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/reciprocal.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_reciprocal(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/remainder.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/remainder.cpp
deleted file mode 100644
index 44a23f7ac9..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/remainder.cpp
+++ /dev/null
@@ -1,199 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "remainder.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/common_inplace.hpp"
-#include "kernels/elementwise_functions/remainder.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
-
-using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
-
-// B22: ===== REMAINDER (x1, x2)
-namespace impl
-{
-
-namespace remainder_fn_ns = dpctl::tensor::kernels::remainder;
-
-static binary_contig_impl_fn_ptr_t
-    remainder_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-static int remainder_output_id_table[td_ns::num_types][td_ns::num_types];
-static int remainder_inplace_output_id_table[td_ns::num_types]
-                                            [td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    remainder_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-static binary_inplace_contig_impl_fn_ptr_t
-    remainder_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static binary_inplace_strided_impl_fn_ptr_t
-    remainder_inplace_strided_dispatch_table[td_ns::num_types]
-                                            [td_ns::num_types];
-
-void populate_remainder_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = remainder_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::RemainderTypeMapFactory;
-    DispatchTableBuilder<int, RemainderTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(remainder_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::RemainderStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, RemainderStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(remainder_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::RemainderContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, RemainderContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(remainder_contig_dispatch_table);
-
-    // function pointers for inplace operation on general strided arrays
-    using fn_ns::RemainderInplaceStridedFactory;
-    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
-                         RemainderInplaceStridedFactory, num_types>
-        dtb4;
-    dtb4.populate_dispatch_table(remainder_inplace_strided_dispatch_table);
-
-    // function pointers for inplace operation on contiguous inputs and output
-    using fn_ns::RemainderInplaceContigFactory;
-    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
-                         RemainderInplaceContigFactory, num_types>
-        dtb5;
-    dtb5.populate_dispatch_table(remainder_inplace_contig_dispatch_table);
-
-    // which types are supported by the in-place kernels
-    using fn_ns::RemainderInplaceTypeMapFactory;
-    DispatchTableBuilder<int, RemainderInplaceTypeMapFactory, num_types> dtb6;
-    dtb6.populate_dispatch_table(remainder_inplace_output_id_table);
-}
-
-} // namespace impl
-
-void init_remainder(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_remainder_dispatch_tables();
-        using impl::remainder_contig_dispatch_table;
-        using impl::remainder_output_id_table;
-        using impl::remainder_strided_dispatch_table;
-
-        auto remainder_pyapi = [&](const arrayT &src1, const arrayT &src2,
-                                   const arrayT &dst, sycl::queue &exec_q,
-                                   const event_vecT &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, remainder_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                remainder_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                remainder_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        auto remainder_result_type_pyapi = [&](const py::dtype &dtype1,
-                                               const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               remainder_output_id_table);
-        };
-        m.def("_remainder", remainder_pyapi, "", py::arg("src1"),
-              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_remainder_result_type", remainder_result_type_pyapi, "");
-
-        using impl::remainder_inplace_contig_dispatch_table;
-        using impl::remainder_inplace_output_id_table;
-        using impl::remainder_inplace_strided_dispatch_table;
-
-        auto remainder_inplace_pyapi = [&](const arrayT &src, const arrayT &dst,
-                                           sycl::queue &exec_q,
-                                           const event_vecT &depends = {}) {
-            return py_binary_inplace_ufunc(
-                src, dst, exec_q, depends, remainder_inplace_output_id_table,
-                // function pointers to handle inplace operation on
-                // contiguous arrays (pointers may be nullptr)
-                remainder_inplace_contig_dispatch_table,
-                // function pointers to handle inplace operation on strided
-                // arrays (most general case)
-                remainder_inplace_strided_dispatch_table,
-                // function pointers to handle inplace operation on
-                // c-contig matrix with c-contig row with broadcasting
-                // (may be nullptr)
-                td_ns::NullPtrTable<
-                    binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
-        };
-        m.def("_remainder_inplace", remainder_inplace_pyapi, "", py::arg("lhs"),
-              py::arg("rhs"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/remainder.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/remainder.hpp
deleted file mode 100644
index 16d4402a1d..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/remainder.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_remainder(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/round.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/round.cpp
deleted file mode 100644
index 41fa4f0a08..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/round.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "round.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/round.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U28: ==== ROUND   (x)
-namespace impl
-{
-
-namespace round_fn_ns = dpctl::tensor::kernels::round;
-
-static unary_contig_impl_fn_ptr_t
-    round_contig_dispatch_vector[td_ns::num_types];
-static int round_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    round_strided_dispatch_vector[td_ns::num_types];
-
-void populate_round_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = round_fn_ns;
-
-    using fn_ns::RoundContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, RoundContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(round_contig_dispatch_vector);
-
-    using fn_ns::RoundStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, RoundStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(round_strided_dispatch_vector);
-
-    using fn_ns::RoundTypeMapFactory;
-    DispatchVectorBuilder<int, RoundTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(round_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_round(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_round_dispatch_vectors();
-        using impl::round_contig_dispatch_vector;
-        using impl::round_output_typeid_vector;
-        using impl::round_strided_dispatch_vector;
-
-        auto round_pyapi = [&](const arrayT &src, const arrayT &dst,
-                               sycl::queue &exec_q,
-                               const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, round_output_typeid_vector,
-                round_contig_dispatch_vector, round_strided_dispatch_vector);
-        };
-        m.def("_round", round_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto round_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              round_output_typeid_vector);
-        };
-        m.def("_round_result_type", round_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/round.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/round.hpp
deleted file mode 100644
index 0eef7ee1b2..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/round.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_round(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/rsqrt.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/rsqrt.cpp
deleted file mode 100644
index 67c4ca14eb..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/rsqrt.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <utility>
-
-#include "elementwise_functions.hpp"
-#include "rsqrt.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/rsqrt.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U39: ==== RSQRT   (x)
-namespace impl
-{
-
-namespace rsqrt_fn_ns = dpctl::tensor::kernels::rsqrt;
-
-static unary_contig_impl_fn_ptr_t
-    rsqrt_contig_dispatch_vector[td_ns::num_types];
-static int rsqrt_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    rsqrt_strided_dispatch_vector[td_ns::num_types];
-
-void populate_rsqrt_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = rsqrt_fn_ns;
-
-    using fn_ns::RsqrtContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, RsqrtContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(rsqrt_contig_dispatch_vector);
-
-    using fn_ns::RsqrtStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, RsqrtStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(rsqrt_strided_dispatch_vector);
-
-    using fn_ns::RsqrtTypeMapFactory;
-    DispatchVectorBuilder<int, RsqrtTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(rsqrt_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_rsqrt(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_rsqrt_dispatch_vectors();
-        using impl::rsqrt_contig_dispatch_vector;
-        using impl::rsqrt_output_typeid_vector;
-        using impl::rsqrt_strided_dispatch_vector;
-
-        auto rsqrt_pyapi = [&](const arrayT &src, const arrayT &dst,
-                               sycl::queue &exec_q,
-                               const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, rsqrt_output_typeid_vector,
-                rsqrt_contig_dispatch_vector, rsqrt_strided_dispatch_vector);
-        };
-        m.def("_rsqrt", rsqrt_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto rsqrt_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              rsqrt_output_typeid_vector);
-        };
-        m.def("_rsqrt_result_type", rsqrt_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/rsqrt.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/rsqrt.hpp
deleted file mode 100644
index 11a011c63b..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/rsqrt.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_rsqrt(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sign.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/sign.cpp
deleted file mode 100644
index f8d9183a98..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/sign.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "sign.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/sign.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U29: ==== SIGN   (x)
-namespace impl
-{
-
-namespace sign_fn_ns = dpctl::tensor::kernels::sign;
-
-static unary_contig_impl_fn_ptr_t sign_contig_dispatch_vector[td_ns::num_types];
-static int sign_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    sign_strided_dispatch_vector[td_ns::num_types];
-
-void populate_sign_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = sign_fn_ns;
-
-    using fn_ns::SignContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, SignContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(sign_contig_dispatch_vector);
-
-    using fn_ns::SignStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, SignStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(sign_strided_dispatch_vector);
-
-    using fn_ns::SignTypeMapFactory;
-    DispatchVectorBuilder<int, SignTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(sign_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_sign(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_sign_dispatch_vectors();
-        using impl::sign_contig_dispatch_vector;
-        using impl::sign_output_typeid_vector;
-        using impl::sign_strided_dispatch_vector;
-
-        auto sign_pyapi = [&](const arrayT &src, const arrayT &dst,
-                              sycl::queue &exec_q,
-                              const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, sign_output_typeid_vector,
-                sign_contig_dispatch_vector, sign_strided_dispatch_vector);
-        };
-        m.def("_sign", sign_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto sign_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, sign_output_typeid_vector);
-        };
-        m.def("_sign_result_type", sign_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sign.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/sign.hpp
deleted file mode 100644
index acb8db91f7..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/sign.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_sign(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/signbit.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/signbit.cpp
deleted file mode 100644
index 1ed31d2b0e..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/signbit.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "signbit.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/signbit.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U41: ==== SIGNBIT   (x)
-namespace impl
-{
-
-namespace signbit_fn_ns = dpctl::tensor::kernels::signbit;
-
-static unary_contig_impl_fn_ptr_t
-    signbit_contig_dispatch_vector[td_ns::num_types];
-static int signbit_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    signbit_strided_dispatch_vector[td_ns::num_types];
-
-void populate_signbit_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = signbit_fn_ns;
-
-    using fn_ns::SignbitContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, SignbitContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(signbit_contig_dispatch_vector);
-
-    using fn_ns::SignbitStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, SignbitStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(signbit_strided_dispatch_vector);
-
-    using fn_ns::SignbitTypeMapFactory;
-    DispatchVectorBuilder<int, SignbitTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(signbit_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_signbit(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_signbit_dispatch_vectors();
-        using impl::signbit_contig_dispatch_vector;
-        using impl::signbit_output_typeid_vector;
-        using impl::signbit_strided_dispatch_vector;
-
-        auto signbit_pyapi = [&](const arrayT &src, const arrayT &dst,
-                                 sycl::queue &exec_q,
-                                 const event_vecT &depends = {}) {
-            return py_unary_ufunc(src, dst, exec_q, depends,
-                                  signbit_output_typeid_vector,
-                                  signbit_contig_dispatch_vector,
-                                  signbit_strided_dispatch_vector);
-        };
-        m.def("_signbit", signbit_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto signbit_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              signbit_output_typeid_vector);
-        };
-        m.def("_signbit_result_type", signbit_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/signbit.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/signbit.hpp
deleted file mode 100644
index 9ab3154c57..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/signbit.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_signbit(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sin.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/sin.cpp
deleted file mode 100644
index 3b624475da..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/sin.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "sin.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/sin.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U30: ==== SIN   (x)
-namespace impl
-{
-
-namespace sin_fn_ns = dpctl::tensor::kernels::sin;
-
-static unary_contig_impl_fn_ptr_t sin_contig_dispatch_vector[td_ns::num_types];
-static int sin_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    sin_strided_dispatch_vector[td_ns::num_types];
-
-void populate_sin_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = sin_fn_ns;
-
-    using fn_ns::SinContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, SinContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(sin_contig_dispatch_vector);
-
-    using fn_ns::SinStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, SinStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(sin_strided_dispatch_vector);
-
-    using fn_ns::SinTypeMapFactory;
-    DispatchVectorBuilder<int, SinTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(sin_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_sin(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_sin_dispatch_vectors();
-        using impl::sin_contig_dispatch_vector;
-        using impl::sin_output_typeid_vector;
-        using impl::sin_strided_dispatch_vector;
-
-        auto sin_pyapi = [&](const arrayT &src, const arrayT &dst,
-                             sycl::queue &exec_q,
-                             const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, sin_output_typeid_vector,
-                sin_contig_dispatch_vector, sin_strided_dispatch_vector);
-        };
-        m.def("_sin", sin_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto sin_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, sin_output_typeid_vector);
-        };
-        m.def("_sin_result_type", sin_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sin.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/sin.hpp
deleted file mode 100644
index 25a84ab74a..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/sin.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_sin(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sinh.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/sinh.cpp
deleted file mode 100644
index 974783829b..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/sinh.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "sinh.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/sinh.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U31: ==== SINH   (x)
-namespace impl
-{
-
-namespace sinh_fn_ns = dpctl::tensor::kernels::sinh;
-
-static unary_contig_impl_fn_ptr_t sinh_contig_dispatch_vector[td_ns::num_types];
-static int sinh_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    sinh_strided_dispatch_vector[td_ns::num_types];
-
-void populate_sinh_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = sinh_fn_ns;
-
-    using fn_ns::SinhContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, SinhContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(sinh_contig_dispatch_vector);
-
-    using fn_ns::SinhStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, SinhStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(sinh_strided_dispatch_vector);
-
-    using fn_ns::SinhTypeMapFactory;
-    DispatchVectorBuilder<int, SinhTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(sinh_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_sinh(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_sinh_dispatch_vectors();
-        using impl::sinh_contig_dispatch_vector;
-        using impl::sinh_output_typeid_vector;
-        using impl::sinh_strided_dispatch_vector;
-
-        auto sinh_pyapi = [&](const arrayT &src, const arrayT &dst,
-                              sycl::queue &exec_q,
-                              const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, sinh_output_typeid_vector,
-                sinh_contig_dispatch_vector, sinh_strided_dispatch_vector);
-        };
-        m.def("_sinh", sinh_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto sinh_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, sinh_output_typeid_vector);
-        };
-        m.def("_sinh_result_type", sinh_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sinh.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/sinh.hpp
deleted file mode 100644
index c1fa1d6c35..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/sinh.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_sinh(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sqrt.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/sqrt.cpp
deleted file mode 100644
index 7816a5ae31..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/sqrt.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "sqrt.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/sqrt.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U33: ==== SQRT   (x)
-namespace impl
-{
-
-namespace sqrt_fn_ns = dpctl::tensor::kernels::sqrt;
-
-static unary_contig_impl_fn_ptr_t sqrt_contig_dispatch_vector[td_ns::num_types];
-static int sqrt_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    sqrt_strided_dispatch_vector[td_ns::num_types];
-
-void populate_sqrt_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = sqrt_fn_ns;
-
-    using fn_ns::SqrtContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, SqrtContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(sqrt_contig_dispatch_vector);
-
-    using fn_ns::SqrtStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, SqrtStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(sqrt_strided_dispatch_vector);
-
-    using fn_ns::SqrtTypeMapFactory;
-    DispatchVectorBuilder<int, SqrtTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(sqrt_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_sqrt(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_sqrt_dispatch_vectors();
-        using impl::sqrt_contig_dispatch_vector;
-        using impl::sqrt_output_typeid_vector;
-        using impl::sqrt_strided_dispatch_vector;
-
-        auto sqrt_pyapi = [&](const arrayT &src, const arrayT &dst,
-                              sycl::queue &exec_q,
-                              const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, sqrt_output_typeid_vector,
-                sqrt_contig_dispatch_vector, sqrt_strided_dispatch_vector);
-        };
-        m.def("_sqrt", sqrt_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto sqrt_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, sqrt_output_typeid_vector);
-        };
-        m.def("_sqrt_result_type", sqrt_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sqrt.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/sqrt.hpp
deleted file mode 100644
index 0a6ea9e9bf..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/sqrt.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_sqrt(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/square.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/square.cpp
deleted file mode 100644
index ae6c2a8383..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/square.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "square.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/square.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U32: ==== SQUARE   (x)
-namespace impl
-{
-
-namespace square_fn_ns = dpctl::tensor::kernels::square;
-
-static unary_contig_impl_fn_ptr_t
-    square_contig_dispatch_vector[td_ns::num_types];
-static int square_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    square_strided_dispatch_vector[td_ns::num_types];
-
-void populate_square_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = square_fn_ns;
-
-    using fn_ns::SquareContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, SquareContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(square_contig_dispatch_vector);
-
-    using fn_ns::SquareStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, SquareStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(square_strided_dispatch_vector);
-
-    using fn_ns::SquareTypeMapFactory;
-    DispatchVectorBuilder<int, SquareTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(square_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_square(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_square_dispatch_vectors();
-        using impl::square_contig_dispatch_vector;
-        using impl::square_output_typeid_vector;
-        using impl::square_strided_dispatch_vector;
-
-        auto square_pyapi = [&](const arrayT &src, const arrayT &dst,
-                                sycl::queue &exec_q,
-                                const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, square_output_typeid_vector,
-                square_contig_dispatch_vector, square_strided_dispatch_vector);
-        };
-        m.def("_square", square_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto square_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              square_output_typeid_vector);
-        };
-        m.def("_square_result_type", square_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/square.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/square.hpp
deleted file mode 100644
index 50ab30c6ca..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/square.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_square(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/subtract.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/subtract.cpp
deleted file mode 100644
index b4e8ec0a98..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/subtract.cpp
+++ /dev/null
@@ -1,237 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "subtract.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/common_inplace.hpp"
-#include "kernels/elementwise_functions/subtract.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
-
-using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
-
-// B23: ===== SUBTRACT (x1, x2)
-namespace impl
-{
-namespace subtract_fn_ns = dpctl::tensor::kernels::subtract;
-
-static binary_contig_impl_fn_ptr_t
-    subtract_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-static int subtract_output_id_table[td_ns::num_types][td_ns::num_types];
-static int subtract_inplace_output_id_table[td_ns::num_types][td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    subtract_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-// sub(matrix, row)
-static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t
-    subtract_contig_matrix_contig_row_broadcast_dispatch_table
-        [td_ns::num_types][td_ns::num_types];
-
-// sub(row, matrix)
-static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t
-    subtract_contig_row_contig_matrix_broadcast_dispatch_table
-        [td_ns::num_types][td_ns::num_types];
-
-static binary_inplace_contig_impl_fn_ptr_t
-    subtract_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static binary_inplace_strided_impl_fn_ptr_t
-    subtract_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t
-    subtract_inplace_row_matrix_dispatch_table[td_ns::num_types]
-                                              [td_ns::num_types];
-
-void populate_subtract_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = subtract_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::SubtractTypeMapFactory;
-    DispatchTableBuilder<int, SubtractTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(subtract_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::SubtractStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, SubtractStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(subtract_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::SubtractContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, SubtractContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(subtract_contig_dispatch_table);
-
-    // function pointers for operation on contiguous matrix, contiguous row
-    // with contiguous matrix output
-    using fn_ns::SubtractContigMatrixContigRowBroadcastFactory;
-    DispatchTableBuilder<
-        binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t,
-        SubtractContigMatrixContigRowBroadcastFactory, num_types>
-        dtb4;
-    dtb4.populate_dispatch_table(
-        subtract_contig_matrix_contig_row_broadcast_dispatch_table);
-
-    // function pointers for operation on contiguous row, contiguous matrix
-    // with contiguous matrix output
-    using fn_ns::SubtractContigRowContigMatrixBroadcastFactory;
-    DispatchTableBuilder<
-        binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t,
-        SubtractContigRowContigMatrixBroadcastFactory, num_types>
-        dtb5;
-    dtb5.populate_dispatch_table(
-        subtract_contig_row_contig_matrix_broadcast_dispatch_table);
-
-    // function pointers for inplace operation on general strided arrays
-    using fn_ns::SubtractInplaceStridedFactory;
-    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
-                         SubtractInplaceStridedFactory, num_types>
-        dtb6;
-    dtb6.populate_dispatch_table(subtract_inplace_strided_dispatch_table);
-
-    // function pointers for inplace operation on contiguous inputs and output
-    using fn_ns::SubtractInplaceContigFactory;
-    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
-                         SubtractInplaceContigFactory, num_types>
-        dtb7;
-    dtb7.populate_dispatch_table(subtract_inplace_contig_dispatch_table);
-
-    // function pointers for inplace operation on contiguous matrix
-    // and contiguous row
-    using fn_ns::SubtractInplaceRowMatrixBroadcastFactory;
-    DispatchTableBuilder<binary_inplace_row_matrix_broadcast_impl_fn_ptr_t,
-                         SubtractInplaceRowMatrixBroadcastFactory, num_types>
-        dtb8;
-    dtb8.populate_dispatch_table(subtract_inplace_row_matrix_dispatch_table);
-
-    // which types are supported by the in-place kernels
-    using fn_ns::SubtractInplaceTypeMapFactory;
-    DispatchTableBuilder<int, SubtractInplaceTypeMapFactory, num_types> dtb9;
-    dtb9.populate_dispatch_table(subtract_inplace_output_id_table);
-};
-
-} // namespace impl
-
-void init_subtract(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_subtract_dispatch_tables();
-        using impl::subtract_contig_dispatch_table;
-        using impl::subtract_contig_matrix_contig_row_broadcast_dispatch_table;
-        using impl::subtract_contig_row_contig_matrix_broadcast_dispatch_table;
-        using impl::subtract_output_id_table;
-        using impl::subtract_strided_dispatch_table;
-
-        auto subtract_pyapi = [&](const arrayT &src1, const arrayT &src2,
-                                  const arrayT &dst, sycl::queue &exec_q,
-                                  const event_vecT &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, subtract_output_id_table,
-                // function pointers to handle operation on contiguous
-                // arrays (pointers may be nullptr)
-                subtract_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays
-                // (most general case)
-                subtract_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix
-                // and c-contig row with broadcasting (may be nullptr)
-                subtract_contig_matrix_contig_row_broadcast_dispatch_table,
-                // function pointers to handle operation of c-contig matrix
-                // and c-contig row with broadcasting (may be nullptr)
-                subtract_contig_row_contig_matrix_broadcast_dispatch_table);
-        };
-        auto subtract_result_type_pyapi = [&](const py::dtype &dtype1,
-                                              const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               subtract_output_id_table);
-        };
-        m.def("_subtract", subtract_pyapi, "", py::arg("src1"), py::arg("src2"),
-              py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_subtract_result_type", subtract_result_type_pyapi, "");
-
-        using impl::subtract_inplace_contig_dispatch_table;
-        using impl::subtract_inplace_output_id_table;
-        using impl::subtract_inplace_row_matrix_dispatch_table;
-        using impl::subtract_inplace_strided_dispatch_table;
-
-        auto subtract_inplace_pyapi = [&](const arrayT &src, const arrayT &dst,
-                                          sycl::queue &exec_q,
-                                          const event_vecT &depends = {}) {
-            return py_binary_inplace_ufunc(
-                src, dst, exec_q, depends, subtract_inplace_output_id_table,
-                // function pointers to handle inplace operation on
-                // contiguous arrays (pointers may be nullptr)
-                subtract_inplace_contig_dispatch_table,
-                // function pointers to handle inplace operation on strided
-                // arrays (most general case)
-                subtract_inplace_strided_dispatch_table,
-                // function pointers to handle inplace operation on
-                // c-contig matrix with c-contig row with broadcasting
-                // (may be nullptr)
-                subtract_inplace_row_matrix_dispatch_table);
-        };
-        m.def("_subtract_inplace", subtract_inplace_pyapi, "", py::arg("lhs"),
-              py::arg("rhs"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/subtract.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/subtract.hpp
deleted file mode 100644
index 89cdfd6d0e..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/subtract.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_subtract(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/tan.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/tan.cpp
deleted file mode 100644
index 99d307e11e..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/tan.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "tan.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/tan.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U34: ==== TAN   (x)
-namespace impl
-{
-
-namespace tan_fn_ns = dpctl::tensor::kernels::tan;
-
-static unary_contig_impl_fn_ptr_t tan_contig_dispatch_vector[td_ns::num_types];
-static int tan_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    tan_strided_dispatch_vector[td_ns::num_types];
-
-void populate_tan_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = tan_fn_ns;
-
-    using fn_ns::TanContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, TanContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(tan_contig_dispatch_vector);
-
-    using fn_ns::TanStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, TanStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(tan_strided_dispatch_vector);
-
-    using fn_ns::TanTypeMapFactory;
-    DispatchVectorBuilder<int, TanTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(tan_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_tan(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_tan_dispatch_vectors();
-        using impl::tan_contig_dispatch_vector;
-        using impl::tan_output_typeid_vector;
-        using impl::tan_strided_dispatch_vector;
-
-        auto tan_pyapi = [&](const arrayT &src, const arrayT &dst,
-                             sycl::queue &exec_q,
-                             const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, tan_output_typeid_vector,
-                tan_contig_dispatch_vector, tan_strided_dispatch_vector);
-        };
-        m.def("_tan", tan_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto tan_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, tan_output_typeid_vector);
-        };
-        m.def("_tan_result_type", tan_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/tan.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/tan.hpp
deleted file mode 100644
index 96df1026a2..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/tan.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_tanh(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/tanh.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/tanh.cpp
deleted file mode 100644
index 55c6f5707a..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/tanh.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "tanh.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/tanh.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U35: ==== TANH   (x)
-namespace impl
-{
-
-namespace tanh_fn_ns = dpctl::tensor::kernels::tanh;
-
-static unary_contig_impl_fn_ptr_t tanh_contig_dispatch_vector[td_ns::num_types];
-static int tanh_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    tanh_strided_dispatch_vector[td_ns::num_types];
-
-void populate_tanh_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = tanh_fn_ns;
-
-    using fn_ns::TanhContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, TanhContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(tanh_contig_dispatch_vector);
-
-    using fn_ns::TanhStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, TanhStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(tanh_strided_dispatch_vector);
-
-    using fn_ns::TanhTypeMapFactory;
-    DispatchVectorBuilder<int, TanhTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(tanh_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_tanh(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_tanh_dispatch_vectors();
-        using impl::tanh_contig_dispatch_vector;
-        using impl::tanh_output_typeid_vector;
-        using impl::tanh_strided_dispatch_vector;
-
-        auto tanh_pyapi = [&](const arrayT &src, const arrayT &dst,
-                              sycl::queue &exec_q,
-                              const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, tanh_output_typeid_vector,
-                tanh_contig_dispatch_vector, tanh_strided_dispatch_vector);
-        };
-        m.def("_tanh", tanh_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto tanh_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype, tanh_output_typeid_vector);
-        };
-        m.def("_tanh_result_type", tanh_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/tanh.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/tanh.hpp
deleted file mode 100644
index 1c85a6826a..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/tanh.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_tan(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp
deleted file mode 100644
index 1ad4889fbd..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp
+++ /dev/null
@@ -1,492 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <stdexcept>
-#include <sycl/sycl.hpp>
-#include <utility> // for std::ignore
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include "elementwise_functions.hpp"
-#include "simplify_iteration_space.hpp"
-#include "true_divide.hpp"
-#include "utils/memory_overlap.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/output_validation.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/common_inplace.hpp"
-#include "kernels/elementwise_functions/true_divide.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
-
-using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
-using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
-using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
-
-// B08: ===== DIVIDE (x1, x2)
-namespace impl
-{
-namespace true_divide_fn_ns = dpctl::tensor::kernels::true_divide;
-
-static binary_contig_impl_fn_ptr_t
-    true_divide_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static int true_divide_output_id_table[td_ns::num_types][td_ns::num_types];
-static int true_divide_inplace_output_id_table[td_ns::num_types]
-                                              [td_ns::num_types];
-
-static binary_strided_impl_fn_ptr_t
-    true_divide_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-// divide(matrix, row)
-static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t
-    true_divide_contig_matrix_contig_row_broadcast_dispatch_table
-        [td_ns::num_types][td_ns::num_types];
-
-// divide(row, matrix)
-static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t
-    true_divide_contig_row_contig_matrix_broadcast_dispatch_table
-        [td_ns::num_types][td_ns::num_types];
-
-static binary_inplace_contig_impl_fn_ptr_t
-    true_divide_inplace_contig_dispatch_table[td_ns::num_types]
-                                             [td_ns::num_types];
-static binary_inplace_strided_impl_fn_ptr_t
-    true_divide_inplace_strided_dispatch_table[td_ns::num_types]
-                                              [td_ns::num_types];
-static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t
-    true_divide_inplace_row_matrix_dispatch_table[td_ns::num_types]
-                                                 [td_ns::num_types];
-
-void populate_true_divide_dispatch_tables(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = true_divide_fn_ns;
-
-    // which input types are supported, and what is the type of the result
-    using fn_ns::TrueDivideTypeMapFactory;
-    DispatchTableBuilder<int, TrueDivideTypeMapFactory, num_types> dtb1;
-    dtb1.populate_dispatch_table(true_divide_output_id_table);
-
-    // function pointers for operation on general strided arrays
-    using fn_ns::TrueDivideStridedFactory;
-    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, TrueDivideStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(true_divide_strided_dispatch_table);
-
-    // function pointers for operation on contiguous inputs and output
-    using fn_ns::TrueDivideContigFactory;
-    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, TrueDivideContigFactory,
-                         num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(true_divide_contig_dispatch_table);
-
-    // function pointers for operation on contiguous matrix, contiguous row
-    // with contiguous matrix output
-    using fn_ns::TrueDivideContigMatrixContigRowBroadcastFactory;
-    DispatchTableBuilder<
-        binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t,
-        TrueDivideContigMatrixContigRowBroadcastFactory, num_types>
-        dtb4;
-    dtb4.populate_dispatch_table(
-        true_divide_contig_matrix_contig_row_broadcast_dispatch_table);
-
-    // function pointers for operation on contiguous row, contiguous matrix
-    // with contiguous matrix output
-    using fn_ns::TrueDivideContigRowContigMatrixBroadcastFactory;
-    DispatchTableBuilder<
-        binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t,
-        TrueDivideContigRowContigMatrixBroadcastFactory, num_types>
-        dtb5;
-    dtb5.populate_dispatch_table(
-        true_divide_contig_row_contig_matrix_broadcast_dispatch_table);
-
-    // which types are supported by the in-place kernels
-    using fn_ns::TrueDivideInplaceTypeMapFactory;
-    DispatchTableBuilder<int, TrueDivideInplaceTypeMapFactory, num_types> dtb6;
-    dtb6.populate_dispatch_table(true_divide_inplace_output_id_table);
-
-    // function pointers for inplace operation on general strided arrays
-    using fn_ns::TrueDivideInplaceStridedFactory;
-    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
-                         TrueDivideInplaceStridedFactory, num_types>
-        dtb7;
-    dtb7.populate_dispatch_table(true_divide_inplace_strided_dispatch_table);
-
-    // function pointers for inplace operation on contiguous inputs and output
-    using fn_ns::TrueDivideInplaceContigFactory;
-    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
-                         TrueDivideInplaceContigFactory, num_types>
-        dtb8;
-    dtb8.populate_dispatch_table(true_divide_inplace_contig_dispatch_table);
-
-    // function pointers for inplace operation on contiguous matrix
-    // and contiguous row
-    using fn_ns::TrueDivideInplaceRowMatrixBroadcastFactory;
-    DispatchTableBuilder<binary_inplace_row_matrix_broadcast_impl_fn_ptr_t,
-                         TrueDivideInplaceRowMatrixBroadcastFactory, num_types>
-        dtb9;
-    dtb9.populate_dispatch_table(true_divide_inplace_row_matrix_dispatch_table);
-};
-
-template <typename T> class divide_by_scalar_krn;
-
-typedef sycl::event (*divide_by_scalar_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t,
-    int,
-    const ssize_t *,
-    const char *,
-    py::ssize_t,
-    const char *,
-    char *,
-    py::ssize_t,
-    const std::vector<sycl::event> &);
-
-template <typename T, typename scalarT>
-sycl::event divide_by_scalar(sycl::queue &exec_q,
-                             std::size_t nelems,
-                             int nd,
-                             const ssize_t *shape_and_strides,
-                             const char *arg_p,
-                             py::ssize_t arg_offset,
-                             const char *scalar_ptr,
-                             char *res_p,
-                             py::ssize_t res_offset,
-                             const std::vector<sycl::event> &depends = {})
-{
-    const scalarT sc_v = *reinterpret_cast<const scalarT *>(scalar_ptr);
-
-    sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        using BinOpT =
-            dpctl::tensor::kernels::true_divide::TrueDivideFunctor<T, scalarT,
-                                                                   T>;
-
-        auto op = BinOpT();
-
-        using IndexerT =
-            typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;
-
-        const IndexerT two_offsets_indexer{nd, arg_offset, res_offset,
-                                           shape_and_strides};
-
-        const T *arg_tp = reinterpret_cast<const T *>(arg_p);
-        T *res_tp = reinterpret_cast<T *>(res_p);
-
-        cgh.parallel_for<divide_by_scalar_krn<T>>(
-            {nelems}, [=](sycl::id<1> id) {
-                const auto &two_offsets_ =
-                    two_offsets_indexer(static_cast<ssize_t>(id.get(0)));
-
-                const auto &arg_i = two_offsets_.get_first_offset();
-                const auto &res_i = two_offsets_.get_second_offset();
-                res_tp[res_i] = op(arg_tp[arg_i], sc_v);
-            });
-    });
-    return comp_ev;
-}
-
-std::pair<sycl::event, sycl::event>
-py_divide_by_scalar(const dpctl::tensor::usm_ndarray &src,
-                    double scalar,
-                    const dpctl::tensor::usm_ndarray &dst,
-                    sycl::queue &exec_q,
-                    const std::vector<sycl::event> &depends = {})
-{
-    int src_typenum = src.get_typenum();
-    int dst_typenum = dst.get_typenum();
-
-    auto array_types = td_ns::usm_ndarray_types();
-    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    if (src_typeid != dst_typeid) {
-        throw py::value_error(
-            "Destination array has unexpected elemental data type.");
-    }
-
-    // check that queues are compatible
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-    // check shapes, broadcasting is assumed done by caller
-    // check that dimensions are the same
-    int dst_nd = dst.get_ndim();
-    if (dst_nd != src.get_ndim()) {
-        throw py::value_error("Array dimensions are not the same.");
-    }
-
-    // check that shapes are the same
-    const py::ssize_t *src_shape = src.get_shape_raw();
-    const py::ssize_t *dst_shape = dst.get_shape_raw();
-    bool shapes_equal(true);
-    std::size_t src_nelems(1);
-
-    for (int i = 0; i < dst_nd; ++i) {
-        src_nelems *= static_cast<std::size_t>(src_shape[i]);
-        shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
-    }
-    if (!shapes_equal) {
-        throw py::value_error("Array shapes are not the same.");
-    }
-
-    // if nelems is zero, return
-    if (src_nelems == 0) {
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
-
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    auto const &same_logical_tensors =
-        dpctl::tensor::overlap::SameLogicalTensors();
-    if ((overlap(src, dst) && !same_logical_tensors(src, dst))) {
-        throw py::value_error("Arrays index overlapping segments of memory");
-    }
-
-    const char *src_data = src.get_data();
-    char *dst_data = dst.get_data();
-
-    static constexpr int float16_typeid =
-        static_cast<int>(td_ns::typenum_t::HALF);
-    static constexpr int float32_typeid =
-        static_cast<int>(td_ns::typenum_t::FLOAT);
-    static constexpr int float64_typeid =
-        static_cast<int>(td_ns::typenum_t::DOUBLE);
-    static constexpr int complex64_typeid =
-        static_cast<int>(td_ns::typenum_t::CFLOAT);
-    static constexpr int complex128_typeid =
-        static_cast<int>(td_ns::typenum_t::CDOUBLE);
-
-    // statically pre-allocated memory for scalar
-    alignas(double) char scalar_alloc[sizeof(double)] = {0};
-
-    divide_by_scalar_fn_ptr_t fn;
-    // placement new into stack memory means no call to delete is necessary
-    switch (src_typeid) {
-    case float16_typeid:
-    {
-        fn = divide_by_scalar<sycl::half, sycl::half>;
-        std::ignore =
-            new (scalar_alloc) sycl::half(static_cast<sycl::half>(scalar));
-        break;
-    }
-    case float32_typeid:
-    {
-        fn = divide_by_scalar<float, float>;
-        std::ignore = new (scalar_alloc) float(scalar);
-        break;
-    }
-    case float64_typeid:
-    {
-        fn = divide_by_scalar<double, double>;
-        std::ignore = new (scalar_alloc) double(scalar);
-        break;
-    }
-    case complex64_typeid:
-    {
-        fn = divide_by_scalar<std::complex<float>, float>;
-        std::ignore = new (scalar_alloc) float(scalar);
-        break;
-    }
-    case complex128_typeid:
-    {
-        fn = divide_by_scalar<std::complex<double>, double>;
-        std::ignore = new (scalar_alloc) double(scalar);
-        break;
-    }
-    default:
-        throw std::runtime_error("Implementation is missing for typeid=" +
-                                 std::to_string(src_typeid));
-    }
-
-    // simplify strides
-    auto const &src_strides = src.get_strides_vector();
-    auto const &dst_strides = dst.get_strides_vector();
-
-    using shT = std::vector<py::ssize_t>;
-    shT simplified_shape;
-    shT simplified_src_strides;
-    shT simplified_dst_strides;
-    py::ssize_t src_offset(0);
-    py::ssize_t dst_offset(0);
-
-    int nd = dst_nd;
-    const py::ssize_t *shape = src_shape;
-
-    std::vector<sycl::event> host_tasks{};
-    dpctl::tensor::py_internal::simplify_iteration_space(
-        nd, shape, src_strides, dst_strides,
-        // outputs
-        simplified_shape, simplified_src_strides, simplified_dst_strides,
-        src_offset, dst_offset);
-
-    if (nd == 0) {
-        // handle 0d array as 1d array with 1 element
-        static constexpr py::ssize_t one{1};
-        simplified_shape.push_back(one);
-        simplified_src_strides.push_back(one);
-        simplified_dst_strides.push_back(one);
-        src_offset = 0;
-        dst_offset = 0;
-    }
-
-    using dpctl::tensor::offset_utils::device_allocate_and_pack;
-    auto ptr_sz_event_triple_ = device_allocate_and_pack<py::ssize_t>(
-        exec_q, host_tasks, simplified_shape, simplified_src_strides,
-        simplified_dst_strides);
-    auto shape_strides_owner = std::move(std::get<0>(ptr_sz_event_triple_));
-    auto &copy_metadata_ev = std::get<2>(ptr_sz_event_triple_);
-
-    const py::ssize_t *shape_strides = shape_strides_owner.get();
-
-    std::vector<sycl::event> all_deps;
-    all_deps.reserve(depends.size() + 1);
-    all_deps.resize(depends.size());
-    std::copy(depends.begin(), depends.end(), all_deps.begin());
-    all_deps.push_back(copy_metadata_ev);
-
-    sycl::event div_ev =
-        fn(exec_q, src_nelems, nd, shape_strides, src_data, src_offset,
-           scalar_alloc, dst_data, dst_offset, all_deps);
-
-    // async free of shape_strides temporary
-    sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
-        exec_q, {div_ev}, shape_strides_owner);
-
-    host_tasks.push_back(tmp_cleanup_ev);
-
-    return std::make_pair(
-        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_tasks), div_ev);
-}
-
-} // namespace impl
-
-void init_divide(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_true_divide_dispatch_tables();
-        using impl::true_divide_contig_dispatch_table;
-        using impl::
-            true_divide_contig_matrix_contig_row_broadcast_dispatch_table;
-        using impl::
-            true_divide_contig_row_contig_matrix_broadcast_dispatch_table;
-        using impl::true_divide_output_id_table;
-        using impl::true_divide_strided_dispatch_table;
-
-        auto divide_pyapi = [&](const arrayT &src1, const arrayT &src2,
-                                const arrayT &dst, sycl::queue &exec_q,
-                                const event_vecT &depends = {}) {
-            return py_binary_ufunc(
-                src1, src2, dst, exec_q, depends, true_divide_output_id_table,
-                // function pointers to handle operation on contiguous arrays
-                // (pointers may be nullptr)
-                true_divide_contig_dispatch_table,
-                // function pointers to handle operation on strided arrays (most
-                // general case)
-                true_divide_strided_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                true_divide_contig_matrix_contig_row_broadcast_dispatch_table,
-                // function pointers to handle operation of c-contig matrix and
-                // c-contig row with broadcasting (may be nullptr)
-                true_divide_contig_row_contig_matrix_broadcast_dispatch_table);
-        };
-        auto divide_result_type_pyapi = [&](const py::dtype &dtype1,
-                                            const py::dtype &dtype2) {
-            return py_binary_ufunc_result_type(dtype1, dtype2,
-                                               true_divide_output_id_table);
-        };
-        m.def("_divide", divide_pyapi, "", py::arg("src1"), py::arg("src2"),
-              py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-        m.def("_divide_result_type", divide_result_type_pyapi, "");
-
-        using impl::true_divide_inplace_contig_dispatch_table;
-        using impl::true_divide_inplace_output_id_table;
-        using impl::true_divide_inplace_row_matrix_dispatch_table;
-        using impl::true_divide_inplace_strided_dispatch_table;
-
-        auto divide_inplace_pyapi = [&](const arrayT &src, const arrayT &dst,
-                                        sycl::queue &exec_q,
-                                        const event_vecT &depends = {}) {
-            return py_binary_inplace_ufunc(
-                src, dst, exec_q, depends, true_divide_inplace_output_id_table,
-                // function pointers to handle inplace operation on
-                // contiguous arrays (pointers may be nullptr)
-                true_divide_inplace_contig_dispatch_table,
-                // function pointers to handle inplace operation on strided
-                // arrays (most general case)
-                true_divide_inplace_strided_dispatch_table,
-                // function pointers to handle inplace operation on
-                // c-contig matrix with c-contig row with broadcasting
-                // (may be nullptr)
-                true_divide_inplace_row_matrix_dispatch_table);
-        };
-        m.def("_divide_inplace", divide_inplace_pyapi, "", py::arg("lhs"),
-              py::arg("rhs"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-
-        using impl::py_divide_by_scalar;
-        m.def("_divide_by_scalar", &py_divide_by_scalar, "", py::arg("src"),
-              py::arg("scalar"), py::arg("dst"), py::arg("sycl_queue"),
-              py::arg("depends") = py::list());
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.hpp
deleted file mode 100644
index dbe0551a61..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_divide(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/trunc.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/trunc.cpp
deleted file mode 100644
index 68ae8ad93e..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/trunc.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "elementwise_functions.hpp"
-#include "trunc.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/elementwise_functions/common.hpp"
-#include "kernels/elementwise_functions/trunc.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
-using ew_cmn_ns::unary_contig_impl_fn_ptr_t;
-using ew_cmn_ns::unary_strided_impl_fn_ptr_t;
-
-// U36: ==== TRUNC   (x)
-namespace impl
-{
-
-namespace trunc_fn_ns = dpctl::tensor::kernels::trunc;
-
-static unary_contig_impl_fn_ptr_t
-    trunc_contig_dispatch_vector[td_ns::num_types];
-static int trunc_output_typeid_vector[td_ns::num_types];
-static unary_strided_impl_fn_ptr_t
-    trunc_strided_dispatch_vector[td_ns::num_types];
-
-void populate_trunc_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    namespace fn_ns = trunc_fn_ns;
-
-    using fn_ns::TruncContigFactory;
-    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, TruncContigFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(trunc_contig_dispatch_vector);
-
-    using fn_ns::TruncStridedFactory;
-    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, TruncStridedFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(trunc_strided_dispatch_vector);
-
-    using fn_ns::TruncTypeMapFactory;
-    DispatchVectorBuilder<int, TruncTypeMapFactory, num_types> dvb3;
-    dvb3.populate_dispatch_vector(trunc_output_typeid_vector);
-};
-
-} // namespace impl
-
-void init_trunc(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_trunc_dispatch_vectors();
-        using impl::trunc_contig_dispatch_vector;
-        using impl::trunc_output_typeid_vector;
-        using impl::trunc_strided_dispatch_vector;
-
-        auto trunc_pyapi = [&](const arrayT &src, const arrayT &dst,
-                               sycl::queue &exec_q,
-                               const event_vecT &depends = {}) {
-            return py_unary_ufunc(
-                src, dst, exec_q, depends, trunc_output_typeid_vector,
-                trunc_contig_dispatch_vector, trunc_strided_dispatch_vector);
-        };
-        m.def("_trunc", trunc_pyapi, "", py::arg("src"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto trunc_result_type_pyapi = [&](const py::dtype &dtype) {
-            return py_unary_ufunc_result_type(dtype,
-                                              trunc_output_typeid_vector);
-        };
-        m.def("_trunc_result_type", trunc_result_type_pyapi);
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/trunc.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/trunc.hpp
deleted file mode 100644
index 443648d55c..0000000000
--- a/dpctl/tensor/libtensor/source/elementwise_functions/trunc.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for elementwise operations.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_trunc(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/eye_ctor.cpp b/dpctl/tensor/libtensor/source/eye_ctor.cpp
deleted file mode 100644
index f475907064..0000000000
--- a/dpctl/tensor/libtensor/source/eye_ctor.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#include <cstddef>
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/pybind11.h>
-
-#include "eye_ctor.hpp"
-#include "kernels/constructors.hpp"
-#include "utils/output_validation.hpp"
-#include "utils/type_dispatch.hpp"
-
-namespace py = pybind11;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-using dpctl::utils::keep_args_alive;
-
-using dpctl::tensor::kernels::constructors::eye_fn_ptr_t;
-static eye_fn_ptr_t eye_dispatch_vector[td_ns::num_types];
-
-std::pair<sycl::event, sycl::event>
-usm_ndarray_eye(py::ssize_t k,
-                const dpctl::tensor::usm_ndarray &dst,
-                sycl::queue &exec_q,
-                const std::vector<sycl::event> &depends)
-{
-    // dst must be 2D
-
-    if (dst.get_ndim() != 2) {
-        throw py::value_error(
-            "usm_ndarray_eye: Expecting 2D array to populate");
-    }
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
-        throw py::value_error("Execution queue is not compatible with the "
-                              "allocation queue");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    auto array_types = td_ns::usm_ndarray_types();
-    int dst_typenum = dst.get_typenum();
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    const py::ssize_t nelem = dst.get_size();
-    const py::ssize_t rows = dst.get_shape(0);
-    const py::ssize_t cols = dst.get_shape(1);
-    if (rows == 0 || cols == 0) {
-        // nothing to do
-        return std::make_pair(sycl::event{}, sycl::event{});
-    }
-
-    bool is_dst_c_contig = dst.is_c_contiguous();
-    bool is_dst_f_contig = dst.is_f_contiguous();
-    if (!is_dst_c_contig && !is_dst_f_contig) {
-        throw py::value_error("USM array is not contiguous");
-    }
-
-    py::ssize_t start;
-    if (is_dst_c_contig) {
-        start = (k < 0) ? -k * cols : k;
-    }
-    else {
-        start = (k < 0) ? -k : k * rows;
-    }
-
-    const py::ssize_t *strides = dst.get_strides_raw();
-    py::ssize_t step;
-    if (strides == nullptr) {
-        step = (is_dst_c_contig) ? cols + 1 : rows + 1;
-    }
-    else {
-        step = strides[0] + strides[1];
-    }
-
-    const py::ssize_t length = std::min({rows, cols, rows + k, cols - k});
-    const py::ssize_t end = start + step * (length - 1);
-
-    char *dst_data = dst.get_data();
-    sycl::event eye_event;
-
-    auto fn = eye_dispatch_vector[dst_typeid];
-
-    eye_event = fn(exec_q, static_cast<std::size_t>(nelem), start, end, step,
-                   dst_data, depends);
-
-    return std::make_pair(keep_args_alive(exec_q, {dst}, {eye_event}),
-                          eye_event);
-}
-
-void init_eye_ctor_dispatch_vectors(void)
-{
-    using namespace td_ns;
-    using dpctl::tensor::kernels::constructors::EyeFactory;
-
-    DispatchVectorBuilder<eye_fn_ptr_t, EyeFactory, num_types> dvb;
-    dvb.populate_dispatch_vector(eye_dispatch_vector);
-
-    return;
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/eye_ctor.hpp b/dpctl/tensor/libtensor/source/eye_ctor.hpp
deleted file mode 100644
index b156f06a1c..0000000000
--- a/dpctl/tensor/libtensor/source/eye_ctor.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#pragma once
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/pybind11.h>
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern std::pair<sycl::event, sycl::event>
-usm_ndarray_eye(py::ssize_t k,
-                const dpctl::tensor::usm_ndarray &dst,
-                sycl::queue &exec_q,
-                const std::vector<sycl::event> &depends = {});
-
-extern void init_eye_ctor_dispatch_vectors(void);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/full_ctor.cpp b/dpctl/tensor/libtensor/source/full_ctor.cpp
deleted file mode 100644
index 1b3e14b245..0000000000
--- a/dpctl/tensor/libtensor/source/full_ctor.cpp
+++ /dev/null
@@ -1,300 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#include <complex>
-#include <cstddef>
-#include <stdexcept>
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/complex.h>
-#include <pybind11/pybind11.h>
-
-#include "kernels/constructors.hpp"
-#include "utils/output_validation.hpp"
-#include "utils/type_dispatch.hpp"
-#include "utils/type_utils.hpp"
-
-#include "full_ctor.hpp"
-
-namespace py = pybind11;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-using dpctl::utils::keep_args_alive;
-
-typedef sycl::event (*full_contig_fn_ptr_t)(sycl::queue &,
-                                            std::size_t,
-                                            const py::object &,
-                                            char *,
-                                            const std::vector<sycl::event> &);
-
-/*!
- * @brief Function to submit kernel to fill given contiguous memory allocation
- * with specified value.
- *
- * @param exec_q  Sycl queue to which kernel is submitted for execution.
- * @param nelems  Length of the sequence
- * @param py_value  Python object representing the value to fill the array with.
- * Must be convertible to `dstTy`.
- * @param dst_p  Kernel accessible USM pointer to the start of array to be
- * populated.
- * @param depends  List of events to wait for before starting computations, if
- * any.
- *
- * @return Event to wait on to ensure that computation completes.
- * @defgroup CtorKernels
- */
-template <typename dstTy>
-sycl::event full_contig_impl(sycl::queue &exec_q,
-                             std::size_t nelems,
-                             const py::object &py_value,
-                             char *dst_p,
-                             const std::vector<sycl::event> &depends)
-{
-    dstTy fill_v = py::cast<dstTy>(py_value);
-
-    sycl::event fill_ev;
-
-    if constexpr (sizeof(dstTy) == sizeof(char)) {
-        const auto memset_val = sycl::bit_cast<unsigned char>(fill_v);
-        fill_ev = exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(depends);
-
-            cgh.memset(reinterpret_cast<void *>(dst_p), memset_val,
-                       nelems * sizeof(dstTy));
-        });
-    }
-    else {
-        bool is_zero = false;
-        if constexpr (sizeof(dstTy) == 1) {
-            is_zero = (std::uint8_t{0} == sycl::bit_cast<std::uint8_t>(fill_v));
-        }
-        else if constexpr (sizeof(dstTy) == 2) {
-            is_zero =
-                (std::uint16_t{0} == sycl::bit_cast<std::uint16_t>(fill_v));
-        }
-        else if constexpr (sizeof(dstTy) == 4) {
-            is_zero =
-                (std::uint32_t{0} == sycl::bit_cast<std::uint32_t>(fill_v));
-        }
-        else if constexpr (sizeof(dstTy) == 8) {
-            is_zero =
-                (std::uint64_t{0} == sycl::bit_cast<std::uint64_t>(fill_v));
-        }
-        else if constexpr (sizeof(dstTy) == 16) {
-            struct UInt128
-            {
-
-                constexpr UInt128() : v1{}, v2{} {}
-                UInt128(const UInt128 &) = default;
-
-                operator bool() const { return bool(!v1) && bool(!v2); }
-
-                std::uint64_t v1;
-                std::uint64_t v2;
-            };
-            is_zero = static_cast<bool>(sycl::bit_cast<UInt128>(fill_v));
-        }
-
-        if (is_zero) {
-            static constexpr int memset_val = 0;
-            fill_ev = exec_q.submit([&](sycl::handler &cgh) {
-                cgh.depends_on(depends);
-
-                cgh.memset(reinterpret_cast<void *>(dst_p), memset_val,
-                           nelems * sizeof(dstTy));
-            });
-        }
-        else {
-            using dpctl::tensor::kernels::constructors::full_contig_impl;
-
-            fill_ev =
-                full_contig_impl<dstTy>(exec_q, nelems, fill_v, dst_p, depends);
-        }
-    }
-
-    return fill_ev;
-}
-
-template <typename fnT, typename Ty> struct FullContigFactory
-{
-    fnT get()
-    {
-        fnT f = full_contig_impl<Ty>;
-        return f;
-    }
-};
-
-typedef sycl::event (*full_strided_fn_ptr_t)(sycl::queue &,
-                                             int,
-                                             std::size_t,
-                                             py::ssize_t *,
-                                             const py::object &,
-                                             char *,
-                                             const std::vector<sycl::event> &);
-
-/*!
- * @brief Function to submit kernel to fill given strided memory allocation
- * with specified value.
- *
- * @param exec_q  Sycl queue to which kernel is submitted for execution.
- * @param nd  Array dimensionality
- * @param nelems  Length of the sequence
- * @param shape_strides  Kernel accessible USM pointer to packed shape and
- * strides of array.
- * @param py_value  Python object representing the value to fill the array with.
- * Must be convertible to `dstTy`.
- * @param dst_p  Kernel accessible USM pointer to the start of array to be
- * populated.
- * @param depends  List of events to wait for before starting computations, if
- * any.
- *
- * @return Event to wait on to ensure that computation completes.
- * @defgroup CtorKernels
- */
-template <typename dstTy>
-sycl::event full_strided_impl(sycl::queue &exec_q,
-                              int nd,
-                              std::size_t nelems,
-                              py::ssize_t *shape_strides,
-                              const py::object &py_value,
-                              char *dst_p,
-                              const std::vector<sycl::event> &depends)
-{
-    dstTy fill_v = py::cast<dstTy>(py_value);
-
-    using dpctl::tensor::kernels::constructors::full_strided_impl;
-    sycl::event fill_ev = full_strided_impl<dstTy>(
-        exec_q, nd, nelems, shape_strides, fill_v, dst_p, depends);
-
-    return fill_ev;
-}
-
-template <typename fnT, typename Ty> struct FullStridedFactory
-{
-    fnT get()
-    {
-        fnT f = full_strided_impl<Ty>;
-        return f;
-    }
-};
-
-static full_contig_fn_ptr_t full_contig_dispatch_vector[td_ns::num_types];
-static full_strided_fn_ptr_t full_strided_dispatch_vector[td_ns::num_types];
-
-std::pair<sycl::event, sycl::event>
-usm_ndarray_full(const py::object &py_value,
-                 const dpctl::tensor::usm_ndarray &dst,
-                 sycl::queue &exec_q,
-                 const std::vector<sycl::event> &depends)
-{
-    // py_value should be coercible into data type of dst
-
-    py::ssize_t dst_nelems = dst.get_size();
-
-    if (dst_nelems == 0) {
-        // nothing to do
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
-        throw py::value_error(
-            "Execution queue is not compatible with the allocation queue");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    auto array_types = td_ns::usm_ndarray_types();
-    int dst_typenum = dst.get_typenum();
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    char *dst_data = dst.get_data();
-
-    if (dst_nelems == 1 || dst.is_c_contiguous() || dst.is_f_contiguous()) {
-        auto fn = full_contig_dispatch_vector[dst_typeid];
-
-        sycl::event full_contig_event =
-            fn(exec_q, static_cast<std::size_t>(dst_nelems), py_value, dst_data,
-               depends);
-
-        return std::make_pair(
-            keep_args_alive(exec_q, {dst}, {full_contig_event}),
-            full_contig_event);
-    }
-    else {
-        int nd = dst.get_ndim();
-        auto const &dst_shape = dst.get_shape_vector();
-        auto const &dst_strides = dst.get_strides_vector();
-
-        auto fn = full_strided_dispatch_vector[dst_typeid];
-
-        std::vector<sycl::event> host_task_events;
-        host_task_events.reserve(2);
-        using dpctl::tensor::offset_utils::device_allocate_and_pack;
-        auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
-            exec_q, host_task_events, dst_shape, dst_strides);
-        auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
-        const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
-        py::ssize_t *shape_strides = shape_strides_owner.get();
-
-        const sycl::event &full_strided_ev =
-            fn(exec_q, nd, dst_nelems, shape_strides, py_value, dst_data,
-               {copy_shape_ev});
-
-        // free shape_strides
-        const auto &temporaries_cleanup_ev =
-            dpctl::tensor::alloc_utils::async_smart_free(
-                exec_q, {full_strided_ev}, shape_strides_owner);
-        host_task_events.push_back(temporaries_cleanup_ev);
-
-        return std::make_pair(keep_args_alive(exec_q, {dst}, host_task_events),
-                              full_strided_ev);
-    }
-}
-
-void init_full_ctor_dispatch_vectors(void)
-{
-    using namespace td_ns;
-
-    DispatchVectorBuilder<full_contig_fn_ptr_t, FullContigFactory, num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(full_contig_dispatch_vector);
-
-    DispatchVectorBuilder<full_strided_fn_ptr_t, FullStridedFactory, num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(full_strided_dispatch_vector);
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/full_ctor.hpp b/dpctl/tensor/libtensor/source/full_ctor.hpp
deleted file mode 100644
index e9eac5d44a..0000000000
--- a/dpctl/tensor/libtensor/source/full_ctor.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#pragma once
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/pybind11.h>
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern std::pair<sycl::event, sycl::event>
-usm_ndarray_full(const py::object &py_value,
-                 const dpctl::tensor::usm_ndarray &dst,
-                 sycl::queue &exec_q,
-                 const std::vector<sycl::event> &depends = {});
-
-extern void init_full_ctor_dispatch_vectors(void);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
deleted file mode 100644
index 5eb54bbe70..0000000000
--- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
+++ /dev/null
@@ -1,811 +0,0 @@
-//===-- integer_advanced_indexing.cpp -                         --*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines implementation functions of dpctl.tensor.take and
-/// dpctl.tensor.put
-//===----------------------------------------------------------------------===//
-
-#include <algorithm>
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <stdexcept>
-#include <sycl/sycl.hpp>
-#include <utility>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/complex.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include "kernels/integer_advanced_indexing.hpp"
-#include "utils/memory_overlap.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/output_validation.hpp"
-#include "utils/sycl_alloc_utils.hpp"
-#include "utils/type_dispatch.hpp"
-#include "utils/type_utils.hpp"
-
-#include "integer_advanced_indexing.hpp"
-
-#define INDEXING_MODES 2
-#define WRAP_MODE 0
-#define CLIP_MODE 1
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::kernels::indexing::put_fn_ptr_t;
-using dpctl::tensor::kernels::indexing::take_fn_ptr_t;
-
-static take_fn_ptr_t take_dispatch_table[INDEXING_MODES][td_ns::num_types]
-                                        [td_ns::num_types];
-
-static put_fn_ptr_t put_dispatch_table[INDEXING_MODES][td_ns::num_types]
-                                      [td_ns::num_types];
-
-namespace py = pybind11;
-
-using dpctl::utils::keep_args_alive;
-
-std::vector<sycl::event>
-_populate_kernel_params(sycl::queue &exec_q,
-                        std::vector<sycl::event> &host_task_events,
-                        char **device_ind_ptrs,
-                        py::ssize_t *device_ind_sh_st,
-                        py::ssize_t *device_ind_offsets,
-                        py::ssize_t *device_orthog_sh_st,
-                        py::ssize_t *device_along_sh_st,
-                        const py::ssize_t *inp_shape,
-                        const py::ssize_t *arr_shape,
-                        std::vector<py::ssize_t> &inp_strides,
-                        std::vector<py::ssize_t> &arr_strides,
-                        std::vector<py::ssize_t> &ind_sh_sts,
-                        std::vector<char *> &ind_ptrs,
-                        std::vector<py::ssize_t> &ind_offsets,
-                        int axis_start,
-                        int k,
-                        int ind_nd,
-                        int inp_nd,
-                        int orthog_sh_elems,
-                        int ind_sh_elems)
-{
-
-    using usm_host_allocator_T =
-        dpctl::tensor::alloc_utils::usm_host_allocator<char *>;
-    using ptrT = std::vector<char *, usm_host_allocator_T>;
-
-    usm_host_allocator_T ptr_allocator(exec_q);
-    std::shared_ptr<ptrT> host_ind_ptrs_shp =
-        std::make_shared<ptrT>(k, ptr_allocator);
-
-    using usm_host_allocatorT =
-        dpctl::tensor::alloc_utils::usm_host_allocator<py::ssize_t>;
-    using shT = std::vector<py::ssize_t, usm_host_allocatorT>;
-
-    usm_host_allocatorT sz_allocator(exec_q);
-    std::shared_ptr<shT> host_ind_sh_st_shp =
-        std::make_shared<shT>(ind_sh_elems * (k + 1), sz_allocator);
-
-    std::shared_ptr<shT> host_ind_offsets_shp =
-        std::make_shared<shT>(k, sz_allocator);
-
-    std::shared_ptr<shT> host_orthog_sh_st_shp =
-        std::make_shared<shT>(3 * orthog_sh_elems, sz_allocator);
-
-    std::shared_ptr<shT> host_along_sh_st_shp =
-        std::make_shared<shT>(2 * (k + ind_sh_elems), sz_allocator);
-
-    std::copy(ind_sh_sts.begin(), ind_sh_sts.end(),
-              host_ind_sh_st_shp->begin());
-    std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin());
-    std::copy(ind_offsets.begin(), ind_offsets.end(),
-              host_ind_offsets_shp->begin());
-
-    const sycl::event &device_ind_ptrs_copy_ev = exec_q.copy<char *>(
-        host_ind_ptrs_shp->data(), device_ind_ptrs, host_ind_ptrs_shp->size());
-
-    const sycl::event &device_ind_sh_st_copy_ev =
-        exec_q.copy<py::ssize_t>(host_ind_sh_st_shp->data(), device_ind_sh_st,
-                                 host_ind_sh_st_shp->size());
-
-    const sycl::event &device_ind_offsets_copy_ev = exec_q.copy<py::ssize_t>(
-        host_ind_offsets_shp->data(), device_ind_offsets,
-        host_ind_offsets_shp->size());
-
-    int orthog_nd = inp_nd - k;
-
-    if (orthog_nd > 0) {
-        if (axis_start > 0) {
-            std::copy(inp_shape, inp_shape + axis_start,
-                      host_orthog_sh_st_shp->begin());
-            std::copy(inp_strides.begin(), inp_strides.begin() + axis_start,
-                      host_orthog_sh_st_shp->begin() + orthog_sh_elems);
-            std::copy(arr_strides.begin(), arr_strides.begin() + axis_start,
-                      host_orthog_sh_st_shp->begin() + 2 * orthog_sh_elems);
-        }
-        if (inp_nd > (axis_start + k)) {
-            std::copy(inp_shape + axis_start + k, inp_shape + inp_nd,
-                      host_orthog_sh_st_shp->begin() + axis_start);
-            std::copy(inp_strides.begin() + axis_start + k, inp_strides.end(),
-                      host_orthog_sh_st_shp->begin() + orthog_sh_elems +
-                          axis_start);
-
-            std::copy(arr_strides.begin() + axis_start + ind_nd,
-                      arr_strides.end(),
-                      host_orthog_sh_st_shp->begin() + 2 * orthog_sh_elems +
-                          axis_start);
-        }
-    }
-
-    if (inp_nd > 0) {
-        std::copy(inp_shape + axis_start, inp_shape + axis_start + k,
-                  host_along_sh_st_shp->begin());
-
-        std::copy(inp_strides.begin() + axis_start,
-                  inp_strides.begin() + axis_start + k,
-                  host_along_sh_st_shp->begin() + k);
-    }
-
-    if (ind_nd > 0) {
-        std::copy(arr_shape + axis_start, arr_shape + axis_start + ind_nd,
-                  host_along_sh_st_shp->begin() + 2 * k);
-        std::copy(arr_strides.begin() + axis_start,
-                  arr_strides.begin() + axis_start + ind_nd,
-                  host_along_sh_st_shp->begin() + 2 * k + ind_nd);
-    }
-
-    const sycl::event &device_orthog_sh_st_copy_ev = exec_q.copy<py::ssize_t>(
-        host_orthog_sh_st_shp->data(), device_orthog_sh_st,
-        host_orthog_sh_st_shp->size());
-
-    const sycl::event &device_along_sh_st_copy_ev = exec_q.copy<py::ssize_t>(
-        host_along_sh_st_shp->data(), device_along_sh_st,
-        host_along_sh_st_shp->size());
-
-    const sycl::event &shared_ptr_cleanup_ev =
-        exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on({device_along_sh_st_copy_ev,
-                            device_orthog_sh_st_copy_ev,
-                            device_ind_offsets_copy_ev,
-                            device_ind_sh_st_copy_ev, device_ind_ptrs_copy_ev});
-            cgh.host_task(
-                [host_ind_offsets_shp = std::move(host_ind_offsets_shp),
-                 host_ind_sh_st_shp = std::move(host_ind_sh_st_shp),
-                 host_ind_ptrs_shp = std::move(host_ind_ptrs_shp),
-                 host_orthog_sh_st_shp = std::move(host_orthog_sh_st_shp),
-                 host_along_sh_st_shp = std::move(host_along_sh_st_shp)] {});
-        });
-    host_task_events.push_back(shared_ptr_cleanup_ev);
-
-    std::vector<sycl::event> sh_st_pack_deps{
-        device_ind_ptrs_copy_ev, device_ind_sh_st_copy_ev,
-        device_ind_offsets_copy_ev, device_orthog_sh_st_copy_ev,
-        device_along_sh_st_copy_ev};
-    return sh_st_pack_deps;
-}
-
-/* Utility to parse python object py_ind into vector of `usm_ndarray`s */
-std::vector<dpctl::tensor::usm_ndarray> parse_py_ind(const sycl::queue &q,
-                                                     const py::object &py_ind)
-{
-    std::size_t ind_count = py::len(py_ind);
-    std::vector<dpctl::tensor::usm_ndarray> res;
-    res.reserve(ind_count);
-
-    bool nd_is_known = false;
-    int nd = -1;
-    for (std::size_t i = 0; i < ind_count; ++i) {
-        py::object el_i = py_ind[py::cast(i)];
-        dpctl::tensor::usm_ndarray arr_i =
-            py::cast<dpctl::tensor::usm_ndarray>(el_i);
-        if (!dpctl::utils::queues_are_compatible(q, {arr_i})) {
-            throw py::value_error("Index allocation queue is not compatible "
-                                  "with execution queue");
-        }
-        if (nd_is_known) {
-            if (nd != arr_i.get_ndim()) {
-                throw py::value_error(
-                    "Indices must have the same number of dimensions.");
-            }
-        }
-        else {
-            nd_is_known = true;
-            nd = arr_i.get_ndim();
-        }
-        res.push_back(arr_i);
-    }
-
-    return res;
-}
-
-std::pair<sycl::event, sycl::event>
-usm_ndarray_take(const dpctl::tensor::usm_ndarray &src,
-                 const py::object &py_ind,
-                 const dpctl::tensor::usm_ndarray &dst,
-                 int axis_start,
-                 std::uint8_t mode,
-                 sycl::queue &exec_q,
-                 const std::vector<sycl::event> &depends)
-{
-    std::vector<dpctl::tensor::usm_ndarray> ind = parse_py_ind(exec_q, py_ind);
-
-    int k = ind.size();
-
-    if (k == 0) {
-        throw py::value_error("List of indices is empty.");
-    }
-
-    if (axis_start < 0) {
-        throw py::value_error("Axis cannot be negative.");
-    }
-
-    if (mode != 0 && mode != 1) {
-        throw py::value_error("Mode must be 0 or 1.");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    const dpctl::tensor::usm_ndarray ind_rep = ind[0];
-
-    int src_nd = src.get_ndim();
-    int dst_nd = dst.get_ndim();
-    int ind_nd = ind_rep.get_ndim();
-
-    auto sh_elems = std::max<int>(src_nd, 1);
-
-    if (axis_start + k > sh_elems) {
-        throw py::value_error("Axes are out of range for array of dimension " +
-                              std::to_string(src_nd));
-    }
-    if (src_nd == 0) {
-        if (dst_nd != ind_nd) {
-            throw py::value_error(
-                "Destination is not of appropriate dimension for take kernel.");
-        }
-    }
-    else {
-        if (dst_nd != (src_nd - k + ind_nd)) {
-            throw py::value_error(
-                "Destination is not of appropriate dimension for take kernel.");
-        }
-    }
-
-    const py::ssize_t *src_shape = src.get_shape_raw();
-    const py::ssize_t *dst_shape = dst.get_shape_raw();
-
-    bool orthog_shapes_equal(true);
-    std::size_t orthog_nelems(1);
-    for (int i = 0; i < (src_nd - k); ++i) {
-        auto idx1 = (i < axis_start) ? i : i + k;
-        auto idx2 = (i < axis_start) ? i : i + ind_nd;
-
-        orthog_nelems *= static_cast<std::size_t>(src_shape[idx1]);
-        orthog_shapes_equal =
-            orthog_shapes_equal && (src_shape[idx1] == dst_shape[idx2]);
-    }
-
-    if (!orthog_shapes_equal) {
-        throw py::value_error(
-            "Axes of basic indices are not of matching shapes.");
-    }
-
-    if (orthog_nelems == 0) {
-        return std::make_pair(sycl::event{}, sycl::event{});
-    }
-
-    char *src_data = src.get_data();
-    char *dst_data = dst.get_data();
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    if (overlap(src, dst)) {
-        throw py::value_error("Array memory overlap.");
-    }
-
-    py::ssize_t src_offset = py::ssize_t(0);
-    py::ssize_t dst_offset = py::ssize_t(0);
-
-    int src_typenum = src.get_typenum();
-    int dst_typenum = dst.get_typenum();
-
-    auto array_types = td_ns::usm_ndarray_types();
-    int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
-    int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
-
-    if (src_type_id != dst_type_id) {
-        throw py::type_error("Array data types are not the same.");
-    }
-
-    const py::ssize_t *ind_shape = ind_rep.get_shape_raw();
-
-    int ind_typenum = ind_rep.get_typenum();
-    int ind_type_id = array_types.typenum_to_lookup_id(ind_typenum);
-
-    std::size_t ind_nelems(1);
-    for (int i = 0; i < ind_nd; ++i) {
-        ind_nelems *= static_cast<std::size_t>(ind_shape[i]);
-
-        if (!(ind_shape[i] == dst_shape[axis_start + i])) {
-            throw py::value_error(
-                "Indices shape does not match shape of axis in destination.");
-        }
-    }
-
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
-        dst, orthog_nelems * ind_nelems);
-
-    int ind_sh_elems = std::max<int>(ind_nd, 1);
-
-    std::vector<char *> ind_ptrs;
-    ind_ptrs.reserve(k);
-
-    std::vector<py::ssize_t> ind_offsets;
-    ind_offsets.reserve(k);
-
-    std::vector<py::ssize_t> ind_sh_sts((k + 1) * ind_sh_elems, 0);
-    if (ind_nd > 0) {
-        std::copy(ind_shape, ind_shape + ind_nd, ind_sh_sts.begin());
-    }
-    for (int i = 0; i < k; ++i) {
-        dpctl::tensor::usm_ndarray ind_ = ind[i];
-
-        if (!dpctl::utils::queues_are_compatible(exec_q, {ind_})) {
-            throw py::value_error(
-                "Execution queue is not compatible with allocation queues");
-        }
-
-        // ndim, type, and shape are checked against the first array
-        if (i > 0) {
-            if (!(ind_.get_ndim() == ind_nd)) {
-                throw py::value_error("Index dimensions are not the same");
-            }
-
-            if (!(ind_type_id ==
-                  array_types.typenum_to_lookup_id(ind_.get_typenum())))
-            {
-                throw py::type_error(
-                    "Indices array data types are not all the same.");
-            }
-
-            const py::ssize_t *ind_shape_ = ind_.get_shape_raw();
-            for (int dim = 0; dim < ind_nd; ++dim) {
-                if (!(ind_shape[dim] == ind_shape_[dim])) {
-                    throw py::value_error("Indices shapes are not all equal.");
-                }
-            }
-        }
-
-        // check for overlap with destination
-        if (overlap(dst, ind_)) {
-            throw py::value_error(
-                "Arrays index overlapping segments of memory");
-        }
-
-        char *ind_data = ind_.get_data();
-
-        // strides are initialized to 0 for 0D indices, so skip here
-        if (ind_nd > 0) {
-            auto ind_strides = ind_.get_strides_vector();
-            std::copy(ind_strides.begin(), ind_strides.end(),
-                      ind_sh_sts.begin() + (i + 1) * ind_nd);
-        }
-
-        ind_ptrs.push_back(ind_data);
-        ind_offsets.push_back(py::ssize_t(0));
-    }
-
-    if (ind_nelems == 0) {
-        return std::make_pair(sycl::event{}, sycl::event{});
-    }
-
-    auto packed_ind_ptrs_owner =
-        dpctl::tensor::alloc_utils::smart_malloc_device<char *>(k, exec_q);
-    char **packed_ind_ptrs = packed_ind_ptrs_owner.get();
-
-    // rearrange to past where indices shapes are checked
-    // packed_ind_shapes_strides = [ind_shape,
-    //                              ind[0] strides,
-    //                              ...,
-    //                              ind[k] strides]
-    auto packed_ind_shapes_strides_owner =
-        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(
-            (k + 1) * ind_sh_elems, exec_q);
-    py::ssize_t *packed_ind_shapes_strides =
-        packed_ind_shapes_strides_owner.get();
-
-    auto packed_ind_offsets_owner =
-        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(k, exec_q);
-    py::ssize_t *packed_ind_offsets = packed_ind_offsets_owner.get();
-
-    int orthog_sh_elems = std::max<int>(src_nd - k, 1);
-
-    // packed_shapes_strides = [src_shape[:axis] + src_shape[axis+k:],
-    //                          src_strides[:axis] + src_strides[axis+k:],
-    //                          dst_strides[:axis] +
-    //                          dst_strides[axis+ind.ndim:]]
-    auto packed_shapes_strides_owner =
-        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(
-            3 * orthog_sh_elems, exec_q);
-    py::ssize_t *packed_shapes_strides = packed_shapes_strides_owner.get();
-
-    // packed_axes_shapes_strides = [src_shape[axis:axis+k],
-    //                               src_strides[axis:axis+k],
-    //                               dst_shape[axis:axis+ind.ndim],
-    //                               dst_strides[axis:axis+ind.ndim]]
-    auto packed_axes_shapes_strides_owner =
-        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(
-            2 * (k + ind_sh_elems), exec_q);
-    py::ssize_t *packed_axes_shapes_strides =
-        packed_axes_shapes_strides_owner.get();
-
-    auto src_strides = src.get_strides_vector();
-    auto dst_strides = dst.get_strides_vector();
-
-    std::vector<sycl::event> host_task_events;
-    host_task_events.reserve(2);
-
-    std::vector<sycl::event> pack_deps = _populate_kernel_params(
-        exec_q, host_task_events, packed_ind_ptrs, packed_ind_shapes_strides,
-        packed_ind_offsets, packed_shapes_strides, packed_axes_shapes_strides,
-        src_shape, dst_shape, src_strides, dst_strides, ind_sh_sts, ind_ptrs,
-        ind_offsets, axis_start, k, ind_nd, src_nd, orthog_sh_elems,
-        ind_sh_elems);
-
-    std::vector<sycl::event> all_deps;
-    all_deps.reserve(depends.size() + pack_deps.size());
-    all_deps.insert(std::end(all_deps), std::begin(pack_deps),
-                    std::end(pack_deps));
-    all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
-
-    auto fn = take_dispatch_table[mode][src_type_id][ind_type_id];
-
-    if (fn == nullptr) {
-        sycl::event::wait(host_task_events);
-        throw std::runtime_error("Indices must be integer type, got " +
-                                 std::to_string(ind_type_id));
-    }
-
-    sycl::event take_generic_ev =
-        fn(exec_q, orthog_nelems, ind_nelems, orthog_sh_elems, ind_sh_elems, k,
-           packed_shapes_strides, packed_axes_shapes_strides,
-           packed_ind_shapes_strides, src_data, dst_data, packed_ind_ptrs,
-           src_offset, dst_offset, packed_ind_offsets, all_deps);
-
-    // free packed temporaries
-    sycl::event temporaries_cleanup_ev =
-        dpctl::tensor::alloc_utils::async_smart_free(
-            exec_q, {take_generic_ev}, packed_shapes_strides_owner,
-            packed_axes_shapes_strides_owner, packed_ind_shapes_strides_owner,
-            packed_ind_ptrs_owner, packed_ind_offsets_owner);
-    host_task_events.push_back(temporaries_cleanup_ev);
-
-    sycl::event arg_cleanup_ev =
-        keep_args_alive(exec_q, {src, py_ind, dst}, host_task_events);
-
-    return std::make_pair(arg_cleanup_ev, take_generic_ev);
-}
-
-std::pair<sycl::event, sycl::event>
-usm_ndarray_put(const dpctl::tensor::usm_ndarray &dst,
-                const py::object &py_ind,
-                const dpctl::tensor::usm_ndarray &val,
-                int axis_start,
-                std::uint8_t mode,
-                sycl::queue &exec_q,
-                const std::vector<sycl::event> &depends)
-{
-    std::vector<dpctl::tensor::usm_ndarray> ind = parse_py_ind(exec_q, py_ind);
-    int k = ind.size();
-
-    if (k == 0) {
-        // no indices to write to
-        throw py::value_error("List of indices is empty.");
-    }
-
-    if (axis_start < 0) {
-        throw py::value_error("Axis cannot be negative.");
-    }
-
-    if (mode != 0 && mode != 1) {
-        throw py::value_error("Mode must be 0 or 1.");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    const dpctl::tensor::usm_ndarray ind_rep = ind[0];
-
-    int dst_nd = dst.get_ndim();
-    int val_nd = val.get_ndim();
-    int ind_nd = ind_rep.get_ndim();
-
-    auto sh_elems = std::max<int>(dst_nd, 1);
-
-    if (axis_start + k > sh_elems) {
-        throw py::value_error("Axes are out of range for array of dimension " +
-                              std::to_string(dst_nd));
-    }
-    if (dst_nd == 0) {
-        if (val_nd != ind_nd) {
-            throw py::value_error("Destination is not of appropriate dimension "
-                                  "for put function.");
-        }
-    }
-    else {
-        if (val_nd != (dst_nd - k + ind_nd)) {
-            throw py::value_error("Destination is not of appropriate dimension "
-                                  "for put function.");
-        }
-    }
-
-    std::size_t dst_nelems = dst.get_size();
-
-    const py::ssize_t *dst_shape = dst.get_shape_raw();
-    const py::ssize_t *val_shape = val.get_shape_raw();
-
-    bool orthog_shapes_equal(true);
-    std::size_t orthog_nelems(1);
-    for (int i = 0; i < (dst_nd - k); ++i) {
-        auto idx1 = (i < axis_start) ? i : i + k;
-        auto idx2 = (i < axis_start) ? i : i + ind_nd;
-
-        orthog_nelems *= static_cast<std::size_t>(dst_shape[idx1]);
-        orthog_shapes_equal =
-            orthog_shapes_equal && (dst_shape[idx1] == val_shape[idx2]);
-    }
-
-    if (!orthog_shapes_equal) {
-        throw py::value_error(
-            "Axes of basic indices are not of matching shapes.");
-    }
-
-    if (orthog_nelems == 0) {
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    char *dst_data = dst.get_data();
-    char *val_data = val.get_data();
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {dst, val})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    if (overlap(val, dst)) {
-        throw py::value_error("Arrays index overlapping segments of memory");
-    }
-
-    py::ssize_t dst_offset = py::ssize_t(0);
-    py::ssize_t val_offset = py::ssize_t(0);
-
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems);
-
-    int dst_typenum = dst.get_typenum();
-    int val_typenum = val.get_typenum();
-
-    auto array_types = td_ns::usm_ndarray_types();
-    int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
-    int val_type_id = array_types.typenum_to_lookup_id(val_typenum);
-
-    if (dst_type_id != val_type_id) {
-        throw py::type_error("Array data types are not the same.");
-    }
-
-    const py::ssize_t *ind_shape = ind_rep.get_shape_raw();
-
-    int ind_typenum = ind_rep.get_typenum();
-    int ind_type_id = array_types.typenum_to_lookup_id(ind_typenum);
-
-    std::size_t ind_nelems(1);
-    for (int i = 0; i < ind_nd; ++i) {
-        ind_nelems *= static_cast<std::size_t>(ind_shape[i]);
-
-        if (!(ind_shape[i] == val_shape[axis_start + i])) {
-            throw py::value_error(
-                "Indices shapes does not match shape of axis in vals.");
-        }
-    }
-
-    auto ind_sh_elems = std::max<int>(ind_nd, 1);
-
-    std::vector<char *> ind_ptrs;
-    ind_ptrs.reserve(k);
-    std::vector<py::ssize_t> ind_offsets;
-    ind_offsets.reserve(k);
-    std::vector<py::ssize_t> ind_sh_sts((k + 1) * ind_sh_elems, py::ssize_t(0));
-    if (ind_nd > 0) {
-        std::copy(ind_shape, ind_shape + ind_sh_elems, ind_sh_sts.begin());
-    }
-    for (int i = 0; i < k; ++i) {
-        dpctl::tensor::usm_ndarray ind_ = ind[i];
-
-        if (!dpctl::utils::queues_are_compatible(exec_q, {ind_})) {
-            throw py::value_error(
-                "Execution queue is not compatible with allocation queues");
-        }
-
-        // ndim, type, and shape are checked against the first array
-        if (i > 0) {
-            if (!(ind_.get_ndim() == ind_nd)) {
-                throw py::value_error("Index dimensions are not the same");
-            }
-
-            if (!(ind_type_id ==
-                  array_types.typenum_to_lookup_id(ind_.get_typenum())))
-            {
-                throw py::type_error(
-                    "Indices array data types are not all the same.");
-            }
-
-            const py::ssize_t *ind_shape_ = ind_.get_shape_raw();
-            for (int dim = 0; dim < ind_nd; ++dim) {
-                if (!(ind_shape[dim] == ind_shape_[dim])) {
-                    throw py::value_error("Indices shapes are not all equal.");
-                }
-            }
-        }
-
-        // check for overlap with destination
-        if (overlap(ind_, dst)) {
-            throw py::value_error(
-                "Arrays index overlapping segments of memory");
-        }
-
-        char *ind_data = ind_.get_data();
-
-        // strides are initialized to 0 for 0D indices, so skip here
-        if (ind_nd > 0) {
-            auto ind_strides = ind_.get_strides_vector();
-            std::copy(ind_strides.begin(), ind_strides.end(),
-                      ind_sh_sts.begin() + (i + 1) * ind_nd);
-        }
-
-        ind_ptrs.push_back(ind_data);
-        ind_offsets.push_back(py::ssize_t(0));
-    }
-
-    if (ind_nelems == 0) {
-        return std::make_pair(sycl::event{}, sycl::event{});
-    }
-
-    auto packed_ind_ptrs_owner =
-        dpctl::tensor::alloc_utils::smart_malloc_device<char *>(k, exec_q);
-    char **packed_ind_ptrs = packed_ind_ptrs_owner.get();
-
-    // packed_ind_shapes_strides = [ind_shape,
-    //                              ind[0] strides,
-    //                              ...,
-    //                              ind[k] strides]
-    auto packed_ind_shapes_strides_owner =
-        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(
-            (k + 1) * ind_sh_elems, exec_q);
-    py::ssize_t *packed_ind_shapes_strides =
-        packed_ind_shapes_strides_owner.get();
-
-    auto packed_ind_offsets_owner =
-        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(k, exec_q);
-    py::ssize_t *packed_ind_offsets = packed_ind_offsets_owner.get();
-
-    int orthog_sh_elems = std::max<int>(dst_nd - k, 1);
-
-    // packed_shapes_strides = [dst_shape[:axis] + dst_shape[axis+k:],
-    //                          dst_strides[:axis] + dst_strides[axis+k:],
-    //                          val_strides[:axis] +
-    //                          val_strides[axis+ind.ndim:]]
-    auto packed_shapes_strides_owner =
-        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(
-            3 * orthog_sh_elems, exec_q);
-    py::ssize_t *packed_shapes_strides = packed_shapes_strides_owner.get();
-
-    // packed_axes_shapes_strides = [dst_shape[axis:axis+k],
-    //                               dst_strides[axis:axis+k],
-    //                               val_shape[axis:axis+ind.ndim],
-    //                               val_strides[axis:axis+ind.ndim]]
-    auto packed_axes_shapes_strides_owner =
-        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(
-            2 * (k + ind_sh_elems), exec_q);
-    py::ssize_t *packed_axes_shapes_strides =
-        packed_axes_shapes_strides_owner.get();
-
-    auto dst_strides = dst.get_strides_vector();
-    auto val_strides = val.get_strides_vector();
-
-    std::vector<sycl::event> host_task_events;
-    host_task_events.reserve(2);
-
-    std::vector<sycl::event> pack_deps = _populate_kernel_params(
-        exec_q, host_task_events, packed_ind_ptrs, packed_ind_shapes_strides,
-        packed_ind_offsets, packed_shapes_strides, packed_axes_shapes_strides,
-        dst_shape, val_shape, dst_strides, val_strides, ind_sh_sts, ind_ptrs,
-        ind_offsets, axis_start, k, ind_nd, dst_nd, orthog_sh_elems,
-        ind_sh_elems);
-
-    std::vector<sycl::event> all_deps;
-    all_deps.reserve(depends.size() + pack_deps.size());
-    all_deps.insert(std::end(all_deps), std::begin(pack_deps),
-                    std::end(pack_deps));
-    all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
-
-    auto fn = put_dispatch_table[mode][dst_type_id][ind_type_id];
-
-    if (fn == nullptr) {
-        sycl::event::wait(host_task_events);
-        throw std::runtime_error("Indices must be integer type, got " +
-                                 std::to_string(ind_type_id));
-    }
-
-    sycl::event put_generic_ev =
-        fn(exec_q, orthog_nelems, ind_nelems, orthog_sh_elems, ind_sh_elems, k,
-           packed_shapes_strides, packed_axes_shapes_strides,
-           packed_ind_shapes_strides, dst_data, val_data, packed_ind_ptrs,
-           dst_offset, val_offset, packed_ind_offsets, all_deps);
-
-    // free packed temporaries
-    sycl::event temporaries_cleanup_ev =
-        dpctl::tensor::alloc_utils::async_smart_free(
-            exec_q, {put_generic_ev}, packed_shapes_strides_owner,
-            packed_axes_shapes_strides_owner, packed_ind_shapes_strides_owner,
-            packed_ind_ptrs_owner, packed_ind_offsets_owner);
-    host_task_events.push_back(temporaries_cleanup_ev);
-
-    sycl::event arg_cleanup_ev =
-        keep_args_alive(exec_q, {dst, py_ind, val}, host_task_events);
-
-    return std::make_pair(arg_cleanup_ev, put_generic_ev);
-}
-
-void init_advanced_indexing_dispatch_tables(void)
-{
-    using namespace td_ns;
-
-    using dpctl::tensor::kernels::indexing::TakeClipFactory;
-    DispatchTableBuilder<take_fn_ptr_t, TakeClipFactory, num_types>
-        dtb_takeclip;
-    dtb_takeclip.populate_dispatch_table(take_dispatch_table[CLIP_MODE]);
-
-    using dpctl::tensor::kernels::indexing::TakeWrapFactory;
-    DispatchTableBuilder<take_fn_ptr_t, TakeWrapFactory, num_types>
-        dtb_takewrap;
-    dtb_takewrap.populate_dispatch_table(take_dispatch_table[WRAP_MODE]);
-
-    using dpctl::tensor::kernels::indexing::PutClipFactory;
-    DispatchTableBuilder<put_fn_ptr_t, PutClipFactory, num_types> dtb_putclip;
-    dtb_putclip.populate_dispatch_table(put_dispatch_table[CLIP_MODE]);
-
-    using dpctl::tensor::kernels::indexing::PutWrapFactory;
-    DispatchTableBuilder<put_fn_ptr_t, PutWrapFactory, num_types> dtb_putwrap;
-    dtb_putwrap.populate_dispatch_table(put_dispatch_table[WRAP_MODE]);
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.hpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.hpp
deleted file mode 100644
index 10555b3dad..0000000000
--- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.hpp
+++ /dev/null
@@ -1,63 +0,0 @@
-//===-- integer_advanced_indexing.hpp -                         --*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file declares Python API for implementation functions of
-/// dpctl.tensor.take and dpctl.tensor.put
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/pybind11.h>
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern std::pair<sycl::event, sycl::event>
-usm_ndarray_take(const dpctl::tensor::usm_ndarray &,
-                 const py::object &,
-                 const dpctl::tensor::usm_ndarray &,
-                 int,
-                 std::uint8_t,
-                 sycl::queue &,
-                 const std::vector<sycl::event> & = {});
-
-extern std::pair<sycl::event, sycl::event>
-usm_ndarray_put(const dpctl::tensor::usm_ndarray &,
-                const py::object &,
-                const dpctl::tensor::usm_ndarray &,
-                int,
-                std::uint8_t,
-                sycl::queue &,
-                const std::vector<sycl::event> & = {});
-
-extern void init_advanced_indexing_dispatch_tables(void);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/linalg_functions/dot.cpp b/dpctl/tensor/libtensor/source/linalg_functions/dot.cpp
deleted file mode 100644
index 57bf88c00c..0000000000
--- a/dpctl/tensor/libtensor/source/linalg_functions/dot.cpp
+++ /dev/null
@@ -1,859 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#include <cstddef>
-#include <cstdint>
-#include <exception>
-#include <stdexcept>
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include "dot.hpp"
-#include "dot_atomic_support.hpp"
-#include "dot_dispatch.hpp"
-#include "elementwise_functions/elementwise_functions_type_utils.hpp"
-#include "kernels/linalg_functions/dot_product.hpp"
-#include "kernels/linalg_functions/gemm.hpp"
-#include "reductions/reduction_atomic_support.hpp"
-#include "simplify_iteration_space.hpp"
-#include "utils/memory_overlap.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/output_validation.hpp"
-#include "utils/sycl_alloc_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-static int dot_output_id_table[td_ns::num_types][td_ns::num_types];
-
-using dpctl::tensor::kernels::dot_product_impl_fn_ptr_t;
-static dot_product_impl_fn_ptr_t dot_product_dispatch_table[td_ns::num_types]
-                                                           [td_ns::num_types];
-
-static dot_product_impl_fn_ptr_t
-    dot_product_temps_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-using dpctl::tensor::kernels::dot_product_contig_impl_fn_ptr_t;
-static dot_product_contig_impl_fn_ptr_t
-    dot_product_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-static dot_product_contig_impl_fn_ptr_t
-    dot_product_contig_temps_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-using dpctl::tensor::kernels::gemm_impl_fn_ptr_t;
-static gemm_impl_fn_ptr_t gemm_atomic_dispatch_table[td_ns::num_types]
-                                                    [td_ns::num_types];
-
-static gemm_impl_fn_ptr_t gemm_temps_dispatch_table[td_ns::num_types]
-                                                   [td_ns::num_types];
-
-using dpctl::tensor::kernels::gemm_contig_impl_fn_ptr_t;
-static gemm_contig_impl_fn_ptr_t
-    gemm_contig_atomic_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-static gemm_contig_impl_fn_ptr_t
-    gemm_contig_temps_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-using dpctl::tensor::kernels::gemm_batch_impl_fn_ptr_t;
-static gemm_batch_impl_fn_ptr_t
-    gemm_batch_atomic_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-static gemm_batch_impl_fn_ptr_t
-    gemm_batch_temps_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-using dpctl::tensor::kernels::gemm_batch_contig_impl_fn_ptr_t;
-static gemm_batch_contig_impl_fn_ptr_t
-    gemm_batch_contig_atomic_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-static gemm_batch_contig_impl_fn_ptr_t
-    gemm_batch_contig_temps_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-void init_dot_dispatch_tables(void)
-{
-    using dpctl::tensor::py_internal::DotTypeMapFactory;
-    td_ns::DispatchTableBuilder<int, DotTypeMapFactory, td_ns::num_types> dtb1;
-    dtb1.populate_dispatch_table(dot_output_id_table);
-
-    using dpctl::tensor::py_internal::GemmBatchAtomicFactory;
-    td_ns::DispatchTableBuilder<gemm_batch_impl_fn_ptr_t,
-                                GemmBatchAtomicFactory, td_ns::num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(gemm_batch_atomic_dispatch_table);
-
-    using dpctl::tensor::py_internal::GemmBatchContigAtomicFactory;
-    td_ns::DispatchTableBuilder<gemm_batch_contig_impl_fn_ptr_t,
-                                GemmBatchContigAtomicFactory, td_ns::num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(gemm_batch_contig_atomic_dispatch_table);
-
-    using dpctl::tensor::py_internal::GemmAtomicFactory;
-    td_ns::DispatchTableBuilder<gemm_impl_fn_ptr_t, GemmAtomicFactory,
-                                td_ns::num_types>
-        dtb4;
-    dtb4.populate_dispatch_table(gemm_atomic_dispatch_table);
-
-    using dpctl::tensor::py_internal::GemmContigAtomicFactory;
-    td_ns::DispatchTableBuilder<gemm_contig_impl_fn_ptr_t,
-                                GemmContigAtomicFactory, td_ns::num_types>
-        dtb5;
-    dtb5.populate_dispatch_table(gemm_contig_atomic_dispatch_table);
-
-    using dpctl::tensor::py_internal::GemmBatchTempsFactory;
-    td_ns::DispatchTableBuilder<gemm_batch_impl_fn_ptr_t, GemmBatchTempsFactory,
-                                td_ns::num_types>
-        dtb6;
-    dtb6.populate_dispatch_table(gemm_batch_temps_dispatch_table);
-
-    using dpctl::tensor::py_internal::GemmBatchContigTempsFactory;
-    td_ns::DispatchTableBuilder<gemm_batch_contig_impl_fn_ptr_t,
-                                GemmBatchContigTempsFactory, td_ns::num_types>
-        dtb7;
-    dtb7.populate_dispatch_table(gemm_batch_contig_temps_dispatch_table);
-
-    using dpctl::tensor::py_internal::GemmTempsFactory;
-    td_ns::DispatchTableBuilder<gemm_impl_fn_ptr_t, GemmTempsFactory,
-                                td_ns::num_types>
-        dtb8;
-    dtb8.populate_dispatch_table(gemm_temps_dispatch_table);
-
-    using dpctl::tensor::py_internal::GemmContigTempsFactory;
-    td_ns::DispatchTableBuilder<gemm_contig_impl_fn_ptr_t,
-                                GemmContigTempsFactory, td_ns::num_types>
-        dtb9;
-    dtb9.populate_dispatch_table(gemm_contig_temps_dispatch_table);
-
-    using dpctl::tensor::py_internal::DotProductAtomicFactory;
-    td_ns::DispatchTableBuilder<dot_product_impl_fn_ptr_t,
-                                DotProductAtomicFactory, td_ns::num_types>
-        dtb10;
-    dtb10.populate_dispatch_table(dot_product_dispatch_table);
-
-    using dpctl::tensor::py_internal::DotProductNoAtomicFactory;
-    td_ns::DispatchTableBuilder<dot_product_impl_fn_ptr_t,
-                                DotProductNoAtomicFactory, td_ns::num_types>
-        dtb11;
-    dtb11.populate_dispatch_table(dot_product_temps_dispatch_table);
-
-    using dpctl::tensor::py_internal::DotProductContigAtomicFactory;
-    td_ns::DispatchTableBuilder<dot_product_contig_impl_fn_ptr_t,
-                                DotProductContigAtomicFactory, td_ns::num_types>
-        dtb12;
-    dtb12.populate_dispatch_table(dot_product_contig_dispatch_table);
-
-    using dpctl::tensor::py_internal::DotProductContigNoAtomicFactory;
-    td_ns::DispatchTableBuilder<dot_product_contig_impl_fn_ptr_t,
-                                DotProductContigNoAtomicFactory,
-                                td_ns::num_types>
-        dtb13;
-    dtb13.populate_dispatch_table(dot_product_contig_temps_dispatch_table);
-}
-
-using atomic_support::atomic_support_fn_ptr_t;
-static atomic_support_fn_ptr_t dot_atomic_support_vector[td_ns::num_types];
-
-void init_dot_atomic_support_vector(void)
-{
-
-    using atomic_support::DotAtomicSupportFactory;
-    td_ns::DispatchVectorBuilder<atomic_support_fn_ptr_t,
-                                 DotAtomicSupportFactory, td_ns::num_types>
-        dvb;
-    dvb.populate_dispatch_vector(dot_atomic_support_vector);
-}
-
-std::pair<sycl::event, sycl::event>
-py_dot(const dpctl::tensor::usm_ndarray &x1,
-       const dpctl::tensor::usm_ndarray &x2,
-       int batch_dims,
-       int x1_outer_dims,
-       int x2_outer_dims,
-       int inner_dims,
-       const dpctl::tensor::usm_ndarray &dst,
-       sycl::queue &exec_q,
-       const std::vector<sycl::event> &depends)
-{
-    if (!dpctl::utils::queues_are_compatible(exec_q, {x1, x2, dst})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    if (inner_dims == 0) {
-        throw py::value_error("No inner dimension for dot");
-    }
-
-    int x1_nd = x1.get_ndim();
-    int x2_nd = x2.get_ndim();
-    if (x1_nd != (batch_dims + x1_outer_dims + inner_dims) ||
-        x2_nd != (batch_dims + x2_outer_dims + inner_dims))
-    {
-        throw py::value_error("Input arrays do not have dimensions consistent "
-                              "with input dimensions");
-    }
-
-    int dst_nd = dst.get_ndim();
-    if (dst_nd != (batch_dims + x1_outer_dims + x2_outer_dims)) {
-        throw py::value_error("Destination array rank does not match input "
-                              "array rank and number of input dimensions");
-    }
-
-    const py::ssize_t *x1_shape_ptr = x1.get_shape_raw();
-    const py::ssize_t *x2_shape_ptr = x2.get_shape_raw();
-    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
-
-    bool same_shapes = true;
-    std::size_t batches(1);
-    for (int i = 0; same_shapes && (i < batch_dims); ++i) {
-        same_shapes = same_shapes && (x1_shape_ptr[i] == dst_shape_ptr[i]) &&
-                      (x2_shape_ptr[i] == dst_shape_ptr[i]);
-        batches *= x1_shape_ptr[i];
-    }
-    std::size_t x1_outer_nelems(1);
-    for (int i = batch_dims; same_shapes && (i < (batch_dims + x1_outer_dims));
-         ++i)
-    {
-        same_shapes = same_shapes && (x1_shape_ptr[i] == dst_shape_ptr[i]);
-        x1_outer_nelems *= x1_shape_ptr[i];
-    }
-    std::size_t inner_nelems(1);
-    for (int i = batch_dims; i < (batch_dims + inner_dims); ++i) {
-        auto x1_shape_idx = x1_outer_dims + i;
-        same_shapes =
-            same_shapes && (x1_shape_ptr[x1_shape_idx] == x2_shape_ptr[i]);
-        inner_nelems *= x1_shape_ptr[x1_shape_idx];
-    }
-    std::size_t x2_outer_nelems(1);
-    for (int i = 0; same_shapes && (i < x2_outer_dims); ++i) {
-        auto x2_shape_idx = batch_dims + inner_dims + i;
-        same_shapes =
-            same_shapes && (x2_shape_ptr[x2_shape_idx] ==
-                            dst_shape_ptr[batch_dims + x1_outer_dims + i]);
-        x2_outer_nelems *= x2_shape_ptr[x2_shape_idx];
-    }
-    if (!same_shapes) {
-        throw py::value_error("Input arrays to tensor dot product do not have "
-                              "appropriate shapes");
-    }
-
-    std::size_t dst_nelems = batches * x1_outer_nelems * x2_outer_nelems;
-    if (dst_nelems == 0) {
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    if (static_cast<std::size_t>(dst.get_size()) != dst_nelems) {
-        throw py::value_error("dst shape and size mismatch");
-    }
-
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems);
-
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    // check that dst does not intersect with x1 or x2
-    if (overlap(dst, x1) || overlap(dst, x2)) {
-        throw py::value_error("Result array overlaps with inputs");
-    }
-
-    int x1_typenum = x1.get_typenum();
-    int x2_typenum = x2.get_typenum();
-    int dst_typenum = dst.get_typenum();
-
-    auto const &array_types = td_ns::usm_ndarray_types();
-    int x1_typeid = array_types.typenum_to_lookup_id(x1_typenum);
-    int x2_typeid = array_types.typenum_to_lookup_id(x2_typenum);
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    int output_typeid = dot_output_id_table[x1_typeid][x2_typeid];
-
-    if (output_typeid != dst_typeid) {
-        throw py::value_error(
-            "Result array has unexpected elemental data type.");
-    }
-
-    void *data_ptr = dst.get_data();
-    const auto &ctx = exec_q.get_context();
-    auto usm_type = sycl::get_pointer_type(data_ptr, ctx);
-    bool supports_atomics =
-        dot_atomic_support_vector[output_typeid](exec_q, usm_type);
-
-    const char *x1_data = x1.get_data();
-    const char *x2_data = x2.get_data();
-    char *dst_data = dst.get_data();
-
-    const auto &x1_shape_vec = x1.get_shape_vector();
-    const auto &x1_strides_vec = x1.get_strides_vector();
-
-    const auto &x2_shape_vec = x2.get_shape_vector();
-    const auto &x2_strides_vec = x2.get_strides_vector();
-
-    const auto &dst_shape_vec = dst.get_shape_vector();
-    const auto &dst_strides_vec = dst.get_strides_vector();
-
-    bool is_x1_c_contig = x1.is_c_contiguous();
-    bool is_x1_f_contig = x1.is_f_contiguous();
-    bool is_x2_c_contig = x2.is_c_contiguous();
-    bool is_x2_f_contig = x2.is_f_contiguous();
-    bool is_dst_c_contig = dst.is_c_contiguous();
-
-    bool call_vecdot = ((x1_outer_dims == 0 && x1_outer_nelems == 1) &&
-                        (x2_outer_dims == 0 && x2_outer_nelems == 1));
-
-    bool call_batched = (batch_dims != 0 || batches > 1);
-    std::vector<sycl::event> host_task_events{};
-    sycl::event dot_ev;
-    if (call_vecdot) {
-        if ((is_x1_c_contig && is_x2_c_contig && is_dst_c_contig) ||
-            ((is_x1_f_contig && is_x2_f_contig) && !call_batched))
-        {
-            dot_product_contig_impl_fn_ptr_t fn = nullptr;
-            if (supports_atomics) {
-                fn = dot_product_contig_dispatch_table[x1_typeid][x2_typeid];
-            }
-            else {
-                fn = dot_product_contig_temps_dispatch_table[x1_typeid]
-                                                            [x2_typeid];
-            }
-            if (fn != nullptr) {
-                static constexpr py::ssize_t zero_offset = 0;
-                dot_ev = fn(exec_q, dst_nelems, inner_nelems, x1.get_data(),
-                            x2.get_data(), dst.get_data(),
-                            zero_offset, // lhs batch offset
-                            zero_offset, // rhs batch offset
-                            zero_offset, // res batch offset
-                            zero_offset, // lhs reduction offset
-                            zero_offset, // rhs reduction offset
-                            depends);
-                return std::make_pair(dpctl::utils::keep_args_alive(
-                                          exec_q, {x1, x2, dst}, {dot_ev}),
-                                      dot_ev);
-            }
-        }
-        using dpctl::tensor::py_internal::simplify_iteration_space;
-        using dpctl::tensor::py_internal::simplify_iteration_space_3;
-
-        int inner_nd = inner_dims;
-        const py::ssize_t *inner_shape_ptr = x1_shape_ptr + batch_dims;
-        using shT = std::vector<py::ssize_t>;
-        const shT inner_x1_strides(std::begin(x1_strides_vec) + batch_dims,
-                                   std::end(x1_strides_vec));
-        const shT inner_x2_strides(std::begin(x2_strides_vec) + batch_dims,
-                                   std::end(x2_strides_vec));
-
-        shT simplified_inner_shape;
-        shT simplified_inner_x1_strides;
-        shT simplified_inner_x2_strides;
-        py::ssize_t inner_x1_offset(0);
-        py::ssize_t inner_x2_offset(0);
-
-        simplify_iteration_space(
-            inner_nd, inner_shape_ptr, inner_x1_strides, inner_x2_strides,
-            // output
-            simplified_inner_shape, simplified_inner_x1_strides,
-            simplified_inner_x2_strides, inner_x1_offset, inner_x2_offset);
-
-        const py::ssize_t *batch_shape_ptr = x1_shape_ptr;
-
-        const shT batch_x1_strides(std::begin(x1_strides_vec),
-                                   std::begin(x1_strides_vec) + batch_dims);
-        const shT batch_x2_strides(std::begin(x2_strides_vec),
-                                   std::begin(x2_strides_vec) + batch_dims);
-        shT const &batch_dst_strides = dst_strides_vec;
-
-        shT simplified_batch_shape;
-        shT simplified_batch_x1_strides;
-        shT simplified_batch_x2_strides;
-        shT simplified_batch_dst_strides;
-        py::ssize_t batch_x1_offset(0);
-        py::ssize_t batch_x2_offset(0);
-        py::ssize_t batch_dst_offset(0);
-
-        if (batch_dims == 0) {
-            if (dst_nelems != 1) {
-                throw std::runtime_error(
-                    "batch_dims == 0, but dst_nelems != 1");
-            }
-            batch_dims = 1;
-            simplified_batch_shape.push_back(1);
-            simplified_batch_x1_strides.push_back(0);
-            simplified_batch_x2_strides.push_back(0);
-            simplified_batch_dst_strides.push_back(0);
-        }
-        else {
-            simplify_iteration_space_3(
-                batch_dims, batch_shape_ptr, batch_x1_strides, batch_x2_strides,
-                batch_dst_strides,
-                // output
-                simplified_batch_shape, simplified_batch_x1_strides,
-                simplified_batch_x2_strides, simplified_batch_dst_strides,
-                batch_x1_offset, batch_x2_offset, batch_dst_offset);
-        }
-
-        if (inner_nd == 1 && batch_dims == 1) {
-            bool dot_product_c_contig = false;
-            bool reduce_all_elems = false;
-
-            if (simplified_inner_x1_strides[0] == 1 &&
-                simplified_inner_x2_strides[0] == 1)
-            {
-                reduce_all_elems = (simplified_batch_shape[0] == 1);
-                dot_product_c_contig =
-                    (simplified_batch_dst_strides[0] == 1) &&
-                    (static_cast<std::size_t>(simplified_batch_x1_strides[0]) ==
-                     inner_nelems) &&
-                    (static_cast<std::size_t>(simplified_batch_x2_strides[0]) ==
-                     inner_nelems);
-            }
-
-            if (dot_product_c_contig || reduce_all_elems) {
-                dot_product_contig_impl_fn_ptr_t fn = nullptr;
-                if (supports_atomics) {
-                    fn =
-                        dot_product_contig_dispatch_table[x1_typeid][x2_typeid];
-                }
-                else {
-                    fn = dot_product_contig_temps_dispatch_table[x1_typeid]
-                                                                [x2_typeid];
-                }
-                if (fn != nullptr) {
-                    dot_ev = fn(exec_q, dst_nelems, inner_nelems, x1.get_data(),
-                                x2.get_data(), dst.get_data(),
-                                batch_x1_offset,  // lhs batch offset
-                                batch_x2_offset,  // rhs batch offset
-                                batch_dst_offset, // res batch offset
-                                inner_x1_offset,  // lhs reduction offset
-                                inner_x2_offset,  // rhs reduction offset
-                                depends);
-                    return std::make_pair(dpctl::utils::keep_args_alive(
-                                              exec_q, {x1, x2, dst}, {dot_ev}),
-                                          dot_ev);
-                }
-            }
-        }
-
-        dot_product_impl_fn_ptr_t fn = nullptr;
-        if (supports_atomics) {
-            fn = dot_product_dispatch_table[x1_typeid][x2_typeid];
-        }
-        if (fn == nullptr) {
-            fn = dot_product_temps_dispatch_table[x1_typeid][x2_typeid];
-            if (fn == nullptr) {
-                throw std::runtime_error(
-                    "Implementation is missing for x1_typeid=" +
-                    std::to_string(x1_typeid) +
-                    " and x2_typeid=" + std::to_string(x2_typeid));
-            }
-        }
-
-        using dpctl::tensor::offset_utils::device_allocate_and_pack;
-        auto arrays_metainfo_packing_triple_ =
-            device_allocate_and_pack<py::ssize_t>(
-                exec_q, host_task_events,
-                // iteration metadata
-                simplified_batch_shape, simplified_batch_x1_strides,
-                simplified_batch_x2_strides, simplified_batch_dst_strides,
-                // reduction metadata
-                simplified_inner_shape, simplified_inner_x1_strides,
-                simplified_inner_x2_strides);
-        auto tmp_alloc_owner =
-            std::move(std::get<0>(arrays_metainfo_packing_triple_));
-        const auto &copy_metadata_ev =
-            std::get<2>(arrays_metainfo_packing_triple_);
-        const py::ssize_t *temp_allocation_ptr = tmp_alloc_owner.get();
-
-        const py::ssize_t *iter_shape_and_strides = temp_allocation_ptr;
-        const py::ssize_t *inner_shape_stride =
-            temp_allocation_ptr + 4 * simplified_batch_shape.size();
-
-        std::vector<sycl::event> all_deps;
-        all_deps.reserve(depends.size() + 1);
-        all_deps.resize(depends.size());
-        std::copy(depends.begin(), depends.end(), all_deps.begin());
-        all_deps.push_back(copy_metadata_ev);
-
-        dot_ev =
-            fn(exec_q, dst_nelems, inner_nelems, x1.get_data(), x2.get_data(),
-               dst.get_data(), batch_dims, iter_shape_and_strides,
-               batch_x1_offset, batch_x2_offset, batch_dst_offset,
-               inner_nd, // number dimensions being reduced
-               inner_shape_stride, inner_x1_offset, inner_x2_offset, all_deps);
-
-        sycl::event temp_cleanup_ev =
-            dpctl::tensor::alloc_utils::async_smart_free(exec_q, {dot_ev},
-                                                         tmp_alloc_owner);
-        host_task_events.push_back(temp_cleanup_ev);
-    }
-    else { // if (!call_vecdot)
-        if (!call_batched) {
-            if ((is_x1_c_contig && is_x2_c_contig && is_dst_c_contig)) {
-                gemm_contig_impl_fn_ptr_t fn = nullptr;
-                if (supports_atomics) {
-                    fn =
-                        gemm_contig_atomic_dispatch_table[x1_typeid][x2_typeid];
-                }
-                else {
-                    fn = gemm_contig_temps_dispatch_table[x1_typeid][x2_typeid];
-                }
-                if (fn != nullptr) {
-                    dot_ev = fn(exec_q, x1_data, x2_data, dst_data,
-                                x1_outer_nelems, // n
-                                inner_nelems,    // k
-                                x2_outer_nelems, // m
-                                depends);
-                    return std::make_pair(dpctl::utils::keep_args_alive(
-                                              exec_q, {x1, x2, dst}, {dot_ev}),
-                                          dot_ev);
-                }
-            }
-            gemm_impl_fn_ptr_t fn = nullptr;
-            if (supports_atomics) {
-                fn = gemm_atomic_dispatch_table[x1_typeid][x2_typeid];
-            }
-            if (fn == nullptr) {
-                fn = gemm_temps_dispatch_table[x1_typeid][x2_typeid];
-                if (fn == nullptr) {
-                    throw std::runtime_error(
-                        "Implementation is missing for x1_typeid=" +
-                        std::to_string(x1_typeid) +
-                        " and x2_typeid=" + std::to_string(x2_typeid));
-                }
-            }
-            using dpctl::tensor::offset_utils::device_allocate_and_pack;
-            auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
-                exec_q, host_task_events, x1_shape_vec, x1_strides_vec,
-                x2_shape_vec, x2_strides_vec, dst_shape_vec, dst_strides_vec);
-            auto packed_shapes_strides_owner =
-                std::move(std::get<0>(ptr_size_event_tuple1));
-            sycl::event copy_shapes_strides_ev =
-                std::get<2>(ptr_size_event_tuple1);
-            const py::ssize_t *packed_shapes_strides =
-                packed_shapes_strides_owner.get();
-
-            const py::ssize_t *x1_shape_strides = packed_shapes_strides;
-            const py::ssize_t *x2_shape_strides =
-                packed_shapes_strides + 2 * (x1_nd);
-            const py::ssize_t *dst_shape_strides =
-                packed_shapes_strides + 2 * (x1_nd + x2_nd);
-
-            std::vector<sycl::event> all_deps;
-            all_deps.reserve(depends.size() + 1);
-            all_deps.insert(all_deps.end(), depends.begin(), depends.end());
-            all_deps.push_back(copy_shapes_strides_ev);
-
-            // change gemm calls to pass inner dims and outer dims separately
-            dot_ev =
-                fn(exec_q, x1_data, x2_data, dst_data, x1_outer_nelems,
-                   inner_nelems, x2_outer_nelems, inner_dims, x1_outer_dims,
-                   x1_shape_strides, x2_outer_dims, x2_shape_strides,
-                   x1_outer_dims + x2_outer_dims, dst_shape_strides, all_deps);
-
-            sycl::event cleanup_tmp_allocations_ev =
-                dpctl::tensor::alloc_utils::async_smart_free(
-                    exec_q, {dot_ev}, packed_shapes_strides_owner);
-            host_task_events.push_back(cleanup_tmp_allocations_ev);
-        }
-        else { // if (call_batched)
-            using shT = std::vector<py::ssize_t>;
-            // temporary asserts for matmul
-            assert(x1_outer_dims == 1);
-            assert(x2_outer_dims == 1);
-            assert(inner_dims == 1);
-
-            if ((is_x1_c_contig && is_x2_c_contig && is_dst_c_contig)) {
-                gemm_batch_contig_impl_fn_ptr_t fn = nullptr;
-                if (supports_atomics) {
-                    fn = gemm_batch_contig_atomic_dispatch_table[x1_typeid]
-                                                                [x2_typeid];
-                }
-                else {
-                    fn = gemm_batch_contig_temps_dispatch_table[x1_typeid]
-                                                               [x2_typeid];
-                }
-                if (fn != nullptr) {
-                    static constexpr py::ssize_t zero_offset = 0;
-                    dot_ev = fn(exec_q, x1_data, x2_data, dst_data, batches,
-                                x1_outer_nelems, // n
-                                inner_nelems,    // k
-                                x2_outer_nelems, // m
-                                zero_offset, zero_offset, zero_offset, depends);
-                    return std::make_pair(dpctl::utils::keep_args_alive(
-                                              exec_q, {x1, x2, dst}, {dot_ev}),
-                                          dot_ev);
-                }
-            }
-
-            auto x1_outer_inner_dims = x1_nd - batch_dims;
-            auto x2_outer_inner_dims = x2_nd - batch_dims;
-            auto dst_outer_inner_dims = dst_nd - batch_dims;
-
-            shT batch_x1_shape;
-            shT outer_inner_x1_shape;
-            shT batch_x1_strides;
-            shT outer_inner_x1_strides;
-            dpctl::tensor::py_internal::split_iteration_space(
-                x1_shape_vec, x1_strides_vec, batch_dims,
-                batch_dims + x1_outer_inner_dims,
-                // 4 vectors modified
-                batch_x1_shape, outer_inner_x1_shape, batch_x1_strides,
-                outer_inner_x1_strides);
-
-            shT batch_x2_shape;
-            shT outer_inner_x2_shape;
-            shT batch_x2_strides;
-            shT outer_inner_x2_strides;
-            dpctl::tensor::py_internal::split_iteration_space(
-                x2_shape_vec, x2_strides_vec, batch_dims,
-                batch_dims + x2_outer_inner_dims,
-                // 4 vectors modified
-                batch_x2_shape, outer_inner_x2_shape, batch_x2_strides,
-                outer_inner_x2_strides);
-
-            shT batch_dst_shape;
-            shT outer_inner_dst_shape;
-            shT batch_dst_strides;
-            shT outer_inner_dst_strides;
-            dpctl::tensor::py_internal::split_iteration_space(
-                dst_shape_vec, dst_strides_vec, batch_dims,
-                batch_dims + dst_outer_inner_dims,
-                // 4 vectors modified
-                batch_dst_shape, outer_inner_dst_shape, batch_dst_strides,
-                outer_inner_dst_strides);
-
-            using shT = std::vector<py::ssize_t>;
-            shT simplified_batch_shape;
-            shT simplified_batch_x1_strides;
-            shT simplified_batch_x2_strides;
-            shT simplified_batch_dst_strides;
-            py::ssize_t x1_batch_offset(0);
-            py::ssize_t x2_batch_offset(0);
-            py::ssize_t dst_batch_offset(0);
-
-            const py::ssize_t *shape = x1_shape_ptr;
-
-            using dpctl::tensor::py_internal::simplify_iteration_space_3;
-            simplify_iteration_space_3(
-                batch_dims, shape, batch_x1_strides, batch_x2_strides,
-                batch_dst_strides,
-                // outputs
-                simplified_batch_shape, simplified_batch_x1_strides,
-                simplified_batch_x2_strides, simplified_batch_dst_strides,
-                x1_batch_offset, x2_batch_offset, dst_batch_offset);
-
-            if (batch_dims == 1 && x1_outer_dims == 1 && x2_outer_dims == 1 &&
-                inner_dims == 1)
-            {
-                bool gemm_batch_c_contig = false;
-
-                if ((static_cast<std::size_t>(outer_inner_x1_strides[0]) ==
-                         inner_nelems &&
-                     outer_inner_x1_strides[1] == 1) &&
-                    (static_cast<std::size_t>(outer_inner_x2_strides[0]) ==
-                         inner_nelems &&
-                     outer_inner_x2_strides[1] == 1) &&
-                    (static_cast<std::size_t>(outer_inner_dst_strides[0]) ==
-                         x2_outer_nelems &&
-                     outer_inner_dst_strides[1] == 1))
-                {
-                    gemm_batch_c_contig =
-                        (static_cast<std::size_t>(
-                             simplified_batch_x1_strides[0]) ==
-                         x1_outer_nelems * inner_nelems) &&
-                        (static_cast<std::size_t>(
-                             simplified_batch_x2_strides[0]) ==
-                         x2_outer_nelems * inner_nelems) &&
-                        (static_cast<std::size_t>(
-                             simplified_batch_dst_strides[0]) ==
-                         x1_outer_nelems * x2_outer_nelems);
-                }
-
-                if (gemm_batch_c_contig) {
-                    gemm_batch_contig_impl_fn_ptr_t fn = nullptr;
-                    if (supports_atomics) {
-                        fn = gemm_batch_contig_atomic_dispatch_table[x1_typeid]
-                                                                    [x2_typeid];
-                    }
-                    else {
-                        fn = gemm_batch_contig_temps_dispatch_table[x1_typeid]
-                                                                   [x2_typeid];
-                    }
-                    if (fn != nullptr) {
-                        dot_ev = fn(exec_q, x1_data, x2_data, dst_data, batches,
-                                    x1_outer_nelems, // n
-                                    inner_nelems,    // k
-                                    x2_outer_nelems, // m
-                                    x1_batch_offset, x2_batch_offset,
-                                    dst_batch_offset, depends);
-                        return std::make_pair(
-                            dpctl::utils::keep_args_alive(exec_q, {x1, x2, dst},
-                                                          {dot_ev}),
-                            dot_ev);
-                    }
-                }
-            }
-
-            gemm_batch_impl_fn_ptr_t fn = nullptr;
-            if (supports_atomics) {
-                fn = gemm_batch_atomic_dispatch_table[x1_typeid][x2_typeid];
-            }
-            if (fn == nullptr) {
-                fn = gemm_batch_temps_dispatch_table[x1_typeid][x2_typeid];
-                if (fn == nullptr) {
-                    throw std::runtime_error(
-                        "Implementation is missing for x1_typeid=" +
-                        std::to_string(x1_typeid) +
-                        " and x2_typeid=" + std::to_string(x2_typeid));
-                }
-            }
-            using dpctl::tensor::offset_utils::device_allocate_and_pack;
-            auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
-                exec_q, host_task_events, simplified_batch_shape,
-                simplified_batch_x1_strides, simplified_batch_x2_strides,
-                simplified_batch_dst_strides, outer_inner_x1_shape,
-                outer_inner_x1_strides, outer_inner_x2_shape,
-                outer_inner_x2_strides, outer_inner_dst_shape,
-                outer_inner_dst_strides,
-                // full shape and strides of the result array
-                // necessary for reduction and initialization
-                simplified_batch_shape, outer_inner_dst_shape,
-                simplified_batch_dst_strides, outer_inner_dst_strides);
-            auto packed_shapes_strides_owner =
-                std::move(std::get<0>(ptr_size_event_tuple1));
-            sycl::event copy_shapes_strides_ev =
-                std::get<2>(ptr_size_event_tuple1);
-            const py::ssize_t *packed_shapes_strides =
-                packed_shapes_strides_owner.get();
-
-            const auto batch_shape_strides = packed_shapes_strides;
-            const auto x1_outer_inner_shapes_strides =
-                packed_shapes_strides + 4 * batch_dims;
-            const auto x2_outer_inner_shapes_strides =
-                packed_shapes_strides + 4 * batch_dims +
-                2 * (x1_outer_inner_dims);
-            const auto dst_outer_shapes_strides =
-                packed_shapes_strides + 4 * batch_dims +
-                2 * (x1_outer_inner_dims) + 2 * (x2_outer_inner_dims);
-            const auto dst_full_shape_strides =
-                packed_shapes_strides + 4 * batch_dims +
-                2 * (x1_outer_inner_dims) + 2 * (x2_outer_inner_dims) +
-                2 * (dst_outer_inner_dims);
-
-            std::vector<sycl::event> all_deps;
-            all_deps.reserve(depends.size() + 1);
-            all_deps.insert(all_deps.end(), depends.begin(), depends.end());
-            all_deps.push_back(copy_shapes_strides_ev);
-
-            dot_ev = fn(
-                exec_q, x1_data, x2_data, dst_data, batches, x1_outer_nelems,
-                inner_nelems, x2_outer_nelems, batch_dims, batch_shape_strides,
-                x1_batch_offset, x2_batch_offset, dst_batch_offset, inner_dims,
-                x1_outer_dims, x1_outer_inner_shapes_strides, x2_outer_dims,
-                x2_outer_inner_shapes_strides, x1_outer_dims + x2_outer_dims,
-                dst_outer_shapes_strides, dst_full_shape_strides, all_deps);
-
-            sycl::event cleanup_tmp_allocations_ev =
-                dpctl::tensor::alloc_utils::async_smart_free(
-                    exec_q, {dot_ev}, packed_shapes_strides_owner);
-            host_task_events.push_back(cleanup_tmp_allocations_ev);
-        }
-    }
-    return std::make_pair(
-        dpctl::utils::keep_args_alive(exec_q, {x1, x2, dst}, host_task_events),
-        dot_ev);
-}
-
-template <typename output_typesT>
-py::object py_dot_result_type(const py::dtype &input1_dtype,
-                              const py::dtype &input2_dtype,
-                              const output_typesT &output_types_table)
-{
-    int tn1 = input1_dtype.num(); // NumPy type numbers are the same as in dpctl
-    int tn2 = input2_dtype.num(); // NumPy type numbers are the same as in dpctl
-    int src1_typeid = -1;
-    int src2_typeid = -1;
-
-    auto array_types = td_ns::usm_ndarray_types();
-
-    try {
-        src1_typeid = array_types.typenum_to_lookup_id(tn1);
-        src2_typeid = array_types.typenum_to_lookup_id(tn2);
-    } catch (const std::exception &e) {
-        throw py::value_error(e.what());
-    }
-
-    if (src1_typeid < 0 || src1_typeid >= td_ns::num_types || src2_typeid < 0 ||
-        src2_typeid >= td_ns::num_types)
-    {
-        throw std::runtime_error("binary output type lookup failed");
-    }
-    int dst_typeid = output_types_table[src1_typeid][src2_typeid];
-
-    if (dst_typeid < 0) {
-        auto res = py::none();
-        return py::cast<py::object>(res);
-    }
-    else {
-        using dpctl::tensor::py_internal::type_utils::_dtype_from_typenum;
-
-        auto dst_typenum_t = static_cast<td_ns::typenum_t>(dst_typeid);
-        auto dt = _dtype_from_typenum(dst_typenum_t);
-
-        return py::cast<py::object>(dt);
-    }
-}
-
-void init_dot(py::module_ m)
-{
-    using dpctl::tensor::py_internal::init_dot_atomic_support_vector;
-    init_dot_atomic_support_vector();
-    using dpctl::tensor::py_internal::init_dot_dispatch_tables;
-    init_dot_dispatch_tables();
-
-    using dpctl::tensor::py_internal::py_dot;
-    m.def("_dot", &py_dot, "", py::arg("x1"), py::arg("x2"),
-          py::arg("batch_dims"), py::arg("x1_outer_dims"),
-          py::arg("x2_outer_dims"), py::arg("inner_dims"), py::arg("dst"),
-          py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-    using dpctl::tensor::py_internal::dot_output_id_table;
-    auto dot_result_type_pyapi = [&](const py::dtype &dtype1,
-                                     const py::dtype &dtype2) {
-        using dpctl::tensor::py_internal::py_dot_result_type;
-        return py_dot_result_type(dtype1, dtype2, dot_output_id_table);
-    };
-    m.def("_dot_result_type", dot_result_type_pyapi, "");
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/linalg_functions/dot.hpp b/dpctl/tensor/libtensor/source/linalg_functions/dot.hpp
deleted file mode 100644
index c97a9f7e85..0000000000
--- a/dpctl/tensor/libtensor/source/linalg_functions/dot.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_dot(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/linalg_functions/dot_atomic_support.hpp b/dpctl/tensor/libtensor/source/linalg_functions/dot_atomic_support.hpp
deleted file mode 100644
index c256d83845..0000000000
--- a/dpctl/tensor/libtensor/source/linalg_functions/dot_atomic_support.hpp
+++ /dev/null
@@ -1,58 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#pragma once
-
-#include <type_traits>
-
-#include "reductions/reduction_atomic_support.hpp"
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-namespace atomic_support
-{
-
-template <typename fnT, typename T> struct DotAtomicSupportFactory
-{
-    fnT get()
-    {
-        using dpctl::tensor::type_utils::is_complex;
-        if constexpr (is_complex<T>::value) {
-            return atomic_support::fixed_decision<false>;
-        }
-        else {
-            return atomic_support::check_atomic_support<T>;
-        }
-    }
-};
-
-} // namespace atomic_support
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/linalg_functions/dot_dispatch.hpp b/dpctl/tensor/libtensor/source/linalg_functions/dot_dispatch.hpp
deleted file mode 100644
index 2437ed40bb..0000000000
--- a/dpctl/tensor/libtensor/source/linalg_functions/dot_dispatch.hpp
+++ /dev/null
@@ -1,391 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#pragma once
-
-#include <cstdint>
-#include <type_traits>
-#include <utility>
-
-#include "kernels/linalg_functions/dot_product.hpp"
-#include "kernels/linalg_functions/gemm.hpp"
-#include "utils/type_dispatch_building.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-template <typename T1, typename T2> struct DotAtomicOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint32_t,
-                                        T2,
-                                        std::uint32_t,
-                                        std::uint32_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint32_t,
-                                        T2,
-                                        std::uint32_t,
-                                        std::uint64_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int32_t,
-                                        T2,
-                                        std::int32_t,
-                                        std::int32_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int32_t,
-                                        T2,
-                                        std::int32_t,
-                                        std::int64_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint64_t,
-                                        T2,
-                                        std::uint64_t,
-                                        std::uint64_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int64_t,
-                                        T2,
-                                        std::int64_t,
-                                        std::int64_t>,
-        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
-        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, double>,
-        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-// add separate type support lists for atomic vs. temps
-// gemm, gevm, and dot product share output type struct
-template <typename T1, typename T2> struct DotNoAtomicOutputType
-{
-    using value_type = typename std::disjunction<
-        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint8_t,
-                                        T2,
-                                        std::uint8_t,
-                                        std::uint8_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int8_t,
-                                        T2,
-                                        std::int8_t,
-                                        std::int8_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint16_t,
-                                        T2,
-                                        std::uint16_t,
-                                        std::uint16_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int16_t,
-                                        T2,
-                                        std::int16_t,
-                                        std::int16_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint32_t,
-                                        T2,
-                                        std::uint32_t,
-                                        std::uint32_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint32_t,
-                                        T2,
-                                        std::uint32_t,
-                                        std::uint64_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int32_t,
-                                        T2,
-                                        std::int32_t,
-                                        std::int32_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int32_t,
-                                        T2,
-                                        std::int32_t,
-                                        std::int64_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::uint64_t,
-                                        T2,
-                                        std::uint64_t,
-                                        std::uint64_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::int64_t,
-                                        T2,
-                                        std::int64_t,
-                                        std::int64_t>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        sycl::half,
-                                        T2,
-                                        sycl::half,
-                                        sycl::half>,
-        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
-        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<float>,
-                                        T2,
-                                        std::complex<float>,
-                                        std::complex<float>>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<float>,
-                                        T2,
-                                        std::complex<float>,
-                                        std::complex<double>>,
-        td_ns::BinaryTypeMapResultEntry<T1,
-                                        std::complex<double>,
-                                        T2,
-                                        std::complex<double>,
-                                        std::complex<double>>,
-        td_ns::DefaultResultEntry<void>>::result_type;
-
-    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
-};
-
-template <typename fnT, typename T1, typename T2> struct DotTypeMapFactory
-{
-    /*! @brief get typeid for output type of kernels called by py_dot */
-    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
-    {
-        using rT1 = typename DotNoAtomicOutputType<T1, T2>::value_type;
-        using rT2 = typename DotAtomicOutputType<T1, T2>::value_type;
-        static_assert(std::is_same_v<rT1, rT2> || std::is_same_v<rT2, void>);
-        return td_ns::GetTypeid<rT1>{}.get();
-    }
-};
-
-template <typename fnT, typename T1, typename T2> struct GemmBatchAtomicFactory
-{
-    fnT get()
-    {
-        if constexpr (!DotAtomicOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            using dpctl::tensor::kernels::gemm_batch_impl;
-            using T3 = typename DotAtomicOutputType<T1, T2>::value_type;
-            fnT fn = gemm_batch_impl<T1, T2, T3>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2>
-struct GemmBatchContigAtomicFactory
-{
-    fnT get()
-    {
-        if constexpr (!DotAtomicOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            using dpctl::tensor::kernels::gemm_batch_contig_impl;
-            using T3 = typename DotAtomicOutputType<T1, T2>::value_type;
-            fnT fn = gemm_batch_contig_impl<T1, T2, T3>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2> struct GemmAtomicFactory
-{
-    fnT get()
-    {
-        if constexpr (!DotAtomicOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            using dpctl::tensor::kernels::gemm_impl;
-            using T3 = typename DotAtomicOutputType<T1, T2>::value_type;
-            fnT fn = gemm_impl<T1, T2, T3>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2> struct GemmContigAtomicFactory
-{
-    fnT get()
-    {
-        if constexpr (!DotAtomicOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            using dpctl::tensor::kernels::gemm_contig_impl;
-            using T3 = typename DotAtomicOutputType<T1, T2>::value_type;
-            fnT fn = gemm_contig_impl<T1, T2, T3>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2> struct GemmTempsFactory
-{
-    fnT get()
-    {
-        if constexpr (!DotNoAtomicOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            using dpctl::tensor::kernels::gemm_tree_impl;
-            using T3 = typename DotNoAtomicOutputType<T1, T2>::value_type;
-            fnT fn = gemm_tree_impl<T1, T2, T3>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2> struct GemmContigTempsFactory
-{
-    fnT get()
-    {
-        if constexpr (!DotNoAtomicOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            using dpctl::tensor::kernels::gemm_contig_tree_impl;
-            using T3 = typename DotNoAtomicOutputType<T1, T2>::value_type;
-            fnT fn = gemm_contig_tree_impl<T1, T2, T3>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2> struct GemmBatchTempsFactory
-{
-    fnT get()
-    {
-        if constexpr (!DotNoAtomicOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            using dpctl::tensor::kernels::gemm_batch_tree_impl;
-            using T3 = typename DotNoAtomicOutputType<T1, T2>::value_type;
-            fnT fn = gemm_batch_tree_impl<T1, T2, T3>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2>
-struct GemmBatchContigTempsFactory
-{
-    fnT get()
-    {
-        if constexpr (!DotNoAtomicOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            using dpctl::tensor::kernels::gemm_batch_contig_tree_impl;
-            using T3 = typename DotNoAtomicOutputType<T1, T2>::value_type;
-            fnT fn = gemm_batch_contig_tree_impl<T1, T2, T3>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2> struct DotProductAtomicFactory
-{
-    fnT get()
-    {
-        if constexpr (!DotAtomicOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            using dpctl::tensor::kernels::dot_product_impl;
-            using T3 = typename DotAtomicOutputType<T1, T2>::value_type;
-            fnT fn = dot_product_impl<T1, T2, T3>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2>
-struct DotProductNoAtomicFactory
-{
-    fnT get()
-    {
-        if constexpr (!DotNoAtomicOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            using dpctl::tensor::kernels::dot_product_tree_impl;
-            using T3 = typename DotNoAtomicOutputType<T1, T2>::value_type;
-            fnT fn = dot_product_tree_impl<T1, T2, T3>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2>
-struct DotProductContigAtomicFactory
-{
-    fnT get()
-    {
-        if constexpr (!DotAtomicOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            using dpctl::tensor::kernels::dot_product_contig_impl;
-            using T3 = typename DotAtomicOutputType<T1, T2>::value_type;
-            fnT fn = dot_product_contig_impl<T1, T2, T3>;
-            return fn;
-        }
-    }
-};
-
-template <typename fnT, typename T1, typename T2>
-struct DotProductContigNoAtomicFactory
-{
-    fnT get()
-    {
-        if constexpr (!DotNoAtomicOutputType<T1, T2>::is_defined) {
-            fnT fn = nullptr;
-            return fn;
-        }
-        else {
-            using dpctl::tensor::kernels::dot_product_contig_tree_impl;
-            using T3 = typename DotNoAtomicOutputType<T1, T2>::value_type;
-            fnT fn = dot_product_contig_tree_impl<T1, T2, T3>;
-            return fn;
-        }
-    }
-};
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/linear_sequences.cpp b/dpctl/tensor/libtensor/source/linear_sequences.cpp
deleted file mode 100644
index ca19b572d6..0000000000
--- a/dpctl/tensor/libtensor/source/linear_sequences.cpp
+++ /dev/null
@@ -1,300 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <complex>
-#include <cstddef>
-#include <pybind11/complex.h>
-#include <pybind11/pybind11.h>
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "kernels/constructors.hpp"
-#include "utils/output_validation.hpp"
-#include "utils/type_dispatch.hpp"
-#include "utils/type_utils.hpp"
-
-#include "linear_sequences.hpp"
-
-namespace py = pybind11;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-// Constructor to populate tensor with linear sequence defined by
-// start and step data
-
-typedef sycl::event (*lin_space_step_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t, // num_elements
-    const py::object &start,
-    const py::object &step,
-    char *, // dst_data_ptr
-    const std::vector<sycl::event> &);
-
-/*!
- * @brief Function to submit kernel to populate given contiguous memory
- * allocation with linear sequence specified by starting value and increment
- * given as Python objects.
- *
- * @param q  Sycl queue to which the kernel is submitted
- * @param nelems Length of the sequence
- * @param start Starting value of the sequence as Python object. Must be
- * convertible to array element data type `Ty`.
- * @param step  Increment of the sequence as Python object. Must be convertible
- * to array element data type `Ty`.
- * @param array_data Kernel accessible USM pointer to the start of array to be
- * populated.
- * @param depends List of events to wait for before starting computations, if
- * any.
- *
- * @return Event to wait on to ensure that computation completes.
- * @defgroup CtorKernels
- */
-template <typename Ty>
-sycl::event lin_space_step_impl(sycl::queue &exec_q,
-                                std::size_t nelems,
-                                const py::object &start,
-                                const py::object &step,
-                                char *array_data,
-                                const std::vector<sycl::event> &depends)
-{
-    Ty start_v = py::cast<Ty>(start);
-    Ty step_v = py::cast<Ty>(step);
-
-    using dpctl::tensor::kernels::constructors::lin_space_step_impl;
-
-    auto lin_space_step_event = lin_space_step_impl<Ty>(
-        exec_q, nelems, start_v, step_v, array_data, depends);
-
-    return lin_space_step_event;
-}
-
-typedef sycl::event (*lin_space_affine_fn_ptr_t)(
-    sycl::queue &,
-    std::size_t, // num_elements
-    const py::object &start,
-    const py::object &end,
-    bool include_endpoint,
-    char *, // dst_data_ptr
-    const std::vector<sycl::event> &);
-
-/*!
- * @brief Function to submit kernel to populate given contiguous memory
- * allocation with linear sequence specified  by starting and end values given
- * as Python objects.
- *
- * @param exec_q  Sycl queue to which kernel is submitted for execution.
- * @param nelems  Length of the sequence
- * @param start Stating value of the sequence as Python object. Must be
- * convertible to array data element type `Ty`.
- * @param end   End-value of the sequence as Python object. Must be convertible
- * to array data element type `Ty`.
- * @param include_endpoint  Whether the end-value is included in the sequence
- * @param array_data Kernel accessible USM pointer to the start of array to be
- * populated.
- * @param depends  List of events to wait for before starting computations, if
- * any.
- *
- * @return Event to wait on to ensure that computation completes.
- * @defgroup CtorKernels
- */
-template <typename Ty>
-sycl::event lin_space_affine_impl(sycl::queue &exec_q,
-                                  std::size_t nelems,
-                                  const py::object &start,
-                                  const py::object &end,
-                                  bool include_endpoint,
-                                  char *array_data,
-                                  const std::vector<sycl::event> &depends)
-{
-    Ty start_v = py::cast<Ty>(start);
-    Ty end_v = py::cast<Ty>(end);
-
-    using dpctl::tensor::kernels::constructors::lin_space_affine_impl;
-
-    auto lin_space_affine_event = lin_space_affine_impl<Ty>(
-        exec_q, nelems, start_v, end_v, include_endpoint, array_data, depends);
-
-    return lin_space_affine_event;
-}
-
-using dpctl::utils::keep_args_alive;
-
-static lin_space_step_fn_ptr_t lin_space_step_dispatch_vector[td_ns::num_types];
-
-static lin_space_affine_fn_ptr_t
-    lin_space_affine_dispatch_vector[td_ns::num_types];
-
-std::pair<sycl::event, sycl::event>
-usm_ndarray_linear_sequence_step(const py::object &start,
-                                 const py::object &dt,
-                                 const dpctl::tensor::usm_ndarray &dst,
-                                 sycl::queue &exec_q,
-                                 const std::vector<sycl::event> &depends)
-{
-    // dst must be 1D and C-contiguous
-    // start, end should be coercible into data type of dst
-
-    if (dst.get_ndim() != 1) {
-        throw py::value_error(
-            "usm_ndarray_linspace: Expecting 1D array to populate");
-    }
-
-    if (!dst.is_c_contiguous()) {
-        throw py::value_error(
-            "usm_ndarray_linspace: Non-contiguous arrays are not supported");
-    }
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
-        throw py::value_error(
-            "Execution queue is not compatible with the allocation queue");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    auto array_types = td_ns::usm_ndarray_types();
-    int dst_typenum = dst.get_typenum();
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    py::ssize_t len = dst.get_shape(0);
-    if (len == 0) {
-        // nothing to do
-        return std::make_pair(sycl::event{}, sycl::event{});
-    }
-
-    char *dst_data = dst.get_data();
-    sycl::event linspace_step_event;
-
-    auto fn = lin_space_step_dispatch_vector[dst_typeid];
-
-    linspace_step_event =
-        fn(exec_q, static_cast<std::size_t>(len), start, dt, dst_data, depends);
-
-    return std::make_pair(keep_args_alive(exec_q, {dst}, {linspace_step_event}),
-                          linspace_step_event);
-}
-
-std::pair<sycl::event, sycl::event>
-usm_ndarray_linear_sequence_affine(const py::object &start,
-                                   const py::object &end,
-                                   const dpctl::tensor::usm_ndarray &dst,
-                                   bool include_endpoint,
-                                   sycl::queue &exec_q,
-                                   const std::vector<sycl::event> &depends)
-{
-    // dst must be 1D and C-contiguous
-    // start, end should be coercible into data type of dst
-
-    if (dst.get_ndim() != 1) {
-        throw py::value_error(
-            "usm_ndarray_linspace: Expecting 1D array to populate");
-    }
-
-    if (!dst.is_c_contiguous()) {
-        throw py::value_error(
-            "usm_ndarray_linspace: Non-contiguous arrays are not supported");
-    }
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
-        throw py::value_error(
-            "Execution queue context is not the same as allocation context");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    auto array_types = td_ns::usm_ndarray_types();
-    int dst_typenum = dst.get_typenum();
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    py::ssize_t len = dst.get_shape(0);
-    if (len == 0) {
-        // nothing to do
-        return std::make_pair(sycl::event{}, sycl::event{});
-    }
-
-    char *dst_data = dst.get_data();
-    sycl::event linspace_affine_event;
-
-    auto fn = lin_space_affine_dispatch_vector[dst_typeid];
-
-    linspace_affine_event = fn(exec_q, static_cast<std::size_t>(len), start,
-                               end, include_endpoint, dst_data, depends);
-
-    return std::make_pair(
-        keep_args_alive(exec_q, {dst}, {linspace_affine_event}),
-        linspace_affine_event);
-}
-
-/*!
- * @brief  Factor to get function pointer of type `fnT` for array with elements
- * of type `Ty`.
- * @defgroup CtorKernels
- */
-template <typename fnT, typename Ty> struct LinSpaceStepFactory
-{
-    fnT get()
-    {
-        fnT f = lin_space_step_impl<Ty>;
-        return f;
-    }
-};
-
-/*!
- * @brief Factory to get function pointer of type `fnT` for array data type
- * `Ty`.
- */
-template <typename fnT, typename Ty> struct LinSpaceAffineFactory
-{
-    fnT get()
-    {
-        fnT f = lin_space_affine_impl<Ty>;
-        return f;
-    }
-};
-
-void init_linear_sequences_dispatch_vectors(void)
-{
-    using namespace td_ns;
-
-    DispatchVectorBuilder<lin_space_step_fn_ptr_t, LinSpaceStepFactory,
-                          num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(lin_space_step_dispatch_vector);
-
-    DispatchVectorBuilder<lin_space_affine_fn_ptr_t, LinSpaceAffineFactory,
-                          num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(lin_space_affine_dispatch_vector);
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/linear_sequences.hpp b/dpctl/tensor/libtensor/source/linear_sequences.hpp
deleted file mode 100644
index 516c9e9bf4..0000000000
--- a/dpctl/tensor/libtensor/source/linear_sequences.hpp
+++ /dev/null
@@ -1,59 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#pragma once
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/pybind11.h>
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern std::pair<sycl::event, sycl::event>
-usm_ndarray_linear_sequence_step(const py::object &start,
-                                 const py::object &dt,
-                                 const dpctl::tensor::usm_ndarray &dst,
-                                 sycl::queue &exec_q,
-                                 const std::vector<sycl::event> &depends = {});
-
-extern std::pair<sycl::event, sycl::event> usm_ndarray_linear_sequence_affine(
-    const py::object &start,
-    const py::object &end,
-    const dpctl::tensor::usm_ndarray &dst,
-    bool include_endpoint,
-    sycl::queue &exec_q,
-    const std::vector<sycl::event> &depends = {});
-
-extern void init_linear_sequences_dispatch_vectors(void);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/all.cpp b/dpctl/tensor/libtensor/source/reductions/all.cpp
deleted file mode 100644
index e2ae097c82..0000000000
--- a/dpctl/tensor/libtensor/source/reductions/all.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "kernels/reductions.hpp"
-#include "reduction_atomic_support.hpp"
-#include "reduction_over_axis.hpp"
-#include "utils/type_dispatch.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace impl
-{
-
-using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
-static reduction_strided_impl_fn_ptr
-    all_reduction_strided_dispatch_vector[td_ns::num_types];
-
-using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
-static reduction_contig_impl_fn_ptr
-    all_reduction_axis1_contig_dispatch_vector[td_ns::num_types];
-static reduction_contig_impl_fn_ptr
-    all_reduction_axis0_contig_dispatch_vector[td_ns::num_types];
-
-template <typename fnT, typename srcTy> struct AllStridedFactory
-{
-    fnT get() const
-    {
-        using dstTy = std::int32_t;
-        using ReductionOpT = sycl::logical_and<dstTy>;
-        return dpctl::tensor::kernels::
-            reduction_over_group_with_atomics_strided_impl<srcTy, dstTy,
-                                                           ReductionOpT>;
-    }
-};
-
-template <typename fnT, typename srcTy> struct AllAxis1ContigFactory
-{
-    fnT get() const
-    {
-        using dstTy = std::int32_t;
-        using ReductionOpT = sycl::logical_and<dstTy>;
-        return dpctl::tensor::kernels::
-            reduction_axis1_over_group_with_atomics_contig_impl<srcTy, dstTy,
-                                                                ReductionOpT>;
-    }
-};
-
-template <typename fnT, typename srcTy> struct AllAxis0ContigFactory
-{
-    fnT get() const
-    {
-        using dstTy = std::int32_t;
-        using ReductionOpT = sycl::logical_and<dstTy>;
-        return dpctl::tensor::kernels::
-            reduction_axis0_over_group_with_atomics_contig_impl<srcTy, dstTy,
-                                                                ReductionOpT>;
-    }
-};
-
-void populate_all_dispatch_vectors(void)
-{
-    using td_ns::DispatchVectorBuilder;
-
-    DispatchVectorBuilder<reduction_strided_impl_fn_ptr, AllStridedFactory,
-                          td_ns::num_types>
-        all_dvb1;
-    all_dvb1.populate_dispatch_vector(all_reduction_strided_dispatch_vector);
-
-    DispatchVectorBuilder<reduction_contig_impl_fn_ptr, AllAxis1ContigFactory,
-                          td_ns::num_types>
-        all_dvb2;
-    all_dvb2.populate_dispatch_vector(
-        all_reduction_axis1_contig_dispatch_vector);
-
-    DispatchVectorBuilder<reduction_contig_impl_fn_ptr, AllAxis0ContigFactory,
-                          td_ns::num_types>
-        all_dvb3;
-    all_dvb3.populate_dispatch_vector(
-        all_reduction_axis0_contig_dispatch_vector);
-};
-
-using atomic_support::atomic_support_fn_ptr_t;
-using atomic_support::check_atomic_support;
-static atomic_support_fn_ptr_t all_atomic_support =
-    check_atomic_support<std::int32_t>;
-
-} // namespace impl
-
-void init_all(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_all_dispatch_vectors();
-        using impl::all_reduction_axis0_contig_dispatch_vector;
-        using impl::all_reduction_axis1_contig_dispatch_vector;
-        using impl::all_reduction_strided_dispatch_vector;
-
-        using impl::all_atomic_support;
-
-        auto all_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
-                             const arrayT &dst, sycl::queue &exec_q,
-                             const event_vecT &depends = {}) {
-            return py_boolean_reduction(
-                src, trailing_dims_to_reduce, dst, exec_q, depends,
-                all_reduction_axis1_contig_dispatch_vector,
-                all_reduction_axis0_contig_dispatch_vector,
-                all_reduction_strided_dispatch_vector, all_atomic_support);
-        };
-        m.def("_all", all_pyapi, "", py::arg("src"),
-              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/all.hpp b/dpctl/tensor/libtensor/source/reductions/all.hpp
deleted file mode 100644
index e50448ce70..0000000000
--- a/dpctl/tensor/libtensor/source/reductions/all.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_all(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/any.cpp b/dpctl/tensor/libtensor/source/reductions/any.cpp
deleted file mode 100644
index afa2e5aaaf..0000000000
--- a/dpctl/tensor/libtensor/source/reductions/any.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <vector>
-
-#include "kernels/reductions.hpp"
-#include "reduction_atomic_support.hpp"
-#include "reduction_over_axis.hpp"
-#include "utils/type_dispatch.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace impl
-{
-
-using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
-static reduction_strided_impl_fn_ptr
-    any_reduction_strided_dispatch_vector[td_ns::num_types];
-
-using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
-static reduction_contig_impl_fn_ptr
-    any_reduction_axis1_contig_dispatch_vector[td_ns::num_types];
-static reduction_contig_impl_fn_ptr
-    any_reduction_axis0_contig_dispatch_vector[td_ns::num_types];
-
-template <typename fnT, typename srcTy> struct AnyStridedFactory
-{
-    fnT get() const
-    {
-        using dstTy = std::int32_t;
-        using ReductionOpT = sycl::logical_or<dstTy>;
-        return dpctl::tensor::kernels::
-            reduction_over_group_with_atomics_strided_impl<srcTy, dstTy,
-                                                           ReductionOpT>;
-    }
-};
-
-template <typename fnT, typename srcTy> struct AnyAxis1ContigFactory
-{
-    fnT get() const
-    {
-        using dstTy = std::int32_t;
-        using ReductionOpT = sycl::logical_or<dstTy>;
-        return dpctl::tensor::kernels::
-            reduction_axis1_over_group_with_atomics_contig_impl<srcTy, dstTy,
-                                                                ReductionOpT>;
-    }
-};
-
-template <typename fnT, typename srcTy> struct AnyAxis0ContigFactory
-{
-    fnT get() const
-    {
-        using dstTy = std::int32_t;
-        using ReductionOpT = sycl::logical_or<dstTy>;
-        return dpctl::tensor::kernels::
-            reduction_axis0_over_group_with_atomics_contig_impl<srcTy, dstTy,
-                                                                ReductionOpT>;
-    }
-};
-
-void populate_any_dispatch_vectors(void)
-{
-    using td_ns::DispatchVectorBuilder;
-
-    DispatchVectorBuilder<reduction_strided_impl_fn_ptr, AnyStridedFactory,
-                          td_ns::num_types>
-        any_dvb1;
-    any_dvb1.populate_dispatch_vector(any_reduction_strided_dispatch_vector);
-
-    DispatchVectorBuilder<reduction_contig_impl_fn_ptr, AnyAxis1ContigFactory,
-                          td_ns::num_types>
-        any_dvb2;
-    any_dvb2.populate_dispatch_vector(
-        any_reduction_axis1_contig_dispatch_vector);
-
-    DispatchVectorBuilder<reduction_contig_impl_fn_ptr, AnyAxis0ContigFactory,
-                          td_ns::num_types>
-        any_dvb3;
-    any_dvb3.populate_dispatch_vector(
-        any_reduction_axis0_contig_dispatch_vector);
-};
-
-using atomic_support::atomic_support_fn_ptr_t;
-using atomic_support::check_atomic_support;
-static atomic_support_fn_ptr_t any_atomic_support =
-    check_atomic_support<std::int32_t>;
-
-} // namespace impl
-
-void init_any(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        impl::populate_any_dispatch_vectors();
-        using impl::any_reduction_axis0_contig_dispatch_vector;
-        using impl::any_reduction_axis1_contig_dispatch_vector;
-        using impl::any_reduction_strided_dispatch_vector;
-
-        using impl::any_atomic_support;
-
-        auto any_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
-                             const arrayT &dst, sycl::queue &exec_q,
-                             const event_vecT &depends = {}) {
-            return py_boolean_reduction(
-                src, trailing_dims_to_reduce, dst, exec_q, depends,
-                any_reduction_axis1_contig_dispatch_vector,
-                any_reduction_axis0_contig_dispatch_vector,
-                any_reduction_strided_dispatch_vector, any_atomic_support);
-        };
-        m.def("_any", any_pyapi, "", py::arg("src"),
-              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/any.hpp b/dpctl/tensor/libtensor/source/reductions/any.hpp
deleted file mode 100644
index 5c30813e68..0000000000
--- a/dpctl/tensor/libtensor/source/reductions/any.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_any(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/argmax.cpp b/dpctl/tensor/libtensor/source/reductions/argmax.cpp
deleted file mode 100644
index 2e6bcfddd3..0000000000
--- a/dpctl/tensor/libtensor/source/reductions/argmax.cpp
+++ /dev/null
@@ -1,277 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-#include <vector>
-
-#include "kernels/reductions.hpp"
-#include "reduction_over_axis.hpp"
-#include "utils/sycl_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace su_ns = dpctl::tensor::sycl_utils;
-
-namespace impl
-{
-
-using dpctl::tensor::kernels::search_strided_impl_fn_ptr;
-static search_strided_impl_fn_ptr
-    argmax_over_axis_strided_temps_dispatch_table[td_ns::num_types]
-                                                 [td_ns::num_types];
-
-using dpctl::tensor::kernels::search_contig_impl_fn_ptr;
-static search_contig_impl_fn_ptr
-    argmax_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
-                                                 [td_ns::num_types];
-using dpctl::tensor::kernels::search_contig_impl_fn_ptr;
-static search_contig_impl_fn_ptr
-    argmax_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
-                                                 [td_ns::num_types];
-
-template <typename argTy, typename outTy>
-struct TypePairSupportForArgmaxReductionTemps
-{
-
-    static constexpr bool is_defined = std::disjunction<
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
-        // input int8_t
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
-
-        // input uint8_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
-
-        // input int16_t
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
-
-        // input uint16_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
-
-        // input int32_t
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
-        // input uint32_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::int64_t>,
-
-        // input int64_t
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
-
-        // input uint32_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::int64_t>,
-
-        // input half
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, std::int64_t>,
-
-        // input float
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::int64_t>,
-
-        // input double
-        td_ns::TypePairDefinedEntry<argTy, double, outTy, std::int64_t>,
-
-        // input std::complex
-        td_ns::TypePairDefinedEntry<argTy,
-                                    std::complex<float>,
-                                    outTy,
-                                    std::int64_t>,
-
-        td_ns::TypePairDefinedEntry<argTy,
-                                    std::complex<double>,
-                                    outTy,
-                                    std::int64_t>,
-
-        // fall-through
-        td_ns::NotDefinedEntry>::is_defined;
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct ArgmaxOverAxisTempsStridedFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportForArgmaxReductionTemps<srcTy,
-                                                             dstTy>::is_defined)
-        {
-            if constexpr (std::is_integral_v<srcTy> &&
-                          !std::is_same_v<srcTy, bool>)
-            {
-                // op for values
-                using ReductionOpT = sycl::maximum<srcTy>;
-                // op for indices
-                using IndexOpT = sycl::minimum<dstTy>;
-                return dpctl::tensor::kernels::
-                    search_over_group_temps_strided_impl<
-                        srcTy, dstTy, ReductionOpT, IndexOpT>;
-            }
-            else {
-                // op for values
-                using ReductionOpT = su_ns::Maximum<srcTy>;
-                // op for indices
-                using IndexOpT = sycl::minimum<dstTy>;
-                return dpctl::tensor::kernels::
-                    search_over_group_temps_strided_impl<
-                        srcTy, dstTy, ReductionOpT, IndexOpT>;
-            }
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct ArgmaxOverAxis1TempsContigFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportForArgmaxReductionTemps<srcTy,
-                                                             dstTy>::is_defined)
-        {
-            if constexpr (std::is_integral_v<srcTy> &&
-                          !std::is_same_v<srcTy, bool>)
-            {
-                // op for values
-                using ReductionOpT = sycl::maximum<srcTy>;
-                // op for indices
-                using IndexOpT = sycl::minimum<dstTy>;
-                return dpctl::tensor::kernels::
-                    search_axis1_over_group_temps_contig_impl<
-                        srcTy, dstTy, ReductionOpT, IndexOpT>;
-            }
-            else {
-                // op for values
-                using ReductionOpT = su_ns::Maximum<srcTy>;
-                // op for indices
-                using IndexOpT = sycl::minimum<dstTy>;
-                return dpctl::tensor::kernels::
-                    search_axis1_over_group_temps_contig_impl<
-                        srcTy, dstTy, ReductionOpT, IndexOpT>;
-            }
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct ArgmaxOverAxis0TempsContigFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportForArgmaxReductionTemps<srcTy,
-                                                             dstTy>::is_defined)
-        {
-            if constexpr (std::is_integral_v<srcTy> &&
-                          !std::is_same_v<srcTy, bool>)
-            {
-                // op for values
-                using ReductionOpT = sycl::maximum<srcTy>;
-                // op for indices
-                using IndexOpT = sycl::minimum<dstTy>;
-                return dpctl::tensor::kernels::
-                    search_axis0_over_group_temps_contig_impl<
-                        srcTy, dstTy, ReductionOpT, IndexOpT>;
-            }
-            else {
-                // op for values
-                using ReductionOpT = su_ns::Maximum<srcTy>;
-                // op for indices
-                using IndexOpT = sycl::minimum<dstTy>;
-                return dpctl::tensor::kernels::
-                    search_axis0_over_group_temps_contig_impl<
-                        srcTy, dstTy, ReductionOpT, IndexOpT>;
-            }
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-void populate_argmax_over_axis_dispatch_tables(void)
-{
-    using dpctl::tensor::kernels::search_strided_impl_fn_ptr;
-    using td_ns::DispatchTableBuilder;
-
-    DispatchTableBuilder<search_strided_impl_fn_ptr,
-                         ArgmaxOverAxisTempsStridedFactory, td_ns::num_types>
-        dtb1;
-    dtb1.populate_dispatch_table(argmax_over_axis_strided_temps_dispatch_table);
-
-    DispatchTableBuilder<search_contig_impl_fn_ptr,
-                         ArgmaxOverAxis1TempsContigFactory, td_ns::num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(argmax_over_axis1_contig_temps_dispatch_table);
-
-    DispatchTableBuilder<search_contig_impl_fn_ptr,
-                         ArgmaxOverAxis0TempsContigFactory, td_ns::num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(argmax_over_axis0_contig_temps_dispatch_table);
-}
-
-} // namespace impl
-
-void init_argmax(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        using impl::populate_argmax_over_axis_dispatch_tables;
-        populate_argmax_over_axis_dispatch_tables();
-        using impl::argmax_over_axis0_contig_temps_dispatch_table;
-        using impl::argmax_over_axis1_contig_temps_dispatch_table;
-        using impl::argmax_over_axis_strided_temps_dispatch_table;
-
-        auto argmax_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
-                                const arrayT &dst, sycl::queue &exec_q,
-                                const event_vecT &depends = {}) {
-            using dpctl::tensor::py_internal::py_search_over_axis;
-            return py_search_over_axis(
-                src, trailing_dims_to_reduce, dst, exec_q, depends,
-                argmax_over_axis_strided_temps_dispatch_table,
-                argmax_over_axis0_contig_temps_dispatch_table,
-                argmax_over_axis1_contig_temps_dispatch_table);
-        };
-        m.def("_argmax_over_axis", argmax_pyapi, "", py::arg("src"),
-              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/argmax.hpp b/dpctl/tensor/libtensor/source/reductions/argmax.hpp
deleted file mode 100644
index b1cf4db60b..0000000000
--- a/dpctl/tensor/libtensor/source/reductions/argmax.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_argmax(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/argmin.cpp b/dpctl/tensor/libtensor/source/reductions/argmin.cpp
deleted file mode 100644
index 883ec1d397..0000000000
--- a/dpctl/tensor/libtensor/source/reductions/argmin.cpp
+++ /dev/null
@@ -1,278 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-#include <vector>
-
-#include "kernels/reductions.hpp"
-#include "reduction_over_axis.hpp"
-
-#include "utils/sycl_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace su_ns = dpctl::tensor::sycl_utils;
-
-namespace impl
-{
-
-using dpctl::tensor::kernels::search_strided_impl_fn_ptr;
-static search_strided_impl_fn_ptr
-    argmin_over_axis_strided_temps_dispatch_table[td_ns::num_types]
-                                                 [td_ns::num_types];
-
-using dpctl::tensor::kernels::search_contig_impl_fn_ptr;
-static search_contig_impl_fn_ptr
-    argmin_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
-                                                 [td_ns::num_types];
-using dpctl::tensor::kernels::search_contig_impl_fn_ptr;
-static search_contig_impl_fn_ptr
-    argmin_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
-                                                 [td_ns::num_types];
-
-template <typename argTy, typename outTy>
-struct TypePairSupportForArgminReductionTemps
-{
-
-    static constexpr bool is_defined = std::disjunction<
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
-        // input int8_t
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
-
-        // input uint8_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
-
-        // input int16_t
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
-
-        // input uint16_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
-
-        // input int32_t
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
-        // input uint32_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::int64_t>,
-
-        // input int64_t
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
-
-        // input uint32_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::int64_t>,
-
-        // input half
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, std::int64_t>,
-
-        // input float
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::int64_t>,
-
-        // input double
-        td_ns::TypePairDefinedEntry<argTy, double, outTy, std::int64_t>,
-
-        // input std::complex
-        td_ns::TypePairDefinedEntry<argTy,
-                                    std::complex<float>,
-                                    outTy,
-                                    std::int64_t>,
-
-        td_ns::TypePairDefinedEntry<argTy,
-                                    std::complex<double>,
-                                    outTy,
-                                    std::int64_t>,
-
-        // fall-through
-        td_ns::NotDefinedEntry>::is_defined;
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct ArgminOverAxisTempsStridedFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportForArgminReductionTemps<srcTy,
-                                                             dstTy>::is_defined)
-        {
-            if constexpr (std::is_integral_v<srcTy> &&
-                          !std::is_same_v<srcTy, bool>)
-            {
-                // op for values
-                using ReductionOpT = sycl::minimum<srcTy>;
-                // op for indices
-                using IndexOpT = sycl::minimum<dstTy>;
-                return dpctl::tensor::kernels::
-                    search_over_group_temps_strided_impl<
-                        srcTy, dstTy, ReductionOpT, IndexOpT>;
-            }
-            else {
-                // op for values
-                using ReductionOpT = su_ns::Minimum<srcTy>;
-                // op for indices
-                using IndexOpT = sycl::minimum<dstTy>;
-                return dpctl::tensor::kernels::
-                    search_over_group_temps_strided_impl<
-                        srcTy, dstTy, ReductionOpT, IndexOpT>;
-            }
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct ArgminOverAxis1TempsContigFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportForArgminReductionTemps<srcTy,
-                                                             dstTy>::is_defined)
-        {
-            if constexpr (std::is_integral_v<srcTy> &&
-                          !std::is_same_v<srcTy, bool>)
-            {
-                // op for values
-                using ReductionOpT = sycl::minimum<srcTy>;
-                // op for indices
-                using IndexOpT = sycl::minimum<dstTy>;
-                return dpctl::tensor::kernels::
-                    search_axis1_over_group_temps_contig_impl<
-                        srcTy, dstTy, ReductionOpT, IndexOpT>;
-            }
-            else {
-                // op for values
-                using ReductionOpT = su_ns::Minimum<srcTy>;
-                // op for indices
-                using IndexOpT = sycl::minimum<dstTy>;
-                return dpctl::tensor::kernels::
-                    search_axis1_over_group_temps_contig_impl<
-                        srcTy, dstTy, ReductionOpT, IndexOpT>;
-            }
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct ArgminOverAxis0TempsContigFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportForArgminReductionTemps<srcTy,
-                                                             dstTy>::is_defined)
-        {
-            if constexpr (std::is_integral_v<srcTy> &&
-                          !std::is_same_v<srcTy, bool>)
-            {
-                // op for values
-                using ReductionOpT = sycl::minimum<srcTy>;
-                // op for indices
-                using IndexOpT = sycl::minimum<dstTy>;
-                return dpctl::tensor::kernels::
-                    search_axis0_over_group_temps_contig_impl<
-                        srcTy, dstTy, ReductionOpT, IndexOpT>;
-            }
-            else {
-                // op for values
-                using ReductionOpT = su_ns::Minimum<srcTy>;
-                // op for indices
-                using IndexOpT = sycl::minimum<dstTy>;
-                return dpctl::tensor::kernels::
-                    search_axis0_over_group_temps_contig_impl<
-                        srcTy, dstTy, ReductionOpT, IndexOpT>;
-            }
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-void populate_argmin_over_axis_dispatch_tables(void)
-{
-    using dpctl::tensor::kernels::search_strided_impl_fn_ptr;
-    using td_ns::DispatchTableBuilder;
-
-    DispatchTableBuilder<search_strided_impl_fn_ptr,
-                         ArgminOverAxisTempsStridedFactory, td_ns::num_types>
-        dtb1;
-    dtb1.populate_dispatch_table(argmin_over_axis_strided_temps_dispatch_table);
-
-    DispatchTableBuilder<search_contig_impl_fn_ptr,
-                         ArgminOverAxis1TempsContigFactory, td_ns::num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(argmin_over_axis1_contig_temps_dispatch_table);
-
-    DispatchTableBuilder<search_contig_impl_fn_ptr,
-                         ArgminOverAxis0TempsContigFactory, td_ns::num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(argmin_over_axis0_contig_temps_dispatch_table);
-}
-
-} // namespace impl
-
-void init_argmin(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        using impl::populate_argmin_over_axis_dispatch_tables;
-        populate_argmin_over_axis_dispatch_tables();
-        using impl::argmin_over_axis0_contig_temps_dispatch_table;
-        using impl::argmin_over_axis1_contig_temps_dispatch_table;
-        using impl::argmin_over_axis_strided_temps_dispatch_table;
-
-        auto argmin_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
-                                const arrayT &dst, sycl::queue &exec_q,
-                                const event_vecT &depends = {}) {
-            using dpctl::tensor::py_internal::py_search_over_axis;
-            return py_search_over_axis(
-                src, trailing_dims_to_reduce, dst, exec_q, depends,
-                argmin_over_axis_strided_temps_dispatch_table,
-                argmin_over_axis0_contig_temps_dispatch_table,
-                argmin_over_axis1_contig_temps_dispatch_table);
-        };
-        m.def("_argmin_over_axis", argmin_pyapi, "", py::arg("src"),
-              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/argmin.hpp b/dpctl/tensor/libtensor/source/reductions/argmin.hpp
deleted file mode 100644
index 480b1732c2..0000000000
--- a/dpctl/tensor/libtensor/source/reductions/argmin.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_argmin(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/logsumexp.cpp b/dpctl/tensor/libtensor/source/reductions/logsumexp.cpp
deleted file mode 100644
index c6f387ef8b..0000000000
--- a/dpctl/tensor/libtensor/source/reductions/logsumexp.cpp
+++ /dev/null
@@ -1,253 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-#include <vector>
-
-#include "kernels/reductions.hpp"
-#include "reduction_over_axis.hpp"
-#include "utils/sycl_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace su_ns = dpctl::tensor::sycl_utils;
-
-namespace impl
-{
-
-using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
-static reduction_strided_impl_fn_ptr
-    logsumexp_over_axis_strided_temps_dispatch_table[td_ns::num_types]
-                                                    [td_ns::num_types];
-
-using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
-static reduction_contig_impl_fn_ptr
-    logsumexp_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
-                                                    [td_ns::num_types];
-static reduction_contig_impl_fn_ptr
-    logsumexp_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
-                                                    [td_ns::num_types];
-
-template <typename argTy, typename outTy>
-struct TypePairSupportDataForLogSumExpReductionTemps
-{
-
-    static constexpr bool is_defined = std::disjunction<
-#if 1
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, sycl::half>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
-
-        // input int8_t
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, sycl::half>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
-
-        // input uint8_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, sycl::half>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
-
-        // input int16_t
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
-
-        // input uint16_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
-
-        // input int32_t
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
-
-        // input uint32_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
-
-        // input int64_t
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
-
-        // input uint64_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
-        // input half
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, double>,
-
-        // input float
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
-
-        // input double
-        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
-#endif
-
-        // fall-through
-        td_ns::NotDefinedEntry>::is_defined;
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct LogSumExpOverAxisTempsStridedFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForLogSumExpReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
-            using ReductionOpT = su_ns::LogSumExp<dstTy>;
-            return dpctl::tensor::kernels::
-                reduction_over_group_temps_strided_impl<srcTy, dstTy,
-                                                        ReductionOpT>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct LogSumExpOverAxis1TempsContigFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForLogSumExpReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
-            using ReductionOpT = su_ns::LogSumExp<dstTy>;
-            return dpctl::tensor::kernels::
-                reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
-                                                             ReductionOpT>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct LogSumExpOverAxis0TempsContigFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForLogSumExpReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
-            using ReductionOpT = su_ns::LogSumExp<dstTy>;
-            return dpctl::tensor::kernels::
-                reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
-                                                             ReductionOpT>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-void populate_logsumexp_over_axis_dispatch_tables(void)
-{
-    using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
-    using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
-    using namespace td_ns;
-
-    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
-                         LogSumExpOverAxisTempsStridedFactory, num_types>
-        dtb1;
-    dtb1.populate_dispatch_table(
-        logsumexp_over_axis_strided_temps_dispatch_table);
-
-    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
-                         LogSumExpOverAxis1TempsContigFactory, td_ns::num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(
-        logsumexp_over_axis1_contig_temps_dispatch_table);
-
-    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
-                         LogSumExpOverAxis0TempsContigFactory, td_ns::num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(
-        logsumexp_over_axis0_contig_temps_dispatch_table);
-}
-
-} // namespace impl
-
-void init_logsumexp(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        using impl::populate_logsumexp_over_axis_dispatch_tables;
-        populate_logsumexp_over_axis_dispatch_tables();
-        using impl::logsumexp_over_axis0_contig_temps_dispatch_table;
-        using impl::logsumexp_over_axis1_contig_temps_dispatch_table;
-        using impl::logsumexp_over_axis_strided_temps_dispatch_table;
-
-        using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
-        using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
-
-        auto logsumexp_pyapi = [&](const arrayT &src,
-                                   int trailing_dims_to_reduce,
-                                   const arrayT &dst, sycl::queue &exec_q,
-                                   const event_vecT &depends = {}) {
-            using dpctl::tensor::py_internal::py_tree_reduction_over_axis;
-            return py_tree_reduction_over_axis(
-                src, trailing_dims_to_reduce, dst, exec_q, depends,
-                logsumexp_over_axis_strided_temps_dispatch_table,
-                logsumexp_over_axis0_contig_temps_dispatch_table,
-                logsumexp_over_axis1_contig_temps_dispatch_table);
-        };
-        m.def("_logsumexp_over_axis", logsumexp_pyapi, "", py::arg("src"),
-              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto logsumexp_dtype_supported = [&](const py::dtype &input_dtype,
-                                             const py::dtype &output_dtype) {
-            using dpctl::tensor::py_internal::py_tree_reduction_dtype_supported;
-            return py_tree_reduction_dtype_supported(
-                input_dtype, output_dtype,
-                logsumexp_over_axis_strided_temps_dispatch_table);
-        };
-        m.def("_logsumexp_over_axis_dtype_supported", logsumexp_dtype_supported,
-              "", py::arg("arg_dtype"), py::arg("out_dtype"));
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/logsumexp.hpp b/dpctl/tensor/libtensor/source/reductions/logsumexp.hpp
deleted file mode 100644
index 3bc951e14d..0000000000
--- a/dpctl/tensor/libtensor/source/reductions/logsumexp.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_logsumexp(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/max.cpp b/dpctl/tensor/libtensor/source/reductions/max.cpp
deleted file mode 100644
index 55fff60f9b..0000000000
--- a/dpctl/tensor/libtensor/source/reductions/max.cpp
+++ /dev/null
@@ -1,410 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-#include <vector>
-
-#include "kernels/reductions.hpp"
-#include "utils/sycl_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-
-#include "reduction_atomic_support.hpp"
-#include "reduction_over_axis.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace su_ns = dpctl::tensor::sycl_utils;
-
-namespace impl
-{
-
-using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
-static reduction_strided_impl_fn_ptr
-    max_over_axis_strided_atomic_dispatch_table[td_ns::num_types]
-                                               [td_ns::num_types];
-static reduction_strided_impl_fn_ptr
-    max_over_axis_strided_temps_dispatch_table[td_ns::num_types]
-                                              [td_ns::num_types];
-
-using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
-static reduction_contig_impl_fn_ptr
-    max_over_axis1_contig_atomic_dispatch_table[td_ns::num_types]
-                                               [td_ns::num_types];
-static reduction_contig_impl_fn_ptr
-    max_over_axis0_contig_atomic_dispatch_table[td_ns::num_types]
-                                               [td_ns::num_types];
-static reduction_contig_impl_fn_ptr
-    max_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
-                                              [td_ns::num_types];
-static reduction_contig_impl_fn_ptr
-    max_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
-                                              [td_ns::num_types];
-
-/* @brief Types supported by max reduction code based on atomic_ref */
-template <typename argTy, typename outTy>
-struct TypePairSupportDataForMaxReductionAtomic
-{
-    /* value is true if a kernel for <argTy, outTy> must be instantiated, false
-     * otherwise */
-    static constexpr bool is_defined = std::disjunction<
-        // input int32
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
-        // input uint32
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
-        // input int64
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
-        // input uint64
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
-        // input float
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
-        // input double
-        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
-        // fall-through
-        td_ns::NotDefinedEntry>::is_defined;
-};
-
-template <typename argTy, typename outTy>
-struct TypePairSupportDataForMaxReductionTemps
-{
-    static constexpr bool is_defined = std::disjunction<
-        // input bool
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, bool>,
-        // input int8_t
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
-        // input uint8_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
-
-        // input int16_t
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
-        // input uint16_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
-
-        // input int32_t
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
-        // input uint32_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
-
-        // input int64_t
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
-
-        // input uint32_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
-
-        // input half
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
-
-        // input float
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
-
-        // input double
-        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
-
-        // input std::complex
-        td_ns::TypePairDefinedEntry<argTy,
-                                    std::complex<float>,
-                                    outTy,
-                                    std::complex<float>>,
-
-        td_ns::TypePairDefinedEntry<argTy,
-                                    std::complex<double>,
-                                    outTy,
-                                    std::complex<double>>,
-
-        // fall-through
-        td_ns::NotDefinedEntry>::is_defined;
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct MaxOverAxisAtomicStridedFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForMaxReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
-            if constexpr (std::is_floating_point<dstTy>::value) {
-                using ReductionOpT = su_ns::Maximum<dstTy>;
-                return dpctl::tensor::kernels::
-                    reduction_over_group_with_atomics_strided_impl<
-                        srcTy, dstTy, ReductionOpT>;
-            }
-            else {
-                using ReductionOpT = sycl::maximum<dstTy>;
-                return dpctl::tensor::kernels::
-                    reduction_over_group_with_atomics_strided_impl<
-                        srcTy, dstTy, ReductionOpT>;
-            }
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct MaxOverAxisTempsStridedFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForMaxReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
-            if constexpr (std::is_integral_v<dstTy> &&
-                          !std::is_same_v<dstTy, bool>)
-            {
-                using ReductionOpT = sycl::maximum<dstTy>;
-                return dpctl::tensor::kernels::
-                    reduction_over_group_temps_strided_impl<srcTy, dstTy,
-                                                            ReductionOpT>;
-            }
-            else {
-                using ReductionOpT = su_ns::Maximum<dstTy>;
-                return dpctl::tensor::kernels::
-                    reduction_over_group_temps_strided_impl<srcTy, dstTy,
-                                                            ReductionOpT>;
-            }
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct MaxOverAxis1AtomicContigFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForMaxReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
-            if constexpr (std::is_floating_point<dstTy>::value) {
-                using ReductionOpT = su_ns::Maximum<dstTy>;
-                return dpctl::tensor::kernels::
-                    reduction_axis1_over_group_with_atomics_contig_impl<
-                        srcTy, dstTy, ReductionOpT>;
-            }
-            else {
-                using ReductionOpT = sycl::maximum<dstTy>;
-                return dpctl::tensor::kernels::
-                    reduction_axis1_over_group_with_atomics_contig_impl<
-                        srcTy, dstTy, ReductionOpT>;
-            }
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct MaxOverAxis0AtomicContigFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForMaxReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
-            if constexpr (std::is_floating_point<dstTy>::value) {
-                using ReductionOpT = su_ns::Maximum<dstTy>;
-                return dpctl::tensor::kernels::
-                    reduction_axis0_over_group_with_atomics_contig_impl<
-                        srcTy, dstTy, ReductionOpT>;
-            }
-            else {
-                using ReductionOpT = sycl::maximum<dstTy>;
-                return dpctl::tensor::kernels::
-                    reduction_axis0_over_group_with_atomics_contig_impl<
-                        srcTy, dstTy, ReductionOpT>;
-            }
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct MaxOverAxis1TempsContigFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForMaxReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
-            if constexpr (std::is_integral_v<dstTy> &&
-                          !std::is_same_v<dstTy, bool>)
-            {
-                using ReductionOpT = sycl::maximum<dstTy>;
-                return dpctl::tensor::kernels::
-                    reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
-                                                                 ReductionOpT>;
-            }
-            else {
-                using ReductionOpT = su_ns::Maximum<dstTy>;
-                return dpctl::tensor::kernels::
-                    reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
-                                                                 ReductionOpT>;
-            }
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct MaxOverAxis0TempsContigFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForMaxReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
-            if constexpr (std::is_integral_v<dstTy> &&
-                          !std::is_same_v<dstTy, bool>)
-            {
-                using ReductionOpT = sycl::maximum<dstTy>;
-                return dpctl::tensor::kernels::
-                    reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
-                                                                 ReductionOpT>;
-            }
-            else {
-                using ReductionOpT = su_ns::Maximum<dstTy>;
-                return dpctl::tensor::kernels::
-                    reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
-                                                                 ReductionOpT>;
-            }
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-void populate_max_over_axis_dispatch_tables(void)
-{
-    using td_ns::DispatchTableBuilder;
-
-    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
-                         MaxOverAxisAtomicStridedFactory, td_ns::num_types>
-        dtb1;
-    dtb1.populate_dispatch_table(max_over_axis_strided_atomic_dispatch_table);
-
-    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
-                         MaxOverAxisTempsStridedFactory, td_ns::num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(max_over_axis_strided_temps_dispatch_table);
-
-    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
-                         MaxOverAxis1AtomicContigFactory, td_ns::num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(max_over_axis1_contig_atomic_dispatch_table);
-
-    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
-                         MaxOverAxis0AtomicContigFactory, td_ns::num_types>
-        dtb4;
-    dtb4.populate_dispatch_table(max_over_axis0_contig_atomic_dispatch_table);
-
-    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
-                         MaxOverAxis1TempsContigFactory, td_ns::num_types>
-        dtb5;
-    dtb5.populate_dispatch_table(max_over_axis1_contig_temps_dispatch_table);
-
-    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
-                         MaxOverAxis0TempsContigFactory, td_ns::num_types>
-        dtb6;
-    dtb6.populate_dispatch_table(max_over_axis0_contig_temps_dispatch_table);
-}
-
-using atomic_support::atomic_support_fn_ptr_t;
-static atomic_support_fn_ptr_t max_atomic_support_vector[td_ns::num_types];
-
-void populate_max_atomic_support_dispatch_vector(void)
-{
-    using td_ns::DispatchVectorBuilder;
-
-    using atomic_support::MaxAtomicSupportFactory;
-    DispatchVectorBuilder<atomic_support_fn_ptr_t, MaxAtomicSupportFactory,
-                          td_ns::num_types>
-        dvb;
-    dvb.populate_dispatch_vector(max_atomic_support_vector);
-}
-
-} // namespace impl
-
-void init_max(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        using impl::populate_max_over_axis_dispatch_tables;
-        populate_max_over_axis_dispatch_tables();
-        using impl::max_over_axis0_contig_atomic_dispatch_table;
-        using impl::max_over_axis0_contig_temps_dispatch_table;
-        using impl::max_over_axis1_contig_atomic_dispatch_table;
-        using impl::max_over_axis1_contig_temps_dispatch_table;
-        using impl::max_over_axis_strided_atomic_dispatch_table;
-        using impl::max_over_axis_strided_temps_dispatch_table;
-
-        using impl::populate_max_atomic_support_dispatch_vector;
-        populate_max_atomic_support_dispatch_vector();
-        using impl::max_atomic_support_vector;
-
-        auto max_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
-                             const arrayT &dst, sycl::queue &exec_q,
-                             const event_vecT &depends = {}) {
-            using dpctl::tensor::py_internal::py_reduction_over_axis;
-            return py_reduction_over_axis(
-                src, trailing_dims_to_reduce, dst, exec_q, depends,
-                max_over_axis_strided_atomic_dispatch_table,
-                max_over_axis0_contig_atomic_dispatch_table,
-                max_over_axis1_contig_atomic_dispatch_table,
-                max_over_axis_strided_temps_dispatch_table,
-                max_over_axis0_contig_temps_dispatch_table,
-                max_over_axis1_contig_temps_dispatch_table,
-                max_atomic_support_vector);
-        };
-        m.def("_max_over_axis", max_pyapi, "", py::arg("src"),
-              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/max.hpp b/dpctl/tensor/libtensor/source/reductions/max.hpp
deleted file mode 100644
index 7b7c63b15c..0000000000
--- a/dpctl/tensor/libtensor/source/reductions/max.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_max(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/min.cpp b/dpctl/tensor/libtensor/source/reductions/min.cpp
deleted file mode 100644
index 1ff5331bf0..0000000000
--- a/dpctl/tensor/libtensor/source/reductions/min.cpp
+++ /dev/null
@@ -1,412 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-#include <vector>
-
-#include "kernels/reductions.hpp"
-#include "utils/sycl_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-
-#include "reduction_atomic_support.hpp"
-#include "reduction_over_axis.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace su_ns = dpctl::tensor::sycl_utils;
-
-namespace impl
-{
-
-using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
-static reduction_strided_impl_fn_ptr
-    min_over_axis_strided_atomic_dispatch_table[td_ns::num_types]
-                                               [td_ns::num_types];
-static reduction_strided_impl_fn_ptr
-    min_over_axis_strided_temps_dispatch_table[td_ns::num_types]
-                                              [td_ns::num_types];
-
-using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
-static reduction_contig_impl_fn_ptr
-    min_over_axis1_contig_atomic_dispatch_table[td_ns::num_types]
-                                               [td_ns::num_types];
-static reduction_contig_impl_fn_ptr
-    min_over_axis0_contig_atomic_dispatch_table[td_ns::num_types]
-                                               [td_ns::num_types];
-static reduction_contig_impl_fn_ptr
-    min_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
-                                              [td_ns::num_types];
-static reduction_contig_impl_fn_ptr
-    min_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
-                                              [td_ns::num_types];
-
-/* @brief Types supported by min reduction code based on atomic_ref */
-template <typename argTy, typename outTy>
-struct TypePairSupportDataForMinReductionAtomic
-{
-    /* value is true if a kernel for <argTy, outTy> must be instantiated, false
-     * otherwise */
-    static constexpr bool is_defined = std::disjunction<
-        // input int32
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
-        // input uint32
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
-        // input int64
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
-        // input uint64
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
-        // input float
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
-        // input double
-        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
-        // fall-through
-        td_ns::NotDefinedEntry>::is_defined;
-};
-
-template <typename argTy, typename outTy>
-struct TypePairSupportDataForMinReductionTemps
-{
-    static constexpr bool is_defined = std::disjunction<
-        // input bool
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, bool>,
-        // input int8_t
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
-        // input uint8_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
-
-        // input int16_t
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
-        // input uint16_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
-
-        // input int32_t
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
-        // input uint32_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
-
-        // input int64_t
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
-
-        // input uint32_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
-
-        // input half
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
-
-        // input float
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
-
-        // input double
-        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
-
-        // input std::complex
-        td_ns::TypePairDefinedEntry<argTy,
-                                    std::complex<float>,
-                                    outTy,
-                                    std::complex<float>>,
-
-        td_ns::TypePairDefinedEntry<argTy,
-                                    std::complex<double>,
-                                    outTy,
-                                    std::complex<double>>,
-
-        // fall-through
-        td_ns::NotDefinedEntry>::is_defined;
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct MinOverAxisAtomicStridedFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForMinReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
-            if constexpr (std::is_floating_point<dstTy>::value) {
-                using ReductionOpT = su_ns::Minimum<dstTy>;
-                return dpctl::tensor::kernels::
-                    reduction_over_group_with_atomics_strided_impl<
-                        srcTy, dstTy, ReductionOpT>;
-            }
-            else {
-                using ReductionOpT = sycl::minimum<dstTy>;
-                return dpctl::tensor::kernels::
-                    reduction_over_group_with_atomics_strided_impl<
-                        srcTy, dstTy, ReductionOpT>;
-            }
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct MinOverAxisTempsStridedFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForMinReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
-            if constexpr (std::is_integral_v<dstTy> &&
-                          !std::is_same_v<dstTy, bool>)
-            {
-                using ReductionOpT = sycl::minimum<dstTy>;
-                return dpctl::tensor::kernels::
-                    reduction_over_group_temps_strided_impl<srcTy, dstTy,
-                                                            ReductionOpT>;
-            }
-            else {
-                using ReductionOpT = su_ns::Minimum<dstTy>;
-                return dpctl::tensor::kernels::
-                    reduction_over_group_temps_strided_impl<srcTy, dstTy,
-                                                            ReductionOpT>;
-            }
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct MinOverAxis1AtomicContigFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForMinReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
-            if constexpr (std::is_floating_point<dstTy>::value) {
-                using ReductionOpT = su_ns::Minimum<dstTy>;
-                return dpctl::tensor::kernels::
-                    reduction_axis1_over_group_with_atomics_contig_impl<
-                        srcTy, dstTy, ReductionOpT>;
-            }
-            else {
-                using ReductionOpT = sycl::minimum<dstTy>;
-                return dpctl::tensor::kernels::
-                    reduction_axis1_over_group_with_atomics_contig_impl<
-                        srcTy, dstTy, ReductionOpT>;
-            }
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct MinOverAxis0AtomicContigFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForMinReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
-            if constexpr (std::is_floating_point<dstTy>::value) {
-                using ReductionOpT = su_ns::Minimum<dstTy>;
-                return dpctl::tensor::kernels::
-                    reduction_axis0_over_group_with_atomics_contig_impl<
-                        srcTy, dstTy, ReductionOpT>;
-            }
-            else {
-                using ReductionOpT = sycl::minimum<dstTy>;
-                return dpctl::tensor::kernels::
-                    reduction_axis0_over_group_with_atomics_contig_impl<
-                        srcTy, dstTy, ReductionOpT>;
-            }
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct MinOverAxis1TempsContigFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForMinReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
-            if constexpr (std::is_integral_v<dstTy> &&
-                          !std::is_same_v<dstTy, bool>)
-            {
-                using ReductionOpT = sycl::minimum<dstTy>;
-                return dpctl::tensor::kernels::
-                    reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
-                                                                 ReductionOpT>;
-            }
-            else {
-                using ReductionOpT = su_ns::Minimum<dstTy>;
-                return dpctl::tensor::kernels::
-                    reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
-                                                                 ReductionOpT>;
-            }
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct MinOverAxis0TempsContigFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForMinReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
-            if constexpr (std::is_integral_v<dstTy> &&
-                          !std::is_same_v<dstTy, bool>)
-            {
-                using ReductionOpT = sycl::minimum<dstTy>;
-                return dpctl::tensor::kernels::
-                    reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
-                                                                 ReductionOpT>;
-            }
-            else {
-                using ReductionOpT = su_ns::Minimum<dstTy>;
-                return dpctl::tensor::kernels::
-                    reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
-                                                                 ReductionOpT>;
-            }
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-void populate_min_over_axis_dispatch_tables(void)
-{
-    using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
-    using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
-    using td_ns::DispatchTableBuilder;
-
-    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
-                         MinOverAxisAtomicStridedFactory, td_ns::num_types>
-        dtb1;
-    dtb1.populate_dispatch_table(min_over_axis_strided_atomic_dispatch_table);
-
-    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
-                         MinOverAxisTempsStridedFactory, td_ns::num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(min_over_axis_strided_temps_dispatch_table);
-
-    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
-                         MinOverAxis1AtomicContigFactory, td_ns::num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(min_over_axis1_contig_atomic_dispatch_table);
-
-    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
-                         MinOverAxis0AtomicContigFactory, td_ns::num_types>
-        dtb4;
-    dtb4.populate_dispatch_table(min_over_axis0_contig_atomic_dispatch_table);
-
-    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
-                         MinOverAxis1TempsContigFactory, td_ns::num_types>
-        dtb5;
-    dtb5.populate_dispatch_table(min_over_axis1_contig_temps_dispatch_table);
-
-    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
-                         MinOverAxis0TempsContigFactory, td_ns::num_types>
-        dtb6;
-    dtb6.populate_dispatch_table(min_over_axis0_contig_temps_dispatch_table);
-}
-
-using atomic_support::atomic_support_fn_ptr_t;
-static atomic_support_fn_ptr_t min_atomic_support_vector[td_ns::num_types];
-
-void populate_min_atomic_support_dispatch_vector(void)
-{
-    using td_ns::DispatchVectorBuilder;
-
-    using atomic_support::MinAtomicSupportFactory;
-    DispatchVectorBuilder<atomic_support_fn_ptr_t, MinAtomicSupportFactory,
-                          td_ns::num_types>
-        dvb;
-    dvb.populate_dispatch_vector(min_atomic_support_vector);
-}
-
-} // namespace impl
-
-void init_min(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        using impl::populate_min_over_axis_dispatch_tables;
-        populate_min_over_axis_dispatch_tables();
-        using impl::min_over_axis0_contig_atomic_dispatch_table;
-        using impl::min_over_axis0_contig_temps_dispatch_table;
-        using impl::min_over_axis1_contig_atomic_dispatch_table;
-        using impl::min_over_axis1_contig_temps_dispatch_table;
-        using impl::min_over_axis_strided_atomic_dispatch_table;
-        using impl::min_over_axis_strided_temps_dispatch_table;
-
-        using impl::populate_min_atomic_support_dispatch_vector;
-        populate_min_atomic_support_dispatch_vector();
-        using impl::min_atomic_support_vector;
-
-        auto min_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
-                             const arrayT &dst, sycl::queue &exec_q,
-                             const event_vecT &depends = {}) {
-            using dpctl::tensor::py_internal::py_reduction_over_axis;
-            return py_reduction_over_axis(
-                src, trailing_dims_to_reduce, dst, exec_q, depends,
-                min_over_axis_strided_atomic_dispatch_table,
-                min_over_axis0_contig_atomic_dispatch_table,
-                min_over_axis1_contig_atomic_dispatch_table,
-                min_over_axis_strided_temps_dispatch_table,
-                min_over_axis0_contig_temps_dispatch_table,
-                min_over_axis1_contig_temps_dispatch_table,
-                min_atomic_support_vector);
-        };
-        m.def("_min_over_axis", min_pyapi, "", py::arg("src"),
-              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/min.hpp b/dpctl/tensor/libtensor/source/reductions/min.hpp
deleted file mode 100644
index b9cd6ad743..0000000000
--- a/dpctl/tensor/libtensor/source/reductions/min.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_min(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/prod.cpp b/dpctl/tensor/libtensor/source/reductions/prod.cpp
deleted file mode 100644
index 7c768ce179..0000000000
--- a/dpctl/tensor/libtensor/source/reductions/prod.cpp
+++ /dev/null
@@ -1,459 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-#include <vector>
-
-#include "kernels/reductions.hpp"
-#include "utils/type_dispatch_building.hpp"
-
-#include "reduction_atomic_support.hpp"
-#include "reduction_over_axis.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace impl
-{
-
-using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
-static reduction_strided_impl_fn_ptr
-    prod_over_axis_strided_atomic_dispatch_table[td_ns::num_types]
-                                                [td_ns::num_types];
-static reduction_strided_impl_fn_ptr
-    prod_over_axis_strided_temps_dispatch_table[td_ns::num_types]
-                                               [td_ns::num_types];
-
-using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
-static reduction_contig_impl_fn_ptr
-    prod_over_axis1_contig_atomic_dispatch_table[td_ns::num_types]
-                                                [td_ns::num_types];
-static reduction_contig_impl_fn_ptr
-    prod_over_axis0_contig_atomic_dispatch_table[td_ns::num_types]
-                                                [td_ns::num_types];
-static reduction_contig_impl_fn_ptr
-    prod_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
-                                               [td_ns::num_types];
-static reduction_contig_impl_fn_ptr
-    prod_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
-                                               [td_ns::num_types];
-
-/* @brief Types supported by plus-reduction code based on atomic_ref */
-template <typename argTy, typename outTy>
-struct TypePairSupportDataForProductReductionAtomic
-{
-
-    /* value if true a kernel for <argTy, outTy> must be instantiated, false
-     * otherwise */
-    static constexpr bool is_defined = std::disjunction<
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
-        // input int8
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
-        // input uint8
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
-        // input int16
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
-        // input uint16
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
-        // input int32
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
-        // input uint32
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
-        // input int64
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
-        // input uint64
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
-        // fall-through
-        td_ns::NotDefinedEntry>::is_defined;
-};
-
-template <typename argTy, typename outTy>
-struct TypePairSupportDataForProductReductionTemps
-{
-
-    static constexpr bool is_defined = std::disjunction<
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, bool>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int8_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint8_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int16_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint16_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
-
-        // input int8_t
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
-
-        // input uint8_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
-
-        // input int16_t
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
-
-        // input uint16_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
-
-        // input int32_t
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
-
-        // input uint32_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
-
-        // input int64_t
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
-
-        // input uint32_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
-
-        // input half
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, double>,
-        td_ns::
-            TypePairDefinedEntry<argTy, sycl::half, outTy, std::complex<float>>,
-        td_ns::TypePairDefinedEntry<argTy,
-                                    sycl::half,
-                                    outTy,
-                                    std::complex<double>>,
-
-        // input float
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::complex<float>>,
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::complex<double>>,
-
-        // input double
-        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
-        td_ns::TypePairDefinedEntry<argTy, double, outTy, std::complex<double>>,
-
-        // input std::complex
-        td_ns::TypePairDefinedEntry<argTy,
-                                    std::complex<float>,
-                                    outTy,
-                                    std::complex<float>>,
-        td_ns::TypePairDefinedEntry<argTy,
-                                    std::complex<float>,
-                                    outTy,
-                                    std::complex<double>>,
-
-        td_ns::TypePairDefinedEntry<argTy,
-                                    std::complex<double>,
-                                    outTy,
-                                    std::complex<double>>,
-
-        // fall-through
-        td_ns::NotDefinedEntry>::is_defined;
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct ProductOverAxisAtomicStridedFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForProductReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
-            using ReductionOpT = sycl::multiplies<dstTy>;
-            return dpctl::tensor::kernels::
-                reduction_over_group_with_atomics_strided_impl<srcTy, dstTy,
-                                                               ReductionOpT>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct ProductOverAxisTempsStridedFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForProductReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
-            using ReductionOpT = std::conditional_t<std::is_same_v<dstTy, bool>,
-                                                    sycl::logical_and<dstTy>,
-                                                    sycl::multiplies<dstTy>>;
-            return dpctl::tensor::kernels::
-                reduction_over_group_temps_strided_impl<srcTy, dstTy,
-                                                        ReductionOpT>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct ProductOverAxis1AtomicContigFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForProductReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
-            using ReductionOpT = sycl::multiplies<dstTy>;
-            return dpctl::tensor::kernels::
-                reduction_axis1_over_group_with_atomics_contig_impl<
-                    srcTy, dstTy, ReductionOpT>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct ProductOverAxis0AtomicContigFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForProductReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
-            using ReductionOpT = sycl::multiplies<dstTy>;
-            return dpctl::tensor::kernels::
-                reduction_axis0_over_group_with_atomics_contig_impl<
-                    srcTy, dstTy, ReductionOpT>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct ProductOverAxis1TempsContigFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForProductReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
-            using ReductionOpT = std::conditional_t<std::is_same_v<dstTy, bool>,
-                                                    sycl::logical_and<dstTy>,
-                                                    sycl::multiplies<dstTy>>;
-            return dpctl::tensor::kernels::
-                reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
-                                                             ReductionOpT>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct ProductOverAxis0TempsContigFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForProductReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
-            using ReductionOpT = std::conditional_t<std::is_same_v<dstTy, bool>,
-                                                    sycl::logical_and<dstTy>,
-                                                    sycl::multiplies<dstTy>>;
-            return dpctl::tensor::kernels::
-                reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
-                                                             ReductionOpT>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-void populate_prod_over_axis_dispatch_tables(void)
-{
-    using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
-    using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
-    using namespace td_ns;
-
-    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
-                         ProductOverAxisAtomicStridedFactory, num_types>
-        dtb1;
-    dtb1.populate_dispatch_table(prod_over_axis_strided_atomic_dispatch_table);
-
-    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
-                         ProductOverAxisTempsStridedFactory, num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(prod_over_axis_strided_temps_dispatch_table);
-
-    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
-                         ProductOverAxis1AtomicContigFactory, num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(prod_over_axis1_contig_atomic_dispatch_table);
-
-    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
-                         ProductOverAxis0AtomicContigFactory, num_types>
-        dtb4;
-    dtb4.populate_dispatch_table(prod_over_axis0_contig_atomic_dispatch_table);
-
-    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
-                         ProductOverAxis1TempsContigFactory, td_ns::num_types>
-        dtb5;
-    dtb5.populate_dispatch_table(prod_over_axis1_contig_temps_dispatch_table);
-
-    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
-                         ProductOverAxis0TempsContigFactory, td_ns::num_types>
-        dtb6;
-    dtb6.populate_dispatch_table(prod_over_axis0_contig_temps_dispatch_table);
-}
-
-using atomic_support::atomic_support_fn_ptr_t;
-static atomic_support_fn_ptr_t prod_atomic_support_vector[td_ns::num_types];
-
-void populate_prod_atomic_support_dispatch_vector(void)
-{
-    using td_ns::DispatchVectorBuilder;
-
-    using atomic_support::ProductAtomicSupportFactory;
-    DispatchVectorBuilder<atomic_support_fn_ptr_t, ProductAtomicSupportFactory,
-                          td_ns::num_types>
-        dvb;
-    dvb.populate_dispatch_vector(prod_atomic_support_vector);
-}
-
-} // namespace impl
-
-void init_prod(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        using impl::populate_prod_over_axis_dispatch_tables;
-        populate_prod_over_axis_dispatch_tables();
-        using impl::prod_over_axis0_contig_atomic_dispatch_table;
-        using impl::prod_over_axis0_contig_temps_dispatch_table;
-        using impl::prod_over_axis1_contig_atomic_dispatch_table;
-        using impl::prod_over_axis1_contig_temps_dispatch_table;
-        using impl::prod_over_axis_strided_atomic_dispatch_table;
-        using impl::prod_over_axis_strided_temps_dispatch_table;
-
-        using impl::populate_prod_atomic_support_dispatch_vector;
-        populate_prod_atomic_support_dispatch_vector();
-        using impl::prod_atomic_support_vector;
-
-        auto prod_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
-                              const arrayT &dst, sycl::queue &exec_q,
-                              const event_vecT &depends = {}) {
-            using dpctl::tensor::py_internal::py_reduction_over_axis;
-            return py_reduction_over_axis(
-                src, trailing_dims_to_reduce, dst, exec_q, depends,
-                prod_over_axis_strided_atomic_dispatch_table,
-                prod_over_axis0_contig_atomic_dispatch_table,
-                prod_over_axis1_contig_atomic_dispatch_table,
-                prod_over_axis_strided_temps_dispatch_table,
-                prod_over_axis0_contig_temps_dispatch_table,
-                prod_over_axis1_contig_temps_dispatch_table,
-                prod_atomic_support_vector);
-        };
-        m.def("_prod_over_axis", prod_pyapi, "", py::arg("src"),
-              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto prod_dtype_supported =
-            [&](const py::dtype &input_dtype, const py::dtype &output_dtype,
-                const std::string &dst_usm_type, sycl::queue &q) {
-                using dpctl::tensor::py_internal::py_reduction_dtype_supported;
-                return py_reduction_dtype_supported(
-                    input_dtype, output_dtype, dst_usm_type, q,
-                    prod_over_axis_strided_atomic_dispatch_table,
-                    prod_over_axis_strided_temps_dispatch_table,
-                    prod_atomic_support_vector);
-            };
-        m.def("_prod_over_axis_dtype_supported", prod_dtype_supported, "",
-              py::arg("arg_dtype"), py::arg("out_dtype"),
-              py::arg("dst_usm_type"), py::arg("sycl_queue"));
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/prod.hpp b/dpctl/tensor/libtensor/source/reductions/prod.hpp
deleted file mode 100644
index bd1e1e6227..0000000000
--- a/dpctl/tensor/libtensor/source/reductions/prod.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_prod(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/reduce_hypot.cpp b/dpctl/tensor/libtensor/source/reductions/reduce_hypot.cpp
deleted file mode 100644
index 63e356077f..0000000000
--- a/dpctl/tensor/libtensor/source/reductions/reduce_hypot.cpp
+++ /dev/null
@@ -1,248 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-#include <vector>
-
-#include "kernels/reductions.hpp"
-#include "reduction_over_axis.hpp"
-#include "utils/sycl_utils.hpp"
-#include "utils/type_dispatch_building.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace su_ns = dpctl::tensor::sycl_utils;
-
-namespace impl
-{
-
-using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
-static reduction_strided_impl_fn_ptr
-    hypot_over_axis_strided_temps_dispatch_table[td_ns::num_types]
-                                                [td_ns::num_types];
-
-using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
-static reduction_contig_impl_fn_ptr
-    hypot_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
-                                                [td_ns::num_types];
-static reduction_contig_impl_fn_ptr
-    hypot_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
-                                                [td_ns::num_types];
-
-template <typename argTy, typename outTy>
-struct TypePairSupportDataForHypotReductionTemps
-{
-
-    static constexpr bool is_defined = std::disjunction<
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, sycl::half>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
-
-        // input int8_t
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, sycl::half>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
-
-        // input uint8_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, sycl::half>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
-
-        // input int16_t
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
-
-        // input uint16_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
-
-        // input int32_t
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
-
-        // input uint32_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
-
-        // input int64_t
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
-
-        // input uint64_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
-
-        // input half
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, double>,
-
-        // input float
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
-
-        // input double
-        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
-
-        // fall-through
-        td_ns::NotDefinedEntry>::is_defined;
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct HypotOverAxisTempsStridedFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForHypotReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
-            using ReductionOpT = su_ns::Hypot<dstTy>;
-            return dpctl::tensor::kernels::
-                reduction_over_group_temps_strided_impl<srcTy, dstTy,
-                                                        ReductionOpT>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct HypotOverAxis1TempsContigFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForHypotReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
-            using ReductionOpT = su_ns::Hypot<dstTy>;
-            return dpctl::tensor::kernels::
-                reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
-                                                             ReductionOpT>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct HypotOverAxis0TempsContigFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForHypotReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
-            using ReductionOpT = su_ns::Hypot<dstTy>;
-            return dpctl::tensor::kernels::
-                reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
-                                                             ReductionOpT>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-void populate_hypot_over_axis_dispatch_tables(void)
-{
-    using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
-    using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
-    using namespace td_ns;
-
-    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
-                         HypotOverAxisTempsStridedFactory, num_types>
-        dtb1;
-    dtb1.populate_dispatch_table(hypot_over_axis_strided_temps_dispatch_table);
-
-    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
-                         HypotOverAxis1TempsContigFactory, td_ns::num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(hypot_over_axis1_contig_temps_dispatch_table);
-
-    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
-                         HypotOverAxis0TempsContigFactory, td_ns::num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(hypot_over_axis0_contig_temps_dispatch_table);
-}
-
-} // namespace impl
-
-void init_reduce_hypot(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        using impl::populate_hypot_over_axis_dispatch_tables;
-        populate_hypot_over_axis_dispatch_tables();
-        using impl::hypot_over_axis0_contig_temps_dispatch_table;
-        using impl::hypot_over_axis1_contig_temps_dispatch_table;
-        using impl::hypot_over_axis_strided_temps_dispatch_table;
-
-        using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
-        using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
-
-        auto hypot_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
-                               const arrayT &dst, sycl::queue &exec_q,
-                               const event_vecT &depends = {}) {
-            using dpctl::tensor::py_internal::py_tree_reduction_over_axis;
-            return py_tree_reduction_over_axis(
-                src, trailing_dims_to_reduce, dst, exec_q, depends,
-                hypot_over_axis_strided_temps_dispatch_table,
-                hypot_over_axis0_contig_temps_dispatch_table,
-                hypot_over_axis1_contig_temps_dispatch_table);
-        };
-        m.def("_hypot_over_axis", hypot_pyapi, "", py::arg("src"),
-              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto hypot_dtype_supported = [&](const py::dtype &input_dtype,
-                                         const py::dtype &output_dtype) {
-            using dpctl::tensor::py_internal::py_tree_reduction_dtype_supported;
-            return py_tree_reduction_dtype_supported(
-                input_dtype, output_dtype,
-                hypot_over_axis_strided_temps_dispatch_table);
-        };
-        m.def("_hypot_over_axis_dtype_supported", hypot_dtype_supported, "",
-              py::arg("arg_dtype"), py::arg("out_dtype"));
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/reduce_hypot.hpp b/dpctl/tensor/libtensor/source/reductions/reduce_hypot.hpp
deleted file mode 100644
index 8ff5719968..0000000000
--- a/dpctl/tensor/libtensor/source/reductions/reduce_hypot.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_reduce_hypot(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/reduction_atomic_support.hpp b/dpctl/tensor/libtensor/source/reductions/reduction_atomic_support.hpp
deleted file mode 100644
index bc24c71658..0000000000
--- a/dpctl/tensor/libtensor/source/reductions/reduction_atomic_support.hpp
+++ /dev/null
@@ -1,140 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#pragma once
-#include <complex>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-#include "utils/type_utils.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-namespace atomic_support
-{
-
-typedef bool (*atomic_support_fn_ptr_t)(const sycl::queue &, sycl::usm::alloc);
-
-/*! @brief Function which returns a constant value for atomic support */
-template <bool return_value>
-bool fixed_decision(const sycl::queue &, sycl::usm::alloc)
-{
-    return return_value;
-}
-
-/*! @brief Template for querying atomic support for a type on a device */
-template <typename T>
-bool check_atomic_support(const sycl::queue &exec_q,
-                          sycl::usm::alloc usm_alloc_type)
-{
-    static constexpr bool atomic32 = (sizeof(T) == 4);
-    static constexpr bool atomic64 = (sizeof(T) == 8);
-    using dpctl::tensor::type_utils::is_complex;
-    if constexpr ((!atomic32 && !atomic64) || is_complex<T>::value) {
-        return fixed_decision<false>(exec_q, usm_alloc_type);
-    }
-    else {
-        bool supports_atomics = false;
-        const sycl::device &dev = exec_q.get_device();
-        if constexpr (atomic64) {
-            if (!dev.has(sycl::aspect::atomic64)) {
-                return false;
-            }
-        }
-        switch (usm_alloc_type) {
-        case sycl::usm::alloc::shared:
-            supports_atomics =
-                dev.has(sycl::aspect::usm_atomic_shared_allocations);
-            break;
-        case sycl::usm::alloc::host:
-            supports_atomics =
-                dev.has(sycl::aspect::usm_atomic_host_allocations);
-            break;
-        case sycl::usm::alloc::device:
-            supports_atomics = true;
-            break;
-        default:
-            supports_atomics = false;
-        }
-        return supports_atomics;
-    }
-}
-
-template <typename fnT, typename T> struct ArithmeticAtomicSupportFactory
-{
-    fnT get()
-    {
-        using dpctl::tensor::type_utils::is_complex;
-        if constexpr (std::is_floating_point_v<T> ||
-                      std::is_same_v<T, sycl::half> || is_complex<T>::value)
-        {
-            // for real- and complex- floating point types, tree reduction has
-            // better round-off accumulation properties (round-off error is
-            // proportional to the log2(reduction_size), while naive elementwise
-            // summation used by atomic implementation has round-off error
-            // growing proportional to the reduction_size.), hence reduction
-            // over floating point types should always use tree_reduction
-            // algorithm, even though atomic implementation may be applicable
-            return fixed_decision<false>;
-        }
-        else {
-            return check_atomic_support<T>;
-        }
-    }
-};
-
-template <typename fnT, typename T> struct MinMaxAtomicSupportFactory
-{
-    fnT get() { return check_atomic_support<T>; }
-};
-
-template <typename fnT, typename T>
-struct MaxAtomicSupportFactory : public MinMaxAtomicSupportFactory<fnT, T>
-{
-};
-
-template <typename fnT, typename T>
-struct MinAtomicSupportFactory : public MinMaxAtomicSupportFactory<fnT, T>
-{
-};
-
-template <typename fnT, typename T>
-struct SumAtomicSupportFactory : public ArithmeticAtomicSupportFactory<fnT, T>
-{
-};
-
-template <typename fnT, typename T>
-struct ProductAtomicSupportFactory
-    : public ArithmeticAtomicSupportFactory<fnT, T>
-{
-};
-
-} // namespace atomic_support
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/reduction_common.cpp b/dpctl/tensor/libtensor/source/reductions/reduction_common.cpp
deleted file mode 100644
index 1644318c26..0000000000
--- a/dpctl/tensor/libtensor/source/reductions/reduction_common.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#include <pybind11/pybind11.h>
-
-#include "all.hpp"
-#include "any.hpp"
-#include "argmax.hpp"
-#include "argmin.hpp"
-#include "logsumexp.hpp"
-#include "max.hpp"
-#include "min.hpp"
-#include "prod.hpp"
-#include "reduce_hypot.hpp"
-#include "sum.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-/*! @brief Add reduction functions to Python module */
-void init_reduction_functions(py::module_ m)
-{
-    init_all(m);
-    init_any(m);
-    init_argmax(m);
-    init_argmin(m);
-    init_logsumexp(m);
-    init_max(m);
-    init_min(m);
-    init_prod(m);
-    init_reduce_hypot(m);
-    init_sum(m);
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/reduction_common.hpp b/dpctl/tensor/libtensor/source/reductions/reduction_common.hpp
deleted file mode 100644
index 747ffc061f..0000000000
--- a/dpctl/tensor/libtensor/source/reductions/reduction_common.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_reduction_functions(py::module_);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp b/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp
deleted file mode 100644
index 14abb2eb2e..0000000000
--- a/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp
+++ /dev/null
@@ -1,1320 +0,0 @@
-//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions,
-/// specifically functions for reductions.
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-#include <exception>
-#include <stdexcept>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include "kernels/reductions.hpp"
-#include "simplify_iteration_space.hpp"
-#include "utils/memory_overlap.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/output_validation.hpp"
-#include "utils/sycl_alloc_utils.hpp"
-#include "utils/type_dispatch.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-/* ====================== dtype supported ======================== */
-
-/*! @brief Template implementing Python API for querying type support by
- * reduction which may support atomics */
-template <typename fnT, typename CheckAtomicSupportFnT>
-bool py_reduction_dtype_supported(
-    const py::dtype &input_dtype,
-    const py::dtype &output_dtype,
-    const std::string &dst_usm_type,
-    sycl::queue &q,
-    const fnT &atomic_dispatch_table,
-    const fnT &temps_dispatch_table,
-    const CheckAtomicSupportFnT &check_atomic_support)
-{
-    int arg_tn =
-        input_dtype.num(); // NumPy type numbers are the same as in dpctl
-    int out_tn =
-        output_dtype.num(); // NumPy type numbers are the same as in dpctl
-    int arg_typeid = -1;
-    int out_typeid = -1;
-
-    auto array_types = td_ns::usm_ndarray_types();
-
-    try {
-        arg_typeid = array_types.typenum_to_lookup_id(arg_tn);
-        out_typeid = array_types.typenum_to_lookup_id(out_tn);
-    } catch (const std::exception &e) {
-        throw py::value_error(e.what());
-    }
-
-    if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 ||
-        out_typeid >= td_ns::num_types)
-    {
-        throw std::runtime_error("Reduction type support check: lookup failed");
-    }
-
-    // remove_all_extents gets underlying type of table
-    using fn_ptrT = typename std::remove_all_extents<fnT>::type;
-    fn_ptrT fn = nullptr;
-
-    sycl::usm::alloc kind = sycl::usm::alloc::unknown;
-
-    if (dst_usm_type == "device") {
-        kind = sycl::usm::alloc::device;
-    }
-    else if (dst_usm_type == "shared") {
-        kind = sycl::usm::alloc::shared;
-    }
-    else if (dst_usm_type == "host") {
-        kind = sycl::usm::alloc::host;
-    }
-    else {
-        throw py::value_error("Unrecognized `dst_usm_type` argument.");
-    }
-
-    bool supports_atomics = check_atomic_support[out_typeid](q, kind);
-
-    if (supports_atomics) {
-        fn = atomic_dispatch_table[arg_typeid][out_typeid];
-    }
-
-    if (fn == nullptr) {
-        // use slower reduction implementation using temporaries
-        fn = temps_dispatch_table[arg_typeid][out_typeid];
-    }
-
-    return (fn != nullptr);
-}
-
-/*! @brief Template implementing Python API for querying type support by tree
- * reduction */
-template <typename fnT>
-bool py_tree_reduction_dtype_supported(const py::dtype &input_dtype,
-                                       const py::dtype &output_dtype,
-                                       const fnT &temps_dispatch_table)
-{
-    int arg_tn =
-        input_dtype.num(); // NumPy type numbers are the same as in dpctl
-    int out_tn =
-        output_dtype.num(); // NumPy type numbers are the same as in dpctl
-    int arg_typeid = -1;
-    int out_typeid = -1;
-
-    auto array_types = td_ns::usm_ndarray_types();
-
-    try {
-        arg_typeid = array_types.typenum_to_lookup_id(arg_tn);
-        out_typeid = array_types.typenum_to_lookup_id(out_tn);
-    } catch (const std::exception &e) {
-        throw py::value_error(e.what());
-    }
-
-    if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 ||
-        out_typeid >= td_ns::num_types)
-    {
-        throw std::runtime_error("Reduction type support check: lookup failed");
-    }
-
-    auto fn = temps_dispatch_table[arg_typeid][out_typeid];
-
-    return (fn != nullptr);
-}
-
-/* ==================== Generic reductions ====================== */
-
-/*! @brief Template implementing Python API for reduction over axis which may
- * support atomics */
-template <typename strided_fnT, typename contig_fnT, typename SupportAtomicFnT>
-std::pair<sycl::event, sycl::event> py_reduction_over_axis(
-    const dpctl::tensor::usm_ndarray &src,
-    int trailing_dims_to_reduce, // comp over this many trailing indexes
-    const dpctl::tensor::usm_ndarray &dst,
-    sycl::queue &exec_q,
-    const std::vector<sycl::event> &depends,
-    const strided_fnT &atomic_dispatch_table,
-    const contig_fnT &axis0_atomic_dispatch_table,
-    const contig_fnT &axis1_atomic_dispatch_table,
-    const strided_fnT &temps_dispatch_table,
-    const contig_fnT &axis0_temps_dispatch_table,
-    const contig_fnT &axis1_temps_dispatch_table,
-    const SupportAtomicFnT &check_atomic_support)
-{
-    int src_nd = src.get_ndim();
-    int iteration_nd = src_nd - trailing_dims_to_reduce;
-    if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) {
-        throw py::value_error("Trailing_dim_to_reduce must be positive, but no "
-                              "greater than rank of the array being reduced");
-    }
-
-    int dst_nd = dst.get_ndim();
-    if (dst_nd != iteration_nd) {
-        throw py::value_error("Destination array rank does not match input "
-                              "array rank and number of reduced dimensions");
-    }
-
-    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
-    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
-
-    bool same_shapes = true;
-    for (int i = 0; same_shapes && (i < dst_nd); ++i) {
-        same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]);
-    }
-
-    if (!same_shapes) {
-        throw py::value_error("Destination shape does not match unreduced "
-                              "dimensions of the input shape");
-    }
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    std::size_t dst_nelems = dst.get_size();
-
-    if (dst_nelems == 0) {
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    std::size_t reduction_nelems(1);
-    for (int i = dst_nd; i < src_nd; ++i) {
-        reduction_nelems *= static_cast<std::size_t>(src_shape_ptr[i]);
-    }
-
-    // check that dst and src do not overlap
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    if (overlap(src, dst)) {
-        throw py::value_error("Arrays index overlapping segments of memory");
-    }
-
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems);
-
-    int src_typenum = src.get_typenum();
-    int dst_typenum = dst.get_typenum();
-
-    namespace td_ns = dpctl::tensor::type_dispatch;
-    const auto &array_types = td_ns::usm_ndarray_types();
-    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    void *data_ptr = dst.get_data();
-    const auto &ctx = exec_q.get_context();
-    auto usm_type = sycl::get_pointer_type(data_ptr, ctx);
-
-    bool supports_atomics = check_atomic_support[dst_typeid](exec_q, usm_type);
-
-    // handle special case when both reduction and iteration are 1D contiguous
-    bool is_src_c_contig = src.is_c_contiguous();
-    bool is_dst_c_contig = dst.is_c_contiguous();
-    bool is_src_f_contig = src.is_f_contiguous();
-
-    if ((is_src_c_contig && is_dst_c_contig) ||
-        (is_src_f_contig && dst_nelems == 1))
-    {
-        // remove_all_extents gets underlying type of table
-        using contig_fn_ptr_T =
-            typename std::remove_all_extents<contig_fnT>::type;
-        contig_fn_ptr_T fn;
-        if (supports_atomics) {
-            fn = axis1_atomic_dispatch_table[src_typeid][dst_typeid];
-        }
-        else {
-            fn = axis1_temps_dispatch_table[src_typeid][dst_typeid];
-        }
-        if (fn != nullptr) {
-            std::size_t iter_nelems = dst_nelems;
-
-            static constexpr py::ssize_t zero_offset = 0;
-
-            sycl::event reduction_over_axis_contig_ev =
-                fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
-                   dst.get_data(),
-                   zero_offset, // iteration_src_offset
-                   zero_offset, // iteration_dst_offset
-                   zero_offset, // reduction_src_offset
-                   depends);
-
-            sycl::event keep_args_event = dpctl::utils::keep_args_alive(
-                exec_q, {src, dst}, {reduction_over_axis_contig_ev});
-
-            return std::make_pair(keep_args_event,
-                                  reduction_over_axis_contig_ev);
-        }
-    }
-    else if (is_src_f_contig &&
-             ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous()))
-    {
-        // remove_all_extents gets underlying type of table
-        using contig_fn_ptr_T =
-            typename std::remove_all_extents<contig_fnT>::type;
-        contig_fn_ptr_T fn;
-        if (supports_atomics) {
-            fn = axis0_atomic_dispatch_table[src_typeid][dst_typeid];
-        }
-        else {
-            fn = axis0_temps_dispatch_table[src_typeid][dst_typeid];
-        }
-        if (fn != nullptr) {
-            std::size_t iter_nelems = dst_nelems;
-
-            static constexpr py::ssize_t zero_offset = 0;
-
-            sycl::event reduction_over_axis_contig_ev =
-                fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
-                   dst.get_data(),
-                   zero_offset, // iteration_src_offset
-                   zero_offset, // iteration_dst_offset
-                   zero_offset, // reduction_src_offset
-                   depends);
-
-            sycl::event keep_args_event = dpctl::utils::keep_args_alive(
-                exec_q, {src, dst}, {reduction_over_axis_contig_ev});
-
-            return std::make_pair(keep_args_event,
-                                  reduction_over_axis_contig_ev);
-        }
-    }
-
-    using dpctl::tensor::py_internal::simplify_iteration_space;
-    using dpctl::tensor::py_internal::simplify_iteration_space_1;
-
-    auto const &src_shape_vecs = src.get_shape_vector();
-    auto const &src_strides_vecs = src.get_strides_vector();
-    auto const &dst_strides_vecs = dst.get_strides_vector();
-
-    int reduction_nd = trailing_dims_to_reduce;
-    const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd;
-    using shT = std::vector<py::ssize_t>;
-    shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd,
-                              std::end(src_strides_vecs));
-
-    shT simplified_reduction_shape;
-    shT simplified_reduction_src_strides;
-    py::ssize_t reduction_src_offset(0);
-
-    simplify_iteration_space_1(
-        reduction_nd, reduction_shape_ptr, reduction_src_strides,
-        // output
-        simplified_reduction_shape, simplified_reduction_src_strides,
-        reduction_src_offset);
-
-    const py::ssize_t *iteration_shape_ptr = src_shape_ptr;
-
-    shT iteration_src_strides(std::begin(src_strides_vecs),
-                              std::begin(src_strides_vecs) + iteration_nd);
-    shT const &iteration_dst_strides = dst_strides_vecs;
-
-    shT simplified_iteration_shape;
-    shT simplified_iteration_src_strides;
-    shT simplified_iteration_dst_strides;
-    py::ssize_t iteration_src_offset(0);
-    py::ssize_t iteration_dst_offset(0);
-
-    if (iteration_nd == 0) {
-        if (dst_nelems != 1) {
-            throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1");
-        }
-        iteration_nd = 1;
-        simplified_iteration_shape.push_back(1);
-        simplified_iteration_src_strides.push_back(0);
-        simplified_iteration_dst_strides.push_back(0);
-    }
-    else {
-        simplify_iteration_space(iteration_nd, iteration_shape_ptr,
-                                 iteration_src_strides, iteration_dst_strides,
-                                 // output
-                                 simplified_iteration_shape,
-                                 simplified_iteration_src_strides,
-                                 simplified_iteration_dst_strides,
-                                 iteration_src_offset, iteration_dst_offset);
-    }
-
-    if ((reduction_nd == 1) && (iteration_nd == 1)) {
-        bool mat_reduce_over_axis1 = false;
-        bool mat_reduce_over_axis0 = false;
-        bool array_reduce_all_elems = false;
-        std::size_t iter_nelems = dst_nelems;
-
-        if (simplified_reduction_src_strides[0] == 1) {
-            array_reduce_all_elems = (simplified_iteration_shape[0] == 1);
-            mat_reduce_over_axis1 =
-                (simplified_iteration_dst_strides[0] == 1) &&
-                (static_cast<std::size_t>(
-                     simplified_iteration_src_strides[0]) == reduction_nelems);
-        }
-        else if (static_cast<std::size_t>(
-                     simplified_reduction_src_strides[0]) == iter_nelems)
-        {
-            mat_reduce_over_axis0 =
-                (simplified_iteration_dst_strides[0] == 1) &&
-                (simplified_iteration_src_strides[0] == 1);
-        }
-
-        if (mat_reduce_over_axis1 || array_reduce_all_elems) {
-            using contig_fn_ptr_T =
-                typename std::remove_all_extents<contig_fnT>::type;
-            contig_fn_ptr_T fn;
-            if (supports_atomics) {
-                fn = axis1_atomic_dispatch_table[src_typeid][dst_typeid];
-            }
-            else {
-                fn = axis1_temps_dispatch_table[src_typeid][dst_typeid];
-            }
-            if (fn != nullptr) {
-                sycl::event reduction_over_axis1_contig_ev =
-                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
-                       dst.get_data(), iteration_src_offset,
-                       iteration_dst_offset, reduction_src_offset, depends);
-
-                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
-                    exec_q, {src, dst}, {reduction_over_axis1_contig_ev});
-
-                return std::make_pair(keep_args_event,
-                                      reduction_over_axis1_contig_ev);
-            }
-        }
-        else if (mat_reduce_over_axis0) {
-            using contig_fn_ptr_T =
-                typename std::remove_all_extents<contig_fnT>::type;
-            contig_fn_ptr_T fn;
-            if (supports_atomics) {
-                fn = axis0_atomic_dispatch_table[src_typeid][dst_typeid];
-            }
-            else {
-                fn = axis0_temps_dispatch_table[src_typeid][dst_typeid];
-            }
-            if (fn != nullptr) {
-                sycl::event reduction_over_axis0_contig_ev =
-                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
-                       dst.get_data(), iteration_src_offset,
-                       iteration_dst_offset, reduction_src_offset, depends);
-
-                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
-                    exec_q, {src, dst}, {reduction_over_axis0_contig_ev});
-
-                return std::make_pair(keep_args_event,
-                                      reduction_over_axis0_contig_ev);
-            }
-        }
-    }
-
-    // remove_all_extents gets underlying type of table
-    using strided_fn_ptr_T =
-        typename std::remove_all_extents<strided_fnT>::type;
-    strided_fn_ptr_T fn = nullptr;
-
-    if (supports_atomics) {
-        fn = atomic_dispatch_table[src_typeid][dst_typeid];
-    }
-
-    if (fn == nullptr) {
-        // use slower reduction implementation using temporaries
-        fn = temps_dispatch_table[src_typeid][dst_typeid];
-        if (fn == nullptr) {
-            throw std::runtime_error("Datatypes are not supported");
-        }
-    }
-
-    std::vector<sycl::event> host_task_events{};
-    using dpctl::tensor::offset_utils::device_allocate_and_pack;
-    auto arrays_metainfo_packing_triple_ =
-        device_allocate_and_pack<py::ssize_t>(
-            exec_q, host_task_events,
-            // iteration metadata
-            simplified_iteration_shape, simplified_iteration_src_strides,
-            simplified_iteration_dst_strides,
-            // reduction metadata
-            simplified_reduction_shape, simplified_reduction_src_strides);
-    auto tmp_alloc_owner =
-        std::move(std::get<0>(arrays_metainfo_packing_triple_));
-    const auto &copy_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_);
-    const py::ssize_t *temp_allocation_ptr = tmp_alloc_owner.get();
-
-    const py::ssize_t *iter_shape_and_strides = temp_allocation_ptr;
-    const py::ssize_t *reduction_shape_stride =
-        temp_allocation_ptr + 3 * simplified_iteration_shape.size();
-
-    std::vector<sycl::event> all_deps;
-    all_deps.reserve(depends.size() + 1);
-    all_deps.resize(depends.size());
-    std::copy(depends.begin(), depends.end(), all_deps.begin());
-    all_deps.push_back(copy_metadata_ev);
-
-    auto reduction_ev =
-        fn(exec_q, dst_nelems, reduction_nelems, src.get_data(), dst.get_data(),
-           iteration_nd, iter_shape_and_strides, iteration_src_offset,
-           iteration_dst_offset,
-           reduction_nd, // number dimensions being reduced
-           reduction_shape_stride, reduction_src_offset, all_deps);
-
-    sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
-        exec_q, {reduction_ev}, tmp_alloc_owner);
-    host_task_events.push_back(temp_cleanup_ev);
-
-    sycl::event keep_args_event =
-        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events);
-
-    return std::make_pair(keep_args_event, reduction_ev);
-}
-
-/* ================= No atomic reductions ====================== */
-
-/*! @brief Template implementing Python API for reduction over axis without
- * atomics */
-template <typename strided_fnT, typename contig_fnT>
-std::pair<sycl::event, sycl::event> py_tree_reduction_over_axis(
-    const dpctl::tensor::usm_ndarray &src,
-    int trailing_dims_to_reduce, // comp over this many trailing indexes
-    const dpctl::tensor::usm_ndarray &dst,
-    sycl::queue &exec_q,
-    const std::vector<sycl::event> &depends,
-    const strided_fnT &temps_dispatch_table,
-    const contig_fnT &axis0_temps_dispatch_table,
-    const contig_fnT &axis1_temps_dispatch_table)
-{
-    int src_nd = src.get_ndim();
-    int iteration_nd = src_nd - trailing_dims_to_reduce;
-    if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) {
-        throw py::value_error("Trailing_dim_to_reduce must be positive, but no "
-                              "greater than rank of the array being reduced");
-    }
-
-    int dst_nd = dst.get_ndim();
-    if (dst_nd != iteration_nd) {
-        throw py::value_error("Destination array rank does not match input "
-                              "array rank and number of reduced dimensions");
-    }
-
-    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
-    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
-
-    bool same_shapes = true;
-    for (int i = 0; same_shapes && (i < dst_nd); ++i) {
-        same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]);
-    }
-
-    if (!same_shapes) {
-        throw py::value_error("Destination shape does not match unreduced "
-                              "dimensions of the input shape");
-    }
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    std::size_t dst_nelems = dst.get_size();
-
-    if (dst_nelems == 0) {
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    std::size_t reduction_nelems(1);
-    for (int i = dst_nd; i < src_nd; ++i) {
-        reduction_nelems *= static_cast<std::size_t>(src_shape_ptr[i]);
-    }
-
-    // check that dst and src do not overlap
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    if (overlap(src, dst)) {
-        throw py::value_error("Arrays index overlapping segments of memory");
-    }
-
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems);
-
-    int src_typenum = src.get_typenum();
-    int dst_typenum = dst.get_typenum();
-
-    namespace td_ns = dpctl::tensor::type_dispatch;
-    const auto &array_types = td_ns::usm_ndarray_types();
-    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    // handle special case when both reduction and iteration are 1D contiguous
-    bool is_src_c_contig = src.is_c_contiguous();
-    bool is_dst_c_contig = dst.is_c_contiguous();
-    bool is_src_f_contig = src.is_f_contiguous();
-
-    if ((is_src_c_contig && is_dst_c_contig) ||
-        (is_src_f_contig && dst_nelems == 1))
-    {
-        auto fn = axis1_temps_dispatch_table[src_typeid][dst_typeid];
-        if (fn != nullptr) {
-            std::size_t iter_nelems = dst_nelems;
-
-            static constexpr py::ssize_t zero_offset = 0;
-
-            sycl::event reduction_over_axis_contig_ev =
-                fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
-                   dst.get_data(),
-                   zero_offset, // iteration_src_offset
-                   zero_offset, // iteration_dst_offset
-                   zero_offset, // reduction_src_offset
-                   depends);
-
-            sycl::event keep_args_event = dpctl::utils::keep_args_alive(
-                exec_q, {src, dst}, {reduction_over_axis_contig_ev});
-
-            return std::make_pair(keep_args_event,
-                                  reduction_over_axis_contig_ev);
-        }
-    }
-    else if (is_src_f_contig &&
-             ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous()))
-    {
-        auto fn = axis0_temps_dispatch_table[src_typeid][dst_typeid];
-        if (fn != nullptr) {
-            std::size_t iter_nelems = dst_nelems;
-
-            static constexpr py::ssize_t zero_offset = 0;
-
-            sycl::event reduction_over_axis_contig_ev =
-                fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
-                   dst.get_data(),
-                   zero_offset, // iteration_src_offset
-                   zero_offset, // iteration_dst_offset
-                   zero_offset, // reduction_src_offset
-                   depends);
-
-            sycl::event keep_args_event = dpctl::utils::keep_args_alive(
-                exec_q, {src, dst}, {reduction_over_axis_contig_ev});
-
-            return std::make_pair(keep_args_event,
-                                  reduction_over_axis_contig_ev);
-        }
-    }
-
-    using dpctl::tensor::py_internal::simplify_iteration_space;
-    using dpctl::tensor::py_internal::simplify_iteration_space_1;
-
-    auto const &src_shape_vecs = src.get_shape_vector();
-    auto const &src_strides_vecs = src.get_strides_vector();
-    auto const &dst_strides_vecs = dst.get_strides_vector();
-
-    int reduction_nd = trailing_dims_to_reduce;
-    const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd;
-    using shT = std::vector<py::ssize_t>;
-    shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd,
-                              std::end(src_strides_vecs));
-
-    shT simplified_reduction_shape;
-    shT simplified_reduction_src_strides;
-    py::ssize_t reduction_src_offset(0);
-
-    simplify_iteration_space_1(
-        reduction_nd, reduction_shape_ptr, reduction_src_strides,
-        // output
-        simplified_reduction_shape, simplified_reduction_src_strides,
-        reduction_src_offset);
-
-    const py::ssize_t *iteration_shape_ptr = src_shape_ptr;
-
-    shT iteration_src_strides(std::begin(src_strides_vecs),
-                              std::begin(src_strides_vecs) + iteration_nd);
-    shT const &iteration_dst_strides = dst_strides_vecs;
-
-    shT simplified_iteration_shape;
-    shT simplified_iteration_src_strides;
-    shT simplified_iteration_dst_strides;
-    py::ssize_t iteration_src_offset(0);
-    py::ssize_t iteration_dst_offset(0);
-
-    if (iteration_nd == 0) {
-        if (dst_nelems != 1) {
-            throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1");
-        }
-        iteration_nd = 1;
-        simplified_iteration_shape.push_back(1);
-        simplified_iteration_src_strides.push_back(0);
-        simplified_iteration_dst_strides.push_back(0);
-    }
-    else {
-        simplify_iteration_space(iteration_nd, iteration_shape_ptr,
-                                 iteration_src_strides, iteration_dst_strides,
-                                 // output
-                                 simplified_iteration_shape,
-                                 simplified_iteration_src_strides,
-                                 simplified_iteration_dst_strides,
-                                 iteration_src_offset, iteration_dst_offset);
-    }
-
-    if ((reduction_nd == 1) && (iteration_nd == 1)) {
-        bool mat_reduce_over_axis1 = false;
-        bool mat_reduce_over_axis0 = false;
-        bool array_reduce_all_elems = false;
-        std::size_t iter_nelems = dst_nelems;
-
-        if (simplified_reduction_src_strides[0] == 1) {
-            array_reduce_all_elems = (simplified_iteration_shape[0] == 1);
-            mat_reduce_over_axis1 =
-                (simplified_iteration_dst_strides[0] == 1) &&
-                (static_cast<std::size_t>(
-                     simplified_iteration_src_strides[0]) == reduction_nelems);
-        }
-        else if (static_cast<std::size_t>(
-                     simplified_reduction_src_strides[0]) == iter_nelems)
-        {
-            mat_reduce_over_axis0 =
-                (simplified_iteration_dst_strides[0] == 1) &&
-                (simplified_iteration_src_strides[0] == 1);
-        }
-
-        if (mat_reduce_over_axis1 || array_reduce_all_elems) {
-            auto fn = axis1_temps_dispatch_table[src_typeid][dst_typeid];
-            if (fn != nullptr) {
-                sycl::event reduction_over_axis1_contig_ev =
-                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
-                       dst.get_data(), iteration_src_offset,
-                       iteration_dst_offset, reduction_src_offset, depends);
-
-                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
-                    exec_q, {src, dst}, {reduction_over_axis1_contig_ev});
-
-                return std::make_pair(keep_args_event,
-                                      reduction_over_axis1_contig_ev);
-            }
-        }
-        else if (mat_reduce_over_axis0) {
-            auto fn = axis0_temps_dispatch_table[src_typeid][dst_typeid];
-            if (fn != nullptr) {
-                sycl::event reduction_over_axis0_contig_ev =
-                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
-                       dst.get_data(), iteration_src_offset,
-                       iteration_dst_offset, reduction_src_offset, depends);
-
-                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
-                    exec_q, {src, dst}, {reduction_over_axis0_contig_ev});
-
-                return std::make_pair(keep_args_event,
-                                      reduction_over_axis0_contig_ev);
-            }
-        }
-    }
-
-    auto fn = temps_dispatch_table[src_typeid][dst_typeid];
-    if (fn == nullptr) {
-        throw std::runtime_error("Datatypes are not supported");
-    }
-
-    std::vector<sycl::event> host_task_events{};
-    using dpctl::tensor::offset_utils::device_allocate_and_pack;
-    auto arrays_metainfo_packing_triple_ =
-        device_allocate_and_pack<py::ssize_t>(
-            exec_q, host_task_events,
-            // iteration metadata
-            simplified_iteration_shape, simplified_iteration_src_strides,
-            simplified_iteration_dst_strides,
-            // reduction metadata
-            simplified_reduction_shape, simplified_reduction_src_strides);
-    auto tmp_owner = std::move(std::get<0>(arrays_metainfo_packing_triple_));
-    const auto &copy_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_);
-    const py::ssize_t *temp_allocation_ptr = tmp_owner.get();
-
-    const py::ssize_t *iter_shape_and_strides = temp_allocation_ptr;
-    const py::ssize_t *reduction_shape_stride =
-        temp_allocation_ptr + 3 * simplified_iteration_shape.size();
-
-    std::vector<sycl::event> all_deps;
-    all_deps.reserve(depends.size() + 1);
-    all_deps.resize(depends.size());
-    std::copy(depends.begin(), depends.end(), all_deps.begin());
-    all_deps.push_back(copy_metadata_ev);
-
-    auto reduction_ev =
-        fn(exec_q, dst_nelems, reduction_nelems, src.get_data(), dst.get_data(),
-           iteration_nd, iter_shape_and_strides, iteration_src_offset,
-           iteration_dst_offset,
-           reduction_nd, // number dimensions being reduced
-           reduction_shape_stride, reduction_src_offset, all_deps);
-
-    sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
-        exec_q, {reduction_ev}, tmp_owner);
-    host_task_events.push_back(temp_cleanup_ev);
-
-    sycl::event keep_args_event =
-        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events);
-
-    return std::make_pair(keep_args_event, reduction_ev);
-}
-
-/*! @brief Template implementing Python API for searching over an axis */
-template <typename strided_fnT, typename contig_fnT>
-std::pair<sycl::event, sycl::event> py_search_over_axis(
-    const dpctl::tensor::usm_ndarray &src,
-    int trailing_dims_to_reduce, // comp over this many trailing indexes
-    const dpctl::tensor::usm_ndarray &dst,
-    sycl::queue &exec_q,
-    const std::vector<sycl::event> &depends,
-    const strided_fnT &strided_dispatch_table,
-    const contig_fnT &axis0_contig_dispatch_table,
-    const contig_fnT &axis1_contig_dispatch_table)
-{
-    int src_nd = src.get_ndim();
-    int iteration_nd = src_nd - trailing_dims_to_reduce;
-    if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) {
-        throw py::value_error("Trailing_dim_to_reduce must be positive, but no "
-                              "greater than rank of the array being reduced");
-    }
-
-    int dst_nd = dst.get_ndim();
-    if (dst_nd != iteration_nd) {
-        throw py::value_error("Destination array rank does not match input "
-                              "array rank and number of reduced dimensions");
-    }
-
-    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
-    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
-
-    bool same_shapes = true;
-    for (int i = 0; same_shapes && (i < dst_nd); ++i) {
-        same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]);
-    }
-
-    if (!same_shapes) {
-        throw py::value_error("Destination shape does not match unreduced "
-                              "dimensions of the input shape");
-    }
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    std::size_t dst_nelems = dst.get_size();
-
-    if (dst_nelems == 0) {
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    std::size_t reduction_nelems(1);
-    for (int i = dst_nd; i < src_nd; ++i) {
-        reduction_nelems *= static_cast<std::size_t>(src_shape_ptr[i]);
-    }
-
-    // check that dst and src do not overlap
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    if (overlap(src, dst)) {
-        throw py::value_error("Arrays index overlapping segments of memory");
-    }
-
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems);
-
-    int src_typenum = src.get_typenum();
-    int dst_typenum = dst.get_typenum();
-
-    namespace td_ns = dpctl::tensor::type_dispatch;
-    const auto &array_types = td_ns::usm_ndarray_types();
-    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    // handle special case when both reduction and iteration are 1D contiguous
-    bool is_src_c_contig = src.is_c_contiguous();
-    bool is_dst_c_contig = dst.is_c_contiguous();
-    bool is_src_f_contig = src.is_f_contiguous();
-
-    if (is_src_c_contig && is_dst_c_contig) {
-        auto fn = axis1_contig_dispatch_table[src_typeid][dst_typeid];
-        if (fn != nullptr) {
-            std::size_t iter_nelems = dst_nelems;
-
-            static constexpr py::ssize_t zero_offset = 0;
-
-            sycl::event reduction_over_axis_contig_ev =
-                fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
-                   dst.get_data(),
-                   zero_offset, // iteration_src_offset
-                   zero_offset, // iteration_dst_offset
-                   zero_offset, // reduction_src_offset
-                   depends);
-
-            sycl::event keep_args_event = dpctl::utils::keep_args_alive(
-                exec_q, {src, dst}, {reduction_over_axis_contig_ev});
-
-            return std::make_pair(keep_args_event,
-                                  reduction_over_axis_contig_ev);
-        }
-    }
-    else if (is_src_f_contig && dst_nd == 1) {
-        auto fn = axis0_contig_dispatch_table[src_typeid][dst_typeid];
-        if (fn != nullptr) {
-            std::size_t iter_nelems = dst_nelems;
-
-            static constexpr py::ssize_t zero_offset = 0;
-
-            sycl::event reduction_over_axis_contig_ev =
-                fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
-                   dst.get_data(),
-                   zero_offset, // iteration_src_offset
-                   zero_offset, // iteration_dst_offset
-                   zero_offset, // reduction_src_offset
-                   depends);
-
-            sycl::event keep_args_event = dpctl::utils::keep_args_alive(
-                exec_q, {src, dst}, {reduction_over_axis_contig_ev});
-
-            return std::make_pair(keep_args_event,
-                                  reduction_over_axis_contig_ev);
-        }
-    }
-
-    using dpctl::tensor::py_internal::simplify_iteration_space;
-
-    auto const &src_shape_vecs = src.get_shape_vector();
-    auto const &src_strides_vecs = src.get_strides_vector();
-    auto const &dst_strides_vecs = dst.get_strides_vector();
-
-    int reduction_nd = trailing_dims_to_reduce;
-    const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd;
-    using shT = std::vector<py::ssize_t>;
-    shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd,
-                              std::end(src_strides_vecs));
-
-    shT compact_reduction_shape;
-    shT compact_reduction_src_strides;
-    py::ssize_t reduction_src_offset(0);
-
-    compact_iteration_space(
-        reduction_nd, reduction_shape_ptr, reduction_src_strides,
-        // output
-        compact_reduction_shape, compact_reduction_src_strides);
-
-    const py::ssize_t *iteration_shape_ptr = src_shape_ptr;
-
-    shT iteration_src_strides(std::begin(src_strides_vecs),
-                              std::begin(src_strides_vecs) + iteration_nd);
-    shT const &iteration_dst_strides = dst_strides_vecs;
-
-    shT simplified_iteration_shape;
-    shT simplified_iteration_src_strides;
-    shT simplified_iteration_dst_strides;
-    py::ssize_t iteration_src_offset(0);
-    py::ssize_t iteration_dst_offset(0);
-
-    if (iteration_nd == 0) {
-        if (dst_nelems != 1) {
-            throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1");
-        }
-        iteration_nd = 1;
-        simplified_iteration_shape.push_back(1);
-        simplified_iteration_src_strides.push_back(0);
-        simplified_iteration_dst_strides.push_back(0);
-    }
-    else {
-        simplify_iteration_space(iteration_nd, iteration_shape_ptr,
-                                 iteration_src_strides, iteration_dst_strides,
-                                 // output
-                                 simplified_iteration_shape,
-                                 simplified_iteration_src_strides,
-                                 simplified_iteration_dst_strides,
-                                 iteration_src_offset, iteration_dst_offset);
-    }
-
-    if ((reduction_nd == 1) && (iteration_nd == 1)) {
-        bool mat_reduce_over_axis1 = false;
-        bool mat_reduce_over_axis0 = false;
-        std::size_t iter_nelems = dst_nelems;
-
-        if (compact_reduction_src_strides[0] == 1) {
-            mat_reduce_over_axis1 =
-                (simplified_iteration_dst_strides[0] == 1) &&
-                (static_cast<std::size_t>(
-                     simplified_iteration_src_strides[0]) == reduction_nelems);
-        }
-        else if (static_cast<std::size_t>(compact_reduction_src_strides[0]) ==
-                 iter_nelems)
-        {
-            mat_reduce_over_axis0 =
-                (simplified_iteration_dst_strides[0] == 1) &&
-                (simplified_iteration_src_strides[0] == 1);
-        }
-
-        if (mat_reduce_over_axis1) {
-            auto fn = axis1_contig_dispatch_table[src_typeid][dst_typeid];
-            if (fn != nullptr) {
-                sycl::event reduction_over_axis1_contig_ev =
-                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
-                       dst.get_data(), iteration_src_offset,
-                       iteration_dst_offset, reduction_src_offset, depends);
-
-                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
-                    exec_q, {src, dst}, {reduction_over_axis1_contig_ev});
-
-                return std::make_pair(keep_args_event,
-                                      reduction_over_axis1_contig_ev);
-            }
-        }
-        else if (mat_reduce_over_axis0) {
-            auto fn = axis0_contig_dispatch_table[src_typeid][dst_typeid];
-            if (fn != nullptr) {
-                sycl::event reduction_over_axis0_contig_ev =
-                    fn(exec_q, iter_nelems, reduction_nelems, src.get_data(),
-                       dst.get_data(), iteration_src_offset,
-                       iteration_dst_offset, reduction_src_offset, depends);
-
-                sycl::event keep_args_event = dpctl::utils::keep_args_alive(
-                    exec_q, {src, dst}, {reduction_over_axis0_contig_ev});
-
-                return std::make_pair(keep_args_event,
-                                      reduction_over_axis0_contig_ev);
-            }
-        }
-    }
-
-    auto fn = strided_dispatch_table[src_typeid][dst_typeid];
-    if (fn == nullptr) {
-        throw std::runtime_error("Datatypes are not supported");
-    }
-
-    std::vector<sycl::event> host_task_events{};
-
-    using dpctl::tensor::offset_utils::device_allocate_and_pack;
-
-    auto arrays_metainfo_packing_triple_ =
-        device_allocate_and_pack<py::ssize_t>(
-            exec_q, host_task_events,
-            // iteration metadata
-            simplified_iteration_shape, simplified_iteration_src_strides,
-            simplified_iteration_dst_strides,
-            // reduction metadata
-            compact_reduction_shape, compact_reduction_src_strides);
-    auto tmp_owner = std::move(std::get<0>(arrays_metainfo_packing_triple_));
-    const auto &copy_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_);
-    const py::ssize_t *temp_allocation_ptr = tmp_owner.get();
-
-    const py::ssize_t *iter_shape_and_strides = temp_allocation_ptr;
-    const py::ssize_t *reduction_shape_stride =
-        temp_allocation_ptr + 3 * simplified_iteration_shape.size();
-
-    std::vector<sycl::event> all_deps;
-    all_deps.reserve(depends.size() + 1);
-    all_deps.resize(depends.size());
-    std::copy(depends.begin(), depends.end(), all_deps.begin());
-    all_deps.push_back(copy_metadata_ev);
-
-    auto comp_ev = fn(exec_q, dst_nelems, reduction_nelems, src.get_data(),
-                      dst.get_data(), iteration_nd, iter_shape_and_strides,
-                      iteration_src_offset, iteration_dst_offset,
-                      reduction_nd, // number dimensions being reduced
-                      reduction_shape_stride, reduction_src_offset, all_deps);
-
-    sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
-        exec_q, {comp_ev}, tmp_owner);
-    host_task_events.push_back(temp_cleanup_ev);
-
-    sycl::event keep_args_event =
-        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events);
-
-    return std::make_pair(keep_args_event, comp_ev);
-}
-
-/* ================= Atomic only reductions ====================== */
-
-/*! @brief Template implementing Python API for boolean reductions over an axis
- */
-template <typename contig_dispatchT,
-          typename strided_dispatchT,
-          typename atomic_support_fnT>
-std::pair<sycl::event, sycl::event>
-py_boolean_reduction(const dpctl::tensor::usm_ndarray &src,
-                     int trailing_dims_to_reduce,
-                     const dpctl::tensor::usm_ndarray &dst,
-                     sycl::queue &exec_q,
-                     const std::vector<sycl::event> &depends,
-                     const contig_dispatchT &axis1_contig_dispatch_vector,
-                     const contig_dispatchT &axis0_contig_dispatch_vector,
-                     const strided_dispatchT &strided_dispatch_vector,
-                     const atomic_support_fnT check_atomic_support)
-{
-    int src_nd = src.get_ndim();
-    int iter_nd = src_nd - trailing_dims_to_reduce;
-    if (trailing_dims_to_reduce <= 0 || iter_nd < 0) {
-        throw py::value_error("Trailing_dim_to_reduce must be positive, but no "
-                              "greater than rank of the array being reduced");
-    }
-
-    int dst_nd = dst.get_ndim();
-    if (dst_nd != iter_nd) {
-        throw py::value_error("Destination array rank does not match input "
-                              "array rank and number of reduced dimensions");
-    }
-
-    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
-    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
-
-    bool same_shapes = true;
-    for (int i = 0; same_shapes && (i < dst_nd); ++i) {
-        same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]);
-    }
-
-    if (!same_shapes) {
-        throw py::value_error("Destination shape does not match unreduced "
-                              "dimensions of the input shape");
-    }
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    std::size_t dst_nelems = dst.get_size();
-
-    std::size_t red_nelems(1);
-    for (int i = dst_nd; i < src_nd; ++i) {
-        red_nelems *= static_cast<std::size_t>(src_shape_ptr[i]);
-    }
-
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    if (overlap(dst, src)) {
-        throw py::value_error("Arrays are expected to have no memory overlap");
-    }
-
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems);
-
-    const char *src_data = src.get_data();
-    char *dst_data = dst.get_data();
-
-    int src_typenum = src.get_typenum();
-    int dst_typenum = dst.get_typenum();
-
-    const auto &array_types = td_ns::usm_ndarray_types();
-    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    static constexpr int int32_typeid =
-        static_cast<int>(td_ns::typenum_t::INT32);
-    if (dst_typeid != int32_typeid) {
-        throw py::value_error(
-            "Unexpected data type of destination array, expecting 'int32'");
-    }
-
-    void *data_ptr = dst.get_data();
-    const auto &ctx = exec_q.get_context();
-    auto usm_type = sycl::get_pointer_type(data_ptr, ctx);
-
-    bool supports_atomics = check_atomic_support(exec_q, usm_type);
-    if (!supports_atomics) {
-        throw py::value_error(
-            "This reduction is not supported for this device and usm_type.");
-    }
-
-    bool is_src_c_contig = src.is_c_contiguous();
-    bool is_src_f_contig = src.is_f_contiguous();
-    bool is_dst_c_contig = dst.is_c_contiguous();
-
-    if ((is_src_c_contig && is_dst_c_contig) ||
-        (is_src_f_contig && dst_nelems == 0))
-    {
-        auto fn = axis1_contig_dispatch_vector[src_typeid];
-        static constexpr py::ssize_t zero_offset = 0;
-
-        sycl::event red_ev =
-            fn(exec_q, dst_nelems, red_nelems, src_data, dst_data, zero_offset,
-               zero_offset, zero_offset, depends);
-
-        sycl::event keep_args_event =
-            dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev});
-
-        return std::make_pair(keep_args_event, red_ev);
-    }
-    else if (is_src_f_contig &&
-             ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous()))
-    {
-        auto fn = axis0_contig_dispatch_vector[src_typeid];
-        static constexpr py::ssize_t zero_offset = 0;
-
-        sycl::event red_ev =
-            fn(exec_q, dst_nelems, red_nelems, src_data, dst_data, zero_offset,
-               zero_offset, zero_offset, depends);
-
-        sycl::event keep_args_event =
-            dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev});
-
-        return std::make_pair(keep_args_event, red_ev);
-    }
-
-    auto src_shape_vecs = src.get_shape_vector();
-    auto src_strides_vecs = src.get_strides_vector();
-    auto dst_strides_vecs = dst.get_strides_vector();
-
-    int simplified_red_nd = trailing_dims_to_reduce;
-
-    using shT = std::vector<py::ssize_t>;
-    shT red_src_strides(std::begin(src_strides_vecs) + dst_nd,
-                        std::end(src_strides_vecs));
-
-    shT simplified_red_shape;
-    shT simplified_red_src_strides;
-    py::ssize_t red_src_offset(0);
-
-    using dpctl::tensor::py_internal::simplify_iteration_space_1;
-    simplify_iteration_space_1(
-        simplified_red_nd, src_shape_ptr + dst_nd, red_src_strides,
-        // output
-        simplified_red_shape, simplified_red_src_strides, red_src_offset);
-
-    shT iter_src_strides(std::begin(src_strides_vecs),
-                         std::begin(src_strides_vecs) + iter_nd);
-    shT const &iter_dst_strides = dst_strides_vecs;
-
-    shT simplified_iter_shape;
-    shT simplified_iter_src_strides;
-    shT simplified_iter_dst_strides;
-    py::ssize_t iter_src_offset(0);
-    py::ssize_t iter_dst_offset(0);
-
-    if (iter_nd == 0) {
-        if (dst_nelems != 1) {
-            throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1");
-        }
-        iter_nd = 1;
-        simplified_iter_shape.push_back(1);
-        simplified_iter_src_strides.push_back(0);
-        simplified_iter_dst_strides.push_back(0);
-    }
-    else {
-        using dpctl::tensor::py_internal::simplify_iteration_space;
-        simplify_iteration_space(
-            iter_nd, src_shape_ptr, iter_src_strides, iter_dst_strides,
-            // output
-            simplified_iter_shape, simplified_iter_src_strides,
-            simplified_iter_dst_strides, iter_src_offset, iter_dst_offset);
-    }
-
-    if (simplified_red_nd == 1 && iter_nd == 1) {
-        bool mat_reduce_over_axis1 = false;
-        bool mat_reduce_over_axis0 = false;
-        bool array_reduce_all_elems = false;
-        std::size_t iter_nelems = dst_nelems;
-
-        if (simplified_red_src_strides[0] == 1) {
-            array_reduce_all_elems = (simplified_iter_shape[0] == 1);
-            mat_reduce_over_axis1 =
-                (simplified_iter_dst_strides[0] == 1) &&
-                (static_cast<std::size_t>(simplified_iter_src_strides[0]) ==
-                 red_nelems);
-        }
-        else if (static_cast<std::size_t>(simplified_red_src_strides[0]) ==
-                 iter_nelems)
-        {
-            mat_reduce_over_axis0 = (simplified_iter_dst_strides[0] == 1) &&
-                                    (simplified_iter_src_strides[0] == 1);
-        }
-        if (mat_reduce_over_axis1 || array_reduce_all_elems) {
-            auto fn = axis1_contig_dispatch_vector[src_typeid];
-
-            sycl::event red_ev =
-                fn(exec_q, iter_nelems, red_nelems, src_data, dst_data,
-                   iter_src_offset, iter_dst_offset, red_src_offset, depends);
-
-            sycl::event keep_args_event =
-                dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev});
-
-            return std::make_pair(keep_args_event, red_ev);
-        }
-        else if (mat_reduce_over_axis0) {
-            auto fn = axis0_contig_dispatch_vector[src_typeid];
-
-            sycl::event red_ev =
-                fn(exec_q, iter_nelems, red_nelems, src_data, dst_data,
-                   iter_src_offset, iter_dst_offset, red_src_offset, depends);
-
-            sycl::event keep_args_event =
-                dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev});
-
-            return std::make_pair(keep_args_event, red_ev);
-        }
-    }
-
-    auto fn = strided_dispatch_vector[src_typeid];
-
-    std::vector<sycl::event> host_task_events{};
-    auto iter_red_metadata_packing_triple_ =
-        dpctl::tensor::offset_utils::device_allocate_and_pack<py::ssize_t>(
-            exec_q, host_task_events, simplified_iter_shape,
-            simplified_iter_src_strides, simplified_iter_dst_strides,
-            simplified_red_shape, simplified_red_src_strides);
-    auto packed_shapes_strides_owner =
-        std::move(std::get<0>(iter_red_metadata_packing_triple_));
-    const auto &copy_metadata_ev =
-        std::get<2>(iter_red_metadata_packing_triple_);
-    const py::ssize_t *packed_shapes_and_strides =
-        packed_shapes_strides_owner.get();
-
-    const py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides;
-    const py::ssize_t *red_shape_stride =
-        packed_shapes_and_strides + 3 * simplified_iter_shape.size();
-
-    std::vector<sycl::event> all_deps;
-    all_deps.reserve(depends.size() + 1);
-    all_deps.resize(depends.size());
-    std::copy(depends.begin(), depends.end(), all_deps.begin());
-    all_deps.push_back(copy_metadata_ev);
-
-    auto red_ev =
-        fn(exec_q, dst_nelems, red_nelems, src_data, dst_data, iter_nd,
-           iter_shape_and_strides, iter_src_offset, iter_dst_offset,
-           simplified_red_nd, red_shape_stride, red_src_offset, all_deps);
-
-    sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
-        exec_q, {red_ev}, packed_shapes_strides_owner);
-    host_task_events.push_back(temp_cleanup_ev);
-
-    sycl::event keep_args_event =
-        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events);
-
-    return std::make_pair(keep_args_event, red_ev);
-}
-
-extern void init_reduction_functions(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/sum.cpp b/dpctl/tensor/libtensor/source/reductions/sum.cpp
deleted file mode 100644
index f449a6cde3..0000000000
--- a/dpctl/tensor/libtensor/source/reductions/sum.cpp
+++ /dev/null
@@ -1,459 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-#include <vector>
-
-#include "kernels/reductions.hpp"
-#include "utils/type_dispatch_building.hpp"
-
-#include "reduction_atomic_support.hpp"
-#include "reduction_over_axis.hpp"
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace impl
-{
-
-using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
-static reduction_strided_impl_fn_ptr
-    sum_over_axis_strided_atomic_dispatch_table[td_ns::num_types]
-                                               [td_ns::num_types];
-static reduction_strided_impl_fn_ptr
-    sum_over_axis_strided_temps_dispatch_table[td_ns::num_types]
-                                              [td_ns::num_types];
-
-using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
-static reduction_contig_impl_fn_ptr
-    sum_over_axis1_contig_atomic_dispatch_table[td_ns::num_types]
-                                               [td_ns::num_types];
-static reduction_contig_impl_fn_ptr
-    sum_over_axis0_contig_atomic_dispatch_table[td_ns::num_types]
-                                               [td_ns::num_types];
-static reduction_contig_impl_fn_ptr
-    sum_over_axis1_contig_temps_dispatch_table[td_ns::num_types]
-                                              [td_ns::num_types];
-static reduction_contig_impl_fn_ptr
-    sum_over_axis0_contig_temps_dispatch_table[td_ns::num_types]
-                                              [td_ns::num_types];
-
-/* @brief Types supported by plus-reduction code based on atomic_ref */
-template <typename argTy, typename outTy>
-struct TypePairSupportDataForSumReductionAtomic
-{
-
-    /* value if true a kernel for <argTy, outTy> must be instantiated, false
-     * otherwise */
-    static constexpr bool is_defined = std::disjunction<
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
-        // input int8
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
-        // input uint8
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
-        // input int16
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
-        // input uint16
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
-        // input int32
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
-        // input uint32
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
-        // input int64
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
-        // input uint64
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
-        // fall-through
-        td_ns::NotDefinedEntry>::is_defined;
-};
-
-template <typename argTy, typename outTy>
-struct TypePairSupportDataForSumReductionTemps
-{
-
-    static constexpr bool is_defined = std::disjunction<
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, bool>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int8_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint8_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int16_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint16_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
-
-        // input int8_t
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
-
-        // input uint8_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
-
-        // input int16_t
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
-
-        // input uint16_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
-
-        // input int32_t
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
-
-        // input uint32_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
-
-        // input int64_t
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
-
-        // input uint64_t
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
-
-        // input half
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, double>,
-        td_ns::
-            TypePairDefinedEntry<argTy, sycl::half, outTy, std::complex<float>>,
-        td_ns::TypePairDefinedEntry<argTy,
-                                    sycl::half,
-                                    outTy,
-                                    std::complex<double>>,
-
-        // input float
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, float>,
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, double>,
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::complex<float>>,
-        td_ns::TypePairDefinedEntry<argTy, float, outTy, std::complex<double>>,
-
-        // input double
-        td_ns::TypePairDefinedEntry<argTy, double, outTy, double>,
-        td_ns::TypePairDefinedEntry<argTy, double, outTy, std::complex<double>>,
-
-        // input std::complex
-        td_ns::TypePairDefinedEntry<argTy,
-                                    std::complex<float>,
-                                    outTy,
-                                    std::complex<float>>,
-        td_ns::TypePairDefinedEntry<argTy,
-                                    std::complex<float>,
-                                    outTy,
-                                    std::complex<double>>,
-
-        td_ns::TypePairDefinedEntry<argTy,
-                                    std::complex<double>,
-                                    outTy,
-                                    std::complex<double>>,
-
-        // fall-through
-        td_ns::NotDefinedEntry>::is_defined;
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct SumOverAxisAtomicStridedFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForSumReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
-            using ReductionOpT = sycl::plus<dstTy>;
-            return dpctl::tensor::kernels::
-                reduction_over_group_with_atomics_strided_impl<srcTy, dstTy,
-                                                               ReductionOpT>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct SumOverAxisTempsStridedFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForSumReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
-            using ReductionOpT =
-                std::conditional_t<std::is_same_v<dstTy, bool>,
-                                   sycl::logical_or<dstTy>, sycl::plus<dstTy>>;
-            return dpctl::tensor::kernels::
-                reduction_over_group_temps_strided_impl<srcTy, dstTy,
-                                                        ReductionOpT>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct SumOverAxis1AtomicContigFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForSumReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
-            using ReductionOpT = sycl::plus<dstTy>;
-            return dpctl::tensor::kernels::
-                reduction_axis1_over_group_with_atomics_contig_impl<
-                    srcTy, dstTy, ReductionOpT>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct SumOverAxis0AtomicContigFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForSumReductionAtomic<
-                          srcTy, dstTy>::is_defined)
-        {
-            using ReductionOpT = sycl::plus<dstTy>;
-            return dpctl::tensor::kernels::
-                reduction_axis0_over_group_with_atomics_contig_impl<
-                    srcTy, dstTy, ReductionOpT>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct SumOverAxis1TempsContigFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForSumReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
-            using ReductionOpT =
-                std::conditional_t<std::is_same_v<dstTy, bool>,
-                                   sycl::logical_or<dstTy>, sycl::plus<dstTy>>;
-            return dpctl::tensor::kernels::
-                reduction_axis1_over_group_temps_contig_impl<srcTy, dstTy,
-                                                             ReductionOpT>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename srcTy, typename dstTy>
-struct SumOverAxis0TempsContigFactory
-{
-    fnT get() const
-    {
-        if constexpr (TypePairSupportDataForSumReductionTemps<
-                          srcTy, dstTy>::is_defined)
-        {
-            using ReductionOpT =
-                std::conditional_t<std::is_same_v<dstTy, bool>,
-                                   sycl::logical_or<dstTy>, sycl::plus<dstTy>>;
-            return dpctl::tensor::kernels::
-                reduction_axis0_over_group_temps_contig_impl<srcTy, dstTy,
-                                                             ReductionOpT>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-void populate_sum_over_axis_dispatch_tables(void)
-{
-    using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr;
-    using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr;
-    using namespace td_ns;
-
-    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
-                         SumOverAxisAtomicStridedFactory, num_types>
-        dtb1;
-    dtb1.populate_dispatch_table(sum_over_axis_strided_atomic_dispatch_table);
-
-    DispatchTableBuilder<reduction_strided_impl_fn_ptr,
-                         SumOverAxisTempsStridedFactory, num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(sum_over_axis_strided_temps_dispatch_table);
-
-    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
-                         SumOverAxis1AtomicContigFactory, num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(sum_over_axis1_contig_atomic_dispatch_table);
-
-    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
-                         SumOverAxis0AtomicContigFactory, num_types>
-        dtb4;
-    dtb4.populate_dispatch_table(sum_over_axis0_contig_atomic_dispatch_table);
-
-    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
-                         SumOverAxis1TempsContigFactory, td_ns::num_types>
-        dtb5;
-    dtb5.populate_dispatch_table(sum_over_axis1_contig_temps_dispatch_table);
-
-    DispatchTableBuilder<reduction_contig_impl_fn_ptr,
-                         SumOverAxis0TempsContigFactory, td_ns::num_types>
-        dtb6;
-    dtb6.populate_dispatch_table(sum_over_axis0_contig_temps_dispatch_table);
-}
-
-using atomic_support::atomic_support_fn_ptr_t;
-static atomic_support_fn_ptr_t sum_atomic_support_vector[td_ns::num_types];
-
-void populate_sum_atomic_support_dispatch_vector(void)
-{
-    using td_ns::DispatchVectorBuilder;
-
-    using atomic_support::SumAtomicSupportFactory;
-    DispatchVectorBuilder<atomic_support_fn_ptr_t, SumAtomicSupportFactory,
-                          td_ns::num_types>
-        dvb;
-    dvb.populate_dispatch_vector(sum_atomic_support_vector);
-}
-
-} // namespace impl
-
-void init_sum(py::module_ m)
-{
-    using arrayT = dpctl::tensor::usm_ndarray;
-    using event_vecT = std::vector<sycl::event>;
-    {
-        using impl::populate_sum_over_axis_dispatch_tables;
-        populate_sum_over_axis_dispatch_tables();
-        using impl::sum_over_axis0_contig_atomic_dispatch_table;
-        using impl::sum_over_axis0_contig_temps_dispatch_table;
-        using impl::sum_over_axis1_contig_atomic_dispatch_table;
-        using impl::sum_over_axis1_contig_temps_dispatch_table;
-        using impl::sum_over_axis_strided_atomic_dispatch_table;
-        using impl::sum_over_axis_strided_temps_dispatch_table;
-
-        using impl::populate_sum_atomic_support_dispatch_vector;
-        populate_sum_atomic_support_dispatch_vector();
-        using impl::sum_atomic_support_vector;
-
-        auto sum_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce,
-                             const arrayT &dst, sycl::queue &exec_q,
-                             const event_vecT &depends = {}) {
-            using dpctl::tensor::py_internal::py_reduction_over_axis;
-            return py_reduction_over_axis(
-                src, trailing_dims_to_reduce, dst, exec_q, depends,
-                sum_over_axis_strided_atomic_dispatch_table,
-                sum_over_axis0_contig_atomic_dispatch_table,
-                sum_over_axis1_contig_atomic_dispatch_table,
-                sum_over_axis_strided_temps_dispatch_table,
-                sum_over_axis0_contig_temps_dispatch_table,
-                sum_over_axis1_contig_temps_dispatch_table,
-                sum_atomic_support_vector);
-        };
-        m.def("_sum_over_axis", sum_pyapi, "", py::arg("src"),
-              py::arg("trailing_dims_to_reduce"), py::arg("dst"),
-              py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-        auto sum_dtype_supported =
-            [&](const py::dtype &input_dtype, const py::dtype &output_dtype,
-                const std::string &dst_usm_type, sycl::queue &q) {
-                using dpctl::tensor::py_internal::py_reduction_dtype_supported;
-                return py_reduction_dtype_supported(
-                    input_dtype, output_dtype, dst_usm_type, q,
-                    sum_over_axis_strided_atomic_dispatch_table,
-                    sum_over_axis_strided_temps_dispatch_table,
-                    sum_atomic_support_vector);
-            };
-        m.def("_sum_over_axis_dtype_supported", sum_dtype_supported, "",
-              py::arg("arg_dtype"), py::arg("out_dtype"),
-              py::arg("dst_usm_type"), py::arg("sycl_queue"));
-    }
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/reductions/sum.hpp b/dpctl/tensor/libtensor/source/reductions/sum.hpp
deleted file mode 100644
index edf03f5b14..0000000000
--- a/dpctl/tensor/libtensor/source/reductions/sum.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_sum(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/repeat.cpp b/dpctl/tensor/libtensor/source/repeat.cpp
deleted file mode 100644
index 9dfc84681e..0000000000
--- a/dpctl/tensor/libtensor/source/repeat.cpp
+++ /dev/null
@@ -1,817 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#include <cstddef>
-#include <cstdint>
-#include <limits>
-#include <stdexcept>
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include "kernels/repeat.hpp"
-#include "utils/memory_overlap.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/output_validation.hpp"
-#include "utils/sycl_alloc_utils.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "simplify_iteration_space.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::kernels::repeat::repeat_by_sequence_fn_ptr_t;
-static repeat_by_sequence_fn_ptr_t
-    repeat_by_sequence_dispatch_vector[td_ns::num_types];
-
-using dpctl::tensor::kernels::repeat::repeat_by_sequence_1d_fn_ptr_t;
-static repeat_by_sequence_1d_fn_ptr_t
-    repeat_by_sequence_1d_dispatch_vector[td_ns::num_types];
-
-using dpctl::tensor::kernels::repeat::repeat_by_scalar_fn_ptr_t;
-static repeat_by_scalar_fn_ptr_t
-    repeat_by_scalar_dispatch_vector[td_ns::num_types];
-
-using dpctl::tensor::kernels::repeat::repeat_by_scalar_1d_fn_ptr_t;
-static repeat_by_scalar_1d_fn_ptr_t
-    repeat_by_scalar_1d_dispatch_vector[td_ns::num_types];
-
-void init_repeat_dispatch_vectors(void)
-{
-    using dpctl::tensor::kernels::repeat::RepeatSequenceFactory;
-    td_ns::DispatchVectorBuilder<repeat_by_sequence_fn_ptr_t,
-                                 RepeatSequenceFactory, td_ns::num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(repeat_by_sequence_dispatch_vector);
-
-    using dpctl::tensor::kernels::repeat::RepeatSequence1DFactory;
-    td_ns::DispatchVectorBuilder<repeat_by_sequence_1d_fn_ptr_t,
-                                 RepeatSequence1DFactory, td_ns::num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(repeat_by_sequence_1d_dispatch_vector);
-
-    using dpctl::tensor::kernels::repeat::RepeatScalarFactory;
-    td_ns::DispatchVectorBuilder<repeat_by_scalar_fn_ptr_t, RepeatScalarFactory,
-                                 td_ns::num_types>
-        dvb3;
-    dvb3.populate_dispatch_vector(repeat_by_scalar_dispatch_vector);
-
-    using dpctl::tensor::kernels::repeat::RepeatScalar1DFactory;
-    td_ns::DispatchVectorBuilder<repeat_by_scalar_1d_fn_ptr_t,
-                                 RepeatScalar1DFactory, td_ns::num_types>
-        dvb4;
-    dvb4.populate_dispatch_vector(repeat_by_scalar_1d_dispatch_vector);
-}
-
-std::pair<sycl::event, sycl::event>
-py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src,
-                      const dpctl::tensor::usm_ndarray &dst,
-                      const dpctl::tensor::usm_ndarray &reps,
-                      const dpctl::tensor::usm_ndarray &cumsum,
-                      int axis,
-                      sycl::queue &exec_q,
-                      const std::vector<sycl::event> &depends)
-{
-    int src_nd = src.get_ndim();
-    if (axis < 0 || (axis + 1 > src_nd && src_nd > 0) ||
-        (axis > 0 && src_nd == 0))
-    {
-        throw py::value_error("Specified axis is invalid.");
-    }
-
-    int dst_nd = dst.get_ndim();
-    if ((src_nd != dst_nd && src_nd > 0) || (src_nd == 0 && dst_nd > 1)) {
-        throw py::value_error("Number of dimensions of source and destination "
-                              "arrays is not consistent");
-    }
-
-    int reps_nd = reps.get_ndim();
-    if (reps_nd != 1) {
-        throw py::value_error("`reps` array must be 1-dimensional");
-    }
-
-    if (cumsum.get_ndim() != 1) {
-        throw py::value_error("`cumsum` array must be 1-dimensional.");
-    }
-
-    if (!cumsum.is_c_contiguous()) {
-        throw py::value_error("Expecting `cumsum` array to be C-contiguous.");
-    }
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, reps, cumsum, dst}))
-    {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    std::size_t reps_sz = reps.get_size();
-    std::size_t cumsum_sz = cumsum.get_size();
-
-    const py::ssize_t *src_shape = src.get_shape_raw();
-    const py::ssize_t *dst_shape = dst.get_shape_raw();
-    bool same_orthog_dims(true);
-    std::size_t orthog_nelems(1); // number of orthogonal iterations
-    for (auto i = 0; i < axis; ++i) {
-        auto src_sh_i = src_shape[i];
-        orthog_nelems *= src_sh_i;
-        same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]);
-    }
-    for (auto i = axis + 1; i < src_nd; ++i) {
-        auto src_sh_i = src_shape[i];
-        orthog_nelems *= src_sh_i;
-        same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]);
-    }
-
-    std::size_t src_axis_nelems(1);
-    if (src_nd > 0) {
-        src_axis_nelems = src_shape[axis];
-    }
-    std::size_t dst_axis_nelems(dst_shape[axis]);
-
-    // shape at repeated axis must be equal to the sum of reps
-    if (!same_orthog_dims || src_axis_nelems != reps_sz ||
-        src_axis_nelems != cumsum_sz)
-    {
-        throw py::value_error("Inconsistent array dimensions");
-    }
-
-    if (orthog_nelems == 0 || src_axis_nelems == 0) {
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
-        dst, orthog_nelems * dst_axis_nelems);
-
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    // check that dst does not intersect with src or reps
-    if (overlap(dst, src) || overlap(dst, reps) || overlap(dst, cumsum)) {
-        throw py::value_error("Destination array overlaps with inputs");
-    }
-
-    int src_typenum = src.get_typenum();
-    int dst_typenum = dst.get_typenum();
-    int reps_typenum = reps.get_typenum();
-    int cumsum_typenum = cumsum.get_typenum();
-
-    auto const &array_types = td_ns::usm_ndarray_types();
-    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-    int reps_typeid = array_types.typenum_to_lookup_id(reps_typenum);
-    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
-
-    if (src_typeid != dst_typeid) {
-        throw py::value_error(
-            "Destination array must have the same elemental data type");
-    }
-
-    static constexpr int int64_typeid =
-        static_cast<int>(td_ns::typenum_t::INT64);
-    if (cumsum_typeid != int64_typeid) {
-        throw py::value_error(
-            "Unexpected data type of `cumsum` array, expecting "
-            "'int64'");
-    }
-
-    if (reps_typeid != cumsum_typeid) {
-        throw py::value_error("`reps` array must have the same elemental "
-                              "data type as cumsum");
-    }
-
-    const char *src_data_p = src.get_data();
-    const char *reps_data_p = reps.get_data();
-    const char *cumsum_data_p = cumsum.get_data();
-    char *dst_data_p = dst.get_data();
-
-    auto src_shape_vec = src.get_shape_vector();
-    auto src_strides_vec = src.get_strides_vector();
-
-    auto dst_shape_vec = dst.get_shape_vector();
-    auto dst_strides_vec = dst.get_strides_vector();
-
-    auto reps_shape_vec = reps.get_shape_vector();
-    auto reps_strides_vec = reps.get_strides_vector();
-
-    sycl::event repeat_ev;
-    std::vector<sycl::event> host_task_events{};
-    if (axis == 0 && src_nd < 2) {
-        // empty orthogonal directions
-
-        auto fn = repeat_by_sequence_1d_dispatch_vector[src_typeid];
-
-        assert(dst_shape_vec.size() == 1);
-        assert(dst_strides_vec.size() == 1);
-
-        if (src_nd == 0) {
-            src_shape_vec = {0};
-            src_strides_vec = {0};
-        }
-
-        using dpctl::tensor::offset_utils::device_allocate_and_pack;
-        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
-            exec_q, host_task_events, src_shape_vec, src_strides_vec);
-        auto packed_src_shape_strides_owner =
-            std::move(std::get<0>(ptr_size_event_tuple1));
-        sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
-        const py::ssize_t *packed_src_shape_strides =
-            packed_src_shape_strides_owner.get();
-
-        std::vector<sycl::event> all_deps;
-        all_deps.reserve(depends.size() + 1);
-        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
-        all_deps.push_back(copy_shapes_strides_ev);
-
-        assert(all_deps.size() == depends.size() + 1);
-
-        repeat_ev =
-            fn(exec_q, src_axis_nelems, src_data_p, dst_data_p, reps_data_p,
-               cumsum_data_p, src_nd, packed_src_shape_strides,
-               dst_shape_vec[0], dst_strides_vec[0], reps_shape_vec[0],
-               reps_strides_vec[0], all_deps);
-
-        sycl::event cleanup_tmp_allocations_ev =
-            dpctl::tensor::alloc_utils::async_smart_free(
-                exec_q, {repeat_ev}, packed_src_shape_strides_owner);
-        host_task_events.push_back(cleanup_tmp_allocations_ev);
-    }
-    else {
-        // non-empty othogonal directions
-
-        auto fn = repeat_by_sequence_dispatch_vector[src_typeid];
-
-        int orthog_nd = src_nd - 1;
-
-        using shT = std::vector<py::ssize_t>;
-        shT orthog_src_shape;
-        shT orthog_src_strides;
-        shT axis_src_shape;
-        shT axis_src_stride;
-        dpctl::tensor::py_internal::split_iteration_space(
-            src_shape_vec, src_strides_vec, axis, axis + 1, orthog_src_shape,
-            axis_src_shape, orthog_src_strides, axis_src_stride);
-
-        shT orthog_dst_shape;
-        shT orthog_dst_strides;
-        shT axis_dst_shape;
-        shT axis_dst_stride;
-        dpctl::tensor::py_internal::split_iteration_space(
-            dst_shape_vec, dst_strides_vec, axis, axis + 1, orthog_dst_shape,
-            axis_dst_shape, orthog_dst_strides, axis_dst_stride);
-
-        assert(orthog_src_shape.size() == static_cast<std::size_t>(orthog_nd));
-        assert(orthog_dst_shape.size() == static_cast<std::size_t>(orthog_nd));
-        assert(std::equal(orthog_src_shape.begin(), orthog_src_shape.end(),
-                          orthog_dst_shape.begin()));
-
-        shT simplified_orthog_shape;
-        shT simplified_orthog_src_strides;
-        shT simplified_orthog_dst_strides;
-
-        const py::ssize_t *_shape = orthog_src_shape.data();
-
-        py::ssize_t orthog_src_offset(0);
-        py::ssize_t orthog_dst_offset(0);
-        dpctl::tensor::py_internal::simplify_iteration_space(
-            orthog_nd, _shape, orthog_src_strides, orthog_dst_strides,
-            // output
-            simplified_orthog_shape, simplified_orthog_src_strides,
-            simplified_orthog_dst_strides, orthog_src_offset,
-            orthog_dst_offset);
-
-        using dpctl::tensor::offset_utils::device_allocate_and_pack;
-        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
-            exec_q, host_task_events, simplified_orthog_shape,
-            simplified_orthog_src_strides, simplified_orthog_dst_strides);
-        auto packed_shapes_strides_owner =
-            std::move(std::get<0>(ptr_size_event_tuple1));
-        sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
-        const py::ssize_t *packed_shapes_strides =
-            packed_shapes_strides_owner.get();
-
-        std::vector<sycl::event> all_deps;
-        all_deps.reserve(depends.size() + 1);
-        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
-        all_deps.push_back(copy_shapes_strides_ev);
-
-        assert(all_deps.size() == depends.size() + 1);
-
-        repeat_ev = fn(exec_q, orthog_nelems, src_axis_nelems, src_data_p,
-                       dst_data_p, reps_data_p, cumsum_data_p,
-                       // data to build orthog indexer
-                       orthog_nd, packed_shapes_strides, orthog_src_offset,
-                       orthog_dst_offset,
-                       // data to build indexers along repeated axis in src
-                       axis_src_shape[0], axis_src_stride[0],
-                       // data to build indexer along repeated axis in dst
-                       axis_dst_shape[0], axis_dst_stride[0],
-                       // data to build indexer for reps array
-                       reps_shape_vec[0], reps_strides_vec[0], all_deps);
-
-        sycl::event cleanup_tmp_allocations_ev =
-            dpctl::tensor::alloc_utils::async_smart_free(
-                exec_q, {repeat_ev}, packed_shapes_strides_owner);
-        host_task_events.push_back(cleanup_tmp_allocations_ev);
-    }
-
-    sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive(
-        exec_q, {src, reps, cumsum, dst}, host_task_events);
-
-    return std::make_pair(py_obj_management_host_task_ev, repeat_ev);
-}
-
-std::pair<sycl::event, sycl::event>
-py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src,
-                      const dpctl::tensor::usm_ndarray &dst,
-                      const dpctl::tensor::usm_ndarray &reps,
-                      const dpctl::tensor::usm_ndarray &cumsum,
-                      sycl::queue &exec_q,
-                      const std::vector<sycl::event> &depends)
-{
-
-    int dst_nd = dst.get_ndim();
-    if (dst_nd != 1) {
-        throw py::value_error(
-            "`dst` array must be 1-dimensional when repeating a full array");
-    }
-
-    int reps_nd = reps.get_ndim();
-    if (reps_nd != 1) {
-        throw py::value_error("`reps` array must be 1-dimensional");
-    }
-
-    if (cumsum.get_ndim() != 1) {
-        throw py::value_error("`cumsum` array must be 1-dimensional.");
-    }
-
-    if (!cumsum.is_c_contiguous()) {
-        throw py::value_error("Expecting `cumsum` array to be C-contiguous.");
-    }
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, reps, cumsum, dst}))
-    {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    std::size_t src_sz = src.get_size();
-    std::size_t reps_sz = reps.get_size();
-    std::size_t cumsum_sz = cumsum.get_size();
-
-    // shape at repeated axis must be equal to the sum of reps
-    if (src_sz != reps_sz || src_sz != cumsum_sz) {
-        throw py::value_error("Inconsistent array dimensions");
-    }
-
-    if (src_sz == 0) {
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst,
-                                                               dst.get_size());
-
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    // check that dst does not intersect with src, cumsum, or reps
-    if (overlap(dst, src) || overlap(dst, reps) || overlap(dst, cumsum)) {
-        throw py::value_error("Destination array overlaps with inputs");
-    }
-
-    int src_typenum = src.get_typenum();
-    int dst_typenum = dst.get_typenum();
-    int reps_typenum = reps.get_typenum();
-    int cumsum_typenum = cumsum.get_typenum();
-
-    auto const &array_types = td_ns::usm_ndarray_types();
-    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-    int reps_typeid = array_types.typenum_to_lookup_id(reps_typenum);
-    int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum);
-
-    if (src_typeid != dst_typeid) {
-        throw py::value_error(
-            "Destination array must have the same elemental data type");
-    }
-
-    static constexpr int int64_typeid =
-        static_cast<int>(td_ns::typenum_t::INT64);
-    if (cumsum_typeid != int64_typeid) {
-        throw py::value_error(
-            "Unexpected data type of `cumsum` array, expecting "
-            "'int64'");
-    }
-
-    if (reps_typeid != cumsum_typeid) {
-        throw py::value_error("`reps` array must have the same elemental "
-                              "data type as cumsum");
-    }
-
-    const char *src_data_p = src.get_data();
-    const char *reps_data_p = reps.get_data();
-    const char *cumsum_data_p = cumsum.get_data();
-    char *dst_data_p = dst.get_data();
-
-    int src_nd = src.get_ndim();
-    auto src_shape_vec = src.get_shape_vector();
-    auto src_strides_vec = src.get_strides_vector();
-    if (src_nd == 0) {
-        src_shape_vec = {0};
-        src_strides_vec = {0};
-    }
-
-    auto dst_shape_vec = dst.get_shape_vector();
-    auto dst_strides_vec = dst.get_strides_vector();
-
-    auto reps_shape_vec = reps.get_shape_vector();
-    auto reps_strides_vec = reps.get_strides_vector();
-
-    std::vector<sycl::event> host_task_events{};
-
-    auto fn = repeat_by_sequence_1d_dispatch_vector[src_typeid];
-
-    using dpctl::tensor::offset_utils::device_allocate_and_pack;
-    auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
-        exec_q, host_task_events, src_shape_vec, src_strides_vec);
-    auto packed_src_shapes_strides_owner =
-        std::move(std::get<0>(ptr_size_event_tuple1));
-    sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
-    const py::ssize_t *packed_src_shapes_strides =
-        packed_src_shapes_strides_owner.get();
-
-    std::vector<sycl::event> all_deps;
-    all_deps.reserve(depends.size() + 1);
-    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
-    all_deps.push_back(copy_shapes_strides_ev);
-
-    assert(all_deps.size() == depends.size() + 1);
-
-    sycl::event repeat_ev = fn(
-        exec_q, src_sz, src_data_p, dst_data_p, reps_data_p, cumsum_data_p,
-        src_nd, packed_src_shapes_strides, dst_shape_vec[0], dst_strides_vec[0],
-        reps_shape_vec[0], reps_strides_vec[0], all_deps);
-
-    sycl::event cleanup_tmp_allocations_ev =
-        dpctl::tensor::alloc_utils::async_smart_free(
-            exec_q, {repeat_ev}, packed_src_shapes_strides_owner);
-    host_task_events.push_back(cleanup_tmp_allocations_ev);
-
-    sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive(
-        exec_q, {src, reps, cumsum, dst}, host_task_events);
-
-    return std::make_pair(py_obj_management_host_task_ev, repeat_ev);
-}
-
-std::pair<sycl::event, sycl::event>
-py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src,
-                    const dpctl::tensor::usm_ndarray &dst,
-                    const py::ssize_t reps,
-                    int axis,
-                    sycl::queue &exec_q,
-                    const std::vector<sycl::event> &depends)
-{
-    int src_nd = src.get_ndim();
-    if (axis < 0 || (axis + 1 > src_nd && src_nd > 0) ||
-        (axis > 0 && src_nd == 0))
-    {
-        throw py::value_error("Specified axis is invalid.");
-    }
-
-    int dst_nd = dst.get_ndim();
-    if ((src_nd != dst_nd && src_nd > 0) || (src_nd == 0 && dst_nd > 1)) {
-        throw py::value_error("Number of dimensions of source and destination "
-                              "arrays is not consistent");
-    }
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    const py::ssize_t *src_shape = src.get_shape_raw();
-    const py::ssize_t *dst_shape = dst.get_shape_raw();
-    bool same_orthog_dims(true);
-    std::size_t orthog_nelems(1); // number of orthogonal iterations
-    for (auto i = 0; i < axis; ++i) {
-        auto src_sh_i = src_shape[i];
-        orthog_nelems *= src_sh_i;
-        same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]);
-    }
-    for (auto i = axis + 1; i < src_nd; ++i) {
-        auto src_sh_i = src_shape[i];
-        orthog_nelems *= src_sh_i;
-        same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]);
-    }
-
-    std::size_t src_axis_nelems(1);
-    if (src_nd > 0) {
-        src_axis_nelems = src_shape[axis];
-    }
-    std::size_t dst_axis_nelems(dst_shape[axis]);
-
-    // shape at repeated axis must be equal to the shape of src at the axis *
-    // reps
-    if (!same_orthog_dims || (src_axis_nelems * reps) != dst_axis_nelems) {
-        throw py::value_error("Inconsistent array dimensions");
-    }
-
-    if (orthog_nelems == 0 || src_axis_nelems == 0) {
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
-        dst, orthog_nelems * (src_axis_nelems * reps));
-
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    // check that dst does not intersect with src
-    if (overlap(dst, src)) {
-        throw py::value_error("Destination array overlaps with inputs");
-    }
-
-    int src_typenum = src.get_typenum();
-    int dst_typenum = dst.get_typenum();
-
-    auto const &array_types = td_ns::usm_ndarray_types();
-    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    if (src_typeid != dst_typeid) {
-        throw py::value_error(
-            "Destination array must have the same elemental data type");
-    }
-
-    const char *src_data_p = src.get_data();
-    char *dst_data_p = dst.get_data();
-
-    auto src_shape_vec = src.get_shape_vector();
-    auto src_strides_vec = src.get_strides_vector();
-
-    auto dst_shape_vec = dst.get_shape_vector();
-    auto dst_strides_vec = dst.get_strides_vector();
-
-    sycl::event repeat_ev;
-    std::vector<sycl::event> host_task_events{};
-    if (axis == 0 && src_nd < 2) {
-        // empty orthogonal directions
-
-        auto fn = repeat_by_scalar_1d_dispatch_vector[src_typeid];
-
-        assert(dst_shape_vec.size() == 1);
-        assert(dst_strides_vec.size() == 1);
-
-        if (src_nd == 0) {
-            src_shape_vec = {0};
-            src_strides_vec = {0};
-        }
-
-        using dpctl::tensor::offset_utils::device_allocate_and_pack;
-        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
-            exec_q, host_task_events, src_shape_vec, src_strides_vec);
-        auto packed_src_shape_strides_owner =
-            std::move(std::get<0>(ptr_size_event_tuple1));
-        sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
-        const py::ssize_t *packed_src_shape_strides =
-            packed_src_shape_strides_owner.get();
-
-        std::vector<sycl::event> all_deps;
-        all_deps.reserve(depends.size() + 1);
-        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
-        all_deps.push_back(copy_shapes_strides_ev);
-
-        assert(all_deps.size() == depends.size() + 1);
-
-        repeat_ev = fn(exec_q, dst_axis_nelems, src_data_p, dst_data_p, reps,
-                       src_nd, packed_src_shape_strides, dst_shape_vec[0],
-                       dst_strides_vec[0], all_deps);
-
-        sycl::event cleanup_tmp_allocations_ev =
-            dpctl::tensor::alloc_utils::async_smart_free(
-                exec_q, {repeat_ev}, packed_src_shape_strides_owner);
-
-        host_task_events.push_back(cleanup_tmp_allocations_ev);
-    }
-    else {
-        // non-empty othogonal directions
-
-        auto fn = repeat_by_scalar_dispatch_vector[src_typeid];
-
-        int orthog_nd = src_nd - 1;
-
-        using shT = std::vector<py::ssize_t>;
-        shT orthog_src_shape;
-        shT orthog_src_strides;
-        shT axis_src_shape;
-        shT axis_src_stride;
-        dpctl::tensor::py_internal::split_iteration_space(
-            src_shape_vec, src_strides_vec, axis, axis + 1, orthog_src_shape,
-            axis_src_shape, orthog_src_strides, axis_src_stride);
-
-        shT orthog_dst_shape;
-        shT orthog_dst_strides;
-        shT axis_dst_shape;
-        shT axis_dst_stride;
-        dpctl::tensor::py_internal::split_iteration_space(
-            dst_shape_vec, dst_strides_vec, axis, axis + 1, orthog_dst_shape,
-            axis_dst_shape, orthog_dst_strides, axis_dst_stride);
-
-        assert(orthog_src_shape.size() == static_cast<std::size_t>(orthog_nd));
-        assert(orthog_dst_shape.size() == static_cast<std::size_t>(orthog_nd));
-        assert(std::equal(orthog_src_shape.begin(), orthog_src_shape.end(),
-                          orthog_dst_shape.begin()));
-
-        shT simplified_orthog_shape;
-        shT simplified_orthog_src_strides;
-        shT simplified_orthog_dst_strides;
-
-        const py::ssize_t *_shape = orthog_src_shape.data();
-
-        py::ssize_t orthog_src_offset(0);
-        py::ssize_t orthog_dst_offset(0);
-
-        dpctl::tensor::py_internal::simplify_iteration_space(
-            orthog_nd, _shape, orthog_src_strides, orthog_dst_strides,
-            // output
-            simplified_orthog_shape, simplified_orthog_src_strides,
-            simplified_orthog_dst_strides, orthog_src_offset,
-            orthog_dst_offset);
-
-        using dpctl::tensor::offset_utils::device_allocate_and_pack;
-        auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
-            exec_q, host_task_events, simplified_orthog_shape,
-            simplified_orthog_src_strides, simplified_orthog_dst_strides);
-        auto packed_shapes_strides_owner =
-            std::move(std::get<0>(ptr_size_event_tuple1));
-        sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
-        const py::ssize_t *packed_shapes_strides =
-            packed_shapes_strides_owner.get();
-
-        std::vector<sycl::event> all_deps;
-        all_deps.reserve(depends.size() + 1);
-        all_deps.insert(all_deps.end(), depends.begin(), depends.end());
-        all_deps.push_back(copy_shapes_strides_ev);
-
-        assert(all_deps.size() == depends.size() + 1);
-
-        repeat_ev = fn(exec_q, orthog_nelems, dst_axis_nelems, src_data_p,
-                       dst_data_p, reps,
-                       // data to build orthog indexer
-                       orthog_nd, packed_shapes_strides, orthog_src_offset,
-                       orthog_dst_offset,
-                       // data to build indexer along repeated axis in src
-                       axis_src_shape[0], axis_src_stride[0],
-                       // data to build indexer along repeated axis in dst
-                       axis_dst_shape[0], axis_dst_stride[0], all_deps);
-
-        sycl::event cleanup_tmp_allocations_ev =
-            dpctl::tensor::alloc_utils::async_smart_free(
-                exec_q, {repeat_ev}, packed_shapes_strides_owner);
-        host_task_events.push_back(cleanup_tmp_allocations_ev);
-    }
-
-    sycl::event py_obj_management_host_task_ev =
-        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events);
-
-    return std::make_pair(py_obj_management_host_task_ev, repeat_ev);
-}
-
-std::pair<sycl::event, sycl::event>
-py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src,
-                    const dpctl::tensor::usm_ndarray &dst,
-                    const py::ssize_t reps,
-                    sycl::queue &exec_q,
-                    const std::vector<sycl::event> &depends)
-{
-    int dst_nd = dst.get_ndim();
-    if (dst_nd != 1) {
-        throw py::value_error(
-            "`dst` array must be 1-dimensional when repeating a full array");
-    }
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    std::size_t src_sz = src.get_size();
-    std::size_t dst_sz = dst.get_size();
-
-    // shape at repeated axis must be equal to the shape of src at the axis *
-    // reps
-    if ((src_sz * reps) != dst_sz) {
-        throw py::value_error("Inconsistent array dimensions");
-    }
-
-    if (src_sz == 0) {
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst,
-                                                               src_sz * reps);
-
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    // check that dst does not intersect with src
-    if (overlap(dst, src)) {
-        throw py::value_error("Destination array overlaps with inputs");
-    }
-
-    int src_typenum = src.get_typenum();
-    int dst_typenum = dst.get_typenum();
-
-    auto const &array_types = td_ns::usm_ndarray_types();
-    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    if (src_typeid != dst_typeid) {
-        throw py::value_error(
-            "Destination array must have the same elemental data type");
-    }
-
-    const char *src_data_p = src.get_data();
-    char *dst_data_p = dst.get_data();
-
-    int src_nd = src.get_ndim();
-    auto src_shape_vec = src.get_shape_vector();
-    auto src_strides_vec = src.get_strides_vector();
-
-    if (src_nd == 0) {
-        src_shape_vec = {0};
-        src_strides_vec = {0};
-    }
-
-    auto dst_shape_vec = dst.get_shape_vector();
-    auto dst_strides_vec = dst.get_strides_vector();
-
-    std::vector<sycl::event> host_task_events{};
-
-    auto fn = repeat_by_scalar_1d_dispatch_vector[src_typeid];
-
-    using dpctl::tensor::offset_utils::device_allocate_and_pack;
-    auto ptr_size_event_tuple1 = device_allocate_and_pack<py::ssize_t>(
-        exec_q, host_task_events, src_shape_vec, src_strides_vec);
-    auto packed_src_shape_strides_owner =
-        std::move(std::get<0>(ptr_size_event_tuple1));
-    sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1);
-    const py::ssize_t *packed_src_shape_strides =
-        packed_src_shape_strides_owner.get();
-
-    std::vector<sycl::event> all_deps;
-    all_deps.reserve(depends.size() + 1);
-    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
-    all_deps.push_back(copy_shapes_strides_ev);
-
-    assert(all_deps.size() == depends.size() + 1);
-
-    sycl::event repeat_ev = fn(exec_q, dst_sz, src_data_p, dst_data_p, reps,
-                               src_nd, packed_src_shape_strides,
-                               dst_shape_vec[0], dst_strides_vec[0], all_deps);
-
-    sycl::event cleanup_tmp_allocations_ev =
-        dpctl::tensor::alloc_utils::async_smart_free(
-            exec_q, {repeat_ev}, packed_src_shape_strides_owner);
-    host_task_events.push_back(cleanup_tmp_allocations_ev);
-
-    sycl::event py_obj_management_host_task_ev =
-        dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events);
-
-    return std::make_pair(py_obj_management_host_task_ev, repeat_ev);
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/repeat.hpp b/dpctl/tensor/libtensor/source/repeat.hpp
deleted file mode 100644
index d0aa91f16e..0000000000
--- a/dpctl/tensor/libtensor/source/repeat.hpp
+++ /dev/null
@@ -1,76 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#pragma once
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/pybind11.h>
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_repeat_dispatch_vectors(void);
-
-extern std::pair<sycl::event, sycl::event>
-py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src,
-                      const dpctl::tensor::usm_ndarray &dst,
-                      const dpctl::tensor::usm_ndarray &reps,
-                      const dpctl::tensor::usm_ndarray &cumsum,
-                      int axis,
-                      sycl::queue &exec_q,
-                      const std::vector<sycl::event> &depends);
-
-extern std::pair<sycl::event, sycl::event>
-py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src,
-                      const dpctl::tensor::usm_ndarray &dst,
-                      const dpctl::tensor::usm_ndarray &reps,
-                      const dpctl::tensor::usm_ndarray &cumsum,
-                      sycl::queue &exec_q,
-                      const std::vector<sycl::event> &depends);
-
-extern std::pair<sycl::event, sycl::event>
-py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src,
-                    const dpctl::tensor::usm_ndarray &dst,
-                    const py::ssize_t reps,
-                    int axis,
-                    sycl::queue &exec_q,
-                    const std::vector<sycl::event> &depends);
-
-extern std::pair<sycl::event, sycl::event>
-py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src,
-                    const dpctl::tensor::usm_ndarray &dst,
-                    const py::ssize_t reps,
-                    sycl::queue &exec_q,
-                    const std::vector<sycl::event> &depends);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp b/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp
deleted file mode 100644
index 911d3fd053..0000000000
--- a/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp
+++ /dev/null
@@ -1,535 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#include "simplify_iteration_space.hpp"
-#include "utils/strided_iters.hpp"
-#include <cstddef>
-#include <pybind11/pybind11.h>
-#include <vector>
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace py = pybind11;
-
-void simplify_iteration_space_1(int &nd,
-                                const py::ssize_t *const &shape,
-                                std::vector<py::ssize_t> const &strides,
-                                // output
-                                std::vector<py::ssize_t> &simplified_shape,
-                                std::vector<py::ssize_t> &simplified_strides,
-                                py::ssize_t &offset)
-{
-    using dpctl::tensor::strides::simplify_iteration_stride;
-    if (nd > 1) {
-        // Simplify iteration space to reduce dimensionality
-        // and improve access pattern
-        simplified_shape.reserve(nd);
-        simplified_shape.insert(std::end(simplified_shape), shape, shape + nd);
-
-        simplified_strides.reserve(nd);
-        simplified_strides.insert(std::end(simplified_strides),
-                                  std::begin(strides), std::end(strides));
-
-        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
-        assert(simplified_strides.size() == static_cast<std::size_t>(nd));
-        int contracted_nd = simplify_iteration_stride(
-            nd, simplified_shape.data(), simplified_strides.data(),
-            offset // modified by reference
-        );
-        simplified_shape.resize(contracted_nd);
-        simplified_strides.resize(contracted_nd);
-
-        nd = contracted_nd;
-    }
-    else if (nd == 1) {
-        offset = 0;
-        // Populate vectors
-        simplified_shape.reserve(nd);
-        simplified_shape.push_back(shape[0]);
-
-        simplified_strides.reserve(nd);
-        simplified_strides.push_back((strides[0] >= 0) ? strides[0]
-                                                       : -strides[0]);
-        if ((strides[0] < 0) && (shape[0] > 1)) {
-            offset += (shape[0] - 1) * strides[0];
-        }
-
-        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
-        assert(simplified_strides.size() == static_cast<std::size_t>(nd));
-    }
-}
-
-void simplify_iteration_space(int &nd,
-                              const py::ssize_t *const &shape,
-                              std::vector<py::ssize_t> const &src_strides,
-                              std::vector<py::ssize_t> const &dst_strides,
-                              // output
-                              std::vector<py::ssize_t> &simplified_shape,
-                              std::vector<py::ssize_t> &simplified_src_strides,
-                              std::vector<py::ssize_t> &simplified_dst_strides,
-                              py::ssize_t &src_offset,
-                              py::ssize_t &dst_offset)
-{
-    using dpctl::tensor::strides::simplify_iteration_two_strides;
-    if (nd > 1) {
-        // Simplify iteration space to reduce dimensionality
-        // and improve access pattern
-        simplified_shape.reserve(nd);
-        simplified_shape.insert(std::begin(simplified_shape), shape,
-                                shape + nd);
-        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
-
-        simplified_src_strides.reserve(nd);
-        simplified_src_strides.insert(std::end(simplified_src_strides),
-                                      std::begin(src_strides),
-                                      std::end(src_strides));
-        assert(simplified_src_strides.size() == static_cast<std::size_t>(nd));
-
-        simplified_dst_strides.reserve(nd);
-        simplified_dst_strides.insert(std::end(simplified_dst_strides),
-                                      std::begin(dst_strides),
-                                      std::end(dst_strides));
-        assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
-
-        int contracted_nd = simplify_iteration_two_strides(
-            nd, simplified_shape.data(), simplified_src_strides.data(),
-            simplified_dst_strides.data(),
-            src_offset, // modified by reference
-            dst_offset  // modified by reference
-        );
-        simplified_shape.resize(contracted_nd);
-        simplified_src_strides.resize(contracted_nd);
-        simplified_dst_strides.resize(contracted_nd);
-
-        nd = contracted_nd;
-    }
-    else if (nd == 1) {
-        src_offset = 0;
-        dst_offset = 0;
-        // Populate vectors
-        simplified_shape.reserve(nd);
-        simplified_shape.push_back(shape[0]);
-        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
-
-        simplified_src_strides.reserve(nd);
-        simplified_dst_strides.reserve(nd);
-
-        if (src_strides[0] < 0 && dst_strides[0] < 0) {
-            simplified_src_strides.push_back(-src_strides[0]);
-            simplified_dst_strides.push_back(-dst_strides[0]);
-            if (shape[0] > 1) {
-                src_offset += (shape[0] - 1) * src_strides[0];
-                dst_offset += (shape[0] - 1) * dst_strides[0];
-            }
-        }
-        else {
-            simplified_src_strides.push_back(src_strides[0]);
-            simplified_dst_strides.push_back(dst_strides[0]);
-        }
-
-        assert(simplified_src_strides.size() == static_cast<std::size_t>(nd));
-        assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
-    }
-}
-
-void simplify_iteration_space_3(
-    int &nd,
-    const py::ssize_t *const &shape,
-    // src1
-    std::vector<py::ssize_t> const &src1_strides,
-    // src2
-    std::vector<py::ssize_t> const &src2_strides,
-    // dst
-    std::vector<py::ssize_t> const &dst_strides,
-    // output
-    std::vector<py::ssize_t> &simplified_shape,
-    std::vector<py::ssize_t> &simplified_src1_strides,
-    std::vector<py::ssize_t> &simplified_src2_strides,
-    std::vector<py::ssize_t> &simplified_dst_strides,
-    py::ssize_t &src1_offset,
-    py::ssize_t &src2_offset,
-    py::ssize_t &dst_offset)
-{
-    using dpctl::tensor::strides::simplify_iteration_three_strides;
-    if (nd > 1) {
-        // Simplify iteration space to reduce dimensionality
-        // and improve access pattern
-        simplified_shape.reserve(nd);
-        simplified_shape.insert(std::end(simplified_shape), shape, shape + nd);
-        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
-
-        simplified_src1_strides.reserve(nd);
-        simplified_src1_strides.insert(std::end(simplified_src1_strides),
-                                       std::begin(src1_strides),
-                                       std::end(src1_strides));
-        assert(simplified_src1_strides.size() == static_cast<std::size_t>(nd));
-
-        simplified_src2_strides.reserve(nd);
-        simplified_src2_strides.insert(std::end(simplified_src2_strides),
-                                       std::begin(src2_strides),
-                                       std::end(src2_strides));
-        assert(simplified_src2_strides.size() == static_cast<std::size_t>(nd));
-
-        simplified_dst_strides.reserve(nd);
-        simplified_dst_strides.insert(std::end(simplified_dst_strides),
-                                      std::begin(dst_strides),
-                                      std::end(dst_strides));
-        assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
-
-        int contracted_nd = simplify_iteration_three_strides(
-            nd, simplified_shape.data(), simplified_src1_strides.data(),
-            simplified_src2_strides.data(), simplified_dst_strides.data(),
-            src1_offset, // modified by reference
-            src2_offset, // modified by reference
-            dst_offset   // modified by reference
-        );
-        simplified_shape.resize(contracted_nd);
-        simplified_src1_strides.resize(contracted_nd);
-        simplified_src2_strides.resize(contracted_nd);
-        simplified_dst_strides.resize(contracted_nd);
-
-        nd = contracted_nd;
-    }
-    else if (nd == 1) {
-        src1_offset = 0;
-        src2_offset = 0;
-        dst_offset = 0;
-        // Populate vectors
-        simplified_shape.reserve(nd);
-        simplified_shape.push_back(shape[0]);
-        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
-
-        simplified_src1_strides.reserve(nd);
-        simplified_src2_strides.reserve(nd);
-        simplified_dst_strides.reserve(nd);
-
-        if ((src1_strides[0] < 0) && (src2_strides[0] < 0) &&
-            (dst_strides[0] < 0))
-        {
-            simplified_src1_strides.push_back(-src1_strides[0]);
-            simplified_src2_strides.push_back(-src2_strides[0]);
-            simplified_dst_strides.push_back(-dst_strides[0]);
-            if (shape[0] > 1) {
-                src1_offset += src1_strides[0] * (shape[0] - 1);
-                src2_offset += src2_strides[0] * (shape[0] - 1);
-                dst_offset += dst_strides[0] * (shape[0] - 1);
-            }
-        }
-        else {
-            simplified_src1_strides.push_back(src1_strides[0]);
-            simplified_src2_strides.push_back(src2_strides[0]);
-            simplified_dst_strides.push_back(dst_strides[0]);
-        }
-
-        assert(simplified_src1_strides.size() == static_cast<std::size_t>(nd));
-        assert(simplified_src2_strides.size() == static_cast<std::size_t>(nd));
-        assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
-    }
-}
-
-void simplify_iteration_space_4(
-    int &nd,
-    const py::ssize_t *const &shape,
-    // src1
-    std::vector<py::ssize_t> const &src1_strides,
-    // src2
-    std::vector<py::ssize_t> const &src2_strides,
-    // src3
-    std::vector<py::ssize_t> const &src3_strides,
-    // dst
-    std::vector<py::ssize_t> const &dst_strides,
-    // output
-    std::vector<py::ssize_t> &simplified_shape,
-    std::vector<py::ssize_t> &simplified_src1_strides,
-    std::vector<py::ssize_t> &simplified_src2_strides,
-    std::vector<py::ssize_t> &simplified_src3_strides,
-    std::vector<py::ssize_t> &simplified_dst_strides,
-    py::ssize_t &src1_offset,
-    py::ssize_t &src2_offset,
-    py::ssize_t &src3_offset,
-    py::ssize_t &dst_offset)
-{
-    using dpctl::tensor::strides::simplify_iteration_four_strides;
-    if (nd > 1) {
-        // Simplify iteration space to reduce dimensionality
-        // and improve access pattern
-        simplified_shape.reserve(nd);
-        simplified_shape.insert(std::end(simplified_shape), shape, shape + nd);
-        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
-
-        simplified_src1_strides.reserve(nd);
-        simplified_src1_strides.insert(std::end(simplified_src1_strides),
-                                       std::begin(src1_strides),
-                                       std::end(src1_strides));
-        assert(simplified_src1_strides.size() == static_cast<std::size_t>(nd));
-
-        simplified_src2_strides.reserve(nd);
-        simplified_src2_strides.insert(std::end(simplified_src2_strides),
-                                       std::begin(src2_strides),
-                                       std::end(src2_strides));
-        assert(simplified_src2_strides.size() == static_cast<std::size_t>(nd));
-
-        simplified_src3_strides.reserve(nd);
-        simplified_src3_strides.insert(std::end(simplified_src3_strides),
-                                       std::begin(src3_strides),
-                                       std::end(src3_strides));
-        assert(simplified_src3_strides.size() == static_cast<std::size_t>(nd));
-
-        simplified_dst_strides.reserve(nd);
-        simplified_dst_strides.insert(std::end(simplified_dst_strides),
-                                      std::begin(dst_strides),
-                                      std::end(dst_strides));
-        assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
-
-        int contracted_nd = simplify_iteration_four_strides(
-            nd, simplified_shape.data(), simplified_src1_strides.data(),
-            simplified_src2_strides.data(), simplified_src3_strides.data(),
-            simplified_dst_strides.data(),
-            src1_offset, // modified by reference
-            src2_offset, // modified by reference
-            src3_offset, // modified by reference
-            dst_offset   // modified by reference
-        );
-        simplified_shape.resize(contracted_nd);
-        simplified_src1_strides.resize(contracted_nd);
-        simplified_src2_strides.resize(contracted_nd);
-        simplified_src3_strides.resize(contracted_nd);
-        simplified_dst_strides.resize(contracted_nd);
-
-        nd = contracted_nd;
-    }
-    else if (nd == 1) {
-        src1_offset = 0;
-        src2_offset = 0;
-        src3_offset = 0;
-        dst_offset = 0;
-        // Populate vectors
-        simplified_shape.reserve(nd);
-        simplified_shape.push_back(shape[0]);
-        assert(simplified_shape.size() == static_cast<std::size_t>(nd));
-
-        simplified_src1_strides.reserve(nd);
-        simplified_src2_strides.reserve(nd);
-        simplified_src3_strides.reserve(nd);
-        simplified_dst_strides.reserve(nd);
-
-        if ((src1_strides[0] < 0) && (src2_strides[0] < 0) &&
-            (src3_strides[0] < 0) && (dst_strides[0] < 0))
-        {
-            simplified_src1_strides.push_back(-src1_strides[0]);
-            simplified_src2_strides.push_back(-src2_strides[0]);
-            simplified_src3_strides.push_back(-src3_strides[0]);
-            simplified_dst_strides.push_back(-dst_strides[0]);
-            if (shape[0] > 1) {
-                src1_offset += src1_strides[0] * (shape[0] - 1);
-                src2_offset += src2_strides[0] * (shape[0] - 1);
-                src3_offset += src3_strides[0] * (shape[0] - 1);
-                dst_offset += dst_strides[0] * (shape[0] - 1);
-            }
-        }
-        else {
-            simplified_src1_strides.push_back(src1_strides[0]);
-            simplified_src2_strides.push_back(src2_strides[0]);
-            simplified_src3_strides.push_back(src3_strides[0]);
-            simplified_dst_strides.push_back(dst_strides[0]);
-        }
-
-        assert(simplified_src1_strides.size() == static_cast<std::size_t>(nd));
-        assert(simplified_src2_strides.size() == static_cast<std::size_t>(nd));
-        assert(simplified_src3_strides.size() == static_cast<std::size_t>(nd));
-        assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
-    }
-}
-
-void compact_iteration_space(int &nd,
-                             const py::ssize_t *const &shape,
-                             std::vector<py::ssize_t> const &strides,
-                             // output
-                             std::vector<py::ssize_t> &compact_shape,
-                             std::vector<py::ssize_t> &compact_strides)
-{
-    using dpctl::tensor::strides::compact_iteration;
-    if (nd > 1) {
-        // Compact iteration space to reduce dimensionality
-        // and improve access pattern
-        compact_shape.reserve(nd);
-        compact_shape.insert(std::begin(compact_shape), shape, shape + nd);
-        assert(compact_shape.size() == static_cast<std::size_t>(nd));
-
-        compact_strides.reserve(nd);
-        compact_strides.insert(std::end(compact_strides), std::begin(strides),
-                               std::end(strides));
-        assert(compact_strides.size() == static_cast<std::size_t>(nd));
-
-        int contracted_nd =
-            compact_iteration(nd, compact_shape.data(), compact_strides.data());
-        compact_shape.resize(contracted_nd);
-        compact_strides.resize(contracted_nd);
-
-        nd = contracted_nd;
-    }
-    else if (nd == 1) {
-        // Populate vectors
-        compact_shape.reserve(nd);
-        compact_shape.push_back(shape[0]);
-        assert(compact_shape.size() == static_cast<std::size_t>(nd));
-
-        compact_strides.reserve(nd);
-        compact_strides.push_back(strides[0]);
-        assert(compact_strides.size() == static_cast<std::size_t>(nd));
-    }
-}
-
-/* @brief Split shape/strides into dir1 (complementary to axis_start <= i <
- * axis_end) and dir2 (along given set of axes)
- */
-void split_iteration_space(const std::vector<py::ssize_t> &shape_vec,
-                           const std::vector<py::ssize_t> &strides_vec,
-                           int axis_start,
-                           int axis_end,
-                           std::vector<py::ssize_t> &dir1_shape_vec,
-                           std::vector<py::ssize_t> &dir2_shape_vec,
-                           std::vector<py::ssize_t> &dir1_strides_vec,
-                           std::vector<py::ssize_t> &dir2_strides_vec)
-{
-    int nd = static_cast<int>(shape_vec.size());
-    int dir2_sz = axis_end - axis_start;
-    int dir1_sz = nd - dir2_sz;
-
-    assert(dir1_sz > 0);
-    assert(dir2_sz > 0);
-
-    dir1_shape_vec.resize(dir1_sz);
-    dir2_shape_vec.resize(dir2_sz);
-
-    std::copy(shape_vec.begin(), shape_vec.begin() + axis_start,
-              dir1_shape_vec.begin());
-    std::copy(shape_vec.begin() + axis_end, shape_vec.end(),
-              dir1_shape_vec.begin() + axis_start);
-
-    std::copy(shape_vec.begin() + axis_start, shape_vec.begin() + axis_end,
-              dir2_shape_vec.begin());
-
-    dir1_strides_vec.resize(dir1_sz);
-    dir2_strides_vec.resize(dir2_sz);
-
-    std::copy(strides_vec.begin(), strides_vec.begin() + axis_start,
-              dir1_strides_vec.begin());
-    std::copy(strides_vec.begin() + axis_end, strides_vec.end(),
-              dir1_strides_vec.begin() + axis_start);
-
-    std::copy(strides_vec.begin() + axis_start, strides_vec.begin() + axis_end,
-              dir2_strides_vec.begin());
-
-    return;
-}
-
-py::ssize_t _ravel_multi_index_c(std::vector<py::ssize_t> const &mi,
-                                 std::vector<py::ssize_t> const &shape)
-{
-    std::size_t nd = shape.size();
-    if (nd != mi.size()) {
-        throw py::value_error(
-            "Multi-index and shape vectors must have the same length.");
-    }
-
-    py::ssize_t flat_index = 0;
-    py::ssize_t s = 1;
-    for (std::size_t i = 0; i < nd; ++i) {
-        flat_index += mi.at(nd - 1 - i) * s;
-        s *= shape.at(nd - 1 - i);
-    }
-
-    return flat_index;
-}
-
-py::ssize_t _ravel_multi_index_f(std::vector<py::ssize_t> const &mi,
-                                 std::vector<py::ssize_t> const &shape)
-{
-    std::size_t nd = shape.size();
-    if (nd != mi.size()) {
-        throw py::value_error(
-            "Multi-index and shape vectors must have the same length.");
-    }
-
-    py::ssize_t flat_index = 0;
-    py::ssize_t s = 1;
-    for (std::size_t i = 0; i < nd; ++i) {
-        flat_index += mi.at(i) * s;
-        s *= shape.at(i);
-    }
-
-    return flat_index;
-}
-
-std::vector<py::ssize_t> _unravel_index_c(py::ssize_t flat_index,
-                                          std::vector<py::ssize_t> const &shape)
-{
-    std::size_t nd = shape.size();
-    std::vector<py::ssize_t> mi;
-    mi.resize(nd);
-
-    py::ssize_t i_ = flat_index;
-    for (std::size_t dim = 0; dim + 1 < nd; ++dim) {
-        const py::ssize_t si = shape[nd - 1 - dim];
-        const py::ssize_t q = i_ / si;
-        const py::ssize_t r = (i_ - q * si);
-        mi[nd - 1 - dim] = r;
-        i_ = q;
-    }
-    if (nd) {
-        mi[0] = i_;
-    }
-    return mi;
-}
-
-std::vector<py::ssize_t> _unravel_index_f(py::ssize_t flat_index,
-                                          std::vector<py::ssize_t> const &shape)
-{
-    std::size_t nd = shape.size();
-    std::vector<py::ssize_t> mi;
-    mi.resize(nd);
-
-    py::ssize_t i_ = flat_index;
-    for (std::size_t dim = 0; dim + 1 < nd; ++dim) {
-        const py::ssize_t si = shape[dim];
-        const py::ssize_t q = i_ / si;
-        const py::ssize_t r = (i_ - q * si);
-        mi[dim] = r;
-        i_ = q;
-    }
-    if (nd) {
-        mi[nd - 1] = i_;
-    }
-    return mi;
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp b/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp
deleted file mode 100644
index 441d328f58..0000000000
--- a/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp
+++ /dev/null
@@ -1,120 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#pragma once
-#include <pybind11/pybind11.h>
-#include <vector>
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace py = pybind11;
-
-void simplify_iteration_space_1(int &,
-                                const py::ssize_t *const &,
-                                std::vector<py::ssize_t> const &,
-                                std::vector<py::ssize_t> &,
-                                std::vector<py::ssize_t> &,
-                                py::ssize_t &);
-
-void simplify_iteration_space(int &,
-                              const py::ssize_t *const &,
-                              std::vector<py::ssize_t> const &,
-                              std::vector<py::ssize_t> const &,
-                              std::vector<py::ssize_t> &,
-                              std::vector<py::ssize_t> &,
-                              std::vector<py::ssize_t> &,
-                              py::ssize_t &,
-                              py::ssize_t &);
-
-void simplify_iteration_space_3(int &,
-                                const py::ssize_t *const &,
-                                // src1
-                                std::vector<py::ssize_t> const &,
-                                // src2
-                                std::vector<py::ssize_t> const &,
-                                // dst
-                                std::vector<py::ssize_t> const &,
-                                // output
-                                std::vector<py::ssize_t> &,
-                                std::vector<py::ssize_t> &,
-                                std::vector<py::ssize_t> &,
-                                std::vector<py::ssize_t> &,
-                                py::ssize_t &,
-                                py::ssize_t &,
-                                py::ssize_t &);
-
-void simplify_iteration_space_4(int &,
-                                const py::ssize_t *const &,
-                                // src1
-                                std::vector<py::ssize_t> const &,
-                                // src2
-                                std::vector<py::ssize_t> const &,
-                                // src3
-                                std::vector<py::ssize_t> const &,
-                                // dst
-                                std::vector<py::ssize_t> const &,
-                                // output
-                                std::vector<py::ssize_t> &,
-                                std::vector<py::ssize_t> &,
-                                std::vector<py::ssize_t> &,
-                                std::vector<py::ssize_t> &,
-                                std::vector<py::ssize_t> &,
-                                py::ssize_t &,
-                                py::ssize_t &,
-                                py::ssize_t &,
-                                py::ssize_t &);
-
-void compact_iteration_space(int &,
-                             const py::ssize_t *const &,
-                             std::vector<py::ssize_t> const &,
-                             // output
-                             std::vector<py::ssize_t> &,
-                             std::vector<py::ssize_t> &);
-
-void split_iteration_space(const std::vector<py::ssize_t> &,
-                           const std::vector<py::ssize_t> &,
-                           int,
-                           int,
-                           // output
-                           std::vector<py::ssize_t> &,
-                           std::vector<py::ssize_t> &,
-                           std::vector<py::ssize_t> &,
-                           std::vector<py::ssize_t> &);
-
-py::ssize_t _ravel_multi_index_c(std::vector<py::ssize_t> const &,
-                                 std::vector<py::ssize_t> const &);
-py::ssize_t _ravel_multi_index_f(std::vector<py::ssize_t> const &,
-                                 std::vector<py::ssize_t> const &);
-std::vector<py::ssize_t> _unravel_index_c(py::ssize_t,
-                                          std::vector<py::ssize_t> const &);
-std::vector<py::ssize_t> _unravel_index_f(py::ssize_t,
-                                          std::vector<py::ssize_t> const &);
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/sorting/isin.cpp b/dpctl/tensor/libtensor/source/sorting/isin.cpp
deleted file mode 100644
index 97f3acb2b1..0000000000
--- a/dpctl/tensor/libtensor/source/sorting/isin.cpp
+++ /dev/null
@@ -1,319 +0,0 @@
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_sorting_impl
-/// extension.
-//===--------------------------------------------------------------------===//
-
-#include <cstddef>
-#include <stdexcept>
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include "kernels/sorting/isin.hpp"
-#include "utils/memory_overlap.hpp"
-#include "utils/output_validation.hpp"
-#include "utils/sycl_alloc_utils.hpp"
-#include "utils/type_dispatch.hpp"
-#include "utils/type_utils.hpp"
-
-#include "simplify_iteration_space.hpp"
-
-namespace py = pybind11;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace tu_ns = dpctl::tensor::type_utils;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace detail
-{
-
-using dpctl::tensor::kernels::isin_contig_impl_fp_ptr_t;
-
-static isin_contig_impl_fp_ptr_t
-    isin_contig_impl_dispatch_vector[td_ns::num_types];
-
-template <typename fnT, typename argTy> struct IsinContigFactory
-{
-    constexpr IsinContigFactory() {}
-
-    fnT get() const
-    {
-        using dpctl::tensor::kernels::isin_contig_impl;
-        return isin_contig_impl<argTy>;
-    }
-};
-
-using dpctl::tensor::kernels::isin_strided_impl_fp_ptr_t;
-
-static isin_strided_impl_fp_ptr_t
-    isin_strided_impl_dispatch_vector[td_ns::num_types];
-
-template <typename fnT, typename argTy> struct IsinStridedFactory
-{
-    constexpr IsinStridedFactory() {}
-
-    fnT get() const
-    {
-        using dpctl::tensor::kernels::isin_strided_impl;
-        return isin_strided_impl<argTy>;
-    }
-};
-
-void init_isin_dispatch_vector(void)
-{
-
-    // Contiguous input function dispatch
-    td_ns::DispatchVectorBuilder<isin_contig_impl_fp_ptr_t, IsinContigFactory,
-                                 td_ns::num_types>
-        dvb1;
-    dvb1.populate_dispatch_vector(isin_contig_impl_dispatch_vector);
-
-    // Strided input function dispatch
-    td_ns::DispatchVectorBuilder<isin_strided_impl_fp_ptr_t, IsinStridedFactory,
-                                 td_ns::num_types>
-        dvb2;
-    dvb2.populate_dispatch_vector(isin_strided_impl_dispatch_vector);
-}
-
-} // namespace detail
-
-/*! @brief search for needle from needles in sorted hay */
-std::pair<sycl::event, sycl::event>
-py_isin(const dpctl::tensor::usm_ndarray &needles,
-        const dpctl::tensor::usm_ndarray &hay,
-        const dpctl::tensor::usm_ndarray &dst,
-        sycl::queue &exec_q,
-        const bool invert,
-        const std::vector<sycl::event> &depends)
-{
-    const int hay_nd = hay.get_ndim();
-    const int needles_nd = needles.get_ndim();
-    const int dst_nd = dst.get_ndim();
-
-    if (hay_nd != 1 || needles_nd != dst_nd) {
-        throw py::value_error("Array dimensions mismatch");
-    }
-
-    // check that needle and dst have the same shape
-    std::size_t needles_nelems(1);
-    bool same_shape(true);
-
-    const std::size_t hay_nelems = static_cast<std::size_t>(hay.get_shape(0));
-
-    const py::ssize_t *needles_shape_ptr = needles.get_shape_raw();
-    const py::ssize_t *dst_shape_ptr = needles.get_shape_raw();
-
-    for (int i = 0; (i < needles_nd) && same_shape; ++i) {
-        const auto needles_sh_i = needles_shape_ptr[i];
-        const auto dst_sh_i = dst_shape_ptr[i];
-
-        same_shape = same_shape && (needles_sh_i == dst_sh_i);
-        needles_nelems *= static_cast<std::size_t>(needles_sh_i);
-    }
-
-    if (!same_shape) {
-        throw py::value_error(
-            "Array of values to search for and array of their "
-            "dst do not have the same shape");
-    }
-
-    // check that dst is ample enough
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst,
-                                                               needles_nelems);
-
-    // check that dst is writable
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    // check that queues are compatible
-    if (!dpctl::utils::queues_are_compatible(exec_q, {hay, needles, dst})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    // if output array overlaps with input arrays, race condition results
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    if (overlap(dst, hay) || overlap(dst, needles)) {
-        throw py::value_error("Destination array overlaps with input.");
-    }
-
-    const int hay_typenum = hay.get_typenum();
-    const int needles_typenum = needles.get_typenum();
-    const int dst_typenum = dst.get_typenum();
-
-    auto const &array_types = td_ns::usm_ndarray_types();
-    const int hay_typeid = array_types.typenum_to_lookup_id(hay_typenum);
-    const int needles_typeid =
-        array_types.typenum_to_lookup_id(needles_typenum);
-    const int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    // check hay and needle have the same data-type
-    if (needles_typeid != hay_typeid) {
-        throw py::value_error(
-            "Hay array and needles array must have the same data types");
-    }
-    // check that dst has boolean data type
-    const auto dst_typenum_t_v = static_cast<td_ns::typenum_t>(dst_typeid);
-    if (dst_typenum_t_v != td_ns::typenum_t::BOOL) {
-        throw py::value_error("dst array must have data-type bool");
-    }
-
-    if (needles_nelems == 0) {
-        // Nothing to do
-        return std::make_pair(sycl::event{}, sycl::event{});
-    }
-
-    // if all inputs are contiguous call contiguous implementations
-    // otherwise call strided implementation
-    const bool hay_is_c_contig = hay.is_c_contiguous();
-    const bool hay_is_f_contig = hay.is_f_contiguous();
-
-    const bool needles_is_c_contig = needles.is_c_contiguous();
-    const bool needles_is_f_contig = needles.is_f_contiguous();
-
-    const bool dst_is_c_contig = dst.is_c_contiguous();
-    const bool dst_is_f_contig = dst.is_f_contiguous();
-
-    const bool all_c_contig =
-        (hay_is_c_contig && needles_is_c_contig && dst_is_c_contig);
-    const bool all_f_contig =
-        (hay_is_f_contig && needles_is_f_contig && dst_is_f_contig);
-
-    const char *hay_data = hay.get_data();
-    const char *needles_data = needles.get_data();
-
-    char *dst_data = dst.get_data();
-
-    if (all_c_contig || all_f_contig) {
-        auto fn = detail::isin_contig_impl_dispatch_vector[hay_typeid];
-
-        static constexpr py::ssize_t zero_offset(0);
-
-        sycl::event comp_ev = fn(exec_q, invert, hay_nelems, needles_nelems,
-                                 hay_data, zero_offset, needles_data,
-                                 zero_offset, dst_data, zero_offset, depends);
-
-        return std::make_pair(dpctl::utils::keep_args_alive(
-                                  exec_q, {hay, needles, dst}, {comp_ev}),
-                              comp_ev);
-    }
-
-    // strided case
-
-    const auto &needles_strides = needles.get_strides_vector();
-    const auto &dst_strides = dst.get_strides_vector();
-
-    int simplified_nd = needles_nd;
-
-    using shT = std::vector<py::ssize_t>;
-    shT simplified_common_shape;
-    shT simplified_needles_strides;
-    shT simplified_dst_strides;
-    py::ssize_t needles_offset(0);
-    py::ssize_t dst_offset(0);
-
-    if (simplified_nd == 0) {
-        // needles and dst have same nd
-        simplified_nd = 1;
-        simplified_common_shape.push_back(1);
-        simplified_needles_strides.push_back(0);
-        simplified_dst_strides.push_back(0);
-    }
-    else {
-        dpctl::tensor::py_internal::simplify_iteration_space(
-            // modified by refernce
-            simplified_nd,
-            // read-only inputs
-            needles_shape_ptr, needles_strides, dst_strides,
-            // output, modified by reference
-            simplified_common_shape, simplified_needles_strides,
-            simplified_dst_strides, needles_offset, dst_offset);
-    }
-    std::vector<sycl::event> host_task_events;
-    host_task_events.reserve(2);
-
-    using dpctl::tensor::offset_utils::device_allocate_and_pack;
-
-    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
-        exec_q, host_task_events,
-        // vectors being packed
-        simplified_common_shape, simplified_needles_strides,
-        simplified_dst_strides);
-    auto packed_shape_strides_owner =
-        std::move(std::get<0>(ptr_size_event_tuple));
-    const sycl::event &copy_shape_strides_ev =
-        std::get<2>(ptr_size_event_tuple);
-    const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get();
-
-    std::vector<sycl::event> all_deps;
-    all_deps.reserve(depends.size() + 1);
-    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
-    all_deps.push_back(copy_shape_strides_ev);
-
-    auto strided_fn = detail::isin_strided_impl_dispatch_vector[hay_typeid];
-
-    if (!strided_fn) {
-        throw std::runtime_error(
-            "No implementation for data types of input arrays");
-    }
-
-    static constexpr py::ssize_t zero_offset(0);
-    py::ssize_t hay_step = hay.get_strides_vector()[0];
-
-    const sycl::event &comp_ev = strided_fn(
-        exec_q, invert, hay_nelems, needles_nelems, hay_data, zero_offset,
-        hay_step, needles_data, needles_offset, dst_data, dst_offset,
-        simplified_nd, packed_shape_strides, all_deps);
-
-    // free packed temporaries
-    sycl::event temporaries_cleanup_ev =
-        dpctl::tensor::alloc_utils::async_smart_free(
-            exec_q, {comp_ev}, packed_shape_strides_owner);
-
-    host_task_events.push_back(temporaries_cleanup_ev);
-    const sycl::event &ht_ev = dpctl::utils::keep_args_alive(
-        exec_q, {hay, needles, dst}, host_task_events);
-
-    return std::make_pair(ht_ev, comp_ev);
-}
-
-void init_isin_functions(py::module_ m)
-{
-    dpctl::tensor::py_internal::detail::init_isin_dispatch_vector();
-
-    using dpctl::tensor::py_internal::py_isin;
-    m.def("_isin", &py_isin, py::arg("needles"), py::arg("hay"), py::arg("dst"),
-          py::arg("sycl_queue"), py::arg("invert"),
-          py::arg("depends") = py::list());
-}
-
-} // end of namespace py_internal
-} // end of namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/sorting/isin.hpp b/dpctl/tensor/libtensor/source/sorting/isin.hpp
deleted file mode 100644
index c855cd3d4c..0000000000
--- a/dpctl/tensor/libtensor/source/sorting/isin.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_sorting_impl
-/// extension.
-//===--------------------------------------------------------------------===//
-
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_isin_functions(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/sorting/merge_argsort.cpp b/dpctl/tensor/libtensor/source/sorting/merge_argsort.cpp
deleted file mode 100644
index 779ed0a5bc..0000000000
--- a/dpctl/tensor/libtensor/source/sorting/merge_argsort.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_sorting_impl
-/// extension.
-//===--------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-
-#include "utils/math_utils.hpp"
-#include "utils/memory_overlap.hpp"
-#include "utils/output_validation.hpp"
-#include "utils/rich_comparisons.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/sorting/merge_sort.hpp"
-#include "kernels/sorting/sort_impl_fn_ptr_t.hpp"
-
-#include "merge_argsort.hpp"
-#include "py_argsort_common.hpp"
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
-static sort_contig_fn_ptr_t
-    ascending_argsort_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
-static sort_contig_fn_ptr_t
-    descending_argsort_contig_dispatch_table[td_ns::num_types]
-                                            [td_ns::num_types];
-
-template <typename fnT, typename argTy, typename IndexTy>
-struct AscendingArgSortContigFactory
-{
-    fnT get()
-    {
-        if constexpr (std::is_same_v<IndexTy, std::int64_t> ||
-                      std::is_same_v<IndexTy, std::int32_t>)
-        {
-            using dpctl::tensor::rich_comparisons::AscendingSorter;
-            using Comp = typename AscendingSorter<argTy>::type;
-
-            using dpctl::tensor::kernels::stable_argsort_axis1_contig_impl;
-            return stable_argsort_axis1_contig_impl<argTy, IndexTy, Comp>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename argTy, typename IndexTy>
-struct DescendingArgSortContigFactory
-{
-    fnT get()
-    {
-        if constexpr (std::is_same_v<IndexTy, std::int64_t> ||
-                      std::is_same_v<IndexTy, std::int32_t>)
-        {
-            using dpctl::tensor::rich_comparisons::DescendingSorter;
-            using Comp = typename DescendingSorter<argTy>::type;
-
-            using dpctl::tensor::kernels::stable_argsort_axis1_contig_impl;
-            return stable_argsort_axis1_contig_impl<argTy, IndexTy, Comp>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-void init_merge_argsort_dispatch_tables(void)
-{
-    using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
-
-    td_ns::DispatchTableBuilder<sort_contig_fn_ptr_t,
-                                AscendingArgSortContigFactory, td_ns::num_types>
-        dtb1;
-    dtb1.populate_dispatch_table(ascending_argsort_contig_dispatch_table);
-
-    td_ns::DispatchTableBuilder<
-        sort_contig_fn_ptr_t, DescendingArgSortContigFactory, td_ns::num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(descending_argsort_contig_dispatch_table);
-}
-
-void init_merge_argsort_functions(py::module_ m)
-{
-    dpctl::tensor::py_internal::init_merge_argsort_dispatch_tables();
-
-    auto py_argsort_ascending = [](const dpctl::tensor::usm_ndarray &src,
-                                   const int trailing_dims_to_sort,
-                                   const dpctl::tensor::usm_ndarray &dst,
-                                   sycl::queue &exec_q,
-                                   const std::vector<sycl::event> &depends)
-        -> std::pair<sycl::event, sycl::event> {
-        return dpctl::tensor::py_internal::py_argsort(
-            src, trailing_dims_to_sort, dst, exec_q, depends,
-            dpctl::tensor::py_internal::
-                ascending_argsort_contig_dispatch_table);
-    };
-    m.def("_argsort_ascending", py_argsort_ascending, py::arg("src"),
-          py::arg("trailing_dims_to_sort"), py::arg("dst"),
-          py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-    auto py_argsort_descending = [](const dpctl::tensor::usm_ndarray &src,
-                                    const int trailing_dims_to_sort,
-                                    const dpctl::tensor::usm_ndarray &dst,
-                                    sycl::queue &exec_q,
-                                    const std::vector<sycl::event> &depends)
-        -> std::pair<sycl::event, sycl::event> {
-        return dpctl::tensor::py_internal::py_argsort(
-            src, trailing_dims_to_sort, dst, exec_q, depends,
-            dpctl::tensor::py_internal::
-                descending_argsort_contig_dispatch_table);
-    };
-    m.def("_argsort_descending", py_argsort_descending, py::arg("src"),
-          py::arg("trailing_dims_to_sort"), py::arg("dst"),
-          py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-    return;
-}
-
-} // end of namespace py_internal
-} // end of namespace tensor
-} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/sorting/merge_argsort.hpp b/dpctl/tensor/libtensor/source/sorting/merge_argsort.hpp
deleted file mode 100644
index 9d671faf54..0000000000
--- a/dpctl/tensor/libtensor/source/sorting/merge_argsort.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_sorting_impl
-/// extension.
-//===--------------------------------------------------------------------===//
-
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_merge_argsort_functions(py::module_);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/sorting/merge_sort.cpp b/dpctl/tensor/libtensor/source/sorting/merge_sort.cpp
deleted file mode 100644
index 2c53c56f65..0000000000
--- a/dpctl/tensor/libtensor/source/sorting/merge_sort.cpp
+++ /dev/null
@@ -1,134 +0,0 @@
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_sorting_impl
-/// extension.
-//===--------------------------------------------------------------------===//
-
-#include <sycl/sycl.hpp>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include "utils/math_utils.hpp"
-#include "utils/memory_overlap.hpp"
-#include "utils/output_validation.hpp"
-#include "utils/rich_comparisons.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/sorting/merge_sort.hpp"
-#include "kernels/sorting/sort_impl_fn_ptr_t.hpp"
-
-#include "merge_sort.hpp"
-#include "py_sort_common.hpp"
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
-static sort_contig_fn_ptr_t
-    ascending_sort_contig_dispatch_vector[td_ns::num_types];
-static sort_contig_fn_ptr_t
-    descending_sort_contig_dispatch_vector[td_ns::num_types];
-
-template <typename fnT, typename argTy> struct AscendingSortContigFactory
-{
-    fnT get()
-    {
-        using dpctl::tensor::rich_comparisons::AscendingSorter;
-        using Comp = typename AscendingSorter<argTy>::type;
-
-        using dpctl::tensor::kernels::stable_sort_axis1_contig_impl;
-        return stable_sort_axis1_contig_impl<argTy, Comp>;
-    }
-};
-
-template <typename fnT, typename argTy> struct DescendingSortContigFactory
-{
-    fnT get()
-    {
-        using dpctl::tensor::rich_comparisons::DescendingSorter;
-        using Comp = typename DescendingSorter<argTy>::type;
-
-        using dpctl::tensor::kernels::stable_sort_axis1_contig_impl;
-        return stable_sort_axis1_contig_impl<argTy, Comp>;
-    }
-};
-
-void init_merge_sort_dispatch_vectors(void)
-{
-    using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
-
-    td_ns::DispatchVectorBuilder<sort_contig_fn_ptr_t,
-                                 AscendingSortContigFactory, td_ns::num_types>
-        dtv1;
-    dtv1.populate_dispatch_vector(ascending_sort_contig_dispatch_vector);
-
-    td_ns::DispatchVectorBuilder<sort_contig_fn_ptr_t,
-                                 DescendingSortContigFactory, td_ns::num_types>
-        dtv2;
-    dtv2.populate_dispatch_vector(descending_sort_contig_dispatch_vector);
-}
-
-void init_merge_sort_functions(py::module_ m)
-{
-    dpctl::tensor::py_internal::init_merge_sort_dispatch_vectors();
-
-    auto py_sort_ascending = [](const dpctl::tensor::usm_ndarray &src,
-                                const int trailing_dims_to_sort,
-                                const dpctl::tensor::usm_ndarray &dst,
-                                sycl::queue &exec_q,
-                                const std::vector<sycl::event> &depends)
-        -> std::pair<sycl::event, sycl::event> {
-        return dpctl::tensor::py_internal::py_sort(
-            src, trailing_dims_to_sort, dst, exec_q, depends,
-            dpctl::tensor::py_internal::ascending_sort_contig_dispatch_vector);
-    };
-    m.def("_sort_ascending", py_sort_ascending, py::arg("src"),
-          py::arg("trailing_dims_to_sort"), py::arg("dst"),
-          py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-    auto py_sort_descending = [](const dpctl::tensor::usm_ndarray &src,
-                                 const int trailing_dims_to_sort,
-                                 const dpctl::tensor::usm_ndarray &dst,
-                                 sycl::queue &exec_q,
-                                 const std::vector<sycl::event> &depends)
-        -> std::pair<sycl::event, sycl::event> {
-        return dpctl::tensor::py_internal::py_sort(
-            src, trailing_dims_to_sort, dst, exec_q, depends,
-            dpctl::tensor::py_internal::descending_sort_contig_dispatch_vector);
-    };
-    m.def("_sort_descending", py_sort_descending, py::arg("src"),
-          py::arg("trailing_dims_to_sort"), py::arg("dst"),
-          py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-    return;
-}
-
-} // end of namespace py_internal
-} // end of namespace tensor
-} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/sorting/merge_sort.hpp b/dpctl/tensor/libtensor/source/sorting/merge_sort.hpp
deleted file mode 100644
index a817af3649..0000000000
--- a/dpctl/tensor/libtensor/source/sorting/merge_sort.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_sorting_impl
-/// extension.
-//===--------------------------------------------------------------------===//
-
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_merge_sort_functions(py::module_);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/sorting/py_argsort_common.hpp b/dpctl/tensor/libtensor/source/sorting/py_argsort_common.hpp
deleted file mode 100644
index aa3ea018b3..0000000000
--- a/dpctl/tensor/libtensor/source/sorting/py_argsort_common.hpp
+++ /dev/null
@@ -1,174 +0,0 @@
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_sorting_impl
-/// extension.
-//===--------------------------------------------------------------------===//
-
-#include "dpctl4pybind11.hpp"
-#include <cstddef>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-
-#include "utils/math_utils.hpp"
-#include "utils/memory_overlap.hpp"
-#include "utils/output_validation.hpp"
-#include "utils/type_dispatch.hpp"
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-template <typename sorting_contig_impl_fnT>
-std::pair<sycl::event, sycl::event>
-py_argsort(const dpctl::tensor::usm_ndarray &src,
-           const int trailing_dims_to_sort,
-           const dpctl::tensor::usm_ndarray &dst,
-           sycl::queue &exec_q,
-           const std::vector<sycl::event> &depends,
-           const sorting_contig_impl_fnT &sort_contig_fns)
-{
-    int src_nd = src.get_ndim();
-    int dst_nd = dst.get_ndim();
-    if (src_nd != dst_nd) {
-        throw py::value_error("The input and output arrays must have "
-                              "the same array ranks");
-    }
-    int iteration_nd = src_nd - trailing_dims_to_sort;
-    if (trailing_dims_to_sort <= 0 || iteration_nd < 0) {
-        throw py::value_error("Trailing_dim_to_sort must be positive, but no "
-                              "greater than rank of the array being sorted");
-    }
-
-    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
-    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
-
-    bool same_shapes = true;
-    std::size_t iter_nelems(1);
-
-    for (int i = 0; same_shapes && (i < iteration_nd); ++i) {
-        auto src_shape_i = src_shape_ptr[i];
-        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
-        iter_nelems *= static_cast<std::size_t>(src_shape_i);
-    }
-
-    std::size_t sort_nelems(1);
-    for (int i = iteration_nd; same_shapes && (i < src_nd); ++i) {
-        auto src_shape_i = src_shape_ptr[i];
-        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
-        sort_nelems *= static_cast<std::size_t>(src_shape_i);
-    }
-
-    if (!same_shapes) {
-        throw py::value_error(
-            "Destination shape does not match the input shape");
-    }
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    if ((iter_nelems == 0) || (sort_nelems == 0)) {
-        // Nothing to do
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    // check that dst and src do not overlap
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    if (overlap(src, dst)) {
-        throw py::value_error("Arrays index overlapping segments of memory");
-    }
-
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
-        dst, sort_nelems * iter_nelems);
-
-    int src_typenum = src.get_typenum();
-    int dst_typenum = dst.get_typenum();
-
-    const auto &array_types = td_ns::usm_ndarray_types();
-    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    if ((dst_typeid != static_cast<int>(td_ns::typenum_t::INT64)) &&
-        (dst_typeid != static_cast<int>(td_ns::typenum_t::INT32)))
-    {
-        throw py::value_error(
-            "Output index array must have data type int32 or int64");
-    }
-
-    bool is_src_c_contig = src.is_c_contiguous();
-    bool is_dst_c_contig = dst.is_c_contiguous();
-
-    if (is_src_c_contig && is_dst_c_contig) {
-        if (sort_nelems > 1) {
-            static constexpr py::ssize_t zero_offset = py::ssize_t(0);
-
-            auto fn = sort_contig_fns[src_typeid][dst_typeid];
-
-            if (fn == nullptr) {
-                throw py::value_error(
-                    "Not implemented for dtypes of input arrays");
-            }
-
-            sycl::event comp_ev =
-                fn(exec_q, iter_nelems, sort_nelems, src.get_data(),
-                   dst.get_data(), zero_offset, zero_offset, zero_offset,
-                   zero_offset, depends);
-
-            sycl::event keep_args_alive_ev =
-                dpctl::utils::keep_args_alive(exec_q, {src, dst}, {comp_ev});
-
-            return std::make_pair(keep_args_alive_ev, comp_ev);
-        }
-        else {
-            assert(dst.get_size() == iter_nelems);
-            int dst_elemsize = dst.get_elemsize();
-            static constexpr int memset_val(0);
-
-            sycl::event fill_ev = exec_q.submit([&](sycl::handler &cgh) {
-                cgh.depends_on(depends);
-
-                cgh.memset(reinterpret_cast<void *>(dst.get_data()), memset_val,
-                           iter_nelems * dst_elemsize);
-            });
-
-            sycl::event keep_args_alive_ev =
-                dpctl::utils::keep_args_alive(exec_q, {src, dst}, {fill_ev});
-
-            return std::make_pair(keep_args_alive_ev, fill_ev);
-        }
-    }
-
-    throw py::value_error(
-        "Both source and destination arrays must be C-contiguous");
-}
-
-} // end of namespace py_internal
-} // end of namespace tensor
-} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/sorting/py_sort_common.hpp b/dpctl/tensor/libtensor/source/sorting/py_sort_common.hpp
deleted file mode 100644
index 2c02bcc8b6..0000000000
--- a/dpctl/tensor/libtensor/source/sorting/py_sort_common.hpp
+++ /dev/null
@@ -1,170 +0,0 @@
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_sorting_impl
-/// extension.
-//===--------------------------------------------------------------------===//
-
-#pragma once
-
-#include <cstddef>
-#include <sycl/sycl.hpp>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include "utils/math_utils.hpp"
-#include "utils/memory_overlap.hpp"
-#include "utils/output_validation.hpp"
-#include "utils/type_dispatch.hpp"
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-template <typename sorting_contig_impl_fnT>
-std::pair<sycl::event, sycl::event>
-py_sort(const dpctl::tensor::usm_ndarray &src,
-        const int trailing_dims_to_sort,
-        const dpctl::tensor::usm_ndarray &dst,
-        sycl::queue &exec_q,
-        const std::vector<sycl::event> &depends,
-        const sorting_contig_impl_fnT &sort_contig_fns)
-{
-    int src_nd = src.get_ndim();
-    int dst_nd = dst.get_ndim();
-    if (src_nd != dst_nd) {
-        throw py::value_error("The input and output arrays must have "
-                              "the same array ranks");
-    }
-    int iteration_nd = src_nd - trailing_dims_to_sort;
-    if (trailing_dims_to_sort <= 0 || iteration_nd < 0) {
-        throw py::value_error("Trailing_dim_to_sort must be positive, but no "
-                              "greater than rank of the array being sorted");
-    }
-
-    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
-    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
-
-    bool same_shapes = true;
-    std::size_t iter_nelems(1);
-
-    for (int i = 0; same_shapes && (i < iteration_nd); ++i) {
-        auto src_shape_i = src_shape_ptr[i];
-        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
-        iter_nelems *= static_cast<std::size_t>(src_shape_i);
-    }
-
-    std::size_t sort_nelems(1);
-    for (int i = iteration_nd; same_shapes && (i < src_nd); ++i) {
-        auto src_shape_i = src_shape_ptr[i];
-        same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]);
-        sort_nelems *= static_cast<std::size_t>(src_shape_i);
-    }
-
-    if (!same_shapes) {
-        throw py::value_error(
-            "Destination shape does not match the input shape");
-    }
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    if ((iter_nelems == 0) || (sort_nelems == 0)) {
-        // Nothing to do
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    // check that dst and src do not overlap
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    if (overlap(src, dst)) {
-        throw py::value_error("Arrays index overlapping segments of memory");
-    }
-
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(
-        dst, sort_nelems * iter_nelems);
-
-    int src_typenum = src.get_typenum();
-    int dst_typenum = dst.get_typenum();
-
-    const auto &array_types = td_ns::usm_ndarray_types();
-    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    if (src_typeid != dst_typeid) {
-        throw py::value_error("Both input arrays must have "
-                              "the same value data type");
-    }
-
-    bool is_src_c_contig = src.is_c_contiguous();
-    bool is_dst_c_contig = dst.is_c_contiguous();
-
-    if (is_src_c_contig && is_dst_c_contig) {
-        if (sort_nelems > 1) {
-            static constexpr py::ssize_t zero_offset = py::ssize_t(0);
-
-            auto fn = sort_contig_fns[src_typeid];
-
-            if (nullptr == fn) {
-                throw py::value_error(
-                    "Not implemented for the dtype of input arrays");
-            }
-
-            sycl::event comp_ev =
-                fn(exec_q, iter_nelems, sort_nelems, src.get_data(),
-                   dst.get_data(), zero_offset, zero_offset, zero_offset,
-                   zero_offset, depends);
-
-            sycl::event keep_args_alive_ev =
-                dpctl::utils::keep_args_alive(exec_q, {src, dst}, {comp_ev});
-
-            return std::make_pair(keep_args_alive_ev, comp_ev);
-        }
-        else {
-            assert(dst.get_size() == iter_nelems);
-            int src_elemsize = src.get_elemsize();
-
-            sycl::event copy_ev =
-                exec_q.copy<char>(src.get_data(), dst.get_data(),
-                                  src_elemsize * iter_nelems, depends);
-
-            return std::make_pair(
-                dpctl::utils::keep_args_alive(exec_q, {src, dst}, {copy_ev}),
-                copy_ev);
-        }
-    }
-
-    throw py::value_error(
-        "Both source and destination arrays must be C-contiguous");
-}
-
-} // end of namespace py_internal
-} // end of namespace tensor
-} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/sorting/radix_argsort.cpp b/dpctl/tensor/libtensor/source/sorting/radix_argsort.cpp
deleted file mode 100644
index 66a1ae9ee0..0000000000
--- a/dpctl/tensor/libtensor/source/sorting/radix_argsort.cpp
+++ /dev/null
@@ -1,187 +0,0 @@
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_sorting_impl
-/// extension.
-//===--------------------------------------------------------------------===//
-
-#include <cstddef>
-#include <cstdint>
-#include <utility>
-#include <vector>
-
-#include <sycl/sycl.hpp>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include "utils/memory_overlap.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/output_validation.hpp"
-#include "utils/sycl_alloc_utils.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/sorting/radix_sort.hpp"
-#include "kernels/sorting/sort_impl_fn_ptr_t.hpp"
-
-#include "py_argsort_common.hpp"
-#include "radix_argsort.hpp"
-#include "radix_sort_support.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace impl_ns = dpctl::tensor::kernels::radix_sort_details;
-
-using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
-
-static sort_contig_fn_ptr_t
-    ascending_radix_argsort_contig_dispatch_table[td_ns::num_types]
-                                                 [td_ns::num_types];
-static sort_contig_fn_ptr_t
-    descending_radix_argsort_contig_dispatch_table[td_ns::num_types]
-                                                  [td_ns::num_types];
-
-namespace
-{
-
-template <bool is_ascending, typename T, typename I>
-sycl::event argsort_axis1_contig_caller(sycl::queue &q,
-                                        std::size_t iter_nelems,
-                                        std::size_t sort_nelems,
-                                        const char *arg_cp,
-                                        char *res_cp,
-                                        ssize_t iter_arg_offset,
-                                        ssize_t iter_res_offset,
-                                        ssize_t sort_arg_offset,
-                                        ssize_t sort_res_offset,
-                                        const std::vector<sycl::event> &depends)
-{
-    using dpctl::tensor::kernels::radix_argsort_axis1_contig_impl;
-
-    return radix_argsort_axis1_contig_impl<T, I>(
-        q, is_ascending, iter_nelems, sort_nelems, arg_cp, res_cp,
-        iter_arg_offset, iter_res_offset, sort_arg_offset, sort_res_offset,
-        depends);
-}
-
-} // end of anonymous namespace
-
-template <typename fnT, typename argTy, typename IndexTy>
-struct AscendingRadixArgSortContigFactory
-{
-    fnT get()
-    {
-        if constexpr (RadixSortSupportVector<argTy>::is_defined &&
-                      (std::is_same_v<IndexTy, std::int64_t> ||
-                       std::is_same_v<IndexTy, std::int32_t>))
-        {
-            return argsort_axis1_contig_caller<
-                /*ascending*/ true, argTy, IndexTy>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename argTy, typename IndexTy>
-struct DescendingRadixArgSortContigFactory
-{
-    fnT get()
-    {
-        if constexpr (RadixSortSupportVector<argTy>::is_defined &&
-                      (std::is_same_v<IndexTy, std::int64_t> ||
-                       std::is_same_v<IndexTy, std::int32_t>))
-        {
-            return argsort_axis1_contig_caller<
-                /*ascending*/ false, argTy, IndexTy>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-void init_radix_argsort_dispatch_tables(void)
-{
-    using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
-
-    td_ns::DispatchTableBuilder<sort_contig_fn_ptr_t,
-                                AscendingRadixArgSortContigFactory,
-                                td_ns::num_types>
-        dtb1;
-    dtb1.populate_dispatch_table(ascending_radix_argsort_contig_dispatch_table);
-
-    td_ns::DispatchTableBuilder<sort_contig_fn_ptr_t,
-                                DescendingRadixArgSortContigFactory,
-                                td_ns::num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(
-        descending_radix_argsort_contig_dispatch_table);
-}
-
-void init_radix_argsort_functions(py::module_ m)
-{
-    dpctl::tensor::py_internal::init_radix_argsort_dispatch_tables();
-
-    auto py_radix_argsort_ascending =
-        [](const dpctl::tensor::usm_ndarray &src,
-           const int trailing_dims_to_sort,
-           const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q,
-           const std::vector<sycl::event> &depends)
-        -> std::pair<sycl::event, sycl::event> {
-        return dpctl::tensor::py_internal::py_argsort(
-            src, trailing_dims_to_sort, dst, exec_q, depends,
-            dpctl::tensor::py_internal::
-                ascending_radix_argsort_contig_dispatch_table);
-    };
-    m.def("_radix_argsort_ascending", py_radix_argsort_ascending,
-          py::arg("src"), py::arg("trailing_dims_to_sort"), py::arg("dst"),
-          py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-    auto py_radix_argsort_descending =
-        [](const dpctl::tensor::usm_ndarray &src,
-           const int trailing_dims_to_sort,
-           const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q,
-           const std::vector<sycl::event> &depends)
-        -> std::pair<sycl::event, sycl::event> {
-        return dpctl::tensor::py_internal::py_argsort(
-            src, trailing_dims_to_sort, dst, exec_q, depends,
-            dpctl::tensor::py_internal::
-                descending_radix_argsort_contig_dispatch_table);
-    };
-    m.def("_radix_argsort_descending", py_radix_argsort_descending,
-          py::arg("src"), py::arg("trailing_dims_to_sort"), py::arg("dst"),
-          py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-    return;
-}
-
-} // namespace py_internal
-} // end of namespace tensor
-} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/sorting/radix_argsort.hpp b/dpctl/tensor/libtensor/source/sorting/radix_argsort.hpp
deleted file mode 100644
index 1651d35dc8..0000000000
--- a/dpctl/tensor/libtensor/source/sorting/radix_argsort.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_sorting_impl
-/// extension.
-//===--------------------------------------------------------------------===//
-
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_radix_argsort_functions(py::module_);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/sorting/radix_sort.cpp b/dpctl/tensor/libtensor/source/sorting/radix_sort.cpp
deleted file mode 100644
index 5ec39415df..0000000000
--- a/dpctl/tensor/libtensor/source/sorting/radix_sort.cpp
+++ /dev/null
@@ -1,187 +0,0 @@
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_sorting_impl
-/// extension.
-//===--------------------------------------------------------------------===//
-
-#include <cstddef>
-#include <cstdint>
-#include <exception>
-#include <utility>
-#include <vector>
-
-#include <sycl/sycl.hpp>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include "utils/memory_overlap.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/output_validation.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "kernels/dpctl_tensor_types.hpp"
-#include "kernels/sorting/radix_sort.hpp"
-#include "kernels/sorting/sort_impl_fn_ptr_t.hpp"
-
-#include "py_sort_common.hpp"
-#include "radix_sort.hpp"
-#include "radix_sort_support.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace impl_ns = dpctl::tensor::kernels::radix_sort_details;
-
-using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
-static sort_contig_fn_ptr_t
-    ascending_radix_sort_contig_dispatch_vector[td_ns::num_types];
-static sort_contig_fn_ptr_t
-    descending_radix_sort_contig_dispatch_vector[td_ns::num_types];
-
-namespace
-{
-
-template <bool is_ascending, typename T>
-sycl::event sort_axis1_contig_caller(sycl::queue &q,
-                                     std::size_t iter_nelems,
-                                     std::size_t sort_nelems,
-                                     const char *arg_cp,
-                                     char *res_cp,
-                                     ssize_t iter_arg_offset,
-                                     ssize_t iter_res_offset,
-                                     ssize_t sort_arg_offset,
-                                     ssize_t sort_res_offset,
-                                     const std::vector<sycl::event> &depends)
-{
-    using dpctl::tensor::kernels::radix_sort_axis1_contig_impl;
-
-    return radix_sort_axis1_contig_impl<T>(
-        q, is_ascending, iter_nelems, sort_nelems, arg_cp, res_cp,
-        iter_arg_offset, iter_res_offset, sort_arg_offset, sort_res_offset,
-        depends);
-}
-
-} // end of anonymous namespace
-
-template <typename fnT, typename argTy> struct AscendingRadixSortContigFactory
-{
-    fnT get()
-    {
-        if constexpr (RadixSortSupportVector<argTy>::is_defined) {
-            return sort_axis1_contig_caller</*ascending*/ true, argTy>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename argTy> struct DescendingRadixSortContigFactory
-{
-    fnT get()
-    {
-        if constexpr (RadixSortSupportVector<argTy>::is_defined) {
-            return sort_axis1_contig_caller</*ascending*/ false, argTy>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-void init_radix_sort_dispatch_vectors(void)
-{
-    using dpctl::tensor::kernels::sort_contig_fn_ptr_t;
-
-    td_ns::DispatchVectorBuilder<
-        sort_contig_fn_ptr_t, AscendingRadixSortContigFactory, td_ns::num_types>
-        dtv1;
-    dtv1.populate_dispatch_vector(ascending_radix_sort_contig_dispatch_vector);
-
-    td_ns::DispatchVectorBuilder<sort_contig_fn_ptr_t,
-                                 DescendingRadixSortContigFactory,
-                                 td_ns::num_types>
-        dtv2;
-    dtv2.populate_dispatch_vector(descending_radix_sort_contig_dispatch_vector);
-}
-
-bool py_radix_sort_defined(int typenum)
-{
-    const auto &array_types = td_ns::usm_ndarray_types();
-
-    try {
-        int type_id = array_types.typenum_to_lookup_id(typenum);
-        return (nullptr !=
-                ascending_radix_sort_contig_dispatch_vector[type_id]);
-    } catch (const std::exception &e) {
-        return false;
-    }
-}
-
-void init_radix_sort_functions(py::module_ m)
-{
-    dpctl::tensor::py_internal::init_radix_sort_dispatch_vectors();
-
-    auto py_radix_sort_ascending = [](const dpctl::tensor::usm_ndarray &src,
-                                      const int trailing_dims_to_sort,
-                                      const dpctl::tensor::usm_ndarray &dst,
-                                      sycl::queue &exec_q,
-                                      const std::vector<sycl::event> &depends)
-        -> std::pair<sycl::event, sycl::event> {
-        return dpctl::tensor::py_internal::py_sort(
-            src, trailing_dims_to_sort, dst, exec_q, depends,
-            dpctl::tensor::py_internal::
-                ascending_radix_sort_contig_dispatch_vector);
-    };
-    m.def("_radix_sort_ascending", py_radix_sort_ascending, py::arg("src"),
-          py::arg("trailing_dims_to_sort"), py::arg("dst"),
-          py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-    auto py_radix_sort_descending = [](const dpctl::tensor::usm_ndarray &src,
-                                       const int trailing_dims_to_sort,
-                                       const dpctl::tensor::usm_ndarray &dst,
-                                       sycl::queue &exec_q,
-                                       const std::vector<sycl::event> &depends)
-        -> std::pair<sycl::event, sycl::event> {
-        return dpctl::tensor::py_internal::py_sort(
-            src, trailing_dims_to_sort, dst, exec_q, depends,
-            dpctl::tensor::py_internal::
-                descending_radix_sort_contig_dispatch_vector);
-    };
-    m.def("_radix_sort_descending", py_radix_sort_descending, py::arg("src"),
-          py::arg("trailing_dims_to_sort"), py::arg("dst"),
-          py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-    m.def("_radix_sort_dtype_supported", py_radix_sort_defined);
-
-    return;
-}
-
-} // namespace py_internal
-} // end of namespace tensor
-} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/sorting/radix_sort.hpp b/dpctl/tensor/libtensor/source/sorting/radix_sort.hpp
deleted file mode 100644
index 1a0a5fcb6a..0000000000
--- a/dpctl/tensor/libtensor/source/sorting/radix_sort.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_sorting_impl
-/// extension.
-//===--------------------------------------------------------------------===//
-
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_radix_sort_functions(py::module_);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/sorting/radix_sort_support.hpp b/dpctl/tensor/libtensor/source/sorting/radix_sort_support.hpp
deleted file mode 100644
index a4692b417d..0000000000
--- a/dpctl/tensor/libtensor/source/sorting/radix_sort_support.hpp
+++ /dev/null
@@ -1,71 +0,0 @@
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_sorting_impl
-/// extension.
-//===--------------------------------------------------------------------===//
-
-#pragma once
-
-#include <type_traits>
-
-#include <sycl/sycl.hpp>
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-template <typename Ty, typename ArgTy>
-struct TypeDefinedEntry : std::bool_constant<std::is_same_v<Ty, ArgTy>>
-{
-    static constexpr bool is_defined = true;
-};
-
-struct NotDefinedEntry : std::true_type
-{
-    static constexpr bool is_defined = false;
-};
-
-template <typename T> struct RadixSortSupportVector
-{
-    using resolver_t =
-        typename std::disjunction<TypeDefinedEntry<T, bool>,
-                                  TypeDefinedEntry<T, std::int8_t>,
-                                  TypeDefinedEntry<T, std::uint8_t>,
-                                  TypeDefinedEntry<T, std::int16_t>,
-                                  TypeDefinedEntry<T, std::uint16_t>,
-                                  TypeDefinedEntry<T, std::int32_t>,
-                                  TypeDefinedEntry<T, std::uint32_t>,
-                                  TypeDefinedEntry<T, std::int64_t>,
-                                  TypeDefinedEntry<T, std::uint64_t>,
-                                  TypeDefinedEntry<T, sycl::half>,
-                                  TypeDefinedEntry<T, float>,
-                                  TypeDefinedEntry<T, double>,
-                                  NotDefinedEntry>;
-
-    static constexpr bool is_defined = resolver_t::is_defined;
-};
-
-} // end of namespace py_internal
-} // end of namespace tensor
-} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/sorting/searchsorted.cpp b/dpctl/tensor/libtensor/source/sorting/searchsorted.cpp
deleted file mode 100644
index 0162bf770b..0000000000
--- a/dpctl/tensor/libtensor/source/sorting/searchsorted.cpp
+++ /dev/null
@@ -1,473 +0,0 @@
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_sorting_impl
-/// extension.
-//===--------------------------------------------------------------------===//
-
-#include <cstddef>
-#include <stdexcept>
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include "kernels/sorting/searchsorted.hpp"
-#include "utils/memory_overlap.hpp"
-#include "utils/output_validation.hpp"
-#include "utils/rich_comparisons.hpp"
-#include "utils/sycl_alloc_utils.hpp"
-#include "utils/type_dispatch.hpp"
-#include "utils/type_utils.hpp"
-
-#include "simplify_iteration_space.hpp"
-
-namespace py = pybind11;
-namespace td_ns = dpctl::tensor::type_dispatch;
-namespace tu_ns = dpctl::tensor::type_utils;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace detail
-{
-
-using dpctl::tensor::kernels::searchsorted_contig_impl_fp_ptr_t;
-
-static searchsorted_contig_impl_fp_ptr_t
-    left_side_searchsorted_contig_impl[td_ns::num_types][td_ns::num_types];
-
-static searchsorted_contig_impl_fp_ptr_t
-    right_side_searchsorted_contig_impl[td_ns::num_types][td_ns::num_types];
-
-template <typename fnT, typename argTy, typename indTy>
-struct LeftSideSearchSortedContigFactory
-{
-    constexpr LeftSideSearchSortedContigFactory() {}
-
-    fnT get() const
-    {
-        if constexpr (std::is_same_v<indTy, std::int32_t> ||
-                      std::is_same_v<indTy, std::int64_t>)
-        {
-            static constexpr bool left_side_search(true);
-            using dpctl::tensor::kernels::searchsorted_contig_impl;
-            using dpctl::tensor::rich_comparisons::AscendingSorter;
-
-            using Compare = typename AscendingSorter<argTy>::type;
-
-            return searchsorted_contig_impl<argTy, indTy, left_side_search,
-                                            Compare>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename argTy, typename indTy>
-struct RightSideSearchSortedContigFactory
-{
-    constexpr RightSideSearchSortedContigFactory() {}
-
-    fnT get() const
-    {
-        if constexpr (std::is_same_v<indTy, std::int32_t> ||
-                      std::is_same_v<indTy, std::int64_t>)
-        {
-            static constexpr bool right_side_search(false);
-
-            using dpctl::tensor::kernels::searchsorted_contig_impl;
-            using dpctl::tensor::rich_comparisons::AscendingSorter;
-
-            using Compare = typename AscendingSorter<argTy>::type;
-
-            return searchsorted_contig_impl<argTy, indTy, right_side_search,
-                                            Compare>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-using dpctl::tensor::kernels::searchsorted_strided_impl_fp_ptr_t;
-
-static searchsorted_strided_impl_fp_ptr_t
-    left_side_searchsorted_strided_impl[td_ns::num_types][td_ns::num_types];
-
-static searchsorted_strided_impl_fp_ptr_t
-    right_side_searchsorted_strided_impl[td_ns::num_types][td_ns::num_types];
-
-template <typename fnT, typename argTy, typename indTy>
-struct LeftSideSearchSortedStridedFactory
-{
-    constexpr LeftSideSearchSortedStridedFactory() {}
-
-    fnT get() const
-    {
-        if constexpr (std::is_same_v<indTy, std::int32_t> ||
-                      std::is_same_v<indTy, std::int64_t>)
-        {
-            static constexpr bool left_side_search(true);
-            using dpctl::tensor::kernels::searchsorted_strided_impl;
-            using dpctl::tensor::rich_comparisons::AscendingSorter;
-
-            using Compare = typename AscendingSorter<argTy>::type;
-
-            return searchsorted_strided_impl<argTy, indTy, left_side_search,
-                                             Compare>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-template <typename fnT, typename argTy, typename indTy>
-struct RightSideSearchSortedStridedFactory
-{
-    constexpr RightSideSearchSortedStridedFactory() {}
-
-    fnT get() const
-    {
-        if constexpr (std::is_same_v<indTy, std::int32_t> ||
-                      std::is_same_v<indTy, std::int64_t>)
-        {
-            static constexpr bool right_side_search(false);
-            using dpctl::tensor::kernels::searchsorted_strided_impl;
-            using dpctl::tensor::rich_comparisons::AscendingSorter;
-
-            using Compare = typename AscendingSorter<argTy>::type;
-
-            return searchsorted_strided_impl<argTy, indTy, right_side_search,
-                                             Compare>;
-        }
-        else {
-            return nullptr;
-        }
-    }
-};
-
-void init_searchsorted_dispatch_table(void)
-{
-
-    // Contiguous input function dispatch
-    td_ns::DispatchTableBuilder<searchsorted_contig_impl_fp_ptr_t,
-                                LeftSideSearchSortedContigFactory,
-                                td_ns::num_types>
-        dtb1;
-    dtb1.populate_dispatch_table(left_side_searchsorted_contig_impl);
-
-    td_ns::DispatchTableBuilder<searchsorted_contig_impl_fp_ptr_t,
-                                RightSideSearchSortedContigFactory,
-                                td_ns::num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(right_side_searchsorted_contig_impl);
-
-    // Strided input function dispatch
-    td_ns::DispatchTableBuilder<searchsorted_strided_impl_fp_ptr_t,
-                                LeftSideSearchSortedStridedFactory,
-                                td_ns::num_types>
-        dtb3;
-    dtb3.populate_dispatch_table(left_side_searchsorted_strided_impl);
-
-    td_ns::DispatchTableBuilder<searchsorted_strided_impl_fp_ptr_t,
-                                RightSideSearchSortedStridedFactory,
-                                td_ns::num_types>
-        dtb4;
-    dtb4.populate_dispatch_table(right_side_searchsorted_strided_impl);
-}
-
-} // namespace detail
-
-/*! @brief search for needle from needles in sorted hay */
-std::pair<sycl::event, sycl::event>
-py_searchsorted(const dpctl::tensor::usm_ndarray &hay,
-                const dpctl::tensor::usm_ndarray &needles,
-                const dpctl::tensor::usm_ndarray &positions,
-                sycl::queue &exec_q,
-                const bool search_left_side,
-                const std::vector<sycl::event> &depends)
-{
-    const int hay_nd = hay.get_ndim();
-    const int needles_nd = needles.get_ndim();
-    const int positions_nd = positions.get_ndim();
-
-    if (hay_nd != 1 || needles_nd != positions_nd) {
-        throw py::value_error("Array dimensions mismatch");
-    }
-
-    // check that needle and positions have the same shape
-    std::size_t needles_nelems(1);
-    bool same_shape(true);
-
-    const std::size_t hay_nelems = static_cast<std::size_t>(hay.get_shape(0));
-
-    const py::ssize_t *needles_shape_ptr = needles.get_shape_raw();
-    const py::ssize_t *positions_shape_ptr = needles.get_shape_raw();
-
-    for (int i = 0; (i < needles_nd) && same_shape; ++i) {
-        const auto needles_sh_i = needles_shape_ptr[i];
-        const auto positions_sh_i = positions_shape_ptr[i];
-
-        same_shape = same_shape && (needles_sh_i == positions_sh_i);
-        needles_nelems *= static_cast<std::size_t>(needles_sh_i);
-    }
-
-    if (!same_shape) {
-        throw py::value_error(
-            "Array of values to search for and array of their "
-            "positions do not have the same shape");
-    }
-
-    // check that positions is ample enough
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(positions,
-                                                               needles_nelems);
-
-    // check that positions is writable
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(positions);
-
-    // check that queues are compatible
-    if (!dpctl::utils::queues_are_compatible(exec_q, {hay, needles, positions}))
-    {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    // if output array overlaps with input arrays, race condition results
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    if (overlap(positions, hay) || overlap(positions, needles)) {
-        throw py::value_error("Destination array overlaps with input.");
-    }
-
-    const int hay_typenum = hay.get_typenum();
-    const int needles_typenum = needles.get_typenum();
-    const int positions_typenum = positions.get_typenum();
-
-    auto const &array_types = td_ns::usm_ndarray_types();
-    const int hay_typeid = array_types.typenum_to_lookup_id(hay_typenum);
-    const int needles_typeid =
-        array_types.typenum_to_lookup_id(needles_typenum);
-    const int positions_typeid =
-        array_types.typenum_to_lookup_id(positions_typenum);
-
-    // check hay and needle have the same data-type
-    if (needles_typeid != hay_typeid) {
-        throw py::value_error(
-            "Hay array and needles array must have the same data types");
-    }
-    // check that positions has indexing data-type (int32, or int64)
-    const auto positions_typenum_t_v =
-        static_cast<td_ns::typenum_t>(positions_typeid);
-    if (positions_typenum_t_v != td_ns::typenum_t::INT32 &&
-        positions_typenum_t_v != td_ns::typenum_t::INT64)
-    {
-        throw py::value_error(
-            "Positions array must have data-type int32, or int64");
-    }
-
-    if (needles_nelems == 0) {
-        // Nothing to do
-        return std::make_pair(sycl::event{}, sycl::event{});
-    }
-
-    // if all inputs are contiguous call contiguous implementations
-    // otherwise call strided implementation
-    const bool hay_is_c_contig = hay.is_c_contiguous();
-    const bool hay_is_f_contig = hay.is_f_contiguous();
-
-    const bool needles_is_c_contig = needles.is_c_contiguous();
-    const bool needles_is_f_contig = needles.is_f_contiguous();
-
-    const bool positions_is_c_contig = positions.is_c_contiguous();
-    const bool positions_is_f_contig = positions.is_f_contiguous();
-
-    const bool all_c_contig =
-        (hay_is_c_contig && needles_is_c_contig && positions_is_c_contig);
-    const bool all_f_contig =
-        (hay_is_f_contig && needles_is_f_contig && positions_is_f_contig);
-
-    const char *hay_data = hay.get_data();
-    const char *needles_data = needles.get_data();
-
-    char *positions_data = positions.get_data();
-
-    if (all_c_contig || all_f_contig) {
-        auto fn =
-            (search_left_side)
-                ? detail::left_side_searchsorted_contig_impl[hay_typeid]
-                                                            [positions_typeid]
-                : detail::right_side_searchsorted_contig_impl[hay_typeid]
-                                                             [positions_typeid];
-
-        if (fn) {
-            static constexpr py::ssize_t zero_offset(0);
-
-            sycl::event comp_ev =
-                fn(exec_q, hay_nelems, needles_nelems, hay_data, zero_offset,
-                   needles_data, zero_offset, positions_data, zero_offset,
-                   depends);
-
-            return std::make_pair(
-                dpctl::utils::keep_args_alive(exec_q, {hay, needles, positions},
-                                              {comp_ev}),
-                comp_ev);
-        }
-    }
-
-    // strided case
-
-    const auto &needles_strides = needles.get_strides_vector();
-    const auto &positions_strides = positions.get_strides_vector();
-
-    int simplified_nd = needles_nd;
-
-    using shT = std::vector<py::ssize_t>;
-    shT simplified_common_shape;
-    shT simplified_needles_strides;
-    shT simplified_positions_strides;
-    py::ssize_t needles_offset(0);
-    py::ssize_t positions_offset(0);
-
-    if (simplified_nd == 0) {
-        // needles and positions have same nd
-        simplified_nd = 1;
-        simplified_common_shape.push_back(1);
-        simplified_needles_strides.push_back(0);
-        simplified_positions_strides.push_back(0);
-    }
-    else {
-        dpctl::tensor::py_internal::simplify_iteration_space(
-            // modified by refernce
-            simplified_nd,
-            // read-only inputs
-            needles_shape_ptr, needles_strides, positions_strides,
-            // output, modified by reference
-            simplified_common_shape, simplified_needles_strides,
-            simplified_positions_strides, needles_offset, positions_offset);
-    }
-    std::vector<sycl::event> host_task_events;
-    host_task_events.reserve(2);
-
-    using dpctl::tensor::offset_utils::device_allocate_and_pack;
-
-    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
-        exec_q, host_task_events,
-        // vectors being packed
-        simplified_common_shape, simplified_needles_strides,
-        simplified_positions_strides);
-    auto packed_shape_strides_owner =
-        std::move(std::get<0>(ptr_size_event_tuple));
-    const sycl::event &copy_shape_strides_ev =
-        std::get<2>(ptr_size_event_tuple);
-    const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get();
-
-    std::vector<sycl::event> all_deps;
-    all_deps.reserve(depends.size() + 1);
-    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
-    all_deps.push_back(copy_shape_strides_ev);
-
-    auto strided_fn =
-        (search_left_side)
-            ? detail::left_side_searchsorted_strided_impl[hay_typeid]
-                                                         [positions_typeid]
-            : detail::right_side_searchsorted_strided_impl[hay_typeid]
-                                                          [positions_typeid];
-
-    if (!strided_fn) {
-        throw std::runtime_error(
-            "No implementation for data types of input arrays");
-    }
-
-    static constexpr py::ssize_t zero_offset(0);
-    py::ssize_t hay_step = hay.get_strides_vector()[0];
-
-    const sycl::event &comp_ev = strided_fn(
-        exec_q, hay_nelems, needles_nelems, hay_data, zero_offset, hay_step,
-        needles_data, needles_offset, positions_data, positions_offset,
-        simplified_nd, packed_shape_strides, all_deps);
-
-    // free packed temporaries
-    sycl::event temporaries_cleanup_ev =
-        dpctl::tensor::alloc_utils::async_smart_free(
-            exec_q, {comp_ev}, packed_shape_strides_owner);
-
-    host_task_events.push_back(temporaries_cleanup_ev);
-    const sycl::event &ht_ev = dpctl::utils::keep_args_alive(
-        exec_q, {hay, needles, positions}, host_task_events);
-
-    return std::make_pair(ht_ev, comp_ev);
-}
-
-/*! @brief search for needle from needles in sorted hay,
- *         hay[pos] <= needle < hay[pos + 1]
- */
-std::pair<sycl::event, sycl::event>
-py_searchsorted_left(const dpctl::tensor::usm_ndarray &hay,
-                     const dpctl::tensor::usm_ndarray &needles,
-                     const dpctl::tensor::usm_ndarray &positions,
-                     sycl::queue &exec_q,
-                     const std::vector<sycl::event> &depends)
-{
-    static constexpr bool side_left(true);
-    return py_searchsorted(hay, needles, positions, exec_q, side_left, depends);
-}
-
-/*! @brief search for needle from needles in sorted hay,
- *         hay[pos] < needle <= hay[pos + 1]
- */
-std::pair<sycl::event, sycl::event>
-py_searchsorted_right(const dpctl::tensor::usm_ndarray &hay,
-                      const dpctl::tensor::usm_ndarray &needles,
-                      const dpctl::tensor::usm_ndarray &positions,
-                      sycl::queue &exec_q,
-                      const std::vector<sycl::event> &depends)
-{
-    static constexpr bool side_right(false);
-    return py_searchsorted(hay, needles, positions, exec_q, side_right,
-                           depends);
-}
-
-void init_searchsorted_functions(py::module_ m)
-{
-    dpctl::tensor::py_internal::detail::init_searchsorted_dispatch_table();
-
-    using dpctl::tensor::py_internal::py_searchsorted_left;
-    using dpctl::tensor::py_internal::py_searchsorted_right;
-
-    m.def("_searchsorted_left", &py_searchsorted_left, py::arg("hay"),
-          py::arg("needles"), py::arg("positions"), py::arg("sycl_queue"),
-          py::arg("depends") = py::list());
-    m.def("_searchsorted_right", &py_searchsorted_right, py::arg("hay"),
-          py::arg("needles"), py::arg("positions"), py::arg("sycl_queue"),
-          py::arg("depends") = py::list());
-}
-
-} // end of namespace py_internal
-} // end of namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/sorting/searchsorted.hpp b/dpctl/tensor/libtensor/source/sorting/searchsorted.hpp
deleted file mode 100644
index c1ae4506b9..0000000000
--- a/dpctl/tensor/libtensor/source/sorting/searchsorted.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_sorting_impl
-/// extension.
-//===--------------------------------------------------------------------===//
-
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_searchsorted_functions(py::module_ m);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/sorting/topk.cpp b/dpctl/tensor/libtensor/source/sorting/topk.cpp
deleted file mode 100644
index 7e0e4f82aa..0000000000
--- a/dpctl/tensor/libtensor/source/sorting/topk.cpp
+++ /dev/null
@@ -1,301 +0,0 @@
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_sorting_impl
-/// extension.
-//===--------------------------------------------------------------------===//
-
-#include <cstddef>
-#include <cstdint>
-#include <limits>
-#include <optional>
-#include <tuple>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include <sycl/sycl.hpp>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include "kernels/sorting/topk.hpp"
-#include "utils/math_utils.hpp"
-#include "utils/memory_overlap.hpp"
-#include "utils/output_validation.hpp"
-#include "utils/rich_comparisons.hpp"
-#include "utils/type_dispatch.hpp"
-#include "utils/type_utils.hpp"
-
-#include "topk.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-typedef sycl::event (*topk_impl_fn_ptr_t)(sycl::queue &,
-                                          std::size_t,
-                                          std::size_t,
-                                          std::size_t,
-                                          bool,
-                                          const char *,
-                                          char *,
-                                          char *,
-                                          const std::vector<sycl::event> &);
-
-static topk_impl_fn_ptr_t topk_dispatch_vector[td_ns::num_types];
-
-namespace
-{
-
-template <typename T, typename = void>
-struct use_radix_sort : public std::false_type
-{
-};
-
-template <typename T>
-struct use_radix_sort<
-    T,
-    std::enable_if_t<std::disjunction<std::is_same<T, bool>,
-                                      std::is_same<T, std::uint8_t>,
-                                      std::is_same<T, std::int8_t>,
-                                      std::is_same<T, std::uint16_t>,
-                                      std::is_same<T, std::int16_t>>::value>>
-    : public std::true_type
-{
-};
-
-template <typename argTy, typename IndexTy>
-sycl::event topk_caller(sycl::queue &exec_q,
-                        std::size_t iter_nelems, // number of sub-arrays
-                        std::size_t axis_nelems, // size of each sub-array
-                        std::size_t k,
-                        bool largest,
-                        const char *arg_cp,
-                        char *vals_cp,
-                        char *inds_cp,
-                        const std::vector<sycl::event> &depends)
-{
-    if constexpr (use_radix_sort<argTy>::value) {
-        using dpctl::tensor::kernels::topk_radix_impl;
-        auto ascending = !largest;
-        return topk_radix_impl<argTy, IndexTy>(exec_q, iter_nelems, axis_nelems,
-                                               k, ascending, arg_cp, vals_cp,
-                                               inds_cp, depends);
-    }
-    else {
-        using dpctl::tensor::kernels::topk_merge_impl;
-        if (largest) {
-            using CompTy =
-                typename dpctl::tensor::rich_comparisons::DescendingSorter<
-                    argTy>::type;
-            return topk_merge_impl<argTy, IndexTy, CompTy>(
-                exec_q, iter_nelems, axis_nelems, k, arg_cp, vals_cp, inds_cp,
-                depends);
-        }
-        else {
-            using CompTy =
-                typename dpctl::tensor::rich_comparisons::AscendingSorter<
-                    argTy>::type;
-            return topk_merge_impl<argTy, IndexTy, CompTy>(
-                exec_q, iter_nelems, axis_nelems, k, arg_cp, vals_cp, inds_cp,
-                depends);
-        }
-    }
-}
-
-} // namespace
-
-std::pair<sycl::event, sycl::event>
-py_topk(const dpctl::tensor::usm_ndarray &src,
-        std::optional<const int> trailing_dims_to_search,
-        const std::size_t k,
-        const bool largest,
-        const dpctl::tensor::usm_ndarray &vals,
-        const dpctl::tensor::usm_ndarray &inds,
-        sycl::queue &exec_q,
-        const std::vector<sycl::event> &depends)
-{
-    int src_nd = src.get_ndim();
-    int vals_nd = vals.get_ndim();
-    int inds_nd = inds.get_ndim();
-
-    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
-    const py::ssize_t *vals_shape_ptr = vals.get_shape_raw();
-    const py::ssize_t *inds_shape_ptr = inds.get_shape_raw();
-
-    std::size_t axis_nelems(1);
-    std::size_t iter_nelems(1);
-    if (trailing_dims_to_search.has_value()) {
-        if (src_nd != vals_nd || src_nd != inds_nd) {
-            throw py::value_error("The input and output arrays must have "
-                                  "the same array ranks");
-        }
-
-        auto trailing_dims = trailing_dims_to_search.value();
-        int iter_nd = src_nd - trailing_dims;
-        if (trailing_dims <= 0 || iter_nd < 0) {
-            throw py::value_error(
-                "trailing_dims_to_search must be positive, but no "
-                "greater than rank of the array being searched");
-        }
-
-        bool same_shapes = true;
-        for (int i = 0; same_shapes && (i < iter_nd); ++i) {
-            auto src_shape_i = src_shape_ptr[i];
-            same_shapes = same_shapes && (src_shape_i == vals_shape_ptr[i] &&
-                                          src_shape_i == inds_shape_ptr[i]);
-            iter_nelems *= static_cast<std::size_t>(src_shape_i);
-        }
-
-        if (!same_shapes) {
-            throw py::value_error(
-                "Destination shape does not match the input shape");
-        }
-
-        std::size_t vals_k(1);
-        std::size_t inds_k(1);
-        for (int i = iter_nd; i < src_nd; ++i) {
-            axis_nelems *= static_cast<std::size_t>(src_shape_ptr[i]);
-            vals_k *= static_cast<std::size_t>(vals_shape_ptr[i]);
-            inds_k *= static_cast<std::size_t>(inds_shape_ptr[i]);
-        }
-
-        bool valid_k = (vals_k == k && inds_k == k && axis_nelems >= k);
-        if (!valid_k) {
-            throw py::value_error("The value of k is invalid for the input and "
-                                  "destination arrays");
-        }
-    }
-    else {
-        if (vals_nd != 1 || inds_nd != 1) {
-            throw py::value_error("Output arrays must be one-dimensional");
-        }
-
-        for (int i = 0; i < src_nd; ++i) {
-            axis_nelems *= static_cast<std::size_t>(src_shape_ptr[i]);
-        }
-
-        bool valid_k = (axis_nelems >= k &&
-                        static_cast<std::size_t>(vals_shape_ptr[0]) == k &&
-                        static_cast<std::size_t>(inds_shape_ptr[0]) == k);
-        if (!valid_k) {
-            throw py::value_error("The value of k is invalid for the input and "
-                                  "destination arrays");
-        }
-    }
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, vals, inds})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(vals);
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(inds);
-
-    if ((iter_nelems == 0) || (axis_nelems == 0)) {
-        // Nothing to do
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    if (overlap(src, vals) || overlap(src, inds)) {
-        throw py::value_error("Arrays index overlapping segments of memory");
-    }
-
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(vals,
-                                                               k * iter_nelems);
-
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(inds,
-                                                               k * iter_nelems);
-
-    int src_typenum = src.get_typenum();
-    int vals_typenum = vals.get_typenum();
-    int inds_typenum = inds.get_typenum();
-
-    const auto &array_types = td_ns::usm_ndarray_types();
-    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
-    int vals_typeid = array_types.typenum_to_lookup_id(vals_typenum);
-    int inds_typeid = array_types.typenum_to_lookup_id(inds_typenum);
-
-    if (src_typeid != vals_typeid) {
-        throw py::value_error("Input array and vals array must have "
-                              "the same data type");
-    }
-
-    if (inds_typeid != static_cast<int>(td_ns::typenum_t::INT64)) {
-        throw py::value_error("Inds array must have data type int64");
-    }
-
-    bool is_src_c_contig = src.is_c_contiguous();
-    bool is_vals_c_contig = vals.is_c_contiguous();
-    bool is_inds_c_contig = inds.is_c_contiguous();
-
-    if (is_src_c_contig && is_vals_c_contig && is_inds_c_contig) {
-        auto fn = topk_dispatch_vector[src_typeid];
-
-        sycl::event comp_ev =
-            fn(exec_q, iter_nelems, axis_nelems, k, largest, src.get_data(),
-               vals.get_data(), inds.get_data(), depends);
-
-        sycl::event keep_args_alive_ev =
-            dpctl::utils::keep_args_alive(exec_q, {src, vals, inds}, {comp_ev});
-
-        return std::make_pair(keep_args_alive_ev, comp_ev);
-    }
-
-    return std::make_pair(sycl::event(), sycl::event());
-}
-
-template <typename fnT, typename T> struct TopKFactory
-{
-    fnT get()
-    {
-        using IdxT = std::int64_t;
-        return topk_caller<T, IdxT>;
-    }
-};
-
-void init_topk_dispatch_vectors(void)
-{
-    td_ns::DispatchVectorBuilder<topk_impl_fn_ptr_t, TopKFactory,
-                                 td_ns::num_types>
-        dvb;
-    dvb.populate_dispatch_vector(topk_dispatch_vector);
-}
-
-void init_topk_functions(py::module_ m)
-{
-    dpctl::tensor::py_internal::init_topk_dispatch_vectors();
-
-    m.def("_topk", &py_topk, py::arg("src"), py::arg("trailing_dims_to_search"),
-          py::arg("k"), py::arg("largest"), py::arg("vals"), py::arg("inds"),
-          py::arg("sycl_queue"), py::arg("depends") = py::list());
-}
-
-} // end of namespace py_internal
-} // end of namespace tensor
-} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/sorting/topk.hpp b/dpctl/tensor/libtensor/source/sorting/topk.hpp
deleted file mode 100644
index 2fe5c00429..0000000000
--- a/dpctl/tensor/libtensor/source/sorting/topk.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_sorting_impl
-/// extension.
-//===--------------------------------------------------------------------===//
-
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern void init_topk_functions(py::module_);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/tensor_accumulation.cpp b/dpctl/tensor/libtensor/source/tensor_accumulation.cpp
deleted file mode 100644
index f1b6eedbd6..0000000000
--- a/dpctl/tensor/libtensor/source/tensor_accumulation.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-//===-- tensor_accumulation.cpp -                              --*-C++-*-/===//
-//   Implementation of _tensor_accumulation_impl module
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===----------------------------------------------------------------------===//
-
-#include <pybind11/pybind11.h>
-
-#include "accumulators/accumulators_common.hpp"
-
-namespace py = pybind11;
-
-PYBIND11_MODULE(_tensor_accumulation_impl, m)
-{
-    dpctl::tensor::py_internal::init_accumulator_functions(m);
-}
diff --git a/dpctl/tensor/libtensor/source/tensor_ctors.cpp b/dpctl/tensor/libtensor/source/tensor_ctors.cpp
deleted file mode 100644
index 2dccc4e359..0000000000
--- a/dpctl/tensor/libtensor/source/tensor_ctors.cpp
+++ /dev/null
@@ -1,492 +0,0 @@
-//===-- tensor_ctors.cpp -                                    ---*-C++-*-/===//
-//   Implementation of _tensor_impl module
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===----------------------------------------------------------------------===//
-
-#include <algorithm>
-#include <complex>
-#include <cstdint>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <sycl/sycl.hpp>
-#include <thread>
-#include <type_traits>
-#include <utility>
-
-#include "dpctl4pybind11.hpp"
-
-#include "accumulators.hpp"
-#include "boolean_advanced_indexing.hpp"
-#include "clip.hpp"
-#include "copy_and_cast_usm_to_usm.hpp"
-#include "copy_as_contig.hpp"
-#include "copy_for_reshape.hpp"
-#include "copy_for_roll.hpp"
-#include "copy_numpy_ndarray_into_usm_ndarray.hpp"
-#include "device_support_queries.hpp"
-#include "eye_ctor.hpp"
-#include "full_ctor.hpp"
-#include "integer_advanced_indexing.hpp"
-#include "kernels/dpctl_tensor_types.hpp"
-#include "linear_sequences.hpp"
-#include "repeat.hpp"
-#include "simplify_iteration_space.hpp"
-#include "triul_ctor.hpp"
-#include "utils/memory_overlap.hpp"
-#include "utils/strided_iters.hpp"
-#include "where.hpp"
-#include "zeros_ctor.hpp"
-
-namespace py = pybind11;
-
-static_assert(std::is_same_v<py::ssize_t, dpctl::tensor::ssize_t>);
-
-namespace
-{
-
-using dpctl::tensor::c_contiguous_strides;
-using dpctl::tensor::f_contiguous_strides;
-
-using dpctl::tensor::overlap::MemoryOverlap;
-using dpctl::tensor::overlap::SameLogicalTensors;
-
-using dpctl::tensor::py_internal::copy_usm_ndarray_into_usm_ndarray;
-using dpctl::tensor::py_internal::py_as_c_contig;
-using dpctl::tensor::py_internal::py_as_f_contig;
-
-/* =========================== Copy for reshape ============================= */
-
-using dpctl::tensor::py_internal::copy_usm_ndarray_for_reshape;
-
-/* =========================== Copy for roll ============================= */
-
-using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_1d;
-using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_nd;
-
-/* ============= Copy from numpy.ndarray to usm_ndarray ==================== */
-
-using dpctl::tensor::py_internal::copy_numpy_ndarray_into_usm_ndarray;
-
-/* ============= linear-sequence ==================== */
-
-using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine;
-using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step;
-
-/* ================ Full ================== */
-
-using dpctl::tensor::py_internal::usm_ndarray_full;
-
-/* ================ Zeros ================== */
-
-using dpctl::tensor::py_internal::usm_ndarray_zeros;
-
-/* ============== Advanced Indexing ============= */
-using dpctl::tensor::py_internal::usm_ndarray_put;
-using dpctl::tensor::py_internal::usm_ndarray_take;
-
-using dpctl::tensor::py_internal::py_extract;
-using dpctl::tensor::py_internal::py_mask_positions;
-using dpctl::tensor::py_internal::py_nonzero;
-using dpctl::tensor::py_internal::py_place;
-
-/* ================= Repeat ====================*/
-using dpctl::tensor::py_internal::py_cumsum_1d;
-using dpctl::tensor::py_internal::py_repeat_by_scalar;
-using dpctl::tensor::py_internal::py_repeat_by_sequence;
-
-/* ================ Eye ================== */
-
-using dpctl::tensor::py_internal::usm_ndarray_eye;
-
-/* =========================== Tril and triu ============================== */
-
-using dpctl::tensor::py_internal::usm_ndarray_triul;
-
-/* =========================== Where ============================== */
-
-using dpctl::tensor::py_internal::py_where;
-
-/* =========================== Clip ============================== */
-using dpctl::tensor::py_internal::py_clip;
-
-// populate dispatch tables
-void init_dispatch_tables(void)
-{
-    using namespace dpctl::tensor::py_internal;
-
-    init_copy_and_cast_usm_to_usm_dispatch_tables();
-    init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables();
-    init_advanced_indexing_dispatch_tables();
-    init_where_dispatch_tables();
-    return;
-}
-
-// populate dispatch vectors
-void init_dispatch_vectors(void)
-{
-    using namespace dpctl::tensor::py_internal;
-
-    init_copy_as_contig_dispatch_vectors();
-    init_copy_for_reshape_dispatch_vectors();
-    init_copy_for_roll_dispatch_vectors();
-    init_linear_sequences_dispatch_vectors();
-    init_full_ctor_dispatch_vectors();
-    init_zeros_ctor_dispatch_vectors();
-    init_eye_ctor_dispatch_vectors();
-    init_triul_ctor_dispatch_vectors();
-
-    populate_masked_extract_dispatch_vectors();
-    populate_masked_place_dispatch_vectors();
-
-    populate_mask_positions_dispatch_vectors();
-
-    populate_cumsum_1d_dispatch_vectors();
-    init_repeat_dispatch_vectors();
-
-    init_clip_dispatch_vectors();
-
-    return;
-}
-
-} // namespace
-
-PYBIND11_MODULE(_tensor_impl, m)
-{
-    init_dispatch_tables();
-    init_dispatch_vectors();
-
-    using dpctl::tensor::strides::contract_iter;
-    m.def(
-        "_contract_iter", &contract_iter<py::ssize_t, py::value_error>,
-        "Simplifies iteration of array of given shape & stride. Returns "
-        "a triple: shape, stride and offset for the new iterator of possible "
-        "smaller dimension, which traverses the same elements as the original "
-        "iterator, possibly in a different order.");
-
-    m.def("_copy_usm_ndarray_into_usm_ndarray",
-          &copy_usm_ndarray_into_usm_ndarray,
-          "Copies from usm_ndarray `src` into usm_ndarray `dst` of the same "
-          "shape. "
-          "Returns a tuple of events: (host_task_event, compute_task_event)",
-          py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
-          py::arg("depends") = py::list());
-
-    m.def("_as_c_contig", &py_as_c_contig,
-          "Copies from usm_ndarray `src` into C-contiguous usm_ndarray "
-          "`dst` of the same shape and the same data type. "
-          "Returns a tuple of events: (host_task_event, compute_task_event)",
-          py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
-          py::arg("depends") = py::list());
-
-    m.def("_as_f_contig", &py_as_f_contig,
-          "Copies from usm_ndarray `src` into F-contiguous usm_ndarray "
-          "`dst` of the same shape and the same data type. "
-          "Returns a tuple of events: (host_task_event, compute_task_event)",
-          py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
-          py::arg("depends") = py::list());
-
-    using dpctl::tensor::strides::contract_iter2;
-    m.def(
-        "_contract_iter2", &contract_iter2<py::ssize_t, py::value_error>,
-        "Simplifies iteration over elements of pair of arrays of given shape "
-        "with strides stride1 and stride2. Returns "
-        "a 5-tuple: shape, stride and offset for the new iterator of possible "
-        "smaller dimension for each array, which traverses the same elements "
-        "as the original "
-        "iterator, possibly in a different order.");
-
-    using dpctl::tensor::strides::contract_iter3;
-    m.def(
-        "_contract_iter3", &contract_iter3<py::ssize_t, py::value_error>,
-        "Simplifies iteration over elements of 3-tuple of arrays of given "
-        "shape "
-        "with strides stride1, stride2, and stride3. Returns "
-        "a 7-tuple: shape, stride and offset for the new iterator of possible "
-        "smaller dimension for each array, which traverses the same elements "
-        "as the original "
-        "iterator, possibly in a different order.");
-
-    using dpctl::tensor::strides::contract_iter4;
-    m.def(
-        "_contract_iter4", &contract_iter4<py::ssize_t, py::value_error>,
-        "Simplifies iteration over elements of 4-tuple of arrays of given "
-        "shape "
-        "with strides stride1, stride2, stride3, and stride4. Returns "
-        "a 9-tuple: shape, stride and offset for the new iterator of possible "
-        "smaller dimension for each array, which traverses the same elements "
-        "as the original "
-        "iterator, possibly in a different order.");
-
-    static constexpr char orderC = 'C';
-    m.def(
-        "_ravel_multi_index",
-        [](const std::vector<py::ssize_t> &mi,
-           const std::vector<py::ssize_t> &shape, char order = 'C') {
-            if (order == orderC) {
-                return dpctl::tensor::py_internal::_ravel_multi_index_c(mi,
-                                                                        shape);
-            }
-            else {
-                return dpctl::tensor::py_internal::_ravel_multi_index_f(mi,
-                                                                        shape);
-            }
-        },
-        "");
-
-    m.def(
-        "_unravel_index",
-        [](py::ssize_t flat_index, const std::vector<py::ssize_t> &shape,
-           char order = 'C') {
-            if (order == orderC) {
-                return dpctl::tensor::py_internal::_unravel_index_c(flat_index,
-                                                                    shape);
-            }
-            else {
-                return dpctl::tensor::py_internal::_unravel_index_f(flat_index,
-                                                                    shape);
-            }
-        },
-        "");
-
-    m.def("_copy_usm_ndarray_for_reshape", &copy_usm_ndarray_for_reshape,
-          "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same "
-          "number of elements using underlying 'C'-contiguous order for flat "
-          "traversal. "
-          "Returns a tuple of events: (ht_event, comp_event)",
-          py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
-          py::arg("depends") = py::list());
-
-    m.def("_copy_usm_ndarray_for_roll_1d", &copy_usm_ndarray_for_roll_1d,
-          "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same "
-          "shapes using underlying 'C'-contiguous order for flat "
-          "traversal with shift. "
-          "Returns a tuple of events: (ht_event, comp_event)",
-          py::arg("src"), py::arg("dst"), py::arg("shift"),
-          py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-    m.def("_copy_usm_ndarray_for_roll_nd", &copy_usm_ndarray_for_roll_nd,
-          "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same "
-          "shapes using underlying 'C'-contiguous order for "
-          "traversal with shifts along each axis. "
-          "Returns a tuple of events: (ht_event, comp_event)",
-          py::arg("src"), py::arg("dst"), py::arg("shifts"),
-          py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-    m.def("_linspace_step", &usm_ndarray_linear_sequence_step,
-          "Fills input 1D contiguous usm_ndarray `dst` with linear sequence "
-          "specified by "
-          "starting point `start` and step `dt`. "
-          "Returns a tuple of events: (ht_event, comp_event)",
-          py::arg("start"), py::arg("dt"), py::arg("dst"),
-          py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-    m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine,
-          "Fills input 1D contiguous usm_ndarray `dst` with linear sequence "
-          "specified by "
-          "starting point `start` and end point `end`. "
-          "Returns a tuple of events: (ht_event, comp_event)",
-          py::arg("start"), py::arg("end"), py::arg("dst"),
-          py::arg("include_endpoint"), py::arg("sycl_queue"),
-          py::arg("depends") = py::list());
-
-    m.def("_copy_numpy_ndarray_into_usm_ndarray",
-          &copy_numpy_ndarray_into_usm_ndarray,
-          "Copy from numpy array `src` into usm_ndarray `dst` synchronously.",
-          py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
-          py::arg("depends") = py::list());
-
-    m.def("_zeros_usm_ndarray", &usm_ndarray_zeros,
-          "Populate usm_ndarray `dst` with zeros.", py::arg("dst"),
-          py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-    m.def("_full_usm_ndarray", &usm_ndarray_full,
-          "Populate usm_ndarray `dst` with given fill_value.",
-          py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"),
-          py::arg("depends") = py::list());
-
-    m.def("_take", &usm_ndarray_take,
-          "Takes elements at usm_ndarray indices `ind` and axes starting "
-          "at axis `axis_start` from array `src` and copies them "
-          "into usm_ndarray `dst` synchronously."
-          "Returns a tuple of events: (hev, ev)",
-          py::arg("src"), py::arg("ind"), py::arg("dst"), py::arg("axis_start"),
-          py::arg("mode"), py::arg("sycl_queue"),
-          py::arg("depends") = py::list());
-
-    m.def("_put", &usm_ndarray_put,
-          "Puts elements at usm_ndarray indices `ind` and axes starting "
-          "at axis `axis_start` into array `dst` from "
-          "usm_ndarray `val` synchronously."
-          "Returns a tuple of events: (hev, ev)",
-          py::arg("dst"), py::arg("ind"), py::arg("val"), py::arg("axis_start"),
-          py::arg("mode"), py::arg("sycl_queue"),
-          py::arg("depends") = py::list());
-
-    m.def("_eye", &usm_ndarray_eye,
-          "Fills input 2D contiguous usm_ndarray `dst` with "
-          "zeros outside of the diagonal "
-          "specified by "
-          "the diagonal index `k` "
-          "which is filled with ones."
-          "Returns a tuple of events: (ht_event, comp_event)",
-          py::arg("k"), py::arg("dst"), py::arg("sycl_queue"),
-          py::arg("depends") = py::list());
-
-    m.def("default_device_fp_type",
-          dpctl::tensor::py_internal::default_device_fp_type,
-          "Gives default floating point type supported by device.",
-          py::arg("dev"));
-
-    m.def("default_device_int_type",
-          dpctl::tensor::py_internal::default_device_int_type,
-          "Gives default signed integer type supported by device.",
-          py::arg("dev"));
-
-    m.def("default_device_uint_type",
-          dpctl::tensor::py_internal::default_device_uint_type,
-          "Gives default unsigned integer type supported by device.",
-          py::arg("dev"));
-
-    m.def("default_device_bool_type",
-          dpctl::tensor::py_internal::default_device_bool_type,
-          "Gives default boolean type supported by device.", py::arg("dev"));
-
-    m.def("default_device_complex_type",
-          dpctl::tensor::py_internal::default_device_complex_type,
-          "Gives default complex floating point type supported by device.",
-          py::arg("dev"));
-
-    m.def("default_device_index_type",
-          dpctl::tensor::py_internal::default_device_index_type,
-          "Gives default index type supported by device.", py::arg("dev"));
-
-    auto tril_fn = [](const dpctl::tensor::usm_ndarray &src,
-                      const dpctl::tensor::usm_ndarray &dst, py::ssize_t k,
-                      sycl::queue &exec_q,
-                      const std::vector<sycl::event> depends)
-        -> std::pair<sycl::event, sycl::event> {
-        return usm_ndarray_triul(exec_q, src, dst, 'l', k, depends);
-    };
-    m.def("_tril", tril_fn, "Tril helper function.", py::arg("src"),
-          py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"),
-          py::arg("depends") = py::list());
-
-    auto triu_fn = [](const dpctl::tensor::usm_ndarray &src,
-                      const dpctl::tensor::usm_ndarray &dst, py::ssize_t k,
-                      sycl::queue &exec_q,
-                      const std::vector<sycl::event> depends)
-        -> std::pair<sycl::event, sycl::event> {
-        return usm_ndarray_triul(exec_q, src, dst, 'u', k, depends);
-    };
-    m.def("_triu", triu_fn, "Triu helper function.", py::arg("src"),
-          py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"),
-          py::arg("depends") = py::list());
-
-    m.def("mask_positions", &py_mask_positions, "", py::arg("mask"),
-          py::arg("cumsum"), py::arg("sycl_queue"),
-          py::arg("depends") = py::list());
-
-    m.def("_cumsum_1d", &py_cumsum_1d, "", py::arg("src"), py::arg("cumsum"),
-          py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-    m.def("_extract", &py_extract, "", py::arg("src"), py::arg("cumsum"),
-          py::arg("axis_start"), py::arg("axis_end"), py::arg("dst"),
-          py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-    auto overlap = [](const dpctl::tensor::usm_ndarray &x1,
-                      const dpctl::tensor::usm_ndarray &x2) -> bool {
-        auto const &overlap = MemoryOverlap();
-        return overlap(x1, x2);
-    };
-    m.def("_array_overlap", overlap,
-          "Determines if the memory regions indexed by each array overlap",
-          py::arg("array1"), py::arg("array2"));
-
-    auto same_logical_tensors =
-        [](const dpctl::tensor::usm_ndarray &x1,
-           const dpctl::tensor::usm_ndarray &x2) -> bool {
-        auto const &same_logical_tensors = SameLogicalTensors();
-        return same_logical_tensors(x1, x2);
-    };
-    m.def("_same_logical_tensors", same_logical_tensors,
-          "Determines if the memory regions indexed by each array are the same",
-          py::arg("array1"), py::arg("array2"));
-
-    m.def("_place", &py_place, "", py::arg("dst"), py::arg("cumsum"),
-          py::arg("axis_start"), py::arg("axis_end"), py::arg("rhs"),
-          py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-    m.def("_nonzero", &py_nonzero, "", py::arg("cumsum"), py::arg("indexes"),
-          py::arg("mask_shape"), py::arg("sycl_queue"),
-          py::arg("depends") = py::list());
-
-    m.def("_where", &py_where, "", py::arg("condition"), py::arg("x1"),
-          py::arg("x2"), py::arg("dst"), py::arg("sycl_queue"),
-          py::arg("depends") = py::list());
-
-    auto repeat_sequence = [](const dpctl::tensor::usm_ndarray &src,
-                              const dpctl::tensor::usm_ndarray &dst,
-                              const dpctl::tensor::usm_ndarray &reps,
-                              const dpctl::tensor::usm_ndarray &cumsum,
-                              std::optional<int> axis, sycl::queue &exec_q,
-                              const std::vector<sycl::event> depends)
-        -> std::pair<sycl::event, sycl::event> {
-        if (axis) {
-            return py_repeat_by_sequence(src, dst, reps, cumsum, axis.value(),
-                                         exec_q, depends);
-        }
-        else {
-            return py_repeat_by_sequence(src, dst, reps, cumsum, exec_q,
-                                         depends);
-        }
-    };
-    m.def("_repeat_by_sequence", repeat_sequence, py::arg("src"),
-          py::arg("dst"), py::arg("reps"), py::arg("cumsum"), py::arg("axis"),
-          py::arg("sycl_queue"), py::arg("depends") = py::list());
-
-    auto repeat_scalar = [](const dpctl::tensor::usm_ndarray &src,
-                            const dpctl::tensor::usm_ndarray &dst,
-                            const py::ssize_t reps, std::optional<int> axis,
-                            sycl::queue &exec_q,
-                            const std::vector<sycl::event> depends)
-        -> std::pair<sycl::event, sycl::event> {
-        if (axis) {
-            return py_repeat_by_scalar(src, dst, reps, axis.value(), exec_q,
-                                       depends);
-        }
-        else {
-            return py_repeat_by_scalar(src, dst, reps, exec_q, depends);
-        }
-    };
-    m.def("_repeat_by_scalar", repeat_scalar, py::arg("src"), py::arg("dst"),
-          py::arg("reps"), py::arg("axis"), py::arg("sycl_queue"),
-          py::arg("depends") = py::list());
-
-    m.def("_clip", &py_clip,
-          "Clamps elements of array `x` to the range "
-          "[`min`, `max] and writes the result to the "
-          "array `dst` for each element of `x`, `min`, and `max`."
-          "Returns a tuple of events: (hev, ev)",
-          py::arg("src"), py::arg("min"), py::arg("max"), py::arg("dst"),
-          py::arg("sycl_queue"), py::arg("depends") = py::list());
-}
diff --git a/dpctl/tensor/libtensor/source/tensor_elementwise.cpp b/dpctl/tensor/libtensor/source/tensor_elementwise.cpp
deleted file mode 100644
index 56a5795a17..0000000000
--- a/dpctl/tensor/libtensor/source/tensor_elementwise.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-//===-- tensor_elementwise.cpp                                ---*-C++-*-/===//
-//    Implementation of _tensor_elementwise_impl module
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===----------------------------------------------------------------------===//
-
-#include "elementwise_functions/elementwise_common.hpp"
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-PYBIND11_MODULE(_tensor_elementwise_impl, m)
-{
-    dpctl::tensor::py_internal::init_elementwise_functions(m);
-}
diff --git a/dpctl/tensor/libtensor/source/tensor_linalg.cpp b/dpctl/tensor/libtensor/source/tensor_linalg.cpp
deleted file mode 100644
index b21777654e..0000000000
--- a/dpctl/tensor/libtensor/source/tensor_linalg.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-//===-- tensor_linalg.cpp                                ---*-C++-*-/===//
-//    Implementation of _tensor_linalg_impl module
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===----------------------------------------------------------------------===//
-
-#include "linalg_functions/dot.hpp"
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-PYBIND11_MODULE(_tensor_linalg_impl, m)
-{
-    dpctl::tensor::py_internal::init_dot(m);
-}
diff --git a/dpctl/tensor/libtensor/source/tensor_reductions.cpp b/dpctl/tensor/libtensor/source/tensor_reductions.cpp
deleted file mode 100644
index ff94c37eb6..0000000000
--- a/dpctl/tensor/libtensor/source/tensor_reductions.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-//===-- tensor_reductions.cpp -                              --*-C++-*-/===//
-//   Implementation of _tensor_reductions_impl module
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===----------------------------------------------------------------------===//
-
-#include <pybind11/pybind11.h>
-
-#include "reductions/reduction_common.hpp"
-
-namespace py = pybind11;
-
-PYBIND11_MODULE(_tensor_reductions_impl, m)
-{
-    dpctl::tensor::py_internal::init_reduction_functions(m);
-}
diff --git a/dpctl/tensor/libtensor/source/tensor_sorting.cpp b/dpctl/tensor/libtensor/source/tensor_sorting.cpp
deleted file mode 100644
index 3b9bed7768..0000000000
--- a/dpctl/tensor/libtensor/source/tensor_sorting.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-//===-- tensor_sorting.cpp -                                 -----*-C++-*-/===//
-//   Implementation of _tensor_reductions_impl module
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===----------------------------------------------------------------------===//
-
-#include <pybind11/pybind11.h>
-
-#include "sorting/isin.hpp"
-#include "sorting/merge_argsort.hpp"
-#include "sorting/merge_sort.hpp"
-#include "sorting/radix_argsort.hpp"
-#include "sorting/radix_sort.hpp"
-#include "sorting/searchsorted.hpp"
-#include "sorting/topk.hpp"
-
-namespace py = pybind11;
-
-PYBIND11_MODULE(_tensor_sorting_impl, m)
-{
-    dpctl::tensor::py_internal::init_isin_functions(m);
-    dpctl::tensor::py_internal::init_merge_sort_functions(m);
-    dpctl::tensor::py_internal::init_merge_argsort_functions(m);
-    dpctl::tensor::py_internal::init_searchsorted_functions(m);
-    dpctl::tensor::py_internal::init_radix_sort_functions(m);
-    dpctl::tensor::py_internal::init_radix_argsort_functions(m);
-    dpctl::tensor::py_internal::init_topk_functions(m);
-}
diff --git a/dpctl/tensor/libtensor/source/tensor_sorting_radix.cpp b/dpctl/tensor/libtensor/source/tensor_sorting_radix.cpp
deleted file mode 100644
index d0880c4ccf..0000000000
--- a/dpctl/tensor/libtensor/source/tensor_sorting_radix.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-//===-- tensor_sorting.cpp -                                 -----*-C++-*-/===//
-//   Implementation of _tensor_reductions_impl module
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===----------------------------------------------------------------------===//
-
-#include <pybind11/pybind11.h>
-
-#include "sorting/radix_argsort.hpp"
-#include "sorting/radix_sort.hpp"
-
-namespace py = pybind11;
-
-PYBIND11_MODULE(_tensor_sorting_radix_impl, m)
-{
-    dpctl::tensor::py_internal::init_radix_sort_functions(m);
-    dpctl::tensor::py_internal::init_radix_argsort_functions(m);
-}
diff --git a/dpctl/tensor/libtensor/source/triul_ctor.cpp b/dpctl/tensor/libtensor/source/triul_ctor.cpp
deleted file mode 100644
index be5788dcdc..0000000000
--- a/dpctl/tensor/libtensor/source/triul_ctor.cpp
+++ /dev/null
@@ -1,243 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#include <algorithm> // for std::copy
-#include <cstddef>   // for std::size_t
-#include <memory>    // for std::make_shared
-#include <stdexcept> // for std::runtime_error
-#include <utility>   // for std::pair, std::move
-#include <vector>    // for std::vector, std::begin, std::end
-
-#include <sycl/sycl.hpp>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/pybind11.h>
-
-#include "kernels/constructors.hpp"
-#include "simplify_iteration_space.hpp"
-#include "utils/memory_overlap.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/output_validation.hpp"
-#include "utils/sycl_alloc_utils.hpp"
-#include "utils/type_dispatch.hpp"
-
-namespace py = pybind11;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-using dpctl::utils::keep_args_alive;
-
-using dpctl::tensor::kernels::constructors::tri_fn_ptr_t;
-
-static tri_fn_ptr_t tril_generic_dispatch_vector[td_ns::num_types];
-static tri_fn_ptr_t triu_generic_dispatch_vector[td_ns::num_types];
-
-std::pair<sycl::event, sycl::event>
-usm_ndarray_triul(sycl::queue &exec_q,
-                  const dpctl::tensor::usm_ndarray &src,
-                  const dpctl::tensor::usm_ndarray &dst,
-                  char part,
-                  py::ssize_t k = 0,
-                  const std::vector<sycl::event> &depends = {})
-{
-    // array dimensions must be the same
-    int src_nd = src.get_ndim();
-    int dst_nd = dst.get_ndim();
-    if (src_nd != dst_nd) {
-        throw py::value_error("Array dimensions are not the same.");
-    }
-
-    if (src_nd < 2) {
-        throw py::value_error("Array dimensions less than 2.");
-    }
-
-    // shapes must be the same
-    const py::ssize_t *src_shape = src.get_shape_raw();
-    const py::ssize_t *dst_shape = dst.get_shape_raw();
-
-    bool shapes_equal(true);
-    std::size_t src_nelems(1);
-
-    for (int i = 0; shapes_equal && i < src_nd; ++i) {
-        src_nelems *= static_cast<std::size_t>(src_shape[i]);
-        shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
-    }
-    if (!shapes_equal) {
-        throw py::value_error("Array shapes are not the same.");
-    }
-
-    if (src_nelems == 0) {
-        // nothing to do
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    char *src_data = src.get_data();
-    char *dst_data = dst.get_data();
-
-    // check that arrays do not overlap, and concurrent copying is safe.
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    if (overlap(src, dst)) {
-        // TODO: could use a temporary, but this is done by the caller
-        throw py::value_error("Arrays index overlapping segments of memory");
-    }
-
-    auto array_types = td_ns::usm_ndarray_types();
-
-    int src_typenum = src.get_typenum();
-    int dst_typenum = dst.get_typenum();
-    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    if (dst_typeid != src_typeid) {
-        throw py::value_error("Array dtype are not the same.");
-    }
-
-    // check same queues
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
-        throw py::value_error(
-            "Execution queue context is not the same as allocation contexts");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    auto src_strides = src.get_strides_vector();
-    auto dst_strides = dst.get_strides_vector();
-
-    using shT = std::vector<py::ssize_t>;
-    shT simplified_shape;
-    shT simplified_src_strides;
-    shT simplified_dst_strides;
-    py::ssize_t src_offset(0);
-    py::ssize_t dst_offset(0);
-
-    int nd = src_nd - 2;
-    const py::ssize_t *shape = src_shape;
-
-    const shT iter_src_strides(std::begin(src_strides),
-                               std::begin(src_strides) + nd);
-    const shT iter_dst_strides(std::begin(dst_strides),
-                               std::begin(dst_strides) + nd);
-
-    simplify_iteration_space(nd, shape, iter_src_strides, iter_dst_strides,
-                             // output
-                             simplified_shape, simplified_src_strides,
-                             simplified_dst_strides, src_offset, dst_offset);
-
-    if (src_offset != 0 || dst_offset != 0) {
-        throw py::value_error("Reversed slice for dst is not supported");
-    }
-
-    nd += 2;
-
-    using usm_host_allocatorT =
-        dpctl::tensor::alloc_utils::usm_host_allocator<py::ssize_t>;
-    using usmshT = std::vector<py::ssize_t, usm_host_allocatorT>;
-
-    usm_host_allocatorT allocator(exec_q);
-    auto shp_host_shape_and_strides =
-        std::make_shared<usmshT>(3 * nd, allocator);
-
-    std::copy(simplified_shape.begin(), simplified_shape.end(),
-              shp_host_shape_and_strides->begin());
-    (*shp_host_shape_and_strides)[nd - 2] = src_shape[src_nd - 2];
-    (*shp_host_shape_and_strides)[nd - 1] = src_shape[src_nd - 1];
-
-    std::copy(simplified_src_strides.begin(), simplified_src_strides.end(),
-              shp_host_shape_and_strides->begin() + nd);
-    (*shp_host_shape_and_strides)[2 * nd - 2] = src_strides[src_nd - 2];
-    (*shp_host_shape_and_strides)[2 * nd - 1] = src_strides[src_nd - 1];
-
-    std::copy(simplified_dst_strides.begin(), simplified_dst_strides.end(),
-              shp_host_shape_and_strides->begin() + 2 * nd);
-    (*shp_host_shape_and_strides)[3 * nd - 2] = dst_strides[src_nd - 2];
-    (*shp_host_shape_and_strides)[3 * nd - 1] = dst_strides[src_nd - 1];
-
-    auto dev_shape_and_strides_owner =
-        dpctl::tensor::alloc_utils::smart_malloc_device<py::ssize_t>(3 * nd,
-                                                                     exec_q);
-    py::ssize_t *dev_shape_and_strides = dev_shape_and_strides_owner.get();
-
-    const sycl::event &copy_shape_and_strides = exec_q.copy<py::ssize_t>(
-        shp_host_shape_and_strides->data(), dev_shape_and_strides, 3 * nd);
-
-    py::ssize_t inner_range = src_shape[src_nd - 1] * src_shape[src_nd - 2];
-    py::ssize_t outer_range = src_nelems / inner_range;
-
-    sycl::event tri_ev;
-    if (part == 'l') {
-        auto fn = tril_generic_dispatch_vector[src_typeid];
-        tri_ev =
-            fn(exec_q, inner_range, outer_range, src_data, dst_data, nd,
-               dev_shape_and_strides, k, depends, {copy_shape_and_strides});
-    }
-    else {
-        auto fn = triu_generic_dispatch_vector[src_typeid];
-        tri_ev =
-            fn(exec_q, inner_range, outer_range, src_data, dst_data, nd,
-               dev_shape_and_strides, k, depends, {copy_shape_and_strides});
-    }
-
-    const auto &temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(tri_ev);
-        const auto &ctx = exec_q.get_context();
-        using dpctl::tensor::alloc_utils::sycl_free_noexcept;
-        cgh.host_task(
-            [shp_host_shape_and_strides = std::move(shp_host_shape_and_strides),
-             dev_shape_and_strides, ctx]() {
-                // capture of shp_host_shape_and_strides ensure the underlying
-                // vector exists for the entire execution of copying kernel
-                sycl_free_noexcept(dev_shape_and_strides, ctx);
-            });
-    });
-    // since host_task now owns USM allocation, release ownership by smart
-    // pointer
-    dev_shape_and_strides_owner.release();
-
-    return std::make_pair(
-        keep_args_alive(exec_q, {src, dst}, {temporaries_cleanup_ev}), tri_ev);
-}
-
-void init_triul_ctor_dispatch_vectors(void)
-{
-
-    using namespace td_ns;
-    using dpctl::tensor::kernels::constructors::TrilGenericFactory;
-    using dpctl::tensor::kernels::constructors::TriuGenericFactory;
-
-    DispatchVectorBuilder<tri_fn_ptr_t, TrilGenericFactory, num_types> dvb1;
-    dvb1.populate_dispatch_vector(tril_generic_dispatch_vector);
-
-    DispatchVectorBuilder<tri_fn_ptr_t, TriuGenericFactory, num_types> dvb2;
-    dvb2.populate_dispatch_vector(triu_generic_dispatch_vector);
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/triul_ctor.hpp b/dpctl/tensor/libtensor/source/triul_ctor.hpp
deleted file mode 100644
index b4c4318096..0000000000
--- a/dpctl/tensor/libtensor/source/triul_ctor.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#pragma once
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/pybind11.h>
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern std::pair<sycl::event, sycl::event>
-usm_ndarray_triul(sycl::queue &exec_q,
-                  const dpctl::tensor::usm_ndarray &src,
-                  const dpctl::tensor::usm_ndarray &dst,
-                  char part,
-                  py::ssize_t k = 0,
-                  const std::vector<sycl::event> &depends = {});
-
-extern void init_triul_ctor_dispatch_vectors(void);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/where.cpp b/dpctl/tensor/libtensor/source/where.cpp
deleted file mode 100644
index 9825b65901..0000000000
--- a/dpctl/tensor/libtensor/source/where.cpp
+++ /dev/null
@@ -1,262 +0,0 @@
-//===-- where.cpp - Implementation of where  --*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines Python API for implementation functions of
-/// dpctl.tensor.where
-//===----------------------------------------------------------------------===//
-
-#include <complex>
-#include <cstddef>
-#include <cstdint>
-#include <stdexcept>
-#include <utility>
-
-#include <sycl/sycl.hpp>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/complex.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include "kernels/where.hpp"
-#include "utils/memory_overlap.hpp"
-#include "utils/offset_utils.hpp"
-#include "utils/output_validation.hpp"
-#include "utils/sycl_alloc_utils.hpp"
-#include "utils/type_dispatch.hpp"
-
-#include "simplify_iteration_space.hpp"
-#include "where.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-using dpctl::tensor::kernels::search::where_contig_impl_fn_ptr_t;
-using dpctl::tensor::kernels::search::where_strided_impl_fn_ptr_t;
-
-static where_contig_impl_fn_ptr_t where_contig_dispatch_table[td_ns::num_types]
-                                                             [td_ns::num_types];
-static where_strided_impl_fn_ptr_t
-    where_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
-
-using dpctl::utils::keep_args_alive;
-
-std::pair<sycl::event, sycl::event>
-py_where(const dpctl::tensor::usm_ndarray &condition,
-         const dpctl::tensor::usm_ndarray &x1,
-         const dpctl::tensor::usm_ndarray &x2,
-         const dpctl::tensor::usm_ndarray &dst,
-         sycl::queue &exec_q,
-         const std::vector<sycl::event> &depends)
-{
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {x1, x2, condition, dst}))
-    {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    int nd = condition.get_ndim();
-    int x1_nd = x1.get_ndim();
-    int x2_nd = x2.get_ndim();
-    int dst_nd = dst.get_ndim();
-
-    if (nd != x1_nd || nd != x2_nd) {
-        throw py::value_error(
-            "Input arrays are not of appropriate dimension for where kernel.");
-    }
-
-    if (nd != dst_nd) {
-        throw py::value_error(
-            "Destination is not of appropriate dimension for where kernel.");
-    }
-
-    const py::ssize_t *x1_shape = x1.get_shape_raw();
-    const py::ssize_t *x2_shape = x2.get_shape_raw();
-    const py::ssize_t *dst_shape = dst.get_shape_raw();
-    const py::ssize_t *cond_shape = condition.get_shape_raw();
-
-    bool shapes_equal(true);
-    std::size_t nelems(1);
-    for (int i = 0; i < nd; ++i) {
-        const auto &sh_i = dst_shape[i];
-        nelems *= static_cast<std::size_t>(sh_i);
-        shapes_equal = shapes_equal && (x1_shape[i] == sh_i) &&
-                       (x2_shape[i] == sh_i) && (cond_shape[i] == sh_i);
-    }
-
-    if (!shapes_equal) {
-        throw py::value_error("Axes are not of matching shapes.");
-    }
-
-    if (nelems == 0) {
-        return std::make_pair(sycl::event{}, sycl::event{});
-    }
-
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    auto const &same_logical_tensors =
-        dpctl::tensor::overlap::SameLogicalTensors();
-    if ((overlap(dst, condition) && !same_logical_tensors(dst, condition)) ||
-        (overlap(dst, x1) && !same_logical_tensors(dst, x1)) ||
-        (overlap(dst, x2) && !same_logical_tensors(dst, x2)))
-    {
-        throw py::value_error("Destination array overlaps with input.");
-    }
-
-    int x1_typenum = x1.get_typenum();
-    int x2_typenum = x2.get_typenum();
-    int cond_typenum = condition.get_typenum();
-    int dst_typenum = dst.get_typenum();
-
-    auto const &array_types = td_ns::usm_ndarray_types();
-    int cond_typeid = array_types.typenum_to_lookup_id(cond_typenum);
-    int x1_typeid = array_types.typenum_to_lookup_id(x1_typenum);
-    int x2_typeid = array_types.typenum_to_lookup_id(x2_typenum);
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    if (x1_typeid != x2_typeid || x1_typeid != dst_typeid) {
-        throw py::value_error("Value arrays must have the same data type");
-    }
-
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, nelems);
-
-    char *cond_data = condition.get_data();
-    char *x1_data = x1.get_data();
-    char *x2_data = x2.get_data();
-    char *dst_data = dst.get_data();
-
-    bool is_x1_c_contig = x1.is_c_contiguous();
-    bool is_x1_f_contig = x1.is_f_contiguous();
-
-    bool is_x2_c_contig = x2.is_c_contiguous();
-    bool is_x2_f_contig = x2.is_f_contiguous();
-
-    bool is_cond_c_contig = condition.is_c_contiguous();
-    bool is_cond_f_contig = condition.is_f_contiguous();
-
-    bool is_dst_c_contig = dst.is_c_contiguous();
-    bool is_dst_f_contig = dst.is_f_contiguous();
-
-    bool all_c_contig = (is_x1_c_contig && is_x2_c_contig && is_cond_c_contig &&
-                         is_dst_c_contig);
-    bool all_f_contig = (is_x1_f_contig && is_x2_f_contig && is_cond_f_contig &&
-                         is_dst_f_contig);
-
-    if (all_c_contig || all_f_contig) {
-        auto contig_fn = where_contig_dispatch_table[x1_typeid][cond_typeid];
-
-        auto where_ev = contig_fn(exec_q, nelems, cond_data, x1_data, x2_data,
-                                  dst_data, depends);
-        sycl::event ht_ev =
-            keep_args_alive(exec_q, {x1, x2, dst, condition}, {where_ev});
-
-        return std::make_pair(ht_ev, where_ev);
-    }
-
-    auto const &cond_strides = condition.get_strides_vector();
-    auto const &x1_strides = x1.get_strides_vector();
-    auto const &x2_strides = x2.get_strides_vector();
-    auto const &dst_strides = dst.get_strides_vector();
-
-    using shT = std::vector<py::ssize_t>;
-    shT simplified_shape;
-    shT simplified_cond_strides;
-    shT simplified_x1_strides;
-    shT simplified_x2_strides;
-    shT simplified_dst_strides;
-    py::ssize_t cond_offset(0);
-    py::ssize_t x1_offset(0);
-    py::ssize_t x2_offset(0);
-    py::ssize_t dst_offset(0);
-
-    dpctl::tensor::py_internal::simplify_iteration_space_4(
-        nd, x1_shape, cond_strides, x1_strides, x2_strides, dst_strides,
-        // outputs
-        simplified_shape, simplified_cond_strides, simplified_x1_strides,
-        simplified_x2_strides, simplified_dst_strides, cond_offset, x1_offset,
-        x2_offset, dst_offset);
-
-    auto fn = where_strided_dispatch_table[x1_typeid][cond_typeid];
-
-    std::vector<sycl::event> host_task_events;
-    host_task_events.reserve(2);
-
-    using dpctl::tensor::offset_utils::device_allocate_and_pack;
-    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
-        exec_q, host_task_events,
-        // common shape and strides
-        simplified_shape, simplified_cond_strides, simplified_x1_strides,
-        simplified_x2_strides, simplified_dst_strides);
-    auto packed_shape_strides_owner =
-        std::move(std::get<0>(ptr_size_event_tuple));
-    sycl::event copy_shape_strides_ev = std::get<2>(ptr_size_event_tuple);
-    const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get();
-
-    std::vector<sycl::event> all_deps;
-    all_deps.reserve(depends.size() + 1);
-    all_deps.insert(all_deps.end(), depends.begin(), depends.end());
-    all_deps.push_back(copy_shape_strides_ev);
-
-    assert(all_deps.size() == depends.size() + 1);
-
-    sycl::event where_ev = fn(exec_q, nelems, nd, cond_data, x1_data, x2_data,
-                              dst_data, packed_shape_strides, cond_offset,
-                              x1_offset, x2_offset, dst_offset, all_deps);
-
-    // free packed temporaries
-    sycl::event temporaries_cleanup_ev =
-        dpctl::tensor::alloc_utils::async_smart_free(
-            exec_q, {where_ev}, packed_shape_strides_owner);
-    host_task_events.push_back(temporaries_cleanup_ev);
-
-    sycl::event arg_cleanup_ev =
-        keep_args_alive(exec_q, {x1, x2, condition, dst}, host_task_events);
-
-    return std::make_pair(arg_cleanup_ev, where_ev);
-}
-
-void init_where_dispatch_tables(void)
-{
-    using namespace td_ns;
-    using dpctl::tensor::kernels::search::WhereContigFactory;
-    DispatchTableBuilder<where_contig_impl_fn_ptr_t, WhereContigFactory,
-                         num_types>
-        dtb1;
-    dtb1.populate_dispatch_table(where_contig_dispatch_table);
-
-    using dpctl::tensor::kernels::search::WhereStridedFactory;
-    DispatchTableBuilder<where_strided_impl_fn_ptr_t, WhereStridedFactory,
-                         num_types>
-        dtb2;
-    dtb2.populate_dispatch_table(where_strided_dispatch_table);
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/where.hpp b/dpctl/tensor/libtensor/source/where.hpp
deleted file mode 100644
index 54981f9823..0000000000
--- a/dpctl/tensor/libtensor/source/where.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-//===--                      where.hpp -                       --*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file declares Python API for implementation functions of
-/// dpctl.tensor.where
-//===----------------------------------------------------------------------===//
-
-#pragma once
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern std::pair<sycl::event, sycl::event>
-py_where(const dpctl::tensor::usm_ndarray &,
-         const dpctl::tensor::usm_ndarray &,
-         const dpctl::tensor::usm_ndarray &,
-         const dpctl::tensor::usm_ndarray &,
-         sycl::queue &,
-         const std::vector<sycl::event> &);
-
-extern void init_where_dispatch_tables(void);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/zeros_ctor.cpp b/dpctl/tensor/libtensor/source/zeros_ctor.cpp
deleted file mode 100644
index ef74469a8b..0000000000
--- a/dpctl/tensor/libtensor/source/zeros_ctor.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#include <complex>
-#include <cstddef>
-#include <stdexcept>
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/complex.h>
-#include <pybind11/pybind11.h>
-
-#include "kernels/constructors.hpp"
-#include "utils/output_validation.hpp"
-#include "utils/type_dispatch.hpp"
-#include "utils/type_utils.hpp"
-
-#include "zeros_ctor.hpp"
-
-namespace py = pybind11;
-namespace td_ns = dpctl::tensor::type_dispatch;
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-using dpctl::utils::keep_args_alive;
-
-typedef sycl::event (*zeros_contig_fn_ptr_t)(sycl::queue &,
-                                             std::size_t,
-                                             char *,
-                                             const std::vector<sycl::event> &);
-
-/*!
- * @brief Function to submit kernel to fill given contiguous memory allocation
- * with zeros.
- *
- * @param exec_q  Sycl queue to which kernel is submitted for execution.
- * @param nelems  Length of the sequence
- * @param dst_p Kernel accessible USM pointer to the start of array to be
- * populated.
- * @param depends  List of events to wait for before starting computations, if
- * any.
- *
- * @return Event to wait on to ensure that computation completes.
- * @defgroup CtorKernels
- */
-template <typename dstTy>
-sycl::event zeros_contig_impl(sycl::queue &exec_q,
-                              std::size_t nelems,
-                              char *dst_p,
-                              const std::vector<sycl::event> &depends)
-{
-
-    static constexpr int memset_val(0);
-    sycl::event fill_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        cgh.memset(reinterpret_cast<void *>(dst_p), memset_val,
-                   nelems * sizeof(dstTy));
-    });
-
-    return fill_ev;
-}
-
-template <typename fnT, typename Ty> struct ZerosContigFactory
-{
-    fnT get()
-    {
-        fnT f = zeros_contig_impl<Ty>;
-        return f;
-    }
-};
-
-static zeros_contig_fn_ptr_t zeros_contig_dispatch_vector[td_ns::num_types];
-
-std::pair<sycl::event, sycl::event>
-usm_ndarray_zeros(const dpctl::tensor::usm_ndarray &dst,
-                  sycl::queue &exec_q,
-                  const std::vector<sycl::event> &depends)
-{
-    py::ssize_t dst_nelems = dst.get_size();
-
-    if (dst_nelems == 0) {
-        // nothing to do
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
-        throw py::value_error(
-            "Execution queue is not compatible with the allocation queue");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
-
-    auto array_types = td_ns::usm_ndarray_types();
-    int dst_typenum = dst.get_typenum();
-    int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum);
-
-    char *dst_data = dst.get_data();
-
-    if (dst_nelems == 1 || dst.is_c_contiguous() || dst.is_f_contiguous()) {
-        auto fn = zeros_contig_dispatch_vector[dst_typeid];
-
-        sycl::event zeros_contig_event =
-            fn(exec_q, static_cast<std::size_t>(dst_nelems), dst_data, depends);
-
-        return std::make_pair(
-            keep_args_alive(exec_q, {dst}, {zeros_contig_event}),
-            zeros_contig_event);
-    }
-    else {
-        throw std::runtime_error(
-            "Only population of contiguous usm_ndarray objects is supported.");
-    }
-}
-
-void init_zeros_ctor_dispatch_vectors(void)
-{
-    using namespace td_ns;
-
-    DispatchVectorBuilder<zeros_contig_fn_ptr_t, ZerosContigFactory, num_types>
-        dvb;
-    dvb.populate_dispatch_vector(zeros_contig_dispatch_vector);
-
-    return;
-}
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/zeros_ctor.hpp b/dpctl/tensor/libtensor/source/zeros_ctor.hpp
deleted file mode 100644
index 05bd9449e1..0000000000
--- a/dpctl/tensor/libtensor/source/zeros_ctor.hpp
+++ /dev/null
@@ -1,49 +0,0 @@
-//===-- ------------ Implementation of _tensor_impl module  ----*-C++-*-/===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===--------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines functions of dpctl.tensor._tensor_impl extensions
-//===--------------------------------------------------------------------===//
-
-#pragma once
-#include <sycl/sycl.hpp>
-#include <utility>
-#include <vector>
-
-#include "dpctl4pybind11.hpp"
-#include <pybind11/pybind11.h>
-
-namespace dpctl
-{
-namespace tensor
-{
-namespace py_internal
-{
-
-extern std::pair<sycl::event, sycl::event>
-usm_ndarray_zeros(const dpctl::tensor::usm_ndarray &dst,
-                  sycl::queue &exec_q,
-                  const std::vector<sycl::event> &depends = {});
-
-extern void init_zeros_ctor_dispatch_vectors(void);
-
-} // namespace py_internal
-} // namespace tensor
-} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/tests/test_copy.py b/dpctl/tensor/libtensor/tests/test_copy.py
deleted file mode 100644
index 085acdd49e..0000000000
--- a/dpctl/tensor/libtensor/tests/test_copy.py
+++ /dev/null
@@ -1,309 +0,0 @@
-#                      Data Parallel Control (dpctl)
-#
-# Copyright 2020-2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import pytest
-
-import dpctl
-import dpctl.tensor as dpt
-import dpctl.tensor._tensor_impl as ti
-
-_usm_types_list = ["shared", "device", "host"]
-_typestrs_list = [
-    "b1",
-    "u1",
-    "i1",
-    "u2",
-    "i2",
-    "u4",
-    "i4",
-    "u8",
-    "i8",
-    "f2",
-    "f4",
-    "f8",
-    "c8",
-    "c16",
-]
-
-
-def _typestr_has_fp64(arr_typestr):
-    return arr_typestr in ["f8", "c16"]
-
-
-def _typestr_has_fp16(arr_typestr):
-    return arr_typestr in ["f2"]
-
-
-@pytest.fixture(params=_usm_types_list)
-def usm_type(request):
-    return request.param
-
-
-@pytest.fixture(params=_typestrs_list)
-def src_typestr(request):
-    return request.param
-
-
-@pytest.fixture(params=_typestrs_list)
-def dst_typestr(request):
-    return request.param
-
-
-def _random_vector(n, src_dt):
-    src_dt = np.dtype(src_dt)
-    if np.issubdtype(src_dt, np.integer):
-        Xnp = np.random.randint(0, 2, size=n).astype(src_dt)
-    elif np.issubdtype(src_dt, np.floating):
-        Xnp = np.random.randn(n).astype(src_dt)
-    elif np.issubdtype(src_dt, np.complexfloating):
-        Xnp = np.random.randn(n) + 1j * np.random.randn(n)
-        Xnp = Xnp.astype(src_dt)
-    else:
-        Xnp = np.random.randint(0, 2, size=n).astype(src_dt)
-    return Xnp
-
-
-def _force_cast(Xnp, dst_dt):
-    if np.issubdtype(Xnp.dtype, np.complexfloating) and not np.issubdtype(
-        dst_dt, np.complexfloating
-    ):
-        R = Xnp.real.astype(dst_dt, casting="unsafe", copy=True)
-    else:
-        R = Xnp.astype(dst_dt, casting="unsafe", copy=True)
-    return R
-
-
-def are_close(X1, X2):
-    if np.issubdtype(X2.dtype, np.floating) or np.issubdtype(
-        X2.dtype, np.complexfloating
-    ):
-        return np.allclose(
-            X1, X2, atol=np.finfo(X2.dtype).eps, rtol=np.finfo(X2.dtype).eps
-        )
-    else:
-        return np.allclose(X1, X2)
-
-
-def test_copy1d_c_contig(src_typestr, dst_typestr):
-    try:
-        q = dpctl.SyclQueue()
-    except dpctl.SyclQueueCreationError:
-        pytest.skip("Queue could not be created")
-    if not q.sycl_device.has_aspect_fp64 and (
-        _typestr_has_fp64(src_typestr) or _typestr_has_fp64(dst_typestr)
-    ):
-        pytest.skip("Device does not support double precision")
-    if not q.sycl_device.has_aspect_fp16 and (
-        _typestr_has_fp16(src_typestr) or _typestr_has_fp16(dst_typestr)
-    ):
-        pytest.skip("Device does not support half precision")
-    src_dt = np.dtype(src_typestr)
-    dst_dt = np.dtype(dst_typestr)
-    Xnp = _random_vector(4096, src_dt)
-
-    X = dpt.asarray(Xnp, sycl_queue=q)
-    Y = dpt.empty(Xnp.shape, dtype=dst_typestr, sycl_queue=q)
-    hev, ev = ti._copy_usm_ndarray_into_usm_ndarray(src=X, dst=Y, sycl_queue=q)
-    hev.wait()
-    Ynp = _force_cast(Xnp, dst_dt)
-    assert are_close(Ynp, dpt.asnumpy(Y))
-    # q.wait()
-
-
-def test_copy1d_strided(src_typestr, dst_typestr):
-    try:
-        q = dpctl.SyclQueue()
-    except dpctl.SyclQueueCreationError:
-        pytest.skip("Queue could not be created")
-    if not q.sycl_device.has_aspect_fp64 and (
-        _typestr_has_fp64(src_typestr) or _typestr_has_fp64(dst_typestr)
-    ):
-        pytest.skip("Device does not support double precision")
-    if not q.sycl_device.has_aspect_fp16 and (
-        _typestr_has_fp16(src_typestr) or _typestr_has_fp16(dst_typestr)
-    ):
-        pytest.skip("Device does not support half precision")
-    src_dt = np.dtype(src_typestr)
-    dst_dt = np.dtype(dst_typestr)
-    Xnp = _random_vector(4096, src_dt)
-
-    for s in (
-        slice(None, None, 2),
-        slice(None, None, -2),
-    ):
-        X = dpt.asarray(Xnp, sycl_queue=q)[s]
-        Y = dpt.empty(X.shape, dtype=dst_typestr, sycl_queue=q)
-        hev, ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=X, dst=Y, sycl_queue=q
-        )
-        hev.wait()
-        Ynp = _force_cast(Xnp[s], dst_dt)
-        assert are_close(Ynp, dpt.asnumpy(Y))
-
-    # now 0-strided source
-    X = dpt.usm_ndarray(
-        (4096,),
-        dtype=src_typestr,
-        strides=(0,),
-        buffer_ctor_kwargs={"queue": q},
-    )
-    X[0] = Xnp[0]
-    Y = dpt.empty(X.shape, dtype=dst_typestr, sycl_queue=q)
-    hev, ev = ti._copy_usm_ndarray_into_usm_ndarray(src=X, dst=Y, sycl_queue=q)
-    Ynp = _force_cast(np.broadcast_to(Xnp[0], X.shape), dst_dt)
-    hev.wait()
-    assert are_close(Ynp, dpt.asnumpy(Y))
-
-
-def test_copy1d_strided2(src_typestr, dst_typestr):
-    try:
-        q = dpctl.SyclQueue()
-    except dpctl.SyclQueueCreationError:
-        pytest.skip("Queue could not be created")
-    if not q.sycl_device.has_aspect_fp64 and (
-        _typestr_has_fp64(src_typestr) or _typestr_has_fp64(dst_typestr)
-    ):
-        pytest.skip("Device does not support double precision")
-    if not q.sycl_device.has_aspect_fp16 and (
-        _typestr_has_fp16(src_typestr) or _typestr_has_fp16(dst_typestr)
-    ):
-        pytest.skip("Device does not support half precision")
-    src_dt = np.dtype(src_typestr)
-    dst_dt = np.dtype(dst_typestr)
-    Xnp = _random_vector(4096, src_dt)
-
-    for s in (
-        slice(None, None, 2),
-        slice(None, None, -2),
-    ):
-        X = dpt.asarray(Xnp, sycl_queue=q)[s]
-        Y = dpt.empty(X.shape, dtype=dst_typestr, sycl_queue=q)[::-1]
-        hev, ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=X, dst=Y, sycl_queue=q
-        )
-        Ynp = _force_cast(Xnp[s], dst_dt)
-        hev.wait()
-        assert are_close(Ynp, dpt.asnumpy(Y))
-
-
-@pytest.mark.parametrize("sgn1", [-1, 1])
-@pytest.mark.parametrize("sgn2", [-1, 1])
-@pytest.mark.parametrize("st1", [5, 3, 1])
-@pytest.mark.parametrize("st2", [1, 2])
-def test_copy2d(src_typestr, dst_typestr, st1, sgn1, st2, sgn2):
-    try:
-        q = dpctl.SyclQueue()
-    except dpctl.SyclQueueCreationError:
-        pytest.skip("Queue could not be created")
-    if not q.sycl_device.has_aspect_fp64 and (
-        _typestr_has_fp64(src_typestr) or _typestr_has_fp64(dst_typestr)
-    ):
-        pytest.skip("Device does not support double precision")
-    if not q.sycl_device.has_aspect_fp16 and (
-        _typestr_has_fp16(src_typestr) or _typestr_has_fp16(dst_typestr)
-    ):
-        pytest.skip("Device does not support half precision")
-
-    src_dt = np.dtype(src_typestr)
-    dst_dt = np.dtype(dst_typestr)
-    n1, n2 = 15, 12
-    Snp = _random_vector(st1 * st2 * n1 * n2, src_dt).reshape(
-        (st1 * n1, st2 * n2)
-    )
-    Xnp = Snp[
-        slice(None, None, st1 * sgn1),
-        slice(None, None, st2 * sgn2),
-    ]
-    S = dpt.asarray(Snp, sycl_queue=q)
-    X = S[
-        slice(None, None, st1 * sgn1),
-        slice(None, None, st2 * sgn2),
-    ]
-    Y = dpt.empty((n1, n2), dtype=dst_dt, device=X.device)
-    hev, ev = ti._copy_usm_ndarray_into_usm_ndarray(src=X, dst=Y, sycl_queue=q)
-    Ynp = _force_cast(Xnp, dst_dt)
-    hev.wait()
-    assert are_close(Ynp, dpt.asnumpy(Y))
-    Yst = dpt.empty((2 * n1, n2), dtype=dst_dt, device=X.device)[::2, ::-1]
-    hev, ev = ti._copy_usm_ndarray_into_usm_ndarray(
-        src=X, dst=Yst, sycl_queue=q
-    )
-    Y = dpt.empty((n1, n2), dtype=dst_dt, device=X.device)
-    hev2, ev2 = ti._copy_usm_ndarray_into_usm_ndarray(
-        src=Yst, dst=Y, sycl_queue=q, depends=[ev]
-    )
-    Ynp = _force_cast(Xnp, dst_dt)
-    hev2.wait()
-    hev.wait()
-    assert are_close(Ynp, dpt.asnumpy(Y))
-    # q.wait()
-
-
-@pytest.mark.parametrize("sgn1", [-1, 1])
-@pytest.mark.parametrize("sgn2", [-1, 1])
-@pytest.mark.parametrize("sgn3", [-1, 1])
-@pytest.mark.parametrize("st1", [3, 1])
-@pytest.mark.parametrize("st2", [1, 2])
-@pytest.mark.parametrize("st3", [3, 2])
-def test_copy3d(src_typestr, dst_typestr, st1, sgn1, st2, sgn2, st3, sgn3):
-    try:
-        q = dpctl.SyclQueue()
-    except dpctl.SyclQueueCreationError:
-        pytest.skip("Queue could not be created")
-
-    if not q.sycl_device.has_aspect_fp64 and (
-        _typestr_has_fp64(src_typestr) or _typestr_has_fp64(dst_typestr)
-    ):
-        pytest.skip("Device does not support double precision")
-    if not q.sycl_device.has_aspect_fp16 and (
-        _typestr_has_fp16(src_typestr) or _typestr_has_fp16(dst_typestr)
-    ):
-        pytest.skip("Device does not support half precision")
-    src_dt = np.dtype(src_typestr)
-    dst_dt = np.dtype(dst_typestr)
-    n1, n2, n3 = 5, 4, 6
-    Snp = _random_vector(st1 * st2 * st3 * n1 * n2 * n3, src_dt).reshape(
-        (st1 * n1, st2 * n2, st3 * n3)
-    )
-    Xnp = Snp[
-        slice(None, None, st1 * sgn1),
-        slice(None, None, st2 * sgn2),
-        slice(None, None, st3 * sgn3),
-    ]
-    S = dpt.asarray(Snp, sycl_queue=q)
-    X = S[
-        slice(None, None, st1 * sgn1),
-        slice(None, None, st2 * sgn2),
-        slice(None, None, st3 * sgn3),
-    ]
-    Y = dpt.empty((n1, n2, n3), dtype=dst_dt, device=X.device)
-    hev, ev = ti._copy_usm_ndarray_into_usm_ndarray(src=X, dst=Y, sycl_queue=q)
-    Ynp = _force_cast(Xnp, dst_dt)
-    hev.wait()
-    assert are_close(Ynp, dpt.asnumpy(Y)), "1"
-    Yst = dpt.empty((2 * n1, n2, n3), dtype=dst_dt, device=X.device)[::2, ::-1]
-    hev2, ev2 = ti._copy_usm_ndarray_into_usm_ndarray(
-        src=X, dst=Yst, sycl_queue=q
-    )
-    Y2 = dpt.empty((n1, n2, n3), dtype=dst_dt, device=X.device)
-    hev3, ev3 = ti._copy_usm_ndarray_into_usm_ndarray(
-        src=Yst, dst=Y2, sycl_queue=q, depends=[ev2]
-    )
-    hev3.wait()
-    hev2.wait()
-    assert are_close(Ynp, dpt.asnumpy(Y2)), "2"
-    # q.wait()
diff --git a/dpctl/tensor/libtensor/tests/test_main.cpp b/dpctl/tensor/libtensor/tests/test_main.cpp
deleted file mode 100644
index 0423aafa2c..0000000000
--- a/dpctl/tensor/libtensor/tests/test_main.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-//===-------------------- test_main.cpp - Common test runner               ===//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// A common test runner for all tests in dpctl C API.
-///
-//===----------------------------------------------------------------------===//
-
-#include <gtest/gtest.h>
-
-int main(int argc, char **argv)
-{
-    ::testing::InitGoogleTest(&argc, argv);
-    int ret = RUN_ALL_TESTS();
-    return ret;
-}
diff --git a/dpctl/tests/elementwise/__init__.py b/dpctl/tests/elementwise/__init__.py
deleted file mode 100644
index aca5068b6f..0000000000
--- a/dpctl/tests/elementwise/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-"""
-Collection of test and utility files for testing elementwise operations
-over :class:`dpctl.tensor.usm_ndarray`.
-"""
diff --git a/dpctl/tests/elementwise/test_abs.py b/dpctl/tests/elementwise/test_abs.py
deleted file mode 100644
index 17daf2c1ba..0000000000
--- a/dpctl/tests/elementwise/test_abs.py
+++ /dev/null
@@ -1,204 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import itertools
-import warnings
-
-import numpy as np
-import pytest
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes, _complex_fp_dtypes, _real_fp_dtypes, _usm_types
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_abs_out_type(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    arg_dt = np.dtype(dtype)
-    X = dpt.asarray(0, dtype=arg_dt, sycl_queue=q)
-    if np.issubdtype(arg_dt, np.complexfloating):
-        type_map = {
-            np.dtype("c8"): np.dtype("f4"),
-            np.dtype("c16"): np.dtype("f8"),
-        }
-        assert dpt.abs(X).dtype == type_map[arg_dt]
-
-        r = dpt.empty_like(X, dtype=type_map[arg_dt])
-        dpt.abs(X, out=r)
-        assert np.allclose(dpt.asnumpy(r), dpt.asnumpy(dpt.abs(X)))
-    else:
-        assert dpt.abs(X).dtype == arg_dt
-
-        r = dpt.empty_like(X, dtype=arg_dt)
-        dpt.abs(X, out=r)
-        assert np.allclose(dpt.asnumpy(r), dpt.asnumpy(dpt.abs(X)))
-
-
-@pytest.mark.parametrize("usm_type", _usm_types)
-def test_abs_usm_type(usm_type):
-    q = get_queue_or_skip()
-
-    arg_dt = np.dtype("i4")
-    input_shape = (10, 10, 10, 10)
-    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
-    X[..., 0::2] = 1
-    X[..., 1::2] = 0
-
-    Y = dpt.abs(X)
-    assert Y.usm_type == X.usm_type
-    assert Y.sycl_queue == X.sycl_queue
-    assert Y.flags.c_contiguous
-
-    expected_Y = dpt.asnumpy(X)
-    assert np.allclose(dpt.asnumpy(Y), expected_Y)
-
-
-def test_abs_types_property():
-    get_queue_or_skip()
-    types = dpt.abs.types
-    assert isinstance(types, list)
-    assert len(types) > 0
-    assert types == dpt.abs.types_
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes[1:])
-def test_abs_order(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    arg_dt = np.dtype(dtype)
-    exp_dt = np.abs(np.ones(tuple(), dtype=arg_dt)).dtype
-    input_shape = (10, 10, 10, 10)
-    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
-    X[..., 0::2] = 1
-    X[..., 1::2] = 0
-
-    for perms in itertools.permutations(range(4)):
-        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
-        expected_Y = np.ones(U.shape, dtype=exp_dt)
-        expected_Y[..., 1::2] = 0
-        expected_Y = np.transpose(expected_Y, perms)
-        for ord in ["C", "F", "A", "K"]:
-            Y = dpt.abs(U, order=ord)
-            assert np.allclose(dpt.asnumpy(Y), expected_Y)
-
-
-@pytest.mark.parametrize("dtype", ["c8", "c16"])
-def test_abs_complex(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    arg_dt = np.dtype(dtype)
-    input_shape = (10, 10, 10, 10)
-    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
-    Xnp = np.random.standard_normal(
-        size=input_shape
-    ) + 1j * np.random.standard_normal(size=input_shape)
-    Xnp = Xnp.astype(arg_dt)
-    X[...] = Xnp
-
-    for ord in ["C", "F", "A", "K"]:
-        for perms in itertools.permutations(range(4)):
-            U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
-            Y = dpt.abs(U, order=ord)
-            expected_Y = np.abs(np.transpose(Xnp[:, ::-1, ::-1, :], perms))
-            tol = dpt.finfo(Y.dtype).resolution
-            np.testing.assert_allclose(
-                dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol
-            )
-
-
-def test_abs_out_overlap():
-    get_queue_or_skip()
-
-    X = dpt.arange(-3, 3, 1, dtype="i4")
-    expected = dpt.asarray([3, 2, 1, 0, 1, 2], dtype="i4")
-    Y = dpt.abs(X, out=X)
-
-    assert Y is X
-    assert dpt.all(expected == X)
-
-    X = dpt.arange(-3, 3, 1, dtype="i4")
-    expected = expected[::-1]
-    Y = dpt.abs(X, out=X[::-1])
-    assert Y is not X
-    assert dpt.all(expected == X)
-
-
-@pytest.mark.parametrize("dtype", _real_fp_dtypes)
-def test_abs_real_fp_special_values(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    nans_ = [dpt.nan, -dpt.nan]
-    infs_ = [dpt.inf, -dpt.inf]
-    finites_ = [-1.0, -0.0, 0.0, 1.0]
-    inps_ = nans_ + infs_ + finites_
-
-    x = dpt.asarray(inps_, dtype=dtype)
-    r = dpt.abs(x)
-
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")
-        expected_np = np.abs(np.asarray(inps_, dtype=dtype))
-
-    expected = dpt.asarray(expected_np, dtype=dtype)
-    tol = dpt.finfo(r.dtype).resolution
-
-    assert dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True)
-
-
-@pytest.mark.parametrize("dtype", _complex_fp_dtypes)
-def test_abs_complex_fp_special_values(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    nans_ = [dpt.nan, -dpt.nan]
-    infs_ = [dpt.inf, -dpt.inf]
-    finites_ = [-1.0, -0.0, 0.0, 1.0]
-    inps_ = nans_ + infs_ + finites_
-    c_ = [complex(*v) for v in itertools.product(inps_, repeat=2)]
-
-    z = dpt.asarray(c_, dtype=dtype)
-    r = dpt.abs(z)
-
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")
-        expected_np = np.abs(np.asarray(c_, dtype=dtype))
-
-    expected = dpt.asarray(expected_np, dtype=dtype)
-    tol = dpt.finfo(r.dtype).resolution
-
-    assert dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True)
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_abs_alignment(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    x = dpt.ones(512, dtype=dtype)
-    r = dpt.abs(x)
-
-    r2 = dpt.abs(x[1:])
-    assert np.allclose(dpt.asnumpy(r[1:]), dpt.asnumpy(r2))
-
-    dpt.abs(x[:-1], out=r[1:])
-    assert np.allclose(dpt.asnumpy(r[1:]), dpt.asnumpy(r2))
diff --git a/dpctl/tests/elementwise/test_add.py b/dpctl/tests/elementwise/test_add.py
deleted file mode 100644
index 44d91d34cc..0000000000
--- a/dpctl/tests/elementwise/test_add.py
+++ /dev/null
@@ -1,574 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import ctypes
-import re
-
-import numpy as np
-import pytest
-
-import dpctl
-import dpctl.tensor as dpt
-from dpctl.tensor._type_utils import _can_cast
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-from dpctl.utils import ExecutionPlacementError
-
-from .utils import _all_dtypes, _compare_dtypes, _usm_types
-
-
-@pytest.mark.parametrize("op1_dtype", _all_dtypes)
-@pytest.mark.parametrize("op2_dtype", _all_dtypes)
-def test_add_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1 = dpt.ones(sz, dtype=op1_dtype)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
-
-    r = dpt.add(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected_dtype = np.add(
-        np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype)
-    ).dtype
-    assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q)
-    assert r.shape == ar1.shape
-    assert (dpt.asnumpy(r) == np.full(r.shape, 2, dtype=r.dtype)).all()
-    assert r.sycl_queue == ar1.sycl_queue
-
-    r2 = dpt.empty_like(ar1, dtype=r.dtype)
-    dpt.add(ar1, ar2, out=r2)
-    assert (dpt.asnumpy(r2) == np.full(r2.shape, 2, dtype=r2.dtype)).all()
-
-    ar3 = dpt.ones(sz, dtype=op1_dtype)
-    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
-
-    r = dpt.add(ar3[::-1], ar4[::2])
-    assert isinstance(r, dpt.usm_ndarray)
-    expected_dtype = np.add(
-        np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype)
-    ).dtype
-    assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q)
-    assert r.shape == ar3.shape
-    assert (dpt.asnumpy(r) == np.full(r.shape, 2, dtype=r.dtype)).all()
-
-    r2 = dpt.empty_like(ar1, dtype=r.dtype)
-    dpt.add(ar3[::-1], ar4[::2], out=r2)
-    assert (dpt.asnumpy(r2) == np.full(r2.shape, 2, dtype=r2.dtype)).all()
-
-
-@pytest.mark.parametrize("op1_usm_type", _usm_types)
-@pytest.mark.parametrize("op2_usm_type", _usm_types)
-def test_add_usm_type_matrix(op1_usm_type, op2_usm_type):
-    get_queue_or_skip()
-
-    sz = 128
-    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
-    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
-
-    r = dpt.add(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
-    assert r.usm_type == expected_usm_type
-
-
-def test_add_order():
-    get_queue_or_skip()
-
-    test_shape = (
-        20,
-        20,
-    )
-    test_shape2 = tuple(2 * dim for dim in test_shape)
-    n = test_shape[-1]
-
-    for dt1, dt2 in zip(["i4", "i4", "f4"], ["i4", "f4", "i4"]):
-        ar1 = dpt.ones(test_shape, dtype=dt1, order="C")
-        ar2 = dpt.ones(test_shape, dtype=dt2, order="C")
-        r1 = dpt.add(ar1, ar2, order="C")
-        assert r1.flags.c_contiguous
-        r2 = dpt.add(ar1, ar2, order="F")
-        assert r2.flags.f_contiguous
-        r3 = dpt.add(ar1, ar2, order="A")
-        assert r3.flags.c_contiguous
-        r4 = dpt.add(ar1, ar2, order="K")
-        assert r4.flags.c_contiguous
-
-        ar1 = dpt.ones(test_shape, dtype=dt1, order="F")
-        ar2 = dpt.ones(test_shape, dtype=dt2, order="F")
-        r1 = dpt.add(ar1, ar2, order="C")
-        assert r1.flags.c_contiguous
-        r2 = dpt.add(ar1, ar2, order="F")
-        assert r2.flags.f_contiguous
-        r3 = dpt.add(ar1, ar2, order="A")
-        assert r3.flags.f_contiguous
-        r4 = dpt.add(ar1, ar2, order="K")
-        assert r4.flags.f_contiguous
-
-        ar1 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2]
-        ar2 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2]
-        r4 = dpt.add(ar1, ar2, order="K")
-        assert r4.strides == (n, -1)
-        r5 = dpt.add(ar1, ar2, order="C")
-        assert r5.strides == (n, 1)
-
-        ar1 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2].mT
-        ar2 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2].mT
-        r4 = dpt.add(ar1, ar2, order="K")
-        assert r4.strides == (-1, n)
-        r5 = dpt.add(ar1, ar2, order="C")
-        assert r5.strides == (n, 1)
-
-
-def test_add_broadcasting():
-    get_queue_or_skip()
-
-    m = dpt.ones((100, 5), dtype="i4")
-    v = dpt.arange(5, dtype="i4")
-
-    r = dpt.add(m, v)
-    assert (dpt.asnumpy(r) == np.arange(1, 6, dtype="i4")[np.newaxis, :]).all()
-
-    r2 = dpt.add(v, m)
-    assert (dpt.asnumpy(r2) == np.arange(1, 6, dtype="i4")[np.newaxis, :]).all()
-
-    r3 = dpt.empty_like(m)
-    dpt.add(m, v, out=r3)
-    assert (dpt.asnumpy(r3) == np.arange(1, 6, dtype="i4")[np.newaxis, :]).all()
-
-    r4 = dpt.empty_like(m)
-    dpt.add(v, m, out=r4)
-    assert (dpt.asnumpy(r4) == np.arange(1, 6, dtype="i4")[np.newaxis, :]).all()
-
-
-def test_add_broadcasting_new_shape():
-    get_queue_or_skip()
-
-    ar1 = dpt.ones((6, 1), dtype="i4")
-    ar2 = dpt.arange(6, dtype="i4")
-
-    r = dpt.add(ar1, ar2)
-    assert (dpt.asnumpy(r) == np.arange(1, 7, dtype="i4")[np.newaxis, :]).all()
-
-    r1 = dpt.add(ar2, ar1)
-    assert (dpt.asnumpy(r1) == np.arange(1, 7, dtype="i4")[np.newaxis, :]).all()
-
-    r2 = dpt.add(ar1[::2], ar2[::2])
-    assert (
-        dpt.asnumpy(r2) == np.arange(1, 7, dtype="i4")[::2][np.newaxis, :]
-    ).all()
-
-    r3 = dpt.empty_like(ar1)
-    with pytest.raises(ValueError):
-        dpt.add(ar1, ar2, out=r3)
-
-    ar3 = dpt.ones((6, 1), dtype="i4")
-    ar4 = dpt.ones((1, 6), dtype="i4")
-
-    r4 = dpt.add(ar3, ar4)
-    assert (dpt.asnumpy(r4) == np.full((6, 6), 2, dtype="i4")).all()
-
-    r5 = dpt.add(ar4, ar3)
-    assert (dpt.asnumpy(r5) == np.full((6, 6), 2, dtype="i4")).all()
-
-    r6 = dpt.add(ar3[::2], ar4[:, ::2])
-    assert (dpt.asnumpy(r6) == np.full((3, 3), 2, dtype="i4")).all()
-
-    r7 = dpt.add(ar3[::2], ar4)
-    assert (dpt.asnumpy(r7) == np.full((3, 6), 2, dtype="i4")).all()
-
-
-def test_add_broadcasting_error():
-    get_queue_or_skip()
-    m = dpt.ones((10, 10), dtype="i4")
-    v = dpt.ones((3,), dtype="i4")
-    with pytest.raises(ValueError):
-        dpt.add(m, v)
-
-
-@pytest.mark.parametrize("arr_dt", _all_dtypes)
-def test_add_python_scalar(arr_dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arr_dt, q)
-
-    X = dpt.zeros((10, 10), dtype=arr_dt, sycl_queue=q)
-    py_zeros = (
-        bool(0),
-        int(0),
-        float(0),
-        complex(0),
-        np.float32(0),
-        ctypes.c_int(0),
-    )
-    for sc in py_zeros:
-        R = dpt.add(X, sc)
-        assert isinstance(R, dpt.usm_ndarray)
-        R = dpt.add(sc, X)
-        assert isinstance(R, dpt.usm_ndarray)
-
-
-class MockArray:
-    def __init__(self, arr):
-        self.data_ = arr
-
-    @property
-    def __sycl_usm_array_interface__(self):
-        return self.data_.__sycl_usm_array_interface__
-
-
-def test_add_mock_array():
-    get_queue_or_skip()
-    a = dpt.arange(10)
-    b = dpt.ones(10)
-    c = MockArray(b)
-    r = dpt.add(a, c)
-    assert isinstance(r, dpt.usm_ndarray)
-
-
-def test_add_canary_mock_array():
-    get_queue_or_skip()
-    a = dpt.arange(10)
-
-    class Canary:
-        def __init__(self):
-            pass
-
-        @property
-        def __sycl_usm_array_interface__(self):
-            return None
-
-    c = Canary()
-    with pytest.raises(ValueError):
-        dpt.add(a, c)
-
-
-def test_add_types_property():
-    get_queue_or_skip()
-    types = dpt.add.types
-    assert isinstance(types, list)
-    assert len(types) > 0
-    assert types == dpt.add.types_
-
-
-def test_add_errors():
-    get_queue_or_skip()
-    try:
-        gpu_queue = dpctl.SyclQueue("gpu")
-    except dpctl.SyclQueueCreationError:
-        pytest.skip("SyclQueue('gpu') failed, skipping")
-    try:
-        cpu_queue = dpctl.SyclQueue("cpu")
-    except dpctl.SyclQueueCreationError:
-        pytest.skip("SyclQueue('cpu') failed, skipping")
-
-    ar1 = dpt.ones(2, dtype="float32", sycl_queue=gpu_queue)
-    ar2 = dpt.ones_like(ar1, sycl_queue=gpu_queue)
-    y = dpt.empty_like(ar1, sycl_queue=cpu_queue)
-    with pytest.raises(ExecutionPlacementError) as excinfo:
-        dpt.add(ar1, ar2, out=y)
-    assert "Input and output allocation queues are not compatible" in str(
-        excinfo.value
-    )
-
-    ar1 = dpt.ones(2, dtype="float32")
-    ar2 = dpt.ones_like(ar1, dtype="int32")
-    y = dpt.empty(3)
-    with pytest.raises(ValueError) as excinfo:
-        dpt.add(ar1, ar2, out=y)
-    assert "The shape of input and output arrays are inconsistent" in str(
-        excinfo.value
-    )
-
-    ar1 = np.ones(2, dtype="float32")
-    ar2 = np.ones_like(ar1, dtype="int32")
-    with pytest.raises(ExecutionPlacementError) as excinfo:
-        dpt.add(ar1, ar2)
-    assert re.match(
-        "Execution placement can not be unambiguously inferred.*",
-        str(excinfo.value),
-    )
-
-    ar1 = dpt.ones(2, dtype="float32")
-    ar2 = dpt.ones_like(ar1, dtype="int32")
-    y = np.empty(ar1.shape, dtype=ar1.dtype)
-    with pytest.raises(TypeError) as excinfo:
-        dpt.add(ar1, ar2, out=y)
-    assert "output array must be of usm_ndarray type" in str(excinfo.value)
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_add_dtype_error(
-    dtype,
-):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    ar1 = dpt.ones(5, dtype=dtype)
-    ar2 = dpt.ones_like(ar1, dtype="f4")
-
-    y = dpt.zeros_like(ar1, dtype="int8")
-    with pytest.raises(ValueError) as excinfo:
-        dpt.add(ar1, ar2, out=y)
-    assert re.match("Output array of type.*is needed", str(excinfo.value))
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_add_inplace_python_scalar(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
-    dt_kind = X.dtype.kind
-    if dt_kind in "ui":
-        X += int(0)
-    elif dt_kind == "f":
-        X += float(0)
-    elif dt_kind == "c":
-        X += complex(0)
-    elif dt_kind == "b":
-        X += bool(0)
-
-
-@pytest.mark.parametrize("op1_dtype", _all_dtypes)
-@pytest.mark.parametrize("op2_dtype", _all_dtypes)
-def test_add_inplace_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1 = dpt.ones(sz, dtype=op1_dtype)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
-
-    dev = q.sycl_device
-    _fp16 = dev.has_aspect_fp16
-    _fp64 = dev.has_aspect_fp64
-    # operators use a different Python implementation which permits
-    # same kind style casting
-    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
-        ar1 += ar2
-        assert (
-            dpt.asnumpy(ar1) == np.full(ar1.shape, 2, dtype=ar1.dtype)
-        ).all()
-
-        ar3 = dpt.ones(sz, dtype=op1_dtype)[::-1]
-        ar4 = dpt.ones(2 * sz, dtype=op2_dtype)[::2]
-        ar3 += ar4
-        assert (
-            dpt.asnumpy(ar3) == np.full(ar3.shape, 2, dtype=ar3.dtype)
-        ).all()
-    else:
-        with pytest.raises(ValueError):
-            ar1 += ar2
-
-    # here, test the special case where out is the first argument
-    # so an in-place kernel is used for efficiency
-    # this covers a specific branch in the BinaryElementwiseFunc logic
-    ar1 = dpt.ones(sz, dtype=op1_dtype)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
-    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64):
-        dpt.add(ar1, ar2, out=ar1)
-        assert (
-            dpt.asnumpy(ar1) == np.full(ar1.shape, 2, dtype=ar1.dtype)
-        ).all()
-
-        ar3 = dpt.ones(sz, dtype=op1_dtype)[::-1]
-        ar4 = dpt.ones(2 * sz, dtype=op2_dtype)[::2]
-        dpt.add(ar3, ar4, out=ar3)
-        assert (
-            dpt.asnumpy(ar3) == np.full(ar3.shape, 2, dtype=ar3.dtype)
-        ).all()
-    else:
-        with pytest.raises(ValueError):
-            dpt.add(ar1, ar2, out=ar1)
-
-    ar1 = dpt.ones(sz, dtype=op1_dtype)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
-    if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64):
-        dpt.add(ar1, ar2, out=ar2)
-        assert (
-            dpt.asnumpy(ar2) == np.full(ar2.shape, 2, dtype=ar2.dtype)
-        ).all()
-
-        ar3 = dpt.ones(sz, dtype=op1_dtype)[::-1]
-        ar4 = dpt.ones(2 * sz, dtype=op2_dtype)[::2]
-        dpt.add(ar3, ar4, out=ar4)
-        assert (
-            dpt.asnumpy(ar4) == np.full(ar4.shape, 2, dtype=ar4.dtype)
-        ).all()
-    else:
-        with pytest.raises(ValueError):
-            dpt.add(ar1, ar2, out=ar2)
-
-
-def test_add_inplace_broadcasting():
-    get_queue_or_skip()
-
-    m = dpt.ones((100, 5), dtype="i4")
-    v = dpt.arange(5, dtype="i4")
-
-    dpt.add(m, v, out=m)
-    assert (dpt.asnumpy(m) == np.arange(1, 6, dtype="i4")[np.newaxis, :]).all()
-
-    # check case where second arg is out
-    dpt.add(v, m, out=m)
-    assert (
-        dpt.asnumpy(m) == np.arange(10, dtype="i4")[np.newaxis, 1:10:2]
-    ).all()
-
-
-def test_add_inplace_operator_broadcasting():
-    get_queue_or_skip()
-
-    m = dpt.ones((100, 5), dtype="i4")
-    v = dpt.arange(5, dtype="i4")
-
-    m += v
-    assert (dpt.asnumpy(m) == np.arange(1, 6, dtype="i4")[np.newaxis, :]).all()
-
-
-def test_add_inplace_operator_mutual_broadcast():
-    get_queue_or_skip()
-
-    x1 = dpt.ones((1, 10), dtype="i4")
-    x2 = dpt.ones((10, 1), dtype="i4")
-
-    with pytest.raises(ValueError):
-        dpt.add._inplace_op(x1, x2)
-
-
-def test_add_inplace_errors():
-    get_queue_or_skip()
-    try:
-        gpu_queue = dpctl.SyclQueue("gpu")
-    except dpctl.SyclQueueCreationError:
-        pytest.skip("SyclQueue('gpu') failed, skipping")
-    try:
-        cpu_queue = dpctl.SyclQueue("cpu")
-    except dpctl.SyclQueueCreationError:
-        pytest.skip("SyclQueue('cpu') failed, skipping")
-
-    ar1 = dpt.ones(2, dtype="float32", sycl_queue=gpu_queue)
-    ar2 = dpt.ones_like(ar1, sycl_queue=cpu_queue)
-    with pytest.raises(ExecutionPlacementError):
-        dpt.add(ar1, ar2, out=ar1)
-
-    ar1 = dpt.ones(2, dtype="float32")
-    ar2 = dpt.ones(3, dtype="float32")
-    with pytest.raises(ValueError):
-        dpt.add(ar1, ar2, out=ar1)
-
-    ar1 = np.ones(2, dtype="float32")
-    ar2 = dpt.ones(2, dtype="float32")
-    with pytest.raises(TypeError):
-        dpt.add(ar1, ar2, out=ar1)
-
-    ar1 = dpt.ones(2, dtype="float32")
-    ar2 = dict()
-    with pytest.raises(ValueError):
-        dpt.add(ar1, ar2, out=ar1)
-
-    ar1 = dpt.ones((2, 1), dtype="float32")
-    ar2 = dpt.ones((1, 2), dtype="float32")
-    with pytest.raises(ValueError):
-        dpt.add(ar1, ar2, out=ar1)
-
-
-def test_add_inplace_operator_errors():
-    q1 = get_queue_or_skip()
-    q2 = get_queue_or_skip()
-
-    x = dpt.ones(10, dtype="i4", sycl_queue=q1)
-    with pytest.raises(TypeError):
-        dpt.add._inplace_op(dict(), x)
-
-    x.flags["W"] = False
-    with pytest.raises(ValueError):
-        dpt.add._inplace_op(x, 2)
-
-    x_q1 = dpt.ones(10, dtype="i4", sycl_queue=q1)
-    x_q2 = dpt.ones(10, dtype="i4", sycl_queue=q2)
-    with pytest.raises(ExecutionPlacementError):
-        dpt.add._inplace_op(x_q1, x_q2)
-
-
-def test_add_inplace_same_tensors():
-    get_queue_or_skip()
-
-    ar1 = dpt.ones(10, dtype="i4")
-    ar1 += ar1
-    assert (dpt.asnumpy(ar1) == np.full(ar1.shape, 2, dtype="i4")).all()
-
-    ar1 = dpt.ones(10, dtype="i4")
-    ar2 = dpt.ones(10, dtype="i4")
-    dpt.add(ar1, ar2, out=ar1)
-    # all ar1 vals should be 2
-    assert (dpt.asnumpy(ar1) == np.full(ar1.shape, 2, dtype="i4")).all()
-
-    dpt.add(ar2, ar1, out=ar2)
-    # all ar2 vals should be 3
-    assert (dpt.asnumpy(ar2) == np.full(ar2.shape, 3, dtype="i4")).all()
-
-    dpt.add(ar1, ar2, out=ar2)
-    # all ar2 vals should be 5
-    assert (dpt.asnumpy(ar2) == np.full(ar2.shape, 5, dtype="i4")).all()
-
-
-def test_add_str_repr():
-    add_s = str(dpt.add)
-    assert isinstance(add_s, str)
-    assert "add" in add_s
-
-    add_r = repr(dpt.add)
-    assert isinstance(add_r, str)
-    assert "add" in add_r
-
-
-def test_add_cfd():
-    q1 = get_queue_or_skip()
-    q2 = dpctl.SyclQueue(q1.sycl_device)
-
-    x1 = dpt.ones(10, sycl_queue=q1)
-    x2 = dpt.ones(10, sycl_queue=q2)
-    with pytest.raises(ExecutionPlacementError):
-        dpt.add(x1, x2)
-
-    with pytest.raises(ExecutionPlacementError):
-        dpt.add(x1, x1, out=x2)
-
-
-def test_add_out_type_check():
-    get_queue_or_skip()
-
-    x1 = dpt.ones(10)
-    x2 = dpt.ones(10)
-
-    out = range(10)
-
-    with pytest.raises(TypeError):
-        dpt.add(x1, x2, out=out)
-
-
-def test_add_out_need_temporary():
-    get_queue_or_skip()
-
-    x = dpt.ones(10, dtype="u4")
-
-    dpt.add(x[:6], 1, out=x[-6:])
-
-    assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2)
diff --git a/dpctl/tests/elementwise/test_angle.py b/dpctl/tests/elementwise/test_angle.py
deleted file mode 100644
index 9a6fcc5c68..0000000000
--- a/dpctl/tests/elementwise/test_angle.py
+++ /dev/null
@@ -1,92 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import itertools
-
-import numpy as np
-import pytest
-
-import dpctl.tensor as dpt
-from dpctl.tensor._type_utils import _can_cast
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes, _complex_fp_dtypes, _no_complex_dtypes
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_angle_out_type(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    x = dpt.asarray(1, dtype=dtype, sycl_queue=q)
-    dt = dpt.dtype(dtype)
-    dev = q.sycl_device
-    _fp16 = dev.has_aspect_fp16
-    _fp64 = dev.has_aspect_fp64
-    if _can_cast(dt, dpt.complex64, _fp16, _fp64):
-        assert dpt.angle(x).dtype == dpt.float32
-    else:
-        assert dpt.angle(x).dtype == dpt.float64
-
-
-@pytest.mark.parametrize("dtype", _no_complex_dtypes[1:])
-def test_angle_real(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    x = dpt.arange(10, dtype=dtype, sycl_queue=q)
-    r = dpt.angle(x)
-
-    assert dpt.all(r == 0)
-
-
-@pytest.mark.parametrize("dtype", _complex_fp_dtypes)
-def test_angle_complex(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    tol = 8 * dpt.finfo(dtype).resolution
-    vals = dpt.pi * dpt.arange(10, dtype=dpt.finfo(dtype).dtype, sycl_queue=q)
-
-    x = dpt.zeros(10, dtype=dtype, sycl_queue=q)
-
-    x.imag[...] = vals
-    r = dpt.angle(x)
-    expected = dpt.atan2(x.imag, x.real)
-    assert dpt.allclose(r, expected, atol=tol, rtol=tol)
-
-    x.real[...] += dpt.pi
-    r = dpt.angle(x)
-    expected = dpt.atan2(x.imag, x.real)
-    assert dpt.allclose(r, expected, atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("dtype", ["c8", "c16"])
-def test_angle_special_cases(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    vals = [np.nan, -np.nan, np.inf, -np.inf, +0.0, -0.0]
-    vals = [complex(*val) for val in itertools.product(vals, repeat=2)]
-
-    x = dpt.asarray(vals, dtype=dtype, sycl_queue=q)
-
-    r = dpt.angle(x)
-    expected = dpt.atan2(x.imag, x.real)
-
-    tol = 8 * dpt.finfo(dtype).resolution
-
-    assert dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True)
diff --git a/dpctl/tests/elementwise/test_atan2.py b/dpctl/tests/elementwise/test_atan2.py
deleted file mode 100644
index 197c7b6189..0000000000
--- a/dpctl/tests/elementwise/test_atan2.py
+++ /dev/null
@@ -1,506 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import ctypes
-
-import numpy as np
-import pytest
-from numpy.testing import assert_allclose
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _compare_dtypes, _no_complex_dtypes
-
-
-@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:])
-@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:])
-def test_atan2_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
-
-    r = dpt.atan2(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.arctan2(
-        np.ones(sz, dtype=op1_dtype), np.ones(sz, dtype=op2_dtype)
-    )
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar1.shape
-
-    tol = 8 * max(
-        dpt.finfo(r.dtype).resolution, dpt.finfo(expected.dtype).resolution
-    )
-    assert_allclose(dpt.asnumpy(r), expected, atol=tol, rtol=tol)
-    assert r.sycl_queue == ar1.sycl_queue
-
-    ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
-    ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)
-
-    r = dpt.atan2(ar3[::-1], ar4[::2])
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.arctan2(
-        np.ones(sz, dtype=op1_dtype), np.ones(sz, dtype=op2_dtype)
-    )
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar3.shape
-
-    tol = 8 * max(
-        dpt.finfo(r.dtype).resolution, dpt.finfo(expected.dtype).resolution
-    )
-    assert_allclose(dpt.asnumpy(r), expected, atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("arr_dt", _no_complex_dtypes[1:])
-def test_atan2_python_scalar(arr_dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arr_dt, q)
-
-    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
-    py_ones = (
-        bool(1),
-        int(1),
-        float(1),
-        np.float32(1),
-        ctypes.c_int(1),
-    )
-    for sc in py_ones:
-        R = dpt.atan2(X, sc)
-        assert isinstance(R, dpt.usm_ndarray)
-        R = dpt.atan2(sc, X)
-        assert isinstance(R, dpt.usm_ndarray)
-
-
-@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
-def test_atan2_special_case_one_nan(dt):
-    """If either x1_i or x2_i is NaN, the result is NaN."""
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    x1 = dpt.asarray([dpt.nan, dpt.nan, 1], dtype=dt)
-    x2 = dpt.asarray([dpt.nan, 1, dpt.nan], dtype=dt)
-
-    y = dpt.atan2(x1, x2)
-    assert dpt.all(dpt.isnan(y))
-
-
-@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
-def test_atan2_special_case_positive_and_pzero(dt):
-    """If x1_i is greater than 0 and x2_i is +0, the result
-    is an approximation to +pi/2.
-    """
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    x1 = dpt.asarray([0.5, 1, 2, dpt.inf], dtype=dt)
-    x2 = dpt.asarray([+0.0], dtype=dt)
-
-    actual = dpt.atan2(x1, x2)
-    expected = dpt.asarray(dpt.pi / 2, dtype=dt)
-
-    diff = dpt.abs(dpt.subtract(actual, expected))
-    atol = 8 * dpt.finfo(diff.dtype).eps
-    assert dpt.all(dpt.less_equal(diff, atol))
-
-
-@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
-def test_atan2_special_case_positive_and_nzero(dt):
-    """If x1_i is greater than 0 and x2_i is -0, the result
-    is an approximation to +pi/2.
-    """
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    x1 = dpt.asarray([0.5, 1, 2, dpt.inf], dtype=dt)
-    x2 = dpt.asarray([-0.0], dtype=dt)
-
-    actual = dpt.atan2(x1, x2)
-    expected = dpt.asarray(dpt.pi / 2, dtype=dt)
-
-    diff = dpt.abs(dpt.subtract(actual, expected))
-    atol = 8 * dpt.finfo(diff.dtype).eps
-    assert dpt.all(dpt.less_equal(diff, atol))
-
-
-@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
-def test_atan2_special_case_pzero_and_positive(dt):
-    """If x1_i is +0 and x2_i is greater than 0,
-    the result is +0.
-    """
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    x1 = dpt.asarray(+0.0, dtype=dt)
-    x2 = dpt.asarray([0.5, 1, 2, dpt.inf], dtype=dt)
-
-    actual = dpt.atan2(x1, x2)
-    expected = dpt.asarray(+0.0, dtype=dt)
-
-    assert dpt.all(dpt.equal(actual, expected))
-    assert not dpt.any(dpt.signbit(actual))
-
-
-@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
-def test_atan2_special_case_pzero_and_pzero(dt):
-    """If x1_i is +0 and x2_i is +0, the result is +0."""
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    x1 = dpt.asarray(+0.0, dtype=dt)
-    x2 = dpt.asarray([+0.0], dtype=dt)
-
-    actual = dpt.atan2(x1, x2)
-    expected = dpt.asarray(+0.0, dtype=dt)
-
-    assert dpt.all(dpt.equal(actual, expected))
-    assert not dpt.any(dpt.signbit(actual))
-
-
-@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
-def test_atan2_special_case_pzero_and_nzero(dt):
-    """
-    If x1_i is +0 and x2_i is -0, the result is an
-    approximation to +pi.
-    """
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    x1 = dpt.asarray(+0.0, dtype=dt)
-    x2 = dpt.asarray([-0.0], dtype=dt)
-
-    actual = dpt.atan2(x1, x2)
-    expected = dpt.asarray(dpt.pi, dtype=dt)
-
-    diff = dpt.abs(dpt.subtract(actual, expected))
-    atol = 8 * dpt.finfo(diff.dtype).eps
-    assert dpt.all(dpt.less_equal(diff, atol))
-
-
-@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
-def test_atan2_special_case_pzero_and_negatvie(dt):
-    """
-    If x1_i is +0 and x2_i is less than 0, the result
-    is an approximation to +pi.
-    """
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    x1 = dpt.asarray(+0.0, dtype=dt)
-    x2 = dpt.asarray([-0.5, -1, -2, -dpt.inf], dtype=dt)
-
-    actual = dpt.atan2(x1, x2)
-    expected = dpt.asarray(dpt.pi, dtype=dt)
-
-    diff = dpt.abs(dpt.subtract(actual, expected))
-    atol = 8 * dpt.finfo(diff.dtype).eps
-    assert dpt.all(dpt.less_equal(diff, atol))
-
-
-@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
-def test_atan2_special_case_nzero_and_positive(dt):
-    """If x1_i is -0 and x2_i is greater than 0,
-    the result is -0.
-    """
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    x1 = dpt.asarray(-0.0, dtype=dt)
-    x2 = dpt.asarray([0.5, 1, 2, dpt.inf], dtype=dt)
-
-    actual = dpt.atan2(x1, x2)
-    expected = dpt.asarray(-0.0, dtype=dt)
-
-    assert dpt.all(dpt.equal(actual, expected))
-    assert dpt.all(dpt.signbit(actual))
-
-
-@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
-def test_atan2_special_case_nzero_and_pzero(dt):
-    """If x1_i is -0 and x2_i is +0, the result is -0."""
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    x1 = dpt.asarray(-0.0, dtype=dt)
-    x2 = dpt.asarray([+0.0], dtype=dt)
-
-    actual = dpt.atan2(x1, x2)
-    expected = dpt.asarray(-0.0, dtype=dt)
-
-    assert dpt.all(dpt.equal(actual, expected))
-    assert dpt.all(dpt.signbit(actual))
-
-
-@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
-def test_atan2_special_case_nzero_and_nzero(dt):
-    """If x1_i is -0 and x2_i is -0, the result is
-    an approximation to -pi.
-    """
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    x1 = dpt.asarray([-0.0], dtype=dt)
-    x2 = dpt.asarray([-0.0], dtype=dt)
-
-    actual = dpt.atan2(x1, x2)
-    expected = dpt.asarray(-dpt.pi, dtype=dt)
-
-    diff = dpt.abs(dpt.subtract(actual, expected))
-    atol = 8 * dpt.finfo(diff.dtype).eps
-    assert dpt.all(dpt.less_equal(diff, atol))
-
-
-@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
-def test_atan2_special_case_nzero_and_negative(dt):
-    """If x1_i is -0 and x2_i is less than 0, the result
-    is an approximation to -pi.
-    """
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    x1 = dpt.asarray([-0.0], dtype=dt)
-    x2 = dpt.asarray([-dpt.inf, -2, -1, -0.5], dtype=dt)
-
-    actual = dpt.atan2(x1, x2)
-    expected = dpt.asarray(-dpt.pi, dtype=dt)
-
-    diff = dpt.abs(dpt.subtract(actual, expected))
-    atol = 8 * dpt.finfo(diff.dtype).eps
-    assert dpt.all(dpt.less_equal(diff, atol))
-
-
-@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
-def test_atan2_special_case_negative_and_pzero(dt):
-    """If x1_i is less than 0 and x2_i is +0, the result
-    is an approximation to -pi/2.
-    """
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    x1 = dpt.asarray([-dpt.inf, -2, -1, -0.5], dtype=dt)
-    x2 = dpt.asarray(+0.0, dtype=dt)
-
-    actual = dpt.atan2(x1, x2)
-    expected = dpt.asarray(-dpt.pi / 2, dtype=dt)
-
-    diff = dpt.abs(dpt.subtract(actual, expected))
-    atol = 8 * dpt.finfo(diff.dtype).eps
-    assert dpt.all(dpt.less_equal(diff, atol))
-
-
-@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
-def test_atan2_special_case_negative_and_nzero(dt):
-    """If x1_i is less than 0 and x2_i is -0, the result
-    is an approximation to -pi/2."""
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    x1 = dpt.asarray([-dpt.inf, -2, -1, -0.5], dtype=dt)
-    x2 = dpt.asarray(-0.0, dtype=dt)
-
-    actual = dpt.atan2(x1, x2)
-    expected = dpt.asarray(-dpt.pi / 2, dtype=dt)
-
-    diff = dpt.abs(dpt.subtract(actual, expected))
-    atol = 8 * dpt.finfo(diff.dtype).eps
-    assert dpt.all(dpt.less_equal(diff, atol))
-
-
-@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
-def test_atan2_special_case_pfinite_and_pinf(dt):
-    """If x1_i is greater than 0, x1_i is a finite number,
-    and x2_i is +infinity, the result is +0."""
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    x1 = dpt.asarray([0.5, 1, 2, 5], dtype=dt)
-    x2 = dpt.asarray(dpt.inf, dtype=dt)
-
-    actual = dpt.atan2(x1, x2)
-    expected = dpt.asarray(+0.0, dtype=dt)
-    assert dpt.all(dpt.equal(actual, expected))
-    assert not dpt.any(dpt.signbit(actual))
-
-
-@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
-def test_atan2_special_case_pfinite_and_ninf(dt):
-    """If x1_i is greater than 0, x1_i is a finite number,
-    and x2_i is -infinity, the result is an approximation
-    to +pi."""
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    x1 = dpt.asarray([0.5, 1, 2, 5], dtype=dt)
-    x2 = dpt.asarray(-dpt.inf, dtype=dt)
-
-    actual = dpt.atan2(x1, x2)
-    expected = dpt.asarray(dpt.pi, dtype=dt)
-
-    diff = dpt.abs(dpt.subtract(actual, expected))
-    atol = 8 * dpt.finfo(diff.dtype).eps
-    assert dpt.all(dpt.less_equal(diff, atol))
-
-
-@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
-def test_atan2_special_case_nfinite_and_pinf(dt):
-    """If x1_i is less than 0, x1_i is a finite number,
-    and x2_i is +infinity, the result is -0."""
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    x1 = dpt.asarray([-0.5, -1, -2, -5], dtype=dt)
-    x2 = dpt.asarray(dpt.inf, dtype=dt)
-
-    actual = dpt.atan2(x1, x2)
-    expected = dpt.asarray(-0.0, dtype=dt)
-    assert dpt.all(dpt.equal(actual, expected))
-    assert dpt.all(dpt.signbit(actual))
-
-
-@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
-def test_atan2_special_case_nfinite_and_ninf(dt):
-    """If x1_i is less than 0, x1_i is a finite number, and
-    x2_i is -infinity, the result is an approximation
-    to -pi."""
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    x1 = dpt.asarray([-0.5, -1, -2, -5], dtype=dt)
-    x2 = dpt.asarray(-dpt.inf, dtype=dt)
-
-    actual = dpt.atan2(x1, x2)
-    expected = dpt.asarray(-dpt.pi, dtype=dt)
-
-    diff = dpt.abs(dpt.subtract(actual, expected))
-    atol = 8 * dpt.finfo(diff.dtype).eps
-    assert dpt.all(dpt.less_equal(diff, atol))
-
-
-@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
-def test_atan2_special_case_pinf_and_finite(dt):
-    """If x1_i is +infinity and x2_i is a finite number,
-    the result is an approximation to +pi/2.
-    """
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    x1 = dpt.asarray(dpt.inf, dtype=dt)
-    x2 = dpt.asarray([-2, -0.0, 0.0, 2], dtype=dt)
-
-    actual = dpt.atan2(x1, x2)
-    expected = dpt.asarray(dpt.pi / 2, dtype=dt)
-
-    diff = dpt.abs(dpt.subtract(actual, expected))
-    atol = 8 * dpt.finfo(diff.dtype).eps
-    assert dpt.all(dpt.less_equal(diff, atol))
-
-
-@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
-def test_atan2_special_case_ninf_and_finite(dt):
-    """If x1_i is -infinity and x2_i is a finite number,
-    the result is an approximation to -pi/2.
-    """
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    x1 = dpt.asarray(-dpt.inf, dtype=dt)
-    x2 = dpt.asarray([-2, -0.0, 0.0, 2], dtype=dt)
-
-    actual = dpt.atan2(x1, x2)
-    expected = dpt.asarray(-dpt.pi / 2, dtype=dt)
-
-    diff = dpt.abs(dpt.subtract(actual, expected))
-    atol = 8 * dpt.finfo(diff.dtype).eps
-    assert dpt.all(dpt.less_equal(diff, atol))
-
-
-@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
-def test_atan2_special_case_pinf_and_pinf(dt):
-    """If x1_i is +infinity and x2_i is +infinity,
-    the result is an approximation to +pi/4.
-    """
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    x1 = dpt.asarray(dpt.inf, dtype=dt)
-    x2 = dpt.asarray([dpt.inf], dtype=dt)
-
-    actual = dpt.atan2(x1, x2)
-    expected = dpt.asarray(dpt.pi / 4, dtype=dt)
-
-    diff = dpt.abs(dpt.subtract(actual, expected))
-    atol = 8 * dpt.finfo(diff.dtype).eps
-    assert dpt.all(dpt.less_equal(diff, atol))
-
-
-@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
-def test_atan2_special_case_pinf_and_ninf(dt):
-    """If x1_i is +infinity and x2_i is -infinity,
-    the result is an approximation to +3*pi/4.
-    """
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    x1 = dpt.asarray(dpt.inf, dtype=dt)
-    x2 = dpt.asarray([-dpt.inf], dtype=dt)
-
-    actual = dpt.atan2(x1, x2)
-    expected = dpt.asarray(3 * dpt.pi / 4, dtype=dt)
-
-    diff = dpt.abs(dpt.subtract(actual, expected))
-    atol = 8 * dpt.finfo(diff.dtype).eps
-    assert dpt.all(dpt.less_equal(diff, atol))
-
-
-@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
-def test_atan2_special_case_ninf_and_pinf(dt):
-    """If x1_i is -infinity and x2_i is +infinity,
-    the result is an approximation to -pi/4.
-    """
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    x1 = dpt.asarray(-dpt.inf, dtype=dt)
-    x2 = dpt.asarray([dpt.inf], dtype=dt)
-
-    actual = dpt.atan2(x1, x2)
-    expected = dpt.asarray(-dpt.pi / 4, dtype=dt)
-
-    diff = dpt.abs(dpt.subtract(actual, expected))
-    atol = 8 * dpt.finfo(diff.dtype).eps
-    assert dpt.all(dpt.less_equal(diff, atol))
-
-
-@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
-def test_atan2_special_case_ninf_and_ninf(dt):
-    """If x1_i is -infinity and x2_i is -infinity,
-    the result is an approximation to -3*pi/4.
-    """
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    x1 = dpt.asarray(-dpt.inf, dtype=dt)
-    x2 = dpt.asarray([-dpt.inf], dtype=dt)
-
-    actual = dpt.atan2(x1, x2)
-    expected = dpt.asarray(-3 * dpt.pi / 4, dtype=dt)
-
-    diff = dpt.abs(dpt.subtract(actual, expected))
-    atol = 8 * dpt.finfo(diff.dtype).eps
-    assert dpt.all(dpt.less_equal(diff, atol))
diff --git a/dpctl/tests/elementwise/test_bitwise_and.py b/dpctl/tests/elementwise/test_bitwise_and.py
deleted file mode 100644
index 292e09e0ea..0000000000
--- a/dpctl/tests/elementwise/test_bitwise_and.py
+++ /dev/null
@@ -1,127 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2023-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless_equal required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import numpy as np
-import pytest
-
-import dpctl.tensor as dpt
-from dpctl.tensor._type_utils import _can_cast
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _integral_dtypes
-
-
-@pytest.mark.parametrize("op_dtype", _integral_dtypes)
-def test_bitwise_and_dtype_matrix_contig(op_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op_dtype, q)
-
-    sz = 7
-    n = 2 * sz
-    dt1 = dpt.dtype(op_dtype)
-    dt2 = dpt.dtype(op_dtype)
-
-    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
-    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)
-
-    x2_range_begin = -sz if dpt.iinfo(dt2).min < 0 else 0
-    x2 = dpt.arange(x2_range_begin, x2_range_begin + n, dtype=dt1)
-
-    r = dpt.bitwise_and(x1, x2)
-    assert isinstance(r, dpt.usm_ndarray)
-
-    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op_dtype)
-    x2_np = np.arange(x2_range_begin, x2_range_begin + n, dtype=op_dtype)
-    r_np = np.bitwise_and(x1_np, x2_np)
-
-    assert (r_np == dpt.asnumpy(r)).all()
-
-
-@pytest.mark.parametrize("op_dtype", _integral_dtypes)
-def test_bitwise_and_dtype_matrix_strided(op_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op_dtype, q)
-
-    sz = 11
-    n = 2 * sz
-    dt1 = dpt.dtype(op_dtype)
-    dt2 = dpt.dtype(op_dtype)
-
-    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
-    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::2]
-
-    x2_range_begin = -(sz // 2) if dpt.iinfo(dt2).min < 0 else 0
-    x2 = dpt.arange(x2_range_begin, x2_range_begin + n, dtype=dt1)[::-2]
-
-    r = dpt.bitwise_and(x1, x2)
-    assert isinstance(r, dpt.usm_ndarray)
-
-    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op_dtype)[::2]
-    x2_np = np.arange(x2_range_begin, x2_range_begin + n, dtype=op_dtype)[::-2]
-    r_np = np.bitwise_and(x1_np, x2_np)
-
-    assert (r_np == dpt.asnumpy(r)).all()
-
-
-def test_bitwise_and_bool():
-    get_queue_or_skip()
-
-    x1 = dpt.asarray([True, False])
-    x2 = dpt.asarray([False, True])
-
-    r_bw = dpt.bitwise_and(x1[:, dpt.newaxis], x2[dpt.newaxis])
-    r_lo = dpt.logical_and(x1[:, dpt.newaxis], x2[dpt.newaxis])
-
-    assert dpt.all(dpt.equal(r_bw, r_lo))
-
-
-@pytest.mark.parametrize("dtype", ["?"] + _integral_dtypes)
-def test_bitwise_and_inplace_python_scalar(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
-    dt_kind = X.dtype.kind
-    if dt_kind == "b":
-        X &= False
-    else:
-        X &= int(0)
-
-
-@pytest.mark.parametrize("op1_dtype", ["?"] + _integral_dtypes)
-@pytest.mark.parametrize("op2_dtype", ["?"] + _integral_dtypes)
-def test_bitwise_and_inplace_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
-
-    dev = q.sycl_device
-    _fp16 = dev.has_aspect_fp16
-    _fp64 = dev.has_aspect_fp64
-    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
-        ar1 &= ar2
-        assert dpt.all(ar1 == 1)
-
-        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
-        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
-        ar3 &= ar4
-        assert dpt.all(ar3 == 1)
-    else:
-        with pytest.raises(ValueError):
-            ar1 &= ar2
diff --git a/dpctl/tests/elementwise/test_bitwise_invert.py b/dpctl/tests/elementwise/test_bitwise_invert.py
deleted file mode 100644
index 91202a675d..0000000000
--- a/dpctl/tests/elementwise/test_bitwise_invert.py
+++ /dev/null
@@ -1,129 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2023-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless_equal required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import numpy as np
-import pytest
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _compare_dtypes, _integral_dtypes, _usm_types
-
-
-@pytest.mark.parametrize(
-    "op_dtype",
-    [
-        "b1",
-    ]
-    + _integral_dtypes,
-)
-def test_bitwise_invert_dtype_matrix(op_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op_dtype, q)
-
-    sz = 7
-    ar1 = dpt.asarray(np.random.randint(0, 2, sz), dtype=op_dtype)
-
-    r = dpt.bitwise_invert(ar1)
-    assert isinstance(r, dpt.usm_ndarray)
-    assert r.dtype == ar1.dtype
-
-    expected = np.bitwise_not(dpt.asnumpy(ar1))
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar1.shape
-    assert (dpt.asnumpy(r) == expected).all()
-    assert r.sycl_queue == ar1.sycl_queue
-
-    r2 = dpt.empty_like(r, dtype=r.dtype)
-    dpt.bitwise_invert(ar1, out=r2)
-    assert dpt.all(dpt.equal(r, r2))
-
-    ar2 = dpt.zeros(sz, dtype=op_dtype)
-    r = dpt.bitwise_invert(ar2[::-1])
-    assert isinstance(r, dpt.usm_ndarray)
-
-    expected = np.bitwise_not(np.zeros(ar2.shape, dtype=op_dtype))
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar2.shape
-    assert (dpt.asnumpy(r) == expected).all()
-
-    ar3 = dpt.ones(sz, dtype=op_dtype)
-    r2 = dpt.bitwise_invert(ar3[::2])
-    assert isinstance(r, dpt.usm_ndarray)
-
-    expected = np.bitwise_not(np.ones(ar3.shape, dtype=op_dtype)[::2])
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert (dpt.asnumpy(r2) == expected).all()
-
-    r3 = dpt.empty_like(r, dtype=r.dtype)
-    dpt.bitwise_invert(ar2[::-1], out=r3)
-    assert dpt.all(dpt.equal(r, r3))
-
-
-@pytest.mark.parametrize("op_usm_type", _usm_types)
-def test_bitwise_invert_usm_type_matrix(op_usm_type):
-    get_queue_or_skip()
-
-    sz = 128
-    ar1 = dpt.asarray(
-        np.random.randint(0, 2, sz), dtype="i4", usm_type=op_usm_type
-    )
-
-    r = dpt.bitwise_invert(ar1)
-    assert isinstance(r, dpt.usm_ndarray)
-    assert r.usm_type == op_usm_type
-
-
-def test_bitwise_invert_order():
-    get_queue_or_skip()
-
-    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
-    r1 = dpt.bitwise_invert(ar1, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.bitwise_invert(ar1, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.bitwise_invert(ar1, order="A")
-    assert r3.flags.c_contiguous
-    r4 = dpt.bitwise_invert(ar1, order="K")
-    assert r4.flags.c_contiguous
-
-    ar1 = dpt.zeros((20, 20), dtype="i4", order="F")
-    r1 = dpt.bitwise_invert(ar1, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.bitwise_invert(ar1, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.bitwise_invert(ar1, order="A")
-    assert r3.flags.f_contiguous
-    r4 = dpt.bitwise_invert(ar1, order="K")
-    assert r4.flags.f_contiguous
-
-    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
-    r4 = dpt.bitwise_invert(ar1, order="K")
-    assert r4.strides == (20, -1)
-
-    ar1 = dpt.zeros((40, 40), dtype="i4", order="C")[:20, ::-2].mT
-    r4 = dpt.bitwise_invert(ar1, order="K")
-    assert r4.strides == (-1, 20)
-
-
-def test_bitwise_invert_large_boolean():
-    get_queue_or_skip()
-
-    x = dpt.tril(dpt.ones((32, 32), dtype="?"), k=-1)
-    res = dpt.astype(dpt.bitwise_invert(x), "i4")
-
-    assert dpt.all(res >= 0)
-    assert dpt.all(res <= 1)
diff --git a/dpctl/tests/elementwise/test_bitwise_left_shift.py b/dpctl/tests/elementwise/test_bitwise_left_shift.py
deleted file mode 100644
index 24112a786e..0000000000
--- a/dpctl/tests/elementwise/test_bitwise_left_shift.py
+++ /dev/null
@@ -1,135 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2023-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless_equal required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import numpy as np
-import pytest
-
-import dpctl.tensor as dpt
-from dpctl.tensor._type_utils import _can_cast
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _integral_dtypes
-
-
-@pytest.mark.parametrize("op1_dtype", _integral_dtypes)
-@pytest.mark.parametrize("op2_dtype", _integral_dtypes)
-def test_bitwise_left_shift_dtype_matrix_contig(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    if op1_dtype != op2_dtype and "u8" in [op1_dtype, op2_dtype]:
-        return
-
-    sz = 7
-    n = 2 * sz
-    dt1 = dpt.dtype(op1_dtype)
-    dt2 = dpt.dtype(op2_dtype)
-
-    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
-    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)
-    x2 = dpt.arange(0, n, dtype=dt2)
-
-    r = dpt.bitwise_left_shift(x1, x2)
-    assert isinstance(r, dpt.usm_ndarray)
-    assert r.sycl_queue == x1.sycl_queue
-    assert r.sycl_queue == x2.sycl_queue
-
-    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op1_dtype)
-    x2_np = np.arange(0, n, dtype=op2_dtype)
-    r_np = np.left_shift(x1_np, x2_np)
-
-    assert r.dtype == r_np.dtype
-    assert (dpt.asnumpy(r) == r_np).all()
-
-
-@pytest.mark.parametrize("op1_dtype", _integral_dtypes)
-@pytest.mark.parametrize("op2_dtype", _integral_dtypes)
-def test_bitwise_left_shift_dtype_matrix_strided(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    if op1_dtype != op2_dtype and "u8" in [op1_dtype, op2_dtype]:
-        return
-
-    sz = 11
-    n = 2 * sz
-    dt1 = dpt.dtype(op1_dtype)
-    dt2 = dpt.dtype(op2_dtype)
-
-    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
-    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::-2]
-    x2 = dpt.arange(0, n, dtype=dt2)[::2]
-
-    r = dpt.bitwise_left_shift(x1, x2)
-    assert isinstance(r, dpt.usm_ndarray)
-    assert r.sycl_queue == x1.sycl_queue
-    assert r.sycl_queue == x2.sycl_queue
-
-    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::-2]
-    x2_np = np.arange(0, n, dtype=dt2)[::2]
-    r_np = np.left_shift(x1_np, x2_np)
-
-    assert r.dtype == r_np.dtype
-    assert (dpt.asnumpy(r) == r_np).all()
-
-
-@pytest.mark.parametrize("op_dtype", _integral_dtypes)
-def test_bitwise_left_shift_range(op_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op_dtype, q)
-
-    x = dpt.ones(255, dtype=op_dtype)
-    y = dpt.asarray(64, dtype=op_dtype)
-
-    z = dpt.bitwise_left_shift(x, y)
-    assert dpt.all(dpt.equal(z, 0))
-
-
-@pytest.mark.parametrize("dtype", _integral_dtypes)
-def test_bitwise_left_shift_inplace_python_scalar(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
-    X <<= int(0)
-
-
-@pytest.mark.parametrize("op1_dtype", _integral_dtypes)
-@pytest.mark.parametrize("op2_dtype", _integral_dtypes)
-def test_bitwise_left_shift_inplace_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
-
-    dev = q.sycl_device
-    _fp16 = dev.has_aspect_fp16
-    _fp64 = dev.has_aspect_fp64
-    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
-        ar1 <<= ar2
-        assert dpt.all(ar1 == 2)
-
-        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
-        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
-        ar3 <<= ar4
-        assert dpt.all(ar3 == 2)
-    else:
-        with pytest.raises(ValueError):
-            ar1 <<= ar2
diff --git a/dpctl/tests/elementwise/test_bitwise_or.py b/dpctl/tests/elementwise/test_bitwise_or.py
deleted file mode 100644
index 70fdff0c42..0000000000
--- a/dpctl/tests/elementwise/test_bitwise_or.py
+++ /dev/null
@@ -1,143 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2023-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless_equal required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import numpy as np
-import pytest
-
-import dpctl.tensor as dpt
-from dpctl.tensor._type_utils import _can_cast
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _integral_dtypes
-
-
-@pytest.mark.parametrize("op_dtype", _integral_dtypes)
-def test_bitwise_or_dtype_matrix_contig(op_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op_dtype, q)
-
-    sz = 7
-    n = 2 * sz
-    dt1 = dpt.dtype(op_dtype)
-    dt2 = dpt.dtype(op_dtype)
-
-    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
-    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)
-
-    x2_range_begin = -sz if dpt.iinfo(dt2).min < 0 else 0
-    x2 = dpt.arange(x2_range_begin, x2_range_begin + n, dtype=dt1)
-
-    r = dpt.bitwise_or(x1, x2)
-    assert isinstance(r, dpt.usm_ndarray)
-
-    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op_dtype)
-    x2_np = np.arange(x2_range_begin, x2_range_begin + n, dtype=op_dtype)
-    r_np = np.bitwise_or(x1_np, x2_np)
-
-    assert (r_np == dpt.asnumpy(r)).all()
-
-
-@pytest.mark.parametrize("op_dtype", _integral_dtypes)
-def test_bitwise_or_dtype_matrix_strided(op_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op_dtype, q)
-
-    sz = 11
-    n = 2 * sz
-    dt1 = dpt.dtype(op_dtype)
-    dt2 = dpt.dtype(op_dtype)
-
-    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
-    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::2]
-
-    x2_range_begin = -(sz // 2) if dpt.iinfo(dt2).min < 0 else 0
-    x2 = dpt.arange(x2_range_begin, x2_range_begin + n, dtype=dt1)[::-2]
-
-    r = dpt.bitwise_or(x1, x2)
-    assert isinstance(r, dpt.usm_ndarray)
-
-    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op_dtype)[::2]
-    x2_np = np.arange(x2_range_begin, x2_range_begin + n, dtype=op_dtype)[::-2]
-    r_np = np.bitwise_or(x1_np, x2_np)
-
-    assert (r_np == dpt.asnumpy(r)).all()
-
-
-def test_bitwise_or_bool():
-    get_queue_or_skip()
-
-    x1 = dpt.asarray([True, False])
-    x2 = dpt.asarray([False, True])
-
-    r_bw = dpt.bitwise_or(x1[:, dpt.newaxis], x2[dpt.newaxis])
-    r_lo = dpt.logical_or(x1[:, dpt.newaxis], x2[dpt.newaxis])
-
-    assert dpt.all(dpt.equal(r_bw, r_lo))
-
-
-@pytest.mark.parametrize("dtype", ["?"] + _integral_dtypes)
-def test_bitwise_or_inplace_python_scalar(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
-    dt_kind = X.dtype.kind
-    if dt_kind == "b":
-        X |= False
-    else:
-        X |= int(0)
-
-
-@pytest.mark.parametrize("op1_dtype", ["?"] + _integral_dtypes)
-@pytest.mark.parametrize("op2_dtype", ["?"] + _integral_dtypes)
-def test_bitwise_or_inplace_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
-
-    dev = q.sycl_device
-    _fp16 = dev.has_aspect_fp16
-    _fp64 = dev.has_aspect_fp64
-    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
-        ar1 |= ar2
-        assert dpt.all(ar1 == 1)
-
-        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
-        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
-        ar3 |= ar4
-        assert dpt.all(ar3 == 1)
-    else:
-        with pytest.raises(ValueError):
-            ar1 |= ar2
-            dpt.bitwise_or(ar1, ar2, out=ar1)
-
-    # out is second arg
-    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
-    if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64):
-        dpt.bitwise_or(ar1, ar2, out=ar2)
-        assert dpt.all(ar2 == 1)
-
-        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
-        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
-        dpt.bitwise_or(ar3, ar4, out=ar4)
-        dpt.all(ar4 == 1)
-    else:
-        with pytest.raises(ValueError):
-            dpt.bitwise_or(ar1, ar2, out=ar2)
diff --git a/dpctl/tests/elementwise/test_bitwise_right_shift.py b/dpctl/tests/elementwise/test_bitwise_right_shift.py
deleted file mode 100644
index 7a4e24817e..0000000000
--- a/dpctl/tests/elementwise/test_bitwise_right_shift.py
+++ /dev/null
@@ -1,151 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2023-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless_equal required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import numpy as np
-import pytest
-
-import dpctl.tensor as dpt
-from dpctl.tensor._type_utils import _can_cast
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _integral_dtypes
-
-
-@pytest.mark.parametrize("op1_dtype", _integral_dtypes)
-@pytest.mark.parametrize("op2_dtype", _integral_dtypes)
-def test_bitwise_right_shift_dtype_matrix_contig(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    if op1_dtype != op2_dtype and "u8" in [op1_dtype, op2_dtype]:
-        return
-
-    sz = 7
-    n = 2 * sz
-    dt1 = dpt.dtype(op1_dtype)
-    dt2 = dpt.dtype(op2_dtype)
-
-    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
-    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)
-    x2 = dpt.arange(0, n, dtype=dt2)
-
-    r = dpt.bitwise_right_shift(x1, x2)
-    assert isinstance(r, dpt.usm_ndarray)
-    assert r.sycl_queue == x1.sycl_queue
-    assert r.sycl_queue == x2.sycl_queue
-
-    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op1_dtype)
-    x2_np = np.arange(0, n, dtype=op2_dtype)
-    r_np = np.right_shift(x1_np, x2_np)
-
-    assert r.dtype == r_np.dtype
-    assert (dpt.asnumpy(r) == r_np).all()
-
-
-@pytest.mark.parametrize("op1_dtype", _integral_dtypes)
-@pytest.mark.parametrize("op2_dtype", _integral_dtypes)
-def test_bitwise_right_shift_dtype_matrix_strided(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    if op1_dtype != op2_dtype and "u8" in [op1_dtype, op2_dtype]:
-        return
-
-    sz = 11
-    n = 2 * sz
-    dt1 = dpt.dtype(op1_dtype)
-    dt2 = dpt.dtype(op2_dtype)
-
-    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
-    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::-2]
-    x2 = dpt.arange(0, n, dtype=dt2)[::2]
-
-    r = dpt.bitwise_right_shift(x1, x2)
-    assert isinstance(r, dpt.usm_ndarray)
-    assert r.sycl_queue == x1.sycl_queue
-    assert r.sycl_queue == x2.sycl_queue
-
-    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::-2]
-    x2_np = np.arange(0, n, dtype=dt2)[::2]
-    r_np = np.right_shift(x1_np, x2_np)
-
-    assert r.dtype == r_np.dtype
-    assert (dpt.asnumpy(r) == r_np).all()
-
-
-@pytest.mark.parametrize("op_dtype", _integral_dtypes)
-def test_bitwise_right_shift_range(op_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op_dtype, q)
-
-    x = dpt.ones(255, dtype=op_dtype)
-    y = dpt.asarray(64, dtype=op_dtype)
-
-    z = dpt.bitwise_right_shift(x, y)
-    assert dpt.all(dpt.equal(z, 0))
-
-
-@pytest.mark.parametrize("dtype", _integral_dtypes)
-def test_bitwise_right_shift_inplace_python_scalar(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
-    X >>= int(0)
-
-
-@pytest.mark.parametrize("op1_dtype", _integral_dtypes)
-@pytest.mark.parametrize("op2_dtype", _integral_dtypes)
-def test_bitwise_right_shift_inplace_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
-
-    dev = q.sycl_device
-    _fp16 = dev.has_aspect_fp16
-    _fp64 = dev.has_aspect_fp64
-    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64):
-        ar1 >>= ar2
-        assert dpt.all(ar1 == 0)
-
-        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
-        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
-        ar3 >>= ar4
-        assert dpt.all(ar3 == 0)
-    else:
-        with pytest.raises(ValueError):
-            ar1 >>= ar2
-            dpt.bitwise_right_shift(ar1, ar2, out=ar1)
-
-    # out is second arg
-    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
-    if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64):
-        dpt.bitwise_right_shift(ar1, ar2, out=ar2)
-        assert dpt.all(ar2 == 0)
-
-        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
-        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
-        dpt.bitwise_right_shift(ar3, ar4, out=ar4)
-        dpt.all(ar4 == 0)
-    else:
-        with pytest.raises(ValueError):
-            dpt.bitwise_right_shift(ar1, ar2, out=ar2)
diff --git a/dpctl/tests/elementwise/test_bitwise_xor.py b/dpctl/tests/elementwise/test_bitwise_xor.py
deleted file mode 100644
index 1f2b96e793..0000000000
--- a/dpctl/tests/elementwise/test_bitwise_xor.py
+++ /dev/null
@@ -1,143 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2023-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless_equal required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import numpy as np
-import pytest
-
-import dpctl.tensor as dpt
-from dpctl.tensor._type_utils import _can_cast
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _integral_dtypes
-
-
-@pytest.mark.parametrize("op_dtype", _integral_dtypes)
-def test_bitwise_xor_dtype_matrix_contig(op_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op_dtype, q)
-
-    sz = 7
-    n = 2 * sz
-    dt1 = dpt.dtype(op_dtype)
-    dt2 = dpt.dtype(op_dtype)
-
-    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
-    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)
-
-    x2_range_begin = -sz if dpt.iinfo(dt2).min < 0 else 0
-    x2 = dpt.arange(x2_range_begin, x2_range_begin + n, dtype=dt1)
-
-    r = dpt.bitwise_xor(x1, x2)
-    assert isinstance(r, dpt.usm_ndarray)
-
-    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op_dtype)
-    x2_np = np.arange(x2_range_begin, x2_range_begin + n, dtype=op_dtype)
-    r_np = np.bitwise_xor(x1_np, x2_np)
-
-    assert (r_np == dpt.asnumpy(r)).all()
-
-
-@pytest.mark.parametrize("op_dtype", _integral_dtypes)
-def test_bitwise_xor_dtype_matrix_strided(op_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op_dtype, q)
-
-    sz = 11
-    n = 2 * sz
-    dt1 = dpt.dtype(op_dtype)
-    dt2 = dpt.dtype(op_dtype)
-
-    x1_range_begin = -sz if dpt.iinfo(dt1).min < 0 else 0
-    x1 = dpt.arange(x1_range_begin, x1_range_begin + n, dtype=dt1)[::2]
-
-    x2_range_begin = -(sz // 2) if dpt.iinfo(dt2).min < 0 else 0
-    x2 = dpt.arange(x2_range_begin, x2_range_begin + n, dtype=dt1)[::-2]
-
-    r = dpt.bitwise_xor(x1, x2)
-    assert isinstance(r, dpt.usm_ndarray)
-
-    x1_np = np.arange(x1_range_begin, x1_range_begin + n, dtype=op_dtype)[::2]
-    x2_np = np.arange(x2_range_begin, x2_range_begin + n, dtype=op_dtype)[::-2]
-    r_np = np.bitwise_xor(x1_np, x2_np)
-
-    assert (r_np == dpt.asnumpy(r)).all()
-
-
-def test_bitwise_xor_bool():
-    get_queue_or_skip()
-
-    x1 = dpt.asarray([True, False])
-    x2 = dpt.asarray([False, True])
-
-    r_bw = dpt.bitwise_xor(x1[:, dpt.newaxis], x2[dpt.newaxis])
-    r_lo = dpt.logical_xor(x1[:, dpt.newaxis], x2[dpt.newaxis])
-
-    assert dpt.all(dpt.equal(r_bw, r_lo))
-
-
-@pytest.mark.parametrize("dtype", ["?"] + _integral_dtypes)
-def test_bitwise_xor_inplace_python_scalar(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
-    dt_kind = X.dtype.kind
-    if dt_kind == "b":
-        X ^= False
-    else:
-        X ^= int(0)
-
-
-@pytest.mark.parametrize("op1_dtype", ["?"] + _integral_dtypes)
-@pytest.mark.parametrize("op2_dtype", ["?"] + _integral_dtypes)
-def test_bitwise_xor_inplace_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
-
-    dev = q.sycl_device
-    _fp16 = dev.has_aspect_fp16
-    _fp64 = dev.has_aspect_fp64
-    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
-        ar1 ^= ar2
-        assert dpt.all(ar1 == 0)
-
-        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
-        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
-        ar3 ^= ar4
-        assert dpt.all(ar3 == 0)
-    else:
-        with pytest.raises(ValueError):
-            ar1 ^= ar2
-            dpt.bitwise_xor(ar1, ar2, out=ar1)
-
-    # out is second arg
-    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
-    if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64):
-        dpt.bitwise_xor(ar1, ar2, out=ar2)
-        assert dpt.all(ar2 == 0)
-
-        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
-        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
-        dpt.bitwise_xor(ar3, ar4, out=ar4)
-        dpt.all(ar4 == 0)
-    else:
-        with pytest.raises(ValueError):
-            dpt.bitwise_xor(ar1, ar2, out=ar2)
diff --git a/dpctl/tests/elementwise/test_cbrt.py b/dpctl/tests/elementwise/test_cbrt.py
deleted file mode 100644
index a77bebcc81..0000000000
--- a/dpctl/tests/elementwise/test_cbrt.py
+++ /dev/null
@@ -1,79 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import numpy as np
-import pytest
-from numpy.testing import assert_allclose
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _map_to_device_dtype, _no_complex_dtypes, _real_fp_dtypes
-
-
-@pytest.mark.parametrize("dtype", _no_complex_dtypes)
-def test_cbrt_out_type(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
-    expected_dtype = np.cbrt(np.array(0, dtype=dtype)).dtype
-    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
-    assert dpt.cbrt(X).dtype == expected_dtype
-
-
-@pytest.mark.parametrize("dtype", _real_fp_dtypes)
-def test_cbrt_output_contig(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n_seq = 1027
-
-    X = dpt.linspace(0, 13, num=n_seq, dtype=dtype, sycl_queue=q)
-    Xnp = dpt.asnumpy(X)
-
-    Y = dpt.cbrt(X)
-    tol = 8 * dpt.finfo(Y.dtype).resolution
-
-    assert_allclose(dpt.asnumpy(Y), np.cbrt(Xnp), atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("dtype", _real_fp_dtypes)
-def test_cbrt_output_strided(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n_seq = 2054
-
-    X = dpt.linspace(0, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
-    Xnp = dpt.asnumpy(X)
-
-    Y = dpt.cbrt(X)
-    tol = 8 * dpt.finfo(Y.dtype).resolution
-
-    assert_allclose(dpt.asnumpy(Y), np.cbrt(Xnp), atol=tol, rtol=tol)
-
-
-@pytest.mark.usefixtures("suppress_invalid_numpy_warnings")
-def test_cbrt_special_cases():
-    get_queue_or_skip()
-
-    X = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4")
-    res = dpt.cbrt(X)
-    expected = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4")
-    tol = dpt.finfo(dpt.float32).resolution
-
-    assert dpt.allclose(res, expected, atol=tol, rtol=tol, equal_nan=True)
diff --git a/dpctl/tests/elementwise/test_complex.py b/dpctl/tests/elementwise/test_complex.py
deleted file mode 100644
index 62c8f12dae..0000000000
--- a/dpctl/tests/elementwise/test_complex.py
+++ /dev/null
@@ -1,221 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import itertools
-import warnings
-
-import numpy as np
-import pytest
-from numpy.testing import assert_allclose
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes, _map_to_device_dtype, _usm_types
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_complex_out_type(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
-    expected_dtype = np.real(np.array(0, dtype=dtype)).dtype
-    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
-    assert dpt.real(X).dtype == expected_dtype
-
-    expected_dtype = np.imag(np.array(0, dtype=dtype)).dtype
-    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
-    assert dpt.imag(X).dtype == expected_dtype
-
-    expected_dtype = np.conj(np.array(0, dtype=dtype)).dtype
-    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
-    assert dpt.conj(X).dtype == expected_dtype
-
-
-@pytest.mark.parametrize(
-    "np_call, dpt_call",
-    [(np.real, dpt.real), (np.imag, dpt.imag), (np.conj, dpt.conj)],
-)
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_complex_output(np_call, dpt_call, dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n_seq = 100
-
-    x1 = np.linspace(0, 10, num=n_seq, dtype=dtype)
-    x2 = np.linspace(0, 20, num=n_seq, dtype=dtype)
-    Xnp = x1 + 1j * x2
-    X = dpt.asarray(Xnp, sycl_queue=q)
-
-    Y = dpt_call(X)
-    tol = 8 * dpt.finfo(Y.dtype).resolution
-
-    assert_allclose(dpt.asnumpy(Y), np_call(Xnp), atol=tol, rtol=tol)
-
-    Z = dpt.empty_like(X, dtype=Y.dtype)
-    dpt_call(X, out=Z)
-
-    assert_allclose(dpt.asnumpy(Z), np_call(Xnp), atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize(
-    "np_call, dpt_call",
-    [(np.real, dpt.real), (np.imag, dpt.imag), (np.conj, dpt.conj)],
-)
-@pytest.mark.parametrize("usm_type", _usm_types)
-def test_complex_usm_type(np_call, dpt_call, usm_type):
-    q = get_queue_or_skip()
-
-    arg_dt = np.dtype("c8")
-    input_shape = (10, 10, 10, 10)
-    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
-    X[..., 0::2] = np.pi / 6 + 1j * np.pi / 3
-    X[..., 1::2] = np.pi / 3 + 1j * np.pi / 6
-
-    Y = dpt_call(X)
-    assert Y.usm_type == X.usm_type
-    assert Y.sycl_queue == X.sycl_queue
-    assert Y.flags.c_contiguous
-
-    expected_Y = np.empty(input_shape, dtype=arg_dt)
-    expected_Y[..., 0::2] = np_call(np.complex64(np.pi / 6 + 1j * np.pi / 3))
-    expected_Y[..., 1::2] = np_call(np.complex64(np.pi / 3 + 1j * np.pi / 6))
-    tol = 8 * dpt.finfo(Y.dtype).resolution
-
-    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize(
-    "np_call, dpt_call",
-    [(np.real, dpt.real), (np.imag, dpt.imag), (np.conj, dpt.conj)],
-)
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_complex_order(np_call, dpt_call, dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    arg_dt = np.dtype(dtype)
-    input_shape = (10, 10, 10, 10)
-    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
-    X[..., 0::2] = np.pi / 6 + 1j * np.pi / 3
-    X[..., 1::2] = np.pi / 3 + 1j * np.pi / 6
-
-    for perms in itertools.permutations(range(4)):
-        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
-        expected_Y = np_call(dpt.asnumpy(U))
-        for ord in ["C", "F", "A", "K"]:
-            Y = dpt_call(U, order=ord)
-            assert_allclose(dpt.asnumpy(Y), expected_Y)
-
-
-@pytest.mark.parametrize("dtype", ["c8", "c16"])
-def test_projection_complex(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    X = [
-        complex(1, 2),
-        complex(dpt.inf, -1),
-        complex(0, -dpt.inf),
-        complex(-dpt.inf, dpt.nan),
-    ]
-    Y = [
-        complex(1, 2),
-        complex(np.inf, -0.0),
-        complex(np.inf, -0.0),
-        complex(np.inf, 0.0),
-    ]
-
-    Xf = dpt.asarray(X, dtype=dtype, sycl_queue=q)
-    Yf = np.array(Y, dtype=dtype)
-
-    tol = 8 * dpt.finfo(Xf.dtype).resolution
-    assert_allclose(dpt.asnumpy(dpt.proj(Xf)), Yf, atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_projection(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    Xf = dpt.asarray(1, dtype=dtype, sycl_queue=q)
-    out_dtype = dpt.proj(Xf).dtype
-    Yf = np.array(complex(1, 0), dtype=out_dtype)
-
-    tol = 8 * dpt.finfo(Yf.dtype).resolution
-    assert_allclose(dpt.asnumpy(dpt.proj(Xf)), Yf, atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize(
-    "np_call, dpt_call",
-    [(np.real, dpt.real), (np.imag, dpt.imag), (np.conj, dpt.conj)],
-)
-@pytest.mark.parametrize("dtype", ["c8", "c16"])
-def test_complex_strided(np_call, dpt_call, dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    np.random.seed(42)
-    strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4])
-    sizes = [2, 4, 6, 8, 9, 24, 72]
-    tol = 8 * dpt.finfo(dtype).resolution
-
-    low = -1000.0
-    high = 1000.0
-    for ii in sizes:
-        x1 = np.random.uniform(low=low, high=high, size=ii)
-        x2 = np.random.uniform(low=low, high=high, size=ii)
-        Xnp = np.array([complex(v1, v2) for v1, v2 in zip(x1, x2)], dtype=dtype)
-        X = dpt.asarray(Xnp)
-        Ynp = np_call(Xnp)
-        for jj in strides:
-            assert_allclose(
-                dpt.asnumpy(dpt_call(X[::jj])),
-                Ynp[::jj],
-                atol=tol,
-                rtol=tol,
-            )
-
-
-@pytest.mark.parametrize("dtype", ["c8", "c16"])
-def test_complex_special_cases(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    x = [np.nan, -np.nan, np.inf, -np.inf, +0.0, -0.0]
-    xc = [complex(*val) for val in itertools.product(x, repeat=2)]
-
-    Xc_np = np.array(xc, dtype=dtype)
-    Xc = dpt.asarray(Xc_np, dtype=dtype, sycl_queue=q)
-
-    tol = 8 * dpt.finfo(dtype).resolution
-
-    actual = dpt.real(Xc)
-    expected = np.real(Xc_np)
-    assert_allclose(dpt.asnumpy(actual), expected, atol=tol, rtol=tol)
-
-    actual = dpt.imag(Xc)
-    expected = np.imag(Xc_np)
-    assert_allclose(dpt.asnumpy(actual), expected, atol=tol, rtol=tol)
-
-    actual = dpt.conj(Xc)
-    expected = np.conj(Xc_np)
-
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")
-        assert_allclose(dpt.asnumpy(actual), expected, atol=tol, rtol=tol)
diff --git a/dpctl/tests/elementwise/test_copysign.py b/dpctl/tests/elementwise/test_copysign.py
deleted file mode 100644
index 9425cebb71..0000000000
--- a/dpctl/tests/elementwise/test_copysign.py
+++ /dev/null
@@ -1,111 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import ctypes
-
-import numpy as np
-import pytest
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _compare_dtypes, _no_complex_dtypes, _real_fp_dtypes
-
-
-@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes)
-@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes)
-def test_copysign_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1 = dpt.ones(sz, dtype=op1_dtype)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
-
-    r = dpt.copysign(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.copysign(
-        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
-    )
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar1.shape
-    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
-    assert r.sycl_queue == ar1.sycl_queue
-
-    ar3 = dpt.ones(sz, dtype=op1_dtype)
-    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
-
-    r = dpt.copysign(ar3[::-1], ar4[::2])
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.copysign(
-        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
-    )
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar3.shape
-    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
-
-
-@pytest.mark.parametrize("arr_dt", _real_fp_dtypes)
-def test_copysign_python_scalar(arr_dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arr_dt, q)
-
-    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
-    py_ones = (
-        bool(1),
-        int(1),
-        float(1),
-        np.float32(1),
-        ctypes.c_int(1),
-    )
-    for sc in py_ones:
-        R = dpt.copysign(X, sc)
-        assert isinstance(R, dpt.usm_ndarray)
-        R = dpt.copysign(sc, X)
-        assert isinstance(R, dpt.usm_ndarray)
-
-
-@pytest.mark.parametrize("dt", _real_fp_dtypes)
-def test_copysign(dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    x = dpt.arange(100, dtype=dt, sycl_queue=q)
-    x[1::2] *= -1
-    y = dpt.ones(100, dtype=dt, sycl_queue=q)
-    y[::2] *= -1
-    res = dpt.copysign(x, y)
-    expected = dpt.negative(x)
-    tol = dpt.finfo(dt).resolution
-    assert dpt.allclose(res, expected, atol=tol, rtol=tol)
-
-
-def test_copysign_special_values():
-    get_queue_or_skip()
-
-    x1 = dpt.asarray([1.0, 0.0, dpt.nan, dpt.nan], dtype="f4")
-    y1 = dpt.asarray([-1.0, -0.0, -dpt.nan, -1], dtype="f4")
-    res = dpt.copysign(x1, y1)
-    assert dpt.all(dpt.signbit(res))
-    x2 = dpt.asarray([-1.0, -0.0, -dpt.nan, -dpt.nan], dtype="f4")
-    res = dpt.copysign(x2, y1)
-    assert dpt.all(dpt.signbit(res))
-    y2 = dpt.asarray([0.0, 1.0, dpt.nan, 1.0], dtype="f4")
-    res = dpt.copysign(x2, y2)
-    assert not dpt.any(dpt.signbit(res))
-    res = dpt.copysign(x1, y2)
-    assert not dpt.any(dpt.signbit(res))
diff --git a/dpctl/tests/elementwise/test_divide.py b/dpctl/tests/elementwise/test_divide.py
deleted file mode 100644
index 2945b70cf0..0000000000
--- a/dpctl/tests/elementwise/test_divide.py
+++ /dev/null
@@ -1,298 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import ctypes
-
-import numpy as np
-import pytest
-
-import dpctl
-import dpctl.tensor as dpt
-from dpctl.tensor._tensor_elementwise_impl import _divide_by_scalar
-from dpctl.tensor._type_utils import _can_cast
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-from dpctl.utils import SequentialOrderManager
-
-from .utils import (
-    _all_dtypes,
-    _compare_dtypes,
-    _complex_fp_dtypes,
-    _real_fp_dtypes,
-    _usm_types,
-)
-
-
-@pytest.mark.parametrize("op1_dtype", _all_dtypes)
-@pytest.mark.parametrize("op2_dtype", _all_dtypes)
-def test_divide_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1 = dpt.ones(sz, dtype=op1_dtype)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
-
-    r = dpt.divide(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.divide(
-        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
-    )
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar1.shape
-    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
-    assert r.sycl_queue == ar1.sycl_queue
-
-    ar3 = dpt.ones(sz, dtype=op1_dtype)
-    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
-
-    r = dpt.divide(ar3[::-1], ar4[::2])
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.divide(
-        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
-    )
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar3.shape
-    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
-
-
-@pytest.mark.parametrize("op1_usm_type", _usm_types)
-@pytest.mark.parametrize("op2_usm_type", _usm_types)
-def test_divide_usm_type_matrix(op1_usm_type, op2_usm_type):
-    get_queue_or_skip()
-
-    sz = 128
-    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
-    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
-
-    r = dpt.divide(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
-    assert r.usm_type == expected_usm_type
-
-
-def test_divide_order():
-    get_queue_or_skip()
-
-    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
-    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
-    r1 = dpt.divide(ar1, ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.divide(ar1, ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.divide(ar1, ar2, order="A")
-    assert r3.flags.c_contiguous
-    r4 = dpt.divide(ar1, ar2, order="K")
-    assert r4.flags.c_contiguous
-
-    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
-    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
-    r1 = dpt.divide(ar1, ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.divide(ar1, ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.divide(ar1, ar2, order="A")
-    assert r3.flags.f_contiguous
-    r4 = dpt.divide(ar1, ar2, order="K")
-    assert r4.flags.f_contiguous
-
-    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
-    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
-    r4 = dpt.divide(ar1, ar2, order="K")
-    assert r4.strides == (20, -1)
-
-    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
-    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
-    r4 = dpt.divide(ar1, ar2, order="K")
-    assert r4.strides == (-1, 20)
-
-
-def test_divide_broadcasting():
-    get_queue_or_skip()
-
-    m = dpt.ones((100, 5), dtype="i4")
-    v = dpt.arange(1, 6, dtype="i4")
-
-    r = dpt.divide(m, v)
-
-    expected = np.divide(
-        np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4")
-    )
-    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
-
-    r2 = dpt.divide(v, m)
-    expected2 = np.divide(
-        np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4")
-    )
-    assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all()
-
-
-@pytest.mark.parametrize("arr_dt", _all_dtypes)
-def test_divide_python_scalar(arr_dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arr_dt, q)
-
-    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
-    py_ones = (
-        bool(1),
-        int(1),
-        float(1),
-        complex(1),
-        np.float32(1),
-        ctypes.c_int(1),
-    )
-    for sc in py_ones:
-        R = dpt.divide(X, sc)
-        assert isinstance(R, dpt.usm_ndarray)
-        R = dpt.divide(sc, X)
-        assert isinstance(R, dpt.usm_ndarray)
-
-
-class MockArray:
-    def __init__(self, arr):
-        self.data_ = arr
-
-    @property
-    def __sycl_usm_array_interface__(self):
-        return self.data_.__sycl_usm_array_interface__
-
-
-def test_divide_mock_array():
-    get_queue_or_skip()
-    a = dpt.arange(10)
-    b = dpt.ones(10)
-    c = MockArray(b)
-    r = dpt.divide(a, c)
-    assert isinstance(r, dpt.usm_ndarray)
-
-
-def test_divide_canary_mock_array():
-    get_queue_or_skip()
-    a = dpt.arange(10)
-
-    class Canary:
-        def __init__(self):
-            pass
-
-        @property
-        def __sycl_usm_array_interface__(self):
-            return None
-
-    c = Canary()
-    with pytest.raises(ValueError):
-        dpt.divide(a, c)
-
-
-@pytest.mark.parametrize("dtype", _real_fp_dtypes + _complex_fp_dtypes)
-def test_divide_inplace_python_scalar(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
-    dt_kind = X.dtype.kind
-    if dt_kind == "f":
-        X /= float(1)
-    elif dt_kind == "c":
-        X /= complex(1)
-
-
-@pytest.mark.parametrize("op1_dtype", _all_dtypes)
-@pytest.mark.parametrize("op2_dtype", _all_dtypes)
-def test_divide_inplace_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
-
-    dev = q.sycl_device
-    _fp16 = dev.has_aspect_fp16
-    _fp64 = dev.has_aspect_fp64
-    # out array only valid if it is inexact
-    if (
-        _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind")
-        and dpt.dtype(op1_dtype).kind in "fc"
-    ):
-        ar1 /= ar2
-        assert dpt.all(ar1 == 1)
-
-        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
-        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
-        ar3 /= ar4
-        assert dpt.all(ar3 == 1)
-    else:
-        with pytest.raises(ValueError):
-            ar1 /= ar2
-            dpt.divide(ar1, ar2, out=ar1)
-
-    # out is second arg
-    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
-    if (
-        _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64)
-        and dpt.dtype(op2_dtype).kind in "fc"
-    ):
-        dpt.divide(ar1, ar2, out=ar2)
-        assert dpt.all(ar2 == 1)
-
-        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
-        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
-        dpt.divide(ar3, ar4, out=ar4)
-        dpt.all(ar4 == 1)
-    else:
-        with pytest.raises(ValueError):
-            dpt.divide(ar1, ar2, out=ar2)
-
-
-def test_divide_gh_1711():
-    "See https://github.com/IntelPython/dpctl/issues/1711"
-    get_queue_or_skip()
-
-    res = dpt.divide(-4, dpt.asarray(1, dtype="u4"))
-    assert isinstance(res, dpt.usm_ndarray)
-    assert res.dtype.kind == "f"
-    assert dpt.allclose(res, -4 / dpt.asarray(1, dtype="i4"))
-
-    res = dpt.divide(dpt.asarray(3, dtype="u4"), -2)
-    assert isinstance(res, dpt.usm_ndarray)
-    assert res.dtype.kind == "f"
-    assert dpt.allclose(res, dpt.asarray(3, dtype="i4") / -2)
-
-
-# don't test for overflowing double as Python won't cast
-# a Python integer of that size to a Python float
-@pytest.mark.parametrize("fp_dt", [dpt.float16, dpt.float32])
-def test_divide_by_scalar_overflow(fp_dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(fp_dt, q)
-
-    x = dpt.ones(10, dtype=fp_dt, sycl_queue=q)
-    out = dpt.empty_like(x)
-
-    max_exp = np.finfo(fp_dt).maxexp
-    sca = 2**max_exp
-
-    _manager = SequentialOrderManager[q]
-    dep_evs = _manager.submitted_events
-    _, ev = _divide_by_scalar(
-        src=x, scalar=sca, dst=out, sycl_queue=q, depends=dep_evs
-    )
-    ev.wait()
-
-    assert dpt.all(out == 0)
diff --git a/dpctl/tests/elementwise/test_elementwise_classes.py b/dpctl/tests/elementwise/test_elementwise_classes.py
deleted file mode 100644
index d5906c34e1..0000000000
--- a/dpctl/tests/elementwise/test_elementwise_classes.py
+++ /dev/null
@@ -1,137 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import pytest
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip
-
-unary_fn = dpt.negative
-binary_fn = dpt.divide
-
-
-def test_unary_class_getters():
-    fn = unary_fn.get_implementation_function()
-    assert callable(fn)
-
-    fn = unary_fn.get_type_result_resolver_function()
-    assert callable(fn)
-
-
-def test_unary_class_types_property():
-    get_queue_or_skip()
-    loop_types = unary_fn.types
-    assert isinstance(loop_types, list)
-    assert len(loop_types) > 0
-    assert all(isinstance(sig, str) for sig in loop_types)
-    assert all("->" in sig for sig in loop_types)
-
-
-def test_unary_class_str_repr():
-    s = str(unary_fn)
-    r = repr(unary_fn)
-
-    assert isinstance(s, str)
-    assert isinstance(r, str)
-    kl_n = unary_fn.__name__
-    assert kl_n in s
-    assert kl_n in r
-
-
-def test_unary_read_only_out():
-    get_queue_or_skip()
-    x = dpt.arange(32, dtype=dpt.int32)
-    r = dpt.empty_like(x)
-    r.flags["W"] = False
-    with pytest.raises(ValueError):
-        unary_fn(x, out=r)
-
-
-def test_binary_class_getters():
-    fn = binary_fn.get_implementation_function()
-    assert callable(fn)
-
-    fn = binary_fn.get_implementation_inplace_function()
-    assert callable(fn)
-
-    fn = binary_fn.get_type_result_resolver_function()
-    assert callable(fn)
-
-    fn = binary_fn.get_type_promotion_path_acceptance_function()
-    assert callable(fn)
-
-
-def test_binary_class_types_property():
-    get_queue_or_skip()
-    loop_types = binary_fn.types
-    assert isinstance(loop_types, list)
-    assert len(loop_types) > 0
-    assert all(isinstance(sig, str) for sig in loop_types)
-    assert all("->" in sig for sig in loop_types)
-
-
-def test_binary_class_str_repr():
-    s = str(binary_fn)
-    r = repr(binary_fn)
-
-    assert isinstance(s, str)
-    assert isinstance(r, str)
-    kl_n = binary_fn.__name__
-    assert kl_n in s
-    assert kl_n in r
-
-
-def test_unary_class_nin():
-    nin = unary_fn.nin
-    assert isinstance(nin, int)
-    assert nin == 1
-
-
-def test_binary_class_nin():
-    nin = binary_fn.nin
-    assert isinstance(nin, int)
-    assert nin == 2
-
-
-def test_unary_class_nout():
-    nout = unary_fn.nout
-    assert isinstance(nout, int)
-    assert nout == 1
-
-
-def test_binary_class_nout():
-    nout = binary_fn.nout
-    assert isinstance(nout, int)
-    assert nout == 1
-
-
-def test_binary_read_only_out():
-    get_queue_or_skip()
-    x1 = dpt.ones(32, dtype=dpt.float32)
-    x2 = dpt.ones_like(x1)
-    r = dpt.empty_like(x1)
-    r.flags["W"] = False
-    with pytest.raises(ValueError):
-        binary_fn(x1, x2, out=r)
-
-
-def test_binary_no_inplace_op():
-    get_queue_or_skip()
-    x1 = dpt.ones(10, dtype="i4")
-    x2 = dpt.ones_like(x1)
-
-    with pytest.raises(ValueError):
-        dpt.logaddexp._inplace_op(x1, x2)
diff --git a/dpctl/tests/elementwise/test_equal.py b/dpctl/tests/elementwise/test_equal.py
deleted file mode 100644
index 6b80769f69..0000000000
--- a/dpctl/tests/elementwise/test_equal.py
+++ /dev/null
@@ -1,190 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import ctypes
-
-import numpy as np
-import pytest
-
-import dpctl
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes, _compare_dtypes, _usm_types
-
-
-@pytest.mark.parametrize("op1_dtype", _all_dtypes)
-@pytest.mark.parametrize("op2_dtype", _all_dtypes)
-def test_equal_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1 = dpt.ones(sz, dtype=op1_dtype)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
-
-    r = dpt.equal(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected_dtype = np.equal(
-        np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype)
-    ).dtype
-    assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q)
-    assert r.shape == ar1.shape
-    assert (dpt.asnumpy(r) == np.full(r.shape, True, dtype=r.dtype)).all()
-    assert r.sycl_queue == ar1.sycl_queue
-
-    ar3 = dpt.ones(sz, dtype=op1_dtype)
-    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
-
-    r = dpt.equal(ar3[::-1], ar4[::2])
-    assert isinstance(r, dpt.usm_ndarray)
-    expected_dtype = np.equal(
-        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
-    ).dtype
-    assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q)
-    assert r.shape == ar3.shape
-    assert (dpt.asnumpy(r) == np.full(r.shape, True, dtype=r.dtype)).all()
-
-
-@pytest.mark.parametrize("op1_usm_type", _usm_types)
-@pytest.mark.parametrize("op2_usm_type", _usm_types)
-def test_equal_usm_type_matrix(op1_usm_type, op2_usm_type):
-    get_queue_or_skip()
-
-    sz = 128
-    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
-    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
-
-    r = dpt.equal(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
-    assert r.usm_type == expected_usm_type
-
-
-def test_equal_order():
-    get_queue_or_skip()
-
-    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
-    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
-    r1 = dpt.equal(ar1, ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.equal(ar1, ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.equal(ar1, ar2, order="A")
-    assert r3.flags.c_contiguous
-    r4 = dpt.equal(ar1, ar2, order="K")
-    assert r4.flags.c_contiguous
-
-    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
-    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
-    r1 = dpt.equal(ar1, ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.equal(ar1, ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.equal(ar1, ar2, order="A")
-    assert r3.flags.f_contiguous
-    r4 = dpt.equal(ar1, ar2, order="K")
-    assert r4.flags.f_contiguous
-
-    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
-    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
-    r4 = dpt.equal(ar1, ar2, order="K")
-    assert r4.strides == (20, -1)
-
-    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
-    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
-    r4 = dpt.equal(ar1, ar2, order="K")
-    assert r4.strides == (-1, 20)
-
-
-def test_equal_broadcasting():
-    get_queue_or_skip()
-
-    m = dpt.ones((100, 5), dtype="i4")
-    v = dpt.arange(5, dtype="i4")
-
-    r = dpt.equal(m, v)
-    expected = np.full((100, 5), [False, True, False, False, False], dtype="?")
-
-    assert (dpt.asnumpy(r) == expected).all()
-
-    r2 = dpt.equal(v, m)
-    assert (dpt.asnumpy(r2) == expected).all()
-
-    r3 = dpt.empty_like(m, dtype="?")
-    dpt.equal(m, v, out=r3)
-    assert (dpt.asnumpy(r3) == expected).all()
-
-
-@pytest.mark.parametrize("arr_dt", _all_dtypes)
-def test_equal_python_scalar(arr_dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arr_dt, q)
-
-    X = dpt.zeros((10, 10), dtype=arr_dt, sycl_queue=q)
-    py_zeros = (
-        bool(0),
-        int(0),
-        float(0),
-        complex(0),
-        np.float32(0),
-        ctypes.c_int(0),
-    )
-    for sc in py_zeros:
-        R = dpt.equal(X, sc)
-        assert isinstance(R, dpt.usm_ndarray)
-        assert dpt.all(R)
-        R = dpt.equal(sc, X)
-        assert isinstance(R, dpt.usm_ndarray)
-        assert dpt.all(R)
-
-
-class MockArray:
-    def __init__(self, arr):
-        self.data_ = arr
-
-    @property
-    def __sycl_usm_array_interface__(self):
-        return self.data_.__sycl_usm_array_interface__
-
-
-def test_equal_mock_array():
-    get_queue_or_skip()
-    a = dpt.arange(10)
-    b = dpt.ones(10)
-    c = MockArray(b)
-    r = dpt.equal(a, c)
-    assert isinstance(r, dpt.usm_ndarray)
-
-
-def test_equal_canary_mock_array():
-    get_queue_or_skip()
-    a = dpt.arange(10)
-
-    class Canary:
-        def __init__(self):
-            pass
-
-        @property
-        def __sycl_usm_array_interface__(self):
-            return None
-
-    c = Canary()
-    with pytest.raises(ValueError):
-        dpt.equal(a, c)
diff --git a/dpctl/tests/elementwise/test_exp.py b/dpctl/tests/elementwise/test_exp.py
deleted file mode 100644
index 588b294d7a..0000000000
--- a/dpctl/tests/elementwise/test_exp.py
+++ /dev/null
@@ -1,234 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import itertools
-
-import numpy as np
-import pytest
-from numpy.testing import assert_allclose, assert_array_equal
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes, _map_to_device_dtype, _usm_types
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_exp_out_type(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
-    expected_dtype = np.exp(np.array(0, dtype=dtype)).dtype
-    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
-    assert dpt.exp(X).dtype == expected_dtype
-
-
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
-def test_exp_real_contig(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n_seq = 100
-    n_rep = 137
-    Xnp = np.linspace(0.01, 88.1, num=n_seq, dtype=dtype)
-    X = dpt.asarray(np.repeat(Xnp, n_rep), dtype=dtype, sycl_queue=q)
-    Y = dpt.exp(X)
-    with np.errstate(all="ignore"):
-        Ynp = np.exp(Xnp)
-
-    tol = 8 * dpt.finfo(dtype).resolution
-    assert_allclose(dpt.asnumpy(Y), np.repeat(Ynp, n_rep), atol=tol, rtol=tol)
-
-    Z = dpt.empty_like(X, dtype=dtype)
-    dpt.exp(X, out=Z)
-
-    assert_allclose(dpt.asnumpy(Z), np.repeat(Ynp, n_rep), atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("dtype", ["c8", "c16"])
-def test_exp_complex_contig(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n_seq = 100
-    n_rep = 137
-    low = -88.0
-    high = 88.0
-    x1 = np.random.uniform(low=low, high=high, size=n_seq)
-    x2 = np.random.uniform(low=low, high=high, size=n_seq)
-    Xnp = np.array([complex(v1, v2) for v1, v2 in zip(x1, x2)], dtype=dtype)
-
-    X = dpt.asarray(np.repeat(Xnp, n_rep), dtype=dtype, sycl_queue=q)
-    Y = dpt.exp(X)
-
-    tol = 8 * dpt.finfo(dtype).resolution
-    assert_allclose(
-        dpt.asnumpy(Y), np.repeat(np.exp(Xnp), n_rep), atol=tol, rtol=tol
-    )
-
-    Z = dpt.empty_like(X, dtype=dtype)
-    dpt.exp(X, out=Z)
-
-    assert_allclose(
-        dpt.asnumpy(Z), np.repeat(np.exp(Xnp), n_rep), atol=tol, rtol=tol
-    )
-
-
-@pytest.mark.parametrize("usm_type", _usm_types)
-def test_exp_usm_type(usm_type):
-    q = get_queue_or_skip()
-
-    arg_dt = np.dtype("f4")
-    input_shape = (10, 10, 10, 10)
-    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
-    X[..., 0::2] = 16.0
-    X[..., 1::2] = 23.0
-
-    Y = dpt.exp(X)
-    assert Y.usm_type == X.usm_type
-    assert Y.sycl_queue == X.sycl_queue
-    assert Y.flags.c_contiguous
-
-    expected_Y = np.empty(input_shape, dtype=arg_dt)
-    expected_Y[..., 0::2] = np.exp(np.float32(16.0))
-    expected_Y[..., 1::2] = np.exp(np.float32(23.0))
-    tol = 8 * dpt.finfo(Y.dtype).resolution
-
-    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_exp_order(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    arg_dt = np.dtype(dtype)
-    input_shape = (10, 10, 10, 10)
-    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
-    X[..., 0::2] = 8.0
-    X[..., 1::2] = 11.0
-
-    for perms in itertools.permutations(range(4)):
-        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
-        expected_Y = np.exp(dpt.asnumpy(U))
-        for ord in ["C", "F", "A", "K"]:
-            Y = dpt.exp(U, order=ord)
-            tol = 8 * max(
-                dpt.finfo(Y.dtype).resolution,
-                np.finfo(expected_Y.dtype).resolution,
-            )
-            assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
-def test_exp_analytical_values(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    tol = 8 * dpt.finfo(dtype).resolution
-    x = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-    log2_ = 0.69314718055994530943
-    Xnp = np.array(x, dtype=dtype) * log2_
-    X = dpt.asarray(Xnp, dtype=dtype)
-    assert_allclose(dpt.asnumpy(dpt.exp(X)), np.exp(Xnp), atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
-def test_exp_real_special_cases(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    tol = 8 * dpt.finfo(dtype).resolution
-    x = [np.nan, np.inf, -np.inf, 0.0, -0.0]
-    Xnp = np.array(x, dtype=dtype)
-    X = dpt.asarray(x, dtype=dtype)
-
-    Y = dpt.asnumpy(dpt.exp(X))
-    Ynp = np.exp(Xnp)
-    assert_allclose(Y, Ynp, atol=tol, rtol=tol)
-    assert_array_equal(np.signbit(Y), np.signbit(Ynp))
-
-
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
-def test_exp_real_strided(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    np.random.seed(42)
-    strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4])
-    sizes = [2, 4, 6, 8, 9, 24, 72]
-    tol = 8 * dpt.finfo(dtype).resolution
-
-    for ii in sizes:
-        Xnp = np.random.uniform(low=0.01, high=88.1, size=ii)
-        Xnp.astype(dtype)
-        X = dpt.asarray(Xnp)
-        Ynp = np.exp(Xnp)
-        for jj in strides:
-            assert_allclose(
-                dpt.asnumpy(dpt.exp(X[::jj])),
-                Ynp[::jj],
-                atol=tol,
-                rtol=tol,
-            )
-
-
-@pytest.mark.parametrize("dtype", ["c8", "c16"])
-def test_exp_complex_strided(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    np.random.seed(42)
-    strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4])
-    sizes = [2, 4, 6, 8, 9, 24, 72]
-    tol = 8 * dpt.finfo(dtype).resolution
-
-    low = -88.0
-    high = 88.0
-    for ii in sizes:
-        x1 = np.random.uniform(low=low, high=high, size=ii)
-        x2 = np.random.uniform(low=low, high=high, size=ii)
-        Xnp = np.array([complex(v1, v2) for v1, v2 in zip(x1, x2)], dtype=dtype)
-        X = dpt.asarray(Xnp)
-        Ynp = np.exp(Xnp)
-        for jj in strides:
-            assert_allclose(
-                dpt.asnumpy(dpt.exp(X[::jj])),
-                Ynp[::jj],
-                atol=tol,
-                rtol=tol,
-            )
-
-
-@pytest.mark.parametrize("dtype", ["c8", "c16"])
-def test_exp_complex_special_cases(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    x = [np.nan, np.inf, -np.inf, +0.0, -0.0, +1.0, -1.0]
-    xc = [complex(*val) for val in itertools.product(x, repeat=2)]
-
-    Xc_np = np.array(xc, dtype=dtype)
-    Xc = dpt.asarray(Xc_np, dtype=dtype, sycl_queue=q)
-
-    with np.errstate(all="ignore"):
-        Ynp = np.exp(Xc_np)
-    Y = dpt.exp(Xc)
-
-    tol = 8 * dpt.finfo(dtype).resolution
-    assert_allclose(dpt.asnumpy(dpt.real(Y)), np.real(Ynp), atol=tol, rtol=tol)
-    assert_allclose(dpt.asnumpy(dpt.imag(Y)), np.imag(Ynp), atol=tol, rtol=tol)
diff --git a/dpctl/tests/elementwise/test_exp2.py b/dpctl/tests/elementwise/test_exp2.py
deleted file mode 100644
index e5de10129c..0000000000
--- a/dpctl/tests/elementwise/test_exp2.py
+++ /dev/null
@@ -1,168 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import itertools
-
-import numpy as np
-import pytest
-from numpy.testing import assert_allclose
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes, _map_to_device_dtype, _usm_types
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_exp2_out_type(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
-    expected_dtype = np.exp2(np.array(0, dtype=dtype)).dtype
-    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
-    assert dpt.exp2(X).dtype == expected_dtype
-
-
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
-def test_exp2_output_contig(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n_seq = 1027
-
-    X = dpt.linspace(1, 5, num=n_seq, dtype=dtype, sycl_queue=q)
-    Xnp = dpt.asnumpy(X)
-
-    Y = dpt.exp2(X)
-    tol = 8 * dpt.finfo(Y.dtype).resolution
-
-    assert_allclose(dpt.asnumpy(Y), np.exp2(Xnp), atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
-def test_exp2_output_strided(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n_seq = 2 * 1027
-
-    X = dpt.linspace(1, 5, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
-    Xnp = dpt.asnumpy(X)
-
-    Y = dpt.exp2(X)
-    tol = 8 * dpt.finfo(Y.dtype).resolution
-
-    assert_allclose(dpt.asnumpy(Y), np.exp2(Xnp), atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("usm_type", _usm_types)
-def test_exp2_usm_type(usm_type):
-    q = get_queue_or_skip()
-
-    arg_dt = np.dtype("f4")
-    input_shape = (10, 10, 10, 10)
-    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
-    X[..., 0::2] = 1 / 4
-    X[..., 1::2] = 1 / 2
-
-    Y = dpt.exp2(X)
-    assert Y.usm_type == X.usm_type
-    assert Y.sycl_queue == X.sycl_queue
-    assert Y.flags.c_contiguous
-
-    expected_Y = np.empty(input_shape, dtype=arg_dt)
-    expected_Y[..., 0::2] = np.exp2(np.float32(1 / 4))
-    expected_Y[..., 1::2] = np.exp2(np.float32(1 / 2))
-    tol = 8 * dpt.finfo(Y.dtype).resolution
-
-    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_exp2_order(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    arg_dt = np.dtype(dtype)
-    input_shape = (10, 10, 10, 10)
-    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
-    X[..., 0::2] = 1 / 4
-    X[..., 1::2] = 1 / 2
-
-    for perms in itertools.permutations(range(4)):
-        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
-        expected_Y = np.exp2(dpt.asnumpy(U))
-        for ord in ["C", "F", "A", "K"]:
-            Y = dpt.exp2(U, order=ord)
-            tol = 8 * max(
-                dpt.finfo(Y.dtype).resolution,
-                np.finfo(expected_Y.dtype).resolution,
-            )
-            assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
-
-
-def test_exp2_special_cases():
-    get_queue_or_skip()
-
-    X = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4")
-    res = np.asarray([np.nan, 1.0, 1.0, np.inf, 0.0], dtype="f4")
-
-    tol = dpt.finfo(X.dtype).resolution
-    assert_allclose(dpt.asnumpy(dpt.exp2(X)), res, atol=tol, rtol=tol)
-
-    # special cases for complex variant
-    num_finite = 1.0
-    vals = [
-        complex(0.0, 0.0),
-        complex(num_finite, dpt.inf),
-        complex(num_finite, dpt.nan),
-        complex(dpt.inf, 0.0),
-        complex(-dpt.inf, num_finite),
-        complex(dpt.inf, num_finite),
-        complex(-dpt.inf, dpt.inf),
-        complex(dpt.inf, dpt.inf),
-        complex(-dpt.inf, dpt.nan),
-        complex(dpt.inf, dpt.nan),
-        complex(dpt.nan, 0.0),
-        complex(dpt.nan, num_finite),
-        complex(dpt.nan, dpt.nan),
-    ]
-    X = dpt.asarray(vals, dtype=dpt.complex64)
-    cis_1 = complex(np.cos(num_finite), np.sin(num_finite))
-    c_nan = complex(np.nan, np.nan)
-    res = np.asarray(
-        [
-            complex(1.0, 0.0),
-            c_nan,
-            c_nan,
-            complex(np.inf, 0.0),
-            0.0,
-            np.inf * cis_1,
-            complex(0.0, 0.0),
-            complex(np.inf, np.nan),
-            complex(0.0, 0.0),
-            complex(np.inf, np.nan),
-            complex(np.nan, 0.0),
-            c_nan,
-            c_nan,
-        ],
-        dtype=np.complex64,
-    )
-
-    tol = dpt.finfo(X.dtype).resolution
-    with np.errstate(invalid="ignore"):
-        assert_allclose(dpt.asnumpy(dpt.exp2(X)), res, atol=tol, rtol=tol)
diff --git a/dpctl/tests/elementwise/test_expm1.py b/dpctl/tests/elementwise/test_expm1.py
deleted file mode 100644
index 0221273056..0000000000
--- a/dpctl/tests/elementwise/test_expm1.py
+++ /dev/null
@@ -1,168 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import itertools
-
-import numpy as np
-import pytest
-from numpy.testing import assert_allclose
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes, _map_to_device_dtype, _usm_types
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_expm1_out_type(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
-    expected_dtype = np.expm1(np.array(0, dtype=dtype)).dtype
-    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
-    assert dpt.expm1(X).dtype == expected_dtype
-
-
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
-def test_expm1_output_contig(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n_seq = 1027
-
-    X = dpt.linspace(-2, 2, num=n_seq, dtype=dtype, sycl_queue=q)
-    Xnp = dpt.asnumpy(X)
-
-    Y = dpt.expm1(X)
-    tol = 8 * dpt.finfo(Y.dtype).resolution
-
-    assert_allclose(dpt.asnumpy(Y), np.expm1(Xnp), atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
-def test_expm1_output_strided(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n_seq = 2 * 1027
-
-    X = dpt.linspace(-2, 2, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
-    Xnp = dpt.asnumpy(X)
-
-    Y = dpt.expm1(X)
-    tol = 8 * dpt.finfo(Y.dtype).resolution
-
-    assert_allclose(dpt.asnumpy(Y), np.expm1(Xnp), atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("usm_type", _usm_types)
-def test_expm1_usm_type(usm_type):
-    q = get_queue_or_skip()
-
-    arg_dt = np.dtype("f4")
-    input_shape = (10, 10, 10, 10)
-    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
-    X[..., 0::2] = 1 / 50
-    X[..., 1::2] = 1 / 25
-
-    Y = dpt.expm1(X)
-    assert Y.usm_type == X.usm_type
-    assert Y.sycl_queue == X.sycl_queue
-    assert Y.flags.c_contiguous
-
-    expected_Y = np.empty(input_shape, dtype=arg_dt)
-    expected_Y[..., 0::2] = np.expm1(np.float32(1 / 50))
-    expected_Y[..., 1::2] = np.expm1(np.float32(1 / 25))
-    tol = 8 * dpt.finfo(Y.dtype).resolution
-
-    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_expm1_order(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    arg_dt = np.dtype(dtype)
-    input_shape = (10, 10, 10, 10)
-    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
-    X[..., 0::2] = 1 / 50
-    X[..., 1::2] = 1 / 25
-
-    for perms in itertools.permutations(range(4)):
-        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
-        expected_Y = np.expm1(dpt.asnumpy(U))
-        for ord in ["C", "F", "A", "K"]:
-            Y = dpt.expm1(U, order=ord)
-            tol = 8 * max(
-                dpt.finfo(Y.dtype).resolution,
-                np.finfo(expected_Y.dtype).resolution,
-            )
-            assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
-
-
-def test_expm1_special_cases():
-    get_queue_or_skip()
-
-    X = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4")
-    res = np.asarray([np.nan, 0.0, -0.0, np.inf, -1.0], dtype="f4")
-
-    tol = dpt.finfo(X.dtype).resolution
-    assert_allclose(dpt.asnumpy(dpt.expm1(X)), res, atol=tol, rtol=tol)
-
-    # special cases for complex variant
-    num_finite = 1.0
-    vals = [
-        complex(0.0, 0.0),
-        complex(num_finite, dpt.inf),
-        complex(num_finite, dpt.nan),
-        complex(dpt.inf, 0.0),
-        complex(-dpt.inf, num_finite),
-        complex(dpt.inf, num_finite),
-        complex(-dpt.inf, dpt.inf),
-        complex(dpt.inf, dpt.inf),
-        complex(-dpt.inf, dpt.nan),
-        complex(dpt.inf, dpt.nan),
-        complex(dpt.nan, 0.0),
-        complex(dpt.nan, num_finite),
-        complex(dpt.nan, dpt.nan),
-    ]
-    X = dpt.asarray(vals, dtype=dpt.complex64)
-    cis_1 = complex(np.cos(num_finite), np.sin(num_finite))
-    c_nan = complex(np.nan, np.nan)
-    res = np.asarray(
-        [
-            complex(0.0, 0.0),
-            c_nan,
-            c_nan,
-            complex(np.inf, 0.0),
-            0.0 * cis_1 - 1.0,
-            np.inf * cis_1 - 1.0,
-            complex(-1.0, 0.0),
-            complex(np.inf, np.nan),
-            complex(-1.0, 0.0),
-            complex(np.inf, np.nan),
-            complex(np.nan, 0.0),
-            c_nan,
-            c_nan,
-        ],
-        dtype=np.complex64,
-    )
-
-    tol = dpt.finfo(X.dtype).resolution
-    with np.errstate(invalid="ignore"):
-        assert_allclose(dpt.asnumpy(dpt.expm1(X)), res, atol=tol, rtol=tol)
diff --git a/dpctl/tests/elementwise/test_floor_ceil_trunc.py b/dpctl/tests/elementwise/test_floor_ceil_trunc.py
deleted file mode 100644
index 20bb739b2c..0000000000
--- a/dpctl/tests/elementwise/test_floor_ceil_trunc.py
+++ /dev/null
@@ -1,163 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import itertools
-import re
-
-import numpy as np
-import pytest
-from numpy.testing import assert_allclose, assert_array_equal
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _map_to_device_dtype, _no_complex_dtypes, _real_value_dtypes
-
-_all_funcs = [(np.floor, dpt.floor), (np.ceil, dpt.ceil), (np.trunc, dpt.trunc)]
-
-
-@pytest.mark.parametrize("dpt_call", [dpt.floor, dpt.ceil, dpt.trunc])
-@pytest.mark.parametrize("dtype", _no_complex_dtypes)
-def test_floor_ceil_trunc_out_type(dpt_call, dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    arg_dt = np.dtype(dtype)
-    X = dpt.asarray(0.1, dtype=arg_dt, sycl_queue=q)
-    expected_dtype = _map_to_device_dtype(arg_dt, q.sycl_device)
-    assert dpt_call(X).dtype == expected_dtype
-
-    X = dpt.asarray(0.1, dtype=dtype, sycl_queue=q)
-    expected_dtype = _map_to_device_dtype(arg_dt, q.sycl_device)
-    Y = dpt.empty_like(X, dtype=expected_dtype)
-    dpt_call(X, out=Y)
-    assert_allclose(dpt.asnumpy(dpt_call(X)), dpt.asnumpy(Y))
-
-
-@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
-@pytest.mark.parametrize("usm_type", ["device", "shared", "host"])
-def test_floor_ceil_trunc_usm_type(np_call, dpt_call, usm_type):
-    q = get_queue_or_skip()
-
-    arg_dt = np.dtype("f4")
-    input_shape = (10, 10, 10, 10)
-    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
-    X[..., 0::2] = -0.4
-    X[..., 1::2] = 0.7
-
-    Y = dpt_call(X)
-    assert Y.usm_type == X.usm_type
-    assert Y.sycl_queue == X.sycl_queue
-    assert Y.flags.c_contiguous
-
-    expected_Y = np_call(dpt.asnumpy(X))
-    tol = 8 * dpt.finfo(Y.dtype).resolution
-    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
-@pytest.mark.parametrize("dtype", _no_complex_dtypes)
-def test_floor_ceil_trunc_order(np_call, dpt_call, dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    arg_dt = np.dtype(dtype)
-    input_shape = (4, 4, 4, 4)
-    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
-    X[..., 0::2] = -0.4
-    X[..., 1::2] = 0.7
-
-    for perms in itertools.permutations(range(4)):
-        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
-        expected_Y = np_call(dpt.asnumpy(U))
-        for ord in ["C", "F", "A", "K"]:
-            Y = dpt_call(U, order=ord)
-            assert_allclose(dpt.asnumpy(Y), expected_Y)
-
-
-@pytest.mark.parametrize("dpt_call", [dpt.floor, dpt.ceil, dpt.trunc])
-@pytest.mark.parametrize("dtype", _real_value_dtypes)
-def test_floor_ceil_trunc_error_dtype(dpt_call, dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    x = dpt.zeros(5, dtype=dtype)
-    y = dpt.empty_like(x, dtype="b1")
-    with pytest.raises(ValueError) as excinfo:
-        dpt_call(x, out=y)
-    assert re.match("Output array of type.*is needed", str(excinfo.value))
-
-
-@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
-@pytest.mark.parametrize("dtype", _no_complex_dtypes)
-def test_floor_ceil_trunc_contig(np_call, dpt_call, dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n_seq = 100
-    n_rep = 137
-    Xnp = np.linspace(-99.9, 99.9, num=n_seq, dtype=dtype)
-
-    X = dpt.asarray(np.repeat(Xnp, n_rep), dtype=dtype, sycl_queue=q)
-    Y = dpt_call(X)
-
-    assert_allclose(dpt.asnumpy(Y), np.repeat(np_call(Xnp), n_rep))
-
-    Z = dpt.empty_like(X, dtype=dtype)
-    dpt_call(X, out=Z)
-
-    assert_allclose(dpt.asnumpy(Z), np.repeat(np_call(Xnp), n_rep))
-
-
-@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
-@pytest.mark.parametrize("dtype", _no_complex_dtypes)
-def test_floor_ceil_trunc_strided(np_call, dpt_call, dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    np.random.seed(42)
-    strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4])
-    sizes = [2, 4, 6, 8, 24, 32, 72]
-
-    for ii in sizes:
-        Xnp = np.random.uniform(low=-99.9, high=99.9, size=ii)
-        Xnp.astype(dtype)
-        X = dpt.asarray(Xnp)
-        Ynp = np_call(Xnp)
-        for jj in strides:
-            assert_allclose(
-                dpt.asnumpy(dpt_call(X[::jj])),
-                Ynp[::jj],
-            )
-
-
-@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
-def test_floor_ceil_trunc_special_cases(np_call, dpt_call, dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    x = [np.nan, np.inf, -np.inf, +0.0, -0.0]
-
-    xf = np.array(x, dtype=dtype)
-    yf = dpt.asarray(xf, dtype=dtype, sycl_queue=q)
-
-    Y_np = np_call(xf)
-    Y = dpt.asnumpy(dpt_call(yf))
-
-    tol = 8 * dpt.finfo(dtype).resolution
-    assert_allclose(Y, Y_np, atol=tol, rtol=tol)
-    assert_array_equal(np.signbit(Y), np.signbit(Y_np))
diff --git a/dpctl/tests/elementwise/test_floor_divide.py b/dpctl/tests/elementwise/test_floor_divide.py
deleted file mode 100644
index eed9e155da..0000000000
--- a/dpctl/tests/elementwise/test_floor_divide.py
+++ /dev/null
@@ -1,304 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import ctypes
-
-import numpy as np
-import pytest
-
-import dpctl
-import dpctl.tensor as dpt
-from dpctl.tensor._type_utils import _can_cast
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import (
-    _compare_dtypes,
-    _integral_dtypes,
-    _no_complex_dtypes,
-    _usm_types,
-)
-
-
-@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:])
-@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:])
-def test_floor_divide_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1 = dpt.ones(sz, dtype=op1_dtype)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
-
-    r = dpt.floor_divide(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.floor_divide(
-        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
-    )
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar1.shape
-    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
-    assert r.sycl_queue == ar1.sycl_queue
-
-    ar3 = dpt.ones(sz, dtype=op1_dtype)
-    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
-
-    r = dpt.floor_divide(ar3[::-1], ar4[::2])
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.floor_divide(
-        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
-    )
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar3.shape
-    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
-
-
-@pytest.mark.parametrize("op1_usm_type", _usm_types)
-@pytest.mark.parametrize("op2_usm_type", _usm_types)
-def test_floor_divide_usm_type_matrix(op1_usm_type, op2_usm_type):
-    get_queue_or_skip()
-
-    sz = 128
-    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
-    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
-
-    r = dpt.floor_divide(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
-    assert r.usm_type == expected_usm_type
-
-
-def test_floor_divide_order():
-    get_queue_or_skip()
-
-    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
-    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
-    r1 = dpt.floor_divide(ar1, ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.floor_divide(ar1, ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.floor_divide(ar1, ar2, order="A")
-    assert r3.flags.c_contiguous
-    r4 = dpt.floor_divide(ar1, ar2, order="K")
-    assert r4.flags.c_contiguous
-
-    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
-    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
-    r1 = dpt.floor_divide(ar1, ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.floor_divide(ar1, ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.floor_divide(ar1, ar2, order="A")
-    assert r3.flags.f_contiguous
-    r4 = dpt.floor_divide(ar1, ar2, order="K")
-    assert r4.flags.f_contiguous
-
-    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
-    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
-    r4 = dpt.floor_divide(ar1, ar2, order="K")
-    assert r4.strides == (20, -1)
-
-    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
-    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
-    r4 = dpt.floor_divide(ar1, ar2, order="K")
-    assert r4.strides == (-1, 20)
-
-
-def test_floor_divide_broadcasting():
-    get_queue_or_skip()
-
-    m = dpt.ones((100, 5), dtype="i4")
-    v = dpt.arange(1, 6, dtype="i4")
-
-    r = dpt.floor_divide(m, v)
-
-    expected = np.floor_divide(
-        np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4")
-    )
-    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
-
-    r2 = dpt.floor_divide(v, m)
-    expected2 = np.floor_divide(
-        np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4")
-    )
-    assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all()
-
-
-@pytest.mark.parametrize("arr_dt", _no_complex_dtypes[1:])
-def test_floor_divide_python_scalar(arr_dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arr_dt, q)
-
-    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
-    py_ones = (
-        bool(1),
-        int(1),
-        float(1),
-        np.float32(1),
-        ctypes.c_int(1),
-    )
-    for sc in py_ones:
-        R = dpt.floor_divide(X, sc)
-        assert isinstance(R, dpt.usm_ndarray)
-        R = dpt.floor_divide(sc, X)
-        assert isinstance(R, dpt.usm_ndarray)
-
-
-class MockArray:
-    def __init__(self, arr):
-        self.data_ = arr
-
-    @property
-    def __sycl_usm_array_interface__(self):
-        return self.data_.__sycl_usm_array_interface__
-
-
-def test_floor_divide_mock_array():
-    get_queue_or_skip()
-    a = dpt.arange(10)
-    b = dpt.ones(10)
-    c = MockArray(b)
-    r = dpt.floor_divide(a, c)
-    assert isinstance(r, dpt.usm_ndarray)
-
-
-def test_floor_divide_canary_mock_array():
-    get_queue_or_skip()
-    a = dpt.arange(10)
-
-    class Canary:
-        def __init__(self):
-            pass
-
-        @property
-        def __sycl_usm_array_interface__(self):
-            return None
-
-    c = Canary()
-    with pytest.raises(ValueError):
-        dpt.floor_divide(a, c)
-
-
-def test_floor_divide_gh_1247():
-    get_queue_or_skip()
-
-    x = dpt.ones(1, dtype="i4")
-    res = dpt.floor_divide(x, -2)
-    np.testing.assert_array_equal(
-        dpt.asnumpy(res), np.full(res.shape, -1, dtype=res.dtype)
-    )
-
-    x = dpt.full(1, -1, dtype="i4")
-    res = dpt.floor_divide(x, 2)
-    np.testing.assert_array_equal(
-        dpt.asnumpy(res), np.full(res.shape, -1, dtype=res.dtype)
-    )
-
-
-@pytest.mark.parametrize("dtype", _integral_dtypes)
-def test_floor_divide_integer_zero(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    x = dpt.arange(10, dtype=dtype, sycl_queue=q)
-    y = dpt.zeros_like(x, sycl_queue=q)
-    res = dpt.floor_divide(x, y)
-    np.testing.assert_array_equal(
-        dpt.asnumpy(res), np.zeros(x.shape, dtype=res.dtype)
-    )
-
-
-def test_floor_divide_special_cases():
-    q = get_queue_or_skip()
-
-    x = dpt.empty(1, dtype="f4", sycl_queue=q)
-    y = dpt.empty_like(x)
-    x[0], y[0] = dpt.inf, dpt.inf
-    res = dpt.floor_divide(x, y)
-    with np.errstate(all="ignore"):
-        res_np = np.floor_divide(dpt.asnumpy(x), dpt.asnumpy(y))
-        np.testing.assert_array_equal(dpt.asnumpy(res), res_np)
-
-    x[0], y[0] = 0.0, -1.0
-    res = dpt.floor_divide(x, y)
-    x_np = dpt.asnumpy(x)
-    y_np = dpt.asnumpy(y)
-    res_np = np.floor_divide(x_np, y_np)
-    np.testing.assert_array_equal(dpt.asnumpy(res), res_np)
-
-    res = dpt.floor_divide(y, x)
-    with np.errstate(all="ignore"):
-        res_np = np.floor_divide(y_np, x_np)
-        np.testing.assert_array_equal(dpt.asnumpy(res), res_np)
-
-    x[0], y[0] = -1.0, dpt.inf
-    res = dpt.floor_divide(x, y)
-    np.testing.assert_array_equal(
-        dpt.asnumpy(res), np.asarray([-0.0], dtype="f4")
-    )
-
-    res = dpt.floor_divide(y, x)
-    np.testing.assert_array_equal(
-        dpt.asnumpy(res), np.asarray([-dpt.inf], dtype="f4")
-    )
-
-    x[0], y[0] = 1.0, dpt.nan
-    res = dpt.floor_divide(x, y)
-    res_np = np.floor_divide(dpt.asnumpy(x), dpt.asnumpy(y))
-    np.testing.assert_array_equal(dpt.asnumpy(res), res_np)
-
-
-@pytest.mark.parametrize("dtype", _no_complex_dtypes[1:])
-def test_divide_inplace_python_scalar(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
-    dt_kind = X.dtype.kind
-    if dt_kind in "ui":
-        X //= int(1)
-    elif dt_kind == "f":
-        X //= float(1)
-
-
-@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:])
-@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:])
-def test_floor_divide_inplace_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
-
-    dev = q.sycl_device
-    _fp16 = dev.has_aspect_fp16
-    _fp64 = dev.has_aspect_fp64
-    # out array only valid if it is inexact
-    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
-        ar1 //= ar2
-        assert dpt.all(ar1 == 1)
-
-        ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1]
-        ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2]
-        ar3 //= ar4
-        assert dpt.all(ar3 == 1)
-    else:
-        with pytest.raises(ValueError):
-            ar1 //= ar2
-            dpt.floor_divide(ar1, ar2, out=ar1)
diff --git a/dpctl/tests/elementwise/test_greater.py b/dpctl/tests/elementwise/test_greater.py
deleted file mode 100644
index 52a1a4b39d..0000000000
--- a/dpctl/tests/elementwise/test_greater.py
+++ /dev/null
@@ -1,297 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import ctypes
-
-import numpy as np
-import pytest
-
-import dpctl
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes, _compare_dtypes, _usm_types
-
-
-@pytest.mark.parametrize("op1_dtype", _all_dtypes)
-@pytest.mark.parametrize("op2_dtype", _all_dtypes)
-def test_greater_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1 = dpt.zeros(sz, dtype=op1_dtype)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
-
-    r = dpt.greater(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.greater(
-        np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
-    )
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar1.shape
-    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
-    assert r.sycl_queue == ar1.sycl_queue
-
-    ar3 = dpt.zeros(sz, dtype=op1_dtype)
-    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
-
-    r = dpt.greater(ar3[::-1], ar4[::2])
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.greater(
-        np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
-    )
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar3.shape
-    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
-
-
-@pytest.mark.parametrize("op_dtype", ["c8", "c16"])
-def test_greater_complex_matrix(op_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op_dtype, q)
-
-    sz = 127
-    ar1_np_real = np.random.randint(0, 10, sz)
-    ar1_np_imag = np.random.randint(0, 10, sz)
-    ar1_np = ar1_np_real + 1j * ar1_np_imag
-    ar1 = dpt.asarray(ar1_np, dtype=op_dtype)
-
-    ar2_np_real = np.random.randint(0, 10, sz)
-    ar2_np_imag = np.random.randint(0, 10, sz)
-    ar2_np = ar2_np_real + 1j * ar2_np_imag
-    ar2 = dpt.asarray(ar2_np, dtype=op_dtype)
-
-    r = dpt.greater(ar1, ar2)
-    expected = np.greater(ar1_np, ar2_np)
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == expected.shape
-    assert (dpt.asnumpy(r) == expected).all()
-
-    r1 = dpt.greater(ar1[::-2], ar2[::2])
-    expected1 = np.greater(ar1_np[::-2], ar2_np[::2])
-    assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q)
-    assert r1.shape == expected1.shape
-    assert (dpt.asnumpy(r1) == expected1).all()
-
-    ar3 = dpt.asarray([1.0 + 9j, 2.0 + 0j, 2.0 + 1j, 2.0 + 2j], dtype=op_dtype)
-    ar4 = dpt.asarray([2.0 + 0j, dpt.nan, dpt.inf, -dpt.inf], dtype=op_dtype)
-
-    ar3_np = dpt.asnumpy(ar3)
-    ar4_np = dpt.asnumpy(ar4)
-
-    r2 = dpt.greater(ar3, ar4)
-    with np.errstate(invalid="ignore"):
-        expected2 = np.greater(ar3_np, ar4_np)
-    assert (dpt.asnumpy(r2) == expected2).all()
-
-    r3 = dpt.greater(ar4, ar4)
-    with np.errstate(invalid="ignore"):
-        expected3 = np.greater(ar4_np, ar4_np)
-    assert (dpt.asnumpy(r3) == expected3).all()
-
-
-def test_greater_complex_float():
-    get_queue_or_skip()
-
-    ar1 = dpt.asarray([1.0 + 9j, 2.0 + 0j, 2.0 + 1j, 2.0 + 2j], dtype="c8")
-    ar2 = dpt.full((4,), 2, dtype="f4")
-
-    ar1_np = dpt.asnumpy(ar1)
-    ar2_np = dpt.asnumpy(ar2)
-
-    r = dpt.greater(ar1, ar2)
-    expected = np.greater(ar1_np, ar2_np)
-    assert (dpt.asnumpy(r) == expected).all()
-
-    r1 = dpt.greater(ar2, ar1)
-    expected1 = np.greater(ar2_np, ar1_np)
-    assert (dpt.asnumpy(r1) == expected1).all()
-    with np.errstate(invalid="ignore"):
-        for tp in [dpt.nan, dpt.inf, -dpt.inf]:
-
-            ar3 = dpt.full((4,), tp)
-            ar3_np = dpt.asnumpy(ar3)
-
-            r2 = dpt.greater(ar1, ar3)
-            expected2 = np.greater(ar1_np, ar3_np)
-            assert (dpt.asnumpy(r2) == expected2).all()
-
-            r3 = dpt.greater(ar3, ar1)
-            expected3 = np.greater(ar3_np, ar1_np)
-            assert (dpt.asnumpy(r3) == expected3).all()
-
-
-@pytest.mark.parametrize("op1_usm_type", _usm_types)
-@pytest.mark.parametrize("op2_usm_type", _usm_types)
-def test_greater_usm_type_matrix(op1_usm_type, op2_usm_type):
-    get_queue_or_skip()
-
-    sz = 128
-    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
-    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
-
-    r = dpt.greater(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
-    assert r.usm_type == expected_usm_type
-
-
-def test_greater_order():
-    get_queue_or_skip()
-
-    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
-    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
-    r1 = dpt.greater(ar1, ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.greater(ar1, ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.greater(ar1, ar2, order="A")
-    assert r3.flags.c_contiguous
-    r4 = dpt.greater(ar1, ar2, order="K")
-    assert r4.flags.c_contiguous
-
-    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
-    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
-    r1 = dpt.greater(ar1, ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.greater(ar1, ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.greater(ar1, ar2, order="A")
-    assert r3.flags.f_contiguous
-    r4 = dpt.greater(ar1, ar2, order="K")
-    assert r4.flags.f_contiguous
-
-    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
-    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
-    r4 = dpt.greater(ar1, ar2, order="K")
-    assert r4.strides == (20, -1)
-
-    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
-    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
-    r4 = dpt.greater(ar1, ar2, order="K")
-    assert r4.strides == (-1, 20)
-
-
-def test_greater_broadcasting():
-    get_queue_or_skip()
-
-    m = dpt.ones((100, 5), dtype="i4")
-    v = dpt.arange(1, 6, dtype="i4")
-
-    r = dpt.greater(m, v)
-
-    expected = np.greater(
-        np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4")
-    )
-    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
-
-    r2 = dpt.greater(v, m)
-    expected2 = np.greater(
-        np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4")
-    )
-    assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all()
-
-
-@pytest.mark.parametrize("arr_dt", _all_dtypes)
-def test_greater_python_scalar(arr_dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arr_dt, q)
-
-    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
-    py_ones = (
-        bool(1),
-        int(1),
-        float(1),
-        complex(1),
-        np.float32(1),
-        ctypes.c_int(1),
-    )
-    for sc in py_ones:
-        R = dpt.greater(X, sc)
-        assert isinstance(R, dpt.usm_ndarray)
-        R = dpt.greater(sc, X)
-        assert isinstance(R, dpt.usm_ndarray)
-
-
-class MockArray:
-    def __init__(self, arr):
-        self.data_ = arr
-
-    @property
-    def __sycl_usm_array_interface__(self):
-        return self.data_.__sycl_usm_array_interface__
-
-
-def test_greater_mock_array():
-    get_queue_or_skip()
-    a = dpt.arange(10)
-    b = dpt.ones(10)
-    c = MockArray(b)
-    r = dpt.greater(a, c)
-    assert isinstance(r, dpt.usm_ndarray)
-
-
-def test_greater_canary_mock_array():
-    get_queue_or_skip()
-    a = dpt.arange(10)
-
-    class Canary:
-        def __init__(self):
-            pass
-
-        @property
-        def __sycl_usm_array_interface__(self):
-            return None
-
-    c = Canary()
-    with pytest.raises(ValueError):
-        dpt.greater(a, c)
-
-
-def test_greater_mixed_integer_kinds():
-    get_queue_or_skip()
-
-    x1 = dpt.flip(dpt.arange(-9, 1, dtype="i8"))
-    x2 = dpt.arange(10, dtype="u8")
-
-    # u8 - i8
-    res = dpt.greater(x2, x1)
-    assert dpt.all(res[1:])
-    assert not res[0]
-    # i8 - u8
-    assert not dpt.any(dpt.greater(x1, x2))
-
-    # Python scalar
-    assert dpt.all(dpt.greater(x2, -1))
-    assert not dpt.any(dpt.greater(-1, x2))
-
-
-def test_greater_very_large_py_int():
-    get_queue_or_skip()
-
-    py_int = dpt.iinfo(dpt.int64).max + 10
-
-    x = dpt.asarray(3, dtype="u8")
-    assert py_int > x
-    assert not dpt.greater(x, py_int)
-
-    x = dpt.asarray(py_int, dtype="u8")
-    assert x > -1
-    assert not dpt.greater(-1, x)
diff --git a/dpctl/tests/elementwise/test_greater_equal.py b/dpctl/tests/elementwise/test_greater_equal.py
deleted file mode 100644
index 1833f3a2f2..0000000000
--- a/dpctl/tests/elementwise/test_greater_equal.py
+++ /dev/null
@@ -1,296 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import ctypes
-
-import numpy as np
-import pytest
-
-import dpctl
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes, _compare_dtypes, _usm_types
-
-
-@pytest.mark.parametrize("op1_dtype", _all_dtypes)
-@pytest.mark.parametrize("op2_dtype", _all_dtypes)
-def test_greater_equal_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1 = dpt.zeros(sz, dtype=op1_dtype)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
-
-    r = dpt.greater_equal(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.greater_equal(
-        np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
-    )
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar1.shape
-    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
-    assert r.sycl_queue == ar1.sycl_queue
-
-    ar3 = dpt.zeros(sz, dtype=op1_dtype)
-    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
-
-    r = dpt.greater_equal(ar3[::-1], ar4[::2])
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.greater_equal(
-        np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
-    )
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar3.shape
-    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
-
-
-@pytest.mark.parametrize("op_dtype", ["c8", "c16"])
-def test_greater_equal_complex_matrix(op_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op_dtype, q)
-
-    sz = 127
-    ar1_np_real = np.random.randint(0, 10, sz)
-    ar1_np_imag = np.random.randint(0, 10, sz)
-    ar1_np = ar1_np_real + 1j * ar1_np_imag
-    ar1 = dpt.asarray(ar1_np, dtype=op_dtype)
-
-    ar2_np_real = np.random.randint(0, 10, sz)
-    ar2_np_imag = np.random.randint(0, 10, sz)
-    ar2_np = ar2_np_real + 1j * ar2_np_imag
-    ar2 = dpt.asarray(ar2_np, dtype=op_dtype)
-
-    r = dpt.greater_equal(ar1, ar2)
-    expected = np.greater_equal(ar1_np, ar2_np)
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == expected.shape
-    assert (dpt.asnumpy(r) == expected).all()
-
-    r1 = dpt.greater_equal(ar1[::-2], ar2[::2])
-    expected1 = np.greater_equal(ar1_np[::-2], ar2_np[::2])
-    assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q)
-    assert r1.shape == expected1.shape
-    assert (dpt.asnumpy(r1) == expected1).all()
-
-    ar3 = dpt.asarray([1.0 + 9j, 2.0 + 0j, 2.0 + 1j, 2.0 + 2j], dtype=op_dtype)
-    ar4 = dpt.asarray([2.0 + 0j, dpt.nan, dpt.inf, -dpt.inf], dtype=op_dtype)
-
-    ar3_np = dpt.asnumpy(ar3)
-    ar4_np = dpt.asnumpy(ar4)
-    r2 = dpt.greater_equal(ar3, ar4)
-    with np.errstate(invalid="ignore"):
-        expected2 = np.greater_equal(ar3_np, ar4_np)
-    assert (dpt.asnumpy(r2) == expected2).all()
-
-    r3 = dpt.greater_equal(ar4, ar4)
-    with np.errstate(invalid="ignore"):
-        expected3 = np.greater_equal(ar4_np, ar4_np)
-    assert (dpt.asnumpy(r3) == expected3).all()
-
-
-def test_greater_equal_complex_float():
-    get_queue_or_skip()
-
-    ar1 = dpt.asarray([1.0 + 9j, 2.0 + 0j, 2.0 + 1j, 2.0 + 2j], dtype="c8")
-    ar2 = dpt.full((4,), 2, dtype="f4")
-
-    ar1_np = dpt.asnumpy(ar1)
-    ar2_np = dpt.asnumpy(ar2)
-
-    r = dpt.greater_equal(ar1, ar2)
-    expected = np.greater_equal(ar1_np, ar2_np)
-    assert (dpt.asnumpy(r) == expected).all()
-
-    r1 = dpt.greater_equal(ar2, ar1)
-    expected1 = np.greater_equal(ar2_np, ar1_np)
-    assert (dpt.asnumpy(r1) == expected1).all()
-    with np.errstate(invalid="ignore"):
-        for tp in [dpt.nan, dpt.inf, -dpt.inf]:
-
-            ar3 = dpt.full((4,), tp)
-            ar3_np = dpt.asnumpy(ar3)
-            r2 = dpt.greater_equal(ar1, ar3)
-            expected2 = np.greater_equal(ar1_np, ar3_np)
-            assert (dpt.asnumpy(r2) == expected2).all()
-
-            r3 = dpt.greater_equal(ar3, ar1)
-            expected3 = np.greater_equal(ar3_np, ar1_np)
-            assert (dpt.asnumpy(r3) == expected3).all()
-
-
-@pytest.mark.parametrize("op1_usm_type", _usm_types)
-@pytest.mark.parametrize("op2_usm_type", _usm_types)
-def test_greater_equal_usm_type_matrix(op1_usm_type, op2_usm_type):
-    get_queue_or_skip()
-
-    sz = 128
-    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
-    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
-
-    r = dpt.greater_equal(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
-    assert r.usm_type == expected_usm_type
-
-
-def test_greater_equal_order():
-    get_queue_or_skip()
-
-    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
-    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
-    r1 = dpt.greater_equal(ar1, ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.greater_equal(ar1, ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.greater_equal(ar1, ar2, order="A")
-    assert r3.flags.c_contiguous
-    r4 = dpt.greater_equal(ar1, ar2, order="K")
-    assert r4.flags.c_contiguous
-
-    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
-    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
-    r1 = dpt.greater_equal(ar1, ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.greater_equal(ar1, ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.greater_equal(ar1, ar2, order="A")
-    assert r3.flags.f_contiguous
-    r4 = dpt.greater_equal(ar1, ar2, order="K")
-    assert r4.flags.f_contiguous
-
-    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
-    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
-    r4 = dpt.greater_equal(ar1, ar2, order="K")
-    assert r4.strides == (20, -1)
-
-    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
-    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
-    r4 = dpt.greater_equal(ar1, ar2, order="K")
-    assert r4.strides == (-1, 20)
-
-
-def test_greater_equal_broadcasting():
-    get_queue_or_skip()
-
-    m = dpt.ones((100, 5), dtype="i4")
-    v = dpt.arange(1, 6, dtype="i4")
-
-    r = dpt.greater_equal(m, v)
-
-    expected = np.greater_equal(
-        np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4")
-    )
-    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
-
-    r2 = dpt.greater_equal(v, m)
-    expected2 = np.greater_equal(
-        np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4")
-    )
-    assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all()
-
-
-@pytest.mark.parametrize("arr_dt", _all_dtypes)
-def test_greater_equal_python_scalar(arr_dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arr_dt, q)
-
-    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
-    py_ones = (
-        bool(1),
-        int(1),
-        float(1),
-        complex(1),
-        np.float32(1),
-        ctypes.c_int(1),
-    )
-    for sc in py_ones:
-        R = dpt.greater_equal(X, sc)
-        assert isinstance(R, dpt.usm_ndarray)
-        R = dpt.greater_equal(sc, X)
-        assert isinstance(R, dpt.usm_ndarray)
-
-
-class MockArray:
-    def __init__(self, arr):
-        self.data_ = arr
-
-    @property
-    def __sycl_usm_array_interface__(self):
-        return self.data_.__sycl_usm_array_interface__
-
-
-def test_greater_equal_mock_array():
-    get_queue_or_skip()
-    a = dpt.arange(10)
-    b = dpt.ones(10)
-    c = MockArray(b)
-    r = dpt.greater_equal(a, c)
-    assert isinstance(r, dpt.usm_ndarray)
-
-
-def test_greater_equal_canary_mock_array():
-    get_queue_or_skip()
-    a = dpt.arange(10)
-
-    class Canary:
-        def __init__(self):
-            pass
-
-        @property
-        def __sycl_usm_array_interface__(self):
-            return None
-
-    c = Canary()
-    with pytest.raises(ValueError):
-        dpt.greater_equal(a, c)
-
-
-def test_greater_equal_mixed_integer_kinds():
-    get_queue_or_skip()
-
-    x1 = dpt.flip(dpt.arange(-9, 1, dtype="i8"))
-    x2 = dpt.arange(10, dtype="u8")
-
-    # u8 - i8
-    res = dpt.greater_equal(x2, x1)
-    assert dpt.all(res)
-    # i8 - u8
-    res = dpt.greater_equal(x1, x2)
-    assert not dpt.any(res[1:])
-    assert res[0]
-
-    # Python scalar
-    assert dpt.all(dpt.greater_equal(x2, -1))
-    assert not dpt.any(dpt.greater_equal(-1, x2))
-
-
-def test_greater_equal_very_large_py_int():
-    get_queue_or_skip()
-
-    py_int = dpt.iinfo(dpt.int64).max + 10
-
-    x = dpt.asarray(3, dtype="u8")
-    assert py_int >= x
-    assert not dpt.greater_equal(x, py_int)
-
-    x = dpt.asarray(py_int, dtype="u8")
-    assert x >= -1
-    assert not dpt.greater_equal(-1, x)
diff --git a/dpctl/tests/elementwise/test_hyperbolic.py b/dpctl/tests/elementwise/test_hyperbolic.py
deleted file mode 100644
index 731f71ae72..0000000000
--- a/dpctl/tests/elementwise/test_hyperbolic.py
+++ /dev/null
@@ -1,187 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import numpy as np
-import pytest
-from numpy.testing import assert_allclose
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes, _map_to_device_dtype
-
-_hyper_funcs = [(np.sinh, dpt.sinh), (np.cosh, dpt.cosh), (np.tanh, dpt.tanh)]
-_inv_hyper_funcs = [
-    (np.arcsinh, dpt.asinh),
-    (np.arccosh, dpt.acosh),
-    (np.arctanh, dpt.atanh),
-]
-_all_funcs = _hyper_funcs + _inv_hyper_funcs
-
-
-@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_hyper_out_type(np_call, dpt_call, dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    a = 1 if np_call == np.arccosh else 0
-
-    x = dpt.asarray(a, dtype=dtype, sycl_queue=q)
-    expected_dtype = np_call(np.array(a, dtype=dtype)).dtype
-    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
-    assert dpt_call(x).dtype == expected_dtype
-
-
-@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
-def test_hyper_real_contig(np_call, dpt_call, dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n_seq = 100
-    n_rep = 137
-    if np_call == np.arctanh:
-        Xnp = np.linspace(-0.9, 0.9, num=n_seq, dtype=dtype)
-    elif np_call == np.arccosh:
-        Xnp = np.linspace(1.01, 10.0, num=n_seq, dtype=dtype)
-    else:
-        Xnp = np.linspace(-10.0, 10.0, num=n_seq, dtype=dtype)
-
-    X = dpt.asarray(np.repeat(Xnp, n_rep), dtype=dtype, sycl_queue=q)
-    Y = dpt_call(X)
-
-    tol = 8 * dpt.finfo(Y.dtype).resolution
-    assert_allclose(
-        dpt.asnumpy(Y), np.repeat(np_call(Xnp), n_rep), atol=tol, rtol=tol
-    )
-
-    Z = dpt.empty_like(X, dtype=dtype)
-    dpt_call(X, out=Z)
-
-    assert_allclose(
-        dpt.asnumpy(Z), np.repeat(np_call(Xnp), n_rep), atol=tol, rtol=tol
-    )
-
-
-@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
-@pytest.mark.parametrize("dtype", ["c8", "c16"])
-def test_hyper_complex_contig(np_call, dpt_call, dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n_seq = 100
-    n_rep = 137
-    low = -9.0
-    high = 9.0
-    x1 = np.random.uniform(low=low, high=high, size=n_seq)
-    x2 = np.random.uniform(low=low, high=high, size=n_seq)
-    Xnp = x1 + 1j * x2
-
-    X = dpt.asarray(np.repeat(Xnp, n_rep), dtype=dtype, sycl_queue=q)
-    Y = dpt_call(X)
-
-    tol = 50 * dpt.finfo(dtype).resolution
-    assert_allclose(
-        dpt.asnumpy(Y), np.repeat(np_call(Xnp), n_rep), atol=tol, rtol=tol
-    )
-
-    Z = dpt.empty_like(X, dtype=dtype)
-    dpt_call(X, out=Z)
-
-    assert_allclose(
-        dpt.asnumpy(Z), np.repeat(np_call(Xnp), n_rep), atol=tol, rtol=tol
-    )
-
-
-@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
-def test_hyper_real_strided(np_call, dpt_call, dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    np.random.seed(42)
-    strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4])
-    sizes = [2, 4, 6, 8, 9, 24, 72]
-    tol = 8 * dpt.finfo(dtype).resolution
-
-    low = -10.0
-    high = 10.0
-    if np_call == np.arctanh:
-        low = -0.9
-        high = 0.9
-    elif np_call == np.arccosh:
-        low = 1.01
-        high = 100.0
-
-    for ii in sizes:
-        Xnp = np.random.uniform(low=low, high=high, size=ii)
-        Xnp.astype(dtype)
-        X = dpt.asarray(Xnp)
-        Ynp = np_call(Xnp)
-        for jj in strides:
-            assert_allclose(
-                dpt.asnumpy(dpt_call(X[::jj])),
-                Ynp[::jj],
-                atol=tol,
-                rtol=tol,
-            )
-
-
-@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
-@pytest.mark.parametrize("dtype", ["c8", "c16"])
-def test_hyper_complex_strided(np_call, dpt_call, dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    np.random.seed(42)
-    strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4])
-    sizes = [2, 4, 6, 8, 9, 24, 72]
-    tol = 50 * dpt.finfo(dtype).resolution
-
-    low = -8.0
-    high = 8.0
-    for ii in sizes:
-        x1 = np.random.uniform(low=low, high=high, size=ii)
-        x2 = np.random.uniform(low=low, high=high, size=ii)
-        Xnp = np.array([complex(v1, v2) for v1, v2 in zip(x1, x2)], dtype=dtype)
-        X = dpt.asarray(Xnp)
-        Ynp = np_call(Xnp)
-        for jj in strides:
-            assert_allclose(
-                dpt.asnumpy(dpt_call(X[::jj])),
-                Ynp[::jj],
-                atol=tol,
-                rtol=tol,
-            )
-
-
-@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
-def test_hyper_real_special_cases(np_call, dpt_call, dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    x = [np.nan, np.inf, -np.inf, 2.0, -2.0, +0.0, -0.0, +1.0, -1.0]
-
-    xf = np.array(x, dtype=dtype)
-    yf = dpt.asarray(xf, dtype=dtype, sycl_queue=q)
-
-    with np.errstate(all="ignore"):
-        Y_np = np_call(xf)
-
-    tol = 8 * dpt.finfo(dtype).resolution
-    assert_allclose(dpt.asnumpy(dpt_call(yf)), Y_np, atol=tol, rtol=tol)
diff --git a/dpctl/tests/elementwise/test_hypot.py b/dpctl/tests/elementwise/test_hypot.py
deleted file mode 100644
index d4adc2e3b9..0000000000
--- a/dpctl/tests/elementwise/test_hypot.py
+++ /dev/null
@@ -1,193 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import ctypes
-
-import numpy as np
-import pytest
-
-import dpctl
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _compare_dtypes, _no_complex_dtypes, _usm_types
-
-
-@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:])
-@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:])
-def test_hypot_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1 = dpt.zeros(sz, dtype=op1_dtype, sycl_queue=q)
-    ar2 = dpt.zeros_like(ar1, dtype=op2_dtype, sycl_queue=q)
-
-    r = dpt.hypot(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.hypot(
-        np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype)
-    )
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar1.shape
-    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
-    assert r.sycl_queue == ar1.sycl_queue
-
-    ar3 = dpt.zeros(sz, dtype=op1_dtype, sycl_queue=q)
-    ar4 = dpt.zeros(2 * sz, dtype=op2_dtype, sycl_queue=q)
-
-    r = dpt.hypot(ar3[::-1], ar4[::2])
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.hypot(
-        np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype)
-    )
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar3.shape
-    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
-
-
-@pytest.mark.parametrize("op1_usm_type", _usm_types)
-@pytest.mark.parametrize("op2_usm_type", _usm_types)
-def test_hypot_usm_type_matrix(op1_usm_type, op2_usm_type):
-    get_queue_or_skip()
-
-    sz = 128
-    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
-    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
-
-    r = dpt.hypot(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
-    assert r.usm_type == expected_usm_type
-
-
-def test_hypot_order():
-    get_queue_or_skip()
-
-    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
-    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
-    r1 = dpt.hypot(ar1, ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.hypot(ar1, ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.hypot(ar1, ar2, order="A")
-    assert r3.flags.c_contiguous
-    r4 = dpt.hypot(ar1, ar2, order="K")
-    assert r4.flags.c_contiguous
-
-    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
-    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
-    r1 = dpt.hypot(ar1, ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.hypot(ar1, ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.hypot(ar1, ar2, order="A")
-    assert r3.flags.f_contiguous
-    r4 = dpt.hypot(ar1, ar2, order="K")
-    assert r4.flags.f_contiguous
-
-    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
-    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
-    r4 = dpt.hypot(ar1, ar2, order="K")
-    assert r4.strides == (20, -1)
-
-    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
-    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
-    r4 = dpt.hypot(ar1, ar2, order="K")
-    assert r4.strides == (-1, 20)
-
-
-def test_hypot_broadcasting():
-    get_queue_or_skip()
-
-    m = dpt.ones((100, 5), dtype="i4")
-    v = dpt.arange(1, 6, dtype="i4")
-
-    r = dpt.hypot(m, v)
-
-    expected = np.hypot(
-        np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4")
-    )
-    tol = 8 * np.finfo(r.dtype).resolution
-    assert np.allclose(
-        dpt.asnumpy(r), expected.astype(r.dtype), atol=tol, rtol=tol
-    )
-
-    r2 = dpt.hypot(v, m)
-    expected2 = np.hypot(
-        np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4")
-    )
-    assert np.allclose(
-        dpt.asnumpy(r2), expected2.astype(r2.dtype), atol=tol, rtol=tol
-    )
-
-
-@pytest.mark.parametrize("arr_dt", _no_complex_dtypes[1:])
-def test_hypot_python_scalar(arr_dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arr_dt, q)
-
-    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
-    py_ones = (
-        bool(1),
-        int(1),
-        float(1),
-        np.float32(1),
-        ctypes.c_int(1),
-    )
-    for sc in py_ones:
-        R = dpt.hypot(X, sc)
-        assert isinstance(R, dpt.usm_ndarray)
-        R = dpt.hypot(sc, X)
-        assert isinstance(R, dpt.usm_ndarray)
-
-
-class MockArray:
-    def __init__(self, arr):
-        self.data_ = arr
-
-    @property
-    def __sycl_usm_array_interface__(self):
-        return self.data_.__sycl_usm_array_interface__
-
-
-def test_hypot_mock_array():
-    get_queue_or_skip()
-    a = dpt.arange(10)
-    b = dpt.ones(10)
-    c = MockArray(b)
-    r = dpt.hypot(a, c)
-    assert isinstance(r, dpt.usm_ndarray)
-
-
-def test_hypot_canary_mock_array():
-    get_queue_or_skip()
-    a = dpt.arange(10)
-
-    class Canary:
-        def __init__(self):
-            pass
-
-        @property
-        def __sycl_usm_array_interface__(self):
-            return None
-
-    c = Canary()
-    with pytest.raises(ValueError):
-        dpt.hypot(a, c)
diff --git a/dpctl/tests/elementwise/test_isfinite.py b/dpctl/tests/elementwise/test_isfinite.py
deleted file mode 100644
index f2c90fb62d..0000000000
--- a/dpctl/tests/elementwise/test_isfinite.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import itertools
-
-import numpy as np
-import pytest
-from numpy.testing import assert_allclose
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_isfinite_out_type(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
-    assert dpt.isfinite(X).dtype == dpt.bool
-
-
-def test_isfinite_output():
-    q = get_queue_or_skip()
-
-    Xnp = np.asarray(np.nan)
-    X = dpt.asarray(np.nan, sycl_queue=q)
-    assert dpt.asnumpy(dpt.isfinite(X)) == np.isfinite(Xnp)
-
-
-@pytest.mark.parametrize("dtype", ["c8", "c16"])
-def test_isfinite_complex(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    y1 = complex(np.nan, np.nan)
-    y2 = complex(1, np.nan)
-    y3 = complex(np.nan, 1)
-    y4 = complex(2, 1)
-    y5 = complex(np.inf, 1)
-
-    Ynp = np.repeat(np.array([y1, y2, y3, y4, y5], dtype=dtype), 12)
-    Y = dpt.asarray(Ynp, sycl_queue=q)
-    assert np.array_equal(dpt.asnumpy(dpt.isfinite(Y)), np.isfinite(Ynp))
-
-    r = dpt.empty_like(Y, dtype="bool")
-    dpt.isfinite(Y, out=r)
-    assert np.array_equal(dpt.asnumpy(r)[()], np.isfinite(Ynp))
-
-
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
-def test_isfinite_floats(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    y1 = np.nan
-    y2 = 1
-    y3 = np.inf
-
-    for mult in [123, 137, 255, 271, 272]:
-        Ynp = np.repeat(np.array([y1, y2, y3], dtype=dtype), mult)
-        Y = dpt.asarray(Ynp, sycl_queue=q)
-        assert np.array_equal(dpt.asnumpy(dpt.isfinite(Y)), np.isfinite(Ynp))
-
-        r = dpt.empty_like(Y, dtype="bool")
-        dpt.isfinite(Y, out=r)
-        assert np.array_equal(dpt.asnumpy(r)[()], np.isfinite(Ynp))
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_isfinite_order(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    arg_dt = np.dtype(dtype)
-    input_shape = (10, 10, 10, 10)
-    X = dpt.ones(input_shape, dtype=arg_dt, sycl_queue=q)
-
-    for perms in itertools.permutations(range(4)):
-        U = dpt.permute_dims(X[::2, ::-1, ::-1, ::5], perms)
-        expected_Y = np.full(U.shape, fill_value=True, dtype=dpt.bool)
-        for ord in ["C", "F", "A", "K"]:
-            Y = dpt.isfinite(U, order=ord)
-            assert_allclose(dpt.asnumpy(Y), expected_Y)
diff --git a/dpctl/tests/elementwise/test_isinf.py b/dpctl/tests/elementwise/test_isinf.py
deleted file mode 100644
index 16c5226ee1..0000000000
--- a/dpctl/tests/elementwise/test_isinf.py
+++ /dev/null
@@ -1,93 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import itertools
-
-import numpy as np
-import pytest
-from numpy.testing import assert_allclose
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_isinf_out_type(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
-    assert dpt.isinf(X).dtype == dpt.bool
-
-
-def test_isinf_output():
-    q = get_queue_or_skip()
-
-    Xnp = np.asarray(np.inf)
-    X = dpt.asarray(np.inf, sycl_queue=q)
-    assert dpt.asnumpy(dpt.isinf(X)) == np.isinf(Xnp)
-
-
-@pytest.mark.parametrize("dtype", ["c8", "c16"])
-def test_isinf_complex(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    y1 = complex(np.inf, np.inf)
-    y2 = complex(1, np.inf)
-    y3 = complex(np.inf, 1)
-    y4 = complex(2, 1)
-    y5 = complex(np.inf, 1)
-    y6 = complex(np.inf, np.nan)
-
-    Ynp = np.repeat(np.array([y1, y2, y3, y4, y5, y6], dtype=dtype), 123)
-    Y = dpt.asarray(Ynp, sycl_queue=q)
-    assert np.array_equal(dpt.asnumpy(dpt.isinf(Y)), np.isinf(Ynp))
-
-
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
-def test_isinf_floats(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    y1 = np.nan
-    y2 = 1
-    y3 = np.inf
-    y4 = -np.inf
-
-    for mult in [123, 137, 255, 271, 272]:
-        Ynp = np.repeat(np.array([y1, y2, y3, y4], dtype=dtype), mult)
-        Y = dpt.asarray(Ynp, sycl_queue=q)
-        assert np.array_equal(dpt.asnumpy(dpt.isinf(Y)), np.isinf(Ynp))
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_isinf_order(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    arg_dt = np.dtype(dtype)
-    input_shape = (10, 10, 10, 10)
-    X = dpt.ones(input_shape, dtype=arg_dt, sycl_queue=q)
-
-    for perms in itertools.permutations(range(4)):
-        U = dpt.permute_dims(X[::2, ::-1, ::-1, ::5], perms)
-        expected_Y = np.full(U.shape, fill_value=False, dtype=dpt.bool)
-        for ord in ["C", "F", "A", "K"]:
-            Y = dpt.isinf(U, order=ord)
-            assert_allclose(dpt.asnumpy(Y), expected_Y)
diff --git a/dpctl/tests/elementwise/test_isnan.py b/dpctl/tests/elementwise/test_isnan.py
deleted file mode 100644
index 8b7670d502..0000000000
--- a/dpctl/tests/elementwise/test_isnan.py
+++ /dev/null
@@ -1,98 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import itertools
-
-import numpy as np
-import pytest
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_isnan_out_type(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
-    assert dpt.isnan(X).dtype == dpt.bool
-
-
-def test_isnan_output():
-    q = get_queue_or_skip()
-
-    Xnp = np.asarray(np.nan)
-    X = dpt.asarray(np.nan, sycl_queue=q)
-    assert dpt.asnumpy(dpt.isnan(X)) == np.isnan(Xnp)
-
-
-@pytest.mark.parametrize("dtype", ["c8", "c16"])
-def test_isnan_complex(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    y1 = complex(np.nan, np.nan)
-    y2 = complex(1, np.nan)
-    y3 = complex(np.nan, 1)
-    y4 = complex(2, 1)
-    y5 = complex(np.inf, 1)
-
-    Ynp = np.repeat(np.array([y1, y2, y3, y4, y5], dtype=dtype), 123)
-    Y = dpt.asarray(Ynp, sycl_queue=q)
-    assert np.array_equal(dpt.asnumpy(dpt.isnan(Y)), np.isnan(Ynp))
-
-    r = dpt.empty_like(Y, dtype="bool")
-    dpt.isnan(Y, out=r)
-    assert np.array_equal(dpt.asnumpy(r)[()], np.isnan(Ynp))
-
-
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
-def test_isnan_floats(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    y1 = np.nan
-    y2 = 1
-    y3 = np.inf
-
-    for mult in [123, 137, 255, 271, 272]:
-        Ynp = np.repeat(np.array([y1, y2, y3], dtype=dtype), mult)
-        Y = dpt.asarray(Ynp, sycl_queue=q)
-        assert np.array_equal(dpt.asnumpy(dpt.isnan(Y)), np.isnan(Ynp))
-
-        r = dpt.empty_like(Y, dtype="bool")
-        dpt.isnan(Y, out=r)
-        assert np.array_equal(dpt.asnumpy(r)[()], np.isnan(Ynp))
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_isnan_order(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    arg_dt = np.dtype(dtype)
-    input_shape = (10, 10, 10, 10)
-    X = dpt.ones(input_shape, dtype=arg_dt, sycl_queue=q)
-
-    for perms in itertools.permutations(range(4)):
-        U = dpt.permute_dims(X[::2, ::-1, ::-1, ::5], perms)
-        expected_Y = np.full(U.shape, fill_value=False, dtype=dpt.bool)
-        for ord in ["C", "F", "A", "K"]:
-            Y = dpt.isnan(U, order=ord)
-            assert np.allclose(dpt.asnumpy(Y), expected_Y)
diff --git a/dpctl/tests/elementwise/test_less.py b/dpctl/tests/elementwise/test_less.py
deleted file mode 100644
index 560fbf4dae..0000000000
--- a/dpctl/tests/elementwise/test_less.py
+++ /dev/null
@@ -1,297 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import ctypes
-
-import numpy as np
-import pytest
-
-import dpctl
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes, _compare_dtypes, _usm_types
-
-
-@pytest.mark.parametrize("op1_dtype", _all_dtypes)
-@pytest.mark.parametrize("op2_dtype", _all_dtypes)
-def test_less_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1 = dpt.zeros(sz, dtype=op1_dtype)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
-
-    r = dpt.less(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.less(
-        np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
-    )
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar1.shape
-    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
-    assert r.sycl_queue == ar1.sycl_queue
-
-    ar3 = dpt.zeros(sz, dtype=op1_dtype)
-    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
-
-    r = dpt.less(ar3[::-1], ar4[::2])
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.less(
-        np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
-    )
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar3.shape
-    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
-
-
-@pytest.mark.parametrize("op_dtype", ["c8", "c16"])
-def test_less_complex_matrix(op_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op_dtype, q)
-
-    sz = 127
-    ar1_np_real = np.random.randint(0, 10, sz)
-    ar1_np_imag = np.random.randint(0, 10, sz)
-    ar1_np = ar1_np_real + 1j * ar1_np_imag
-    ar1 = dpt.asarray(ar1_np, dtype=op_dtype)
-
-    ar2_np_real = np.random.randint(0, 10, sz)
-    ar2_np_imag = np.random.randint(0, 10, sz)
-    ar2_np = ar2_np_real + 1j * ar2_np_imag
-    ar2 = dpt.asarray(ar2_np, dtype=op_dtype)
-
-    r = dpt.less(ar1, ar2)
-    expected = np.less(ar1_np, ar2_np)
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == expected.shape
-    assert (dpt.asnumpy(r) == expected).all()
-
-    r1 = dpt.less(ar1[::-2], ar2[::2])
-    expected1 = np.less(ar1_np[::-2], ar2_np[::2])
-    assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q)
-    assert r1.shape == expected1.shape
-    assert (dpt.asnumpy(r1) == expected1).all()
-
-    ar3 = dpt.asarray([1.0 + 9j, 2.0 + 0j, 2.0 + 1j, 2.0 + 2j], dtype=op_dtype)
-    ar4 = dpt.asarray([2.0 + 0j, dpt.nan, dpt.inf, -dpt.inf], dtype=op_dtype)
-
-    ar3_np = dpt.asnumpy(ar3)
-    ar4_np = dpt.asnumpy(ar4)
-
-    r2 = dpt.less(ar3, ar4)
-    with np.errstate(invalid="ignore"):
-        expected2 = np.less(ar3_np, ar4_np)
-    assert (dpt.asnumpy(r2) == expected2).all()
-
-    r3 = dpt.less(ar4, ar4)
-    with np.errstate(invalid="ignore"):
-        expected3 = np.less(ar4_np, ar4_np)
-    assert (dpt.asnumpy(r3) == expected3).all()
-
-
-def test_less_complex_float():
-    get_queue_or_skip()
-
-    ar1 = dpt.asarray([1.0 + 9j, 2.0 + 0j, 2.0 + 1j, 2.0 + 2j], dtype="c8")
-    ar2 = dpt.full((4,), 2, dtype="f4")
-
-    ar1_np = dpt.asnumpy(ar1)
-    ar2_np = dpt.asnumpy(ar2)
-
-    r = dpt.less(ar1, ar2)
-    expected = np.less(ar1_np, ar2_np)
-    assert (dpt.asnumpy(r) == expected).all()
-
-    r1 = dpt.less(ar2, ar1)
-    expected1 = np.less(ar2_np, ar1_np)
-    assert (dpt.asnumpy(r1) == expected1).all()
-    with np.errstate(invalid="ignore"):
-        for tp in [dpt.nan, dpt.inf, -dpt.inf]:
-
-            ar3 = dpt.full((4,), tp)
-            ar3_np = dpt.asnumpy(ar3)
-
-            r2 = dpt.less(ar1, ar3)
-            expected2 = np.less(ar1_np, ar3_np)
-            assert (dpt.asnumpy(r2) == expected2).all()
-
-            r3 = dpt.less(ar3, ar1)
-            expected3 = np.less(ar3_np, ar1_np)
-            assert (dpt.asnumpy(r3) == expected3).all()
-
-
-@pytest.mark.parametrize("op1_usm_type", _usm_types)
-@pytest.mark.parametrize("op2_usm_type", _usm_types)
-def test_less_usm_type_matrix(op1_usm_type, op2_usm_type):
-    get_queue_or_skip()
-
-    sz = 128
-    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
-    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
-
-    r = dpt.less(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
-    assert r.usm_type == expected_usm_type
-
-
-def test_less_order():
-    get_queue_or_skip()
-
-    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
-    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
-    r1 = dpt.less(ar1, ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.less(ar1, ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.less(ar1, ar2, order="A")
-    assert r3.flags.c_contiguous
-    r4 = dpt.less(ar1, ar2, order="K")
-    assert r4.flags.c_contiguous
-
-    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
-    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
-    r1 = dpt.less(ar1, ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.less(ar1, ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.less(ar1, ar2, order="A")
-    assert r3.flags.f_contiguous
-    r4 = dpt.less(ar1, ar2, order="K")
-    assert r4.flags.f_contiguous
-
-    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
-    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
-    r4 = dpt.less(ar1, ar2, order="K")
-    assert r4.strides == (20, -1)
-
-    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
-    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
-    r4 = dpt.less(ar1, ar2, order="K")
-    assert r4.strides == (-1, 20)
-
-
-def test_less_broadcasting():
-    get_queue_or_skip()
-
-    m = dpt.ones((100, 5), dtype="i4")
-    v = dpt.arange(1, 6, dtype="i4")
-
-    r = dpt.less(m, v)
-
-    expected = np.less(
-        np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4")
-    )
-    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
-
-    r2 = dpt.less(v, m)
-    expected2 = np.less(
-        np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4")
-    )
-    assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all()
-
-
-@pytest.mark.parametrize("arr_dt", _all_dtypes)
-def test_less_python_scalar(arr_dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arr_dt, q)
-
-    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
-    py_ones = (
-        bool(1),
-        int(1),
-        float(1),
-        complex(1),
-        np.float32(1),
-        ctypes.c_int(1),
-    )
-    for sc in py_ones:
-        R = dpt.less(X, sc)
-        assert isinstance(R, dpt.usm_ndarray)
-        R = dpt.less(sc, X)
-        assert isinstance(R, dpt.usm_ndarray)
-
-
-class MockArray:
-    def __init__(self, arr):
-        self.data_ = arr
-
-    @property
-    def __sycl_usm_array_interface__(self):
-        return self.data_.__sycl_usm_array_interface__
-
-
-def test_less_mock_array():
-    get_queue_or_skip()
-    a = dpt.arange(10)
-    b = dpt.ones(10)
-    c = MockArray(b)
-    r = dpt.less(a, c)
-    assert isinstance(r, dpt.usm_ndarray)
-
-
-def test_less_canary_mock_array():
-    get_queue_or_skip()
-    a = dpt.arange(10)
-
-    class Canary:
-        def __init__(self):
-            pass
-
-        @property
-        def __sycl_usm_array_interface__(self):
-            return None
-
-    c = Canary()
-    with pytest.raises(ValueError):
-        dpt.less(a, c)
-
-
-def test_less_mixed_integer_kinds():
-    get_queue_or_skip()
-
-    x1 = dpt.flip(dpt.arange(-9, 1, dtype="i8"))
-    x2 = dpt.arange(10, dtype="u8")
-
-    # u8 - i8
-    assert not dpt.any(dpt.less(x2, x1))
-    # i8 - u8
-    res = dpt.less(x1, x2)
-    assert not res[0]
-    assert dpt.all(res[1:])
-
-    # Python scalar
-    assert not dpt.any(dpt.less(x2, -1))
-    assert dpt.all(dpt.less(-1, x2))
-
-
-def test_less_very_large_py_int():
-    get_queue_or_skip()
-
-    py_int = dpt.iinfo(dpt.int64).max + 10
-
-    x = dpt.asarray(3, dtype="u8")
-    assert not py_int < x
-    assert dpt.less(x, py_int)
-
-    x = dpt.asarray(py_int, dtype="u8")
-    assert not x < -1
-    assert dpt.less(-1, x)
diff --git a/dpctl/tests/elementwise/test_less_equal.py b/dpctl/tests/elementwise/test_less_equal.py
deleted file mode 100644
index eeb18cb5f7..0000000000
--- a/dpctl/tests/elementwise/test_less_equal.py
+++ /dev/null
@@ -1,296 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import ctypes
-
-import numpy as np
-import pytest
-
-import dpctl
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes, _compare_dtypes, _usm_types
-
-
-@pytest.mark.parametrize("op1_dtype", _all_dtypes)
-@pytest.mark.parametrize("op2_dtype", _all_dtypes)
-def test_less_equal_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1 = dpt.zeros(sz, dtype=op1_dtype)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
-
-    r = dpt.less_equal(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.less_equal(
-        np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
-    )
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar1.shape
-    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
-    assert r.sycl_queue == ar1.sycl_queue
-
-    ar3 = dpt.zeros(sz, dtype=op1_dtype)
-    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
-
-    r = dpt.less_equal(ar3[::-1], ar4[::2])
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.less_equal(
-        np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
-    )
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar3.shape
-    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
-
-
-@pytest.mark.parametrize("op_dtype", ["c8", "c16"])
-def test_less_equal_complex_matrix(op_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op_dtype, q)
-
-    sz = 127
-    ar1_np_real = np.random.randint(0, 10, sz)
-    ar1_np_imag = np.random.randint(0, 10, sz)
-    ar1_np = ar1_np_real + 1j * ar1_np_imag
-    ar1 = dpt.asarray(ar1_np, dtype=op_dtype)
-
-    ar2_np_real = np.random.randint(0, 10, sz)
-    ar2_np_imag = np.random.randint(0, 10, sz)
-    ar2_np = ar2_np_real + 1j * ar2_np_imag
-    ar2 = dpt.asarray(ar2_np, dtype=op_dtype)
-
-    r = dpt.less_equal(ar1, ar2)
-    expected = np.less_equal(ar1_np, ar2_np)
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == expected.shape
-    assert (dpt.asnumpy(r) == expected).all()
-
-    r1 = dpt.less_equal(ar1[::-2], ar2[::2])
-    expected1 = np.less_equal(ar1_np[::-2], ar2_np[::2])
-    assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q)
-    assert r1.shape == expected1.shape
-    assert (dpt.asnumpy(r1) == expected1).all()
-
-    ar3 = dpt.asarray([1.0 + 9j, 2.0 + 0j, 2.0 + 1j, 2.0 + 2j], dtype=op_dtype)
-    ar4 = dpt.asarray([2.0 + 0j, dpt.nan, dpt.inf, -dpt.inf], dtype=op_dtype)
-
-    ar3_np = dpt.asnumpy(ar3)
-    ar4_np = dpt.asnumpy(ar4)
-
-    r2 = dpt.less_equal(ar3, ar4)
-    with np.errstate(invalid="ignore"):
-        expected2 = np.less_equal(ar3_np, ar4_np)
-    assert (dpt.asnumpy(r2) == expected2).all()
-
-    r3 = dpt.less_equal(ar4, ar4)
-    with np.errstate(invalid="ignore"):
-        expected3 = np.less_equal(ar4_np, ar4_np)
-    assert (dpt.asnumpy(r3) == expected3).all()
-
-
-def test_less_equal_complex_float():
-    get_queue_or_skip()
-
-    ar1 = dpt.asarray([1.0 + 9j, 2.0 + 0j, 2.0 + 1j, 2.0 + 2j], dtype="c8")
-    ar2 = dpt.full((4,), 2, dtype="f4")
-
-    ar1_np = dpt.asnumpy(ar1)
-    ar2_np = dpt.asnumpy(ar2)
-
-    r = dpt.less_equal(ar1, ar2)
-    expected = np.less_equal(ar1_np, ar2_np)
-    assert (dpt.asnumpy(r) == expected).all()
-
-    r1 = dpt.less_equal(ar2, ar1)
-    expected1 = np.less_equal(ar2_np, ar1_np)
-    assert (dpt.asnumpy(r1) == expected1).all()
-    with np.errstate(invalid="ignore"):
-        for tp in [dpt.nan, dpt.inf, -dpt.inf]:
-
-            ar3 = dpt.full((4,), tp)
-            ar3_np = dpt.asnumpy(ar3)
-            r2 = dpt.less_equal(ar1, ar3)
-            expected2 = np.less_equal(ar1_np, ar3_np)
-            assert (dpt.asnumpy(r2) == expected2).all()
-
-            r3 = dpt.less_equal(ar3, ar1)
-            expected3 = np.less_equal(ar3_np, ar1_np)
-            assert (dpt.asnumpy(r3) == expected3).all()
-
-
-@pytest.mark.parametrize("op1_usm_type", _usm_types)
-@pytest.mark.parametrize("op2_usm_type", _usm_types)
-def test_less_equal_usm_type_matrix(op1_usm_type, op2_usm_type):
-    get_queue_or_skip()
-
-    sz = 128
-    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
-    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
-
-    r = dpt.less_equal(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
-    assert r.usm_type == expected_usm_type
-
-
-def test_less_equal_order():
-    get_queue_or_skip()
-
-    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
-    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
-    r1 = dpt.less_equal(ar1, ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.less_equal(ar1, ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.less_equal(ar1, ar2, order="A")
-    assert r3.flags.c_contiguous
-    r4 = dpt.less_equal(ar1, ar2, order="K")
-    assert r4.flags.c_contiguous
-
-    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
-    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
-    r1 = dpt.less_equal(ar1, ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.less_equal(ar1, ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.less_equal(ar1, ar2, order="A")
-    assert r3.flags.f_contiguous
-    r4 = dpt.less_equal(ar1, ar2, order="K")
-    assert r4.flags.f_contiguous
-
-    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
-    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
-    r4 = dpt.less_equal(ar1, ar2, order="K")
-    assert r4.strides == (20, -1)
-
-    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
-    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
-    r4 = dpt.less_equal(ar1, ar2, order="K")
-    assert r4.strides == (-1, 20)
-
-
-def test_less_equal_broadcasting():
-    get_queue_or_skip()
-
-    m = dpt.ones((100, 5), dtype="i4")
-    v = dpt.arange(1, 6, dtype="i4")
-
-    r = dpt.less_equal(m, v)
-
-    expected = np.less_equal(
-        np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4")
-    )
-    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
-
-    r2 = dpt.less_equal(v, m)
-    expected2 = np.less_equal(
-        np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4")
-    )
-    assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all()
-
-
-@pytest.mark.parametrize("arr_dt", _all_dtypes)
-def test_less_equal_python_scalar(arr_dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arr_dt, q)
-
-    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
-    py_ones = (
-        bool(1),
-        int(1),
-        float(1),
-        complex(1),
-        np.float32(1),
-        ctypes.c_int(1),
-    )
-    for sc in py_ones:
-        R = dpt.less_equal(X, sc)
-        assert isinstance(R, dpt.usm_ndarray)
-        R = dpt.less_equal(sc, X)
-        assert isinstance(R, dpt.usm_ndarray)
-
-
-class MockArray:
-    def __init__(self, arr):
-        self.data_ = arr
-
-    @property
-    def __sycl_usm_array_interface__(self):
-        return self.data_.__sycl_usm_array_interface__
-
-
-def test_less_equal_mock_array():
-    get_queue_or_skip()
-    a = dpt.arange(10)
-    b = dpt.ones(10)
-    c = MockArray(b)
-    r = dpt.less_equal(a, c)
-    assert isinstance(r, dpt.usm_ndarray)
-
-
-def test_less_equal_canary_mock_array():
-    get_queue_or_skip()
-    a = dpt.arange(10)
-
-    class Canary:
-        def __init__(self):
-            pass
-
-        @property
-        def __sycl_usm_array_interface__(self):
-            return None
-
-    c = Canary()
-    with pytest.raises(ValueError):
-        dpt.less_equal(a, c)
-
-
-def test_less_equal_mixed_integer_kinds():
-    get_queue_or_skip()
-
-    x1 = dpt.flip(dpt.arange(-9, 1, dtype="i8"))
-    x2 = dpt.arange(10, dtype="u8")
-
-    # u8 - i8
-    res = dpt.less_equal(x2, x1)
-    assert res[0]
-    assert not dpt.any(res[1:])
-    # i8 - u8
-    assert dpt.all(dpt.less_equal(x1, x2))
-
-    # Python scalar
-    assert not dpt.any(dpt.less_equal(x2, -1))
-    assert dpt.all(dpt.less_equal(-1, x2))
-
-
-def test_less_equal_very_large_py_int():
-    get_queue_or_skip()
-
-    py_int = dpt.iinfo(dpt.int64).max + 10
-
-    x = dpt.asarray(3, dtype="u8")
-    assert not py_int <= x
-    assert dpt.less_equal(x, py_int)
-
-    x = dpt.asarray(py_int, dtype="u8")
-    assert not x <= -1
-    assert dpt.less_equal(-1, x)
diff --git a/dpctl/tests/elementwise/test_log.py b/dpctl/tests/elementwise/test_log.py
deleted file mode 100644
index df6e205e6d..0000000000
--- a/dpctl/tests/elementwise/test_log.py
+++ /dev/null
@@ -1,130 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import itertools
-
-import numpy as np
-import pytest
-from numpy.testing import assert_allclose, assert_equal
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes, _map_to_device_dtype, _usm_types
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_log_out_type(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    X = dpt.asarray(1, dtype=dtype, sycl_queue=q)
-    expected_dtype = np.log(np.array(1, dtype=dtype)).dtype
-    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
-    assert dpt.log(X).dtype == expected_dtype
-
-
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
-def test_log_output_contig(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n_seq = 1027
-
-    X = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)
-    Xnp = dpt.asnumpy(X)
-
-    Y = dpt.log(X)
-    tol = 8 * dpt.finfo(Y.dtype).resolution
-
-    assert_allclose(dpt.asnumpy(Y), np.log(Xnp), atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
-def test_log_output_strided(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n_seq = 2 * 1027
-
-    X = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
-    Xnp = dpt.asnumpy(X)
-
-    Y = dpt.log(X)
-    tol = 8 * dpt.finfo(Y.dtype).resolution
-
-    assert_allclose(dpt.asnumpy(Y), np.log(Xnp), atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("usm_type", _usm_types)
-def test_log_usm_type(usm_type):
-    q = get_queue_or_skip()
-
-    arg_dt = np.dtype("f4")
-    input_shape = (10, 10, 10, 10)
-    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
-    X[..., 0::2] = 4 * dpt.e
-    X[..., 1::2] = 10 * dpt.e
-
-    Y = dpt.log(X)
-    assert Y.usm_type == X.usm_type
-    assert Y.sycl_queue == X.sycl_queue
-    assert Y.flags.c_contiguous
-
-    expected_Y = np.empty(input_shape, dtype=arg_dt)
-    expected_Y[..., 0::2] = np.log(np.float32(4 * dpt.e))
-    expected_Y[..., 1::2] = np.log(np.float32(10 * dpt.e))
-    tol = 8 * dpt.finfo(Y.dtype).resolution
-
-    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_log_order(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    arg_dt = np.dtype(dtype)
-    input_shape = (10, 10, 10, 10)
-    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
-    X[..., 0::2] = 4 * dpt.e
-    X[..., 1::2] = 10 * dpt.e
-
-    for perms in itertools.permutations(range(4)):
-        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
-        expected_Y = np.log(dpt.asnumpy(U))
-        for ord in ["C", "F", "A", "K"]:
-            Y = dpt.log(U, order=ord)
-            tol = 8 * max(
-                dpt.finfo(Y.dtype).resolution,
-                np.finfo(expected_Y.dtype).resolution,
-            )
-            assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
-
-
-def test_log_special_cases():
-    q = get_queue_or_skip()
-
-    X = dpt.asarray(
-        [dpt.nan, -dpt.inf, -1.0, -0.0, 0.0, dpt.inf], dtype="f4", sycl_queue=q
-    )
-    Y = dpt.log(X)
-
-    expected = np.array(
-        [np.nan, np.nan, np.nan, -np.inf, -np.inf, np.inf], dtype="f4"
-    )
-
-    assert_equal(dpt.asnumpy(Y), expected)
diff --git a/dpctl/tests/elementwise/test_log10.py b/dpctl/tests/elementwise/test_log10.py
deleted file mode 100644
index c56b19c3e3..0000000000
--- a/dpctl/tests/elementwise/test_log10.py
+++ /dev/null
@@ -1,133 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import itertools
-
-import numpy as np
-import pytest
-from numpy.testing import assert_equal
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes, _map_to_device_dtype, _usm_types
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_log_out_type(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    X = dpt.asarray(1, dtype=dtype, sycl_queue=q)
-    expected_dtype = np.log10(np.array(1, dtype=dtype)).dtype
-    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
-    assert dpt.log10(X).dtype == expected_dtype
-
-
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
-def test_log_output_contig(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n_seq = 1027
-
-    X = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)
-    Xnp = dpt.asnumpy(X)
-
-    Y = dpt.log10(X)
-    tol = 8 * dpt.finfo(Y.dtype).resolution
-
-    np.testing.assert_allclose(
-        dpt.asnumpy(Y), np.log10(Xnp), atol=tol, rtol=tol
-    )
-
-
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
-def test_log_output_strided(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n_seq = 2 * 1027
-
-    X = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
-    Xnp = dpt.asnumpy(X)
-
-    Y = dpt.log10(X)
-    tol = 8 * dpt.finfo(Y.dtype).resolution
-
-    np.testing.assert_allclose(
-        dpt.asnumpy(Y), np.log10(Xnp), atol=tol, rtol=tol
-    )
-
-
-@pytest.mark.parametrize("usm_type", _usm_types)
-def test_log_usm_type(usm_type):
-    q = get_queue_or_skip()
-
-    arg_dt = np.dtype("f4")
-    input_shape = (10, 10, 10, 10)
-    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
-    X[..., 0::2] = 4 * dpt.e
-    X[..., 1::2] = 10 * dpt.e
-
-    Y = dpt.log10(X)
-    assert Y.usm_type == X.usm_type
-    assert Y.sycl_queue == X.sycl_queue
-    assert Y.flags.c_contiguous
-
-    expected_Y = np.empty(input_shape, dtype=arg_dt)
-    expected_Y[..., 0::2] = np.log10(np.float32(4 * dpt.e))
-    expected_Y[..., 1::2] = np.log10(np.float32(10 * dpt.e))
-    tol = 8 * dpt.finfo(Y.dtype).resolution
-
-    np.testing.assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_log_order(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    arg_dt = np.dtype(dtype)
-    input_shape = (10, 10, 10, 10)
-    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
-    X[..., 0::2] = 4 * dpt.e
-    X[..., 1::2] = 10 * dpt.e
-
-    for perms in itertools.permutations(range(4)):
-        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
-        expected_Y = np.log10(dpt.asnumpy(U))
-        for ord in ["C", "F", "A", "K"]:
-            Y = dpt.log10(U, order=ord)
-            tol = 8 * max(
-                dpt.finfo(Y.dtype).resolution,
-                np.finfo(expected_Y.dtype).resolution,
-            )
-            np.testing.assert_allclose(
-                dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol
-            )
-
-
-def test_log_special_cases():
-    q = get_queue_or_skip()
-
-    X = dpt.asarray(
-        [dpt.nan, -1.0, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4", sycl_queue=q
-    )
-    Xnp = dpt.asnumpy(X)
-
-    with np.errstate(invalid="ignore", divide="ignore"):
-        assert_equal(dpt.asnumpy(dpt.log10(X)), np.log10(Xnp))
diff --git a/dpctl/tests/elementwise/test_log1p.py b/dpctl/tests/elementwise/test_log1p.py
deleted file mode 100644
index 5d0e87ca64..0000000000
--- a/dpctl/tests/elementwise/test_log1p.py
+++ /dev/null
@@ -1,169 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import itertools
-
-import numpy as np
-import pytest
-from numpy.testing import assert_allclose
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes, _map_to_device_dtype, _usm_types
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_log1p_out_type(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
-    expected_dtype = np.log1p(np.array(0, dtype=dtype)).dtype
-    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
-    assert dpt.log1p(X).dtype == expected_dtype
-
-
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
-def test_log1p_output_contig(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n_seq = 1027
-
-    X = dpt.linspace(0, 2, num=n_seq, dtype=dtype, sycl_queue=q)
-    Xnp = dpt.asnumpy(X)
-
-    Y = dpt.log1p(X)
-    tol = 8 * dpt.finfo(Y.dtype).resolution
-
-    assert_allclose(dpt.asnumpy(Y), np.log1p(Xnp), atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
-def test_log1p_output_strided(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n_seq = 2 * 1027
-
-    X = dpt.linspace(0, 2, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
-    Xnp = dpt.asnumpy(X)
-
-    Y = dpt.log1p(X)
-    tol = 8 * dpt.finfo(Y.dtype).resolution
-
-    assert_allclose(dpt.asnumpy(Y), np.log1p(Xnp), atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("usm_type", _usm_types)
-def test_log1p_usm_type(usm_type):
-    q = get_queue_or_skip()
-
-    arg_dt = np.dtype("f4")
-    input_shape = (10, 10, 10, 10)
-    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
-    X[..., 0::2] = dpt.e / 1000
-    X[..., 1::2] = dpt.e / 100
-
-    Y = dpt.log1p(X)
-    assert Y.usm_type == X.usm_type
-    assert Y.sycl_queue == X.sycl_queue
-    assert Y.flags.c_contiguous
-
-    expected_Y = np.empty(input_shape, dtype=arg_dt)
-    expected_Y[..., 0::2] = np.log1p(np.float32(dpt.e / 1000))
-    expected_Y[..., 1::2] = np.log1p(np.float32(dpt.e / 100))
-    tol = 8 * dpt.finfo(Y.dtype).resolution
-
-    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_log1p_order(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    arg_dt = np.dtype(dtype)
-    input_shape = (10, 10, 10, 10)
-    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
-    X[..., 0::2] = dpt.e / 1000
-    X[..., 1::2] = dpt.e / 100
-
-    for perms in itertools.permutations(range(4)):
-        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
-        expected_Y = np.log1p(dpt.asnumpy(U))
-        for ord in ["C", "F", "A", "K"]:
-            Y = dpt.log1p(U, order=ord)
-            tol = 8 * max(
-                dpt.finfo(Y.dtype).resolution,
-                np.finfo(expected_Y.dtype).resolution,
-            )
-            assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
-
-
-def test_log1p_special_cases():
-    q = get_queue_or_skip()
-
-    X = dpt.asarray(
-        [dpt.nan, -2.0, -1.0, -0.0, 0.0, dpt.inf],
-        dtype="f4",
-        sycl_queue=q,
-    )
-    res = np.asarray([np.nan, np.nan, -np.inf, -0.0, 0.0, np.inf])
-
-    tol = dpt.finfo(X.dtype).resolution
-    with np.errstate(divide="ignore", invalid="ignore"):
-        assert_allclose(dpt.asnumpy(dpt.log1p(X)), res, atol=tol, rtol=tol)
-
-    # special cases for complex
-    vals = [
-        complex(-1.0, 0.0),
-        complex(2.0, dpt.inf),
-        complex(2.0, dpt.nan),
-        complex(-dpt.inf, 1.0),
-        complex(dpt.inf, 1.0),
-        complex(-dpt.inf, dpt.inf),
-        complex(dpt.inf, dpt.inf),
-        complex(dpt.inf, dpt.nan),
-        complex(dpt.nan, 1.0),
-        complex(dpt.nan, dpt.inf),
-        complex(dpt.nan, dpt.nan),
-    ]
-    X = dpt.asarray(vals, dtype=dpt.complex64)
-    c_nan = complex(np.nan, np.nan)
-    res = np.asarray(
-        [
-            complex(-np.inf, 0.0),
-            complex(np.inf, np.pi / 2),
-            c_nan,
-            complex(np.inf, np.pi),
-            complex(np.inf, 0.0),
-            complex(np.inf, 3 * np.pi / 4),
-            complex(np.inf, np.pi / 4),
-            complex(np.inf, np.nan),
-            c_nan,
-            complex(np.inf, np.nan),
-            c_nan,
-        ],
-        dtype=np.complex64,
-    )
-
-    tol = dpt.finfo(X.dtype).resolution
-    with np.errstate(invalid="ignore"):
-        dpt_res = dpt.asnumpy(dpt.log1p(X))
-        assert_allclose(np.real(dpt_res), np.real(res), atol=tol, rtol=tol)
-        assert_allclose(np.imag(dpt_res), np.imag(res), atol=tol, rtol=tol)
diff --git a/dpctl/tests/elementwise/test_log2.py b/dpctl/tests/elementwise/test_log2.py
deleted file mode 100644
index 0aa747f8d8..0000000000
--- a/dpctl/tests/elementwise/test_log2.py
+++ /dev/null
@@ -1,129 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import itertools
-
-import numpy as np
-import pytest
-from numpy.testing import assert_equal
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes, _map_to_device_dtype, _usm_types
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_log_out_type(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    X = dpt.asarray(1, dtype=dtype, sycl_queue=q)
-    expected_dtype = np.log2(np.array(1, dtype=dtype)).dtype
-    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
-    assert dpt.log2(X).dtype == expected_dtype
-
-
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
-def test_log_output_contig(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n_seq = 1027
-
-    X = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)
-    Xnp = dpt.asnumpy(X)
-
-    Y = dpt.log2(X)
-    tol = 8 * dpt.finfo(Y.dtype).resolution
-
-    np.testing.assert_allclose(dpt.asnumpy(Y), np.log2(Xnp), atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
-def test_log_output_strided(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n_seq = 2 * 1027
-
-    X = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
-    Xnp = dpt.asnumpy(X)
-
-    Y = dpt.log2(X)
-    tol = 8 * dpt.finfo(Y.dtype).resolution
-
-    np.testing.assert_allclose(dpt.asnumpy(Y), np.log2(Xnp), atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("usm_type", _usm_types)
-def test_log_usm_type(usm_type):
-    q = get_queue_or_skip()
-
-    arg_dt = np.dtype("f4")
-    input_shape = (10, 10, 10, 10)
-    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
-    X[..., 0::2] = 4 * dpt.e
-    X[..., 1::2] = 10 * dpt.e
-
-    Y = dpt.log2(X)
-    assert Y.usm_type == X.usm_type
-    assert Y.sycl_queue == X.sycl_queue
-    assert Y.flags.c_contiguous
-
-    expected_Y = np.empty(input_shape, dtype=arg_dt)
-    expected_Y[..., 0::2] = np.log2(np.float32(4 * dpt.e))
-    expected_Y[..., 1::2] = np.log2(np.float32(10 * dpt.e))
-    tol = 8 * dpt.finfo(Y.dtype).resolution
-
-    np.testing.assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_log_order(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    arg_dt = np.dtype(dtype)
-    input_shape = (10, 10, 10, 10)
-    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
-    X[..., 0::2] = 4 * dpt.e
-    X[..., 1::2] = 10 * dpt.e
-
-    for perms in itertools.permutations(range(4)):
-        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
-        expected_Y = np.log2(dpt.asnumpy(U))
-        for ord in ["C", "F", "A", "K"]:
-            Y = dpt.log2(U, order=ord)
-            tol = 8 * max(
-                dpt.finfo(Y.dtype).resolution,
-                np.finfo(expected_Y.dtype).resolution,
-            )
-            np.testing.assert_allclose(
-                dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol
-            )
-
-
-def test_log_special_cases():
-    q = get_queue_or_skip()
-
-    X = dpt.asarray(
-        [dpt.nan, -1.0, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4", sycl_queue=q
-    )
-    Xnp = dpt.asnumpy(X)
-
-    with np.errstate(invalid="ignore", divide="ignore"):
-        assert_equal(dpt.asnumpy(dpt.log2(X)), np.log2(Xnp))
diff --git a/dpctl/tests/elementwise/test_logaddexp.py b/dpctl/tests/elementwise/test_logaddexp.py
deleted file mode 100644
index 8a2a73cb7a..0000000000
--- a/dpctl/tests/elementwise/test_logaddexp.py
+++ /dev/null
@@ -1,194 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import ctypes
-import re
-
-import numpy as np
-import pytest
-from numpy.testing import assert_allclose
-
-import dpctl
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _compare_dtypes, _no_complex_dtypes, _usm_types
-
-
-@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes)
-@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes)
-def test_logaddexp_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1 = dpt.ones(sz, dtype=op1_dtype)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
-
-    r = dpt.logaddexp(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.logaddexp(dpt.asnumpy(ar1), dpt.asnumpy(ar2))
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar1.shape
-    tol = 8 * max(
-        np.finfo(r.dtype).resolution, np.finfo(expected.dtype).resolution
-    )
-    assert_allclose(
-        dpt.asnumpy(r), expected.astype(r.dtype), atol=tol, rtol=tol
-    )
-    assert r.sycl_queue == ar1.sycl_queue
-
-    ar3 = dpt.ones(sz, dtype=op1_dtype)
-    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
-
-    r = dpt.logaddexp(ar3[::-1], ar4[::2])
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.logaddexp(dpt.asnumpy(ar3)[::-1], dpt.asnumpy(ar4)[::2])
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar3.shape
-    assert_allclose(
-        dpt.asnumpy(r), expected.astype(r.dtype), atol=tol, rtol=tol
-    )
-
-
-@pytest.mark.parametrize("op1_usm_type", _usm_types)
-@pytest.mark.parametrize("op2_usm_type", _usm_types)
-def test_logaddexp_usm_type_matrix(op1_usm_type, op2_usm_type):
-    get_queue_or_skip()
-
-    sz = 128
-    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
-    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
-
-    r = dpt.logaddexp(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
-    assert r.usm_type == expected_usm_type
-
-
-def test_logaddexp_order():
-    get_queue_or_skip()
-
-    test_shape = (
-        20,
-        20,
-    )
-    test_shape2 = tuple(2 * dim for dim in test_shape)
-    n = test_shape[-1]
-
-    for dt1, dt2 in zip(["i4", "i4", "f4"], ["i4", "f4", "i4"]):
-        ar1 = dpt.ones(test_shape, dtype=dt1, order="C")
-        ar2 = dpt.ones(test_shape, dtype=dt2, order="C")
-        r1 = dpt.logaddexp(ar1, ar2, order="C")
-        assert r1.flags.c_contiguous
-        r2 = dpt.logaddexp(ar1, ar2, order="F")
-        assert r2.flags.f_contiguous
-        r3 = dpt.logaddexp(ar1, ar2, order="A")
-        assert r3.flags.c_contiguous
-        r4 = dpt.logaddexp(ar1, ar2, order="K")
-        assert r4.flags.c_contiguous
-
-        ar1 = dpt.ones(test_shape, dtype=dt1, order="F")
-        ar2 = dpt.ones(test_shape, dtype=dt2, order="F")
-        r1 = dpt.logaddexp(ar1, ar2, order="C")
-        assert r1.flags.c_contiguous
-        r2 = dpt.logaddexp(ar1, ar2, order="F")
-        assert r2.flags.f_contiguous
-        r3 = dpt.logaddexp(ar1, ar2, order="A")
-        assert r3.flags.f_contiguous
-        r4 = dpt.logaddexp(ar1, ar2, order="K")
-        assert r4.flags.f_contiguous
-
-        ar1 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2]
-        ar2 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2]
-        r4 = dpt.logaddexp(ar1, ar2, order="K")
-        assert r4.strides == (n, -1)
-        r5 = dpt.logaddexp(ar1, ar2, order="C")
-        assert r5.strides == (n, 1)
-
-        ar1 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2].mT
-        ar2 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2].mT
-        r4 = dpt.logaddexp(ar1, ar2, order="K")
-        assert r4.strides == (-1, n)
-        r5 = dpt.logaddexp(ar1, ar2, order="C")
-        assert r5.strides == (n, 1)
-
-
-def test_logaddexp_broadcasting():
-    get_queue_or_skip()
-
-    m = dpt.ones((100, 5), dtype="i4")
-    v = dpt.arange(1, 6, dtype="i4")
-
-    r = dpt.logaddexp(m, v)
-
-    expected = np.logaddexp(
-        np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4")
-    )
-    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
-
-    r2 = dpt.logaddexp(v, m)
-    expected2 = np.logaddexp(
-        np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4")
-    )
-    assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all()
-
-
-def test_logaddexp_broadcasting_error():
-    get_queue_or_skip()
-    m = dpt.ones((10, 10), dtype="i4")
-    v = dpt.ones((3,), dtype="i4")
-    with pytest.raises(ValueError):
-        dpt.logaddexp(m, v)
-
-
-@pytest.mark.parametrize("arr_dt", _no_complex_dtypes)
-def test_logaddexp_python_scalar(arr_dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arr_dt, q)
-
-    X = dpt.zeros((10, 10), dtype=arr_dt, sycl_queue=q)
-    py_zeros = (
-        bool(0),
-        int(0),
-        float(0),
-        np.float32(0),
-        ctypes.c_int(0),
-    )
-    for sc in py_zeros:
-        R = dpt.logaddexp(X, sc)
-        assert isinstance(R, dpt.usm_ndarray)
-        R = dpt.logaddexp(sc, X)
-        assert isinstance(R, dpt.usm_ndarray)
-
-
-@pytest.mark.parametrize("dtype", _no_complex_dtypes)
-def test_logaddexp_dtype_error(
-    dtype,
-):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    ar1 = dpt.ones(5, dtype=dtype)
-    ar2 = dpt.ones_like(ar1, dtype="f4")
-
-    y = dpt.zeros_like(ar1, dtype="int8")
-    with pytest.raises(ValueError) as excinfo:
-        dpt.logaddexp(ar1, ar2, out=y)
-    assert re.match("Output array of type.*is needed", str(excinfo.value))
diff --git a/dpctl/tests/elementwise/test_logical_and.py b/dpctl/tests/elementwise/test_logical_and.py
deleted file mode 100644
index 4ce8de978b..0000000000
--- a/dpctl/tests/elementwise/test_logical_and.py
+++ /dev/null
@@ -1,304 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2023-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import ctypes
-
-import numpy as np
-import pytest
-
-import dpctl
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes, _compare_dtypes, _usm_types
-
-
-@pytest.mark.parametrize("op1_dtype", _all_dtypes)
-@pytest.mark.parametrize("op2_dtype", _all_dtypes)
-def test_logical_and_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1 = dpt.asarray(np.random.randint(0, 2, sz), dtype=op1_dtype)
-    ar2 = dpt.asarray(np.random.randint(0, 2, sz), dtype=op2_dtype)
-
-    r = dpt.logical_and(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-
-    expected = np.logical_and(dpt.asnumpy(ar1), dpt.asnumpy(ar2))
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar1.shape
-    assert (dpt.asnumpy(r) == expected).all()
-    assert r.sycl_queue == ar1.sycl_queue
-
-    r2 = dpt.empty_like(r, dtype=r.dtype)
-    dpt.logical_and(ar1, ar2, out=r2)
-    assert (dpt.asnumpy(r) == dpt.asnumpy(r2)).all()
-
-    ar3 = dpt.zeros(sz, dtype=op1_dtype)
-    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
-
-    r = dpt.logical_and(ar3[::-1], ar4[::2])
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.logical_and(
-        np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
-    )
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar3.shape
-    assert (dpt.asnumpy(r) == expected).all()
-
-    r2 = dpt.empty_like(r, dtype=r.dtype)
-    dpt.logical_and(ar3[::-1], ar4[::2], out=r2)
-    assert (dpt.asnumpy(r) == dpt.asnumpy(r2)).all()
-
-
-@pytest.mark.parametrize("op_dtype", ["c8", "c16"])
-def test_logical_and_complex_matrix(op_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op_dtype, q)
-
-    sz = 127
-    ar1_np_real = np.random.randint(0, 2, sz)
-    ar1_np_imag = np.random.randint(0, 2, sz)
-    ar1_np = ar1_np_real + 1j * ar1_np_imag
-    ar1 = dpt.asarray(ar1_np, dtype=op_dtype)
-
-    ar2_np_real = np.random.randint(0, 2, sz)
-    ar2_np_imag = np.random.randint(0, 2, sz)
-    ar2_np = ar2_np_real + 1j * ar2_np_imag
-    ar2 = dpt.asarray(ar2_np, dtype=op_dtype)
-
-    r = dpt.logical_and(ar1, ar2)
-    expected = np.logical_and(ar1_np, ar2_np)
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == expected.shape
-    assert (dpt.asnumpy(r) == expected).all()
-
-    r1 = dpt.logical_and(ar1[::-2], ar2[::2])
-    expected1 = np.logical_and(ar1_np[::-2], ar2_np[::2])
-    assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q)
-    assert r1.shape == expected1.shape
-    assert (dpt.asnumpy(r1) == expected1).all()
-
-    ar3 = dpt.asarray(
-        [
-            2.0 + 0j,
-            dpt.nan,
-            dpt.nan * 1j,
-            dpt.inf,
-            dpt.inf * 1j,
-            -dpt.inf,
-            -dpt.inf * 1j,
-        ],
-        dtype=op_dtype,
-    )
-    ar4 = dpt.full(ar3.shape, fill_value=1.0 + 2j, dtype=op_dtype)
-
-    ar3_np = dpt.asnumpy(ar3)
-    ar4_np = dpt.asnumpy(ar4)
-
-    r2 = dpt.logical_and(ar3, ar4)
-    with np.errstate(invalid="ignore"):
-        expected2 = np.logical_and(ar3_np, ar4_np)
-    assert (dpt.asnumpy(r2) == expected2).all()
-
-    r3 = dpt.logical_and(ar4, ar4)
-    with np.errstate(invalid="ignore"):
-        expected3 = np.logical_and(ar4_np, ar4_np)
-    assert (dpt.asnumpy(r3) == expected3).all()
-
-
-def test_logical_and_complex_float():
-    get_queue_or_skip()
-
-    ar1 = dpt.asarray([1j, 1.0 + 9j, 2.0 + 0j, 2.0 + 1j], dtype="c8")
-    ar2 = dpt.full(ar1.shape, 2, dtype="f4")
-
-    ar1_np = dpt.asnumpy(ar1)
-    ar2_np = dpt.asnumpy(ar2)
-
-    r = dpt.logical_and(ar1, ar2)
-    expected = np.logical_and(ar1_np, ar2_np)
-    assert (dpt.asnumpy(r) == expected).all()
-
-    r1 = dpt.logical_and(ar2, ar1)
-    expected1 = np.logical_and(ar2_np, ar1_np)
-    assert (dpt.asnumpy(r1) == expected1).all()
-    with np.errstate(invalid="ignore"):
-        for tp in [
-            dpt.nan,
-            dpt.nan * 1j,
-            dpt.inf,
-            dpt.inf * 1j,
-            -dpt.inf,
-            -dpt.inf * 1j,
-        ]:
-            ar3 = dpt.full(ar1.shape, tp)
-            ar3_np = dpt.asnumpy(ar3)
-            r2 = dpt.logical_and(ar1, ar3)
-            expected2 = np.logical_and(ar1_np, ar3_np)
-            assert (dpt.asnumpy(r2) == expected2).all()
-
-            r3 = dpt.logical_and(ar3, ar1)
-            expected3 = np.logical_and(ar3_np, ar1_np)
-            assert (dpt.asnumpy(r3) == expected3).all()
-
-
-@pytest.mark.parametrize("op1_usm_type", _usm_types)
-@pytest.mark.parametrize("op2_usm_type", _usm_types)
-def test_logical_and_usm_type_matrix(op1_usm_type, op2_usm_type):
-    get_queue_or_skip()
-
-    sz = 128
-    ar1 = dpt.asarray(
-        np.random.randint(0, 2, sz), dtype="i4", usm_type=op1_usm_type
-    )
-    ar2 = dpt.asarray(
-        np.random.randint(0, 2, sz), dtype=ar1.dtype, usm_type=op2_usm_type
-    )
-
-    r = dpt.logical_and(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
-    assert r.usm_type == expected_usm_type
-
-
-def test_logical_and_order():
-    get_queue_or_skip()
-
-    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
-    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
-    r1 = dpt.logical_and(ar1, ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.logical_and(ar1, ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.logical_and(ar1, ar2, order="A")
-    assert r3.flags.c_contiguous
-    r4 = dpt.logical_and(ar1, ar2, order="K")
-    assert r4.flags.c_contiguous
-
-    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
-    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
-    r1 = dpt.logical_and(ar1, ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.logical_and(ar1, ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.logical_and(ar1, ar2, order="A")
-    assert r3.flags.f_contiguous
-    r4 = dpt.logical_and(ar1, ar2, order="K")
-    assert r4.flags.f_contiguous
-
-    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
-    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
-    r4 = dpt.logical_and(ar1, ar2, order="K")
-    assert r4.strides == (20, -1)
-
-    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
-    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
-    r4 = dpt.logical_and(ar1, ar2, order="K")
-    assert r4.strides == (-1, 20)
-
-
-def test_logical_and_broadcasting():
-    get_queue_or_skip()
-
-    m = dpt.asarray(np.random.randint(0, 2, (100, 5)), dtype="i4")
-    v = dpt.arange(1, 6, dtype="i4")
-
-    r = dpt.logical_and(m, v)
-
-    expected = np.logical_and(dpt.asnumpy(m), dpt.asnumpy(v))
-    assert (dpt.asnumpy(r) == expected).all()
-
-    r2 = dpt.logical_and(v, m)
-    expected2 = np.logical_and(dpt.asnumpy(v), dpt.asnumpy(m))
-    assert (dpt.asnumpy(r2) == expected2).all()
-
-    r3 = dpt.empty_like(r)
-    dpt.logical_and(m, v, out=r3)
-    assert (dpt.asnumpy(r3) == expected).all()
-
-    r4 = dpt.empty_like(r)
-    dpt.logical_and(v, m, out=r4)
-    assert (dpt.asnumpy(r4) == expected).all()
-
-
-@pytest.mark.parametrize("arr_dt", _all_dtypes)
-@pytest.mark.parametrize("scalar_val", [0, 1])
-def test_logical_and_python_scalar(arr_dt, scalar_val):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arr_dt, q)
-
-    X = dpt.asarray(
-        np.random.randint(0, 2, (10, 10)), dtype=arr_dt, sycl_queue=q
-    )
-    py_ones = (
-        bool(scalar_val),
-        int(scalar_val),
-        float(scalar_val),
-        complex(scalar_val),
-        np.float32(scalar_val),
-        ctypes.c_int(scalar_val),
-    )
-    for sc in py_ones:
-        R = dpt.logical_and(X, sc)
-        assert isinstance(R, dpt.usm_ndarray)
-        E = np.logical_and(dpt.asnumpy(X), sc)
-        assert (dpt.asnumpy(R) == E).all()
-
-        R = dpt.logical_and(sc, X)
-        assert isinstance(R, dpt.usm_ndarray)
-        E = np.logical_and(sc, dpt.asnumpy(X))
-        assert (dpt.asnumpy(R) == E).all()
-
-
-class MockArray:
-    def __init__(self, arr):
-        self.data_ = arr
-
-    @property
-    def __sycl_usm_array_interface__(self):
-        return self.data_.__sycl_usm_array_interface__
-
-
-def test_logical_and_mock_array():
-    get_queue_or_skip()
-    a = dpt.arange(10)
-    b = dpt.ones(10)
-    c = MockArray(b)
-    r = dpt.logical_and(a, c)
-    assert isinstance(r, dpt.usm_ndarray)
-
-
-def test_logical_and_canary_mock_array():
-    get_queue_or_skip()
-    a = dpt.arange(10)
-
-    class Canary:
-        def __init__(self):
-            pass
-
-        @property
-        def __sycl_usm_array_interface__(self):
-            return None
-
-    c = Canary()
-    with pytest.raises(ValueError):
-        dpt.logical_and(a, c)
diff --git a/dpctl/tests/elementwise/test_logical_not.py b/dpctl/tests/elementwise/test_logical_not.py
deleted file mode 100644
index c4440574ad..0000000000
--- a/dpctl/tests/elementwise/test_logical_not.py
+++ /dev/null
@@ -1,179 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2023-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import numpy as np
-import pytest
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes, _compare_dtypes, _usm_types
-
-
-@pytest.mark.parametrize("op_dtype", _all_dtypes)
-def test_logical_not_dtype_matrix(op_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op_dtype, q)
-
-    sz = 7
-    ar1_np = np.random.randint(0, 2, sz)
-    ar1 = dpt.asarray(ar1_np, dtype=op_dtype)
-
-    r = dpt.logical_not(ar1)
-    assert isinstance(r, dpt.usm_ndarray)
-
-    expected = np.logical_not(ar1_np)
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar1.shape
-    assert (dpt.asnumpy(r) == expected).all()
-    assert r.sycl_queue == ar1.sycl_queue
-
-    r2 = dpt.empty_like(r, dtype=r.dtype)
-    dpt.logical_not(ar1, out=r2)
-    assert (dpt.asnumpy(r) == dpt.asnumpy(r2)).all()
-
-    ar2 = dpt.zeros(sz, dtype=op_dtype)
-    r = dpt.logical_not(ar2[::-1])
-    assert isinstance(r, dpt.usm_ndarray)
-
-    expected = np.logical_not(np.zeros(ar2.shape, dtype=op_dtype))
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar2.shape
-    assert (dpt.asnumpy(r) == expected).all()
-
-    ar3 = dpt.ones(sz, dtype=op_dtype)
-    r2 = dpt.logical_not(ar3[::2])
-    assert isinstance(r, dpt.usm_ndarray)
-
-    expected = np.logical_not(np.ones(ar3.shape, dtype=op_dtype)[::2])
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert (dpt.asnumpy(r2) == expected).all()
-
-    r3 = dpt.empty_like(r, dtype=r.dtype)
-    dpt.logical_not(ar2[::-1], out=r3)
-    assert (dpt.asnumpy(r) == dpt.asnumpy(r3)).all()
-
-
-@pytest.mark.parametrize("op_dtype", ["c8", "c16"])
-def test_logical_not_complex_matrix(op_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op_dtype, q)
-
-    sz = 127
-    ar1_np_real = np.random.randint(0, 2, sz)
-    ar1_np_imag = np.random.randint(0, 2, sz)
-    ar1_np = ar1_np_real + 1j * ar1_np_imag
-    ar1 = dpt.asarray(ar1_np, dtype=op_dtype)
-
-    r = dpt.logical_not(ar1)
-    expected = np.logical_not(ar1_np)
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == expected.shape
-    assert (dpt.asnumpy(r) == expected).all()
-
-    r1 = dpt.logical_not(ar1[::-2])
-    expected1 = np.logical_not(ar1_np[::-2])
-    assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q)
-    assert r1.shape == expected1.shape
-    assert (dpt.asnumpy(r1) == expected1).all()
-
-    ar2 = dpt.asarray(
-        [
-            2.0 + 0j,
-            dpt.nan,
-            dpt.nan * 1j,
-            dpt.inf,
-            dpt.inf * 1j,
-            -dpt.inf,
-            -dpt.inf * 1j,
-        ],
-        dtype=op_dtype,
-    )
-    ar2_np = dpt.asnumpy(ar2)
-    r2 = dpt.logical_not(ar2)
-    with np.errstate(invalid="ignore"):
-        expected2 = np.logical_not(ar2_np)
-    assert (dpt.asnumpy(r2) == expected2).all()
-
-
-def test_logical_not_complex_float():
-    get_queue_or_skip()
-
-    ar1 = dpt.asarray([1j, 1.0 + 9j, 2.0 + 0j, 2.0 + 1j], dtype="c8")
-
-    r = dpt.logical_not(ar1)
-    expected = np.logical_not(dpt.asnumpy(ar1))
-    assert (dpt.asnumpy(r) == expected).all()
-
-    with np.errstate(invalid="ignore"):
-        for tp in [
-            dpt.nan,
-            dpt.nan * 1j,
-            dpt.inf,
-            dpt.inf * 1j,
-            -dpt.inf,
-            -dpt.inf * 1j,
-        ]:
-            ar2 = dpt.full(ar1.shape, tp)
-            r2 = dpt.logical_not(ar2)
-            expected2 = np.logical_not(dpt.asnumpy(ar2))
-            assert (dpt.asnumpy(r2) == expected2).all()
-
-
-@pytest.mark.parametrize("op_usm_type", _usm_types)
-def test_logical_not_usm_type_matrix(op_usm_type):
-    get_queue_or_skip()
-
-    sz = 128
-    ar1 = dpt.asarray(
-        np.random.randint(0, 2, sz), dtype="i4", usm_type=op_usm_type
-    )
-
-    r = dpt.logical_not(ar1)
-    assert isinstance(r, dpt.usm_ndarray)
-    assert r.usm_type == op_usm_type
-
-
-def test_logical_not_order():
-    get_queue_or_skip()
-
-    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
-    r1 = dpt.logical_not(ar1, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.logical_not(ar1, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.logical_not(ar1, order="A")
-    assert r3.flags.c_contiguous
-    r4 = dpt.logical_not(ar1, order="K")
-    assert r4.flags.c_contiguous
-
-    ar1 = dpt.zeros((20, 20), dtype="i4", order="F")
-    r1 = dpt.logical_not(ar1, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.logical_not(ar1, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.logical_not(ar1, order="A")
-    assert r3.flags.f_contiguous
-    r4 = dpt.logical_not(ar1, order="K")
-    assert r4.flags.f_contiguous
-
-    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
-    r4 = dpt.logical_not(ar1, order="K")
-    assert r4.strides == (20, -1)
-
-    ar1 = dpt.zeros((40, 40), dtype="i4", order="C")[:20, ::-2].mT
-    r4 = dpt.logical_not(ar1, order="K")
-    assert r4.strides == (-1, 20)
diff --git a/dpctl/tests/elementwise/test_logical_or.py b/dpctl/tests/elementwise/test_logical_or.py
deleted file mode 100644
index e4ef4d7ebf..0000000000
--- a/dpctl/tests/elementwise/test_logical_or.py
+++ /dev/null
@@ -1,305 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2023-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import ctypes
-
-import numpy as np
-import pytest
-
-import dpctl
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes, _compare_dtypes, _usm_types
-
-
-@pytest.mark.parametrize("op1_dtype", _all_dtypes)
-@pytest.mark.parametrize("op2_dtype", _all_dtypes)
-def test_logical_or_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1 = dpt.asarray(np.random.randint(0, 2, sz), dtype=op1_dtype)
-    ar2 = dpt.asarray(np.random.randint(0, 2, sz), dtype=op2_dtype)
-
-    r = dpt.logical_or(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-
-    expected = np.logical_or(dpt.asnumpy(ar1), dpt.asnumpy(ar2))
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar1.shape
-    assert (dpt.asnumpy(r) == expected).all()
-    assert r.sycl_queue == ar1.sycl_queue
-
-    r2 = dpt.empty_like(r, dtype=r.dtype)
-    dpt.logical_or(ar1, ar2, out=r2)
-    assert (dpt.asnumpy(r) == dpt.asnumpy(r2)).all()
-
-    ar3 = dpt.zeros(sz, dtype=op1_dtype)
-    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
-
-    r = dpt.logical_or(ar3[::-1], ar4[::2])
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.logical_or(
-        np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
-    )
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar3.shape
-    assert (dpt.asnumpy(r) == expected).all()
-
-    r2 = dpt.empty_like(r, dtype=r.dtype)
-    dpt.logical_or(ar3[::-1], ar4[::2], out=r2)
-    assert (dpt.asnumpy(r) == dpt.asnumpy(r2)).all()
-
-
-@pytest.mark.parametrize("op_dtype", ["c8", "c16"])
-def test_logical_or_complex_matrix(op_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op_dtype, q)
-
-    sz = 127
-    ar1_np_real = np.random.randint(0, 2, sz)
-    ar1_np_imag = np.random.randint(0, 2, sz)
-    ar1_np = ar1_np_real + 1j * ar1_np_imag
-    ar1 = dpt.asarray(ar1_np, dtype=op_dtype)
-
-    ar2_np_real = np.random.randint(0, 2, sz)
-    ar2_np_imag = np.random.randint(0, 2, sz)
-    ar2_np = ar2_np_real + 1j * ar2_np_imag
-    ar2 = dpt.asarray(ar2_np, dtype=op_dtype)
-
-    r = dpt.logical_or(ar1, ar2)
-    expected = np.logical_or(ar1_np, ar2_np)
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == expected.shape
-    assert (dpt.asnumpy(r) == expected).all()
-
-    r1 = dpt.logical_or(ar1[::-2], ar2[::2])
-    expected1 = np.logical_or(ar1_np[::-2], ar2_np[::2])
-    assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q)
-    assert r1.shape == expected1.shape
-    assert (dpt.asnumpy(r1) == expected1).all()
-
-    ar3 = dpt.asarray(
-        [
-            2.0 + 0j,
-            dpt.nan,
-            dpt.nan * 1j,
-            dpt.inf,
-            dpt.inf * 1j,
-            -dpt.inf,
-            -dpt.inf * 1j,
-        ],
-        dtype=op_dtype,
-    )
-    ar4 = dpt.full(ar3.shape, fill_value=1.0 + 2j, dtype=op_dtype)
-
-    ar3_np = dpt.asnumpy(ar3)
-    ar4_np = dpt.asnumpy(ar4)
-
-    r2 = dpt.logical_or(ar3, ar4)
-    with np.errstate(invalid="ignore"):
-        expected2 = np.logical_or(ar3_np, ar4_np)
-    assert (dpt.asnumpy(r2) == expected2).all()
-
-    r3 = dpt.logical_or(ar4, ar4)
-    with np.errstate(invalid="ignore"):
-        expected3 = np.logical_or(ar4_np, ar4_np)
-    assert (dpt.asnumpy(r3) == expected3).all()
-
-
-def test_logical_or_complex_float():
-    get_queue_or_skip()
-
-    ar1 = dpt.asarray([1j, 1.0 + 9j, 2.0 + 0j, 2.0 + 1j], dtype="c8")
-    ar2 = dpt.full(ar1.shape, 2, dtype="f4")
-
-    ar1_np = dpt.asnumpy(ar1)
-    ar2_np = dpt.asnumpy(ar2)
-
-    r = dpt.logical_or(ar1, ar2)
-    expected = np.logical_or(ar1_np, ar2_np)
-    assert (dpt.asnumpy(r) == expected).all()
-
-    r1 = dpt.logical_or(ar2, ar1)
-    expected1 = np.logical_or(ar2_np, ar1_np)
-    assert (dpt.asnumpy(r1) == expected1).all()
-    with np.errstate(invalid="ignore"):
-        for tp in [
-            dpt.nan,
-            dpt.nan * 1j,
-            dpt.inf,
-            dpt.inf * 1j,
-            -dpt.inf,
-            -dpt.inf * 1j,
-        ]:
-            ar3 = dpt.full(ar1.shape, tp)
-            ar3_np = dpt.asnumpy(ar3)
-
-            r2 = dpt.logical_or(ar1, ar3)
-            expected2 = np.logical_or(ar1_np, ar3_np)
-            assert (dpt.asnumpy(r2) == expected2).all()
-
-            r3 = dpt.logical_or(ar3, ar1)
-            expected3 = np.logical_or(ar3_np, ar1_np)
-            assert (dpt.asnumpy(r3) == expected3).all()
-
-
-@pytest.mark.parametrize("op1_usm_type", _usm_types)
-@pytest.mark.parametrize("op2_usm_type", _usm_types)
-def test_logical_or_usm_type_matrix(op1_usm_type, op2_usm_type):
-    get_queue_or_skip()
-
-    sz = 128
-    ar1 = dpt.asarray(
-        np.random.randint(0, 2, sz), dtype="i4", usm_type=op1_usm_type
-    )
-    ar2 = dpt.asarray(
-        np.random.randint(0, 2, sz), dtype=ar1.dtype, usm_type=op2_usm_type
-    )
-
-    r = dpt.logical_or(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
-    assert r.usm_type == expected_usm_type
-
-
-def test_logical_or_order():
-    get_queue_or_skip()
-
-    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
-    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
-    r1 = dpt.logical_or(ar1, ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.logical_or(ar1, ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.logical_or(ar1, ar2, order="A")
-    assert r3.flags.c_contiguous
-    r4 = dpt.logical_or(ar1, ar2, order="K")
-    assert r4.flags.c_contiguous
-
-    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
-    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
-    r1 = dpt.logical_or(ar1, ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.logical_or(ar1, ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.logical_or(ar1, ar2, order="A")
-    assert r3.flags.f_contiguous
-    r4 = dpt.logical_or(ar1, ar2, order="K")
-    assert r4.flags.f_contiguous
-
-    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
-    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
-    r4 = dpt.logical_or(ar1, ar2, order="K")
-    assert r4.strides == (20, -1)
-
-    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
-    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
-    r4 = dpt.logical_or(ar1, ar2, order="K")
-    assert r4.strides == (-1, 20)
-
-
-def test_logical_or_broadcasting():
-    get_queue_or_skip()
-
-    m = dpt.asarray(np.random.randint(0, 2, (100, 5)), dtype="i4")
-    v = dpt.arange(1, 6, dtype="i4")
-
-    r = dpt.logical_or(m, v)
-
-    expected = np.logical_or(dpt.asnumpy(m), dpt.asnumpy(v))
-    assert (dpt.asnumpy(r) == expected).all()
-
-    r2 = dpt.logical_or(v, m)
-    expected2 = np.logical_or(dpt.asnumpy(v), dpt.asnumpy(m))
-    assert (dpt.asnumpy(r2) == expected2).all()
-
-    r3 = dpt.empty_like(r)
-    dpt.logical_or(m, v, out=r3)
-    assert (dpt.asnumpy(r3) == expected).all()
-
-    r4 = dpt.empty_like(r)
-    dpt.logical_or(v, m, out=r4)
-    assert (dpt.asnumpy(r4) == expected).all()
-
-
-@pytest.mark.parametrize("arr_dt", _all_dtypes)
-@pytest.mark.parametrize("scalar_val", [0, 1])
-def test_logical_or_python_scalar(arr_dt, scalar_val):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arr_dt, q)
-
-    X = dpt.asarray(
-        np.random.randint(0, 2, (10, 10)), dtype=arr_dt, sycl_queue=q
-    )
-    py_ones = (
-        bool(scalar_val),
-        int(scalar_val),
-        float(scalar_val),
-        complex(scalar_val),
-        np.float32(scalar_val),
-        ctypes.c_int(scalar_val),
-    )
-    for sc in py_ones:
-        R = dpt.logical_or(X, sc)
-        assert isinstance(R, dpt.usm_ndarray)
-        E = np.logical_or(dpt.asnumpy(X), sc)
-        assert (dpt.asnumpy(R) == E).all()
-
-        R = dpt.logical_or(sc, X)
-        assert isinstance(R, dpt.usm_ndarray)
-        E = np.logical_or(sc, dpt.asnumpy(X))
-        assert (dpt.asnumpy(R) == E).all()
-
-
-class MockArray:
-    def __init__(self, arr):
-        self.data_ = arr
-
-    @property
-    def __sycl_usm_array_interface__(self):
-        return self.data_.__sycl_usm_array_interface__
-
-
-def test_logical_or_mock_array():
-    get_queue_or_skip()
-    a = dpt.arange(10)
-    b = dpt.ones(10)
-    c = MockArray(b)
-    r = dpt.logical_or(a, c)
-    assert isinstance(r, dpt.usm_ndarray)
-
-
-def test_logical_or_canary_mock_array():
-    get_queue_or_skip()
-    a = dpt.arange(10)
-
-    class Canary:
-        def __init__(self):
-            pass
-
-        @property
-        def __sycl_usm_array_interface__(self):
-            return None
-
-    c = Canary()
-    with pytest.raises(ValueError):
-        dpt.logical_or(a, c)
diff --git a/dpctl/tests/elementwise/test_logical_xor.py b/dpctl/tests/elementwise/test_logical_xor.py
deleted file mode 100644
index 05e5edd891..0000000000
--- a/dpctl/tests/elementwise/test_logical_xor.py
+++ /dev/null
@@ -1,306 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2023-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import ctypes
-
-import numpy as np
-import pytest
-
-import dpctl
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes, _compare_dtypes, _usm_types
-
-
-@pytest.mark.parametrize("op1_dtype", _all_dtypes)
-@pytest.mark.parametrize("op2_dtype", _all_dtypes)
-def test_logical_xor_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1_np = np.random.randint(0, 2, sz)
-    ar1 = dpt.asarray(ar1_np, dtype=op1_dtype)
-    ar2_np = np.random.randint(0, 2, sz)
-    ar2 = dpt.asarray(ar2_np, dtype=op2_dtype)
-
-    r = dpt.logical_xor(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-
-    expected = np.logical_xor(ar1_np, ar2_np)
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar1.shape
-    assert (dpt.asnumpy(r) == expected).all()
-    assert r.sycl_queue == ar1.sycl_queue
-
-    r2 = dpt.empty_like(r, dtype=r.dtype)
-    dpt.logical_xor(ar1, ar2, out=r2)
-    assert (dpt.asnumpy(r) == dpt.asnumpy(r2)).all()
-
-    ar3 = dpt.zeros(sz, dtype=op1_dtype)
-    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
-
-    r = dpt.logical_xor(ar3[::-1], ar4[::2])
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.logical_xor(
-        np.zeros(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
-    )
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar3.shape
-    assert (dpt.asnumpy(r) == expected).all()
-
-    r2 = dpt.empty_like(r, dtype=r.dtype)
-    dpt.logical_xor(ar3[::-1], ar4[::2], out=r2)
-    assert (dpt.asnumpy(r) == dpt.asnumpy(r2)).all()
-
-
-@pytest.mark.parametrize("op_dtype", ["c8", "c16"])
-def test_logical_xor_complex_matrix(op_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op_dtype, q)
-
-    sz = 127
-    ar1_np_real = np.random.randint(0, 2, sz)
-    ar1_np_imag = np.random.randint(0, 2, sz)
-    ar1_np = ar1_np_real + 1j * ar1_np_imag
-    ar1 = dpt.asarray(ar1_np, dtype=op_dtype)
-
-    ar2_np_real = np.random.randint(0, 2, sz)
-    ar2_np_imag = np.random.randint(0, 2, sz)
-    ar2_np = ar2_np_real + 1j * ar2_np_imag
-    ar2 = dpt.asarray(ar2_np, dtype=op_dtype)
-
-    r = dpt.logical_xor(ar1, ar2)
-    expected = np.logical_xor(ar1_np, ar2_np)
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == expected.shape
-    assert (dpt.asnumpy(r) == expected).all()
-
-    r1 = dpt.logical_xor(ar1[::-2], ar2[::2])
-    expected1 = np.logical_xor(ar1_np[::-2], ar2_np[::2])
-    assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q)
-    assert r1.shape == expected1.shape
-    assert (dpt.asnumpy(r1) == expected1).all()
-
-    ar3 = dpt.asarray(
-        [
-            2.0 + 0j,
-            dpt.nan,
-            dpt.nan * 1j,
-            dpt.inf,
-            dpt.inf * 1j,
-            -dpt.inf,
-            -dpt.inf * 1j,
-        ],
-        dtype=op_dtype,
-    )
-    ar4 = dpt.full(ar3.shape, fill_value=1.0 + 2j, dtype=op_dtype)
-
-    ar3_np = dpt.asnumpy(ar3)
-    ar4_np = dpt.asnumpy(ar4)
-
-    r2 = dpt.logical_xor(ar3, ar4)
-    with np.errstate(invalid="ignore"):
-        expected2 = np.logical_xor(ar3_np, ar4_np)
-    assert (dpt.asnumpy(r2) == expected2).all()
-
-    r3 = dpt.logical_xor(ar4, ar4)
-    with np.errstate(invalid="ignore"):
-        expected3 = np.logical_xor(ar4_np, ar4_np)
-    assert (dpt.asnumpy(r3) == expected3).all()
-
-
-def test_logical_xor_complex_float():
-    get_queue_or_skip()
-
-    ar1 = dpt.asarray([1j, 1.0 + 9j, 2.0 + 0j, 2.0 + 1j], dtype="c8")
-    ar2 = dpt.full(ar1.shape, 2, dtype="f4")
-
-    ar1_np = dpt.asnumpy(ar1)
-    ar2_np = dpt.asnumpy(ar1)
-
-    r = dpt.logical_xor(ar1, ar2)
-    expected = np.logical_xor(ar1_np, ar2_np)
-    assert (dpt.asnumpy(r) == expected).all()
-
-    r1 = dpt.logical_xor(ar2, ar1)
-    expected1 = np.logical_xor(ar2_np, ar1_np)
-    assert (dpt.asnumpy(r1) == expected1).all()
-    with np.errstate(invalid="ignore"):
-        for tp in [
-            dpt.nan,
-            dpt.nan * 1j,
-            dpt.inf,
-            dpt.inf * 1j,
-            -dpt.inf,
-            -dpt.inf * 1j,
-        ]:
-            ar3 = dpt.full(ar1.shape, tp)
-            ar3_np = dpt.asnumpy(ar3)
-            r2 = dpt.logical_xor(ar1, ar3)
-            expected2 = np.logical_xor(ar1_np, ar3_np)
-            assert (dpt.asnumpy(r2) == expected2).all()
-
-            r3 = dpt.logical_xor(ar3, ar1)
-            expected3 = np.logical_xor(ar3_np, ar1_np)
-            assert (dpt.asnumpy(r3) == expected3).all()
-
-
-@pytest.mark.parametrize("op1_usm_type", _usm_types)
-@pytest.mark.parametrize("op2_usm_type", _usm_types)
-def test_logical_xor_usm_type_matrix(op1_usm_type, op2_usm_type):
-    get_queue_or_skip()
-
-    sz = 128
-    ar1 = dpt.asarray(
-        np.random.randint(0, 2, sz), dtype="i4", usm_type=op1_usm_type
-    )
-    ar2 = dpt.asarray(
-        np.random.randint(0, 2, sz), dtype=ar1.dtype, usm_type=op2_usm_type
-    )
-
-    r = dpt.logical_xor(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
-    assert r.usm_type == expected_usm_type
-
-
-def test_logical_xor_order():
-    get_queue_or_skip()
-
-    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
-    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
-    r1 = dpt.logical_xor(ar1, ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.logical_xor(ar1, ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.logical_xor(ar1, ar2, order="A")
-    assert r3.flags.c_contiguous
-    r4 = dpt.logical_xor(ar1, ar2, order="K")
-    assert r4.flags.c_contiguous
-
-    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
-    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
-    r1 = dpt.logical_xor(ar1, ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.logical_xor(ar1, ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.logical_xor(ar1, ar2, order="A")
-    assert r3.flags.f_contiguous
-    r4 = dpt.logical_xor(ar1, ar2, order="K")
-    assert r4.flags.f_contiguous
-
-    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
-    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
-    r4 = dpt.logical_xor(ar1, ar2, order="K")
-    assert r4.strides == (20, -1)
-
-    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
-    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
-    r4 = dpt.logical_xor(ar1, ar2, order="K")
-    assert r4.strides == (-1, 20)
-
-
-def test_logical_xor_broadcasting():
-    get_queue_or_skip()
-
-    m = dpt.asarray(np.random.randint(0, 2, (100, 5)), dtype="i4")
-    v = dpt.arange(1, 6, dtype="i4")
-
-    r = dpt.logical_xor(m, v)
-
-    expected = np.logical_xor(dpt.asnumpy(m), dpt.asnumpy(v))
-    assert (dpt.asnumpy(r) == expected).all()
-
-    r2 = dpt.logical_xor(v, m)
-    expected2 = np.logical_xor(dpt.asnumpy(v), dpt.asnumpy(m))
-    assert (dpt.asnumpy(r2) == expected2).all()
-
-    r3 = dpt.empty_like(r)
-    dpt.logical_xor(m, v, out=r3)
-    assert (dpt.asnumpy(r3) == expected).all()
-
-    r4 = dpt.empty_like(r)
-    dpt.logical_xor(v, m, out=r4)
-    assert (dpt.asnumpy(r4) == expected).all()
-
-
-@pytest.mark.parametrize("arr_dt", _all_dtypes)
-@pytest.mark.parametrize("scalar_val", [0, 1])
-def test_logical_xor_python_scalar(arr_dt, scalar_val):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arr_dt, q)
-
-    X = dpt.asarray(
-        np.random.randint(0, 2, (10, 10)), dtype=arr_dt, sycl_queue=q
-    )
-    py_ones = (
-        bool(scalar_val),
-        int(scalar_val),
-        float(scalar_val),
-        complex(scalar_val),
-        np.float32(scalar_val),
-        ctypes.c_int(scalar_val),
-    )
-    for sc in py_ones:
-        R = dpt.logical_xor(X, sc)
-        assert isinstance(R, dpt.usm_ndarray)
-        E = np.logical_xor(dpt.asnumpy(X), sc)
-        assert (dpt.asnumpy(R) == E).all()
-
-        R = dpt.logical_xor(sc, X)
-        assert isinstance(R, dpt.usm_ndarray)
-        E = np.logical_xor(sc, dpt.asnumpy(X))
-        assert (dpt.asnumpy(R) == E).all()
-
-
-class MockArray:
-    def __init__(self, arr):
-        self.data_ = arr
-
-    @property
-    def __sycl_usm_array_interface__(self):
-        return self.data_.__sycl_usm_array_interface__
-
-
-def test_logical_xor_mock_array():
-    get_queue_or_skip()
-    a = dpt.arange(10)
-    b = dpt.ones(10)
-    c = MockArray(b)
-    r = dpt.logical_xor(a, c)
-    assert isinstance(r, dpt.usm_ndarray)
-
-
-def test_logical_xor_canary_mock_array():
-    get_queue_or_skip()
-    a = dpt.arange(10)
-
-    class Canary:
-        def __init__(self):
-            pass
-
-        @property
-        def __sycl_usm_array_interface__(self):
-            return None
-
-    c = Canary()
-    with pytest.raises(ValueError):
-        dpt.logical_xor(a, c)
diff --git a/dpctl/tests/elementwise/test_maximum_minimum.py b/dpctl/tests/elementwise/test_maximum_minimum.py
deleted file mode 100644
index 984b405a71..0000000000
--- a/dpctl/tests/elementwise/test_maximum_minimum.py
+++ /dev/null
@@ -1,314 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import ctypes
-import itertools
-
-import numpy as np
-import pytest
-from numpy.testing import assert_array_equal
-
-import dpctl
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes, _compare_dtypes, _usm_types
-
-
-@pytest.mark.parametrize("op1_dtype", _all_dtypes)
-@pytest.mark.parametrize("op2_dtype", _all_dtypes)
-def test_maximum_minimum_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1_np = np.arange(sz)
-    np.random.shuffle(ar1_np)
-    ar1 = dpt.asarray(ar1_np, dtype=op1_dtype)
-    ar2_np = np.arange(sz)
-    np.random.shuffle(ar2_np)
-    ar2 = dpt.asarray(ar2_np, dtype=op2_dtype)
-
-    r = dpt.maximum(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.maximum(ar1_np.astype(op1_dtype), ar2_np.astype(op2_dtype))
-
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar1.shape
-    assert (dpt.asnumpy(r) == expected).all()
-    assert r.sycl_queue == ar1.sycl_queue
-
-    r = dpt.minimum(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.minimum(ar1_np.astype(op1_dtype), ar2_np.astype(op2_dtype))
-
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar1.shape
-    assert (dpt.asnumpy(r) == expected).all()
-    assert r.sycl_queue == ar1.sycl_queue
-
-    ar3_np = np.arange(sz)
-    np.random.shuffle(ar3_np)
-    ar3 = dpt.asarray(ar3_np, dtype=op1_dtype)
-    ar4_np = np.arange(2 * sz)
-    np.random.shuffle(ar4_np)
-    ar4 = dpt.asarray(ar4_np, dtype=op2_dtype)
-
-    r = dpt.maximum(ar3[::-1], ar4[::2])
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.maximum(
-        ar3_np[::-1].astype(op1_dtype), ar4_np[::2].astype(op2_dtype)
-    )
-
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar3.shape
-    assert (dpt.asnumpy(r) == expected).all()
-
-    r = dpt.minimum(ar3[::-1], ar4[::2])
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.minimum(
-        ar3_np[::-1].astype(op1_dtype), ar4_np[::2].astype(op2_dtype)
-    )
-
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar3.shape
-    assert (dpt.asnumpy(r) == expected).all()
-
-
-@pytest.mark.parametrize("op_dtype", ["c8", "c16"])
-def test_maximum_minimum_complex_matrix(op_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op_dtype, q)
-
-    sz = 127
-    ar1_np_real = np.random.randint(0, 10, sz)
-    ar1_np_imag = np.random.randint(0, 10, sz)
-    ar1 = dpt.asarray(ar1_np_real + 1j * ar1_np_imag, dtype=op_dtype)
-
-    ar2_np_real = np.random.randint(0, 10, sz)
-    ar2_np_imag = np.random.randint(0, 10, sz)
-    ar2 = dpt.asarray(ar2_np_real + 1j * ar2_np_imag, dtype=op_dtype)
-
-    r = dpt.maximum(ar1, ar2)
-    expected = np.maximum(dpt.asnumpy(ar1), dpt.asnumpy(ar2))
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == expected.shape
-    assert_array_equal(dpt.asnumpy(r), expected)
-
-    r1 = dpt.maximum(ar1[::-2], ar2[::2])
-    expected1 = np.maximum(dpt.asnumpy(ar1[::-2]), dpt.asnumpy(ar2[::2]))
-    assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q)
-    assert r1.shape == expected1.shape
-    assert_array_equal(dpt.asnumpy(r1), expected1)
-
-    r = dpt.minimum(ar1, ar2)
-    expected = np.minimum(dpt.asnumpy(ar1), dpt.asnumpy(ar2))
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == expected.shape
-    assert_array_equal(dpt.asnumpy(r), expected)
-
-    r1 = dpt.minimum(ar1[::-2], ar2[::2])
-    expected1 = np.minimum(dpt.asnumpy(ar1[::-2]), dpt.asnumpy(ar2[::2]))
-    assert _compare_dtypes(r.dtype, expected1.dtype, sycl_queue=q)
-    assert r1.shape == expected1.shape
-    assert_array_equal(dpt.asnumpy(r1), expected1)
-
-
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
-def test_maximum_minimum_real_special_cases(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    x = [np.nan, np.inf, -np.inf, 5.0, -3.0]
-    x = list(itertools.product(x, repeat=2))
-    Xnp = np.array([tup[0] for tup in x], dtype=dtype)
-    Ynp = np.array([tup[1] for tup in x], dtype=dtype)
-    X = dpt.asarray(Xnp, dtype=dtype)
-    Y = dpt.asarray(Ynp, dtype=dtype)
-
-    R = dpt.maximum(X, Y)
-    Rnp = np.maximum(Xnp, Ynp)
-    assert_array_equal(dpt.asnumpy(R), Rnp)
-
-    R = dpt.minimum(X, Y)
-    Rnp = np.minimum(Xnp, Ynp)
-    assert_array_equal(dpt.asnumpy(R), Rnp)
-
-
-@pytest.mark.parametrize("dtype", ["c8", "c16"])
-def test_maximum_minimum_complex_special_cases(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    x = [np.nan, -np.inf, -np.inf, +2.0, -1.0]
-    x = [complex(*val) for val in itertools.product(x, repeat=2)]
-    x = list(itertools.product(x, repeat=2))
-
-    Xnp = np.array([tup[0] for tup in x], dtype=dtype)
-    Ynp = np.array([tup[1] for tup in x], dtype=dtype)
-    X = dpt.asarray(Xnp, dtype=dtype, sycl_queue=q)
-    Y = dpt.asarray(Ynp, dtype=dtype, sycl_queue=q)
-
-    R = dpt.maximum(X, Y)
-    Rnp = np.maximum(Xnp, Ynp)
-    assert_array_equal(dpt.asnumpy(dpt.real(R)), np.real(Rnp))
-    assert_array_equal(dpt.asnumpy(dpt.imag(R)), np.imag(Rnp))
-
-    R = dpt.minimum(X, Y)
-    Rnp = np.minimum(Xnp, Ynp)
-    assert_array_equal(dpt.asnumpy(dpt.real(R)), np.real(Rnp))
-    assert_array_equal(dpt.asnumpy(dpt.imag(R)), np.imag(Rnp))
-
-
-@pytest.mark.parametrize("op1_usm_type", _usm_types)
-@pytest.mark.parametrize("op2_usm_type", _usm_types)
-def test_maximum_minimum_usm_type_matrix(op1_usm_type, op2_usm_type):
-    get_queue_or_skip()
-
-    sz = 128
-    ar1_np = np.arange(sz, dtype="i4")
-    np.random.shuffle(ar1_np)
-    ar1 = dpt.asarray(ar1_np, usm_type=op1_usm_type)
-    ar2_np = np.arange(sz, dtype="i4")
-    np.random.shuffle(ar2_np)
-    ar2 = dpt.asarray(ar2_np, usm_type=op2_usm_type)
-
-    r = dpt.maximum(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
-    assert r.usm_type == expected_usm_type
-
-    r = dpt.minimum(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
-    assert r.usm_type == expected_usm_type
-
-
-def test_maximum_minimum_order():
-    get_queue_or_skip()
-
-    ar1_np = np.arange(20 * 20, dtype="i4").reshape(20, 20)
-    np.random.shuffle(ar1_np)
-    ar1 = dpt.asarray(ar1_np, order="C")
-    ar2_np = np.arange(20 * 20, dtype="i4").reshape(20, 20)
-    np.random.shuffle(ar2_np)
-    ar2 = dpt.asarray(ar2_np, order="C")
-
-    r1 = dpt.maximum(ar1, ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.maximum(ar1, ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.maximum(ar1, ar2, order="A")
-    assert r3.flags.c_contiguous
-    r4 = dpt.maximum(ar1, ar2, order="K")
-    assert r4.flags.c_contiguous
-
-    ar1 = dpt.asarray(ar1_np, order="F")
-    ar2 = dpt.asarray(ar2_np, order="F")
-    r1 = dpt.maximum(ar1, ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.maximum(ar1, ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.maximum(ar1, ar2, order="A")
-    assert r3.flags.f_contiguous
-    r4 = dpt.maximum(ar1, ar2, order="K")
-    assert r4.flags.f_contiguous
-
-    ar1_np = np.arange(40 * 40, dtype="i4").reshape(40, 40)
-    np.random.shuffle(ar1_np)
-    ar1 = dpt.asarray(ar1_np, order="C")[:20, ::-2]
-    ar2_np = np.arange(40 * 40, dtype="i4").reshape(40, 40)
-    np.random.shuffle(ar2_np)
-    ar2 = dpt.asarray(ar2_np, order="C")[:20, ::-2]
-    r4 = dpt.maximum(ar1, ar2, order="K")
-    assert r4.strides == (20, -1)
-
-    ar1 = dpt.asarray(ar1_np, order="C")[:20, ::-2].mT
-    ar2 = dpt.asarray(ar2_np, order="C")[:20, ::-2].mT
-    r4 = dpt.maximum(ar1, ar2, order="K")
-    assert r4.strides == (-1, 20)
-
-
-@pytest.mark.parametrize("arr_dt", _all_dtypes)
-def test_maximum_minimum_python_scalar(arr_dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arr_dt, q)
-
-    X = dpt.zeros((10, 10), dtype=arr_dt, sycl_queue=q)
-    py_ones = (
-        bool(1),
-        int(1),
-        float(1),
-        complex(1),
-        np.float32(1),
-        ctypes.c_int(1),
-    )
-    for sc in py_ones:
-        R = dpt.maximum(X, sc)
-        assert isinstance(R, dpt.usm_ndarray)
-        R = dpt.maximum(sc, X)
-        assert isinstance(R, dpt.usm_ndarray)
-
-        R = dpt.minimum(X, sc)
-        assert isinstance(R, dpt.usm_ndarray)
-        R = dpt.minimum(sc, X)
-        assert isinstance(R, dpt.usm_ndarray)
-
-
-class MockArray:
-    def __init__(self, arr):
-        self.data_ = arr
-
-    @property
-    def __sycl_usm_array_interface__(self):
-        return self.data_.__sycl_usm_array_interface__
-
-
-def test_maximum_minimum_mock_array():
-    get_queue_or_skip()
-    a = dpt.arange(10)
-    b = dpt.ones(10)
-    c = MockArray(b)
-    r = dpt.maximum(a, c)
-    assert isinstance(r, dpt.usm_ndarray)
-
-    r = dpt.minimum(a, c)
-    assert isinstance(r, dpt.usm_ndarray)
-
-
-def test_maximum_canary_mock_array():
-    get_queue_or_skip()
-    a = dpt.arange(10)
-
-    class Canary:
-        def __init__(self):
-            pass
-
-        @property
-        def __sycl_usm_array_interface__(self):
-            return None
-
-    c = Canary()
-    with pytest.raises(ValueError):
-        dpt.maximum(a, c)
-
-    with pytest.raises(ValueError):
-        dpt.minimum(a, c)
diff --git a/dpctl/tests/elementwise/test_multiply.py b/dpctl/tests/elementwise/test_multiply.py
deleted file mode 100644
index 3b2ed0fd96..0000000000
--- a/dpctl/tests/elementwise/test_multiply.py
+++ /dev/null
@@ -1,234 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import ctypes
-
-import numpy as np
-import pytest
-
-import dpctl
-import dpctl.tensor as dpt
-from dpctl.tensor._type_utils import _can_cast
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes, _compare_dtypes, _usm_types
-
-
-@pytest.mark.parametrize("op1_dtype", _all_dtypes)
-@pytest.mark.parametrize("op2_dtype", _all_dtypes)
-def test_multiply_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1 = dpt.ones(sz, dtype=op1_dtype)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
-
-    r = dpt.multiply(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.multiply(
-        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
-    )
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar1.shape
-    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
-    assert r.sycl_queue == ar1.sycl_queue
-
-    ar3 = dpt.ones(sz, dtype=op1_dtype)
-    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
-
-    r = dpt.multiply(ar3[::-1], ar4[::2])
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.multiply(
-        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
-    )
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar3.shape
-    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
-
-
-@pytest.mark.parametrize("op1_usm_type", _usm_types)
-@pytest.mark.parametrize("op2_usm_type", _usm_types)
-def test_multiply_usm_type_matrix(op1_usm_type, op2_usm_type):
-    get_queue_or_skip()
-
-    sz = 128
-    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
-    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
-
-    r = dpt.multiply(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
-    assert r.usm_type == expected_usm_type
-
-
-def test_multiply_order():
-    get_queue_or_skip()
-
-    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
-    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
-    r1 = dpt.multiply(ar1, ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.multiply(ar1, ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.multiply(ar1, ar2, order="A")
-    assert r3.flags.c_contiguous
-    r4 = dpt.multiply(ar1, ar2, order="K")
-    assert r4.flags.c_contiguous
-
-    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
-    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
-    r1 = dpt.multiply(ar1, ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.multiply(ar1, ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.multiply(ar1, ar2, order="A")
-    assert r3.flags.f_contiguous
-    r4 = dpt.multiply(ar1, ar2, order="K")
-    assert r4.flags.f_contiguous
-
-    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
-    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
-    r4 = dpt.multiply(ar1, ar2, order="K")
-    assert r4.strides == (20, -1)
-
-    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
-    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
-    r4 = dpt.multiply(ar1, ar2, order="K")
-    assert r4.strides == (-1, 20)
-
-
-def test_multiply_broadcasting():
-    get_queue_or_skip()
-
-    m = dpt.ones((100, 5), dtype="i4")
-    v = dpt.arange(1, 6, dtype="i4")
-
-    r = dpt.multiply(m, v)
-
-    expected = np.multiply(
-        np.ones((100, 5), dtype="i4"), np.arange(1, 6, dtype="i4")
-    )
-    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
-
-    r2 = dpt.multiply(v, m)
-    expected2 = np.multiply(
-        np.arange(1, 6, dtype="i4"), np.ones((100, 5), dtype="i4")
-    )
-    assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all()
-
-
-@pytest.mark.parametrize("arr_dt", _all_dtypes)
-def test_multiply_python_scalar(arr_dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arr_dt, q)
-
-    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
-    py_ones = (
-        bool(1),
-        int(1),
-        float(1),
-        complex(1),
-        np.float32(1),
-        ctypes.c_int(1),
-    )
-    for sc in py_ones:
-        R = dpt.multiply(X, sc)
-        assert isinstance(R, dpt.usm_ndarray)
-        R = dpt.multiply(sc, X)
-        assert isinstance(R, dpt.usm_ndarray)
-
-
-@pytest.mark.parametrize("arr_dt", _all_dtypes)
-@pytest.mark.parametrize("sc", [bool(1), int(1), float(1), complex(1)])
-def test_multiply_python_scalar_gh1219(arr_dt, sc):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arr_dt, q)
-
-    Xnp = np.ones(4, dtype=arr_dt)
-
-    X = dpt.ones(4, dtype=arr_dt, sycl_queue=q)
-
-    R = dpt.multiply(X, sc)
-    Rnp = np.multiply(Xnp, sc)
-    assert _compare_dtypes(R.dtype, Rnp.dtype, sycl_queue=q)
-
-    # symmetric case
-    R = dpt.multiply(sc, X)
-    Rnp = np.multiply(sc, Xnp)
-    assert _compare_dtypes(R.dtype, Rnp.dtype, sycl_queue=q)
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_multiply_inplace_python_scalar(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-    X = dpt.ones((10, 10), dtype=dtype, sycl_queue=q)
-    dt_kind = X.dtype.kind
-    if dt_kind in "ui":
-        X *= int(1)
-    elif dt_kind == "f":
-        X *= float(1)
-    elif dt_kind == "c":
-        X *= complex(1)
-    elif dt_kind == "b":
-        X *= bool(1)
-
-
-@pytest.mark.parametrize("op1_dtype", _all_dtypes)
-@pytest.mark.parametrize("op2_dtype", _all_dtypes)
-def test_multiply_inplace_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1 = dpt.ones(sz, dtype=op1_dtype)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
-
-    dev = q.sycl_device
-    _fp16 = dev.has_aspect_fp16
-    _fp64 = dev.has_aspect_fp64
-    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
-        ar1 *= ar2
-        assert (
-            dpt.asnumpy(ar1) == np.full(ar1.shape, 1, dtype=ar1.dtype)
-        ).all()
-
-        ar3 = dpt.ones(sz, dtype=op1_dtype)
-        ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
-
-        ar3[::-1] *= ar4[::2]
-        assert (
-            dpt.asnumpy(ar3) == np.full(ar3.shape, 1, dtype=ar3.dtype)
-        ).all()
-
-    else:
-        with pytest.raises(ValueError):
-            ar1 *= ar2
-
-
-def test_multiply_inplace_broadcasting():
-    get_queue_or_skip()
-
-    m = dpt.ones((100, 5), dtype="i4")
-    v = dpt.arange(5, dtype="i4")
-
-    m *= v
-    assert (dpt.asnumpy(m) == np.arange(0, 5, dtype="i4")[np.newaxis, :]).all()
diff --git a/dpctl/tests/elementwise/test_negative.py b/dpctl/tests/elementwise/test_negative.py
deleted file mode 100644
index 117fa2a69c..0000000000
--- a/dpctl/tests/elementwise/test_negative.py
+++ /dev/null
@@ -1,86 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import itertools
-
-import numpy as np
-import pytest
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes, _usm_types
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes[1:])
-def test_negative_out_type(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    arg_dt = np.dtype(dtype)
-    X = dpt.asarray(0, dtype=arg_dt, sycl_queue=q)
-    assert dpt.negative(X).dtype == arg_dt
-
-    r = dpt.empty_like(X, dtype=arg_dt)
-    dpt.negative(X, out=r)
-    assert np.allclose(dpt.asnumpy(r), dpt.asnumpy(dpt.negative(X)))
-
-
-def test_negative_bool():
-    get_queue_or_skip()
-    x = dpt.ones(64, dtype="?")
-    with pytest.raises(ValueError):
-        dpt.negative(x)
-
-
-@pytest.mark.parametrize("usm_type", _usm_types)
-def test_negative_usm_type(usm_type):
-    q = get_queue_or_skip()
-
-    arg_dt = np.dtype("i4")
-    input_shape = (10, 10, 10, 10)
-    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
-    X[..., 0::2] = 1
-    X[..., 1::2] = 0
-
-    Y = dpt.negative(X)
-    assert Y.usm_type == X.usm_type
-    assert Y.sycl_queue == X.sycl_queue
-    assert Y.flags.c_contiguous
-
-    expected_Y = np.negative(dpt.asnumpy(X))
-    assert np.allclose(dpt.asnumpy(Y), expected_Y)
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes[1:])
-def test_negative_order(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    arg_dt = np.dtype(dtype)
-    input_shape = (10, 10, 10, 10)
-    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
-    X[..., 0::2] = 1
-    X[..., 1::2] = 0
-
-    for perms in itertools.permutations(range(4)):
-        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
-        expected_Y = np.negative(np.ones(U.shape, dtype=U.dtype))
-        expected_Y[..., 1::2] = 0
-        expected_Y = np.transpose(expected_Y, perms)
-        for ord in ["C", "F", "A", "K"]:
-            Y = dpt.negative(U, order=ord)
-            assert np.allclose(dpt.asnumpy(Y), expected_Y)
diff --git a/dpctl/tests/elementwise/test_nextafter.py b/dpctl/tests/elementwise/test_nextafter.py
deleted file mode 100644
index 3e9b636be0..0000000000
--- a/dpctl/tests/elementwise/test_nextafter.py
+++ /dev/null
@@ -1,151 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import ctypes
-
-import numpy as np
-import pytest
-
-import dpctl
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _compare_dtypes, _no_complex_dtypes
-
-
-@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:])
-@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:])
-def test_nextafter_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q)
-
-    r = dpt.nextafter(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.nextafter(
-        np.ones(sz, dtype=op1_dtype), np.ones(sz, dtype=op2_dtype)
-    )
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar1.shape
-    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
-    assert r.sycl_queue == ar1.sycl_queue
-
-    ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)
-    ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)
-
-    r = dpt.nextafter(ar3[::-1], ar4[::2])
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.nextafter(
-        np.ones(sz, dtype=op1_dtype), np.ones(sz, dtype=op2_dtype)
-    )
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar3.shape
-    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
-
-
-@pytest.mark.parametrize("arr_dt", _no_complex_dtypes[1:])
-def test_nextafter_python_scalar(arr_dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arr_dt, q)
-
-    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
-    py_ones = (
-        bool(1),
-        int(1),
-        float(1),
-        np.float32(1),
-        ctypes.c_int(1),
-    )
-    for sc in py_ones:
-        R = dpt.nextafter(X, sc)
-        assert isinstance(R, dpt.usm_ndarray)
-        R = dpt.nextafter(sc, X)
-        assert isinstance(R, dpt.usm_ndarray)
-
-
-@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
-def test_nextafter_special_cases_nan(dt):
-    """If either x1_i or x2_i is NaN, the result is NaN."""
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    x1 = dpt.asarray([2.0, dpt.nan, dpt.nan], dtype=dt)
-    x2 = dpt.asarray([dpt.nan, 2.0, dpt.nan], dtype=dt)
-
-    y = dpt.nextafter(x1, x2)
-    assert dpt.all(dpt.isnan(y))
-
-
-@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
-def test_nextafter_special_cases_zero(dt):
-    """If x1_i is equal to x2_i, the result is x2_i."""
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    x1 = dpt.asarray([-0.0, 0.0, -0.0, 0.0], dtype=dt)
-    x2 = dpt.asarray([0.0, -0.0, -0.0, 0.0], dtype=dt)
-
-    y = dpt.nextafter(x1, x2)
-    assert dpt.all(y == 0)
-
-    skip_checking_signs = (
-        x1.dtype == dpt.float16
-        and x1.sycl_device.backend == dpctl.backend_type.cuda
-    )
-    if skip_checking_signs:
-        pytest.skip(
-            "Skipped checking signs for nextafter due to "
-            "known issue in DPC++ support for CUDA devices"
-        )
-    else:
-        assert dpt.all(dpt.signbit(y) == dpt.signbit(x2))
-
-
-@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
-def test_nextafter_basic(dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    s = 10
-    x1 = dpt.ones(s, dtype=dt, sycl_queue=q)
-    x2 = dpt.full(s, 2, dtype=dt, sycl_queue=q)
-
-    r = dpt.nextafter(x1, x2)
-    expected_diff = dpt.asarray(dpt.finfo(dt).eps, dtype=dt, sycl_queue=q)
-
-    assert dpt.all(r > 0)
-    assert dpt.all(r - x1 == expected_diff)
-
-    x3 = dpt.zeros(s, dtype=dt, sycl_queue=q)
-
-    r = dpt.nextafter(x3, x1)
-    assert dpt.all(r > 0)
-
-    r = dpt.nextafter(x1, x3)
-    assert dpt.all((r - x1) < 0)
-
-    r = dpt.nextafter(x1, 0)
-    assert dpt.all(x1 - r == (expected_diff) / 2)
-
-    r = dpt.nextafter(x3, dpt.inf)
-    assert dpt.all(r > 0)
-
-    r = dpt.nextafter(x3, -dpt.inf)
-    assert dpt.all(r < 0)
diff --git a/dpctl/tests/elementwise/test_not_equal.py b/dpctl/tests/elementwise/test_not_equal.py
deleted file mode 100644
index 3954ec7c71..0000000000
--- a/dpctl/tests/elementwise/test_not_equal.py
+++ /dev/null
@@ -1,208 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import ctypes
-
-import numpy as np
-import pytest
-
-import dpctl
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes, _compare_dtypes, _usm_types
-
-
-@pytest.mark.parametrize("op1_dtype", _all_dtypes)
-@pytest.mark.parametrize("op2_dtype", _all_dtypes)
-def test_not_equal_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1 = dpt.ones(sz, dtype=op1_dtype)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
-
-    r = dpt.not_equal(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected_dtype = np.not_equal(
-        np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype)
-    ).dtype
-    assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q)
-    assert r.shape == ar1.shape
-    assert (dpt.asnumpy(r) == np.full(r.shape, False, dtype=r.dtype)).all()
-    assert r.sycl_queue == ar1.sycl_queue
-
-    ar3 = dpt.ones(sz, dtype=op1_dtype)
-    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
-
-    r = dpt.not_equal(ar3[::-1], ar4[::2])
-    assert isinstance(r, dpt.usm_ndarray)
-    expected_dtype = np.not_equal(
-        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
-    ).dtype
-    assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q)
-    assert r.shape == ar3.shape
-    assert (dpt.asnumpy(r) == np.full(r.shape, False, dtype=r.dtype)).all()
-
-
-@pytest.mark.parametrize("op1_usm_type", _usm_types)
-@pytest.mark.parametrize("op2_usm_type", _usm_types)
-def test_not_equal_usm_type_matrix(op1_usm_type, op2_usm_type):
-    get_queue_or_skip()
-
-    sz = 128
-    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
-    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
-
-    r = dpt.not_equal(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
-    assert r.usm_type == expected_usm_type
-
-
-def test_not_equal_order():
-    get_queue_or_skip()
-
-    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
-    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
-    r1 = dpt.not_equal(ar1, ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.not_equal(ar1, ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.not_equal(ar1, ar2, order="A")
-    assert r3.flags.c_contiguous
-    r4 = dpt.not_equal(ar1, ar2, order="K")
-    assert r4.flags.c_contiguous
-
-    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
-    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
-    r1 = dpt.not_equal(ar1, ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.not_equal(ar1, ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.not_equal(ar1, ar2, order="A")
-    assert r3.flags.f_contiguous
-    r4 = dpt.not_equal(ar1, ar2, order="K")
-    assert r4.flags.f_contiguous
-
-    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
-    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
-    r4 = dpt.not_equal(ar1, ar2, order="K")
-    assert r4.strides == (20, -1)
-
-    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
-    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
-    r4 = dpt.not_equal(ar1, ar2, order="K")
-    assert r4.strides == (-1, 20)
-
-
-def test_not_equal_broadcasting():
-    get_queue_or_skip()
-
-    m = dpt.ones((100, 5), dtype="i4")
-    v = dpt.arange(5, dtype="i4")
-
-    r = dpt.not_equal(m, v)
-    expected = np.full((100, 5), [True, False, True, True, True], dtype="?")
-
-    assert (dpt.asnumpy(r) == expected).all()
-
-    r2 = dpt.not_equal(v, m)
-    assert (dpt.asnumpy(r2) == expected).all()
-
-    r3 = dpt.empty_like(m, dtype="?")
-    dpt.not_equal(m, v, out=r3)
-    assert (dpt.asnumpy(r3) == expected).all()
-
-
-@pytest.mark.parametrize("arr_dt", _all_dtypes)
-def test_not_equal_python_scalar(arr_dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arr_dt, q)
-
-    X = dpt.zeros((10, 10), dtype=arr_dt, sycl_queue=q)
-    py_zeros = (
-        bool(0),
-        int(0),
-        float(0),
-        complex(0),
-        np.float32(0),
-        ctypes.c_int(0),
-    )
-    for sc in py_zeros:
-        R = dpt.not_equal(X, sc)
-        assert isinstance(R, dpt.usm_ndarray)
-        assert not dpt.all(R)
-        R = dpt.not_equal(sc, X)
-        assert isinstance(R, dpt.usm_ndarray)
-        assert not dpt.all(R)
-
-
-class MockArray:
-    def __init__(self, arr):
-        self.data_ = arr
-
-    @property
-    def __sycl_usm_array_interface__(self):
-        return self.data_.__sycl_usm_array_interface__
-
-
-def test_not_equal_mock_array():
-    get_queue_or_skip()
-    a = dpt.arange(10)
-    b = dpt.ones(10)
-    c = MockArray(b)
-    r = dpt.not_equal(a, c)
-    assert isinstance(r, dpt.usm_ndarray)
-
-
-def test_not_equal_canary_mock_array():
-    get_queue_or_skip()
-    a = dpt.arange(10)
-
-    class Canary:
-        def __init__(self):
-            pass
-
-        @property
-        def __sycl_usm_array_interface__(self):
-            return None
-
-    c = Canary()
-    with pytest.raises(ValueError):
-        dpt.not_equal(a, c)
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_not_equal_alignment(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n = 256
-    s = dpt.concat((dpt.zeros(n, dtype=dtype), dpt.zeros(n, dtype=dtype)))
-
-    mask = s[:-1] != s[1:]
-    (pos,) = dpt.nonzero(mask)
-    assert dpt.all(pos == n)
-
-    out_arr = dpt.zeros(2 * n, dtype=mask.dtype)
-    dpt.not_equal(s[:-1], s[1:], out=out_arr[1:])
-    (pos,) = dpt.nonzero(mask)
-    assert dpt.all(pos == (n + 1))
diff --git a/dpctl/tests/elementwise/test_positive.py b/dpctl/tests/elementwise/test_positive.py
deleted file mode 100644
index 5bdb90d9cb..0000000000
--- a/dpctl/tests/elementwise/test_positive.py
+++ /dev/null
@@ -1,79 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import itertools
-
-import numpy as np
-import pytest
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes, _usm_types
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes[1:])
-def test_positive_out_type(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    arg_dt = np.dtype(dtype)
-    X = dpt.asarray(0, dtype=arg_dt, sycl_queue=q)
-    assert dpt.positive(X).dtype == arg_dt
-
-    r = dpt.empty_like(X, dtype=arg_dt)
-    dpt.positive(X, out=r)
-    assert np.allclose(dpt.asnumpy(r), dpt.asnumpy(dpt.positive(X)))
-
-
-@pytest.mark.parametrize("usm_type", _usm_types)
-def test_positive_usm_type(usm_type):
-    q = get_queue_or_skip()
-
-    arg_dt = np.dtype("i4")
-    input_shape = (10, 10, 10, 10)
-    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
-    X[..., 0::2] = 1
-    X[..., 1::2] = 0
-
-    Y = dpt.positive(X)
-    assert Y.usm_type == X.usm_type
-    assert Y.sycl_queue == X.sycl_queue
-    assert Y.flags.c_contiguous
-
-    expected_Y = dpt.asnumpy(X)
-    assert np.allclose(dpt.asnumpy(Y), expected_Y)
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes[1:])
-def test_positive_order(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    arg_dt = np.dtype(dtype)
-    input_shape = (10, 10, 10, 10)
-    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
-    X[..., 0::2] = 1
-    X[..., 1::2] = 0
-
-    for perms in itertools.permutations(range(4)):
-        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
-        expected_Y = np.ones(U.shape, dtype=U.dtype)
-        expected_Y[..., 1::2] = 0
-        expected_Y = np.transpose(expected_Y, perms)
-        for ord in ["C", "F", "A", "K"]:
-            Y = dpt.positive(U, order=ord)
-            assert np.allclose(dpt.asnumpy(Y), expected_Y)
diff --git a/dpctl/tests/elementwise/test_pow.py b/dpctl/tests/elementwise/test_pow.py
deleted file mode 100644
index 7bf4002eaf..0000000000
--- a/dpctl/tests/elementwise/test_pow.py
+++ /dev/null
@@ -1,212 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import ctypes
-
-import numpy as np
-import pytest
-
-import dpctl
-import dpctl.tensor as dpt
-from dpctl.tensor._type_utils import _can_cast
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes, _compare_dtypes, _usm_types
-
-
-@pytest.mark.parametrize("op1_dtype", _all_dtypes[1:])
-@pytest.mark.parametrize("op2_dtype", _all_dtypes[1:])
-def test_power_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1 = dpt.ones(sz, dtype=op1_dtype)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
-
-    r = dpt.pow(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.power(
-        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
-    )
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar1.shape
-    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
-    assert r.sycl_queue == ar1.sycl_queue
-
-    ar3 = dpt.ones(sz, dtype=op1_dtype)
-    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
-
-    r = dpt.pow(ar3[::-1], ar4[::2])
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.power(
-        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
-    )
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar3.shape
-    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
-
-
-@pytest.mark.parametrize("op1_usm_type", _usm_types)
-@pytest.mark.parametrize("op2_usm_type", _usm_types)
-def test_power_usm_type_matrix(op1_usm_type, op2_usm_type):
-    get_queue_or_skip()
-
-    sz = 128
-    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
-    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
-
-    r = dpt.pow(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
-    assert r.usm_type == expected_usm_type
-
-
-def test_pow_order():
-    get_queue_or_skip()
-
-    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
-    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
-    r1 = dpt.pow(ar1, ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.pow(ar1, ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.pow(ar1, ar2, order="A")
-    assert r3.flags.c_contiguous
-    r4 = dpt.pow(ar1, ar2, order="K")
-    assert r4.flags.c_contiguous
-
-    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
-    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
-    r1 = dpt.pow(ar1, ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.pow(ar1, ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.pow(ar1, ar2, order="A")
-    assert r3.flags.f_contiguous
-    r4 = dpt.pow(ar1, ar2, order="K")
-    assert r4.flags.f_contiguous
-
-    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
-    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
-    r4 = dpt.pow(ar1, ar2, order="K")
-    assert r4.strides == (20, -1)
-
-    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
-    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
-    r4 = dpt.pow(ar1, ar2, order="K")
-    assert r4.strides == (-1, 20)
-
-
-def test_pow_broadcasting():
-    get_queue_or_skip()
-
-    v = dpt.arange(1, 6, dtype="i4")
-    m = dpt.full((100, 5), 2, dtype="i4")
-
-    r = dpt.pow(m, v)
-
-    expected = np.power(
-        np.full((100, 5), 2, dtype="i4"), np.arange(1, 6, dtype="i4")
-    )
-    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
-
-    r2 = dpt.pow(v, m)
-    expected2 = np.power(
-        np.arange(1, 6, dtype="i4"), np.full((100, 5), 2, dtype="i4")
-    )
-    assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all()
-
-
-@pytest.mark.parametrize("arr_dt", _all_dtypes)
-def test_pow_python_scalar(arr_dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arr_dt, q)
-
-    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
-    py_ones = (
-        bool(1),
-        int(1),
-        float(1),
-        complex(1),
-        np.float32(1),
-        ctypes.c_int(1),
-    )
-    for sc in py_ones:
-        R = dpt.pow(X, sc)
-        assert isinstance(R, dpt.usm_ndarray)
-        R = dpt.pow(sc, X)
-        assert isinstance(R, dpt.usm_ndarray)
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes[1:])
-def test_pow_inplace_python_scalar(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-    X = dpt.ones((10, 10), dtype=dtype, sycl_queue=q)
-    dt_kind = X.dtype.kind
-    if dt_kind in "ui":
-        X **= int(1)
-    elif dt_kind == "f":
-        X **= float(1)
-    elif dt_kind == "c":
-        X **= complex(1)
-
-
-@pytest.mark.parametrize("op1_dtype", _all_dtypes[1:])
-@pytest.mark.parametrize("op2_dtype", _all_dtypes[1:])
-def test_pow_inplace_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1 = dpt.ones(sz, dtype=op1_dtype)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
-
-    dev = q.sycl_device
-    _fp16 = dev.has_aspect_fp16
-    _fp64 = dev.has_aspect_fp64
-    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
-        ar1 **= ar2
-        assert (
-            dpt.asnumpy(ar1) == np.full(ar1.shape, 1, dtype=ar1.dtype)
-        ).all()
-
-        ar3 = dpt.ones(sz, dtype=op1_dtype)
-        ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
-
-        ar3[::-1] *= ar4[::2]
-        assert (
-            dpt.asnumpy(ar3) == np.full(ar3.shape, 1, dtype=ar3.dtype)
-        ).all()
-
-    else:
-        with pytest.raises(ValueError):
-            ar1 **= ar2
-
-
-def test_pow_inplace_basic():
-    get_queue_or_skip()
-
-    x = dpt.arange(10, dtype="i4")
-    expected = dpt.square(x)
-    x **= 2
-
-    assert dpt.all(x == expected)
diff --git a/dpctl/tests/elementwise/test_reciprocal.py b/dpctl/tests/elementwise/test_reciprocal.py
deleted file mode 100644
index f74c07bca5..0000000000
--- a/dpctl/tests/elementwise/test_reciprocal.py
+++ /dev/null
@@ -1,93 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import itertools
-
-import pytest
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes, _complex_fp_dtypes
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_reciprocal_out_type(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    x = dpt.asarray(1, dtype=dtype, sycl_queue=q)
-    one = dpt.asarray(1, dtype=dtype, sycl_queue=q)
-    expected_dtype = dpt.divide(one, x).dtype
-    assert dpt.reciprocal(x).dtype == expected_dtype
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_reciprocal_output_contig(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n_seq = 1027
-
-    x = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)
-    res = dpt.reciprocal(x)
-    expected = 1 / x
-    tol = 8 * dpt.finfo(res.dtype).resolution
-    assert dpt.allclose(res, expected, atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_reciprocal_output_strided(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n_seq = 2054
-
-    x = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
-    res = dpt.reciprocal(x)
-    expected = 1 / x
-    tol = 8 * dpt.finfo(res.dtype).resolution
-    assert dpt.allclose(res, expected, atol=tol, rtol=tol)
-
-
-def test_reciprocal_special_cases():
-    get_queue_or_skip()
-
-    x = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4")
-    res = dpt.reciprocal(x)
-    expected = dpt.asarray([dpt.nan, dpt.inf, -dpt.inf, 0.0, -0.0], dtype="f4")
-    assert dpt.allclose(res, expected, equal_nan=True)
-
-
-@pytest.mark.parametrize("dtype", _complex_fp_dtypes)
-def test_reciprocal_complex_special_cases(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    nans_ = [dpt.nan, -dpt.nan]
-    infs_ = [dpt.inf, -dpt.inf]
-    finites_ = [-1.0, -0.0, 0.0, 1.0]
-    inps_ = nans_ + infs_ + finites_
-    c_ = [complex(*v) for v in itertools.product(inps_, repeat=2)]
-
-    z = dpt.asarray(c_, dtype=dtype)
-    r = dpt.reciprocal(z)
-
-    expected = 1 / z
-
-    tol = dpt.finfo(r.dtype).resolution
-
-    assert dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True)
diff --git a/dpctl/tests/elementwise/test_remainder.py b/dpctl/tests/elementwise/test_remainder.py
deleted file mode 100644
index 1abeb94a73..0000000000
--- a/dpctl/tests/elementwise/test_remainder.py
+++ /dev/null
@@ -1,260 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import ctypes
-
-import numpy as np
-import pytest
-
-import dpctl
-import dpctl.tensor as dpt
-from dpctl.tensor._type_utils import _can_cast
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _compare_dtypes, _no_complex_dtypes, _usm_types
-
-
-@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes)
-@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes)
-def test_remainder_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1 = dpt.ones(sz, dtype=op1_dtype)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
-
-    r = dpt.remainder(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.remainder(
-        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
-    )
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar1.shape
-    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
-    assert r.sycl_queue == ar1.sycl_queue
-
-    ar3 = dpt.ones(sz, dtype=op1_dtype)
-    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
-
-    r = dpt.remainder(ar3[::-1], ar4[::2])
-    assert isinstance(r, dpt.usm_ndarray)
-    expected = np.remainder(
-        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
-    )
-    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
-    assert r.shape == ar3.shape
-    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
-
-
-@pytest.mark.parametrize("op1_usm_type", _usm_types)
-@pytest.mark.parametrize("op2_usm_type", _usm_types)
-def test_remainder_usm_type_matrix(op1_usm_type, op2_usm_type):
-    get_queue_or_skip()
-
-    sz = 128
-    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
-    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
-
-    r = dpt.remainder(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
-    assert r.usm_type == expected_usm_type
-
-
-def test_remainder_order():
-    get_queue_or_skip()
-
-    ar1 = dpt.ones((20, 20), dtype="i4", order="C")
-    ar2 = dpt.ones((20, 20), dtype="i4", order="C")
-    r1 = dpt.remainder(ar1, ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.remainder(ar1, ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.remainder(ar1, ar2, order="A")
-    assert r3.flags.c_contiguous
-    r4 = dpt.remainder(ar1, ar2, order="K")
-    assert r4.flags.c_contiguous
-
-    ar1 = dpt.ones((20, 20), dtype="i4", order="F")
-    ar2 = dpt.ones((20, 20), dtype="i4", order="F")
-    r1 = dpt.remainder(ar1, ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.remainder(ar1, ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.remainder(ar1, ar2, order="A")
-    assert r3.flags.f_contiguous
-    r4 = dpt.remainder(ar1, ar2, order="K")
-    assert r4.flags.f_contiguous
-
-    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
-    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2]
-    r4 = dpt.remainder(ar1, ar2, order="K")
-    assert r4.strides == (20, -1)
-
-    ar1 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
-    ar2 = dpt.ones((40, 40), dtype="i4", order="C")[:20, ::-2].mT
-    r4 = dpt.remainder(ar1, ar2, order="K")
-    assert r4.strides == (-1, 20)
-
-
-@pytest.mark.parametrize("dt", _no_complex_dtypes[1:8:2])
-def test_remainder_negative_integers(dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    x = dpt.arange(-5, -1, 1, dtype=dt, sycl_queue=q)
-    x_np = np.arange(-5, -1, 1, dtype=dt)
-    val = 3
-
-    r1 = dpt.remainder(x, val)
-    expected = np.remainder(x_np, val)
-    assert (dpt.asnumpy(r1) == expected).all()
-
-    r2 = dpt.remainder(val, x)
-    expected = np.remainder(val, x_np)
-    assert (dpt.asnumpy(r2) == expected).all()
-
-
-def test_remainder_integer_zero():
-    get_queue_or_skip()
-
-    for dt in ["i4", "u4"]:
-        x = dpt.ones(1, dtype=dt)
-        y = dpt.zeros_like(x)
-
-        assert (dpt.asnumpy(dpt.remainder(x, y)) == np.zeros(1, dtype=dt)).all()
-
-        x = dpt.astype(x, dt)
-        y = dpt.zeros_like(x)
-
-        assert (dpt.asnumpy(dpt.remainder(x, y)) == np.zeros(1, dtype=dt)).all()
-
-
-@pytest.mark.parametrize("dt", _no_complex_dtypes[9:])
-def test_remainder_negative_floats(dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    x = dpt.linspace(-5, 5, 20, dtype=dt, sycl_queue=q)
-    x_np = np.linspace(-5, 5, 20, dtype=dt)
-    val = 3
-
-    tol = 8 * dpt.finfo(dt).resolution
-
-    r1 = dpt.remainder(x, val)
-    expected = np.remainder(x_np, val)
-    with np.errstate(invalid="ignore"):
-        np.allclose(
-            dpt.asnumpy(r1), expected, rtol=tol, atol=tol, equal_nan=True
-        )
-
-    r2 = dpt.remainder(val, x)
-    expected = np.remainder(val, x_np)
-    with np.errstate(invalid="ignore"):
-        np.allclose(
-            dpt.asnumpy(r2), expected, rtol=tol, atol=tol, equal_nan=True
-        )
-
-
-def test_remainder_special_cases():
-    get_queue_or_skip()
-
-    lhs = [dpt.nan, dpt.inf, 0.0, -0.0, -0.0, 1.0, dpt.inf, -dpt.inf]
-    rhs = [dpt.nan, dpt.inf, -0.0, 1.0, 1.0, 0.0, 1.0, -1.0]
-
-    x, y = dpt.asarray(lhs, dtype="f4"), dpt.asarray(rhs, dtype="f4")
-
-    x_np, y_np = np.asarray(lhs, dtype="f4"), np.asarray(rhs, dtype="f4")
-
-    res = dpt.remainder(x, y)
-
-    with np.errstate(invalid="ignore"):
-        np.allclose(dpt.asnumpy(res), np.remainder(x_np, y_np))
-
-
-@pytest.mark.parametrize("arr_dt", _no_complex_dtypes)
-def test_remainder_python_scalar(arr_dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arr_dt, q)
-
-    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
-    py_ones = (
-        bool(1),
-        int(1),
-        float(1),
-        np.float32(1),
-        ctypes.c_int(1),
-    )
-    for sc in py_ones:
-        R = dpt.remainder(X, sc)
-        assert isinstance(R, dpt.usm_ndarray)
-        R = dpt.remainder(sc, X)
-        assert isinstance(R, dpt.usm_ndarray)
-
-
-@pytest.mark.parametrize("dtype", _no_complex_dtypes[1:])
-def test_remainder_inplace_python_scalar(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-    X = dpt.ones((10, 10), dtype=dtype, sycl_queue=q)
-    dt_kind = X.dtype.kind
-    if dt_kind in "ui":
-        X %= int(1)
-    elif dt_kind == "f":
-        X %= float(1)
-
-
-@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:])
-@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:])
-def test_remainder_inplace_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1 = dpt.ones(sz, dtype=op1_dtype)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
-
-    dev = q.sycl_device
-    _fp16 = dev.has_aspect_fp16
-    _fp64 = dev.has_aspect_fp64
-    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
-        ar1 %= ar2
-        assert dpt.all(ar1 == dpt.zeros(ar1.shape, dtype=ar1.dtype))
-
-        ar3 = dpt.ones(sz, dtype=op1_dtype)
-        ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
-
-        ar3[::-1] %= ar4[::2]
-        assert dpt.all(ar3 == dpt.zeros(ar3.shape, dtype=ar3.dtype))
-
-    else:
-        with pytest.raises(ValueError):
-            ar1 %= ar2
-
-
-def test_remainder_inplace_basic():
-    get_queue_or_skip()
-
-    x = dpt.arange(10, dtype="i4")
-    expected = x & 1
-    x %= 2
-
-    assert dpt.all(x == expected)
diff --git a/dpctl/tests/elementwise/test_round.py b/dpctl/tests/elementwise/test_round.py
deleted file mode 100644
index 466e015490..0000000000
--- a/dpctl/tests/elementwise/test_round.py
+++ /dev/null
@@ -1,215 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import itertools
-
-import numpy as np
-import pytest
-from numpy.testing import assert_allclose, assert_array_equal
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes, _map_to_device_dtype, _usm_types
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes[1:])
-def test_round_out_type(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    X = dpt.asarray(0.1, dtype=dtype, sycl_queue=q)
-    expected_dtype = np.round(np.array(0, dtype=dtype)).dtype
-    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
-    assert dpt.round(X).dtype == expected_dtype
-
-
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
-def test_round_real_contig(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n_seq = 100
-    n_rep = 137
-    Xnp = np.linspace(0.01, 88.1, num=n_seq, dtype=dtype)
-    X = dpt.asarray(np.repeat(Xnp, n_rep), dtype=dtype, sycl_queue=q)
-    Y = dpt.round(X)
-    Ynp = np.round(Xnp)
-
-    tol = 8 * dpt.finfo(dtype).resolution
-    assert_allclose(dpt.asnumpy(Y), np.repeat(Ynp, n_rep), atol=tol, rtol=tol)
-
-    Z = dpt.empty_like(X, dtype=dtype)
-    dpt.round(X, out=Z)
-
-    assert_allclose(dpt.asnumpy(Z), np.repeat(Ynp, n_rep), atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("dtype", ["c8", "c16"])
-def test_round_complex_contig(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n_seq = 100
-    n_rep = 137
-    low = -88.0
-    high = 88.0
-    x1 = np.random.uniform(low=low, high=high, size=n_seq)
-    x2 = np.random.uniform(low=low, high=high, size=n_seq)
-    Xnp = np.array([complex(v1, v2) for v1, v2 in zip(x1, x2)], dtype=dtype)
-
-    X = dpt.asarray(np.repeat(Xnp, n_rep), dtype=dtype, sycl_queue=q)
-    Y = dpt.round(X)
-
-    tol = 8 * dpt.finfo(dtype).resolution
-    assert_allclose(
-        dpt.asnumpy(Y), np.repeat(np.round(Xnp), n_rep), atol=tol, rtol=tol
-    )
-
-    Z = dpt.empty_like(X, dtype=dtype)
-    dpt.round(X, out=Z)
-
-    assert_allclose(
-        dpt.asnumpy(Z), np.repeat(np.round(Xnp), n_rep), atol=tol, rtol=tol
-    )
-
-
-@pytest.mark.parametrize("usm_type", _usm_types)
-def test_round_usm_type(usm_type):
-    q = get_queue_or_skip()
-
-    arg_dt = np.dtype("f4")
-    input_shape = (10, 10, 10, 10)
-    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
-    X[..., 0::2] = 16.2
-    X[..., 1::2] = 23.7
-
-    Y = dpt.round(X)
-    assert Y.usm_type == X.usm_type
-    assert Y.sycl_queue == X.sycl_queue
-    assert Y.flags.c_contiguous
-
-    expected_Y = np.empty(input_shape, dtype=arg_dt)
-    expected_Y[..., 0::2] = np.round(np.float32(16.2))
-    expected_Y[..., 1::2] = np.round(np.float32(23.7))
-    tol = 8 * dpt.finfo(Y.dtype).resolution
-
-    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_round_order(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    arg_dt = np.dtype(dtype)
-    input_shape = (10, 10, 10, 10)
-    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
-    X[..., 0::2] = 8.8
-    X[..., 1::2] = 11.3
-
-    for perms in itertools.permutations(range(4)):
-        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
-        expected_Y = np.round(dpt.asnumpy(U))
-        for ord in ["C", "F", "A", "K"]:
-            Y = dpt.round(U, order=ord)
-            assert_allclose(dpt.asnumpy(Y), expected_Y)
-
-
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
-def test_round_real_special_cases(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    tol = 8 * dpt.finfo(dtype).resolution
-    x = [np.nan, np.inf, -np.inf, 1.5, 2.5, -1.5, -2.5, 0.0, -0.0]
-    Xnp = np.array(x, dtype=dtype)
-    X = dpt.asarray(x, dtype=dtype)
-
-    Y = dpt.asnumpy(dpt.round(X))
-    Ynp = np.round(Xnp)
-    assert_allclose(Y, Ynp, atol=tol, rtol=tol)
-    assert_array_equal(np.signbit(Y), np.signbit(Ynp))
-
-
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
-def test_round_real_strided(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    np.random.seed(42)
-    strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4])
-    sizes = [2, 4, 6, 8, 9, 24, 72]
-    tol = 8 * dpt.finfo(dtype).resolution
-
-    for ii in sizes:
-        Xnp = np.random.uniform(low=0.01, high=88.1, size=ii)
-        Xnp.astype(dtype)
-        X = dpt.asarray(Xnp)
-        Ynp = np.round(Xnp)
-        for jj in strides:
-            assert_allclose(
-                dpt.asnumpy(dpt.round(X[::jj])),
-                Ynp[::jj],
-                atol=tol,
-                rtol=tol,
-            )
-
-
-@pytest.mark.parametrize("dtype", ["c8", "c16"])
-def test_round_complex_strided(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    np.random.seed(42)
-    strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4])
-    sizes = [2, 4, 6, 8, 9, 24, 72]
-    tol = 8 * dpt.finfo(dtype).resolution
-
-    low = -88.0
-    high = 88.0
-    for ii in sizes:
-        x1 = np.random.uniform(low=low, high=high, size=ii)
-        x2 = np.random.uniform(low=low, high=high, size=ii)
-        Xnp = np.array([complex(v1, v2) for v1, v2 in zip(x1, x2)], dtype=dtype)
-        X = dpt.asarray(Xnp)
-        Ynp = np.round(Xnp)
-        for jj in strides:
-            assert_allclose(
-                dpt.asnumpy(dpt.round(X[::jj])),
-                Ynp[::jj],
-                atol=tol,
-                rtol=tol,
-            )
-
-
-@pytest.mark.parametrize("dtype", ["c8", "c16"])
-def test_round_complex_special_cases(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    x = [np.nan, np.inf, -np.inf, 1.5, 2.5, -1.5, -2.5, 0.0, -0.0]
-    xc = [complex(*val) for val in itertools.product(x, repeat=2)]
-
-    Xc_np = np.array(xc, dtype=dtype)
-    Xc = dpt.asarray(Xc_np, dtype=dtype, sycl_queue=q)
-
-    Ynp = np.round(Xc_np)
-    Y = dpt.round(Xc)
-
-    tol = 8 * dpt.finfo(dtype).resolution
-    assert_allclose(dpt.asnumpy(dpt.real(Y)), np.real(Ynp), atol=tol, rtol=tol)
-    assert_allclose(dpt.asnumpy(dpt.imag(Y)), np.imag(Ynp), atol=tol, rtol=tol)
diff --git a/dpctl/tests/elementwise/test_rsqrt.py b/dpctl/tests/elementwise/test_rsqrt.py
deleted file mode 100644
index e265e8e426..0000000000
--- a/dpctl/tests/elementwise/test_rsqrt.py
+++ /dev/null
@@ -1,74 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import numpy as np
-import pytest
-from numpy.testing import assert_allclose
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _map_to_device_dtype, _no_complex_dtypes, _real_fp_dtypes
-
-
-@pytest.mark.parametrize("dtype", _no_complex_dtypes)
-def test_rsqrt_out_type(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    x = dpt.asarray(1, dtype=dtype, sycl_queue=q)
-    expected_dtype = np.reciprocal(np.sqrt(np.array(1, dtype=dtype))).dtype
-    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
-    assert dpt.rsqrt(x).dtype == expected_dtype
-
-
-@pytest.mark.parametrize("dtype", _real_fp_dtypes)
-def test_rsqrt_output_contig(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n_seq = 1027
-
-    x = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)
-    res = dpt.rsqrt(x)
-    expected = np.reciprocal(np.sqrt(dpt.asnumpy(x), dtype=dtype))
-    tol = 8 * dpt.finfo(res.dtype).resolution
-    assert_allclose(dpt.asnumpy(res), expected, atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("dtype", _real_fp_dtypes)
-def test_rsqrt_output_strided(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n_seq = 2054
-
-    x = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
-    res = dpt.rsqrt(x)
-    expected = np.reciprocal(np.sqrt(dpt.asnumpy(x), dtype=dtype))
-    tol = 8 * dpt.finfo(res.dtype).resolution
-    assert_allclose(dpt.asnumpy(res), expected, atol=tol, rtol=tol)
-
-
-def test_rsqrt_special_cases():
-    get_queue_or_skip()
-
-    x = dpt.asarray([dpt.nan, -1.0, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4")
-    res = dpt.rsqrt(x)
-    expected = dpt.asarray(
-        [dpt.nan, dpt.nan, dpt.inf, -dpt.inf, 0.0, dpt.nan], dtype="f4"
-    )
-    assert dpt.allclose(res, expected, equal_nan=True)
diff --git a/dpctl/tests/elementwise/test_sign.py b/dpctl/tests/elementwise/test_sign.py
deleted file mode 100644
index 1150eeab25..0000000000
--- a/dpctl/tests/elementwise/test_sign.py
+++ /dev/null
@@ -1,121 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import itertools
-
-import numpy as np
-import pytest
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes, _no_complex_dtypes, _usm_types
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes[1:])
-def test_sign_out_type(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    arg_dt = np.dtype(dtype)
-    X = dpt.asarray(0, dtype=arg_dt, sycl_queue=q)
-    assert dpt.sign(X).dtype == arg_dt
-
-    r = dpt.empty_like(X, dtype=arg_dt)
-    dpt.sign(X, out=r)
-    assert np.allclose(dpt.asnumpy(r), dpt.asnumpy(dpt.sign(X)))
-
-
-@pytest.mark.parametrize("usm_type", _usm_types)
-def test_sign_usm_type(usm_type):
-    q = get_queue_or_skip()
-
-    arg_dt = np.dtype("i4")
-    input_shape = (10, 10, 10, 10)
-    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
-    X[..., 0::2] = 1
-    X[..., 1::2] = 0
-
-    Y = dpt.sign(X)
-    assert Y.usm_type == X.usm_type
-    assert Y.sycl_queue == X.sycl_queue
-    assert Y.flags.c_contiguous
-
-    expected_Y = dpt.asnumpy(X)
-    assert np.allclose(dpt.asnumpy(Y), expected_Y)
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes[1:])
-def test_sign_order(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    arg_dt = np.dtype(dtype)
-    expected_dt = np.sign(np.ones(tuple(), dtype=arg_dt)).dtype
-    input_shape = (10, 10, 10, 10)
-    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
-    X[..., 0::2] = 1
-    X[..., 1::2] = 0
-
-    for perms in itertools.permutations(range(4)):
-        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
-        expected_Y = np.ones(U.shape, dtype=expected_dt)
-        expected_Y[..., 1::2] = 0
-        expected_Y = np.transpose(expected_Y, perms)
-        for ord in ["C", "F", "A", "K"]:
-            Y = dpt.sign(U, order=ord)
-            assert np.allclose(dpt.asnumpy(Y), expected_Y)
-
-
-@pytest.mark.parametrize("dtype", ["c8", "c16"])
-def test_sign_complex(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    arg_dt = np.dtype(dtype)
-    input_shape = (10, 10, 10, 10)
-    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
-    Xnp = np.random.standard_normal(
-        size=input_shape
-    ) + 1j * np.random.standard_normal(size=input_shape)
-    Xnp = Xnp.astype(arg_dt)
-    X[...] = Xnp
-
-    for ord in ["C", "F", "A", "K"]:
-        for perms in itertools.permutations(range(4)):
-            U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
-            Y = dpt.sign(U, order=ord)
-            X_t = np.transpose(Xnp[:, ::-1, ::-1, :], perms)
-            expected_Y = X_t / np.abs(X_t)
-            tol = dpt.finfo(Y.dtype).resolution
-            np.testing.assert_allclose(
-                dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol
-            )
-
-
-# test for all signed real data types
-@pytest.mark.parametrize(
-    "dt", _no_complex_dtypes[1:8:2] + _no_complex_dtypes[9:]
-)
-def test_sign_negative(dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    x = dpt.arange(-20, 20, 1, dtype=dt, sycl_queue=q)
-    x_np = np.arange(-20, 20, 1, dtype=dt)
-    res = dpt.sign(x)
-
-    assert (dpt.asnumpy(res) == np.sign(x_np)).all()
diff --git a/dpctl/tests/elementwise/test_signbit.py b/dpctl/tests/elementwise/test_signbit.py
deleted file mode 100644
index 58249f9398..0000000000
--- a/dpctl/tests/elementwise/test_signbit.py
+++ /dev/null
@@ -1,108 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import numpy as np
-import pytest
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
-def test_signbit_out_type_contig(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    arg_dt = np.dtype(dtype)
-    x = dpt.linspace(1, 10, num=256, dtype=arg_dt)
-    sb = dpt.signbit(x)
-    assert sb.dtype == dpt.bool
-
-    assert not dpt.any(sb)
-
-    x2 = dpt.linspace(-10, -1, num=256, dtype=arg_dt)
-    sb2 = dpt.signbit(x2)
-    assert dpt.all(sb2)
-
-
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
-def test_signbit_out_type_strided(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    arg_dt = np.dtype(dtype)
-    x = dpt.linspace(1, 10, num=256, dtype=arg_dt)
-    sb = dpt.signbit(x[::-3])
-    assert sb.dtype == dpt.bool
-
-    assert not dpt.any(sb)
-
-    x2 = dpt.linspace(-10, -1, num=256, dtype=arg_dt)
-    sb2 = dpt.signbit(x2[::-3])
-    assert dpt.all(sb2)
-
-
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
-def test_signbit_special_cases_contig(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    arg_dt = np.dtype(dtype)
-    n = 63
-    x1 = dpt.full(n, -dpt.inf, dtype=arg_dt)
-    x2 = dpt.full(n, -0.0, dtype=arg_dt)
-    x3 = dpt.full(n, 0.0, dtype=arg_dt)
-    x4 = dpt.full(n, dpt.inf, dtype=arg_dt)
-
-    x = dpt.concat((x1, x2, x3, x4))
-    actual = dpt.signbit(x)
-
-    expected = dpt.concat(
-        (
-            dpt.full(x1.size, True),
-            dpt.full(x2.size, True),
-            dpt.full(x3.size, False),
-            dpt.full(x4.size, False),
-        )
-    )
-
-    assert dpt.all(dpt.equal(actual, expected))
-
-
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
-def test_signbit_special_cases_strided(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    arg_dt = np.dtype(dtype)
-    x1 = dpt.full(63, -dpt.inf, dtype=arg_dt)
-    x2 = dpt.full(63, -0.0, dtype=arg_dt)
-    x3 = dpt.full(63, 0.0, dtype=arg_dt)
-    x4 = dpt.full(63, dpt.inf, dtype=arg_dt)
-
-    x = dpt.concat((x1, x2, x3, x4))
-    actual = dpt.signbit(x[::-1])
-
-    expected = dpt.concat(
-        (
-            dpt.full(x4.size, False),
-            dpt.full(x3.size, False),
-            dpt.full(x2.size, True),
-            dpt.full(x1.size, True),
-        )
-    )
-
-    assert dpt.all(dpt.equal(actual, expected))
diff --git a/dpctl/tests/elementwise/test_sqrt.py b/dpctl/tests/elementwise/test_sqrt.py
deleted file mode 100644
index 8a4d013476..0000000000
--- a/dpctl/tests/elementwise/test_sqrt.py
+++ /dev/null
@@ -1,192 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import itertools
-import warnings
-
-import numpy as np
-import pytest
-from numpy.testing import assert_allclose, assert_equal
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import (
-    _all_dtypes,
-    _complex_fp_dtypes,
-    _map_to_device_dtype,
-    _real_fp_dtypes,
-    _usm_types,
-)
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_sqrt_out_type(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
-    expected_dtype = np.sqrt(np.array(0, dtype=dtype)).dtype
-    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
-    assert dpt.sqrt(X).dtype == expected_dtype
-
-
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
-def test_sqrt_output_contig(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n_seq = 1027
-
-    X = dpt.linspace(0, 13, num=n_seq, dtype=dtype, sycl_queue=q)
-    Xnp = dpt.asnumpy(X)
-
-    Y = dpt.sqrt(X)
-    tol = 8 * dpt.finfo(Y.dtype).resolution
-
-    assert_allclose(dpt.asnumpy(Y), np.sqrt(Xnp), atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
-def test_sqrt_output_strided(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n_seq = 2054
-
-    X = dpt.linspace(0, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
-    Xnp = dpt.asnumpy(X)
-
-    Y = dpt.sqrt(X)
-    tol = 8 * dpt.finfo(Y.dtype).resolution
-
-    assert_allclose(dpt.asnumpy(Y), np.sqrt(Xnp), atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("usm_type", _usm_types)
-def test_sqrt_usm_type(usm_type):
-    q = get_queue_or_skip()
-
-    arg_dt = np.dtype("f4")
-    input_shape = (10, 10, 10, 10)
-    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
-    X[..., 0::2] = 16.0
-    X[..., 1::2] = 23.0
-
-    Y = dpt.sqrt(X)
-    assert Y.usm_type == X.usm_type
-    assert Y.sycl_queue == X.sycl_queue
-    assert Y.flags.c_contiguous
-
-    expected_Y = np.empty(input_shape, dtype=arg_dt)
-    expected_Y[..., 0::2] = np.sqrt(np.float32(16.0))
-    expected_Y[..., 1::2] = np.sqrt(np.float32(23.0))
-    tol = 8 * dpt.finfo(Y.dtype).resolution
-
-    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_sqrt_order(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    arg_dt = np.dtype(dtype)
-    input_shape = (10, 10, 10, 10)
-    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
-    X[..., 0::2] = 16.0
-    X[..., 1::2] = 23.0
-
-    for perms in itertools.permutations(range(4)):
-        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
-        expected_Y = np.sqrt(dpt.asnumpy(U))
-        for ord in ["C", "F", "A", "K"]:
-            Y = dpt.sqrt(U, order=ord)
-            tol = 8 * max(
-                dpt.finfo(Y.dtype).resolution,
-                np.finfo(expected_Y.dtype).resolution,
-            )
-            assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
-
-
-@pytest.mark.usefixtures("suppress_invalid_numpy_warnings")
-def test_sqrt_special_cases():
-    q = get_queue_or_skip()
-
-    X = dpt.asarray(
-        [dpt.nan, -1.0, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4", sycl_queue=q
-    )
-    Xnp = dpt.asnumpy(X)
-
-    assert_equal(dpt.asnumpy(dpt.sqrt(X)), np.sqrt(Xnp))
-
-
-@pytest.mark.parametrize("dtype", _real_fp_dtypes)
-def test_sqrt_real_fp_special_values(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    nans_ = [dpt.nan, -dpt.nan]
-    infs_ = [dpt.inf, -dpt.inf]
-    finites_ = [-1.0, -0.0, 0.0, 1.0]
-    inps_ = nans_ + infs_ + finites_
-
-    x = dpt.asarray(inps_, dtype=dtype)
-    r = dpt.sqrt(x)
-
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")
-        expected_np = np.sqrt(np.asarray(inps_, dtype=dtype))
-
-    expected = dpt.asarray(expected_np, dtype=dtype)
-    tol = dpt.finfo(r.dtype).resolution
-
-    assert dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True)
-
-
-@pytest.mark.parametrize("dtype", _complex_fp_dtypes)
-def test_sqrt_complex_fp_special_values(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    nans_ = [dpt.nan, -dpt.nan]
-    infs_ = [dpt.inf, -dpt.inf]
-    finites_ = [-1.0, -0.0, 0.0, 1.0]
-    inps_ = nans_ + infs_ + finites_
-    c_ = [complex(*v) for v in itertools.product(inps_, repeat=2)]
-
-    z = dpt.asarray(c_, dtype=dtype)
-    r = dpt.sqrt(z)
-
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")
-        expected_np = np.sqrt(np.asarray(c_, dtype=dtype))
-
-    expected = dpt.asarray(expected_np, dtype=dtype)
-    tol = dpt.finfo(r.dtype).resolution
-
-    if not dpt.allclose(r, expected, atol=tol, rtol=tol, equal_nan=True):
-        for i in range(r.shape[0]):
-            failure_data = []
-            if not dpt.allclose(
-                r[i], expected[i], atol=tol, rtol=tol, equal_nan=True
-            ):
-                msg = (
-                    f"Test failed for input {z[i]}, i.e. {c_[i]} for index {i}"
-                )
-                msg += f", results were {r[i]} vs. {expected[i]}"
-                failure_data.extend(msg)
-        pytest.skip(reason=msg)
diff --git a/dpctl/tests/elementwise/test_square.py b/dpctl/tests/elementwise/test_square.py
deleted file mode 100644
index 4513aeac70..0000000000
--- a/dpctl/tests/elementwise/test_square.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import itertools
-
-import numpy as np
-import pytest
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes, _usm_types
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes[1:])
-def test_square_out_type(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    arg_dt = np.dtype(dtype)
-    X = dpt.arange(5, dtype=arg_dt, sycl_queue=q)
-    assert dpt.square(X).dtype == arg_dt
-
-    r = dpt.empty_like(X, dtype=arg_dt)
-    dpt.square(X, out=r)
-    assert np.allclose(dpt.asnumpy(r), dpt.asnumpy(dpt.square(X)))
-
-
-@pytest.mark.parametrize("usm_type", _usm_types)
-def test_square_usm_type(usm_type):
-    q = get_queue_or_skip()
-
-    arg_dt = np.dtype("i4")
-    input_shape = (10, 10, 10, 10)
-    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
-    X[..., 0::2] = 1
-    X[..., 1::2] = 0
-
-    Y = dpt.square(X)
-    assert Y.usm_type == X.usm_type
-    assert Y.sycl_queue == X.sycl_queue
-    assert Y.flags.c_contiguous
-
-    expected_Y = dpt.asnumpy(X)
-    assert np.allclose(dpt.asnumpy(Y), expected_Y)
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes[1:])
-def test_square_order(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    arg_dt = np.dtype(dtype)
-    input_shape = (10, 10, 10, 10)
-    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
-    X[..., 0::2] = 2
-    X[..., 1::2] = 0
-
-    for perms in itertools.permutations(range(4)):
-        U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
-        expected_Y = np.full(U.shape, 4, dtype=U.dtype)
-        expected_Y[..., 1::2] = 0
-        expected_Y = np.transpose(expected_Y, perms)
-        for ord in ["C", "F", "A", "K"]:
-            Y = dpt.square(U, order=ord)
-            assert np.allclose(dpt.asnumpy(Y), expected_Y)
-
-
-@pytest.mark.parametrize("dtype", ["c8", "c16"])
-def test_square_special_cases(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    vals = [np.nan, np.inf, -np.inf, 0.0, -0.0]
-    X = dpt.asarray(vals, dtype=dtype, sycl_queue=q)
-    X_np = dpt.asnumpy(X)
-
-    tol = 8 * dpt.finfo(dtype).resolution
-    with np.errstate(all="ignore"):
-        assert np.allclose(
-            dpt.asnumpy(dpt.square(X)),
-            np.square(X_np),
-            atol=tol,
-            rtol=tol,
-            equal_nan=True,
-        )
diff --git a/dpctl/tests/elementwise/test_subtract.py b/dpctl/tests/elementwise/test_subtract.py
deleted file mode 100644
index 0268586f6b..0000000000
--- a/dpctl/tests/elementwise/test_subtract.py
+++ /dev/null
@@ -1,235 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import ctypes
-
-import numpy as np
-import pytest
-
-import dpctl
-import dpctl.tensor as dpt
-from dpctl.tensor._type_utils import _can_cast
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes, _compare_dtypes, _usm_types
-
-
-@pytest.mark.parametrize("op1_dtype", _all_dtypes[1:])
-@pytest.mark.parametrize("op2_dtype", _all_dtypes[1:])
-def test_subtract_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1 = dpt.ones(sz, dtype=op1_dtype)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
-
-    r = dpt.subtract(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected_dtype = np.subtract(
-        np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype)
-    ).dtype
-    assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q)
-    assert r.shape == ar1.shape
-    assert (dpt.asnumpy(r) == np.full(r.shape, 0, dtype=r.dtype)).all()
-    assert r.sycl_queue == ar1.sycl_queue
-
-    r2 = dpt.empty_like(ar1, dtype=r.dtype)
-    dpt.subtract(ar1, ar2, out=r2)
-    assert (dpt.asnumpy(r2) == np.full(r2.shape, 0, dtype=r2.dtype)).all()
-
-    ar3 = dpt.ones(sz, dtype=op1_dtype)
-    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
-
-    r = dpt.subtract(ar3[::-1], ar4[::2])
-    assert isinstance(r, dpt.usm_ndarray)
-    expected_dtype = np.subtract(
-        np.zeros(1, dtype=op1_dtype), np.zeros(1, dtype=op2_dtype)
-    ).dtype
-    assert _compare_dtypes(r.dtype, expected_dtype, sycl_queue=q)
-    assert r.shape == ar3.shape
-    assert (dpt.asnumpy(r) == np.full(r.shape, 0, dtype=r.dtype)).all()
-
-    r2 = dpt.empty_like(ar1, dtype=r.dtype)
-    dpt.subtract(ar3[::-1], ar4[::2], out=r2)
-    assert (dpt.asnumpy(r2) == np.full(r2.shape, 0, dtype=r2.dtype)).all()
-
-
-def test_subtract_bool():
-    get_queue_or_skip()
-    ar1 = dpt.ones(127, dtype="?")
-    ar2 = dpt.ones_like(ar1, dtype="?")
-    with pytest.raises(ValueError):
-        dpt.subtract(ar1, ar2)
-
-
-@pytest.mark.parametrize("op1_usm_type", _usm_types)
-@pytest.mark.parametrize("op2_usm_type", _usm_types)
-def test_subtract_usm_type_matrix(op1_usm_type, op2_usm_type):
-    get_queue_or_skip()
-
-    sz = 128
-    ar1 = dpt.ones(sz, dtype="i4", usm_type=op1_usm_type)
-    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=op2_usm_type)
-
-    r = dpt.subtract(ar1, ar2)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (op1_usm_type, op2_usm_type)
-    )
-    assert r.usm_type == expected_usm_type
-
-
-def test_subtract_order():
-    get_queue_or_skip()
-
-    test_shape = (
-        20,
-        20,
-    )
-    test_shape2 = tuple(2 * dim for dim in test_shape)
-    n = test_shape[-1]
-
-    for dt1, dt2 in zip(["i4", "i4", "f4"], ["i4", "f4", "i4"]):
-        ar1 = dpt.ones(test_shape, dtype=dt1, order="C")
-        ar2 = dpt.ones(test_shape, dtype=dt2, order="C")
-        r1 = dpt.subtract(ar1, ar2, order="C")
-        assert r1.flags.c_contiguous
-        r2 = dpt.subtract(ar1, ar2, order="F")
-        assert r2.flags.f_contiguous
-        r3 = dpt.subtract(ar1, ar2, order="A")
-        assert r3.flags.c_contiguous
-        r4 = dpt.subtract(ar1, ar2, order="K")
-        assert r4.flags.c_contiguous
-
-        ar1 = dpt.ones(test_shape, dtype=dt1, order="F")
-        ar2 = dpt.ones(test_shape, dtype=dt2, order="F")
-        r1 = dpt.subtract(ar1, ar2, order="C")
-        assert r1.flags.c_contiguous
-        r2 = dpt.subtract(ar1, ar2, order="F")
-        assert r2.flags.f_contiguous
-        r3 = dpt.subtract(ar1, ar2, order="A")
-        assert r3.flags.f_contiguous
-        r4 = dpt.subtract(ar1, ar2, order="K")
-        assert r4.flags.f_contiguous
-
-        ar1 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2]
-        ar2 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2]
-        r4 = dpt.subtract(ar1, ar2, order="K")
-        assert r4.strides == (n, -1)
-        r5 = dpt.subtract(ar1, ar2, order="C")
-        assert r5.strides == (n, 1)
-
-        ar1 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2].mT
-        ar2 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2].mT
-        r4 = dpt.subtract(ar1, ar2, order="K")
-        assert r4.strides == (-1, n)
-        r5 = dpt.subtract(ar1, ar2, order="C")
-        assert r5.strides == (n, 1)
-
-
-def test_subtract_broadcasting():
-    get_queue_or_skip()
-
-    m = dpt.ones((100, 5), dtype="i4")
-    v = dpt.arange(5, dtype="i4")
-
-    r = dpt.subtract(m, v)
-    assert (
-        dpt.asnumpy(r) == np.arange(1, -4, step=-1, dtype="i4")[np.newaxis, :]
-    ).all()
-
-    r2 = dpt.subtract(v, m)
-    assert (
-        dpt.asnumpy(r2) == np.arange(-1, 4, dtype="i4")[np.newaxis, :]
-    ).all()
-
-
-@pytest.mark.parametrize("arr_dt", _all_dtypes[1:])
-def test_subtract_python_scalar(arr_dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arr_dt, q)
-
-    X = dpt.zeros((10, 10), dtype=arr_dt, sycl_queue=q)
-    py_zeros = (
-        bool(0),
-        int(0),
-        float(0),
-        complex(0),
-        np.float32(0),
-        ctypes.c_int(0),
-    )
-    for sc in py_zeros:
-        R = dpt.subtract(X, sc)
-        assert isinstance(R, dpt.usm_ndarray)
-        R = dpt.subtract(sc, X)
-        assert isinstance(R, dpt.usm_ndarray)
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes[1:])
-def test_subtract_inplace_python_scalar(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-    X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q)
-    dt_kind = X.dtype.kind
-    if dt_kind in "ui":
-        X -= int(0)
-    elif dt_kind == "f":
-        X -= float(0)
-    elif dt_kind == "c":
-        X -= complex(0)
-
-
-@pytest.mark.parametrize("op1_dtype", _all_dtypes[1:])
-@pytest.mark.parametrize("op2_dtype", _all_dtypes[1:])
-def test_subtract_inplace_dtype_matrix(op1_dtype, op2_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(op1_dtype, q)
-    skip_if_dtype_not_supported(op2_dtype, q)
-
-    sz = 127
-    ar1 = dpt.ones(sz, dtype=op1_dtype)
-    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
-
-    dev = q.sycl_device
-    _fp16 = dev.has_aspect_fp16
-    _fp64 = dev.has_aspect_fp64
-    if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64, casting="same_kind"):
-        ar1 -= ar2
-        assert (dpt.asnumpy(ar1) == np.zeros(ar1.shape, dtype=ar1.dtype)).all()
-
-        ar3 = dpt.ones(sz, dtype=op1_dtype)
-        ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
-
-        ar3[::-1] -= ar4[::2]
-        assert (dpt.asnumpy(ar3) == np.zeros(ar3.shape, dtype=ar3.dtype)).all()
-
-    else:
-        with pytest.raises(ValueError):
-            ar1 -= ar2
-
-
-def test_subtract_inplace_broadcasting():
-    get_queue_or_skip()
-
-    m = dpt.ones((100, 5), dtype="i4")
-    v = dpt.arange(5, dtype="i4")
-
-    m -= v
-    assert (
-        dpt.asnumpy(m) == np.arange(1, -4, step=-1, dtype="i4")[np.newaxis, :]
-    ).all()
diff --git a/dpctl/tests/elementwise/test_trigonometric.py b/dpctl/tests/elementwise/test_trigonometric.py
deleted file mode 100644
index 7fc3d84a56..0000000000
--- a/dpctl/tests/elementwise/test_trigonometric.py
+++ /dev/null
@@ -1,216 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import numpy as np
-import pytest
-from numpy.testing import assert_allclose
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-from .utils import _all_dtypes, _map_to_device_dtype
-
-_trig_funcs = [(np.sin, dpt.sin), (np.cos, dpt.cos), (np.tan, dpt.tan)]
-_inv_trig_funcs = [
-    (np.arcsin, dpt.asin),
-    (np.arccos, dpt.acos),
-    (np.arctan, dpt.atan),
-]
-_all_funcs = _trig_funcs + _inv_trig_funcs
-
-
-@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_trig_out_type(np_call, dpt_call, dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    x = dpt.asarray(0, dtype=dtype, sycl_queue=q)
-    expected_dtype = np_call(np.array(0, dtype=dtype)).dtype
-    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
-    assert dpt_call(x).dtype == expected_dtype
-
-
-@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
-def test_trig_real_contig(np_call, dpt_call, dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n_seq = 100
-    n_rep = 137
-    if np_call in _trig_funcs:
-        Xnp = np.linspace(
-            -np.pi / 2 * 0.99, np.pi / 2 * 0.99, num=n_seq, dtype=dtype
-        )
-    if np_call == np.arctan:
-        Xnp = np.linspace(-100.0, 100.0, num=n_seq, dtype=dtype)
-    else:
-        Xnp = np.linspace(-1.0, 1.0, num=n_seq, dtype=dtype)
-
-    X = dpt.asarray(np.repeat(Xnp, n_rep), dtype=dtype, sycl_queue=q)
-    Y = dpt_call(X)
-
-    tol = 8 * dpt.finfo(dtype).resolution
-    assert_allclose(
-        dpt.asnumpy(Y), np.repeat(np_call(Xnp), n_rep), atol=tol, rtol=tol
-    )
-
-    Z = dpt.empty_like(X, dtype=dtype)
-    dpt_call(X, out=Z)
-
-    assert_allclose(
-        dpt.asnumpy(Z), np.repeat(np_call(Xnp), n_rep), atol=tol, rtol=tol
-    )
-
-
-@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
-@pytest.mark.parametrize("dtype", ["c8", "c16"])
-def test_trig_complex_contig(np_call, dpt_call, dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n_seq = 256
-    n_rep = 137
-    low = -9.0
-    high = 9.0
-    x1 = np.random.uniform(low=low, high=high, size=n_seq)
-    x2 = np.random.uniform(low=low, high=high, size=n_seq)
-    Xnp = x1 + 1j * x2
-
-    # stay away from poles and branch lines
-    modulus = np.abs(Xnp)
-    sel = np.logical_or(
-        modulus < 0.9,
-        np.logical_and(
-            modulus > 1.2, np.minimum(np.abs(x2), np.abs(x1)) > 0.05
-        ),
-    )
-    Xnp = Xnp[sel]
-
-    X = dpt.repeat(dpt.asarray(Xnp, dtype=dtype, sycl_queue=q), n_rep)
-    Y = dpt_call(X)
-
-    expected = np.repeat(np_call(Xnp), n_rep)
-
-    tol = 50 * dpt.finfo(dtype).resolution
-    assert_allclose(dpt.asnumpy(Y), expected, atol=tol, rtol=tol)
-
-    Z = dpt.empty_like(X, dtype=dtype)
-    dpt_call(X, out=Z)
-
-    assert_allclose(dpt.asnumpy(Z), expected, atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
-def test_trig_real_strided(np_call, dpt_call, dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    np.random.seed(42)
-    strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4])
-    sizes = [2, 3, 4, 6, 8, 9, 24, 50, 72]
-    tol = 8 * dpt.finfo(dtype).resolution
-
-    low = -100.0
-    high = 100.0
-    if np_call in [np.arccos, np.arcsin]:
-        low = -1.0
-        high = 1.0
-    elif np_call in [np.tan]:
-        low = -np.pi / 2 * (0.99)
-        high = np.pi / 2 * (0.99)
-
-    for ii in sizes:
-        Xnp = np.random.uniform(low=low, high=high, size=ii)
-        Xnp.astype(dtype)
-        X = dpt.asarray(Xnp)
-        Ynp = np_call(Xnp)
-        for jj in strides:
-            assert_allclose(
-                dpt.asnumpy(dpt_call(X[::jj])),
-                Ynp[::jj],
-                atol=tol,
-                rtol=tol,
-            )
-
-
-@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
-@pytest.mark.parametrize("dtype", ["c8", "c16"])
-def test_trig_complex_strided(np_call, dpt_call, dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    np.random.seed(42)
-    strides = np.array([-4, -3, -2, -1, 1, 2, 3, 4])
-    sizes = [2, 4, 6, 8, 9, 24, 72]
-    tol = 50 * dpt.finfo(dtype).resolution
-
-    low = -9.0
-    high = 9.0
-    while True:
-        x1 = np.random.uniform(low=low, high=high, size=2 * sum(sizes))
-        x2 = np.random.uniform(low=low, high=high, size=2 * sum(sizes))
-        Xnp_all = np.array(
-            [complex(v1, v2) for v1, v2 in zip(x1, x2)], dtype=dtype
-        )
-
-        # stay away from poles and branch lines
-        modulus = np.abs(Xnp_all)
-        sel = np.logical_or(
-            modulus < 0.9,
-            np.logical_and(
-                modulus > 1.2, np.minimum(np.abs(x2), np.abs(x1)) > 0.05
-            ),
-        )
-        Xnp_all = Xnp_all[sel]
-        if Xnp_all.size > sum(sizes):
-            break
-
-    pos = 0
-    for ii in sizes:
-        pos = pos + ii
-        Xnp = Xnp_all[:pos]
-        Xnp = Xnp[-ii:]
-        X = dpt.asarray(Xnp)
-        Ynp = np_call(Xnp)
-        for jj in strides:
-            assert_allclose(
-                dpt.asnumpy(dpt_call(X[::jj])),
-                Ynp[::jj],
-                atol=tol,
-                rtol=tol,
-            )
-
-
-@pytest.mark.parametrize("np_call, dpt_call", _all_funcs)
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
-def test_trig_real_special_cases(np_call, dpt_call, dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    x = [np.nan, np.inf, -np.inf, 2.0, -2.0, +0.0, -0.0, +1.0, -1.0]
-
-    xf = np.array(x, dtype=dtype)
-    yf = dpt.asarray(xf, dtype=dtype, sycl_queue=q)
-
-    with np.errstate(all="ignore"):
-        Y_np = np_call(xf)
-
-    tol = 8 * dpt.finfo(dtype).resolution
-    Y = dpt_call(yf)
-    assert_allclose(dpt.asnumpy(Y), Y_np, atol=tol, rtol=tol)
diff --git a/dpctl/tests/elementwise/test_type_utils.py b/dpctl/tests/elementwise/test_type_utils.py
deleted file mode 100644
index 04b8629db4..0000000000
--- a/dpctl/tests/elementwise/test_type_utils.py
+++ /dev/null
@@ -1,239 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import numpy as np
-import pytest
-
-import dpctl
-import dpctl.tensor as dpt
-import dpctl.tensor._type_utils as tu
-
-from .utils import _all_dtypes, _map_to_device_dtype
-
-
-class MockDevice:
-    def __init__(self, fp16: bool, fp64: bool):
-        self.has_aspect_fp16 = fp16
-        self.has_aspect_fp64 = fp64
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_type_utils_map_to_device_type(dtype):
-    for fp64 in [
-        True,
-        False,
-    ]:
-        for fp16 in [True, False]:
-            dev = MockDevice(fp16, fp64)
-            dt_in = dpt.dtype(dtype)
-            dt_out = _map_to_device_dtype(dt_in, dev)
-            assert isinstance(dt_out, dpt.dtype)
-
-
-def test_type_util_all_data_types():
-    for fp64 in [
-        True,
-        False,
-    ]:
-        for fp16 in [True, False]:
-            r = tu._all_data_types(fp16, fp64)
-            assert isinstance(r, list)
-            # 11: bool + 4 signed + 4 unsigned inegral + float32 + complex64
-            assert len(r) == 11 + int(fp16) + 2 * int(fp64)
-
-
-def test_type_util_can_cast():
-    for fp64 in [
-        True,
-        False,
-    ]:
-        for fp16 in [True, False]:
-            for from_ in _all_dtypes:
-                for to_ in _all_dtypes:
-                    r = tu._can_cast(
-                        dpt.dtype(from_), dpt.dtype(to_), fp16, fp64
-                    )
-                    assert isinstance(r, bool)
-
-
-def test_type_utils_find_buf_dtype():
-    def _denier_fn(dt):
-        return False
-
-    for fp64 in [
-        True,
-        False,
-    ]:
-        for fp16 in [True, False]:
-            dev = MockDevice(fp16, fp64)
-            arg_dt = dpt.float64
-            r = tu._find_buf_dtype(
-                arg_dt, _denier_fn, dev, tu._acceptance_fn_default_unary
-            )
-            assert r == (
-                None,
-                None,
-            )
-
-
-def test_type_utils_get_device_default_type():
-    with pytest.raises(RuntimeError):
-        tu._get_device_default_dtype("-", MockDevice(True, True))
-    try:
-        dev = dpctl.SyclDevice()
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    for k in ["b", "i", "u", "f", "c"]:
-        dt = tu._get_device_default_dtype(k, dev)
-        assert isinstance(dt, dpt.dtype)
-        assert dt.kind == k
-
-
-def test_type_utils_find_buf_dtype2():
-    def _denier_fn(dt1, dt2):
-        return False
-
-    for fp64 in [
-        True,
-        False,
-    ]:
-        for fp16 in [True, False]:
-            dev = MockDevice(fp16, fp64)
-            arg1_dt = dpt.float64
-            arg2_dt = dpt.complex64
-            r = tu._find_buf_dtype2(
-                arg1_dt,
-                arg2_dt,
-                _denier_fn,
-                dev,
-                tu._acceptance_fn_default_binary,
-            )
-            assert r == (
-                None,
-                None,
-                None,
-            )
-
-
-def test_unary_func_arg_validation():
-    with pytest.raises(TypeError):
-        dpt.abs([1, 2, 3])
-    try:
-        a = dpt.arange(8)
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    dpt.abs(a, order="invalid")
-
-
-def test_binary_func_arg_validation():
-    with pytest.raises(dpctl.utils.ExecutionPlacementError):
-        dpt.add([1, 2, 3], 1)
-    try:
-        a = dpt.arange(8)
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    with pytest.raises(ValueError):
-        dpt.add(a, Ellipsis)
-    dpt.add(a, a, order="invalid")
-
-
-def test_all_data_types():
-    fp16_fp64_types = set([dpt.float16, dpt.float64, dpt.complex128])
-    fp64_types = set([dpt.float64, dpt.complex128])
-
-    all_dts = tu._all_data_types(True, True)
-    assert fp16_fp64_types.issubset(all_dts)
-
-    all_dts = tu._all_data_types(True, False)
-    assert dpt.float16 in all_dts
-    assert not fp64_types.issubset(all_dts)
-
-    all_dts = tu._all_data_types(False, True)
-    assert dpt.float16 not in all_dts
-    assert fp64_types.issubset(all_dts)
-
-    all_dts = tu._all_data_types(False, False)
-    assert not fp16_fp64_types.issubset(all_dts)
-
-
-@pytest.mark.parametrize("fp16", [True, False])
-@pytest.mark.parametrize("fp64", [True, False])
-def test_maximal_inexact_types(fp16, fp64):
-    assert not tu._is_maximal_inexact_type(dpt.int32, fp16, fp64)
-    assert fp64 == tu._is_maximal_inexact_type(dpt.float64, fp16, fp64)
-    assert fp64 == tu._is_maximal_inexact_type(dpt.complex128, fp16, fp64)
-    assert fp64 != tu._is_maximal_inexact_type(dpt.float32, fp16, fp64)
-    assert fp64 != tu._is_maximal_inexact_type(dpt.complex64, fp16, fp64)
-
-
-def test_can_cast_device():
-    assert tu._can_cast(dpt.int64, dpt.float64, True, True)
-    # if f8 is available, can't cast i8 to f4
-    assert not tu._can_cast(dpt.int64, dpt.float32, True, True)
-    assert not tu._can_cast(dpt.int64, dpt.float32, False, True)
-    # should be able to cast to f8 when f2 unavailable
-    assert tu._can_cast(dpt.int64, dpt.float64, False, True)
-    # casting to f4 acceptable when f8 unavailable
-    assert tu._can_cast(dpt.int64, dpt.float32, True, False)
-    assert tu._can_cast(dpt.int64, dpt.float32, False, False)
-    # can't safely cast inexact type to inexact type of lesser precision
-    assert not tu._can_cast(dpt.float32, dpt.float16, True, False)
-    assert not tu._can_cast(dpt.float64, dpt.float32, False, True)
-
-
-def test_acceptance_fns():
-    """Check type promotion acceptance functions"""
-    try:
-        dev = dpctl.SyclDevice()
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("Default device is not available")
-    assert tu._acceptance_fn_reciprocal(
-        dpt.float32, dpt.float32, dpt.float32, dev
-    )
-    assert tu._acceptance_fn_negative(dpt.int8, dpt.int16, dpt.int16, dev)
-
-
-def test_weak_types():
-    wbt = tu.WeakBooleanType(True)
-    assert wbt.get()
-    assert tu._weak_type_num_kind(wbt) == 0
-
-    wit = tu.WeakIntegralType(7)
-    assert wit.get() == 7
-    assert tu._weak_type_num_kind(wit) == 1
-
-    wft = tu.WeakFloatingType(3.1415926)
-    assert wft.get() == 3.1415926
-    assert tu._weak_type_num_kind(wft) == 2
-
-    wct = tu.WeakComplexType(2.0 + 3.0j)
-    assert wct.get() == 2 + 3j
-    assert tu._weak_type_num_kind(wct) == 3
-
-
-def test_arg_validation():
-    with pytest.raises(TypeError):
-        tu._weak_type_num_kind(dict())
-
-    with pytest.raises(TypeError):
-        tu._strong_dtype_num_kind(Ellipsis)
-
-    with pytest.raises(ValueError):
-        tu._strong_dtype_num_kind(np.dtype("O"))
-
-    wt = tu.WeakFloatingType(2.0)
-    with pytest.raises(ValueError):
-        tu._resolve_weak_types(wt, wt, None)
diff --git a/dpctl/tests/elementwise/utils.py b/dpctl/tests/elementwise/utils.py
deleted file mode 100644
index 3d140e7043..0000000000
--- a/dpctl/tests/elementwise/utils.py
+++ /dev/null
@@ -1,61 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import dpctl
-import dpctl.tensor._type_utils as tu
-
-_integral_dtypes = [
-    "i1",
-    "u1",
-    "i2",
-    "u2",
-    "i4",
-    "u4",
-    "i8",
-    "u8",
-]
-_real_fp_dtypes = ["f2", "f4", "f8"]
-_complex_fp_dtypes = [
-    "c8",
-    "c16",
-]
-_real_value_dtypes = _integral_dtypes + _real_fp_dtypes
-_no_complex_dtypes = [
-    "b1",
-] + _real_value_dtypes
-_all_dtypes = _no_complex_dtypes + _complex_fp_dtypes
-
-_usm_types = ["device", "shared", "host"]
-
-
-def _map_to_device_dtype(dt, dev):
-    return tu._to_device_supported_dtype(dt, dev)
-
-
-def _compare_dtypes(dt, ref_dt, sycl_queue=None):
-    assert isinstance(sycl_queue, dpctl.SyclQueue)
-    dev = sycl_queue.sycl_device
-    expected_dt = _map_to_device_dtype(ref_dt, dev)
-    return dt == expected_dt
-
-
-__all__ = [
-    "_no_complex_dtypes",
-    "_all_dtypes",
-    "_usm_types",
-    "_map_to_device_dtype",
-    "_compare_dtypes",
-]
diff --git a/dpctl/tests/test_tensor_accumulation.py b/dpctl/tests/test_tensor_accumulation.py
deleted file mode 100644
index 9c8eec91d1..0000000000
--- a/dpctl/tests/test_tensor_accumulation.py
+++ /dev/null
@@ -1,435 +0,0 @@
-#                      Data Parallel Control (dpctl)
-#
-# Copyright 2020-2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from random import randrange
-
-import pytest
-
-import dpctl.tensor as dpt
-from dpctl.utils import ExecutionPlacementError
-
-from .helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-sint_types = [
-    dpt.int8,
-    dpt.int16,
-    dpt.int32,
-    dpt.int64,
-]
-uint_types = [
-    dpt.uint8,
-    dpt.uint16,
-    dpt.uint32,
-    dpt.uint64,
-]
-rfp_types = [
-    dpt.float16,
-    dpt.float32,
-    dpt.float64,
-]
-cfp_types = [
-    dpt.complex64,
-    dpt.complex128,
-]
-
-no_complex_types = [dpt.bool] + sint_types + uint_types + rfp_types
-
-all_types = [dpt.bool] + sint_types + uint_types + rfp_types + cfp_types
-
-
-@pytest.mark.parametrize("dt", sint_types)
-def test_contig_cumsum_sint(dt):
-    get_queue_or_skip()
-    n = 10000
-    x = dpt.repeat(dpt.asarray([1, -1], dtype=dt), n)
-
-    res = dpt.cumulative_sum(x, dtype=dt)
-
-    ar = dpt.arange(n, dtype=dt)
-    expected = dpt.concat((1 + ar, dpt.flip(ar)))
-    assert dpt.all(res == expected)
-
-
-@pytest.mark.parametrize("dt", sint_types)
-def test_strided_cumsum_sint(dt):
-    get_queue_or_skip()
-    n = 10000
-    x = dpt.repeat(dpt.asarray([1, -1], dtype=dt), 2 * n)[1::2]
-
-    res = dpt.cumulative_sum(x, dtype=dt)
-
-    ar = dpt.arange(n, dtype=dt)
-    expected = dpt.concat((1 + ar, dpt.flip(ar)))
-    assert dpt.all(res == expected)
-
-    x2 = dpt.repeat(dpt.asarray([-1, 1], dtype=dt), 2 * n)[-1::-2]
-
-    res = dpt.cumulative_sum(x2, dtype=dt)
-
-    ar = dpt.arange(n, dtype=dt)
-    expected = dpt.concat((1 + ar, dpt.flip(ar)))
-    assert dpt.all(res == expected)
-
-
-@pytest.mark.parametrize("dt", sint_types)
-def test_contig_cumsum_axis_sint(dt):
-    get_queue_or_skip()
-    n0, n1 = 1000, 173
-    x = dpt.repeat(dpt.asarray([1, -1], dtype=dt), n0)
-    m = dpt.tile(dpt.expand_dims(x, axis=1), (1, n1))
-
-    res = dpt.cumulative_sum(m, dtype=dt, axis=0)
-
-    ar = dpt.arange(n0, dtype=dt)
-    expected = dpt.concat((1 + ar, dpt.flip(ar)))
-    assert dpt.all(res == dpt.expand_dims(expected, axis=1))
-
-
-@pytest.mark.parametrize("dt", sint_types)
-def test_strided_cumsum_axis_sint(dt):
-    get_queue_or_skip()
-    n0, n1 = 1000, 173
-    x = dpt.repeat(dpt.asarray([1, -1], dtype=dt), 2 * n0)
-    m = dpt.tile(dpt.expand_dims(x, axis=1), (1, n1))[1::2, ::-1]
-
-    res = dpt.cumulative_sum(m, dtype=dt, axis=0)
-
-    ar = dpt.arange(n0, dtype=dt)
-    expected = dpt.concat((1 + ar, dpt.flip(ar)))
-    assert dpt.all(res == dpt.expand_dims(expected, axis=1))
-
-
-def test_accumulate_scalar():
-    get_queue_or_skip()
-
-    s = dpt.asarray(1, dtype="i8")
-    r = dpt.cumulative_sum(s)
-    assert r == s
-    assert r.ndim == s.ndim
-
-    r = dpt.cumulative_sum(s, include_initial=True)
-    r_expected = dpt.asarray([0, 1], dtype="i8")
-    assert dpt.all(r == r_expected)
-
-
-def test_cumulative_sum_include_initial():
-    get_queue_or_skip()
-
-    n0, n1 = 3, 5
-    x = dpt.ones((n0, n1), dtype="i4")
-    r = dpt.cumulative_sum(x, axis=0, include_initial=True)
-    assert dpt.all(r[0, :] == 0)
-
-    r = dpt.cumulative_sum(x, axis=1, include_initial=True)
-    assert dpt.all(r[:, 0] == 0)
-
-    x = dpt.ones(n1, dtype="i4")
-    r = dpt.cumulative_sum(x, include_initial=True)
-    assert r.shape == (n1 + 1,)
-    assert r[0] == 0
-
-
-def test_cumulative_prod_identity():
-    get_queue_or_skip()
-
-    x = dpt.zeros(5, dtype="i4")
-    r = dpt.cumulative_prod(x, include_initial=True)
-    assert r[0] == 1
-
-
-def test_cumulative_logsumexp_identity():
-    get_queue_or_skip()
-
-    x = dpt.ones(5, dtype="f4")
-    r = dpt.cumulative_logsumexp(x, include_initial=True)
-    assert r[0] == -dpt.inf
-
-
-def test_accumulate_zero_size_dims():
-    get_queue_or_skip()
-
-    n0, n1, n2 = 3, 0, 5
-    x = dpt.ones((n0, n1, n2), dtype="i8")
-    r = dpt.cumulative_sum(x, axis=1)
-    assert r.shape == x.shape
-    assert r.size == 0
-
-    r = dpt.cumulative_sum(x, axis=0)
-    assert r.shape == x.shape
-    assert r.size == 0
-
-    r = dpt.cumulative_sum(x, axis=1, include_initial=True)
-    assert r.shape == (n0, n1 + 1, n2)
-    assert r.size == (n0 * n2)
-
-    r = dpt.cumulative_sum(x, axis=0, include_initial=True)
-    assert r.shape == (n0 + 1, n1, n2)
-    assert r.size == 0
-
-
-@pytest.mark.parametrize("arg_dtype", all_types)
-def test_cumsum_arg_dtype_default_output_dtype_matrix(arg_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arg_dtype, q)
-
-    n = 100
-    x = dpt.ones(n, dtype=arg_dtype)
-    r = dpt.cumulative_sum(x)
-
-    assert isinstance(r, dpt.usm_ndarray)
-    if x.dtype.kind == "i":
-        assert r.dtype.kind == "i"
-    elif x.dtype.kind == "u":
-        assert r.dtype.kind == "u"
-    elif x.dtype.kind == "fc":
-        assert r.dtype == arg_dtype
-
-    r_expected = dpt.arange(1, n + 1, dtype=r.dtype)
-
-    assert dpt.all(r == r_expected)
-
-
-@pytest.mark.parametrize("arg_dtype", all_types)
-@pytest.mark.parametrize("out_dtype", all_types)
-def test_cumsum_arg_out_dtype_matrix(arg_dtype, out_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arg_dtype, q)
-    skip_if_dtype_not_supported(out_dtype, q)
-
-    n = 100
-    x = dpt.ones(n, dtype=arg_dtype)
-    r = dpt.cumulative_sum(x, dtype=out_dtype)
-
-    assert isinstance(r, dpt.usm_ndarray)
-    assert r.dtype == dpt.dtype(out_dtype)
-    if out_dtype == dpt.bool:
-        assert dpt.all(r)
-    else:
-        r_expected = dpt.arange(1, n + 1, dtype=out_dtype)
-        assert dpt.all(r == r_expected)
-
-
-def test_accumulator_out_kwarg():
-    q = get_queue_or_skip()
-
-    n = 100
-
-    expected = dpt.arange(1, n + 1, dtype="i4", sycl_queue=q)
-    x = dpt.ones(n, dtype="i4", sycl_queue=q)
-    out = dpt.empty_like(x, dtype="i4")
-    dpt.cumulative_sum(x, dtype="i4", out=out)
-    assert dpt.all(expected == out)
-
-    # overlap
-    x = dpt.ones(n, dtype="i4", sycl_queue=q)
-    dpt.cumulative_sum(x, dtype="i4", out=x)
-    assert dpt.all(x == expected)
-
-    # axis before final axis
-    expected = dpt.broadcast_to(
-        dpt.arange(1, n + 1, dtype="i4", sycl_queue=q), (n, n)
-    ).mT
-    x = dpt.ones((n, n), dtype="i4", sycl_queue=q)
-    out = dpt.empty_like(x, dtype="i4")
-    dpt.cumulative_sum(x, axis=0, dtype="i4", out=out)
-    assert dpt.all(expected == out)
-
-    # scalar
-    x = dpt.asarray(3, dtype="i4")
-    out = dpt.empty((), dtype="i4")
-    expected = 3
-    dpt.cumulative_sum(x, dtype="i4", out=out)
-    assert expected == out
-
-
-def test_accumulator_arg_validation():
-    q1 = get_queue_or_skip()
-    q2 = get_queue_or_skip()
-
-    n = 5
-    x1 = dpt.ones((n, n), dtype="f4", sycl_queue=q1)
-    x2 = dpt.ones(n, dtype="f4", sycl_queue=q1)
-
-    # must be usm_ndarray
-    with pytest.raises(TypeError):
-        dpt.cumulative_sum(dict())
-
-    # axis must be specified when input not 1D
-    with pytest.raises(ValueError):
-        dpt.cumulative_sum(x1)
-
-    # out must be usm_ndarray
-    with pytest.raises(TypeError):
-        dpt.cumulative_sum(x2, out=dict())
-
-    # out must be writable
-    out_not_writable = dpt.empty_like(x2)
-    out_not_writable.flags.writable = False
-    with pytest.raises(ValueError):
-        dpt.cumulative_sum(x2, out=out_not_writable)
-
-    # out must be expected shape
-    out_wrong_shape = dpt.ones(n + 1, dtype=x2.dtype, sycl_queue=q1)
-    with pytest.raises(ValueError):
-        dpt.cumulative_sum(x2, out=out_wrong_shape)
-
-    # out must be expected dtype
-    out_wrong_dtype = dpt.empty_like(x2, dtype="i4")
-    with pytest.raises(ValueError):
-        dpt.cumulative_sum(x2, out=out_wrong_dtype)
-
-    # compute follows data
-    out_wrong_queue = dpt.empty_like(x2, sycl_queue=q2)
-    with pytest.raises(ExecutionPlacementError):
-        dpt.cumulative_sum(x2, out=out_wrong_queue)
-
-
-def test_cumsum_nan_propagation():
-    get_queue_or_skip()
-
-    n = 100
-    x = dpt.ones(n, dtype="f4")
-    i = randrange(n)
-    x[i] = dpt.nan
-
-    r = dpt.cumulative_sum(x)
-    assert dpt.all(dpt.isnan(r[i:]))
-
-
-def test_cumprod_nan_propagation():
-    get_queue_or_skip()
-
-    n = 100
-    x = dpt.ones(n, dtype="f4")
-    i = randrange(n)
-    x[i] = dpt.nan
-
-    r = dpt.cumulative_prod(x)
-    assert dpt.all(dpt.isnan(r[i:]))
-
-
-def test_logcumsumexp_nan_propagation():
-    get_queue_or_skip()
-
-    n = 100
-    x = dpt.ones(n, dtype="f4")
-    i = randrange(n)
-    x[i] = dpt.nan
-
-    r = dpt.cumulative_logsumexp(x)
-    assert dpt.all(dpt.isnan(r[i:]))
-
-
-@pytest.mark.parametrize("arg_dtype", no_complex_types)
-def test_logcumsumexp_arg_dtype_default_output_dtype_matrix(arg_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arg_dtype, q)
-
-    x = dpt.ones(10, dtype=arg_dtype, sycl_queue=q)
-    r = dpt.cumulative_logsumexp(x)
-
-    if arg_dtype.kind in "biu":
-        assert r.dtype.kind == "f"
-    else:
-        assert r.dtype == arg_dtype
-
-
-def test_logcumsumexp_complex_error():
-    get_queue_or_skip()
-
-    x = dpt.ones(10, dtype="c8")
-    with pytest.raises(ValueError):
-        dpt.cumulative_logsumexp(x)
-
-
-def test_cumprod_basic():
-    get_queue_or_skip()
-
-    n = 50
-    val = 2
-    x = dpt.full(n, val, dtype="i8")
-    r = dpt.cumulative_prod(x)
-    expected = dpt.pow(val, dpt.arange(1, n + 1, dtype="i8"))
-
-    assert dpt.all(r == expected)
-
-    x = dpt.tile(dpt.asarray([2, 0.5], dtype="f4"), 10000)
-    expected = dpt.tile(dpt.asarray([2, 1], dtype="f4"), 10000)
-    r = dpt.cumulative_prod(x)
-    assert dpt.all(r == expected)
-
-
-def test_logcumsumexp_basic():
-    get_queue_or_skip()
-
-    dt = dpt.float32
-    x = dpt.ones(1000, dtype=dt)
-    r = dpt.cumulative_logsumexp(x)
-
-    expected = 1 + dpt.log(dpt.arange(1, 1001, dtype=dt))
-
-    tol = 4 * dpt.finfo(dt).resolution
-    assert dpt.allclose(r, expected, atol=tol, rtol=tol)
-
-
-def geometric_series_closed_form(n, dtype=None, device=None):
-    """Closed form for cumulative_logsumexp(dpt.arange(-n, 0))
-
-    :math:`r[k] == -n + k + log(1 - exp(-k-1)) - log(1-exp(-1))`
-    """
-    x = dpt.arange(-n, 0, dtype=dtype, device=device)
-    y = dpt.arange(-1, -n - 1, step=-1, dtype=dtype, device=device)
-    y = dpt.exp(y, out=y)
-    y = dpt.negative(y, out=y)
-    y = dpt.log1p(y, out=y)
-    y -= y[0]
-    return x + y
-
-
-@pytest.mark.parametrize("fpdt", rfp_types)
-def test_cumulative_logsumexp_closed_form(fpdt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(fpdt, q)
-
-    n = 128
-    r = dpt.cumulative_logsumexp(dpt.arange(-n, 0, dtype=fpdt, device=q))
-    expected = geometric_series_closed_form(n, dtype=fpdt, device=q)
-
-    tol = 4 * dpt.finfo(fpdt).eps
-    assert dpt.allclose(r, expected, atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("p", [257, 260, 273, 280, 509, 512])
-def test_cumulative_sum_gh_1901(p):
-    get_queue_or_skip()
-
-    n = p * p
-    dt = dpt.int32
-    inp = dpt.ones(n, dtype=dt)
-    r = dpt.cumulative_sum(inp, dtype=dt)
-    assert dpt.all(r == dpt.arange(1, n + 1, dtype=dt))
-
-
-@pytest.mark.parametrize(
-    "dt", ["i1", "i2", "i4", "i8", "f2", "f4", "f8", "c8", "c16"]
-)
-def test_gh_2017(dt):
-    "See https://github.com/IntelPython/dpctl/issues/2017"
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-    x = dpt.asarray([-1, 1], dtype=dpt.dtype(dt), sycl_queue=q)
-    r = dpt.cumulative_sum(x, dtype="?")
-    assert dpt.all(r)
diff --git a/dpctl/tests/test_tensor_array_api_inspection.py b/dpctl/tests/test_tensor_array_api_inspection.py
deleted file mode 100644
index bd6d14a528..0000000000
--- a/dpctl/tests/test_tensor_array_api_inspection.py
+++ /dev/null
@@ -1,226 +0,0 @@
-#                      Data Parallel Control (dpctl)
-#
-# Copyright 2020-2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-import dpctl
-import dpctl.tensor as dpt
-from dpctl.tensor._tensor_impl import (
-    default_device_complex_type,
-    default_device_fp_type,
-    default_device_index_type,
-    default_device_int_type,
-)
-
-_dtypes_no_fp16_fp64 = {
-    "bool": dpt.bool,
-    "float32": dpt.float32,
-    "complex64": dpt.complex64,
-    "int8": dpt.int8,
-    "int16": dpt.int16,
-    "int32": dpt.int32,
-    "int64": dpt.int64,
-    "uint8": dpt.uint8,
-    "uint16": dpt.uint16,
-    "uint32": dpt.uint32,
-    "uint64": dpt.uint64,
-}
-
-
-def test_array_api_inspection_methods():
-    info = dpt.__array_namespace_info__()
-    assert info.capabilities()
-    try:
-        assert info.default_device()
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No default device available")
-    assert info.default_dtypes()
-    assert info.devices()
-    assert info.dtypes()
-
-
-def test_array_api_inspection_default_device():
-    try:
-        dev = dpctl.select_default_device()
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No default device available")
-    assert dpt.__array_namespace_info__().default_device() == dev
-
-
-def test_array_api_inspection_devices():
-    try:
-        devices2 = dpctl.get_devices()
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No default device available")
-    devices1 = dpt.__array_namespace_info__().devices()
-    assert len(devices1) == len(devices2)
-    assert devices1 == devices2
-
-
-def test_array_api_inspection_capabilities():
-    capabilities = dpt.__array_namespace_info__().capabilities()
-    assert capabilities["boolean indexing"]
-    assert capabilities["data-dependent shapes"]
-    assert capabilities["max dimensions"] is None
-
-
-def test_array_api_inspection_default_dtypes():
-    try:
-        dev = dpctl.select_default_device()
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No default device available")
-
-    int_dt = default_device_int_type(dev)
-    ind_dt = default_device_index_type(dev)
-    fp_dt = default_device_fp_type(dev)
-    cm_dt = default_device_complex_type(dev)
-
-    info = dpt.__array_namespace_info__()
-    default_dts_nodev = info.default_dtypes()
-    default_dts_dev = info.default_dtypes(device=dev)
-
-    assert (
-        int_dt == default_dts_nodev["integral"] == default_dts_dev["integral"]
-    )
-    assert (
-        ind_dt == default_dts_nodev["indexing"] == default_dts_dev["indexing"]
-    )
-    assert (
-        fp_dt
-        == default_dts_nodev["real floating"]
-        == default_dts_dev["real floating"]
-    )
-    assert (
-        cm_dt
-        == default_dts_nodev["complex floating"]
-        == default_dts_dev["complex floating"]
-    )
-
-
-def test_array_api_inspection_default_device_dtypes():
-    try:
-        dev = dpctl.select_default_device()
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No default device available")
-    dtypes = _dtypes_no_fp16_fp64.copy()
-    if dev.has_aspect_fp64:
-        dtypes["float64"] = dpt.float64
-        dtypes["complex128"] = dpt.complex128
-
-    assert dtypes == dpt.__array_namespace_info__().dtypes()
-
-
-def test_array_api_inspection_device_dtypes():
-    info = dpt.__array_namespace_info__()
-    try:
-        dev = info.default_device()
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No default device available")
-    dtypes = _dtypes_no_fp16_fp64.copy()
-    if dev.has_aspect_fp64:
-        dtypes["float64"] = dpt.float64
-        dtypes["complex128"] = dpt.complex128
-
-    assert dtypes == dpt.__array_namespace_info__().dtypes(device=dev)
-
-
-def test_array_api_inspection_dtype_kind():
-    info = dpt.__array_namespace_info__()
-    try:
-        info.default_device()
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No default device available")
-
-    f_dtypes = info.dtypes(kind="real floating")
-    assert all([_dt[1].kind == "f" for _dt in f_dtypes.items()])
-
-    i_dtypes = info.dtypes(kind="signed integer")
-    assert all([_dt[1].kind == "i" for _dt in i_dtypes.items()])
-
-    u_dtypes = info.dtypes(kind="unsigned integer")
-    assert all([_dt[1].kind == "u" for _dt in u_dtypes.items()])
-
-    ui_dtypes = info.dtypes(kind="unsigned integer")
-    assert all([_dt[1].kind in "ui" for _dt in ui_dtypes.items()])
-
-    c_dtypes = info.dtypes(kind="complex floating")
-    assert all([_dt[1].kind == "c" for _dt in c_dtypes.items()])
-
-    assert info.dtypes(kind="bool") == {"bool": dpt.bool}
-
-    _signed_ints = {
-        "int8": dpt.int8,
-        "int16": dpt.int16,
-        "int32": dpt.int32,
-        "int64": dpt.int64,
-    }
-    assert (
-        info.dtypes(kind=("signed integer", "signed integer")) == _signed_ints
-    )
-    assert (
-        info.dtypes(
-            kind=("integral", "bool", "real floating", "complex floating")
-        )
-        == info.dtypes()
-    )
-    assert info.dtypes(
-        kind=("integral", "real floating", "complex floating")
-    ) == info.dtypes(kind="numeric")
-
-
-def test_array_api_inspection_dtype_kind_errors():
-    info = dpt.__array_namespace_info__()
-    try:
-        info.default_device()
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No default device available")
-
-    with pytest.raises(ValueError):
-        info.dtypes(kind="error")
-
-    with pytest.raises(TypeError):
-        info.dtypes(kind={0: "real floating"})
-
-
-def test_array_api_inspection_device_types():
-    info = dpt.__array_namespace_info__()
-    try:
-        dev = info.default_device()
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No default device available")
-
-    q = dpctl.SyclQueue(dev)
-    assert info.default_dtypes(device=q)
-    assert info.dtypes(device=q)
-
-    dev_dpt = dpt.Device.create_device(dev)
-    assert info.default_dtypes(device=dev_dpt)
-    assert info.dtypes(device=dev_dpt)
-
-    filter = dev.get_filter_string()
-    assert info.default_dtypes(device=filter)
-    assert info.dtypes(device=filter)
-
-
-def test_array_api_inspection_device_errors():
-    info = dpt.__array_namespace_info__()
-
-    bad_dev = dict()
-    with pytest.raises(TypeError):
-        info.dtypes(device=bad_dev)
-
-    with pytest.raises(TypeError):
-        info.default_dtypes(device=bad_dev)
diff --git a/dpctl/tests/test_tensor_asarray.py b/dpctl/tests/test_tensor_asarray.py
deleted file mode 100644
index ae6afa5287..0000000000
--- a/dpctl/tests/test_tensor_asarray.py
+++ /dev/null
@@ -1,649 +0,0 @@
-#                      Data Parallel Control (dpctl)
-#
-# Copyright 2020-2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import pytest
-
-import dpctl
-import dpctl.tensor as dpt
-
-from .helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-
-@pytest.mark.parametrize(
-    "src_usm_type, dst_usm_type",
-    [
-        ("device", "shared"),
-        ("device", "host"),
-        ("shared", "device"),
-        ("shared", "host"),
-        ("host", "device"),
-        ("host", "shared"),
-    ],
-)
-def test_asarray_change_usm_type(src_usm_type, dst_usm_type):
-    try:
-        d = dpctl.SyclDevice()
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    X = dpt.empty(10, dtype="u1", usm_type=src_usm_type)
-    Y = dpt.asarray(X, usm_type=dst_usm_type)
-    assert X.shape == Y.shape
-    assert X.usm_type == src_usm_type
-    assert Y.usm_type == dst_usm_type
-
-    with pytest.raises(ValueError):
-        # zero copy is not possible
-        dpt.asarray(X, usm_type=dst_usm_type, copy=False)
-
-    Y = dpt.asarray(X, usm_type=dst_usm_type, sycl_queue=X.sycl_queue)
-    assert X.shape == Y.shape
-    assert Y.usm_type == dst_usm_type
-
-    Y = dpt.asarray(
-        X,
-        usm_type=dst_usm_type,
-        sycl_queue=X.sycl_queue,
-        device=d.get_filter_string(),
-    )
-    assert X.shape == Y.shape
-    assert Y.usm_type == dst_usm_type
-
-
-def test_asarray_from_numpy():
-    Xnp = np.arange(10)
-    try:
-        Y = dpt.asarray(Xnp, usm_type="device")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    assert type(Y) is dpt.usm_ndarray
-    assert Y.shape == Xnp.shape
-    assert Y.dtype == Xnp.dtype
-    # Fortran contiguous case
-    Xnp = np.array([[1, 2, 3], [4, 5, 6]], dtype="f4", order="F")
-    Y = dpt.asarray(Xnp, usm_type="shared")
-    assert type(Y) is dpt.usm_ndarray
-    assert Y.shape == Xnp.shape
-    assert Y.dtype == Xnp.dtype
-    # general strided case
-    Xnp = np.array([[1, 2, 3], [4, 5, 6]], dtype="i8")
-    Y = dpt.asarray(Xnp[::-1, ::-1], usm_type="host")
-    assert type(Y) is dpt.usm_ndarray
-    assert Y.shape == Xnp.shape
-    assert Y.dtype == Xnp.dtype
-
-
-def test_asarray_from_sequence():
-    X = [1, 2, 3]
-    try:
-        Y = dpt.asarray(X, usm_type="device")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    assert type(Y) is dpt.usm_ndarray
-
-    X = [(1, 1), (2.0, 2.0 + 1.0j), range(4, 6), np.array([3, 4], dtype="c16")]
-    Y = dpt.asarray(X, usm_type="device")
-    assert type(Y) is dpt.usm_ndarray
-    assert Y.ndim == 2
-    assert Y.shape == (len(X), 2)
-
-    X = []
-    Y = dpt.asarray(X, usm_type="device")
-    assert type(Y) is dpt.usm_ndarray
-    assert Y.shape == (0,)
-
-    X = [[], []]
-    Y = dpt.asarray(X, usm_type="device")
-    assert type(Y) is dpt.usm_ndarray
-    assert Y.shape == (2, 0)
-
-    X = [True, False]
-    Y = dpt.asarray(X, usm_type="device")
-    assert type(Y) is dpt.usm_ndarray
-    assert Y.dtype.kind == "b"
-
-
-def test_asarray_from_object_with_suai():
-    """Test that asarray can deal with opaque objects implementing SUAI"""
-
-    class Dummy:
-        def __init__(self, obj, iface):
-            self.obj = obj
-            self.__sycl_usm_array_interface__ = iface
-
-    try:
-        X = dpt.empty((2, 3, 4), dtype="f4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    Y = dpt.asarray(Dummy(X, X.__sycl_usm_array_interface__))
-    assert Y.shape == X.shape
-    assert X.usm_type == Y.usm_type
-    assert X.dtype == Y.dtype
-    assert X.sycl_device == Y.sycl_device
-
-
-def test_asarray_input_validation():
-    with pytest.raises(TypeError):
-        # copy keyword is not of right type
-        dpt.asarray([1], copy="invalid")
-    with pytest.raises(TypeError):
-        # order keyword is not valid
-        dpt.asarray([1], order=1)
-    with pytest.raises(TypeError):
-        # dtype is not valid
-        dpt.asarray([1], dtype="invalid")
-    with pytest.raises(ValueError):
-        # unexpected value of order
-        dpt.asarray([1], order="Z")
-    with pytest.raises(TypeError):
-        # usm_type is of wrong type
-        dpt.asarray([1], usm_type=dict())
-    with pytest.raises(ValueError):
-        # usm_type has wrong value
-        dpt.asarray([1], usm_type="mistake")
-    try:
-        wrong_queue_type = dpctl.SyclContext()
-    except dpctl.SyclContextCreationError:
-        # use any other type
-        wrong_queue_type = Ellipsis
-    with pytest.raises(TypeError):
-        # sycl_queue type is not right
-        dpt.asarray([1], sycl_queue=wrong_queue_type)
-    with pytest.raises(ValueError):
-        # sequence is not rectangular
-        dpt.asarray([[1], 2])
-    with pytest.raises(OverflowError):
-        # Python int too large for type
-        dpt.asarray(-9223372036854775809, dtype="i4")
-    with pytest.raises(ValueError):
-        # buffer to usm_ndarray requires a copy
-        dpt.asarray(memoryview(np.arange(5)), copy=False)
-    with pytest.raises(ValueError):
-        # Numpy array to usm_ndarray requires a copy
-        dpt.asarray(np.arange(5), copy=False)
-    with pytest.raises(ValueError):
-        # Python sequence to usm_ndarray requires a copy
-        dpt.asarray([1, 2, 3], copy=False)
-    with pytest.raises(ValueError):
-        # Python scalar to usm_ndarray requires a copy
-        dpt.asarray(5, copy=False)
-
-
-def test_asarray_input_validation2():
-    d = dpctl.get_devices()
-    if len(d) < 2:
-        pytest.skip("Not enough SYCL devices available")
-
-    d0, d1 = d[:2]
-    try:
-        q0 = dpctl.SyclQueue(d0)
-    except dpctl.SyclQueueCreationError:
-        pytest.skip(f"SyclQueue could not be created for {d0}")
-    try:
-        q1 = dpctl.SyclQueue(d1)
-    except dpctl.SyclQueueCreationError:
-        pytest.skip(f"SyclQueue could not be created for {d1}")
-    with pytest.raises(TypeError):
-        dpt.asarray([1, 2], sycl_queue=q0, device=q1)
-
-
-def test_asarray_scalars():
-    import ctypes
-
-    try:
-        Y = dpt.asarray(5)
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    assert Y.dtype == dpt.dtype(int)
-    Y = dpt.asarray(5.2)
-    if Y.sycl_device.has_aspect_fp64:
-        assert Y.dtype == dpt.dtype(float)
-    else:
-        assert Y.dtype == dpt.dtype(dpt.float32)
-    Y = dpt.asarray(np.float32(2.3))
-    assert Y.dtype == dpt.dtype(dpt.float32)
-    Y = dpt.asarray(1.0j)
-    if Y.sycl_device.has_aspect_fp64:
-        assert Y.dtype == dpt.dtype(complex)
-    else:
-        assert Y.dtype == dpt.dtype(dpt.complex64)
-    Y = dpt.asarray(ctypes.c_int(8))
-    assert Y.dtype == dpt.dtype(ctypes.c_int)
-
-
-def test_asarray_copy_false():
-    q = get_queue_or_skip()
-    rng = np.random.default_rng()
-    Xnp = rng.integers(low=-255, high=255, size=(10, 4), dtype=np.int64)
-    X = dpt.from_numpy(Xnp, usm_type="device", sycl_queue=q)
-    Y1 = dpt.asarray(X, copy=False, order="K")
-    assert Y1 is X
-    Y1c = dpt.asarray(X, copy=True, order="K")
-    assert not (Y1c is X)
-    Y2 = dpt.asarray(X, copy=False, order="C")
-    assert Y2 is X
-    Y3 = dpt.asarray(X, copy=False, order="A")
-    assert Y3 is X
-    with pytest.raises(ValueError):
-        Y1 = dpt.asarray(X, copy=False, order="F")
-    Xf = dpt.empty(
-        X.shape,
-        dtype=X.dtype,
-        usm_type="device",
-        sycl_queue=X.sycl_queue,
-        order="F",
-    )
-    Xf[:] = X
-    Y4 = dpt.asarray(Xf, copy=False, order="K")
-    assert Y4 is Xf
-    Y5 = dpt.asarray(Xf, copy=False, order="F")
-    assert Y5 is Xf
-    Y6 = dpt.asarray(Xf, copy=False, order="A")
-    assert Y6 is Xf
-    with pytest.raises(ValueError):
-        dpt.asarray(Xf, copy=False, order="C")
-
-
-def test_asarray_invalid_dtype():
-    q = get_queue_or_skip()
-    Xnp = np.array([1, 2, 3], dtype=object)
-    with pytest.raises(TypeError):
-        dpt.asarray(Xnp, sycl_queue=q)
-
-
-def test_asarray_cross_device():
-    q = get_queue_or_skip()
-    qprof = dpctl.SyclQueue(property="enable_profiling")
-    x = dpt.empty(10, dtype="i8", sycl_queue=q)
-    y = dpt.asarray(x, sycl_queue=qprof)
-    assert y.sycl_queue == qprof
-
-
-def test_asarray_seq_of_arrays_simple():
-    get_queue_or_skip()
-    r = dpt.arange(10)
-    m = dpt.asarray(
-        [
-            r,
-        ]
-        * 4
-    )
-    assert m.shape == (4,) + r.shape
-    assert m.dtype == r.dtype
-    assert m.device == r.device
-
-
-def test_asarray_seq_of_arrays():
-    get_queue_or_skip()
-    m = dpt.ones((2, 4), dtype="i4")
-    w = dpt.zeros(4)
-    v = dpt.full(4, -1)
-    ar = dpt.asarray([m, [w, v]])
-    assert ar.shape == (2, 2, 4)
-    assert ar.device == m.device
-    assert ar.device == w.device
-    assert ar.device == v.device
-
-
-def test_asarray_seq_of_array_different_queue():
-    get_queue_or_skip()
-    m = dpt.ones((2, 4), dtype="i4")
-    w = dpt.zeros(4)
-    v = dpt.full(4, -1)
-    qprof = dpctl.SyclQueue(property="enable_profiling")
-    ar = dpt.asarray([m, [w, v]], sycl_queue=qprof)
-    assert ar.shape == (2, 2, 4)
-    assert ar.sycl_queue == qprof
-
-
-def test_asarray_seq_of_suai():
-    get_queue_or_skip()
-
-    class Dummy:
-        def __init__(self, obj, iface):
-            self.obj = obj
-            self.__sycl_usm_array_interface__ = iface
-
-    o = dpt.empty(0, usm_type="shared")
-    d = Dummy(o, o.__sycl_usm_array_interface__)
-    x = dpt.asarray(d)
-    assert x.shape == (0,)
-    assert x.usm_type == o.usm_type
-    assert x._pointer == o._pointer
-    assert x.sycl_queue == o.sycl_queue
-
-    x = dpt.asarray([d, d])
-    assert x.shape == (2, 0)
-    assert x.usm_type == o.usm_type
-    assert x.sycl_queue == o.sycl_queue
-
-
-def test_asarray_seq_of_suai_different_queue():
-    q = get_queue_or_skip()
-
-    class Dummy:
-        def __init__(self, obj, iface):
-            self.obj = obj
-            self.__sycl_usm_array_interface__ = iface
-
-        @property
-        def shape(self):
-            return self.__sycl_usm_array_interface__["shape"]
-
-    q2 = dpctl.SyclQueue()
-    assert q != q2
-    o = dpt.empty((2, 2), usm_type="shared", sycl_queue=q2)
-    d = Dummy(o, o.__sycl_usm_array_interface__)
-
-    x = dpt.asarray(d, sycl_queue=q)
-    assert x.sycl_queue == q
-    assert x.shape == d.shape
-    x = dpt.asarray([d], sycl_queue=q)
-    assert x.sycl_queue == q
-    assert x.shape == (1,) + d.shape
-    x = dpt.asarray([d, d], sycl_queue=q)
-    assert x.sycl_queue == q
-    assert x.shape == (2,) + d.shape
-
-
-def test_asarray_seq_of_arrays_on_different_queues():
-    q = get_queue_or_skip()
-
-    m = dpt.empty((2, 4), dtype="i2", sycl_queue=q)
-    q2 = dpctl.SyclQueue()
-    w = dpt.empty(4, dtype="i1", sycl_queue=q2)
-    q3 = dpctl.SyclQueue()
-    py_seq = [
-        0,
-    ] * w.shape[0]
-    res = dpt.asarray([m, [w, py_seq]], sycl_queue=q3)
-    assert res.sycl_queue == q3
-    assert dpt.isdtype(res.dtype, "integral")
-
-    res = dpt.asarray([m, [w, range(w.shape[0])]], sycl_queue=q3)
-    assert res.sycl_queue == q3
-    assert dpt.isdtype(res.dtype, "integral")
-
-    res = dpt.asarray([m, [w, w]], sycl_queue=q)
-    assert res.sycl_queue == q
-    assert dpt.isdtype(res.dtype, "integral")
-
-    res = dpt.asarray([m, [w, dpt.asnumpy(w)]], sycl_queue=q2)
-    assert res.sycl_queue == q2
-    assert dpt.isdtype(res.dtype, "integral")
-
-    res = dpt.asarray([w, dpt.asnumpy(w)])
-    assert res.sycl_queue == w.sycl_queue
-    assert dpt.isdtype(res.dtype, "integral")
-
-    with pytest.raises(dpctl.utils.ExecutionPlacementError):
-        dpt.asarray([m, [w, py_seq]])
-
-
-def test_ulonglong_gh_1167():
-    get_queue_or_skip()
-    x = dpt.asarray(9223372036854775807, dtype="u8")
-    assert x.dtype == dpt.uint64
-    x = dpt.asarray(9223372036854775808, dtype="u8")
-    assert x.dtype == dpt.uint64
-
-
-def test_orderK_gh_1350():
-    get_queue_or_skip()
-    a = dpt.empty((2, 3, 4), dtype="u1")
-    b = dpt.permute_dims(a, (2, 0, 1))
-    c = dpt.asarray(b, copy=True, order="K")
-
-    assert c.shape == b.shape
-    assert c.strides == b.strides
-    assert c._element_offset == 0
-    assert not c._pointer == b._pointer
-
-
-def _typesafe_arange(n: int, dtype_: dpt.dtype, device: object):
-    n_half = n // 2
-    if dtype_.kind in "ui":
-        ii = dpt.iinfo(dtype_)
-        m0 = max(ii.min, -n_half)
-        m1 = min(m0 + n, ii.max)
-        n_tiles = (n + m1 - m0 - 1) // (m1 - m0)
-        res = dpt.arange(m0, m1, dtype=dtype_, device=device)
-    elif dtype_.kind == "b":
-        n_tiles = (n + 1) // 2
-        res = dpt.asarray([False, True], dtype=dtype_, device=device)
-    else:
-        m0 = -n_half
-        m1 = m0 + n
-        n_tiles = 1
-        res = dpt.linspace(m0, m1, num=n, dtype=dtype_, device=device)
-    if n_tiles > 1:
-        res = dpt.tile(res, n_tiles)[:n]
-    return res
-
-
-_all_dtypes = [
-    "b1",
-    "i1",
-    "u1",
-    "i2",
-    "u2",
-    "i4",
-    "u4",
-    "i8",
-    "u8",
-    "f2",
-    "f4",
-    "f8",
-    "c8",
-    "c16",
-]
-
-
-@pytest.mark.parametrize("dt", _all_dtypes)
-def test_as_c_contig_rect(dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    dtype_ = dpt.dtype(dt)
-    n0, n1, n2 = 6, 35, 37
-
-    arr_flat = _typesafe_arange(n0 * n1 * n2, dtype_, q)
-    x = dpt.reshape(arr_flat, (n0, n1, n2)).mT
-
-    y = dpt.asarray(x, order="C")
-    assert dpt.all(x == y)
-
-    x2 = x[0]
-    y2 = dpt.asarray(x2, order="C")
-    assert dpt.all(x2 == y2)
-
-    x3 = dpt.flip(x, axis=1)
-    y3 = dpt.asarray(x3, order="C")
-    assert dpt.all(x3 == y3)
-
-    x4 = dpt.reshape(arr_flat, (2, 3, n1, n2)).mT
-    x5 = x4[:, :2]
-    y5 = dpt.asarray(x5, order="C")
-    assert dpt.all(x5 == y5)
-
-    x6 = dpt.reshape(arr_flat, (n0, n1, n2), order="F")
-    y6 = dpt.asarray(x6, order="C")
-    assert dpt.all(x6 == y6)
-
-
-@pytest.mark.parametrize("dt", _all_dtypes)
-def test_as_f_contig_rect(dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    dtype_ = dpt.dtype(dt)
-    n0, n1, n2 = 6, 35, 37
-
-    arr_flat = _typesafe_arange(n0 * n1 * n2, dtype_, q)
-    x = dpt.reshape(arr_flat, (n0, n1, n2))
-
-    y = dpt.asarray(x, order="F")
-    assert dpt.all(x == y)
-
-    x2 = x[0]
-    y2 = dpt.asarray(x2, order="F")
-    assert dpt.all(x2 == y2)
-
-    x3 = dpt.flip(x, axis=1)
-    y3 = dpt.asarray(x3, order="F")
-    assert dpt.all(x3 == y3)
-
-    x4 = dpt.reshape(arr_flat, (2, 3, n1, n2))
-    x5 = dpt.moveaxis(x4[:, :2], (2, 3), (0, 1))
-    y5 = dpt.asarray(x5, order="F")
-    assert dpt.all(x5 == y5)
-
-
-@pytest.mark.parametrize("dt", _all_dtypes)
-def test_as_c_contig_square(dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    dtype_ = dpt.dtype(dt)
-    n0, n1 = 4, 53
-
-    arr_flat = _typesafe_arange(n0 * n1 * n1, dtype_, q)
-    x = dpt.reshape(arr_flat, (n0, n1, n1)).mT
-
-    y = dpt.asarray(x, order="C")
-    assert dpt.all(x == y)
-
-    x2 = x[0]
-    y2 = dpt.asarray(x2, order="C")
-    assert dpt.all(x2 == y2)
-
-    x3 = dpt.flip(x, axis=1)
-    y3 = dpt.asarray(x3, order="C")
-    assert dpt.all(x3 == y3)
-
-
-@pytest.mark.parametrize("dt", _all_dtypes)
-def test_as_f_contig_square(dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    dtype_ = dpt.dtype(dt)
-    n0, n1 = 6, 53
-
-    arr_flat = _typesafe_arange(n0 * n1 * n1, dtype_, q)
-    x = dpt.moveaxis(dpt.reshape(arr_flat, (n0, n1, n1)), (1, 2), (0, 1))
-
-    y = dpt.asarray(x, order="F")
-    assert dpt.all(x == y)
-
-    x2 = x[..., 0]
-    y2 = dpt.asarray(x2, order="F")
-    assert dpt.all(x2 == y2)
-
-    x3 = dpt.flip(x, axis=1)
-    y3 = dpt.asarray(x3, order="F")
-    assert dpt.all(x3 == y3)
-
-
-class MockArrayWithBothProtocols:
-    """
-    Object that implements both __sycl_usm_array_interface__
-    and __usm_ndarray__ properties.
-    """
-
-    def __init__(self, usm_ar):
-        if not isinstance(usm_ar, dpt.usm_ndarray):
-            raise TypeError
-        self._arr = usm_ar
-
-    @property
-    def __usm_ndarray__(self):
-        return self._arr
-
-    @property
-    def __sycl_usm_array_interface__(self):
-        return self._arr.__sycl_usm_array_interface__
-
-
-class MockArrayWithSUAIOnly:
-    """
-    Object that implements only the
-    __sycl_usm_array_interface__ property.
-    """
-
-    def __init__(self, usm_ar):
-        if not isinstance(usm_ar, dpt.usm_ndarray):
-            raise TypeError
-        self._arr = usm_ar
-
-    @property
-    def __sycl_usm_array_interface__(self):
-        return self._arr.__sycl_usm_array_interface__
-
-
-@pytest.mark.parametrize("usm_type", ["shared", "device", "host"])
-def test_asarray_support_for_usm_ndarray_protocol(usm_type):
-    get_queue_or_skip()
-
-    x = dpt.arange(256, dtype="i4", usm_type=usm_type)
-
-    o1 = MockArrayWithBothProtocols(x)
-    o2 = MockArrayWithSUAIOnly(x)
-
-    y1 = dpt.asarray(o1)
-    assert x.sycl_queue == y1.sycl_queue
-    assert x.usm_type == y1.usm_type
-    assert x.dtype == y1.dtype
-    assert y1.usm_data.reference_obj is None
-    assert dpt.all(x == y1)
-
-    y2 = dpt.asarray(o2)
-    assert x.sycl_queue == y2.sycl_queue
-    assert x.usm_type == y2.usm_type
-    assert x.dtype == y2.dtype
-    assert not (y2.usm_data.reference_obj is None)
-    assert dpt.all(x == y2)
-
-    y3 = dpt.asarray([o1, o2])
-    assert x.sycl_queue == y3.sycl_queue
-    assert x.usm_type == y3.usm_type
-    assert x.dtype == y3.dtype
-    assert y3.usm_data.reference_obj is None
-    assert dpt.all(x[dpt.newaxis, :] == y3)
-
-
-@pytest.mark.parametrize("dt", [dpt.float16, dpt.float64, dpt.complex128])
-def test_asarray_to_device_with_unsupported_dtype(dt):
-    aspect = "fp16" if dt == dpt.float16 else "fp64"
-    try:
-        d0 = dpctl.select_device_with_aspects(aspect)
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No device with aspect for test")
-    d1 = None
-    for d in dpctl.get_devices():
-        if d.default_selector_score < 0:
-            pass
-        try:
-            d1 = dpctl.select_device_with_aspects(
-                d.device_type.name, excluded_aspects=[aspect]
-            )
-        except dpctl.SyclDeviceCreationError:
-            pass
-    if d1 is None:
-        pytest.skip("No device with missing aspect for test")
-    x = dpt.ones(10, dtype=dt, device=d0)
-    y = dpt.asarray(x, device=d1)
-    assert y.sycl_device == d1
diff --git a/dpctl/tests/test_tensor_clip.py b/dpctl/tests/test_tensor_clip.py
deleted file mode 100644
index 2fdb91af62..0000000000
--- a/dpctl/tests/test_tensor_clip.py
+++ /dev/null
@@ -1,778 +0,0 @@
-#                      Data Parallel Control (dpctl)
-#
-# Copyright 2020-2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import pytest
-from numpy.testing import assert_raises_regex
-
-import dpctl
-import dpctl.tensor as dpt
-from dpctl.tensor._elementwise_common import _get_dtype
-from dpctl.tensor._type_utils import (
-    _can_cast,
-    _strong_dtype_num_kind,
-    _weak_type_num_kind,
-)
-from dpctl.utils import ExecutionPlacementError
-
-from .helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-_all_dtypes = [
-    "?",
-    "u1",
-    "i1",
-    "u2",
-    "i2",
-    "u4",
-    "i4",
-    "u8",
-    "i8",
-    "e",
-    "f",
-    "d",
-    "F",
-    "D",
-]
-
-_usm_types = ["device", "shared", "host"]
-
-
-@pytest.mark.parametrize("dt1", _all_dtypes)
-@pytest.mark.parametrize("dt2", _all_dtypes)
-def test_clip_dtypes(dt1, dt2):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt1, q)
-    skip_if_dtype_not_supported(dt2, q)
-
-    sz = 127
-    ar1 = dpt.ones(sz, dtype=dt1, sycl_queue=q)
-    ar2 = dpt.ones_like(ar1, dtype=dt1, sycl_queue=q)
-    ar3 = dpt.ones_like(ar1, dtype=dt2, sycl_queue=q)
-
-    dev = q.sycl_device
-    _fp16 = dev.has_aspect_fp16
-    _fp64 = dev.has_aspect_fp64
-    # also covers cases where dt1 == dt2
-    if _can_cast(ar3.dtype, ar1.dtype, _fp16, _fp64):
-        r = dpt.clip(ar1, ar2, ar3)
-        assert isinstance(r, dpt.usm_ndarray)
-        assert r.dtype == ar1.dtype
-        assert r.shape == ar1.shape
-        assert dpt.all(r == ar1)
-        assert r.sycl_queue == ar1.sycl_queue
-
-        r = dpt.clip(ar1, min=ar3, max=None)
-        assert isinstance(r, dpt.usm_ndarray)
-        assert r.dtype == ar1.dtype
-        assert r.shape == ar1.shape
-        assert dpt.all(r == ar1)
-        assert r.sycl_queue == ar1.sycl_queue
-
-        r = dpt.clip(ar1, min=None, max=ar3)
-        assert isinstance(r, dpt.usm_ndarray)
-        assert r.dtype == ar1.dtype
-        assert r.shape == ar1.shape
-        assert dpt.all(r == ar1)
-        assert r.sycl_queue == ar1.sycl_queue
-    else:
-        with pytest.raises(ValueError):
-            dpt.clip(ar1, ar2, ar3)
-        with pytest.raises(ValueError):
-            dpt.clip(ar1, min=ar3, max=None)
-        with pytest.raises(ValueError):
-            dpt.clip(ar1, min=None, max=ar3)
-
-
-def test_clip_empty():
-    get_queue_or_skip()
-
-    x = dpt.empty((2, 0, 3), dtype="i4")
-    a_min = dpt.ones((2, 0, 3), dtype="i4")
-    a_max = dpt.ones((2, 0, 3), dtype="i4")
-
-    r = dpt.clip(x, a_min, a_max)
-    assert r.size == 0
-    assert r.shape == x.shape
-
-
-def test_clip_python_scalars():
-    get_queue_or_skip()
-
-    arrs = [
-        dpt.ones(1, dtype="?"),
-        dpt.ones(1, dtype="i4"),
-        dpt.ones(1, dtype="f4"),
-        dpt.ones(1, dtype="c8"),
-    ]
-
-    py_zeros = [
-        False,
-        0,
-        0.0,
-        complex(0, 0),
-    ]
-
-    py_ones = [
-        True,
-        1,
-        1.0,
-        complex(1, 0),
-    ]
-
-    for zero, one, arr in zip(py_zeros, py_ones, arrs):
-        r = dpt.clip(arr, zero, one)
-        assert isinstance(r, dpt.usm_ndarray)
-        r = dpt.clip(arr, min=zero)
-        assert isinstance(r, dpt.usm_ndarray)
-
-
-def test_clip_in_place():
-    get_queue_or_skip()
-
-    x = dpt.arange(10, dtype="i4")
-    a_min = dpt.arange(1, 11, dtype="i4")
-    a_max = dpt.arange(2, 12, dtype="i4")
-    dpt.clip(x, a_min, a_max, out=x)
-    assert dpt.all(x == a_min)
-
-    x = dpt.arange(10, dtype="i4")
-    dpt.clip(x, min=a_min, max=None, out=x)
-    assert dpt.all(x == a_min)
-
-    x = dpt.arange(10, dtype="i4")
-    dpt.clip(x, a_min, a_max, out=a_max)
-    assert dpt.all(a_max == a_min)
-
-    a_min = dpt.arange(1, 11, dtype="i4")
-    dpt.clip(x, min=a_min, max=None, out=a_min[::-1])
-    assert dpt.all((x + 1)[::-1] == a_min)
-
-
-def test_clip_special_cases():
-    get_queue_or_skip()
-
-    x = dpt.arange(10, dtype="f4")
-    r = dpt.clip(x, -dpt.inf, dpt.inf)
-    assert dpt.all(r == x)
-    r = dpt.clip(x, dpt.nan, dpt.inf)
-    assert dpt.all(dpt.isnan(r))
-    r = dpt.clip(x, -dpt.inf, dpt.nan)
-    assert dpt.all(dpt.isnan(r))
-
-
-def test_clip_out_need_temporary():
-    get_queue_or_skip()
-
-    x = dpt.ones(10, dtype="i4")
-    a_min = dpt.asarray(2, dtype="i4")
-    a_max = dpt.asarray(3, dtype="i4")
-    dpt.clip(x[:6], 2, 3, out=x[-6:])
-    assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2)
-
-    x = dpt.ones(10, dtype="i4")
-    a_min = dpt.asarray(2, dtype="i4")
-    a_max = dpt.asarray(3, dtype="i2")
-    dpt.clip(x[:6], 2, 3, out=x[-6:])
-    assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2)
-
-    x = dpt.ones(10, dtype="i4")
-    a_min = dpt.asarray(2, dtype="i2")
-    a_max = dpt.asarray(3, dtype="i4")
-    dpt.clip(x[:6], 2, 3, out=x[-6:])
-    assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2)
-
-    x = dpt.ones(10, dtype="i4")
-    a_min = dpt.asarray(2, dtype="i2")
-    a_max = dpt.asarray(3, dtype="i1")
-    dpt.clip(x[:6], 2, 3, out=x[-6:])
-    assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2)
-
-    x = dpt.arange(12, dtype="i4")
-    dpt.clip(x[:6], out=x[-6:])
-    expected = dpt.arange(6, dtype="i4")
-    assert dpt.all(x[:-6] == expected) and dpt.all(x[-6:] == expected)
-
-    x = dpt.ones(10, dtype="i4")
-    dpt.clip(x, out=x)
-    assert dpt.all(x == 1)
-
-    x = dpt.full(6, 3, dtype="i4")
-    a_min = dpt.full(10, 2, dtype="i4")
-    a_max = dpt.asarray(4, dtype="i4")
-    dpt.clip(x, min=a_min[:6], max=a_max, out=a_min[-6:])
-    assert dpt.all(a_min[:-6] == 2) and dpt.all(a_min[-6:] == 3)
-
-    x = dpt.full(6, 3, dtype="i4")
-    a_min = dpt.full(10, 2, dtype="i4")
-    a_max = dpt.asarray(4, dtype="i2")
-    dpt.clip(x, min=a_min[:6], max=a_max, out=a_min[-6:])
-    assert dpt.all(a_min[:-6] == 2) and dpt.all(a_min[-6:] == 3)
-
-
-def test_clip_out_need_temporary_none():
-    get_queue_or_skip()
-
-    x = dpt.full(6, 3, dtype="i4")
-    # with min/max == None
-    a_min = dpt.full(10, 2, dtype="i4")
-    dpt.clip(x, min=a_min[:6], max=None, out=a_min[-6:])
-    assert dpt.all(a_min[:-6] == 2) and dpt.all(a_min[-6:] == 3)
-
-
-def test_clip_arg_validation():
-    get_queue_or_skip()
-
-    check = dict()
-    x1 = dpt.empty((1,), dtype="i4")
-    x2 = dpt.empty((1,), dtype="i4")
-
-    with pytest.raises(TypeError):
-        dpt.clip(check, x1, x2)
-
-    with pytest.raises(ValueError):
-        dpt.clip(x1, check, x2)
-
-    with pytest.raises(ValueError):
-        dpt.clip(x1, check)
-
-    with pytest.raises(TypeError):
-        dpt.clip(x1, x1, x2, out=check)
-
-    with pytest.raises(TypeError):
-        dpt.clip(x1, x2, out=check)
-
-    with pytest.raises(TypeError):
-        dpt.clip(x1, out=check)
-
-
-@pytest.mark.parametrize(
-    "dt1,dt2", [("i4", "i4"), ("i4", "i2"), ("i2", "i4"), ("i1", "i2")]
-)
-def test_clip_order(dt1, dt2):
-    get_queue_or_skip()
-
-    test_shape = (
-        20,
-        20,
-    )
-    test_shape2 = tuple(2 * dim for dim in test_shape)
-    n = test_shape[-1]
-
-    ar1 = dpt.ones(test_shape, dtype="i4", order="C")
-    ar2 = dpt.ones(test_shape, dtype=dt1, order="C")
-    ar3 = dpt.ones(test_shape, dtype=dt2, order="C")
-    r1 = dpt.clip(ar1, ar2, ar3, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.clip(ar1, ar2, ar3, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.clip(ar1, ar2, ar3, order="A")
-    assert r3.flags.c_contiguous
-    r4 = dpt.clip(ar1, ar2, ar3, order="K")
-    assert r4.flags.c_contiguous
-
-    ar1 = dpt.ones(test_shape, dtype="i4", order="F")
-    ar2 = dpt.ones(test_shape, dtype=dt1, order="F")
-    ar3 = dpt.ones(test_shape, dtype=dt2, order="F")
-    r1 = dpt.clip(ar1, ar2, ar3, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.clip(ar1, ar2, ar3, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.clip(ar1, ar2, ar3, order="A")
-    assert r3.flags.f_contiguous
-    r4 = dpt.clip(ar1, ar2, ar3, order="K")
-    assert r4.flags.f_contiguous
-
-    ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2]
-    ar2 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2]
-    ar3 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2]
-    r4 = dpt.clip(ar1, ar2, ar3, order="K")
-    assert r4.strides == (n, -1)
-    r5 = dpt.clip(ar1, ar2, ar3, order="C")
-    assert r5.strides == (n, 1)
-
-    ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2].mT
-    ar2 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2].mT
-    ar3 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2].mT
-    r4 = dpt.clip(ar1, ar2, ar3, order="K")
-    assert r4.strides == (-1, n)
-    r5 = dpt.clip(ar1, ar2, ar3, order="C")
-    assert r5.strides == (n, 1)
-
-
-@pytest.mark.parametrize("dt", ["i4", "i2"])
-def test_clip_none_order(dt):
-    get_queue_or_skip()
-
-    test_shape = (
-        20,
-        20,
-    )
-    test_shape2 = tuple(2 * dim for dim in test_shape)
-    n = test_shape[-1]
-
-    ar1 = dpt.ones(test_shape, dtype="i4", order="C")
-    ar2 = dpt.ones(test_shape, dtype=dt, order="C")
-
-    r1 = dpt.clip(ar1, min=None, max=ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.clip(ar1, min=None, max=ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.clip(ar1, min=None, max=ar2, order="A")
-    assert r3.flags.c_contiguous
-    r4 = dpt.clip(ar1, min=None, max=ar2, order="K")
-    assert r4.flags.c_contiguous
-
-    ar1 = dpt.ones(test_shape, dtype="i4", order="F")
-    ar2 = dpt.ones(test_shape, dtype=dt, order="F")
-
-    r1 = dpt.clip(ar1, min=None, max=ar2, order="C")
-    assert r1.flags.c_contiguous
-    r2 = dpt.clip(ar1, min=None, max=ar2, order="F")
-    assert r2.flags.f_contiguous
-    r3 = dpt.clip(ar1, min=None, max=ar2, order="A")
-    assert r3.flags.f_contiguous
-    r4 = dpt.clip(ar1, min=None, max=ar2, order="K")
-    assert r4.flags.f_contiguous
-
-    ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2]
-    ar2 = dpt.ones(test_shape2, dtype=dt, order="C")[:20, ::-2]
-
-    r4 = dpt.clip(ar1, min=None, max=ar2, order="K")
-    assert r4.strides == (n, -1)
-    r5 = dpt.clip(ar1, min=None, max=ar2, order="C")
-    assert r5.strides == (n, 1)
-
-    ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2].mT
-    ar2 = dpt.ones(test_shape2, dtype=dt, order="C")[:20, ::-2].mT
-
-    r4 = dpt.clip(ar1, min=None, max=ar2, order="K")
-    assert r4.strides == (-1, n)
-    r5 = dpt.clip(ar1, min=None, max=ar2, order="C")
-    assert r5.strides == (n, 1)
-
-
-@pytest.mark.parametrize("usm_type1", _usm_types)
-@pytest.mark.parametrize("usm_type2", _usm_types)
-@pytest.mark.parametrize("usm_type3", _usm_types)
-def test_clip_usm_type_matrix(usm_type1, usm_type2, usm_type3):
-    get_queue_or_skip()
-
-    sz = 128
-    ar1 = dpt.ones(sz, dtype="i4", usm_type=usm_type1)
-    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=usm_type2)
-    ar3 = dpt.ones_like(ar1, dtype="i4", usm_type=usm_type3)
-
-    r = dpt.clip(ar1, ar2, ar3)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type(
-        (usm_type1, usm_type2, usm_type3)
-    )
-    assert r.usm_type == expected_usm_type
-
-
-@pytest.mark.parametrize("usm_type1", _usm_types)
-@pytest.mark.parametrize("usm_type2", _usm_types)
-def test_clip_usm_type_matrix_none_arg(usm_type1, usm_type2):
-    get_queue_or_skip()
-
-    sz = 128
-    ar1 = dpt.ones(sz, dtype="i4", usm_type=usm_type1)
-    ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=usm_type2)
-
-    r = dpt.clip(ar1, min=ar2, max=None)
-    assert isinstance(r, dpt.usm_ndarray)
-    expected_usm_type = dpctl.utils.get_coerced_usm_type((usm_type1, usm_type2))
-    assert r.usm_type == expected_usm_type
-
-
-def test_clip_dtype_error():
-    get_queue_or_skip()
-
-    ar1 = dpt.ones(1, dtype="i4")
-    ar2 = dpt.ones(1, dtype="i4")
-    ar3 = dpt.ones(1, dtype="i4")
-    ar4 = dpt.empty_like(ar1, dtype="f4")
-
-    assert_raises_regex(
-        ValueError,
-        "Output array of type.*is needed",
-        dpt.clip,
-        ar1,
-        ar2,
-        ar3,
-        ar4,
-    )
-    assert_raises_regex(
-        ValueError,
-        "Output array of type.*is needed",
-        dpt.clip,
-        ar1,
-        ar2,
-        None,
-        ar4,
-    )
-
-
-def test_clip_errors():
-    get_queue_or_skip()
-    try:
-        gpu_queue = dpctl.SyclQueue("gpu")
-    except dpctl.SyclQueueCreationError:
-        pytest.skip("SyclQueue('gpu') failed, skipping")
-    try:
-        cpu_queue = dpctl.SyclQueue("cpu")
-    except dpctl.SyclQueueCreationError:
-        pytest.skip("SyclQueue('cpu') failed, skipping")
-
-    ar1 = dpt.ones(2, dtype="float32", sycl_queue=gpu_queue)
-    ar2 = dpt.ones_like(ar1, sycl_queue=gpu_queue)
-    ar3 = dpt.ones_like(ar1, sycl_queue=gpu_queue)
-    ar4 = dpt.empty_like(ar1, sycl_queue=cpu_queue)
-    assert_raises_regex(
-        ExecutionPlacementError,
-        "Input and output allocation queues are not compatible",
-        dpt.clip,
-        ar1,
-        ar2,
-        ar3,
-        ar4,
-    )
-
-    assert_raises_regex(
-        ExecutionPlacementError,
-        "Input and output allocation queues are not compatible",
-        dpt.clip,
-        ar1,
-        None,
-        ar3,
-        ar4,
-    )
-
-    assert_raises_regex(
-        ExecutionPlacementError,
-        "Execution placement can not be unambiguously inferred from input "
-        "arguments.",
-        dpt.clip,
-        ar1,
-        ar4,
-        ar2,
-        ar3,
-    )
-
-    assert_raises_regex(
-        ExecutionPlacementError,
-        "Execution placement can not be unambiguously inferred from input "
-        "arguments.",
-        dpt.clip,
-        ar1,
-        ar4,
-        1,
-        ar3,
-    )
-
-    assert_raises_regex(
-        ExecutionPlacementError,
-        "Execution placement can not be unambiguously inferred from input "
-        "arguments.",
-        dpt.clip,
-        ar1,
-        1,
-        ar4,
-        ar3,
-    )
-
-    assert_raises_regex(
-        ExecutionPlacementError,
-        "Execution placement can not be unambiguously inferred from input "
-        "arguments.",
-        dpt.clip,
-        ar1,
-        ar4,
-        None,
-        ar2,
-    )
-
-    ar1 = dpt.ones(2, dtype="float32")
-    ar2 = dpt.ones_like(ar1, dtype="float32")
-    ar3 = dpt.ones_like(ar1, dtype="float32")
-    ar4 = dpt.empty(3, dtype="float32")
-    assert_raises_regex(
-        ValueError,
-        "The shape of input and output arrays are inconsistent",
-        dpt.clip,
-        ar1,
-        ar2,
-        ar3,
-        ar4,
-    )
-
-    assert_raises_regex(
-        ValueError,
-        "The shape of input and output arrays are inconsistent",
-        dpt.clip,
-        ar1,
-        ar2,
-        None,
-        ar4,
-    )
-
-    ar1 = np.ones(2, dtype="f4")
-    ar2 = dpt.ones(2, dtype="f4")
-    ar3 = dpt.ones(2, dtype="f4")
-    assert_raises_regex(
-        TypeError,
-        "Expected `x` to be of dpctl.tensor.usm_ndarray type*",
-        dpt.clip,
-        ar1,
-        ar2,
-        ar3,
-    )
-
-    ar1 = dpt.ones(2, dtype="i4")
-    ar2 = dpt.ones_like(ar1, dtype="i4")
-    ar3 = dpt.ones_like(ar1, dtype="i4")
-    ar4 = np.empty(ar1.shape, dtype=ar1.dtype)
-    assert_raises_regex(
-        TypeError,
-        "output array must be of usm_ndarray type",
-        dpt.clip,
-        ar1,
-        ar2,
-        ar3,
-        ar4,
-    )
-
-    assert_raises_regex(
-        TypeError,
-        "output array must be of usm_ndarray type",
-        dpt.clip,
-        ar1,
-        ar2,
-        None,
-        ar4,
-    )
-
-
-def test_clip_out_type_check():
-    get_queue_or_skip()
-
-    x1 = dpt.ones(10)
-    x2 = dpt.ones(10)
-    x3 = dpt.ones(10)
-
-    out = range(10)
-
-    with pytest.raises(TypeError):
-        dpt.clip(x1, x2, x3, out=out)
-
-
-@pytest.mark.parametrize("dt", ["i4", "f4", "c8"])
-def test_clip_basic(dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    sz = 1026
-    x = dpt.arange(sz, dtype=dt, sycl_queue=q)
-    r = dpt.clip(x, min=100, max=500)
-    expected = dpt.arange(sz, dtype=dt, sycl_queue=q)
-    expected[:100] = 100
-    expected[500:] = 500
-    assert dpt.all(expected == r)
-
-    x = dpt.zeros(sz, dtype=dt, sycl_queue=q)
-    a_max = dpt.full(sz, -1, dtype=dt, sycl_queue=q)
-    a_max[::2] = -2
-    r = dpt.clip(x, min=-3, max=a_max)
-    assert dpt.all(a_max == r)
-
-
-@pytest.mark.parametrize("dt", ["i4", "f4", "c8"])
-def test_clip_strided(dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    sz = 2 * 1026
-    x = dpt.arange(sz, dtype=dt, sycl_queue=q)[::-2]
-    r = dpt.clip(x, min=100, max=500)
-    expected = dpt.arange(sz, dtype=dt, sycl_queue=q)
-    expected[:100] = 100
-    expected[500:] = 500
-    expected = expected[::-2]
-    assert dpt.all(expected == r)
-
-    x = dpt.zeros(sz, dtype=dt, sycl_queue=q)[::-2]
-    a_max = dpt.full(sz, -1, dtype=dt, sycl_queue=q)
-    a_max[::2] = -2
-    a_max = a_max[::-2]
-    r = dpt.clip(x, min=-3, max=a_max)
-    assert dpt.all(a_max == r)
-
-
-def test_clip_max_less_than_min():
-    get_queue_or_skip()
-
-    x = dpt.ones(10, dtype="i4")
-    res = dpt.clip(x, 5, 0)
-    assert dpt.all(res == 0)
-
-
-@pytest.mark.parametrize("dt", ["?", "i4", "f4", "c8"])
-def test_clip_minmax_weak_types(dt):
-    get_queue_or_skip()
-
-    x = dpt.zeros(10, dtype=dt)
-    min_list = [False, 0, 0.0, 0.0 + 0.0j]
-    max_list = [True, 1, 1.0, 1.0 + 0.0j]
-
-    for min_v, max_v in zip(min_list, max_list):
-        st_dt = _strong_dtype_num_kind(dpt.dtype(dt))
-        wk_dt1 = _weak_type_num_kind(_get_dtype(min_v, x.sycl_device))
-        wk_dt2 = _weak_type_num_kind(_get_dtype(max_v, x.sycl_device))
-
-        if st_dt >= wk_dt1 and st_dt >= wk_dt2:
-            r = dpt.clip(x, min_v, max_v)
-            assert isinstance(r, dpt.usm_ndarray)
-        else:
-            with pytest.raises(ValueError):
-                dpt.clip(x, min_v, max_v)
-
-        if st_dt >= wk_dt1:
-            r = dpt.clip(x, min_v)
-            assert isinstance(r, dpt.usm_ndarray)
-
-            r = dpt.clip(x, None, min_v)
-            assert isinstance(r, dpt.usm_ndarray)
-        else:
-            with pytest.raises(ValueError):
-                dpt.clip(x, min_v)
-            with pytest.raises(ValueError):
-                dpt.clip(x, None, max_v)
-
-
-def test_clip_max_weak_type_errors():
-    get_queue_or_skip()
-
-    x = dpt.zeros(10, dtype="i4")
-    m = dpt.ones(10, dtype="i4")
-
-    with pytest.raises(ValueError):
-        dpt.clip(x, m, 2.5)
-
-    with pytest.raises(ValueError):
-        dpt.clip(x, 2.5, m)
-
-    with pytest.raises(ValueError):
-        dpt.clip(x, 2.5)
-
-    with pytest.raises(ValueError):
-        dpt.clip(dpt.astype(x, "?"), 2)
-
-    with pytest.raises(ValueError):
-        dpt.clip(dpt.astype(x, "f4"), complex(2))
-
-
-def test_clip_unaligned():
-    get_queue_or_skip()
-
-    x = dpt.full(513, 5, dtype="i4")
-    a_min = dpt.zeros(512, dtype="i4")
-    a_max = dpt.full(512, 2, dtype="i4")
-
-    expected = dpt.full(512, 2, dtype="i4")
-    assert dpt.all(dpt.clip(x[1:], a_min, a_max) == expected)
-
-
-def test_clip_none_args():
-    get_queue_or_skip()
-
-    x = dpt.arange(10, dtype="i4")
-    r = dpt.clip(x)
-    assert dpt.all(x == r)
-
-
-def test_clip_shape_errors():
-    get_queue_or_skip()
-
-    x = dpt.ones((4, 4), dtype="i4")
-    a_min = dpt.ones(5, dtype="i4")
-    a_max = dpt.ones(5, dtype="i4")
-
-    with pytest.raises(ValueError):
-        dpt.clip(x, a_min, a_max)
-
-    with pytest.raises(ValueError):
-        dpt.clip(x, a_min)
-
-    with pytest.raises(ValueError):
-        dpt.clip(x, 0, 1, out=a_min)
-
-    with pytest.raises(ValueError):
-        dpt.clip(x, 0, out=a_min)
-
-    with pytest.raises(ValueError):
-        dpt.clip(x, out=a_min)
-
-
-def test_clip_compute_follows_data():
-    q1 = get_queue_or_skip()
-    q2 = get_queue_or_skip()
-
-    x = dpt.ones(10, dtype="i4", sycl_queue=q1)
-    a_min = dpt.ones(10, dtype="i4", sycl_queue=q2)
-    a_max = dpt.ones(10, dtype="i4", sycl_queue=q1)
-    res = dpt.empty_like(x, sycl_queue=q2)
-
-    with pytest.raises(ExecutionPlacementError):
-        dpt.clip(x, a_min, a_max)
-
-    with pytest.raises(ExecutionPlacementError):
-        dpt.clip(x, dpt.ones_like(x), a_max, out=res)
-
-    with pytest.raises(ExecutionPlacementError):
-        dpt.clip(x, a_min)
-
-    with pytest.raises(ExecutionPlacementError):
-        dpt.clip(x, None, a_max, out=res)
-
-    with pytest.raises(ExecutionPlacementError):
-        dpt.clip(x, out=res)
-
-
-def test_clip_readonly_out():
-    get_queue_or_skip()
-    x = dpt.arange(32, dtype=dpt.int32)
-    r = dpt.empty_like(x)
-    r.flags["W"] = False
-
-    with pytest.raises(ValueError):
-        dpt.clip(x, min=0, max=10, out=r)
-
-    with pytest.raises(ValueError):
-        dpt.clip(x, max=10, out=r)
-
-    with pytest.raises(ValueError):
-        dpt.clip(x, min=0, out=r)
-
-    with pytest.raises(ValueError):
-        dpt.clip(x, out=r)
-
-
-def test_clip_gh_1744():
-    get_queue_or_skip()
-    x = dpt.asarray([0, 255], dtype=dpt.uint8)
-    y = dpt.clip(x, -300, 300)
-
-    assert dpt.all(x == y)
diff --git a/dpctl/tests/test_tensor_copy_utils.py b/dpctl/tests/test_tensor_copy_utils.py
deleted file mode 100644
index 61ca3ec87c..0000000000
--- a/dpctl/tests/test_tensor_copy_utils.py
+++ /dev/null
@@ -1,100 +0,0 @@
-#                      Data Parallel Control (dpctl)
-#
-# Copyright 2020-2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import pytest
-
-import dpctl.tensor as dpt
-import dpctl.tensor._copy_utils as cu
-from dpctl.tests.helper import get_queue_or_skip
-
-
-def test_copy_utils_empty_like_orderK():
-    get_queue_or_skip()
-    a = dpt.empty((10, 10), dtype=dpt.int32, order="F")
-    X = cu._empty_like_orderK(a, dpt.int32, a.usm_type, a.device)
-    assert X.flags["F"]
-
-
-def test_copy_utils_empty_like_orderK_invalid_args():
-    get_queue_or_skip()
-    with pytest.raises(TypeError):
-        cu._empty_like_orderK([1, 2, 3], dpt.int32, "device", None)
-    with pytest.raises(TypeError):
-        cu._empty_like_pair_orderK(
-            [1, 2, 3],
-            (
-                1,
-                2,
-                3,
-            ),
-            dpt.int32,
-            (3,),
-            "device",
-            None,
-        )
-
-    a = dpt.empty(10, dtype=dpt.int32)
-    with pytest.raises(TypeError):
-        cu._empty_like_pair_orderK(
-            a,
-            (
-                1,
-                2,
-                3,
-            ),
-            dpt.int32,
-            (10,),
-            "device",
-            None,
-        )
-
-
-def test_copy_utils_from_numpy_empty_like_orderK():
-    q = get_queue_or_skip()
-
-    a = np.empty((10, 10), dtype=np.int32, order="C")
-    r0 = cu._from_numpy_empty_like_orderK(a, dpt.int32, "device", q)
-    assert r0.flags["C"]
-
-    b = np.empty((10, 10), dtype=np.int32, order="F")
-    r1 = cu._from_numpy_empty_like_orderK(b, dpt.int32, "device", q)
-    assert r1.flags["F"]
-
-    c = np.empty((2, 3, 4), dtype=np.int32, order="C")
-    c = np.transpose(c, (1, 0, 2))
-    r2 = cu._from_numpy_empty_like_orderK(c, dpt.int32, "device", q)
-    assert not r2.flags["C"] and not r2.flags["F"]
-
-
-def test_copy_utils_from_numpy_empty_like_orderK_invalid_args():
-    with pytest.raises(TypeError):
-        cu._from_numpy_empty_like_orderK([1, 2, 3], dpt.int32, "device", None)
-
-
-def test_gh_2055():
-    """
-    Test that `dpt.asarray` works on contiguous NumPy arrays with `order="K"`
-    when dimensions are permuted.
-
-    See: https://github.com/IntelPython/dpctl/issues/2055
-    """
-    get_queue_or_skip()
-
-    a = np.ones((2, 3, 4), dtype=dpt.int32)
-    a_t = np.transpose(a, (2, 0, 1))
-    r = dpt.asarray(a_t)
-    assert not r.flags["C"] and not r.flags["F"]
diff --git a/dpctl/tests/test_tensor_diff.py b/dpctl/tests/test_tensor_diff.py
deleted file mode 100644
index 3b7c000fae..0000000000
--- a/dpctl/tests/test_tensor_diff.py
+++ /dev/null
@@ -1,329 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-from math import prod
-
-import pytest
-from numpy.testing import assert_raises_regex
-
-import dpctl.tensor as dpt
-from dpctl.tensor._type_utils import _to_device_supported_dtype
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-from dpctl.utils import ExecutionPlacementError
-
-_all_dtypes = [
-    "?",
-    "i1",
-    "u1",
-    "i2",
-    "u2",
-    "i4",
-    "u4",
-    "i8",
-    "u8",
-    "f2",
-    "f4",
-    "f8",
-    "c8",
-    "c16",
-]
-
-
-@pytest.mark.parametrize("dt", _all_dtypes)
-def test_diff_basic(dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    x = dpt.asarray([9, 12, 7, 17, 10, 18, 15, 9, 8, 8], dtype=dt, sycl_queue=q)
-    op = dpt.not_equal if x.dtype is dpt.bool else dpt.subtract
-
-    # test both n=2 and n>2 branches
-    for n in [1, 2, 5]:
-        res = dpt.diff(x, n=n)
-        expected_res = x
-        for _ in range(n):
-            expected_res = op(expected_res[1:], expected_res[:-1])
-        if dpt.dtype(dt).kind in "fc":
-            assert dpt.allclose(res, expected_res)
-        else:
-            assert dpt.all(res == expected_res)
-
-
-def test_diff_axis():
-    get_queue_or_skip()
-
-    x = dpt.tile(
-        dpt.asarray([9, 12, 7, 17, 10, 18, 15, 9, 8, 8], dtype="i4"), (3, 4, 1)
-    )
-    x[:, ::2, :] = 0
-
-    for n in [1, 2, 3]:
-        res = dpt.diff(x, n=n, axis=1)
-        expected_res = x
-        for _ in range(n):
-            expected_res = dpt.subtract(
-                expected_res[:, 1:, :], expected_res[:, :-1, :]
-            )
-        assert dpt.all(res == expected_res)
-
-
-def test_diff_prepend_append_type_promotion():
-    get_queue_or_skip()
-
-    dts = [
-        ("i1", "u1", "i8"),
-        ("i1", "u8", "u1"),
-        ("u4", "i4", "f4"),
-        ("i8", "c8", "u8"),
-    ]
-
-    for dt0, dt1, dt2 in dts:
-        x = dpt.ones(10, dtype=dt1)
-        prepend = dpt.full(1, 2, dtype=dt0)
-        append = dpt.full(1, 3, dtype=dt2)
-
-        res = dpt.diff(x, prepend=prepend, append=append)
-        assert res.dtype == _to_device_supported_dtype(
-            dpt.result_type(prepend, x, append),
-            x.sycl_queue.sycl_device,
-        )
-
-        res = dpt.diff(x, prepend=prepend)
-        assert res.dtype == _to_device_supported_dtype(
-            dpt.result_type(prepend, x),
-            x.sycl_queue.sycl_device,
-        )
-
-        res = dpt.diff(x, append=append)
-        assert res.dtype == _to_device_supported_dtype(
-            dpt.result_type(x, append),
-            x.sycl_queue.sycl_device,
-        )
-
-
-def test_diff_0d():
-    get_queue_or_skip()
-
-    x = dpt.ones(())
-    with pytest.raises(ValueError):
-        dpt.diff(x)
-
-
-def test_diff_empty_array():
-    get_queue_or_skip()
-
-    x = dpt.ones((3, 0, 5))
-    res = dpt.diff(x, axis=1)
-    assert res.shape == x.shape
-
-    res = dpt.diff(x, axis=0)
-    assert res.shape == (2, 0, 5)
-
-    append = dpt.ones((3, 2, 5))
-    res = dpt.diff(x, axis=1, append=append)
-    assert res.shape == (3, 1, 5)
-
-    prepend = dpt.ones((3, 2, 5))
-    res = dpt.diff(x, axis=1, prepend=prepend)
-    assert res.shape == (3, 1, 5)
-
-
-def test_diff_no_op():
-    get_queue_or_skip()
-
-    x = dpt.ones(10, dtype="i4")
-    res = dpt.diff(x, n=0)
-    assert dpt.all(x == res)
-
-    x = dpt.reshape(x, (2, 5))
-    res = dpt.diff(x, n=0, axis=0)
-    assert dpt.all(x == res)
-
-
-@pytest.mark.parametrize("sh,axis", [((1,), 0), ((3, 4, 5), 1)])
-def test_diff_prepend_append_py_scalars(sh, axis):
-    get_queue_or_skip()
-
-    n = 1
-
-    arr = dpt.ones(sh, dtype="i4")
-    zero = 0
-
-    # first and last elements along axis
-    # will be checked for correctness
-    sl1 = [slice(None)] * arr.ndim
-    sl1[axis] = slice(1)
-    sl1 = tuple(sl1)
-
-    sl2 = [slice(None)] * arr.ndim
-    sl2[axis] = slice(-1, None, None)
-    sl2 = tuple(sl2)
-
-    r = dpt.diff(arr, axis=axis, prepend=zero, append=zero)
-    assert all(r.shape[i] == arr.shape[i] for i in range(arr.ndim) if i != axis)
-    assert r.shape[axis] == arr.shape[axis] + 2 - n
-    assert dpt.all(r[sl1] == 1)
-    assert dpt.all(r[sl2] == -1)
-
-    r = dpt.diff(arr, axis=axis, prepend=zero)
-    assert all(r.shape[i] == arr.shape[i] for i in range(arr.ndim) if i != axis)
-    assert r.shape[axis] == arr.shape[axis] + 1 - n
-    assert dpt.all(r[sl1] == 1)
-
-    r = dpt.diff(arr, axis=axis, append=zero)
-    assert all(r.shape[i] == arr.shape[i] for i in range(arr.ndim) if i != axis)
-    assert r.shape[axis] == arr.shape[axis] + 1 - n
-    assert dpt.all(r[sl2] == -1)
-
-    r = dpt.diff(arr, axis=axis, prepend=dpt.asarray(zero), append=zero)
-    assert all(r.shape[i] == arr.shape[i] for i in range(arr.ndim) if i != axis)
-    assert r.shape[axis] == arr.shape[axis] + 2 - n
-    assert dpt.all(r[sl1] == 1)
-    assert dpt.all(r[sl2] == -1)
-
-    r = dpt.diff(arr, axis=axis, prepend=zero, append=dpt.asarray(zero))
-    assert all(r.shape[i] == arr.shape[i] for i in range(arr.ndim) if i != axis)
-    assert r.shape[axis] == arr.shape[axis] + 2 - n
-    assert dpt.all(r[sl1] == 1)
-    assert dpt.all(r[sl2] == -1)
-
-
-def test_tensor_diff_append_prepend_arrays():
-    get_queue_or_skip()
-
-    n = 1
-    axis = 0
-
-    for sh in [(5,), (3, 4, 5)]:
-        sz = prod(sh)
-        arr = dpt.reshape(dpt.arange(sz, 2 * sz, dtype="i4"), sh)
-        prepend = dpt.reshape(dpt.arange(sz, dtype="i4"), sh)
-        append = dpt.reshape(dpt.arange(2 * sz, 3 * sz, dtype="i4"), sh)
-        const_diff = sz / sh[axis]
-
-        r = dpt.diff(arr, axis=axis, prepend=prepend, append=append)
-        assert all(
-            r.shape[i] == arr.shape[i] for i in range(arr.ndim) if i != axis
-        )
-        assert (
-            r.shape[axis]
-            == arr.shape[axis] + prepend.shape[axis] + append.shape[axis] - n
-        )
-        assert dpt.all(r == const_diff)
-
-        r = dpt.diff(arr, axis=axis, prepend=prepend)
-        assert all(
-            r.shape[i] == arr.shape[i] for i in range(arr.ndim) if i != axis
-        )
-        assert r.shape[axis] == arr.shape[axis] + prepend.shape[axis] - n
-        assert dpt.all(r == const_diff)
-
-        r = dpt.diff(arr, axis=axis, append=append)
-        assert all(
-            r.shape[i] == arr.shape[i] for i in range(arr.ndim) if i != axis
-        )
-        assert r.shape[axis] == arr.shape[axis] + append.shape[axis] - n
-        assert dpt.all(r == const_diff)
-
-
-def test_diff_wrong_append_prepend_shape():
-    get_queue_or_skip()
-
-    arr = dpt.ones((3, 4, 5), dtype="i4")
-    arr_bad_sh = dpt.ones(2, dtype="i4")
-
-    assert_raises_regex(
-        ValueError,
-        ".*shape.*is invalid.*",
-        dpt.diff,
-        arr,
-        prepend=arr_bad_sh,
-        append=arr_bad_sh,
-    )
-
-    assert_raises_regex(
-        ValueError,
-        ".*shape.*is invalid.*",
-        dpt.diff,
-        arr,
-        prepend=arr,
-        append=arr_bad_sh,
-    )
-
-    assert_raises_regex(
-        ValueError,
-        ".*shape.*is invalid.*",
-        dpt.diff,
-        arr,
-        prepend=arr_bad_sh,
-    )
-
-    assert_raises_regex(
-        ValueError,
-        ".*shape.*is invalid.*",
-        dpt.diff,
-        arr,
-        append=arr_bad_sh,
-    )
-
-
-def test_diff_compute_follows_data():
-    q1 = get_queue_or_skip()
-    q2 = get_queue_or_skip()
-    q3 = get_queue_or_skip()
-
-    ar1 = dpt.ones(1, dtype="i4", sycl_queue=q1)
-    ar2 = dpt.ones(1, dtype="i4", sycl_queue=q2)
-    ar3 = dpt.ones(1, dtype="i4", sycl_queue=q3)
-
-    with pytest.raises(ExecutionPlacementError):
-        dpt.diff(ar1, prepend=ar2, append=ar3)
-
-    with pytest.raises(ExecutionPlacementError):
-        dpt.diff(ar1, prepend=ar2, append=0)
-
-    with pytest.raises(ExecutionPlacementError):
-        dpt.diff(ar1, prepend=0, append=ar2)
-
-    with pytest.raises(ExecutionPlacementError):
-        dpt.diff(ar1, prepend=ar2)
-
-    with pytest.raises(ExecutionPlacementError):
-        dpt.diff(ar1, append=ar2)
-
-
-def test_diff_input_validation():
-    bad_in = dict()
-    assert_raises_regex(
-        TypeError,
-        "Expecting dpctl.tensor.usm_ndarray type, got.*",
-        dpt.diff,
-        bad_in,
-    )
-
-
-def test_diff_positive_order():
-    get_queue_or_skip()
-
-    x = dpt.ones(1, dtype="i4")
-    n = -1
-    assert_raises_regex(
-        ValueError,
-        ".*must be positive.*",
-        dpt.diff,
-        x,
-        n=n,
-    )
diff --git a/dpctl/tests/test_tensor_dtype_routines.py b/dpctl/tests/test_tensor_dtype_routines.py
deleted file mode 100644
index 0e0ae689a0..0000000000
--- a/dpctl/tests/test_tensor_dtype_routines.py
+++ /dev/null
@@ -1,158 +0,0 @@
-#                      Data Parallel Control (dpctl)
-#
-# Copyright 2020-2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import pytest
-
-import dpctl
-import dpctl.tensor as dpt
-
-list_dtypes = [
-    "bool",
-    "int8",
-    "int16",
-    "int32",
-    "int64",
-    "uint8",
-    "uint16",
-    "uint32",
-    "uint64",
-    "float16",
-    "float32",
-    "float64",
-    "complex64",
-    "complex128",
-]
-
-
-dtype_categories = {
-    "bool": ["bool"],
-    "signed integer": ["int8", "int16", "int32", "int64"],
-    "unsigned integer": ["uint8", "uint16", "uint32", "uint64"],
-    "integral": [
-        "int8",
-        "int16",
-        "int32",
-        "int64",
-        "uint8",
-        "uint16",
-        "uint32",
-        "uint64",
-    ],
-    "real floating": ["float16", "float32", "float64"],
-    "complex floating": ["complex64", "complex128"],
-    "numeric": [d for d in list_dtypes if d != "bool"],
-}
-
-
-@pytest.mark.parametrize("kind_str", dtype_categories.keys())
-@pytest.mark.parametrize("dtype_str", list_dtypes)
-def test_isdtype_kind_str(dtype_str, kind_str):
-    dt = dpt.dtype(dtype_str)
-    is_in_kind = dpt.isdtype(dt, kind_str)
-    expected = dtype_str in dtype_categories[kind_str]
-    assert is_in_kind == expected
-
-
-@pytest.mark.parametrize("dtype_str", list_dtypes)
-def test_isdtype_kind_tuple(dtype_str):
-    dt = dpt.dtype(dtype_str)
-    if dtype_str.startswith("bool"):
-        assert dpt.isdtype(dt, ("real floating", "bool"))
-        assert not dpt.isdtype(
-            dt, ("integral", "real floating", "complex floating")
-        )
-    elif dtype_str.startswith("int"):
-        assert dpt.isdtype(dt, ("real floating", "signed integer"))
-        assert not dpt.isdtype(
-            dt, ("bool", "unsigned integer", "real floating")
-        )
-    elif dtype_str.startswith("uint"):
-        assert dpt.isdtype(dt, ("bool", "unsigned integer"))
-        assert not dpt.isdtype(dt, ("real floating", "complex floating"))
-    elif dtype_str.startswith("float"):
-        assert dpt.isdtype(dt, ("complex floating", "real floating"))
-        assert not dpt.isdtype(dt, ("integral", "complex floating", "bool"))
-    else:
-        assert dpt.isdtype(dt, ("integral", "complex floating"))
-        assert not dpt.isdtype(dt, ("bool", "integral", "real floating"))
-
-
-@pytest.mark.parametrize("dtype_str", list_dtypes)
-def test_isdtype_kind_tuple_dtypes(dtype_str):
-    dt = dpt.dtype(dtype_str)
-    if dtype_str.startswith("bool"):
-        assert dpt.isdtype(dt, (dpt.int32, dpt.bool))
-        assert not dpt.isdtype(dt, (dpt.int16, dpt.uint32, dpt.float64))
-
-    elif dtype_str.startswith("int"):
-        assert dpt.isdtype(dt, (dpt.int8, dpt.int16, dpt.int32, dpt.int64))
-        assert not dpt.isdtype(dt, (dpt.bool, dpt.float32, dpt.complex64))
-
-    elif dtype_str.startswith("uint"):
-        assert dpt.isdtype(dt, (dpt.uint8, dpt.uint16, dpt.uint32, dpt.uint64))
-        assert not dpt.isdtype(dt, (dpt.bool, dpt.int32, dpt.float32))
-
-    elif dtype_str.startswith("float"):
-        assert dpt.isdtype(dt, (dpt.float16, dpt.float32, dpt.float64))
-        assert not dpt.isdtype(dt, (dpt.bool, dpt.complex64, dpt.int8))
-
-    else:
-        assert dpt.isdtype(dt, (dpt.complex64, dpt.complex128))
-        assert not dpt.isdtype(dt, (dpt.bool, dpt.uint64, dpt.int8))
-
-
-@pytest.mark.parametrize(
-    "kind",
-    [
-        [dpt.int32, dpt.bool],
-        "f4",
-        float,
-        123,
-        "complex",
-    ],
-)
-def test_isdtype_invalid_kind(kind):
-    with pytest.raises((TypeError, ValueError)):
-        dpt.isdtype(dpt.int32, kind)
-
-
-def test_finfo_array():
-    try:
-        x = dpt.empty(tuple(), dtype="f4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("Default-selected SYCL device unavailable")
-    o = dpt.finfo(x)
-    assert o.dtype == dpt.float32
-
-
-def test_iinfo_array():
-    try:
-        x = dpt.empty(tuple(), dtype="i4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("Default-selected SYCL device unavailable")
-    o = dpt.iinfo(x)
-    assert o.dtype == dpt.int32
-
-
-def test_iinfo_validation():
-    with pytest.raises(ValueError):
-        dpt.iinfo("O")
-
-
-def test_finfo_validation():
-    with pytest.raises(ValueError):
-        dpt.iinfo("O")
diff --git a/dpctl/tests/test_tensor_isin.py b/dpctl/tests/test_tensor_isin.py
deleted file mode 100644
index eef029ffd0..0000000000
--- a/dpctl/tests/test_tensor_isin.py
+++ /dev/null
@@ -1,266 +0,0 @@
-#                      Data Parallel Control (dpctl)
-#
-# Copyright 2020-2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import ctypes
-
-import numpy as np
-import pytest
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-from dpctl.utils import ExecutionPlacementError
-
-_numeric_dtypes = [
-    "i1",
-    "u1",
-    "i2",
-    "u2",
-    "i4",
-    "u4",
-    "i8",
-    "u8",
-    "f2",
-    "f4",
-    "f8",
-    "c8",
-    "c16",
-]
-
-_all_dtypes = ["?"] + _numeric_dtypes
-
-
-@pytest.mark.parametrize("dtype", _numeric_dtypes)
-def test_isin_basic(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n = 100
-    x = dpt.arange(n, dtype=dtype, sycl_queue=q)
-    test = dpt.arange(n - 1, dtype=dtype, sycl_queue=q)
-    r1 = dpt.isin(x, test)
-    assert dpt.all(r1[:-1])
-    assert not r1[-1]
-    assert r1.shape == x.shape
-
-    # test with invert keyword
-    r2 = dpt.isin(x, test, invert=True)
-    assert not dpt.any(r2[:-1])
-    assert r2[-1]
-    assert r2.shape == x.shape
-
-
-def test_isin_basic_bool():
-    dt = dpt.bool
-    n = 100
-    x = dpt.zeros(n, dtype=dt)
-    x[-1] = True
-    test = dpt.zeros((), dtype=dt)
-    r1 = dpt.isin(x, test)
-    assert dpt.all(r1[:-1])
-    assert not r1[-1]
-    assert r1.shape == x.shape
-
-    r2 = dpt.isin(x, test, invert=True)
-    assert not dpt.any(r2[:-1])
-    assert r2[-1]
-    assert r2.shape == x.shape
-
-
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        "i1",
-        "u1",
-        "i2",
-        "u2",
-        "i4",
-        "u4",
-        "i8",
-        "u8",
-        "f2",
-        "f4",
-        "f8",
-        "c8",
-        "c16",
-    ],
-)
-def test_isin_strided(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n, m = 100, 20
-    x = dpt.zeros((n, m), dtype=dtype, order="F", sycl_queue=q)
-    x[:, ::2] = dpt.arange(1, (m / 2) + 1, dtype=dtype, sycl_queue=q)
-    x_s = x[:, ::2]
-    test = dpt.arange(1, (m / 2), dtype=dtype, sycl_queue=q)
-    r1 = dpt.isin(x_s, test)
-    assert dpt.all(r1[:, :-1])
-    assert not dpt.any(r1[:, -1])
-    assert not dpt.any(x[:, 1::2])
-    assert r1.shape == x_s.shape
-    assert r1.flags.c_contiguous
-
-    # test with invert keyword
-    r2 = dpt.isin(x_s, test, invert=True)
-    assert not dpt.any(r2[:, :-1])
-    assert dpt.all(r2[:, -1])
-    assert not dpt.any(x[:, 1:2])
-    assert r2.shape == x_s.shape
-    assert r2.flags.c_contiguous
-
-
-def test_isin_strided_bool():
-    dt = dpt.bool
-
-    n, m = 100, 20
-    x = dpt.zeros((n, m), dtype=dt, order="F")
-    x[:, :-2:2] = True
-    x_s = x[:, ::2]
-    test = dpt.ones((), dtype=dt)
-    r1 = dpt.isin(x_s, test)
-    assert dpt.all(r1[:, :-1])
-    assert not dpt.any(r1[:, -1])
-    assert not dpt.any(x[:, 1::2])
-    assert r1.shape == x_s.shape
-    assert r1.flags.c_contiguous
-
-    # test with invert keyword
-    r2 = dpt.isin(x_s, test, invert=True)
-    assert not dpt.any(r2[:, :-1])
-    assert dpt.all(r2[:, -1])
-    assert not dpt.any(x[:, 1:2])
-    assert r2.shape == x_s.shape
-    assert r2.flags.c_contiguous
-
-
-@pytest.mark.parametrize("dt1", _numeric_dtypes)
-@pytest.mark.parametrize("dt2", _numeric_dtypes)
-def test_isin_dtype_matrix(dt1, dt2):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt1, q)
-    skip_if_dtype_not_supported(dt2, q)
-
-    sz = 10
-    x = dpt.asarray([0, 1, 11], dtype=dt1, sycl_queue=q)
-    test1 = dpt.arange(sz, dtype=dt2, sycl_queue=q)
-
-    r1 = dpt.isin(x, test1)
-    assert isinstance(r1, dpt.usm_ndarray)
-    assert r1.dtype == dpt.bool
-    assert r1.shape == x.shape
-    assert not r1[-1]
-    assert dpt.all(r1[0:-1])
-    assert r1.sycl_queue == x.sycl_queue
-
-    test2 = dpt.tile(dpt.asarray([[0, 1]], dtype=dt2, sycl_queue=q).mT, 2)
-    r2 = dpt.isin(x, test2)
-    assert isinstance(r2, dpt.usm_ndarray)
-    assert r2.dtype == dpt.bool
-    assert r2.shape == x.shape
-    assert not r2[-1]
-    assert dpt.all(r1[0:-1])
-    assert r2.sycl_queue == x.sycl_queue
-
-
-def test_isin_empty_inputs():
-    get_queue_or_skip()
-
-    x = dpt.ones((10, 0, 1), dtype="i4")
-    test = dpt.ones((), dtype="i4")
-    res1 = dpt.isin(x, test)
-    assert isinstance(res1, dpt.usm_ndarray)
-    assert res1.size == 0
-    assert res1.shape == x.shape
-    assert res1.dtype == dpt.bool
-
-    res2 = dpt.isin(x, test, invert=True)
-    assert isinstance(res2, dpt.usm_ndarray)
-    assert res2.size == 0
-    assert res2.shape == x.shape
-    assert res2.dtype == dpt.bool
-
-    x = dpt.ones((3, 3), dtype="i4")
-    test = dpt.ones(0, dtype="i4")
-    res3 = dpt.isin(x, test)
-    assert isinstance(res3, dpt.usm_ndarray)
-    assert res3.shape == x.shape
-    assert res3.dtype == dpt.bool
-    assert not dpt.all(res3)
-
-    res4 = dpt.isin(x, test, invert=True)
-    assert isinstance(res4, dpt.usm_ndarray)
-    assert res4.shape == x.shape
-    assert res4.dtype == dpt.bool
-    assert dpt.all(res4)
-
-
-def test_isin_validation():
-    get_queue_or_skip()
-    with pytest.raises(ExecutionPlacementError):
-        dpt.isin(1, 1)
-    not_bool = dict()
-    with pytest.raises(TypeError):
-        dpt.isin(dpt.ones([1]), dpt.ones([1]), invert=not_bool)
-
-
-def test_isin_special_floating_point_vals():
-    get_queue_or_skip()
-
-    # real and complex nans compare false
-    x = dpt.asarray(dpt.nan, dtype="f4")
-    test = dpt.asarray(dpt.nan, dtype="f4")
-    assert not dpt.isin(x, test)
-
-    x = dpt.asarray(dpt.nan, dtype="c8")
-    test = dpt.asarray(dpt.nan, dtype="c8")
-    assert not dpt.isin(x, test)
-
-    # -0.0 compares equal to +0.0
-    x = dpt.asarray(-0.0, dtype="f4")
-    test = dpt.asarray(0.0, dtype="f4")
-    assert dpt.isin(x, test)
-    assert dpt.isin(test, x)
-
-
-@pytest.mark.parametrize("dt", _all_dtypes)
-def test_isin_py_scalars(dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    x = dpt.zeros((10, 10), dtype=dt, sycl_queue=q)
-    py_zeros = (
-        bool(0),
-        int(0),
-        float(0),
-        complex(0),
-        np.float32(0),
-        ctypes.c_int(0),
-    )
-    for sc in py_zeros:
-        r1 = dpt.isin(x, sc)
-        assert isinstance(r1, dpt.usm_ndarray)
-        r2 = dpt.isin(sc, x)
-        assert isinstance(r2, dpt.usm_ndarray)
-
-
-def test_isin_compute_follows_data():
-    q1 = get_queue_or_skip()
-    q2 = get_queue_or_skip()
-
-    x = dpt.ones(10, sycl_queue=q1)
-    test = dpt.ones_like(x, sycl_queue=q2)
-    with pytest.raises(ExecutionPlacementError):
-        dpt.isin(x, test)
diff --git a/dpctl/tests/test_tensor_statistical_functions.py b/dpctl/tests/test_tensor_statistical_functions.py
deleted file mode 100644
index 03231de305..0000000000
--- a/dpctl/tests/test_tensor_statistical_functions.py
+++ /dev/null
@@ -1,255 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import pytest
-
-import dpctl.tensor as dpt
-from dpctl.tensor._tensor_impl import default_device_fp_type
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-_no_complex_dtypes = [
-    "?",
-    "i1",
-    "u1",
-    "i2",
-    "u2",
-    "i4",
-    "u4",
-    "i8",
-    "u8",
-    "f2",
-    "f4",
-    "f8",
-]
-
-
-@pytest.mark.parametrize("dt", _no_complex_dtypes)
-def test_mean_dtypes(dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    x = dpt.ones(10, dtype=dt)
-    res = dpt.mean(x)
-    assert res == 1
-    if x.dtype.kind in "biu":
-        assert res.dtype == dpt.dtype(default_device_fp_type(q))
-    else:
-        assert res.dtype == x.dtype
-
-
-@pytest.mark.parametrize("dt", _no_complex_dtypes)
-@pytest.mark.parametrize("py_zero", [float(0), int(0)])
-def test_std_var_dtypes(dt, py_zero):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    x = dpt.ones(10, dtype=dt)
-    res = dpt.std(x, correction=py_zero)
-    assert res == 0
-    if x.dtype.kind in "biu":
-        assert res.dtype == dpt.dtype(default_device_fp_type(q))
-    else:
-        assert res.dtype == x.dtype
-
-    res = dpt.var(x, correction=py_zero)
-    assert res == 0
-    if x.dtype.kind in "biu":
-        assert res.dtype == dpt.dtype(default_device_fp_type(q))
-    else:
-        assert res.dtype == x.dtype
-
-
-def test_stat_fns_axis():
-    get_queue_or_skip()
-
-    x = dpt.ones((3, 4, 5, 6, 7), dtype="f4")
-    m = dpt.mean(x, axis=(1, 2, -1))
-
-    assert isinstance(m, dpt.usm_ndarray)
-    assert m.shape == (3, 6)
-    assert dpt.allclose(m, dpt.asarray(1, dtype=m.dtype))
-
-    s = dpt.var(x, axis=(1, 2, -1))
-    assert isinstance(s, dpt.usm_ndarray)
-    assert s.shape == (3, 6)
-    assert dpt.allclose(s, dpt.asarray(0, dtype=s.dtype))
-
-
-@pytest.mark.parametrize("fn", [dpt.mean, dpt.var])
-def test_stat_fns_empty(fn):
-    get_queue_or_skip()
-    x = dpt.empty((0,), dtype="f4")
-    r = fn(x)
-    assert r.shape == tuple()
-    assert dpt.isnan(r)
-
-    x = dpt.empty((10, 0, 2), dtype="f4")
-    r = fn(x, axis=1)
-    assert r.shape == (10, 2)
-    assert dpt.all(dpt.isnan(r))
-
-    r = fn(x, axis=0)
-    assert r.shape == (0, 2)
-    assert r.size == 0
-
-
-def test_stat_fns_keepdims():
-    get_queue_or_skip()
-
-    x = dpt.ones((3, 4, 5, 6, 7), dtype="f4")
-    m = dpt.mean(x, axis=(1, 2, -1), keepdims=True)
-
-    assert isinstance(m, dpt.usm_ndarray)
-    assert m.shape == (3, 1, 1, 6, 1)
-    assert dpt.allclose(m, dpt.asarray(1, dtype=m.dtype))
-
-    s = dpt.var(x, axis=(1, 2, -1), keepdims=True)
-    assert isinstance(s, dpt.usm_ndarray)
-    assert s.shape == (3, 1, 1, 6, 1)
-    assert dpt.allclose(s, dpt.asarray(0, dtype=s.dtype))
-
-
-def test_stat_fns_empty_axis():
-    get_queue_or_skip()
-
-    x = dpt.reshape(dpt.arange(3 * 4 * 5, dtype="f4"), (3, 4, 5))
-    m = dpt.mean(x, axis=())
-
-    assert x.shape == m.shape
-    assert dpt.all(x == m)
-
-    s = dpt.var(x, axis=())
-    assert x.shape == s.shape
-    assert dpt.all(s == 0)
-
-    d = dpt.std(x, axis=())
-    assert x.shape == d.shape
-    assert dpt.all(d == 0)
-
-
-def test_mean():
-    get_queue_or_skip()
-
-    x = dpt.reshape(dpt.arange(9, dtype="f4"), (3, 3))
-    m = dpt.mean(x)
-    expected = dpt.asarray(4, dtype="f4")
-    assert dpt.allclose(m, expected)
-
-    m = dpt.mean(x, axis=0)
-    expected = dpt.arange(3, 6, dtype="f4")
-    assert dpt.allclose(m, expected)
-
-    m = dpt.mean(x, axis=1)
-    expected = dpt.asarray([1, 4, 7], dtype="f4")
-    assert dpt.allclose(m, expected)
-
-
-def test_var_std():
-    get_queue_or_skip()
-
-    x = dpt.reshape(dpt.arange(9, dtype="f4"), (3, 3))
-    r = dpt.var(x)
-    expected = dpt.asarray(6.666666507720947, dtype="f4")
-    assert dpt.allclose(r, expected)
-
-    r1 = dpt.var(x, correction=3)
-    expected1 = dpt.asarray(10.0, dtype="f4")
-    assert dpt.allclose(r1, expected1)
-
-    r = dpt.std(x)
-    expected = dpt.sqrt(expected)
-    assert dpt.allclose(r, expected)
-
-    r1 = dpt.std(x, correction=3)
-    expected1 = dpt.sqrt(expected1)
-    assert dpt.allclose(r1, expected1)
-
-    r = dpt.var(x, axis=0)
-    expected = dpt.full(x.shape[1], 6, dtype="f4")
-    assert dpt.allclose(r, expected)
-
-    r1 = dpt.var(x, axis=0, correction=1)
-    expected1 = dpt.full(x.shape[1], 9, dtype="f4")
-    assert dpt.allclose(r1, expected1)
-
-    r = dpt.std(x, axis=0)
-    expected = dpt.sqrt(expected)
-    assert dpt.allclose(r, expected)
-
-    r1 = dpt.std(x, axis=0, correction=1)
-    expected1 = dpt.sqrt(expected1)
-    assert dpt.allclose(r1, expected1)
-
-    r = dpt.var(x, axis=1)
-    expected = dpt.full(x.shape[0], 0.6666666865348816, dtype="f4")
-    assert dpt.allclose(r, expected)
-
-    r1 = dpt.var(x, axis=1, correction=1)
-    expected1 = dpt.ones(x.shape[0], dtype="f4")
-    assert dpt.allclose(r1, expected1)
-
-    r = dpt.std(x, axis=1)
-    expected = dpt.sqrt(expected)
-    assert dpt.allclose(r, expected)
-
-    r1 = dpt.std(x, axis=1, correction=1)
-    expected1 = dpt.sqrt(expected1)
-    assert dpt.allclose(r1, expected1)
-
-
-def test_var_axis_length_correction():
-    get_queue_or_skip()
-
-    x = dpt.reshape(dpt.arange(9, dtype="f4"), (3, 3))
-
-    r = dpt.var(x, correction=x.size)
-    assert dpt.isnan(r)
-
-    r = dpt.var(x, axis=0, correction=x.shape[0])
-    assert dpt.all(dpt.isnan(r))
-
-    r = dpt.var(x, axis=1, correction=x.shape[1])
-    assert dpt.all(dpt.isnan(r))
-
-
-def test_stat_function_errors():
-    d = dict()
-    with pytest.raises(TypeError):
-        dpt.var(d)
-    with pytest.raises(TypeError):
-        dpt.std(d)
-    with pytest.raises(TypeError):
-        dpt.mean(d)
-
-    get_queue_or_skip()
-    x = dpt.empty(1, dtype="f4")
-    with pytest.raises(TypeError):
-        dpt.var(x, axis=d)
-    with pytest.raises(TypeError):
-        dpt.std(x, axis=d)
-    with pytest.raises(TypeError):
-        dpt.mean(x, axis=d)
-
-    with pytest.raises(TypeError):
-        dpt.var(x, correction=d)
-    with pytest.raises(TypeError):
-        dpt.std(x, correction=d)
-
-    x = dpt.empty(1, dtype="c8")
-    with pytest.raises(ValueError):
-        dpt.var(x)
-    with pytest.raises(ValueError):
-        dpt.std(x)
diff --git a/dpctl/tests/test_tensor_sum.py b/dpctl/tests/test_tensor_sum.py
deleted file mode 100644
index 29ee3abb1b..0000000000
--- a/dpctl/tests/test_tensor_sum.py
+++ /dev/null
@@ -1,332 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import pytest
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-_all_dtypes = [
-    "?",
-    "i1",
-    "u1",
-    "i2",
-    "u2",
-    "i4",
-    "u4",
-    "i8",
-    "u8",
-    "f2",
-    "f4",
-    "f8",
-    "c8",
-    "c16",
-]
-
-
-@pytest.mark.parametrize("arg_dtype", _all_dtypes)
-def test_sum_arg_dtype_default_output_dtype_matrix(arg_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arg_dtype, q)
-
-    # test reduction for C-contiguous input
-    m = dpt.ones(100, dtype=arg_dtype)
-    r = dpt.sum(m)
-
-    assert isinstance(r, dpt.usm_ndarray)
-    if m.dtype.kind == "i":
-        assert r.dtype.kind == "i"
-    elif m.dtype.kind == "u":
-        assert r.dtype.kind == "u"
-    elif m.dtype.kind == "f":
-        assert r.dtype.kind == "f"
-    elif m.dtype.kind == "c":
-        assert r.dtype.kind == "c"
-
-    assert dpt.all(r == 100)
-
-    # test reduction for strided input
-    m = dpt.ones(200, dtype=arg_dtype)[:1:-2]
-    r = dpt.sum(m)
-    assert dpt.all(r == 99)
-
-    # test reduction for strided input which can be simplified
-    # to contiguous computation
-    m = dpt.ones(100, dtype=arg_dtype)
-    r = dpt.sum(dpt.flip(m))
-    assert dpt.all(r == 100)
-
-
-@pytest.mark.parametrize("arg_dtype", _all_dtypes)
-@pytest.mark.parametrize("out_dtype", _all_dtypes[1:])
-def test_sum_arg_out_dtype_matrix(arg_dtype, out_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arg_dtype, q)
-    skip_if_dtype_not_supported(out_dtype, q)
-
-    m = dpt.ones(100, dtype=arg_dtype)
-    r = dpt.sum(m, dtype=out_dtype)
-
-    assert isinstance(r, dpt.usm_ndarray)
-    assert r.dtype == dpt.dtype(out_dtype)
-    assert dpt.all(r == 100)
-
-
-def test_sum_empty():
-    get_queue_or_skip()
-    x = dpt.empty((0,), dtype="u1")
-    y = dpt.sum(x)
-    assert y.shape == tuple()
-    assert int(y) == 0
-
-
-def test_sum_axis():
-    get_queue_or_skip()
-
-    m = dpt.ones((3, 4, 5, 6, 7), dtype="i4")
-    s = dpt.sum(m, axis=(1, 2, -1))
-
-    assert isinstance(s, dpt.usm_ndarray)
-    assert s.shape == (3, 6)
-    assert dpt.all(s == dpt.asarray(4 * 5 * 7, dtype="i4"))
-
-
-def test_sum_keepdims():
-    get_queue_or_skip()
-
-    m = dpt.ones((3, 4, 5, 6, 7), dtype="i4")
-    s = dpt.sum(m, axis=(1, 2, -1), keepdims=True)
-
-    assert isinstance(s, dpt.usm_ndarray)
-    assert s.shape == (3, 1, 1, 6, 1)
-    assert dpt.all(s == dpt.asarray(4 * 5 * 7, dtype=s.dtype))
-
-
-def test_sum_scalar():
-    get_queue_or_skip()
-
-    m = dpt.ones(())
-    s = dpt.sum(m)
-
-    assert isinstance(s, dpt.usm_ndarray)
-    assert m.sycl_queue == s.sycl_queue
-    assert s.shape == ()
-    assert s == dpt.full((), 1)
-
-
-@pytest.mark.parametrize("arg_dtype", _all_dtypes)
-@pytest.mark.parametrize("out_dtype", _all_dtypes[1:])
-def test_sum_arg_out_dtype_scalar(arg_dtype, out_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arg_dtype, q)
-    skip_if_dtype_not_supported(out_dtype, q)
-
-    m = dpt.ones((), dtype=arg_dtype)
-    r = dpt.sum(m, dtype=out_dtype)
-
-    assert isinstance(r, dpt.usm_ndarray)
-    assert r.dtype == dpt.dtype(out_dtype)
-    assert r == 1
-
-
-def test_sum_keepdims_zero_size():
-    """See gh-1293"""
-    get_queue_or_skip()
-    n = 10
-    a = dpt.ones((n, 0, n))
-
-    s1 = dpt.sum(a, keepdims=True)
-    assert s1.shape == (1, 1, 1)
-
-    s2 = dpt.sum(a, axis=(0, 1), keepdims=True)
-    assert s2.shape == (1, 1, n)
-
-    s3 = dpt.sum(a, axis=(1, 2), keepdims=True)
-    assert s3.shape == (n, 1, 1)
-
-    s4 = dpt.sum(a, axis=(0, 2), keepdims=True)
-    assert s4.shape == (1, 0, 1)
-
-    a0 = a[0]
-    s5 = dpt.sum(a0, keepdims=True)
-    assert s5.shape == (1, 1)
-
-
-@pytest.mark.parametrize("arg_dtype", ["i8", "f4", "c8"])
-@pytest.mark.parametrize("n", [1023, 1024, 1025])
-def test_largish_reduction(arg_dtype, n):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arg_dtype, q)
-
-    m = 5
-    x = dpt.ones((m, n, m), dtype=arg_dtype)
-
-    y1 = dpt.sum(x, axis=(0, 1))
-    y2 = dpt.sum(x, axis=(1, 2))
-
-    assert dpt.all(dpt.equal(y1, y2))
-    assert dpt.all(dpt.equal(y1, n * m))
-
-
-@pytest.mark.parametrize("n", [1023, 1024, 1025])
-def test_largish_reduction_axis1_axis0(n):
-    get_queue_or_skip()
-
-    m = 25
-    x1 = dpt.ones((m, n), dtype="f4")
-    x2 = dpt.ones((n, m), dtype="f4")
-
-    y1 = dpt.sum(x1, axis=1)
-    y2 = dpt.sum(x2, axis=0)
-
-    assert dpt.all(y1 == n)
-    assert dpt.all(y2 == n)
-
-
-def test_axis0_bug():
-    "gh-1391"
-    get_queue_or_skip()
-
-    sh = (1, 2, 3)
-    a = dpt.arange(sh[0] * sh[1] * sh[2], dtype="i4")
-    a = dpt.reshape(a, sh)
-    aT = dpt.permute_dims(a, (2, 1, 0))
-
-    s = dpt.sum(aT, axis=2)
-    expected = dpt.asarray([[0, 3], [1, 4], [2, 5]])
-
-    assert dpt.all(s == expected)
-
-
-def test_sum_axis1_axis0():
-    """See gh-1455"""
-    get_queue_or_skip()
-
-    # The atomic case is checked in `test_usm_ndarray_reductions`
-    # This test checks the tree reduction path for correctness
-    x = dpt.reshape(dpt.arange(3 * 4 * 5, dtype="f4"), (3, 4, 5))
-
-    m = dpt.sum(x, axis=0)
-    expected = dpt.asarray(
-        [
-            [60, 63, 66, 69, 72],
-            [75, 78, 81, 84, 87],
-            [90, 93, 96, 99, 102],
-            [105, 108, 111, 114, 117],
-        ],
-        dtype="f4",
-    )
-    tol = dpt.finfo(m.dtype).resolution
-    assert dpt.allclose(m, expected, atol=tol, rtol=tol)
-
-    x = dpt.flip(x, axis=2)
-    m = dpt.sum(x, axis=2)
-    expected = dpt.asarray(
-        [[10, 35, 60, 85], [110, 135, 160, 185], [210, 235, 260, 285]],
-        dtype="f4",
-    )
-    assert dpt.allclose(m, expected, atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("arg_dtype", _all_dtypes[1:])
-def test_prod_arg_dtype_default_output_dtype_matrix(arg_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arg_dtype, q)
-
-    arg_dtype = dpt.dtype(arg_dtype)
-
-    m = dpt.ones(100, dtype=arg_dtype)
-    r = dpt.prod(m)
-
-    assert isinstance(r, dpt.usm_ndarray)
-    if m.dtype.kind == "i":
-        assert r.dtype.kind == "i"
-    elif m.dtype.kind == "u":
-        assert r.dtype.kind == "u"
-    elif m.dtype.kind == "f":
-        assert r.dtype.kind == "f"
-    elif m.dtype.kind == "c":
-        assert r.dtype.kind == "c"
-    assert dpt.all(r == 1)
-
-    if dpt.isdtype(m.dtype, "unsigned integer"):
-        m = dpt.tile(dpt.arange(1, 3, dtype=arg_dtype), 10)[:1:-2]
-        r = dpt.prod(m)
-        assert dpt.all(r == dpt.asarray(512, dtype=r.dtype))
-    else:
-        m = dpt.full(200, -1, dtype=arg_dtype)[:1:-2]
-        r = dpt.prod(m)
-        assert dpt.all(r == dpt.asarray(-1, dtype=r.dtype))
-
-
-def test_prod_empty():
-    get_queue_or_skip()
-    x = dpt.empty((0,), dtype="u1")
-    y = dpt.prod(x)
-    assert y.shape == tuple()
-    assert int(y) == 1
-
-
-def test_prod_axis():
-    get_queue_or_skip()
-
-    m = dpt.ones((3, 4, 5, 6, 7), dtype="i4")
-    s = dpt.prod(m, axis=(1, 2, -1))
-
-    assert isinstance(s, dpt.usm_ndarray)
-    assert s.shape == (3, 6)
-    assert dpt.all(s == dpt.asarray(1, dtype="i4"))
-
-
-@pytest.mark.parametrize("arg_dtype", _all_dtypes)
-@pytest.mark.parametrize("out_dtype", _all_dtypes[1:])
-def test_prod_arg_out_dtype_matrix(arg_dtype, out_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arg_dtype, q)
-    skip_if_dtype_not_supported(out_dtype, q)
-
-    out_dtype = dpt.dtype(out_dtype)
-    arg_dtype = dpt.dtype(arg_dtype)
-
-    m = dpt.ones(100, dtype=arg_dtype)
-    r = dpt.prod(m, dtype=out_dtype)
-
-    assert isinstance(r, dpt.usm_ndarray)
-    assert r.dtype == dpt.dtype(out_dtype)
-    assert dpt.all(r == 1)
-
-
-def test_gh_1468():
-    "See https://github.com/IntelPython/dpctl/issues/1468"
-    get_queue_or_skip()
-
-    a = dpt.full((2, 3, 4), 123456789, dtype=dpt.int32)
-    t = dpt.sum(a, dtype="f4")
-    assert t > 0
-
-
-@pytest.mark.parametrize(
-    "dt", ["i1", "i2", "i4", "i8", "f2", "f4", "f8", "c8", "c16"]
-)
-def test_gh_1944(dt):
-    "See https://github.com/IntelPython/dpctl/issues/1944"
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-    x = dpt.asarray([-1, 1], dtype=dpt.dtype(dt), sycl_queue=q)
-    r = dpt.sum(x, dtype="?")
-    # reduction must be performed in the requested dtype
-    # if performed in the input type, result is False
-    assert r
diff --git a/dpctl/tests/test_tensor_testing.py b/dpctl/tests/test_tensor_testing.py
deleted file mode 100644
index d1cb4df4ab..0000000000
--- a/dpctl/tests/test_tensor_testing.py
+++ /dev/null
@@ -1,149 +0,0 @@
-import itertools
-
-import pytest
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-_all_dtypes = [
-    "?",
-    "i1",
-    "u1",
-    "i2",
-    "u2",
-    "i4",
-    "u4",
-    "i8",
-    "u8",
-    "f2",
-    "f4",
-    "f8",
-    "c8",
-    "c16",
-]
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_allclose(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    a1 = dpt.ones(10, dtype=dtype)
-    a2 = dpt.ones(10, dtype=dtype)
-
-    assert dpt.allclose(a1, a2)
-
-
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
-def test_allclose_real_fp(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    v = [dpt.nan, -dpt.nan, dpt.inf, -dpt.inf, -0.0, 0.0, 1.0, -1.0]
-    a1 = dpt.asarray(v[2:], dtype=dtype)
-    a2 = dpt.asarray(v[2:], dtype=dtype)
-
-    tol = dpt.finfo(a1.dtype).resolution
-    assert dpt.allclose(a1, a2, atol=tol, rtol=tol)
-
-    a1 = dpt.asarray(v, dtype=dtype)
-    a2 = dpt.asarray(v, dtype=dtype)
-
-    assert not dpt.allclose(a1, a2, atol=tol, rtol=tol)
-    assert dpt.allclose(a1, a2, atol=tol, rtol=tol, equal_nan=True)
-
-
-@pytest.mark.parametrize("dtype", ["c8", "c16"])
-def test_allclose_complex_fp(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    v = [dpt.nan, -dpt.nan, dpt.inf, -dpt.inf, -0.0, 0.0, 1.0, -1.0]
-
-    not_nans = [complex(*xy) for xy in itertools.product(v[2:], repeat=2)]
-    z1 = dpt.asarray(not_nans, dtype=dtype)
-    z2 = dpt.asarray(not_nans, dtype=dtype)
-
-    tol = dpt.finfo(z1.dtype).resolution
-    assert dpt.allclose(z1, z2, atol=tol, rtol=tol)
-
-    both = [complex(*xy) for xy in itertools.product(v, repeat=2)]
-    z1 = dpt.asarray(both, dtype=dtype)
-    z2 = dpt.asarray(both, dtype=dtype)
-
-    tol = dpt.finfo(z1.dtype).resolution
-    assert not dpt.allclose(z1, z2, atol=tol, rtol=tol)
-    assert dpt.allclose(z1, z2, atol=tol, rtol=tol, equal_nan=True)
-
-
-def test_allclose_validation():
-    with pytest.raises(TypeError):
-        dpt.allclose(True, False)
-
-    get_queue_or_skip()
-    x = dpt.asarray(True)
-    with pytest.raises(TypeError):
-        dpt.allclose(x, False)
-
-
-def test_allclose_type_promotion():
-    get_queue_or_skip()
-
-    x1 = dpt.ones(10, dtype="i4")
-    x2 = dpt.ones(10, dtype="i8")
-
-    assert dpt.allclose(x1, x2)
-
-
-def test_allclose_tolerance():
-    get_queue_or_skip()
-
-    x = dpt.zeros(10, dtype="f4")
-    atol = 1e-5
-    y = dpt.full_like(x, atol)
-    assert dpt.allclose(x, y, atol=atol, rtol=0)
-
-    # about 8e-6
-    tol = float.fromhex("0x1.0p-17")
-    x = dpt.ones(10, dtype="f4")
-    y = x - tol
-    assert dpt.allclose(x, y, atol=0, rtol=tol)
-
-
-def test_allclose_real_fp_early_exists():
-    get_queue_or_skip()
-
-    x1 = dpt.asarray([0.0, dpt.inf, -dpt.inf], dtype="f4")
-    x2 = dpt.asarray([dpt.inf, 0.0, -dpt.inf], dtype="f4")
-
-    # early exists, inf positions are different
-    assert not dpt.allclose(x1, x2)
-
-    x2 = dpt.asarray([0.0, -dpt.inf, dpt.inf], dtype="f4")
-
-    # early exists, inf positions are the same, but signs differ
-    assert not dpt.allclose(x1, x2)
-
-
-def test_allclose_complex_fp_early_exists():
-    get_queue_or_skip()
-
-    x1 = dpt.asarray([0.0, dpt.inf, -dpt.inf], dtype="c8")
-    x2 = dpt.asarray([dpt.inf, 0.0, -dpt.inf], dtype="c8")
-
-    # early exists, inf positions of real parts are different
-    assert not dpt.allclose(x1, x2)
-
-    x2 = dpt.asarray([0.0, -dpt.inf, dpt.inf], dtype="c8")
-
-    # early exists, inf positions of real parts are the same, but signs differ
-    assert not dpt.allclose(x1, x2)
-
-    x1 = dpt.asarray([0.0, dpt.inf * 1j, -dpt.inf * 1j], dtype="c8")
-    x2 = dpt.asarray([dpt.inf * 1j, 0.0, -dpt.inf * 1j], dtype="c8")
-
-    # early exists, inf positions of imag parts are different
-    assert not dpt.allclose(x1, x2)
-
-    x2 = dpt.asarray([0.0, -dpt.inf * 1j, dpt.inf * 1j], dtype="c8")
-    assert not dpt.allclose(x1, x2)
diff --git a/dpctl/tests/test_usm_ndarray_ctor.py b/dpctl/tests/test_usm_ndarray_ctor.py
deleted file mode 100644
index bde61394f0..0000000000
--- a/dpctl/tests/test_usm_ndarray_ctor.py
+++ /dev/null
@@ -1,2786 +0,0 @@
-#                      Data Parallel Control (dpctl)
-#
-# Copyright 2020-2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import ctypes
-import numbers
-from math import prod
-
-import numpy as np
-import pytest
-from numpy.testing import assert_raises_regex
-
-import dpctl
-import dpctl.memory as dpm
-import dpctl.tensor as dpt
-from dpctl.tensor import Device
-
-from .helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-_all_dtypes = [
-    "b1",
-    "i1",
-    "u1",
-    "i2",
-    "u2",
-    "i4",
-    "u4",
-    "i8",
-    "u8",
-    "f2",
-    "f4",
-    "f8",
-    "c8",
-    "c16",
-]
-
-
-@pytest.mark.parametrize(
-    "shape",
-    [
-        (),
-        (4,),
-        (0,),
-        (0, 1),
-        (0, 0),
-        (4, 5),
-        (2, 5, 2),
-        (2, 2, 2, 2, 2, 2, 2, 2),
-        5,
-        np.int32(7),
-    ],
-)
-@pytest.mark.parametrize("usm_type", ["shared", "host", "device"])
-def test_allocate_usm_ndarray(shape, usm_type):
-    q = get_queue_or_skip()
-    X = dpt.usm_ndarray(
-        shape, dtype="i8", buffer=usm_type, buffer_ctor_kwargs={"queue": q}
-    )
-    Xnp = np.ndarray(shape, dtype="i8")
-    assert X.usm_type == usm_type
-    assert X.sycl_context == q.sycl_context
-    assert X.sycl_device == q.sycl_device
-    assert X.size == Xnp.size
-    assert X.shape == Xnp.shape
-    assert X.shape == X.__sycl_usm_array_interface__["shape"]
-
-
-def test_usm_ndarray_flags():
-    get_queue_or_skip()
-    f = dpt.usm_ndarray((5,), dtype="i4").flags
-    assert f.fc
-    assert f.forc
-
-    f = dpt.usm_ndarray((5, 2), dtype="i4").flags
-    assert f.c_contiguous
-    assert f.forc
-
-    f = dpt.usm_ndarray((5, 2), dtype="i4", order="F").flags
-    assert f.f_contiguous
-    assert f.forc
-    assert f.fnc
-
-    f = dpt.usm_ndarray((5,), dtype="i4", strides=(1,)).flags
-    assert f.fc
-    assert f.forc
-
-    f = dpt.usm_ndarray((5, 1, 2), dtype="i4", strides=(2, 0, 1)).flags
-    assert f.c_contiguous
-    assert f.forc
-
-    f = dpt.usm_ndarray((5, 1, 2), dtype="i4", strides=(1, 0, 5)).flags
-    assert f.f_contiguous
-    assert f.forc
-    assert f.fnc
-
-    f = dpt.usm_ndarray((5, 0, 1), dtype="i4", strides=(1, 0, 1)).flags
-    assert f.fc
-    assert f.forc
-    assert not dpt.usm_ndarray(
-        (5, 1, 1), dtype="i4", strides=(2, 0, 1)
-    ).flags.forc
-
-    x = dpt.empty(5, dtype="u2")
-    assert x.flags.writable is True
-    x.flags.writable = False
-    assert x.flags.writable is False
-    with pytest.raises(ValueError):
-        x[:] = 0
-    x.flags["W"] = True
-    assert x.flags.writable is True
-    x.flags["WRITABLE"] = True
-    assert x.flags.writable is True
-    x[:] = 0
-
-    with pytest.raises(TypeError):
-        x.flags.writable = dict()
-    with pytest.raises(ValueError):
-        x.flags["C"] = False
-
-
-def test_usm_ndarray_flags_bug_gh_1334():
-    get_queue_or_skip()
-    a = dpt.ones((2, 3), dtype="u4")
-    r = dpt.reshape(a, (1, 6, 1))
-    assert r.flags["C"] and r.flags["F"]
-
-    a = dpt.ones((2, 3), dtype="u4", order="F")
-    r = dpt.reshape(a, (1, 6, 1), order="F")
-    assert r.flags["C"] and r.flags["F"]
-
-    a = dpt.ones((2, 3, 4), dtype="i8")
-    r = dpt.sum(a, axis=(1, 2), keepdims=True)
-    assert r.flags["C"] and r.flags["F"]
-
-    a = dpt.ones((2, 1), dtype="?")
-    r = a[:, 1::-1]
-    assert r.flags["F"] and r.flags["C"]
-
-
-def test_usm_ndarray_writable_flag_views():
-    get_queue_or_skip()
-    a = dpt.arange(10, dtype="f4")
-    a.flags["W"] = False
-
-    a.shape = (5, 2)
-    assert not a.flags.writable
-    assert not a.T.flags.writable
-    assert not a.mT.flags.writable
-    assert not a.real.flags.writable
-    assert not a[0:3].flags.writable
-
-    a = dpt.arange(10, dtype="c8")
-    a.flags["W"] = False
-
-    assert not a.real.flags.writable
-    assert not a.imag.flags.writable
-
-
-@pytest.mark.parametrize("dt1", _all_dtypes)
-@pytest.mark.parametrize("dt2", _all_dtypes)
-def test_usm_ndarray_from_zero_sized_usm_ndarray(dt1, dt2):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt1, q)
-    skip_if_dtype_not_supported(dt2, q)
-
-    x1 = dpt.ones((0,), dtype=dt1, sycl_queue=q)
-    x2 = dpt.usm_ndarray(x1.shape, dtype=dt2, buffer=x1)
-    assert x2.dtype == dt2
-    assert x2.sycl_queue == q
-    assert x2._pointer == x1._pointer
-    assert x2.shape == x1.shape
-
-
-def test_usm_ndarray_from_usm_ndarray_readonly():
-    get_queue_or_skip()
-
-    x1 = dpt.arange(10, dtype="f4")
-    x1.flags["W"] = False
-    x2 = dpt.usm_ndarray(x1.shape, dtype="f4", buffer=x1)
-    assert not x2.flags.writable
-
-
-@pytest.mark.parametrize(
-    "dtype",
-    _all_dtypes
-    + [
-        b"float32",
-        dpt.dtype("d"),
-        np.half,
-    ],
-)
-def test_dtypes(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-    Xusm = dpt.usm_ndarray((1,), dtype=dtype)
-    assert Xusm.itemsize == dpt.dtype(dtype).itemsize
-    expected_fmt = (dpt.dtype(dtype).str)[1:]
-    actual_fmt = Xusm.__sycl_usm_array_interface__["typestr"][1:]
-    assert expected_fmt == actual_fmt
-
-
-@pytest.mark.parametrize("usm_type", ["device", "shared", "host"])
-@pytest.mark.parametrize("buffer_ctor_kwargs", [dict(), {"queue": None}])
-def test_default_dtype(usm_type, buffer_ctor_kwargs):
-    q = get_queue_or_skip()
-    dev = q.get_sycl_device()
-    if buffer_ctor_kwargs:
-        buffer_ctor_kwargs["queue"] = q
-    Xusm = dpt.usm_ndarray(
-        (1,), buffer=usm_type, buffer_ctor_kwargs=buffer_ctor_kwargs
-    )
-    if dev.has_aspect_fp64:
-        expected_dtype = "f8"
-    else:
-        expected_dtype = "f4"
-    assert Xusm.itemsize == dpt.dtype(expected_dtype).itemsize
-    expected_fmt = (dpt.dtype(expected_dtype).str)[1:]
-    actual_fmt = Xusm.__sycl_usm_array_interface__["typestr"][1:]
-    assert expected_fmt == actual_fmt
-
-
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        "",
-        ">f4",
-        "invalid",
-        123,
-        np.dtype(">f4"),
-        np.dtype([("a", ">f4"), ("b", "i4")]),
-    ],
-)
-def test_dtypes_invalid(dtype):
-    with pytest.raises((TypeError, ValueError)):
-        dpt.usm_ndarray((1,), dtype=dtype)
-
-
-@pytest.mark.parametrize("dt", ["f", "c8"])
-def test_properties(dt):
-    """
-    Test that properties execute
-    """
-    try:
-        X = dpt.usm_ndarray((3, 4, 5), dtype=dt)
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    assert isinstance(X.sycl_queue, dpctl.SyclQueue)
-    assert isinstance(X.sycl_device, dpctl.SyclDevice)
-    assert isinstance(X.sycl_context, dpctl.SyclContext)
-    assert isinstance(X.dtype, dpt.dtype)
-    assert isinstance(X.__sycl_usm_array_interface__, dict)
-    assert isinstance(X.mT, dpt.usm_ndarray)
-    assert isinstance(X.imag, dpt.usm_ndarray)
-    assert isinstance(X.real, dpt.usm_ndarray)
-    assert isinstance(X.shape, tuple)
-    assert isinstance(X.strides, tuple)
-    assert X.usm_type in ("shared", "device", "host")
-    assert isinstance(X.size, numbers.Integral)
-    assert isinstance(X.nbytes, numbers.Integral)
-    assert isinstance(X.ndim, numbers.Integral)
-    assert isinstance(X._pointer, numbers.Integral)
-    assert isinstance(X.device, Device)
-    with pytest.raises(ValueError):
-        # array-API mandates exception for .ndim != 2
-        X.T
-    Y = dpt.usm_ndarray((2, 3), dtype=dt)
-    assert isinstance(Y.mT, dpt.usm_ndarray)
-    V = dpt.usm_ndarray((3,), dtype=dt)
-    with pytest.raises(ValueError):
-        # array-API mandates exception for .ndim != 2
-        V.mT
-
-
-@pytest.mark.parametrize("shape", [tuple(), (1,), (1, 1), (1, 1, 1)])
-@pytest.mark.parametrize("dtype", ["|b1", "|u2", "|f4", "|i8"])
-class TestCopyScalar:
-    @pytest.mark.parametrize("func", [bool, float, int, complex])
-    def test_copy_scalar_with_func(self, func, shape, dtype):
-        try:
-            X = dpt.usm_ndarray(shape, dtype=dtype)
-        except dpctl.SyclDeviceCreationError:
-            pytest.skip("No SYCL devices available")
-        Y = np.arange(1, X.size + 1, dtype=dtype)
-        X.usm_data.copy_from_host(Y.view("|u1"))
-        Y = Y.reshape(())
-        # Non-0D numeric arrays must not be convertible to Python scalars
-        if len(shape) != 0:
-            assert_raises_regex(TypeError, "only 0-dimensional arrays", func, X)
-        else:
-            # 0D arrays are allowed to convert
-            assert func(X) == func(Y)
-
-    @pytest.mark.parametrize(
-        "method", ["__bool__", "__float__", "__int__", "__complex__"]
-    )
-    def test_copy_scalar_with_method(self, method, shape, dtype):
-        try:
-            X = dpt.usm_ndarray(shape, dtype=dtype)
-        except dpctl.SyclDeviceCreationError:
-            pytest.skip("No SYCL devices available")
-        Y = np.arange(1, X.size + 1, dtype=dtype)
-        X.usm_data.copy_from_host(Y.view("|u1"))
-        Y = Y.reshape(())
-        if len(shape) != 0:
-            assert_raises_regex(
-                TypeError, "only 0-dimensional arrays", getattr(X, method)
-            )
-        else:
-            assert getattr(X, method)() == getattr(Y, method)()
-
-
-@pytest.mark.parametrize("func", [bool, float, int, complex])
-@pytest.mark.parametrize("shape", [(2,), (1, 2), (3, 4, 5), (0,)])
-def test_copy_scalar_invalid_shape(func, shape):
-    try:
-        X = dpt.usm_ndarray(shape, dtype="i8")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    with pytest.raises(ValueError):
-        func(X)
-
-
-def test_index_noninteger():
-    import operator
-
-    try:
-        X = dpt.usm_ndarray(1, "f4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    with pytest.raises(IndexError):
-        operator.index(X)
-
-
-@pytest.mark.parametrize(
-    "ind",
-    [
-        tuple(),
-        (None,),
-        (
-            None,
-            Ellipsis,
-            None,
-        ),
-        (2, 2, None, 3, 4),
-        (Ellipsis,),
-        (None, slice(0, None, 2), Ellipsis, slice(0, None, 3)),
-        (None, slice(1, None, 2), Ellipsis, slice(1, None, 3)),
-        (None, slice(None, -1, -2), Ellipsis, slice(2, None, 3)),
-        (
-            slice(None, None, -1),
-            slice(None, None, -1),
-            slice(0, None, 3),
-            slice(1, None, 2),
-        ),
-    ],
-)
-def test_basic_slice(ind):
-    try:
-        X = dpt.usm_ndarray((2 * 3, 2 * 4, 3 * 5, 2 * 7), dtype="u1")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    Xnp = np.empty(X.shape, dtype=X.dtype)
-    S = X[ind]
-    Snp = Xnp[ind]
-    assert S.shape == Snp.shape
-    assert S.strides == Snp.strides
-    assert S.dtype == X.dtype
-
-
-def test_empty_slice():
-    # see gh801
-    try:
-        X = dpt.empty((1, 0, 1), dtype="u1")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    Y = X[:, ::-1, :]
-    assert Y.shape == X.shape
-    Z = X[:, ::2, :]
-    assert Z.shape == X.shape
-    X = dpt.empty(0)
-    Y = X[::-1]
-    assert Y.shape == X.shape
-    Z = X[::2]
-    assert Z.shape == X.shape
-    X = dpt.empty((0, 4), dtype="u1")
-    assert X[:, 1].shape == (0,)
-    assert X[:, 1:3].shape == (0, 2)
-
-
-def test_slice_constructor_1d():
-    Xh = np.arange(37, dtype="i4")
-    try:
-        Xusm = dpt.arange(Xh.size, dtype="i4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    for ind in [
-        slice(1, None, 2),
-        slice(0, None, 3),
-        slice(1, None, 3),
-        slice(2, None, 3),
-        slice(None, None, -1),
-        slice(-2, 2, -2),
-        slice(-1, 1, -2),
-        slice(None, None, -13),
-    ]:
-        assert np.array_equal(
-            dpt.asnumpy(Xusm[ind]), Xh[ind]
-        ), "Failed for {}".format(ind)
-
-
-def test_slice_constructor_3d():
-    Xh = np.ones((37, 24, 35), dtype="i4")
-    try:
-        Xusm = dpt.ones(Xh.shape, dtype=Xh.dtype)
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    for ind in [
-        slice(1, None, 2),
-        slice(0, None, 3),
-        slice(1, None, 3),
-        slice(2, None, 3),
-        slice(None, None, -1),
-        slice(-2, 2, -2),
-        slice(-1, 1, -2),
-        slice(None, None, -13),
-        (slice(None, None, -2), Ellipsis, None, 15),
-    ]:
-        assert np.array_equal(
-            dpt.to_numpy(Xusm[ind]), Xh[ind]
-        ), "Failed for {}".format(ind)
-
-
-@pytest.mark.parametrize("usm_type", ["device", "shared", "host"])
-def test_slice_suai(usm_type):
-    Xh = np.arange(0, 10, dtype="u1")
-    try:
-        Xusm = dpt.arange(0, 10, dtype="u1", usm_type=usm_type)
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    for ind in [slice(2, 3, None), slice(5, 7, None), slice(3, 9, None)]:
-        assert np.array_equal(
-            dpm.as_usm_memory(Xusm[ind]).copy_to_host(), Xh[ind]
-        ), "Failed for {}".format(ind)
-
-
-def test_slicing_basic():
-    try:
-        Xusm = dpt.usm_ndarray((10, 5), dtype="c8")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    Xusm[None]
-    Xusm[...]
-    Xusm[8]
-    Xusm[-3]
-    with pytest.raises(IndexError):
-        Xusm[..., ...]
-    with pytest.raises(IndexError):
-        Xusm[1, 1, :, 1]
-    Xusm[:, -4]
-    with pytest.raises(IndexError):
-        Xusm[:, -128]
-    with pytest.raises(IndexError):
-        Xusm[{1, 2, 3, 4, 5, 6, 7}]
-    X = dpt.usm_ndarray(10, "u1")
-    X.usm_data.copy_from_host(b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09")
-    int(
-        X[X[2]]
-    )  # check that objects with __index__ method can be used as indices
-    Xh = dpm.as_usm_memory(X[X[2] : X[5]]).copy_to_host()
-    Xnp = np.arange(0, 10, dtype="u1")
-    assert np.array_equal(Xh, Xnp[Xnp[2] : Xnp[5]])
-
-
-def test_slicing_empty():
-    try:
-        X = dpt.usm_ndarray((0, 10), dtype="i4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    x = dpt.moveaxis(X, 1, 0)
-    # this used to raise ValueError
-    y = x[1]
-    assert y.ndim == 1
-    assert y.shape == (0,)
-    assert y.dtype == X.dtype
-    assert y.usm_type == X.usm_type
-    assert y.sycl_queue == X.sycl_queue
-    w = x[1:3]
-    assert w.ndim == 2
-    assert w.shape == (
-        2,
-        0,
-    )
-    assert w.dtype == X.dtype
-    assert w.usm_type == X.usm_type
-    assert w.sycl_queue == X.sycl_queue
-
-
-def test_ctor_invalid_shape():
-    with pytest.raises(TypeError):
-        dpt.usm_ndarray(dict())
-
-
-def test_ctor_invalid_order():
-    get_queue_or_skip()
-    with pytest.raises(ValueError):
-        dpt.usm_ndarray((5, 5, 3), order="Z")
-    with pytest.raises(ValueError):
-        dpt.usm_ndarray((10), strides=(1,), order="Z")
-    with pytest.raises(ValueError):
-        dpt.usm_ndarray((), order="Z")
-
-
-def test_ctor_buffer_kwarg():
-    try:
-        dpt.usm_ndarray(10, dtype="i8", buffer=b"device")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    with pytest.raises(ValueError):
-        dpt.usm_ndarray(10, buffer="invalid_param")
-    Xusm = dpt.usm_ndarray((10, 5), dtype="c8")
-    Xusm[...] = 1
-    X2 = dpt.usm_ndarray(Xusm.shape, buffer=Xusm, dtype=Xusm.dtype)
-    Horig_copy = Xusm.usm_data.copy_to_host()
-    H2_copy = X2.usm_data.copy_to_host()
-    assert np.array_equal(Horig_copy, H2_copy)
-    with pytest.raises(ValueError):
-        dpt.usm_ndarray(10, dtype="i4", buffer=dict())
-    # use device-specific default fp data type
-    X3 = dpt.usm_ndarray(Xusm.shape, buffer=Xusm)
-    assert np.array_equal(Horig_copy, X3.usm_data.copy_to_host())
-
-
-def test_usm_ndarray_props():
-    try:
-        Xusm = dpt.usm_ndarray((10, 5), dtype="c8", order="F")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    Xusm.ndim
-    repr(Xusm)
-    Xusm.flags
-    Xusm.__sycl_usm_array_interface__
-    Xusm.device
-    Xusm.strides
-    Xusm.real
-    Xusm.imag
-    try:
-        dpctl.SyclQueue("cpu")
-    except dpctl.SyclQueueCreationError:
-        pytest.skip("Sycl device CPU was not detected")
-    Xusm.to_device("cpu")
-
-
-def test_datapi_device():
-    try:
-        X = dpt.usm_ndarray(1, dtype="i4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    dev_t = type(X.device)
-    with pytest.raises(TypeError):
-        dev_t()
-    dev_t.create_device(X.device)
-    dev_t.create_device(X.sycl_queue)
-    d1 = dev_t.create_device(X.sycl_device)
-    d2 = dev_t.create_device(X.sycl_device.filter_string)
-    d3 = dev_t.create_device(None)
-    assert d1.sycl_queue == d2.sycl_queue
-    assert d1.sycl_queue == d3.sycl_queue
-    X.device.sycl_context
-    X.device.sycl_queue
-    X.device.sycl_device
-    repr(X.device)
-    X.device.print_device_info()
-
-
-def _pyx_capi_fnptr_to_callable(
-    X,
-    pyx_capi_name,
-    caps_name,
-    fn_restype=ctypes.c_void_p,
-    fn_argtypes=(ctypes.py_object,),
-):
-    import sys
-
-    mod = sys.modules[X.__class__.__module__]
-    cap = mod.__pyx_capi__.get(pyx_capi_name, None)
-    if cap is None:
-        raise ValueError(
-            "__pyx_capi__ does not export {} capsule".format(pyx_capi_name)
-        )
-    # construct Python callable to invoke these functions
-    cap_ptr_fn = ctypes.pythonapi.PyCapsule_GetPointer
-    cap_ptr_fn.restype = ctypes.c_void_p
-    cap_ptr_fn.argtypes = [ctypes.py_object, ctypes.c_char_p]
-    fn_ptr = cap_ptr_fn(cap, caps_name)
-    callable_maker_ptr = ctypes.PYFUNCTYPE(fn_restype, *fn_argtypes)
-    return callable_maker_ptr(fn_ptr)
-
-
-def test_pyx_capi_get_data():
-    try:
-        X = dpt.usm_ndarray(17, dtype="i8")[1::2]
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    get_data_fn = _pyx_capi_fnptr_to_callable(
-        X,
-        "UsmNDArray_GetData",
-        b"char *(struct PyUSMArrayObject *)",
-        fn_restype=ctypes.c_void_p,
-        fn_argtypes=(ctypes.py_object,),
-    )
-    r1 = get_data_fn(X)
-    sua_iface = X.__sycl_usm_array_interface__
-    assert r1 == sua_iface["data"][0] + sua_iface.get("offset") * X.itemsize
-
-
-def test_pyx_capi_get_shape():
-    try:
-        X = dpt.usm_ndarray(17, dtype="u4")[1::2]
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    get_shape_fn = _pyx_capi_fnptr_to_callable(
-        X,
-        "UsmNDArray_GetShape",
-        b"Py_ssize_t *(struct PyUSMArrayObject *)",
-        fn_restype=ctypes.c_void_p,
-        fn_argtypes=(ctypes.py_object,),
-    )
-    c_longlong_p = ctypes.POINTER(ctypes.c_longlong)
-    shape0 = ctypes.cast(get_shape_fn(X), c_longlong_p).contents.value
-    assert shape0 == X.shape[0]
-
-
-def test_pyx_capi_get_strides():
-    try:
-        X = dpt.usm_ndarray(17, dtype="f4")[1::2]
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    get_strides_fn = _pyx_capi_fnptr_to_callable(
-        X,
-        "UsmNDArray_GetStrides",
-        b"Py_ssize_t *(struct PyUSMArrayObject *)",
-        fn_restype=ctypes.c_void_p,
-        fn_argtypes=(ctypes.py_object,),
-    )
-    c_longlong_p = ctypes.POINTER(ctypes.c_longlong)
-    strides0_p = get_strides_fn(X)
-    if strides0_p:
-        strides0_p = ctypes.cast(strides0_p, c_longlong_p).contents
-        strides0_p = strides0_p.value
-    assert strides0_p == 0 or strides0_p == X.strides[0]
-
-
-def test_pyx_capi_get_ndim():
-    try:
-        X = dpt.usm_ndarray(17, dtype="?")[1::2]
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    get_ndim_fn = _pyx_capi_fnptr_to_callable(
-        X,
-        "UsmNDArray_GetNDim",
-        b"int (struct PyUSMArrayObject *)",
-        fn_restype=ctypes.c_int,
-        fn_argtypes=(ctypes.py_object,),
-    )
-    assert get_ndim_fn(X) == X.ndim
-
-
-def test_pyx_capi_get_typenum():
-    try:
-        X = dpt.usm_ndarray(17, dtype="c8")[1::2]
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    get_typenum_fn = _pyx_capi_fnptr_to_callable(
-        X,
-        "UsmNDArray_GetTypenum",
-        b"int (struct PyUSMArrayObject *)",
-        fn_restype=ctypes.c_int,
-        fn_argtypes=(ctypes.py_object,),
-    )
-    typenum = get_typenum_fn(X)
-    assert type(typenum) is int
-    assert typenum == X.dtype.num
-
-
-def test_pyx_capi_get_elemsize():
-    try:
-        X = dpt.usm_ndarray(17, dtype="u8")[1::2]
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    get_elemsize_fn = _pyx_capi_fnptr_to_callable(
-        X,
-        "UsmNDArray_GetElementSize",
-        b"int (struct PyUSMArrayObject *)",
-        fn_restype=ctypes.c_int,
-        fn_argtypes=(ctypes.py_object,),
-    )
-    itemsize = get_elemsize_fn(X)
-    assert type(itemsize) is int
-    assert itemsize == X.itemsize
-
-
-def test_pyx_capi_get_flags():
-    try:
-        X = dpt.usm_ndarray(17, dtype="i8")[1::2]
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    get_flags_fn = _pyx_capi_fnptr_to_callable(
-        X,
-        "UsmNDArray_GetFlags",
-        b"int (struct PyUSMArrayObject *)",
-        fn_restype=ctypes.c_int,
-        fn_argtypes=(ctypes.py_object,),
-    )
-    flags = get_flags_fn(X)
-    assert type(flags) is int and X.flags == flags
-
-
-def test_pyx_capi_get_offset():
-    try:
-        X = dpt.usm_ndarray(17, dtype="u2")[1::2]
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    get_offset_fn = _pyx_capi_fnptr_to_callable(
-        X,
-        "UsmNDArray_GetOffset",
-        b"Py_ssize_t (struct PyUSMArrayObject *)",
-        fn_restype=ctypes.c_longlong,
-        fn_argtypes=(ctypes.py_object,),
-    )
-    offset = get_offset_fn(X)
-    assert type(offset) is int
-    assert offset == X.__sycl_usm_array_interface__["offset"]
-
-
-def test_pyx_capi_get_usmdata():
-    try:
-        X = dpt.usm_ndarray(17, dtype="u2")[1::2]
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    get_usmdata_fn = _pyx_capi_fnptr_to_callable(
-        X,
-        "UsmNDArray_GetUSMData",
-        b"PyObject *(struct PyUSMArrayObject *)",
-        fn_restype=ctypes.py_object,
-        fn_argtypes=(ctypes.py_object,),
-    )
-    capi_usm_data = get_usmdata_fn(X)
-    assert isinstance(capi_usm_data, dpm._memory._Memory)
-    assert capi_usm_data.nbytes == X.usm_data.nbytes
-    assert capi_usm_data._pointer == X.usm_data._pointer
-    assert capi_usm_data.sycl_queue == X.usm_data.sycl_queue
-
-
-def test_pyx_capi_get_queue_ref():
-    try:
-        X = dpt.usm_ndarray(17, dtype="i2")[1::2]
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    get_queue_ref_fn = _pyx_capi_fnptr_to_callable(
-        X,
-        "UsmNDArray_GetQueueRef",
-        b"DPCTLSyclQueueRef (struct PyUSMArrayObject *)",
-        fn_restype=ctypes.c_void_p,
-        fn_argtypes=(ctypes.py_object,),
-    )
-    queue_ref = get_queue_ref_fn(X)  # address of a copy, should be unequal
-    assert queue_ref != X.sycl_queue.addressof_ref()
-
-
-def test_pyx_capi_make_from_memory():
-    q = get_queue_or_skip()
-    n0, n1 = 4, 6
-    c_tuple = (ctypes.c_ssize_t * 2)(n0, n1)
-    mem = dpm.MemoryUSMShared(n0 * n1 * 4, queue=q)
-    typenum = dpt.dtype("single").num
-    any_usm_ndarray = dpt.empty(tuple(), dtype="i4", sycl_queue=q)
-    make_from_memory_fn = _pyx_capi_fnptr_to_callable(
-        any_usm_ndarray,
-        "UsmNDArray_MakeSimpleFromMemory",
-        b"PyObject *(int, Py_ssize_t const *, int, "
-        b"struct Py_MemoryObject *, Py_ssize_t, char)",
-        fn_restype=ctypes.py_object,
-        fn_argtypes=(
-            ctypes.c_int,
-            ctypes.POINTER(ctypes.c_ssize_t),
-            ctypes.c_int,
-            ctypes.py_object,
-            ctypes.c_ssize_t,
-            ctypes.c_char,
-        ),
-    )
-    r = make_from_memory_fn(
-        ctypes.c_int(2),
-        c_tuple,
-        ctypes.c_int(typenum),
-        mem,
-        ctypes.c_ssize_t(0),
-        ctypes.c_char(b"C"),
-    )
-    assert isinstance(r, dpt.usm_ndarray)
-    assert r.ndim == 2
-    assert r.shape == (n0, n1)
-    assert r._pointer == mem._pointer
-    assert r.usm_type == "shared"
-    assert r.sycl_queue == q
-    assert r.flags["C"]
-    r2 = make_from_memory_fn(
-        ctypes.c_int(2),
-        c_tuple,
-        ctypes.c_int(typenum),
-        mem,
-        ctypes.c_ssize_t(0),
-        ctypes.c_char(b"F"),
-    )
-    ptr = mem._pointer
-    del mem
-    del r
-    assert isinstance(r2, dpt.usm_ndarray)
-    assert r2._pointer == ptr
-    assert r2.usm_type == "shared"
-    assert r2.sycl_queue == q
-    assert r2.flags["F"]
-
-
-def test_pyx_capi_set_writable_flag():
-    q = get_queue_or_skip()
-    usm_ndarray = dpt.empty((4, 5), dtype="i4", sycl_queue=q)
-    assert isinstance(usm_ndarray, dpt.usm_ndarray)
-    assert usm_ndarray.flags["WRITABLE"] is True
-    set_writable = _pyx_capi_fnptr_to_callable(
-        usm_ndarray,
-        "UsmNDArray_SetWritableFlag",
-        b"void (struct PyUSMArrayObject *, int)",
-        fn_restype=None,
-        fn_argtypes=(ctypes.py_object, ctypes.c_int),
-    )
-    set_writable(usm_ndarray, ctypes.c_int(0))
-    assert isinstance(usm_ndarray, dpt.usm_ndarray)
-    assert usm_ndarray.flags["WRITABLE"] is False
-    set_writable(usm_ndarray, ctypes.c_int(1))
-    assert isinstance(usm_ndarray, dpt.usm_ndarray)
-    assert usm_ndarray.flags["WRITABLE"] is True
-
-
-def test_pyx_capi_make_from_ptr():
-    q = get_queue_or_skip()
-    usm_ndarray = dpt.empty(tuple(), dtype="i4", sycl_queue=q)
-    make_from_ptr = _pyx_capi_fnptr_to_callable(
-        usm_ndarray,
-        "UsmNDArray_MakeSimpleFromPtr",
-        b"PyObject *(size_t, int, DPCTLSyclUSMRef, "
-        b"DPCTLSyclQueueRef, PyObject *)",
-        fn_restype=ctypes.py_object,
-        fn_argtypes=(
-            ctypes.c_size_t,
-            ctypes.c_int,
-            ctypes.c_void_p,
-            ctypes.c_void_p,
-            ctypes.py_object,
-        ),
-    )
-    nelems = 10
-    dt = dpt.int64
-    mem = dpm.MemoryUSMDevice(nelems * dt.itemsize, queue=q)
-    arr = make_from_ptr(
-        ctypes.c_size_t(nelems),
-        dt.num,
-        mem._pointer,
-        mem.sycl_queue.addressof_ref(),
-        mem,
-    )
-    assert isinstance(arr, dpt.usm_ndarray)
-    assert arr.shape == (nelems,)
-    assert arr.dtype == dt
-    assert arr.sycl_queue == q
-    assert arr._pointer == mem._pointer
-    del mem
-    assert isinstance(arr.__repr__(), str)
-
-
-def test_pyx_capi_make_general():
-    q = get_queue_or_skip()
-    usm_ndarray = dpt.empty(tuple(), dtype="i4", sycl_queue=q)
-    make_from_ptr = _pyx_capi_fnptr_to_callable(
-        usm_ndarray,
-        "UsmNDArray_MakeFromPtr",
-        b"PyObject *(int, Py_ssize_t const *, int, Py_ssize_t const *, "
-        b"DPCTLSyclUSMRef, DPCTLSyclQueueRef, Py_ssize_t, PyObject *)",
-        fn_restype=ctypes.py_object,
-        fn_argtypes=(
-            ctypes.c_int,
-            ctypes.POINTER(ctypes.c_ssize_t),
-            ctypes.c_int,
-            ctypes.POINTER(ctypes.c_ssize_t),
-            ctypes.c_void_p,
-            ctypes.c_void_p,
-            ctypes.c_ssize_t,
-            ctypes.py_object,
-        ),
-    )
-    # Create array to view into diagonal of a matrix
-    n = 5
-    mat = dpt.reshape(
-        dpt.arange(n * n, dtype="i4", sycl_queue=q),
-        (
-            n,
-            n,
-        ),
-    )
-    c_shape = (ctypes.c_ssize_t * 1)(
-        n,
-    )
-    c_strides = (ctypes.c_ssize_t * 1)(
-        n + 1,
-    )
-    diag = make_from_ptr(
-        ctypes.c_int(1),
-        c_shape,
-        ctypes.c_int(mat.dtype.num),
-        c_strides,
-        mat._pointer,
-        mat.sycl_queue.addressof_ref(),
-        ctypes.c_ssize_t(0),
-        mat,
-    )
-    assert isinstance(diag, dpt.usm_ndarray)
-    assert diag.shape == (n,)
-    assert diag.strides == (n + 1,)
-    assert diag.dtype == mat.dtype
-    assert diag.sycl_queue == q
-    assert diag._pointer == mat._pointer
-    del mat
-    assert isinstance(diag.__repr__(), str)
-    # create 0d scalar
-    mat = dpt.reshape(
-        dpt.arange(n * n, dtype="i4", sycl_queue=q),
-        (
-            n,
-            n,
-        ),
-    )
-    sc = make_from_ptr(
-        ctypes.c_int(0),
-        None,  # NULL pointer
-        ctypes.c_int(mat.dtype.num),
-        None,  # NULL pointer
-        mat._pointer,
-        mat.sycl_queue.addressof_ref(),
-        ctypes.c_ssize_t(0),
-        mat,
-    )
-    assert isinstance(sc, dpt.usm_ndarray)
-    assert sc.shape == tuple()
-    assert sc.dtype == mat.dtype
-    assert sc.sycl_queue == q
-    assert sc._pointer == mat._pointer
-    c_shape = (ctypes.c_ssize_t * 2)(0, n)
-    c_strides = (ctypes.c_ssize_t * 2)(0, 1)
-    zd_arr = make_from_ptr(
-        ctypes.c_int(2),
-        c_shape,
-        ctypes.c_int(mat.dtype.num),
-        c_strides,
-        mat._pointer,
-        mat.sycl_queue.addressof_ref(),
-        ctypes.c_ssize_t(0),
-        mat,
-    )
-    assert isinstance(zd_arr, dpt.usm_ndarray)
-    assert zd_arr.shape == (
-        0,
-        n,
-    )
-    assert zd_arr.strides == (
-        0,
-        1,
-    )
-    assert zd_arr.dtype == mat.dtype
-    assert zd_arr.sycl_queue == q
-    assert zd_arr._pointer == mat._pointer
-
-
-def test_pyx_capi_make_fns_invalid_typenum():
-    q = get_queue_or_skip()
-    usm_ndarray = dpt.empty(tuple(), dtype="i4", sycl_queue=q)
-
-    make_simple_from_ptr = _pyx_capi_fnptr_to_callable(
-        usm_ndarray,
-        "UsmNDArray_MakeSimpleFromPtr",
-        b"PyObject *(size_t, int, DPCTLSyclUSMRef, "
-        b"DPCTLSyclQueueRef, PyObject *)",
-        fn_restype=ctypes.py_object,
-        fn_argtypes=(
-            ctypes.c_size_t,
-            ctypes.c_int,
-            ctypes.c_void_p,
-            ctypes.c_void_p,
-            ctypes.py_object,
-        ),
-    )
-
-    nelems = 10
-    dtype = dpt.int64
-    arr = dpt.arange(nelems, dtype=dtype, sycl_queue=q)
-
-    with pytest.raises(ValueError):
-        make_simple_from_ptr(
-            ctypes.c_size_t(nelems),
-            -1,
-            arr._pointer,
-            arr.sycl_queue.addressof_ref(),
-            arr,
-        )
-
-    make_from_ptr = _pyx_capi_fnptr_to_callable(
-        usm_ndarray,
-        "UsmNDArray_MakeFromPtr",
-        b"PyObject *(int, Py_ssize_t const *, int, Py_ssize_t const *, "
-        b"DPCTLSyclUSMRef, DPCTLSyclQueueRef, Py_ssize_t, PyObject *)",
-        fn_restype=ctypes.py_object,
-        fn_argtypes=(
-            ctypes.c_int,
-            ctypes.POINTER(ctypes.c_ssize_t),
-            ctypes.c_int,
-            ctypes.POINTER(ctypes.c_ssize_t),
-            ctypes.c_void_p,
-            ctypes.c_void_p,
-            ctypes.c_ssize_t,
-            ctypes.py_object,
-        ),
-    )
-    c_shape = (ctypes.c_ssize_t * 1)(
-        nelems,
-    )
-    c_strides = (ctypes.c_ssize_t * 1)(
-        1,
-    )
-    with pytest.raises(ValueError):
-        make_from_ptr(
-            ctypes.c_int(1),
-            c_shape,
-            -1,
-            c_strides,
-            arr._pointer,
-            arr.sycl_queue.addressof_ref(),
-            ctypes.c_ssize_t(0),
-            arr,
-        )
-    del arr
-
-
-def _pyx_capi_int(X, pyx_capi_name, caps_name=b"int", val_restype=ctypes.c_int):
-    import sys
-
-    mod = sys.modules[X.__class__.__module__]
-    cap = mod.__pyx_capi__.get(pyx_capi_name, None)
-    if cap is None:
-        raise ValueError(
-            "__pyx_capi__ does not export {} capsule".format(pyx_capi_name)
-        )
-    # construct Python callable to invoke these functions
-    cap_ptr_fn = ctypes.pythonapi.PyCapsule_GetPointer
-    cap_ptr_fn.restype = ctypes.c_void_p
-    cap_ptr_fn.argtypes = [ctypes.py_object, ctypes.c_char_p]
-    cap_ptr = cap_ptr_fn(cap, caps_name)
-    val_ptr = ctypes.cast(cap_ptr, ctypes.POINTER(val_restype))
-    return val_ptr.contents.value
-
-
-def test_pyx_capi_check_constants():
-    try:
-        X = dpt.usm_ndarray(17, dtype="i1")[1::2]
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    cc_flag = _pyx_capi_int(X, "USM_ARRAY_C_CONTIGUOUS")
-    assert cc_flag > 0 and 0 == (cc_flag & (cc_flag - 1))
-    fc_flag = _pyx_capi_int(X, "USM_ARRAY_F_CONTIGUOUS")
-    assert fc_flag > 0 and 0 == (fc_flag & (fc_flag - 1))
-    w_flag = _pyx_capi_int(X, "USM_ARRAY_WRITABLE")
-    assert w_flag > 0 and 0 == (w_flag & (w_flag - 1))
-
-    bool_typenum = _pyx_capi_int(X, "UAR_BOOL")
-    assert bool_typenum == dpt.dtype("bool_").num
-
-    byte_typenum = _pyx_capi_int(X, "UAR_BYTE")
-    assert byte_typenum == dpt.dtype(np.byte).num
-    ubyte_typenum = _pyx_capi_int(X, "UAR_UBYTE")
-    assert ubyte_typenum == dpt.dtype(np.ubyte).num
-
-    short_typenum = _pyx_capi_int(X, "UAR_SHORT")
-    assert short_typenum == dpt.dtype(np.short).num
-    ushort_typenum = _pyx_capi_int(X, "UAR_USHORT")
-    assert ushort_typenum == dpt.dtype(np.ushort).num
-
-    int_typenum = _pyx_capi_int(X, "UAR_INT")
-    assert int_typenum == dpt.dtype(np.intc).num
-    uint_typenum = _pyx_capi_int(X, "UAR_UINT")
-    assert uint_typenum == dpt.dtype(np.uintc).num
-
-    long_typenum = _pyx_capi_int(X, "UAR_LONG")
-    assert long_typenum == dpt.dtype("l").num
-    ulong_typenum = _pyx_capi_int(X, "UAR_ULONG")
-    assert ulong_typenum == dpt.dtype("L").num
-
-    longlong_typenum = _pyx_capi_int(X, "UAR_LONGLONG")
-    assert longlong_typenum == dpt.dtype(np.longlong).num
-    ulonglong_typenum = _pyx_capi_int(X, "UAR_ULONGLONG")
-    assert ulonglong_typenum == dpt.dtype(np.ulonglong).num
-
-    half_typenum = _pyx_capi_int(X, "UAR_HALF")
-    assert half_typenum == dpt.dtype(np.half).num
-    float_typenum = _pyx_capi_int(X, "UAR_FLOAT")
-    assert float_typenum == dpt.dtype(np.single).num
-    double_typenum = _pyx_capi_int(X, "UAR_DOUBLE")
-    assert double_typenum == dpt.dtype(np.double).num
-
-    cfloat_typenum = _pyx_capi_int(X, "UAR_CFLOAT")
-    assert cfloat_typenum == dpt.dtype(np.csingle).num
-    cdouble_typenum = _pyx_capi_int(X, "UAR_CDOUBLE")
-    assert cdouble_typenum == dpt.dtype(np.cdouble).num
-
-
-@pytest.mark.parametrize(
-    "shape", [tuple(), (1,), (5,), (2, 3), (2, 3, 4), (2, 2, 2, 2, 2)]
-)
-@pytest.mark.parametrize(
-    "dtype",
-    _all_dtypes,
-)
-@pytest.mark.parametrize("usm_type", ["device", "shared", "host"])
-def test_tofrom_numpy(shape, dtype, usm_type):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-    Xusm = dpt.zeros(shape, dtype=dtype, usm_type=usm_type, sycl_queue=q)
-    Ynp = np.ones(shape, dtype=dtype)
-    Ynp[(0,) * len(shape)] = 0
-    ind = (slice(None, None, None),) * Ynp.ndim
-    Xusm[ind] = Ynp
-    assert np.array_equal(dpt.to_numpy(Xusm), Ynp)
-
-
-@pytest.mark.parametrize(
-    "dtype",
-    _all_dtypes,
-)
-@pytest.mark.parametrize("usm_type", ["device", "shared", "host"])
-def test_tofrom_numpy_permuted(dtype, usm_type):
-    shape = (3, 5, 7)
-    perm = (1, 2, 0)
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-    Xusm = dpt.permute_dims(
-        dpt.zeros(shape, dtype=dtype, usm_type=usm_type, sycl_queue=q), perm
-    )
-    Ynp = np.transpose(np.ones(shape, dtype=dtype), perm)
-    Ynp[:, ::2, ::2] = 0
-    ind = (slice(None, None, None),) * Ynp.ndim
-    # even though Xusm and Ynp are strided, simple memcpy could be done.
-    # This test validates that it is being done correctly
-    Xusm[ind] = Ynp
-    assert np.array_equal(dpt.to_numpy(Xusm), Ynp)
-
-
-@pytest.mark.parametrize(
-    "dtype",
-    _all_dtypes,
-)
-@pytest.mark.parametrize("src_usm_type", ["device", "shared", "host"])
-@pytest.mark.parametrize("dst_usm_type", ["device", "shared", "host"])
-def test_setitem_same_dtype(dtype, src_usm_type, dst_usm_type):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-    shape = (2, 4, 3)
-    Xnp = (
-        np.random.randint(-10, 10, size=prod(shape))
-        .astype(dtype)
-        .reshape(shape)
-    )
-    X = dpt.from_numpy(Xnp, usm_type=src_usm_type)
-    Z = dpt.zeros(shape, dtype=dtype, usm_type=dst_usm_type)
-    Zusm_0d = dpt.copy(Z[0, 0, 0])
-    ind = (-1, -1, -1)
-    Xusm_0d = X[ind]
-    Zusm_0d[Ellipsis] = Xusm_0d
-    assert np.array_equal(dpt.to_numpy(Zusm_0d), Xnp[ind])
-    Zusm_1d = dpt.copy(Z[0, 1:3, 0])
-    ind = (-1, slice(0, 2, None), -1)
-    Xusm_1d = X[ind]
-    Zusm_1d[Ellipsis] = Xusm_1d
-    assert np.array_equal(dpt.to_numpy(Zusm_1d), Xnp[ind])
-    Zusm_2d = dpt.copy(Z[:, 1:3, 0])[::-1]
-    Xusm_2d = X[:, 1:4, -1]
-    Zusm_2d[:] = Xusm_2d[:, 0:2]
-    assert np.array_equal(dpt.to_numpy(Zusm_2d), Xnp[:, 1:3, -1])
-    Zusm_3d = dpt.copy(Z)
-    Xusm_3d = X
-    Zusm_3d[:] = Xusm_3d
-    assert np.array_equal(dpt.to_numpy(Zusm_3d), Xnp)
-    Zusm_3d[::-1] = Xusm_3d[::-1]
-    assert np.array_equal(dpt.to_numpy(Zusm_3d), Xnp)
-    Zusm_3d[:] = Xusm_3d[0]
-    R1 = dpt.to_numpy(Zusm_3d)
-    R2 = np.broadcast_to(Xnp[0], R1.shape)
-    assert R1.shape == R2.shape
-    assert np.allclose(R1, R2)
-    Zusm_empty = Zusm_1d[0:0]
-    Zusm_empty[Ellipsis] = Zusm_3d[0, 0, 0:0]
-
-
-def test_setitem_broadcasting():
-    "See gh-1503"
-    get_queue_or_skip()
-    dst = dpt.ones((2, 3, 4), dtype="u4")
-    src = dpt.zeros((3, 1), dtype=dst.dtype)
-    dst[...] = src
-    expected = np.zeros(dst.shape, dtype=dst.dtype)
-    assert np.array_equal(dpt.asnumpy(dst), expected)
-
-
-def test_setitem_broadcasting_offset():
-    get_queue_or_skip()
-    dt = dpt.int32
-    x = dpt.asarray([[1, 2, 3], [6, 7, 8]], dtype=dt)
-    y = dpt.asarray([4, 5], dtype=dt)
-    x[0] = y[1]
-    expected = dpt.asarray([[5, 5, 5], [6, 7, 8]], dtype=dt)
-    assert dpt.all(x == expected)
-
-
-def test_setitem_broadcasting_empty_dst_validation():
-    "Broadcasting rules apply, except exception"
-    get_queue_or_skip()
-    dst = dpt.ones((2, 0, 5, 4), dtype="i8")
-    src = dpt.ones((2, 0, 3, 4), dtype="i8")
-    with pytest.raises(ValueError):
-        dst[...] = src
-
-
-def test_setitem_broadcasting_empty_dst_edge_case():
-    """RHS is shunken to empty array by
-    broadasting rule, hence no exception"""
-    get_queue_or_skip()
-    dst = dpt.ones(1, dtype="i8")[0:0]
-    src = dpt.ones(tuple(), dtype="i8")
-    dst[...] = src
-
-
-def test_setitem_broadcasting_src_ndim_equal_dst_ndim():
-    get_queue_or_skip()
-    dst = dpt.ones((2, 3, 4), dtype="i4")
-    src = dpt.zeros((2, 1, 4), dtype="i4")
-    dst[...] = src
-
-    expected = np.zeros(dst.shape, dtype=dst.dtype)
-    assert np.array_equal(dpt.asnumpy(dst), expected)
-
-
-def test_setitem_broadcasting_src_ndim_greater_than_dst_ndim():
-    get_queue_or_skip()
-    dst = dpt.ones((2, 3, 4), dtype="i4")
-    src = dpt.zeros((1, 2, 1, 4), dtype="i4")
-    dst[...] = src
-
-    expected = np.zeros(dst.shape, dtype=dst.dtype)
-    assert np.array_equal(dpt.asnumpy(dst), expected)
-
-
-@pytest.mark.parametrize(
-    "dtype",
-    _all_dtypes,
-)
-@pytest.mark.parametrize("usm_type", ["device", "shared", "host"])
-def test_setitem_scalar(dtype, usm_type):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-    X = dpt.usm_ndarray((6, 6), dtype=dtype, buffer=usm_type)
-    for i in range(X.size):
-        X[np.unravel_index(i, X.shape)] = np.asarray(i, dtype=dtype)
-    assert np.array_equal(
-        dpt.to_numpy(X), np.arange(X.size).astype(dtype).reshape(X.shape)
-    )
-    Y = dpt.usm_ndarray((2, 3), dtype=dtype, buffer=usm_type)
-    for i in range(Y.size):
-        Y[np.unravel_index(i, Y.shape)] = i
-    assert np.array_equal(
-        dpt.to_numpy(Y), np.arange(Y.size).astype(dtype).reshape(Y.shape)
-    )
-
-
-def test_setitem_errors():
-    q = get_queue_or_skip()
-    X = dpt.empty((4,), dtype="u1", sycl_queue=q)
-    Y = dpt.empty((4, 2), dtype="u1", sycl_queue=q)
-    with pytest.raises(ValueError):
-        X[:] = Y
-    with pytest.raises(ValueError):
-        X[:] = Y[:, 0:1]
-    X[:] = Y[None, :, 0]
-
-
-@pytest.mark.parametrize("src_dt,dst_dt", [("i4", "i8"), ("f4", "f8")])
-def test_setitem_different_dtypes(src_dt, dst_dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dst_dt, q)
-    X = dpt.ones(10, dtype=src_dt, sycl_queue=q)
-    Y = dpt.zeros(10, dtype=src_dt, sycl_queue=q)
-    Z = dpt.empty((20,), dtype=dst_dt, sycl_queue=q)
-    Z[::2] = X
-    Z[1::2] = Y
-    assert np.allclose(dpt.asnumpy(Z), np.tile(np.array([1, 0], Z.dtype), 10))
-
-
-def test_setitem_wingaps():
-    q = get_queue_or_skip()
-    if dpt.dtype("intc").itemsize == dpt.dtype("int32").itemsize:
-        dpt_dst = dpt.empty(4, dtype="int32", sycl_queue=q)
-        np_src = np.arange(4, dtype="intc")
-        dpt_dst[:] = np_src  # should not raise exceptions
-        assert np.array_equal(dpt.asnumpy(dpt_dst), np_src)
-    if dpt.dtype("long").itemsize == dpt.dtype("longlong").itemsize:
-        dpt_dst = dpt.empty(4, dtype="longlong", sycl_queue=q)
-        np_src = np.arange(4, dtype="long")
-        dpt_dst[:] = np_src  # should not raise exceptions
-        assert np.array_equal(dpt.asnumpy(dpt_dst), np_src)
-
-
-def test_shape_setter():
-    def cc_strides(sh):
-        return np.empty(sh, dtype="u1").strides
-
-    def relaxed_strides_equal(st1, st2, sh):
-        eq_ = True
-        for s1, s2, d in zip(st1, st2, sh):
-            eq_ = eq_ and ((d == 1) or (s1 == s2))
-        return eq_
-
-    sh_s = (2 * 3 * 4 * 5,)
-    sh_f = (
-        2,
-        3,
-        4,
-        5,
-    )
-    try:
-        X = dpt.usm_ndarray(sh_s, dtype="i8")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    X.shape = sh_f
-    assert X.shape == sh_f
-    assert relaxed_strides_equal(X.strides, cc_strides(sh_f), sh_f)
-    assert X.flags.c_contiguous, "reshaped array expected to be C-contiguous"
-
-    sh_s = (
-        2,
-        12,
-        5,
-    )
-    sh_f = (
-        2,
-        3,
-        4,
-        5,
-    )
-    X = dpt.usm_ndarray(sh_s, dtype="u4", order="C")
-    X.shape = sh_f
-    assert X.shape == sh_f
-    assert relaxed_strides_equal(X.strides, cc_strides(sh_f), sh_f)
-
-    sh_s = (2, 3, 4, 5)
-    sh_f = (4, 3, 2, 5)
-    X = dpt.usm_ndarray(sh_s, dtype="f4")
-    X.shape = sh_f
-    assert relaxed_strides_equal(X.strides, cc_strides(sh_f), sh_f)
-
-    sh_s = (2, 3, 4, 5)
-    sh_f = (4, 3, 1, 2, 5)
-    X = dpt.usm_ndarray(sh_s, dtype="?")
-    X.shape = sh_f
-    assert relaxed_strides_equal(X.strides, cc_strides(sh_f), sh_f)
-    sz = X.size
-    X.shape = sz
-    assert X.shape == (sz,)
-    assert relaxed_strides_equal(X.strides, (1,), (sz,))
-
-    X = dpt.usm_ndarray(sh_s, dtype="u4")
-    with pytest.raises(TypeError):
-        X.shape = "abcbe"
-    X = dpt.usm_ndarray((4, 4), dtype="u1")[::2, ::2]
-    with pytest.raises(AttributeError):
-        X.shape = (4,)
-    X = dpt.usm_ndarray((0,), dtype="i4")
-    X.shape = (0,)
-    X.shape = (
-        2,
-        0,
-    )
-    X.shape = (
-        0,
-        2,
-    )
-    X.shape = (
-        1,
-        0,
-        1,
-    )
-
-
-def test_len():
-    try:
-        X = dpt.usm_ndarray(1, "i4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    assert len(X) == 1
-    X = dpt.usm_ndarray((2, 1), "i4")
-    assert len(X) == 2
-    X = dpt.usm_ndarray(tuple(), "i4")
-    with pytest.raises(TypeError):
-        len(X)
-
-
-def test_array_namespace():
-    try:
-        X = dpt.usm_ndarray(1, "i4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    X.__array_namespace__()
-    X._set_namespace(dpt)
-    assert X.__array_namespace__() is dpt
-    X.__array_namespace__(api_version=dpt.__array_api_version__)
-    assert X.__array_namespace__() is dpt
-
-
-def test_dlpack():
-    try:
-        X = dpt.usm_ndarray(1, "i4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    X.__dlpack_device__()
-    X.__dlpack__(stream=None)
-
-
-def test_to_device():
-    try:
-        X = dpt.usm_ndarray(1, "f4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    for dev in dpctl.get_devices():
-        if dev.default_selector_score > 0:
-            Y = X.to_device(dev)
-            assert Y.sycl_device == dev
-
-
-def test_to_device_stream_validation():
-    try:
-        X = dpt.usm_ndarray(1, "f4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    # invalid type of stream keyword
-    with pytest.raises(TypeError):
-        X.to_device(X.sycl_queue, stream=dict())
-    # stream is keyword-only arg
-    with pytest.raises(TypeError):
-        X.to_device(X.sycl_queue, X.sycl_queue)
-
-
-def test_to_device_stream_use():
-    try:
-        X = dpt.usm_ndarray(1, "f4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    q1 = dpctl.SyclQueue(
-        X.sycl_context, X.sycl_device, property="enable_profiling"
-    )
-    X.to_device(q1, stream=q1)
-
-
-def test_to_device_migration():
-    q1 = get_queue_or_skip()  # two distinct copies of default-constructed queue
-    q2 = get_queue_or_skip()
-    X1 = dpt.empty((5,), dtype="i8", sycl_queue=q1)  # X1 is associated with q1
-    X2 = X1.to_device(q2)  # X2 is reassociated with q2
-    assert X1.sycl_queue == q1
-    assert X2.sycl_queue == q2
-    assert X1.usm_data._pointer == X2.usm_data._pointer
-
-
-def test_astype():
-    try:
-        X = dpt.empty((5, 5), dtype="i4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    X[:] = np.full((5, 5), 7, dtype="i4")
-    Y = dpt.astype(X, "c8", order="C")
-    assert np.allclose(dpt.to_numpy(Y), np.full((5, 5), 7, dtype="c8"))
-    if Y.sycl_device.has_aspect_fp16:
-        Y = dpt.astype(X[::2, ::-1], "f2", order="K")
-        assert np.allclose(dpt.to_numpy(Y), np.full(Y.shape, 7, dtype="f2"))
-    Y = dpt.astype(X[::2, ::-1], "f4", order="K")
-    assert np.allclose(dpt.to_numpy(Y), np.full(Y.shape, 7, dtype="f4"))
-    Y = dpt.astype(X[::2, ::-1], "i4", order="K", copy=False)
-    assert Y.usm_data is X.usm_data
-    Y = dpt.astype(X, None, order="K")
-    if X.sycl_queue.sycl_device.has_aspect_fp64:
-        assert Y.dtype is dpt.float64
-    else:
-        assert Y.dtype is dpt.float32
-
-
-def test_astype_invalid_order():
-    try:
-        X = dpt.usm_ndarray(5, "i4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    with pytest.raises(ValueError):
-        dpt.astype(X, "i4", order="WRONG")
-
-
-def test_astype_device():
-    get_queue_or_skip()
-    q1 = dpctl.SyclQueue()
-    q2 = dpctl.SyclQueue()
-
-    x = dpt.arange(5, dtype="i4", sycl_queue=q1)
-    r = dpt.astype(x, "f4")
-    assert r.sycl_queue == x.sycl_queue
-    assert r.sycl_device == x.sycl_device
-
-    r = dpt.astype(x, "f4", device=q2)
-    assert r.sycl_queue == q2
-
-
-def test_astype_gh_1926():
-    get_queue_or_skip()
-
-    x = dpt.ones(64)
-    x_ = dpt.astype(x, x.dtype, copy=False, order="C")
-    assert x is x_
-
-    x__ = dpt.astype(x, x.dtype, copy=False, order="F")
-    assert x is x__
-
-
-def test_astype_gh_2121():
-    get_queue_or_skip()
-
-    x_np = np.asarray([0, 3, 1, 2, 0, 1], dtype="u1").view("?")
-    x = dpt.asarray(x_np)
-    res = dpt.astype(x, dpt.uint8)
-    expected = dpt.asarray([0, 1, 1, 1, 0, 1], dtype="u1")
-    assert dpt.all(res == expected)
-
-
-def test_copy():
-    try:
-        X = dpt.usm_ndarray((5, 5), "i4")[2:4, 1:4]
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    X[:] = 42
-    Yc = dpt.copy(X, order="C")
-    Yf = dpt.copy(X, order="F")
-    Ya = dpt.copy(X, order="A")
-    Yk = dpt.copy(X, order="K")
-    assert Yc.usm_data is not X.usm_data
-    assert Yf.usm_data is not X.usm_data
-    assert Ya.usm_data is not X.usm_data
-    assert Yk.usm_data is not X.usm_data
-    assert Yc.strides == (3, 1)
-    assert Yf.strides == (1, 2)
-    assert Ya.strides == (3, 1)
-    assert Yk.strides == (3, 1)
-    ref = np.full(X.shape, 42, dtype=X.dtype)
-    assert np.array_equal(dpt.asnumpy(Yc), ref)
-    assert np.array_equal(dpt.asnumpy(Yf), ref)
-    assert np.array_equal(dpt.asnumpy(Ya), ref)
-    assert np.array_equal(dpt.asnumpy(Yk), ref)
-
-
-def test_copy_unaligned():
-    get_queue_or_skip()
-
-    x = dpt.ones(513, dtype="i4")
-    r = dpt.astype(x[1:], "f4")
-
-    assert dpt.all(r == 1)
-
-
-def test_ctor_invalid():
-    try:
-        m = dpm.MemoryUSMShared(12)
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    with pytest.raises(ValueError):
-        dpt.usm_ndarray((4,), dtype="i4", buffer=m)
-    m = dpm.MemoryUSMShared(64)
-    with pytest.raises(ValueError):
-        dpt.usm_ndarray((4,), dtype="u1", buffer=m, strides={"not": "valid"})
-
-
-def test_reshape():
-    try:
-        X = dpt.usm_ndarray((5, 5), "i4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    # can be done as views
-    Y = dpt.reshape(X, (25,))
-    assert Y.shape == (25,)
-    Z = X[::2, ::2]
-    # requires a copy
-    W = dpt.reshape(Z, (Z.size,), order="F")
-    assert W.shape == (Z.size,)
-    with pytest.raises(TypeError):
-        dpt.reshape("invalid")
-    with pytest.raises(ValueError):
-        dpt.reshape(Z, (2, 2, 2, 2, 2))
-    with pytest.raises(ValueError):
-        dpt.reshape(Z, Z.shape, order="invalid")
-    W = dpt.reshape(Z, (-1,), order="C")
-    assert W.shape == (Z.size,)
-
-    X = dpt.usm_ndarray((1,), dtype="i8")
-    Y = dpt.reshape(X, X.shape)
-    assert Y.flags == X.flags
-
-    A = dpt.usm_ndarray((0,), "i4")
-    A1 = dpt.reshape(A, (0,))
-    assert A1.shape == (0,)
-    requested_shape = (
-        2,
-        0,
-    )
-    A2 = dpt.reshape(A, requested_shape)
-    assert A2.shape == requested_shape
-    requested_shape = (
-        0,
-        2,
-    )
-    A3 = dpt.reshape(A, requested_shape)
-    assert A3.shape == requested_shape
-    requested_shape = (
-        1,
-        0,
-        2,
-    )
-    A4 = dpt.reshape(A, requested_shape)
-    assert A4.shape == requested_shape
-
-
-def test_reshape_orderF():
-    try:
-        a = dpt.arange(6 * 3 * 4, dtype="i4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    b = dpt.reshape(a, (6, 2, 6))
-    c = dpt.reshape(b, (9, 8), order="F")
-    assert c.flags.f_contiguous
-    assert c._pointer != b._pointer
-    assert b._pointer == a._pointer
-
-    a_np = np.arange(6 * 3 * 4, dtype="i4")
-    b_np = np.reshape(a_np, (6, 2, 6))
-    c_np = np.reshape(b_np, (9, 8), order="F")
-    assert np.array_equal(c_np, dpt.asnumpy(c))
-
-
-def test_reshape_noop():
-    """Per gh-1664"""
-    try:
-        a = dpt.ones((2, 1))
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    b = dpt.reshape(a, (2, 1))
-    assert b is a
-
-
-def test_reshape_zero_size():
-    try:
-        a = dpt.empty((0,))
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    with pytest.raises(ValueError):
-        dpt.reshape(a, (-1, 0))
-
-
-def test_reshape_large_ndim():
-    ndim = 32
-    idx = tuple(1 if i + 1 < ndim else ndim for i in range(ndim))
-    try:
-        d = dpt.ones(ndim, dtype="i4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    d = dpt.reshape(d, idx)
-    assert d.shape == idx
-
-
-def test_reshape_copy_kwrd():
-    try:
-        X = dpt.usm_ndarray((2, 3), "i4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    new_shape = (6,)
-    Z = dpt.reshape(X, new_shape, copy=True)
-    assert Z.shape == new_shape
-    assert Z.usm_data is not X.usm_data
-    X = dpt.usm_ndarray((3, 3), "i4")[::2, ::2]
-    new_shape = (4,)
-    with pytest.raises(ValueError):
-        Z = dpt.reshape(X, new_shape, copy=False)
-    with pytest.raises(ValueError):
-        invalid = Ellipsis
-        Z = dpt.reshape(X, new_shape, copy=invalid)
-
-
-def test_transpose():
-    n, m = 2, 3
-    try:
-        X = dpt.usm_ndarray((n, m), "f4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    Xnp = np.arange(n * m, dtype="f4").reshape((n, m))
-    X[:] = Xnp
-    assert np.array_equal(dpt.to_numpy(X.T), Xnp.T)
-    assert np.array_equal(dpt.to_numpy(X[1:].T), Xnp[1:].T)
-
-
-def test_real_imag_views():
-    n, m = 2, 3
-    try:
-        X = dpt.usm_ndarray((n, m), "c8")
-        X_scalar = dpt.usm_ndarray((), dtype="c8")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    Xnp_r = np.arange(n * m, dtype="f4").reshape((n, m))
-    Xnp_i = np.arange(n * m, 2 * n * m, dtype="f4").reshape((n, m))
-    Xnp = Xnp_r + 1j * Xnp_i
-    X[:] = Xnp
-    X_real = X.real
-    X_imag = X.imag
-    assert np.array_equal(dpt.to_numpy(X_real), Xnp.real)
-    assert np.array_equal(dpt.to_numpy(X.imag), Xnp.imag)
-    assert not X_real.flags["C"] and not X_real.flags["F"]
-    assert not X_imag.flags["C"] and not X_imag.flags["F"]
-    assert X_real.strides == X_imag.strides
-    assert np.array_equal(dpt.to_numpy(X[1:].real), Xnp[1:].real)
-    assert np.array_equal(dpt.to_numpy(X[1:].imag), Xnp[1:].imag)
-
-    X_scalar[...] = complex(n * m, 2 * n * m)
-    assert X_scalar.real and X_scalar.imag
-
-    # check that _zero_like works for scalars
-    X_scalar = dpt.usm_ndarray((), dtype="f4")
-    assert isinstance(X_scalar.imag, dpt.usm_ndarray)
-    assert not X_scalar.imag
-    assert X_scalar.real.sycl_queue == X_scalar.imag.sycl_queue
-
-
-def test_real_imag_views_fp16():
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dpt.float16, q)
-
-    X = dpt.usm_ndarray(
-        (3, 4), dtype=dpt.float16, buffer_ctor_kwargs={"queue": q}
-    )
-    assert isinstance(X.real, dpt.usm_ndarray) and isinstance(
-        X.imag, dpt.usm_ndarray
-    )
-
-
-@pytest.mark.parametrize(
-    "dtype",
-    _all_dtypes,
-)
-def test_zeros(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-    X = dpt.zeros(10, dtype=dtype, sycl_queue=q)
-    assert np.array_equal(dpt.asnumpy(X), np.zeros(10, dtype=dtype))
-
-
-@pytest.mark.parametrize(
-    "dtype",
-    _all_dtypes,
-)
-def test_ones(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-    X = dpt.ones(10, dtype=dtype, sycl_queue=q)
-    assert np.array_equal(dpt.asnumpy(X), np.ones(10, dtype=dtype))
-
-
-@pytest.mark.parametrize(
-    "dtype",
-    _all_dtypes,
-)
-def test_full(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-    X = dpt.full(10, 4, dtype=dtype, sycl_queue=q)
-    assert np.array_equal(dpt.asnumpy(X), np.full(10, 4, dtype=dtype))
-
-
-def test_full_cmplx128():
-    q = get_queue_or_skip()
-    dtype = "c16"
-    skip_if_dtype_not_supported(dtype, q)
-    fill_v = 1 + 1j
-    X = dpt.full(tuple(), fill_value=fill_v, dtype=dtype, sycl_queue=q)
-    assert np.array_equal(
-        dpt.asnumpy(X), np.full(tuple(), fill_value=fill_v, dtype=dtype)
-    )
-    fill_v = 0 + 1j
-    X = dpt.full(tuple(), fill_value=fill_v, dtype=dtype, sycl_queue=q)
-    assert np.array_equal(
-        dpt.asnumpy(X), np.full(tuple(), fill_value=fill_v, dtype=dtype)
-    )
-    fill_v = 0 + 0j
-    X = dpt.full(tuple(), fill_value=fill_v, dtype=dtype, sycl_queue=q)
-    assert np.array_equal(
-        dpt.asnumpy(X), np.full(tuple(), fill_value=fill_v, dtype=dtype)
-    )
-
-
-def test_full_dtype_inference():
-    try:
-        X = dpt.full(10, 4)
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    assert np.issubdtype(X.dtype, np.integer)
-    try:
-        X = dpt.full(10, True)
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    assert X.dtype is dpt.dtype(np.bool_)
-    assert np.issubdtype(dpt.full(10, 12.3).dtype, np.floating)
-    try:
-        X = dpt.full(10, 0.3 - 2j)
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    cdt = X.dtype
-    assert np.issubdtype(cdt, np.complexfloating)
-
-    assert np.issubdtype(dpt.full(10, 12.3, dtype=int).dtype, np.integer)
-    assert np.issubdtype(dpt.full(10, 0.3 - 2j, dtype=int).dtype, np.integer)
-    rdt = np.finfo(cdt).dtype
-    assert np.issubdtype(dpt.full(10, 0.3 - 2j, dtype=rdt).dtype, np.floating)
-
-
-@pytest.mark.parametrize("dt", ["f2", "f4", "f8"])
-def test_full_special_fp(dt):
-    """See gh-1314"""
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    ar = dpt.full(10, fill_value=dpt.nan)
-    err_msg = f"Failed for fill_value=dpt.nan and dtype {dt}"
-    assert dpt.isnan(ar[0]), err_msg
-
-    ar = dpt.full(10, fill_value=dpt.inf)
-    err_msg = f"Failed for fill_value=dpt.inf and dtype {dt}"
-    assert dpt.isinf(ar[0]) and dpt.greater(ar[0], 0), err_msg
-
-    ar = dpt.full(10, fill_value=-dpt.inf)
-    err_msg = f"Failed for fill_value=-dpt.inf and dtype {dt}"
-    assert dpt.isinf(ar[0]) and dpt.less(ar[0], 0), err_msg
-
-    ar = dpt.full(10, fill_value=dpt.pi)
-    err_msg = f"Failed for fill_value=dpt.pi and dtype {dt}"
-    check = abs(float(ar[0]) - dpt.pi) < 16 * dpt.finfo(ar.dtype).eps
-    assert check, err_msg
-
-
-def test_full_fill_array():
-    q = get_queue_or_skip()
-
-    Xnp = np.array([1, 2, 3], dtype="i4")
-    X = dpt.asarray(Xnp, sycl_queue=q)
-
-    shape = (3, 3)
-    Y = dpt.full(shape, X)
-    Ynp = np.full(shape, Xnp)
-
-    assert Y.dtype == Ynp.dtype
-    assert Y.usm_type == "device"
-    assert np.array_equal(dpt.asnumpy(Y), Ynp)
-
-
-def test_full_compute_follows_data():
-    q1 = get_queue_or_skip()
-    q2 = get_queue_or_skip()
-
-    X = dpt.arange(10, dtype="i4", sycl_queue=q1, usm_type="shared")
-    Y = dpt.full(10, X[3])
-
-    assert Y.dtype == X.dtype
-    assert Y.usm_type == X.usm_type
-    assert dpctl.utils.get_execution_queue((Y.sycl_queue, X.sycl_queue))
-    assert np.array_equal(dpt.asnumpy(Y), np.full(10, 3, dtype="i4"))
-
-    Y = dpt.full(10, X[3], dtype="f4", sycl_queue=q2, usm_type="host")
-
-    assert Y.dtype == dpt.dtype("f4")
-    assert Y.usm_type == "host"
-    assert dpctl.utils.get_execution_queue((Y.sycl_queue, q2))
-    assert np.array_equal(dpt.asnumpy(Y), np.full(10, 3, dtype="f4"))
-
-
-@pytest.mark.parametrize("order1", ["F", "C"])
-@pytest.mark.parametrize("order2", ["F", "C"])
-def test_full_order(order1, order2):
-    q = get_queue_or_skip()
-    Xnp = np.array([1, 2, 3], order=order1)
-    Ynp = np.full((3, 3), Xnp, order=order2)
-    Y = dpt.full((3, 3), Xnp, order=order2, sycl_queue=q)
-    assert Y.flags.c_contiguous == Ynp.flags.c_contiguous
-    assert Y.flags.f_contiguous == Ynp.flags.f_contiguous
-    assert np.array_equal(dpt.asnumpy(Y), Ynp)
-
-
-def test_full_strides():
-    q = get_queue_or_skip()
-    X = dpt.full((3, 3), dpt.arange(3, dtype="i4"), sycl_queue=q)
-    Xnp = np.full((3, 3), np.arange(3, dtype="i4"))
-    assert X.strides == tuple(el // Xnp.itemsize for el in Xnp.strides)
-    assert np.array_equal(dpt.asnumpy(X), Xnp)
-
-    X = dpt.full((3, 3), dpt.arange(6, dtype="i4")[::2], sycl_queue=q)
-    Xnp = np.full((3, 3), np.arange(6, dtype="i4")[::2])
-    assert X.strides == tuple(el // Xnp.itemsize for el in Xnp.strides)
-    assert np.array_equal(dpt.asnumpy(X), Xnp)
-
-
-@pytest.mark.parametrize("dt", ["i1", "u1", "i2", "u2", "i4", "u4", "i8", "u8"])
-def test_full_gh_1230(dt):
-    get_queue_or_skip()
-    dtype = dpt.dtype(dt)
-    dt_maxint = dpt.iinfo(dtype).max
-
-    if (dtype.itemsize < 8) and (np.lib.NumpyVersion(np.__version__) < "2.0.0"):
-        try:
-            X = dpt.full(1, fill_value=(dt_maxint + 1), dtype=dt)
-        except OverflowError:
-            pytest.skip("Expected OverflowError raised")
-        Y = dpt.full_like(X, fill_value=dpt.iinfo(dt).min)
-        assert dpt.all(X == Y)
-    else:
-        with pytest.raises(OverflowError):
-            dpt.full(1, dt_maxint + 1, dtype=dt)
-
-
-@pytest.mark.parametrize(
-    "dt",
-    _all_dtypes[1:],
-)
-def test_arange(dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-    X = dpt.arange(0, 123, dtype=dt, sycl_queue=q)
-    dt = dpt.dtype(dt)
-    if np.issubdtype(dt, np.integer):
-        assert int(X[47]) == 47
-    elif np.issubdtype(dt, np.floating):
-        assert float(X[47]) == 47.0
-    elif np.issubdtype(dt, np.complexfloating):
-        assert complex(X[47]) == 47.0 + 0.0j
-
-    # choose size larger than maximal value that u1/u2 can accommodate
-    sz = int(dpt.iinfo(dpt.int8).max)
-    X1 = dpt.arange(sz + 1, dtype=dt, sycl_queue=q)
-    assert X1.shape == (sz + 1,)
-
-    X2 = dpt.arange(sz, 0, -1, dtype=dt, sycl_queue=q)
-    assert X2.shape == (sz,)
-
-
-def test_arange_fp():
-    q = get_queue_or_skip()
-
-    assert dpt.arange(7, 0, -2, dtype="f4", device=q).shape == (4,)
-    assert dpt.arange(0, 1, 0.25, dtype="f4", device=q).shape == (4,)
-
-    has_fp64 = q.sycl_device.has_aspect_fp64
-    if has_fp64:
-        assert dpt.arange(7, 0, -2, dtype="f8", device=q).shape == (4,)
-    assert dpt.arange(0, 1, 0.25, dtype="f4", device=q).shape == (4,)
-
-    x = dpt.arange(9.7, stop=10, sycl_queue=q)
-    assert x.shape == (1,)
-    assert x.dtype == dpt.float64 if has_fp64 else dpt.float32
-
-
-def test_arange_step_None():
-    q = get_queue_or_skip()
-
-    x = dpt.arange(0, stop=10, step=None, dtype="int32", sycl_queue=q)
-    assert x.shape == (10,)
-
-
-def test_arange_bool():
-    q = get_queue_or_skip()
-
-    x = dpt.arange(0, stop=2, dtype="bool", sycl_queue=q)
-    assert x.shape == (2,)
-    assert x.dtype == dpt.bool
-
-
-def test_arange_mixed_types():
-    q = get_queue_or_skip()
-
-    x = dpt.arange(-2.5, stop=200, step=100, dtype="int32", sycl_queue=q)
-    assert x.shape[0] == 3
-    assert int(x[1]) == 99 + int(x[0])
-
-    x = dpt.arange(+2.5, stop=200, step=100, dtype="int32", device=x.device)
-    assert x.shape[0] == 2
-    assert int(x[1]) == 100 + int(x[0])
-
-    _stop = np.float32(504)
-    x = dpt.arange(0, stop=_stop, step=100, dtype="f4", device=x.device)
-    assert x.shape == (6,)
-
-    # ensure length is determined using uncast parameters
-    x = dpt.arange(-5, stop=10**2, step=2.7, dtype="int64", device=x.device)
-    assert x.shape == (39,)
-
-
-@pytest.mark.parametrize(
-    "dt",
-    _all_dtypes,
-)
-def test_linspace(dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-    X = dpt.linspace(0, 1, num=2, dtype=dt, sycl_queue=q)
-    assert np.allclose(dpt.asnumpy(X), np.linspace(0, 1, num=2, dtype=dt))
-
-
-def test_linspace_fp():
-    q = get_queue_or_skip()
-    n = 16
-    X = dpt.linspace(0, n - 1, num=n, sycl_queue=q)
-    if q.sycl_device.has_aspect_fp64:
-        assert X.dtype == dpt.dtype("float64")
-    else:
-        assert X.dtype == dpt.dtype("float32")
-    assert X.shape == (n,)
-    assert X.strides == (1,)
-
-
-@pytest.mark.parametrize("dtype", ["f2", "f4", "f8"])
-def test_linspace_fp_max(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-    n = 16
-    dt = dpt.dtype(dtype)
-    max_ = dpt.finfo(dt).max
-    X = dpt.linspace(max_, max_, endpoint=True, num=n, dtype=dt, sycl_queue=q)
-    assert X.shape == (n,)
-    assert X.strides == (1,)
-    assert np.allclose(
-        dpt.asnumpy(X), np.linspace(max_, max_, endpoint=True, num=n, dtype=dt)
-    )
-
-
-def test_linspace_int():
-    q = get_queue_or_skip()
-    X = dpt.linspace(0.1, 9.1, 11, endpoint=True, dtype=int, sycl_queue=q)
-    Xnp = np.linspace(0.1, 9.1, 11, endpoint=True, dtype=int)
-    assert np.array_equal(dpt.asnumpy(X), Xnp)
-
-
-@pytest.mark.parametrize(
-    "dt",
-    _all_dtypes,
-)
-@pytest.mark.parametrize(
-    "usm_kind",
-    [
-        "shared",
-        "device",
-        "host",
-    ],
-)
-def test_empty_like(dt, usm_kind):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    X = dpt.empty((4, 5), dtype=dt, usm_type=usm_kind, sycl_queue=q)
-    Y = dpt.empty_like(X)
-    assert X.shape == Y.shape
-    assert X.dtype == Y.dtype
-    assert X.usm_type == Y.usm_type
-    assert X.sycl_queue == Y.sycl_queue
-
-    X = dpt.empty(tuple(), dtype=dt, usm_type=usm_kind, sycl_queue=q)
-    Y = dpt.empty_like(X)
-    assert X.shape == Y.shape
-    assert X.dtype == Y.dtype
-    assert X.usm_type == Y.usm_type
-    assert X.sycl_queue == Y.sycl_queue
-
-
-def test_empty_unexpected_data_type():
-    with pytest.raises(TypeError):
-        try:
-            dpt.empty(1, dtype=np.object_)
-        except dpctl.SyclDeviceCreationError:
-            pytest.skip("No SYCL devices available")
-
-
-@pytest.mark.parametrize(
-    "dt",
-    _all_dtypes,
-)
-@pytest.mark.parametrize(
-    "usm_kind",
-    [
-        "shared",
-        "device",
-        "host",
-    ],
-)
-def test_zeros_like(dt, usm_kind):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    X = dpt.empty((4, 5), dtype=dt, usm_type=usm_kind, sycl_queue=q)
-    Y = dpt.zeros_like(X)
-    assert X.shape == Y.shape
-    assert X.dtype == Y.dtype
-    assert X.usm_type == Y.usm_type
-    assert X.sycl_queue == Y.sycl_queue
-    assert np.allclose(dpt.asnumpy(Y), np.zeros(X.shape, dtype=X.dtype))
-
-    X = dpt.empty(tuple(), dtype=dt, usm_type=usm_kind, sycl_queue=q)
-    Y = dpt.zeros_like(X)
-    assert X.shape == Y.shape
-    assert X.dtype == Y.dtype
-    assert X.usm_type == Y.usm_type
-    assert X.sycl_queue == Y.sycl_queue
-    assert np.array_equal(dpt.asnumpy(Y), np.zeros(X.shape, dtype=X.dtype))
-
-
-@pytest.mark.parametrize(
-    "dt",
-    _all_dtypes,
-)
-@pytest.mark.parametrize(
-    "usm_kind",
-    [
-        "shared",
-        "device",
-        "host",
-    ],
-)
-def test_ones_like(dt, usm_kind):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    X = dpt.empty((4, 5), dtype=dt, usm_type=usm_kind, sycl_queue=q)
-    Y = dpt.ones_like(X)
-    assert X.shape == Y.shape
-    assert X.dtype == Y.dtype
-    assert X.usm_type == Y.usm_type
-    assert X.sycl_queue == Y.sycl_queue
-    assert np.allclose(dpt.asnumpy(Y), np.ones(X.shape, dtype=X.dtype))
-
-    X = dpt.empty(tuple(), dtype=dt, usm_type=usm_kind, sycl_queue=q)
-    Y = dpt.ones_like(X)
-    assert X.shape == Y.shape
-    assert X.dtype == Y.dtype
-    assert X.usm_type == Y.usm_type
-    assert X.sycl_queue == Y.sycl_queue
-    assert np.array_equal(dpt.asnumpy(Y), np.ones(X.shape, dtype=X.dtype))
-
-
-@pytest.mark.parametrize(
-    "dt",
-    _all_dtypes,
-)
-@pytest.mark.parametrize(
-    "usm_kind",
-    [
-        "shared",
-        "device",
-        "host",
-    ],
-)
-def test_full_like(dt, usm_kind):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    fill_v = dpt.dtype(dt).type(1)
-    X = dpt.empty((4, 5), dtype=dt, usm_type=usm_kind, sycl_queue=q)
-    Y = dpt.full_like(X, fill_v)
-    assert X.shape == Y.shape
-    assert X.dtype == Y.dtype
-    assert X.usm_type == Y.usm_type
-    assert X.sycl_queue == Y.sycl_queue
-    assert np.allclose(dpt.asnumpy(Y), np.ones(X.shape, dtype=X.dtype))
-
-    X = dpt.empty(tuple(), dtype=dt, usm_type=usm_kind, sycl_queue=q)
-    Y = dpt.full_like(X, fill_v)
-    assert X.shape == Y.shape
-    assert X.dtype == Y.dtype
-    assert X.usm_type == Y.usm_type
-    assert X.sycl_queue == Y.sycl_queue
-    assert np.array_equal(dpt.asnumpy(Y), np.ones(X.shape, dtype=X.dtype))
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes)
-@pytest.mark.parametrize("usm_kind", ["shared", "device", "host"])
-def test_eye(dtype, usm_kind):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    X = dpt.eye(4, 5, k=1, dtype=dtype, usm_type=usm_kind, sycl_queue=q)
-    Xnp = np.eye(4, 5, k=1, dtype=dtype)
-    assert X.dtype == Xnp.dtype
-    assert np.array_equal(Xnp, dpt.asnumpy(X))
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes[1:])
-def test_tril(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    shape = (2, 3, 4, 5, 5)
-    X = dpt.reshape(dpt.arange(prod(shape), dtype=dtype, sycl_queue=q), shape)
-    Y = dpt.tril(X)
-    Xnp = np.arange(prod(shape), dtype=dtype).reshape(shape)
-    Ynp = np.tril(Xnp)
-    assert Y.dtype == Ynp.dtype
-    assert np.array_equal(Ynp, dpt.asnumpy(Y))
-
-
-@pytest.mark.parametrize("dtype", _all_dtypes[1:])
-def test_triu(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    shape = (4, 5)
-    X = dpt.reshape(dpt.arange(prod(shape), dtype=dtype, sycl_queue=q), shape)
-    Y = dpt.triu(X, k=1)
-    Xnp = np.arange(prod(shape), dtype=dtype).reshape(shape)
-    Ynp = np.triu(Xnp, k=1)
-    assert Y.dtype == Ynp.dtype
-    assert np.array_equal(Ynp, dpt.asnumpy(Y))
-
-
-@pytest.mark.parametrize("tri_fn", [dpt.tril, dpt.triu])
-@pytest.mark.parametrize("usm_type", ["device", "shared", "host"])
-def test_tri_usm_type(tri_fn, usm_type):
-    q = get_queue_or_skip()
-    dtype = dpt.uint16
-
-    shape = (2, 3, 4, 5, 5)
-    size = prod(shape)
-    X = dpt.reshape(
-        dpt.arange(size, dtype=dtype, usm_type=usm_type, sycl_queue=q), shape
-    )
-    Y = tri_fn(X)  # main execution branch
-    assert Y.usm_type == X.usm_type
-    assert Y.sycl_queue == q
-    Y = tri_fn(X, k=-6)  # special case of Y == X
-    assert Y.usm_type == X.usm_type
-    assert Y.sycl_queue == q
-    Y = tri_fn(X, k=6)  # special case of Y == 0
-    assert Y.usm_type == X.usm_type
-    assert Y.sycl_queue == q
-
-
-def test_tril_slice():
-    q = get_queue_or_skip()
-
-    shape = (6, 10)
-    X = dpt.reshape(dpt.arange(prod(shape), dtype="int", sycl_queue=q), shape)[
-        1:, ::-2
-    ]
-    Y = dpt.tril(X)
-    Xnp = np.arange(prod(shape), dtype="int").reshape(shape)[1:, ::-2]
-    Ynp = np.tril(Xnp)
-    assert Y.dtype == Ynp.dtype
-    assert np.array_equal(Ynp, dpt.asnumpy(Y))
-
-
-def test_triu_permute_dims():
-    q = get_queue_or_skip()
-
-    shape = (2, 3, 4, 5)
-    X = dpt.permute_dims(
-        dpt.reshape(dpt.arange(prod(shape), dtype="int", sycl_queue=q), shape),
-        (3, 2, 1, 0),
-    )
-    Y = dpt.triu(X)
-    Xnp = np.transpose(
-        np.arange(prod(shape), dtype="int").reshape(shape), (3, 2, 1, 0)
-    )
-    Ynp = np.triu(Xnp)
-    assert Y.dtype == Ynp.dtype
-    assert np.array_equal(Ynp, dpt.asnumpy(Y))
-
-
-def test_tril_broadcast_to():
-    q = get_queue_or_skip()
-
-    shape = (5, 5)
-    X = dpt.broadcast_to(dpt.ones((1), dtype="int", sycl_queue=q), shape)
-    Y = dpt.tril(X)
-    Xnp = np.broadcast_to(np.ones((1), dtype="int"), shape)
-    Ynp = np.tril(Xnp)
-    assert Y.dtype == Ynp.dtype
-    assert np.array_equal(Ynp, dpt.asnumpy(Y))
-
-
-def test_triu_bool():
-    q = get_queue_or_skip()
-
-    shape = (4, 5)
-    X = dpt.ones((shape), dtype="bool", sycl_queue=q)
-    Y = dpt.triu(X)
-    Xnp = np.ones((shape), dtype="bool")
-    Ynp = np.triu(Xnp)
-    assert Y.dtype == Ynp.dtype
-    assert np.array_equal(Ynp, dpt.asnumpy(Y))
-
-
-@pytest.mark.parametrize("order", ["F", "C"])
-@pytest.mark.parametrize("k", [-10, -2, -1, 3, 4, 10])
-def test_triu_order_k(order, k):
-    q = get_queue_or_skip()
-
-    shape = (3, 3)
-    X = dpt.reshape(
-        dpt.arange(prod(shape), dtype="int", sycl_queue=q),
-        shape,
-        order=order,
-    )
-    Y = dpt.triu(X, k=k)
-    Xnp = np.arange(prod(shape), dtype="int").reshape(shape, order=order)
-    Ynp = np.triu(Xnp, k=k)
-    assert Y.dtype == Ynp.dtype
-    assert X.flags == Y.flags
-    assert np.array_equal(Ynp, dpt.asnumpy(Y))
-
-
-@pytest.mark.parametrize("order", ["F", "C"])
-@pytest.mark.parametrize("k", [-10, -4, -3, 1, 2, 10])
-def test_tril_order_k(order, k):
-    try:
-        q = dpctl.SyclQueue()
-    except dpctl.SyclQueueCreationError:
-        pytest.skip("Queue could not be created")
-    shape = (3, 3)
-    X = dpt.reshape(
-        dpt.arange(prod(shape), dtype="int", sycl_queue=q),
-        shape,
-        order=order,
-    )
-    Y = dpt.tril(X, k=k)
-    Xnp = np.arange(prod(shape), dtype="int").reshape(shape, order=order)
-    Ynp = np.tril(Xnp, k=k)
-    assert Y.dtype == Ynp.dtype
-    assert X.flags == Y.flags
-    assert np.array_equal(Ynp, dpt.asnumpy(Y))
-
-
-def test_meshgrid():
-    q = get_queue_or_skip()
-
-    X = dpt.arange(5, sycl_queue=q)
-    Y = dpt.arange(3, sycl_queue=q)
-    Z = dpt.meshgrid(X, Y)
-    Znp = np.meshgrid(dpt.asnumpy(X), dpt.asnumpy(Y))
-    n = len(Z)
-    assert n == len(Znp)
-    for i in range(n):
-        assert np.array_equal(dpt.asnumpy(Z[i]), Znp[i])
-    assert dpt.meshgrid() == []
-    # dimension > 1 must raise ValueError
-    with pytest.raises(ValueError):
-        dpt.meshgrid(dpt.usm_ndarray((4, 4)))
-    # unknown indexing kwarg must raise ValueError
-    with pytest.raises(ValueError):
-        dpt.meshgrid(X, indexing="ji")
-    # input arrays with different data types must raise ValueError
-    with pytest.raises(ValueError):
-        dpt.meshgrid(X, dpt.asarray(Y, dtype="b1"))
-
-
-def test_meshgrid2():
-    q1 = get_queue_or_skip()
-    q2 = get_queue_or_skip()
-    q3 = get_queue_or_skip()
-
-    x1 = dpt.arange(0, 2, dtype="int16", sycl_queue=q1)
-    x2 = dpt.arange(3, 6, dtype="int16", sycl_queue=q2)
-    x3 = dpt.arange(6, 10, dtype="int16", sycl_queue=q3)
-    y1, y2, y3 = dpt.meshgrid(x1, x2, x3, indexing="xy")
-    z1, z2, z3 = dpt.meshgrid(x1, x2, x3, indexing="ij")
-    assert all(
-        x.sycl_queue == y.sycl_queue for x, y in zip((x1, x2, x3), (y1, y2, y3))
-    )
-    assert all(
-        x.sycl_queue == z.sycl_queue for x, z in zip((x1, x2, x3), (z1, z2, z3))
-    )
-    assert y1.shape == y2.shape and y2.shape == y3.shape
-    assert z1.shape == z2.shape and z2.shape == z3.shape
-    assert y1.shape == (len(x2), len(x1), len(x3))
-    assert z1.shape == (len(x1), len(x2), len(x3))
-
-
-def test_common_arg_validation():
-    order = "I"
-    # invalid order must raise ValueError
-    with pytest.raises(ValueError):
-        dpt.empty(10, order=order)
-    with pytest.raises(ValueError):
-        dpt.zeros(10, order=order)
-    with pytest.raises(ValueError):
-        dpt.ones(10, order=order)
-    with pytest.raises(ValueError):
-        dpt.full(10, 1, order=order)
-    with pytest.raises(ValueError):
-        dpt.eye(10, order=order)
-    try:
-        X = dpt.empty(10)
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    with pytest.raises(ValueError):
-        dpt.empty_like(X, order=order)
-    with pytest.raises(ValueError):
-        dpt.zeros_like(X, order=order)
-    with pytest.raises(ValueError):
-        dpt.ones_like(X, order=order)
-    with pytest.raises(ValueError):
-        dpt.full_like(X, 1, order=order)
-    X = dict()
-    # test for type validation
-    with pytest.raises(TypeError):
-        dpt.empty_like(X)
-    with pytest.raises(TypeError):
-        dpt.zeros_like(X)
-    with pytest.raises(TypeError):
-        dpt.ones_like(X)
-    with pytest.raises(TypeError):
-        dpt.full_like(X, 1)
-    with pytest.raises(TypeError):
-        dpt.tril(X)
-    with pytest.raises(TypeError):
-        dpt.triu(X)
-    with pytest.raises(TypeError):
-        dpt.meshgrid(X)
-
-
-def test_flags():
-    try:
-        x = dpt.empty(tuple(), dtype="i4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    f = x.flags
-    # check comparison with generic types
-    assert f != Ellipsis
-    f.__repr__()
-    assert f.c_contiguous == f["C"]
-    assert f.f_contiguous == f["F"]
-    assert f.contiguous == f["CONTIGUOUS"]
-    assert f.fc == f["FC"]
-    assert f.forc == f["FORC"]
-    assert f.fnc == f["FNC"]
-    assert f.writable == f["W"]
-
-
-def test_asarray_uint64():
-    Xnp = np.ndarray(1, dtype=np.uint64)
-    try:
-        X = dpt.asarray(Xnp)
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    assert X.dtype == Xnp.dtype
-
-
-def test_Device():
-    try:
-        dev = dpctl.select_default_device()
-        d1 = dpt.Device.create_device(dev)
-        d2 = dpt.Device.create_device(dev)
-    except (dpctl.SyclQueueCreationError, dpctl.SyclDeviceCreationError):
-        pytest.skip(
-            "Could not create default device, or a queue that targets it"
-        )
-    assert d1 == d2
-    dict = {d1: 1}
-    assert dict[d2] == 1
-    assert d1 == d2.sycl_queue
-    assert not d1 == Ellipsis
-
-
-def test_element_offset():
-    n0, n1 = 3, 8
-    try:
-        x = dpt.empty((n0, n1), dtype="i4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    assert isinstance(x._element_offset, int)
-    assert x._element_offset == 0
-    y = x[::-1, ::2]
-    assert y._element_offset == (n0 - 1) * n1
-
-
-def test_byte_bounds():
-    n0, n1 = 3, 8
-    try:
-        x = dpt.empty((n0, n1), dtype="i4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    assert isinstance(x._byte_bounds, tuple)
-    assert len(x._byte_bounds) == 2
-    lo, hi = x._byte_bounds
-    assert hi - lo == n0 * n1 * x.itemsize
-    y = x[::-1, ::2]
-    lo, hi = y._byte_bounds
-    assert hi - lo == (n0 * n1 - 1) * x.itemsize
-
-
-def test_gh_1201():
-    n = 100
-    a = np.flipud(np.arange(n, dtype="i4"))
-    try:
-        b = dpt.asarray(a)
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    assert (dpt.asnumpy(b) == a).all()
-    c = dpt.flip(dpt.empty(a.shape, dtype=a.dtype))
-    c[:] = a
-    assert (dpt.asnumpy(c) == a).all()
-
-
-class ObjWithSyclUsmArrayInterface:
-    def __init__(self, ary):
-        self._array_obj = ary
-
-    @property
-    def __sycl_usm_array_interface__(self):
-        _suai = self._array_obj.__sycl_usm_array_interface__
-        return _suai
-
-
-@pytest.mark.parametrize("ro_flag", [True, False])
-def test_asarray_writable_flag(ro_flag):
-    try:
-        a = dpt.empty(8)
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-
-    a.flags["W"] = not ro_flag
-    wrapped = ObjWithSyclUsmArrayInterface(a)
-
-    b = dpt.asarray(wrapped)
-
-    assert b.flags["W"] == (not ro_flag)
-    assert b._pointer == a._pointer
-
-
-def test_getitem_validation():
-    """Test based on gh-1785"""
-    try:
-        a = dpt.empty((2, 2, 2))
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    with pytest.raises(IndexError):
-        a[0.0]
-    with pytest.raises(IndexError):
-        a[1, 0.0, ...]
-    with pytest.raises(IndexError):
-        a[1, 0.0, dpt.newaxis, 1]
-    with pytest.raises(IndexError):
-        a[dpt.newaxis, ..., 0.0]
-    with pytest.raises(IndexError):
-        a[dpt.newaxis, ..., 0.0, dpt.newaxis]
-    with pytest.raises(IndexError):
-        a[..., 0.0, dpt.newaxis]
-    with pytest.raises(IndexError):
-        a[:, 0.0, dpt.newaxis]
-
-
-def test_array_like_ctors_order_K():
-    get_queue_or_skip()
-
-    sh = (10, 10)
-    x1 = dpt.zeros(sh, dtype="i4", order="C")
-    r1 = dpt.full_like(x1, 2, order="K")
-    assert dpt.all(r1 == 2)
-    assert r1.flags.c_contiguous
-    r2 = dpt.empty_like(x1, order="K")
-    assert r2.flags.c_contiguous
-    r3 = dpt.ones_like(x1, order="K")
-    assert dpt.all(r3 == 1)
-    assert r3.flags.c_contiguous
-    r4 = dpt.zeros_like(x1, order="K")
-    assert dpt.all(r4 == 0)
-    assert r4.flags.c_contiguous
-
-    x2 = dpt.zeros(sh, dtype="i4", order="F")
-    r5 = dpt.full_like(x2, 2, order="K")
-    assert dpt.all(r5 == 2)
-    assert r5.flags.f_contiguous
-    r6 = dpt.empty_like(x2, order="K")
-    assert r6.flags.f_contiguous
-    r7 = dpt.ones_like(x2, order="K")
-    assert dpt.all(r7 == 1)
-    assert r7.flags.f_contiguous
-    r8 = dpt.zeros_like(x2, order="K")
-    assert dpt.all(r8 == 0)
-    assert r8.flags.f_contiguous
-
-    x3 = dpt.zeros(sh, dtype="i4", order="C")[::-2, :5]
-    st_expected = (-5, 1)
-    r9 = dpt.full_like(x3, 2, order="K")
-    assert dpt.all(r1 == 2)
-    assert r9.strides == st_expected
-    assert not r9.flags.forc
-    r10 = dpt.empty_like(x3, order="K")
-    assert not r10.flags.forc
-    assert r10.strides == st_expected
-    r11 = dpt.ones_like(x3, order="K")
-    assert dpt.all(r11 == 1)
-    assert not r11.flags.forc
-    assert r11.strides == st_expected
-    r12 = dpt.zeros_like(x3, order="K")
-    assert dpt.all(r12 == 0)
-    assert not r12.flags.forc
-    assert r12.strides == st_expected
-
-
-def test_array_like_ctors_order_A():
-    get_queue_or_skip()
-
-    sh = (10, 10)
-    x1 = dpt.zeros(sh, dtype="i4", order="C")
-    r1 = dpt.full_like(x1, 2, order="A")
-    assert dpt.all(r1 == 2)
-    assert r1.flags.c_contiguous
-    r2 = dpt.empty_like(x1, order="A")
-    assert r2.flags.c_contiguous
-    r3 = dpt.ones_like(x1, order="A")
-    assert dpt.all(r3 == 1)
-    assert r3.flags.c_contiguous
-    r4 = dpt.zeros_like(x1, order="A")
-    assert dpt.all(r4 == 0)
-    assert r4.flags.c_contiguous
-
-    x2 = dpt.zeros(sh, dtype="i4", order="F")
-    r5 = dpt.full_like(x2, 2, order="A")
-    assert dpt.all(r5 == 2)
-    assert r5.flags.f_contiguous
-    r6 = dpt.empty_like(x2, order="A")
-    assert r6.flags.f_contiguous
-    r7 = dpt.ones_like(x2, order="A")
-    assert dpt.all(r7 == 1)
-    assert r7.flags.f_contiguous
-    r8 = dpt.zeros_like(x2, order="A")
-    assert dpt.all(r8 == 0)
-    assert r8.flags.f_contiguous
-
-    x3 = dpt.zeros(sh, dtype="i4", order="C")[::-2, :5]
-    r9 = dpt.full_like(x3, 2, order="A")
-    assert dpt.all(r1 == 2)
-    assert r9.flags.c_contiguous
-    r10 = dpt.empty_like(x3, order="A")
-    assert r10.flags.c_contiguous
-    r11 = dpt.ones_like(x3, order="A")
-    assert dpt.all(r11 == 1)
-    assert r11.flags.c_contiguous
-    r12 = dpt.zeros_like(x3, order="A")
-    assert dpt.all(r12 == 0)
-    assert r12.flags.c_contiguous
-
-
-def test_full_like_order_K_array_fill_v():
-    get_queue_or_skip()
-
-    x = dpt.zeros((10, 10), dtype="i4")
-    fill_v = dpt.asarray(2, dtype="i4")
-
-    r1 = dpt.full_like(x, fill_v, order="K")
-    assert dpt.all(r1 == 2)
-
-    # broadcast behavior
-    fill_v = dpt.arange(10, dtype="i4")[:, dpt.newaxis]
-    r1 = dpt.full_like(x, fill_v, order="K")
-    assert dpt.all(r1 == dpt.tile(fill_v, (1, 10)))
-
-
-def test_full_like_order_K_same_input_output_queues():
-    q1 = get_queue_or_skip()
-    q2 = get_queue_or_skip()
-
-    x = dpt.zeros((10, 10), dtype="i4", sycl_queue=q1)
-    fill_v = dpt.asarray(2, dtype="i4", sycl_queue=q2)
-
-    r = dpt.full_like(x, fill_v, order="K")
-    assert r.sycl_queue == x.sycl_queue
-
-
-def test_asarray_from_numpy_contig():
-    get_queue_or_skip()
-
-    i_dt = np.int64
-    Xnp = np.arange(32, dtype=i_dt)
-
-    fp_dt = dpt.float32
-    # Use contig copy kernel
-    Xdpt = dpt.asarray(Xnp, dtype=fp_dt)
-
-    assert dpt.all(Xdpt == dpt.arange(32, dtype=fp_dt))
-
-
-def test_setitem_from_numpy_contig():
-    get_queue_or_skip()
-
-    i_dt = np.int64
-    fp_dt = dpt.float32
-
-    Xnp = np.flip(np.arange(32, dtype=i_dt))
-    Xdpt = dpt.flip(dpt.empty(Xnp.shape, dtype=fp_dt))
-    # Use contig copy kernel, after stride simplification
-    Xdpt[:] = Xnp
-
-    expected = dpt.arange(31, stop=-1, step=-1, dtype=fp_dt)
-    assert dpt.all(Xdpt == expected)
-
-    Xnp = np.fliplr(np.reshape(np.arange(-10, 10, dtype=i_dt), (4, 5)))
-    Xdpt = dpt.flip(dpt.empty(Xnp.shape, dtype=fp_dt), axis=-1)
-
-    # after stride simplification, contig kernel is used
-    Xdpt[:] = Xnp
-
-    expected = dpt.reshape(dpt.arange(-10, 10, dtype=fp_dt), (4, 5))
-    assert dpt.all(dpt.flip(Xdpt, axis=-1) == expected)
-
-
-def test_full_functions_raise_type_error():
-    get_queue_or_skip()
-
-    with pytest.raises(TypeError):
-        dpt.full(1, "0")
-
-    x = dpt.ones(1, dtype="i4")
-    with pytest.raises(TypeError):
-        dpt.full_like(x, "0")
-
-
-@pytest.mark.parametrize("dt", _all_dtypes)
-def test_setitem_copy_as_contig_alignment(dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    dtype_ = dpt.dtype(dt)
-    n0, n1 = 8, 23
-
-    x = dpt.zeros((n0, n1), dtype=dtype_, sycl_queue=q)
-
-    vals = dpt.ones(n1, dtype=dtype_, sycl_queue=q)[dpt.newaxis, :]
-    x[1:, ...] = vals
-    assert dpt.all(x[0] == 0)
-    assert dpt.all(x[1:, :] == vals)
-
-
-def test_asarray_property():
-    get_queue_or_skip()
-
-    x = dpt.ones(11, dtype="i4")
-
-    with pytest.raises(TypeError):
-        np.asarray(x)
diff --git a/dpctl/tests/test_usm_ndarray_dlpack.py b/dpctl/tests/test_usm_ndarray_dlpack.py
deleted file mode 100644
index 89539762a8..0000000000
--- a/dpctl/tests/test_usm_ndarray_dlpack.py
+++ /dev/null
@@ -1,902 +0,0 @@
-#                      Data Parallel Control (dpctl)
-#
-# Copyright 2020-2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-import ctypes
-
-import numpy as np
-import pytest
-
-import dpctl
-import dpctl.tensor as dpt
-import dpctl.tensor._dlpack as _dlp
-import dpctl.tensor._usmarray as dpt_arr
-
-from .helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-device_CPU = dpt_arr.DLDeviceType.kDLCPU
-device_oneAPI = dpt_arr.DLDeviceType.kDLOneAPI
-
-_usm_types_list = ["shared", "device", "host"]
-
-
-@pytest.fixture(params=_usm_types_list)
-def usm_type(request):
-    return request.param
-
-
-_typestrs_list = [
-    "b1",
-    "u1",
-    "i1",
-    "u2",
-    "i2",
-    "u4",
-    "i4",
-    "u8",
-    "i8",
-    "f2",
-    "f4",
-    "f8",
-    "c8",
-    "c16",
-]
-
-
-@pytest.fixture(params=_typestrs_list)
-def typestr(request):
-    return request.param
-
-
-@pytest.fixture
-def all_root_devices():
-    """
-    Caches root devices. For the sake of speed
-    of test suite execution, keep at most two
-    devices from each platform
-    """
-    devs = dpctl.get_devices()
-    devs_per_platform = collections.defaultdict(list)
-    for dev in devs:
-        devs_per_platform[dev.sycl_platform].append(dev)
-
-    pruned = map(lambda li: li[:2], devs_per_platform.values())
-    return sum(pruned, start=[])
-
-
-def test_dlpack_device(usm_type, all_root_devices):
-    for sycl_dev in all_root_devices:
-        X = dpt.empty((64,), dtype="u1", usm_type=usm_type, device=sycl_dev)
-        dev = X.__dlpack_device__()
-        assert type(dev) is tuple
-        assert len(dev) == 2
-        assert dev[0] == device_oneAPI
-        assert dev[1] == sycl_dev.get_device_id()
-
-
-def test_dlpack_exporter(typestr, usm_type, all_root_devices):
-    caps_fn = ctypes.pythonapi.PyCapsule_IsValid
-    caps_fn.restype = bool
-    caps_fn.argtypes = [ctypes.py_object, ctypes.c_char_p]
-    for sycl_dev in all_root_devices:
-        skip_if_dtype_not_supported(typestr, sycl_dev)
-        X = dpt.empty((64,), dtype=typestr, usm_type=usm_type, device=sycl_dev)
-        caps = X.__dlpack__()
-        assert caps_fn(caps, b"dltensor")
-        Y = X[::2]
-        caps2 = Y.__dlpack__()
-        assert caps_fn(caps2, b"dltensor")
-
-
-def test_dlpack_exporter_empty(typestr, usm_type):
-    caps_fn = ctypes.pythonapi.PyCapsule_IsValid
-    caps_fn.restype = bool
-    caps_fn.argtypes = [ctypes.py_object, ctypes.c_char_p]
-    try:
-        sycl_dev = dpctl.select_default_device()
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    skip_if_dtype_not_supported(typestr, sycl_dev)
-    X = dpt.empty((0,), dtype=typestr, usm_type=usm_type, device=sycl_dev)
-    caps = X.__dlpack__()
-    assert caps_fn(caps, b"dltensor")
-    Y = dpt.empty(
-        (
-            1,
-            0,
-        ),
-        dtype=typestr,
-        usm_type=usm_type,
-        device=sycl_dev,
-    )
-    caps = Y.__dlpack__()
-    assert caps_fn(caps, b"dltensor")
-
-
-def test_dlpack_exporter_stream():
-    try:
-        q1 = dpctl.SyclQueue()
-        q2 = dpctl.SyclQueue()
-    except dpctl.SyclQueueCreationError:
-        pytest.skip("Could not create default queues")
-    X = dpt.empty((64,), dtype="u1", sycl_queue=q1)
-    cap1 = X.__dlpack__(stream=q1)
-    cap2 = X.__dlpack__(stream=q2)
-    assert type(cap1) is type(cap2)
-
-
-@pytest.mark.parametrize("shape", [tuple(), (2,), (3, 0, 1), (2, 2, 2)])
-def test_from_dlpack(shape, typestr, usm_type, all_root_devices):
-    for sycl_dev in all_root_devices:
-        skip_if_dtype_not_supported(typestr, sycl_dev)
-        X = dpt.empty(shape, dtype=typestr, usm_type=usm_type, device=sycl_dev)
-        Y = dpt.from_dlpack(X)
-        assert X.shape == Y.shape
-        assert X.dtype == Y.dtype
-        assert X.usm_type == Y.usm_type
-        assert X._pointer == Y._pointer
-        # we can only expect device to round-trip for USM-device and
-        # USM-shared allocations, which are made for specific device
-        assert (Y.usm_type == "host") or (X.sycl_device == Y.sycl_device)
-        if Y.ndim:
-            V = Y[::-1]
-            W = dpt.from_dlpack(V)
-            assert V.strides == W.strides
-
-
-@pytest.mark.parametrize("mod", [2, 5])
-def test_from_dlpack_strides(mod, typestr, usm_type, all_root_devices):
-    for sycl_dev in all_root_devices:
-        skip_if_dtype_not_supported(typestr, sycl_dev)
-        X0 = dpt.empty(
-            3 * mod, dtype=typestr, usm_type=usm_type, device=sycl_dev
-        )
-        for start in range(mod):
-            X = X0[slice(-start - 1, None, -mod)]
-            Y = dpt.from_dlpack(X)
-            assert X.shape == Y.shape
-            assert X.dtype == Y.dtype
-            assert X.usm_type == Y.usm_type
-            assert X._pointer == Y._pointer
-            # we can only expect device to round-trip for USM-device and
-            # USM-shared allocations, which are made for specific device
-            assert (Y.usm_type == "host") or (X.sycl_device == Y.sycl_device)
-            if Y.ndim:
-                V = Y[::-1]
-                W = dpt.from_dlpack(V)
-                assert V.strides == W.strides
-
-
-def test_from_dlpack_input_validation():
-    v = dpt._dlpack.get_build_dlpack_version()
-    assert type(v) is tuple
-    with pytest.raises(TypeError):
-        dpt.from_dlpack(None)
-
-    class DummyWithProperty:
-        @property
-        def __dlpack__(self):
-            return None
-
-    with pytest.raises(TypeError):
-        dpt.from_dlpack(DummyWithProperty())
-
-    class DummyWithMethod:
-        def __dlpack__(self):
-            return None
-
-    with pytest.raises(TypeError):
-        dpt.from_dlpack(DummyWithMethod())
-
-
-def test_from_dlpack_fortran_contig_array_roundtripping():
-    """Based on examples from issue gh-1241"""
-    n0, n1 = 3, 5
-    try:
-        ar1d = dpt.arange(n0 * n1, dtype="i4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No default device available")
-    ar2d_c = dpt.reshape(ar1d, (n0, n1), order="C")
-    ar2d_f = dpt.asarray(ar2d_c, order="F")
-    ar2d_r = dpt.from_dlpack(ar2d_f)
-
-    assert dpt.all(dpt.equal(ar2d_f, ar2d_r))
-    assert dpt.all(dpt.equal(ar2d_c, ar2d_r))
-
-
-def test_dlpack_from_subdevice():
-    """
-    This test checks that array allocated on a sub-device,
-    with memory bound to platform-default SyclContext can be
-    exported and imported via DLPack.
-    """
-    n = 64
-    try:
-        dev = dpctl.SyclDevice()
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No default device available")
-    try:
-        sdevs = dev.create_sub_devices(partition="next_partitionable")
-    except dpctl.SyclSubDeviceCreationError:
-        sdevs = None
-    try:
-        if sdevs is None:
-            sdevs = dev.create_sub_devices(partition=[1, 1])
-    except dpctl.SyclSubDeviceCreationError:
-        pytest.skip("Default device can not be partitioned")
-    assert isinstance(sdevs, list) and len(sdevs) > 0
-    try:
-        ctx = sdevs[0].sycl_platform.default_context
-    except dpctl.SyclContextCreationError:
-        pytest.skip("Platform's default_context is not available")
-    try:
-        q = dpctl.SyclQueue(ctx, sdevs[0])
-    except dpctl.SyclQueueCreationError:
-        pytest.skip("Queue could not be created")
-
-    ar = dpt.arange(n, dtype=dpt.int32, sycl_queue=q)
-    ar2 = dpt.from_dlpack(ar)
-    assert ar2.sycl_device == sdevs[0]
-
-
-def test_legacy_dlpack_capsule():
-    try:
-        x = dpt.arange(100, dtype="i4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No default device available")
-
-    legacy_ver = (0, 8)
-
-    cap = x.__dlpack__(max_version=legacy_ver)
-    y = _dlp.from_dlpack_capsule(cap)
-    del cap
-    assert x._pointer == y._pointer
-
-    x = dpt.arange(100, dtype="u4")
-    x2 = dpt.reshape(x, (10, 10)).mT
-    cap = x2.__dlpack__(max_version=legacy_ver)
-    y = _dlp.from_dlpack_capsule(cap)
-    del cap
-    assert x2._pointer == y._pointer
-    del x2
-
-    x = dpt.arange(100, dtype="f4")
-    x2 = dpt.asarray(dpt.reshape(x, (10, 10)), order="F")
-    cap = x2.__dlpack__(max_version=legacy_ver)
-    y = _dlp.from_dlpack_capsule(cap)
-    del cap
-    assert x2._pointer == y._pointer
-
-    x = dpt.arange(100, dtype="c8")
-    x3 = x[::-2]
-    cap = x3.__dlpack__(max_version=legacy_ver)
-    y = _dlp.from_dlpack_capsule(cap)
-    assert x3._pointer == y._pointer
-    del x3, y, x
-    del cap
-
-    x = dpt.ones(100, dtype="?")
-    x4 = x[::-2]
-    cap = x4.__dlpack__(max_version=legacy_ver)
-    y = _dlp.from_dlpack_capsule(cap)
-    assert x4._pointer == y._pointer
-    del x4, y, x
-    del cap
-
-
-def test_versioned_dlpack_capsule():
-    try:
-        x = dpt.arange(100, dtype="i4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No default device available")
-
-    max_supported_ver = _dlp.get_build_dlpack_version()
-    cap = x.__dlpack__(max_version=max_supported_ver)
-    y = _dlp.from_dlpack_capsule(cap)
-    del cap
-    assert x._pointer == y._pointer
-
-    x2 = dpt.asarray(dpt.reshape(x, (10, 10)), order="F")
-    cap = x2.__dlpack__(max_version=max_supported_ver)
-    y = _dlp.from_dlpack_capsule(cap)
-    del cap
-    assert x2._pointer == y._pointer
-    del x2
-
-    x3 = x[::-2]
-    cap = x3.__dlpack__(max_version=max_supported_ver)
-    y = _dlp.from_dlpack_capsule(cap)
-    assert x3._pointer == y._pointer
-    del x3, y, x
-    del cap
-
-    # read-only array
-    x = dpt.arange(100, dtype="i4")
-    x.flags["W"] = False
-    cap = x.__dlpack__(max_version=max_supported_ver)
-    y = _dlp.from_dlpack_capsule(cap)
-    assert x._pointer == y._pointer
-    assert not y.flags.writable
-
-    # read-only array, and copy
-    cap = x.__dlpack__(max_version=max_supported_ver, copy=True)
-    y = _dlp.from_dlpack_capsule(cap)
-    assert x._pointer != y._pointer
-    assert not y.flags.writable
-
-
-def test_from_dlpack_kwargs():
-    try:
-        x = dpt.arange(100, dtype="i4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No default device available")
-
-    y = dpt.from_dlpack(x, copy=True)
-    assert x._pointer != y._pointer
-
-    z = dpt.from_dlpack(x, device=x.sycl_device)
-    assert z._pointer == x._pointer
-
-
-def test_dlpack_deleters():
-    try:
-        x = dpt.arange(100, dtype="i4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No default device available")
-
-    legacy_ver = (0, 8)
-    cap = x.__dlpack__(max_version=legacy_ver)
-    del cap
-
-    max_supported_ver = _dlp.get_build_dlpack_version()
-    cap = x.__dlpack__(max_version=max_supported_ver)
-    del cap
-
-
-def test_from_dlpack_device():
-    try:
-        x = dpt.arange(100, dtype="i4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No default device available")
-
-    out = dpt.from_dlpack(x, device=x.__dlpack_device__())
-    assert x.device == out.device
-    assert x._pointer == out._pointer
-
-    out = dpt.from_dlpack(x, device=x.device)
-    assert x.device == out.device
-    assert x._pointer == out._pointer
-
-    out = dpt.from_dlpack(x, device=x.sycl_device)
-    assert x.device == out.device
-    assert x._pointer == out._pointer
-
-
-def test_used_dlpack_capsule():
-    try:
-        x = dpt.arange(100, dtype="i4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No default device available")
-
-    legacy_ver = (0, 8)
-    cap = x.__dlpack__(max_version=legacy_ver)
-    _dlp.from_dlpack_capsule(cap)
-    with pytest.raises(
-        ValueError,
-        match="A DLPack tensor object can not be consumed multiple times",
-    ):
-        _dlp.from_dlpack_capsule(cap)
-    del cap
-
-    max_supported_ver = _dlp.get_build_dlpack_version()
-    cap = x.__dlpack__(max_version=max_supported_ver)
-    _dlp.from_dlpack_capsule(cap)
-    with pytest.raises(
-        ValueError,
-        match="A DLPack tensor object can not be consumed multiple times",
-    ):
-        _dlp.from_dlpack_capsule(cap)
-    del cap
-
-
-def test_dlpack_size_0():
-    try:
-        x = dpt.ones(0, dtype="i4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No default device available")
-
-    legacy_ver = (0, 8)
-    cap = x.__dlpack__(max_version=legacy_ver)
-    y = _dlp.from_dlpack_capsule(cap)
-    assert y._pointer == x._pointer
-
-    max_supported_ver = _dlp.get_build_dlpack_version()
-    cap = x.__dlpack__(max_version=max_supported_ver)
-    y = _dlp.from_dlpack_capsule(cap)
-    assert y._pointer == x._pointer
-
-
-def test_dlpack_max_version_validation():
-    try:
-        x = dpt.ones(100, dtype="i4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No default device available")
-
-    with pytest.raises(
-        TypeError,
-        match=r"`__dlpack__` expects `max_version` to be a "
-        r"2-tuple of integers `\(major, minor\)`, instead "
-        r"got .*",
-    ):
-        x.__dlpack__(max_version=1)
-
-
-def test_dlpack_kwargs():
-    try:
-        q1 = dpctl.SyclQueue()
-        q2 = dpctl.SyclQueue()
-    except dpctl.SyclQueueCreationError:
-        pytest.skip("Could not create default queues")
-    x = dpt.arange(100, dtype="i4", sycl_queue=q1)
-
-    legacy_ver = (0, 8)
-    cap = x.__dlpack__(stream=q2, max_version=legacy_ver, copy=True)
-    # `copy` ignored for legacy path
-    y = _dlp.from_dlpack_capsule(cap)
-    assert y._pointer == x._pointer
-    del x, y
-    del cap
-
-    x1 = dpt.arange(100, dtype="i4", sycl_queue=q1)
-    max_supported_ver = _dlp.get_build_dlpack_version()
-    cap = x1.__dlpack__(stream=q2, max_version=max_supported_ver, copy=False)
-    y = _dlp.from_dlpack_capsule(cap)
-    assert y._pointer == x1._pointer
-    del x1, y
-    del cap
-
-    x2 = dpt.arange(100, dtype="i4", sycl_queue=q1)
-    cap = x2.__dlpack__(stream=q2, max_version=max_supported_ver, copy=True)
-    y = _dlp.from_dlpack_capsule(cap)
-    assert y._pointer != x2._pointer
-    del x2, y
-    del cap
-
-
-def _is_capsule(o):
-    t = type(o)
-    return t.__module__ == "builtins" and t.__name__ == "PyCapsule"
-
-
-def test_dlpack_dl_device():
-    try:
-        x = dpt.arange(100, dtype="i4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    max_supported_ver = _dlp.get_build_dlpack_version()
-    cap1 = x.__dlpack__(
-        dl_device=x.__dlpack_device__(), max_version=max_supported_ver
-    )
-    assert _is_capsule(cap1)
-    cap2 = x.__dlpack__(dl_device=(1, 0), max_version=max_supported_ver)
-    assert _is_capsule(cap2)
-    cap3 = x.__dlpack__(
-        dl_device=(device_CPU, 0),
-        max_version=max_supported_ver,
-    )
-    assert _is_capsule(cap3)
-    cap4 = x.__dlpack__(dl_device=("kDLCPU", 0), max_version=max_supported_ver)
-    assert _is_capsule(cap4)
-    with pytest.raises(TypeError):
-        # pass method instead of return of its __call__ invocation
-        x.__dlpack__(
-            dl_device=x.__dlpack_device__, max_version=max_supported_ver
-        )
-    with pytest.raises(TypeError):
-        # exercise check for length
-        x.__dlpack__(dl_device=(3,), max_version=max_supported_ver)
-
-
-def test_from_dlpack_kdlcpu_interop_numpy():
-    """
-    Basic test that usm_ndarray can interoperate with NumPy ndarray
-    `__dlpack_device__`.
-    """
-    get_queue_or_skip()
-
-    sh = 5
-    dt = dpt.int32
-
-    X = dpt.empty(sh, dtype=dt)
-    dl_device_np = np.empty(()).__dlpack_device__()
-
-    Y = dpt.from_dlpack(X, device=dl_device_np)
-    assert isinstance(Y, np.ndarray)
-    assert X.shape == Y.shape
-    assert X.dtype == Y.dtype
-
-    V = dpt.from_dlpack(Y)
-    assert isinstance(V, np.ndarray)
-    assert Y.shape == V.shape
-    assert Y.dtype == V.dtype
-
-
-@pytest.mark.parametrize("shape", [tuple(), (2,), (3, 0, 1), (2, 2, 2)])
-def test_from_dlpack_to_kdlcpu(shape, typestr):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(typestr, q.sycl_device)
-
-    X = dpt.empty(shape, dtype=typestr, sycl_queue=q)
-    Y = dpt.from_dlpack(X, device=(device_CPU, 0))
-    assert isinstance(Y, np.ndarray)
-    assert X.shape == Y.shape
-    assert X.dtype == Y.dtype
-    # NumPy does not treat size 0 arrays consistently
-    # w.r.t. strides, so skip these cases
-    if X.ndim and X.size != 0:
-        V = Y[::-1]
-        W = dpt.from_dlpack(V)
-        assert V.strides == W.strides
-
-
-@pytest.mark.parametrize("mod", [2, 5])
-def test_from_dlpack_to_kdlcpu_strides(mod, typestr):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(typestr, q.sycl_device)
-
-    X0 = dpt.empty(3 * mod, dtype=typestr, sycl_queue=q)
-    for start in range(mod):
-        X = X0[slice(-start - 1, None, -mod)]
-        Y = dpt.from_dlpack(X, device=(device_CPU, 0))
-        assert X.shape == Y.shape
-        assert X.dtype == Y.dtype
-        if Y.ndim:
-            V = Y[::-1]
-            W = dpt.from_dlpack(V)
-            assert V.strides == W.strides
-
-
-def test_dlpack_from_subdevice_to_kdlcpu():
-    """
-    Check that array allocated on a sub-device can be
-    imported via DLPack to kDLCPU device (as a NumPy array).
-    """
-    n = 64
-    try:
-        dev = dpctl.SyclDevice()
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No default device available")
-    try:
-        sdevs = dev.create_sub_devices(partition="next_partitionable")
-    except dpctl.SyclSubDeviceCreationError:
-        sdevs = None
-    try:
-        if sdevs is None:
-            sdevs = dev.create_sub_devices(partition=[1, 1])
-    except dpctl.SyclSubDeviceCreationError:
-        pytest.skip("Default device can not be partitioned")
-    assert isinstance(sdevs, list) and len(sdevs) > 0
-    try:
-        ctx = sdevs[0].sycl_platform.default_context
-    except dpctl.SyclContextCreationError:
-        pytest.skip("Platform's default_context is not available")
-    try:
-        q = dpctl.SyclQueue(ctx, sdevs[0])
-    except dpctl.SyclQueueCreationError:
-        pytest.skip("Queue could not be created")
-
-    ar = dpt.arange(n, dtype=dpt.int32, sycl_queue=q)
-    ar2 = dpt.from_dlpack(ar, dl_device=(device_CPU, 0))
-    assert isinstance(ar2, np.ndarray)
-
-
-def test_legacy_dlpack_capsule_from_numpy():
-    """
-    Check that NumPy's exported legacy DLPack capsule
-    will interoperate with from_dlpack_capsule,
-    especially with zero-copy.
-    """
-    x = np.arange(100, dtype="i4")
-    cap = x.__dlpack__()
-    y = _dlp.from_dlpack_capsule(cap)
-    del cap
-    assert x.ctypes.data == y.ctypes.data
-
-    x = np.arange(100, dtype="u4").reshape((10, 10)).T
-    cap = x.__dlpack__()
-    y = _dlp.from_dlpack_capsule(cap)
-    del cap
-    assert x.ctypes.data == y.ctypes.data
-    del x
-
-    x = np.arange(100, dtype="f4").reshape((10, 10), order="F")
-    cap = x.__dlpack__()
-    y = _dlp.from_dlpack_capsule(cap)
-    del cap
-    assert x.ctypes.data == y.ctypes.data
-
-    x = np.arange(100, dtype="c8")
-    x1 = x[::-2]
-    cap = x1.__dlpack__()
-    y = _dlp.from_dlpack_capsule(cap)
-    assert x1.ctypes.data == y.ctypes.data
-    del x1, y, x
-    del cap
-
-    x = np.ones(100, dtype="?")
-    x1 = x[::-2]
-    cap = x1.__dlpack__()
-    y = _dlp.from_dlpack_capsule(cap)
-    assert x1.ctypes.data == y.ctypes.data
-    del x1, y, x
-    del cap
-
-
-def test_dlpack_capsule_readonly_array_to_kdlcpu():
-    try:
-        x = dpt.arange(100, dtype="i4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No default device available")
-
-    max_supported_ver = _dlp.get_build_dlpack_version()
-    # read-only array
-    x.flags["W"] = False
-    cap = x.__dlpack__(max_version=max_supported_ver, dl_device=(device_CPU, 0))
-    y = _dlp.from_dlpack_capsule(cap)
-    assert dpt.all(x == dpt.asarray(y))
-    assert not y.flags["W"]
-
-    cap1 = _dlp.numpy_to_dlpack_versioned_capsule(y, not y.flags["W"])
-    y1 = _dlp.from_dlpack_capsule(cap1)
-    assert not y1.flags["W"]
-
-
-def test_to_dlpack_capsule_c_and_f_contig():
-    try:
-        x = dpt.asarray(np.random.rand(2, 3))
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No default device available")
-
-    cap = _dlp.to_dlpack_capsule(x)
-    y = _dlp.from_dlpack_capsule(cap)
-    assert np.allclose(dpt.asnumpy(x), dpt.asnumpy(y))
-    assert x.strides == y.strides
-
-    x_f = x.T
-    cap = _dlp.to_dlpack_capsule(x_f)
-    yf = _dlp.from_dlpack_capsule(cap)
-    assert np.allclose(dpt.asnumpy(x_f), dpt.asnumpy(yf))
-    assert x_f.strides == yf.strides
-    del cap
-
-
-def test_to_dlpack_versioned_capsule_c_and_f_contig():
-    try:
-        x = dpt.asarray(np.random.rand(2, 3))
-        max_supported_ver = _dlp.get_build_dlpack_version()
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No default device available")
-
-    cap = x.__dlpack__(max_version=max_supported_ver)
-    y = _dlp.from_dlpack_capsule(cap)
-    assert np.allclose(dpt.asnumpy(x), dpt.asnumpy(y))
-    assert x.strides == y.strides
-
-    x_f = x.T
-    cap = x_f.__dlpack__(max_version=max_supported_ver)
-    yf = _dlp.from_dlpack_capsule(cap)
-    assert np.allclose(dpt.asnumpy(x_f), dpt.asnumpy(yf))
-    assert x_f.strides == yf.strides
-    del cap
-
-
-def test_used_dlpack_capsule_from_numpy():
-    get_queue_or_skip()
-
-    x_np = np.arange(100, dtype="i4")
-
-    cap = x_np.__dlpack__()
-    _dlp.from_dlpack_capsule(cap)
-    with pytest.raises(
-        ValueError,
-        match="A DLPack tensor object can not be consumed multiple times",
-    ):
-        _dlp.from_dlpack_capsule(cap)
-    del cap
-
-    x = dpt.asarray(x_np)
-    max_supported_ver = _dlp.get_build_dlpack_version()
-    cap = x.__dlpack__(max_version=max_supported_ver, dl_device=(device_CPU, 0))
-    _dlp.from_dlpack_capsule(cap)
-    with pytest.raises(
-        ValueError,
-        match="A DLPack tensor object can not be consumed multiple times",
-    ):
-        _dlp.from_dlpack_capsule(cap)
-    del cap
-
-
-def test_dlpack_size_0_on_kdlcpu():
-    get_queue_or_skip()
-    x_np = np.ones(0, dtype="i4")
-
-    cap = x_np.__dlpack__()
-    y = _dlp.from_dlpack_capsule(cap)
-    assert y.ctypes.data == x_np.ctypes.data
-
-
-def test_copy_via_host():
-    get_queue_or_skip()
-    x = dpt.ones(1, dtype="i4")
-    x_np = np.ones(1, dtype="i4")
-    x_dl_dev = x.__dlpack_device__()
-    y = dpt.from_dlpack(x_np, device=x_dl_dev)
-    assert isinstance(y, dpt.usm_ndarray)
-    assert y.sycl_device == x.sycl_device
-    assert y.usm_type == "device"
-
-    with pytest.raises(ValueError):
-        # uncorrect length of tuple
-        dpt.from_dlpack(x_np, device=(1, 0, 0))
-    with pytest.raises(ValueError):
-        # only kDLCPU and kDLOneAPI are supported
-        dpt.from_dlpack(x, device=(2, 0))
-
-    num_devs = dpctl.get_num_devices()
-    if num_devs > 1:
-        j = [i for i in range(num_devs) if i != x_dl_dev[1]][0]
-        z = dpt.from_dlpack(x, device=(x_dl_dev[0], j))
-        assert isinstance(z, dpt.usm_ndarray)
-        assert z.usm_type == "device"
-
-
-def test_copy_via_host_gh_1789():
-    "Test based on review example from gh-1789"
-    get_queue_or_skip()
-    x_np = np.ones((10, 10), dtype="i4")
-    # strides are no longer multiple of itemsize
-    x_np.strides = (x_np.strides[0] - 1, x_np.strides[1])
-    with pytest.raises(BufferError):
-        dpt.from_dlpack(x_np)
-    with pytest.raises(BufferError):
-        dpt.from_dlpack(x_np, device=(14, 0))
-
-
-class LegacyContainer:
-    "Helper class implementing legacy `__dlpack__` protocol"
-
-    def __init__(self, array):
-        self._array = array
-
-    def __dlpack__(self, stream=None):
-        return self._array.__dlpack__(stream=stream)
-
-    def __dlpack_device__(self):
-        return self._array.__dlpack_device__()
-
-
-class Container:
-    "Helper class implementing `__dlpack__` protocol version 1.0"
-
-    def __init__(self, array):
-        self._array = array
-
-    def __dlpack__(
-        self, max_version=None, dl_device=None, copy=None, stream=None
-    ):
-        return self._array.__dlpack__(
-            max_version=max_version,
-            dl_device=dl_device,
-            copy=copy,
-            stream=stream,
-        )
-
-    def __dlpack_device__(self):
-        return self._array.__dlpack_device__()
-
-
-def test_generic_container_legacy():
-    get_queue_or_skip()
-    C = LegacyContainer(dpt.linspace(0, 100, num=20, dtype="int16"))
-
-    X = dpt.from_dlpack(C)
-    assert isinstance(X, dpt.usm_ndarray)
-    assert X._pointer == C._array._pointer
-    assert X.sycl_device == C._array.sycl_device
-    assert X.dtype == C._array.dtype
-
-    Y = dpt.from_dlpack(C, device=(dpt.DLDeviceType.kDLCPU, 0))
-    assert isinstance(Y, np.ndarray)
-    assert Y.dtype == X.dtype
-
-    Z = dpt.from_dlpack(C, device=X.device)
-    assert isinstance(Z, dpt.usm_ndarray)
-    assert Z._pointer == X._pointer
-    assert Z.device == X.device
-
-
-def test_generic_container_legacy_np():
-    get_queue_or_skip()
-    C = LegacyContainer(np.linspace(0, 100, num=20, dtype="int16"))
-
-    X = dpt.from_dlpack(C)
-    assert isinstance(X, np.ndarray)
-    assert X.ctypes.data == C._array.ctypes.data
-    assert X.dtype == C._array.dtype
-
-    Y = dpt.from_dlpack(C, device=(dpt.DLDeviceType.kDLCPU, 0))
-    assert isinstance(Y, np.ndarray)
-    assert Y.dtype == X.dtype
-
-    dev = dpt.Device.create_device()
-    Z = dpt.from_dlpack(C, device=dev)
-    assert isinstance(Z, dpt.usm_ndarray)
-    assert Z.device == dev
-
-
-def test_generic_container():
-    get_queue_or_skip()
-    C = Container(dpt.linspace(0, 100, num=20, dtype="int16"))
-
-    X = dpt.from_dlpack(C)
-    assert isinstance(X, dpt.usm_ndarray)
-    assert X._pointer == C._array._pointer
-    assert X.sycl_device == C._array.sycl_device
-    assert X.dtype == C._array.dtype
-
-    Y = dpt.from_dlpack(C, device=(dpt.DLDeviceType.kDLCPU, 0))
-    assert isinstance(Y, np.ndarray)
-    assert Y.dtype == X.dtype
-
-    Z = dpt.from_dlpack(C, device=X.device)
-    assert isinstance(Z, dpt.usm_ndarray)
-    assert Z._pointer == X._pointer
-    assert Z.device == X.device
-
-
-def test_sycl_device_to_dldevice(all_root_devices):
-    for sycl_dev in all_root_devices:
-        dev = dpt.sycl_device_to_dldevice(sycl_dev)
-        assert type(dev) is tuple
-        assert len(dev) == 2
-        assert dev[0] == device_oneAPI
-        assert dev[1] == sycl_dev.get_device_id()
-
-
-def test_dldevice_to_sycl_device(all_root_devices):
-    for sycl_dev in all_root_devices:
-        dldev = dpt.empty(0, device=sycl_dev).__dlpack_device__()
-        dev = dpt.dldevice_to_sycl_device(dldev)
-        assert type(dev) is dpctl.SyclDevice
-        assert dev.get_device_id() == sycl_dev.get_device_id()
-
-
-def test_dldevice_conversion_arg_validation():
-    bad_dldevice_type = (dpt.DLDeviceType.kDLCPU, 0)
-    with pytest.raises(ValueError):
-        dpt.dldevice_to_sycl_device(bad_dldevice_type)
-
-    bad_dldevice_len = bad_dldevice_type + (0,)
-    with pytest.raises(ValueError):
-        dpt.dldevice_to_sycl_device(bad_dldevice_len)
-
-    bad_dldevice = dict()
-    with pytest.raises(TypeError):
-        dpt.dldevice_to_sycl_device(bad_dldevice)
-
-    bad_sycldevice = dict()
-    with pytest.raises(TypeError):
-        dpt.sycl_device_to_dldevice(bad_sycldevice)
diff --git a/dpctl/tests/test_usm_ndarray_indexing.py b/dpctl/tests/test_usm_ndarray_indexing.py
deleted file mode 100644
index 4b2c42a631..0000000000
--- a/dpctl/tests/test_usm_ndarray_indexing.py
+++ /dev/null
@@ -1,2041 +0,0 @@
-#                      Data Parallel Control (dpctl)
-#
-# Copyright 2020-2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import numpy as np
-import pytest
-from numpy.testing import assert_array_equal
-
-import dpctl
-import dpctl.tensor as dpt
-import dpctl.tensor._tensor_impl as ti
-from dpctl.tensor._copy_utils import _take_multi_index
-from dpctl.utils import ExecutionPlacementError
-
-from .helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-_all_dtypes = [
-    "u1",
-    "i1",
-    "u2",
-    "i2",
-    "u4",
-    "i4",
-    "u8",
-    "i8",
-    "e",
-    "f",
-    "d",
-    "F",
-    "D",
-]
-
-_all_int_dtypes = ["u1", "i1", "u2", "i2", "u4", "i4", "u8", "i8"]
-
-
-def test_basic_slice1():
-    q = get_queue_or_skip()
-    x = dpt.empty(10, dtype="u2", sycl_queue=q)
-    y = x[0]
-    assert isinstance(y, dpt.usm_ndarray)
-    assert y.ndim == 0
-    assert y.shape == tuple()
-    assert y.strides == tuple()
-
-
-def test_basic_slice2():
-    q = get_queue_or_skip()
-    x = dpt.empty(10, dtype="i2", sycl_queue=q)
-    y = x[(0,)]
-    assert isinstance(y, dpt.usm_ndarray)
-    assert y.ndim == 0
-    assert y.shape == tuple()
-    assert y.strides == tuple()
-
-
-def test_basic_slice3():
-    q = get_queue_or_skip()
-    x = dpt.empty(10, dtype="i2", sycl_queue=q)
-    y = x[:]
-    assert isinstance(y, dpt.usm_ndarray)
-    assert y.ndim == x.ndim
-    assert y.shape == x.shape
-    assert y.strides == x.strides
-    y = x[(slice(None, None, None),)]
-    assert isinstance(y, dpt.usm_ndarray)
-    assert y.ndim == x.ndim
-    assert y.shape == x.shape
-    assert y.strides == x.strides
-
-
-def test_basic_slice4():
-    q = get_queue_or_skip()
-    n0, n1 = 5, 3
-    x = dpt.empty((n0, n1), dtype="f4", sycl_queue=q)
-    y = x[::-1]
-    assert isinstance(y, dpt.usm_ndarray)
-    assert y.shape == x.shape
-    assert y.strides == (-x.strides[0], x.strides[1])
-    actual_offset = y.__sycl_usm_array_interface__["offset"]
-    assert actual_offset == (n0 - 1) * n1
-
-
-def test_basic_slice5():
-    q = get_queue_or_skip()
-    n0, n1 = 5, 3
-    x = dpt.empty((n0, n1), dtype="c8", sycl_queue=q)
-    y = x[:, ::-1]
-    assert isinstance(y, dpt.usm_ndarray)
-    assert y.shape == x.shape
-    assert y.strides == (x.strides[0], -x.strides[1])
-    actual_offset = y.__sycl_usm_array_interface__["offset"]
-    assert actual_offset == (n1 - 1)
-
-
-def test_basic_slice6():
-    q = get_queue_or_skip()
-    i0, n0, n1 = 2, 4, 3
-    x = dpt.empty((n0, n1), dtype="c8", sycl_queue=q)
-    y = x[i0, ::-1]
-    assert isinstance(y, dpt.usm_ndarray)
-    assert y.shape == (x.shape[1],)
-    assert y.strides == (-x.strides[1],)
-    actual_offset = y.__sycl_usm_array_interface__["offset"]
-    expected_offset = i0 * x.strides[0] + (n1 - 1) * x.strides[1]
-    assert actual_offset == expected_offset
-
-
-def test_basic_slice7():
-    q = get_queue_or_skip()
-    n0, n1, n2 = 5, 3, 2
-    x = dpt.empty((n0, n1, n2), dtype="?", sycl_queue=q)
-    y = x[..., ::-1]
-    assert isinstance(y, dpt.usm_ndarray)
-    assert y.shape == x.shape
-    assert y.strides == (
-        x.strides[0],
-        x.strides[1],
-        -x.strides[2],
-    )
-    actual_offset = y.__sycl_usm_array_interface__["offset"]
-    expected_offset = (n2 - 1) * x.strides[2]
-    assert actual_offset == expected_offset
-
-
-def test_basic_slice8():
-    q = get_queue_or_skip()
-    n0, n1 = 3, 7
-    x = dpt.empty((n0, n1), dtype="u1", sycl_queue=q)
-    y = x[..., dpt.newaxis]
-    assert isinstance(y, dpt.usm_ndarray)
-    assert y.shape == (n0, n1, 1)
-    assert y.strides == (n1, 1, 0)
-
-
-def test_basic_slice9():
-    q = get_queue_or_skip()
-    n0, n1 = 3, 7
-    x = dpt.empty((n0, n1), dtype="u8", sycl_queue=q)
-    y = x[dpt.newaxis, ...]
-    assert isinstance(y, dpt.usm_ndarray)
-    assert y.shape == (1, n0, n1)
-    assert y.strides == (0, n1, 1)
-
-
-def test_basic_slice10():
-    q = get_queue_or_skip()
-    n0, n1, n2 = 3, 7, 5
-    x = dpt.empty((n0, n1, n2), dtype="u1", sycl_queue=q)
-    y = x[dpt.newaxis, ..., :]
-    assert isinstance(y, dpt.usm_ndarray)
-    assert y.shape == (1, n0, n1, n2)
-    assert y.strides == (0, n1 * n2, n2, 1)
-
-
-def _all_equal(it1, it2):
-    return all(bool(x == y) for x, y in zip(it1, it2))
-
-
-def test_advanced_slice1():
-    q = get_queue_or_skip()
-    ii = dpt.asarray([1, 2], sycl_queue=q)
-    x = dpt.arange(10, dtype="i4", sycl_queue=q)
-    y = x[ii]
-    assert isinstance(y, dpt.usm_ndarray)
-    assert y.shape == ii.shape
-    assert y.strides == (1,)
-    assert _all_equal(
-        (x[ii[k]] for k in range(ii.shape[0])),
-        (y[k] for k in range(ii.shape[0])),
-    )
-    y = x[(ii,)]
-    assert isinstance(y, dpt.usm_ndarray)
-    assert y.shape == ii.shape
-    assert y.strides == (1,)
-    assert _all_equal(
-        (x[ii[k]] for k in range(ii.shape[0])),
-        (y[k] for k in range(ii.shape[0])),
-    )
-
-
-def test_advanced_slice1_negative_strides():
-    q = get_queue_or_skip()
-    ii = dpt.asarray([0, 1], sycl_queue=q)
-    x = dpt.flip(dpt.arange(5, dtype="i4", sycl_queue=q))
-    y = x[ii]
-    assert isinstance(y, dpt.usm_ndarray)
-    assert y.shape == ii.shape
-    assert y.strides == (1,)
-    assert _all_equal(
-        (x[ii[k]] for k in range(ii.shape[0])),
-        (y[k] for k in range(ii.shape[0])),
-    )
-
-
-def test_advanced_slice2():
-    q = get_queue_or_skip()
-    ii = dpt.asarray([1, 2], sycl_queue=q)
-    x = dpt.arange(10, dtype="i4", sycl_queue=q)
-    y = x[ii, dpt.newaxis]
-    assert isinstance(y, dpt.usm_ndarray)
-    assert y.shape == ii.shape + (1,)
-    assert y.flags["C"]
-
-
-def test_advanced_slice3():
-    q = get_queue_or_skip()
-    ii = dpt.asarray([1, 2], sycl_queue=q)
-    x = dpt.arange(10, dtype="i4", sycl_queue=q)
-    y = x[dpt.newaxis, ii]
-    assert isinstance(y, dpt.usm_ndarray)
-    assert y.shape == (1,) + ii.shape
-    assert y.flags["C"]
-
-
-def _make_3d(dt, q):
-    return dpt.reshape(
-        dpt.arange(3 * 3 * 3, dtype=dt, sycl_queue=q),
-        (
-            3,
-            3,
-            3,
-        ),
-    )
-
-
-def test_advanced_slice4():
-    q = get_queue_or_skip()
-    ii = dpt.asarray([1, 2], sycl_queue=q)
-    x = _make_3d("i4", q)
-    y = x[ii, ii, ii]
-    assert isinstance(y, dpt.usm_ndarray)
-    assert y.shape == ii.shape
-    assert _all_equal(
-        (x[ii[k], ii[k], ii[k]] for k in range(ii.shape[0])),
-        (y[k] for k in range(ii.shape[0])),
-    )
-
-
-def test_advanced_slice5():
-    q = get_queue_or_skip()
-    ii = dpt.asarray([1, 2], sycl_queue=q)
-    x = _make_3d("i4", q)
-    y = x[ii, 0, ii]
-    assert isinstance(y, dpt.usm_ndarray)
-    # 0 broadcast to [0, 0] per array API
-    assert y.shape == ii.shape
-    assert _all_equal(
-        (x[ii[i], 0, ii[i]] for i in range(ii.shape[0])),
-        (y[i] for i in range(ii.shape[0])),
-    )
-
-
-def test_advanced_slice6():
-    q = get_queue_or_skip()
-    ii = dpt.asarray([1, 2], sycl_queue=q)
-    x = _make_3d("i4", q)
-    y = x[:, ii, ii]
-    assert isinstance(y, dpt.usm_ndarray)
-    assert y.shape == (
-        x.shape[0],
-        ii.shape[0],
-    )
-    assert _all_equal(
-        (
-            x[i, ii[k], ii[k]]
-            for i in range(x.shape[0])
-            for k in range(ii.shape[0])
-        ),
-        (y[i, k] for i in range(x.shape[0]) for k in range(ii.shape[0])),
-    )
-
-
-def test_advanced_slice7():
-    q = get_queue_or_skip()
-    mask = dpt.asarray(
-        [
-            [[True, True, False], [False, True, True], [True, False, True]],
-            [[True, False, False], [False, False, True], [False, True, False]],
-            [[True, True, True], [False, False, False], [False, False, True]],
-        ],
-        sycl_queue=q,
-    )
-    x = _make_3d("i2", q)
-    y = x[mask]
-    expected = [0, 1, 4, 5, 6, 8, 9, 14, 16, 18, 19, 20, 26]
-    assert isinstance(y, dpt.usm_ndarray)
-    assert y.shape == (len(expected),)
-    assert all(dpt.asnumpy(y[k]) == expected[k] for k in range(len(expected)))
-
-
-def test_advanced_slice8():
-    q = get_queue_or_skip()
-    mask = dpt.asarray(
-        [[True, False, False], [False, True, False], [False, True, False]],
-        sycl_queue=q,
-    )
-    x = _make_3d("u2", q)
-    y = x[mask]
-    expected = dpt.asarray(
-        [[0, 1, 2], [12, 13, 14], [21, 22, 23]], sycl_queue=q
-    )
-    assert isinstance(y, dpt.usm_ndarray)
-    assert y.shape == expected.shape
-    assert (dpt.asnumpy(y) == dpt.asnumpy(expected)).all()
-
-
-def test_advanced_slice9():
-    q = get_queue_or_skip()
-    mask = dpt.asarray(
-        [[True, False, False], [False, True, False], [False, True, False]],
-        sycl_queue=q,
-    )
-    x = _make_3d("u4", q)
-    y = x[:, mask]
-    expected = dpt.asarray([[0, 4, 7], [9, 13, 16], [18, 22, 25]], sycl_queue=q)
-    assert isinstance(y, dpt.usm_ndarray)
-    assert y.shape == expected.shape
-    assert (dpt.asnumpy(y) == dpt.asnumpy(expected)).all()
-
-
-def lin_id(i, j, k):
-    """global_linear_id for (3,3,3) range traversed in C-contiguous order"""
-    return 9 * i + 3 * j + k
-
-
-def test_advanced_slice10():
-    q = get_queue_or_skip()
-    x = _make_3d("u8", q)
-    i0 = dpt.asarray([0, 1, 1], device=x.device)
-    i1 = dpt.asarray([1, 1, 2], device=x.device)
-    i2 = dpt.asarray([2, 0, 1], device=x.device)
-    y = x[i0, i1, i2]
-    res_expected = dpt.asarray(
-        [
-            lin_id(0, 1, 2),
-            lin_id(1, 1, 0),
-            lin_id(1, 2, 1),
-        ],
-        sycl_queue=q,
-    )
-    assert isinstance(y, dpt.usm_ndarray)
-    assert y.shape == res_expected.shape
-    assert (dpt.asnumpy(y) == dpt.asnumpy(res_expected)).all()
-
-
-def test_advanced_slice11():
-    q = get_queue_or_skip()
-    x = _make_3d("u8", q)
-    i0 = dpt.asarray([0, 1, 1], device=x.device)
-    i2 = dpt.asarray([2, 0, 1], device=x.device)
-    with pytest.raises(IndexError):
-        x[i0, :, i2]
-
-
-def test_advanced_slice12():
-    q = get_queue_or_skip()
-    x = _make_3d("u8", q)
-    i1 = dpt.asarray([1, 1, 2], device=x.device)
-    i2 = dpt.asarray([2, 0, 1], device=x.device)
-    y = x[:, dpt.newaxis, i1, i2, dpt.newaxis]
-    res_expected = dpt.asarray(
-        [
-            [[[lin_id(0, 1, 2)], [lin_id(0, 1, 0)], [lin_id(0, 2, 1)]]],
-            [[[lin_id(1, 1, 2)], [lin_id(1, 1, 0)], [lin_id(1, 2, 1)]]],
-            [[[lin_id(2, 1, 2)], [lin_id(2, 1, 0)], [lin_id(2, 2, 1)]]],
-        ],
-        sycl_queue=q,
-    )
-    assert isinstance(y, dpt.usm_ndarray)
-    assert y.shape == res_expected.shape
-    assert (dpt.asnumpy(y) == dpt.asnumpy(res_expected)).all()
-
-
-def test_advanced_slice13():
-    q = get_queue_or_skip()
-    x = _make_3d("u8", q)
-    i1 = dpt.asarray([[1], [2]], device=x.device)
-    i2 = dpt.asarray([[0, 1]], device=x.device)
-    y = x[i1, i2, 0]
-    expected = dpt.asarray(
-        [
-            [lin_id(1, 0, 0), lin_id(1, 1, 0)],
-            [lin_id(2, 0, 0), lin_id(2, 1, 0)],
-        ],
-        device=x.device,
-    )
-    assert isinstance(y, dpt.usm_ndarray)
-    assert y.shape == expected.shape
-    assert (dpt.asnumpy(y) == dpt.asnumpy(expected)).all()
-
-
-def test_advanced_slice14():
-    q = get_queue_or_skip()
-    ii = dpt.asarray([1, 2], sycl_queue=q)
-    x = dpt.reshape(dpt.arange(3**5, dtype="i4", sycl_queue=q), (3,) * 5)
-    y = x[ii, 0, ii, 1, :]
-    assert isinstance(y, dpt.usm_ndarray)
-    # integers broadcast to ii.shape per array API
-    assert y.shape == ii.shape + x.shape[-1:]
-    assert _all_equal(
-        (
-            x[ii[i], 0, ii[i], 1, k]
-            for i in range(ii.shape[0])
-            for k in range(x.shape[-1])
-        ),
-        (y[i, k] for i in range(ii.shape[0]) for k in range(x.shape[-1])),
-    )
-
-
-def test_advanced_slice15():
-    q = get_queue_or_skip()
-    ii = dpt.asarray([1, 2], sycl_queue=q)
-    x = dpt.reshape(dpt.arange(3**5, dtype="i4", sycl_queue=q), (3,) * 5)
-    # : cannot appear between two integral arrays
-    with pytest.raises(IndexError):
-        x[ii, 0, ii, :, ii]
-
-
-def test_advanced_slice16():
-    q = get_queue_or_skip()
-    ii = dpt.asarray(1, sycl_queue=q)
-    i0 = dpt.asarray(False, sycl_queue=q)
-    i1 = dpt.asarray(True, sycl_queue=q)
-    x = dpt.reshape(dpt.arange(3**5, dtype="i4", sycl_queue=q), (3,) * 5)
-    y = x[ii, i0, ii, i1, :]
-    # TODO: add a shape check here when discrepancy with NumPy is investigated
-    assert isinstance(y, dpt.usm_ndarray)
-
-
-def test_integer_indexing_numpy_array():
-    q = get_queue_or_skip()
-    ii = np.asarray([1, 2])
-    x = dpt.arange(10, dtype="i4", sycl_queue=q)
-    y = x[ii]
-    assert isinstance(y, dpt.usm_ndarray)
-    assert y.shape == ii.shape
-    assert dpt.all(x[1:3] == y)
-
-
-def test_boolean_indexing_numpy_array():
-    q = get_queue_or_skip()
-    ii = np.asarray(
-        [False, True, True, False, False, False, False, False, False, False]
-    )
-    x = dpt.arange(10, dtype="i4", sycl_queue=q)
-    y = x[ii]
-    assert isinstance(y, dpt.usm_ndarray)
-    assert y.shape == (2,)
-    assert dpt.all(x[1:3] == y)
-
-
-def test_boolean_indexing_validation():
-    get_queue_or_skip()
-    x = dpt.zeros(10, dtype="i4")
-    ii = dpt.ones((2, 5), dtype="?")
-    with pytest.raises(IndexError):
-        x[ii]
-    with pytest.raises(IndexError):
-        x[ii[0, :]]
-
-
-def test_boolean_indexing_getitem_empty_mask():
-    get_queue_or_skip()
-    x = dpt.ones((2, 3, 4), dtype="i4")
-    ii = dpt.ones((0,), dtype="?")
-    assert x[ii].size == 0
-    ii1 = dpt.ones((0, 3), dtype="?")
-    assert x[ii1].size == 0
-    ii2 = dpt.ones((0, 3, 4), dtype="?")
-    assert x[ii2].size == 0
-
-
-def test_boolean_indexing_setitem_empty_mask():
-    get_queue_or_skip()
-    x = dpt.ones((2, 3, 4), dtype="i4")
-    ii = dpt.ones((0,), dtype="?")
-    x[ii] = 0
-    assert dpt.all(x == 1)
-    ii1 = dpt.ones((0, 3), dtype="?")
-    x[ii1] = 0
-    assert dpt.all(x == 1)
-    ii2 = dpt.ones((0, 3, 4), dtype="?")
-    x[ii2] = 0
-    assert dpt.all(x == 1)
-
-
-def test_integer_indexing_1d():
-    get_queue_or_skip()
-    x = dpt.arange(10, dtype="i4")
-    ind_1d = dpt.asarray([7, 3, 1], dtype="u2")
-    ind_2d = dpt.asarray([[2, 3, 4], [3, 4, 5], [5, 6, 7]], dtype="i4")
-
-    y1 = x[ind_1d]
-    assert y1.shape == ind_1d.shape
-    y2 = x[ind_2d]
-    assert y2.shape == ind_2d.shape
-    assert (dpt.asnumpy(y1) == np.array([7, 3, 1], dtype="i4")).all()
-    assert (
-        dpt.asnumpy(y2)
-        == np.array([[2, 3, 4], [3, 4, 5], [5, 6, 7]], dtype="i4")
-    ).all()
-
-
-def test_integer_indexing_2d():
-    get_queue_or_skip()
-    n0, n1 = 5, 7
-    x = dpt.reshape(
-        dpt.arange(n0 * n1, dtype="i4"),
-        (
-            n0,
-            n1,
-        ),
-    )
-    ind0 = dpt.arange(n0)
-    ind1 = dpt.arange(n1)
-
-    y = x[ind0[:2, dpt.newaxis], ind1[dpt.newaxis, -2:]]
-    assert y.dtype == x.dtype
-    assert (dpt.asnumpy(y) == np.array([[5, 6], [12, 13]])).all()
-
-
-def test_integer_strided_indexing():
-    get_queue_or_skip()
-    n0, n1 = 5, 7
-    x = dpt.reshape(
-        dpt.arange(2 * n0 * n1, dtype="i4"),
-        (
-            2 * n0,
-            n1,
-        ),
-    )
-    ind0 = dpt.arange(n0)
-    ind1 = dpt.arange(n1)
-
-    z = x[::-2, :]
-    y = z[ind0[:2, dpt.newaxis], ind1[dpt.newaxis, -2:]]
-    assert y.dtype == x.dtype
-    zc = dpt.copy(z, order="C")
-    yc = zc[ind0[:2, dpt.newaxis], ind1[dpt.newaxis, -2:]]
-    assert (dpt.asnumpy(y) == dpt.asnumpy(yc)).all()
-
-
-def test_TrueFalse_indexing():
-    get_queue_or_skip()
-    n0, n1 = 2, 3
-    x = dpt.ones((n0, n1))
-    for ind in [True, dpt.asarray(True)]:
-        y1 = x[ind]
-        assert y1.shape == (1, n0, n1)
-        assert y1._pointer == x._pointer
-        y2 = x[:, ind]
-        assert y2.shape == (n0, 1, n1)
-        assert y2._pointer == x._pointer
-        y3 = x[..., ind]
-        assert y3.shape == (n0, n1, 1)
-        assert y3._pointer == x._pointer
-    for ind in [False, dpt.asarray(False)]:
-        y1 = x[ind]
-        assert y1.shape == (0, n0, n1)
-        assert y1._pointer == x._pointer
-        y2 = x[:, ind]
-        assert y2.shape == (n0, 0, n1)
-        assert y2._pointer == x._pointer
-        y3 = x[..., ind]
-        assert y3.shape == (n0, n1, 0)
-        assert y3._pointer == x._pointer
-
-
-def test_mixed_index_getitem():
-    get_queue_or_skip()
-    x = dpt.reshape(dpt.arange(1000, dtype="i4"), (10, 10, 10))
-    i1b = dpt.ones(10, dtype="?")
-    info = x.__array_namespace__().__array_namespace_info__()
-    ind_dt = info.default_dtypes(device=x.device)["indexing"]
-    i0 = dpt.asarray([0, 2, 3], dtype=ind_dt)[:, dpt.newaxis]
-    i2 = dpt.asarray([3, 4, 7], dtype=ind_dt)[:, dpt.newaxis]
-    y = x[i0, i1b, i2]
-    assert y.shape == (3, dpt.sum(i1b, dtype="i8"))
-
-
-def test_mixed_index_setitem():
-    get_queue_or_skip()
-    x = dpt.reshape(dpt.arange(1000, dtype="i4"), (10, 10, 10))
-    i1b = dpt.ones(10, dtype="?")
-    info = x.__array_namespace__().__array_namespace_info__()
-    ind_dt = info.default_dtypes(device=x.device)["indexing"]
-    i0 = dpt.asarray([0, 2, 3], dtype=ind_dt)[:, dpt.newaxis]
-    i2 = dpt.asarray([3, 4, 7], dtype=ind_dt)[:, dpt.newaxis]
-    v_shape = (3, int(dpt.sum(i1b, dtype="i8")))
-    canary = 7
-    x[i0, i1b, i2] = dpt.full(v_shape, canary, dtype=x.dtype)
-    assert x[0, 0, 3] == canary
-
-
-@pytest.mark.parametrize(
-    "data_dt",
-    _all_dtypes,
-)
-@pytest.mark.parametrize(
-    "ind_dt",
-    _all_int_dtypes,
-)
-def test_take_basic(data_dt, ind_dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(data_dt, q)
-
-    x = dpt.arange(10, dtype=data_dt)
-    ind = dpt.arange(2, 5, dtype=ind_dt)
-    y = dpt.take(x, ind)
-    assert y.dtype == x.dtype
-    assert (dpt.asnumpy(y) == np.arange(2, 5, dtype=data_dt)).all()
-
-
-@pytest.mark.parametrize(
-    "data_dt",
-    _all_dtypes,
-)
-@pytest.mark.parametrize(
-    "ind_dt",
-    _all_int_dtypes,
-)
-def test_put_basic(data_dt, ind_dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(data_dt, q)
-
-    x = dpt.arange(10, dtype=data_dt)
-    ind = dpt.arange(2, 5, dtype=ind_dt)
-    val = dpt.ones(3, dtype=data_dt)
-    dpt.put(x, ind, val)
-    assert (
-        dpt.asnumpy(x)
-        == np.array([0, 1, 1, 1, 1, 5, 6, 7, 8, 9], dtype=data_dt)
-    ).all()
-
-
-def test_take_basic_axis():
-    get_queue_or_skip()
-
-    n0, n1 = 5, 7
-    x = dpt.reshape(
-        dpt.arange(n0 * n1, dtype="i4"),
-        (
-            n0,
-            n1,
-        ),
-    )
-    ind = dpt.arange(2, 4)
-    y0 = dpt.take(x, ind, axis=0)
-    y1 = dpt.take(x, ind, axis=1)
-    assert y0.shape == (2, n1)
-    assert y1.shape == (n0, 2)
-
-
-def test_put_basic_axis():
-    get_queue_or_skip()
-
-    n0, n1 = 5, 7
-    x = dpt.reshape(
-        dpt.arange(n0 * n1, dtype="i4"),
-        (
-            n0,
-            n1,
-        ),
-    )
-    ind = dpt.arange(2, 4)
-    v0 = dpt.zeros((2, n1), dtype=x.dtype)
-    v1 = dpt.zeros((n0, 2), dtype=x.dtype)
-    dpt.put(x, ind, v0, axis=0)
-    dpt.put(x, ind, v1, axis=1)
-    expected = np.arange(n0 * n1, dtype="i4").reshape((n0, n1))
-    expected[[2, 3], :] = 0
-    expected[:, [2, 3]] = 0
-    assert (expected == dpt.asnumpy(x)).all()
-
-
-@pytest.mark.parametrize("data_dt", _all_dtypes)
-def test_put_0d_val(data_dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(data_dt, q)
-
-    x = dpt.arange(5, dtype=data_dt, sycl_queue=q)
-    ind = dpt.asarray([0], dtype="i8", sycl_queue=q)
-    val = dpt.asarray(2, dtype=x.dtype, sycl_queue=q)
-    x[ind] = val
-    assert_array_equal(np.asarray(2, dtype=data_dt), dpt.asnumpy(x[0]))
-
-    x = dpt.asarray(5, dtype=data_dt, sycl_queue=q)
-    dpt.put(x, ind, val)
-    assert_array_equal(np.asarray(2, dtype=data_dt), dpt.asnumpy(x))
-
-
-@pytest.mark.parametrize(
-    "data_dt",
-    _all_dtypes,
-)
-def test_take_0d_data(data_dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(data_dt, q)
-
-    x = dpt.asarray(0, dtype=data_dt, sycl_queue=q)
-    ind = dpt.arange(5, dtype="i8", sycl_queue=q)
-
-    y = dpt.take(x, ind)
-    assert (
-        dpt.asnumpy(y)
-        == np.broadcast_to(np.asarray(0, dtype=data_dt), ind.shape)
-    ).all()
-
-
-@pytest.mark.parametrize(
-    "data_dt",
-    _all_dtypes,
-)
-def test_put_0d_data(data_dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(data_dt, q)
-
-    x = dpt.asarray(0, dtype=data_dt, sycl_queue=q)
-    ind = dpt.arange(5, dtype="i8", sycl_queue=q)
-    val = dpt.asarray(2, dtype=data_dt, sycl_queue=q)
-
-    dpt.put(x, ind, val, axis=0)
-    assert (
-        dpt.asnumpy(x)
-        == np.broadcast_to(np.asarray(2, dtype=data_dt), ind.shape)
-    ).all()
-
-
-@pytest.mark.parametrize(
-    "ind_dt",
-    _all_int_dtypes,
-)
-def test_indexing_0d_ind(ind_dt):
-    q = get_queue_or_skip()
-
-    x = dpt.arange(5, dtype="i4", sycl_queue=q)
-    ind = dpt.asarray(3, dtype=ind_dt, sycl_queue=q)
-
-    y = x[ind]
-    assert dpt.asnumpy(x[3]) == dpt.asnumpy(y)
-
-
-@pytest.mark.parametrize(
-    "ind_dt",
-    _all_int_dtypes,
-)
-def test_put_0d_ind(ind_dt):
-    q = get_queue_or_skip()
-
-    x = dpt.arange(5, dtype="i4", sycl_queue=q)
-    ind = dpt.asarray(3, dtype=ind_dt, sycl_queue=q)
-    val = dpt.asarray(5, dtype=x.dtype, sycl_queue=q)
-
-    x[ind] = val
-    assert dpt.asnumpy(x[3]) == dpt.asnumpy(val)
-
-
-@pytest.mark.parametrize(
-    "data_dt",
-    _all_dtypes,
-)
-def test_take_strided_1d_source(data_dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(data_dt, q)
-
-    x = dpt.arange(27, dtype=data_dt, sycl_queue=q)
-    ind = dpt.arange(4, 9, dtype="i8", sycl_queue=q)
-
-    x_np = dpt.asnumpy(x)
-    ind_np = dpt.asnumpy(ind)
-
-    for s in (
-        slice(None, None, 2),
-        slice(None, None, -2),
-    ):
-        assert_array_equal(
-            np.take(x_np[s], ind_np, axis=0),
-            dpt.asnumpy(dpt.take(x[s], ind, axis=0)),
-        )
-
-    # 0-strided
-    x = dpt.usm_ndarray(
-        (27,),
-        dtype=data_dt,
-        strides=(0,),
-        buffer_ctor_kwargs={"queue": q},
-    )
-    x[0] = x_np[0]
-    assert_array_equal(
-        np.broadcast_to(x_np[0], ind.shape),
-        dpt.asnumpy(dpt.take(x, ind, axis=0)),
-    )
-
-
-@pytest.mark.parametrize(
-    "data_dt",
-    _all_dtypes,
-)
-@pytest.mark.parametrize("order", ["C", "F"])
-def test_take_strided(data_dt, order):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(data_dt, q)
-
-    x = dpt.reshape(_make_3d(data_dt, q), (9, 3), order=order)
-    ind = dpt.arange(2, dtype="i8", sycl_queue=q)
-
-    x_np = dpt.asnumpy(x)
-    ind_np = dpt.asnumpy(ind)
-
-    for s in (
-        slice(None, None, 2),
-        slice(None, None, -2),
-    ):
-        for sgn in (-1, 1):
-            xs = x[s, ::sgn]
-            xs_np = x_np[s, ::sgn]
-            assert_array_equal(
-                np.take(xs_np, ind_np, axis=0),
-                dpt.asnumpy(dpt.take(xs, ind, axis=0)),
-            )
-            assert_array_equal(
-                np.take(xs_np, ind_np, axis=1),
-                dpt.asnumpy(dpt.take(xs, ind, axis=1)),
-            )
-
-
-@pytest.mark.parametrize(
-    "ind_dt",
-    _all_int_dtypes,
-)
-def test_take_strided_1d_indices(ind_dt):
-    q = get_queue_or_skip()
-
-    x = dpt.arange(27, dtype="i4", sycl_queue=q)
-    ind = dpt.arange(12, 24, dtype=ind_dt, sycl_queue=q)
-
-    x_np = dpt.asnumpy(x)
-    ind_np = dpt.asnumpy(ind).astype("i8")
-
-    for s in (
-        slice(None, None, 2),
-        slice(None, None, -2),
-    ):
-        assert_array_equal(
-            np.take(x_np, ind_np[s], axis=0),
-            dpt.asnumpy(dpt.take(x, ind[s], axis=0)),
-        )
-
-    # 0-strided
-    ind = dpt.usm_ndarray(
-        (12,),
-        dtype=ind_dt,
-        strides=(0,),
-        buffer_ctor_kwargs={"queue": q},
-    )
-    ind[0] = ind_np[0]
-    assert_array_equal(
-        np.broadcast_to(x_np[ind_np[0]], ind.shape),
-        dpt.asnumpy(dpt.take(x, ind, axis=0)),
-    )
-
-
-@pytest.mark.parametrize(
-    "ind_dt",
-    _all_int_dtypes,
-)
-@pytest.mark.parametrize("order", ["C", "F"])
-def test_take_strided_indices(ind_dt, order):
-    q = get_queue_or_skip()
-
-    x = dpt.arange(27, dtype="i4", sycl_queue=q)
-    ind = dpt.reshape(
-        dpt.arange(12, 24, dtype=ind_dt, sycl_queue=q), (4, 3), order=order
-    )
-
-    x_np = dpt.asnumpy(x)
-    ind_np = dpt.asnumpy(ind).astype("i8")
-
-    for s in (
-        slice(None, None, 2),
-        slice(None, None, -2),
-    ):
-        for sgn in [-1, 1]:
-            inds = ind[s, ::sgn]
-            inds_np = ind_np[s, ::sgn]
-            assert_array_equal(
-                np.take(x_np, inds_np, axis=0),
-                dpt.asnumpy(x[inds]),
-            )
-
-
-@pytest.mark.parametrize(
-    "data_dt",
-    _all_dtypes,
-)
-@pytest.mark.parametrize("order", ["C", "F"])
-def test_put_strided_1d_destination(data_dt, order):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(data_dt, q)
-
-    x = dpt.arange(27, dtype=data_dt, sycl_queue=q)
-    ind = dpt.arange(4, 9, dtype="i8", sycl_queue=q)
-    val = dpt.asarray(9, dtype=x.dtype, sycl_queue=q)
-
-    x_np = dpt.asnumpy(x)
-    ind_np = dpt.asnumpy(ind)
-    val_np = dpt.asnumpy(val)
-
-    for s in (
-        slice(None, None, 2),
-        slice(None, None, -2),
-    ):
-        x_np1 = x_np.copy()
-        x_np1[s][ind_np] = val_np
-
-        x1 = dpt.copy(x)
-        dpt.put(x1[s], ind, val, axis=0)
-
-        assert_array_equal(x_np1, dpt.asnumpy(x1))
-
-
-@pytest.mark.parametrize(
-    "data_dt",
-    _all_dtypes,
-)
-@pytest.mark.parametrize("order", ["C", "F"])
-def test_put_strided_destination(data_dt, order):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(data_dt, q)
-
-    x = dpt.reshape(_make_3d(data_dt, q), (9, 3), order=order)
-    ind = dpt.arange(2, dtype="i8", sycl_queue=q)
-    val = dpt.asarray(9, dtype=x.dtype, sycl_queue=q)
-
-    x_np = dpt.asnumpy(x)
-    ind_np = dpt.asnumpy(ind)
-    val_np = dpt.asnumpy(val)
-
-    for s in (
-        slice(None, None, 2),
-        slice(None, None, -2),
-    ):
-        for sgn in [-1, 1]:
-            xs = x[s, ::sgn]
-            xs_np = x_np[s, ::sgn]
-
-            x_np1 = xs_np.copy()
-            x_np1[ind_np] = val_np
-
-            x1 = dpt.copy(xs)
-            dpt.put(x1, ind, val, axis=0)
-            assert_array_equal(x_np1, dpt.asnumpy(x1))
-
-            x_np1 = xs_np.copy()
-            x_np1[:, ind_np] = val_np
-
-            x1 = dpt.copy(xs)
-            dpt.put(x1, ind, val, axis=1)
-            assert_array_equal(x_np1, dpt.asnumpy(x1))
-
-            x_np1 = xs_np.copy()
-            x_np1[ind_np, ind_np] = val_np
-
-            x1 = dpt.copy(xs)
-            x1[ind, ind] = val
-            assert_array_equal(x_np1, dpt.asnumpy(x1))
-
-
-@pytest.mark.parametrize(
-    "ind_dt",
-    _all_int_dtypes,
-)
-def test_put_strided_1d_indices(ind_dt):
-    q = get_queue_or_skip()
-
-    x = dpt.arange(27, dtype="i4", sycl_queue=q)
-    ind = dpt.arange(12, 24, dtype=ind_dt, sycl_queue=q)
-    val = dpt.asarray(-1, dtype=x.dtype, sycl_queue=q)
-
-    x_np = dpt.asnumpy(x)
-    ind_np = dpt.asnumpy(ind).astype("i8")
-    val_np = dpt.asnumpy(val)
-
-    for s in (
-        slice(None, None, 2),
-        slice(None, None, -2),
-    ):
-        x_copy = dpt.copy(x)
-        dpt.put(x_copy, ind[s], val, axis=0)
-
-        x_np_copy = x_np.copy()
-        x_np_copy[ind_np[s]] = val_np
-
-        assert_array_equal(x_np_copy, dpt.asnumpy(x_copy))
-
-
-@pytest.mark.parametrize(
-    "ind_dt",
-    _all_int_dtypes,
-)
-@pytest.mark.parametrize("order", ["C", "F"])
-def test_put_strided_indices(ind_dt, order):
-    q = get_queue_or_skip()
-
-    x = dpt.arange(27, dtype="i4", sycl_queue=q)
-    ind = dpt.reshape(
-        dpt.arange(12, 24, dtype=ind_dt, sycl_queue=q), (4, 3), order=order
-    )
-    val = dpt.asarray(-1, sycl_queue=q, dtype=x.dtype)
-
-    x_np = dpt.asnumpy(x)
-    ind_np = dpt.asnumpy(ind).astype("i8")
-    val_np = dpt.asnumpy(val)
-
-    for s in (
-        slice(None, None, 2),
-        slice(None, None, -2),
-    ):
-        for sgn in [-1, 1]:
-            inds = ind[s, ::sgn]
-            inds_np = ind_np[s, ::sgn]
-
-            x_copy = dpt.copy(x)
-            x_copy[inds] = val
-
-            x_np_copy = x_np.copy()
-            x_np_copy[inds_np] = val_np
-
-            assert_array_equal(x_np_copy, dpt.asnumpy(x_copy))
-
-
-def test_integer_indexing_modes():
-    q = get_queue_or_skip()
-
-    x = dpt.arange(5, sycl_queue=q)
-    x_np = dpt.asnumpy(x)
-
-    # wrapping negative indices
-    ind = dpt.asarray([-4, -3, 0, 2, 4], dtype="i8", sycl_queue=q)
-
-    res = dpt.take(x, ind, mode="wrap")
-    expected_arr = np.take(x_np, dpt.asnumpy(ind), mode="raise")
-
-    assert (dpt.asnumpy(res) == expected_arr).all()
-
-    # clipping to 0 (disabling negative indices)
-    ind = dpt.asarray([-6, -3, 0, 2, 6], dtype="i8", sycl_queue=q)
-
-    res = dpt.take(x, ind, mode="clip")
-    expected_arr = np.take(x_np, dpt.asnumpy(ind), mode="clip")
-
-    assert (dpt.asnumpy(res) == expected_arr).all()
-
-
-def test_take_arg_validation():
-    q = get_queue_or_skip()
-
-    x = dpt.arange(4, dtype="i4", sycl_queue=q)
-    ind0 = dpt.arange(4, dtype="i8", sycl_queue=q)
-    ind1 = dpt.arange(2.0, dtype="f", sycl_queue=q)
-
-    with pytest.raises(TypeError):
-        dpt.take(dict(), ind0, axis=0)
-    with pytest.raises(TypeError):
-        dpt.take(x, dict(), axis=0)
-    with pytest.raises(IndexError):
-        x[[]]
-    with pytest.raises(IndexError):
-        dpt.take(x, ind1, axis=0)
-    with pytest.raises(IndexError):
-        x[ind1]
-
-    with pytest.raises(ValueError):
-        dpt.take(dpt.reshape(x, (2, 2)), ind0)
-    with pytest.raises(ValueError):
-        dpt.take(x, ind0, mode=0)
-    with pytest.raises(ValueError):
-        dpt.take(dpt.reshape(x, (2, 2)), ind0, axis=None)
-    with pytest.raises(ValueError):
-        dpt.take(x, dpt.reshape(ind0, (2, 2)))
-    with pytest.raises(ValueError):
-        dpt.take(x[0], ind0, axis=2)
-    with pytest.raises(ValueError):
-        dpt.take(x[:, dpt.newaxis, dpt.newaxis], ind0, axis=None)
-
-
-def test_put_arg_validation():
-    q = get_queue_or_skip()
-
-    x = dpt.arange(4, dtype="i4", sycl_queue=q)
-    ind0 = dpt.arange(4, dtype="i8", sycl_queue=q)
-    ind1 = dpt.arange(2.0, dtype="f", sycl_queue=q)
-    val = dpt.asarray(2, dtype=x.dtype, sycl_queue=q)
-
-    with pytest.raises(TypeError):
-        dpt.put(dict(), ind0, val, axis=0)
-    with pytest.raises(TypeError):
-        dpt.put(x, dict(), val, axis=0)
-    with pytest.raises(IndexError):
-        x[[]] = val
-    with pytest.raises(IndexError):
-        dpt.put(x, ind1, val, axis=0)
-    with pytest.raises(IndexError):
-        x[ind1] = val
-    with pytest.raises(TypeError):
-        dpt.put(x, ind0, dict(), axis=0)
-    with pytest.raises(TypeError):
-        x[ind0] = dict()
-
-    with pytest.raises(ValueError):
-        dpt.put(x, ind0, val, mode=0)
-    with pytest.raises(ValueError):
-        dpt.put(x, dpt.reshape(ind0, (2, 2)), val)
-    with pytest.raises(ValueError):
-        dpt.put(x[0], ind0, val, axis=2)
-    with pytest.raises(ValueError):
-        dpt.put(x[:, dpt.newaxis, dpt.newaxis], ind0, val, axis=None)
-
-
-def test_advanced_indexing_compute_follows_data():
-    q1 = get_queue_or_skip()
-    q2 = get_queue_or_skip()
-
-    x = dpt.arange(4, sycl_queue=q1)
-    ind0 = dpt.asarray([0], sycl_queue=q1)
-    ind1 = dpt.asarray([0], sycl_queue=q2)
-    val0 = dpt.asarray(2, dtype=x.dtype, sycl_queue=q1)
-    val1 = dpt.asarray(2, dtype=x.dtype, sycl_queue=q2)
-
-    with pytest.raises(ExecutionPlacementError):
-        dpt.take(x, ind1, axis=0)
-    with pytest.raises(ExecutionPlacementError):
-        x[ind1]
-    with pytest.raises(ExecutionPlacementError):
-        dpt.put(x, ind1, val0, axis=0)
-    with pytest.raises(ExecutionPlacementError):
-        x[ind1] = val0
-    with pytest.raises(ExecutionPlacementError):
-        dpt.put(x, ind0, val1, axis=0)
-    with pytest.raises(ExecutionPlacementError):
-        x[ind0] = val1
-
-
-def test_extract_all_1d():
-    get_queue_or_skip()
-    x = dpt.arange(30, dtype="i4")
-    sel = dpt.ones(30, dtype="?")
-    sel[::2] = False
-
-    res = x[sel]
-    expected_res = dpt.asnumpy(x)[dpt.asnumpy(sel)]
-    assert (dpt.asnumpy(res) == expected_res).all()
-
-    res2 = dpt.extract(sel, x)
-    assert (dpt.asnumpy(res2) == expected_res).all()
-
-    # test strided case
-    x = dpt.arange(15, dtype="i4")
-    sel_np = np.zeros(15, dtype="?")
-    np.put(sel_np, np.random.choice(sel_np.size, size=7), True)
-    sel = dpt.asarray(sel_np)
-
-    res = x[sel[::-1]]
-    expected_res = dpt.asnumpy(x)[sel_np[::-1]]
-    assert (dpt.asnumpy(res) == expected_res).all()
-
-    res2 = dpt.extract(sel[::-1], x)
-    assert (dpt.asnumpy(res2) == expected_res).all()
-
-
-def test_extract_all_2d():
-    get_queue_or_skip()
-    x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6))
-    sel = dpt.ones(30, dtype="?")
-    sel[::2] = False
-    sel = dpt.reshape(sel, x.shape)
-
-    res = x[sel]
-    expected_res = dpt.asnumpy(x)[dpt.asnumpy(sel)]
-    assert (dpt.asnumpy(res) == expected_res).all()
-
-    res2 = dpt.extract(sel, x)
-    assert (dpt.asnumpy(res2) == expected_res).all()
-
-
-def test_extract_2D_axis0():
-    get_queue_or_skip()
-    x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6))
-    sel = dpt.ones(x.shape[0], dtype="?")
-    sel[::2] = False
-
-    res = x[sel]
-    expected_res = dpt.asnumpy(x)[dpt.asnumpy(sel)]
-    assert (dpt.asnumpy(res) == expected_res).all()
-
-
-def test_extract_2D_axis1():
-    get_queue_or_skip()
-    x = dpt.reshape(dpt.arange(30, dtype="i4"), (5, 6))
-    sel = dpt.ones(x.shape[1], dtype="?")
-    sel[::2] = False
-
-    res = x[:, sel]
-    expected = dpt.asnumpy(x)[:, dpt.asnumpy(sel)]
-    assert (dpt.asnumpy(res) == expected).all()
-
-
-def test_extract_begin():
-    get_queue_or_skip()
-    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
-    y = dpt.permute_dims(x, (2, 0, 3, 1))
-    sel = dpt.zeros((3, 3), dtype="?")
-    sel[0, 0] = True
-    sel[1, 1] = True
-    z = y[sel]
-    expected = dpt.asnumpy(y)[[0, 1], [0, 1]]
-    assert (dpt.asnumpy(z) == expected).all()
-
-
-def test_extract_end():
-    get_queue_or_skip()
-    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
-    y = dpt.permute_dims(x, (2, 0, 3, 1))
-    sel = dpt.zeros((4, 4), dtype="?")
-    sel[0, 0] = True
-    z = y[..., sel]
-    expected = dpt.asnumpy(y)[..., [0], [0]]
-    assert (dpt.asnumpy(z) == expected).all()
-
-
-def test_extract_middle():
-    get_queue_or_skip()
-    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
-    y = dpt.permute_dims(x, (2, 0, 3, 1))
-    sel = dpt.zeros((3, 4), dtype="?")
-    sel[0, 0] = True
-    z = y[:, sel]
-    expected = dpt.asnumpy(y)[:, [0], [0], :]
-    assert (dpt.asnumpy(z) == expected).all()
-
-
-def test_extract_empty_result():
-    get_queue_or_skip()
-    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
-    y = dpt.permute_dims(x, (2, 0, 3, 1))
-    sel = dpt.zeros((3, 4), dtype="?")
-    z = y[:, sel]
-    assert z.shape == (
-        y.shape[0],
-        0,
-        y.shape[3],
-    )
-
-
-def test_place_all_1d():
-    get_queue_or_skip()
-    x = dpt.arange(10, dtype="i2")
-    sel = dpt.zeros(10, dtype="?")
-    sel[0::2] = True
-    val = dpt.zeros(5, dtype=x.dtype)
-    x[sel] = val
-    assert (dpt.asnumpy(x) == np.array([0, 1, 0, 3, 0, 5, 0, 7, 0, 9])).all()
-    dpt.place(x, sel, dpt.asarray([2]))
-    assert (dpt.asnumpy(x) == np.array([2, 1, 2, 3, 2, 5, 2, 7, 2, 9])).all()
-
-
-def test_place_2d_axis0():
-    get_queue_or_skip()
-    x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4))
-    sel = dpt.asarray([True, False, True])
-    val = dpt.zeros((2, 4), dtype=x.dtype)
-    x[sel] = val
-    expected_x = np.stack(
-        (
-            np.zeros(4, dtype="i2"),
-            np.arange(4, 8, dtype="i2"),
-            np.zeros(4, dtype="i2"),
-        )
-    )
-    assert (dpt.asnumpy(x) == expected_x).all()
-
-
-def test_place_2d_axis1():
-    get_queue_or_skip()
-    x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4))
-    sel = dpt.asarray([True, False, True, False])
-    val = dpt.zeros((3, 2), dtype=x.dtype)
-    x[:, sel] = val
-    expected_x = np.array(
-        [[0, 1, 0, 3], [0, 5, 0, 7], [0, 9, 0, 11]], dtype="i2"
-    )
-    assert (dpt.asnumpy(x) == expected_x).all()
-
-
-def test_place_2d_axis1_scalar():
-    get_queue_or_skip()
-    x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4))
-    sel = dpt.asarray([True, False, True, False])
-    val = dpt.zeros(tuple(), dtype=x.dtype)
-    x[:, sel] = val
-    expected_x = np.array(
-        [[0, 1, 0, 3], [0, 5, 0, 7], [0, 9, 0, 11]], dtype="i2"
-    )
-    assert (dpt.asnumpy(x) == expected_x).all()
-
-
-def test_place_all_slices():
-    get_queue_or_skip()
-    x = dpt.reshape(dpt.arange(12, dtype="i2"), (3, 4))
-    sel = dpt.asarray(
-        [
-            [False, True, True, False],
-            [True, True, False, False],
-            [False, False, True, True],
-        ],
-        dtype="?",
-    )
-    y = dpt.ones_like(x)
-    y[sel] = x[sel]
-
-
-def test_place_some_slices_begin():
-    get_queue_or_skip()
-    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
-    y = dpt.permute_dims(x, (2, 0, 3, 1))
-    sel = dpt.zeros((3, 3), dtype="?")
-    sel[0, 0] = True
-    sel[1, 1] = True
-    z = y[sel]
-    w = dpt.zeros_like(y)
-    w[sel] = z
-
-
-def test_place_some_slices_mid():
-    get_queue_or_skip()
-    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
-    y = dpt.permute_dims(x, (2, 0, 3, 1))
-    sel = dpt.zeros((3, 4), dtype="?")
-    sel[0, 0] = True
-    sel[1, 1] = True
-    z = y[:, sel]
-    w = dpt.zeros_like(y)
-    w[:, sel] = z
-
-
-def test_place_some_slices_end():
-    get_queue_or_skip()
-    x = dpt.reshape(dpt.arange(3 * 3 * 4 * 4, dtype="i2"), (3, 4, 3, 4))
-    y = dpt.permute_dims(x, (2, 0, 3, 1))
-    sel = dpt.zeros((4, 4), dtype="?")
-    sel[0, 0] = True
-    sel[1, 1] = True
-    z = y[:, :, sel]
-    w = dpt.zeros_like(y)
-    w[:, :, sel] = z
-
-
-def test_place_cycling():
-    get_queue_or_skip()
-    x = dpt.zeros(10, dtype="f4")
-    y = dpt.asarray([2, 3])
-    sel = dpt.ones(x.size, dtype="?")
-    dpt.place(x, sel, y)
-    expected = np.array(
-        [
-            2,
-            3,
-        ]
-        * 5,
-        dtype=x.dtype,
-    )
-    assert (dpt.asnumpy(x) == expected).all()
-
-
-def test_place_subset():
-    get_queue_or_skip()
-    x = dpt.zeros(10, dtype="f4")
-    y = dpt.ones_like(x)
-    sel = dpt.ones(x.size, dtype="?")
-    sel[::2] = False
-    dpt.place(x, sel, y)
-    expected = np.array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1], dtype=x.dtype)
-    assert (dpt.asnumpy(x) == expected).all()
-
-
-def test_place_empty_vals_error():
-    get_queue_or_skip()
-    x = dpt.zeros(10, dtype="f4")
-    y = dpt.empty((0,), dtype=x.dtype)
-    sel = dpt.ones(x.size, dtype="?")
-    sel[::2] = False
-    with pytest.raises(ValueError):
-        dpt.place(x, sel, y)
-
-
-def test_place_empty_vals_full_false_mask():
-    get_queue_or_skip()
-    x = dpt.ones(10, dtype="f4")
-    y = dpt.empty((0,), dtype=x.dtype)
-    sel = dpt.zeros(x.size, dtype="?")
-    expected = np.ones(10, dtype=x.dtype)
-    dpt.place(x, sel, y)
-    assert (dpt.asnumpy(x) == expected).all()
-
-
-def test_nonzero():
-    get_queue_or_skip()
-    x = dpt.concat((dpt.zeros(3), dpt.ones(4), dpt.zeros(3)))
-    (i,) = dpt.nonzero(x)
-    assert (dpt.asnumpy(i) == np.array([3, 4, 5, 6])).all()
-
-
-def test_nonzero_f_contig():
-    "See gh-1370"
-    get_queue_or_skip()
-
-    mask = dpt.zeros((5, 5), dtype="?", order="F")
-    mask[2, 3] = True
-
-    expected_res = np.nonzero(dpt.asnumpy(mask))
-    result = dpt.nonzero(mask)
-
-    for exp, res in zip(expected_res, result):
-        assert_array_equal(dpt.asnumpy(res), exp)
-    assert dpt.asnumpy(mask[result]).all()
-
-
-def test_nonzero_compacting():
-    """See gh-1370.
-    Test with input where dimensionality
-    of iteration space is compacted from 3d to 2d
-    """
-    get_queue_or_skip()
-
-    mask = dpt.zeros((5, 5, 5), dtype="?", order="F")
-    mask[3, 2, 1] = True
-    mask_view = mask[..., :3]
-
-    expected_res = np.nonzero(dpt.asnumpy(mask_view))
-    result = dpt.nonzero(mask_view)
-
-    for exp, res in zip(expected_res, result):
-        assert_array_equal(dpt.asnumpy(res), exp)
-    assert dpt.asnumpy(mask_view[result]).all()
-
-
-def test_assign_scalar():
-    get_queue_or_skip()
-    x = dpt.arange(-5, 5, dtype="i8")
-    cond = dpt.asarray(
-        [True, True, True, True, True, False, False, False, False, False]
-    )
-    x[cond] = 0  # no error expected
-    x[dpt.nonzero(cond)] = -1
-    expected = np.array([-1, -1, -1, -1, -1, 0, 1, 2, 3, 4], dtype=x.dtype)
-    assert (dpt.asnumpy(x) == expected).all()
-
-
-def test_nonzero_large():
-    get_queue_or_skip()
-    m = dpt.full((60, 80), True)
-    assert m[m].size == m.size
-
-    m = dpt.full((30, 60, 80), True)
-    assert m[m].size == m.size
-
-
-def test_extract_arg_validation():
-    get_queue_or_skip()
-    with pytest.raises(TypeError):
-        dpt.extract(None, None)
-    cond = dpt.ones(10, dtype="?")
-    with pytest.raises(TypeError):
-        dpt.extract(cond, None)
-    q1 = dpctl.SyclQueue()
-    with pytest.raises(ExecutionPlacementError):
-        dpt.extract(cond.to_device(q1), dpt.zeros_like(cond, dtype="u1"))
-    with pytest.raises(ValueError):
-        dpt.extract(dpt.ones((2, 3), dtype="?"), dpt.ones((3, 2), dtype="i1"))
-
-
-def test_place_arg_validation():
-    get_queue_or_skip()
-    with pytest.raises(TypeError):
-        dpt.place(None, None, None)
-    arr = dpt.zeros(8, dtype="i1")
-    with pytest.raises(TypeError):
-        dpt.place(arr, None, None)
-    cond = dpt.ones(8, dtype="?")
-    with pytest.raises(TypeError):
-        dpt.place(arr, cond, None)
-    vals = dpt.ones_like(arr)
-    q1 = dpctl.SyclQueue()
-    with pytest.raises(ExecutionPlacementError):
-        dpt.place(arr.to_device(q1), cond, vals)
-    with pytest.raises(ValueError):
-        dpt.place(dpt.reshape(arr, (2, 2, 2)), cond, vals)
-
-
-def test_nonzero_arg_validation():
-    get_queue_or_skip()
-    with pytest.raises(TypeError):
-        dpt.nonzero(list())
-    with pytest.raises(ValueError):
-        dpt.nonzero(dpt.asarray(1))
-
-
-def test_nonzero_dtype():
-    "See gh-1322"
-    get_queue_or_skip()
-    x = dpt.ones((3, 4))
-    idx, idy = dpt.nonzero(x)
-    # create array using device's
-    # default index data type
-    index_dt = dpt.dtype(ti.default_device_index_type(x.sycl_queue))
-    assert idx.dtype == index_dt
-    assert idy.dtype == index_dt
-
-
-def test_take_empty_axes():
-    get_queue_or_skip()
-
-    x = dpt.ones((3, 0, 4, 5, 6), dtype="f4")
-    inds = dpt.ones(1, dtype="i4")
-
-    with pytest.raises(IndexError):
-        dpt.take(x, inds, axis=1)
-
-    inds = dpt.ones(0, dtype="i4")
-    r = dpt.take(x, inds, axis=1)
-    assert r.shape == x.shape
-
-
-def test_put_empty_axes():
-    get_queue_or_skip()
-
-    x = dpt.ones((3, 0, 4, 5, 6), dtype="f4")
-    inds = dpt.ones(1, dtype="i4")
-    vals = dpt.zeros((3, 1, 4, 5, 6), dtype="f4")
-
-    with pytest.raises(IndexError):
-        dpt.put(x, inds, vals, axis=1)
-
-    inds = dpt.ones(0, dtype="i4")
-    vals = dpt.zeros_like(x)
-
-    with pytest.raises(ValueError):
-        dpt.put(x, inds, vals, axis=1)
-
-
-def test_put_cast_vals():
-    get_queue_or_skip()
-
-    x = dpt.arange(10, dtype="i4")
-    inds = dpt.arange(7, 10, dtype="i4")
-    vals = dpt.zeros_like(inds, dtype="f4")
-
-    dpt.put(x, inds, vals)
-    assert dpt.all(x[7:10] == 0)
-
-
-def test_advanced_integer_indexing_cast_vals():
-    get_queue_or_skip()
-
-    x = dpt.arange(10, dtype="i4")
-    inds = dpt.arange(7, 10, dtype="i4")
-    vals = dpt.zeros_like(inds, dtype="f4")
-
-    x[inds] = vals
-    assert dpt.all(x[7:10] == 0)
-
-
-def test_advanced_integer_indexing_empty_axis():
-    get_queue_or_skip()
-
-    # getting
-    x = dpt.ones((3, 0, 4, 5, 6), dtype="f4")
-    inds = dpt.ones(1, dtype="i4")
-    with pytest.raises(IndexError):
-        x[:, inds, ...]
-    with pytest.raises(IndexError):
-        x[inds, inds, inds, ...]
-
-    # setting
-    with pytest.raises(IndexError):
-        x[:, inds, ...] = 2
-    with pytest.raises(IndexError):
-        x[inds, inds, inds, ...] = 2
-
-    # empty inds
-    inds = dpt.ones(0, dtype="i4")
-    assert x[:, inds, ...].shape == x.shape
-    assert x[inds, inds, inds, ...].shape == (0, 5, 6)
-
-    vals = dpt.zeros_like(x)
-    x[:, inds, ...] = vals
-    vals = dpt.zeros((0, 5, 6), dtype="f4")
-    x[inds, inds, inds, ...] = vals
-
-
-def test_advanced_integer_indexing_cast_indices():
-    get_queue_or_skip()
-
-    inds0 = dpt.asarray([0, 1], dtype="i1")
-    for ind_dts in (("i1", "i2", "i4"), ("i1", "u4", "i4"), ("u1", "u2", "u8")):
-        x = dpt.ones((3, 4, 5, 6), dtype="i4")
-        inds0 = dpt.asarray([0, 1], dtype=ind_dts[0])
-        inds1 = dpt.astype(inds0, ind_dts[1])
-        x[inds0, inds1, ...] = 2
-        assert dpt.all(x[inds0, inds1, ...] == 2)
-        inds2 = dpt.astype(inds0, ind_dts[2])
-        x[inds0, inds1, ...] = 2
-        assert dpt.all(x[inds0, inds1, inds2, ...] == 2)
-
-    # fail when float would be required per type promotion
-    inds0 = dpt.asarray([0, 1], dtype="i1")
-    inds1 = dpt.astype(inds0, "u4")
-    inds2 = dpt.astype(inds0, "u8")
-    x = dpt.ones((3, 4, 5, 6), dtype="i4")
-    # test getitem
-    with pytest.raises(ValueError):
-        x[inds0, inds1, inds2, ...]
-    # test setitem
-    with pytest.raises(ValueError):
-        x[inds0, inds1, inds2, ...] = 1
-
-
-def test_take_along_axis():
-    get_queue_or_skip()
-
-    n0, n1, n2 = 3, 5, 7
-    x = dpt.reshape(dpt.arange(n0 * n1 * n2), (n0, n1, n2))
-    ind_dt = dpt.__array_namespace_info__().default_dtypes(
-        device=x.sycl_device
-    )["indexing"]
-    ind0 = dpt.ones((1, n1, n2), dtype=ind_dt)
-    ind1 = dpt.ones((n0, 1, n2), dtype=ind_dt)
-    ind2 = dpt.ones((n0, n1, 1), dtype=ind_dt)
-
-    y0 = dpt.take_along_axis(x, ind0, axis=0)
-    assert y0.shape == ind0.shape
-    y1 = dpt.take_along_axis(x, ind1, axis=1)
-    assert y1.shape == ind1.shape
-    y2 = dpt.take_along_axis(x, ind2, axis=2)
-    assert y2.shape == ind2.shape
-
-
-def test_take_along_axis_validation():
-    # type check on the first argument
-    with pytest.raises(TypeError):
-        dpt.take_along_axis(tuple(), list())
-    get_queue_or_skip()
-    n1, n2 = 2, 5
-    x = dpt.ones(n1 * n2)
-    # type check on the second argument
-    with pytest.raises(TypeError):
-        dpt.take_along_axis(x, list())
-    x_dev = x.sycl_device
-    info_ = dpt.__array_namespace_info__()
-    def_dtypes = info_.default_dtypes(device=x_dev)
-    ind_dt = def_dtypes["indexing"]
-    ind = dpt.zeros(1, dtype=ind_dt)
-    # axis validation
-    with pytest.raises(ValueError):
-        dpt.take_along_axis(x, ind, axis=1)
-    # mode validation
-    with pytest.raises(ValueError):
-        dpt.take_along_axis(x, ind, axis=0, mode="invalid")
-    # same array-ranks validation
-    with pytest.raises(ValueError):
-        dpt.take_along_axis(dpt.reshape(x, (n1, n2)), ind)
-    # check compute-follows-data
-    q2 = dpctl.SyclQueue(x_dev, property="enable_profiling")
-    ind2 = dpt.zeros(1, dtype=ind_dt, sycl_queue=q2)
-    with pytest.raises(ExecutionPlacementError):
-        dpt.take_along_axis(x, ind2)
-
-
-def test_put_along_axis():
-    get_queue_or_skip()
-
-    n0, n1, n2 = 3, 5, 7
-    x = dpt.reshape(dpt.arange(n0 * n1 * n2), (n0, n1, n2))
-    ind_dt = dpt.__array_namespace_info__().default_dtypes(
-        device=x.sycl_device
-    )["indexing"]
-    ind0 = dpt.ones((1, n1, n2), dtype=ind_dt)
-    ind1 = dpt.ones((n0, 1, n2), dtype=ind_dt)
-    ind2 = dpt.ones((n0, n1, 1), dtype=ind_dt)
-
-    xc = dpt.copy(x)
-    vals = dpt.ones(ind0.shape, dtype=x.dtype)
-    dpt.put_along_axis(xc, ind0, vals, axis=0)
-    assert dpt.all(dpt.take_along_axis(xc, ind0, axis=0) == vals)
-
-    xc = dpt.copy(x)
-    vals = dpt.ones(ind1.shape, dtype=x.dtype)
-    dpt.put_along_axis(xc, ind1, vals, axis=1)
-    assert dpt.all(dpt.take_along_axis(xc, ind1, axis=1) == vals)
-
-    xc = dpt.copy(x)
-    vals = dpt.ones(ind2.shape, dtype=x.dtype)
-    dpt.put_along_axis(xc, ind2, vals, axis=2)
-    assert dpt.all(dpt.take_along_axis(xc, ind2, axis=2) == vals)
-
-    xc = dpt.copy(x)
-    vals = dpt.ones(ind2.shape, dtype=x.dtype)
-    dpt.put_along_axis(xc, ind2, dpt.asnumpy(vals), axis=2)
-    assert dpt.all(dpt.take_along_axis(xc, ind2, axis=2) == vals)
-
-
-def test_put_along_axis_validation():
-    # type check on the first argument
-    with pytest.raises(TypeError):
-        dpt.put_along_axis(tuple(), list(), list())
-    get_queue_or_skip()
-    n1, n2 = 2, 5
-    x = dpt.ones(n1 * n2)
-    # type check on the second argument
-    with pytest.raises(TypeError):
-        dpt.put_along_axis(x, list(), list())
-    x_dev = x.sycl_device
-    info_ = dpt.__array_namespace_info__()
-    def_dtypes = info_.default_dtypes(device=x_dev)
-    ind_dt = def_dtypes["indexing"]
-    ind = dpt.zeros(1, dtype=ind_dt)
-    vals = dpt.zeros(1, dtype=x.dtype)
-    # axis validation
-    with pytest.raises(ValueError):
-        dpt.put_along_axis(x, ind, vals, axis=1)
-    # mode validation
-    with pytest.raises(ValueError):
-        dpt.put_along_axis(x, ind, vals, axis=0, mode="invalid")
-    # same array-ranks validation
-    with pytest.raises(ValueError):
-        dpt.put_along_axis(dpt.reshape(x, (n1, n2)), ind, vals)
-    # check compute-follows-data
-    q2 = dpctl.SyclQueue(x_dev, property="enable_profiling")
-    ind2 = dpt.zeros(1, dtype=ind_dt, sycl_queue=q2)
-    with pytest.raises(ExecutionPlacementError):
-        dpt.put_along_axis(x, ind2, vals)
-
-
-def test_put_along_axis_application():
-    get_queue_or_skip()
-    info_ = dpt.__array_namespace_info__()
-    def_dtypes = info_.default_dtypes(device=None)
-    ind_dt = def_dtypes["indexing"]
-    all_perms = dpt.asarray(
-        [
-            [0, 1, 2, 3],
-            [0, 2, 1, 3],
-            [2, 0, 1, 3],
-            [2, 1, 0, 3],
-            [1, 0, 2, 3],
-            [1, 2, 0, 3],
-            [0, 1, 3, 2],
-            [0, 2, 3, 1],
-            [2, 0, 3, 1],
-            [2, 1, 3, 0],
-            [1, 0, 3, 2],
-            [1, 2, 3, 0],
-            [0, 3, 1, 2],
-            [0, 3, 2, 1],
-            [2, 3, 0, 1],
-            [2, 3, 1, 0],
-            [1, 3, 0, 2],
-            [1, 3, 2, 0],
-            [3, 0, 1, 2],
-            [3, 0, 2, 1],
-            [3, 2, 0, 1],
-            [3, 2, 1, 0],
-            [3, 1, 0, 2],
-            [3, 1, 2, 0],
-        ],
-        dtype=ind_dt,
-    )
-    p_mats = dpt.zeros((24, 4, 4), dtype=dpt.int64)
-    vals = dpt.ones((24, 4, 1), dtype=p_mats.dtype)
-    # form 24 permutation matrices
-    dpt.put_along_axis(p_mats, all_perms[..., dpt.newaxis], vals, axis=2)
-    p2 = p_mats @ p_mats
-    p4 = p2 @ p2
-    p8 = p4 @ p4
-    expected = dpt.eye(4, dtype=p_mats.dtype)[dpt.newaxis, ...]
-    assert dpt.all(p8 @ p4 == expected)
-
-
-def check__extract_impl_validation(fn):
-    x = dpt.ones(10)
-    ind = dpt.ones(10, dtype="?")
-    with pytest.raises(TypeError):
-        fn(list(), ind)
-    with pytest.raises(TypeError):
-        fn(x, list())
-    q2 = dpctl.SyclQueue(x.sycl_device, property="enable_profiling")
-    ind2 = dpt.ones(10, dtype="?", sycl_queue=q2)
-    with pytest.raises(ExecutionPlacementError):
-        fn(x, ind2)
-    with pytest.raises(ValueError):
-        fn(x, ind, 1)
-
-
-def check__nonzero_impl_validation(fn):
-    with pytest.raises(TypeError):
-        fn(list())
-
-
-def check__take_multi_index(fn):
-    x = dpt.ones(10)
-    x_dev = x.sycl_device
-    info_ = dpt.__array_namespace_info__()
-    def_dtypes = info_.default_dtypes(device=x_dev)
-    ind_dt = def_dtypes["indexing"]
-    ind = dpt.arange(10, dtype=ind_dt)
-    with pytest.raises(TypeError):
-        fn(list(), tuple(), 1)
-    with pytest.raises(ValueError):
-        fn(x, (ind,), 0, mode=2)
-    with pytest.raises(ValueError):
-        fn(x, (None,), 1)
-    with pytest.raises(IndexError):
-        fn(x, (x,), 1)
-    q2 = dpctl.SyclQueue(x.sycl_device, property="enable_profiling")
-    ind2 = dpt.arange(10, dtype=ind_dt, sycl_queue=q2)
-    with pytest.raises(ExecutionPlacementError):
-        fn(x, (ind2,), 0)
-    m = dpt.ones((10, 10))
-    ind_1 = dpt.arange(10, dtype="i8")
-    ind_2 = dpt.arange(10, dtype="u8")
-    with pytest.raises(ValueError):
-        fn(m, (ind_1, ind_2), 0)
-
-
-def check__place_impl_validation(fn):
-    with pytest.raises(TypeError):
-        fn(list(), list(), list())
-    x = dpt.ones(10)
-    with pytest.raises(TypeError):
-        fn(x, list(), list())
-    q2 = dpctl.SyclQueue(x.sycl_device, property="enable_profiling")
-    mask2 = dpt.ones(10, dtype="?", sycl_queue=q2)
-    with pytest.raises(ExecutionPlacementError):
-        fn(x, mask2, 1)
-    x2 = dpt.ones((5, 5))
-    mask2 = dpt.ones((5, 5), dtype="?")
-    with pytest.raises(ValueError):
-        fn(x2, mask2, x2, axis=1)
-
-
-def check__put_multi_index_validation(fn):
-    with pytest.raises(TypeError):
-        fn(list(), list(), 0, list())
-    x = dpt.ones(10)
-    inds = dpt.arange(10, dtype="i8")
-    vals = dpt.zeros(10)
-    # test inds which is not a tuple/list
-    fn(x, inds, 0, vals)
-    x2 = dpt.ones((5, 5))
-    ind1 = dpt.arange(5, dtype="i8")
-    ind2 = dpt.arange(5, dtype="u8")
-    with pytest.raises(ValueError):
-        fn(x2, (ind1, ind2), 0, x2)
-    with pytest.raises(TypeError):
-        # invalid index type
-        fn(x2, (ind1, list()), 0, x2)
-    with pytest.raises(ValueError):
-        # invalid mode keyword value
-        fn(x, inds, 0, vals, mode=100)
-
-
-def test__copy_utils():
-    import dpctl.tensor._copy_utils as cu
-
-    get_queue_or_skip()
-
-    check__extract_impl_validation(cu._extract_impl)
-    check__nonzero_impl_validation(cu._nonzero_impl)
-    check__take_multi_index(cu._take_multi_index)
-    check__place_impl_validation(cu._place_impl)
-    check__put_multi_index_validation(cu._put_multi_index)
-
-
-@pytest.mark.parametrize("mode", ["wrap", "clip"])
-def test_take_indices_oob_py_ssize_t(mode):
-    get_queue_or_skip()
-
-    x = dpt.arange(10, dtype="i4")
-    inds1 = dpt.full(5, dpt.iinfo(dpt.uint64).max, dtype=dpt.uint64)
-    inds2 = dpt.full(5, dpt.iinfo(dpt.uint64).max, dtype=dpt.uint64)
-
-    # sweep through a small range of indices
-    # to check that OOB indices are well-behaved
-    for i in range(1, 10):
-        inds2 -= i
-        r1 = dpt.take(x, inds1, mode=mode)
-        r2 = dpt.take(x, inds2, mode=mode)
-
-        assert dpt.all(r1 == r2)
-
-
-@pytest.mark.parametrize("mode", ["wrap", "clip"])
-def test_put_indices_oob_py_ssize_t(mode):
-    get_queue_or_skip()
-
-    x = dpt.full(10, -1, dtype="i4")
-    inds = dpt.full(1, dpt.iinfo(dpt.uint64).max, dtype=dpt.uint64)
-
-    # OOB inds are positive, so always
-    # clip to the top of range
-    for i in range(1, 10):
-        inds -= i
-        dpt.put(x, inds, i, mode=mode)
-
-        assert dpt.all(x[:-1] == -1)
-        assert x[-1] == i
-
-
-def test_take_along_axis_uint64_indices():
-    get_queue_or_skip()
-
-    inds = dpt.arange(1, 10, 2, dtype="u8")
-    x = dpt.tile(dpt.asarray([0, -1], dtype="i4"), 5)
-    res = dpt.take_along_axis(x, inds)
-    assert dpt.all(res == -1)
-
-    sh0 = 2
-    inds = dpt.broadcast_to(inds, (sh0,) + inds.shape)
-    x = dpt.broadcast_to(x, (sh0,) + x.shape)
-    res = dpt.take_along_axis(x, inds, axis=1)
-    assert dpt.all(res == -1)
-
-
-def test_put_along_axis_uint64_indices():
-    get_queue_or_skip()
-
-    inds = dpt.arange(1, 10, 2, dtype="u8")
-    x = dpt.zeros(10, dtype="i4")
-    dpt.put_along_axis(x, inds, dpt.asarray(2, dtype=x.dtype))
-    expected = dpt.tile(dpt.asarray([0, 2], dtype="i4"), 5)
-    assert dpt.all(x == expected)
-
-    sh0 = 2
-    inds = dpt.broadcast_to(inds, (sh0,) + inds.shape)
-    x = dpt.zeros((sh0,) + x.shape, dtype="i4")
-    dpt.put_along_axis(x, inds, dpt.asarray(2, dtype=x.dtype), axis=1)
-    expected = dpt.tile(dpt.asarray([0, 2], dtype="i4"), (2, 5))
-    assert dpt.all(expected == x)
-
-
-@pytest.mark.parametrize("data_dt", _all_dtypes)
-@pytest.mark.parametrize("order", ["C", "F"])
-def test_take_out(data_dt, order):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(data_dt, q)
-
-    axis = 0
-    x = dpt.reshape(_make_3d(data_dt, q), (9, 3), order=order)
-    ind = dpt.arange(2, dtype="i8", sycl_queue=q)
-    out_sh = x.shape[:axis] + ind.shape + x.shape[axis + 1 :]
-    out = dpt.empty(out_sh, dtype=data_dt, sycl_queue=q)
-
-    expected = dpt.take(x, ind, axis=axis)
-
-    dpt.take(x, ind, axis=axis, out=out)
-
-    assert dpt.all(out == expected)
-
-
-@pytest.mark.parametrize("data_dt", _all_dtypes)
-@pytest.mark.parametrize("order", ["C", "F"])
-def test_take_out_overlap(data_dt, order):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(data_dt, q)
-
-    axis = 0
-    x = dpt.reshape(_make_3d(data_dt, q), (9, 3), order=order)
-    ind = dpt.arange(2, dtype="i8", sycl_queue=q)
-    out = x[x.shape[axis] - ind.shape[axis] : x.shape[axis], :]
-
-    expected = dpt.take(x, ind, axis=axis)
-
-    dpt.take(x, ind, axis=axis, out=out)
-
-    assert dpt.all(out == expected)
-    assert dpt.all(x[x.shape[0] - ind.shape[0] : x.shape[0], :] == out)
-
-
-def test_take_out_errors():
-    q1 = get_queue_or_skip()
-    q2 = get_queue_or_skip()
-
-    x = dpt.arange(10, dtype="i4", sycl_queue=q1)
-    ind = dpt.arange(2, dtype="i4", sycl_queue=q1)
-
-    with pytest.raises(TypeError):
-        dpt.take(x, ind, out=dict())
-
-    out_read_only = dpt.empty(ind.shape, dtype=x.dtype, sycl_queue=q1)
-    out_read_only.flags["W"] = False
-    with pytest.raises(ValueError):
-        dpt.take(x, ind, out=out_read_only)
-
-    out_bad_shape = dpt.empty(0, dtype=x.dtype, sycl_queue=q1)
-    with pytest.raises(ValueError):
-        dpt.take(x, ind, out=out_bad_shape)
-
-    out_bad_dt = dpt.empty(ind.shape, dtype="i8", sycl_queue=q1)
-    with pytest.raises(ValueError):
-        dpt.take(x, ind, out=out_bad_dt)
-
-    out_bad_q = dpt.empty(ind.shape, dtype=x.dtype, sycl_queue=q2)
-    with pytest.raises(dpctl.utils.ExecutionPlacementError):
-        dpt.take(x, ind, out=out_bad_q)
-
-
-def test_getitem_impl_fn_invalid_inp():
-    get_queue_or_skip()
-
-    x = dpt.ones((10, 10), dtype="i4")
-
-    bad_ind_type = (dpt.ones((), dtype="i4"), 2.0)
-    with pytest.raises(TypeError):
-        _take_multi_index(x, bad_ind_type, 0, 0)
-
-    no_array_inds = (2, 3)
-    with pytest.raises(TypeError):
-        _take_multi_index(x, no_array_inds, 0, 0)
diff --git a/dpctl/tests/test_usm_ndarray_linalg.py b/dpctl/tests/test_usm_ndarray_linalg.py
deleted file mode 100644
index 74da303aa2..0000000000
--- a/dpctl/tests/test_usm_ndarray_linalg.py
+++ /dev/null
@@ -1,1015 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import itertools
-
-import numpy as np
-import pytest
-
-import dpctl
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-from dpctl.utils import ExecutionPlacementError
-
-_numeric_types = [
-    "i1",
-    "u1",
-    "i2",
-    "u2",
-    "i4",
-    "u4",
-    "i8",
-    "u8",
-    "f2",
-    "f4",
-    "f8",
-    "c8",
-    "c16",
-]
-
-
-def _map_int_to_type(n, dt):
-    assert isinstance(n, int)
-    assert n > 0
-    if dt == dpt.int8:
-        return ((n + 128) % 256) - 128
-    elif dt == dpt.uint8:
-        return n % 256
-    elif dt == dpt.int16:
-        return ((n + 32768) % 65536) - 32768
-    elif dt == dpt.uint16:
-        return n % 65536
-    return n
-
-
-def test_matrix_transpose():
-    get_queue_or_skip()
-
-    X = dpt.reshape(dpt.arange(2 * 3, dtype="i4"), (2, 3))
-    res = dpt.matrix_transpose(X)
-    expected_res = X.mT
-
-    assert expected_res.shape == res.shape
-    assert expected_res.flags["C"] == res.flags["C"]
-    assert expected_res.flags["F"] == res.flags["F"]
-    assert dpt.all(X.mT == res)
-
-
-def test_matrix_transpose_arg_validation():
-    get_queue_or_skip()
-
-    X = dpt.empty(5, dtype="i4")
-    with pytest.raises(ValueError):
-        dpt.matrix_transpose(X)
-
-    X = dict()
-    with pytest.raises(TypeError):
-        dpt.matrix_transpose(X)
-
-    X = dpt.empty((5, 5), dtype="i4")
-    assert isinstance(dpt.matrix_transpose(X), dpt.usm_ndarray)
-
-
-@pytest.mark.parametrize("dtype", _numeric_types)
-def test_matmul_simple(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n, m = 235, 17
-    m1 = dpt.zeros((m, n), dtype=dtype)
-    m2 = dpt.zeros((n, m), dtype=dtype)
-
-    dt = m1.dtype
-    if dt.kind in "ui":
-        n1 = min(n, dpt.iinfo(dt).max)
-    else:
-        n1 = n
-    m1[:, :n1] = dpt.ones((m, n1), dtype=dt)
-    m2[:n1, :] = dpt.ones((n1, m), dtype=dt)
-
-    for k in [1, 2, 3, 4, 7, 8, 9, 15, 16, 17]:
-        r = dpt.matmul(m1[:k, :], m2[:, :k])
-        assert dpt.all(r == dpt.full((k, k), fill_value=n1, dtype=dt))
-
-
-@pytest.mark.parametrize("dtype", _numeric_types)
-def test_matmul_nilpotent1(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n = 77
-    N_mat = dpt.eye(n, k=1, dtype=dtype)
-    I_mat = dpt.eye(n, dtype=dtype)
-    R_mat = dpt.eye(n, dtype=dtype)
-    for _ in range(n + 1):
-        R_mat = I_mat + dpt.matmul(N_mat, R_mat)
-
-    assert dpt.allclose(dpt.matmul(I_mat - N_mat, R_mat), I_mat)
-
-
-@pytest.mark.parametrize("dtype", _numeric_types)
-def test_matmul_nilpotent2(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n = 128
-    u = dpt.ones((n, 1), dtype=dtype)
-    v = dpt.ones((1, n), dtype=dtype)
-
-    uv = dpt.matmul(u, v)
-    uv_ref = u * v
-
-    assert dpt.allclose(uv, uv_ref)
-
-
-def test_matmul_null_axis():
-    get_queue_or_skip()
-    n = 3
-
-    A_mat = dpt.ones((n, 0), dtype="f4")
-    B_mat = dpt.ones((0, 1), dtype="f4")
-
-    R_mat = dpt.matmul(A_mat, B_mat)
-    assert R_mat.shape == (n, 1)
-
-    R_mat = dpt.matmul(A_mat, B_mat[:, :0])
-    assert R_mat.shape == (n, 0)
-
-
-@pytest.mark.parametrize("dtype", ["i4", "f4"])
-def test_matmul_dims(dtype):
-    get_queue_or_skip()
-
-    n, m, k, b = 4, 5, 7, 3
-    v = dpt.ones(k, dtype=dtype)
-    m1 = dpt.ones((n, k), dtype=dtype)
-    m2 = dpt.ones((k, m), dtype=dtype)
-    st1 = dpt.ones((b, n, k), dtype=dtype)
-    st2 = dpt.ones((b, k, m), dtype=dtype)
-
-    r = dpt.matmul(v, v)
-    assert r.shape == tuple()
-    assert dpt.round(r) == k
-
-    r = dpt.matmul(m1, v)
-    assert r.shape == (n,)
-    assert dpt.all(dpt.round(r) == k)
-
-    r = dpt.matmul(v, m2)
-    assert r.shape == (m,)
-    assert dpt.all(dpt.round(r) == k)
-
-    r = dpt.matmul(m1, m2)
-    assert r.shape == (
-        n,
-        m,
-    )
-    assert dpt.all(dpt.round(r) == k)
-
-    r = dpt.matmul(v, st2)
-    assert r.shape == (
-        b,
-        m,
-    )
-    assert dpt.all(dpt.round(r) == k)
-
-    r = dpt.matmul(st1, v)
-    assert r.shape == (
-        b,
-        n,
-    )
-    assert dpt.all(dpt.round(r) == k)
-
-    r = dpt.matmul(st1, m2)
-    assert r.shape == (
-        b,
-        n,
-        m,
-    )
-    assert dpt.all(dpt.round(r) == k)
-
-    r = dpt.matmul(m1, st2)
-    assert r.shape == (
-        b,
-        n,
-        m,
-    )
-    assert dpt.all(dpt.round(r) == k)
-
-    r = dpt.matmul(st1, st2)
-    assert r.shape == (
-        b,
-        n,
-        m,
-    )
-    assert dpt.all(dpt.round(r) == k)
-
-
-def test_matmul_arg_validation():
-    get_queue_or_skip()
-
-    s1, s2 = dpt.ones(tuple()), dpt.zeros(tuple())
-    v1, v2 = dpt.ones(16), dpt.zeros(16)
-
-    with pytest.raises(ValueError):
-        dpt.matmul(s1, v2)
-
-    with pytest.raises(ValueError):
-        dpt.matmul(v1, s2)
-
-    with pytest.raises(TypeError):
-        dpt.matmul(dict(), v2)
-
-    with pytest.raises(TypeError):
-        dpt.matmul(v2, None)
-
-
-def test_matmul_dims_validation():
-    get_queue_or_skip()
-
-    m1 = dpt.ones((16, 16))
-    m2 = dpt.ones((16, 16))
-
-    # contraction dimensions mismatch
-    with pytest.raises(ValueError):
-        dpt.matmul(m1[:, :7], m2[:3, :])
-
-    m1 = dpt.ones((3, 4, 5))
-    m2 = dpt.ones((2, 5, 3))
-    # broadcasting dimensions mismatch
-    with pytest.raises(ValueError):
-        dpt.matmul(m1, m2)
-
-
-def test_matmul_broadcasting():
-    get_queue_or_skip()
-
-    for dt1, dt2 in [
-        (dpt.int16, dpt.int32),
-        (dpt.float32, dpt.int16),
-        (dpt.int32, dpt.uint32),
-    ]:
-        m1 = dpt.ones((7, 11, 16), dtype=dt1)
-        m2 = dpt.ones((16, 13), dtype=dt2)
-
-        r = dpt.matmul(m1, m2[dpt.newaxis, ...])
-
-        assert r.shape == (7, 11, 13)
-
-
-@pytest.mark.parametrize("dtype", ["i4", "i8", "f4", "c8"])
-def test_matmul_strided(dtype):
-    get_queue_or_skip()
-
-    m1_shape = (14, 22, 32)
-    m1_size = 1
-    for el in m1_shape:
-        m1_size = m1_size * el
-
-    m1 = dpt.remainder(dpt.arange(1, m1_size + 1, dtype="i8"), 13)
-    m1_orig = dpt.reshape(dpt.astype(m1, dtype), m1_shape)
-    m2_orig = dpt.ones((14, 16, 13), dtype=dtype)
-
-    m1 = m1_orig[::2, ::-2, ::2]
-    m2 = m2_orig[::2, :, :]
-    r = dpt.matmul(m1, m2)
-
-    assert r.shape == m1.shape[:2] + m2.shape[-1:]
-    ref = np.matmul(dpt.asnumpy(m1), dpt.asnumpy(m2))
-    assert np.allclose(dpt.asnumpy(r), ref)
-
-    m1 = m1_orig[::2, ::2, ::-2]
-    m2 = m2_orig[::2, :, :]
-    r = dpt.matmul(m1, m2)
-
-    assert r.shape == m1.shape[:2] + m2.shape[-1:]
-    ref = np.matmul(dpt.asnumpy(m1), dpt.asnumpy(m2))
-    assert np.allclose(dpt.asnumpy(r), ref)
-
-    m1 = m1_orig[::-2, ::2, ::2]
-    m2 = m2_orig[::-2, :, :]
-    r = dpt.matmul(m1, m2)
-
-    assert r.shape == m1.shape[:2] + m2.shape[-1:]
-    ref = np.matmul(dpt.asnumpy(m1), dpt.asnumpy(m2))
-    assert np.allclose(dpt.asnumpy(r), ref)
-
-
-def test_matmul_out():
-    get_queue_or_skip()
-
-    m1 = (
-        dpt.arange(14, dtype="f4")[:, dpt.newaxis, dpt.newaxis]
-        + dpt.arange(17, dtype="f4")[dpt.newaxis, :, dpt.newaxis]
-        + dpt.arange(128, dtype="f4")[dpt.newaxis, dpt.newaxis, :]
-    )
-    assert m1.shape == (14, 17, 128)
-    m2 = dpt.tile(
-        dpt.reshape(dpt.asarray([1, 2], dtype="f4"), (2, 1, 1)), (7, 128, 13)
-    )
-    assert m2.shape == (14, 128, 13)
-
-    buf = dpt.zeros((2 * 14, 3 * 17, 13), dtype="f4")
-    res = dpt.matmul(m1, m2, out=buf[::-2, 1::3, :])
-
-    assert dpt.allclose(res, buf[::-2, 1::3, :])
-    assert dpt.allclose(dpt.zeros_like(res), buf[::-2, 0::3, :])
-    assert dpt.allclose(dpt.zeros_like(res), buf[::-2, 2::3, :])
-
-    m1_np = dpt.asnumpy(m1)
-    ref = np.matmul(m1_np, dpt.asnumpy(m2))
-    assert np.allclose(ref, dpt.asnumpy(res))
-
-    res = dpt.matmul(m1[:, :10, :10], m1[:, :10, :10].mT, out=m1[:, :10, :10])
-    ref = np.matmul(
-        m1_np[:, :10, :10], np.transpose(m1_np[:, :10, :10], (0, 2, 1))
-    )
-    assert np.allclose(ref, dpt.asnumpy(res))
-
-
-def test_matmul_readonly_out():
-    get_queue_or_skip()
-    m = dpt.ones((10, 10), dtype=dpt.int32)
-    r = dpt.empty_like(m)
-    r.flags["W"] = False
-
-    with pytest.raises(ValueError):
-        dpt.matmul(m, m, out=r)
-
-
-def test_matmul_dtype():
-    get_queue_or_skip()
-
-    for dt1, dt2 in [
-        (dpt.int32, dpt.int16),
-        (dpt.int16, dpt.int32),
-        (dpt.float32, dpt.int16),
-        (dpt.int32, dpt.float32),
-    ]:
-        m1 = dpt.ones((10, 10), dtype=dt1)
-        m2 = dpt.ones((10, 10), dtype=dt2)
-
-        for ord in ["C", "A", "F", "K"]:
-            r = dpt.matmul(m1, m2, dtype=dpt.float32, order=ord)
-            assert r.dtype == dpt.float32
-
-
-@pytest.mark.parametrize("dt1", _numeric_types)
-@pytest.mark.parametrize("dt2", _numeric_types)
-@pytest.mark.parametrize("order", ["C", "K"])
-def test_matmul_type_promotion(dt1, dt2, order):
-    get_queue_or_skip()
-
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt1, q)
-    skip_if_dtype_not_supported(dt2, q)
-
-    b, n, k, m = 8, 10, 17, 10
-    m1 = dpt.ones((1, n, k), dtype=dt1)
-    m2 = dpt.ones((b, k, m), dtype=dt2)
-    expected_dt = dpt.result_type(m1, m2)
-
-    r = dpt.matmul(m1, m2, order=order)
-    assert r.shape == (b, n, m)
-    assert r.dtype == expected_dt
-
-    m1 = dpt.ones((b, n, k), dtype=dt1)
-    m2 = dpt.ones((1, k, m), dtype=dt2)
-
-    r = dpt.matmul(m1, m2, order=order)
-    assert r.shape == (b, n, m)
-    assert r.dtype == expected_dt
-
-    m1 = dpt.ones((n, k), dtype=dt1)
-    m2 = dpt.ones((k, m), dtype=dt2)
-
-    r = dpt.matmul(m1, m2, order=order)
-    assert r.shape == (n, m)
-    assert r.dtype == expected_dt
-
-
-def test_matmul_invalid_dtype():
-    get_queue_or_skip()
-
-    m1 = dpt.zeros((10, 10), dtype="f4")
-    m2 = dpt.zeros((10, 10), dtype="f4")
-    m3 = dpt.zeros((10, 10), dtype="i4")
-
-    with pytest.raises(ValueError):
-        dpt.matmul(m1, m2, dtype="i4")
-
-    with pytest.raises(ValueError):
-        dpt.matmul(m1, m3, dtype="i4")
-
-    with pytest.raises(ValueError):
-        dpt.matmul(m3, m1, dtype="i4")
-
-
-def test_matmul_out_errors():
-    q1 = get_queue_or_skip()
-    q2 = dpctl.SyclQueue()
-
-    sh = (10, 10)
-    dt = "i4"
-    m1 = dpt.zeros(sh, dtype=dt, sycl_queue=q1)
-    m2 = dpt.zeros(sh, dtype=dt, sycl_queue=q1)
-
-    with pytest.raises(TypeError):
-        dpt.matmul(m1, m2, out=dict())
-
-    with pytest.raises(ValueError):
-        dpt.matmul(m1, m2, out=dpt.empty((10,), dtype=dt, sycl_queue=q1))
-
-    with pytest.raises(ValueError):
-        dpt.matmul(m1, m2, out=dpt.empty(sh, dtype="f4", sycl_queue=q1))
-
-    with pytest.raises(ExecutionPlacementError):
-        dpt.matmul(m1, m2, out=dpt.empty(sh, dtype=dt, sycl_queue=q2))
-
-
-def test_matmul_order():
-    get_queue_or_skip()
-
-    sh = (
-        10,
-        10,
-    )
-    sh2 = tuple(2 * dim for dim in sh)
-    n = sh[-1]
-
-    for dt1, dt2 in zip(["i4", "i4", "f4"], ["i4", "f4", "i4"]):
-        ar1 = dpt.ones(sh, dtype=dt1, order="C")
-        ar2 = dpt.ones(sh, dtype=dt2, order="C")
-        r1 = dpt.matmul(ar1, ar2, order="C")
-        assert r1.flags.c_contiguous
-        r2 = dpt.matmul(ar1, ar2, order="F")
-        assert r2.flags.f_contiguous
-        r3 = dpt.matmul(ar1, ar2, order="A")
-        assert r3.flags.c_contiguous
-        r4 = dpt.matmul(ar1, ar2, order="K")
-        assert r4.flags.c_contiguous
-
-        ar1 = dpt.ones(sh, dtype=dt1, order="F")
-        ar2 = dpt.ones(sh, dtype=dt2, order="F")
-        r1 = dpt.matmul(ar1, ar2, order="C")
-        assert r1.flags.c_contiguous
-        r2 = dpt.matmul(ar1, ar2, order="F")
-        assert r2.flags.f_contiguous
-        r3 = dpt.matmul(ar1, ar2, order="A")
-        assert r3.flags.f_contiguous
-        r4 = dpt.matmul(ar1, ar2, order="K")
-        assert r4.flags.f_contiguous
-
-        ar1 = dpt.ones(sh2, dtype=dt1, order="C")[:10, ::-2]
-        ar2 = dpt.ones(sh2, dtype=dt2, order="C")[:10, ::-2]
-        r4 = dpt.matmul(ar1, ar2, order="K")
-        assert r4.strides == (n, -1)
-        r5 = dpt.matmul(ar1, ar2, order="C")
-        assert r5.strides == (n, 1)
-
-        ar1 = dpt.ones(sh2, dtype=dt1, order="C")[:10, ::-2].mT
-        ar2 = dpt.ones(sh2, dtype=dt2, order="C")[:10, ::-2].mT
-        r4 = dpt.matmul(ar1, ar2, order="K")
-        assert r4.strides == (-1, n)
-        r5 = dpt.matmul(ar1, ar2, order="C")
-        assert r5.strides == (n, 1)
-
-
-def test_matmul_invalid_order():
-    get_queue_or_skip()
-
-    sh = (
-        10,
-        10,
-    )
-    dt = "i4"
-
-    ar1 = dpt.ones(sh, dtype=dt, order="C")
-    ar2 = dpt.ones(sh, dtype=dt, order="C")
-    r = dpt.matmul(ar1, ar2, order="invalid")
-    assert r.flags.c_contiguous
-
-    ar1 = dpt.ones(sh, dtype=dt, order="F")
-    ar2 = dpt.ones(sh, dtype=dt, order="F")
-    r = dpt.matmul(ar1, ar2, order="invalid")
-    assert r.flags.f_contiguous
-
-
-def test_matmul_compute_follows_data():
-    q1 = get_queue_or_skip()
-    q2 = dpctl.SyclQueue()
-
-    sh = (
-        10,
-        10,
-    )
-    dt = "i4"
-    m1 = dpt.zeros(sh, dtype=dt, sycl_queue=q1)
-    m2 = dpt.zeros(sh, dtype=dt, sycl_queue=q2)
-
-    with pytest.raises(ExecutionPlacementError):
-        dpt.matmul(m1, m2)
-
-
-def test_matmul_inplace_broadcasting():
-    get_queue_or_skip()
-
-    sh = (3, 5, 5)
-    dt = "i4"
-
-    m1 = dpt.ones((3, 5, 5), dtype=dt)
-    m2 = dpt.ones((1, 5, 5), dtype=dt)
-    m1 @= m2
-    assert dpt.all(m1 == dpt.full(sh, 5, dtype=dt))
-
-
-def test_matmul_prepend_dims():
-    get_queue_or_skip()
-
-    n = 5
-    for dt1, dt2 in [
-        (dpt.int32, dpt.int32),
-        (dpt.int32, dpt.int64),
-        (dpt.int64, dpt.int32),
-        (dpt.int32, dpt.uint32),
-    ]:
-        m = dpt.ones((n, 4), dtype=dt1)
-        v = dpt.ones((4,), dtype=dt2)
-        r = dpt.matmul(m, v)
-        assert r.shape == (n,)
-
-        r = dpt.matmul(v, m.mT)
-        assert r.shape == (n,)
-
-
-def test_matmul_inplace_same_tensors():
-    get_queue_or_skip()
-
-    n = 5
-    sh = (
-        n,
-        n,
-    )
-
-    ar1 = dpt.ones(sh, dtype="i4")
-    ar1 @= ar1
-    assert dpt.all(ar1 == dpt.full(sh, n, dtype="i4"))
-
-    ar1 = dpt.ones(sh, dtype="i8")
-    ar2 = dpt.ones(sh, dtype="i4")
-    dpt.matmul(ar1, ar2, out=ar1)
-    assert dpt.all(ar1 == dpt.full(sh, n, dtype=ar1.dtype))
-
-    ar1 = dpt.ones(sh, dtype="i4")
-    ar2 = dpt.ones(sh, dtype="i8")
-    dpt.matmul(ar1, ar2, out=ar2)
-    assert dpt.all(ar2 == dpt.full(sh, n, dtype=ar2.dtype))
-
-
-@pytest.fixture
-def random_matrix():
-    rs = np.random.RandomState(seed=123456)
-    m_np = rs.randint(low=0, high=6, size=(400, 400))
-    return m_np
-
-
-@pytest.mark.parametrize("dtype", _numeric_types)
-def test_matmul_largish_square(dtype, random_matrix):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    m_np = random_matrix.astype(dtype)
-    x_np = np.matmul(m_np.T, m_np)
-
-    m = dpt.asarray(m_np)
-    mT = dpt.asarray(m.mT, copy=True, order="C")
-    x1 = dpt.matmul(m.mT, m)
-    x2 = dpt.matmul(mT, m)
-
-    tol = 0
-    if dpt.isdtype(x2.dtype, ("real floating", "complex floating")):
-        tol = 32 * dpt.finfo(x2.dtype).eps
-
-    assert dpt.allclose(x1, x2, atol=tol, rtol=tol)
-    assert dpt.allclose(x1, dpt.asarray(x_np), atol=tol, rtol=tol)
-
-    # check stided input
-    m_np = m_np[:-1, :-1]
-    x_np = np.matmul(m_np.T, m_np)
-
-    m = m[:-1, :-1]
-    mT = dpt.asarray(m.mT, copy=True, order="C")
-    x1 = dpt.matmul(m.mT, m)
-    x2 = dpt.matmul(mT, m)
-
-    assert dpt.allclose(x1, x2, atol=tol, rtol=tol)
-    assert dpt.allclose(x1, dpt.asarray(x_np), atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("dtype", _numeric_types)
-def test_matmul_largish_rect(dtype, random_matrix):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    m_np = random_matrix.astype(dtype)[:, :-1]
-    x_np = np.matmul(m_np.T[:-2, :], m_np)
-
-    m = dpt.asarray(m_np)
-    mmT = m.mT[:-2, :]
-    mT = dpt.asarray(mmT, copy=True, order="C")
-    x1 = dpt.matmul(mmT, m)
-    x2 = dpt.matmul(mT, m)
-
-    tol = 0
-    if dpt.isdtype(x2.dtype, ("real floating", "complex floating")):
-        tol = 32 * dpt.finfo(x2.dtype).eps
-
-    assert dpt.allclose(x1, x2, atol=tol, rtol=tol)
-    assert dpt.allclose(x1, dpt.asarray(x_np), atol=tol, rtol=tol)
-
-    m_np = m_np[:-1, :-1]
-    x_np = np.matmul(m_np.T[:-2, :], m_np)
-
-    m = m[:-1, :-1]
-    mmT = m.mT[:-2, :]
-    mT = dpt.asarray(mmT, copy=True, order="C")
-    x1 = dpt.matmul(mmT, m)
-    x2 = dpt.matmul(mT, m)
-
-    assert dpt.allclose(x1, x2, atol=tol, rtol=tol)
-    assert dpt.allclose(x1, dpt.asarray(x_np), atol=tol, rtol=tol)
-
-
-@pytest.mark.parametrize("dtype", _numeric_types)
-def test_tensordot_outer(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    t1 = dpt.ones((3, 8), dtype=dtype)
-    t2 = dpt.ones((4, 12), dtype=dtype)
-
-    r = dpt.tensordot(t1, t2, axes=0)
-    assert r.shape == t1.shape + t2.shape
-    assert dpt.allclose(r, dpt.ones_like(r))
-
-
-@pytest.mark.parametrize("dtype", _numeric_types)
-def test_tensordot_inner(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    t1 = dpt.ones((3, 8), dtype=dtype)
-    t2 = dpt.ones((4, 8), dtype=dtype)
-
-    r = dpt.tensordot(t1, t2.mT, axes=1)
-    assert r.shape == t1.shape[:1] + t2.shape[:1]
-    assert dpt.allclose(r, dpt.full_like(r, fill_value=t1.shape[1]))
-
-
-@pytest.mark.parametrize("dtype", _numeric_types)
-def test_tensordot_double(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    t1 = dpt.ones((2, 4, 8), dtype=dtype)
-    t2 = dpt.ones((3, 4, 8), dtype=dtype)
-
-    r = dpt.tensordot(t1, dpt.permute_dims(t2, (1, 2, 0)), axes=2)
-    assert r.shape == t1.shape[:1] + t2.shape[:1]
-    expected = dpt.prod(dpt.asarray(t1.shape[1:]))
-    assert dpt.allclose(r, dpt.full_like(r, fill_value=expected))
-
-
-@pytest.mark.parametrize("dtype", ["i4", "f4"])
-def test_tensordot_axes_sequence(dtype):
-    get_queue_or_skip()
-
-    r = 4
-    t1 = dpt.ones((2, 2, 4, 3), dtype=dtype)
-    t2 = dpt.ones((3, 2, 4, 3), dtype=dtype)
-
-    assert len(t1.shape) == r
-    assert len(t2.shape) == r
-
-    expected = dpt.prod(dpt.asarray(t1.shape[1:]))
-    ps1 = itertools.permutations(range(r))
-    ps2 = itertools.permutations(range(r))
-
-    for p1 in ps1:
-        assert len(p1) == r
-        inv_p1 = sorted(range(r), key=p1.__getitem__)
-        u1 = dpt.permute_dims(t1, p1)
-        x1_axes = inv_p1[1:]
-        for p2 in ps2:
-            inv_p2 = sorted(range(r), key=p2.__getitem__)
-            u2 = dpt.permute_dims(t2, p2)
-            x2_axes = inv_p2[1:]
-
-            tdr = dpt.tensordot(u1, u2, axes=(x1_axes, x2_axes))
-            assert tdr.shape == t1.shape[:1] + t2.shape[:1]
-            assert dpt.allclose(tdr, dpt.full_like(tdr, fill_value=expected))
-
-
-def test_tensordot_validation():
-    get_queue_or_skip()
-
-    with pytest.raises(TypeError):
-        dpt.tensordot(dict(), dict())
-
-    t1 = dpt.empty((10, 10, 10))
-    with pytest.raises(TypeError):
-        dpt.tensordot(t1, dict())
-
-    t2 = dpt.empty((10, 10, 10))
-    q = dpctl.SyclQueue(t2.sycl_context, t2.sycl_device, property="in_order")
-    with pytest.raises(dpctl.utils.ExecutionPlacementError):
-        dpt.tensordot(t1, t2.to_device(q))
-
-    invalid_axes = (
-        1,
-        2,
-        3,
-    )
-    with pytest.raises(ValueError):
-        dpt.tensordot(t1, t2, axes=invalid_axes)
-
-    invalid_axes = 5.2
-    with pytest.raises(TypeError):
-        dpt.tensordot(t1, t2, axes=invalid_axes)
-
-    invalid_axes = (
-        (1,),
-        (
-            0,
-            2,
-        ),
-    )
-    with pytest.raises(ValueError):
-        dpt.tensordot(t1, t2, axes=invalid_axes)
-
-    with pytest.raises(ValueError):
-        dpt.tensordot(t1[..., :5], t2)
-
-
-def test_tensordot_promotion():
-    get_queue_or_skip()
-
-    t1 = dpt.zeros((10, 10), dtype="i4")
-    t2 = dpt.zeros((10, 10), dtype="i8")
-
-    r1 = dpt.tensordot(t1, t2)
-    assert r1.dtype == t2.dtype
-
-    r2 = dpt.tensordot(t2, t1)
-    assert r2.dtype == t2.dtype
-
-    t3 = dpt.zeros((10, 10), dtype="u4")
-    r3 = dpt.tensordot(t1, t3)
-    assert r3.dtype == dpt.result_type(t1, t3)
-
-
-def test_tensordot_axes_errors():
-    get_queue_or_skip()
-
-    m1 = dpt.zeros((10, 10), dtype="i4")
-    m2 = dpt.zeros((10, 10), dtype="i4")
-
-    with pytest.raises(ValueError):
-        dpt.tensordot(m1, m2, axes=-1)
-
-
-# tests for gh-1570
-def test_tensordot_gemm_small_k_m():
-    get_queue_or_skip()
-
-    x1 = dpt.asarray(1, dtype="i2")
-    x2 = dpt.asarray([0, 1, 0, 0], dtype="i2")
-
-    res = dpt.tensordot(x1, x2, axes=0)
-    assert dpt.all(x2 == res)
-
-
-@pytest.mark.parametrize("dtype", _numeric_types)
-def test_vecdot_1d(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n = 511
-    v1 = dpt.ones(n, dtype=dtype)
-
-    v2 = dpt.ones(n, dtype=dtype)
-
-    r = dpt.vecdot(v1, v2)
-    expected_value = _map_int_to_type(n, r.dtype)
-    assert r == expected_value
-
-
-@pytest.mark.parametrize("dtype", _numeric_types)
-def test_vecdot_3d(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    m1, m2, n = 7, 3, 511
-    v1 = dpt.ones((m1, m2, n), dtype=dtype)
-
-    v2 = dpt.ones((m1, m2, n), dtype=dtype)
-
-    r = dpt.vecdot(v1, v2)
-
-    assert r.shape == (
-        m1,
-        m2,
-    )
-    expected_value = _map_int_to_type(n, r.dtype)
-    assert dpt.all(r == expected_value)
-
-
-@pytest.mark.parametrize("dtype", _numeric_types)
-def test_vecdot_axis(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    m1, m2, n = 7, 3, 511
-    v1 = dpt.ones((m1, n, m2), dtype=dtype)
-
-    v2 = dpt.ones((m1, n, m2), dtype=dtype)
-
-    r = dpt.vecdot(v1, v2, axis=-2)
-
-    assert r.shape == (
-        m1,
-        m2,
-    )
-    expected_value = _map_int_to_type(n, r.dtype)
-    assert dpt.all(r == expected_value)
-
-
-@pytest.mark.parametrize("dtype", _numeric_types)
-def test_vecdot_strided(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    m1, m2, n = 7, 3, 511
-    list1 = [1, 0, 2, 0]
-    pattern1 = dpt.asarray(list1, dtype=dtype)
-    n_padded1 = pattern1.size * (1 + ((n - 1) // pattern1.size))
-    v1 = dpt.tile(dpt.reshape(pattern1, (1, -1, 1)), (m1, n_padded1, m2))[
-        ::-1, :n, :
-    ]
-
-    list2 = [1, 2, 1, 2]
-    pattern2 = dpt.asarray(list2, dtype=dtype)
-    n_padded2 = pattern2.size * (1 + ((n - 1) // pattern2.size))
-    v2 = dpt.tile(dpt.reshape(pattern2, (1, -1, 1)), (m1, n_padded2, m2))[
-        :, :n, ::-1
-    ]
-
-    r = dpt.vecdot(v1, v2, axis=-2)
-
-    ref = sum(
-        el1 * el2
-        for el1, el2 in zip((list1 * n_padded1)[:n], (list2 * n_padded1)[:n])
-    )
-
-    assert r.shape == (
-        m1,
-        m2,
-    )
-    ref = _map_int_to_type(ref, r.dtype)
-    assert dpt.all(r == ref)
-
-
-def test_vector_arg_validation():
-    get_queue_or_skip()
-
-    s1, s2 = dpt.ones(tuple()), dpt.zeros(tuple())
-    v1, v2 = dpt.ones(16), dpt.zeros(16)
-
-    with pytest.raises(ValueError):
-        dpt.vecdot(s1, v2)
-
-    with pytest.raises(ValueError):
-        dpt.vecdot(v1, s2)
-
-    with pytest.raises(TypeError):
-        dpt.vecdot(dict(), v2)
-
-    with pytest.raises(TypeError):
-        dpt.vecdot(v2, None)
-
-    with pytest.raises(ValueError):
-        dpt.vecdot(v1[:5], v2[:4])
-
-    with pytest.raises(ValueError):
-        dpt.vecdot(v1, v2, axis=2)
-
-    with pytest.raises(ValueError):
-        dpt.vecdot(v1, v2, axis=-2)
-
-    q = dpctl.SyclQueue(
-        v2.sycl_context, v2.sycl_device, property="enable_profiling"
-    )
-    with pytest.raises(dpctl.utils.ExecutionPlacementError):
-        dpt.vecdot(v1, v2.to_device(q))
-
-    m1 = dpt.empty((10, 5))
-    m2 = dpt.empty((5, 5))
-    with pytest.raises(ValueError):
-        dpt.vecdot(m1, m2, axis=-1)
-
-
-def test_vecdot_broadcast():
-    get_queue_or_skip()
-
-    for dt1, dt2 in [
-        (dpt.int32, dpt.int32),
-        (dpt.int32, dpt.int64),
-        (dpt.int64, dpt.int32),
-        (dpt.int32, dpt.uint32),
-    ]:
-        m1 = dpt.zeros((1, 5), dtype=dt1)
-        m2 = dpt.zeros((5, 5), dtype=dt2)
-        r1 = dpt.vecdot(m1, m2, axis=-1)
-        r2 = dpt.vecdot(m2, m1, axis=-1)
-        assert r1.shape == r2.shape
-
-
-@pytest.mark.parametrize("dt1", _numeric_types)
-@pytest.mark.parametrize("dt2", _numeric_types)
-def test_vecdot_type_promotion(dt1, dt2):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt1, q)
-    skip_if_dtype_not_supported(dt2, q)
-
-    v1 = dpt.ones(128, dtype=dt1)
-    v2 = dpt.ones(128, dtype=dt2)
-
-    r = dpt.vecdot(v1, v2)
-    mul = v1 * v2
-    assert r.shape == tuple()
-    assert r.dtype == mul.dtype
-    assert dpt.allclose(r, dpt.sum(mul, dtype=mul.dtype))
-
-
-def test_vecdot_broadcast_o1_buffer():
-    get_queue_or_skip()
-
-    v1 = dpt.arange(10, dtype="i2")
-    v2 = dpt.ones((5, 10), dtype="i4")
-
-    res1 = dpt.vecdot(v1, v2)
-    assert res1.shape == (5,)
-
-    res2 = dpt.vecdot(v2, v1)
-    assert res2.shape == (5,)
-
-
-def test_vecdot_contig_small():
-    get_queue_or_skip()
-
-    n = 1
-    for dt in [dpt.int16, dpt.int32, dpt.complex64]:
-        v1 = dpt.zeros((10, n), dtype=dt)
-        v2 = dpt.ones_like(v1, dtype=dt)
-        v1[-1] = 1
-        res = dpt.vecdot(v1, v2)
-        assert dpt.all(res[:-1] == 0)
-        assert res[-1] == n
-
-
-def test_matmul_out_appended_axes():
-    get_queue_or_skip()
-
-    n0, n1, n2 = 4, 10, 5
-    # vm
-    x1 = dpt.ones(n1, dtype="i4")
-    x2 = dpt.ones((n0, n1, n2), dtype="i4")
-    out = dpt.empty((n0, n2), dtype="i4")
-
-    dpt.matmul(x1, x2, out=out)
-    assert dpt.all(out == n1)
-
-    # mv
-    x2 = x2.mT
-    x1, x2 = x2, x1
-    dpt.matmul(x1, x2, out=out)
-    assert dpt.all(out == n1)
-
-    # vv
-    x1 = dpt.ones(n1, dtype="i4")
-    out = dpt.empty((), dtype="i4")
-    dpt.matmul(x1, x2, out=out)
-    assert out == n1
diff --git a/dpctl/tests/test_usm_ndarray_manipulation.py b/dpctl/tests/test_usm_ndarray_manipulation.py
deleted file mode 100644
index b278761811..0000000000
--- a/dpctl/tests/test_usm_ndarray_manipulation.py
+++ /dev/null
@@ -1,1597 +0,0 @@
-#                      Data Parallel Control (dpctl)
-#
-# Copyright 2020-2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import itertools
-
-import numpy as np
-import pytest
-from numpy.testing import assert_, assert_array_equal, assert_raises_regex
-
-import dpctl
-import dpctl.tensor as dpt
-from dpctl.tensor._numpy_helper import AxisError
-from dpctl.tests.helper import get_queue_or_skip
-from dpctl.utils import ExecutionPlacementError
-
-
-def test_permute_dims_incorrect_type():
-    X_list = list([[1, 2, 3], [4, 5, 6]])
-    X_tuple = tuple(X_list)
-    Xnp = np.array(X_list)
-
-    pytest.raises(TypeError, dpt.permute_dims, X_list, (1, 0))
-    pytest.raises(TypeError, dpt.permute_dims, X_tuple, (1, 0))
-    pytest.raises(TypeError, dpt.permute_dims, Xnp, (1, 0))
-
-
-def test_permute_dims_empty_array():
-    q = get_queue_or_skip()
-
-    Xnp = np.empty((10, 0))
-    X = dpt.asarray(Xnp, sycl_queue=q)
-    Y = dpt.permute_dims(X, (1, 0))
-    Ynp = np.transpose(Xnp, (1, 0))
-    assert_array_equal(Ynp, dpt.asnumpy(Y))
-
-
-def test_permute_dims_0d_1d():
-    q = get_queue_or_skip()
-
-    Xnp_0d = np.array(1, dtype="int64")
-    X_0d = dpt.asarray(Xnp_0d, sycl_queue=q)
-    Y_0d = dpt.permute_dims(X_0d, ())
-    assert_array_equal(dpt.asnumpy(Y_0d), dpt.asnumpy(X_0d))
-
-    Xnp_1d = np.random.randint(0, 2, size=6, dtype="int64")
-    X_1d = dpt.asarray(Xnp_1d, sycl_queue=q)
-    Y_1d = dpt.permute_dims(X_1d, (0))
-    assert_array_equal(dpt.asnumpy(Y_1d), dpt.asnumpy(X_1d))
-
-    pytest.raises(ValueError, dpt.permute_dims, X_1d, ())
-    pytest.raises(AxisError, dpt.permute_dims, X_1d, (1))
-    pytest.raises(ValueError, dpt.permute_dims, X_1d, (1, 0))
-    pytest.raises(
-        ValueError, dpt.permute_dims, dpt.reshape(X_1d, (2, 3)), (1, 1)
-    )
-
-
-@pytest.mark.parametrize("shapes", [(2, 2), (1, 4), (3, 3, 3), (4, 1, 3)])
-def test_permute_dims_2d_3d(shapes):
-    q = get_queue_or_skip()
-
-    Xnp_size = np.prod(shapes)
-
-    Xnp = np.random.randint(0, 2, size=Xnp_size, dtype="int64").reshape(shapes)
-    X = dpt.asarray(Xnp, sycl_queue=q)
-    X_ndim = X.ndim
-    if X_ndim == 2:
-        Y = dpt.permute_dims(X, (1, 0))
-        Ynp = np.transpose(Xnp, (1, 0))
-    elif X_ndim == 3:
-        X = dpt.asarray(Xnp, sycl_queue=q)
-        Y = dpt.permute_dims(X, (2, 0, 1))
-        Ynp = np.transpose(Xnp, (2, 0, 1))
-    assert_array_equal(Ynp, dpt.asnumpy(Y))
-
-
-def test_expand_dims_incorrect_type():
-    X_list = [1, 2, 3, 4, 5]
-    with pytest.raises(TypeError):
-        dpt.permute_dims(X_list, axis=1)
-
-
-def test_expand_dims_0d():
-    q = get_queue_or_skip()
-
-    Xnp = np.array(1, dtype="int64")
-    X = dpt.asarray(Xnp, sycl_queue=q)
-
-    Y = dpt.expand_dims(X, axis=0)
-    Ynp = np.expand_dims(Xnp, axis=0)
-    assert_array_equal(Ynp, dpt.asnumpy(Y))
-
-    Y = dpt.expand_dims(X, axis=-1)
-    Ynp = np.expand_dims(Xnp, axis=-1)
-    assert_array_equal(Ynp, dpt.asnumpy(Y))
-
-    pytest.raises(AxisError, dpt.expand_dims, X, axis=1)
-    pytest.raises(AxisError, dpt.expand_dims, X, axis=-2)
-
-
-@pytest.mark.parametrize("shapes", [(3,), (3, 3), (3, 3, 3)])
-def test_expand_dims_1d_3d(shapes):
-    q = get_queue_or_skip()
-
-    Xnp_size = np.prod(shapes)
-
-    Xnp = np.random.randint(0, 2, size=Xnp_size, dtype="int64").reshape(shapes)
-    X = dpt.asarray(Xnp, sycl_queue=q)
-    shape_len = len(shapes)
-    for axis in range(-shape_len - 1, shape_len):
-        Y = dpt.expand_dims(X, axis=axis)
-        Ynp = np.expand_dims(Xnp, axis=axis)
-        assert_array_equal(Ynp, dpt.asnumpy(Y))
-
-    pytest.raises(AxisError, dpt.expand_dims, X, axis=shape_len + 1)
-    pytest.raises(AxisError, dpt.expand_dims, X, axis=-shape_len - 2)
-
-
-@pytest.mark.parametrize(
-    "axes", [(0, 1, 2), (0, -1, -2), (0, 3, 5), (0, -3, -5)]
-)
-def test_expand_dims_tuple(axes):
-    q = get_queue_or_skip()
-
-    Xnp = np.empty((3, 3, 3), dtype="u1")
-    X = dpt.asarray(Xnp, sycl_queue=q)
-    Y = dpt.expand_dims(X, axis=axes)
-    Ynp = np.expand_dims(Xnp, axis=axes)
-    assert_array_equal(Ynp, dpt.asnumpy(Y))
-
-
-def test_expand_dims_incorrect_tuple():
-    try:
-        X = dpt.empty((3, 3, 3), dtype="i4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    with pytest.raises(AxisError):
-        dpt.expand_dims(X, axis=(0, -6))
-    with pytest.raises(AxisError):
-        dpt.expand_dims(X, axis=(0, 5))
-
-    with pytest.raises(ValueError):
-        dpt.expand_dims(X, axis=(1, 1))
-
-
-def test_squeeze_incorrect_type():
-    X_list = [1, 2, 3, 4, 5]
-    with pytest.raises(TypeError):
-        dpt.permute_dims(X_list, 1)
-
-
-def test_squeeze_0d():
-    q = get_queue_or_skip()
-
-    Xnp = np.array(1)
-    X = dpt.asarray(Xnp, sycl_queue=q)
-    Y = dpt.squeeze(X)
-    Ynp = Xnp.squeeze()
-    assert_array_equal(Ynp, dpt.asnumpy(Y))
-
-    Y = dpt.squeeze(X, 0)
-    Ynp = Xnp.squeeze(0)
-    assert_array_equal(Ynp, dpt.asnumpy(Y))
-
-    Y = dpt.squeeze(X, (0))
-    Ynp = Xnp.squeeze((0))
-    assert_array_equal(Ynp, dpt.asnumpy(Y))
-
-    Y = dpt.squeeze(X, -1)
-    Ynp = Xnp.squeeze(-1)
-    assert_array_equal(Ynp, dpt.asnumpy(Y))
-
-    pytest.raises(AxisError, dpt.squeeze, X, 1)
-    pytest.raises(AxisError, dpt.squeeze, X, -2)
-    pytest.raises(AxisError, dpt.squeeze, X, (1))
-    pytest.raises(AxisError, dpt.squeeze, X, (-2))
-    pytest.raises(ValueError, dpt.squeeze, X, (0, 0))
-
-
-@pytest.mark.parametrize(
-    "shapes",
-    [
-        (0),
-        (1),
-        (1, 2),
-        (2, 1),
-        (1, 1),
-        (2, 2),
-        (1, 0),
-        (0, 1),
-        (1, 2, 1),
-        (2, 1, 2),
-        (2, 2, 2),
-        (1, 1, 1),
-        (1, 0, 1),
-        (0, 1, 0),
-    ],
-)
-def test_squeeze_without_axes(shapes):
-    q = get_queue_or_skip()
-
-    Xnp = np.empty(shapes, dtype="u1")
-    X = dpt.asarray(Xnp, sycl_queue=q)
-    Y = dpt.squeeze(X)
-    Ynp = Xnp.squeeze()
-    assert_array_equal(Ynp, dpt.asnumpy(Y))
-
-
-@pytest.mark.parametrize("axes", [0, 2, (0), (2), (0, 2)])
-def test_squeeze_axes_arg(axes):
-    q = get_queue_or_skip()
-
-    Xnp = np.array([[[1], [2], [3]]], dtype="u1")
-    X = dpt.asarray(Xnp, sycl_queue=q)
-    Y = dpt.squeeze(X, axes)
-    Ynp = Xnp.squeeze(axes)
-    assert_array_equal(Ynp, dpt.asnumpy(Y))
-
-
-@pytest.mark.parametrize("axes", [1, -2, (1), (-2), (0, 0), (1, 1)])
-def test_squeeze_axes_arg_error(axes):
-    q = get_queue_or_skip()
-
-    Xnp = np.array([[[1], [2], [3]]], dtype="u1")
-    X = dpt.asarray(Xnp, sycl_queue=q)
-    pytest.raises(ValueError, dpt.squeeze, X, axes)
-
-
-@pytest.mark.parametrize(
-    "data",
-    [
-        [np.array(0, dtype="u1"), (0,)],
-        [np.array(0, dtype="u1"), (1,)],
-        [np.array(0, dtype="u1"), (3,)],
-        [np.ones(1, dtype="u1"), (1,)],
-        [np.ones(1, dtype="u1"), (2,)],
-        [np.ones(1, dtype="u1"), (1, 2, 3)],
-        [np.arange(3, dtype="u1"), (3,)],
-        [np.arange(3, dtype="u1"), (1, 3)],
-        [np.arange(3, dtype="u1"), (2, 3)],
-        [np.ones(0, dtype="u1"), 0],
-        [np.ones(1, dtype="u1"), 1],
-        [np.ones(1, dtype="u1"), 2],
-        [np.ones(1, dtype="u1"), (0,)],
-        [np.ones((1, 2), dtype="u1"), (0, 2)],
-        [np.ones((2, 1), dtype="u1"), (2, 0)],
-    ],
-)
-def test_broadcast_to_succeeds(data):
-    q = get_queue_or_skip()
-
-    Xnp, target_shape = data
-    X = dpt.asarray(Xnp, sycl_queue=q)
-    Y = dpt.broadcast_to(X, target_shape)
-    Ynp = np.broadcast_to(Xnp, target_shape)
-    assert_array_equal(dpt.asnumpy(Y), Ynp)
-
-
-@pytest.mark.parametrize(
-    "data",
-    [
-        [(0,), ()],
-        [(1,), ()],
-        [(3,), ()],
-        [(3,), (1,)],
-        [(3,), (2,)],
-        [(3,), (4,)],
-        [(1, 2), (2, 1)],
-        [(1, 1), (1,)],
-        [(1,), -1],
-        [(1,), (-1,)],
-        [(1, 2), (-1, 2)],
-    ],
-)
-def test_broadcast_to_raises(data):
-    q = get_queue_or_skip()
-
-    orig_shape, target_shape = data
-    Xnp = np.zeros(orig_shape, dtype="i1")
-    X = dpt.asarray(Xnp, sycl_queue=q)
-    pytest.raises(ValueError, dpt.broadcast_to, X, target_shape)
-
-
-def assert_broadcast_correct(input_shapes):
-    q = get_queue_or_skip()
-    np_arrays = [np.zeros(s, dtype="i1") for s in input_shapes]
-    out_np_arrays = np.broadcast_arrays(*np_arrays)
-    usm_arrays = [dpt.asarray(Xnp, sycl_queue=q) for Xnp in np_arrays]
-    out_usm_arrays = dpt.broadcast_arrays(*usm_arrays)
-    for Xnp, X in zip(out_np_arrays, out_usm_arrays):
-        assert_array_equal(
-            Xnp, dpt.asnumpy(X), err_msg=f"Failed for {input_shapes})"
-        )
-
-
-def assert_broadcast_arrays_raise(input_shapes):
-    q = get_queue_or_skip()
-    usm_arrays = [dpt.asarray(np.zeros(s), sycl_queue=q) for s in input_shapes]
-    pytest.raises(ValueError, dpt.broadcast_arrays, *usm_arrays)
-
-
-def test_broadcast_arrays_same():
-    q = get_queue_or_skip()
-    Xnp = np.arange(10)
-    Ynp = np.arange(10)
-    res_Xnp, res_Ynp = np.broadcast_arrays(Xnp, Ynp)
-    X = dpt.asarray(Xnp, sycl_queue=q)
-    Y = dpt.asarray(Ynp, sycl_queue=q)
-    res_X, res_Y = dpt.broadcast_arrays(X, Y)
-    assert_array_equal(res_Xnp, dpt.asnumpy(res_X))
-    assert_array_equal(res_Ynp, dpt.asnumpy(res_Y))
-
-
-def test_broadcast_arrays_one_off():
-    q = get_queue_or_skip()
-    Xnp = np.array([[1, 2, 3]])
-    Ynp = np.array([[1], [2], [3]])
-    res_Xnp, res_Ynp = np.broadcast_arrays(Xnp, Ynp)
-    X = dpt.asarray(Xnp, sycl_queue=q)
-    Y = dpt.asarray(Ynp, sycl_queue=q)
-    res_X, res_Y = dpt.broadcast_arrays(X, Y)
-    assert_array_equal(res_Xnp, dpt.asnumpy(res_X))
-    assert_array_equal(res_Ynp, dpt.asnumpy(res_Y))
-
-
-@pytest.mark.parametrize(
-    "shapes",
-    [
-        (),
-        (1,),
-        (3,),
-        (0, 1),
-        (0, 3),
-        (1, 0),
-        (3, 0),
-        (1, 3),
-        (3, 1),
-        (3, 3),
-    ],
-)
-def test_broadcast_arrays_same_shapes(shapes):
-    for shape in shapes:
-        single_input_shapes = [shape]
-        assert_broadcast_correct(single_input_shapes)
-        double_input_shapes = [shape, shape]
-        assert_broadcast_correct(double_input_shapes)
-        triple_input_shapes = [shape, shape, shape]
-        assert_broadcast_correct(triple_input_shapes)
-
-
-@pytest.mark.parametrize(
-    "shapes",
-    [
-        [[(1,), (3,)]],
-        [[(1, 3), (3, 3)]],
-        [[(3, 1), (3, 3)]],
-        [[(1, 3), (3, 1)]],
-        [[(1, 1), (3, 3)]],
-        [[(1, 1), (1, 3)]],
-        [[(1, 1), (3, 1)]],
-        [[(1, 0), (0, 0)]],
-        [[(0, 1), (0, 0)]],
-        [[(1, 0), (0, 1)]],
-        [[(1, 1), (0, 0)]],
-        [[(1, 1), (1, 0)]],
-        [[(1, 1), (0, 1)]],
-    ],
-)
-def test_broadcast_arrays_same_len_shapes(shapes):
-    # Check that two different input shapes of the same length, but some have
-    # ones, broadcast to the correct shape.
-
-    for input_shapes in shapes:
-        assert_broadcast_correct(input_shapes)
-        assert_broadcast_correct(input_shapes[::-1])
-
-
-@pytest.mark.parametrize(
-    "shapes",
-    [
-        [[(), (3,)]],
-        [[(3,), (3, 3)]],
-        [[(3,), (3, 1)]],
-        [[(1,), (3, 3)]],
-        [[(), (3, 3)]],
-        [[(1, 1), (3,)]],
-        [[(1,), (3, 1)]],
-        [[(1,), (1, 3)]],
-        [[(), (1, 3)]],
-        [[(), (3, 1)]],
-        [[(), (0,)]],
-        [[(0,), (0, 0)]],
-        [[(0,), (0, 1)]],
-        [[(1,), (0, 0)]],
-        [[(), (0, 0)]],
-        [[(1, 1), (0,)]],
-        [[(1,), (0, 1)]],
-        [[(1,), (1, 0)]],
-        [[(), (1, 0)]],
-        [[(), (0, 1)]],
-    ],
-)
-def test_broadcast_arrays_different_len_shapes(shapes):
-    # Check that two different input shapes (of different lengths) broadcast
-    # to the correct shape.
-
-    for input_shapes in shapes:
-        assert_broadcast_correct(input_shapes)
-        assert_broadcast_correct(input_shapes[::-1])
-
-
-@pytest.mark.parametrize(
-    "shapes",
-    [
-        [[(3,), (4,)]],
-        [[(2, 3), (2,)]],
-        [[(3,), (3,), (4,)]],
-        [[(1, 3, 4), (2, 3, 3)]],
-    ],
-)
-def test_incompatible_shapes_raise_valueerror(shapes):
-    for input_shapes in shapes:
-        assert_broadcast_arrays_raise(input_shapes)
-        assert_broadcast_arrays_raise(input_shapes[::-1])
-
-
-def test_broadcast_arrays_no_args():
-    with pytest.raises(ValueError):
-        dpt.broadcast_arrays()
-
-
-def test_flip_axis_incorrect():
-    q = get_queue_or_skip()
-
-    X_np = np.ones((4, 4))
-    X = dpt.asarray(X_np, sycl_queue=q)
-
-    pytest.raises(AxisError, dpt.flip, dpt.asarray(np.ones(4)), axis=1)
-    pytest.raises(AxisError, dpt.flip, X, axis=2)
-    pytest.raises(AxisError, dpt.flip, X, axis=-3)
-    pytest.raises(AxisError, dpt.flip, X, axis=(0, 3))
-
-
-def test_flip_0d():
-    q = get_queue_or_skip()
-
-    Xnp = np.array(1, dtype="int64")
-    X = dpt.asarray(Xnp, sycl_queue=q)
-    Ynp = np.flip(Xnp)
-    Y = dpt.flip(X)
-    assert_array_equal(Ynp, dpt.asnumpy(Y))
-
-    pytest.raises(AxisError, dpt.flip, X, axis=0)
-    pytest.raises(AxisError, dpt.flip, X, axis=1)
-    pytest.raises(AxisError, dpt.flip, X, axis=-1)
-
-
-def test_flip_1d():
-    q = get_queue_or_skip()
-
-    Xnp = np.arange(6)
-    X = dpt.asarray(Xnp, sycl_queue=q)
-
-    for ax in range(-X.ndim, X.ndim):
-        Ynp = np.flip(Xnp, axis=ax)
-        Y = dpt.flip(X, axis=ax)
-        assert_array_equal(Ynp, dpt.asnumpy(Y))
-
-    Ynp = np.flip(Xnp, axis=0)
-    Y = dpt.flip(X, axis=0)
-    assert_array_equal(Ynp, dpt.asnumpy(Y))
-
-
-@pytest.mark.parametrize(
-    "shapes",
-    [
-        (3, 2),
-        (2, 3),
-        (2, 2),
-        (3, 3),
-        (3, 2, 3),
-        (2, 3, 2),
-        (2, 2, 2),
-        (3, 3, 3),
-    ],
-)
-def test_flip_2d_3d(shapes):
-    q = get_queue_or_skip()
-
-    Xnp_size = np.prod(shapes)
-    Xnp = np.arange(Xnp_size).reshape(shapes)
-    X = dpt.asarray(Xnp, sycl_queue=q)
-    for ax in range(-X.ndim, X.ndim):
-        Y = dpt.flip(X, axis=ax)
-        Ynp = np.flip(Xnp, axis=ax)
-        assert_array_equal(Ynp, dpt.asnumpy(Y))
-
-
-@pytest.mark.parametrize(
-    "shapes",
-    [
-        (1,),
-        (3,),
-        (2, 3),
-        (3, 2),
-        (2, 2),
-        (1, 2, 3),
-        (2, 1, 3),
-        (2, 3, 1),
-        (3, 2, 1),
-        (3, 3, 3),
-    ],
-)
-def test_flip_default_axes(shapes):
-    q = get_queue_or_skip()
-
-    Xnp_size = np.prod(shapes)
-    Xnp = np.arange(Xnp_size).reshape(shapes)
-    X = dpt.asarray(Xnp, sycl_queue=q)
-    Y = dpt.flip(X)
-    Ynp = np.flip(Xnp)
-    assert_array_equal(Ynp, dpt.asnumpy(Y))
-
-
-@pytest.mark.parametrize(
-    "shapes",
-    [
-        (0),
-        (1),
-        (1, 1),
-        (1, 0),
-        (0, 1),
-        (1, 1, 1),
-        (1, 0, 1),
-        (0, 1, 0),
-    ],
-)
-def test_flip_empty_0_size_dim(shapes):
-    q = get_queue_or_skip()
-
-    X = dpt.empty(shapes, sycl_queue=q)
-    dpt.flip(X)
-
-
-@pytest.mark.parametrize(
-    "data",
-    [
-        [(2, 3), (0, 1)],
-        [(2, 3), (1, 0)],
-        [(2, 3), ()],
-        [(2, 1, 3), (0, 2)],
-        [(3, 1, 2), (2, 0)],
-        [(3, 3, 3), (2,)],
-        [(1, 2, 3), [0, -2]],
-        [(3, 1, 2), [-1, 0]],
-        [(3, 3, 3), [-2, -1]],
-    ],
-)
-def test_flip_multiple_axes(data):
-    q = get_queue_or_skip()
-
-    shape, axes = data
-    Xnp_size = np.prod(shape)
-    Xnp = np.arange(Xnp_size).reshape(shape)
-    X = dpt.asarray(Xnp, sycl_queue=q)
-    Y = dpt.flip(X, axis=axes)
-    Ynp = np.flip(Xnp, axis=axes)
-    assert_array_equal(Ynp, dpt.asnumpy(Y))
-
-
-def test_roll_scalar():
-    q = get_queue_or_skip()
-
-    Xnp = np.ones([], dtype="f4")
-    X = dpt.asarray(Xnp, sycl_queue=q)
-
-    Y = dpt.roll(X, 1)
-    Ynp = np.roll(Xnp, 1)
-    assert_array_equal(Ynp, dpt.asnumpy(Y))
-    with pytest.raises(AxisError):
-        dpt.roll(X, 1, axis=0)
-    with pytest.raises(AxisError):
-        dpt.roll(X, 1, axis=1)
-
-
-@pytest.mark.parametrize(
-    "data",
-    [
-        [2, None],
-        [-2, None],
-        [2, 0],
-        [-2, 0],
-        [2, ()],
-        [11, 0],
-    ],
-)
-def test_roll_1d(data):
-    q = get_queue_or_skip()
-
-    Xnp = np.arange(10)
-    X = dpt.asarray(Xnp, sycl_queue=q)
-    sh, ax = data
-
-    Y = dpt.roll(X, sh, axis=ax)
-    Ynp = np.roll(Xnp, sh, axis=ax)
-    assert_array_equal(Ynp, dpt.asnumpy(Y))
-
-    Y = dpt.roll(X, sh, axis=ax)
-    Ynp = np.roll(Xnp, sh, axis=ax)
-    assert_array_equal(Ynp, dpt.asnumpy(Y))
-
-
-@pytest.mark.parametrize(
-    "data",
-    [
-        [1, None],
-        [1, 0],
-        [1, 1],
-        [1, ()],
-        # Roll multiple axes at once
-        [1, (0, 1)],
-        [(1, 0), (0, 1)],
-        [(-1, 0), (1, 0)],
-        [(0, 1), (0, 1)],
-        [(0, -1), (0, 1)],
-        [(1, 1), (0, 1)],
-        [(-1, -1), (0, 1)],
-        # Roll the same axis multiple times.
-        [1, (0, 0)],
-        [1, (1, 1)],
-        # Roll more than one turn in either direction.
-        [6, 1],
-        [-4, 1],
-    ],
-)
-def test_roll_2d(data):
-    q = get_queue_or_skip()
-
-    Xnp = np.arange(10).reshape(2, 5)
-    X = dpt.asarray(Xnp, sycl_queue=q)
-    sh, ax = data
-
-    Y = dpt.roll(X, sh, axis=ax)
-    Ynp = np.roll(Xnp, sh, axis=ax)
-    assert_array_equal(Ynp, dpt.asnumpy(Y))
-
-
-def test_roll_out_bounds_shifts():
-    "See gh-1857"
-    get_queue_or_skip()
-
-    x = dpt.arange(4)
-    y = dpt.roll(x, np.uint64(2**63 + 2))
-    expected = dpt.roll(x, 2)
-    assert dpt.all(y == expected)
-
-    x_empty = x[1:1]
-    y = dpt.roll(x_empty, 11)
-    assert y.size == 0
-
-    x_2d = dpt.reshape(x, (2, 2))
-    y = dpt.roll(x_2d, np.uint64(2**63 + 1), axis=1)
-    expected = dpt.roll(x_2d, 1, axis=1)
-    assert dpt.all(y == expected)
-
-    x_2d_empty = x_2d[:, 1:1]
-    y = dpt.roll(x_2d_empty, 3, axis=1)
-    expected = dpt.empty_like(x_2d_empty)
-    assert dpt.all(y == expected)
-
-
-def test_roll_validation():
-    get_queue_or_skip()
-
-    X = dict()
-    with pytest.raises(TypeError):
-        dpt.roll(X)
-
-    X = dpt.empty((1, 2, 3))
-    shift = ((2, 3, 1), (1, 2, 3))
-    with pytest.raises(ValueError):
-        dpt.roll(X, shift=shift, axis=(0, 1, 2))
-
-
-def test_concat_incorrect_type():
-    Xnp = np.ones((2, 2))
-    with pytest.raises(TypeError):
-        dpt.concat()
-    with pytest.raises(TypeError):
-        dpt.concat([])
-    with pytest.raises(TypeError):
-        dpt.concat(Xnp)
-    with pytest.raises(TypeError):
-        dpt.concat([Xnp, Xnp])
-
-
-def test_concat_incorrect_queue():
-    q1 = get_queue_or_skip()
-    q2 = get_queue_or_skip()
-
-    X = dpt.ones((2, 2), sycl_queue=q1)
-    Y = dpt.ones((2, 2), sycl_queue=q2)
-
-    pytest.raises(ValueError, dpt.concat, [X, Y])
-
-
-def test_concat_different_dtype():
-    q = get_queue_or_skip()
-
-    X = dpt.ones((2, 2), dtype=np.int64, sycl_queue=q)
-    Y = dpt.ones((3, 2), dtype=np.uint32, sycl_queue=q)
-
-    XY = dpt.concat([X, Y])
-
-    assert XY.dtype is X.dtype
-    assert XY.shape == (5, 2)
-    assert XY.sycl_queue == q
-
-    X1 = dpt.arange(10, dtype="i2", sycl_queue=q)
-    Y1 = dpt.arange(5, dtype="i4", sycl_queue=q)
-
-    XY1 = dpt.concat([X1[::2], Y1[::-1]], axis=None)
-    assert XY1.shape == (10,)
-    assert XY1.sycl_queue == q
-    assert XY1.dtype == Y1.dtype
-
-
-def test_concat_incorrect_ndim():
-    q = get_queue_or_skip()
-
-    X = dpt.ones((2, 2), sycl_queue=q)
-    Y = dpt.ones((2, 2, 2), sycl_queue=q)
-
-    pytest.raises(ValueError, dpt.concat, [X, Y])
-
-
-@pytest.mark.parametrize(
-    "data",
-    [
-        [(2, 2), (3, 3), 0],
-        [(2, 2), (3, 3), 1],
-        [(3, 2), (3, 3), 0],
-        [(2, 3), (3, 3), 1],
-    ],
-)
-def test_concat_incorrect_shape(data):
-    q = get_queue_or_skip()
-
-    Xshape, Yshape, axis = data
-
-    X = dpt.ones(Xshape, sycl_queue=q)
-    Y = dpt.ones(Yshape, sycl_queue=q)
-
-    pytest.raises(ValueError, dpt.concat, [X, Y], axis=axis)
-
-
-@pytest.mark.parametrize(
-    "data",
-    [
-        [(6,), 0],
-        [(2, 3), 1],
-        [(3, 2), -1],
-        [(1, 6), 0],
-        [(2, 1, 3), 2],
-    ],
-)
-def test_concat_1array(data):
-    q = get_queue_or_skip()
-
-    Xshape, axis = data
-
-    Xnp = np.arange(6).reshape(Xshape)
-    X = dpt.asarray(Xnp, sycl_queue=q)
-
-    Ynp = np.concatenate([Xnp], axis=axis)
-    Y = dpt.concat([X], axis=axis)
-
-    assert_array_equal(Ynp, dpt.asnumpy(Y))
-
-    Ynp = np.concatenate((Xnp,), axis=axis)
-    Y = dpt.concat((X,), axis=axis)
-
-    assert_array_equal(Ynp, dpt.asnumpy(Y))
-
-
-@pytest.mark.parametrize(
-    "data",
-    [
-        [(1,), (1,), 0],
-        [(0, 2), (0, 2), 1],
-        [(0, 2), (2, 2), 0],
-        [(2, 1), (2, 2), -1],
-        [(2, 2, 2), (2, 1, 2), 1],
-        [(3, 3, 3), (2, 2), None],
-    ],
-)
-def test_concat_2arrays(data):
-    q = get_queue_or_skip()
-
-    Xshape, Yshape, axis = data
-
-    Xnp = np.ones(Xshape)
-    X = dpt.asarray(Xnp, sycl_queue=q)
-
-    Ynp = np.zeros(Yshape)
-    Y = dpt.asarray(Ynp, sycl_queue=q)
-
-    Znp = np.concatenate([Xnp, Ynp], axis=axis)
-    Z = dpt.concat([X, Y], axis=axis)
-
-    assert_array_equal(Znp, dpt.asnumpy(Z))
-
-
-@pytest.mark.parametrize(
-    "data",
-    [
-        [(1,), (1,), (1,), 0],
-        [(0, 2), (2, 2), (1, 2), 0],
-        [(2, 1, 2), (2, 2, 2), (2, 4, 2), 1],
-    ],
-)
-def test_concat_3arrays(data):
-    q = get_queue_or_skip()
-
-    Xshape, Yshape, Zshape, axis = data
-
-    Xnp = np.ones(Xshape)
-    X = dpt.asarray(Xnp, sycl_queue=q)
-
-    Ynp = np.zeros(Yshape)
-    Y = dpt.asarray(Ynp, sycl_queue=q)
-
-    Znp = np.full(Zshape, 2.0)
-    Z = dpt.asarray(Znp, sycl_queue=q)
-
-    Rnp = np.concatenate([Xnp, Ynp, Znp], axis=axis)
-    R = dpt.concat([X, Y, Z], axis=axis)
-
-    assert_array_equal(Rnp, dpt.asnumpy(R))
-
-
-def test_concat_axis_none_strides():
-    q = get_queue_or_skip()
-    Xnp = np.arange(0, 18).reshape((6, 3))
-    X = dpt.asarray(Xnp, sycl_queue=q)
-
-    Ynp = np.arange(20, 36).reshape((4, 2, 2))
-    Y = dpt.asarray(Ynp, sycl_queue=q)
-
-    Znp = np.concatenate([Xnp[::2], Ynp[::2]], axis=None)
-    Z = dpt.concat([X[::2], Y[::2]], axis=None)
-
-    assert_array_equal(Znp, dpt.asnumpy(Z))
-
-
-def test_stack_incorrect_shape():
-    q = get_queue_or_skip()
-
-    X = dpt.ones((1,), sycl_queue=q)
-    Y = dpt.ones((2,), sycl_queue=q)
-
-    with pytest.raises(ValueError):
-        dpt.stack([X, Y], axis=0)
-
-
-@pytest.mark.parametrize(
-    "data",
-    [
-        [(6,), 0],
-        [(2, 3), 1],
-        [(3, 2), -1],
-        [(1, 6), 2],
-        [(2, 1, 3), 2],
-    ],
-)
-def test_stack_1array(data):
-    q = get_queue_or_skip()
-
-    shape, axis = data
-
-    Xnp = np.arange(6).reshape(shape)
-    X = dpt.asarray(Xnp, sycl_queue=q)
-
-    Ynp = np.stack([Xnp], axis=axis)
-    Y = dpt.stack([X], axis=axis)
-
-    assert_array_equal(Ynp, dpt.asnumpy(Y))
-
-    Ynp = np.stack((Xnp,), axis=axis)
-    Y = dpt.stack((X,), axis=axis)
-
-    assert_array_equal(Ynp, dpt.asnumpy(Y))
-
-
-@pytest.mark.parametrize(
-    "data",
-    [
-        [(1,), 0],
-        [(0, 2), 0],
-        [(2, 0), 0],
-        [(2, 3), 0],
-        [(2, 3), 1],
-        [(2, 3), 2],
-        [(2, 3), -1],
-        [(2, 3), -2],
-        [(2, 2, 2), 1],
-    ],
-)
-def test_stack_2arrays(data):
-    q = get_queue_or_skip()
-
-    shape, axis = data
-
-    Xnp = np.ones(shape)
-    X = dpt.asarray(Xnp, sycl_queue=q)
-
-    Ynp = np.zeros(shape)
-    Y = dpt.asarray(Ynp, sycl_queue=q)
-
-    Znp = np.stack([Xnp, Ynp], axis=axis)
-    Z = dpt.stack([X, Y], axis=axis)
-
-    assert_array_equal(Znp, dpt.asnumpy(Z))
-
-
-@pytest.mark.parametrize(
-    "data",
-    [
-        [(1,), 0],
-        [(0, 2), 0],
-        [(2, 1, 2), 1],
-    ],
-)
-def test_stack_3arrays(data):
-    q = get_queue_or_skip()
-
-    shape, axis = data
-
-    Xnp = np.ones(shape)
-    X = dpt.asarray(Xnp, sycl_queue=q)
-
-    Ynp = np.zeros(shape)
-    Y = dpt.asarray(Ynp, sycl_queue=q)
-
-    Znp = np.full(shape, 2.0)
-    Z = dpt.asarray(Znp, sycl_queue=q)
-
-    Rnp = np.stack([Xnp, Ynp, Znp], axis=axis)
-    R = dpt.stack([X, Y, Z], axis=axis)
-
-    assert_array_equal(Rnp, dpt.asnumpy(R))
-
-
-def test_can_cast():
-    q = get_queue_or_skip()
-
-    # incorrect input
-    X = dpt.ones((2, 2), dtype=dpt.int16, sycl_queue=q)
-    pytest.raises(TypeError, dpt.can_cast, X, 1)
-    pytest.raises(TypeError, dpt.can_cast, X, X)
-    X_np = np.ones((2, 2), dtype=np.int16)
-
-    assert dpt.can_cast(X, "float32") == np.can_cast(X_np, "float32")
-    assert dpt.can_cast(X, dpt.int32) == np.can_cast(X_np, np.int32)
-    assert dpt.can_cast(X, dpt.int64) == np.can_cast(X_np, np.int64)
-
-
-def test_result_type():
-    q = get_queue_or_skip()
-
-    usm_ar = dpt.ones((2), dtype=dpt.int16, sycl_queue=q)
-    np_ar = dpt.asnumpy(usm_ar)
-
-    X = [usm_ar, dpt.int32, "int64", usm_ar]
-    X_np = [np_ar, np.int32, "int64", np_ar]
-
-    assert dpt.result_type(*X) == np.result_type(*X_np)
-
-    X = [usm_ar, dpt.int32, "int64", True]
-    X_np = [np_ar, np.int32, "int64", True]
-
-    assert dpt.result_type(*X) == np.result_type(*X_np)
-
-    X = [usm_ar, dpt.int32, "int64", 2]
-    X_np = [np_ar, np.int32, "int64", 2]
-
-    assert dpt.result_type(*X) == np.result_type(*X_np)
-
-    X = [dpt.int32, "int64", 2]
-    X_np = [np.int32, "int64", 2]
-
-    assert dpt.result_type(*X) == np.result_type(*X_np)
-
-    X = [usm_ar, dpt.int32, "int64", 2.0]
-    X_np = [np_ar, np.int32, "int64", 2.0]
-
-    assert dpt.result_type(*X).kind == np.result_type(*X_np).kind
-
-    X = [usm_ar, dpt.int32, "int64", 2.0 + 1j]
-    X_np = [np_ar, np.int32, "int64", 2.0 + 1j]
-
-    assert dpt.result_type(*X).kind == np.result_type(*X_np).kind
-
-
-def test_swapaxes_1d():
-    get_queue_or_skip()
-    x = np.array([[1, 2, 3]])
-    exp = np.swapaxes(x, 0, 1)
-
-    y = dpt.asarray([[1, 2, 3]])
-    res = dpt.swapaxes(y, 0, 1)
-
-    assert_array_equal(exp, dpt.asnumpy(res))
-
-
-def test_swapaxes_2d():
-    get_queue_or_skip()
-    x = np.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]]])
-    exp = np.swapaxes(x, 0, 2)
-
-    y = dpt.asarray([[[0, 1], [2, 3]], [[4, 5], [6, 7]]])
-    res = dpt.swapaxes(y, 0, 2)
-
-    assert_array_equal(exp, dpt.asnumpy(res))
-
-
-@pytest.mark.parametrize(
-    "source, expected",
-    [
-        (0, (6, 7, 5)),
-        (1, (5, 7, 6)),
-        (2, (5, 6, 7)),
-        (-1, (5, 6, 7)),
-    ],
-)
-def test_moveaxis_move_to_end(source, expected):
-    get_queue_or_skip()
-    x = dpt.reshape(dpt.arange(5 * 6 * 7), (5, 6, 7))
-    actual = dpt.moveaxis(x, source, -1).shape
-    assert_(actual, expected)
-
-
-@pytest.mark.parametrize(
-    "source, destination, expected",
-    [
-        (0, 1, (2, 1, 3, 4)),
-        (1, 2, (1, 3, 2, 4)),
-        (1, -1, (1, 3, 4, 2)),
-    ],
-)
-def test_moveaxis_new_position(source, destination, expected):
-    get_queue_or_skip()
-    x = dpt.reshape(dpt.arange(24), (1, 2, 3, 4))
-    actual = dpt.moveaxis(x, source, destination).shape
-    assert_(actual, expected)
-
-
-@pytest.mark.parametrize(
-    "source, destination",
-    [
-        (0, 0),
-        (3, -1),
-        (-1, 3),
-        ([0, -1], [0, -1]),
-        ([2, 0], [2, 0]),
-    ],
-)
-def test_moveaxis_preserve_order(source, destination):
-    get_queue_or_skip()
-    x = dpt.zeros((1, 2, 3, 4))
-    actual = dpt.moveaxis(x, source, destination).shape
-    assert_(actual, (1, 2, 3, 4))
-
-
-@pytest.mark.parametrize(
-    "shape, source, destination, expected",
-    [
-        ((0, 1, 2, 3), [0, 1], [2, 3], (2, 3, 0, 1)),
-        ((0, 1, 2, 3), [2, 3], [0, 1], (2, 3, 0, 1)),
-        ((0, 1, 2, 3), [0, 1, 2], [2, 3, 0], (2, 3, 0, 1)),
-        ((0, 1, 2, 3), [3, 0], [1, 0], (0, 3, 1, 2)),
-        ((0, 1, 2, 3), [0, 3], [0, 1], (0, 3, 1, 2)),
-        ((1, 2, 3, 4), range(4), range(4), (1, 2, 3, 4)),
-    ],
-)
-def test_moveaxis_move_multiples(shape, source, destination, expected):
-    get_queue_or_skip()
-    x = dpt.zeros(shape)
-    y = dpt.moveaxis(x, source, destination)
-    actual = y.shape
-    assert_(actual, expected)
-    assert y._pointer == x._pointer
-
-
-def test_moveaxis_errors():
-    try:
-        x_flat = dpt.arange(6)
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    x = dpt.reshape(x_flat, (1, 2, 3))
-    assert_raises_regex(
-        AxisError, "source.*out of bounds", dpt.moveaxis, x, 3, 0
-    )
-    assert_raises_regex(
-        AxisError, "source.*out of bounds", dpt.moveaxis, x, -4, 0
-    )
-    assert_raises_regex(
-        AxisError, "destination.*out of bounds", dpt.moveaxis, x, 0, 5
-    )
-    assert_raises_regex(
-        ValueError, "repeated axis in `source`", dpt.moveaxis, x, [0, 0], [0, 1]
-    )
-    assert_raises_regex(
-        ValueError,
-        "repeated axis in `destination`",
-        dpt.moveaxis,
-        x,
-        [0, 1],
-        [1, 1],
-    )
-    assert_raises_regex(
-        ValueError, "must have the same number", dpt.moveaxis, x, 0, [0, 1]
-    )
-    assert_raises_regex(
-        ValueError, "must have the same number", dpt.moveaxis, x, [0, 1], [0]
-    )
-
-
-def test_unstack_axis0():
-    try:
-        x_flat = dpt.arange(6)
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    y = dpt.reshape(x_flat, (2, 3))
-    res = dpt.unstack(y)
-
-    assert_array_equal(dpt.asnumpy(y[0, ...]), dpt.asnumpy(res[0]))
-    assert_array_equal(dpt.asnumpy(y[1, ...]), dpt.asnumpy(res[1]))
-
-
-def test_unstack_axis1():
-    try:
-        x_flat = dpt.arange(6)
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    y = dpt.reshape(x_flat, (2, 3))
-    res = dpt.unstack(y, axis=1)
-
-    assert_array_equal(dpt.asnumpy(y[:, 0, ...]), dpt.asnumpy(res[0]))
-    assert_array_equal(dpt.asnumpy(y[:, 1, ...]), dpt.asnumpy(res[1]))
-    assert_array_equal(dpt.asnumpy(y[:, 2, ...]), dpt.asnumpy(res[2]))
-
-
-def test_unstack_axis2():
-    try:
-        x_flat = dpt.arange(60)
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    y = dpt.reshape(x_flat, (4, 5, 3))
-    res = dpt.unstack(y, axis=2)
-
-    assert_array_equal(dpt.asnumpy(y[:, :, 0, ...]), dpt.asnumpy(res[0]))
-    assert_array_equal(dpt.asnumpy(y[:, :, 1, ...]), dpt.asnumpy(res[1]))
-    assert_array_equal(dpt.asnumpy(y[:, :, 2, ...]), dpt.asnumpy(res[2]))
-
-
-def test_finfo_object():
-    fi = dpt.finfo(dpt.float32)
-    assert isinstance(fi.bits, int)
-    assert isinstance(fi.max, float)
-    assert isinstance(fi.min, float)
-    assert isinstance(fi.eps, float)
-    assert isinstance(fi.epsneg, float)
-    assert isinstance(fi.smallest_normal, float)
-    assert isinstance(fi.tiny, float)
-    assert isinstance(fi.precision, float)
-    assert isinstance(fi.resolution, float)
-    assert isinstance(fi.dtype, dpt.dtype)
-    assert isinstance(str(fi), str)
-    assert isinstance(repr(fi), str)
-
-
-def test_repeat_scalar_sequence_agreement():
-    get_queue_or_skip()
-
-    x = dpt.arange(5, dtype="i4")
-    expected_res = dpt.empty(10, dtype="i4")
-    expected_res[1::2], expected_res[::2] = x, x
-
-    # scalar case
-    reps = 2
-    res = dpt.repeat(x, reps)
-    assert dpt.all(res == expected_res)
-
-    # tuple
-    reps = (2, 2, 2, 2, 2)
-    res = dpt.repeat(x, reps)
-    assert dpt.all(res == expected_res)
-
-
-def test_repeat_as_broadcasting():
-    get_queue_or_skip()
-
-    reps = 5
-    x = dpt.arange(reps, dtype="i4")
-    x1 = x[:, dpt.newaxis]
-    expected_res = dpt.broadcast_to(x1, (reps, reps))
-
-    res = dpt.repeat(x1, reps, axis=1)
-    assert dpt.all(res == expected_res)
-
-    x2 = x[dpt.newaxis, :]
-    expected_res = dpt.broadcast_to(x2, (reps, reps))
-
-    res = dpt.repeat(x2, reps, axis=0)
-    assert dpt.all(res == expected_res)
-
-
-def test_repeat_axes():
-    get_queue_or_skip()
-
-    reps = 2
-    x = dpt.reshape(dpt.arange(5 * 10, dtype="i4"), (5, 10))
-    expected_res = dpt.empty((x.shape[0] * 2, x.shape[1]), dtype=x.dtype)
-    expected_res[::2, :], expected_res[1::2] = x, x
-    res = dpt.repeat(x, reps, axis=0)
-    assert dpt.all(res == expected_res)
-
-    expected_res = dpt.empty((x.shape[0], x.shape[1] * 2), dtype=x.dtype)
-    expected_res[:, ::2], expected_res[:, 1::2] = x, x
-    res = dpt.repeat(x, reps, axis=1)
-    assert dpt.all(res == expected_res)
-
-    x = dpt.arange(10, dtype="i4")
-    expected_res = dpt.empty(x.shape[0] * reps, dtype=x.dtype)
-    expected_res[::2], expected_res[1::2] = x, x
-    res = dpt.repeat(x, reps, axis=0)
-    assert dpt.all(res == expected_res)
-
-
-def test_repeat_size_0_outputs():
-    get_queue_or_skip()
-
-    x = dpt.ones((3, 0, 5), dtype="i4")
-    reps = 10
-    res = dpt.repeat(x, reps, axis=0)
-    assert res.size == 0
-    assert res.shape == (30, 0, 5)
-
-    res = dpt.repeat(x, reps, axis=1)
-    assert res.size == 0
-    assert res.shape == (3, 0, 5)
-
-    res = dpt.repeat(x, (2, 2, 2), axis=0)
-    assert res.size == 0
-    assert res.shape == (6, 0, 5)
-
-    x = dpt.ones((3, 2, 5))
-    res = dpt.repeat(x, 0, axis=1)
-    assert res.size == 0
-    assert res.shape == (3, 0, 5)
-
-    res = dpt.repeat(x, (0, 0), axis=1)
-    assert res.size == 0
-    assert res.shape == (3, 0, 5)
-
-    # axis=None cases
-    res = dpt.repeat(x, 0)
-    assert res.size == 0
-
-    res = dpt.repeat(x, (0,) * x.size)
-    assert res.size == 0
-
-
-def test_repeat_strides():
-    get_queue_or_skip()
-
-    reps = 2
-    x = dpt.reshape(dpt.arange(10 * 10, dtype="i4"), (10, 10))
-    x1 = x[:, ::-2]
-    expected_res = dpt.empty((10, 10), dtype="i4")
-    expected_res[:, ::2], expected_res[:, 1::2] = x1, x1
-    res = dpt.repeat(x1, reps, axis=1)
-    assert dpt.all(res == expected_res)
-    res = dpt.repeat(x1, (reps,) * x1.shape[1], axis=1)
-    assert dpt.all(res == expected_res)
-
-    x1 = x[::-2, :]
-    expected_res = dpt.empty((10, 10), dtype="i4")
-    expected_res[::2, :], expected_res[1::2, :] = x1, x1
-    res = dpt.repeat(x1, reps, axis=0)
-    assert dpt.all(res == expected_res)
-    res = dpt.repeat(x1, (reps,) * x1.shape[0], axis=0)
-    assert dpt.all(res == expected_res)
-
-    # axis=None
-    x = dpt.reshape(dpt.arange(10 * 10), (10, 10))
-    x1 = dpt.reshape(x[::-2, :], -1)
-    x2 = x[::-2, :]
-    expected_res = dpt.empty(10 * 10, dtype="i4")
-    expected_res[::2], expected_res[1::2] = x1, x1
-    res = dpt.repeat(x2, reps)
-    assert dpt.all(res == expected_res)
-    res = dpt.repeat(x2, (reps,) * x1.size)
-    assert dpt.all(res == expected_res)
-
-
-def test_repeat_casting():
-    get_queue_or_skip()
-
-    x = dpt.arange(5, dtype="i4")
-    # i4 is cast to i8
-    reps = dpt.ones(5, dtype="i4")
-    res = dpt.repeat(x, reps)
-    assert res.shape == x.shape
-    assert dpt.all(res == x)
-
-
-def test_repeat_strided_repeats():
-    get_queue_or_skip()
-
-    x = dpt.arange(5, dtype="i4")
-    reps = dpt.ones(10, dtype="i8")
-    reps[::2] = 0
-    reps = reps[::-2]
-    res = dpt.repeat(x, reps)
-    assert res.shape == x.shape
-    assert dpt.all(res == x)
-
-
-def test_repeat_size1_repeats():
-    get_queue_or_skip()
-
-    x = dpt.arange(5, dtype="i4")
-    expected_res = dpt.repeat(x, 2)
-    # 0D repeats
-    reps_0d = dpt.asarray(2, dtype="i8")
-    res = dpt.repeat(x, reps_0d)
-    assert dpt.all(res == expected_res)
-    # 1D repeats
-    reps_1d = dpt.asarray([2], dtype="i8")
-    res = dpt.repeat(x, reps_1d)
-    assert dpt.all(res == expected_res)
-
-
-def test_repeat_arg_validation():
-    get_queue_or_skip()
-
-    x = dict()
-    with pytest.raises(TypeError):
-        dpt.repeat(x, 2)
-
-    # axis must be 0 for scalar
-    x = dpt.empty(())
-    with pytest.raises(ValueError):
-        dpt.repeat(x, 2, axis=1)
-
-    # repeats must be positive
-    x = dpt.empty(5)
-    with pytest.raises(ValueError):
-        dpt.repeat(x, -2)
-
-    # repeats must be integers
-    with pytest.raises(TypeError):
-        dpt.repeat(x, 2.0)
-
-    # repeats tuple must be the same length as axis
-    with pytest.raises(ValueError):
-        dpt.repeat(x, (1, 2))
-
-    # repeats tuple elements must be positive
-    with pytest.raises(ValueError):
-        dpt.repeat(x, (-1,))
-
-    # repeats must be int or tuple
-    with pytest.raises(TypeError):
-        dpt.repeat(x, dict())
-
-    # repeats array must be 0d or 1d
-    with pytest.raises(ValueError):
-        dpt.repeat(x, dpt.ones((1, 1), dtype="i8"))
-
-    # repeats must be castable to i8
-    with pytest.raises(TypeError):
-        dpt.repeat(x, dpt.asarray(2.0, dtype="f4"))
-
-    # compute follows data
-    q2 = dpctl.SyclQueue()
-    reps = dpt.asarray(1, dtype="i8", sycl_queue=q2)
-    with pytest.raises(ExecutionPlacementError):
-        dpt.repeat(x, reps)
-
-    # repeats array must not contain negative elements
-    reps = dpt.asarray(-1, dtype="i8")
-    with pytest.raises(ValueError):
-        dpt.repeat(x, reps)
-    reps = dpt.asarray([1, 1, 1, 1, -1], dtype="i8")
-    with pytest.raises(ValueError):
-        dpt.repeat(x, reps)
-
-    # repeats must broadcastable to axis size
-    reps = dpt.arange(10, dtype="i8")
-    with pytest.raises(ValueError):
-        dpt.repeat(x, reps)
-
-
-def test_tile_basic():
-    get_queue_or_skip()
-
-    reps = 2
-    x = dpt.arange(5, dtype="i4")
-    res = dpt.tile(x, reps)
-    assert res.shape == (x.shape[0] * reps,)
-    assert dpt.all(res[: x.size] == res[x.size :])
-
-    reps = (2, 1)
-    expected_sh = (2, x.shape[0])
-    expected_res = dpt.broadcast_to(x, expected_sh)
-    res = dpt.tile(x, reps)
-    assert res.shape == expected_sh
-    assert dpt.all(expected_res == res)
-
-
-def test_tile_size_1():
-    get_queue_or_skip()
-
-    reps = 5
-    # test for 0d array
-    x1 = dpt.asarray(2, dtype="i4")
-    res = dpt.tile(x1, reps)
-    assert dpt.all(res == dpt.full(reps, 2, dtype="i4"))
-
-    # test for 1d array with single element
-    x2 = dpt.asarray([2], dtype="i4")
-    res = dpt.tile(x2, reps)
-    assert dpt.all(res == dpt.full(reps, 2, dtype="i4"))
-
-    reps = ()
-    # test for gh-1627 behavior
-    res = dpt.tile(x1, reps)
-    assert x1.shape == res.shape
-    assert_array_equal(dpt.asnumpy(x1), dpt.asnumpy(res))
-
-    res = dpt.tile(x2, reps)
-    assert x2.shape == res.shape
-    assert_array_equal(dpt.asnumpy(x2), dpt.asnumpy(res))
-
-
-def test_tile_prepends_axes():
-    get_queue_or_skip()
-
-    reps = (2,)
-    x = dpt.ones((5, 10), dtype="i4")
-    expected_res = dpt.ones((5, 20), dtype="i4")
-    res = dpt.tile(x, reps)
-    assert dpt.all(res == expected_res)
-
-    reps = (3, 2, 2)
-    expected_res = dpt.ones((3, 10, 20), dtype="i4")
-    res = dpt.tile(x, reps)
-    assert dpt.all(res == expected_res)
-
-
-def test_tile_empty_outputs():
-    get_queue_or_skip()
-
-    x = dpt.asarray((), dtype="i4")
-    reps = 10
-    res = dpt.tile(x, reps)
-    assert res.size == 0
-    assert res.shape == (0,)
-
-    x = dpt.ones((3, 0, 5), dtype="i4")
-    res = dpt.tile(x, reps)
-    assert res.size == 0
-    assert res.shape == (3, 0, 50)
-
-    reps = (2, 1, 2)
-    res = dpt.tile(x, reps)
-    assert res.size == 0
-    assert res.shape == (6, 0, 10)
-
-    x = dpt.ones((2, 3, 4), dtype="i4")
-    reps = (0, 1, 1)
-    res = dpt.tile(x, reps)
-    assert res.size == 0
-    assert res.shape == (0, 3, 4)
-
-
-def test_tile_strides():
-    get_queue_or_skip()
-
-    reps = (1, 2)
-    x = dpt.reshape(dpt.arange(10 * 10, dtype="i4"), (10, 10))
-    x1 = x[:, ::-2]
-    expected_res = dpt.empty((10, 10), dtype="i4")
-    expected_res[:, : x1.shape[1]], expected_res[:, x1.shape[1] :] = x1, x1
-    res = dpt.tile(x1, reps)
-    assert dpt.all(res == expected_res)
-
-    reps = (2, 1)
-    x1 = x[::-2, :]
-    expected_res = dpt.empty((10, 10), dtype="i4")
-    expected_res[: x1.shape[0], :], expected_res[x1.shape[0] :, :] = x1, x1
-    res = dpt.tile(x1, reps)
-    assert dpt.all(res == expected_res)
-
-
-def test_tile_size_1_axes():
-    get_queue_or_skip()
-
-    reps = (1, 2, 1)
-    x = dpt.ones((2, 1, 3), dtype="i4")
-    res = dpt.tile(x, reps)
-    expected_res = dpt.broadcast_to(x, (2, 2, 3))
-    assert dpt.all(res == expected_res)
-
-
-def test_tile_arg_validation():
-    get_queue_or_skip()
-
-    with pytest.raises(TypeError):
-        dpt.tile(dict(), 2)
-
-    # repetitions must be int or tuple
-    x = dpt.empty(())
-    with pytest.raises(TypeError):
-        dpt.tile(x, dict())
-
-
-def test_repeat_0_size():
-    get_queue_or_skip()
-
-    x = dpt.ones((0, 10, 0), dtype="i4")
-    repetitions = 2
-    res = dpt.repeat(x, repetitions)
-    assert res.shape == (0,)
-    res = dpt.repeat(x, repetitions, axis=2)
-    assert res.shape == x.shape
-    res = dpt.repeat(x, repetitions, axis=1)
-    axis_sz = x.shape[1] * repetitions
-    assert res.shape == (0, 20, 0)
-
-    repetitions = dpt.asarray(2, dtype="i4")
-    res = dpt.repeat(x, repetitions)
-    assert res.shape == (0,)
-    res = dpt.repeat(x, repetitions, axis=2)
-    assert res.shape == x.shape
-    res = dpt.repeat(x, repetitions, axis=1)
-    assert res.shape == (0, 20, 0)
-
-    repetitions = dpt.arange(10, dtype="i4")
-    res = dpt.repeat(x, repetitions, axis=1)
-    axis_sz = dpt.sum(repetitions)
-    assert res.shape == (0, axis_sz, 0)
-
-    repetitions = (2,) * 10
-    res = dpt.repeat(x, repetitions, axis=1)
-    axis_sz = 2 * x.shape[1]
-    assert res.shape == (0, axis_sz, 0)
-
-
-def test_result_type_bug_1874():
-    py_sc = True
-    np_sc = np.asarray([py_sc])[0]
-    dts_bool = [py_sc, np_sc]
-    py_sc = int(1)
-    np_sc = np.asarray([py_sc])[0]
-    dts_ints = [py_sc, np_sc]
-    dts_floats = [float(1), np.float64(1)]
-    dts_complexes = [complex(1), np.complex128(1)]
-
-    # iterate over two categories
-    for dts1, dts2 in itertools.product(
-        [dts_bool, dts_ints, dts_floats, dts_complexes], repeat=2
-    ):
-        res_dts = []
-        # iterate over Python scalar/NumPy scalar choices within categories
-        for dt1, dt2 in itertools.product(dts1, dts2):
-            res_dt = dpt.result_type(dt1, dt2)
-            res_dts.append(res_dt)
-        # check that all results are the same
-        assert res_dts and all(res_dts[0] == el for el in res_dts[1:])
diff --git a/dpctl/tests/test_usm_ndarray_operators.py b/dpctl/tests/test_usm_ndarray_operators.py
deleted file mode 100644
index 9608be3cc3..0000000000
--- a/dpctl/tests/test_usm_ndarray_operators.py
+++ /dev/null
@@ -1,142 +0,0 @@
-#                      Data Parallel Control (dpctl)
-#
-# Copyright 2020-2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-import dpctl
-import dpctl.tensor as dpt
-
-
-class Dummy:
-    @staticmethod
-    def abs(a):
-        return a
-
-    @staticmethod
-    def add(a, b):
-        if isinstance(a, dpt.usm_ndarray):
-            return a
-        else:
-            return b
-
-    @staticmethod
-    def subtract(a, b):
-        if isinstance(a, dpt.usm_ndarray):
-            return a
-        else:
-            return b
-
-    @staticmethod
-    def multiply(a, b):
-        if isinstance(a, dpt.usm_ndarray):
-            return a
-        else:
-            return b
-
-
-@pytest.mark.parametrize("namespace", [dpt, Dummy()])
-def test_fp_ops(namespace):
-    try:
-        X = dpt.ones(1)
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    X._set_namespace(namespace)
-    assert X.__array_namespace__() is namespace
-    X[0] = -2.5
-    X.__abs__()
-    X.__add__(1.0)
-    X.__radd__(1.0)
-    X.__sub__(1.0)
-    X.__rsub__(1.0)
-    X.__mul__(1.0)
-    X.__rmul__(1.0)
-    X.__truediv__(1.0)
-    X.__rtruediv__(1.0)
-    X.__floordiv__(1.0)
-    X.__rfloordiv__(1.0)
-    X.__pos__()
-    X.__neg__()
-    X.__eq__(-2.5)
-    X.__ne__(-2.5)
-    X.__le__(-2.5)
-    X.__ge__(-2.5)
-    X.__gt__(-2.0)
-    X.__iadd__(X)
-    X.__isub__(X)
-    X.__imul__(X)
-    X.__itruediv__(1.0)
-    X.__ifloordiv__(1.0)
-
-
-@pytest.mark.parametrize("namespace", [dpt, Dummy()])
-def test_int_ops(namespace):
-    try:
-        X = dpt.usm_ndarray(1, "i4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    X._set_namespace(namespace)
-    assert X.__array_namespace__() is namespace
-    X.__lshift__(2)
-    X.__rshift__(2)
-    X.__rlshift__(2)
-    X.__rrshift__(2)
-    X.__ilshift__(2)
-    X.__irshift__(2)
-    X.__and__(X)
-    X.__rand__(X)
-    X.__iand__(X)
-    X.__or__(X)
-    X.__ror__(X)
-    X.__ior__(X)
-    X.__xor__(X)
-    X.__rxor__(X)
-    X.__ixor__(X)
-    X.__invert__()
-    X.__mod__(5)
-    X.__rmod__(5)
-    X.__imod__(5)
-    X.__pow__(2)
-    X.__rpow__(2)
-    X.__ipow__(2)
-
-
-@pytest.mark.parametrize("namespace", [dpt, Dummy()])
-def test_mat_ops(namespace):
-    try:
-        M = dpt.eye(3, 3)
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    M._set_namespace(namespace)
-    assert M.__array_namespace__() is namespace
-    M.__matmul__(M)
-    M.__imatmul__(M)
-    M.__rmatmul__(M)
-
-
-@pytest.mark.parametrize("namespace", [dpt, Dummy()])
-def test_comp_ops(namespace):
-    try:
-        X = dpt.asarray(1, dtype="u8")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("No SYCL devices available")
-    X._set_namespace(namespace)
-    assert X.__array_namespace__() is namespace
-    assert X.__gt__(-1)
-    assert X.__ge__(-1)
-    assert not X.__lt__(-1)
-    assert not X.__le__(-1)
-    assert not X.__eq__(-1)
-    assert X.__ne__(-1)
diff --git a/dpctl/tests/test_usm_ndarray_print.py b/dpctl/tests/test_usm_ndarray_print.py
deleted file mode 100644
index 677f0a9bf2..0000000000
--- a/dpctl/tests/test_usm_ndarray_print.py
+++ /dev/null
@@ -1,393 +0,0 @@
-#                      Data Parallel Control (dpctl)
-#
-# Copyright 2020-2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import pytest
-
-import dpctl
-import dpctl.tensor as dpt
-
-from .helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-
-class TestPrint:
-    def setup_method(self):
-        self._retain_options = dpt.get_print_options()
-
-    def teardown_method(self):
-        dpt.set_print_options(**self._retain_options)
-
-
-class TestArgValidation(TestPrint):
-    @pytest.mark.parametrize(
-        "arg,err",
-        [
-            ({"linewidth": "I"}, TypeError),
-            ({"edgeitems": "I"}, TypeError),
-            ({"threshold": "I"}, TypeError),
-            ({"precision": "I"}, TypeError),
-            ({"floatmode": "I"}, ValueError),
-            ({"edgeitems": "I"}, TypeError),
-            ({"sign": "I"}, ValueError),
-            ({"nanstr": np.nan}, TypeError),
-            ({"infstr": np.nan}, TypeError),
-        ],
-    )
-    def test_print_option_arg_validation(self, arg, err):
-        with pytest.raises(err):
-            dpt.set_print_options(**arg)
-
-    def test_usm_ndarray_repr_arg_validation(self):
-        X = dict()
-        with pytest.raises(TypeError):
-            dpt.usm_ndarray_repr(X)
-
-        try:
-            X = dpt.arange(4)
-        except dpctl.SyclDeviceCreationError:
-            pytest.skip("No SYCL devices available")
-        with pytest.raises(TypeError):
-            dpt.usm_ndarray_repr(X, line_width="I")
-
-        with pytest.raises(TypeError):
-            dpt.usm_ndarray_repr(X, precision="I")
-
-        with pytest.raises(TypeError):
-            dpt.usm_ndarray_repr(X, prefix=4)
-
-    def test_usm_ndarray_str_arg_validation(self):
-        X = dict()
-        with pytest.raises(TypeError):
-            dpt.usm_ndarray_str(X)
-
-        try:
-            X = dpt.arange(4)
-        except dpctl.SyclDeviceCreationError:
-            pytest.skip("No SYCL devices available")
-
-        with pytest.raises(TypeError):
-            dpt.usm_ndarray_str(X, line_width="I")
-
-        with pytest.raises(TypeError):
-            dpt.usm_ndarray_str(X, edge_items="I")
-
-        with pytest.raises(TypeError):
-            dpt.usm_ndarray_str(X, threshold="I")
-
-        with pytest.raises(TypeError):
-            dpt.usm_ndarray_str(X, precision="I")
-
-        with pytest.raises(ValueError):
-            dpt.usm_ndarray_str(X, floatmode="I")
-
-        with pytest.raises(TypeError):
-            dpt.usm_ndarray_str(X, edge_items="I")
-
-        with pytest.raises(ValueError):
-            dpt.usm_ndarray_str(X, sign="I")
-
-        with pytest.raises(TypeError):
-            dpt.usm_ndarray_str(X, prefix=4)
-
-        with pytest.raises(TypeError):
-            dpt.usm_ndarray_str(X, prefix=4)
-
-        with pytest.raises(TypeError):
-            dpt.usm_ndarray_str(X, suffix=4)
-
-
-class TestSetPrintOptions(TestPrint):
-    def test_set_linewidth(self):
-        q = get_queue_or_skip()
-
-        dpt.set_print_options(linewidth=1)
-        x = dpt.asarray([0, 1], sycl_queue=q)
-        assert str(x) == "[0\n 1]"
-
-    def test_set_precision(self):
-        q = get_queue_or_skip()
-
-        dpt.set_print_options(precision=4)
-        x = dpt.asarray([1.23450], sycl_queue=q)
-        assert str(x) == "[1.2345]"
-
-    def test_threshold_edgeitems(self):
-        q = get_queue_or_skip()
-
-        dpt.set_print_options(threshold=1, edgeitems=1)
-        x = dpt.arange(9, sycl_queue=q)
-        assert str(x) == "[0 ... 8]"
-        dpt.set_print_options(edgeitems=9)
-        assert str(x) == "[0 1 2 3 4 5 6 7 8]"
-
-    def test_floatmodes(self):
-        q = get_queue_or_skip()
-
-        x = dpt.asarray([0.1234, 0.1234678], sycl_queue=q)
-        dpt.set_print_options(floatmode="fixed", precision=4)
-        assert str(x) == "[0.1234 0.1235]"
-
-        dpt.set_print_options(floatmode="unique")
-        assert str(x) == "[0.1234    0.1234678]"
-
-        dpt.set_print_options(floatmode="maxprec")
-        assert str(x) == "[0.1234 0.1235]"
-
-        dpt.set_print_options(floatmode="maxprec", precision=8)
-        assert str(x) == "[0.1234    0.1234678]"
-
-        dpt.set_print_options(floatmode="maxprec_equal", precision=4)
-        assert str(x) == "[0.1234 0.1235]"
-
-        dpt.set_print_options(floatmode="maxprec_equal", precision=8)
-        assert str(x) == "[0.1234000 0.1234678]"
-
-    def test_nan_inf_suppress(self):
-        q = get_queue_or_skip()
-
-        dpt.set_print_options(nanstr="nan1", infstr="inf1")
-        x = dpt.asarray([np.nan, np.inf], sycl_queue=q)
-        assert str(x) == "[nan1 inf1]"
-
-    def test_suppress_small(self):
-        q = get_queue_or_skip()
-
-        dpt.set_print_options(suppress=True)
-        x = dpt.asarray(5e-10, sycl_queue=q)
-        assert str(x) == "0."
-
-    def test_sign(self):
-        q = get_queue_or_skip()
-
-        x = dpt.asarray([0.0, 1.0, 2.0], sycl_queue=q)
-        y = dpt.asarray(1.0, sycl_queue=q)
-        z = dpt.asarray([1.0 + 1.0j], sycl_queue=q)
-        assert str(x) == "[0. 1. 2.]"
-        assert str(y) == "1."
-        assert str(z) == "[1.+1.j]"
-
-        dpt.set_print_options(sign="+")
-        assert str(x) == "[+0. +1. +2.]"
-        assert str(y) == "+1."
-        assert str(z) == "[+1.+1.j]"
-
-        dpt.set_print_options(sign=" ")
-        assert str(x) == "[ 0.  1.  2.]"
-        assert str(y) == " 1."
-        assert str(z) == "[ 1.+1.j]"
-
-    def test_numpy(self):
-        dpt.set_print_options(numpy=True)
-        options = dpt.get_print_options()
-        np_options = np.get_printoptions()
-        assert all(np_options[k] == options[k] for k in options.keys())
-
-
-class TestPrintFns(TestPrint):
-    @pytest.mark.parametrize(
-        "dtype,x_str",
-        [
-            ("b1", "[False  True  True  True]"),
-            ("i1", "[0 1 2 3]"),
-            ("u1", "[0 1 2 3]"),
-            ("i2", "[0 1 2 3]"),
-            ("u2", "[0 1 2 3]"),
-            ("i4", "[0 1 2 3]"),
-            ("u4", "[0 1 2 3]"),
-            ("i8", "[0 1 2 3]"),
-            ("u8", "[0 1 2 3]"),
-            ("f2", "[0. 1. 2. 3.]"),
-            ("f4", "[0. 1. 2. 3.]"),
-            ("f8", "[0. 1. 2. 3.]"),
-            ("c8", "[0.+0.j 1.+0.j 2.+0.j 3.+0.j]"),
-            ("c16", "[0.+0.j 1.+0.j 2.+0.j 3.+0.j]"),
-        ],
-    )
-    def test_print_types(self, dtype, x_str):
-        q = get_queue_or_skip()
-        skip_if_dtype_not_supported(dtype, q)
-
-        x = dpt.asarray([0, 1, 2, 3], dtype=dtype, sycl_queue=q)
-        assert str(x) == x_str
-
-    def test_print_str(self):
-        q = get_queue_or_skip()
-
-        x = dpt.asarray(0, sycl_queue=q)
-        assert str(x) == "0"
-
-        x = dpt.asarray([np.nan, np.inf], sycl_queue=q)
-        assert str(x) == "[nan inf]"
-
-        x = dpt.arange(9, sycl_queue=q)
-        assert str(x) == "[0 1 2 3 4 5 6 7 8]"
-
-        y = dpt.reshape(x, (3, 3), copy=True)
-        assert str(y) == "[[0 1 2]\n [3 4 5]\n [6 7 8]]"
-
-    def test_print_str_abbreviated(self):
-        q = get_queue_or_skip()
-
-        dpt.set_print_options(threshold=0, edgeitems=1)
-        x = dpt.arange(9, sycl_queue=q)
-        assert str(x) == "[0 ... 8]"
-
-        x = dpt.reshape(x, (3, 3))
-        assert str(x) == "[[0 ... 2]\n ...\n [6 ... 8]]"
-
-    def test_usm_ndarray_str_separator(self):
-        q = get_queue_or_skip()
-
-        x = dpt.reshape(dpt.arange(4, sycl_queue=q), (2, 2))
-
-        np.testing.assert_equal(
-            dpt.usm_ndarray_str(x, prefix="test", separator="   "),
-            "[[0   1]\n     [2   3]]",
-        )
-
-    def test_print_repr(self):
-        q = get_queue_or_skip()
-
-        x = dpt.asarray(3, dtype="int64", sycl_queue=q)
-        assert repr(x) == "usm_ndarray(3)"
-
-        x = dpt.asarray([np.nan, np.inf], sycl_queue=q)
-        if x.sycl_device.has_aspect_fp64:
-            assert repr(x) == "usm_ndarray([nan, inf])"
-        else:
-            assert repr(x) == "usm_ndarray([nan, inf], dtype=float32)"
-
-        x = dpt.arange(9, sycl_queue=q, dtype="int64")
-        assert repr(x) == "usm_ndarray([0, 1, 2, 3, 4, 5, 6, 7, 8])"
-
-        x = dpt.reshape(x, (3, 3))
-        np.testing.assert_equal(
-            repr(x),
-            "usm_ndarray([[0, 1, 2],"
-            "\n             [3, 4, 5],"
-            "\n             [6, 7, 8]])",
-        )
-
-        x = dpt.arange(4, dtype="i4", sycl_queue=q)
-        assert repr(x) == "usm_ndarray([0, 1, 2, 3], dtype=int32)"
-
-        dpt.set_print_options(linewidth=1)
-        np.testing.assert_equal(
-            repr(x),
-            "usm_ndarray([0,"
-            "\n             1,"
-            "\n             2,"
-            "\n             3],"
-            "\n            dtype=int32)",
-        )
-
-        # zero-size array
-        dpt.set_print_options(linewidth=75)
-        x = dpt.ones((9, 0), dtype="i4", sycl_queue=q)
-        assert repr(x) == "usm_ndarray([], shape=(9, 0), dtype=int32)"
-
-    def test_print_repr_abbreviated(self):
-        q = get_queue_or_skip()
-
-        dpt.set_print_options(threshold=0, edgeitems=1)
-        x = dpt.arange(9, dtype="int64", sycl_queue=q)
-        assert repr(x) == "usm_ndarray([0, ..., 8], shape=(9,))"
-
-        y = dpt.asarray(x, dtype="i4", copy=True)
-        assert repr(y) == "usm_ndarray([0, ..., 8], shape=(9,), dtype=int32)"
-
-        x = dpt.reshape(x, (3, 3))
-        np.testing.assert_equal(
-            repr(x),
-            "usm_ndarray([[0, ..., 2],"
-            "\n             ...,"
-            "\n             [6, ..., 8]], shape=(3, 3))",
-        )
-
-        y = dpt.reshape(y, (3, 3))
-        np.testing.assert_equal(
-            repr(y),
-            "usm_ndarray([[0, ..., 2],"
-            "\n             ...,"
-            "\n             [6, ..., 8]], shape=(3, 3), dtype=int32)",
-        )
-
-        dpt.set_print_options(linewidth=1)
-        np.testing.assert_equal(
-            repr(y),
-            "usm_ndarray([[0,"
-            "\n              ...,"
-            "\n              2],"
-            "\n             ...,"
-            "\n             [6,"
-            "\n              ...,"
-            "\n              8]],"
-            "\n            shape=(3, 3),"
-            "\n            dtype=int32)",
-        )
-
-    @pytest.mark.parametrize(
-        "dtype",
-        [
-            "i1",
-            "u1",
-            "i2",
-            "u2",
-            "i4",
-            "u4",
-            "u8",
-            "f2",
-            "f4",
-            "c8",
-        ],
-    )
-    def test_repr_appended_dtype(self, dtype):
-        q = get_queue_or_skip()
-        skip_if_dtype_not_supported(dtype, q)
-
-        x = dpt.empty(4, dtype=dtype)
-        assert repr(x).split("=")[-1][:-1] == x.dtype.name
-
-    def test_usm_ndarray_repr_prefix(self):
-        q = get_queue_or_skip()
-
-        x = dpt.arange(4, dtype=np.intp, sycl_queue=q)
-        np.testing.assert_equal(
-            dpt.usm_ndarray_repr(x, prefix="test"), "test([0, 1, 2, 3])"
-        )
-        x = dpt.reshape(x, (2, 2))
-        np.testing.assert_equal(
-            dpt.usm_ndarray_repr(x, prefix="test"),
-            "test([[0, 1]," "\n      [2, 3]])",
-        )
-
-
-class TestContextManager:
-    def test_context_manager_basic(self):
-        options = dpt.get_print_options()
-        try:
-            X = dpt.asarray(1.234567)
-        except dpctl.SyclDeviceCreationError:
-            pytest.skip("No SYCL devices available")
-        with dpt.print_options(precision=4):
-            s = str(X)
-        assert s == "1.2346"
-        assert options == dpt.get_print_options()
-
-    def test_context_manager_as(self):
-        with dpt.print_options(precision=4) as x:
-            options = x.copy()
-        assert options["precision"] == 4
diff --git a/dpctl/tests/test_usm_ndarray_reductions.py b/dpctl/tests/test_usm_ndarray_reductions.py
deleted file mode 100644
index 06692e30c2..0000000000
--- a/dpctl/tests/test_usm_ndarray_reductions.py
+++ /dev/null
@@ -1,690 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2025 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-from random import randrange
-
-import numpy as np
-import pytest
-from numpy.testing import assert_allclose
-
-import dpctl.tensor as dpt
-from dpctl.tensor._tensor_impl import default_device_index_type
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-from dpctl.utils import ExecutionPlacementError
-
-_no_complex_dtypes = [
-    "?",
-    "i1",
-    "u1",
-    "i2",
-    "u2",
-    "i4",
-    "u4",
-    "i8",
-    "u8",
-    "f2",
-    "f4",
-    "f8",
-]
-
-
-_all_dtypes = _no_complex_dtypes + [
-    "c8",
-    "c16",
-]
-
-
-def test_max_min_axis():
-    get_queue_or_skip()
-
-    x = dpt.reshape(
-        dpt.arange((3 * 4 * 5 * 6 * 7), dtype="i4"), (3, 4, 5, 6, 7)
-    )
-
-    m = dpt.max(x, axis=(1, 2, -1))
-    assert m.shape == (3, 6)
-    assert dpt.all(m == x[:, -1, -1, :, -1])
-
-    m = dpt.min(x, axis=(1, 2, -1))
-    assert m.shape == (3, 6)
-    assert dpt.all(m == x[:, 0, 0, :, 0])
-
-
-def test_max_axis1_axis0():
-    """See gh-1455"""
-    get_queue_or_skip()
-
-    x = dpt.reshape(dpt.arange(3 * 4 * 5), (3, 4, 5))
-
-    m = dpt.max(x, axis=0)
-    assert dpt.all(m == x[-1, :, :])
-
-    x = dpt.flip(x, axis=2)
-    m = dpt.max(x, axis=2)
-    assert dpt.all(m == x[:, :, 0])
-
-
-def test_reduction_keepdims():
-    get_queue_or_skip()
-
-    n0, n1 = 3, 6
-    x = dpt.ones((n0, 4, 5, n1, 7), dtype="i4")
-    m = dpt.max(x, axis=(1, 2, -1), keepdims=True)
-
-    xx = dpt.reshape(dpt.permute_dims(x, (0, 3, 1, 2, -1)), (n0, n1, -1))
-    p = dpt.argmax(xx, axis=-1, keepdims=True)
-
-    assert m.shape == (n0, 1, 1, n1, 1)
-    assert dpt.all(m == dpt.reshape(x[:, 0, 0, :, 0], m.shape))
-    assert dpt.all(p == 0)
-
-
-def test_max_scalar():
-    get_queue_or_skip()
-
-    x = dpt.ones(())
-    m = dpt.max(x)
-
-    assert m.shape == ()
-    assert x == m
-
-
-@pytest.mark.parametrize("arg_dtype", ["i4", "f4", "c8"])
-def test_reduction_kernels(arg_dtype):
-    # i4 - always uses atomics w/ sycl group reduction
-    # f4 - always uses atomics w/ custom group reduction
-    # c8 - always uses temps w/ custom group reduction
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arg_dtype, q)
-
-    x = dpt.ones((24, 1025), dtype=arg_dtype, sycl_queue=q)
-    x[x.shape[0] // 2, :] = 3
-    x[:, x.shape[1] // 2] = 3
-
-    m = dpt.max(x)
-    assert m == 3
-    m = dpt.max(x, axis=0)
-    assert dpt.all(m == 3)
-    m = dpt.max(x, axis=1)
-    assert dpt.all(m == 3)
-
-    x = dpt.ones((24, 1025), dtype=arg_dtype, sycl_queue=q)
-    x[x.shape[0] // 2, :] = 0
-    x[:, x.shape[1] // 2] = 0
-
-    m = dpt.min(x)
-    assert m == 0
-    m = dpt.min(x, axis=0)
-    assert dpt.all(m == 0)
-    m = dpt.min(x, axis=1)
-    assert dpt.all(m == 0)
-
-
-def test_max_min_nan_propagation():
-    get_queue_or_skip()
-
-    # float, finites
-    x = dpt.arange(4, dtype="f4")
-    x[0] = dpt.nan
-    assert dpt.isnan(dpt.max(x))
-    assert dpt.isnan(dpt.min(x))
-
-    # float, infinities
-    x[1:] = dpt.inf
-    assert dpt.isnan(dpt.max(x))
-    x[1:] = -dpt.inf
-    assert dpt.isnan(dpt.min(x))
-
-    # complex
-    x = dpt.arange(4, dtype="c8")
-    x[0] = complex(dpt.nan, 0)
-    assert dpt.isnan(dpt.max(x))
-    assert dpt.isnan(dpt.min(x))
-
-    x[0] = complex(0, dpt.nan)
-    assert dpt.isnan(dpt.max(x))
-    assert dpt.isnan(dpt.min(x))
-
-
-def test_argmax_scalar():
-    get_queue_or_skip()
-
-    x = dpt.ones(())
-    m = dpt.argmax(x)
-
-    assert m.shape == ()
-    assert m == 0
-
-
-@pytest.mark.parametrize("arg_dtype", ["i4", "f4", "c8"])
-def test_search_reduction_kernels(arg_dtype):
-    # i4 - always uses atomics w/ sycl group reduction
-    # f4 - always uses atomics w/ custom group reduction
-    # c8 - always uses temps w/ custom group reduction
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arg_dtype, q)
-
-    x_shape = (24, 1024)
-    x_size = np.prod(x_shape)
-    x = dpt.ones(x_size, dtype=arg_dtype, sycl_queue=q)
-    idx = randrange(x.size)
-    idx_tup = np.unravel_index(idx, x_shape)
-    x[idx] = 2
-
-    m = dpt.argmax(x)
-    assert m == idx
-
-    # test case of strided input mapping to contig
-    # implementation
-    m = dpt.argmax(dpt.flip(x))
-    assert m == x.size - 1 - idx
-
-    # test case of strided implementation
-    y = dpt.ones(2 * x.size, dtype=arg_dtype, sycl_queue=q)
-    y[::2] = x
-    m = dpt.argmax(y)
-    assert m == 2 * idx
-
-    x = dpt.reshape(x, x_shape)
-
-    x[idx_tup[0], :] = 3
-    m = dpt.argmax(x, axis=0)
-    assert dpt.all(m == idx_tup[0])
-    x[:, idx_tup[1]] = 4
-    m = dpt.argmax(x, axis=1)
-    assert dpt.all(m == idx_tup[1])
-
-    x = x[:, ::-2]
-    idx = randrange(x.shape[1])
-    x[:, idx] = 5
-    m = dpt.argmax(x, axis=1)
-    assert dpt.all(m == idx)
-
-    x = dpt.ones(x_size, dtype=arg_dtype, sycl_queue=q)
-    idx = randrange(x.size)
-    idx_tup = np.unravel_index(idx, x_shape)
-    x[idx] = 0
-
-    m = dpt.argmin(x)
-    assert m == idx
-
-    x = dpt.reshape(x, x_shape)
-
-    x[idx_tup[0], :] = -1
-    m = dpt.argmin(x, axis=0)
-    assert dpt.all(m == idx_tup[0])
-    x[:, idx_tup[1]] = -2
-    m = dpt.argmin(x, axis=1)
-    assert dpt.all(m == idx_tup[1])
-
-    x = x[:, ::-2]
-    idx = randrange(x.shape[1])
-    x[:, idx] = -3
-    m = dpt.argmin(x, axis=1)
-    assert dpt.all(m == idx)
-
-
-def test_argmax_argmin_nan_propagation():
-    get_queue_or_skip()
-
-    sz = 4
-    idx = randrange(sz)
-    # floats
-    x = dpt.arange(sz, dtype="f4")
-    x[idx] = dpt.nan
-    assert dpt.argmax(x) == idx
-    assert dpt.argmin(x) == idx
-
-    # complex
-    x = dpt.arange(sz, dtype="c8")
-    x[idx] = complex(dpt.nan, 0)
-    assert dpt.argmax(x) == idx
-    assert dpt.argmin(x) == idx
-
-    x[idx] = complex(0, dpt.nan)
-    assert dpt.argmax(x) == idx
-    assert dpt.argmin(x) == idx
-
-
-def test_argmax_argmin_identities():
-    # make sure that identity arrays work as expected
-    get_queue_or_skip()
-
-    x = dpt.full(3, dpt.iinfo(dpt.int32).min, dtype="i4")
-    assert dpt.argmax(x) == 0
-    x = dpt.full(3, dpt.iinfo(dpt.int32).max, dtype="i4")
-    assert dpt.argmin(x) == 0
-
-
-@pytest.mark.parametrize("order", ["C", "F"])
-def test_argmax_axis0_axis1(order):
-    get_queue_or_skip()
-
-    x = dpt.asarray([[1, 2, 3], [6, 5, 4]], dtype="i4", order=order)
-    assert dpt.argmax(x) == 3
-
-    res = dpt.argmax(x, axis=0)
-    expected = dpt.asarray([1, 1, 1], dtype=res.dtype)
-    assert dpt.all(res == expected)
-
-    res = dpt.argmax(x, axis=1)
-    expected = dpt.asarray([2, 0], dtype=res.dtype)
-    assert dpt.all(res == expected)
-
-
-def test_reduction_arg_validation():
-    get_queue_or_skip()
-
-    x = dict()
-    with pytest.raises(TypeError):
-        dpt.sum(x)
-    with pytest.raises(TypeError):
-        dpt.max(x)
-    with pytest.raises(TypeError):
-        dpt.argmax(x)
-
-    x = dpt.zeros((0,), dtype="i4")
-    with pytest.raises(ValueError):
-        dpt.max(x)
-    with pytest.raises(ValueError):
-        dpt.argmax(x)
-
-
-@pytest.mark.parametrize("arg_dtype", _no_complex_dtypes[1:])
-def test_logsumexp_arg_dtype_default_output_dtype_matrix(arg_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arg_dtype, q)
-
-    m = dpt.ones(100, dtype=arg_dtype)
-    r = dpt.logsumexp(m)
-
-    assert isinstance(r, dpt.usm_ndarray)
-    assert r.dtype.kind == "f"
-    tol = dpt.finfo(r.dtype).resolution
-    assert_allclose(
-        dpt.asnumpy(r),
-        np.logaddexp.reduce(dpt.asnumpy(m), dtype=r.dtype),
-        rtol=tol,
-        atol=tol,
-    )
-
-
-def test_logsumexp_empty():
-    get_queue_or_skip()
-    x = dpt.empty((0,), dtype="f4")
-    y = dpt.logsumexp(x)
-    assert y.shape == tuple()
-    assert y == -dpt.inf
-
-
-def test_logsumexp_axis():
-    get_queue_or_skip()
-
-    m = dpt.ones((3, 4, 5, 6, 7), dtype="f4")
-    s = dpt.logsumexp(m, axis=(1, 2, -1))
-
-    assert isinstance(s, dpt.usm_ndarray)
-    assert s.shape == (3, 6)
-    tol = dpt.finfo(s.dtype).resolution
-    assert_allclose(
-        dpt.asnumpy(s),
-        np.logaddexp.reduce(dpt.asnumpy(m), axis=(1, 2, -1), dtype=s.dtype),
-        rtol=tol,
-        atol=tol,
-    )
-
-
-@pytest.mark.parametrize("arg_dtype", _no_complex_dtypes[1:])
-@pytest.mark.parametrize("out_dtype", _all_dtypes[1:])
-def test_logsumexp_arg_out_dtype_matrix(arg_dtype, out_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arg_dtype, q)
-    skip_if_dtype_not_supported(out_dtype, q)
-
-    m = dpt.ones(100, dtype=arg_dtype)
-    r = dpt.logsumexp(m, dtype=out_dtype)
-
-    assert isinstance(r, dpt.usm_ndarray)
-    assert r.dtype == dpt.dtype(out_dtype)
-
-
-def test_logsumexp_keepdims():
-    get_queue_or_skip()
-
-    m = dpt.ones((3, 4, 5, 6, 7), dtype="i4")
-    s = dpt.logsumexp(m, axis=(1, 2, -1), keepdims=True)
-
-    assert isinstance(s, dpt.usm_ndarray)
-    assert s.shape == (3, 1, 1, 6, 1)
-
-
-def test_logsumexp_keepdims_zero_size():
-    get_queue_or_skip()
-    n = 10
-    a = dpt.ones((n, 0, n))
-
-    s1 = dpt.logsumexp(a, keepdims=True)
-    assert s1.shape == (1, 1, 1)
-
-    s2 = dpt.logsumexp(a, axis=(0, 1), keepdims=True)
-    assert s2.shape == (1, 1, n)
-
-    s3 = dpt.logsumexp(a, axis=(1, 2), keepdims=True)
-    assert s3.shape == (n, 1, 1)
-
-    s4 = dpt.logsumexp(a, axis=(0, 2), keepdims=True)
-    assert s4.shape == (1, 0, 1)
-
-    a0 = a[0]
-    s5 = dpt.logsumexp(a0, keepdims=True)
-    assert s5.shape == (1, 1)
-
-
-def test_logsumexp_scalar():
-    get_queue_or_skip()
-
-    m = dpt.ones(())
-    s = dpt.logsumexp(m)
-
-    assert isinstance(s, dpt.usm_ndarray)
-    assert m.sycl_queue == s.sycl_queue
-    assert s.shape == ()
-
-
-def test_logsumexp_complex():
-    get_queue_or_skip()
-
-    x = dpt.zeros(1, dtype="c8")
-    with pytest.raises(ValueError):
-        dpt.logsumexp(x)
-
-
-def test_logsumexp_int_axis():
-    get_queue_or_skip()
-
-    x = dpt.zeros((8, 10), dtype="f4")
-    res = dpt.logsumexp(x, axis=0)
-    assert res.ndim == 1
-    assert res.shape[0] == 10
-
-
-def test_logsumexp_invalid_arr():
-    x = dict()
-    with pytest.raises(TypeError):
-        dpt.logsumexp(x)
-
-
-@pytest.mark.parametrize("arg_dtype", _no_complex_dtypes[1:])
-def test_hypot_arg_dtype_default_output_dtype_matrix(arg_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arg_dtype, q)
-
-    m = dpt.ones(100, dtype=arg_dtype)
-    r = dpt.reduce_hypot(m)
-
-    assert isinstance(r, dpt.usm_ndarray)
-    assert r.dtype.kind == "f"
-    tol = dpt.finfo(r.dtype).resolution
-    assert_allclose(
-        dpt.asnumpy(r),
-        np.hypot.reduce(dpt.asnumpy(m), dtype=r.dtype),
-        rtol=tol,
-        atol=tol,
-    )
-
-
-def test_hypot_empty():
-    get_queue_or_skip()
-    x = dpt.empty((0,), dtype="f4")
-    y = dpt.reduce_hypot(x)
-    assert y.shape == tuple()
-    assert y == 0
-
-
-@pytest.mark.parametrize("arg_dtype", _no_complex_dtypes[1:])
-@pytest.mark.parametrize("out_dtype", _all_dtypes[1:])
-def test_hypot_arg_out_dtype_matrix(arg_dtype, out_dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arg_dtype, q)
-    skip_if_dtype_not_supported(out_dtype, q)
-
-    m = dpt.ones(100, dtype=arg_dtype)
-    r = dpt.reduce_hypot(m, dtype=out_dtype)
-
-    assert isinstance(r, dpt.usm_ndarray)
-    assert r.dtype == dpt.dtype(out_dtype)
-
-
-def test_hypot_complex():
-    get_queue_or_skip()
-
-    x = dpt.zeros(1, dtype="c8")
-    with pytest.raises(ValueError):
-        dpt.reduce_hypot(x)
-
-
-def test_tree_reduction_axis1_axis0():
-    """See gh-1455"""
-    get_queue_or_skip()
-
-    x = dpt.reshape(dpt.arange(3 * 4 * 5, dtype="f4"), (3, 4, 5))
-
-    m = dpt.logsumexp(x, axis=0)
-    tol = dpt.finfo(m.dtype).resolution
-    assert_allclose(
-        dpt.asnumpy(m),
-        np.logaddexp.reduce(dpt.asnumpy(x), axis=0, dtype=m.dtype),
-        rtol=tol,
-        atol=tol,
-    )
-
-    x = dpt.flip(x, axis=2)
-    m = dpt.logsumexp(x, axis=2)
-    assert_allclose(
-        dpt.asnumpy(m),
-        np.logaddexp.reduce(dpt.asnumpy(x), axis=2, dtype=m.dtype),
-        rtol=tol,
-        atol=tol,
-    )
-
-
-def test_numeric_reduction_out_kwarg():
-    get_queue_or_skip()
-
-    n1, n2, n3 = 3, 4, 5
-    x = dpt.ones((n1, n2, n3), dtype="i8")
-    out = dpt.zeros((2 * n1, 3 * n2), dtype="i8")
-    res = dpt.sum(x, axis=-1, out=out[::-2, 1::3])
-    assert dpt.all(out[::-2, 0::3] == 0)
-    assert dpt.all(out[::-2, 2::3] == 0)
-    assert dpt.all(out[::-2, 1::3] == res)
-    assert dpt.all(out[::-2, 1::3] == 5)
-
-    out = dpt.zeros((2 * n1, 3 * n2, 1), dtype="i8")
-    res = dpt.sum(x, axis=-1, keepdims=True, out=out[::-2, 1::3])
-    assert res.shape == (n1, n2, 1)
-    assert dpt.all(out[::-2, 0::3] == 0)
-    assert dpt.all(out[::-2, 2::3] == 0)
-    assert dpt.all(out[::-2, 1::3] == res)
-    assert dpt.all(out[::-2, 1::3] == 5)
-
-    res = dpt.sum(x, axis=0, out=x[-1])
-    assert dpt.all(x[-1] == res)
-    assert dpt.all(x[-1] == 3)
-    assert dpt.all(x[0:-1] == 1)
-
-    # test no-op case
-    x = dpt.ones((n1, n2, n3), dtype="i8")
-    out = dpt.zeros((2 * n1, 3 * n2, n3), dtype="i8")
-    res = dpt.sum(x, axis=(), out=out[::-2, 1::3])
-    assert dpt.all(out[::-2, 0::3] == 0)
-    assert dpt.all(out[::-2, 2::3] == 0)
-    assert dpt.all(out[::-2, 1::3] == x)
-
-    # test with dtype kwarg
-    x = dpt.ones((n1, n2, n3), dtype="i4")
-    out = dpt.zeros((2 * n1, 3 * n2), dtype="f4")
-    res = dpt.sum(x, axis=-1, dtype="f4", out=out[::-2, 1::3])
-    zero_res = dpt.zeros_like(res)
-    assert dpt.allclose(out[::-2, 0::3], zero_res)
-    assert dpt.allclose(out[::-2, 2::3], zero_res)
-    assert dpt.allclose(out[::-2, 1::3], res)
-    assert dpt.allclose(out[::-2, 1::3], dpt.full_like(res, 5, dtype="f4"))
-
-
-def test_comparison_reduction_out_kwarg():
-    get_queue_or_skip()
-
-    n1, n2, n3 = 3, 4, 5
-    x = dpt.reshape(dpt.arange(n1 * n2 * n3, dtype="i4"), (n1, n2, n3))
-    out = dpt.zeros((2 * n1, 3 * n2), dtype="i4")
-    res = dpt.max(x, axis=-1, out=out[::-2, 1::3])
-    assert dpt.all(out[::-2, 0::3] == 0)
-    assert dpt.all(out[::-2, 2::3] == 0)
-    assert dpt.all(out[::-2, 1::3] == res)
-    assert dpt.all(out[::-2, 1::3] == x[:, :, -1])
-
-    out = dpt.zeros((2 * n1, 3 * n2, 1), dtype="i4")
-    res = dpt.max(x, axis=-1, keepdims=True, out=out[::-2, 1::3])
-    assert res.shape == (n1, n2, 1)
-    assert dpt.all(out[::-2, 0::3] == 0)
-    assert dpt.all(out[::-2, 2::3] == 0)
-    assert dpt.all(out[::-2, 1::3] == res)
-    assert dpt.all(out[::-2, 1::3] == x[:, :, -1, dpt.newaxis])
-
-    # test no-op case
-    out = dpt.zeros((2 * n1, 3 * n2, n3), dtype="i4")
-    res = dpt.max(x, axis=(), out=out[::-2, 1::3])
-    assert dpt.all(out[::-2, 0::3] == 0)
-    assert dpt.all(out[::-2, 2::3] == 0)
-    assert dpt.all(out[::-2, 1::3] == x)
-
-    # test overlap
-    res = dpt.max(x, axis=0, out=x[0])
-    assert dpt.all(x[0] == res)
-    assert dpt.all(x[0] == x[-1])
-
-
-def test_search_reduction_out_kwarg():
-    get_queue_or_skip()
-
-    n1, n2, n3 = 3, 4, 5
-    dt = dpt.__array_namespace_info__().default_dtypes()["indexing"]
-
-    x = dpt.reshape(dpt.arange(n1 * n2 * n3, dtype=dt), (n1, n2, n3))
-    out = dpt.zeros((2 * n1, 3 * n2), dtype=dt)
-    res = dpt.argmax(x, axis=-1, out=out[::-2, 1::3])
-    assert dpt.all(out[::-2, 0::3] == 0)
-    assert dpt.all(out[::-2, 2::3] == 0)
-    assert dpt.all(out[::-2, 1::3] == res)
-    assert dpt.all(out[::-2, 1::3] == n2)
-
-    out = dpt.zeros((2 * n1, 3 * n2, 1), dtype=dt)
-    res = dpt.argmax(x, axis=-1, keepdims=True, out=out[::-2, 1::3])
-    assert res.shape == (n1, n2, 1)
-    assert dpt.all(out[::-2, 0::3] == 0)
-    assert dpt.all(out[::-2, 2::3] == 0)
-    assert dpt.all(out[::-2, 1::3] == res)
-    assert dpt.all(out[::-2, 1::3] == n3 - 1)
-
-    # test no-op case
-    x = dpt.ones((), dtype=dt)
-    out = dpt.ones(2, dtype=dt)
-    res = dpt.argmax(x, axis=None, out=out[1])
-    assert dpt.all(out[0] == 1)
-    assert dpt.all(out[1] == 0)
-
-    # test overlap
-    x = dpt.reshape(dpt.arange(n1 * n2, dtype=dt), (n1, n2))
-    res = dpt.argmax(x, axis=0, out=x[0])
-    assert dpt.all(x[0] == res)
-    assert dpt.all(x[0] == n1 - 1)
-
-
-def test_reduction_out_kwarg_arg_validation():
-    q1 = get_queue_or_skip()
-    q2 = get_queue_or_skip()
-
-    ind_dt = dpt.__array_namespace_info__().default_dtypes()["indexing"]
-
-    x = dpt.ones(10, dtype="f4")
-    out_wrong_queue = dpt.empty((), dtype="f4", sycl_queue=q2)
-    out_wrong_dtype = dpt.empty((), dtype="i4", sycl_queue=q1)
-    out_wrong_shape = dpt.empty(1, dtype="f4", sycl_queue=q1)
-    out_wrong_keepdims = dpt.empty((), dtype="f4", sycl_queue=q1)
-    out_not_writable = dpt.empty((), dtype="f4", sycl_queue=q1)
-    out_not_writable.flags["W"] = False
-
-    with pytest.raises(TypeError):
-        dpt.sum(x, out=dict())
-    with pytest.raises(TypeError):
-        dpt.max(x, out=dict())
-    with pytest.raises(TypeError):
-        dpt.argmax(x, out=dict())
-    with pytest.raises(ExecutionPlacementError):
-        dpt.sum(x, out=out_wrong_queue)
-    with pytest.raises(ExecutionPlacementError):
-        dpt.max(x, out=out_wrong_queue)
-    with pytest.raises(ExecutionPlacementError):
-        dpt.argmax(x, out=dpt.empty_like(out_wrong_queue, dtype=ind_dt))
-    with pytest.raises(ValueError):
-        dpt.sum(x, out=out_wrong_dtype)
-    with pytest.raises(ValueError):
-        dpt.max(x, out=out_wrong_dtype)
-    with pytest.raises(ValueError):
-        dpt.argmax(x, out=dpt.empty_like(out_wrong_dtype, dtype="f4"))
-    with pytest.raises(ValueError):
-        dpt.sum(x, out=out_wrong_shape)
-    with pytest.raises(ValueError):
-        dpt.max(x, out=out_wrong_shape)
-    with pytest.raises(ValueError):
-        dpt.argmax(x, out=dpt.empty_like(out_wrong_shape, dtype=ind_dt))
-    with pytest.raises(ValueError):
-        dpt.sum(x, out=out_not_writable)
-    with pytest.raises(ValueError):
-        dpt.max(x, out=out_not_writable)
-    with pytest.raises(ValueError):
-        search_not_writable = dpt.empty_like(out_not_writable, dtype=ind_dt)
-        search_not_writable.flags["W"] = False
-        dpt.argmax(x, out=search_not_writable)
-    with pytest.raises(ValueError):
-        dpt.sum(x, keepdims=True, out=out_wrong_keepdims)
-    with pytest.raises(ValueError):
-        dpt.max(x, keepdims=True, out=out_wrong_keepdims)
-    with pytest.raises(ValueError):
-        dpt.argmax(
-            x,
-            keepdims=True,
-            out=dpt.empty_like(out_wrong_keepdims, dtype=ind_dt),
-        )
-
-
-@pytest.mark.parametrize("dt", _all_dtypes)
-def test_count_nonzero(dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    expected_dt = default_device_index_type(q.sycl_device)
-
-    x = dpt.ones(10, dtype=dt, sycl_queue=q)
-    res = dpt.count_nonzero(x)
-    assert res == 10
-    assert res.dtype == expected_dt
-
-    x[3:6] = 0
-    res = dpt.count_nonzero(x)
-    assert res == 7
-    assert res.dtype == expected_dt
diff --git a/dpctl/tests/test_usm_ndarray_search_functions.py b/dpctl/tests/test_usm_ndarray_search_functions.py
deleted file mode 100644
index 4f3649dce2..0000000000
--- a/dpctl/tests/test_usm_ndarray_search_functions.py
+++ /dev/null
@@ -1,579 +0,0 @@
-#                      Data Parallel Control (dpctl)
-#
-# Copyright 2020-2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import ctypes
-import itertools
-
-import numpy as np
-import pytest
-from numpy.testing import assert_array_equal
-
-import dpctl.tensor as dpt
-from dpctl.tensor._search_functions import _where_result_type
-from dpctl.tensor._type_utils import _all_data_types
-from dpctl.utils import ExecutionPlacementError
-
-from .helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-_all_dtypes = [
-    "?",
-    "u1",
-    "i1",
-    "u2",
-    "i2",
-    "u4",
-    "i4",
-    "u8",
-    "i8",
-    "e",
-    "f",
-    "d",
-    "F",
-    "D",
-]
-
-
-class mock_device:
-    def __init__(self, fp16, fp64):
-        self.has_aspect_fp16 = fp16
-        self.has_aspect_fp64 = fp64
-
-
-def test_where_basic():
-    get_queue_or_skip()
-
-    cond = dpt.asarray(
-        [
-            [True, False, False],
-            [False, True, False],
-            [False, False, True],
-            [False, False, False],
-            [True, True, True],
-        ]
-    )
-    out = dpt.where(cond, dpt.asarray(1), dpt.asarray(0))
-    out_expected = dpt.asarray(
-        [[1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 0], [1, 1, 1]]
-    )
-    assert (dpt.asnumpy(out) == dpt.asnumpy(out_expected)).all()
-
-    out = dpt.where(cond, dpt.ones(cond.shape), dpt.zeros(cond.shape))
-    assert (dpt.asnumpy(out) == dpt.asnumpy(out_expected)).all()
-
-    out = dpt.where(
-        cond,
-        dpt.ones(cond.shape[0], dtype="i4")[:, dpt.newaxis],
-        dpt.zeros(cond.shape[0], dtype="i4")[:, dpt.newaxis],
-    )
-    assert (dpt.asnumpy(out) == dpt.asnumpy(out_expected)).all()
-
-
-def _dtype_all_close(x1, x2):
-    if np.issubdtype(x2.dtype, np.floating) or np.issubdtype(
-        x2.dtype, np.complexfloating
-    ):
-        x2_dtype = x2.dtype
-        return np.allclose(
-            x1, x2, atol=np.finfo(x2_dtype).eps, rtol=np.finfo(x2_dtype).eps
-        )
-    else:
-        return np.allclose(x1, x2)
-
-
-@pytest.mark.parametrize("dt1", _all_dtypes)
-@pytest.mark.parametrize("dt2", _all_dtypes)
-@pytest.mark.parametrize("fp16", [True, False])
-@pytest.mark.parametrize("fp64", [True, False])
-def test_where_result_types(dt1, dt2, fp16, fp64):
-    dev = mock_device(fp16, fp64)
-
-    dt1 = dpt.dtype(dt1)
-    dt2 = dpt.dtype(dt2)
-    res_t = _where_result_type(dt1, dt2, dev)
-
-    if fp16 and fp64:
-        assert res_t == dpt.result_type(dt1, dt2)
-    else:
-        if res_t:
-            assert res_t.kind == dpt.result_type(dt1, dt2).kind
-        else:
-            # some illegal cases are covered above, but
-            # this guarantees that _where_result_type
-            # produces None only when one of the dtypes
-            # is illegal given fp aspects of device
-            all_dts = _all_data_types(fp16, fp64)
-            assert dt1 not in all_dts or dt2 not in all_dts
-
-
-@pytest.mark.parametrize("dt", _all_dtypes)
-def test_where_mask_dtypes(dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dt, q)
-
-    # mask dtype changes
-    cond = dpt.asarray([0, 1, 3, 0, 10], dtype=dt, sycl_queue=q)
-    x1 = dpt.asarray(0, dtype="f4", sycl_queue=q)
-    x2 = dpt.asarray(1, dtype="f4", sycl_queue=q)
-    res = dpt.where(cond, x1, x2)
-
-    res_check = np.asarray([1, 0, 0, 1, 0], dtype=res.dtype)
-    assert _dtype_all_close(dpt.asnumpy(res), res_check)
-
-    # contiguous cases
-    x1 = dpt.full(cond.shape, 0, dtype="f4", sycl_queue=q)
-    x2 = dpt.full(cond.shape, 1, dtype="f4", sycl_queue=q)
-    res = dpt.where(cond, x1, x2)
-    assert _dtype_all_close(dpt.asnumpy(res), res_check)
-
-    # input array dtype changes
-    cond = dpt.asarray([False, True, True, False, True], sycl_queue=q)
-    x1 = dpt.asarray(0, dtype=dt, sycl_queue=q)
-    x2 = dpt.asarray(1, dtype=dt, sycl_queue=q)
-    res = dpt.where(cond, x1, x2)
-
-    res_check = np.asarray([1, 0, 0, 1, 0], dtype=res.dtype)
-    assert _dtype_all_close(dpt.asnumpy(res), res_check)
-
-    # contiguous cases
-    x1 = dpt.full(cond.shape, 0, dtype=dt, sycl_queue=q)
-    x2 = dpt.full(cond.shape, 1, dtype=dt, sycl_queue=q)
-    res = dpt.where(cond, x1, x2)
-    assert _dtype_all_close(dpt.asnumpy(res), res_check)
-
-
-def test_where_asymmetric_dtypes():
-    q = get_queue_or_skip()
-
-    cond = dpt.asarray([0, 1, 3, 0, 10], dtype="?", sycl_queue=q)
-    x1 = dpt.asarray(2, dtype="i4", sycl_queue=q)
-    x2 = dpt.asarray(3, dtype="i8", sycl_queue=q)
-
-    res = dpt.where(cond, x1, x2)
-    res_check = np.asarray([3, 2, 2, 3, 2], dtype=res.dtype)
-    assert _dtype_all_close(dpt.asnumpy(res), res_check)
-
-    # flip order
-
-    res = dpt.where(cond, x2, x1)
-    res_check = np.asarray([2, 3, 3, 2, 3], dtype=res.dtype)
-    assert _dtype_all_close(dpt.asnumpy(res), res_check)
-
-
-def test_where_nan_inf():
-    get_queue_or_skip()
-
-    cond = dpt.asarray([True, False, True, False], dtype="?")
-    x1 = dpt.asarray([np.nan, 2.0, np.inf, 3.0], dtype="f4")
-    x2 = dpt.asarray([2.0, np.nan, 3.0, np.inf], dtype="f4")
-
-    cond_np = dpt.asnumpy(cond)
-    x1_np = dpt.asnumpy(x1)
-    x2_np = dpt.asnumpy(x2)
-
-    res = dpt.where(cond, x1, x2)
-    res_np = np.where(cond_np, x1_np, x2_np)
-
-    assert np.allclose(dpt.asnumpy(res), res_np, equal_nan=True)
-
-    res = dpt.where(x1, cond, x2)
-    res_np = np.where(x1_np, cond_np, x2_np)
-    assert _dtype_all_close(dpt.asnumpy(res), res_np)
-
-
-def test_where_empty():
-    # check that numpy returns same results when
-    # handling empty arrays
-    get_queue_or_skip()
-
-    empty = dpt.empty(0, dtype="i2")
-    m = dpt.asarray(True)
-    x1 = dpt.asarray(1, dtype="i2")
-    x2 = dpt.asarray(2, dtype="i2")
-    res = dpt.where(empty, x1, x2)
-
-    empty_np = np.empty(0, dtype="i2")
-    m_np = dpt.asnumpy(m)
-    x1_np = dpt.asnumpy(x1)
-    x2_np = dpt.asnumpy(x2)
-    res_np = np.where(empty_np, x1_np, x2_np)
-
-    assert_array_equal(dpt.asnumpy(res), res_np)
-
-    res = dpt.where(m, empty, x2)
-    res_np = np.where(m_np, empty_np, x2_np)
-
-    assert_array_equal(dpt.asnumpy(res), res_np)
-
-    # check that broadcasting is performed
-    with pytest.raises(ValueError):
-        dpt.where(empty, x1, dpt.empty((1, 2)))
-
-
-@pytest.mark.parametrize("order", ["C", "F"])
-def test_where_contiguous(order):
-    get_queue_or_skip()
-
-    cond = dpt.asarray(
-        [
-            [[True, False, False], [False, True, True]],
-            [[False, True, False], [True, False, True]],
-            [[False, False, True], [False, False, True]],
-            [[False, False, False], [True, False, True]],
-            [[True, True, True], [True, False, True]],
-        ],
-        order=order,
-    )
-
-    x1 = dpt.full(cond.shape, 2, dtype="i4", order=order)
-    x2 = dpt.full(cond.shape, 3, dtype="i4", order=order)
-    expected = np.where(dpt.asnumpy(cond), dpt.asnumpy(x1), dpt.asnumpy(x2))
-    res = dpt.where(cond, x1, x2)
-
-    assert _dtype_all_close(dpt.asnumpy(res), expected)
-
-
-def test_where_contiguous1D():
-    get_queue_or_skip()
-
-    cond = dpt.asarray([True, False, True, False, False, True])
-
-    x1 = dpt.full(cond.shape, 2, dtype="i4")
-    x2 = dpt.full(cond.shape, 3, dtype="i4")
-    expected = np.where(dpt.asnumpy(cond), dpt.asnumpy(x1), dpt.asnumpy(x2))
-    res = dpt.where(cond, x1, x2)
-    assert_array_equal(dpt.asnumpy(res), expected)
-
-    # test with complex dtype (branch in kernel)
-    x1 = dpt.astype(x1, dpt.complex64)
-    x2 = dpt.astype(x2, dpt.complex64)
-    expected = np.where(dpt.asnumpy(cond), dpt.asnumpy(x1), dpt.asnumpy(x2))
-    res = dpt.where(cond, x1, x2)
-    assert _dtype_all_close(dpt.asnumpy(res), expected)
-
-
-def test_where_gh_1170():
-    get_queue_or_skip()
-
-    cond = dpt.asarray([False, True, True, False], dtype="?")
-    x1 = dpt.ones((3, 4), dtype="i4")
-    x2 = dpt.zeros((3, 4), dtype="i4")
-
-    res = dpt.where(cond, x1, x2)
-    expected = np.broadcast_to(dpt.asnumpy(cond).astype(x1.dtype), x1.shape)
-
-    assert_array_equal(dpt.asnumpy(res), expected)
-
-
-def test_where_strided():
-    get_queue_or_skip()
-
-    s0, s1 = 4, 9
-    cond = dpt.reshape(
-        dpt.asarray(
-            [True, False, False, False, True, True, False, True, False] * s0
-        ),
-        (s0, s1),
-    )[:, ::3]
-
-    x1 = dpt.reshape(
-        dpt.arange(cond.shape[0] * cond.shape[1] * 2, dtype="i4"),
-        (cond.shape[0], cond.shape[1] * 2),
-    )[:, ::2]
-    x2 = dpt.reshape(
-        dpt.arange(cond.shape[0] * cond.shape[1] * 3, dtype="i4"),
-        (cond.shape[0], cond.shape[1] * 3),
-    )[:, ::3]
-    expected = np.where(dpt.asnumpy(cond), dpt.asnumpy(x1), dpt.asnumpy(x2))
-    res = dpt.where(cond, x1, x2)
-
-    assert_array_equal(dpt.asnumpy(res), expected)
-
-    # negative strides
-    res = dpt.where(cond, dpt.flip(x1), x2)
-    expected = np.where(
-        dpt.asnumpy(cond), np.flip(dpt.asnumpy(x1)), dpt.asnumpy(x2)
-    )
-    assert_array_equal(dpt.asnumpy(res), expected)
-
-    res = dpt.where(dpt.flip(cond), x1, x2)
-    expected = np.where(
-        np.flip(dpt.asnumpy(cond)), dpt.asnumpy(x1), dpt.asnumpy(x2)
-    )
-    assert_array_equal(dpt.asnumpy(res), expected)
-
-
-def test_where_invariants():
-    get_queue_or_skip()
-
-    test_sh = (
-        6,
-        8,
-    )
-    mask = dpt.asarray(np.random.choice([True, False], size=test_sh))
-    p = dpt.ones(test_sh, dtype=dpt.int16)
-    m = dpt.full(test_sh, -1, dtype=dpt.int16)
-    inds_list = [
-        (
-            np.s_[:3],
-            np.s_[::2],
-        ),
-        (
-            np.s_[::2],
-            np.s_[::2],
-        ),
-        (
-            np.s_[::-1],
-            np.s_[:],
-        ),
-    ]
-    for ind in inds_list:
-        r1 = dpt.where(mask, p, m)[ind]
-        r2 = dpt.where(mask[ind], p[ind], m[ind])
-        assert (dpt.asnumpy(r1) == dpt.asnumpy(r2)).all()
-
-
-def test_where_arg_validation():
-    get_queue_or_skip()
-
-    check = dict()
-    x1 = dpt.empty((1,), dtype="i4")
-    x2 = dpt.empty((1,), dtype="i4")
-
-    with pytest.raises(TypeError):
-        dpt.where(check, x1, x2)
-    with pytest.raises(ValueError):
-        dpt.where(x1, check, x2)
-    with pytest.raises(ValueError):
-        dpt.where(x1, x2, check)
-
-
-def test_where_compute_follows_data():
-    q1 = get_queue_or_skip()
-    q2 = get_queue_or_skip()
-    q3 = get_queue_or_skip()
-
-    x1 = dpt.empty((1,), dtype="i4", sycl_queue=q1)
-    x2 = dpt.empty((1,), dtype="i4", sycl_queue=q2)
-
-    with pytest.raises(ExecutionPlacementError):
-        dpt.where(dpt.empty((1,), dtype="i4", sycl_queue=q1), x1, x2)
-    with pytest.raises(ExecutionPlacementError):
-        dpt.where(dpt.empty((1,), dtype="i4", sycl_queue=q3), x1, x2)
-    with pytest.raises(ExecutionPlacementError):
-        dpt.where(x1, x1, x2)
-
-
-def test_where_order():
-    get_queue_or_skip()
-
-    test_sh = (
-        20,
-        20,
-    )
-    test_sh2 = tuple(2 * dim for dim in test_sh)
-    n = test_sh[-1]
-
-    for dt1, dt2 in zip(["i4", "i4", "f4"], ["i4", "f4", "i4"]):
-        ar1 = dpt.zeros(test_sh, dtype=dt1, order="C")
-        ar2 = dpt.ones(test_sh, dtype=dt2, order="C")
-        condition = dpt.zeros(test_sh, dtype="?", order="C")
-        res1 = dpt.where(condition, ar1, ar2, order="C")
-        assert res1.flags.c_contiguous
-        res2 = dpt.where(condition, ar1, ar2, order="F")
-        assert res2.flags.f_contiguous
-        res3 = dpt.where(condition, ar1, ar2, order="A")
-        assert res3.flags.c_contiguous
-        res4 = dpt.where(condition, ar1, ar2, order="K")
-        assert res4.flags.c_contiguous
-
-        ar1 = dpt.ones(test_sh, dtype=dt1, order="F")
-        ar2 = dpt.ones(test_sh, dtype=dt2, order="F")
-        condition = dpt.zeros(test_sh, dtype="?", order="F")
-        res1 = dpt.where(condition, ar1, ar2, order="C")
-        assert res1.flags.c_contiguous
-        res2 = dpt.where(condition, ar1, ar2, order="F")
-        assert res2.flags.f_contiguous
-        res3 = dpt.where(condition, ar1, ar2, order="A")
-        assert res2.flags.f_contiguous
-        res4 = dpt.where(condition, ar1, ar2, order="K")
-        assert res4.flags.f_contiguous
-
-        ar1 = dpt.ones(test_sh2, dtype=dt1, order="C")[:20, ::-2]
-        ar2 = dpt.ones(test_sh2, dtype=dt2, order="C")[:20, ::-2]
-        condition = dpt.zeros(test_sh2, dtype="?", order="C")[:20, ::-2]
-        res1 = dpt.where(condition, ar1, ar2, order="K")
-        assert res1.strides == (n, -1)
-        res2 = dpt.where(condition, ar1, ar2, order="C")
-        assert res2.strides == (n, 1)
-
-        ar1 = dpt.ones(test_sh2, dtype=dt1, order="C")[:20, ::-2].mT
-        ar2 = dpt.ones(test_sh2, dtype=dt2, order="C")[:20, ::-2].mT
-        condition = dpt.zeros(test_sh2, dtype="?", order="C")[:20, ::-2].mT
-        res1 = dpt.where(condition, ar1, ar2, order="K")
-        assert res1.strides == (-1, n)
-        res2 = dpt.where(condition, ar1, ar2, order="C")
-        assert res2.strides == (n, 1)
-
-        ar1 = dpt.ones(n, dtype=dt1, order="C")
-        ar2 = dpt.broadcast_to(dpt.ones(n, dtype=dt2, order="C"), test_sh)
-        condition = dpt.zeros(n, dtype="?", order="C")
-        res = dpt.where(condition, ar1, ar2, order="K")
-        assert res.strides == (20, 1)
-
-
-def test_where_unaligned():
-    get_queue_or_skip()
-
-    x = dpt.ones(513, dtype="i4")
-    a = dpt.full(512, 2, dtype="i4")
-    b = dpt.zeros(512, dtype="i4")
-
-    expected = dpt.full(512, 2, dtype="i4")
-    assert dpt.all(dpt.where(x[1:], a, b) == expected)
-
-
-def test_where_out():
-    get_queue_or_skip()
-
-    n1, n2, n3 = 3, 4, 5
-    ar1 = dpt.reshape(dpt.arange(n1 * n2 * n3, dtype="i4"), (n1, n2, n3))
-    ar2 = dpt.full_like(ar1, -5)
-    condition = dpt.tile(
-        dpt.reshape(
-            dpt.asarray([True, False, False, True], dtype="?"), (1, n2, 1)
-        ),
-        (n1, 1, n3),
-    )
-
-    out = dpt.zeros((2 * n1, 3 * n2, n3), dtype="i4")
-    res = dpt.where(condition, ar1, ar2, out=out[::-2, 1::3, :])
-
-    assert dpt.all(res == out[::-2, 1::3, :])
-    assert dpt.all(out[::-2, 0::3, :] == 0)
-    assert dpt.all(out[::-2, 2::3, :] == 0)
-
-    assert dpt.all(res[:, 1:3, :] == -5)
-    assert dpt.all(res[:, 0, :] == ar1[:, 0, :])
-    assert dpt.all(res[:, 3, :] == ar1[:, 3, :])
-
-    condition = dpt.tile(
-        dpt.reshape(dpt.asarray([1, 0], dtype="i4"), (1, 2, 1)),
-        (n1, 2, n3),
-    )
-    res = dpt.where(
-        condition[:, ::-1, :], condition[:, ::-1, :], condition, out=condition
-    )
-    assert dpt.all(res == condition)
-    assert dpt.all(condition == 1)
-
-    condition = dpt.tile(
-        dpt.reshape(dpt.asarray([True, False], dtype="?"), (1, 2, 1)),
-        (n1, 2, n3),
-    )
-    ar1 = dpt.full((n1, n2, n3), 7, dtype="i4")
-    ar2 = dpt.full_like(ar1, -5)
-    res = dpt.where(condition, ar1, ar2, out=ar2[:, ::-1, :])
-    assert dpt.all(ar2[:, ::-1, :] == res)
-    assert dpt.all(ar2[:, ::2, :] == -5)
-    assert dpt.all(ar2[:, 1::2, :] == 7)
-
-    condition = dpt.tile(
-        dpt.reshape(dpt.asarray([True, False], dtype="?"), (1, 2, 1)),
-        (n1, 2, n3),
-    )
-    ar1 = dpt.full((n1, n2, n3), 7, dtype="i4")
-    ar2 = dpt.full_like(ar1, -5)
-    res = dpt.where(condition, ar1, ar2, out=ar1[:, ::-1, :])
-    assert dpt.all(ar1[:, ::-1, :] == res)
-    assert dpt.all(ar1[:, ::2, :] == -5)
-    assert dpt.all(ar1[:, 1::2, :] == 7)
-
-
-def test_where_out_arg_validation():
-    q1 = get_queue_or_skip()
-    q2 = get_queue_or_skip()
-
-    condition = dpt.ones(5, dtype="i4", sycl_queue=q1)
-    x1 = dpt.ones(5, dtype="i4", sycl_queue=q1)
-    x2 = dpt.ones(5, dtype="i4", sycl_queue=q1)
-
-    out_wrong_queue = dpt.empty_like(condition, sycl_queue=q2)
-    out_wrong_dtype = dpt.empty_like(condition, dtype="f4")
-    out_wrong_shape = dpt.empty(6, dtype="i4", sycl_queue=q1)
-    out_not_writable = dpt.empty_like(condition)
-    out_not_writable.flags["W"] = False
-
-    with pytest.raises(TypeError):
-        dpt.where(condition, x1, x2, out=dict())
-    with pytest.raises(ExecutionPlacementError):
-        dpt.where(condition, x1, x2, out=out_wrong_queue)
-    with pytest.raises(ValueError):
-        dpt.where(condition, x1, x2, out=out_wrong_dtype)
-    with pytest.raises(ValueError):
-        dpt.where(condition, x1, x2, out=out_wrong_shape)
-    with pytest.raises(ValueError):
-        dpt.where(condition, x1, x2, out=out_not_writable)
-
-
-@pytest.mark.parametrize("arr_dt", _all_dtypes)
-def test_where_python_scalar(arr_dt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(arr_dt, q)
-
-    n1, n2 = 10, 10
-    condition = dpt.tile(
-        dpt.reshape(
-            dpt.asarray([True, False], dtype="?", sycl_queue=q), (1, 2)
-        ),
-        (n1, n2 // 2),
-    )
-    x = dpt.zeros((n1, n2), dtype=arr_dt, sycl_queue=q)
-    py_scalars = (
-        bool(0),
-        int(0),
-        float(0),
-        complex(0),
-        np.float32(0),
-        ctypes.c_int(0),
-    )
-    for sc in py_scalars:
-        r = dpt.where(condition, x, sc)
-        assert isinstance(r, dpt.usm_ndarray)
-        r = dpt.where(condition, sc, x)
-        assert isinstance(r, dpt.usm_ndarray)
-
-
-def test_where_two_python_scalars():
-    get_queue_or_skip()
-
-    n1, n2 = 10, 10
-    condition = dpt.tile(
-        dpt.reshape(dpt.asarray([True, False], dtype="?"), (1, 2)),
-        (n1, n2 // 2),
-    )
-
-    py_scalars = [
-        bool(0),
-        int(0),
-        float(0),
-        complex(0),
-        np.float32(0),
-        ctypes.c_int(0),
-    ]
-
-    for sc1, sc2 in itertools.product(py_scalars, repeat=2):
-        r = dpt.where(condition, sc1, sc2)
-        assert isinstance(r, dpt.usm_ndarray)
diff --git a/dpctl/tests/test_usm_ndarray_searchsorted.py b/dpctl/tests/test_usm_ndarray_searchsorted.py
deleted file mode 100644
index dfec24de08..0000000000
--- a/dpctl/tests/test_usm_ndarray_searchsorted.py
+++ /dev/null
@@ -1,377 +0,0 @@
-import numpy as np
-import pytest
-
-import dpctl
-import dpctl.tensor as dpt
-import dpctl.utils as dpu
-
-from .helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-
-def _check(hay_stack, needles, needles_np):
-    assert hay_stack.dtype == needles.dtype
-    assert hay_stack.ndim == 1
-
-    info_ = dpt.__array_namespace_info__()
-    default_dts_dev = info_.default_dtypes(device=hay_stack.device)
-    index_dt = default_dts_dev["indexing"]
-
-    p_left = dpt.searchsorted(hay_stack, needles, side="left")
-    assert p_left.dtype == index_dt
-
-    hs_np = dpt.asnumpy(hay_stack)
-    ref_left = np.searchsorted(hs_np, needles_np, side="left")
-    assert dpt.all(p_left == dpt.asarray(ref_left))
-
-    p_right = dpt.searchsorted(hay_stack, needles, side="right")
-    assert p_right.dtype == index_dt
-
-    ref_right = np.searchsorted(hs_np, needles_np, side="right")
-    assert dpt.all(p_right == dpt.asarray(ref_right))
-
-    sorter = dpt.arange(hay_stack.size)
-    ps_left = dpt.searchsorted(hay_stack, needles, side="left", sorter=sorter)
-    assert ps_left.dtype == index_dt
-    assert dpt.all(ps_left == p_left)
-    ps_right = dpt.searchsorted(hay_stack, needles, side="right", sorter=sorter)
-    assert ps_right.dtype == index_dt
-    assert dpt.all(ps_right == p_right)
-
-
-def test_searchsorted_contig_bool():
-    get_queue_or_skip()
-
-    dt = dpt.bool
-
-    hay_stack = dpt.arange(0, 1, dtype=dt)
-    needles_np = np.random.choice([True, False], size=1024)
-    needles = dpt.asarray(needles_np)
-
-    _check(hay_stack, needles, needles_np)
-    _check(
-        hay_stack,
-        dpt.reshape(needles, (32, 32)),
-        np.reshape(needles_np, (32, 32)),
-    )
-
-
-def test_searchsorted_strided_bool():
-    get_queue_or_skip()
-
-    dt = dpt.bool
-
-    hay_stack = dpt.repeat(dpt.arange(0, 1, dtype=dt), 4)[::4]
-    needles_np = np.random.choice([True, False], size=2 * 1024)
-    needles = dpt.asarray(needles_np)
-    sl = slice(None, None, -2)
-
-    _check(hay_stack, needles[sl], needles_np[sl])
-    _check(
-        hay_stack,
-        dpt.reshape(needles[sl], (32, 32)),
-        np.reshape(needles_np[sl], (32, 32)),
-    )
-
-
-@pytest.mark.parametrize(
-    "idt",
-    [
-        dpt.int8,
-        dpt.uint8,
-        dpt.int16,
-        dpt.uint16,
-        dpt.int32,
-        dpt.uint32,
-        dpt.int64,
-        dpt.uint64,
-    ],
-)
-def test_searchsorted_contig_int(idt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(idt, q)
-
-    dt = dpt.dtype(idt)
-    max_v = dpt.iinfo(dt).max
-
-    hay_stack = dpt.arange(0, min(max_v, 255), dtype=dt)
-    needles_np = np.random.randint(0, max_v, dtype=dt, size=1024)
-    needles = dpt.asarray(needles_np)
-
-    _check(hay_stack, needles, needles_np)
-    _check(
-        hay_stack,
-        dpt.reshape(needles, (32, 32)),
-        np.reshape(needles_np, (32, 32)),
-    )
-
-
-@pytest.mark.parametrize(
-    "idt",
-    [
-        dpt.int8,
-        dpt.uint8,
-        dpt.int16,
-        dpt.uint16,
-        dpt.int32,
-        dpt.uint32,
-        dpt.int64,
-        dpt.uint64,
-    ],
-)
-def test_searchsorted_strided_int(idt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(idt, q)
-
-    dt = dpt.dtype(idt)
-    max_v = dpt.iinfo(dt).max
-
-    hay_stack = dpt.repeat(dpt.arange(0, min(max_v, 255), dtype=dt), 4)[1::4]
-    needles_np = np.random.randint(0, max_v, dtype=dt, size=2 * 1024)
-    needles = dpt.asarray(needles_np)
-    sl = slice(None, None, -2)
-
-    _check(hay_stack, needles[sl], needles_np[sl])
-    _check(
-        hay_stack,
-        dpt.reshape(needles[sl], (32, 32)),
-        np.reshape(needles_np[sl], (32, 32)),
-    )
-
-
-def _add_extended_fp(array):
-    array[0] = -dpt.inf
-    array[-2] = dpt.inf
-    array[-1] = dpt.nan
-
-
-@pytest.mark.parametrize("idt", [dpt.float16, dpt.float32, dpt.float64])
-def test_searchsorted_contig_fp(idt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(idt, q)
-
-    dt = dpt.dtype(idt)
-
-    hay_stack = dpt.linspace(0, 1, num=255, dtype=dt, endpoint=True)
-    _add_extended_fp(hay_stack)
-
-    needles_np = np.random.uniform(-0.1, 1.1, size=1024).astype(dt)
-    needles = dpt.asarray(needles_np)
-
-    _check(hay_stack, needles, needles_np)
-    _check(
-        hay_stack,
-        dpt.reshape(needles, (32, 32)),
-        np.reshape(needles_np, (32, 32)),
-    )
-
-
-@pytest.mark.parametrize("idt", [dpt.float16, dpt.float32, dpt.float64])
-def test_searchsorted_strided_fp(idt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(idt, q)
-
-    dt = dpt.dtype(idt)
-
-    hay_stack = dpt.repeat(
-        dpt.linspace(0, 1, num=255, dtype=dt, endpoint=True), 4
-    )[1::4]
-    _add_extended_fp(hay_stack)
-
-    needles_np = np.random.uniform(-0.1, 1.1, size=3 * 1024).astype(dt)
-    needles = dpt.asarray(needles_np)
-    sl = slice(1, None, 3)
-
-    _check(hay_stack, needles[sl], needles_np[sl])
-    _check(
-        hay_stack,
-        dpt.reshape(needles[sl], (32, 32)),
-        np.reshape(needles_np[sl], (32, 32)),
-    )
-
-
-def _add_extended_cfp(array):
-    dt = array.dtype
-    ev_li = [
-        complex(-dpt.inf, -1),
-        complex(-dpt.inf, -dpt.inf),
-        complex(-dpt.inf, dpt.inf),
-        complex(-dpt.inf, dpt.nan),
-        complex(0, -dpt.inf),
-        complex(0, -1),
-        complex(0, dpt.inf),
-        complex(0, dpt.nan),
-        complex(dpt.inf, -dpt.inf),
-        complex(dpt.inf, -1),
-        complex(dpt.inf, dpt.inf),
-        complex(dpt.inf, dpt.nan),
-        complex(dpt.nan, -dpt.inf),
-        complex(dpt.nan, -1),
-        complex(dpt.nan, dpt.inf),
-        complex(dpt.nan, dpt.nan),
-    ]
-    ev = dpt.asarray(ev_li, dtype=dt, device=array.device)
-    return dpt.sort(dpt.concat((ev, array)))
-
-
-@pytest.mark.parametrize("idt", [dpt.complex64, dpt.complex128])
-def test_searchsorted_contig_cfp(idt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(idt, q)
-
-    dt = dpt.dtype(idt)
-
-    hay_stack = dpt.linspace(0, 1, num=255, dtype=dt, endpoint=True)
-    hay_stack = _add_extended_cfp(hay_stack)
-    needles_np = np.random.uniform(-0.1, 1.1, size=1024).astype(dt)
-    needles = dpt.asarray(needles_np)
-
-    _check(hay_stack, needles, needles_np)
-    _check(
-        hay_stack,
-        dpt.reshape(needles, (32, 32)),
-        np.reshape(needles_np, (32, 32)),
-    )
-
-
-@pytest.mark.parametrize("idt", [dpt.complex64, dpt.complex128])
-def test_searchsorted_strided_cfp(idt):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(idt, q)
-
-    dt = dpt.dtype(idt)
-
-    hay_stack = dpt.repeat(
-        dpt.linspace(0, 1, num=255, dtype=dt, endpoint=True), 4
-    )[1::4]
-    needles_np = np.random.uniform(-0.1, 1.1, size=3 * 1024).astype(dt)
-    needles = dpt.asarray(needles_np)
-    sl = slice(1, None, 3)
-
-    _check(hay_stack, needles[sl], needles_np[sl])
-    _check(
-        hay_stack,
-        dpt.reshape(needles[sl], (32, 32)),
-        np.reshape(needles_np[sl], (32, 32)),
-    )
-
-    hay_stack = _add_extended_cfp(hay_stack)
-    _check(hay_stack, needles[sl], needles_np[sl])
-    _check(
-        hay_stack,
-        dpt.reshape(needles[sl], (32, 32)),
-        np.reshape(needles_np[sl], (32, 32)),
-    )
-
-
-def test_searchsorted_coerce():
-    get_queue_or_skip()
-
-    x1_i4 = dpt.arange(5, dtype="i4")
-    x1_i8 = dpt.arange(5, dtype="i8")
-    x2_i4 = dpt.arange(5, dtype="i4")
-    x2_i8 = dpt.arange(5, dtype="i8")
-
-    p1 = dpt.searchsorted(x1_i4, x2_i8)
-    p2 = dpt.searchsorted(x1_i8, x2_i8)
-    p3 = dpt.searchsorted(x1_i8, x2_i4)
-    assert dpt.all(p1 == p2)
-    assert dpt.all(p2 == p3)
-
-
-def test_searchsorted_validation():
-    with pytest.raises(TypeError):
-        dpt.searchsorted(None, None)
-    try:
-        x1 = dpt.arange(10, dtype="i4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("Default device could not be created")
-    with pytest.raises(TypeError):
-        dpt.searchsorted(x1, None)
-    with pytest.raises(TypeError):
-        dpt.searchsorted(x1, x1, sorter=dict())
-    with pytest.raises(ValueError):
-        dpt.searchsorted(x1, x1, side="unknown")
-
-
-def test_searchsorted_validation2():
-    try:
-        x1 = dpt.arange(10, dtype="i4")
-        sorter = dpt.arange(10, dtype="i4")
-    except dpctl.SyclDeviceCreationError:
-        pytest.skip("Default device could not be created")
-    d = x1.sycl_device
-    q2 = dpctl.SyclQueue(d, property="in_order")
-    x2 = dpt.ones(5, dtype=x1.dtype, sycl_queue=q2)
-
-    with pytest.raises(dpu.ExecutionPlacementError):
-        dpt.searchsorted(x1, x2)
-
-    with pytest.raises(dpu.ExecutionPlacementError):
-        dpt.searchsorted(x1, x2, sorter=sorter)
-
-    sorter = dpt.ones(x1.shape, dtype=dpt.bool)
-    # non-integral sorter.dtype raises
-    with pytest.raises(ValueError):
-        dpt.searchsorted(x1, x1, sorter=sorter)
-
-    # non-matching x1.shape and sorter.shape raises
-    with pytest.raises(ValueError):
-        dpt.searchsorted(x1, x1, sorter=sorter[:-1])
-
-    # x1 must be 1d, or ValueError is raised
-    with pytest.raises(ValueError):
-        dpt.searchsorted(x1[dpt.newaxis, :], x1)
-
-
-def test_pw_linear_interpolation_example():
-    get_queue_or_skip()
-
-    bins = dpt.asarray([0.0, 0.05, 0.2, 0.25, 0.5, 0.8, 0.95, 1])
-    vals = dpt.asarray([0.1, 0.15, 0.3, 0.5, 0.7, 0.53, 0.37, 0.1])
-    assert vals.shape == bins.shape
-    data_np = np.random.uniform(0, 1, size=10000)
-    data = dpt.asarray(data_np)
-
-    p = dpt.searchsorted(bins, data)
-    w = (data - bins[p]) / (bins[p - 1] - bins[p])
-    assert dpt.min(w) >= 0
-    assert dpt.max(w) <= 1
-    interp_vals = vals[p - 1] * w + (1 - w) * vals[p]
-
-    assert interp_vals.shape == data.shape
-    assert dpt.min(interp_vals) >= dpt.zeros(tuple())
-    av = dpt.sum(interp_vals) / data.size
-    exp = dpt.vecdot(vals[1:] + vals[:-1], bins[1:] - bins[:-1]) / 2
-
-    assert dpt.abs(av - exp) < 0.1
-
-
-def test_out_of_bound_sorter_values():
-    get_queue_or_skip()
-
-    x = dpt.asarray([1, 2, 0], dtype="i4")
-    n = x.shape[0]
-
-    # use out-of-bounds indices in sorter
-    sorter = dpt.asarray([2, 0 - n, 1 - n], dtype="i8")
-
-    x2 = dpt.arange(3, dtype=x.dtype)
-    p = dpt.searchsorted(x, x2, sorter=sorter)
-    # verify that they were applied with mode="wrap"
-    assert dpt.all(p == dpt.arange(3, dtype=p.dtype))
-
-
-def test_searchsorted_strided_scalar_needle():
-    get_queue_or_skip()
-
-    a_max = 255
-
-    hay_stack = dpt.flip(
-        dpt.repeat(dpt.arange(a_max - 1, -1, -1, dtype=dpt.int32), 4)
-    )
-    needles_np = np.squeeze(
-        np.random.randint(0, a_max, dtype=dpt.int32, size=1), axis=0
-    )
-    needles = dpt.asarray(needles_np)
-
-    _check(hay_stack, needles, needles_np)
diff --git a/dpctl/tests/test_usm_ndarray_sorting.py b/dpctl/tests/test_usm_ndarray_sorting.py
deleted file mode 100644
index 1bd20913ca..0000000000
--- a/dpctl/tests/test_usm_ndarray_sorting.py
+++ /dev/null
@@ -1,381 +0,0 @@
-#                      Data Parallel Control (dpctl)
-#
-# Copyright 2020-2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import itertools
-
-import numpy as np
-import pytest
-from numpy.testing import assert_array_equal
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        "i1",
-        "u1",
-        "i2",
-        "u2",
-        "i4",
-        "u4",
-        "i8",
-        "u8",
-        "f2",
-        "f4",
-        "f8",
-        "c8",
-        "c16",
-    ],
-)
-def test_sort_1d(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    inp = dpt.roll(
-        dpt.concat(
-            (dpt.ones(10000, dtype=dtype), dpt.zeros(10000, dtype=dtype))
-        ),
-        734,
-    )
-
-    s = dpt.sort(inp, descending=False)
-    assert dpt.all(s[:-1] <= s[1:])
-
-    s1 = dpt.sort(inp, descending=True)
-    assert dpt.all(s1[:-1] >= s1[1:])
-
-
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        "i1",
-        "u1",
-        "i2",
-        "u2",
-        "i4",
-        "u4",
-        "i8",
-        "u8",
-        "f2",
-        "f4",
-        "f8",
-        "c8",
-        "c16",
-    ],
-)
-def test_sort_2d(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    fl = dpt.roll(
-        dpt.concat(
-            (dpt.ones(10000, dtype=dtype), dpt.zeros(10000, dtype=dtype))
-        ),
-        734,
-    )
-    inp = dpt.reshape(fl, (20, -1))
-
-    s = dpt.sort(inp, axis=1, descending=False)
-    assert dpt.all(s[:, :-1] <= s[:, 1:])
-
-    s1 = dpt.sort(inp, axis=1, descending=True)
-    assert dpt.all(s1[:, :-1] >= s1[:, 1:])
-
-
-def test_sort_strides():
-    get_queue_or_skip()
-
-    fl = dpt.roll(
-        dpt.concat((dpt.ones(10000, dtype="i4"), dpt.zeros(10000, dtype="i4"))),
-        734,
-    )
-    inp = dpt.reshape(fl, (-1, 20))
-
-    s = dpt.sort(inp, axis=0, descending=False)
-    assert dpt.all(s[:-1, :] <= s[1:, :])
-
-    s1 = dpt.sort(inp, axis=0, descending=True)
-    assert dpt.all(s1[:-1, :] >= s1[1:, :])
-
-
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        "i1",
-        "u1",
-        "i2",
-        "u2",
-        "i4",
-        "u4",
-        "i8",
-        "u8",
-        "f2",
-        "f4",
-        "f8",
-        "c8",
-        "c16",
-    ],
-)
-def test_argsort_1d(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    inp = dpt.roll(
-        dpt.concat(
-            (dpt.ones(10000, dtype=dtype), dpt.zeros(10000, dtype=dtype))
-        ),
-        734,
-    )
-
-    s_idx = dpt.argsort(inp, descending=False)
-    assert dpt.all(inp[s_idx[:-1]] <= inp[s_idx[1:]])
-
-    s1_idx = dpt.argsort(inp, descending=True)
-    assert dpt.all(inp[s1_idx[:-1]] >= inp[s1_idx[1:]])
-
-
-def test_sort_validation():
-    with pytest.raises(TypeError):
-        dpt.sort(dict())
-
-
-def test_sort_validation_kind():
-    get_queue_or_skip()
-
-    x = dpt.ones(128, dtype="u1")
-
-    with pytest.raises(ValueError):
-        dpt.sort(x, kind=Ellipsis)
-
-    with pytest.raises(ValueError):
-        dpt.sort(x, kind="invalid")
-
-
-def test_argsort_validation():
-    with pytest.raises(TypeError):
-        dpt.argsort(dict())
-
-
-def test_argsort_validation_kind():
-    get_queue_or_skip()
-
-    x = dpt.arange(127, stop=0, step=-1, dtype="i1")
-
-    with pytest.raises(ValueError):
-        dpt.argsort(x, kind=Ellipsis)
-
-    with pytest.raises(ValueError):
-        dpt.argsort(x, kind="invalid")
-
-
-_all_kinds = ["stable", "mergesort", "radixsort"]
-
-
-@pytest.mark.parametrize("kind", _all_kinds)
-def test_sort_axis0(kind):
-    get_queue_or_skip()
-
-    n, m = 200, 30
-    xf = dpt.arange(n * m, 0, step=-1, dtype="i4")
-    x = dpt.reshape(xf, (n, m))
-    s = dpt.sort(x, axis=0, kind=kind)
-
-    assert dpt.all(s[:-1, :] <= s[1:, :])
-
-
-@pytest.mark.parametrize("kind", _all_kinds)
-def test_argsort_axis0(kind):
-    get_queue_or_skip()
-
-    n, m = 200, 30
-    xf = dpt.arange(n * m, 0, step=-1, dtype="i4")
-    x = dpt.reshape(xf, (n, m))
-    idx = dpt.argsort(x, axis=0, kind=kind)
-
-    s = dpt.take_along_axis(x, idx, axis=0)
-
-    assert dpt.all(s[:-1, :] <= s[1:, :])
-
-
-@pytest.mark.parametrize("kind", _all_kinds)
-def test_argsort_axis1(kind):
-    get_queue_or_skip()
-
-    n, m = 200, 30
-    xf = dpt.arange(n * m, 0, step=-1, dtype="i4")
-    x = dpt.reshape(xf, (n, m))
-    idx = dpt.argsort(x, axis=1, kind=kind)
-
-    s = dpt.take_along_axis(x, idx, axis=1)
-
-    assert dpt.all(s[:, :-1] <= s[:, 1:])
-
-
-@pytest.mark.parametrize("kind", _all_kinds)
-def test_sort_strided(kind):
-    get_queue_or_skip()
-
-    x_orig = dpt.arange(100, dtype="i4")
-    x_flipped = dpt.flip(x_orig, axis=0)
-    s = dpt.sort(x_flipped, kind=kind)
-
-    assert dpt.all(s == x_orig)
-
-
-@pytest.mark.parametrize("kind", _all_kinds)
-def test_argsort_strided(kind):
-    get_queue_or_skip()
-
-    x_orig = dpt.arange(100, dtype="i4")
-    x_flipped = dpt.flip(x_orig, axis=0)
-    idx = dpt.argsort(x_flipped, kind=kind)
-    s = dpt.take_along_axis(x_flipped, idx, axis=0)
-
-    assert dpt.all(s == x_orig)
-
-
-@pytest.mark.parametrize("kind", _all_kinds)
-def test_sort_0d_array(kind):
-    get_queue_or_skip()
-
-    x = dpt.asarray(1, dtype="i4")
-    expected = dpt.asarray(1, dtype="i4")
-    assert dpt.sort(x, kind=kind) == expected
-
-
-@pytest.mark.parametrize("kind", _all_kinds)
-def test_argsort_0d_array(kind):
-    get_queue_or_skip()
-
-    x = dpt.asarray(1, dtype="i4")
-    expected = dpt.asarray(0, dtype="i4")
-    assert dpt.argsort(x, kind=kind) == expected
-
-
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        "f2",
-        "f4",
-        "f8",
-    ],
-)
-@pytest.mark.parametrize("kind", _all_kinds)
-def test_sort_real_fp_nan(dtype, kind):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    x = dpt.asarray(
-        [-0.0, 0.1, dpt.nan, 0.0, -0.1, dpt.nan, 0.2, -0.3], dtype=dtype
-    )
-    s = dpt.sort(x, kind=kind)
-
-    expected = dpt.asarray(
-        [-0.3, -0.1, -0.0, 0.0, 0.1, 0.2, dpt.nan, dpt.nan], dtype=dtype
-    )
-
-    assert dpt.allclose(s, expected, equal_nan=True)
-
-    s = dpt.sort(x, descending=True, kind=kind)
-
-    expected = dpt.asarray(
-        [dpt.nan, dpt.nan, 0.2, 0.1, -0.0, 0.0, -0.1, -0.3], dtype=dtype
-    )
-
-    assert dpt.allclose(s, expected, equal_nan=True)
-
-
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        "c8",
-        "c16",
-    ],
-)
-def test_sort_complex_fp_nan(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    rvs = [-0.0, 0.1, 0.0, 0.2, -0.3, dpt.nan]
-    ivs = [-0.0, 0.1, 0.0, 0.2, -0.3, dpt.nan]
-
-    cv = []
-    for rv in rvs:
-        for iv in ivs:
-            cv.append(complex(rv, iv))
-
-    inp = dpt.asarray(cv, dtype=dtype)
-    s = dpt.sort(inp)
-
-    expected = np.sort(dpt.asnumpy(inp))
-
-    assert np.allclose(dpt.asnumpy(s), expected, equal_nan=True)
-
-    pairs = []
-    for i, j in itertools.permutations(range(inp.shape[0]), 2):
-        pairs.append([i, j])
-    sub_arrs = inp[dpt.asarray(pairs)]
-    m1 = dpt.asnumpy(dpt.sort(sub_arrs, axis=1))
-    m2 = np.sort(dpt.asnumpy(sub_arrs), axis=1)
-    for k in range(len(pairs)):
-        i, j = pairs[k]
-        r1 = m1[k]
-        r2 = m2[k]
-        assert np.array_equal(
-            r1.view(np.int64), r2.view(np.int64)
-        ), f"Failed for {i} and {j}"
-
-
-def test_radix_sort_size_1_axis():
-    get_queue_or_skip()
-
-    x1 = dpt.ones((), dtype="i1")
-    r1 = dpt.sort(x1, kind="radixsort")
-    assert_array_equal(dpt.asnumpy(r1), dpt.asnumpy(x1))
-
-    x2 = dpt.ones([1], dtype="i1")
-    r2 = dpt.sort(x2, kind="radixsort")
-    assert_array_equal(dpt.asnumpy(r2), dpt.asnumpy(x2))
-
-    x3 = dpt.reshape(dpt.arange(10, dtype="i1"), (10, 1))
-    r3 = dpt.sort(x3, kind="radixsort")
-    assert dpt.asnumpy(r3 == x3).all()
-
-    x4 = dpt.reshape(dpt.arange(10, dtype="i1"), (1, 10))
-    r4 = dpt.sort(x4, axis=0, kind="radixsort")
-    assert dpt.asnumpy(r4 == x4).all()
-
-
-def test_radix_argsort_size_1_axis():
-    get_queue_or_skip()
-
-    x1 = dpt.ones((), dtype="i1")
-    r1 = dpt.argsort(x1, kind="radixsort")
-    assert r1 == 0
-
-    x2 = dpt.ones([1], dtype="i1")
-    r2 = dpt.argsort(x2, kind="radixsort")
-    assert dpt.asnumpy(r2 == 0).all()
-
-    x3 = dpt.reshape(dpt.arange(10, dtype="i1"), (10, 1))
-    r3 = dpt.argsort(x3, kind="radixsort")
-    assert dpt.asnumpy(r3 == 0).all()
-
-    x4 = dpt.reshape(dpt.arange(10, dtype="i1"), (1, 10))
-    r4 = dpt.argsort(x4, axis=0, kind="radixsort")
-    assert dpt.asnumpy(r4 == 0).all()
diff --git a/dpctl/tests/test_usm_ndarray_top_k.py b/dpctl/tests/test_usm_ndarray_top_k.py
deleted file mode 100644
index 80371870fc..0000000000
--- a/dpctl/tests/test_usm_ndarray_top_k.py
+++ /dev/null
@@ -1,315 +0,0 @@
-#                      Data Parallel Control (dpctl)
-#
-# Copyright 2020-2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-
-def _expected_largest_inds(inp, n, shift, k):
-    "Computed expected top_k indices for mode='largest'"
-    assert k < n
-    ones_start_id = shift % (2 * n)
-
-    alloc_dev = inp.device
-
-    if ones_start_id < n:
-        expected_inds = dpt.arange(
-            ones_start_id, ones_start_id + k, dtype="i8", device=alloc_dev
-        )
-    else:
-        # wrap-around
-        ones_end_id = (ones_start_id + n) % (2 * n)
-        if ones_end_id >= k:
-            expected_inds = dpt.arange(k, dtype="i8", device=alloc_dev)
-        else:
-            expected_inds = dpt.concat(
-                (
-                    dpt.arange(ones_end_id, dtype="i8", device=alloc_dev),
-                    dpt.arange(
-                        ones_start_id,
-                        ones_start_id + k - ones_end_id,
-                        dtype="i8",
-                        device=alloc_dev,
-                    ),
-                )
-            )
-
-    return expected_inds
-
-
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        "i1",
-        "u1",
-        "i2",
-        "u2",
-        "i4",
-        "u4",
-        "i8",
-        "u8",
-        "f2",
-        "f4",
-        "f8",
-        "c8",
-        "c16",
-    ],
-)
-@pytest.mark.parametrize("n", [33, 43, 255, 511, 1021, 8193])
-def test_top_k_1d_largest(dtype, n):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    shift, k = 734, 5
-    o = dpt.ones(n, dtype=dtype)
-    z = dpt.zeros(n, dtype=dtype)
-    oz = dpt.concat((o, z))
-    inp = dpt.roll(oz, shift)
-
-    expected_inds = _expected_largest_inds(oz, n, shift, k)
-
-    s = dpt.top_k(inp, k, mode="largest")
-    assert s.values.shape == (k,)
-    assert s.values.dtype == inp.dtype
-    assert s.indices.shape == (k,)
-    assert dpt.all(s.values == dpt.ones(k, dtype=dtype)), s.values
-    assert dpt.all(s.values == inp[s.indices]), s.indices
-    assert dpt.all(s.indices == expected_inds), (s.indices, expected_inds)
-
-
-def _expected_smallest_inds(inp, n, shift, k):
-    "Computed expected top_k indices for mode='smallest'"
-    assert k < n
-    zeros_start_id = (n + shift) % (2 * n)
-    zeros_end_id = (shift) % (2 * n)
-
-    alloc_dev = inp.device
-
-    if zeros_start_id < zeros_end_id:
-        expected_inds = dpt.arange(
-            zeros_start_id, zeros_start_id + k, dtype="i8", device=alloc_dev
-        )
-    else:
-        if zeros_end_id >= k:
-            expected_inds = dpt.arange(k, dtype="i8", device=alloc_dev)
-        else:
-            expected_inds = dpt.concat(
-                (
-                    dpt.arange(zeros_end_id, dtype="i8", device=alloc_dev),
-                    dpt.arange(
-                        zeros_start_id,
-                        zeros_start_id + k - zeros_end_id,
-                        dtype="i8",
-                        device=alloc_dev,
-                    ),
-                )
-            )
-
-    return expected_inds
-
-
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        "i1",
-        "u1",
-        "i2",
-        "u2",
-        "i4",
-        "u4",
-        "i8",
-        "u8",
-        "f2",
-        "f4",
-        "f8",
-        "c8",
-        "c16",
-    ],
-)
-@pytest.mark.parametrize("n", [37, 39, 61, 255, 257, 513, 1021, 8193])
-def test_top_k_1d_smallest(dtype, n):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    shift, k = 734, 5
-    o = dpt.ones(n, dtype=dtype)
-    z = dpt.zeros(n, dtype=dtype)
-    oz = dpt.concat((o, z))
-    inp = dpt.roll(oz, shift)
-
-    expected_inds = _expected_smallest_inds(oz, n, shift, k)
-
-    s = dpt.top_k(inp, k, mode="smallest")
-    assert s.values.shape == (k,)
-    assert s.values.dtype == inp.dtype
-    assert s.indices.shape == (k,)
-    assert dpt.all(s.values == dpt.zeros(k, dtype=dtype)), s.values
-    assert dpt.all(s.values == inp[s.indices]), s.indices
-    assert dpt.all(s.indices == expected_inds), (s.indices, expected_inds)
-
-
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        # skip short types to ensure that m*n can be represented
-        # in the type
-        "i4",
-        "u4",
-        "i8",
-        "u8",
-        "f2",
-        "f4",
-        "f8",
-        "c8",
-        "c16",
-    ],
-)
-@pytest.mark.parametrize("n", [37, 39, 61, 255, 257, 513, 1021, 8193])
-def test_top_k_2d_largest(dtype, n):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    m, k = 8, 3
-    if dtype == "f2" and m * n > 2000:
-        pytest.skip(
-            "f2 can not distinguish between large integers used in this test"
-        )
-
-    x = dpt.reshape(dpt.arange(m * n, dtype=dtype), (m, n))
-
-    r = dpt.top_k(x, k, axis=1)
-
-    assert r.values.shape == (m, k)
-    assert r.indices.shape == (m, k)
-    expected_inds = dpt.reshape(dpt.arange(n, dtype=r.indices.dtype), (1, n))[
-        :, -k:
-    ]
-    assert expected_inds.shape == (1, k)
-    assert dpt.all(
-        dpt.sort(r.indices, axis=1) == dpt.sort(expected_inds, axis=1)
-    ), (r.indices, expected_inds)
-    expected_vals = x[:, -k:]
-    assert dpt.all(
-        dpt.sort(r.values, axis=1) == dpt.sort(expected_vals, axis=1)
-    )
-
-
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        # skip short types to ensure that m*n can be represented
-        # in the type
-        "i4",
-        "u4",
-        "i8",
-        "u8",
-        "f2",
-        "f4",
-        "f8",
-        "c8",
-        "c16",
-    ],
-)
-@pytest.mark.parametrize("n", [37, 39, 61, 255, 257, 513, 1021, 8193])
-def test_top_k_2d_smallest(dtype, n):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    m, k = 8, 3
-    if dtype == "f2" and m * n > 2000:
-        pytest.skip(
-            "f2 can not distinguish between large integers used in this test"
-        )
-
-    x = dpt.reshape(dpt.arange(m * n, dtype=dtype), (m, n))
-
-    r = dpt.top_k(x, k, axis=1, mode="smallest")
-
-    assert r.values.shape == (m, k)
-    assert r.indices.shape == (m, k)
-    expected_inds = dpt.reshape(dpt.arange(n, dtype=r.indices.dtype), (1, n))[
-        :, :k
-    ]
-    assert dpt.all(
-        dpt.sort(r.indices, axis=1) == dpt.sort(expected_inds, axis=1)
-    )
-    assert dpt.all(dpt.sort(r.values, axis=1) == dpt.sort(x[:, :k], axis=1))
-
-
-def test_top_k_0d():
-    get_queue_or_skip()
-
-    a = dpt.ones(tuple(), dtype="i4")
-    assert a.ndim == 0
-    assert a.size == 1
-
-    r = dpt.top_k(a, 1)
-    assert r.values == a
-    assert r.indices == dpt.zeros_like(a, dtype=r.indices.dtype)
-
-
-def test_top_k_noncontig():
-    get_queue_or_skip()
-
-    a = dpt.arange(256, dtype=dpt.int32)[::2]
-    r = dpt.top_k(a, 3)
-
-    assert dpt.all(dpt.sort(r.values) == dpt.asarray([250, 252, 254])), r.values
-    assert dpt.all(
-        dpt.sort(r.indices) == dpt.asarray([125, 126, 127])
-    ), r.indices
-
-
-def test_top_k_axis0():
-    get_queue_or_skip()
-
-    m, n, k = 128, 8, 3
-    x = dpt.reshape(dpt.arange(m * n, dtype=dpt.int32), (m, n))
-
-    r = dpt.top_k(x, k, axis=0, mode="smallest")
-    assert r.values.shape == (k, n)
-    assert r.indices.shape == (k, n)
-    expected_inds = dpt.reshape(dpt.arange(m, dtype=r.indices.dtype), (m, 1))[
-        :k, :
-    ]
-    assert dpt.all(
-        dpt.sort(r.indices, axis=0) == dpt.sort(expected_inds, axis=0)
-    )
-    assert dpt.all(dpt.sort(r.values, axis=0) == dpt.sort(x[:k, :], axis=0))
-
-
-def test_top_k_validation():
-    get_queue_or_skip()
-    x = dpt.ones(10, dtype=dpt.int64)
-    with pytest.raises(ValueError):
-        # k must be positive
-        dpt.top_k(x, -1)
-    with pytest.raises(TypeError):
-        # argument should be usm_ndarray
-        dpt.top_k(list(), 2)
-    x2 = dpt.reshape(x, (2, 5))
-    with pytest.raises(ValueError):
-        # k must not exceed array dimension
-        # along specified axis
-        dpt.top_k(x2, 100, axis=1)
-    with pytest.raises(ValueError):
-        # for 0d arrays, k must be 1
-        dpt.top_k(x[0], 2)
-    with pytest.raises(ValueError):
-        # mode must be "largest", or "smallest"
-        dpt.top_k(x, 2, mode="invalid")
diff --git a/dpctl/tests/test_usm_ndarray_unique.py b/dpctl/tests/test_usm_ndarray_unique.py
deleted file mode 100644
index 4632061ab6..0000000000
--- a/dpctl/tests/test_usm_ndarray_unique.py
+++ /dev/null
@@ -1,345 +0,0 @@
-#                      Data Parallel Control (dpctl)
-#
-# Copyright 2020-2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-import dpctl
-import dpctl.tensor as dpt
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        "i1",
-        "u1",
-        "i2",
-        "u2",
-        "i4",
-        "u4",
-        "i8",
-        "u8",
-        "f2",
-        "f4",
-        "f8",
-        "c8",
-        "c16",
-    ],
-)
-def test_unique_values(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n, roll = 10000, 734
-    inp = dpt.roll(
-        dpt.concat((dpt.ones(n, dtype=dtype), dpt.zeros(n, dtype=dtype))),
-        roll,
-    )
-
-    uv = dpt.unique_values(inp)
-    assert dpt.all(uv == dpt.arange(2, dtype=dtype))
-
-
-def test_unique_values_strided():
-    get_queue_or_skip()
-
-    n, m = 1000, 20
-    inp = dpt.ones((n, m), dtype="i4", order="F")
-    inp[:, ::2] = 0
-
-    uv = dpt.unique_values(inp)
-    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
-
-    inp = dpt.reshape(inp, -1)
-    inp = dpt.flip(dpt.reshape(inp, -1))
-
-    uv = dpt.unique_values(inp)
-    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
-
-
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        "i1",
-        "u1",
-        "i2",
-        "u2",
-        "i4",
-        "u4",
-        "i8",
-        "u8",
-        "f2",
-        "f4",
-        "f8",
-        "c8",
-        "c16",
-    ],
-)
-def test_unique_counts(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n, roll = 10000, 734
-    inp = dpt.roll(
-        dpt.concat((dpt.ones(n, dtype=dtype), dpt.zeros(n, dtype=dtype))),
-        roll,
-    )
-
-    uv, uv_counts = dpt.unique_counts(inp)
-    assert dpt.all(uv == dpt.arange(2, dtype=dtype))
-    assert dpt.all(uv_counts == dpt.full(2, n, dtype=uv_counts.dtype))
-
-
-def test_unique_counts_strided():
-    get_queue_or_skip()
-
-    n, m = 1000, 20
-    inp = dpt.ones((n, m), dtype="i4", order="F")
-    inp[:, ::2] = 0
-
-    uv, uv_counts = dpt.unique_counts(inp)
-    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
-    assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype))
-
-    inp = dpt.flip(dpt.reshape(inp, -1))
-
-    uv, uv_counts = dpt.unique_counts(inp)
-    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
-    assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype))
-
-
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        "i1",
-        "u1",
-        "i2",
-        "u2",
-        "i4",
-        "u4",
-        "i8",
-        "u8",
-        "f2",
-        "f4",
-        "f8",
-        "c8",
-        "c16",
-    ],
-)
-def test_unique_inverse(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n, roll = 10000, 734
-    inp = dpt.roll(
-        dpt.concat((dpt.ones(n, dtype=dtype), dpt.zeros(n, dtype=dtype))),
-        roll,
-    )
-
-    uv, inv = dpt.unique_inverse(inp)
-    assert dpt.all(uv == dpt.arange(2, dtype=dtype))
-    assert dpt.all(inp == uv[inv])
-    assert inp.shape == inv.shape
-
-
-def test_unique_inverse_strided():
-    get_queue_or_skip()
-
-    n, m = 1000, 20
-    inp = dpt.ones((n, m), dtype="i4", order="F")
-    inp[:, ::2] = 0
-
-    uv, inv = dpt.unique_inverse(inp)
-    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
-    assert dpt.all(inp == uv[inv])
-    assert inp.shape == inv.shape
-
-    inp = dpt.flip(dpt.reshape(inp, -1))
-
-    uv, inv = dpt.unique_inverse(inp)
-    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
-    assert dpt.all(inp == uv[inv])
-    assert inp.shape == inv.shape
-
-
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        "i1",
-        "u1",
-        "i2",
-        "u2",
-        "i4",
-        "u4",
-        "i8",
-        "u8",
-        "f2",
-        "f4",
-        "f8",
-        "c8",
-        "c16",
-    ],
-)
-def test_unique_all(dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    n, roll = 10000, 734
-    inp = dpt.roll(
-        dpt.concat((dpt.ones(n, dtype=dtype), dpt.zeros(n, dtype=dtype))),
-        roll,
-    )
-
-    uv, ind, inv, uv_counts = dpt.unique_all(inp)
-    assert dpt.all(uv == dpt.arange(2, dtype=dtype))
-    assert dpt.all(uv == inp[ind])
-    assert dpt.all(inp == uv[inv])
-    assert inp.shape == inv.shape
-    assert dpt.all(uv_counts == dpt.full(2, n, dtype=uv_counts.dtype))
-
-
-def test_unique_all_strided():
-    get_queue_or_skip()
-
-    n, m = 1000, 20
-    inp = dpt.ones((n, m), dtype="i4", order="F")
-    inp[:, ::2] = 0
-
-    uv, ind, inv, uv_counts = dpt.unique_all(inp)
-    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
-    assert dpt.all(uv == dpt.reshape(inp, -1)[ind])
-    assert dpt.all(inp == uv[inv])
-    assert inp.shape == inv.shape
-    assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype))
-
-    inp = dpt.flip(dpt.reshape(inp, -1))
-
-    uv, ind, inv, uv_counts = dpt.unique_all(inp)
-    assert dpt.all(uv == dpt.arange(2, dtype="i4"))
-    assert dpt.all(uv == inp[ind])
-    assert dpt.all(inp == uv[inv])
-    assert inp.shape == inv.shape
-    assert dpt.all(uv_counts == dpt.full(2, n / 2 * m, dtype=uv_counts.dtype))
-
-
-def test_set_functions_empty_input():
-    get_queue_or_skip()
-    x = dpt.ones((10, 0, 1), dtype="i4")
-
-    res = dpt.unique_values(x)
-    assert isinstance(res, dpctl.tensor.usm_ndarray)
-    assert res.size == 0
-    assert res.dtype == x.dtype
-
-    res = dpt.unique_inverse(x)
-    assert type(res).__name__ == "UniqueInverseResult"
-    uv, inv = res
-    assert isinstance(uv, dpctl.tensor.usm_ndarray)
-    assert uv.size == 0
-    assert isinstance(inv, dpctl.tensor.usm_ndarray)
-    assert inv.size == 0
-
-    res = dpt.unique_counts(x)
-    assert type(res).__name__ == "UniqueCountsResult"
-    uv, uv_counts = res
-    assert isinstance(uv, dpctl.tensor.usm_ndarray)
-    assert uv.size == 0
-    assert isinstance(uv_counts, dpctl.tensor.usm_ndarray)
-    assert uv_counts.size == 0
-
-    res = dpt.unique_all(x)
-    assert type(res).__name__ == "UniqueAllResult"
-    uv, ind, inv, uv_counts = res
-    assert isinstance(uv, dpctl.tensor.usm_ndarray)
-    assert uv.size == 0
-    assert isinstance(ind, dpctl.tensor.usm_ndarray)
-    assert ind.size == 0
-    assert isinstance(inv, dpctl.tensor.usm_ndarray)
-    assert inv.size == 0
-    assert isinstance(uv_counts, dpctl.tensor.usm_ndarray)
-    assert uv_counts.size == 0
-
-
-def test_set_function_outputs():
-    get_queue_or_skip()
-    # check standard and early exit paths
-    x1 = dpt.arange(10, dtype="i4")
-    x2 = dpt.ones((10, 10), dtype="i4")
-
-    assert isinstance(dpt.unique_values(x1), dpctl.tensor.usm_ndarray)
-    assert isinstance(dpt.unique_values(x2), dpctl.tensor.usm_ndarray)
-
-    assert type(dpt.unique_inverse(x1)).__name__ == "UniqueInverseResult"
-    assert type(dpt.unique_inverse(x2)).__name__ == "UniqueInverseResult"
-
-    assert type(dpt.unique_counts(x1)).__name__ == "UniqueCountsResult"
-    assert type(dpt.unique_counts(x2)).__name__ == "UniqueCountsResult"
-
-    assert type(dpt.unique_all(x1)).__name__ == "UniqueAllResult"
-    assert type(dpt.unique_all(x2)).__name__ == "UniqueAllResult"
-
-
-def test_set_functions_compute_follows_data():
-    # tests that all intermediate calls and allocations
-    # are compatible with an input with an arbitrary queue
-    get_queue_or_skip()
-    q = dpctl.SyclQueue()
-    x = dpt.arange(10, dtype="i4", sycl_queue=q)
-
-    uv = dpt.unique_values(x)
-    assert isinstance(uv, dpctl.tensor.usm_ndarray)
-    assert uv.sycl_queue == q
-    uv, uc = dpt.unique_counts(x)
-    assert isinstance(uv, dpctl.tensor.usm_ndarray)
-    assert isinstance(uc, dpctl.tensor.usm_ndarray)
-    assert uv.sycl_queue == q
-    assert uc.sycl_queue == q
-    uv, inv_ind = dpt.unique_inverse(x)
-    assert isinstance(uv, dpctl.tensor.usm_ndarray)
-    assert isinstance(inv_ind, dpctl.tensor.usm_ndarray)
-    assert uv.sycl_queue == q
-    assert inv_ind.sycl_queue == q
-    uv, ind, inv_ind, uc = dpt.unique_all(x)
-    assert isinstance(uv, dpctl.tensor.usm_ndarray)
-    assert isinstance(ind, dpctl.tensor.usm_ndarray)
-    assert isinstance(inv_ind, dpctl.tensor.usm_ndarray)
-    assert isinstance(uc, dpctl.tensor.usm_ndarray)
-    assert uv.sycl_queue == q
-    assert ind.sycl_queue == q
-    assert inv_ind.sycl_queue == q
-    assert uc.sycl_queue == q
-
-
-def test_gh_1738():
-    get_queue_or_skip()
-
-    ones = dpt.ones(10, dtype="i8")
-    iota = dpt.arange(10, dtype="i8")
-
-    assert ones.device == iota.device
-
-    dpt_info = dpt.__array_namespace_info__()
-    ind_dt = dpt_info.default_dtypes(device=ones.device)["indexing"]
-
-    dt = dpt.unique_inverse(ones).inverse_indices.dtype
-    assert dt == ind_dt
-    dt = dpt.unique_all(ones).inverse_indices.dtype
-    assert dt == ind_dt
-
-    dt = dpt.unique_inverse(iota).inverse_indices.dtype
-    assert dt == ind_dt
-    dt = dpt.unique_all(iota).inverse_indices.dtype
-    assert dt == ind_dt
diff --git a/dpctl/tests/test_usm_ndarray_utility_functions.py b/dpctl/tests/test_usm_ndarray_utility_functions.py
deleted file mode 100644
index f3afe49923..0000000000
--- a/dpctl/tests/test_usm_ndarray_utility_functions.py
+++ /dev/null
@@ -1,167 +0,0 @@
-from random import randrange
-
-import numpy as np
-import pytest
-from numpy.testing import assert_array_equal, assert_equal
-
-import dpctl.tensor as dpt
-from dpctl.tensor._numpy_helper import AxisError
-from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
-
-_all_dtypes = [
-    "?",
-    "i1",
-    "u1",
-    "i2",
-    "u2",
-    "i4",
-    "u4",
-    "i8",
-    "u8",
-    "f2",
-    "f4",
-    "f8",
-    "c8",
-    "c16",
-]
-
-
-@pytest.mark.parametrize("func,identity", [(dpt.all, True), (dpt.any, False)])
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_boolean_reduction_dtypes_contig(func, identity, dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    x = dpt.full(10, identity, dtype=dtype, sycl_queue=q)
-    res = func(x)
-
-    assert_equal(dpt.asnumpy(res), identity)
-
-    x[randrange(x.size)] = not identity
-    res = func(x)
-    assert_equal(dpt.asnumpy(res), not identity)
-
-    # test branch in kernel for large arrays
-    wg_size = 4 * 32
-    x = dpt.full((wg_size + 1), identity, dtype=dtype, sycl_queue=q)
-    res = func(x)
-    assert_equal(dpt.asnumpy(res), identity)
-
-    x[randrange(x.size)] = not identity
-    res = func(x)
-    assert_equal(dpt.asnumpy(res), not identity)
-
-
-@pytest.mark.parametrize("func,identity", [(dpt.all, True), (dpt.any, False)])
-@pytest.mark.parametrize("dtype", _all_dtypes)
-def test_boolean_reduction_dtypes_strided(func, identity, dtype):
-    q = get_queue_or_skip()
-    skip_if_dtype_not_supported(dtype, q)
-
-    x = dpt.full(20, identity, dtype=dtype, sycl_queue=q)[::-2]
-    res = func(x)
-    assert_equal(dpt.asnumpy(res), identity)
-
-    x[randrange(x.size)] = not identity
-    res = func(x)
-    assert_equal(dpt.asnumpy(res), not identity)
-
-
-@pytest.mark.parametrize("func,identity", [(dpt.all, True), (dpt.any, False)])
-def test_boolean_reduction_axis(func, identity):
-    get_queue_or_skip()
-
-    x = dpt.full((2, 3, 4, 5, 6), identity, dtype="i4")
-    res = func(x, axis=(1, 2, -1))
-
-    assert res.shape == (2, 5)
-    assert_array_equal(dpt.asnumpy(res), np.full(res.shape, identity))
-
-    # make first row of output negation of identity
-    x[0, 0, 0, ...] = not identity
-    res = func(x, axis=(1, 2, -1))
-    assert_array_equal(dpt.asnumpy(res[0]), np.full(res.shape[1], not identity))
-
-
-@pytest.mark.parametrize("func", [dpt.all, dpt.any])
-def test_boolean_reduction_keepdims(func):
-    get_queue_or_skip()
-
-    x = dpt.ones((2, 3, 4, 5, 6), dtype="i4")
-    res = func(x, axis=(1, 2, -1), keepdims=True)
-    assert res.shape == (2, 1, 1, 5, 1)
-    assert_array_equal(dpt.asnumpy(res), np.full(res.shape, True))
-
-    res = func(x, axis=None, keepdims=True)
-    assert res.shape == (1,) * x.ndim
-
-
-@pytest.mark.parametrize("func,identity", [(dpt.all, True), (dpt.any, False)])
-def test_boolean_reduction_empty(func, identity):
-    get_queue_or_skip()
-
-    x = dpt.empty((0,), dtype="i4")
-    res = func(x)
-    assert_equal(dpt.asnumpy(res), identity)
-
-
-# nan, inf, and -inf should evaluate to true
-@pytest.mark.parametrize("func", [dpt.all, dpt.any])
-def test_boolean_reductions_nan_inf(func):
-    q = get_queue_or_skip()
-
-    x = dpt.asarray([dpt.nan, dpt.inf, -dpt.inf], dtype="f4", sycl_queue=q)[
-        :, dpt.newaxis
-    ]
-    res = func(x, axis=1)
-    assert_equal(dpt.asnumpy(res), True)
-
-
-@pytest.mark.parametrize("func", [dpt.all, dpt.any])
-def test_boolean_reduction_scalars(func):
-    get_queue_or_skip()
-
-    x = dpt.ones((), dtype="i4")
-    assert_equal(dpt.asnumpy(func(x)), True)
-
-    x = dpt.zeros((), dtype="i4")
-    assert_equal(dpt.asnumpy(func(x)), False)
-
-
-@pytest.mark.parametrize("func", [dpt.all, dpt.any])
-def test_boolean_reduction_empty_axis(func):
-    get_queue_or_skip()
-
-    x = dpt.ones((5,), dtype="i4")
-    res = func(x, axis=())
-    assert_array_equal(dpt.asnumpy(res), dpt.asnumpy(x).astype(np.bool_))
-
-
-@pytest.mark.parametrize("func", [dpt.all, dpt.any])
-def test_arg_validation_boolean_reductions(func):
-    get_queue_or_skip()
-
-    x = dpt.ones((4, 5), dtype="i4")
-    d = dict()
-
-    with pytest.raises(TypeError):
-        func(d)
-    with pytest.raises(AxisError):
-        func(x, axis=-3)
-
-
-def test_boolean_reductions_3d_gh_1327():
-    get_queue_or_skip()
-
-    size = 24
-    x = dpt.reshape(dpt.arange(-10, size - 10, 1, dtype="i4"), (2, 3, 4))
-    res = dpt.all(x, axis=0)
-    res_np = np.full(res.shape, True, dtype="?")
-    res_np[2, 2] = False
-
-    assert (dpt.asnumpy(res) == res_np).all()
-
-    x = dpt.ones((2, 3, 4, 5), dtype="i4")
-    res = dpt.any(x, axis=0)
-
-    assert (dpt.asnumpy(res) == np.full(res.shape, True, dtype="?")).all()
diff --git a/dpctl/utils/CMakeLists.txt b/dpctl/utils/CMakeLists.txt
index a684983c74..40569da8a4 100644
--- a/dpctl/utils/CMakeLists.txt
+++ b/dpctl/utils/CMakeLists.txt
@@ -33,11 +33,6 @@ foreach(python_module_name ${_pybind11_targets})
         target_link_options(${python_module_name} PRIVATE --offload-compress)
     endif()
 
-    target_include_directories(${python_module_name}
-        PRIVATE
-        ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include
-        ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/
-    )
     target_link_options(${python_module_name} PRIVATE ${_linker_options})
     if(DPCTL_GENERATE_COVERAGE)
         if(DPCTL_GENERATE_COVERAGE_FOR_PYBIND11_EXTENSIONS)
diff --git a/setup.py b/setup.py
index d0b636c02c..1b10322ef8 100644
--- a/setup.py
+++ b/setup.py
@@ -26,7 +26,6 @@
     packages=[
         "dpctl",
         "dpctl.memory",
-        "dpctl.tensor",
         "dpctl.program",
         "dpctl.utils",
     ],
@@ -42,19 +41,11 @@
             "include/syclinterface/*.h*",
             "include/syclinterface/Config/*.h",
             "include/syclinterface/Support/*.h",
-            "tensor/libtensor/include/kernels/*.h*",
-            "tensor/libtensor/include/kernels/sorting/*.h*",
-            "tensor/libtensor/include/kernels/elementwise_functions/*.h*",
-            "tensor/libtensor/include/kernels/linalg/*.h*",
-            "tensor/libtensor/include/utils/*.h*",
-            "tensor/include/dlpack/*.*",
             "include/dpctl/_sycl*.h",
             "include/dpctl/memory/_memory*.h",
             "include/dpctl/program/_program*.h",
-            "include/dpctl/tensor/_usmarray*.h",
             "*.pxd",
             "memory/*.pxd",
-            "tensor/*.pxd",
             "program/*.pxd",
         ]
     },

From f7bd7e1ba87f3cf96f3bd84ec838aa35e6209ac0 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 19 Nov 2025 15:21:00 -0800
Subject: [PATCH 02/24] remove benchmarks

---
 benchmarks/README.md                     |  22 ----
 benchmarks/asv.conf.json                 |  53 ---------
 benchmarks/benchmarks/__init__.py        |   0
 benchmarks/benchmarks/benchmark_utils.py |  17 ---
 benchmarks/benchmarks/binary.py          | 133 -----------------------
 benchmarks/benchmarks/ef_bench_add.py    |  31 ------
 6 files changed, 256 deletions(-)
 delete mode 100644 benchmarks/README.md
 delete mode 100644 benchmarks/asv.conf.json
 delete mode 100644 benchmarks/benchmarks/__init__.py
 delete mode 100644 benchmarks/benchmarks/benchmark_utils.py
 delete mode 100644 benchmarks/benchmarks/binary.py
 delete mode 100644 benchmarks/benchmarks/ef_bench_add.py

diff --git a/benchmarks/README.md b/benchmarks/README.md
deleted file mode 100644
index 513ad48ec9..0000000000
--- a/benchmarks/README.md
+++ /dev/null
@@ -1,22 +0,0 @@
-# dpctl benchmarks
-
-Benchmarking dpctl using Airspeed Velocity
-Read more about ASV [here](https://asv.readthedocs.io/en/stable/index.html)
-
-## Usage
-The benchmarks were made with using an existing environment in-mind before execution. You will see the `asv.conf.json` is minimal without any environmental information supplied.
-The expectation is for users to execute `asv run` with an existing environment.
-
-As such, you should have conda or mamba installed, and create an environment [following these instructions](https://intelpython.github.io/dpctl/latest/beginners_guides/installation.html#dpctl-installation)
-Additionally, install `asv` and `libmambapy` to the environment.
-
-Then, you may activate the environment and instruct `asv run` to use this existing environment for the benchmarks by pointing it to the environment's python binary, like so:
-```
-conda activate dpctl_env
-asv run --environment existing:/full/mamba/path/envs/dpctl_env/bin/python
-```
-
-For `level_zero` devices, you might see `USM Allocation` errors unless you use the `asv run` command with `--launch-method spawn`
-
-## Writing new benchmarks
-Read ASV's guidelines for writing benchmarks [here](https://asv.readthedocs.io/en/stable/writing_benchmarks.html)
diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json
deleted file mode 100644
index 478dc518f9..0000000000
--- a/benchmarks/asv.conf.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-    // The version of the config file format.  Do not change, unless
-    // you know what you are doing.
-    "version": 1,
-
-    // The name of the project being benchmarked
-    "project": "dpctl",
-
-    // The project's homepage
-    "project_url": "https://github.com/IntelPython/dpctl",
-
-    // The URL or local path of the source code repository for the
-    // project being benchmarked
-    "repo": "..",
-
-    // Customizable commands for building the project.
-    // See asv.conf.json documentation.
-    "build_command": [],
-
-    // List of branches to benchmark. If not provided, defaults to "main"
-    // (for git) or "default" (for mercurial).
-    "branches": ["HEAD"],
-
-    // The DVCS being used.  If not set, it will be automatically
-    // determined from "repo" by looking at the protocol in the URL
-    // (if remote), or by looking for special directories, such as
-    // ".git" (if local).
-    "dvcs": "git",
-
-    // The tool to use to create environments.  May be "conda",
-    // "virtualenv", "mamba" (above 3.8)
-    // or other value depending on the plugins in use.
-    // If missing or the empty string, the tool will be automatically
-    // determined by looking for tools on the PATH environment
-    // variable.
-    "environment_type": "conda",
-
-    // The directory (relative to the current directory) that benchmarks are
-    // stored in.  If not provided, defaults to "benchmarks"
-    "benchmark_dir": "benchmarks",
-
-    // The directory (relative to the current directory) to cache the Python
-    // environments in.  If not provided, defaults to "env"
-    "env_dir": ".asv/env",
-
-    // The directory (relative to the current directory) that raw benchmark
-    // results are stored in.  If not provided, defaults to "results".
-    "results_dir": ".asv/results",
-
-    // The directory (relative to the current directory) that the html tree
-    // should be written to.  If not provided, defaults to "html".
-    "html_dir": ".asv/html"
-}
diff --git a/benchmarks/benchmarks/__init__.py b/benchmarks/benchmarks/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/benchmarks/benchmarks/benchmark_utils.py b/benchmarks/benchmarks/benchmark_utils.py
deleted file mode 100644
index 7d493d1cd6..0000000000
--- a/benchmarks/benchmarks/benchmark_utils.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from asv_runner.benchmarks.mark import SkipNotImplemented
-
-import dpctl.tensor as dpt
-
-
-def skip_unsupported_dtype(q, dtype):
-    """
-    Skip the benchmark if the device does not support the given data type.
-    """
-    if (
-        (dtype == dpt.float64 or dtype.name == dpt.complex128)
-        and not q.sycl_device.has_aspect_fp64
-    ) or (dtype == dpt.float16 and not q.sycl_device.has_aspect_fp16):
-        raise SkipNotImplemented(
-            f"Skipping benchmark for {dtype.name} on this device"
-            + " as it is not supported."
-        )
diff --git a/benchmarks/benchmarks/binary.py b/benchmarks/benchmarks/binary.py
deleted file mode 100644
index 49d08d09d6..0000000000
--- a/benchmarks/benchmarks/binary.py
+++ /dev/null
@@ -1,133 +0,0 @@
-import dpctl
-import dpctl.tensor as dpt
-
-from . import benchmark_utils as bench_utils
-
-SHARED_QUEUE = dpctl.SyclQueue(property="enable_profiling")
-
-
-class Binary:
-    """Benchmark class for binary operations on SYCL devices."""
-
-    timeout = 300.0
-
-    def setup(self):
-        """Setup the benchmark environment."""
-        self.q = SHARED_QUEUE
-        self.iterations = 1
-        self.n_values = 10**8
-
-    def run_bench(self, q, reps, n_max, dtype1, dtype2, op):
-        """Run the benchmark for a specific function and dtype combination."""
-
-        def get_sizes(n):
-            s = []
-            m = 8192
-            while m < n:
-                s.append(m)
-                m *= 2
-            s.append(n)
-            return s
-
-        x1 = dpt.ones(n_max, dtype=dtype1, sycl_queue=q)
-        x2 = dpt.ones(n_max, dtype=dtype2, sycl_queue=q)
-        r = op(x1, x2)
-
-        max_bytes = x1.nbytes + x2.nbytes + r.nbytes
-        times_res = []
-
-        for n in get_sizes(n_max):
-            x1_n = x1[:n]
-            x2_n = x2[:n]
-            r_n = r[:n]
-            n_bytes = x1_n.nbytes + x2_n.nbytes + r_n.nbytes
-
-            n_iters = int((max_bytes / n_bytes) * reps)
-
-            while True:
-                timer = dpctl.SyclTimer(
-                    device_timer="order_manager", time_scale=1e9
-                )
-                with timer(q):
-                    for _ in range(n_iters):
-                        op(x1_n, x2_n, out=r_n)
-
-                dev_dt = timer.dt.device_dt
-                if dev_dt > 0:
-                    times_res.append((n, dev_dt / n_iters))
-                    break
-
-        return times_res
-
-
-binary_instance = Binary()
-binary_instance.q = SHARED_QUEUE
-binary_instance.iterations = 1
-binary_instance.n_values = 10**8
-
-function_list = [
-    dpt.add,
-    dpt.multiply,
-    dpt.divide,
-    dpt.subtract,
-    dpt.floor_divide,
-    dpt.remainder,
-    dpt.hypot,
-    dpt.logaddexp,
-    dpt.pow,
-    dpt.atan2,
-    dpt.nextafter,
-    dpt.copysign,
-    dpt.less,
-    dpt.less_equal,
-    dpt.greater,
-    dpt.greater_equal,
-    dpt.equal,
-    dpt.not_equal,
-    dpt.minimum,
-    dpt.maximum,
-    dpt.bitwise_and,
-    dpt.bitwise_or,
-    dpt.bitwise_xor,
-    dpt.bitwise_left_shift,
-    dpt.bitwise_right_shift,
-    dpt.logical_and,
-    dpt.logical_or,
-    dpt.logical_xor,
-]
-
-# Generate dtype combinations for each function
-dtypes = {}
-for fn in function_list:
-    dtypes[fn] = [list(map(dpt.dtype, sig.split("->")[0])) for sig in fn.types]
-
-
-# Dynamically create benchmark methods at the module level
-def generate_benchmark_functions():
-    """Dynamically create benchmark functions for each
-    function and dtype combination.
-    """
-    for fn in function_list:
-        fn_name = fn.name_
-        for dtype1, dtype2 in dtypes[fn]:
-            # Create unique function names
-            method_name = f"time_{fn_name}_{dtype1.name}_{dtype2.name}"
-
-            def benchmark_method(self, fn=fn, dtype1=dtype1, dtype2=dtype2):
-                bench_utils.skip_unsupported_dtype(self.q, dtype1)
-                return self.run_bench(
-                    self.q,
-                    self.iterations,
-                    self.n_values,
-                    dtype1,
-                    dtype2,
-                    fn,
-                )
-
-            benchmark_method.__name__ = method_name
-            # Attach the new method to the Binary class
-            setattr(Binary, method_name, benchmark_method)
-
-
-# Generate the benchmark functions
-generate_benchmark_functions()
diff --git a/benchmarks/benchmarks/ef_bench_add.py b/benchmarks/benchmarks/ef_bench_add.py
deleted file mode 100644
index f17f8613d2..0000000000
--- a/benchmarks/benchmarks/ef_bench_add.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import dpctl
-import dpctl.tensor as dpt
-import dpctl.tensor._tensor_elementwise_impl as tei
-import dpctl.utils as dpu
-
-
-class EfBenchAdd:
-
-    def time_ef_bench_add(self):
-        q = dpctl.SyclQueue(property="enable_profiling")
-        n = 2**26
-        reps = 50
-
-        dt = dpt.int8
-        x1 = dpt.ones(n, dtype=dt, sycl_queue=q)
-        x2 = dpt.ones(n, dtype=dt, sycl_queue=q)
-
-        op1, op2 = dpt.add, tei._add
-
-        r = op1(x1, x2)
-
-        timer = dpctl.SyclTimer(device_timer="order_manager", time_scale=1e9)
-
-        m = dpu.SequentialOrderManager[q]
-        with timer(q):
-            for _ in range(reps):
-                deps = m.submitted_events
-                ht_e, c_e = op2(
-                    src1=x1, src2=x2, dst=r, sycl_queue=q, depends=deps
-                )
-                m.add_event_pair(ht_e, c_e)

From bc93311e394ba2ff7b7079ca354b2f4cc3aafec1 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 19 Nov 2025 15:22:41 -0800
Subject: [PATCH 03/24] remove tensor from dpctl cmake

---
 cmake/dpctl-config.cmake | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/cmake/dpctl-config.cmake b/cmake/dpctl-config.cmake
index ea2b72c903..2fc3197f2f 100644
--- a/cmake/dpctl-config.cmake
+++ b/cmake/dpctl-config.cmake
@@ -8,15 +8,12 @@
 #   True if DPCTL was found.
 # ``Dpctl_INCLUDE_DIR``
 #   The include directory needed to use dpctl.
-# ``Dpctl_TENSOR_INCLUDE_DIR``
-#   The include directory for tensor kernels implementation.
 # ``Dpctl_VERSION``
 #   The version of dpctl found.
 #
 # The module will also explicitly define two cache variables:
 #
 # ``Dpctl_INCLUDE_DIR``
-# ``Dpctl_TENSOR_INCLUDE_DIR``
 #
 
 if(NOT Dpctl_FOUND)
@@ -46,11 +43,6 @@ find_path(Dpctl_INCLUDE_DIR
   )
 get_filename_component(_dpctl_dir ${_dpctl_include_dir} DIRECTORY)
 
-find_path(Dpctl_TENSOR_INCLUDE_DIR
-  kernels utils
-  PATHS "${_dpctl_dir}/tensor/libtensor/include"
-  )
-
 set(Dpctl_INCLUDE_DIRS ${Dpctl_INCLUDE_DIR})
 
 # handle the QUIETLY and REQUIRED arguments and set Dpctl_FOUND to TRUE if
@@ -58,9 +50,8 @@ set(Dpctl_INCLUDE_DIRS ${Dpctl_INCLUDE_DIR})
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(Dpctl
                                   REQUIRED_VARS
-                                    Dpctl_INCLUDE_DIR Dpctl_TENSOR_INCLUDE_DIR
-                                  VERSION_VAR Dpctl_VERSION
+                                    Dpctl_INCLUDE_DIR VERSION_VAR
+                                  Dpctl_VERSION
                                   )
 
 mark_as_advanced(Dpctl_INCLUDE_DIR)
-mark_as_advanced(Dpctl_TENSOR_INCLUDE_DIR)

From d65ccae18aebe689787288f682ff70529fa5bbe1 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 19 Nov 2025 15:22:57 -0800
Subject: [PATCH 04/24] remove array API workflow

---
 .github/workflows/array-api-skips.txt |  41 -------
 .github/workflows/conda-package.yml   | 148 --------------------------
 2 files changed, 189 deletions(-)
 delete mode 100644 .github/workflows/array-api-skips.txt

diff --git a/.github/workflows/array-api-skips.txt b/.github/workflows/array-api-skips.txt
deleted file mode 100644
index 72cf8563d5..0000000000
--- a/.github/workflows/array-api-skips.txt
+++ /dev/null
@@ -1,41 +0,0 @@
-# array API tests to be skipped
-
-# no linalg module as of now
-array_api_tests/test_has_names.py::test_has_names[linalg-cholesky]
-array_api_tests/test_has_names.py::test_has_names[linalg-cross]
-array_api_tests/test_has_names.py::test_has_names[linalg-det]
-array_api_tests/test_has_names.py::test_has_names[linalg-diagonal]
-array_api_tests/test_has_names.py::test_has_names[linalg-eigh]
-array_api_tests/test_has_names.py::test_has_names[linalg-eigvalsh]
-array_api_tests/test_has_names.py::test_has_names[linalg-inv]
-array_api_tests/test_has_names.py::test_has_names[linalg-matmul]
-array_api_tests/test_has_names.py::test_has_names[linalg-matrix_norm]
-array_api_tests/test_has_names.py::test_has_names[linalg-matrix_power]
-array_api_tests/test_has_names.py::test_has_names[linalg-matrix_rank]
-array_api_tests/test_has_names.py::test_has_names[linalg-matrix_transpose]
-array_api_tests/test_has_names.py::test_has_names[linalg-outer]
-array_api_tests/test_has_names.py::test_has_names[linalg-pinv]
-array_api_tests/test_has_names.py::test_has_names[linalg-qr]
-array_api_tests/test_has_names.py::test_has_names[linalg-slogdet]
-array_api_tests/test_has_names.py::test_has_names[linalg-solve]
-array_api_tests/test_has_names.py::test_has_names[linalg-svd]
-array_api_tests/test_has_names.py::test_has_names[linalg-svdvals]
-array_api_tests/test_has_names.py::test_has_names[linalg-tensordot]
-array_api_tests/test_has_names.py::test_has_names[linalg-trace]
-array_api_tests/test_has_names.py::test_has_names[linalg-vecdot]
-array_api_tests/test_has_names.py::test_has_names[linalg-vector_norm]
-array_api_tests/test_has_names.py::test_has_names[linalg-pinv]
-array_api_tests/test_has_names.py::test_has_names[fft-fft]
-array_api_tests/test_has_names.py::test_has_names[fft-fftn]
-array_api_tests/test_has_names.py::test_has_names[fft-rfft]
-array_api_tests/test_has_names.py::test_has_names[fft-rfftn]
-array_api_tests/test_has_names.py::test_has_names[fft-hfft]
-array_api_tests/test_has_names.py::test_has_names[fft-ifft]
-array_api_tests/test_has_names.py::test_has_names[fft-ifftn]
-array_api_tests/test_has_names.py::test_has_names[fft-irfft]
-array_api_tests/test_has_names.py::test_has_names[fft-irfftn]
-array_api_tests/test_has_names.py::test_has_names[fft-ihfft]
-array_api_tests/test_has_names.py::test_has_names[fft-fftshift]
-array_api_tests/test_has_names.py::test_has_names[fft-ifftshift]
-array_api_tests/test_has_names.py::test_has_names[fft-fftfreq]
-array_api_tests/test_has_names.py::test_has_names[fft-rfftfreq]
diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index ad2f635fb3..5478023849 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -666,154 +666,6 @@ jobs:
             python ${script} || exit 1
           done
 
-  array-api-conformity:
-    needs: build_linux
-    runs-on:  ${{ matrix.runner }}
-    timeout-minutes: 90
-    permissions:
-      pull-requests: write
-
-    strategy:
-      matrix:
-        python: ['3.10']
-        experimental: [false]
-        runner: [ubuntu-22.04]
-    continue-on-error: ${{ matrix.experimental }}
-    steps:
-      - name: Construct channels line
-        run: |
-          echo "CHANNELS=-c ${{ env.INTEL_CHANNEL }} -c conda-forge --override-channels" >> $GITHUB_ENV
-      - name: Display channels line
-        run: |
-          echo ${{ env.CHANNELS }}
-      - name: Checkout dpctl repo
-        uses: actions/checkout@v6.0.2
-        with:
-          fetch-depth: 0
-      - name: Cache array API tests
-        id: cache-array-api-tests
-        uses: actions/cache@v5
-        env:
-          ARRAY_CACHE: 3
-        with:
-          path: |
-            /home/runner/work/array-api-tests/
-          key: ${{ runner.os }}-array-api-${{ env.cache-name }}-{{ env.ARRAY_CACHE }}-${{ hashFiles('/home/runner/work/array-api-tests/requirements.txt') }}
-          restore-keys: |
-            ${{ runner.os }}-build-${{ env.cache-name }}-
-            ${{ runner.os }}-build-
-            ${{ runner.os }}-
-      - name: Clone array API tests repo
-        if: steps.cache-array-api-tests.outputs.cache-hit != 'true'
-        shell: bash -l {0}
-        run: |
-          cd /home/runner/work
-          git clone --recurse-submodules https://github.com/data-apis/array-api-tests array-api-tests
-          cd array-api-tests
-      - name: Download artifact
-        uses: actions/download-artifact@v7
-        with:
-          name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }}
-      - name: Add conda to system path
-        run: echo $CONDA/bin >> $GITHUB_PATH
-      - name: Install conda-index
-        # Needed to be able to run conda index
-        run: |
-          conda update -n base --all
-          conda install conda-index -c conda-forge --override-channels
-      - name: Create conda channel
-        run: |
-          mkdir -p $GITHUB_WORKSPACE/channel/linux-64
-          conda index $GITHUB_WORKSPACE/channel || exit 1
-          mv ${PACKAGE_NAME}-*.conda $GITHUB_WORKSPACE/channel/linux-64 || exit 1
-          conda index $GITHUB_WORKSPACE/channel || exit 1
-          # Test channel
-          conda search $PACKAGE_NAME -c $GITHUB_WORKSPACE/channel --override-channels --info --json > $GITHUB_WORKSPACE/ver.json
-          cat ver.json
-      - name: Collect dependencies
-        run: |
-          CHANNELS="-c $GITHUB_WORKSPACE/channel ${{ env.CHANNELS }}"
-          export PACKAGE_VERSION=$(python -c "${VER_SCRIPT1} ${VER_SCRIPT2}")
-          conda create -n ${{ env.TEST_ENV_NAME }} $PACKAGE_NAME=${PACKAGE_VERSION} python=${{ matrix.python }} $CHANNELS --only-deps --dry-run > lockfile
-          cat lockfile
-      - name: Set pkgs_dirs
-        run: |
-          echo "pkgs_dirs: [~/.conda/pkgs]" >> ~/.condarc
-      - name: Cache conda packages
-        uses: actions/cache@v5
-        env:
-          CACHE_NUMBER: 3  # Increase to reset cache
-        with:
-          path: ~/.conda/pkgs
-          key:
-            ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-python-${{ matrix.python }}-${{hashFiles('lockfile') }}
-          restore-keys: |
-            ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-python-${{ matrix.python }}-
-            ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-
-      - name: Install dpctl
-        run: |
-          CHANNELS="-c $GITHUB_WORKSPACE/channel ${{ env.CHANNELS }}"
-          export PACKAGE_VERSION=$(python -c "${VER_SCRIPT1} ${VER_SCRIPT2}")
-          conda create -n ${{ env.TEST_ENV_NAME }} $PACKAGE_NAME=${PACKAGE_VERSION} pytest python=${{ matrix.python }} $CHANNELS
-          # Test installed packages
-          conda list
-      - name: Install array API test dependencies
-        shell: bash -l {0}
-        run: |
-          . $CONDA/etc/profile.d/conda.sh
-          conda activate ${{ env.TEST_ENV_NAME }}
-          cd /home/runner/work/array-api-tests
-          pip install -r requirements.txt
-      - name: Install jq
-        shell: bash -l {0}
-        run: |
-          sudo apt-get install jq
-      - name: Run array API conformance tests
-        id: run-array-api-tests
-        shell: bash -l {0}
-        env:
-          ARRAY_API_TESTS_MODULE: 'dpctl.tensor'
-          ARRAY_API_TESTS_VERSION: '2024.12'
-          SYCL_CACHE_PERSISTENT: 1
-        run: |
-          FILE=/home/runner/work/.report.json
-          . $CONDA/etc/profile.d/conda.sh
-          conda activate ${{ env.TEST_ENV_NAME }}
-          cd /home/runner/work/array-api-tests
-          ${CONDA_PREFIX}/bin/python -c "import dpctl; dpctl.lsplatform()"
-          ${CONDA_PREFIX}/bin/python -m pytest --json-report --json-report-file=$FILE --disable-deadline --skips-file ${GITHUB_WORKSPACE}/.github/workflows/array-api-skips.txt array_api_tests/ || true
-      - name: Set Github environment variables
-        shell: bash -l {0}
-        run: |
-          export PACKAGE_VERSION=$(python -c "${VER_SCRIPT1} ${VER_SCRIPT2}")
-          FILE=/home/runner/work/.report.json
-          if test -f "$FILE"; then
-            PASSED_TESTS=$(jq '.summary | .passed // 0' $FILE)
-            FAILED_TESTS=$(jq '.summary | .failed // 0' $FILE)
-            SKIPPED_TESTS=$(jq '.summary | .skipped // 0' $FILE)
-            MESSAGE="Array API standard conformance tests for dpctl=$PACKAGE_VERSION ran successfully.
-            Passed: $PASSED_TESTS
-            Failed: $FAILED_TESTS
-            Skipped: $SKIPPED_TESTS"
-            echo "MESSAGE<<EOF" >> $GITHUB_ENV
-            echo "$MESSAGE" >> $GITHUB_ENV
-            echo "EOF" >> $GITHUB_ENV
-          else
-            echo "Array API standard conformance tests failed to run for dpctl=$PACKAGE_VERSION."
-            exit 1
-          fi
-      - name: Output API summary
-        shell: bash -l {0}
-        run: echo "::notice ${{ env.MESSAGE }}"
-      - name: Post result to PR
-        if: ${{ github.event.pull_request && !github.event.pull_request.head.repo.fork }}
-        uses: mshick/add-pr-comment@v2
-        with:
-          message: |
-            ${{ env.MESSAGE }}
-          allow-repeats: true
-          repo-token: ${{ secrets.GITHUB_TOKEN }}
-
   cleanup_packages:
     name: Clean up anaconda packages
     needs: [upload_linux, upload_windows]

From ad13be5bfb7fe51221e0765dedd8b73c02e535aa Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 19 Nov 2025 15:23:13 -0800
Subject: [PATCH 05/24] remove tensor from linting and gitignore

---
 .flake8    | 5 -----
 .gitignore | 1 -
 2 files changed, 6 deletions(-)

diff --git a/.flake8 b/.flake8
index 276ee08645..f780379f09 100644
--- a/.flake8
+++ b/.flake8
@@ -23,14 +23,9 @@ per-file-ignores =
     dpctl/_sycl_queue_manager.pyx: E999, E225
     dpctl/memory/_memory.pyx: E999, E225, E226, E227
     dpctl/program/_program.pyx: E999, E225, E226, E227
-    dpctl/tensor/_usmarray.pyx: E999, E225, E226, E227
-    dpctl/tensor/_dlpack.pyx: E999, E225, E226, E227
-    dpctl/tensor/_flags.pyx: E999, E225, E226, E227
-    dpctl/tensor/numpy_usm_shared.py: F821
     dpctl/tests/_cython_api.pyx: E999, E225, E227, E402
     dpctl/utils/_compute_follows_data.pyx: E999, E225, E227
     dpctl/utils/_onetrace_context.py: E501, W505
-    dpctl/tensor/_array_api.py: E501, W505
     examples/cython/sycl_buffer/syclbuffer/_syclbuffer.pyx: E999, E225, E402
     examples/cython/usm_memory/blackscholes/_blackscholes_usm.pyx: E999, E225, E226, E402
     examples/cython/use_dpctl_sycl/use_dpctl_sycl/_cython_api.pyx: E999, E225, E226, E402
diff --git a/.gitignore b/.gitignore
index 093cbba81f..f8d185c7a9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -98,7 +98,6 @@ dpctl/_sycl_event.h
 dpctl/_sycl_queue.h
 dpctl/_sycl_queue_manager.h
 dpctl/memory/_memory.h
-dpctl/tensor/_usmarray.h
 
 # moved cmake scripts
 dpctl/resources/cmake

From d2160fcb41f4908262913c56b648b46163cbc8f4 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 19 Nov 2025 15:24:31 -0800
Subject: [PATCH 06/24] remove tensor from coverage testing

---
 scripts/gen_coverage.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/gen_coverage.py b/scripts/gen_coverage.py
index 100e68df67..a78065421b 100644
--- a/scripts/gen_coverage.py
+++ b/scripts/gen_coverage.py
@@ -213,7 +213,6 @@ def main():
             "--pyargs",
             "dpctl",
             "-vv",
-            "--ignore=dpctl/tensor/libtensor/tests",
             "--no-sycl-interface-test",
         ]
         run(pytest_cmd, env=env, cwd=setup_dir)

From da23c196071d15e4ff2fd3a5a845c5e17db57f51 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Tue, 13 Jan 2026 22:37:41 -0800
Subject: [PATCH 07/24] remove tensor-related options from dpctl module call

---
 dpctl/__main__.py | 26 --------------------------
 1 file changed, 26 deletions(-)

diff --git a/dpctl/__main__.py b/dpctl/__main__.py
index 2b10f7ca1f..0acb09f39f 100644
--- a/dpctl/__main__.py
+++ b/dpctl/__main__.py
@@ -39,18 +39,6 @@ def print_include_flags() -> None:
     print("-I " + get_include_dir())
 
 
-def get_tensor_include_dir() -> str:
-    dpctl_dir = _dpctl_dir()
-    libtensor_dir = os.path.join(dpctl_dir, "tensor", "libtensor", "include")
-    return libtensor_dir
-
-
-def print_tensor_include_flags() -> None:
-    "Prints include flags for dpctl and DPCTLSyclInterface library"
-    libtensor_dir = get_tensor_include_dir()
-    print("-I " + libtensor_dir)
-
-
 def print_cmake_dir() -> None:
     "Prints directory with dpctl-config.cmake"
     dpctl_dir = _dpctl_dir()
@@ -106,16 +94,6 @@ def main() -> None:
         action="store_true",
         help="Path to dpctl include directory.",
     )
-    parser.add_argument(
-        "--tensor-includes",
-        action="store_true",
-        help="Include flags for dpctl libtensor headers.",
-    )
-    parser.add_argument(
-        "--tensor-include-dir",
-        action="store_true",
-        help="Path to dpctl libtensor include directory.",
-    )
     parser.add_argument(
         "--cmakedir",
         action="store_true",
@@ -174,10 +152,6 @@ def main() -> None:
         print_include_flags()
     if args.include_dir:
         print(get_include_dir())
-    if args.tensor_includes:
-        print_tensor_include_flags()
-    if args.tensor_include_dir:
-        print(get_tensor_include_dir())
     if args.cmakedir:
         print_cmake_dir()
     if args.library:

From 9f2c8207297b4cb7588fa5d9069586910b0e260f Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Tue, 27 Jan 2026 11:30:24 -0800
Subject: [PATCH 08/24] remove skip_if_dtype_not_supported test utility

---
 dpctl/tests/helper/__init__.py |  2 --
 dpctl/tests/helper/_helper.py  | 27 ---------------------------
 2 files changed, 29 deletions(-)

diff --git a/dpctl/tests/helper/__init__.py b/dpctl/tests/helper/__init__.py
index 5fa83345d3..a2ae1626ab 100644
--- a/dpctl/tests/helper/__init__.py
+++ b/dpctl/tests/helper/__init__.py
@@ -22,7 +22,6 @@
     has_cpu,
     has_gpu,
     has_sycl_platforms,
-    skip_if_dtype_not_supported,
 )
 
 __all__ = [
@@ -31,5 +30,4 @@
     "has_gpu",
     "has_sycl_platforms",
     "get_queue_or_skip",
-    "skip_if_dtype_not_supported",
 ]
diff --git a/dpctl/tests/helper/_helper.py b/dpctl/tests/helper/_helper.py
index a50c4a2809..475669976c 100644
--- a/dpctl/tests/helper/_helper.py
+++ b/dpctl/tests/helper/_helper.py
@@ -49,30 +49,3 @@ def get_queue_or_skip(args=tuple()):
     except dpctl.SyclQueueCreationError:
         pytest.skip(f"Queue could not be created from {args}")
     return q
-
-
-def skip_if_dtype_not_supported(dt, q_or_dev):
-    import dpctl.tensor as dpt
-
-    dt = dpt.dtype(dt)
-    if type(q_or_dev) is dpctl.SyclQueue:
-        dev = q_or_dev.sycl_device
-    elif type(q_or_dev) is dpctl.SyclDevice:
-        dev = q_or_dev
-    else:
-        raise TypeError(
-            "Expected dpctl.SyclQueue or dpctl.SyclDevice, "
-            f"got {type(q_or_dev)}"
-        )
-    dev_has_dp = dev.has_aspect_fp64
-    if dev_has_dp is False and dt in [dpt.float64, dpt.complex128]:
-        pytest.skip(
-            f"{dev.name} does not support double precision floating point types"
-        )
-    dev_has_hp = dev.has_aspect_fp16
-    if dev_has_hp is False and dt in [
-        dpt.float16,
-    ]:
-        pytest.skip(
-            f"{dev.name} does not support half precision floating point type"
-        )

From 70051ac14ca19a7c498a9ff2f05d1e1b61b6af06 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Tue, 27 Jan 2026 11:30:53 -0800
Subject: [PATCH 09/24] remove references to dpctl.tensor in docstrings

---
 dpctl/_sycl_timer.py          | 7 ++++---
 dpctl/utils/_order_manager.py | 4 ++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/dpctl/_sycl_timer.py b/dpctl/_sycl_timer.py
index 9e33cef4ac..976ae71eca 100644
--- a/dpctl/_sycl_timer.py
+++ b/dpctl/_sycl_timer.py
@@ -134,9 +134,10 @@ class SyclTimer:
         ``device_timer`` keyword argument controls the type of tasks submitted.
         With ``"queue_barrier"`` value, queue barrier tasks are used. With
         ``"order_manager"`` value, a single empty body task is inserted
-        and order manager (used by all `dpctl.tensor` operations) is used to
-        order these tasks so that they fence operations performed within
-        timer's context.
+        and order manager is used to order these tasks so that they fence
+        operations performed within the timer's context. This requires that
+        the order manager is used to order all tasks submitted to the queue
+        within the timer's context.
 
         Timing offloading operations that do not use the order manager with
         the timer that uses ``"order_manager"`` as ``device_timer`` value
diff --git a/dpctl/utils/_order_manager.py b/dpctl/utils/_order_manager.py
index 0873ad99e5..fbe2c47763 100644
--- a/dpctl/utils/_order_manager.py
+++ b/dpctl/utils/_order_manager.py
@@ -65,8 +65,8 @@ def __copy__(self):
 
 
 class SyclQueueToOrderManagerMap:
-    """Utility class to ensure sequential ordering of offloaded
-    tasks issued by dpctl.tensor functions"""
+    """Utility class used to ensure sequential ordering of offloaded tasks
+    when passed to order manager."""
 
     def __init__(self):
         self._map = ContextVar(

From 003dd158fc6e2a81b4d4de265569f6eb15da73bd Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Tue, 27 Jan 2026 11:31:32 -0800
Subject: [PATCH 10/24] Update dpctl tests to remove dpctl.tensor

---
 dpctl/tests/test_raw_kernel_arg.py           |  27 ++--
 dpctl/tests/test_sycl_event.py               |  27 ++--
 dpctl/tests/test_sycl_kernel_submit.py       | 132 +++++++++++--------
 dpctl/tests/test_sycl_timer.py               | 126 ++++++++++++------
 dpctl/tests/test_work_group_memory.py        |  26 ++--
 dpctl/tests/test_work_group_memory_opencl.py |  23 ++--
 6 files changed, 227 insertions(+), 134 deletions(-)

diff --git a/dpctl/tests/test_raw_kernel_arg.py b/dpctl/tests/test_raw_kernel_arg.py
index 7260cb4468..5dc0840835 100644
--- a/dpctl/tests/test_raw_kernel_arg.py
+++ b/dpctl/tests/test_raw_kernel_arg.py
@@ -19,10 +19,11 @@
 import ctypes
 import os
 
+import numpy as np
 import pytest
 
 import dpctl
-import dpctl.tensor
+import dpctl.memory as dpm
 
 
 def get_spirv_abspath(fn):
@@ -73,27 +74,33 @@ def launch_raw_arg_kernel(raw):
     local_size = 16
     global_size = local_size * 8
 
-    x = dpctl.tensor.ones(global_size, dtype="int32")
-    y = dpctl.tensor.zeros(global_size, dtype="int32")
-    x.sycl_queue.wait()
-    y.sycl_queue.wait()
+    x = np.ones(global_size, dtype="i4")
+    y = np.zeros_like(x)
+
+    x_usm = dpm.MemoryUSMDevice(x.nbytes, queue=q)
+    y_usm = dpm.MemoryUSMDevice(y.nbytes, queue=q)
+
+    ev1 = q.memcpy_async(dest=x_usm, src=x, count=x.nbytes)
 
     try:
-        q.submit(
+        ev2 = q.submit(
             kernel,
             [
-                x.usm_data,
-                y.usm_data,
+                x_usm,
+                y_usm,
                 raw,
             ],
             [global_size],
             [local_size],
+            dEvents=[ev1],
         )
-        q.wait()
     except dpctl._sycl_queue.SyclKernelSubmitError:
         pytest.skip(f"Kernel submission to {q.sycl_device} failed")
 
-    assert dpctl.tensor.all(y == 9)
+    ev3 = q.memcpy_async(dest=y, src=y_usm, count=y.nbytes, dEvents=[ev2])
+    ev3.wait()
+
+    assert np.all(y == 9)
 
 
 def test_submit_raw_kernel_arg_pointer():
diff --git a/dpctl/tests/test_sycl_event.py b/dpctl/tests/test_sycl_event.py
index 2405cbe6be..2985bf3af0 100644
--- a/dpctl/tests/test_sycl_event.py
+++ b/dpctl/tests/test_sycl_event.py
@@ -16,12 +16,12 @@
 
 """Defines unit test cases for the SyclEvent class."""
 
+import numpy as np
 import pytest
 
 import dpctl
 import dpctl.memory as dpctl_mem
 import dpctl.program as dpctl_prog
-import dpctl.tensor as dpt
 from dpctl import event_status_type as esty
 
 from .helper import create_invalid_capsule
@@ -41,13 +41,15 @@ def produce_event(profiling=False):
     addKernel = prog.get_sycl_kernel("add")
 
     n = 1024 * 1024
-    a = dpt.arange(n, dtype="i", sycl_queue=q)
-    args = [a.usm_data]
+    a = np.arange(n, dtype="i")
+    a_usm = dpctl_mem.MemoryUSMDevice(a.nbytes, queue=q)
+    ev1 = q.memcpy_async(dest=a_usm, src=a, count=a.nbytes)
+    args = [a_usm]
 
     r = [n]
-    ev = q.submit(addKernel, args, r)
+    ev2 = q.submit(addKernel, args, r, dEvents=[ev1])
 
-    return ev
+    return ev2
 
 
 def test_create_default_event():
@@ -162,16 +164,19 @@ def test_get_wait_list():
     sinKernel = prog.get_sycl_kernel("sin_k")
 
     n = 1024 * 1024
-    a = dpt.arange(n, dtype="f", sycl_queue=q)
-    args = [a.usm_data]
+    a = np.arange(n, dtype="f")
+    a_usm = dpctl_mem.MemoryUSMDevice(a.nbytes, queue=q)
+    ev_1 = q.memcpy_async(dest=a_usm, src=a, count=a.nbytes)
+
+    args = [a_usm]
 
     r = [n]
-    ev_1 = q.submit(addKernel, args, r)
-    ev_2 = q.submit(sqrtKernel, args, r, dEvents=[ev_1])
-    ev_3 = q.submit(sinKernel, args, r, dEvents=[ev_2])
+    ev_2 = q.submit(addKernel, args, r, dEvents=[ev_1])
+    ev_3 = q.submit(sqrtKernel, args, r, dEvents=[ev_2])
+    ev_4 = q.submit(sinKernel, args, r, dEvents=[ev_3])
 
     try:
-        wait_list = ev_3.get_wait_list()
+        wait_list = ev_4.get_wait_list()
     except ValueError:
         pytest.fail("Failed to get a list of waiting events from SyclEvent")
     # FIXME: Due to an issue in underlying runtime the list returns is always
diff --git a/dpctl/tests/test_sycl_kernel_submit.py b/dpctl/tests/test_sycl_kernel_submit.py
index 3b5c08349b..cbad24c8ab 100644
--- a/dpctl/tests/test_sycl_kernel_submit.py
+++ b/dpctl/tests/test_sycl_kernel_submit.py
@@ -23,22 +23,21 @@
 import pytest
 
 import dpctl
-import dpctl.memory as dpctl_mem
+import dpctl.memory as dpm
 import dpctl.program as dpctl_prog
-import dpctl.tensor as dpt
 from dpctl._sycl_queue import kernel_arg_type
 
 
 @pytest.mark.parametrize(
     "ctype_str,dtype,ctypes_ctor",
     [
-        ("short", dpt.dtype("i2"), ctypes.c_short),
-        ("int", dpt.dtype("i4"), ctypes.c_int),
-        ("unsigned int", dpt.dtype("u4"), ctypes.c_uint),
-        ("long", dpt.dtype(np.longlong), ctypes.c_longlong),
-        ("unsigned long", dpt.dtype(np.ulonglong), ctypes.c_ulonglong),
-        ("float", dpt.dtype("f4"), ctypes.c_float),
-        ("double", dpt.dtype("f8"), ctypes.c_double),
+        ("short", np.dtype("i2"), ctypes.c_short),
+        ("int", np.dtype("i4"), ctypes.c_int),
+        ("unsigned int", np.dtype("u4"), ctypes.c_uint),
+        ("long", np.dtype(np.longlong), ctypes.c_longlong),
+        ("unsigned long", np.dtype(np.ulonglong), ctypes.c_ulonglong),
+        ("float", np.dtype("f4"), ctypes.c_float),
+        ("double", np.dtype("f8"), ctypes.c_double),
     ],
 )
 def test_create_program_from_source(ctype_str, dtype, ctypes_ctor):
@@ -46,7 +45,7 @@ def test_create_program_from_source(ctype_str, dtype, ctypes_ctor):
         q = dpctl.SyclQueue("opencl", property="enable_profiling")
     except dpctl.SyclQueueCreationError:
         pytest.skip("OpenCL queue could not be created")
-    if dtype == dpt.dtype("f8") and q.sycl_device.has_aspect_fp64 is False:
+    if dtype == np.dtype("f8") and q.sycl_device.has_aspect_fp64 is False:
         pytest.skip(
             "Device does not support double precision floating point type"
         )
@@ -66,20 +65,26 @@ def test_create_program_from_source(ctype_str, dtype, ctypes_ctor):
     n_elems = 1024 * 512
     lws = 128
     if dtype.kind in "ui":
-        n_elems = min(n_elems, dpt.iinfo(dtype).max)
+        n_elems = min(n_elems, np.iinfo(dtype).max)
         n_elems = (n_elems // lws) * lws
-    a = dpt.arange(n_elems, dtype=dtype, sycl_queue=q)
-    b = dpt.arange(n_elems, stop=0, step=-1, dtype=dtype, sycl_queue=q)
-    c = dpt.zeros(n_elems, dtype=dtype, sycl_queue=q)
+    a = np.arange(n_elems, dtype=dtype)
+    b = np.arange(n_elems, stop=0, step=-1, dtype=dtype)
+    c = np.zeros(n_elems, dtype=dtype)
+
+    a_usm = dpm.MemoryUSMDevice(a.nbytes, queue=q)
+    b_usm = dpm.MemoryUSMDevice(b.nbytes, queue=q)
+    c_usm = dpm.MemoryUSMDevice(c.nbytes, queue=q)
+
+    ev1 = q.memcpy_async(dest=a_usm, src=a, count=a.nbytes)
+    ev2 = q.memcpy_async(dest=b_usm, src=b, count=b.nbytes)
+
+    dpctl.SyclEvent.wait_for([ev1, ev2])
 
     d = 2
-    args = [a.usm_data, b.usm_data, c.usm_data, ctypes_ctor(d)]
+    args = [a_usm, b_usm, c_usm, ctypes_ctor(d)]
 
     assert n_elems % lws == 0
 
-    b_np = dpt.asnumpy(b)
-    a_np = dpt.asnumpy(a)
-
     for r in (
         [
             n_elems,
@@ -87,14 +92,15 @@ def test_create_program_from_source(ctype_str, dtype, ctypes_ctor):
         [2, n_elems],
         [2, 2, n_elems],
     ):
-        c[:] = 0
+        c_usm.memset()
         timer = dpctl.SyclTimer()
         with timer(q):
             q.submit(axpyKernel, args, r).wait()
-            ref_c = a_np * np.array(d, dtype=dtype) + b_np
+            ref_c = a * np.array(d, dtype=dtype) + b
         host_dt, device_dt = timer.dt
         assert type(host_dt) is float and type(device_dt) is float
-        assert np.allclose(dpt.asnumpy(c), ref_c), "Failed for {}".format(r)
+        q.memcpy(c, c_usm, c.nbytes)
+        assert np.allclose(c, ref_c), "Failed for {}".format(r)
 
     for gr, lr in (
         (
@@ -106,16 +112,15 @@ def test_create_program_from_source(ctype_str, dtype, ctypes_ctor):
         ([2, n_elems], [2, lws // 2]),
         ([2, 2, n_elems], [2, 2, lws // 4]),
     ):
-        c[:] = 0
+        c_usm.memset()
         timer = dpctl.SyclTimer()
         with timer(q):
             q.submit(axpyKernel, args, gr, lr, [dpctl.SyclEvent()]).wait()
-            ref_c = a_np * np.array(d, dtype=dtype) + b_np
+            ref_c = a * np.array(d, dtype=dtype) + b
         host_dt, device_dt = timer.dt
         assert type(host_dt) is float and type(device_dt) is float
-        assert np.allclose(dpt.asnumpy(c), ref_c), "Failed for {}, {}".formatg(
-            r, lr
-        )
+        q.memcpy(c, c_usm, c.nbytes)
+        assert np.allclose(c, ref_c), "Failed for {}, {}".format(gr, lr)
 
 
 def test_submit_async():
@@ -124,23 +129,27 @@ def test_submit_async():
     except dpctl.SyclQueueCreationError:
         pytest.skip("OpenCL queue could not be created")
     oclSrc = (
-        "kernel void kern1(global unsigned int *res, unsigned int mod) {"
+        "kernel void kern1("
+        "   global unsigned int *res_base, ulong res_off, unsigned int mod) {"
         "   size_t unused_sum = 0;"
         "   size_t i = 0; "
         "   for (i = 0; i < 4000; i++) { "
         "       unused_sum += i;"
         "   } "
+        "   global unsigned int *res = res_base + (size_t)res_off;"
         "   size_t index = get_global_id(0);"
         "   int ri = (index % mod);"
         "   res[index] = (ri * ri) % mod;"
         "}"
         " "
-        "kernel void kern2(global unsigned int *res, unsigned int mod) {"
+        "kernel void kern2("
+        "   global unsigned int *res_base, ulong res_off, unsigned int mod) {"
         "   size_t unused_sum = 0;"
         "   size_t i = 0; "
         "   for (i = 0; i < 4000; i++) { "
         "       unused_sum += i;"
         "   } "
+        "   global unsigned int *res = res_base + (size_t)res_off;"
         "   size_t index = get_global_id(0);"
         "   int ri = (index % mod);"
         "   int ri2 = (ri * ri) % mod;"
@@ -148,9 +157,13 @@ def test_submit_async():
         "}"
         " "
         "kernel void kern3("
-        "   global unsigned int *res, global unsigned int *arg1, "
-        "   global unsigned int *arg2)"
+        "   global unsigned int *res_base, ulong res_off,"
+        "   global unsigned int *arg1_base, ulong arg1_off,"
+        "   global unsigned int *arg2_base, ulong arg2_off)"
         "{"
+        "   global unsigned int *res = res_base + (size_t)res_off;"
+        "   global unsigned int *arg1 = arg1_base + (size_t)arg1_off;"
+        "   global unsigned int *arg2 = arg2_base + (size_t)arg2_off;"
         "   size_t index = get_global_id(0);"
         "   size_t i = 0; "
         "   size_t unused_sum = 0;"
@@ -177,10 +190,10 @@ def test_submit_async():
     n = f * 1024
     n_alloc = 4 * n
 
-    X = dpt.empty((3, n_alloc), dtype="u4", usm_type="device", sycl_queue=q)
-    first_row = dpctl_mem.as_usm_memory(X[0])
-    second_row = dpctl_mem.as_usm_memory(X[1])
-    third_row = dpctl_mem.as_usm_memory(X[2])
+    x = np.empty((3, n_alloc), dtype="u4")
+    x_usm = dpm.MemoryUSMDevice(x.nbytes, queue=q)
+
+    e1 = q.memcpy_async(dest=x_usm, src=x, count=x.nbytes)
 
     p1, p2 = 17, 27
 
@@ -189,26 +202,39 @@ def test_submit_async():
         e1 = q.submit_async(
             kern1Kernel,
             [
-                first_row,
+                x_usm,
+                ctypes.c_ulonglong(0),
                 ctypes.c_uint(p1),
             ],
             [
                 n,
             ],
+            None,
+            [e1],
         )
         e2 = q.submit_async(
             kern2Kernel,
             [
-                second_row,
+                x_usm,
+                ctypes.c_ulonglong(n_alloc),
                 ctypes.c_uint(p2),
             ],
             [
                 n,
             ],
+            None,
+            [e1],
         )
         e3 = q.submit_async(
             kern3Kernel,
-            [third_row, first_row, second_row],
+            [
+                x_usm,
+                ctypes.c_ulonglong(2 * n_alloc),
+                x_usm,
+                ctypes.c_ulonglong(0),
+                x_usm,
+                ctypes.c_ulonglong(n_alloc),
+            ],
             [
                 n,
             ],
@@ -218,9 +244,7 @@ def test_submit_async():
         e3_st = e3.execution_status
         e2_st = e2.execution_status
         e1_st = e1.execution_status
-        ht_e = q._submit_keep_args_alive(
-            [first_row, second_row, third_row], [e1, e2, e3]
-        )
+        ht_e = q._submit_keep_args_alive([x_usm], [e1, e2, e3])
         are_complete = [
             e == status_complete
             for e in (
@@ -240,14 +264,13 @@ def test_submit_async():
                 break
 
     assert async_detected, "No evidence of async submission detected, unlucky?"
-    Xnp = dpt.asnumpy(X)
-    Xref = np.empty((3, n), dtype="u4")
+    q.memcpy(dest=x, src=x_usm, count=x.nbytes)
+    x_ref = np.empty((3, n), dtype="u4")
     for i in range(n):
-        Xref[0, i] = (i * i) % p1
-        Xref[1, i] = (i * i * i) % p2
-        Xref[2, i] = min(Xref[0, i], Xref[1, i])
-
-    assert np.array_equal(Xnp[:, :n], Xref[:, :n])
+        x_ref[0, i] = (i * i) % p1
+        x_ref[1, i] = (i * i * i) % p2
+        x_ref[2, i] = min(x_ref[0, i], x_ref[1, i])
+    assert np.array_equal(x[:, :n], x_ref[:, :n])
 
 
 def _check_kernel_arg_type_instance(kati):
@@ -303,19 +326,20 @@ def test_submit_local_accessor_arg():
     krn = prog.get_sycl_kernel("_ZTS14SyclKernel_SLMIlE")
     lws = 32
     gws = lws * 10
-    x = dpt.ones(gws, dtype="i8")
-    x.sycl_queue.wait()
+    x = np.ones(gws, dtype="i8")
+    res = np.empty_like(x)
+    x_usm = dpm.MemoryUSMDevice(x.nbytes, queue=q)
+    q.memcpy(dest=x_usm, src=x, count=x.nbytes)
     try:
         e = q.submit(
             krn,
-            [x.usm_data, dpctl.LocalAccessor("i8", (lws,))],
+            [x_usm, dpctl.LocalAccessor("i8", (lws,))],
             [gws],
             [lws],
         )
         e.wait()
     except dpctl._sycl_queue.SyclKernelSubmitError:
         pytest.skip(f"Kernel submission failed for device {q.sycl_device}")
-    expected = dpt.arange(1, x.size + 1, dtype=x.dtype, device=x.device) * (
-        2 * lws
-    )
-    assert dpt.all(x == expected)
+    q.memcpy(dest=res, src=x_usm, count=x.nbytes)
+    expected = np.arange(1, x.size + 1, dtype=x.dtype) * (2 * lws)
+    assert np.all(res == expected)
diff --git a/dpctl/tests/test_sycl_timer.py b/dpctl/tests/test_sycl_timer.py
index 89c6816fda..680de73b2c 100644
--- a/dpctl/tests/test_sycl_timer.py
+++ b/dpctl/tests/test_sycl_timer.py
@@ -16,10 +16,11 @@
 
 import time
 
+import numpy as np
 import pytest
 
 import dpctl
-import dpctl.tensor as dpt
+from dpctl.utils import SequentialOrderManager
 
 
 @pytest.fixture
@@ -33,26 +34,74 @@ def profiling_queue():
     return q
 
 
-@pytest.mark.parametrize(
-    "device_timer", [None, "queue_barrier", "order_manager"]
-)
+@pytest.mark.parametrize("device_timer", [None, "queue_barrier"])
 def test_sycl_timer_queue_barrier(profiling_queue, device_timer):
-    dev = dpt.Device.create_device(profiling_queue)
-
     timer = dpctl.SyclTimer(
         host_timer=time.perf_counter, device_timer=device_timer, time_scale=1e3
     )
+    x = np.linspace(0, 1, num=10**6)
+    res = np.empty_like(x)
+
+    with timer(profiling_queue):
+        # round-trip through USM device memory into new NumPy array
+        x_usm = dpctl.memory.MemoryUSMDevice(x.nbytes, queue=profiling_queue)
+        e1 = profiling_queue.memcpy_async(
+            dest=x_usm,
+            src=x,
+            count=x.nbytes,
+        )
+        e2 = profiling_queue.memcpy_async(
+            dest=res,
+            src=x_usm,
+            count=res.nbytes,
+            dEvents=[e1],
+        )
+
+    e2.wait()
+    host_dt, device_dt = timer.dt
+
+    assert np.all(res == x)
+    assert host_dt > 0
+    assert device_dt > 0
+
+
+def test_sycl_timer_order_manager(profiling_queue):
+    q = profiling_queue
+    timer = dpctl.SyclTimer(
+        host_timer=time.perf_counter,
+        device_timer="order_manager",
+        time_scale=1e3,
+    )
+
+    om = SequentialOrderManager[q]
 
-    with timer(dev.sycl_queue):
-        x = dpt.linspace(0, 1, num=10**6, device=dev)
-        y = 3.0 - dpt.square(x - 0.5)
-        z = dpt.sort(y)
-        res1 = z[-1]
-        res2 = dpt.max(y)
+    x = np.linspace(0, 1, num=10**6)
+    res = np.empty_like(x)
 
+    with timer(q):
+        x_usm = dpctl.memory.MemoryUSMDevice(x.nbytes, queue=q)
+        e1 = q.memcpy_async(
+            dest=x_usm,
+            src=x,
+            count=x.nbytes,
+            dEvents=om.submitted_events,
+        )
+        ht1 = q._submit_keep_args_alive((x_usm, x), [e1])
+        om.add_event_pair(ht1, e1)
+        e2 = q.memcpy_async(
+            dest=res,
+            src=x_usm,
+            count=res.nbytes,
+            dEvents=om.submitted_events,
+        )
+        ht2 = q._submit_keep_args_alive((res, x_usm), [e2])
+        om.add_event_pair(ht2, e2)
+
+    e2.wait()
+    ht2.wait()
     host_dt, device_dt = timer.dt
 
-    assert dpt.all(res1 == res2)
+    assert np.all(res == x)
     assert host_dt > 0
     assert device_dt > 0
 
@@ -66,36 +115,39 @@ def test_sycl_timer_accumulation(profiling_queue):
         time_scale=1e3,
     )
 
-    # initial condition
-    x = dpt.linspace(0, 1, num=10**6, sycl_queue=q)
+    om = SequentialOrderManager[q]
 
-    aitkens_data = [
-        x,
-    ]
+    x = np.linspace(0, 1, num=10**6)
+    res = np.empty_like(x)
+    x_usm = dpctl.memory.MemoryUSMDevice(x.nbytes, queue=q)
 
-    # 16 iterations of Aitken's accelerated Newton's method
-    # x <- x - f(x)/f'(x) for f(x) = x - cos(x)
-    for _ in range(16):
-        # only time Newton step
+    # repeat round-trip several times to exercise timer accumulation
+    for _ in range(8):
         with timer(q):
-            s = dpt.sin(x)
-            x = (dpt.cos(x) + x * s) / (1 + s)
-        aitkens_data.append(x)
-        aitkens_data = aitkens_data[-3:]
-        if len(aitkens_data) == 3:
-            # apply Aitkens acceleration
-            d1 = aitkens_data[-1] - aitkens_data[-2]
-            d2 = aitkens_data[-2] - aitkens_data[-3]
-            if not dpt.any(d1 == d2):
-                x = aitkens_data[-1] - dpt.square(d1) / (d1 - d2)
-
-    # Total time for 16 iterations
+            depends = om.submitted_events
+            e1 = q.memcpy_async(
+                dest=x_usm,
+                src=x,
+                count=x.nbytes,
+                dEvents=depends,
+            )
+            ht1 = q._submit_keep_args_alive((x_usm, x), [e1])
+            om.add_event_pair(ht1, e1)
+            e2 = q.memcpy_async(
+                dest=res,
+                src=x_usm,
+                count=res.nbytes,
+                dEvents=[e1],
+            )
+            ht2 = q._submit_keep_args_alive((res, x_usm), [e2])
+            om.add_event_pair(ht2, e2)
+    e2.wait()
+    ht2.wait()
+    assert np.all(res == x)
+
     dev_dt = timer.dt.device_dt
     assert dev_dt > 0
 
-    # check convergence
-    assert dpt.max(x) - dpt.min(x) < 1e-5
-
 
 def test_sycl_timer_validation():
     with pytest.raises(ValueError):
diff --git a/dpctl/tests/test_work_group_memory.py b/dpctl/tests/test_work_group_memory.py
index edf390e2b6..17b689ee0a 100644
--- a/dpctl/tests/test_work_group_memory.py
+++ b/dpctl/tests/test_work_group_memory.py
@@ -18,10 +18,10 @@
 
 import os
 
+import numpy as np
 import pytest
 
 import dpctl
-import dpctl.tensor
 
 
 def get_spirv_abspath(fn):
@@ -67,24 +67,30 @@ def test_submit_work_group_memory():
     local_size = 16
     global_size = local_size * 8
 
-    x = dpctl.tensor.ones(global_size, dtype="int32")
-    y = dpctl.tensor.zeros(global_size, dtype="int32")
-    x.sycl_queue.wait()
-    y.sycl_queue.wait()
+    x = np.ones(global_size, dtype="float32")
+    y = np.zeros(global_size, dtype="float32")
+
+    x_usm = dpctl.memory.MemoryUSMDevice(x.nbytes, queue=q)
+    y_usm = dpctl.memory.MemoryUSMDevice(y.nbytes, queue=q)
+
+    ev1 = q.memcpy_async(dest=x_usm, src=x, count=x.nbytes)
 
     try:
-        q.submit(
+        ev2 = q.submit(
             kernel,
             [
-                x.usm_data,
-                y.usm_data,
+                x_usm,
+                y_usm,
                 dpctl.WorkGroupMemory("i4", local_size),
             ],
             [global_size],
             [local_size],
+            dEvents=[ev1],
         )
-        q.wait()
     except dpctl._sycl_queue.SyclKernelSubmitError:
         pytest.skip(f"Kernel submission to {q.sycl_device} failed")
 
-    assert dpctl.tensor.all(x == y)
+    ev3 = q.memcpy_async(dest=y, src=y_usm, count=y.nbytes, dEvents=[ev2])
+    ev3.wait()
+
+    assert np.all(x == y)
diff --git a/dpctl/tests/test_work_group_memory_opencl.py b/dpctl/tests/test_work_group_memory_opencl.py
index df90f2be01..b206ed7cab 100644
--- a/dpctl/tests/test_work_group_memory_opencl.py
+++ b/dpctl/tests/test_work_group_memory_opencl.py
@@ -20,7 +20,6 @@
 import pytest
 
 import dpctl
-import dpctl.tensor
 
 ocl_kernel_src = """
 __kernel void local_mem_kernel(__global float *input, __global float *output,
@@ -51,30 +50,30 @@ def test_submit_work_group_memory_opencl():
     local_size = 16
     global_size = local_size * 8
 
-    x_dev = dpctl.memory.MemoryUSMDevice(global_size * 4, queue=q)
-    y_dev = dpctl.memory.MemoryUSMDevice(global_size * 4, queue=q)
-
     x = np.ones(global_size, dtype="float32")
     y = np.zeros(global_size, dtype="float32")
-    q.memcpy(x_dev, x, x_dev.nbytes)
-    q.memcpy(y_dev, y, y_dev.nbytes)
+
+    x_usm = dpctl.memory.MemoryUSMDevice(x.nbytes, queue=q)
+    y_usm = dpctl.memory.MemoryUSMDevice(y.nbytes, queue=q)
+
+    ev1 = q.memcpy_async(dest=x_usm, src=x, count=x.nbytes)
 
     try:
-        q.submit(
+        ev2 = q.submit(
             kernel,
             [
-                x_dev,
-                y_dev,
+                x_usm,
+                y_usm,
                 dpctl.WorkGroupMemory(local_size * x.itemsize),
             ],
             [global_size],
             [local_size],
+            dEvents=[ev1],
         )
-        q.wait()
     except dpctl._sycl_queue.SyclKernelSubmitError:
-        pytest.fail("Foo")
         pytest.skip(f"Kernel submission to {q.sycl_device} failed")
 
-    q.memcpy(y, y_dev, y_dev.nbytes)
+    ev3 = q.memcpy_async(dest=y, src=y_usm, count=y.nbytes, dEvents=[ev2])
+    ev3.wait()
 
     assert np.all(x == y)

From ed2d19e676cef3fab2f03ddc19ed5baefa08c637 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 28 Jan 2026 19:09:22 -0800
Subject: [PATCH 11/24] remove _compute_follows_data.pyx as it is no longer
 applicable to the project sans tensor

---
 dpctl/utils/_compute_follows_data.pyx | 179 --------------------------
 1 file changed, 179 deletions(-)
 delete mode 100644 dpctl/utils/_compute_follows_data.pyx

diff --git a/dpctl/utils/_compute_follows_data.pyx b/dpctl/utils/_compute_follows_data.pyx
deleted file mode 100644
index ce3823ffd5..0000000000
--- a/dpctl/utils/_compute_follows_data.pyx
+++ /dev/null
@@ -1,179 +0,0 @@
-#                      Data Parallel Control (dpctl)
-#
-# Copyright 2020-2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# distutils: language = c++
-# cython: language_level=3
-# cython: linetrace=True
-
-"""This file implements Python buffer protocol using Sycl USM shared and host
-allocators. The USM device allocator is also exposed through this module for
-use in other Python modules.
-"""
-
-
-import dpctl
-
-from .._sycl_queue cimport SyclQueue
-
-__all__ = [
-    "get_execution_queue", "get_coerced_usm_type", "ExecutionPlacementError"
-]
-
-
-class ExecutionPlacementError(Exception):
-    """Exception raised when execution placement target can not
-    be unambiguously determined from input arrays.
-
-    Make sure that input arrays are associated with the same
-    :class:`dpctl.SyclQueue`,
-    or migrate data to the same :class:`dpctl.SyclQueue` using
-    :meth:`dpctl.tensor.usm_ndarray.to_device` method.
-    """
-    pass
-
-
-cdef bint queue_equiv(SyclQueue q1, SyclQueue q2):
-    """Queues are equivalent if ``q1 == q2``, that is they are copies
-    of the same underlying SYCL object and hence are the same."""
-    return q1.__eq__(q2)
-
-
-def get_execution_queue(qs, /):
-    """
-    Get execution queue from queues associated with input arrays.
-
-    Args:
-        qs (List[:class:`dpctl.SyclQueue`], Tuple[:class:`dpctl.SyclQueue`]):
-            a list or a tuple of :class:`dpctl.SyclQueue` objects
-            corresponding to arrays that are being combined.
-
-    Returns:
-        SyclQueue:
-            execution queue under compute follows data paradigm,
-            or ``None`` if queues are not equal.
-    """
-    if not isinstance(qs, (list, tuple)):
-        raise TypeError(
-            "Expected a list or a tuple, got {}".format(type(qs))
-        )
-    if len(qs) == 0:
-        return None
-    elif len(qs) == 1:
-        return qs[0] if isinstance(qs[0], dpctl.SyclQueue) else None
-    for q1, q2 in zip(qs[:-1], qs[1:]):
-        if not isinstance(q1, dpctl.SyclQueue):
-            return None
-        elif not isinstance(q2, dpctl.SyclQueue):
-            return None
-        elif not queue_equiv(<SyclQueue> q1, <SyclQueue> q2):
-            return None
-    return qs[0]
-
-
-def get_coerced_usm_type(usm_types, /):
-    """
-    Get USM type of the output array for a function combining
-    arrays of given USM types using compute-follows-data execution
-    model.
-
-    Args:
-        usm_types (List[str], Tuple[str]):
-            a list or a tuple of strings of ``.usm_types`` attributes
-            for input arrays
-
-    Returns:
-         str
-            type of USM allocation for the output arrays (s).
-            ``None`` if any of the input strings are not recognized.
-    """
-    if not isinstance(usm_types, (list, tuple)):
-        raise TypeError(
-            "Expected a list or a tuple, got {}".format(type(usm_types))
-        )
-    if len(usm_types) == 0:
-        return None
-    _k = ["device", "shared", "host"]
-    _m = {k: i for i, k in enumerate(_k)}
-    res = len(_k)
-    for t in usm_types:
-        if not isinstance(t, str):
-            return None
-        if t not in _m:
-            return None
-        res = min(res, _m[t])
-    return _k[res]
-
-
-def _validate_usm_type_allow_none(usm_type):
-    "Validates usm_type argument"
-    if usm_type is not None:
-        if isinstance(usm_type, str):
-            if usm_type not in ["device", "shared", "host"]:
-                raise ValueError(
-                    f"Unrecognized value of usm_type={usm_type}, "
-                    "expected 'device', 'shared', 'host', or None."
-                )
-        else:
-            raise TypeError(
-                f"Expected usm_type to be a str or None, got {type(usm_type)}"
-            )
-
-
-def _validate_usm_type_disallow_none(usm_type):
-    "Validates usm_type argument"
-    if isinstance(usm_type, str):
-        if usm_type not in ["device", "shared", "host"]:
-            raise ValueError(
-                f"Unrecognized value of usm_type={usm_type}, "
-                "expected 'device', 'shared', or 'host'."
-            )
-    else:
-        raise TypeError(
-            f"Expected usm_type to be a str, got {type(usm_type)}"
-        )
-
-
-def validate_usm_type(usm_type, /, *, allow_none=True):
-    """ validate_usm_type(usm_type, allow_none=True)
-
-    Raises an exception if `usm_type` is invalid.
-
-    Args:
-        usm_type:
-            Specification for USM allocation type. Valid specifications
-            are:
-
-            * ``"device"``
-            * ``"shared"``
-            * ``"host"``
-
-            If ``allow_none`` keyword argument is set, a value of
-            ``None`` is also permitted.
-        allow_none (bool, optional):
-            Whether ``usm_type`` value of ``None`` is considered valid.
-            Default: `True`.
-
-    Raises:
-        ValueError:
-            if ``usm_type`` is not a recognized string.
-        TypeError:
-            if ``usm_type`` is not a string, and ``usm_type`` is
-            not ``None`` provided ``allow_none`` is ``True``.
-    """
-    if allow_none:
-        _validate_usm_type_allow_none(usm_type)
-    else:
-        _validate_usm_type_disallow_none(usm_type)

From 7a6bcf1e14bd9adeb4892c56b0a3be639c80ad3a Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 28 Jan 2026 19:10:31 -0800
Subject: [PATCH 12/24] remove cython/usm_memory example

---
 examples/cython/usm_memory/.gitignore         |   3 -
 examples/cython/usm_memory/CMakeLists.txt     |  56 -----
 examples/cython/usm_memory/README.md          |  44 ----
 .../usm_memory/blackscholes/__init__.py       |  29 ---
 .../blackscholes/_blackscholes_usm.pyx        | 209 ------------------
 examples/cython/usm_memory/scripts/bench.py   |  93 --------
 examples/cython/usm_memory/setup.py           |  32 ---
 .../usm_memory/src/sycl_blackscholes.hpp      | 176 ---------------
 .../usm_memory/tests/test_black_scholes.py    |  94 --------
 9 files changed, 736 deletions(-)
 delete mode 100644 examples/cython/usm_memory/.gitignore
 delete mode 100644 examples/cython/usm_memory/CMakeLists.txt
 delete mode 100644 examples/cython/usm_memory/README.md
 delete mode 100644 examples/cython/usm_memory/blackscholes/__init__.py
 delete mode 100644 examples/cython/usm_memory/blackscholes/_blackscholes_usm.pyx
 delete mode 100644 examples/cython/usm_memory/scripts/bench.py
 delete mode 100644 examples/cython/usm_memory/setup.py
 delete mode 100644 examples/cython/usm_memory/src/sycl_blackscholes.hpp
 delete mode 100644 examples/cython/usm_memory/tests/test_black_scholes.py

diff --git a/examples/cython/usm_memory/.gitignore b/examples/cython/usm_memory/.gitignore
deleted file mode 100644
index d423caa640..0000000000
--- a/examples/cython/usm_memory/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-blackscholes.cpp
-*~
-*.cpython*.so
diff --git a/examples/cython/usm_memory/CMakeLists.txt b/examples/cython/usm_memory/CMakeLists.txt
deleted file mode 100644
index 0a422d26d1..0000000000
--- a/examples/cython/usm_memory/CMakeLists.txt
+++ /dev/null
@@ -1,56 +0,0 @@
-cmake_minimum_required(VERSION 3.22...3.27 FATAL_ERROR)
-
-project(example_cython_blackscholes_usm VERSION 0.1 LANGUAGES CXX
-  DESCRIPTION "Example of Cython extension calling SYCL routines")
-set(DPCTL_CMAKE_MODULES_PATH "${CMAKE_SOURCE_DIR}/../../../cmake")
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${DPCTL_CMAKE_MODULES_PATH})
-
-find_package(IntelSYCL REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH)
-
-
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}")
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_STANDARD_REQUIRED True)
-
-# Define CMAKE_INSTALL_xxx: LIBDIR, INCLUDEDIR
-include(GNUInstallDirs)
-
-find_package(Python REQUIRED COMPONENTS Development.Module NumPy)
-find_package(Dpctl REQUIRED)
-
-# -t is to only Cythonize sources with timestamps newer than existing CXX files (if present)
-# -w is to set working directory (and correctly set __pyx_f[] array of filenames)
-set(CYTHON_FLAGS "-t -w \"${CMAKE_SOURCE_DIR}\"")
-find_package(Cython REQUIRED)
-
-find_package(TBB REQUIRED)
-
-set(MKL_ARCH "intel64")
-set(MKL_LINK "dynamic")
-set(MKL_THREADING "tbb_thread")
-set(MKL_INTERFACE "ilp64")
-find_package(MKL REQUIRED)
-
-set(py_module_name _blackscholes_usm)
-
-set(_cy_source blackscholes/_blackscholes_usm.pyx)
-add_cython_target(${py_module_name} ${_cy_source} CXX OUTPUT_VAR _generated_cy_src)
-Python_add_library(${py_module_name} MODULE WITH_SOABI ${_generated_cy_src})
-add_sycl_to_target(TARGET ${py_module_name} SOURCES ${_generated_cy_src})
-target_compile_definitions(${py_module_name} PRIVATE -DMKL_ILP64)
-target_include_directories(${py_module_name} PUBLIC src ${Dpctl_INCLUDE_DIRS})
-target_link_libraries(${py_module_name} PRIVATE MKL::MKL_SYCL Python::NumPy)
-
-install(TARGETS ${py_module_name} DESTINATION blackscholes)
-
-foreach(_src_fn ${_sources})
-    get_source_file_property(_compile_options ${_src_fn} COMPILE_OPTIONS)
-    set(_combined_options ${_compile_options} "-O3")
-    set_source_files_properties(${_src_fn}
-        PROPERTIES
-        COMPILE_OPTIONS "${_combined_options}"
-    )
-endforeach()
-target_link_options(${py_module_name} PRIVATE -fsycl-device-code-split=per_kernel)
-
-set(ignoreMe "${SKBUILD}")
diff --git a/examples/cython/usm_memory/README.md b/examples/cython/usm_memory/README.md
deleted file mode 100644
index bf359ffc2b..0000000000
--- a/examples/cython/usm_memory/README.md
+++ /dev/null
@@ -1,44 +0,0 @@
-# Working with USM Memory
-
-This example demonstrates building of an extension that works with
-`dpctl.tensor.usm_ndarray` container.
-
-It implements two Python functions: `blackscholes.populate_params` and
-`blackscholes.black_scholes_price`. The first one uses MKL's device RNG
-implementation to populate option parameters from uniform distribution
-in user-specified ranges, and the other one takes the array with option
-parameters and produces array with call and put European vanilla option
-prices.
-
-## Building
-
-> **NOTE:** Make sure oneAPI is activated, $ONEAPI_ROOT must be set.
-
-To build the example, run:
-```
-$ python setup.py build_ext --inplace
-```
-
-## Testing
-
-```
-$ pytest tests/
-```
-
-## Running benchmark
-
-```
-$ python scripts/bench.py
-```
-
-It gives the example output:
-
-```
-(dev_dpctl) opavlyk@opavlyk-mobl:~/repos/dpctl/examples/cython/usm_memory$ python scripts/bench.py
-Pricing 30,000,000 vanilla European options using Black-Scholes-Merton formula
-
-Using      : 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz
-Wall times : [0.07042762002674863, 0.047108696977375075, 0.04325491201598197, 0.045397296984447166, 0.0433025429956615] for dtype=float32
-Using      : Intel(R) Graphics [0x9a49]
-Wall times : [0.1194021370029077, 0.0720841379952617, 0.0647223969863262, 0.06645121600013226, 0.06911522900918499] for dtype=float32
-```
diff --git a/examples/cython/usm_memory/blackscholes/__init__.py b/examples/cython/usm_memory/blackscholes/__init__.py
deleted file mode 100644
index 7fc2069978..0000000000
--- a/examples/cython/usm_memory/blackscholes/__init__.py
+++ /dev/null
@@ -1,29 +0,0 @@
-#                      Data Parallel Control (dpctl)
-#
-# Copyright 2020-2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ._blackscholes_usm import black_scholes_price, populate_params
-
-__doc__ = """
-This is a toy example module illustrating use of SYCL-based code
-to operate on NumPy arrays addressing memory allocated by standard
-Python memory allocator.
-"""
-__license__ = "Apache 2.0"
-
-__all__ = [
-    "black_scholes_price",
-    "populate_params",
-]
diff --git a/examples/cython/usm_memory/blackscholes/_blackscholes_usm.pyx b/examples/cython/usm_memory/blackscholes/_blackscholes_usm.pyx
deleted file mode 100644
index a88175982c..0000000000
--- a/examples/cython/usm_memory/blackscholes/_blackscholes_usm.pyx
+++ /dev/null
@@ -1,209 +0,0 @@
-#                      Data Parallel Control (dpctl)
-#
-# Copyright 2020-2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# cython: language_level=3
-# distutils: language=c++
-
-cimport dpctl as c_dpctl
-cimport dpctl.tensor as c_dpt
-from dpctl.sycl cimport queue as dpcpp_queue
-from dpctl.sycl cimport unwrap_queue
-
-import dpctl.tensor as dpt
-
-
-cdef extern from "sycl_blackscholes.hpp":
-    cdef void cpp_blackscholes[T](
-        dpcpp_queue, size_t n_opts, T* option_params, T* callput
-    ) except +
-    cdef void cpp_populate_params[T](
-        dpcpp_queue,
-        size_t n_opts,
-        T* option_params,
-        T pl,
-        T ph,
-        T sl,
-        T sh,
-        T tl,
-        T th,
-        T rl,
-        T rh,
-        T vl,
-        T vh,
-        int seed
-    ) except +
-
-
-def black_scholes_price(c_dpt.usm_ndarray option_params_arr):
-    """black_scholes_price(params)
-
-    Applies Black-Scholes-Merton formula to compute call and put European
-    option prices.
-
-    Args:
-        option_params_arr: usm_ndarray
-            Floating point array with shape (n_opts, 5) containing
-            (price, strike, maturity, rate, volatility) per each option.
-    Returns:
-        usm_ndarray
-            Floating point array with shape (n_opts, 2) containing
-            (call_price, put_price) per each option.
-    """
-    cdef size_t n_opts = 0
-    cdef size_t n_params = 0
-    cdef c_dpctl.SyclQueue q
-    cdef dpcpp_queue* exec_q_ptr = NULL
-    cdef c_dpt.usm_ndarray call_put_prices
-    cdef double* dp1 = NULL
-    cdef double* dp2 = NULL
-    cdef float* fp1 = NULL
-    cdef float* fp2 = NULL
-    cdef int flags_ = 0
-    cdef int typenum_ = 0
-
-    if option_params_arr.get_ndim() != 2:
-        raise ValueError("Option parameter array must be 2-dimensional")
-
-    n_opts = option_params_arr.get_shape()[0]
-    n_params = option_params_arr.get_shape()[1]
-
-    if (n_params != 5):
-        raise ValueError((
-            "Array of option parameters has unexpected number of "
-            "columns {} != 5. Each row must specify (current_price, "
-            "strike_price, maturity, interest_rate, volatility)."
-            ).format(n_params)
-        )
-
-    flags_ = option_params_arr.get_flags()
-    if (not (flags_ & c_dpt.USM_ARRAY_C_CONTIGUOUS)):
-        raise ValueError("Only C-contiguous arrays are supported")
-
-    q = option_params_arr.get_sycl_queue()
-    exec_q_ptr = unwrap_queue(q.get_queue_ref())
-    typenum_ = option_params_arr.get_typenum()
-
-    if (typenum_ == c_dpt.UAR_DOUBLE):
-        call_put_prices = dpt.empty((n_opts, 2), dtype="d", sycl_queue=q)
-        dp1 = <double *>option_params_arr.get_data()
-        dp2 = <double *>call_put_prices.get_data()
-        # ensure content of dp1 and dp2 is no longer worked on
-        exec_q_ptr[0].wait()
-        cpp_blackscholes[double](exec_q_ptr[0], n_opts, dp1, dp2)
-    elif (typenum_ == c_dpt.UAR_FLOAT):
-        call_put_prices = dpt.empty((n_opts, 2), dtype="f", sycl_queue=q)
-        fp1 = <float *>option_params_arr.get_data()
-        fp2 = <float *>call_put_prices.get_data()
-        # ensure content of fp1 and fp2 is no longer worked on
-        exec_q_ptr[0].wait()
-        cpp_blackscholes[float](exec_q_ptr[0], n_opts, fp1, fp2)
-    else:
-        raise ValueError("Unsupported data-type")
-
-    return call_put_prices
-
-
-def populate_params(
-        c_dpt.usm_ndarray option_params_arr,
-        pl,
-        ph,
-        sl,
-        sh,
-        tl,
-        th,
-        rl,
-        rh,
-        vl,
-        vh,
-        int seed
-):
-    """ populate_params(params, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, seed)
-
-    Args:
-        params: usm_ndarray
-            Array of shape (n_opts, 5) to populate with price, strike, time to
-            maturity, interest rate, volatility rate per option using uniform
-            distribution with provided distribution parameters.
-        pl: float
-            Lower bound for distribution of option price parameter
-        ph: float
-            Upper bound for distribution of option price parameter
-        sl: float
-            Lower bound for distribution of option strike parameter
-        sh: float
-            Upper bound for distribution of option strike parameter
-        tl: float
-            Lower bound for distribution of option time to maturity parameter
-        th: float
-            Upper bound for distribution of option time to maturity parameter
-        rl: float
-            Lower bound for distribution of option interest rate parameter
-        rh: float
-            Upper bound for distribution of option interest rate parameter
-        vl: float
-            Lower bound for distribution of option volatility parameter
-        vh: float
-            Upper bound for distribution of option volatility parameter
-        seed: int
-            Pseudo-random number generator parameter
-    """
-    cdef size_t n_opts = 0
-    cdef size_t n_params = 0
-    cdef dpcpp_queue* exec_q_ptr = NULL
-    cdef double* dp = NULL
-    cdef float* fp = NULL
-    cdef int typenum_ = 0
-    cdef int flags_ = 0
-
-    if option_params_arr.get_ndim() != 2:
-        raise ValueError("Option parameter array must be 2-dimensional")
-
-    n_opts = option_params_arr.get_shape()[0]
-    n_params = option_params_arr.get_shape()[1]
-
-    if (n_params != 5):
-        raise ValueError(
-            "Array of option parameters has unexpected number of "
-            "columns {} != 5. Each row must specify (current_price, "
-            "strike_price, maturity, interest_rate, volatility).".format(
-                n_params
-            )
-        )
-
-    flags_ = option_params_arr.get_flags()
-    if (not (flags_ & c_dpt.USM_ARRAY_C_CONTIGUOUS)):
-        raise ValueError("Only C-contiguous arrays are supported")
-
-    exec_q_ptr = unwrap_queue(option_params_arr.get_queue_ref())
-
-    typenum_ = option_params_arr.get_typenum()
-
-    if (typenum_ == c_dpt.UAR_DOUBLE):
-        dp = <double *>option_params_arr.get_data()
-        exec_q_ptr[0].wait()
-        cpp_populate_params[double](
-            exec_q_ptr[0], n_opts, dp, pl, ph,
-            sl, sh, tl, th, rl, rh, vl, vh, seed
-        )
-    elif (typenum_ == c_dpt.UAR_FLOAT):
-        fp = <float *>option_params_arr.get_data()
-        exec_q_ptr[0].wait()
-        cpp_populate_params[float](
-            exec_q_ptr[0], n_opts, fp, pl, ph,
-            sl, sh, tl, th, rl, rh, vl, vh, seed
-        )
-    else:
-        raise ValueError("Unsupported data-type")
diff --git a/examples/cython/usm_memory/scripts/bench.py b/examples/cython/usm_memory/scripts/bench.py
deleted file mode 100644
index 100c11be4b..0000000000
--- a/examples/cython/usm_memory/scripts/bench.py
+++ /dev/null
@@ -1,93 +0,0 @@
-#                      Data Parallel Control (dpctl)
-#
-# Copyright 2020-2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# coding: utf-8
-
-import timeit
-
-import blackscholes as bs
-
-import dpctl
-import dpctl.tensor as dpt
-
-
-def gen_option_params(
-    n_opts, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, dtype, queue=None
-):
-    params = dpt.empty((n_opts, 5), dtype=dtype, sycl_queue=queue)
-    seed = 1234
-    bs.populate_params(params, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, seed)
-    return params
-
-
-n_opts = 3 * 10**7
-
-# compute on CPU sycl device
-
-queues = []
-for filter_str in ["cpu", "gpu"]:
-    try:
-        q = dpctl.SyclQueue(filter_str)
-        queues.append(q)
-    except dpctl.SyclQueueCreationError:
-        continue
-
-if not queues:
-    print("No queues could be created, nothing to do.")
-    exit(0)
-
-opt_params_list = []
-for q in queues:
-    opt_params = gen_option_params(
-        n_opts,
-        20.0,
-        30.0,
-        22.0,
-        29.0,
-        18.0,
-        24.0,
-        0.01,
-        0.05,
-        0.01,
-        0.05,
-        "f",
-        queue=q,
-    )
-    opt_params_list.append(opt_params)
-
-times_dict = dict()
-dtype_dict = dict()
-
-for q, params in zip(queues, opt_params_list):
-    times_list = []
-    for _ in range(5):
-        t0 = timeit.default_timer()
-        X1 = bs.black_scholes_price(params)
-        t1 = timeit.default_timer()
-        times_list.append(t1 - t0)
-    times_dict[q.name] = times_list
-    dtype_dict[q.name] = params.dtype
-
-print(
-    f"Pricing {n_opts:,} vanilla European options using "
-    "Black-Scholes-Merton formula"
-)
-print("")
-for dev_name, wall_times in times_dict.items():
-    print("Using      : {}".format(dev_name))
-    print(
-        "Wall times : {} for dtype={}".format(wall_times, dtype_dict[dev_name])
-    )
diff --git a/examples/cython/usm_memory/setup.py b/examples/cython/usm_memory/setup.py
deleted file mode 100644
index 9c07a8c807..0000000000
--- a/examples/cython/usm_memory/setup.py
+++ /dev/null
@@ -1,32 +0,0 @@
-#                      Data Parallel Control (dpctl)
-#
-# Copyright 2020-2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from skbuild import setup
-
-setup(
-    name="blackscholes_usm",
-    version="0.0.0",
-    description="An example of Cython extension calling SYCL routines",
-    long_description="""
-    Example of using SYCL to work on usm allocations.
-
-    See README.md for more details.
-    """,
-    license="Apache 2.0",
-    author="Intel Corporation",
-    url="https://github.com/IntelPython/dpctl",
-    packages=["blackscholes"],
-)
diff --git a/examples/cython/usm_memory/src/sycl_blackscholes.hpp b/examples/cython/usm_memory/src/sycl_blackscholes.hpp
deleted file mode 100644
index e3d8d759a9..0000000000
--- a/examples/cython/usm_memory/src/sycl_blackscholes.hpp
+++ /dev/null
@@ -1,176 +0,0 @@
-//=- sycl_blackscholes.cpp - Example of SYCL code to be called from Cython  =//
-//
-//                      Data Parallel Control (dpctl)
-//
-// Copyright 2020-2025 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file implements SYCL code to price European vanilla options using
-/// Black-Scholes formula, as well as code to generate option parameters using
-/// SYCL device random number generation library from Intel(R) Math Kernel
-/// Library.
-///
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include <oneapi/mkl.hpp>
-#include <oneapi/mkl/rng/device.hpp>
-#include <sycl/sycl.hpp>
-
-template <typename T> class black_scholes_kernel;
-
-constexpr int n_params = 5;
-constexpr int n_params_next_pow2 = 8;
-
-constexpr int n_prices = 2;
-constexpr int PRICE = 0;
-constexpr int STRIKE = 1;
-constexpr int MATURITY = 2;
-constexpr int RATE = 3;
-constexpr int VOLATILITY = 4;
-constexpr int CALL = 0;
-constexpr int PUT = 1;
-
-template <typename T>
-void cpp_blackscholes(sycl::queue &q, size_t n_opts, T *params, T *callput)
-{
-    using data_t = T;
-
-    auto e = q.submit([&](sycl::handler &cgh) {
-        data_t zero = data_t(0), one = data_t(1), two = data_t(2);
-        data_t quarter = one / data_t(4);
-        data_t half = one / two;
-
-        cgh.parallel_for<class black_scholes_kernel<T>>(
-            sycl::range<1>(n_opts), [=](sycl::id<1> idx) {
-                const size_t i = n_params * idx[0];
-                const data_t opt_price = params[i + PRICE];
-                const data_t opt_strike = params[i + STRIKE];
-                const data_t opt_maturity = params[i + MATURITY];
-                const data_t opt_rate = params[i + RATE];
-                const data_t opt_volatility = params[i + VOLATILITY];
-                data_t a, b, c, y, z, e, d1, d1c, d2, d2c, w1, w2;
-                data_t mr = -opt_rate,
-                       sig_sig_two = two * opt_volatility * opt_volatility;
-
-                a = sycl::log(opt_price / opt_strike);
-                b = opt_maturity * mr;
-                z = opt_maturity * sig_sig_two;
-
-                c = quarter * z;
-                e = sycl::exp(b);
-                y = sycl::rsqrt(z);
-
-                a = b - a;
-                w1 = (a - c) * y;
-                w2 = (a + c) * y;
-
-                if (w1 < zero) {
-                    d1 = sycl::erfc(w1) * half;
-                    d1c = one - d1;
-                }
-                else {
-                    d1c = sycl::erfc(-w1) * half;
-                    d1 = one - d1c;
-                }
-                if (w2 < zero) {
-                    d2 = sycl::erfc(w2) * half;
-                    d2c = one - d2;
-                }
-                else {
-                    d2c = sycl::erfc(-w2) * half;
-                    d2 = one - d2c;
-                }
-
-                e *= opt_strike;
-                data_t call_price = opt_price * d1 - e * d2;
-                data_t put_price = e * d2c - opt_price * d1c;
-
-                const size_t callput_i = n_prices * idx[0];
-                callput[callput_i + CALL] = call_price;
-                callput[callput_i + PUT] = put_price;
-            });
-    });
-
-    e.wait_and_throw();
-
-    return;
-}
-
-template <typename T>
-void cpp_populate_params(sycl::queue q,
-                         size_t n_opts,
-                         T *params,
-                         T pl,
-                         T ph,
-                         T sl,
-                         T sh,
-                         T tl,
-                         T th,
-                         T rl,
-                         T rh,
-                         T vl,
-                         T vh,
-                         int seed)
-{
-
-    sycl::event e = q.submit([&](sycl::handler &cgh) {
-        cgh.parallel_for(sycl::range<1>(n_opts), [=](sycl::item<1> idx) {
-            size_t i = n_params * idx.get_id(0);
-            size_t j = n_params_next_pow2 * idx.get_id(0);
-
-            // create engine to sample 5 parameters per workers
-            oneapi::mkl::rng::device::philox4x32x10<n_params_next_pow2> engine(
-                seed, j);
-            oneapi::mkl::rng::device::uniform<T> distr;
-
-            sycl::vec<T, n_params_next_pow2> res =
-                oneapi::mkl::rng::device::generate(distr, engine);
-
-            {
-                const int pos = PRICE;
-                auto u = res[pos];
-                params[i + pos] = pl * u + ph * (T(1) - u);
-            }
-            {
-                const int pos = STRIKE;
-                auto u = res[pos];
-                params[i + pos] = sl * u + sh * (T(1) - u);
-            }
-            {
-                const int pos = MATURITY;
-                auto u = res[pos];
-                params[i + pos] = tl * u + th * (T(1) - u);
-            }
-            {
-                const int pos = RATE;
-                auto u = res[pos];
-                params[i + pos] = rl * u + rh * (T(1) - u);
-            }
-            {
-                const int pos = VOLATILITY;
-                auto u = res[pos];
-                params[i + pos] = vl * u + vh * (T(1) - u);
-            }
-        });
-    });
-
-    e.wait_and_throw();
-
-    return;
-}
diff --git a/examples/cython/usm_memory/tests/test_black_scholes.py b/examples/cython/usm_memory/tests/test_black_scholes.py
deleted file mode 100644
index 6c3539c117..0000000000
--- a/examples/cython/usm_memory/tests/test_black_scholes.py
+++ /dev/null
@@ -1,94 +0,0 @@
-#                      Data Parallel Control (dpctl)
-#
-# Copyright 2020-2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-
-import blackscholes
-import numpy as np
-import pytest
-
-import dpctl
-import dpctl.tensor as dpt
-
-
-def ref_python_black_scholes(price, strike, t, rate, vol):
-    mr = -rate
-    sig_sig_two = vol * vol * 2
-
-    P = price
-    S = strike
-    T = t
-
-    a = math.log(P / S)
-    b = T * mr
-
-    z = T * sig_sig_two
-    c = 0.25 * z
-    y = 1 / math.sqrt(z)
-
-    Se = math.exp(b) * S
-
-    w1 = (a - b + c) * y
-    w2 = (a - b - c) * y
-
-    if w1 > 0:
-        d1 = 0.5 * math.erfc(-w1)
-        d1c = 1.0 - d1
-    else:
-        d1c = 0.5 * math.erfc(w1)
-        d1 = 1.0 - d1c
-    if w2 > 0:
-        d2 = 0.5 * math.erfc(-w2)
-        d2c = 1.0 - d2
-    else:
-        d2c = 0.5 * math.erfc(w2)
-        d2 = 1.0 - d2c
-
-    call = P * d1 - Se * d2
-    put = Se * d2c - P * d1c
-    return (call, put)
-
-
-@pytest.mark.parametrize("dtype", [dpt.float32, dpt.float64])
-def test_black_scholes_merton(dtype):
-    try:
-        q = dpctl.SyclQueue()
-    except dpctl.SyclQueueCreationError:
-        pytest.skip("Unable to create queue")
-    if dtype == dpt.float64 and not q.sycl_device.has_aspect_fp64:
-        pytest.skip(f"Hardware {q.sycl_device.name} does not support {dtype}")
-    opts = dpt.empty((3, 5), dtype=dtype)
-    # copy from Host NumPy to USM buffer
-    opts[:, :] = dpt.asarray(
-        [
-            [81.2, 81.8, 29, 0.01, 0.02],
-            [24.24, 22.1, 10, 0.02, 0.08],
-            [100, 100, 30, 0.01, 0.12],
-        ],
-        dtype=dtype,
-    )
-    X = blackscholes.black_scholes_price(opts)
-
-    # compute prices in Python
-    X_ref = np.array(
-        [ref_python_black_scholes(*opt) for opt in dpt.asnumpy(opts)],
-        dtype=dtype,
-    )
-
-    tol = 64 * dpt.finfo(dtype).eps
-    assert np.allclose(dpt.asnumpy(X), X_ref, atol=tol, rtol=tol), np.abs(
-        dpt.asnumpy(X) - X_ref
-    ).max()

From 216203618cfdd71d3b0e9bb456913099e5572ff0 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 28 Jan 2026 19:12:12 -0800
Subject: [PATCH 13/24] remove solver from onemkl example

---
 examples/pybind11/onemkl_gemv/solve.py        | 256 ------------------
 .../onemkl_gemv/sycl_timing_solver.py         | 128 ---------
 2 files changed, 384 deletions(-)
 delete mode 100644 examples/pybind11/onemkl_gemv/solve.py
 delete mode 100644 examples/pybind11/onemkl_gemv/sycl_timing_solver.py

diff --git a/examples/pybind11/onemkl_gemv/solve.py b/examples/pybind11/onemkl_gemv/solve.py
deleted file mode 100644
index 29c53fc7a5..0000000000
--- a/examples/pybind11/onemkl_gemv/solve.py
+++ /dev/null
@@ -1,256 +0,0 @@
-#                      Data Parallel Control (dpctl)
-#
-# Copyright 2020-2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import sycl_gemm
-
-import dpctl
-import dpctl.tensor as dpt
-
-
-def chebyshev(A, b, x0, nIters, lMax, lMin, depends=[]):
-    """Chebyshev iterative solver using SYCL routines"""
-    d = (lMax + lMin) / 2
-    c = (lMax - lMin) / 2
-
-    x = dpt.copy(x0)
-    exec_queue = A.sycl_queue
-    assert exec_queue == x.sycl_queue
-    Ax = dpt.empty_like(A[:, 0])
-    r = dpt.empty_like(Ax)
-    p = dpt.empty_like(Ax)
-
-    e_x = dpctl.SyclEvent()
-    # Ax = A @ x
-    _, e_dot = sycl_gemm.gemv(exec_queue, A, x, Ax, depends=depends)
-    # r = b - Ax
-    _, e_sub = sycl_gemm.sub(exec_queue, b, Ax, r, depends=[e_dot])
-    r_ev = e_sub
-    for i in range(nIters):
-        z = r
-        z_ev = r_ev
-        if i == 0:
-            p[:] = z
-            alpha = 1 / d
-            _, e_axpby = dpctl.SyclEvent(), dpctl.SyclEvent()
-        elif i == 1:
-            beta = 0.5 * (c * alpha) ** 2
-            alpha = 1 / (d - beta / alpha)
-            # p = z + beta * p
-            _, e_axpby = sycl_gemm.axpby_inplace(
-                exec_queue, 1, z, beta, p, depends=[z_ev]
-            )
-        else:
-            beta = (c / 2 * alpha) ** 2
-            alpha = 1 / (d - beta / alpha)
-            # p = z + beta * p
-            _, e_axpby = sycl_gemm.axpby_inplace(
-                exec_queue, 1, z, beta, p, depends=[z_ev]
-            )
-        # x = x + alpha * p
-        _, e_x = sycl_gemm.axpby_inplace(
-            exec_queue, alpha, p, 1, x, depends=[e_axpby, e_x]
-        )
-        # Ax = A @ x
-        _, e_dot = sycl_gemm.gemv(exec_queue, A, x, Ax, depends=[e_x])
-        # r = b - Ax
-        _, e_sub = sycl_gemm.sub(exec_queue, b, Ax, r, depends=[e_dot])
-        # residual = dot(r, r)
-        residual = sycl_gemm.norm_squared_blocking(
-            exec_queue, r, depends=[e_sub]
-        )
-        if residual <= 1e-29:
-            print(f"chebyshev: converged in {i} iters")
-            break
-    exec_queue.wait()  # wait for all host tasks to complete
-    return x
-
-
-def check_with_numpy(A, b):
-    """Direct solver using numpy"""
-    import numpy as np
-
-    return np.linalg.solve(Anp, bnp)
-
-
-def chebyshev_numpy(A, b, x0, nIters, lMax, lMin):
-    """Chebyshev iterative solver using numpy"""
-    d = (lMax + lMin) / 2
-    c = (lMax - lMin) / 2
-
-    x = x0
-
-    Ax = np.dot(A, x)
-    r = b - Ax
-    for i in range(nIters):
-        z = r
-        if i == 0:
-            p = z
-            alpha = 1 / d
-        elif i == 1:
-            beta = 0.5 * (c * alpha) ** 2
-            alpha = 1 / (d - beta / alpha)
-            p = z + beta * p
-        else:
-            beta = (c / 2 * alpha) ** 2
-            alpha = 1 / (d - beta / alpha)
-            p = z + beta * p
-        x = x + alpha * p
-        Ax = np.dot(A, x)
-        r = b - Ax
-        residual = np.dot(r, r)
-        if residual <= 1e-29:
-            print(f"chebyshev_numpy: converged in {i} iters")
-            break
-    return x
-
-
-def cg_solve(A, b):
-    """
-    Conjugate gradient solver for A @ x == b.
-
-    Returns tuple: (x, converged)
-
-    converged is False if solver has not converged, or the iteration number
-    """
-    exec_queue = A.sycl_queue
-    exec_queue.wait()
-
-    x = dpt.zeros_like(b)
-    Ap = dpt.empty_like(x)
-
-    all_host_tasks = []
-    r = dpt.copy(b)
-    p = dpt.copy(b)
-
-    rsold = sycl_gemm.norm_squared_blocking(exec_queue, r)
-    if rsold < 1e-20:
-        return (b, 0)
-    converged = False
-    max_iters = b.shape[0]
-
-    e_p = dpctl.SyclEvent()
-    e_x = dpctl.SyclEvent()
-    for i in range(max_iters):
-        # Ap = A @ p
-        he_gemv, e_gemv = sycl_gemm.gemv(exec_queue, A, p, Ap, depends=[e_p])
-        all_host_tasks.append(he_gemv)
-        # alpha = rsold / dot(p, Ap)
-        alpha = rsold / sycl_gemm.dot_blocking(
-            exec_queue, p, Ap, depends=[e_p, e_gemv]
-        )
-        # x = x + alpha * p
-        he1_x_update, e1_x_update = sycl_gemm.axpby_inplace(
-            exec_queue, alpha, p, 1, x, depends=[e_x]
-        )
-        all_host_tasks.append(he1_x_update)
-
-        # r = r - alpha * Ap
-        he2_r_update, e2_r_update = sycl_gemm.axpby_inplace(
-            exec_queue, -alpha, Ap, 1, r
-        )
-        all_host_tasks.append(he2_r_update)
-
-        # rsnew = dot(r, r)
-        rsnew = sycl_gemm.norm_squared_blocking(
-            exec_queue, r, depends=[e2_r_update]
-        )
-        if rsnew < 1e-20:
-            e1_x_update.wait()
-            converged = i
-            break
-        beta = rsnew / rsold
-
-        # p = r + beta * p
-        he3_p_update, e3_p_update = sycl_gemm.axpby_inplace(
-            exec_queue, 1, r, beta, p, depends=[e2_r_update]
-        )
-
-        rsold = rsnew
-        all_host_tasks.append(he3_p_update)
-        e_p = e3_p_update
-        e_x = e1_x_update
-
-    dpctl.SyclEvent.wait_for(all_host_tasks)
-    return x, converged
-
-
-def cg_solve_numpy(A, b):
-    x = np.zeros_like(b)
-    r = b - np.dot(A, x)
-    p = r
-    rsold = np.dot(r, r)
-    converged = False
-    max_iters = b.shape[0]
-
-    for i in range(max_iters):
-        Ap = np.dot(A, p)
-        alpha = rsold / np.dot(p, Ap)
-        x = x + alpha * p
-        r = r - alpha * Ap
-        rsnew = np.dot(r, r)
-
-        if rsnew < 1e-20:
-            converged = i
-            break
-
-        beta = rsnew / rsold
-        p = r + beta * p
-        rsold = rsnew
-
-    return (x, converged)
-
-
-if __name__ == "__main__":
-    n = 32
-    Anp = (
-        2 * np.eye(n, n, k=0, dtype="d")
-        + np.eye(n, n, k=1, dtype="d")
-        + np.eye(n, n, k=-1, dtype="d")
-    )
-    bnp = np.geomspace(0.5, 2, n, dtype="d")
-    # bounds on eigenvalues of cartan matrix are, needed only
-    # for the Chebyshev solver
-    lambda_max = 4
-    lambda_min = 4 * np.square(np.sin(np.pi / (2 * (n + 2))))
-
-    q = dpctl.SyclQueue(property="enable_profiling")
-    q.print_device_info()
-    A = dpt.asarray(Anp, dtype="d", usm_type="device", sycl_queue=q)
-    dev = A.device
-    b = dpt.asarray(bnp, dtype="d", usm_type="device", device=dev)
-    x0 = b
-    t = dpctl.SyclTimer()
-    with t(dev.sycl_queue):
-        x, conv = cg_solve(A, b)
-    print("SYCL solver, 1st run: ", (conv, t.dt))
-    with t(dev.sycl_queue):
-        x, conv = cg_solve(A, b)
-    print("SYCL solver, 2nd run: ", (conv, t.dt))
-
-    x_ref = check_with_numpy(Anp, bnp)  # solve usign LU solver
-
-    with t(dev.sycl_queue):
-        x_np, conv = cg_solve_numpy(dpt.asnumpy(A), dpt.asnumpy(b))
-    print("NumPy's powered CG solver: ", (conv, t.dt))
-    print(
-        "SYCL cg-solver solution close to reference: ",
-        np.allclose(dpt.asnumpy(x), x_ref),
-    )
-    print(
-        "NumPy's cg-solver solution close to reference: ",
-        np.allclose(x_np, x_ref),
-    )
diff --git a/examples/pybind11/onemkl_gemv/sycl_timing_solver.py b/examples/pybind11/onemkl_gemv/sycl_timing_solver.py
deleted file mode 100644
index febf9d06da..0000000000
--- a/examples/pybind11/onemkl_gemv/sycl_timing_solver.py
+++ /dev/null
@@ -1,128 +0,0 @@
-#                      Data Parallel Control (dpctl)
-#
-# Copyright 2020-2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-
-import numpy as np
-
-# coding: utf-8
-import solve
-import sycl_gemm
-
-import dpctl
-import dpctl.tensor as dpt
-
-argv = sys.argv
-
-n = 1000
-rank = 11
-
-if len(argv) > 1:
-    n = int(argv[1])
-if len(argv) > 2:
-    rank = int(argv[2])
-
-
-print(
-    f"Solving {n} by {n} diagonal linear "
-    f"system with rank {rank} perturbation."
-)
-
-Anp = np.eye(n, n) + (lambda x: x.T @ x)(np.random.randn(rank, n))
-bnp = np.random.rand(n)
-
-q = dpctl.SyclQueue(property=["enable_profiling"])
-q.print_device_info()
-if q.is_in_order:
-    print("Using in-order queue")
-else:
-    print("Using not in-order queue")
-
-api_dev = dpctl.tensor.Device.create_device(q)
-A = dpt.asarray(Anp, "d", device=api_dev)
-b = dpt.asarray(bnp, "d", device=api_dev)
-
-assert A.sycl_queue == b.sycl_queue
-
-# allocate buffers for computation of residual
-r = dpt.empty_like(b)
-delta = dpt.empty_like(b)
-
-timer = dpctl.SyclTimer(time_scale=1e3)
-
-
-def time_python_solver(num_iters=6):
-    """
-    Time solver implemented in Python with use of asynchronous
-    SYCL kernel submission.
-    """
-    global x
-    iters = []
-    for i in range(num_iters):
-        with timer(api_dev.sycl_queue):
-            x, conv_in = solve.cg_solve(A, b)
-
-        print(i, "(host_dt, device_dt)=", timer.dt)
-        iters.append(conv_in)
-        assert x.usm_type == A.usm_type
-        assert x.usm_type == b.usm_type
-        assert x.sycl_queue == A.sycl_queue
-        assert x.sycl_queue == b.sycl_queue
-
-    return iters
-
-
-def time_cpp_solver(num_iters=6):
-    """
-    Time solver implemented in C++ but callable from Python.
-    C++ implementation uses the same algorithm and submits same
-    kernels asynchronously, but bypasses Python binding overhead
-    incurred when algorithm is driver from Python.
-    """
-    global x_cpp
-    x_cpp = dpt.empty_like(b)
-    iters = []
-    for i in range(num_iters):
-        with timer(api_dev.sycl_queue):
-            conv_in = sycl_gemm.cpp_cg_solve(q, A, b, x_cpp)
-
-        print(i, "(host_dt, device_dt)=", timer.dt)
-        iters.append(conv_in)
-
-    return iters
-
-
-def compute_residual(x):
-    """
-    Computes quality of the solution, `norm_squared(A@x - b)`.
-    """
-    assert isinstance(x, dpt.usm_ndarray)
-    q = A.sycl_queue
-    hev, ev = sycl_gemm.gemv(q, A, x, r)
-    hev2, ev2 = sycl_gemm.sub(q, r, b, delta, [ev])
-    rs = sycl_gemm.norm_squared_blocking(q, delta)
-    dpctl.SyclEvent.wait_for([hev, hev2])
-    return rs
-
-
-print("Converged in: ", time_python_solver())
-print(f"Python solution residual norm squared: {compute_residual(x)}")
-
-assert q == api_dev.sycl_queue
-print("")
-
-print("Converged in: ", time_cpp_solver())
-print(f"cpp_cg_solve solution residual norm squared: {compute_residual(x_cpp)}")

From 1e5d3ea0bf15dc24e81eeb00d10c15c0a3003835 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 28 Jan 2026 19:12:29 -0800
Subject: [PATCH 14/24] remove python/sycl_timer example

---
 examples/python/sycl_timer.py | 81 -----------------------------------
 1 file changed, 81 deletions(-)
 delete mode 100644 examples/python/sycl_timer.py

diff --git a/examples/python/sycl_timer.py b/examples/python/sycl_timer.py
deleted file mode 100644
index e4bf521e70..0000000000
--- a/examples/python/sycl_timer.py
+++ /dev/null
@@ -1,81 +0,0 @@
-#                      Data Parallel Control (dpctl)
-#
-# Copyright 2020-2025 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import numpy as np
-
-import dpctl
-import dpctl.tensor as dpt
-from dpctl import SyclTimer
-
-
-def matmul(m1, m2):
-    """Naive matrix multiplication implementation"""
-    assert m1.ndim == 2
-    assert m2.ndim == 2
-    assert m1.shape[1] == m2.shape[0]
-    m1 = m1[:, dpt.newaxis, :]
-    m2 = dpt.permute_dims(m2, (1, 0))[dpt.newaxis, :, :]
-    # form m_prod[i, j, k] = m1[i,k] * m2[k, j]
-    m_prods = m1 * m2
-    # sum over k
-    return dpt.sum(m_prods, axis=-1)
-
-
-n = 500
-
-try:
-    q = dpctl.SyclQueue(property="enable_profiling")
-except dpctl.SyclQueueCreationError:
-    print(
-        "Skipping the example, as dpctl.SyclQueue targeting "
-        "default device could not be created"
-    )
-    exit(0)
-
-a_flat = dpt.arange(n * n, dtype=dpt.float32, sycl_queue=q)
-a = dpt.reshape(a_flat, (n, n))
-
-b_rand = np.random.random(n * n).astype(np.float32)
-b_flat = dpt.asarray(b_rand, dtype=dpt.float32, sycl_queue=q)
-b = dpt.reshape(b_flat, (n, n))
-
-wall_times = []
-device_times = []
-
-print(
-    f"Computing naive matrix multiplication of two {n} by {n} matrices "
-    f"on {q.sycl_device.name}, repeating 5 times."
-)
-print()
-for _ in range(5):
-    timer = SyclTimer(time_scale=1)
-    with timer(q):
-        a_matmul_b = matmul(a, b)
-    host_time, device_time = timer.dt
-    wall_times.append(host_time)
-    device_times.append(device_time)
-
-c = dpt.asnumpy(a_matmul_b)
-cc = np.dot(dpt.asnumpy(a), dpt.asnumpy(b))
-
-print("Wall time: ", wall_times, "\nDevice time: ", device_times)
-print()
-print(
-    "Accuracy test: passed."
-    if np.allclose(c, cc)
-    else (f"Accuracy test: FAILED. \n   Discrepancy = {np.max(np.abs(c-cc))}")
-)

From ad83e957089a6dffd71f6044be3de3fcd268ad98 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Mon, 2 Feb 2026 13:13:27 -0800
Subject: [PATCH 15/24] Update gemv example to remove calls to dpctl.tensor

---
 .../onemkl_gemv/sycl_gemm/_onemkl.cpp         | 405 +++++++-----------
 .../pybind11/onemkl_gemv/tests/test_gemm.py   | 139 ++++--
 2 files changed, 275 insertions(+), 269 deletions(-)

diff --git a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp
index 2dddd415ce..33c30ee9cf 100644
--- a/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp
+++ b/examples/pybind11/onemkl_gemv/sycl_gemm/_onemkl.cpp
@@ -22,7 +22,7 @@
 /// \file
 /// This file implements Pybind11-generated extension exposing functions that
 /// take dpctl Python objects, such as dpctl.SyclQueue, dpctl.SyclDevice, and
-/// dpctl.tensor.usm_ndarray as arguments.
+/// dpctl.memory.MemoryUSMDevice/Shared/Host as arguments.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -30,7 +30,9 @@
 #include <sycl/sycl.hpp>
 #include <oneapi/mkl.hpp>
 #include "cg_solver.hpp"
+#include <cstdint>
 #include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
 #include <pybind11/stl.h>
 #include <pybind11/complex.h>
 #include "dpctl4pybind11.hpp"
@@ -40,27 +42,45 @@ namespace py = pybind11;
 
 using dpctl::utils::keep_args_alive;
 
+namespace
+{
+
+void validate_usm_nbytes(const dpctl::memory::usm_memory &mem,
+                         std::size_t required_nbytes,
+                         const char *arg_name)
+{
+    const std::size_t nbytes = mem.get_nbytes();
+    if (nbytes < required_nbytes) {
+        throw py::value_error(std::string(arg_name) +
+                              " does not have enough bytes for the requested "
+                              "shape/dtype");
+    }
+}
+
+void validate_positive_sizes(std::int64_t n, std::int64_t m)
+{
+    if (n < 0 || m < 0) {
+        throw py::value_error("Dimensions must be non-negative");
+    }
+}
+
+} // end anonymous namespace
+
 std::pair<sycl::event, sycl::event>
 py_gemv(sycl::queue &q,
-        dpctl::tensor::usm_ndarray matrix,
-        dpctl::tensor::usm_ndarray vector,
-        dpctl::tensor::usm_ndarray result,
+        dpctl::memory::usm_memory matrix,
+        dpctl::memory::usm_memory vector,
+        dpctl::memory::usm_memory result,
+        std::int64_t n,
+        std::int64_t m,
+        py::dtype dtype,
+        std::int64_t lda,
         const std::vector<sycl::event> &depends = {})
 {
-    if (matrix.get_ndim() != 2 || vector.get_ndim() != 1 ||
-        result.get_ndim() != 1)
-    {
-        throw std::runtime_error(
-            "Inconsistent dimensions, expecting matrix and a vector");
-    }
+    validate_positive_sizes(n, m);
 
-    py::ssize_t n = matrix.get_shape(0); // get 0-th element of the shape
-    py::ssize_t m = matrix.get_shape(1);
-
-    py::ssize_t v_dim = vector.get_shape(0);
-    py::ssize_t r_dim = result.get_shape(0);
-    if (v_dim != m || r_dim != n) {
-        throw std::runtime_error("Inconsistent shapes.");
+    if (lda < m) {
+        throw py::value_error("lda must be >= number of columns (m)");
     }
 
     if (!dpctl::utils::queues_are_compatible(
@@ -70,74 +90,60 @@ py_gemv(sycl::queue &q,
             "USM allocations are not compatible with the execution queue.");
     }
 
-    auto const &api = dpctl::detail::dpctl_capi::get();
-
-    if (!((matrix.is_c_contiguous()) &&
-          (vector.is_c_contiguous() || vector.is_f_contiguous()) &&
-          (result.is_c_contiguous() || result.is_f_contiguous())))
-    {
-        throw std::runtime_error("Arrays must be contiguous.");
-    }
+    const py::ssize_t itemsize = dtype.itemsize();
 
-    int mat_typenum = matrix.get_typenum();
-    int v_typenum = vector.get_typenum();
-    int r_typenum = result.get_typenum();
+    validate_usm_nbytes(matrix,
+                        static_cast<std::size_t>(n) *
+                            static_cast<std::size_t>(lda) * itemsize,
+                        "Amatrix");
+    validate_usm_nbytes(vector, static_cast<std::size_t>(m) * itemsize, "xvec");
+    validate_usm_nbytes(result, static_cast<std::size_t>(n) * itemsize,
+                        "resvec");
 
-    if ((mat_typenum != v_typenum) || (r_typenum != v_typenum) ||
-        !((v_typenum == api.UAR_DOUBLE_) || (v_typenum == api.UAR_FLOAT_) ||
-          (v_typenum == api.UAR_CDOUBLE_) || (v_typenum == api.UAR_CFLOAT_)))
-    {
-        std::cout << "Found: [" << mat_typenum << ", " << v_typenum << ", "
-                  << r_typenum << "]" << std::endl;
-        std::cout << "Expected: [" << UAR_DOUBLE << ", " << UAR_FLOAT << ", "
-                  << UAR_CDOUBLE << ", " << UAR_CFLOAT << "]" << std::endl;
-        throw std::runtime_error(
-            "Only real and complex floating point arrays are supported.");
-    }
-
-    char *mat_typeless_ptr = matrix.get_data();
-    char *v_typeless_ptr = vector.get_data();
-    char *r_typeless_ptr = result.get_data();
+    char *mat_typeless_ptr = matrix.get_pointer();
+    char *v_typeless_ptr = vector.get_pointer();
+    char *r_typeless_ptr = result.get_pointer();
 
     sycl::event res_ev;
-    if (v_typenum == api.UAR_DOUBLE_) {
+    const char dtype_char = dtype.char_();
+    if (dtype_char == 'd') {
         using T = double;
         sycl::event gemv_ev = oneapi::mkl::blas::row_major::gemv(
             q, oneapi::mkl::transpose::nontrans, n, m, T(1),
-            reinterpret_cast<T *>(mat_typeless_ptr), m,
+            reinterpret_cast<T *>(mat_typeless_ptr), lda,
             reinterpret_cast<T *>(v_typeless_ptr), 1, T(0),
             reinterpret_cast<T *>(r_typeless_ptr), 1, depends);
         res_ev = gemv_ev;
     }
-    else if (v_typenum == api.UAR_FLOAT_) {
+    else if (dtype_char == 'f') {
         using T = float;
         sycl::event gemv_ev = oneapi::mkl::blas::row_major::gemv(
             q, oneapi::mkl::transpose::nontrans, n, m, T(1),
-            reinterpret_cast<T *>(mat_typeless_ptr), m,
+            reinterpret_cast<T *>(mat_typeless_ptr), lda,
             reinterpret_cast<T *>(v_typeless_ptr), 1, T(0),
             reinterpret_cast<T *>(r_typeless_ptr), 1, depends);
         res_ev = gemv_ev;
     }
-    else if (v_typenum == api.UAR_CDOUBLE_) {
+    else if (dtype_char == 'D') {
         using T = std::complex<double>;
         sycl::event gemv_ev = oneapi::mkl::blas::row_major::gemv(
             q, oneapi::mkl::transpose::nontrans, n, m, T(1),
-            reinterpret_cast<T *>(mat_typeless_ptr), m,
+            reinterpret_cast<T *>(mat_typeless_ptr), lda,
             reinterpret_cast<T *>(v_typeless_ptr), 1, T(0),
             reinterpret_cast<T *>(r_typeless_ptr), 1, depends);
         res_ev = gemv_ev;
     }
-    else if (v_typenum == api.UAR_CFLOAT_) {
+    else if (dtype_char == 'F') {
         using T = std::complex<float>;
         sycl::event gemv_ev = oneapi::mkl::blas::row_major::gemv(
             q, oneapi::mkl::transpose::nontrans, n, m, T(1),
-            reinterpret_cast<T *>(mat_typeless_ptr), m,
+            reinterpret_cast<T *>(mat_typeless_ptr), lda,
             reinterpret_cast<T *>(v_typeless_ptr), 1, T(0),
             reinterpret_cast<T *>(r_typeless_ptr), 1, depends);
         res_ev = gemv_ev;
     }
     else {
-        throw std::runtime_error("Type dispatch ran into trouble.");
+        throw std::runtime_error("Unsupported data type for gemv.");
     }
 
     sycl::event ht_event =
@@ -165,22 +171,15 @@ sycl::event sub_impl(sycl::queue q,
 
 // out_r = in_v1 - in_v2
 std::pair<sycl::event, sycl::event>
-py_sub(sycl::queue q,
-       dpctl::tensor::usm_ndarray in_v1,
-       dpctl::tensor::usm_ndarray in_v2,
-       dpctl::tensor::usm_ndarray out_r,
+py_sub(sycl::queue &q,
+       dpctl::memory::usm_memory in_v1,
+       dpctl::memory::usm_memory in_v2,
+       dpctl::memory::usm_memory out_r,
+       std::int64_t n,
+       py::dtype dtype,
        const std::vector<sycl::event> &depends = {})
 {
-    if (in_v1.get_ndim() != 1 || in_v2.get_ndim() != 1 || out_r.get_ndim() != 1)
-    {
-        throw std::runtime_error("Inconsistent dimensions, expecting vectors");
-    }
-
-    py::ssize_t n = in_v1.get_shape(0); // get length of the vector
-
-    if (n != in_v2.get_shape(0) || n != out_r.get_shape(0)) {
-        throw std::runtime_error("Vectors must have the same length");
-    }
+    validate_positive_sizes(n, 0);
 
     if (!dpctl::utils::queues_are_compatible(
             q, {in_v1.get_queue(), in_v2.get_queue(), out_r.get_queue()}))
@@ -189,56 +188,40 @@ py_sub(sycl::queue q,
             "USM allocation is not bound to the context in execution queue");
     }
 
-    auto const &api = dpctl::detail::dpctl_capi::get();
-
-    if (!((in_v1.is_c_contiguous() || in_v1.is_f_contiguous()) &&
-          (in_v2.is_c_contiguous() || in_v2.is_f_contiguous()) &&
-          (out_r.is_c_contiguous() || out_r.is_f_contiguous())))
-    {
-        throw std::runtime_error("Vectors must be contiguous.");
-    }
-
-    int in_v1_typenum = in_v1.get_typenum();
-    int in_v2_typenum = in_v2.get_typenum();
-    int out_r_typenum = out_r.get_typenum();
+    const py::ssize_t itemsize = dtype.itemsize();
 
-    if ((in_v2_typenum != in_v1_typenum) || (out_r_typenum != in_v1_typenum) ||
-        !((in_v1_typenum == api.UAR_DOUBLE_) ||
-          (in_v1_typenum == api.UAR_FLOAT_) ||
-          (in_v1_typenum == api.UAR_CDOUBLE_) ||
-          (in_v1_typenum == api.UAR_CFLOAT_)))
-    {
-        throw std::runtime_error(
-            "Only real and complex floating point arrays are supported.");
-    }
+    validate_usm_nbytes(in_v1, static_cast<std::size_t>(n) * itemsize, "in1");
+    validate_usm_nbytes(in_v2, static_cast<std::size_t>(n) * itemsize, "in2");
+    validate_usm_nbytes(out_r, static_cast<std::size_t>(n) * itemsize, "out");
 
-    const char *in_v1_typeless_ptr = in_v1.get_data();
-    const char *in_v2_typeless_ptr = in_v2.get_data();
-    char *out_r_typeless_ptr = out_r.get_data();
+    const char *in_v1_typeless_ptr = in_v1.get_pointer();
+    const char *in_v2_typeless_ptr = in_v2.get_pointer();
+    char *out_r_typeless_ptr = out_r.get_pointer();
 
     sycl::event res_ev;
-    if (out_r_typenum == api.UAR_DOUBLE_) {
+    const char dtype_char = dtype.char_();
+    if (dtype_char == 'd') {
         using T = double;
         res_ev = sub_impl<T>(q, n, in_v1_typeless_ptr, in_v2_typeless_ptr,
                              out_r_typeless_ptr, depends);
     }
-    else if (out_r_typenum == api.UAR_FLOAT_) {
+    else if (dtype_char == 'f') {
         using T = float;
         res_ev = sub_impl<T>(q, n, in_v1_typeless_ptr, in_v2_typeless_ptr,
                              out_r_typeless_ptr, depends);
     }
-    else if (out_r_typenum == api.UAR_CDOUBLE_) {
+    else if (dtype_char == 'D') {
         using T = std::complex<double>;
         res_ev = sub_impl<T>(q, n, in_v1_typeless_ptr, in_v2_typeless_ptr,
                              out_r_typeless_ptr, depends);
     }
-    else if (out_r_typenum == api.UAR_CFLOAT_) {
+    else if (dtype_char == 'F') {
         using T = std::complex<float>;
         res_ev = sub_impl<T>(q, n, in_v1_typeless_ptr, in_v2_typeless_ptr,
                              out_r_typeless_ptr, depends);
     }
     else {
-        throw std::runtime_error("Type dispatch ran into trouble.");
+        throw std::runtime_error("Unsupported data type for sub.");
     }
 
     sycl::event ht_event = keep_args_alive(q, {in_v1, in_v2, out_r}, {res_ev});
@@ -274,72 +257,53 @@ sycl::event axpby_inplace_impl(sycl::queue q,
 std::pair<sycl::event, sycl::event>
 py_axpby_inplace(sycl::queue q,
                  py::object a,
-                 dpctl::tensor::usm_ndarray x,
+                 dpctl::memory::usm_memory x,
                  py::object b,
-                 dpctl::tensor::usm_ndarray y,
+                 dpctl::memory::usm_memory y,
+                 std::int64_t n,
+                 py::dtype dtype,
                  const std::vector<sycl::event> &depends = {})
 {
-
-    if (x.get_ndim() != 1 || y.get_ndim() != 1) {
-        throw std::runtime_error("Inconsistent dimensions, expecting vectors");
-    }
-
-    py::ssize_t n = x.get_shape(0); // get length of the vector
-
-    if (n != y.get_shape(0)) {
-        throw std::runtime_error("Vectors must have the same length");
-    }
+    validate_positive_sizes(n, 0);
 
     if (!dpctl::utils::queues_are_compatible(q, {x.get_queue(), y.get_queue()}))
     {
         throw std::runtime_error(
             "USM allocation is not bound to the context in execution queue");
     }
-    auto const &api = dpctl::detail::dpctl_capi::get();
-
-    if (!((x.is_c_contiguous() || x.is_f_contiguous()) &&
-          (y.is_c_contiguous() || y.is_f_contiguous())))
-    {
-        throw std::runtime_error("Vectors must be contiguous.");
-    }
 
-    int x_typenum = x.get_typenum();
-    int y_typenum = y.get_typenum();
+    const py::ssize_t itemsize = dtype.itemsize();
 
-    if ((x_typenum != y_typenum) ||
-        !((x_typenum == api.UAR_DOUBLE_) || (x_typenum == api.UAR_FLOAT_) ||
-          (x_typenum == api.UAR_CDOUBLE_) || (x_typenum == api.UAR_CFLOAT_)))
-    {
-        throw std::runtime_error(
-            "Only real and complex floating point arrays are supported.");
-    }
+    validate_usm_nbytes(x, static_cast<std::size_t>(n) * itemsize, "x");
+    validate_usm_nbytes(y, static_cast<std::size_t>(n) * itemsize, "y");
 
-    const char *x_typeless_ptr = x.get_data();
-    char *y_typeless_ptr = y.get_data();
+    const char *x_typeless_ptr = x.get_pointer();
+    char *y_typeless_ptr = y.get_pointer();
 
     sycl::event res_ev;
-    if (x_typenum == api.UAR_DOUBLE_) {
+    const char dtype_char = dtype.char_();
+    if (dtype_char == 'd') {
         using T = double;
         res_ev = axpby_inplace_impl<T>(q, n, a, x_typeless_ptr, b,
                                        y_typeless_ptr, depends);
     }
-    else if (x_typenum == api.UAR_FLOAT_) {
+    else if (dtype_char == 'f') {
         using T = float;
         res_ev = axpby_inplace_impl<T>(q, n, a, x_typeless_ptr, b,
                                        y_typeless_ptr, depends);
     }
-    else if (x_typenum == api.UAR_CDOUBLE_) {
+    else if (dtype_char == 'D') {
         using T = std::complex<double>;
         res_ev = axpby_inplace_impl<T>(q, n, a, x_typeless_ptr, b,
                                        y_typeless_ptr, depends);
     }
-    else if (x_typenum == api.UAR_CFLOAT_) {
+    else if (dtype_char == 'F') {
         using T = std::complex<float>;
         res_ev = axpby_inplace_impl<T>(q, n, a, x_typeless_ptr, b,
                                        y_typeless_ptr, depends);
     }
     else {
-        throw std::runtime_error("Type dispatch ran into trouble.");
+        throw std::runtime_error("Unsupported data type for axpby_inplace.");
     }
 
     sycl::event ht_event = keep_args_alive(q, {x, y}, {res_ev});
@@ -374,89 +338,64 @@ T complex_norm_squared_blocking_impl(
                                                             depends);
 }
 
-py::object py_norm_squared_blocking(sycl::queue q,
-                                    dpctl::tensor::usm_ndarray r,
+py::object py_norm_squared_blocking(sycl::queue &q,
+                                    dpctl::memory::usm_memory r,
+                                    std::int64_t n,
+                                    py::dtype dtype,
                                     const std::vector<sycl::event> depends = {})
 {
-    if (r.get_ndim() != 1) {
-        throw std::runtime_error("Expecting a vector");
-    }
-
-    py::ssize_t n = r.get_shape(0); // get length of the vector
-
-    int r_flags = r.get_flags();
-
-    if (!(r.is_c_contiguous() || r.is_f_contiguous())) {
-        throw std::runtime_error("Vector must be contiguous.");
-    }
+    validate_positive_sizes(n, 0);
 
     if (!dpctl::utils::queues_are_compatible(q, {r.get_queue()})) {
         throw std::runtime_error(
             "USM allocation is not bound to the context in execution queue");
     }
 
-    auto const &api = dpctl::detail::dpctl_capi::get();
+    const py::ssize_t itemsize = dtype.itemsize();
+    validate_usm_nbytes(r, static_cast<std::size_t>(n) * itemsize, "r");
 
-    int r_typenum = r.get_typenum();
-    if ((r_typenum != api.UAR_DOUBLE_) && (r_typenum != api.UAR_FLOAT_) &&
-        (r_typenum != api.UAR_CDOUBLE_) && (r_typenum != api.UAR_CFLOAT_))
-    {
-        throw std::runtime_error(
-            "Only real and complex floating point arrays are supported.");
-    }
-
-    const char *r_typeless_ptr = r.get_data();
+    const char *r_typeless_ptr = r.get_pointer();
     py::object res;
+    const char dtype_char = dtype.char_();
 
-    if (r_typenum == api.UAR_DOUBLE_) {
+    if (dtype_char == 'd') {
         using T = double;
         T n_sq = norm_squared_blocking_impl<T>(q, n, r_typeless_ptr, depends);
         res = py::float_(n_sq);
     }
-    else if (r_typenum == api.UAR_FLOAT_) {
+    else if (dtype_char == 'f') {
         using T = float;
         T n_sq = norm_squared_blocking_impl<T>(q, n, r_typeless_ptr, depends);
         res = py::float_(n_sq);
     }
-    else if (r_typenum == api.UAR_CDOUBLE_) {
+    else if (dtype_char == 'D') {
         using T = std::complex<double>;
         double n_sq = complex_norm_squared_blocking_impl<double>(
             q, n, r_typeless_ptr, depends);
         res = py::float_(n_sq);
     }
-    else if (r_typenum == api.UAR_CFLOAT_) {
+    else if (dtype_char == 'F') {
         using T = std::complex<float>;
         float n_sq = complex_norm_squared_blocking_impl<float>(
             q, n, r_typeless_ptr, depends);
         res = py::float_(n_sq);
     }
     else {
-        throw std::runtime_error("Type dispatch ran into trouble.");
+        throw std::runtime_error(
+            "Unsupported data type for norm_squared_blocking.");
     }
 
     return res;
 }
 
 py::object py_dot_blocking(sycl::queue q,
-                           dpctl::tensor::usm_ndarray v1,
-                           dpctl::tensor::usm_ndarray v2,
+                           dpctl::memory::usm_memory v1,
+                           dpctl::memory::usm_memory v2,
+                           std::int64_t n,
+                           py::dtype dtype,
                            const std::vector<sycl::event> &depends = {})
 {
-    if (v1.get_ndim() != 1 || v2.get_ndim() != 1) {
-        throw std::runtime_error("Expecting two vectors");
-    }
-
-    py::ssize_t n = v1.get_shape(0); // get length of the vector
-
-    if (n != v2.get_shape(0)) {
-        throw std::runtime_error("Length of vectors are not the same");
-    }
-
-    if (!(v1.is_c_contiguous() || v1.is_f_contiguous()) ||
-        !(v2.is_c_contiguous() || v2.is_f_contiguous()))
-    {
-        throw std::runtime_error("Vectors must be contiguous.");
-    }
+    validate_positive_sizes(n, 0);
 
     if (!dpctl::utils::queues_are_compatible(q,
                                              {v1.get_queue(), v2.get_queue()}))
@@ -465,25 +404,17 @@ py::object py_dot_blocking(sycl::queue q,
             "USM allocation is not bound to the context in execution queue");
     }
 
-    auto const &api = dpctl::detail::dpctl_capi::get();
+    const py::ssize_t itemsize = dtype.itemsize();
 
-    int v1_typenum = v1.get_typenum();
-    int v2_typenum = v2.get_typenum();
+    validate_usm_nbytes(v1, static_cast<std::size_t>(n) * itemsize, "v1");
+    validate_usm_nbytes(v2, static_cast<std::size_t>(n) * itemsize, "v2");
 
-    if ((v1_typenum != v2_typenum) ||
-        ((v1_typenum != api.UAR_DOUBLE_) && (v1_typenum != api.UAR_FLOAT_) &&
-         (v1_typenum != api.UAR_CDOUBLE_) && (v1_typenum != api.UAR_CFLOAT_)))
-    {
-        throw py::value_error(
-            "Data types of vectors must be the same. "
-            "Only real and complex floating types are supported.");
-    }
-
-    const char *v1_typeless_ptr = v1.get_data();
-    const char *v2_typeless_ptr = v2.get_data();
+    const char *v1_typeless_ptr = v1.get_pointer();
+    const char *v2_typeless_ptr = v2.get_pointer();
     py::object res;
+    const char dtype_char = dtype.char_();
 
-    if (v1_typenum == api.UAR_DOUBLE_) {
+    if (dtype_char == 'd') {
         using T = double;
         T *res_usm = sycl::malloc_device<T>(1, q);
         sycl::event dot_ev = oneapi::mkl::blas::row_major::dot(
@@ -494,7 +425,7 @@ py::object py_dot_blocking(sycl::queue q,
         sycl::free(res_usm, q);
         res = py::float_(res_v);
     }
-    else if (v1_typenum == api.UAR_FLOAT_) {
+    else if (dtype_char == 'f') {
         using T = float;
         T *res_usm = sycl::malloc_device<T>(1, q);
         sycl::event dot_ev = oneapi::mkl::blas::row_major::dot(
@@ -505,7 +436,7 @@ py::object py_dot_blocking(sycl::queue q,
         sycl::free(res_usm, q);
         res = py::float_(res_v);
     }
-    else if (v1_typenum == api.UAR_CDOUBLE_) {
+    else if (dtype_char == 'D') {
         using T = std::complex<double>;
         T *res_usm = sycl::malloc_device<T>(1, q);
         sycl::event dotc_ev = oneapi::mkl::blas::row_major::dotc(
@@ -516,7 +447,7 @@ py::object py_dot_blocking(sycl::queue q,
         sycl::free(res_usm, q);
         res = py::cast(res_v);
     }
-    else if (v1_typenum == api.UAR_CFLOAT_) {
+    else if (dtype_char == 'F') {
         using T = std::complex<float>;
         T *res_usm = sycl::malloc_device<T>(1, q);
         sycl::event dotc_ev = oneapi::mkl::blas::row_major::dotc(
@@ -528,48 +459,22 @@ py::object py_dot_blocking(sycl::queue q,
         res = py::cast(res_v);
     }
     else {
-        throw std::runtime_error("Type dispatch ran into trouble.");
+        throw std::runtime_error("Unsupported data type for dot_blocking.");
     }
 
     return res;
 }
 
 int py_cg_solve(sycl::queue exec_q,
-                dpctl::tensor::usm_ndarray Amat,
-                dpctl::tensor::usm_ndarray bvec,
-                dpctl::tensor::usm_ndarray xvec,
+                dpctl::memory::usm_memory Amat,
+                dpctl::memory::usm_memory bvec,
+                dpctl::memory::usm_memory xvec,
+                std::int64_t n,
+                py::dtype dtype,
                 double rs_tol,
                 const std::vector<sycl::event> &depends = {})
 {
-    if (Amat.get_ndim() != 2 || bvec.get_ndim() != 1 || xvec.get_ndim() != 1) {
-        throw py::value_error("Expecting a matrix and two vectors");
-    }
-
-    py::ssize_t n0 = Amat.get_shape(0);
-    py::ssize_t n1 = Amat.get_shape(1);
-
-    if (n0 != n1) {
-        throw py::value_error("Matrix must be square.");
-    }
-
-    if (n0 != bvec.get_shape(0) || n0 != xvec.get_shape(0)) {
-        throw py::value_error(
-            "Dimensions of the matrix and vectors are not consistent.");
-    }
-
-    bool all_contig = (Amat.is_c_contiguous()) && (bvec.is_c_contiguous()) &&
-                      (xvec.is_c_contiguous());
-    if (!all_contig) {
-        throw py::value_error("All inputs must be C-contiguous");
-    }
-
-    int A_typenum = Amat.get_typenum();
-    int b_typenum = bvec.get_typenum();
-    int x_typenum = xvec.get_typenum();
-
-    if (A_typenum != b_typenum || A_typenum != x_typenum) {
-        throw py::value_error("All arrays must have the same type");
-    }
+    validate_positive_sizes(n, 0);
 
     if (!dpctl::utils::queues_are_compatible(
             exec_q, {Amat.get_queue(), bvec.get_queue(), xvec.get_queue()}))
@@ -578,33 +483,41 @@ int py_cg_solve(sycl::queue exec_q,
             "USM allocation queues are not the same as the execution queue");
     }
 
-    const char *A_ch = Amat.get_data();
-    const char *b_ch = bvec.get_data();
-    char *x_ch = xvec.get_data();
+    const auto itemsize = dtype.itemsize();
+
+    validate_usm_nbytes(Amat,
+                        static_cast<std::size_t>(n) *
+                            static_cast<std::size_t>(n) * itemsize,
+                        "Amat");
+    validate_usm_nbytes(bvec, static_cast<std::size_t>(n) * itemsize, "bvec");
+    validate_usm_nbytes(xvec, static_cast<std::size_t>(n) * itemsize, "xvec");
 
-    auto const &api = dpctl::detail::dpctl_capi::get();
+    const char *A_ch = Amat.get_pointer();
+    const char *b_ch = bvec.get_pointer();
+    char *x_ch = xvec.get_pointer();
+    const char dtype_char = dtype.char_();
 
-    if (A_typenum == api.UAR_DOUBLE_) {
+    if (dtype_char == 'd') {
         using T = double;
         int iters = cg_solver::cg_solve<T>(
-            exec_q, n0, reinterpret_cast<const T *>(A_ch),
+            exec_q, n, reinterpret_cast<const T *>(A_ch),
             reinterpret_cast<const T *>(b_ch), reinterpret_cast<T *>(x_ch),
             depends, static_cast<T>(rs_tol));
 
         return iters;
     }
-    else if (A_typenum == api.UAR_FLOAT_) {
+    else if (dtype_char == 'f') {
         using T = float;
         int iters = cg_solver::cg_solve<T>(
-            exec_q, n0, reinterpret_cast<const T *>(A_ch),
+            exec_q, n, reinterpret_cast<const T *>(A_ch),
             reinterpret_cast<const T *>(b_ch), reinterpret_cast<T *>(x_ch),
             depends, static_cast<T>(rs_tol));
 
         return iters;
     }
     else {
-        throw std::runtime_error(
-            "Unsupported data type. Use single or double precision.");
+        throw std::runtime_error("Unsupported data type for cg_solve. Use "
+                                 "single or double precision.");
     }
 }
 
@@ -612,21 +525,27 @@ PYBIND11_MODULE(_onemkl, m)
 {
     m.def("gemv", &py_gemv, "Uses oneMKL to compute dot(matrix, vector)",
           py::arg("exec_queue"), py::arg("Amatrix"), py::arg("xvec"),
-          py::arg("resvec"), py::arg("depends") = py::list());
-    m.def("sub", &py_sub, "Subtraction: out = v1 - v2", py::arg("exec_queue"),
-          py::arg("in1"), py::arg("in2"), py::arg("out"),
+          py::arg("resvec"), py::arg("nrows"), py::arg("ncols"),
+          py::arg("dtype"), py::arg("lda") = -1,
           py::arg("depends") = py::list());
+    m.def("sub", &py_sub, "Subtraction: out = v1 - v2", py::arg("exec_queue"),
+          py::arg("in1"), py::arg("in2"), py::arg("out"), py::arg("nelems"),
+          py::arg("dtype"), py::arg("depends") = py::list());
     m.def("axpby_inplace", &py_axpby_inplace, "y = a * x + b * y",
           py::arg("exec_queue"), py::arg("a"), py::arg("x"), py::arg("b"),
-          py::arg("y"), py::arg("depends") = py::list());
+          py::arg("y"), py::arg("nelems"), py::arg("dtype"),
+          py::arg("depends") = py::list());
     m.def("norm_squared_blocking", &py_norm_squared_blocking, "norm(r)**2",
-          py::arg("exec_queue"), py::arg("r"), py::arg("depends") = py::list());
+          py::arg("exec_queue"), py::arg("r"), py::arg("nelems"),
+          py::arg("dtype"), py::arg("depends") = py::list());
     m.def("dot_blocking", &py_dot_blocking, "<v1, v2>", py::arg("exec_queue"),
-          py::arg("v1"), py::arg("v2"), py::arg("depends") = py::list());
+          py::arg("v1"), py::arg("v2"), py::arg("nelems"), py::arg("dtype"),
+          py::arg("depends") = py::list());
 
     m.def("cpp_cg_solve", &py_cg_solve,
           "Dispatch to call C++ implementation of cg_solve",
           py::arg("exec_queue"), py::arg("Amat"), py::arg("bvec"),
-          py::arg("xvec"), py::arg("rs_squared_tolerance") = py::float_(1e-20),
+          py::arg("xvec"), py::arg("n"), py::arg("dtype"),
+          py::arg("rs_squared_tolerance") = py::float_(1e-20),
           py::arg("depends") = py::list());
 }
diff --git a/examples/pybind11/onemkl_gemv/tests/test_gemm.py b/examples/pybind11/onemkl_gemv/tests/test_gemm.py
index 592cda6914..bd8c598950 100644
--- a/examples/pybind11/onemkl_gemv/tests/test_gemm.py
+++ b/examples/pybind11/onemkl_gemv/tests/test_gemm.py
@@ -25,7 +25,15 @@
 )
 
 import dpctl
-import dpctl.tensor as dpt
+import dpctl.memory as dpm
+
+
+def _real_dtype_for_device(q: dpctl.SyclQueue) -> np.dtype:
+    """
+    If the device supports fp64, return np.float64, else np.float32.
+    """
+    _fp64 = q.sycl_device.has_aspect_fp64
+    return np.float64 if _fp64 else np.float32
 
 
 def test_gemv():
@@ -33,13 +41,37 @@ def test_gemv():
         q = dpctl.SyclQueue()
     except dpctl.SyclQueueCreationError:
         pytest.skip("Queue could not be created")
-    Mnp, vnp = np.random.randn(5, 3), np.random.randn(3)
-    M = dpt.asarray(Mnp, sycl_queue=q)
-    v = dpt.asarray(vnp, sycl_queue=q)
-    r = dpt.empty((5,), dtype=v.dtype, sycl_queue=q)
-    hev, ev = gemv(q, M, v, r, [])
+
+    dtype = _real_dtype_for_device(q)
+
+    Mnp = np.random.randn(5, 3).astype(dtype, copy=False)
+    vnp = np.random.randn(3).astype(dtype, copy=False)
+
+    M = dpm.MemoryUSMDevice(Mnp.nbytes, queue=q)
+    ev1 = q.memcpy_async(dest=M, src=Mnp, count=Mnp.nbytes)
+
+    v = dpm.MemoryUSMDevice(vnp.nbytes, queue=q)
+    ev2 = q.memcpy_async(dest=v, src=vnp, count=vnp.nbytes)
+
+    rnp = np.empty((5,), dtype=dtype)
+    r = dpm.MemoryUSMDevice(rnp.nbytes, queue=q)
+
+    hev, ev3 = gemv(
+        q,
+        M,
+        v,
+        r,
+        5,
+        3,
+        np.dtype(Mnp.dtype),
+        3,
+        [ev1, ev2],
+    )
+
+    ev4 = q.memcpy_async(dest=rnp, src=r, count=rnp.nbytes, dEvents=[ev3])
+    ev4.wait()
     hev.wait()
-    rnp = dpt.asnumpy(r)
+
     assert np.allclose(rnp, Mnp @ vnp)
 
 
@@ -48,13 +80,27 @@ def test_sub():
         q = dpctl.SyclQueue()
     except dpctl.SyclQueueCreationError:
         pytest.skip("Queue could not be created")
-    anp, bnp = np.random.randn(5), np.random.randn(5)
-    a = dpt.asarray(anp, sycl_queue=q)
-    b = dpt.asarray(bnp, sycl_queue=q)
-    r = dpt.empty((5,), dtype=b.dtype, sycl_queue=q)
-    hev, ev = sub(q, a, b, r, [])
+
+    dtype = _real_dtype_for_device(q)
+
+    anp = np.random.randn(5).astype(dtype, copy=False)
+    bnp = np.random.randn(5).astype(dtype, copy=False)
+
+    a = dpm.MemoryUSMDevice(anp.nbytes, queue=q)
+    ev1 = q.memcpy_async(dest=a, src=anp, count=anp.nbytes)
+
+    b = dpm.MemoryUSMDevice(bnp.nbytes, queue=q)
+    ev2 = q.memcpy_async(dest=b, src=bnp, count=bnp.nbytes)
+
+    rnp = np.empty((5,), dtype=dtype)
+    r = dpm.MemoryUSMDevice(rnp.nbytes, queue=q)
+
+    hev, ev3 = sub(q, a, b, r, 5, np.dtype(anp.dtype), [ev1, ev2])
+
+    ev4 = q.memcpy_async(dest=rnp, src=r, count=rnp.nbytes, dEvents=[ev3])
+    ev4.wait()
     hev.wait()
-    rnp = dpt.asnumpy(r)
+
     assert np.allclose(rnp + bnp, anp)
 
 
@@ -63,13 +109,38 @@ def test_axpby():
         q = dpctl.SyclQueue()
     except dpctl.SyclQueueCreationError:
         pytest.skip("Queue could not be created")
-    xnp, pnp = np.random.randn(5), np.random.randn(5)
-    x = dpt.asarray(xnp, sycl_queue=q)
-    p = dpt.asarray(pnp, sycl_queue=q)
-    hev, ev = axpby_inplace(q, 0.5, x, -0.7, p, [])
+
+    dtype = _real_dtype_for_device(q)
+
+    xnp = np.random.randn(5).astype(dtype, copy=False)
+    pnp = np.random.randn(5).astype(dtype, copy=False)
+
+    x = dpm.MemoryUSMDevice(xnp.nbytes, queue=q)
+    ev1 = q.memcpy_async(dest=x, src=xnp, count=xnp.nbytes)
+
+    p = dpm.MemoryUSMDevice(pnp.nbytes, queue=q)
+    ev2 = q.memcpy_async(dest=p, src=pnp, count=pnp.nbytes)
+
+    alpha = 0.5
+    beta = -0.7
+
+    hev, ev3 = axpby_inplace(
+        q,
+        alpha,
+        x,
+        beta,
+        p,
+        5,
+        np.dtype(xnp.dtype),
+        [ev1, ev2],
+    )
+
+    rnp = np.empty((5,), dtype=dtype)
+    ev4 = q.memcpy_async(dest=rnp, src=p, count=rnp.nbytes, dEvents=[ev3])
+    ev4.wait()
     hev.wait()
-    rnp = dpt.asnumpy(p)
-    assert np.allclose(rnp, 0.5 * xnp - 0.7 * pnp)
+
+    assert np.allclose(rnp, alpha * xnp + beta * pnp)
 
 
 def test_dot():
@@ -77,10 +148,20 @@ def test_dot():
         q = dpctl.SyclQueue()
     except dpctl.SyclQueueCreationError:
         pytest.skip("Queue could not be created")
-    anp, bnp = np.random.randn(5), np.random.randn(5)
-    a = dpt.asarray(anp, sycl_queue=q)
-    b = dpt.asarray(bnp, sycl_queue=q)
-    dot_res = dot_blocking(q, a, b)
+
+    dtype = _real_dtype_for_device(q)
+
+    anp = np.random.randn(5).astype(dtype, copy=False)
+    bnp = np.random.randn(5).astype(dtype, copy=False)
+
+    a = dpm.MemoryUSMDevice(anp.nbytes, queue=q)
+    ev1 = q.memcpy_async(dest=a, src=anp, count=anp.nbytes)
+
+    b = dpm.MemoryUSMDevice(bnp.nbytes, queue=q)
+    ev2 = q.memcpy_async(dest=b, src=bnp, count=bnp.nbytes)
+
+    dot_res = dot_blocking(q, a, b, 5, np.dtype(anp.dtype), [ev1, ev2])
+
     assert np.allclose(dot_res, np.dot(anp, bnp))
 
 
@@ -89,7 +170,13 @@ def test_norm_squared():
         q = dpctl.SyclQueue()
     except dpctl.SyclQueueCreationError:
         pytest.skip("Queue could not be created")
-    anp = np.random.randn(5)
-    a = dpt.asarray(anp, sycl_queue=q)
-    dot_res = norm_squared_blocking(q, a)
+
+    dtype = _real_dtype_for_device(q)
+
+    anp = np.random.randn(5).astype(dtype, copy=False)
+    a = dpm.MemoryUSMDevice(anp.nbytes, queue=q)
+    ev1 = q.memcpy_async(dest=a, src=anp, count=anp.nbytes)
+
+    dot_res = norm_squared_blocking(q, a, 5, np.dtype(anp.dtype), [ev1])
+
     assert np.allclose(dot_res, np.dot(anp, anp))

From 24eee77c2550c07e916ad294db8786bab41b9649 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Mon, 2 Feb 2026 13:15:57 -0800
Subject: [PATCH 16/24] Update dpctl sycl kernel example

---
 .../pybind11/use_dpctl_sycl_kernel/example.py | 19 +++++++---
 .../tests/test_user_kernel.py                 | 24 ++++++++-----
 .../use_kernel/_example.cpp                   | 36 ++++++++-----------
 3 files changed, 43 insertions(+), 36 deletions(-)

diff --git a/examples/pybind11/use_dpctl_sycl_kernel/example.py b/examples/pybind11/use_dpctl_sycl_kernel/example.py
index 6d534a2c8c..f84124cfed 100644
--- a/examples/pybind11/use_dpctl_sycl_kernel/example.py
+++ b/examples/pybind11/use_dpctl_sycl_kernel/example.py
@@ -16,11 +16,12 @@
 
 # coding: utf-8
 
+import numpy as np
 import use_kernel as eg
 
 import dpctl
+import dpctl.memory as dpmem
 import dpctl.program as dppr
-import dpctl.tensor as dpt
 
 # create execution queue, targeting default selected device
 q = dpctl.SyclQueue()
@@ -38,10 +39,18 @@
 assert krn.num_args == 2
 
 # Construct the argument, and allocate memory for the result
-x = dpt.arange(0, stop=13, step=1, dtype="i4", sycl_queue=q)
-y = dpt.empty_like(x)
+x = np.arange(0, stop=13, step=1, dtype="i4")
+y = np.empty_like(x)
+x_dev = dpmem.MemoryUSMDevice(x.nbytes, queue=q)
+y_dev = dpmem.MemoryUSMDevice(y.nbytes, queue=q)
 
-eg.submit_custom_kernel(q, krn, src=x, dst=y)
+# Copy input data to the device
+q.memcpy(dest=x_dev, src=x, count=x.nbytes)
+
+eg.submit_custom_kernel(q, krn, src=x_dev, dst=y_dev)
+
+# Copy result data back to host
+q.memcpy(dest=y, src=y_dev, count=y.nbytes)
 
 # output the result
-print(dpt.asnumpy(y))
+print(y)
diff --git a/examples/pybind11/use_dpctl_sycl_kernel/tests/test_user_kernel.py b/examples/pybind11/use_dpctl_sycl_kernel/tests/test_user_kernel.py
index 7434151dbe..e541861b84 100644
--- a/examples/pybind11/use_dpctl_sycl_kernel/tests/test_user_kernel.py
+++ b/examples/pybind11/use_dpctl_sycl_kernel/tests/test_user_kernel.py
@@ -23,8 +23,8 @@
 import use_kernel as uk
 
 import dpctl
-import dpctl.program as dpm
-import dpctl.tensor as dpt
+import dpctl.memory as dpmem
+import dpctl.program as dppr
 
 
 def _get_spv_path():
@@ -45,7 +45,7 @@ def test_kernel_can_be_found():
         q = dpctl.SyclQueue()
     except dpctl.SyclQueueCreationError:
         pytest.skip("Could not create default queue")
-    pr = dpm.create_program_from_spirv(q, il, "")
+    pr = dppr.create_program_from_spirv(q, il, "")
     assert pr.has_sycl_kernel("double_it")
 
 
@@ -57,14 +57,20 @@ def test_kernel_submit_through_extension():
         q = dpctl.SyclQueue()
     except dpctl.SyclQueueCreationError:
         pytest.skip("Could not create default queue")
-    pr = dpm.create_program_from_spirv(q, il, "")
+    pr = dppr.create_program_from_spirv(q, il, "")
     krn = pr.get_sycl_kernel("double_it")
     assert krn.num_args == 2
 
-    x = dpt.arange(0, stop=13, step=1, dtype="i4", sycl_queue=q)
-    y = dpt.zeros_like(x)
+    x = np.arange(0, stop=13, step=1, dtype="i4")
+    y = np.empty_like(x)
 
-    q.wait()
-    uk.submit_custom_kernel(q, krn, x, y, [])
+    x_usm = dpmem.MemoryUSMDevice(x.nbytes, queue=q)
+    y_usm = dpmem.MemoryUSMDevice(y.nbytes, queue=q)
 
-    assert np.array_equal(dpt.asnumpy(y), np.arange(0, 26, step=2, dtype="i4"))
+    ev = q.memcpy_async(dest=x_usm, src=x, count=x_usm.nbytes)
+
+    uk.submit_custom_kernel(q, krn, x_usm, y_usm, [ev])
+
+    q.memcpy(dest=y, src=y_usm, count=y.nbytes)
+
+    assert np.array_equal(y, np.arange(0, 26, step=2, dtype="i4"))
diff --git a/examples/pybind11/use_dpctl_sycl_kernel/use_kernel/_example.cpp b/examples/pybind11/use_dpctl_sycl_kernel/use_kernel/_example.cpp
index f7ec9bd353..44bbec6deb 100644
--- a/examples/pybind11/use_dpctl_sycl_kernel/use_kernel/_example.cpp
+++ b/examples/pybind11/use_dpctl_sycl_kernel/use_kernel/_example.cpp
@@ -37,30 +37,25 @@ namespace py = pybind11;
 
 void submit_custom_kernel(sycl::queue &q,
                           sycl::kernel &krn,
-                          dpctl::tensor::usm_ndarray x,
-                          dpctl::tensor::usm_ndarray y,
+                          dpctl::memory::usm_memory x,
+                          dpctl::memory::usm_memory y,
                           const std::vector<sycl::event> &depends = {})
 {
-    if (x.get_ndim() != 1 || !x.is_c_contiguous() || y.get_ndim() != 1 ||
-        !y.is_c_contiguous())
-    {
-        throw py::value_error(
-            "src and dst arguments must be 1D and contiguous.");
-    }
+    const std::size_t nbytes_x = x.get_nbytes();
+    const std::size_t nbytes_y = y.get_nbytes();
 
-    auto const &api = dpctl::detail::dpctl_capi::get();
-    if (x.get_typenum() != api.UAR_INT32_ || y.get_typenum() != api.UAR_INT32_)
-    {
-        throw py::value_error(
-            "src and dst arguments must have int32 element data types.");
+    if (nbytes_x != nbytes_y) {
+        throw py::value_error("src and dst arguments must have equal nbytes.");
+    }
+    if (nbytes_x % sizeof(std::int32_t) != 0) {
+        throw py::value_error("src and dst must be interpretable as int32 "
+                              "(nbytes must be a multiple of 4).");
     }
 
-    size_t n_x = x.get_size();
-    size_t n_y = y.get_size();
+    auto *x_data = reinterpret_cast<std::int32_t *>(x.get_pointer());
+    auto *y_data = reinterpret_cast<std::int32_t *>(y.get_pointer());
 
-    if (n_x != n_y) {
-        throw py::value_error("src and dst arguments must have equal size.");
-    }
+    const std::size_t n_elems = nbytes_x / sizeof(std::int32_t);
 
     if (!dpctl::utils::queues_are_compatible(q, {x.get_queue(), y.get_queue()}))
     {
@@ -68,14 +63,11 @@ void submit_custom_kernel(sycl::queue &q,
             "Execution queue is not compatible with allocation queues");
     }
 
-    void *x_data = x.get_data<void>();
-    void *y_data = y.get_data<void>();
-
     sycl::event e = q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(depends);
         cgh.set_arg(0, x_data);
         cgh.set_arg(1, y_data);
-        cgh.parallel_for(sycl::range<1>(n_x), krn);
+        cgh.parallel_for(sycl::range<1>(n_elems), krn);
     });
 
     e.wait();

From 2b96b66a5573ce6d7b77b4276d1515dd506d5b99 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Mon, 2 Feb 2026 13:16:26 -0800
Subject: [PATCH 17/24] Update tests for external USM allocation example

---
 .../external_usm_allocation/tests/test_direct.py      | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/examples/pybind11/external_usm_allocation/tests/test_direct.py b/examples/pybind11/external_usm_allocation/tests/test_direct.py
index cb4b7a094c..b14d57533b 100644
--- a/examples/pybind11/external_usm_allocation/tests/test_direct.py
+++ b/examples/pybind11/external_usm_allocation/tests/test_direct.py
@@ -17,10 +17,10 @@
 # coding: utf-8
 
 import external_usm_allocation as eua
+import numpy as np
 
 import dpctl
 import dpctl.memory as dpm
-import dpctl.tensor as dpt
 
 
 def test_direct():
@@ -30,8 +30,11 @@ def test_direct():
     mbuf = eua.make_zeroed_device_memory(nb, q)
 
     assert isinstance(mbuf, dpm.MemoryUSMDevice)
-    assert mbuf.nbytes == 2 * 30
+    assert mbuf.nbytes == nb
     assert mbuf.sycl_queue == q
 
-    x = dpt.usm_ndarray(30, dtype="i2", buffer=mbuf)
-    assert dpt.all(x == dpt.zeros(30, dtype="i2", sycl_queue=q))
+    x = np.empty(30, dtype="i2")
+    assert x.nbytes == nb
+
+    q.memcpy(dest=x, src=mbuf, count=nb)
+    assert np.all(x == 0)

From 05cfb2c4c54573b9d47f54c9b700b7aabfd30484 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Mon, 2 Feb 2026 13:27:49 -0800
Subject: [PATCH 18/24] remove test for tensor includes

---
 dpctl/tests/test_service.py | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/dpctl/tests/test_service.py b/dpctl/tests/test_service.py
index da60eac605..0127f45a2f 100644
--- a/dpctl/tests/test_service.py
+++ b/dpctl/tests/test_service.py
@@ -219,24 +219,6 @@ def test_main_library():
     assert "DPCTLSyclInterface" in output
 
 
-def test_tensor_includes():
-    res = subprocess.run(
-        [sys.executable, "-m", "dpctl", "--tensor-includes"],
-        capture_output=True,
-    )
-    assert res.returncode == 0
-    assert res.stdout
-    flags = res.stdout.decode("utf-8")
-    res = subprocess.run(
-        [sys.executable, "-m", "dpctl", "--tensor-include-dir"],
-        capture_output=True,
-    )
-    assert res.returncode == 0
-    assert res.stdout
-    dir = res.stdout.decode("utf-8")
-    assert flags == "-I " + dir
-
-
 def test_main_library_dir():
     res = subprocess.run(
         [sys.executable, "-m", "dpctl", "--library-dir"], capture_output=True

From 0133a2b07a731d6e633aae9528f80c4e53371818 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Mon, 2 Feb 2026 13:28:21 -0800
Subject: [PATCH 19/24] Remove reference to dpctl.tensor from SyclTimer
 docstring

---
 dpctl/_sycl_timer.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/dpctl/_sycl_timer.py b/dpctl/_sycl_timer.py
index 976ae71eca..56e6a0c10e 100644
--- a/dpctl/_sycl_timer.py
+++ b/dpctl/_sycl_timer.py
@@ -260,12 +260,11 @@ def dt(self):
 
                 device = tensor.Device.create_device(q)
                 timer = dpctl.SyclTimer()
+                x = np.linspace(-4, 4, num=10**6, dtype="float32")
 
                 with timer(q):
-                    x = tensor.linspace(-4, 4, num=10**6, dtype="float32")
-                    e = tensor.exp(-0.5 * tensor.square(x))
-                    s = tensor.sin(2.3 * x + 0.11)
-                    f = e * s
+                    x_usm = dpctl.memory.MemoryUSMDevice(x.nbytes, queue=q)
+                    q.memcpy(dest=x_usm, src=x, count=x.nbytes)
 
                 host_dt, device_dt = timer.dt
 

From 9abc509e6a361eb50068a99c2655556fc11be592 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 4 Feb 2026 21:42:34 -0800
Subject: [PATCH 20/24] remove compute_follows_data imports from dpctl.utils
 __init__

---
 dpctl/tests/test_utils.py | 90 ---------------------------------------
 dpctl/utils/__init__.py   | 10 -----
 2 files changed, 100 deletions(-)

diff --git a/dpctl/tests/test_utils.py b/dpctl/tests/test_utils.py
index 1481848947..2114346dbc 100644
--- a/dpctl/tests/test_utils.py
+++ b/dpctl/tests/test_utils.py
@@ -22,96 +22,6 @@
 import dpctl.utils
 
 
-def test_get_execution_queue_input_validation():
-    with pytest.raises(TypeError):
-        dpctl.utils.get_execution_queue(dict())
-
-
-def test_get_execution_queue():
-    try:
-        q = dpctl.SyclQueue()
-        q2 = dpctl.SyclQueue()
-    except dpctl.SyclQueueCreationError:
-        pytest.skip("Queue could not be create for default device")
-
-    exec_q = dpctl.utils.get_execution_queue(())
-    assert exec_q is None
-
-    exec_q = dpctl.utils.get_execution_queue([q])
-    assert exec_q is q
-
-    exec_q = dpctl.utils.get_execution_queue([q, q, q, q])
-    assert exec_q is q
-
-    exec_q = dpctl.utils.get_execution_queue((q, q, None, q))
-    assert exec_q is None
-
-    exec_q = dpctl.utils.get_execution_queue(
-        (
-            q,
-            q2,
-            q,
-        )
-    )
-    assert exec_q is None
-    q_c = dpctl.SyclQueue(q._get_capsule())
-    assert q == q_c
-    exec_q = dpctl.utils.get_execution_queue(
-        (
-            q,
-            q_c,
-            q,
-        )
-    )
-    assert exec_q == q
-
-
-def test_get_execution_queue_nonequiv():
-    try:
-        q = dpctl.SyclQueue("cpu")
-        d1, d2 = q.sycl_device.create_sub_devices(partition=[1, 1])
-        ctx = dpctl.SyclContext([q.sycl_device, d1, d2])
-        q1 = dpctl.SyclQueue(ctx, d1)
-        q2 = dpctl.SyclQueue(ctx, d2)
-    except dpctl.SyclQueueCreationError:
-        pytest.skip("Queue could not be create for default device")
-
-    exec_q = dpctl.utils.get_execution_queue((q, q1, q2))
-    assert exec_q is None
-
-
-def test_get_coerced_usm_type():
-    _t = ["device", "shared", "host"]
-
-    for i1 in range(len(_t)):
-        for i2 in range(len(_t)):
-            assert (
-                dpctl.utils.get_coerced_usm_type([_t[i1], _t[i2]])
-                == _t[min(i1, i2)]
-            )
-
-    assert dpctl.utils.get_coerced_usm_type([]) is None
-    with pytest.raises(TypeError):
-        dpctl.utils.get_coerced_usm_type(dict())
-
-
-def validate_usm_type_arg():
-    _t = ["device", "shared", "host"]
-
-    for i in range(len(_t)):
-        dpctl.utils.validate_usm_type(_t[i])
-        dpctl.utils.validate_usm_type(_t[i], allow_none=False)
-    dpctl.utils.validate_usm_type(None, allow_none=True)
-    with pytest.raises(TypeError):
-        dpctl.utils.validate_usm_type(dict(), allow_none=True)
-    with pytest.raises(TypeError):
-        dpctl.utils.validate_usm_type(dict(), allow_none=False)
-    with pytest.raises(ValueError):
-        dpctl.utils.validate_usm_type("inv", allow_none=True)
-    with pytest.raises(ValueError):
-        dpctl.utils.validate_usm_type("inv", allow_none=False)
-
-
 @pytest.mark.filterwarnings("ignore:.*:RuntimeWarning")
 def test_onetrace_enabled():
     import os
diff --git a/dpctl/utils/__init__.py b/dpctl/utils/__init__.py
index 413750db5d..9530849bf2 100644
--- a/dpctl/utils/__init__.py
+++ b/dpctl/utils/__init__.py
@@ -18,22 +18,12 @@
 A collection of utility functions.
 """
 
-from ._compute_follows_data import (
-    ExecutionPlacementError,
-    get_coerced_usm_type,
-    get_execution_queue,
-    validate_usm_type,
-)
 from ._intel_device_info import intel_device_info
 from ._onetrace_context import onetrace_enabled
 from ._order_manager import SequentialOrderManager
 
 __all__ = [
-    "get_execution_queue",
-    "get_coerced_usm_type",
-    "validate_usm_type",
     "onetrace_enabled",
     "intel_device_info",
-    "ExecutionPlacementError",
     "SequentialOrderManager",
 ]

From dcac1f6ab63c1e0d2ab67d58622e6996c82479f6 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 4 Feb 2026 23:18:35 -0800
Subject: [PATCH 21/24] rewrite C extension test

---
 dpctl/tests/_c_ext.c           | 33 +++++++++++++++++----------------
 dpctl/tests/test_headers.py    |  7 +++----
 dpctl/tests/test_sycl_queue.py |  2 +-
 3 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/dpctl/tests/_c_ext.c b/dpctl/tests/_c_ext.c
index 3b3cd930d4..1733ae30a5 100644
--- a/dpctl/tests/_c_ext.c
+++ b/dpctl/tests/_c_ext.c
@@ -29,7 +29,7 @@
 #include "dpctl_capi.h"
 // clang-format on
 
-PyObject *py_is_usm_ndarray(PyObject *self_unused, PyObject *args)
+PyObject *py_is_sycl_queue(PyObject *self_unused, PyObject *args)
 {
     PyObject *arg = NULL;
     PyObject *res = NULL;
@@ -43,7 +43,7 @@ PyObject *py_is_usm_ndarray(PyObject *self_unused, PyObject *args)
         return NULL;
     }
 
-    check = PyObject_TypeCheck(arg, &PyUSMArrayType);
+    check = PyObject_TypeCheck(arg, &PySyclQueueType);
     if (check == -1) {
         PyErr_SetString(PyExc_RuntimeError, "Type check failed");
         return NULL;
@@ -55,35 +55,36 @@ PyObject *py_is_usm_ndarray(PyObject *self_unused, PyObject *args)
     return res;
 }
 
-PyObject *py_usm_ndarray_ndim(PyObject *self_unused, PyObject *args)
+PyObject *py_check_queue_ref(PyObject *self_unused, PyObject *args)
 {
     PyObject *arg = NULL;
-    struct PyUSMArrayObject *array = NULL;
     PyObject *res = NULL;
     int status = -1;
-    int ndim = -1;
+    struct PySyclQueueObject *q_obj = NULL;
+    DPCTLSyclQueueRef qref = NULL;
 
     (void)(self_unused); // avoid unused arguments warning
-    status = PyArg_ParseTuple(args, "O!", &PyUSMArrayType, &arg);
+    status = PyArg_ParseTuple(args, "O!", &PySyclQueueType, &arg);
     if (!status) {
-        PyErr_SetString(
-            PyExc_TypeError,
-            "Expecting single argument of type dpctl.tensor.usm_ndarray");
+        PyErr_SetString(PyExc_TypeError,
+                        "Expecting single argument of type dpctl.SyclQueue");
         return NULL;
     }
 
-    array = (struct PyUSMArrayObject *)arg;
-    ndim = UsmNDArray_GetNDim(array);
+    q_obj = (struct PySyclQueueObject *)arg;
+    qref = SyclQueue_GetQueueRef((struct PySyclQueueObject *)arg);
+
+    res = (qref != NULL) ? Py_True : Py_False;
+    Py_INCREF(res);
 
-    res = PyLong_FromLong(ndim);
     return res;
 }
 
 static PyMethodDef CExtMethods[] = {
-    {"is_usm_ndarray", py_is_usm_ndarray, METH_VARARGS,
-     "Checks if input object is an usm_ndarray instance"},
-    {"usm_ndarray_ndim", py_usm_ndarray_ndim, METH_VARARGS,
-     "Get ndim property of an usm_ndarray instance"},
+    {"is_sycl_queue", py_is_sycl_queue, METH_VARARGS,
+     "Checks if input object is a dpctl.SyclQueue instance"},
+    {"check_queue_ref", py_check_queue_ref, METH_VARARGS,
+     "Checks that queue ref obtained via C-API is not NULL"},
     {NULL, NULL, 0, NULL} /* Sentinel */
 };
 
diff --git a/dpctl/tests/test_headers.py b/dpctl/tests/test_headers.py
index 3f1c245565..7040e538f6 100644
--- a/dpctl/tests/test_headers.py
+++ b/dpctl/tests/test_headers.py
@@ -1,7 +1,6 @@
 import pytest
 
 import dpctl
-import dpctl.tensor as dpt
 
 
 @pytest.fixture(scope="session")
@@ -44,9 +43,9 @@ def dpctl_c_extension(tmp_path_factory):
 
 def test_c_headers(dpctl_c_extension):
     try:
-        x = dpt.empty(10)
+        q = dpctl.SyclQueue()
     except (dpctl.SyclDeviceCreationError, dpctl.SyclQueueCreationError):
         pytest.skip()
 
-    assert dpctl_c_extension.is_usm_ndarray(x)
-    assert dpctl_c_extension.usm_ndarray_ndim(x) == x.ndim
+    assert dpctl_c_extension.is_sycl_queue(q)
+    assert dpctl_c_extension.check_queue_ref(q)
diff --git a/dpctl/tests/test_sycl_queue.py b/dpctl/tests/test_sycl_queue.py
index 4340143d8b..6f5692fc56 100644
--- a/dpctl/tests/test_sycl_queue.py
+++ b/dpctl/tests/test_sycl_queue.py
@@ -244,7 +244,7 @@ def test_cpython_api_SyclQueue_GetQueueRef():
     try:
         q = dpctl.SyclQueue()
     except dpctl.SyclQueueCreationError:
-        pytest.skip("Can not defaul-construct SyclQueue")
+        pytest.skip("Can not default-construct SyclQueue")
     mod = sys.modules[q.__class__.__module__]
     # get capsule storign SyclQueue_GetQueueRef function ptr
     q_ref_fn_cap = mod.__pyx_capi__["SyclQueue_GetQueueRef"]

From be75a7eafdf28d36f96bb78d36c234c4d2acb5b4 Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Thu, 10 Apr 2025 10:39:23 +0100
Subject: [PATCH 22/24] Support compilation from SYCL source code

Enable SYCL source compilation, but only for DPC++ versions that actually
support the compilation, based on the __SYCL_COMPILER_VERSION reported.

Uses the correct naming for the property based on DPC++ version,
detected through C++ type traits to check which property actually refers
to a fully defined type.

This commit also works around a bug in DPC++ version 2025.1. The constructor
with no parameter of class `include_files` was only declared, but never
defined. Calling it when creating a SYCL source kernel bundle therefore
leads to references to undefined symbols with DPC++ version 2025.1. This
change works around this issue by calling an alternative constructor,
which is defined in the release.

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 dpctl/_backend.pxd                            |  50 +++-
 dpctl/_sycl_device.pxd                        |   1 +
 dpctl/_sycl_device.pyx                        |  31 +++
 dpctl/program/__init__.py                     |   2 +
 dpctl/program/_program.pxd                    |   7 +-
 dpctl/program/_program.pyx                    | 148 ++++++++++-
 dpctl/tests/test_sycl_program.py              | 154 ++++++++++-
 .../dpctl_sycl_device_interface.h             |  36 +++
 .../dpctl_sycl_kernel_bundle_interface.h      | 169 ++++++++++++
 .../source/dpctl_sycl_device_interface.cpp    |  25 ++
 .../dpctl_sycl_kernel_bundle_interface.cpp    | 247 ++++++++++++++++++
 .../test_sycl_kernel_bundle_interface.cpp     | 132 ++++++++++
 12 files changed, 993 insertions(+), 9 deletions(-)

diff --git a/dpctl/_backend.pxd b/dpctl/_backend.pxd
index 93d9b5ef97..069f9a33fc 100644
--- a/dpctl/_backend.pxd
+++ b/dpctl/_backend.pxd
@@ -287,9 +287,12 @@ cdef extern from "syclinterface/dpctl_sycl_device_interface.h":
                                         _peer_access PT)
     cdef void DPCTLDevice_EnablePeerAccess(const DPCTLSyclDeviceRef DRef,
                                            const DPCTLSyclDeviceRef PDRef)
-
     cdef void DPCTLDevice_DisablePeerAccess(const DPCTLSyclDeviceRef DRef,
                                             const DPCTLSyclDeviceRef PDRef)
+    cdef bool DPCTLDevice_CanCompileSPIRV(const DPCTLSyclDeviceRef DRef)
+    cdef bool DPCTLDevice_CanCompileOpenCL(const DPCTLSyclDeviceRef DRef)
+    cdef bool DPCTLDevice_CanCompileSYCL(const DPCTLSyclDeviceRef DRef)
+
 
 cdef extern from "syclinterface/dpctl_sycl_device_manager.h":
     cdef DPCTLDeviceVectorRef DPCTLDeviceVector_CreateFromArray(
@@ -452,6 +455,51 @@ cdef extern from "syclinterface/dpctl_sycl_kernel_bundle_interface.h":
     cdef DPCTLSyclKernelBundleRef DPCTLKernelBundle_Copy(
         const DPCTLSyclKernelBundleRef KBRef)
 
+    cdef struct DPCTLBuildOptionList
+    cdef struct DPCTLKernelNameList
+    cdef struct DPCTLVirtualHeaderList
+    cdef struct DPCTLKernelBuildLog
+    ctypedef DPCTLBuildOptionList* DPCTLBuildOptionListRef
+    ctypedef DPCTLKernelNameList* DPCTLKernelNameListRef
+    ctypedef DPCTLVirtualHeaderList* DPCTLVirtualHeaderListRef
+    ctypedef DPCTLKernelBuildLog* DPCTLKernelBuildLogRef
+
+    cdef DPCTLBuildOptionListRef DPCTLBuildOptionList_Create()
+    cdef void DPCTLBuildOptionList_Delete(DPCTLBuildOptionListRef Ref)
+    cdef void DPCTLBuildOptionList_Append(DPCTLBuildOptionListRef Ref,
+                                          const char *Option)
+
+    cdef DPCTLKernelNameListRef DPCTLKernelNameList_Create()
+    cdef void DPCTLKernelNameList_Delete(DPCTLKernelNameListRef Ref)
+    cdef void DPCTLKernelNameList_Append(DPCTLKernelNameListRef Ref,
+                                         const char *Option)
+
+    cdef DPCTLVirtualHeaderListRef DPCTLVirtualHeaderList_Create()
+    cdef void DPCTLVirtualHeaderList_Delete(DPCTLVirtualHeaderListRef Ref)
+    cdef void DPCTLVirtualHeaderList_Append(DPCTLVirtualHeaderListRef Ref,
+                                            const char *Name,
+                                            const char *Content)
+
+    cdef DPCTLKernelBuildLogRef DPCTLKernelBuildLog_Create()
+    cdef void DPCTLKernelBuildLog_Delete(DPCTLKernelBuildLogRef Ref)
+    cdef const char *DPCTLKernelBuildLog_Get(DPCTLKernelBuildLogRef)
+
+    cdef DPCTLSyclKernelBundleRef DPCTLKernelBundle_CreateFromSYCLSource(
+        const DPCTLSyclContextRef Ctx,
+        const DPCTLSyclDeviceRef Dev,
+        const char *Source,
+        DPCTLVirtualHeaderListRef Headers,
+        DPCTLKernelNameListRef Names,
+        DPCTLBuildOptionListRef BuildOptions,
+        DPCTLKernelBuildLogRef BuildLog)
+
+    cdef DPCTLSyclKernelRef DPCTLKernelBundle_GetSyclKernel(
+                                                DPCTLSyclKernelBundleRef KBRef,
+                                                const char *KernelName)
+
+    cdef bool DPCTLKernelBundle_HasSyclKernel(DPCTLSyclKernelBundleRef KBRef,
+                                              const char *KernelName)
+
 
 cdef extern from "syclinterface/dpctl_sycl_queue_interface.h":
     ctypedef struct _md_local_accessor "MDLocalAccessor":
diff --git a/dpctl/_sycl_device.pxd b/dpctl/_sycl_device.pxd
index 190d981cd0..d9378f0897 100644
--- a/dpctl/_sycl_device.pxd
+++ b/dpctl/_sycl_device.pxd
@@ -61,3 +61,4 @@ cdef public api class SyclDevice(_SyclDevice) [
     cdef int get_overall_ordinal(self)
     cdef int get_backend_ordinal(self)
     cdef int get_backend_and_device_type_ordinal(self)
+    cpdef bint can_compile(self, str language)
diff --git a/dpctl/_sycl_device.pyx b/dpctl/_sycl_device.pyx
index 419ed2b9fb..5a7b6b25f4 100644
--- a/dpctl/_sycl_device.pyx
+++ b/dpctl/_sycl_device.pyx
@@ -26,6 +26,9 @@ from ._backend cimport (  # noqa: E211
     DPCTLDefaultSelector_Create,
     DPCTLDevice_AreEq,
     DPCTLDevice_CanAccessPeer,
+    DPCTLDevice_CanCompileOpenCL,
+    DPCTLDevice_CanCompileSPIRV,
+    DPCTLDevice_CanCompileSYCL,
     DPCTLDevice_Copy,
     DPCTLDevice_CreateFromSelector,
     DPCTLDevice_CreateSubDevicesByAffinity,
@@ -2367,6 +2370,34 @@ cdef class SyclDevice(_SyclDevice):
             raise ValueError("device could not be found")
         return dev_id
 
+    cpdef bint can_compile(self, str language):
+        """
+        Check whether it is possible to create an executable kernel_bundle
+        for this device from the given source language.
+
+        Parameters:
+            language
+                Input language. Possible values are "spirv" for SPIR-V binary
+                files, "opencl" for OpenCL C device code and "sycl" for SYCL
+                device code.
+
+        Returns:
+            bool:
+                True if compilation is supported, False otherwise.
+
+        Raises:
+            ValueError:
+                If an unknown source language is used.
+        """
+        if language == "spirv" or language == "spv":
+            return DPCTLDevice_CanCompileSPIRV(self._device_ref)
+        if language == "opencl" or language == "ocl":
+            return DPCTLDevice_CanCompileOpenCL(self._device_ref)
+        if language == "sycl":
+            return DPCTLDevice_CanCompileSYCL(self._device_ref)
+
+        raise ValueError(f"Unknown source language {language}")
+
 
 cdef api DPCTLSyclDeviceRef SyclDevice_GetDeviceRef(SyclDevice dev):
     """
diff --git a/dpctl/program/__init__.py b/dpctl/program/__init__.py
index 4904d29a7c..52e4ab6477 100644
--- a/dpctl/program/__init__.py
+++ b/dpctl/program/__init__.py
@@ -27,11 +27,13 @@
     SyclProgramCompilationError,
     create_program_from_source,
     create_program_from_spirv,
+    create_program_from_sycl_source,
 )
 
 __all__ = [
     "create_program_from_source",
     "create_program_from_spirv",
+    "create_program_from_sycl_source",
     "SyclKernel",
     "SyclProgram",
     "SyclProgramCompilationError",
diff --git a/dpctl/program/_program.pxd b/dpctl/program/_program.pxd
index dc4208a29b..880843c27f 100644
--- a/dpctl/program/_program.pxd
+++ b/dpctl/program/_program.pxd
@@ -49,9 +49,11 @@ cdef api class SyclProgram [object PySyclProgramObject, type PySyclProgramType]:
     binary file.
     """
     cdef DPCTLSyclKernelBundleRef _program_ref
+    cdef bint _is_sycl_source
 
     @staticmethod
-    cdef  SyclProgram _create (DPCTLSyclKernelBundleRef pref)
+    cdef  SyclProgram _create (DPCTLSyclKernelBundleRef pref,
+                               bint _is_sycl_source)
     cdef  DPCTLSyclKernelBundleRef get_program_ref (self)
     cpdef SyclKernel get_sycl_kernel(self, str kernel_name)
 
@@ -59,3 +61,6 @@ cdef api class SyclProgram [object PySyclProgramObject, type PySyclProgramType]:
 cpdef create_program_from_source (SyclQueue q, unicode source, unicode copts=*)
 cpdef create_program_from_spirv (SyclQueue q, const unsigned char[:] IL,
                                  unicode copts=*)
+cpdef create_program_from_sycl_source(SyclQueue q, unicode source,
+                                      list headers=*, list registered_names=*,
+                                      list copts=*)
diff --git a/dpctl/program/_program.pyx b/dpctl/program/_program.pyx
index 3859314505..64433947ac 100644
--- a/dpctl/program/_program.pyx
+++ b/dpctl/program/_program.pyx
@@ -28,6 +28,10 @@ a OpenCL source string or a SPIR-V binary file.
 from libc.stdint cimport uint32_t
 
 from dpctl._backend cimport (  # noqa: E211, E402;
+    DPCTLBuildOptionList_Append,
+    DPCTLBuildOptionList_Create,
+    DPCTLBuildOptionList_Delete,
+    DPCTLBuildOptionListRef,
     DPCTLKernel_Copy,
     DPCTLKernel_Delete,
     DPCTLKernel_GetCompileNumSubGroups,
@@ -38,16 +42,31 @@ from dpctl._backend cimport (  # noqa: E211, E402;
     DPCTLKernel_GetPreferredWorkGroupSizeMultiple,
     DPCTLKernel_GetPrivateMemSize,
     DPCTLKernel_GetWorkGroupSize,
+    DPCTLKernelBuildLog_Create,
+    DPCTLKernelBuildLog_Delete,
+    DPCTLKernelBuildLog_Get,
+    DPCTLKernelBuildLogRef,
     DPCTLKernelBundle_Copy,
     DPCTLKernelBundle_CreateFromOCLSource,
     DPCTLKernelBundle_CreateFromSpirv,
+    DPCTLKernelBundle_CreateFromSYCLSource,
     DPCTLKernelBundle_Delete,
     DPCTLKernelBundle_GetKernel,
+    DPCTLKernelBundle_GetSyclKernel,
     DPCTLKernelBundle_HasKernel,
+    DPCTLKernelBundle_HasSyclKernel,
+    DPCTLKernelNameList_Append,
+    DPCTLKernelNameList_Create,
+    DPCTLKernelNameList_Delete,
+    DPCTLKernelNameListRef,
     DPCTLSyclContextRef,
     DPCTLSyclDeviceRef,
     DPCTLSyclKernelBundleRef,
     DPCTLSyclKernelRef,
+    DPCTLVirtualHeaderList_Append,
+    DPCTLVirtualHeaderList_Create,
+    DPCTLVirtualHeaderList_Delete,
+    DPCTLVirtualHeaderListRef,
 )
 
 __all__ = [
@@ -196,9 +215,11 @@ cdef class SyclProgram:
     """
 
     @staticmethod
-    cdef SyclProgram _create(DPCTLSyclKernelBundleRef KBRef):
+    cdef SyclProgram _create(DPCTLSyclKernelBundleRef KBRef,
+                             bint is_sycl_source):
         cdef SyclProgram ret = SyclProgram.__new__(SyclProgram)
         ret._program_ref = KBRef
+        ret._is_sycl_source = is_sycl_source
         return ret
 
     def __dealloc__(self):
@@ -209,6 +230,10 @@ cdef class SyclProgram:
 
     cpdef SyclKernel get_sycl_kernel(self, str kernel_name):
         name = kernel_name.encode("utf8")
+        if self._is_sycl_source:
+            return SyclKernel._create(
+                    DPCTLKernelBundle_GetSyclKernel(self._program_ref, name),
+                    kernel_name)
         return SyclKernel._create(
             DPCTLKernelBundle_GetKernel(self._program_ref, name),
             kernel_name
@@ -216,6 +241,8 @@ cdef class SyclProgram:
 
     def has_sycl_kernel(self, str kernel_name):
         name = kernel_name.encode("utf8")
+        if self._is_sycl_source:
+            return DPCTLKernelBundle_HasSyclKernel(self._program_ref, name)
         return DPCTLKernelBundle_HasKernel(self._program_ref, name)
 
     def addressof_ref(self):
@@ -271,7 +298,7 @@ cpdef create_program_from_source(SyclQueue q, str src, str copts=""):
     if KBref is NULL:
         raise SyclProgramCompilationError()
 
-    return SyclProgram._create(KBref)
+    return SyclProgram._create(KBref, False)
 
 
 cpdef create_program_from_spirv(SyclQueue q, const unsigned char[:] IL,
@@ -317,7 +344,120 @@ cpdef create_program_from_spirv(SyclQueue q, const unsigned char[:] IL,
     if KBref is NULL:
         raise SyclProgramCompilationError()
 
-    return SyclProgram._create(KBref)
+    return SyclProgram._create(KBref, False)
+
+
+cpdef create_program_from_sycl_source(SyclQueue q, unicode source,
+                                      list headers=None,
+                                      list registered_names=None,
+                                      list copts=None):
+    """
+        Creates an executable SYCL kernel_bundle from SYCL source code.
+
+        This uses the DPC++ ``kernel_compiler`` extension to create a
+        ``sycl::kernel_bundle<sycl::bundle_state::executable>`` object from
+        SYCL source code.
+
+        Parameters:
+            q (:class:`dpctl.SyclQueue`)
+                The :class:`dpctl.SyclQueue` for which the
+                :class:`.SyclProgram` is going to be built.
+            source (unicode)
+                SYCL source code string.
+            headers (list)
+                Optional list of virtual headers, where each entry in the list
+                needs to be a tuple of header name and header content. See the
+                documentation of the ``include_files`` property in the DPC++
+                ``kernel_compiler`` extension for more information.
+                Default: []
+            registered_names (list, optional)
+                Optional list of kernel names to register. See the
+                documentation of the ``registered_names`` property in the DPC++
+                ``kernel_compiler`` extension for more information.
+                Default: []
+            copts (list)
+                Optional list of compilation flags that will be used
+                when compiling the program. Default: ``""``.
+
+        Returns:
+            program (:class:`.SyclProgram`)
+                A :class:`.SyclProgram` object wrapping the
+                ``sycl::kernel_bundle<sycl::bundle_state::executable>``
+                returned by the C API.
+
+        Raises:
+            SyclProgramCompilationError
+                If a SYCL kernel bundle could not be created. The exception
+                message contains the build log for more details.
+    """
+    cdef DPCTLSyclKernelBundleRef KBref
+    cdef DPCTLSyclContextRef CRef = q.get_sycl_context().get_context_ref()
+    cdef DPCTLSyclDeviceRef DRef = q.get_sycl_device().get_device_ref()
+    cdef bytes bSrc = source.encode("utf8")
+    cdef const char *Src = <const char*>bSrc
+    cdef DPCTLBuildOptionListRef BuildOpts = DPCTLBuildOptionList_Create()
+    cdef bytes bOpt
+    cdef const char* sOpt
+    cdef bytes bName
+    cdef const char* sName
+    cdef bytes bContent
+    cdef const char* sContent
+    cdef const char* buildLogContent
+    for opt in copts:
+        if not isinstance(opt, unicode):
+            DPCTLBuildOptionList_Delete(BuildOpts)
+            raise SyclProgramCompilationError()
+        bOpt = opt.encode("utf8")
+        sOpt = <const char*>bOpt
+        DPCTLBuildOptionList_Append(BuildOpts, sOpt)
+
+    cdef DPCTLKernelNameListRef KernelNames = DPCTLKernelNameList_Create()
+    for name in registered_names:
+        if not isinstance(name, unicode):
+            DPCTLBuildOptionList_Delete(BuildOpts)
+            DPCTLKernelNameList_Delete(KernelNames)
+            raise SyclProgramCompilationError()
+        bName = name.encode("utf8")
+        sName = <const char*>bName
+        DPCTLKernelNameList_Append(KernelNames, sName)
+
+    cdef DPCTLVirtualHeaderListRef VirtualHeaders
+    VirtualHeaders = DPCTLVirtualHeaderList_Create()
+
+    for name, content in headers:
+        if not isinstance(name, unicode) or not isinstance(content, unicode):
+            DPCTLBuildOptionList_Delete(BuildOpts)
+            DPCTLKernelNameList_Delete(KernelNames)
+            DPCTLVirtualHeaderList_Delete(VirtualHeaders)
+            raise SyclProgramCompilationError()
+        bName = name.encode("utf8")
+        sName = <const char*>bName
+        bContent = content.encode("utf8")
+        sContent = <const char*>bContent
+        DPCTLVirtualHeaderList_Append(VirtualHeaders, sName, sContent)
+
+    cdef DPCTLKernelBuildLogRef BuildLog
+    BuildLog = DPCTLKernelBuildLog_Create()
+
+    KBref = DPCTLKernelBundle_CreateFromSYCLSource(CRef, DRef, Src,
+                                                   VirtualHeaders, KernelNames,
+                                                   BuildOpts, BuildLog)
+
+    if KBref is NULL:
+        buildLogContent = DPCTLKernelBuildLog_Get(BuildLog)
+        buildLogStr = str(buildLogContent, "utf-8")
+        DPCTLBuildOptionList_Delete(BuildOpts)
+        DPCTLKernelNameList_Delete(KernelNames)
+        DPCTLVirtualHeaderList_Delete(VirtualHeaders)
+        DPCTLKernelBuildLog_Delete(BuildLog)
+        raise SyclProgramCompilationError(buildLogStr)
+
+    DPCTLBuildOptionList_Delete(BuildOpts)
+    DPCTLKernelNameList_Delete(KernelNames)
+    DPCTLVirtualHeaderList_Delete(VirtualHeaders)
+    DPCTLKernelBuildLog_Delete(BuildLog)
+
+    return SyclProgram._create(KBref, True)
 
 
 cdef api DPCTLSyclKernelBundleRef SyclProgram_GetKernelBundleRef(
@@ -336,4 +476,4 @@ cdef api SyclProgram SyclProgram_Make(DPCTLSyclKernelBundleRef KBRef):
     reference.
     """
     cdef DPCTLSyclKernelBundleRef copied_KBRef = DPCTLKernelBundle_Copy(KBRef)
-    return SyclProgram._create(copied_KBRef)
+    return SyclProgram._create(copied_KBRef, False)
diff --git a/dpctl/tests/test_sycl_program.py b/dpctl/tests/test_sycl_program.py
index 4b7102c264..6ce9b33279 100644
--- a/dpctl/tests/test_sycl_program.py
+++ b/dpctl/tests/test_sycl_program.py
@@ -80,8 +80,7 @@ def _check_cpython_api_SyclProgram_Make(sycl_prog):
     make_prog_fn = callable_maker(make_prog_fn_ptr)
 
     p2 = make_prog_fn(sycl_prog.addressof_ref())
-    assert p2.has_sycl_kernel("add")
-    assert p2.has_sycl_kernel("axpy")
+    return p2
 
 
 def _check_cpython_api_SyclKernel_GetKernelRef(krn):
@@ -186,7 +185,9 @@ def _check_multi_kernel_program(prog):
         assert type(cmsgsz) is int
 
     _check_cpython_api_SyclProgram_GetKernelBundleRef(prog)
-    _check_cpython_api_SyclProgram_Make(prog)
+    p2 = _check_cpython_api_SyclProgram_Make(prog)
+    assert p2.has_sycl_kernel("add")
+    assert p2.has_sycl_kernel("axpy")
 
 
 def test_create_program_from_source_ocl():
@@ -262,3 +263,150 @@ def test_create_program_from_invalid_src_ocl():
     }"
     with pytest.raises(dpctl_prog.SyclProgramCompilationError):
         dpctl_prog.create_program_from_source(q, invalid_oclSrc)
+
+
+def test_create_program_from_sycl_source():
+    try:
+        q = dpctl.SyclQueue("opencl")
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("No OpenCL queue is available")
+
+    if not q.get_sycl_device().can_compile("sycl"):
+        pytest.skip("SYCL source compilation not supported")
+
+    sycl_source = """
+    #include <sycl/sycl.hpp>
+    #include "math_ops.hpp"
+    #include "math_template_ops.hpp"
+
+    namespace syclext = sycl::ext::oneapi::experimental;
+
+    extern "C" SYCL_EXTERNAL
+    SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclext::nd_range_kernel<1>))
+    void vector_add(int* in1, int* in2, int* out){
+        sycl::nd_item<1> item =
+                        sycl::ext::oneapi::this_work_item::get_nd_item<1>();
+        size_t globalID = item.get_global_linear_id();
+        out[globalID] = math_op(in1[globalID],in2[globalID]);
+    }
+
+    template<typename T>
+    SYCL_EXTERNAL
+    SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclext::nd_range_kernel<1>))
+    void vector_add_template(T* in1, T* in2, T* out){
+        sycl::nd_item<1> item =
+                        sycl::ext::oneapi::this_work_item::get_nd_item<1>();
+        size_t globalID = item.get_global_linear_id();
+        out[globalID] = math_op_template(in1[globalID], in2[globalID]);
+    }
+    """
+
+    header_content = """
+    int math_op(int a, int b){
+        return a + b;
+    }
+    """
+
+    header2_content = """
+    template<typename T>
+    T math_op_template(T a, T b){
+        return a + b;
+    }
+    """
+
+    prog = dpctl.program.create_program_from_sycl_source(
+        q,
+        sycl_source,
+        headers=[
+            ("math_ops.hpp", header_content),
+            ("math_template_ops.hpp", header2_content),
+        ],
+        registered_names=["vector_add_template<int>"],
+        copts=["-fno-fast-math"],
+    )
+
+    assert type(prog) is dpctl_prog.SyclProgram
+
+    assert type(prog.addressof_ref()) is int
+    assert prog.has_sycl_kernel("vector_add")
+    regularKernel = prog.get_sycl_kernel("vector_add")
+
+    # DPC++ version 2025.1 supports compilation of SYCL template kernels, but
+    # does not yet support referencing them with the unmangled name.
+    hasTemplateName = prog.has_sycl_kernel("vector_add_template<int>")
+    hasMangledName = prog.has_sycl_kernel(
+        "_Z33__sycl_kernel_vector_add_templateIiEvPT_S1_S1_"
+    )
+    assert hasTemplateName or hasMangledName
+
+    if hasTemplateName:
+        templateKernel = prog.get_sycl_kernel("vector_add_template<int>")
+    else:
+        templateKernel = prog.get_sycl_kernel(
+            "_Z33__sycl_kernel_vector_add_templateIiEvPT_S1_S1_"
+        )
+
+    assert "vector_add" == regularKernel.get_function_name()
+    assert type(regularKernel.addressof_ref()) is int
+    assert type(templateKernel.addressof_ref()) is int
+
+    for krn in [regularKernel, templateKernel]:
+        _check_cpython_api_SyclKernel_GetKernelRef(krn)
+        _check_cpython_api_SyclKernel_Make(krn)
+
+        assert 3 == krn.get_num_args()
+        na = krn.num_args
+        assert na == krn.get_num_args()
+        wgsz = krn.work_group_size
+        assert type(wgsz) is int
+        pwgszm = krn.preferred_work_group_size_multiple
+        assert type(pwgszm) is int
+        pmsz = krn.private_mem_size
+        assert type(pmsz) is int
+        vmnsg = krn.max_num_sub_groups
+        assert type(vmnsg) is int
+        v = krn.max_sub_group_size
+        assert type(v) is int
+        cmnsg = krn.compile_num_sub_groups
+        assert type(cmnsg) is int
+        cmsgsz = krn.compile_sub_group_size
+        assert type(cmsgsz) is int
+
+    _check_cpython_api_SyclProgram_GetKernelBundleRef(prog)
+
+
+def test_create_program_from_invalid_src_sycl():
+    try:
+        q = dpctl.SyclQueue("opencl")
+    except dpctl.SyclQueueCreationError:
+        pytest.skip("No OpenCL queue is available")
+
+    if not q.get_sycl_device().can_compile("sycl"):
+        pytest.skip("SYCL source compilation not supported")
+
+    sycl_source = """
+    #include <sycl/sycl.hpp>
+
+    namespace syclext = sycl::ext::oneapi::experimental;
+
+    extern "C" SYCL_EXTERNAL
+    SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclext::nd_range_kernel<1>))
+    void vector_add(int* in1, int* in2, int* out){
+        sycl::nd_item<1> item =
+                        sycl::ext::oneapi::this_work_item::get_nd_item<1>();
+        size_t globalID = item.get_global_linear_id()
+        out[globalID] = in1[globalID] + in2[globalID];
+    }
+    """
+    try:
+        _ = dpctl.program.create_program_from_sycl_source(
+            q,
+            sycl_source,
+            headers=[],
+            registered_names=[],
+            copts=[],
+        )
+        assert False
+    except dpctl_prog.SyclProgramCompilationError as prog_error:
+        print(str(prog_error))
+        assert "error: expected ';' at end of declaration" in str(prog_error)
diff --git a/libsyclinterface/include/syclinterface/dpctl_sycl_device_interface.h b/libsyclinterface/include/syclinterface/dpctl_sycl_device_interface.h
index 72b0261e1f..95e30ac03f 100644
--- a/libsyclinterface/include/syclinterface/dpctl_sycl_device_interface.h
+++ b/libsyclinterface/include/syclinterface/dpctl_sycl_device_interface.h
@@ -828,4 +828,40 @@ DPCTL_API
 void DPCTLDevice_DisablePeerAccess(__dpctl_keep const DPCTLSyclDeviceRef DRef,
                                    __dpctl_keep const DPCTLSyclDeviceRef PDRef);
 
+/*!
+ * @brief Checks whether it is possible to create executables kernel bundles
+ * from SPIR-V binaries on this device.
+ *
+ * @param DRef  Opaque pointer to a ``sycl::device``.
+ * @return True if creation is supported.
+ * #DPCTLSyclDeviceRef objects
+ * @ingroup DeviceInterface
+ */
+DPCTL_API
+bool DPCTLDevice_CanCompileSPIRV(__dpctl_keep const DPCTLSyclDeviceRef DRef);
+
+/*!
+ * @brief Checks whether it is possible to create executables kernel bundles
+ * from OpenCL source code on this device.
+ *
+ * @param DRef  Opaque pointer to a ``sycl::device``.
+ * @return True if creation is supported.
+ * #DPCTLSyclDeviceRef objects
+ * @ingroup DeviceInterface
+ */
+DPCTL_API
+bool DPCTLDevice_CanCompileOpenCL(__dpctl_keep const DPCTLSyclDeviceRef DRef);
+
+/*!
+ * @brief Checks whether it is possible to create executables kernel bundles
+ * from SYCL source code on this device.
+ *
+ * @param DRef  Opaque pointer to a ``sycl::device``.
+ * @return True if creation is supported.
+ * #DPCTLSyclDeviceRef objects
+ * @ingroup DeviceInterface
+ */
+DPCTL_API
+bool DPCTLDevice_CanCompileSYCL(__dpctl_keep const DPCTLSyclDeviceRef DRef);
+
 DPCTL_C_EXTERN_C_END
diff --git a/libsyclinterface/include/syclinterface/dpctl_sycl_kernel_bundle_interface.h b/libsyclinterface/include/syclinterface/dpctl_sycl_kernel_bundle_interface.h
index 529bc3cca1..1ddfbe95f8 100644
--- a/libsyclinterface/include/syclinterface/dpctl_sycl_kernel_bundle_interface.h
+++ b/libsyclinterface/include/syclinterface/dpctl_sycl_kernel_bundle_interface.h
@@ -129,4 +129,173 @@ DPCTL_API
 __dpctl_give DPCTLSyclKernelBundleRef
 DPCTLKernelBundle_Copy(__dpctl_keep const DPCTLSyclKernelBundleRef KBRef);
 
+typedef struct DPCTLBuildOptionList *DPCTLBuildOptionListRef;
+typedef struct DPCTLKernelNameList *DPCTLKernelNameListRef;
+typedef struct DPCTLVirtualHeaderList *DPCTLVirtualHeaderListRef;
+typedef struct DPCTLKernelBuildLog *DPCTLKernelBuildLogRef;
+
+/*!
+ * @brief Create an empty list of build options.
+ *
+ * @return Opaque pointer to the build option file list.
+ * @ingroup KernelBundleInterface
+ */
+DPCTL_API
+__dpctl_give DPCTLBuildOptionListRef DPCTLBuildOptionList_Create();
+
+/*!
+ * @brief Frees the DPCTLBuildOptionListRef pointer.
+ *
+ * @param   Ref           Opaque pointer to a list of build options
+ * @ingroup KernelBundleInterface
+ */
+DPCTL_API void
+DPCTLBuildOptionList_Delete(__dpctl_take DPCTLBuildOptionListRef Ref);
+
+/*!
+ * @brief Append a build option to the list of build options
+ *
+ * @param Ref Opaque pointer to the list of build options
+ * @param Option Option to append
+ */
+DPCTL_API
+void DPCTLBuildOptionList_Append(__dpctl_keep DPCTLBuildOptionListRef Ref,
+                                 __dpctl_keep const char *Option);
+
+/*!
+ * @brief Create an empty list of kernel names to register.
+ *
+ * @return Opaque pointer to the list of kernel names to register.
+ * @ingroup KernelBundleInterface
+ */
+DPCTL_API
+__dpctl_give DPCTLKernelNameListRef DPCTLKernelNameList_Create();
+
+/*!
+ * @brief Frees the DPCTLKernelNameListRef pointer.
+ *
+ * @param   Ref           Opaque pointer to a list of kernels to register
+ * @ingroup KernelBundleInterface
+ */
+DPCTL_API void
+DPCTLKernelNameList_Delete(__dpctl_take DPCTLKernelNameListRef Ref);
+
+/*!
+ * @brief Append a kernel name to register to the list of build options
+ *
+ * @param Ref Opaque pointer to the list of kernel names
+ * @param Option Kernel name to append
+ */
+DPCTL_API
+void DPCTLKernelNameList_Append(__dpctl_keep DPCTLKernelNameListRef Ref,
+                                __dpctl_keep const char *Option);
+/*!
+ * @brief Create an empty list of virtual header files.
+ *
+ * @return Opaque pointer to the virtual header file list.
+ * @ingroup KernelBundleInterface
+ */
+DPCTL_API
+__dpctl_give DPCTLVirtualHeaderListRef DPCTLVirtualHeaderList_Create();
+
+/*!
+ * @brief Frees the DPCTLVirtualHeaderListRef pointer.
+ *
+ * @param   Ref           Opaque pointer to a list of virtual headers
+ * @ingroup KernelBundleInterface
+ */
+DPCTL_API void
+DPCTLVirtualHeaderList_Delete(__dpctl_take DPCTLVirtualHeaderListRef Ref);
+
+/*!
+ * @brief Append a kernel name to register to the list of virtual header files
+ *
+ * @param Ref Opaque pointer to the list of header files
+ * @param Name Name of the virtual header file
+ * @param Content Content of the virtual header
+ */
+DPCTL_API
+void DPCTLVirtualHeaderList_Append(__dpctl_keep DPCTLVirtualHeaderListRef Ref,
+                                   __dpctl_keep const char *Name,
+                                   __dpctl_keep const char *Content);
+
+/*!
+ * @brief Create an empty kernel build log.
+ *
+ * @return Opaque pointer to the kernel build log.
+ * @ingroup KernelBundleInterface
+ */
+DPCTL_API __dpctl_give DPCTLKernelBuildLogRef DPCTLKernelBuildLog_Create();
+
+/*!
+ * @brief Frees the DPCTLKernelBuildLogRef pointer.
+ *
+ * @param   Ref           Opaque pointer to a kernel build log.
+ * @ingroup KernelBundleInterface
+ */
+DPCTL_API
+void DPCTLKernelBuildLog_Delete(__dpctl_take DPCTLKernelBuildLogRef Ref);
+
+/*!
+ * @brief Get the content of the build log.
+ *
+ * @param Ref   Opaque pointer to the kernel build log.
+ * @return      Content of the build log
+ * @ingroup     KernelBundleInterface
+ */
+DPCTL_API const char *
+DPCTLKernelBuildLog_Get(__dpctl_keep DPCTLKernelBuildLogRef);
+
+/*!
+ * @brief Create a SYCL kernel bundle from an SYCL kernel source string.
+ *
+ * @param    Ctx            An opaque pointer to a sycl::context
+ * @param    Dev            An opaque pointer to a sycl::device
+ * @param    Source         SYCL source string
+ * @param    Headers        List of virtual headers
+ * @param    Names          List of kernel names to register
+ * @param    CompileOpts    List of extra compiler flags (refer Sycl spec.)
+ * @return   A new SyclKernelBundleRef pointer if the program creation
+ * succeeded, else returns NULL.
+ * @ingroup KernelBundleInterface
+ */
+DPCTL_API
+__dpctl_give DPCTLSyclKernelBundleRef DPCTLKernelBundle_CreateFromSYCLSource(
+    __dpctl_keep const DPCTLSyclContextRef Ctx,
+    __dpctl_keep const DPCTLSyclDeviceRef Dev,
+    __dpctl_keep const char *Source,
+    __dpctl_keep DPCTLVirtualHeaderListRef Headers,
+    __dpctl_keep DPCTLKernelNameListRef Names,
+    __dpctl_keep DPCTLBuildOptionListRef BuildOptions,
+    __dpctl_keep DPCTLKernelBuildLogRef BuildLog);
+
+/*!
+ * @brief Returns the SyclKernel with given name from the program compiled from
+ * SYCL source code, if not found then return NULL.
+ *
+ * @param    KBRef          Opaque pointer to a sycl::kernel_bundle
+ * @param    KernelName     Name of kernel
+ * @return   A SyclKernel reference if the kernel exists, else NULL
+ * @ingroup KernelBundleInterface
+ */
+DPCTL_API
+__dpctl_give DPCTLSyclKernelRef
+DPCTLKernelBundle_GetSyclKernel(__dpctl_keep DPCTLSyclKernelBundleRef KBRef,
+                                __dpctl_keep const char *KernelName);
+
+/*!
+ * @brief Return True if a SyclKernel with given name exists in the program
+ * compiled from SYCL source code, if not found then returns False.
+ *
+ * @param    KBRef          Opaque pointer to a sycl::kernel_bundle
+ * @param    KernelName     Name of kernel
+ * @return   True if the kernel exists, else False
+ * @ingroup KernelBundleInterface
+ */
+
+DPCTL_API
+bool DPCTLKernelBundle_HasSyclKernel(__dpctl_keep DPCTLSyclKernelBundleRef
+                                         KBRef,
+                                     __dpctl_keep const char *KernelName);
+
 DPCTL_C_EXTERN_C_END
diff --git a/libsyclinterface/source/dpctl_sycl_device_interface.cpp b/libsyclinterface/source/dpctl_sycl_device_interface.cpp
index 1378f6f818..4bf5c6293b 100644
--- a/libsyclinterface/source/dpctl_sycl_device_interface.cpp
+++ b/libsyclinterface/source/dpctl_sycl_device_interface.cpp
@@ -982,3 +982,28 @@ void DPCTLDevice_DisablePeerAccess(__dpctl_keep const DPCTLSyclDeviceRef DRef,
     }
     return;
 }
+
+bool DPCTLDevice_CanCompileSPIRV(__dpctl_keep const DPCTLSyclDeviceRef DRef)
+{
+    auto Dev = unwrap<device>(DRef);
+    auto Backend = Dev->get_platform().get_backend();
+    return Backend == backend::opencl ||
+           Backend == backend::ext_oneapi_level_zero;
+}
+
+bool DPCTLDevice_CanCompileOpenCL(__dpctl_keep const DPCTLSyclDeviceRef DRef)
+{
+    auto Dev = unwrap<device>(DRef);
+    return Dev->get_platform().get_backend() == backend::opencl;
+}
+
+bool DPCTLDevice_CanCompileSYCL(__dpctl_keep const DPCTLSyclDeviceRef DRef)
+{
+#ifdef SYCL_EXT_ONEAPI_KERNEL_COMPILER
+    auto Dev = unwrap<device>(DRef);
+    return Dev->ext_oneapi_can_compile(
+        ext::oneapi::experimental::source_language::sycl);
+#else
+    return false;
+#endif
+}
diff --git a/libsyclinterface/source/dpctl_sycl_kernel_bundle_interface.cpp b/libsyclinterface/source/dpctl_sycl_kernel_bundle_interface.cpp
index 78c714ecbb..94a0e3099e 100644
--- a/libsyclinterface/source/dpctl_sycl_kernel_bundle_interface.cpp
+++ b/libsyclinterface/source/dpctl_sycl_kernel_bundle_interface.cpp
@@ -761,3 +761,250 @@ DPCTLKernelBundle_Copy(__dpctl_keep const DPCTLSyclKernelBundleRef KBRef)
         return nullptr;
     }
 }
+
+using build_option_list_t = std::vector<std::string>;
+
+__dpctl_give DPCTLBuildOptionListRef DPCTLBuildOptionList_Create()
+{
+    auto BuildOptionList =
+        std::unique_ptr<build_option_list_t>(new build_option_list_t());
+    auto *RetVal =
+        reinterpret_cast<DPCTLBuildOptionListRef>(BuildOptionList.get());
+    BuildOptionList.release();
+    return RetVal;
+}
+
+void DPCTLBuildOptionList_Delete(__dpctl_take DPCTLBuildOptionListRef Ref)
+{
+    delete reinterpret_cast<build_option_list_t *>(Ref);
+}
+
+void DPCTLBuildOptionList_Append(__dpctl_keep DPCTLBuildOptionListRef Ref,
+                                 __dpctl_keep const char *Option)
+{
+    reinterpret_cast<build_option_list_t *>(Ref)->emplace_back(Option);
+}
+
+using kernel_name_list_t = std::vector<std::string>;
+
+__dpctl_give DPCTLKernelNameListRef DPCTLKernelNameList_Create()
+{
+    auto KernelNameList =
+        std::unique_ptr<kernel_name_list_t>(new kernel_name_list_t());
+    auto *RetVal =
+        reinterpret_cast<DPCTLKernelNameListRef>(KernelNameList.get());
+    KernelNameList.release();
+    return RetVal;
+}
+
+void DPCTLKernelNameList_Delete(__dpctl_take DPCTLKernelNameListRef Ref)
+{
+    delete reinterpret_cast<kernel_name_list_t *>(Ref);
+}
+
+void DPCTLKernelNameList_Append(__dpctl_keep DPCTLKernelNameListRef Ref,
+                                __dpctl_keep const char *Option)
+{
+    reinterpret_cast<kernel_name_list_t *>(Ref)->emplace_back(Option);
+}
+
+using virtual_header_list_t = std::vector<std::pair<std::string, std::string>>;
+
+__dpctl_give DPCTLVirtualHeaderListRef DPCTLVirtualHeaderList_Create()
+{
+    auto HeaderList =
+        std::unique_ptr<virtual_header_list_t>(new virtual_header_list_t());
+    auto *RetVal =
+        reinterpret_cast<DPCTLVirtualHeaderListRef>(HeaderList.get());
+    HeaderList.release();
+    return RetVal;
+}
+
+void DPCTLVirtualHeaderList_Delete(__dpctl_take DPCTLVirtualHeaderListRef Ref)
+{
+    delete reinterpret_cast<virtual_header_list_t *>(Ref);
+}
+
+void DPCTLVirtualHeaderList_Append(__dpctl_keep DPCTLVirtualHeaderListRef Ref,
+                                   __dpctl_keep const char *Name,
+                                   __dpctl_keep const char *Content)
+{
+    auto Header = std::make_pair<std::string, std::string>(Name, Content);
+    reinterpret_cast<virtual_header_list_t *>(Ref)->push_back(Header);
+}
+
+using kernel_build_log_t = std::string;
+
+__dpctl_give DPCTLKernelBuildLogRef DPCTLKernelBuildLog_Create()
+{
+    auto BuildLog =
+        std::unique_ptr<kernel_build_log_t>(new kernel_build_log_t(""));
+    auto *RetVal = reinterpret_cast<DPCTLKernelBuildLogRef>(BuildLog.get());
+    BuildLog.release();
+    return RetVal;
+}
+
+void DPCTLKernelBuildLog_Delete(__dpctl_take DPCTLKernelBuildLogRef Ref)
+{
+    delete reinterpret_cast<kernel_build_log_t *>(Ref);
+}
+
+const char *DPCTLKernelBuildLog_Get(__dpctl_keep DPCTLKernelBuildLogRef Ref)
+{
+    return reinterpret_cast<kernel_build_log_t *>(Ref)->data();
+}
+
+namespace syclex = sycl::ext::oneapi::experimental;
+
+#if defined(SYCL_EXT_ONEAPI_KERNEL_COMPILER) &&                                \
+    defined(__SYCL_COMPILER_VERSION) && !defined(SUPPORTS_SYCL_COMPILATION)
+// SYCL source code compilation is supported from 2025.1 onwards.
+#if __SYCL_COMPILER_VERSION >= 20250317u
+#define SUPPORTS_SYCL_COMPILATION 1
+#else
+#define SUPPORTS_SYCL_COMPILATION 0
+#endif
+#endif
+
+#if (SUPPORTS_SYCL_COMPILATION > 0)
+// The property for registering names was renamed between DPC++ versions 2025.1
+// and 2025.2. The original name was `registered_kernel_names`, the new name is
+// `registered_names`. To select the correct name without being overly reliant
+// on the SYCL compiler version definition, we forward declare both names and
+// then select the new name if it is defined (i.e., not only declared).
+namespace sycl::ext::oneapi::experimental
+{
+struct registered_names;
+struct registered_kernel_names;
+} // namespace sycl::ext::oneapi::experimental
+
+template <typename NewT, typename FallbackT, typename = void>
+struct new_type_if_defined
+{
+    using type = FallbackT;
+};
+
+template <typename NewT, typename FallbackT>
+struct new_type_if_defined<NewT, FallbackT, std::void_t<decltype(sizeof(NewT))>>
+{
+    using type = NewT;
+};
+
+using registered_names_property_t =
+    new_type_if_defined<syclex::registered_names,
+                        syclex::registered_kernel_names>::type;
+#endif
+
+__dpctl_give DPCTLSyclKernelBundleRef DPCTLKernelBundle_CreateFromSYCLSource(
+    __dpctl_keep const DPCTLSyclContextRef Ctx,
+    __dpctl_keep const DPCTLSyclDeviceRef Dev,
+    __dpctl_keep const char *Source,
+    __dpctl_keep DPCTLVirtualHeaderListRef Headers,
+    __dpctl_keep DPCTLKernelNameListRef Names,
+    __dpctl_keep DPCTLBuildOptionListRef BuildOptions,
+    __dpctl_keep DPCTLKernelBuildLogRef BuildLog)
+{
+#if (SUPPORTS_SYCL_COMPILATION > 0)
+    context *SyclCtx = unwrap<context>(Ctx);
+    device *SyclDev = unwrap<device>(Dev);
+    if (!SyclDev->ext_oneapi_can_compile(syclex::source_language::sycl)) {
+        return nullptr;
+    }
+    try {
+        auto *IncludeFileList =
+            reinterpret_cast<virtual_header_list_t *>(Headers);
+        std::unique_ptr<kernel_bundle<bundle_state::ext_oneapi_source>>
+            SrcBundle;
+        std::string Src(Source);
+        // The following logic is to work around a bug in DPC++ version 2025.1.
+        // This version declares a constructor with no parameters for the
+        // `include_files` property, but does not implement it. Therefore, the
+        // only way to create `include_files` is with the name and content of
+        // the first virtual header, if any.
+        if (!IncludeFileList->empty()) {
+            auto IncludeFileIt = IncludeFileList->begin();
+            syclex::include_files IncludeFiles{IncludeFileIt->first,
+                                               IncludeFileIt->second};
+            for (std::advance(IncludeFileIt, 1);
+                 IncludeFileIt != IncludeFileList->end(); ++IncludeFileIt)
+            {
+                IncludeFiles.add(IncludeFileIt->first, IncludeFileIt->second);
+            }
+            SrcBundle = std::make_unique<
+                kernel_bundle<bundle_state::ext_oneapi_source>>(
+                syclex::create_kernel_bundle_from_source(
+                    *SyclCtx, syclex::source_language::sycl, Src,
+                    syclex::properties{IncludeFiles}));
+        }
+        else {
+            SrcBundle = std::make_unique<
+                kernel_bundle<bundle_state::ext_oneapi_source>>(
+                syclex::create_kernel_bundle_from_source(
+                    *SyclCtx, syclex::source_language::sycl, Src));
+        }
+
+        registered_names_property_t RegisteredNames;
+        for (const std::string &Name :
+             *reinterpret_cast<kernel_name_list_t *>(Names))
+        {
+            RegisteredNames.add(Name);
+        }
+
+        syclex::build_options Opts{
+            *reinterpret_cast<build_option_list_t *>(BuildOptions)};
+
+        std::vector<sycl::device> Devices({*SyclDev});
+
+        auto ExeBundle = syclex::build(
+            *SrcBundle, Devices, syclex::properties{RegisteredNames, Opts});
+        auto ResultBundle =
+            std::make_unique<sycl::kernel_bundle<bundle_state::executable>>(
+                ExeBundle);
+        return wrap<kernel_bundle<bundle_state::executable>>(
+            ResultBundle.release());
+    } catch (const std::exception &e) {
+        auto *RawBuildLog = reinterpret_cast<kernel_build_log_t *>(BuildLog);
+        *RawBuildLog = e.what();
+        return nullptr;
+    }
+#else
+    return nullptr;
+#endif
+}
+
+__dpctl_give DPCTLSyclKernelRef
+DPCTLKernelBundle_GetSyclKernel(__dpctl_keep DPCTLSyclKernelBundleRef KBRef,
+                                __dpctl_keep const char *KernelName)
+{
+#if (SUPPORTS_SYCL_COMPILATION > 0)
+    try {
+        auto KernelBundle =
+            unwrap<sycl::kernel_bundle<bundle_state::executable>>(KBRef);
+        auto Kernel = KernelBundle->ext_oneapi_get_kernel(KernelName);
+        return wrap<sycl::kernel>(new sycl::kernel(Kernel));
+    } catch (const std::exception &e) {
+        error_handler(e, __FILE__, __func__, __LINE__);
+        return nullptr;
+    }
+#else
+    return nullptr;
+#endif
+}
+
+bool DPCTLKernelBundle_HasSyclKernel(__dpctl_keep DPCTLSyclKernelBundleRef
+                                         KBRef,
+                                     __dpctl_keep const char *KernelName)
+{
+#if (SUPPORTS_SYCL_COMPILATION > 0)
+    try {
+        auto KernelBundle =
+            unwrap<sycl::kernel_bundle<bundle_state::executable>>(KBRef);
+        return KernelBundle->ext_oneapi_has_kernel(KernelName);
+    } catch (const std::exception &e) {
+        error_handler(e, __FILE__, __func__, __LINE__);
+        return false;
+    }
+#else
+    return false;
+#endif
+}
diff --git a/libsyclinterface/tests/test_sycl_kernel_bundle_interface.cpp b/libsyclinterface/tests/test_sycl_kernel_bundle_interface.cpp
index d136c700b6..740385283b 100644
--- a/libsyclinterface/tests/test_sycl_kernel_bundle_interface.cpp
+++ b/libsyclinterface/tests/test_sycl_kernel_bundle_interface.cpp
@@ -273,6 +273,132 @@ TEST_P(TestOCLKernelBundleFromSource, CheckGetKernelOCLSource)
     DPCTLKernel_Delete(AxpyKernel);
 }
 
+struct TestSYCLKernelBundleFromSource
+    : public ::testing::TestWithParam<const char *>
+{
+    const char *sycl_source = R"===(
+    #include <sycl/sycl.hpp>
+    #include "math_ops.hpp"
+    #include "math_template_ops.hpp"
+
+    namespace syclext = sycl::ext::oneapi::experimental;
+
+    extern "C" SYCL_EXTERNAL SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclext::nd_range_kernel<1>))
+    void vector_add(int* in1, int* in2, int* out){
+        sycl::nd_item<1> item = sycl::ext::oneapi::this_work_item::get_nd_item<1>();
+        size_t globalID = item.get_global_linear_id();
+        out[globalID] = math_op(in1[globalID],in2[globalID]);
+    }
+
+    template<typename T>
+    SYCL_EXTERNAL SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclext::nd_range_kernel<1>))
+    void vector_add_template(T* in1, T* in2, T* out){
+        sycl::nd_item<1> item = sycl::ext::oneapi::this_work_item::get_nd_item<1>();
+        size_t globalID = item.get_global_linear_id();
+        out[globalID] = math_op_template(in1[globalID], in2[globalID]);
+    }
+    )===";
+
+    const char *header1_content = R"===(
+    int math_op(int a, int b){
+        return a + b;
+    }
+    )===";
+
+    const char *header2_content = R"===(
+    template<typename T>
+    T math_op_template(T a, T b){
+        return a + b;
+    }
+    )===";
+
+    const char *CompileOpt = "-fno-fast-math";
+    const char *KernelName = "vector_add_template<int>";
+    const char *Header1Name = "math_ops.hpp";
+    const char *Header2Name = "math_template_ops.hpp";
+    DPCTLSyclDeviceRef DRef = nullptr;
+    DPCTLSyclContextRef CRef = nullptr;
+    DPCTLSyclKernelBundleRef KBRef = nullptr;
+
+    TestSYCLKernelBundleFromSource()
+    {
+        auto DS = DPCTLFilterSelector_Create(GetParam());
+        DRef = DPCTLDevice_CreateFromSelector(DS);
+        DPCTLDeviceSelector_Delete(DS);
+        CRef = DPCTLDeviceMgr_GetCachedContext(DRef);
+
+        if (DRef) {
+            DPCTLBuildOptionListRef BORef = DPCTLBuildOptionList_Create();
+            DPCTLBuildOptionList_Append(BORef, CompileOpt);
+            DPCTLKernelNameListRef KNRef = DPCTLKernelNameList_Create();
+            DPCTLKernelNameList_Append(KNRef, KernelName);
+            DPCTLVirtualHeaderListRef VHRef = DPCTLVirtualHeaderList_Create();
+            DPCTLVirtualHeaderList_Append(VHRef, Header1Name, header1_content);
+            DPCTLVirtualHeaderList_Append(VHRef, Header2Name, header2_content);
+            DPCTLKernelBuildLogRef KBLRef = DPCTLKernelBuildLog_Create();
+            KBRef = DPCTLKernelBundle_CreateFromSYCLSource(
+                CRef, DRef, sycl_source, VHRef, KNRef, BORef, KBLRef);
+            DPCTLVirtualHeaderList_Delete(VHRef);
+            DPCTLKernelNameList_Delete(KNRef);
+            DPCTLBuildOptionList_Delete(BORef);
+            DPCTLKernelBuildLog_Delete(KBLRef);
+        }
+    }
+
+    void SetUp()
+    {
+        if (!DRef) {
+            auto message = "Skipping as no device of type " +
+                           std::string(GetParam()) + ".";
+            GTEST_SKIP_(message.c_str());
+        }
+        if (!DPCTLDevice_CanCompileSYCL(DRef)) {
+            const char *message = "Skipping as SYCL compilation not supported";
+            GTEST_SKIP_(message);
+        }
+    }
+
+    ~TestSYCLKernelBundleFromSource()
+    {
+        if (DRef)
+            DPCTLDevice_Delete(DRef);
+        if (CRef)
+            DPCTLContext_Delete(CRef);
+        if (KBRef)
+            DPCTLKernelBundle_Delete(KBRef);
+    }
+};
+
+TEST_P(TestSYCLKernelBundleFromSource, CheckCreateFromSYCLSource)
+{
+    ASSERT_TRUE(KBRef != nullptr);
+    ASSERT_TRUE(DPCTLKernelBundle_HasSyclKernel(KBRef, "vector_add"));
+    // DPC++ version 2025.1 supports compilation of SYCL template kernels,
+    // but does not yet support referencing them with the unmangled name.
+    ASSERT_TRUE(
+        DPCTLKernelBundle_HasSyclKernel(KBRef, "vector_add_template<int>") ||
+        DPCTLKernelBundle_HasSyclKernel(
+            KBRef, "_Z33__sycl_kernel_vector_add_templateIiEvPT_S1_S1_"));
+}
+
+TEST_P(TestSYCLKernelBundleFromSource, CheckGetKernelSYCLSource)
+{
+    auto AddKernel = DPCTLKernelBundle_GetSyclKernel(KBRef, "vector_add");
+    auto AxpyKernel =
+        DPCTLKernelBundle_GetSyclKernel(KBRef, "vector_add_template<int>");
+    if (AxpyKernel == nullptr) {
+        // DPC++ version 2025.1 supports compilation of SYCL template kernels,
+        // but does not yet support referencing them with the unmangled name.
+        AxpyKernel = DPCTLKernelBundle_GetSyclKernel(
+            KBRef, "_Z33__sycl_kernel_vector_add_templateIiEvPT_S1_S1_");
+    }
+
+    ASSERT_TRUE(AddKernel != nullptr);
+    ASSERT_TRUE(AxpyKernel != nullptr);
+    DPCTLKernel_Delete(AddKernel);
+    DPCTLKernel_Delete(AxpyKernel);
+}
+
 INSTANTIATE_TEST_SUITE_P(KernelBundleCreationFromSpirv,
                          TestDPCTLSyclKernelBundleInterface,
                          ::testing::Values("opencl",
@@ -289,6 +415,12 @@ INSTANTIATE_TEST_SUITE_P(KernelBundleCreationFromSource,
                          TestOCLKernelBundleFromSource,
                          ::testing::Values("opencl:gpu", "opencl:cpu"));
 
+INSTANTIATE_TEST_SUITE_P(KernelBundleCreationFromSYCL,
+                         TestSYCLKernelBundleFromSource,
+                         ::testing::Values("opencl:gpu",
+                                           "opencl:cpu",
+                                           "level_zero:gpu"));
+
 struct TestKernelBundleUnsupportedBackend : public ::testing::Test
 {
     DPCTLSyclDeviceRef DRef = nullptr;

From 6797e4089ca4b122b6e9c6620969b6643a057ac9 Mon Sep 17 00:00:00 2001
From: Lukas Sommer <lukas.sommer@codeplay.com>
Date: Wed, 16 Jul 2025 15:10:19 +0100
Subject: [PATCH 23/24] Skip test if oneAPI Base Toolkit isn't available

As the library aren't included in .bc format in version 2025.2 and
earlier, we need to skip the test if no oneAPI Base Toolkit is
installed.

Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
---
 dpctl/tests/test_sycl_program.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/dpctl/tests/test_sycl_program.py b/dpctl/tests/test_sycl_program.py
index 6ce9b33279..cc54c4c741 100644
--- a/dpctl/tests/test_sycl_program.py
+++ b/dpctl/tests/test_sycl_program.py
@@ -17,6 +17,7 @@
 """Defines unit test cases for the SyclProgram and SyclKernel classes"""
 
 import os
+import shutil
 
 import pytest
 
@@ -271,6 +272,12 @@ def test_create_program_from_sycl_source():
     except dpctl.SyclQueueCreationError:
         pytest.skip("No OpenCL queue is available")
 
+    if not shutil.which("icpx"):
+        # In version 2025.2 and before, the packages do not contain the
+        # libraries in the .bc format necessary for RTC. Therefore,
+        # installation of the base toolkit is required.
+        pytest.skip("oneAPI Base Toolkit not installed")
+
     if not q.get_sycl_device().can_compile("sycl"):
         pytest.skip("SYCL source compilation not supported")
 
@@ -381,6 +388,12 @@ def test_create_program_from_invalid_src_sycl():
     except dpctl.SyclQueueCreationError:
         pytest.skip("No OpenCL queue is available")
 
+    if not shutil.which("icpx"):
+        # In version 2025.2 and before, the packages do not contain the
+        # libraries in the .bc format necessary for RTC. Therefore,
+        # installation of the base toolkit is required.
+        pytest.skip("oneAPI Base Toolkit not installed")
+
     if not q.get_sycl_device().can_compile("sycl"):
         pytest.skip("SYCL source compilation not supported")
 

From 75969ce8860157881e6042001124e8eb37c5f80b Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 3 Dec 2025 12:31:36 -0800
Subject: [PATCH 24/24] experiment: do not skip SYCL source tests when basekit
 not installed

---
 dpctl/tests/test_sycl_program.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/dpctl/tests/test_sycl_program.py b/dpctl/tests/test_sycl_program.py
index cc54c4c741..6ce9b33279 100644
--- a/dpctl/tests/test_sycl_program.py
+++ b/dpctl/tests/test_sycl_program.py
@@ -17,7 +17,6 @@
 """Defines unit test cases for the SyclProgram and SyclKernel classes"""
 
 import os
-import shutil
 
 import pytest
 
@@ -272,12 +271,6 @@ def test_create_program_from_sycl_source():
     except dpctl.SyclQueueCreationError:
         pytest.skip("No OpenCL queue is available")
 
-    if not shutil.which("icpx"):
-        # In version 2025.2 and before, the packages do not contain the
-        # libraries in the .bc format necessary for RTC. Therefore,
-        # installation of the base toolkit is required.
-        pytest.skip("oneAPI Base Toolkit not installed")
-
     if not q.get_sycl_device().can_compile("sycl"):
         pytest.skip("SYCL source compilation not supported")
 
@@ -388,12 +381,6 @@ def test_create_program_from_invalid_src_sycl():
     except dpctl.SyclQueueCreationError:
         pytest.skip("No OpenCL queue is available")
 
-    if not shutil.which("icpx"):
-        # In version 2025.2 and before, the packages do not contain the
-        # libraries in the .bc format necessary for RTC. Therefore,
-        # installation of the base toolkit is required.
-        pytest.skip("oneAPI Base Toolkit not installed")
-
     if not q.get_sycl_device().can_compile("sycl"):
         pytest.skip("SYCL source compilation not supported")