From 85fcf7b295187145cb7a49835d0e692e15db6938 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 16 Feb 2026 15:59:38 +0100 Subject: [PATCH 1/6] Add bitwise-shift batch constant api --- include/xsimd/types/xsimd_api.hpp | 48 ++++++++++++++++++++++++++-- test/test_xsimd_api.cpp | 52 ++++++++++++++++++++++--------- 2 files changed, 83 insertions(+), 17 deletions(-) diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp index aa64df4da..abdb385ae 100644 --- a/include/xsimd/types/xsimd_api.hpp +++ b/include/xsimd/types/xsimd_api.hpp @@ -353,6 +353,41 @@ namespace xsimd return kernel::bitwise_cast(x, batch {}, A {}); } + namespace detail + { + // Detection for kernel overloads accepting ``batch_constant`` in ``bitwise_lshift`` + // directly (or in a parent register function). + // The ``batch_constant`` overload is a rare but useful optimization. + // Running the detection here is less error prone than to add a fallback to all + // architectures. + + template + struct has_bitwise_lshift_batch_const : std::false_type + { + }; + + template + struct has_bitwise_lshift_batch_const(std::declval(), std::declval(), A {}))>> + : std::true_type + { + }; + + template + XSIMD_INLINE batch bitwise_lshift_batch_const(batch const& x, batch_constant shift, std::true_type) noexcept + { + // Optimized ``batch_constant`` implementation + return kernel::bitwise_lshift(x, shift, A {}); + } + + template + XSIMD_INLINE batch bitwise_lshift_batch_const(batch const& x, batch_constant shift, std::false_type) noexcept + { + // Fallback to regular run-time implementation + return kernel::bitwise_lshift(x, shift.as_batch(), A {}); + } + } + /** * @ingroup batch_bitwise * @@ -367,17 +402,24 @@ namespace xsimd detail::static_check_supported_config(); return kernel::bitwise_lshift(x, shift, A {}); } + template + XSIMD_INLINE batch bitwise_lshift(batch const& x) noexcept + { + detail::static_check_supported_config(); + return kernel::bitwise_lshift(x, A {}); + } template XSIMD_INLINE batch bitwise_lshift(batch const& x, batch const& shift) noexcept { detail::static_check_supported_config(); return kernel::bitwise_lshift(x, shift, A {}); } - template - XSIMD_INLINE batch bitwise_lshift(batch const& x) noexcept + template + XSIMD_INLINE batch bitwise_lshift(batch const& x, batch_constant shift) noexcept { detail::static_check_supported_config(); - return kernel::bitwise_lshift(x, A {}); + using has_batch_const_impl = detail::has_bitwise_lshift_batch_const; + return detail::bitwise_lshift_batch_const(x, shift, has_batch_const_impl {}); } /** diff --git a/test/test_xsimd_api.cpp b/test/test_xsimd_api.cpp index 8c58543ad..a68155e1e 100644 --- a/test/test_xsimd_api.cpp +++ b/test/test_xsimd_api.cpp @@ -351,7 +351,7 @@ struct xsimd_api_integral_types_functions { using value_type = typename scalar_type::type; - void test_bitwise_lshift() + void test_bitwise_lshift_single() { constexpr int shift = 3; value_type val0(12); @@ -364,6 +364,33 @@ struct xsimd_api_integral_types_functions CHECK_EQ(extract(cr), r); } + template + void test_bitwise_lshift_multiple(typename std::enable_if::value, int>::type = 0) + { +#ifndef XSIMD_NO_SUPPORTED_ARCHITECTURE + constexpr auto Max = static_cast(std::numeric_limits::digits); + constexpr auto max_batch = xsimd::make_batch_constant(); + constexpr auto shifts = xsimd::make_iota_batch_constant() % max_batch; + + auto shifted = xsimd::bitwise_lshift(T(1), shifts.as_batch()); + for (std::size_t i = 0; i < shifts.size; ++i) + { + CHECK_EQ(shifted.get(i), 1 << shifts.get(i)); + } + + auto shifted_cst = xsimd::bitwise_lshift(T(1), shifts); + for (std::size_t i = 0; i < shifts.size; ++i) + { + CHECK_EQ(shifted_cst.get(i), 1 << shifts.get(i)); + } +#endif + } + + template + void test_bitwise_lshift_multiple(typename std::enable_if::value, int>::type = 0) + { + } + void test_bitwise_rshift() { constexpr int shift = 3; @@ -426,9 +453,14 @@ TEST_CASE_TEMPLATE("[xsimd api | integral types functions]", B, INTEGRAL_TYPES) { xsimd_api_integral_types_functions Test; - SUBCASE("bitwise_lshift") + SUBCASE("test_bitwise_lshift_single") + { + Test.test_bitwise_lshift_single(); + } + + SUBCASE("bitwise_lshift_multiple") { - Test.test_bitwise_lshift(); + Test.test_bitwise_lshift_multiple(); } SUBCASE("bitwise_rshift") @@ -502,17 +534,9 @@ struct xsimd_api_float_types_functions } void test_atan2() { - { - value_type val0(0); - value_type val1(1); - CHECK_EQ(extract(xsimd::atan2(T(val0), T(val1))), std::atan2(val0, val1)); - } - - { - value_type val0(1); - value_type val1(0); - CHECK_EQ(extract(xsimd::atan2(T(val0), T(val1))), std::atan2(val0, val1)); - } + value_type val0(0); + value_type val1(1); + CHECK_EQ(extract(xsimd::atan2(T(val0), T(val1))), std::atan2(val0, val1)); } void test_atanh() { From 991b08796f1df360ccbc0abb91bd9b9a52fe4ddf Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 16 Feb 2026 17:02:21 +0100 Subject: [PATCH 2/6] Add x86 optimizations --- include/xsimd/arch/utils/shifts.hpp | 72 +++++++++++++++++++++++++++++ include/xsimd/arch/xsimd_avx2.hpp | 25 ++++++++++ include/xsimd/arch/xsimd_sse2.hpp | 17 +++++++ include/xsimd/arch/xsimd_sse4_1.hpp | 9 ++++ 4 files changed, 123 insertions(+) create mode 100644 include/xsimd/arch/utils/shifts.hpp diff --git a/include/xsimd/arch/utils/shifts.hpp b/include/xsimd/arch/utils/shifts.hpp new file mode 100644 index 000000000..40a99b3a6 --- /dev/null +++ b/include/xsimd/arch/utils/shifts.hpp @@ -0,0 +1,72 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * Copyright (c) Marco Barbone * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_UTILS_SHIFTS_HPP +#define XSIMD_UTILS_SHIFTS_HPP + +#include "../../config/xsimd_inline.hpp" +#include "../../types/xsimd_batch.hpp" +#include "../../types/xsimd_batch_constant.hpp" + +namespace xsimd +{ + namespace kernel + { + namespace utils + { + template + struct select_stride + { + static constexpr I values_array[] = { Vs... }; + + template + static constexpr K get(K i, K) + { + return static_cast(values_array[length * i + offset]); + } + }; + + template + constexpr I lsb_mask(I bit_index) + { + return static_cast((I { 1 } << bit_index) - I { 1 }); + } + + template + XSIMD_INLINE batch bitwise_lshift_as_twice_larger( + batch const& self, batch_constant) noexcept + { + static_assert(sizeof(T2) == 2 * sizeof(T), "One size must be twice the other"); + + const auto self2 = bitwise_cast(self); + + // Lower byte: shift as twice the size and mask bits flowing to higher byte. + constexpr auto shifts_lo = make_batch_constant, A>(); + constexpr auto mask_lo = lsb_mask(8 * sizeof(T)); + const auto shifted_lo = bitwise_lshift(self2, shifts_lo); + constexpr auto batch_mask_lo = make_batch_constant(); + const auto masked_lo = bitwise_and(shifted_lo, batch_mask_lo.as_batch()); + + // Higher byte: mask bits that would flow from lower byte and shift as twice the size. + constexpr auto shifts_hi = make_batch_constant, A>(); + constexpr auto mask_hi = mask_lo << (8 * sizeof(T)); + constexpr auto batch_mask_hi = make_batch_constant(); + const auto masked_hi = bitwise_and(self2, batch_mask_hi.as_batch()); + const auto shifted_hi = bitwise_lshift(masked_hi, shifts_hi); + + return bitwise_cast(bitwise_or(masked_lo, shifted_hi)); + } + } + } +} + +#endif diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index bf6d9e7de..8632c0ed9 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -17,6 +17,7 @@ #include "../types/xsimd_avx2_register.hpp" #include "../types/xsimd_batch_constant.hpp" +#include "./utils/shifts.hpp" #include @@ -332,6 +333,30 @@ namespace xsimd } } + // bitwise_lshift multiple (constant) specific implementations. + // Missing implementations are dispacthed to the `batch` overload in xsimd_api. + template = 0> + XSIMD_INLINE batch bitwise_lshift( + batch const& self, batch_constant, requires_arch) noexcept + { + using uint_t = typename std::make_unsigned::type; + return bitwise_cast( + utils::bitwise_lshift_as_twice_larger( + bitwise_cast(self), + batch_constant(Vs)...> {})); + } + + template = 0> + XSIMD_INLINE batch bitwise_lshift( + batch const& self, batch_constant, requires_arch) noexcept + { + using uint_t = typename std::make_unsigned::type; + return bitwise_cast( + utils::bitwise_lshift_as_twice_larger( + bitwise_cast(self), + batch_constant(Vs)...> {})); + } + // bitwise_or template ::value>> XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp index cccba8144..01e122221 100644 --- a/include/xsimd/arch/xsimd_sse2.hpp +++ b/include/xsimd/arch/xsimd_sse2.hpp @@ -18,6 +18,7 @@ #include "../types/xsimd_batch_constant.hpp" #include "../types/xsimd_sse2_register.hpp" +#include "./utils/shifts.hpp" namespace xsimd { @@ -326,6 +327,22 @@ namespace xsimd return bitwise_lshift(self, common {}); } + // bitwise_lshift multiple (constant) + template + XSIMD_INLINE batch bitwise_lshift( + batch const& self, batch_constant, requires_arch) noexcept + { + constexpr auto mults = batch_constant(1u << Vs)...>(); + return _mm_mullo_epi16(self, mults.as_batch()); + } + + template + XSIMD_INLINE batch bitwise_lshift( + batch const& self, batch_constant shifts, requires_arch) noexcept + { + return utils::bitwise_lshift_as_twice_larger(self, shifts); + } + // bitwise_not template XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_sse4_1.hpp b/include/xsimd/arch/xsimd_sse4_1.hpp index 030fb29db..e6102bb6f 100644 --- a/include/xsimd/arch/xsimd_sse4_1.hpp +++ b/include/xsimd/arch/xsimd_sse4_1.hpp @@ -41,6 +41,15 @@ namespace xsimd return _mm_ceil_pd(self); } + // bitwise_lshift multiple (constant) + template + XSIMD_INLINE batch bitwise_lshift( + batch const& self, batch_constant, requires_arch) noexcept + { + constexpr auto mults = batch_constant(1u << Vs)...>(); + return _mm_mullo_epi32(self, mults.as_batch()); + } + // fast_cast namespace detail { From 116829c3e19f73cf91ef282749f737b1dadd6f8e Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 16 Feb 2026 18:59:20 +0100 Subject: [PATCH 3/6] Fix merge --- test/test_xsimd_api.cpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/test/test_xsimd_api.cpp b/test/test_xsimd_api.cpp index a68155e1e..ac3e6e447 100644 --- a/test/test_xsimd_api.cpp +++ b/test/test_xsimd_api.cpp @@ -534,9 +534,17 @@ struct xsimd_api_float_types_functions } void test_atan2() { - value_type val0(0); - value_type val1(1); - CHECK_EQ(extract(xsimd::atan2(T(val0), T(val1))), std::atan2(val0, val1)); + { + value_type val0(0); + value_type val1(1); + CHECK_EQ(extract(xsimd::atan2(T(val0), T(val1))), std::atan2(val0, val1)); + } + + { + value_type val0(1); + value_type val1(0); + CHECK_EQ(extract(xsimd::atan2(T(val0), T(val1))), std::atan2(val0, val1)); + } } void test_atanh() { From 923059ec7a120423bed42daa74d861144b8dac92 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 17 Feb 2026 10:27:36 +0100 Subject: [PATCH 4/6] Add single shift optimization --- include/xsimd/arch/utils/shifts.hpp | 17 +++++++++++++++++ include/xsimd/arch/xsimd_avx2.hpp | 14 ++++++++++++-- include/xsimd/arch/xsimd_sse2.hpp | 13 +++++++++++-- 3 files changed, 40 insertions(+), 4 deletions(-) diff --git a/include/xsimd/arch/utils/shifts.hpp b/include/xsimd/arch/utils/shifts.hpp index 40a99b3a6..ac93ed336 100644 --- a/include/xsimd/arch/utils/shifts.hpp +++ b/include/xsimd/arch/utils/shifts.hpp @@ -38,9 +38,26 @@ namespace xsimd template constexpr I lsb_mask(I bit_index) { + if (bit_index == 8 * sizeof(I)) + { + return ~I { 0 }; + } return static_cast((I { 1 } << bit_index) - I { 1 }); } + template + constexpr bool all_equals(batch_constant c) + { + static_assert(sizeof...(Vs) > 0, "There must be at least one value"); + + bool out = true; + for (std::size_t k = 0; k < sizeof...(Vs); ++k) + { + out &= c.get(k) == c.get(0); + } + return out; + } + template XSIMD_INLINE batch bitwise_lshift_as_twice_larger( batch const& self, batch_constant) noexcept diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index 8632c0ed9..54485b5ef 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -337,9 +337,14 @@ namespace xsimd // Missing implementations are dispacthed to the `batch` overload in xsimd_api. template = 0> XSIMD_INLINE batch bitwise_lshift( - batch const& self, batch_constant, requires_arch) noexcept + batch const& self, batch_constant shifts, requires_arch req) noexcept { using uint_t = typename std::make_unsigned::type; + + XSIMD_IF_CONSTEXPR(utils::all_equals(shifts)) + { + return bitwise_lshift(self, req); + } return bitwise_cast( utils::bitwise_lshift_as_twice_larger( bitwise_cast(self), @@ -348,9 +353,14 @@ namespace xsimd template = 0> XSIMD_INLINE batch bitwise_lshift( - batch const& self, batch_constant, requires_arch) noexcept + batch const& self, batch_constant shifts, requires_arch req) noexcept { using uint_t = typename std::make_unsigned::type; + + XSIMD_IF_CONSTEXPR(utils::all_equals(shifts)) + { + return bitwise_lshift(self, req); + } return bitwise_cast( utils::bitwise_lshift_as_twice_larger( bitwise_cast(self), diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp index 01e122221..c862174b0 100644 --- a/include/xsimd/arch/xsimd_sse2.hpp +++ b/include/xsimd/arch/xsimd_sse2.hpp @@ -328,18 +328,27 @@ namespace xsimd } // bitwise_lshift multiple (constant) + // Missing implementations are dispacthed to the `batch` overload in xsimd_api. template XSIMD_INLINE batch bitwise_lshift( - batch const& self, batch_constant, requires_arch) noexcept + batch const& self, batch_constant shifts, requires_arch req) noexcept { + XSIMD_IF_CONSTEXPR(utils::all_equals(shifts)) + { + return bitwise_lshift(self, req); + } constexpr auto mults = batch_constant(1u << Vs)...>(); return _mm_mullo_epi16(self, mults.as_batch()); } template XSIMD_INLINE batch bitwise_lshift( - batch const& self, batch_constant shifts, requires_arch) noexcept + batch const& self, batch_constant shifts, requires_arch req) noexcept { + XSIMD_IF_CONSTEXPR(utils::all_equals(shifts)) + { + return bitwise_lshift(self, req); + } return utils::bitwise_lshift_as_twice_larger(self, shifts); } From 8cc985d092c01b4e4ae005458055f3298164347f Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 17 Feb 2026 14:41:01 +0100 Subject: [PATCH 5/6] Strenghen tests --- test/test_xsimd_api.cpp | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/test/test_xsimd_api.cpp b/test/test_xsimd_api.cpp index ac3e6e447..87386eb5e 100644 --- a/test/test_xsimd_api.cpp +++ b/test/test_xsimd_api.cpp @@ -364,30 +364,32 @@ struct xsimd_api_integral_types_functions CHECK_EQ(extract(cr), r); } + /* Test when T is a batch_constant only, not a scalar. */ template - void test_bitwise_lshift_multiple(typename std::enable_if::value, int>::type = 0) + void test_bitwise_lshift_multiple(T const& vals, typename std::enable_if::value, int>::type = 0) { #ifndef XSIMD_NO_SUPPORTED_ARCHITECTURE constexpr auto Max = static_cast(std::numeric_limits::digits); constexpr auto max_batch = xsimd::make_batch_constant(); constexpr auto shifts = xsimd::make_iota_batch_constant() % max_batch; - auto shifted = xsimd::bitwise_lshift(T(1), shifts.as_batch()); - for (std::size_t i = 0; i < shifts.size; ++i) { - CHECK_EQ(shifted.get(i), 1 << shifts.get(i)); - } - - auto shifted_cst = xsimd::bitwise_lshift(T(1), shifts); - for (std::size_t i = 0; i < shifts.size; ++i) - { - CHECK_EQ(shifted_cst.get(i), 1 << shifts.get(i)); + auto shifted = xsimd::bitwise_lshift(vals, shifts.as_batch()); + auto shifted_cst = xsimd::bitwise_lshift(vals, shifts); + + for (std::size_t i = 0; i < shifts.size; ++i) + { + const auto expected = static_cast(vals.get(i) << shifts.get(i)); + CHECK_EQ(shifted.get(i), expected); + CHECK_EQ(shifted_cst.get(i), expected); + } } #endif } + /* Test multiple does not make sense when T is scalar. */ template - void test_bitwise_lshift_multiple(typename std::enable_if::value, int>::type = 0) + void test_bitwise_lshift_multiple(T const&, typename std::enable_if::value, int>::type = 0) { } @@ -451,7 +453,9 @@ struct xsimd_api_integral_types_functions TEST_CASE_TEMPLATE("[xsimd api | integral types functions]", B, INTEGRAL_TYPES) { - xsimd_api_integral_types_functions Test; + using test_type = xsimd_api_integral_types_functions; + + test_type Test; SUBCASE("test_bitwise_lshift_single") { @@ -460,7 +464,9 @@ TEST_CASE_TEMPLATE("[xsimd api | integral types functions]", B, INTEGRAL_TYPES) SUBCASE("bitwise_lshift_multiple") { - Test.test_bitwise_lshift_multiple(); + Test.test_bitwise_lshift_multiple({ 1 }); + Test.test_bitwise_lshift_multiple({ 3 }); + Test.test_bitwise_lshift_multiple({ 127 }); } SUBCASE("bitwise_rshift") From cde846ff47e23c85c6b76def5e20adb0cd4d347b Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 17 Feb 2026 14:54:15 +0100 Subject: [PATCH 6/6] Enable SSE2 fallback for signed integers --- include/xsimd/arch/xsimd_sse2.hpp | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp index c862174b0..fdd6611e1 100644 --- a/include/xsimd/arch/xsimd_sse2.hpp +++ b/include/xsimd/arch/xsimd_sse2.hpp @@ -329,27 +329,32 @@ namespace xsimd // bitwise_lshift multiple (constant) // Missing implementations are dispacthed to the `batch` overload in xsimd_api. - template - XSIMD_INLINE batch bitwise_lshift( - batch const& self, batch_constant shifts, requires_arch req) noexcept + template = 0> + XSIMD_INLINE batch bitwise_lshift( + batch const& self, batch_constant shifts, requires_arch req) noexcept { XSIMD_IF_CONSTEXPR(utils::all_equals(shifts)) { return bitwise_lshift(self, req); } - constexpr auto mults = batch_constant(1u << Vs)...>(); + constexpr auto mults = batch_constant(1u << Vs)...>(); return _mm_mullo_epi16(self, mults.as_batch()); } - template - XSIMD_INLINE batch bitwise_lshift( - batch const& self, batch_constant shifts, requires_arch req) noexcept + template = 0> + XSIMD_INLINE batch bitwise_lshift( + batch const& self, batch_constant shifts, requires_arch req) noexcept { + using uint_t = typename std::make_unsigned::type; + XSIMD_IF_CONSTEXPR(utils::all_equals(shifts)) { return bitwise_lshift(self, req); } - return utils::bitwise_lshift_as_twice_larger(self, shifts); + return bitwise_cast( + utils::bitwise_lshift_as_twice_larger( + bitwise_cast(self), + batch_constant(Vs)...> {})); } // bitwise_not