diff --git a/include/xsimd/arch/utils/shifts.hpp b/include/xsimd/arch/utils/shifts.hpp new file mode 100644 index 000000000..ac93ed336 --- /dev/null +++ b/include/xsimd/arch/utils/shifts.hpp @@ -0,0 +1,89 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * Copyright (c) Marco Barbone * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_UTILS_SHIFTS_HPP +#define XSIMD_UTILS_SHIFTS_HPP + +#include "../../config/xsimd_inline.hpp" +#include "../../types/xsimd_batch.hpp" +#include "../../types/xsimd_batch_constant.hpp" + +namespace xsimd +{ + namespace kernel + { + namespace utils + { + template + struct select_stride + { + static constexpr I values_array[] = { Vs... }; + + template + static constexpr K get(K i, K) + { + return static_cast(values_array[length * i + offset]); + } + }; + + template + constexpr I lsb_mask(I bit_index) + { + if (bit_index == 8 * sizeof(I)) + { + return ~I { 0 }; + } + return static_cast((I { 1 } << bit_index) - I { 1 }); + } + + template + constexpr bool all_equals(batch_constant c) + { + static_assert(sizeof...(Vs) > 0, "There must be at least one value"); + + bool out = true; + for (std::size_t k = 0; k < sizeof...(Vs); ++k) + { + out &= c.get(k) == c.get(0); + } + return out; + } + + template + XSIMD_INLINE batch bitwise_lshift_as_twice_larger( + batch const& self, batch_constant) noexcept + { + static_assert(sizeof(T2) == 2 * sizeof(T), "One size must be twice the other"); + + const auto self2 = bitwise_cast(self); + + // Lower byte: shift as twice the size and mask bits flowing to higher byte. + constexpr auto shifts_lo = make_batch_constant, A>(); + constexpr auto mask_lo = lsb_mask(8 * sizeof(T)); + const auto shifted_lo = bitwise_lshift(self2, shifts_lo); + constexpr auto batch_mask_lo = make_batch_constant(); + const auto masked_lo = bitwise_and(shifted_lo, batch_mask_lo.as_batch()); + + // Higher byte: mask bits that would flow from lower byte and shift as twice the size. + constexpr auto shifts_hi = make_batch_constant, A>(); + constexpr auto mask_hi = mask_lo << (8 * sizeof(T)); + constexpr auto batch_mask_hi = make_batch_constant(); + const auto masked_hi = bitwise_and(self2, batch_mask_hi.as_batch()); + const auto shifted_hi = bitwise_lshift(masked_hi, shifts_hi); + + return bitwise_cast(bitwise_or(masked_lo, shifted_hi)); + } + } + } +} + +#endif diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index bf6d9e7de..54485b5ef 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -17,6 +17,7 @@ #include "../types/xsimd_avx2_register.hpp" #include "../types/xsimd_batch_constant.hpp" +#include "./utils/shifts.hpp" #include @@ -332,6 +333,40 @@ namespace xsimd } } + // bitwise_lshift multiple (constant) specific implementations. + // Missing implementations are dispacthed to the `batch` overload in xsimd_api. + template = 0> + XSIMD_INLINE batch bitwise_lshift( + batch const& self, batch_constant shifts, requires_arch req) noexcept + { + using uint_t = typename std::make_unsigned::type; + + XSIMD_IF_CONSTEXPR(utils::all_equals(shifts)) + { + return bitwise_lshift(self, req); + } + return bitwise_cast( + utils::bitwise_lshift_as_twice_larger( + bitwise_cast(self), + batch_constant(Vs)...> {})); + } + + template = 0> + XSIMD_INLINE batch bitwise_lshift( + batch const& self, batch_constant shifts, requires_arch req) noexcept + { + using uint_t = typename std::make_unsigned::type; + + XSIMD_IF_CONSTEXPR(utils::all_equals(shifts)) + { + return bitwise_lshift(self, req); + } + return bitwise_cast( + utils::bitwise_lshift_as_twice_larger( + bitwise_cast(self), + batch_constant(Vs)...> {})); + } + // bitwise_or template ::value>> XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp index cccba8144..fdd6611e1 100644 --- a/include/xsimd/arch/xsimd_sse2.hpp +++ b/include/xsimd/arch/xsimd_sse2.hpp @@ -18,6 +18,7 @@ #include "../types/xsimd_batch_constant.hpp" #include "../types/xsimd_sse2_register.hpp" +#include "./utils/shifts.hpp" namespace xsimd { @@ -326,6 +327,36 @@ namespace xsimd return bitwise_lshift(self, common {}); } + // bitwise_lshift multiple (constant) + // Missing implementations are dispacthed to the `batch` overload in xsimd_api. + template = 0> + XSIMD_INLINE batch bitwise_lshift( + batch const& self, batch_constant shifts, requires_arch req) noexcept + { + XSIMD_IF_CONSTEXPR(utils::all_equals(shifts)) + { + return bitwise_lshift(self, req); + } + constexpr auto mults = batch_constant(1u << Vs)...>(); + return _mm_mullo_epi16(self, mults.as_batch()); + } + + template = 0> + XSIMD_INLINE batch bitwise_lshift( + batch const& self, batch_constant shifts, requires_arch req) noexcept + { + using uint_t = typename std::make_unsigned::type; + + XSIMD_IF_CONSTEXPR(utils::all_equals(shifts)) + { + return bitwise_lshift(self, req); + } + return bitwise_cast( + utils::bitwise_lshift_as_twice_larger( + bitwise_cast(self), + batch_constant(Vs)...> {})); + } + // bitwise_not template XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_sse4_1.hpp b/include/xsimd/arch/xsimd_sse4_1.hpp index 030fb29db..e6102bb6f 100644 --- a/include/xsimd/arch/xsimd_sse4_1.hpp +++ b/include/xsimd/arch/xsimd_sse4_1.hpp @@ -41,6 +41,15 @@ namespace xsimd return _mm_ceil_pd(self); } + // bitwise_lshift multiple (constant) + template + XSIMD_INLINE batch bitwise_lshift( + batch const& self, batch_constant, requires_arch) noexcept + { + constexpr auto mults = batch_constant(1u << Vs)...>(); + return _mm_mullo_epi32(self, mults.as_batch()); + } + // fast_cast namespace detail { diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp index aa64df4da..abdb385ae 100644 --- a/include/xsimd/types/xsimd_api.hpp +++ b/include/xsimd/types/xsimd_api.hpp @@ -353,6 +353,41 @@ namespace xsimd return kernel::bitwise_cast(x, batch {}, A {}); } + namespace detail + { + // Detection for kernel overloads accepting ``batch_constant`` in ``bitwise_lshift`` + // directly (or in a parent register function). + // The ``batch_constant`` overload is a rare but useful optimization. + // Running the detection here is less error prone than to add a fallback to all + // architectures. + + template + struct has_bitwise_lshift_batch_const : std::false_type + { + }; + + template + struct has_bitwise_lshift_batch_const(std::declval(), std::declval(), A {}))>> + : std::true_type + { + }; + + template + XSIMD_INLINE batch bitwise_lshift_batch_const(batch const& x, batch_constant shift, std::true_type) noexcept + { + // Optimized ``batch_constant`` implementation + return kernel::bitwise_lshift(x, shift, A {}); + } + + template + XSIMD_INLINE batch bitwise_lshift_batch_const(batch const& x, batch_constant shift, std::false_type) noexcept + { + // Fallback to regular run-time implementation + return kernel::bitwise_lshift(x, shift.as_batch(), A {}); + } + } + /** * @ingroup batch_bitwise * @@ -367,17 +402,24 @@ namespace xsimd detail::static_check_supported_config(); return kernel::bitwise_lshift(x, shift, A {}); } + template + XSIMD_INLINE batch bitwise_lshift(batch const& x) noexcept + { + detail::static_check_supported_config(); + return kernel::bitwise_lshift(x, A {}); + } template XSIMD_INLINE batch bitwise_lshift(batch const& x, batch const& shift) noexcept { detail::static_check_supported_config(); return kernel::bitwise_lshift(x, shift, A {}); } - template - XSIMD_INLINE batch bitwise_lshift(batch const& x) noexcept + template + XSIMD_INLINE batch bitwise_lshift(batch const& x, batch_constant shift) noexcept { detail::static_check_supported_config(); - return kernel::bitwise_lshift(x, A {}); + using has_batch_const_impl = detail::has_bitwise_lshift_batch_const; + return detail::bitwise_lshift_batch_const(x, shift, has_batch_const_impl {}); } /** diff --git a/test/test_xsimd_api.cpp b/test/test_xsimd_api.cpp index 8c58543ad..87386eb5e 100644 --- a/test/test_xsimd_api.cpp +++ b/test/test_xsimd_api.cpp @@ -351,7 +351,7 @@ struct xsimd_api_integral_types_functions { using value_type = typename scalar_type::type; - void test_bitwise_lshift() + void test_bitwise_lshift_single() { constexpr int shift = 3; value_type val0(12); @@ -364,6 +364,35 @@ struct xsimd_api_integral_types_functions CHECK_EQ(extract(cr), r); } + /* Test when T is a batch_constant only, not a scalar. */ + template + void test_bitwise_lshift_multiple(T const& vals, typename std::enable_if::value, int>::type = 0) + { +#ifndef XSIMD_NO_SUPPORTED_ARCHITECTURE + constexpr auto Max = static_cast(std::numeric_limits::digits); + constexpr auto max_batch = xsimd::make_batch_constant(); + constexpr auto shifts = xsimd::make_iota_batch_constant() % max_batch; + + { + auto shifted = xsimd::bitwise_lshift(vals, shifts.as_batch()); + auto shifted_cst = xsimd::bitwise_lshift(vals, shifts); + + for (std::size_t i = 0; i < shifts.size; ++i) + { + const auto expected = static_cast(vals.get(i) << shifts.get(i)); + CHECK_EQ(shifted.get(i), expected); + CHECK_EQ(shifted_cst.get(i), expected); + } + } +#endif + } + + /* Test multiple does not make sense when T is scalar. */ + template + void test_bitwise_lshift_multiple(T const&, typename std::enable_if::value, int>::type = 0) + { + } + void test_bitwise_rshift() { constexpr int shift = 3; @@ -424,11 +453,20 @@ struct xsimd_api_integral_types_functions TEST_CASE_TEMPLATE("[xsimd api | integral types functions]", B, INTEGRAL_TYPES) { - xsimd_api_integral_types_functions Test; + using test_type = xsimd_api_integral_types_functions; + + test_type Test; + + SUBCASE("test_bitwise_lshift_single") + { + Test.test_bitwise_lshift_single(); + } - SUBCASE("bitwise_lshift") + SUBCASE("bitwise_lshift_multiple") { - Test.test_bitwise_lshift(); + Test.test_bitwise_lshift_multiple({ 1 }); + Test.test_bitwise_lshift_multiple({ 3 }); + Test.test_bitwise_lshift_multiple({ 127 }); } SUBCASE("bitwise_rshift")