diff --git a/include/xsimd/arch/common/xsimd_common_arithmetic.hpp b/include/xsimd/arch/common/xsimd_common_arithmetic.hpp index 27b5ef24f..541503ea3 100644 --- a/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +++ b/include/xsimd/arch/common/xsimd_common_arithmetic.hpp @@ -251,13 +251,12 @@ namespace xsimd { using B = batch; const B mask(uint64_t(0xffffffffULL)); - B xl = x & mask; + // mul_epu32 uses only the low 32 bits, so low operands need no mask. B xh = x >> 32; - B yl = y & mask; B yh = y >> 32; - B ll = mul_epu32(xl, yl); - B lh = mul_epu32(xl, yh); - B hl = mul_epu32(xh, yl); + B ll = mul_epu32(x, y); + B lh = mul_epu32(x, yh); + B hl = mul_epu32(xh, y); B hh = mul_epu32(xh, yh); B mid = (ll >> 32) + (lh & mask) + (hl & mask); return hh + (lh >> 32) + (hl >> 32) + (mid >> 32); @@ -276,6 +275,27 @@ namespace xsimd auto sb = ::xsimd::bitwise_cast(y >> 63); return ::xsimd::bitwise_cast(uhi - (uy & sa) - (ux & sb)); } + + // Fused mul_hilo: both halves from one set of partials. Returns { hi, lo }. + template + XSIMD_INLINE std::pair, batch> + mulhilo_u64_core(batch const& x, + batch const& y, + WMul mul_epu32) noexcept + { + using B = batch; + const B mask(uint64_t(0xffffffffULL)); + B xh = x >> 32; + B yh = y >> 32; + B ll = mul_epu32(x, y); + B lh = mul_epu32(x, yh); + B hl = mul_epu32(xh, y); + B hh = mul_epu32(xh, yh); + B mid = (ll >> 32) + (lh & mask) + (hl & mask); + B hi = hh + (lh >> 32) + (hl >> 32) + (mid >> 32); + B lo = (ll & mask) | (mid << 32); + return { hi, lo }; + } } template ::value>*/> @@ -294,6 +314,21 @@ namespace xsimd return std::pair, batch> { mul_hi(self, other, A {}), self * other }; } + // Signed 64-bit mul_hilo: unsigned path + sign fixup on hi (lo is sign-invariant). + template + XSIMD_INLINE std::pair, batch> + mul_hilo(batch const& self, batch const& other, requires_arch) noexcept + { + auto ux = ::xsimd::bitwise_cast(self); + auto uy = ::xsimd::bitwise_cast(other); + auto hilo = mul_hilo(ux, uy, A {}); + auto sa = ::xsimd::bitwise_cast(self >> 63); + auto sb = ::xsimd::bitwise_cast(other >> 63); + auto hi = hilo.first - (uy & sa) - (ux & sb); + return { ::xsimd::bitwise_cast(hi), + ::xsimd::bitwise_cast(hilo.second) }; + } + // rotl template XSIMD_INLINE batch rotl(batch const& self, STy other, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index 5cb47f908..a536219ee 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -1003,6 +1003,16 @@ namespace xsimd { return batch(_mm256_mul_epu32(a, b)); }); } + // mul_hilo + template + XSIMD_INLINE std::pair, batch> + mul_hilo(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::mulhilo_u64_core(self, other, + [](batch a, batch b) + { return batch(_mm256_mul_epu32(a, b)); }); + } + // reduce_add template ::value>> XSIMD_INLINE T reduce_add(batch const& self, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index cc057eacf..e508f3d0e 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -1835,6 +1835,16 @@ namespace xsimd { return batch(_mm512_mul_epu32(a, b)); }); } + // mul_hilo + template + XSIMD_INLINE std::pair, batch> + mul_hilo(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::mulhilo_u64_core(self, other, + [](batch a, batch b) + { return batch(_mm512_mul_epu32(a, b)); }); + } + // nearbyint template XSIMD_INLINE batch nearbyint(batch const& self, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_common_fwd.hpp b/include/xsimd/arch/xsimd_common_fwd.hpp index 8c4818176..a9c1fcd42 100644 --- a/include/xsimd/arch/xsimd_common_fwd.hpp +++ b/include/xsimd/arch/xsimd_common_fwd.hpp @@ -122,6 +122,11 @@ namespace xsimd XSIMD_INLINE batch mulhi_i64_core(batch const& x, batch const& y, WMul mul_epu32) noexcept; + template + XSIMD_INLINE std::pair, batch> + mulhilo_u64_core(batch const& x, + batch const& y, + WMul mul_epu32) noexcept; } } } diff --git a/include/xsimd/arch/xsimd_sse4_1.hpp b/include/xsimd/arch/xsimd_sse4_1.hpp index bc2b0f3de..bcca84c44 100644 --- a/include/xsimd/arch/xsimd_sse4_1.hpp +++ b/include/xsimd/arch/xsimd_sse4_1.hpp @@ -394,6 +394,16 @@ namespace xsimd { return batch(_mm_mul_epu32(a, b)); }); } + // mul_hilo + template + XSIMD_INLINE std::pair, batch> + mul_hilo(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::mulhilo_u64_core(self, other, + [](batch a, batch b) + { return batch(_mm_mul_epu32(a, b)); }); + } + // nearbyint template XSIMD_INLINE batch nearbyint(batch const& self, requires_arch) noexcept