From 15603d4560a210018202ae7541c5b71e611e1eeb Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 11 Jun 2026 10:56:52 -0400 Subject: [PATCH] feat: fused mul_hilo for 64-bit batches (shared 32x32->64 partials) mul_hilo previously fell through to the generic common path { mul_hi(x, y), x * y }, deriving the high half (mulhi_u64_core: 4 vpmuludq) and the low half (operator*: 3 vpmuludq) from separately-computed 32-bit partials -- 7 vpmuludq per pair, none CSE-able because the two halves split the operands differently (&mask/>>32 vs vpshufd). Add detail::mulhilo_u64_core, which derives BOTH halves from one set of four 32x32->64 partials (ll, lh, hl, hh): 4 vpmuludq per pair. By construction hi == mulhi_u64_core and lo == operator*, so the returned pair is bit-identical to the unfused result. Native kernels for SSE4.1, AVX2 and AVX-512F each pass their _mm*_mul_epu32 widening functor, mirroring the existing mul_hi structure. SSE2 keeps the common fallback (it has no fused mul_hi either). Signed int64 reuses the unsigned core through a single common overload (bitwise_cast + sign fixup on hi; lo is sign-invariant), so no per-arch signed overloads are added. Verified bit-identical to __int128 for uint64/int64 across SSE4.1 and AVX2 on g++ and clang (including edge cases); avx2 asm shows 4 vpmuludq for the fused mul_hilo vs 7 for the unfused { mul_hi, mul_lo }. --- .../arch/common/xsimd_common_arithmetic.hpp | 45 ++++++++++++++++--- include/xsimd/arch/xsimd_avx2.hpp | 10 +++++ include/xsimd/arch/xsimd_avx512f.hpp | 10 +++++ include/xsimd/arch/xsimd_common_fwd.hpp | 5 +++ include/xsimd/arch/xsimd_sse4_1.hpp | 10 +++++ 5 files changed, 75 insertions(+), 5 deletions(-) diff --git a/include/xsimd/arch/common/xsimd_common_arithmetic.hpp b/include/xsimd/arch/common/xsimd_common_arithmetic.hpp index 27b5ef24f..541503ea3 100644 --- a/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +++ b/include/xsimd/arch/common/xsimd_common_arithmetic.hpp @@ -251,13 +251,12 @@ namespace xsimd { using B = batch; const B mask(uint64_t(0xffffffffULL)); - B xl = x & mask; + // mul_epu32 uses only the low 32 bits, so low operands need no mask. B xh = x >> 32; - B yl = y & mask; B yh = y >> 32; - B ll = mul_epu32(xl, yl); - B lh = mul_epu32(xl, yh); - B hl = mul_epu32(xh, yl); + B ll = mul_epu32(x, y); + B lh = mul_epu32(x, yh); + B hl = mul_epu32(xh, y); B hh = mul_epu32(xh, yh); B mid = (ll >> 32) + (lh & mask) + (hl & mask); return hh + (lh >> 32) + (hl >> 32) + (mid >> 32); @@ -276,6 +275,27 @@ namespace xsimd auto sb = ::xsimd::bitwise_cast(y >> 63); return ::xsimd::bitwise_cast(uhi - (uy & sa) - (ux & sb)); } + + // Fused mul_hilo: both halves from one set of partials. Returns { hi, lo }. + template + XSIMD_INLINE std::pair, batch> + mulhilo_u64_core(batch const& x, + batch const& y, + WMul mul_epu32) noexcept + { + using B = batch; + const B mask(uint64_t(0xffffffffULL)); + B xh = x >> 32; + B yh = y >> 32; + B ll = mul_epu32(x, y); + B lh = mul_epu32(x, yh); + B hl = mul_epu32(xh, y); + B hh = mul_epu32(xh, yh); + B mid = (ll >> 32) + (lh & mask) + (hl & mask); + B hi = hh + (lh >> 32) + (hl >> 32) + (mid >> 32); + B lo = (ll & mask) | (mid << 32); + return { hi, lo }; + } } template ::value>*/> @@ -294,6 +314,21 @@ namespace xsimd return std::pair, batch> { mul_hi(self, other, A {}), self * other }; } + // Signed 64-bit mul_hilo: unsigned path + sign fixup on hi (lo is sign-invariant). + template + XSIMD_INLINE std::pair, batch> + mul_hilo(batch const& self, batch const& other, requires_arch) noexcept + { + auto ux = ::xsimd::bitwise_cast(self); + auto uy = ::xsimd::bitwise_cast(other); + auto hilo = mul_hilo(ux, uy, A {}); + auto sa = ::xsimd::bitwise_cast(self >> 63); + auto sb = ::xsimd::bitwise_cast(other >> 63); + auto hi = hilo.first - (uy & sa) - (ux & sb); + return { ::xsimd::bitwise_cast(hi), + ::xsimd::bitwise_cast(hilo.second) }; + } + // rotl template XSIMD_INLINE batch rotl(batch const& self, STy other, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index 5cb47f908..a536219ee 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -1003,6 +1003,16 @@ namespace xsimd { return batch(_mm256_mul_epu32(a, b)); }); } + // mul_hilo + template + XSIMD_INLINE std::pair, batch> + mul_hilo(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::mulhilo_u64_core(self, other, + [](batch a, batch b) + { return batch(_mm256_mul_epu32(a, b)); }); + } + // reduce_add template ::value>> XSIMD_INLINE T reduce_add(batch const& self, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index cc057eacf..e508f3d0e 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -1835,6 +1835,16 @@ namespace xsimd { return batch(_mm512_mul_epu32(a, b)); }); } + // mul_hilo + template + XSIMD_INLINE std::pair, batch> + mul_hilo(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::mulhilo_u64_core(self, other, + [](batch a, batch b) + { return batch(_mm512_mul_epu32(a, b)); }); + } + // nearbyint template XSIMD_INLINE batch nearbyint(batch const& self, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_common_fwd.hpp b/include/xsimd/arch/xsimd_common_fwd.hpp index 8c4818176..a9c1fcd42 100644 --- a/include/xsimd/arch/xsimd_common_fwd.hpp +++ b/include/xsimd/arch/xsimd_common_fwd.hpp @@ -122,6 +122,11 @@ namespace xsimd XSIMD_INLINE batch mulhi_i64_core(batch const& x, batch const& y, WMul mul_epu32) noexcept; + template + XSIMD_INLINE std::pair, batch> + mulhilo_u64_core(batch const& x, + batch const& y, + WMul mul_epu32) noexcept; } } } diff --git a/include/xsimd/arch/xsimd_sse4_1.hpp b/include/xsimd/arch/xsimd_sse4_1.hpp index bc2b0f3de..bcca84c44 100644 --- a/include/xsimd/arch/xsimd_sse4_1.hpp +++ b/include/xsimd/arch/xsimd_sse4_1.hpp @@ -394,6 +394,16 @@ namespace xsimd { return batch(_mm_mul_epu32(a, b)); }); } + // mul_hilo + template + XSIMD_INLINE std::pair, batch> + mul_hilo(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::mulhilo_u64_core(self, other, + [](batch a, batch b) + { return batch(_mm_mul_epu32(a, b)); }); + } + // nearbyint template XSIMD_INLINE batch nearbyint(batch const& self, requires_arch) noexcept