From 15603d4560a210018202ae7541c5b71e611e1eeb Mon Sep 17 00:00:00 2001
From: Marco Barbone <mbarbone@flatironinstitute.org>
Date: Thu, 11 Jun 2026 10:56:52 -0400
Subject: [PATCH] feat: fused mul_hilo for 64-bit batches (shared 32x32->64
 partials)

mul_hilo<uint64_t> previously fell through to the generic common path
{ mul_hi(x, y), x * y }, deriving the high half (mulhi_u64_core: 4 vpmuludq)
and the low half (operator*: 3 vpmuludq) from separately-computed 32-bit
partials -- 7 vpmuludq per pair, none CSE-able because the two halves split
the operands differently (&mask/>>32 vs vpshufd).

Add detail::mulhilo_u64_core, which derives BOTH halves from one set of four
32x32->64 partials (ll, lh, hl, hh): 4 vpmuludq per pair. By construction
hi == mulhi_u64_core and lo == operator*, so the returned pair is
bit-identical to the unfused result.

Native kernels for SSE4.1, AVX2 and AVX-512F each pass their _mm*_mul_epu32
widening functor, mirroring the existing mul_hi<uint64_t> structure. SSE2
keeps the common fallback (it has no fused mul_hi<uint64_t> either). Signed
int64 reuses the unsigned core through a single common overload (bitwise_cast
+ sign fixup on hi; lo is sign-invariant), so no per-arch signed overloads
are added.

Verified bit-identical to __int128 for uint64/int64 across SSE4.1 and AVX2
on g++ and clang (including edge cases); avx2 asm shows 4 vpmuludq for the
fused mul_hilo vs 7 for the unfused { mul_hi, mul_lo }.
---
 .../arch/common/xsimd_common_arithmetic.hpp   | 45 ++++++++++++++++---
 include/xsimd/arch/xsimd_avx2.hpp             | 10 +++++
 include/xsimd/arch/xsimd_avx512f.hpp          | 10 +++++
 include/xsimd/arch/xsimd_common_fwd.hpp       |  5 +++
 include/xsimd/arch/xsimd_sse4_1.hpp           | 10 +++++
 5 files changed, 75 insertions(+), 5 deletions(-)
diff --git a/include/xsimd/arch/common/xsimd_common_arithmetic.hpp b/include/xsimd/arch/common/xsimd_common_arithmetic.hpp
index 27b5ef24f..541503ea3 100644
--- a/include/xsimd/arch/common/xsimd_common_arithmetic.hpp
+++ b/include/xsimd/arch/common/xsimd_common_arithmetic.hpp
@@ -251,13 +251,12 @@ namespace xsimd
             {
                 using B = batch<uint64_t, A>;
                 const B mask(uint64_t(0xffffffffULL));
-                B xl = x & mask;
+                // mul_epu32 uses only the low 32 bits, so low operands need no mask.
                 B xh = x >> 32;
-                B yl = y & mask;
                 B yh = y >> 32;
-                B ll = mul_epu32(xl, yl);
-                B lh = mul_epu32(xl, yh);
-                B hl = mul_epu32(xh, yl);
+                B ll = mul_epu32(x, y);
+                B lh = mul_epu32(x, yh);
+                B hl = mul_epu32(xh, y);
                 B hh = mul_epu32(xh, yh);
                 B mid = (ll >> 32) + (lh & mask) + (hl & mask);
                 return hh + (lh >> 32) + (hl >> 32) + (mid >> 32);
@@ -276,6 +275,27 @@ namespace xsimd
                 auto sb = ::xsimd::bitwise_cast<uint64_t>(y >> 63);
                 return ::xsimd::bitwise_cast<int64_t>(uhi - (uy & sa) - (ux & sb));
             }
+
+            // Fused mul_hilo: both halves from one set of partials. Returns { hi, lo }.
+            template <class A, class WMul>
+            XSIMD_INLINE std::pair<batch<uint64_t, A>, batch<uint64_t, A>>
+            mulhilo_u64_core(batch<uint64_t, A> const& x,
+                             batch<uint64_t, A> const& y,
+                             WMul mul_epu32) noexcept
+            {
+                using B = batch<uint64_t, A>;
+                const B mask(uint64_t(0xffffffffULL));
+                B xh = x >> 32;
+                B yh = y >> 32;
+                B ll = mul_epu32(x, y);
+                B lh = mul_epu32(x, yh);
+                B hl = mul_epu32(xh, y);
+                B hh = mul_epu32(xh, yh);
+                B mid = (ll >> 32) + (lh & mask) + (hl & mask);
+                B hi = hh + (lh >> 32) + (hl >> 32) + (mid >> 32);
+                B lo = (ll & mask) | (mid << 32);
+                return { hi, lo };
+            }
         }
 
         template <class A, class T, class /*=std::enable_if_t<std::is_integral<T>::value>*/>
@@ -294,6 +314,21 @@ namespace xsimd
             return std::pair<batch<T, A>, batch<T, A>> { mul_hi<A>(self, other, A {}), self * other };
         }
 
+        // Signed 64-bit mul_hilo: unsigned path + sign fixup on hi (lo is sign-invariant).
+        template <class A>
+        XSIMD_INLINE std::pair<batch<int64_t, A>, batch<int64_t, A>>
+        mul_hilo(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<common>) noexcept
+        {
+            auto ux = ::xsimd::bitwise_cast<uint64_t>(self);
+            auto uy = ::xsimd::bitwise_cast<uint64_t>(other);
+            auto hilo = mul_hilo<A>(ux, uy, A {});
+            auto sa = ::xsimd::bitwise_cast<uint64_t>(self >> 63);
+            auto sb = ::xsimd::bitwise_cast<uint64_t>(other >> 63);
+            auto hi = hilo.first - (uy & sa) - (ux & sb);
+            return { ::xsimd::bitwise_cast<int64_t>(hi),
+                     ::xsimd::bitwise_cast<int64_t>(hilo.second) };
+        }
+
         // rotl
         template <class A, class T, class STy>
         XSIMD_INLINE batch<T, A> rotl(batch<T, A> const& self, STy other, requires_arch<common>) noexcept
diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp
index 5cb47f908..a536219ee 100644
--- a/include/xsimd/arch/xsimd_avx2.hpp
+++ b/include/xsimd/arch/xsimd_avx2.hpp
@@ -1003,6 +1003,16 @@ namespace xsimd
                                              { return batch<uint64_t, A>(_mm256_mul_epu32(a, b)); });
         }
 
+        // mul_hilo
+        template <class A>
+        XSIMD_INLINE std::pair<batch<uint64_t, A>, batch<uint64_t, A>>
+        mul_hilo(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return detail::mulhilo_u64_core<A>(self, other,
+                                               [](batch<uint64_t, A> a, batch<uint64_t, A> b)
+                                               { return batch<uint64_t, A>(_mm256_mul_epu32(a, b)); });
+        }
+
         // reduce_add
         template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
         XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx2>) noexcept
diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp
index cc057eacf..e508f3d0e 100644
--- a/include/xsimd/arch/xsimd_avx512f.hpp
+++ b/include/xsimd/arch/xsimd_avx512f.hpp
@@ -1835,6 +1835,16 @@ namespace xsimd
                                              { return batch<uint64_t, A>(_mm512_mul_epu32(a, b)); });
         }
 
+        // mul_hilo
+        template <class A>
+        XSIMD_INLINE std::pair<batch<uint64_t, A>, batch<uint64_t, A>>
+        mul_hilo(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return detail::mulhilo_u64_core<A>(self, other,
+                                               [](batch<uint64_t, A> a, batch<uint64_t, A> b)
+                                               { return batch<uint64_t, A>(_mm512_mul_epu32(a, b)); });
+        }
+
         // nearbyint
         template <class A>
         XSIMD_INLINE batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<avx512f>) noexcept
diff --git a/include/xsimd/arch/xsimd_common_fwd.hpp b/include/xsimd/arch/xsimd_common_fwd.hpp
index 8c4818176..a9c1fcd42 100644
--- a/include/xsimd/arch/xsimd_common_fwd.hpp
+++ b/include/xsimd/arch/xsimd_common_fwd.hpp
@@ -122,6 +122,11 @@ namespace xsimd
             XSIMD_INLINE batch<int64_t, A> mulhi_i64_core(batch<int64_t, A> const& x,
                                                           batch<int64_t, A> const& y,
                                                           WMul mul_epu32) noexcept;
+            template <class A, class WMul>
+            XSIMD_INLINE std::pair<batch<uint64_t, A>, batch<uint64_t, A>>
+            mulhilo_u64_core(batch<uint64_t, A> const& x,
+                             batch<uint64_t, A> const& y,
+                             WMul mul_epu32) noexcept;
         }
     }
 }
diff --git a/include/xsimd/arch/xsimd_sse4_1.hpp b/include/xsimd/arch/xsimd_sse4_1.hpp
index bc2b0f3de..bcca84c44 100644
--- a/include/xsimd/arch/xsimd_sse4_1.hpp
+++ b/include/xsimd/arch/xsimd_sse4_1.hpp
@@ -394,6 +394,16 @@ namespace xsimd
                                              { return batch<uint64_t, A>(_mm_mul_epu32(a, b)); });
         }
 
+        // mul_hilo
+        template <class A>
+        XSIMD_INLINE std::pair<batch<uint64_t, A>, batch<uint64_t, A>>
+        mul_hilo(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<sse4_1>) noexcept
+        {
+            return detail::mulhilo_u64_core<A>(self, other,
+                                               [](batch<uint64_t, A> a, batch<uint64_t, A> b)
+                                               { return batch<uint64_t, A>(_mm_mul_epu32(a, b)); });
+        }
+
         // nearbyint
         template <class A>
         XSIMD_INLINE batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<sse4_1>) noexcept