From eb1c1ba15e1798ec43e2d4bc391909381e99ce1b Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Fri, 17 Apr 2026 16:05:27 -0400 Subject: [PATCH 1/3] fix: declare register_type conversion operator on batch Some toolchains (notably certain GCC builds) define shift and mul immediate intrinsics as macros that apply a textual C-style cast to their operand. That cast does not traverse the multi-level alias inheritance of simd_register (e.g. avx512bw -> avx512dq -> avx512cd -> avx512f), so a batch fails to convert to its native register type in those contexts. Declare the conversion operator on batch itself so the native type is always one member-lookup away. --- include/xsimd/types/xsimd_batch.hpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/include/xsimd/types/xsimd_batch.hpp b/include/xsimd/types/xsimd_batch.hpp index b584a2d81..366161285 100644 --- a/include/xsimd/types/xsimd_batch.hpp +++ b/include/xsimd/types/xsimd_batch.hpp @@ -129,6 +129,18 @@ namespace xsimd XSIMD_INLINE explicit batch(batch_bool_type const& b) noexcept; XSIMD_INLINE batch(register_type reg) noexcept; + /* Redeclare the conversion operator at the most-derived level. Some + * compilers fail to invoke the conversion inherited from + * types::simd_register when a batch is fed to an intrinsic defined as + * a macro (e.g. certain GCC shift/mullo imm intrinsics), because the + * textual C-style cast inside the macro does not traverse the alias + * inheritance chain. Declaring the operator here makes it visible on + * the batch type directly. */ + XSIMD_INLINE operator register_type() const noexcept + { + return this->data; + } + template XSIMD_NO_DISCARD static XSIMD_INLINE batch broadcast(U val) noexcept; From 244d74b297f72439d07fe8f7d2d22b7bb5e83390 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Fri, 17 Apr 2026 10:13:06 -0400 Subject: [PATCH 2/3] feat: add mulhi, mullo, and mulhilo for integer batches Adds three integer-multiplication primitives exposed via the public API: - mullo(x, y): low half of the lane-wise product (equivalent to x * y) - mulhi(x, y): high half of the lane-wise product - mulhilo(x, y): returns {mulhi, mullo} as a pair Native kernels are provided for: - NEON (vmull_* + vshrn for 8/16/32-bit; software path for 64-bit) - SVE (svmulh_x) - RVV (rvvmulh) - SSE2 (mulhi_epi16 / mulhi_epu16) - SSE4.1 (mul_epi32/mul_epu32 + blend for 32-bit; shared 64-bit core) - AVX2 (mulhi_epi16/epu16, mul_epi32/mul_epu32 + blend; shared 64-bit core) - AVX-512F (shared 64-bit core) - AVX-512BW (mulhi_epi16/epu16) The 64-bit x86 cores share a single implementation in common/xsimd_common_arithmetic.hpp: mulhi_u64_core and mulhi_i64_core express the ll/lh/hl/hh decomposition with xsimd batch operators (&, >>, +, -, bitwise_cast) plus an arch-specific widening-mul functor (_mm*_mul_epu32). This eliminates three copies of the same 64x64 -> hi software path and unifies the signed-fixup to a single arithmetic-shift-by-63 pattern (maps to vpsraq on AVX-512, emulated on SSE4.1/AVX2 via bitwise_rshift). The generic fallback in common dispatches per-type through mulhi_helper, using a wider native integer for <=32-bit types and software split-and- multiply (or __int128 when available) for 64-bit. --- docs/source/api/arithmetic_index.rst | 6 + .../arch/common/xsimd_common_arithmetic.hpp | 112 +++++++++++ include/xsimd/arch/xsimd_avx2.hpp | 44 +++++ include/xsimd/arch/xsimd_avx512bw.hpp | 12 ++ include/xsimd/arch/xsimd_avx512f.hpp | 35 ++++ include/xsimd/arch/xsimd_common_fwd.hpp | 14 ++ include/xsimd/arch/xsimd_neon.hpp | 48 +++++ include/xsimd/arch/xsimd_rvv.hpp | 10 + include/xsimd/arch/xsimd_sse2.hpp | 12 ++ include/xsimd/arch/xsimd_sse4_1.hpp | 36 ++++ include/xsimd/arch/xsimd_sve.hpp | 7 + include/xsimd/types/xsimd_api.hpp | 70 +++++++ test/test_batch.cpp | 187 ++++++++++++++++++ 13 files changed, 593 insertions(+) diff --git a/docs/source/api/arithmetic_index.rst b/docs/source/api/arithmetic_index.rst index 429600cb3..811666c4f 100644 --- a/docs/source/api/arithmetic_index.rst +++ b/docs/source/api/arithmetic_index.rst @@ -40,6 +40,12 @@ Binary operations: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`mul` | per slot multiply | +---------------------------------------+----------------------------------------------------+ +| :cpp:func:`mullo` | low N bits of the 2N-bit integer product | ++---------------------------------------+----------------------------------------------------+ +| :cpp:func:`mulhi` | high N bits of the 2N-bit integer product | ++---------------------------------------+----------------------------------------------------+ +| :cpp:func:`mulhilo` | pair {hi, lo} of the 2N-bit integer product | ++---------------------------------------+----------------------------------------------------+ | :cpp:func:`div` | per slot division | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`mod` | per slot modulo | diff --git a/include/xsimd/arch/common/xsimd_common_arithmetic.hpp b/include/xsimd/arch/common/xsimd_common_arithmetic.hpp index ff2fb4118..c0282bfd6 100644 --- a/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +++ b/include/xsimd/arch/common/xsimd_common_arithmetic.hpp @@ -177,6 +177,118 @@ namespace xsimd self, other); } + // mulhi + namespace detail + { + template + struct mulhi_helper + { + // default: use a wider native integer type + using wider = typename std::conditional< + std::is_signed::value, + typename std::conditional::type>::type, + typename std::conditional::type>::type>::type; + + static XSIMD_INLINE T compute(T x, T y) noexcept + { + constexpr int shift = 8 * sizeof(T); + return static_cast((static_cast(x) * static_cast(y)) >> shift); + } + }; + + // 64-bit unsigned software mulhi via 32-bit splits + XSIMD_INLINE uint64_t mulhi_u64(uint64_t x, uint64_t y) noexcept + { +#if defined(__SIZEOF_INT128__) + return static_cast((static_cast(x) * static_cast(y)) >> 64); +#else + uint64_t xl = x & 0xffffffffULL; + uint64_t xh = x >> 32; + uint64_t yl = y & 0xffffffffULL; + uint64_t yh = y >> 32; + uint64_t ll = xl * yl; + uint64_t lh = xl * yh; + uint64_t hl = xh * yl; + uint64_t hh = xh * yh; + uint64_t mid = (ll >> 32) + (lh & 0xffffffffULL) + (hl & 0xffffffffULL); + return hh + (lh >> 32) + (hl >> 32) + (mid >> 32); +#endif + } + + XSIMD_INLINE int64_t mulhi_i64(int64_t x, int64_t y) noexcept + { +#if defined(__SIZEOF_INT128__) + return static_cast((static_cast<__int128>(x) * static_cast<__int128>(y)) >> 64); +#else + uint64_t uhi = mulhi_u64(static_cast(x), static_cast(y)); + if (x < 0) + uhi -= static_cast(y); + if (y < 0) + uhi -= static_cast(x); + return static_cast(uhi); +#endif + } + + template <> + struct mulhi_helper + { + static XSIMD_INLINE uint64_t compute(uint64_t x, uint64_t y) noexcept { return mulhi_u64(x, y); } + }; + + template <> + struct mulhi_helper + { + static XSIMD_INLINE int64_t compute(int64_t x, int64_t y) noexcept { return mulhi_i64(x, y); } + }; + + // Compute the high 64 bits of each lane-wise 64x64 unsigned product, + // given a "widening mul" functor WMul that takes two batch + // and returns batch containing the 64-bit product of the + // low 32 bits of each 64-bit lane (i.e. _mm*_mul_epu32 wrapped). + template + XSIMD_INLINE batch mulhi_u64_core(batch const& x, + batch const& y, + WMul mul_epu32) noexcept + { + using B = batch; + const B mask(uint64_t(0xffffffffULL)); + B xl = x & mask; + B xh = x >> 32; + B yl = y & mask; + B yh = y >> 32; + B ll = mul_epu32(xl, yl); + B lh = mul_epu32(xl, yh); + B hl = mul_epu32(xh, yl); + B hh = mul_epu32(xh, yh); + B mid = (ll >> 32) + (lh & mask) + (hl & mask); + return hh + (lh >> 32) + (hl >> 32) + (mid >> 32); + } + + // Signed variant: unsigned core + sign fixup via arithmetic shift-by-63. + template + XSIMD_INLINE batch mulhi_i64_core(batch const& x, + batch const& y, + WMul mul_epu32) noexcept + { + auto ux = ::xsimd::bitwise_cast(x); + auto uy = ::xsimd::bitwise_cast(y); + auto uhi = mulhi_u64_core(ux, uy, mul_epu32); + auto sa = ::xsimd::bitwise_cast(x >> 63); + auto sb = ::xsimd::bitwise_cast(y >> 63); + return ::xsimd::bitwise_cast(uhi - (uy & sa) - (ux & sb)); + } + } + + template ::value>*/> + XSIMD_INLINE batch mulhi(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::apply([](T x, T y) noexcept -> T + { return detail::mulhi_helper::compute(x, y); }, + self, other); + } + // rotl template XSIMD_INLINE batch rotl(batch const& self, STy other, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index ebffd910b..588e0fecf 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -928,6 +928,50 @@ namespace xsimd } } + // mulhi + template + XSIMD_INLINE batch mulhi(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_mulhi_epi16(self, other); + } + template + XSIMD_INLINE batch mulhi(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_mulhi_epu16(self, other); + } + template + XSIMD_INLINE batch mulhi(batch const& self, batch const& other, requires_arch) noexcept + { + __m256i even = _mm256_mul_epi32(self, other); + __m256i odd = _mm256_mul_epi32(_mm256_shuffle_epi32(self, _MM_SHUFFLE(3, 3, 1, 1)), + _mm256_shuffle_epi32(other, _MM_SHUFFLE(3, 3, 1, 1))); + __m256i even_hi = _mm256_srli_epi64(even, 32); + return _mm256_blend_epi16(even_hi, odd, 0xCC); + } + template + XSIMD_INLINE batch mulhi(batch const& self, batch const& other, requires_arch) noexcept + { + __m256i even = _mm256_mul_epu32(self, other); + __m256i odd = _mm256_mul_epu32(_mm256_srli_epi64(self, 32), _mm256_srli_epi64(other, 32)); + __m256i even_hi = _mm256_srli_epi64(even, 32); + return _mm256_blend_epi16(even_hi, odd, 0xCC); + } + + template + XSIMD_INLINE batch mulhi(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::mulhi_u64_core(self, other, + [](batch a, batch b) + { return batch(_mm256_mul_epu32(a, b)); }); + } + template + XSIMD_INLINE batch mulhi(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::mulhi_i64_core(self, other, + [](batch a, batch b) + { return batch(_mm256_mul_epu32(a, b)); }); + } + // reduce_add template ::value>> XSIMD_INLINE T reduce_add(batch const& self, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx512bw.hpp b/include/xsimd/arch/xsimd_avx512bw.hpp index 28e2e98d6..104ed71a7 100644 --- a/include/xsimd/arch/xsimd_avx512bw.hpp +++ b/include/xsimd/arch/xsimd_avx512bw.hpp @@ -470,6 +470,18 @@ namespace xsimd } } + // mulhi + template + XSIMD_INLINE batch mulhi(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_mulhi_epi16(self, other); + } + template + XSIMD_INLINE batch mulhi(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_mulhi_epu16(self, other); + } + // neq template ::value>> XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index 79b1e0398..2fa6dd709 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -1772,6 +1772,41 @@ namespace xsimd } } + // mulhi + template + XSIMD_INLINE batch mulhi(batch const& self, batch const& other, requires_arch) noexcept + { + __m512i even = _mm512_mul_epi32(self, other); + __m512i odd = _mm512_mul_epi32(_mm512_shuffle_epi32(self, _MM_PERM_ENUM(_MM_SHUFFLE(3, 3, 1, 1))), + _mm512_shuffle_epi32(other, _MM_PERM_ENUM(_MM_SHUFFLE(3, 3, 1, 1)))); + __m512i even_hi = _mm512_srli_epi64(even, 32); + // merge: even_hi has hi in low-32 of each 64, odd has hi in high-32 of each 64 + return _mm512_mask_blend_epi32(static_cast<__mmask16>(0xAAAA), even_hi, odd); + } + template + XSIMD_INLINE batch mulhi(batch const& self, batch const& other, requires_arch) noexcept + { + __m512i even = _mm512_mul_epu32(self, other); + __m512i odd = _mm512_mul_epu32(_mm512_srli_epi64(self, 32), _mm512_srli_epi64(other, 32)); + __m512i even_hi = _mm512_srli_epi64(even, 32); + return _mm512_mask_blend_epi32(static_cast<__mmask16>(0xAAAA), even_hi, odd); + } + + template + XSIMD_INLINE batch mulhi(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::mulhi_u64_core(self, other, + [](batch a, batch b) + { return batch(_mm512_mul_epu32(a, b)); }); + } + template + XSIMD_INLINE batch mulhi(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::mulhi_i64_core(self, other, + [](batch a, batch b) + { return batch(_mm512_mul_epu32(a, b)); }); + } + // nearbyint template XSIMD_INLINE batch nearbyint(batch const& self, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_common_fwd.hpp b/include/xsimd/arch/xsimd_common_fwd.hpp index e78864f6e..21a096f74 100644 --- a/include/xsimd/arch/xsimd_common_fwd.hpp +++ b/include/xsimd/arch/xsimd_common_fwd.hpp @@ -58,6 +58,8 @@ namespace xsimd template ::value>> XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept; template ::value>> + XSIMD_INLINE batch mulhi(batch const& self, batch const& other, requires_arch) noexcept; + template ::value>> XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept; template ::value>> XSIMD_INLINE batch ssub(batch const& self, batch const& other, requires_arch) noexcept; @@ -120,6 +122,18 @@ namespace xsimd XSIMD_INLINE constexpr bool is_only_from_lo(batch_constant) noexcept; template XSIMD_INLINE constexpr bool is_only_from_hi(batch_constant) noexcept; + + // Shared 64-bit mulhi cores, defined in xsimd_common_arithmetic.hpp. + // Forward-declared here so arch-specific kernels (SSE4.1, AVX2, + // AVX-512) can name them with an explicit template argument. + template + XSIMD_INLINE batch mulhi_u64_core(batch const& x, + batch const& y, + WMul mul_epu32) noexcept; + template + XSIMD_INLINE batch mulhi_i64_core(batch const& x, + batch const& y, + WMul mul_epu32) noexcept; } } } diff --git a/include/xsimd/arch/xsimd_neon.hpp b/include/xsimd/arch/xsimd_neon.hpp index 05200cd07..51b1e9848 100644 --- a/include/xsimd/arch/xsimd_neon.hpp +++ b/include/xsimd/arch/xsimd_neon.hpp @@ -1037,6 +1037,54 @@ namespace xsimd return wrap::x_vmulq>(register_type(lhs), register_type(rhs)); } + /********* + * mulhi * + *********/ + + template + XSIMD_INLINE batch mulhi(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + int16x8_t lo = vmull_s8(vget_low_s8(lhs), vget_low_s8(rhs)); + int16x8_t hi = vmull_s8(vget_high_s8(lhs), vget_high_s8(rhs)); + return vcombine_s8(vshrn_n_s16(lo, 8), vshrn_n_s16(hi, 8)); + } + template + XSIMD_INLINE batch mulhi(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + uint16x8_t lo = vmull_u8(vget_low_u8(lhs), vget_low_u8(rhs)); + uint16x8_t hi = vmull_u8(vget_high_u8(lhs), vget_high_u8(rhs)); + return vcombine_u8(vshrn_n_u16(lo, 8), vshrn_n_u16(hi, 8)); + } + template + XSIMD_INLINE batch mulhi(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + int32x4_t lo = vmull_s16(vget_low_s16(lhs), vget_low_s16(rhs)); + int32x4_t hi = vmull_s16(vget_high_s16(lhs), vget_high_s16(rhs)); + return vcombine_s16(vshrn_n_s32(lo, 16), vshrn_n_s32(hi, 16)); + } + template + XSIMD_INLINE batch mulhi(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + uint32x4_t lo = vmull_u16(vget_low_u16(lhs), vget_low_u16(rhs)); + uint32x4_t hi = vmull_u16(vget_high_u16(lhs), vget_high_u16(rhs)); + return vcombine_u16(vshrn_n_u32(lo, 16), vshrn_n_u32(hi, 16)); + } + template + XSIMD_INLINE batch mulhi(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + int64x2_t lo = vmull_s32(vget_low_s32(lhs), vget_low_s32(rhs)); + int64x2_t hi = vmull_s32(vget_high_s32(lhs), vget_high_s32(rhs)); + return vcombine_s32(vshrn_n_s64(lo, 32), vshrn_n_s64(hi, 32)); + } + template + XSIMD_INLINE batch mulhi(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + uint64x2_t lo = vmull_u32(vget_low_u32(lhs), vget_low_u32(rhs)); + uint64x2_t hi = vmull_u32(vget_high_u32(lhs), vget_high_u32(rhs)); + return vcombine_u32(vshrn_n_u64(lo, 32), vshrn_n_u64(hi, 32)); + } + // 64-bit intentionally falls through to the common scalar fallback + /******* * div * *******/ diff --git a/include/xsimd/arch/xsimd_rvv.hpp b/include/xsimd/arch/xsimd_rvv.hpp index 3ae649fdb..ae7347f3a 100644 --- a/include/xsimd/arch/xsimd_rvv.hpp +++ b/include/xsimd/arch/xsimd_rvv.hpp @@ -567,6 +567,9 @@ namespace xsimd (__riscv_vmul), (__riscv_vmul), (__riscv_vfmul), , vec(vec, vec)) + XSIMD_RVV_OVERLOAD2(rvvmulh, + (__riscv_vmulh), + (__riscv_vmulhu), , vec(vec, vec)) XSIMD_RVV_OVERLOAD3(rvvdiv, (__riscv_vdiv), (__riscv_vdivu), @@ -659,6 +662,13 @@ namespace xsimd return detail_rvv::rvvmul(lhs, rhs); } + // mulhi + template ::value, int>::type = 0> + XSIMD_INLINE batch mulhi(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return detail_rvv::rvvmulh(lhs, rhs); + } + // div template = 0> XSIMD_INLINE batch div(batch const& lhs, batch const& rhs, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp index c190b753c..fd593e7bd 100644 --- a/include/xsimd/arch/xsimd_sse2.hpp +++ b/include/xsimd/arch/xsimd_sse2.hpp @@ -1438,6 +1438,18 @@ namespace xsimd return _mm_mullo_epi16(self, other); } + // mulhi + template + XSIMD_INLINE batch mulhi(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_mulhi_epi16(self, other); + } + template + XSIMD_INLINE batch mulhi(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_mulhi_epu16(self, other); + } + // nearbyint_as_int template XSIMD_INLINE batch nearbyint_as_int(batch const& self, diff --git a/include/xsimd/arch/xsimd_sse4_1.hpp b/include/xsimd/arch/xsimd_sse4_1.hpp index 4d544c11c..cee10091d 100644 --- a/include/xsimd/arch/xsimd_sse4_1.hpp +++ b/include/xsimd/arch/xsimd_sse4_1.hpp @@ -359,6 +359,42 @@ namespace xsimd } } + // mulhi + template + XSIMD_INLINE batch mulhi(batch const& self, batch const& other, requires_arch) noexcept + { + __m128i even = _mm_mul_epi32(self, other); // 64-bit products in lanes 0,2 + __m128i odd = _mm_mul_epi32(_mm_shuffle_epi32(self, _MM_SHUFFLE(3, 3, 1, 1)), + _mm_shuffle_epi32(other, _MM_SHUFFLE(3, 3, 1, 1))); + // hi halves in the low 32 of each 64 lane of (even>>32), and in the high 32 of odd + __m128i even_hi = _mm_srli_epi64(even, 32); + // blend: 32-bit lanes {even_hi[0], odd_hi[1], even_hi[2], odd_hi[3]} + return _mm_blend_epi16(even_hi, odd, 0xCC); + } + template + XSIMD_INLINE batch mulhi(batch const& self, batch const& other, requires_arch) noexcept + { + __m128i even = _mm_mul_epu32(self, other); + __m128i odd = _mm_mul_epu32(_mm_srli_epi64(self, 32), _mm_srli_epi64(other, 32)); + __m128i even_hi = _mm_srli_epi64(even, 32); + return _mm_blend_epi16(even_hi, odd, 0xCC); + } + + template + XSIMD_INLINE batch mulhi(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::mulhi_u64_core(self, other, + [](batch a, batch b) + { return batch(_mm_mul_epu32(a, b)); }); + } + template + XSIMD_INLINE batch mulhi(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::mulhi_i64_core(self, other, + [](batch a, batch b) + { return batch(_mm_mul_epu32(a, b)); }); + } + // nearbyint template XSIMD_INLINE batch nearbyint(batch const& self, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_sve.hpp b/include/xsimd/arch/xsimd_sve.hpp index c15404dd9..7776f1ef6 100644 --- a/include/xsimd/arch/xsimd_sve.hpp +++ b/include/xsimd/arch/xsimd_sve.hpp @@ -293,6 +293,13 @@ namespace xsimd return svmul_x(detail_sve::ptrue(), lhs, rhs); } + // mulhi + template ::value, int>::type = 0> + XSIMD_INLINE batch mulhi(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return svmulh_x(detail_sve::ptrue(), lhs, rhs); + } + // div template = 4, int> = 0> XSIMD_INLINE batch div(batch const& lhs, batch const& rhs, requires_arch) noexcept diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp index 6a7206116..09fe1a2a4 100644 --- a/include/xsimd/types/xsimd_api.hpp +++ b/include/xsimd/types/xsimd_api.hpp @@ -16,6 +16,7 @@ #include #include #include +#include #include "../arch/xsimd_isa.hpp" #include "../types/xsimd_batch.hpp" @@ -1744,6 +1745,75 @@ namespace xsimd return x * y; } + /** + * @ingroup batch_arithmetic + * + * Computes the low N bits of the 2N-bit product of integer batches \c x and \c y. + * Equivalent to ``mul(x, y)`` — the low half of the full product is identical for + * both signed and unsigned interpretations. + * + * The function behaves as if it computes the per-slot product of \c x and \c y using + * double the input width as an intermediate representation (e.g. 64 bits for 32-bit + * inputs), then returns the lower half of the result (the lower 32 bits in this + * example). + * @tparam T integer element type of the batch. + * @param x batch involved in the product. + * @param y batch involved in the product. + * @return the low N bits of the product, lane-wise. + */ + template ::value, void>::type> + XSIMD_INLINE batch mullo(batch const& x, batch const& y) noexcept + { + detail::static_check_supported_config(); + return x * y; + } + + /** + * @ingroup batch_arithmetic + * + * Computes the high N bits of the 2N-bit product of integer batches \c x and \c y. + * Signed and unsigned variants differ; the element type of \c x / \c y decides the + * interpretation. + * + * The function behaves as if it computes the per-slot product of \c x and \c y using + * double the input width as an intermediate representation (e.g. 64 bits for 32-bit + * inputs), then returns the upper half of the result (the upper 32 bits in this + * example). + * @tparam T integer element type of the batch. + * @param x batch involved in the product. + * @param y batch involved in the product. + * @return the high N bits of the product, lane-wise. + */ + template ::value, void>::type> + XSIMD_INLINE batch mulhi(batch const& x, batch const& y) noexcept + { + detail::static_check_supported_config(); + return kernel::mulhi(x, y, A {}); + } + + /** + * @ingroup batch_arithmetic + * + * Computes the full 2N-bit product of integer batches \c x and \c y, returning both + * the high and low N-bit halves as ``std::pair{hi, lo}``. + * + * The function behaves as if it computes the per-slot product of \c x and \c y using + * double the input width as an intermediate representation (e.g. 64 bits for 32-bit + * inputs), then returns both halves of the result (the upper and lower 32 bits in + * this example). + * @tparam T integer element type of the batch. + * @param x batch involved in the product. + * @param y batch involved in the product. + * @return pair of batches ``{hi, lo}``. + */ + template ::value, void>::type> + XSIMD_INLINE std::pair, batch> + mulhilo(batch const& x, batch const& y) noexcept + { + detail::static_check_supported_config(); + return std::pair, batch> { mulhi(x, y), x * y }; + } + /** * @ingroup batch_rounding * diff --git a/test/test_batch.cpp b/test/test_batch.cpp index 403bf00df..65bfdd76d 100644 --- a/test/test_batch.cpp +++ b/test/test_batch.cpp @@ -21,6 +21,49 @@ using namespace std::placeholders; +namespace detail_test_mulhilo +{ + template + typename std::enable_if::value && (sizeof(T) <= 4), T>::type + mulhi_reference(T x, T y) noexcept + { + using W = typename std::conditional::value, int64_t, uint64_t>::type; + return static_cast((static_cast(x) * static_cast(y)) >> (8 * sizeof(T))); + } + +#if defined(__SIZEOF_INT128__) + template + typename std::enable_if::value && (sizeof(T) == 8), T>::type + mulhi_reference(T x, T y) noexcept + { + using W = typename std::conditional::value, __int128, unsigned __int128>::type; + return static_cast((static_cast(x) * static_cast(y)) >> 64); + } +#else + template + typename std::enable_if::value && (sizeof(T) == 8), T>::type + mulhi_reference(T x, T y) noexcept + { + uint64_t ux = static_cast(x); + uint64_t uy = static_cast(y); + uint64_t xl = ux & 0xffffffffULL, xh = ux >> 32; + uint64_t yl = uy & 0xffffffffULL, yh = uy >> 32; + uint64_t ll = xl * yl, lh = xl * yh, hl = xh * yl, hh = xh * yh; + uint64_t mid = (ll >> 32) + (lh & 0xffffffffULL) + (hl & 0xffffffffULL); + uint64_t hi = hh + (lh >> 32) + (hl >> 32) + (mid >> 32); + if (std::is_signed::value) + { + if (x < 0) + hi -= uy; + if (y < 0) + hi -= ux; + } + return static_cast(hi); + } +#endif +} +using detail_test_mulhilo::mulhi_reference; + template struct batch_test { @@ -313,6 +356,145 @@ struct batch_test } } + template + void test_mulhilo_impl(std::true_type /*integral*/) const + { + using UT = typename std::make_unsigned::type; + + auto run_case = [](array_type const& a, array_type const& b, const char* tag) + { + batch_type ba = batch_type::load_unaligned(a.data()); + batch_type bb = batch_type::load_unaligned(b.data()); + + array_type lo_expected; + array_type hi_expected; + for (std::size_t i = 0; i < size; ++i) + { + lo_expected[i] = static_cast(static_cast(a[i]) * static_cast(b[i])); + hi_expected[i] = mulhi_reference(a[i], b[i]); + } + + batch_type lo_res = xsimd::mullo(ba, bb); + INFO("mullo(batch, batch) [" << tag << "]"); + CHECK_BATCH_EQ(lo_res, lo_expected); + + batch_type hi_res = xsimd::mulhi(ba, bb); + INFO("mulhi(batch, batch) [" << tag << "]"); + CHECK_BATCH_EQ(hi_res, hi_expected); + + auto p = xsimd::mulhilo(ba, bb); + INFO("mulhilo.first == mulhi [" << tag << "]"); + CHECK_BATCH_EQ(p.first, hi_res); + INFO("mulhilo.second == mullo [" << tag << "]"); + CHECK_BATCH_EQ(p.second, lo_res); + }; + + // baseline: small operands from init_operands + run_case(lhs, rhs, "small"); + + // edge operands that actually exercise the high half + constexpr value_type vmin = std::numeric_limits::min(); + constexpr value_type vmax = std::numeric_limits::max(); + constexpr bool is_signed = std::is_signed::value; + + // Pattern A: extremes paired with extremes (covers vmax*vmax, vmin*vmin, + // vmin*vmax, vmin*-1 — the classic signed-overflow corners). + { + array_type a, b; + for (std::size_t i = 0; i < size; ++i) + { + switch (i % 8) + { + case 0: + a[i] = vmax; + b[i] = vmax; + break; + case 1: + a[i] = vmin; + b[i] = vmin; + break; + case 2: + a[i] = vmin; + b[i] = vmax; + break; + case 3: + a[i] = vmax; + b[i] = static_cast(is_signed ? -1 : vmax); + break; + case 4: + a[i] = static_cast(is_signed ? -1 : vmax); + b[i] = static_cast(is_signed ? -1 : vmax); + break; + case 5: + a[i] = vmin; + b[i] = static_cast(is_signed ? -1 : 1); + break; + case 6: + a[i] = static_cast(vmax / 2 + 1); + b[i] = static_cast(vmax / 2 + 1); + break; + case 7: + a[i] = static_cast(1); + b[i] = vmax; + break; + } + } + run_case(a, b, "extremes"); + } + + // Pattern B: high-half-non-zero, mixed signs (each lane unique so we + // catch lane-wise bugs in 32/64-bit emulated mulhi paths). + { + array_type a, b; + constexpr std::size_t bits = 8 * sizeof(value_type); + constexpr std::size_t half = bits / 2; + const UT half_mask = (static_cast(1) << half) - 1; + for (std::size_t i = 0; i < size; ++i) + { + // Spread bits across both halves so the product overflows the + // low half. Use deterministic but lane-varying patterns. + UT ua = static_cast((static_cast(0xA53C97E1ULL) ^ (static_cast(i) * 0x9E37ULL)) + | (static_cast(i + 1) << half)); + UT ub = static_cast((static_cast(0x6BD1F4A7ULL) ^ (static_cast(i) * 0xC2B5ULL)) + | (static_cast((i * 3) + 1) << half)); + // Make sure both halves are non-zero so the product spans bits. + if ((ua & half_mask) == 0) + ua |= static_cast(1); + if ((ub & half_mask) == 0) + ub |= static_cast(1); + a[i] = static_cast(ua); + b[i] = static_cast(ub); + } + run_case(a, b, "wide-bits"); + } + + // Pattern C: signed correction terms — only meaningful for signed + // types but harmless for unsigned (we still check correctness). + { + array_type a, b; + for (std::size_t i = 0; i < size; ++i) + { + // Alternate negative * positive and negative * negative for + // signed; for unsigned this just samples large magnitudes. + value_type x = static_cast(vmax - static_cast(i)); + value_type y = static_cast(is_signed + ? (i % 2 == 0 ? -static_cast(i + 1) + : static_cast(i + 1)) + : static_cast(vmax - (i * 7))); + a[i] = x; + b[i] = y; + } + run_case(a, b, "signed-correction"); + } + } + + void test_mulhilo_impl(std::false_type /*not integral*/) const { } + + void test_mulhilo() const + { + test_mulhilo_impl(typename std::is_integral::type {}); + } + void test_saturated_arithmetic() const { // batch + batch @@ -1016,6 +1198,11 @@ TEST_CASE_TEMPLATE("[batch]", B, BATCH_TYPES) Test.test_incr_decr(); } + SUBCASE("mulhilo") + { + Test.test_mulhilo(); + } + SUBCASE("saturated_arithmetic") { Test.test_saturated_arithmetic(); From 3d319b1a261afbe1e1452a1e98c858923f0e2169 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Fri, 1 May 2026 16:44:44 -0400 Subject: [PATCH 3/3] ci: use newer qemu-user-static for RVV cross tests docker/setup-qemu-action@v3.0.0 pulls tonistiigi/binfmt:latest, which pins an older qemu (~6.x/7.x) whose RVV implementation miscompiles the vmulh / vmulhu variants emitted by gcc-14 at vlen=128: the test binary runs forever under qemu while making no forward progress, hitting the 6h GHA job timeout. Local repro with qemu 11 / Ubuntu 24.04's qemu-user-static (8.2.x) passes the same binary in ~30s. Switch to the apt-shipped qemu-user-static so the cross-rvv job picks up a maintained qemu without depending on the pinned docker image. --- .github/workflows/cross-rvv.yml | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cross-rvv.yml b/.github/workflows/cross-rvv.yml index cbe19c7c4..6a51e5db8 100644 --- a/.github/workflows/cross-rvv.yml +++ b/.github/workflows/cross-rvv.yml @@ -35,9 +35,14 @@ jobs: sudo ln -srf $(which clang++-${{ matrix.sys.version }}) /usr/bin/clang++ rm llvm.sh - name: Setup QEMU - uses: docker/setup-qemu-action@v3.0.0 - with: - platforms: riscv64 + # Use the qemu-user-static package shipped by Ubuntu 24.04 (qemu 8.2+) + # rather than docker/setup-qemu-action, which pins an older qemu via + # tonistiigi/binfmt and miscompiles RVV vmulh* at vlen=128 (the test + # binary hangs under qemu, with no forward progress, until the GHA + # 6h timeout). + run: | + sudo apt-get -y -qq update + sudo apt-get -y -qq --no-install-suggests --no-install-recommends install qemu-user-static - name: Setup Ninja run: | sudo apt-get -y -qq install ninja-build