From b57a766745c7a1d69d4de81c8066a4be48458342 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 30 Apr 2026 18:51:37 -0400 Subject: [PATCH 1/3] feat: add runtime batch_bool mask overloads for load_masked/store_masked Add runtime-mask overloads of xsimd::load_masked and xsimd::store_masked across AVX2, AVX-512, SSE, SVE, RVV, and NEON. The generic common-path fallback is collapsed to a whole-vector select, and the unaligned page-cross fast path is dropped since the underlying intrinsics suppress faults on masked-off lanes regardless of alignment. Also: forward SVE compile-time masked load/store through the runtime path so the per-lane predicate is correct on SVE wider than 128 bits (the previous svdupq_b* path replicates a 128-bit chunk pattern across the vector). --- docs/source/api/data_transfer.rst | 17 ++- .../xsimd/arch/common/xsimd_common_memory.hpp | 39 ++++++ include/xsimd/arch/xsimd_avx.hpp | 33 +++++ include/xsimd/arch/xsimd_avx2.hpp | 33 ++++- include/xsimd/arch/xsimd_common_fwd.hpp | 4 + include/xsimd/arch/xsimd_rvv.hpp | 24 ++++ include/xsimd/arch/xsimd_sve.hpp | 43 +++++- include/xsimd/types/xsimd_api.hpp | 124 ++++++++++++++++++ include/xsimd/types/xsimd_batch.hpp | 53 +++++++- include/xsimd/types/xsimd_utils.hpp | 10 ++ test/test_load_store.cpp | 85 ++++++++++++ 11 files changed, 452 insertions(+), 13 deletions(-) diff --git a/docs/source/api/data_transfer.rst b/docs/source/api/data_transfer.rst index 815f56293..db63fbc39 100644 --- a/docs/source/api/data_transfer.rst +++ b/docs/source/api/data_transfer.rst @@ -12,7 +12,7 @@ Data Transfers From memory: +---------------------------------------+----------------------------------------------------+ -| :cpp:func:`load` | load values from memory (optionally masked) | +| :cpp:func:`load` | load values from memory (optionally masked) [#m]_ | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`load_aligned` | load values from aligned memory | +---------------------------------------+----------------------------------------------------+ @@ -32,7 +32,7 @@ From a scalar: To memory: +---------------------------------------+----------------------------------------------------+ -| :cpp:func:`store` | store values to memory (optionally masked) | +| :cpp:func:`store` | store values to memory (optionally masked) [#m]_ | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`store_aligned` | store values to aligned memory | +---------------------------------------+----------------------------------------------------+ @@ -84,3 +84,16 @@ The following empty types are used for tag dispatching: .. doxygenstruct:: xsimd::unaligned_mode :project: xsimd + +.. rubric:: Footnotes + +.. [#m] Masked ``load`` / ``store`` come in two flavours. The + :cpp:class:`batch_bool_constant` overload encodes the mask in the type, is + resolved at compile time and is always efficient. The runtime + :cpp:class:`batch_bool` overload, by contrast, falls back to a per-lane + scalar loop on architectures without a native masked load/store + instruction — SSE2 through SSE4.2, NEON/NEON64, VSX, S390x, and WASM. + AVX, AVX2, AVX-512, SVE and RVV use native masked instructions and pay no + such penalty. Prefer the compile-time mask whenever the selection is known + at compile time, and avoid runtime-mask loads/stores in hot inner loops on + the affected architectures. diff --git a/include/xsimd/arch/common/xsimd_common_memory.hpp b/include/xsimd/arch/common/xsimd_common_memory.hpp index c8038334a..d3834583a 100644 --- a/include/xsimd/arch/common/xsimd_common_memory.hpp +++ b/include/xsimd/arch/common/xsimd_common_memory.hpp @@ -15,6 +15,7 @@ #include #include #include +#include #include "../../types/xsimd_batch_constant.hpp" #include "./xsimd_common_details.hpp" @@ -374,6 +375,25 @@ namespace xsimd return batch::load(buffer.data(), aligned_mode {}); } + template + XSIMD_INLINE batch + load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + // Per-lane validity contract: only active lanes of ``mem`` are + // required to be addressable. An unconditional whole-vector load + // would touch inactive lanes and trip ASan/Valgrind on partial + // buffers, so stay scalar. Arches with hardware predicated loads + // (AVX2 32/64-bit, AVX-512, SVE, RVV) override this with a single + // intrinsic that suppresses inactive-lane reads in hardware. + constexpr std::size_t size = batch::size; + alignas(A::alignment()) std::array buffer {}; + const uint64_t bits = mask.mask(); + for (std::size_t i = 0; i < size; ++i) + if ((bits >> i) & uint64_t(1)) + buffer[i] = mem[i]; + return batch::load_aligned(buffer.data()); + } + template XSIMD_INLINE void store_masked(T_out* mem, batch const& src, batch_bool_constant, alignment, requires_arch) noexcept @@ -388,6 +408,25 @@ namespace xsimd } } + template + XSIMD_INLINE void + store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + // Per-lane validity contract (matches native masked-store APIs): + // only active lanes of ``mem`` are touched. A load+select+store + // RMW would both read and write inactive bytes, breaking that + // contract — stay scalar. Arches with hardware predicated stores + // override this with a single intrinsic that suppresses inactive + // lanes in hardware. + constexpr std::size_t size = batch::size; + alignas(A::alignment()) std::array src_buf; + src.store_aligned(src_buf.data()); + const uint64_t bits = mask.mask(); + for (std::size_t i = 0; i < size; ++i) + if ((bits >> i) & uint64_t(1)) + mem[i] = src_buf[i]; + } + template XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept { diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index 305041f11..429637784 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -1015,6 +1015,23 @@ namespace xsimd } } + // Runtime-mask load for float/double on AVX. Both aligned_mode and + // unaligned_mode map to _mm256_maskload_* — the intrinsic does not fault + // on masked-off lanes, so partial loads across page boundaries are safe. + template + XSIMD_INLINE batch + load_masked(float const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + return _mm256_maskload_ps(mem, _mm256_castps_si256(mask)); + } + + template + XSIMD_INLINE batch + load_masked(double const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + return _mm256_maskload_pd(mem, _mm256_castpd_si256(mask)); + } + // store_masked namespace detail { @@ -1031,6 +1048,22 @@ namespace xsimd } } + // Runtime-mask store for float/double on AVX. Same fault-suppression + // semantics as the masked loads above; alignment mode is irrelevant. + template + XSIMD_INLINE void + store_masked(float* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + _mm256_maskstore_ps(mem, _mm256_castps_si256(mask), src); + } + + template + XSIMD_INLINE void + store_masked(double* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + _mm256_maskstore_pd(mem, _mm256_castpd_si256(mask), src); + } + template XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index ebffd910b..d3e9330be 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -119,7 +119,6 @@ namespace xsimd } // load_masked - // AVX2 low-level helpers (operate on raw SIMD registers) namespace detail { XSIMD_INLINE __m256i maskload(const int32_t* mem, __m256i mask) noexcept @@ -138,14 +137,12 @@ namespace xsimd } } - // single templated implementation for integer masked loads (32/64-bit) template XSIMD_INLINE std::enable_if_t::value && (sizeof(T) >= 4), batch> load_masked(T const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { static_assert(sizeof(T) == 4 || sizeof(T) == 8, "load_masked supports only 32/64-bit integers on AVX2"); using int_t = std::conditional_t; - // Use the raw register-level maskload helpers for the remaining cases. return detail::maskload(reinterpret_cast(mem), mask.as_batch()); } @@ -175,6 +172,20 @@ namespace xsimd return bitwise_cast(r); } + // Runtime-mask load for 32/64-bit integers on AVX2. 8/16-bit integers + // fall back to the scalar common path: AVX2 has no native maskload for + // those widths, and a load-then-blend would break fault-suppression at + // page boundaries (the main reason callers ask for a masked load). + // Both aligned_mode and unaligned_mode route to the same intrinsic — + // masked-off lanes do not fault regardless of alignment. + template + XSIMD_INLINE std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8), batch> + load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + using int_t = std::conditional_t; + return detail::maskload(reinterpret_cast(mem), __m256i(mask)); + } + // store_masked namespace detail { @@ -196,14 +207,12 @@ namespace xsimd { constexpr size_t lanes_per_half = batch::size / 2; - // confined to lower 128-bit half → forward to SSE XSIMD_IF_CONSTEXPR(mask.countl_zero() >= lanes_per_half) { constexpr auto mlo = ::xsimd::detail::lower_half(mask); const auto lo = detail::lower_half(src); store_masked(mem, lo, mlo, Mode {}, sse4_2 {}); } - // confined to upper 128-bit half → forward to SSE else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= lanes_per_half) { constexpr auto mhi = ::xsimd::detail::upper_half(mask); @@ -230,6 +239,20 @@ namespace xsimd store_masked(reinterpret_cast(mem), s64, batch_bool_constant {}, Mode {}, avx2 {}); } + template + XSIMD_INLINE std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8), void> + store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + _mm256_maskstore_epi32(reinterpret_cast(mem), __m256i(mask), __m256i(src)); + } + else + { + _mm256_maskstore_epi64(reinterpret_cast(mem), __m256i(mask), __m256i(src)); + } + } + // load_stream template ::value, void>> XSIMD_INLINE batch load_stream(T const* mem, convert, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_common_fwd.hpp b/include/xsimd/arch/xsimd_common_fwd.hpp index e78864f6e..1474aeeb8 100644 --- a/include/xsimd/arch/xsimd_common_fwd.hpp +++ b/include/xsimd/arch/xsimd_common_fwd.hpp @@ -79,8 +79,12 @@ namespace xsimd XSIMD_INLINE batch load(T const* mem, unaligned_mode, requires_arch) noexcept; template XSIMD_INLINE batch load_masked(T_in const* mem, batch_bool_constant mask, convert, alignment, requires_arch) noexcept; + template + XSIMD_INLINE batch load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept; template XSIMD_INLINE void store_masked(T_out* mem, batch const& src, batch_bool_constant mask, alignment, requires_arch) noexcept; + template + XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept; template XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept; template diff --git a/include/xsimd/arch/xsimd_rvv.hpp b/include/xsimd/arch/xsimd_rvv.hpp index 3ae649fdb..df64d3c1b 100644 --- a/include/xsimd/arch/xsimd_rvv.hpp +++ b/include/xsimd/arch/xsimd_rvv.hpp @@ -409,6 +409,11 @@ namespace xsimd { XSIMD_RVV_OVERLOAD(rvvle, (__riscv_vle XSIMD_RVV_S _v_ XSIMD_RVV_TSM), , vec(T const*)) XSIMD_RVV_OVERLOAD(rvvse, (__riscv_vse XSIMD_RVV_S _v_ XSIMD_RVV_TSM), , void(T*, vec)) + // Masked load (mask-undisturbed with zero passthrough): inactive lanes read as 0, + // no memory access is performed for inactive lanes (page-fault safe). + XSIMD_RVV_OVERLOAD(rvvle_mu, (__riscv_vle XSIMD_RVV_S _v_ XSIMD_RVV_TSM _mu), , vec(bvec, vec, T const*)) + // Masked store: inactive lanes are not written. + XSIMD_RVV_OVERLOAD(rvvse_m, (__riscv_vse XSIMD_RVV_S _v_ XSIMD_RVV_TSM _m), , void(bvec, T*, vec)) } template = 0> @@ -423,6 +428,16 @@ namespace xsimd return load_aligned(src, convert(), rvv {}); } + // load_masked (runtime mask): native vle*.v vd, (rs1), v0.t with zero-init + // passthrough so inactive lanes read as 0, matching xsimd's contract. + template = 0> + XSIMD_INLINE batch load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + using proj_t = map_to_sized_type_t; + const auto zero = detail_rvv::rvvmv_splat(proj_t {}); + return detail_rvv::rvvle_mu(mask, zero, reinterpret_cast(mem)); + } + // load_complex namespace detail_rvv { @@ -500,6 +515,15 @@ namespace xsimd store_aligned(dst, src, rvv {}); } + // store_masked (runtime mask): native vse*.v vd, (rs1), v0.t — inactive lanes + // are not written (page-fault safe). + template = 0> + XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + using proj_t = map_to_sized_type_t; + detail_rvv::rvvse_m(mask, reinterpret_cast(mem), src); + } + /****************** * scatter/gather * ******************/ diff --git a/include/xsimd/arch/xsimd_sve.hpp b/include/xsimd/arch/xsimd_sve.hpp index c15404dd9..e5729e65e 100644 --- a/include/xsimd/arch/xsimd_sve.hpp +++ b/include/xsimd/arch/xsimd_sve.hpp @@ -101,11 +101,28 @@ namespace xsimd return load_aligned(src, convert(), sve {}); } - // load_masked - template = 0> - XSIMD_INLINE batch load_masked(T const* mem, batch_bool_constant, Mode, requires_arch) noexcept + // load_masked (compile-time mask): build a runtime predicate from + // the constant mask and reuse the runtime-mask path. ``pmask`` only + // constructs a 128-bit chunk predicate (svdupq_b{8,16,32,64}), which + // is replication-based and does not correctly express a per-lane + // mask on SVE wider than 128 bits — going through ``as_batch_bool`` + // gives the right predicate for every vector width. ``int32``/ + // ``int64``/``uint32``/``uint64`` are excluded so the common-arch + // dispatchers that reinterpret to ``float``/``double`` win partial + // ordering (otherwise we'd be ambiguous with ``requires_arch``). + template = 0, + std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8)), int> = 0> + XSIMD_INLINE batch load_masked(T const* mem, batch_bool_constant mask, convert, Mode m, requires_arch) noexcept { - return svld1(detail_sve::pmask(), reinterpret_cast const*>(mem)); + return load_masked(mem, mask.as_batch_bool(), convert {}, m, sve {}); + } + + // load_masked (runtime mask) + template = 0> + XSIMD_INLINE batch load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + return svld1(mask, reinterpret_cast const*>(mem)); } // load_complex @@ -141,6 +158,24 @@ namespace xsimd store_aligned(dst, src, sve {}); } + // store_masked (compile-time mask): forward to the runtime-mask + // path for the same reason as load_masked above; same exclusion of + // 32/64-bit integers to defer to the common dispatchers. + template = 0, + std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8)), int> = 0> + XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool_constant mask, Mode m, requires_arch) noexcept + { + store_masked(mem, src, mask.as_batch_bool(), m, sve {}); + } + + // store_masked (runtime mask) + template = 0> + XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + svst1(mask, reinterpret_cast*>(mem), src); + } + // store_complex template = 0> XSIMD_INLINE void store_complex_aligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp index 6a7206116..85bb72899 100644 --- a/include/xsimd/types/xsimd_api.hpp +++ b/include/xsimd/types/xsimd_api.hpp @@ -1550,6 +1550,36 @@ namespace xsimd return batch::load(ptr, mask, aligned_mode {}); } + /** + * @ingroup batch_data_transfer + * + * Creates a batch from the buffer \c ptr using a runtime mask. Elements + * corresponding to \c false in the mask are not accessed in memory and are + * zero-initialized in the resulting batch. No type conversion is performed: + * \c ptr must point to \c T. Lanes whose mask bit is \c false do not fault, + * so partial loads across a page boundary are safe. \c stream_mode is not + * supported. + * + * \warning Runtime-mask loads carry a significant performance penalty on + * architectures without a native masked load instruction (SSE2 through + * SSE4.2, NEON/NEON64, VSX, S390x, WASM). On those targets the + * implementation falls back to a per-lane scalar loop. AVX, AVX2, AVX-512, + * SVE and RVV use native masked instructions and are not affected. Prefer + * the \c batch_bool_constant overload whenever the mask is known at + * compile time, or hoist the masked load out of hot inner loops. + * @param ptr the memory buffer to read. Must be aligned. + * @param mask runtime selection mask for the elements to load + * @return a new batch instance + */ + template + XSIMD_INLINE batch load(T const* ptr, + batch_bool mask, + aligned_mode = {}) noexcept + { + detail::static_check_supported_config(); + return batch::load(ptr, mask, aligned_mode {}); + } + /** * @ingroup batch_data_transfer * @@ -1570,6 +1600,37 @@ namespace xsimd return batch::load(ptr, mask, unaligned_mode {}); } + /** + * @ingroup batch_data_transfer + * + * Creates a batch from the buffer \c ptr using a runtime mask. Elements + * corresponding to \c false in the mask are not accessed in memory and are + * zero-initialized in the resulting batch. No type conversion is performed: + * \c ptr must point to \c T. Lanes whose mask bit is \c false do not fault, + * so partial loads across a page boundary are safe. \c stream_mode is not + * supported. + * + * \warning Runtime-mask loads carry a significant performance penalty on + * architectures without a native masked load instruction (SSE2 through + * SSE4.2, NEON/NEON64, VSX, S390x, WASM). On those targets the + * implementation falls back to a per-lane scalar loop. AVX, AVX2, AVX-512, + * SVE and RVV use native masked instructions and are not affected. Prefer + * the \c batch_bool_constant overload whenever the mask is known at + * compile time, or hoist the masked load out of hot inner loops. + * @param ptr the memory buffer to read. The buffer does not need to be + * aligned. + * @param mask runtime selection mask for the elements to load + * @return a new batch instance + */ + template + XSIMD_INLINE batch load(T const* ptr, + batch_bool mask, + unaligned_mode) noexcept + { + detail::static_check_supported_config(); + return batch::load(ptr, mask, unaligned_mode {}); + } + /** * @ingroup batch_data_transfer * @@ -2663,6 +2724,37 @@ namespace xsimd val.store(mem, mask, aligned_mode {}); } + /** + * @ingroup batch_data_transfer + * + * Copy selected elements of batch \c val to the buffer \c mem using a + * runtime mask. Elements corresponding to \c false in the mask are not + * written and leave the contents of \c mem untouched. No type conversion is + * performed: \c mem must point to \c T. Lanes whose mask bit is \c false do + * not fault, so partial stores across a page boundary are safe. \c + * stream_mode is not supported. + * + * \warning Runtime-mask stores carry a significant performance penalty on + * architectures without a native masked store instruction (SSE2 through + * SSE4.2, NEON/NEON64, VSX, S390x, WASM). On those targets the + * implementation falls back to a per-lane scalar loop. AVX, AVX2, AVX-512, + * SVE and RVV use native masked instructions and are not affected. Prefer + * the \c batch_bool_constant overload whenever the mask is known at + * compile time, or hoist the masked store out of hot inner loops. + * @param mem the memory buffer to write to. Must be aligned. + * @param val the batch to copy from + * @param mask runtime selection mask for the elements to store + */ + template + XSIMD_INLINE void store(T* mem, + batch const& val, + batch_bool mask, + aligned_mode = {}) noexcept + { + detail::static_check_supported_config(); + val.store(mem, mask, aligned_mode {}); + } + /** * @ingroup batch_data_transfer * @@ -2683,6 +2775,38 @@ namespace xsimd val.store(mem, mask, unaligned_mode {}); } + /** + * @ingroup batch_data_transfer + * + * Copy selected elements of batch \c val to the buffer \c mem using a + * runtime mask. Elements corresponding to \c false in the mask are not + * written and leave the contents of \c mem untouched. No type conversion is + * performed: \c mem must point to \c T. Lanes whose mask bit is \c false do + * not fault, so partial stores across a page boundary are safe. \c + * stream_mode is not supported. + * + * \warning Runtime-mask stores carry a significant performance penalty on + * architectures without a native masked store instruction (SSE2 through + * SSE4.2, NEON/NEON64, VSX, S390x, WASM). On those targets the + * implementation falls back to a per-lane scalar loop. AVX, AVX2, AVX-512, + * SVE and RVV use native masked instructions and are not affected. Prefer + * the \c batch_bool_constant overload whenever the mask is known at + * compile time, or hoist the masked store out of hot inner loops. + * @param mem the memory buffer to write to. The buffer does not need to be + * aligned. + * @param val the batch to copy from + * @param mask runtime selection mask for the elements to store + */ + template + XSIMD_INLINE void store(T* mem, + batch const& val, + batch_bool mask, + unaligned_mode) noexcept + { + detail::static_check_supported_config(); + val.store(mem, mask, unaligned_mode {}); + } + /** * @ingroup batch_data_transfer * diff --git a/include/xsimd/types/xsimd_batch.hpp b/include/xsimd/types/xsimd_batch.hpp index b584a2d81..8826e35c7 100644 --- a/include/xsimd/types/xsimd_batch.hpp +++ b/include/xsimd/types/xsimd_batch.hpp @@ -144,9 +144,12 @@ namespace xsimd template XSIMD_INLINE void store(U* mem, stream_mode) const noexcept; - // Compile-time mask overloads + // Masked overloads template XSIMD_INLINE void store(U* mem, batch_bool_constant mask, Mode) const noexcept; + /** \brief Runtime-mask store; see xsimd::store(T*, batch const&, batch_bool, Mode). */ + template + XSIMD_INLINE void store(T* mem, batch_bool mask, Mode = {}) const noexcept; template XSIMD_NO_DISCARD static XSIMD_INLINE batch load_aligned(U const* mem) noexcept; @@ -156,9 +159,12 @@ namespace xsimd XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, aligned_mode) noexcept; template XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, unaligned_mode) noexcept; - // Compile-time mask overloads + // Masked overloads template XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, batch_bool_constant mask, Mode = {}) noexcept; + /** \brief Runtime-mask load; see xsimd::load(T const*, batch_bool, Mode). */ + template + XSIMD_NO_DISCARD static XSIMD_INLINE batch load(T const* mem, batch_bool mask, Mode = {}) noexcept; template XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, stream_mode) noexcept; @@ -722,6 +728,26 @@ namespace xsimd } } + template + template + XSIMD_INLINE batch batch::load(T const* mem, batch_bool mask, Mode mode) noexcept + { + detail::static_check_supported_config(); + static_assert(std::is_same::value || std::is_same::value, + "supported load mode"); + constexpr uint64_t full_mask = details::full_mask(size); + const auto bits = mask.mask(); + if (bits == 0) + { + return broadcast(0); + } + if (bits == full_mask) + { + return load(mem, mode); + } + return kernel::load_masked(mem, mask, kernel::convert {}, mode, A {}); + } + template template XSIMD_INLINE void batch::store(U* mem, @@ -745,6 +771,29 @@ namespace xsimd } } + template + template + XSIMD_INLINE void batch::store(T* mem, + batch_bool mask, + Mode mode) const noexcept + { + detail::static_check_supported_config(); + static_assert(std::is_same::value || std::is_same::value, + "supported store mode"); + constexpr uint64_t full_mask = details::full_mask(size); + const auto bits = mask.mask(); + if (bits == 0) + { + return; + } + if (bits == full_mask) + { + store(mem, mode); + return; + } + kernel::store_masked(mem, *this, mask, mode, A {}); + } + template template XSIMD_INLINE batch batch::load(U const* mem, stream_mode) noexcept diff --git a/include/xsimd/types/xsimd_utils.hpp b/include/xsimd/types/xsimd_utils.hpp index 6af62c1a0..940ee084f 100644 --- a/include/xsimd/types/xsimd_utils.hpp +++ b/include/xsimd/types/xsimd_utils.hpp @@ -457,6 +457,16 @@ namespace xsimd template using complex_batch_type_t = typename complex_batch_type::type; + + namespace details + { + // Returns a bitmask with the lowest \c size bits set. Used by masked + // load/store fast paths to detect "all lanes active". + inline constexpr uint64_t full_mask(std::size_t size) noexcept + { + return size >= 64 ? ~uint64_t(0) : ((uint64_t(1) << size) - 1); + } + } } #endif diff --git a/test/test_load_store.cpp b/test/test_load_store.cpp index a5266eeb3..411b69472 100644 --- a/test/test_load_store.cpp +++ b/test/test_load_store.cpp @@ -105,6 +105,20 @@ struct load_store_test static constexpr bool get(std::size_t, std::size_t) noexcept { return true; } }; + template + static batch_bool_type make_runtime_mask() noexcept + { + uint64_t bits = 0; + for (std::size_t i = 0; i < size; ++i) + { + if (Generator::get(i, size)) + { + bits |= uint64_t(1) << i; + } + } + return batch_bool_type::from_mask(bits); + } + int8_vector_type i8_vec; uint8_vector_type ui8_vec; int16_vector_type i16_vec; @@ -377,6 +391,16 @@ struct load_store_test run_load_mask_pattern(v, name, b, expected, " masked odd elements"); run_load_mask_pattern(v, name, b, expected, " masked pseudo random"); run_load_mask_pattern(v, name, b, expected, " masked all elements"); + run_load_runtime_mask_pattern(v, name, b, expected, " runtime masked none"); + run_load_runtime_mask_pattern(v, name, b, expected, " runtime masked first element"); + run_load_runtime_mask_pattern(v, name, b, expected, " runtime masked first half"); + run_load_runtime_mask_pattern(v, name, b, expected, " runtime masked last half"); + run_load_runtime_mask_pattern(v, name, b, expected, " runtime masked first N"); + run_load_runtime_mask_pattern(v, name, b, expected, " runtime masked last N"); + run_load_runtime_mask_pattern(v, name, b, expected, " runtime masked even elements"); + run_load_runtime_mask_pattern(v, name, b, expected, " runtime masked odd elements"); + run_load_runtime_mask_pattern(v, name, b, expected, " runtime masked pseudo random"); + run_load_runtime_mask_pattern(v, name, b, expected, " runtime masked all elements"); } template @@ -404,6 +428,26 @@ struct load_store_test CHECK_BATCH_EQ(b, expected_masked); } + template + void run_load_runtime_mask_pattern(const V& v, const std::string& name, batch_type& b, const array_type& expected, const std::string& label) + { + const auto mask = make_runtime_mask(); + array_type expected_masked { 0 }; + + for (std::size_t i = 0; i < size; ++i) + { + const bool active = Generator::get(i, size); + expected_masked[i] = active ? expected[i] : value_type(); + } + + b = xsimd::load(v.data(), mask, xsimd::aligned_mode()); + INFO(name, label + " aligned"); + CHECK_BATCH_EQ(b, expected_masked); + b = xsimd::load(v.data(), mask, xsimd::unaligned_mode()); + INFO(name, label + " unaligned"); + CHECK_BATCH_EQ(b, expected_masked); + } + template void run_store_mask_pattern(const V& v, const std::string& name, batch_type& b, V& res, V& expected_masked, const std::string& label) { @@ -422,6 +466,24 @@ struct load_store_test CHECK_VECTOR_EQ(res, expected_masked); } + template + void run_store_runtime_mask_pattern(const V& v, const std::string& name, batch_type& b, V& res, V& expected_masked, const std::string& label) + { + const auto mask = make_runtime_mask(); + for (std::size_t i = 0; i < size; ++i) + { + expected_masked[i] = Generator::get(i, size) ? v[i] : value_type(); + } + std::fill(res.begin(), res.end(), value_type()); + xsimd::store(res.data(), b, mask, xsimd::aligned_mode()); + INFO(name, label + " aligned"); + CHECK_VECTOR_EQ(res, expected_masked); + std::fill(res.begin(), res.end(), value_type()); + xsimd::store(res.data(), b, mask, xsimd::unaligned_mode()); + INFO(name, label + " unaligned"); + CHECK_VECTOR_EQ(res, expected_masked); + } + template void run_store_mask_tests(const V& v, const std::string& name, batch_type& b, V& res, V& expected_masked, std::true_type) { @@ -434,6 +496,15 @@ struct load_store_test run_store_mask_pattern(v, name, b, res, expected_masked, " masked odd elements"); run_store_mask_pattern(v, name, b, res, expected_masked, " masked pseudo random"); run_store_mask_pattern(v, name, b, res, expected_masked, " masked all elements"); + run_store_runtime_mask_pattern(v, name, b, res, expected_masked, " runtime masked first element"); + run_store_runtime_mask_pattern(v, name, b, res, expected_masked, " runtime masked first half"); + run_store_runtime_mask_pattern(v, name, b, res, expected_masked, " runtime masked last half"); + run_store_runtime_mask_pattern(v, name, b, res, expected_masked, " runtime masked first N"); + run_store_runtime_mask_pattern(v, name, b, res, expected_masked, " runtime masked last N"); + run_store_runtime_mask_pattern(v, name, b, res, expected_masked, " runtime masked even elements"); + run_store_runtime_mask_pattern(v, name, b, res, expected_masked, " runtime masked odd elements"); + run_store_runtime_mask_pattern(v, name, b, res, expected_masked, " runtime masked pseudo random"); + run_store_runtime_mask_pattern(v, name, b, res, expected_masked, " runtime masked all elements"); } template @@ -453,6 +524,7 @@ struct load_store_test V sentinel_expected(size, sentinel); auto zero_mask = xsimd::make_batch_bool_constant(); + auto runtime_zero_mask = make_runtime_mask(); std::fill(res.begin(), res.end(), sentinel); b.store(res.data(), zero_mask, xsimd::aligned_mode()); INFO(name, " masked none aligned store"); @@ -470,6 +542,19 @@ struct load_store_test CHECK(std::all_of(scratch.begin(), scratch.end(), [](const value_type v) { return v == sentinel; })); + std::fill(res.begin(), res.end(), sentinel); + xsimd::store(res.data(), b, runtime_zero_mask, xsimd::aligned_mode()); + INFO(name, " runtime masked none aligned store"); + CHECK_VECTOR_EQ(res, sentinel_expected); + + std::fill(scratch.begin(), scratch.end(), sentinel); + xsimd::store(scratch_ptr, b, runtime_zero_mask, xsimd::unaligned_mode()); + INFO(name, " runtime masked none unaligned store"); + std::copy(scratch_ptr, scratch_ptr + scratch_slice.size(), scratch_slice.begin()); + CHECK_VECTOR_EQ(scratch_slice, sentinel_expected); + CHECK(std::all_of(scratch.begin(), scratch.end(), [](const value_type v) + { return v == sentinel; })); + run_store_mask_tests(v, name, b, res, expected_masked, std::true_type {}); } From e22734657bbba4015b985264b8cb620635044c89 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 30 Apr 2026 18:34:44 -0400 Subject: [PATCH 2/3] feat: add load_head / load_tail / store_head / store_tail APIs Sugar over runtime-mask load/store for loop head/tail remainders. Take ``n`` directly instead of a constructed batch_bool; only ``mem[0, n)`` is touched. ``head`` uses mask ``(1 << n) - 1``; ``tail`` uses ``((1 << n) - 1) << (size - n)`` with a base-pointer offset (via uintptr_t to dodge -Warray-bounds), so every arch with native predicated load/store inherits its intrinsic for free. Tested on sse2/sse41/avx2/avx512f/emulated256 native and neon64/rvv under qemu. --- docs/source/api/data_transfer.rst | 14 ++ .../xsimd/arch/common/xsimd_common_memory.hpp | 68 ++++++-- include/xsimd/arch/xsimd_common_fwd.hpp | 10 ++ include/xsimd/types/xsimd_api.hpp | 89 ++++++++++ include/xsimd/types/xsimd_batch.hpp | 112 ++++++++++++ test/test_load_store.cpp | 165 ++++++++++++++++++ 6 files changed, 446 insertions(+), 12 deletions(-) diff --git a/docs/source/api/data_transfer.rst b/docs/source/api/data_transfer.rst index db63fbc39..74aa4b512 100644 --- a/docs/source/api/data_transfer.rst +++ b/docs/source/api/data_transfer.rst @@ -20,6 +20,10 @@ From memory: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`load_as` | load values, forcing a type conversion | +---------------------------------------+----------------------------------------------------+ +| :cpp:func:`load_head` | load the first ``n`` contiguous elements [#h]_ | ++---------------------------------------+----------------------------------------------------+ +| :cpp:func:`load_tail` | load the last ``n`` contiguous elements [#h]_ | ++---------------------------------------+----------------------------------------------------+ From a scalar: @@ -40,6 +44,10 @@ To memory: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`store_as` | store values, forcing a type conversion | +---------------------------------------+----------------------------------------------------+ +| :cpp:func:`store_head` | store the first ``n`` contiguous elements [#h]_ | ++---------------------------------------+----------------------------------------------------+ +| :cpp:func:`store_tail` | store the last ``n`` contiguous elements [#h]_ | ++---------------------------------------+----------------------------------------------------+ In place: @@ -97,3 +105,9 @@ The following empty types are used for tag dispatching: such penalty. Prefer the compile-time mask whenever the selection is known at compile time, and avoid runtime-mask loads/stores in hot inner loops on the affected architectures. + +.. [#h] ``load_head`` / ``store_head`` / ``load_tail`` / ``store_tail`` + take a runtime element count ``n`` instead of a constructed mask; + they are sugar for the runtime-mask ``load`` / ``store`` with a + contiguous-prefix or contiguous-suffix mask, and inherit its + contract and per-arch codegen. diff --git a/include/xsimd/arch/common/xsimd_common_memory.hpp b/include/xsimd/arch/common/xsimd_common_memory.hpp index d3834583a..92a1155b5 100644 --- a/include/xsimd/arch/common/xsimd_common_memory.hpp +++ b/include/xsimd/arch/common/xsimd_common_memory.hpp @@ -379,12 +379,8 @@ namespace xsimd XSIMD_INLINE batch load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept { - // Per-lane validity contract: only active lanes of ``mem`` are - // required to be addressable. An unconditional whole-vector load - // would touch inactive lanes and trip ASan/Valgrind on partial - // buffers, so stay scalar. Arches with hardware predicated loads - // (AVX2 32/64-bit, AVX-512, SVE, RVV) override this with a single - // intrinsic that suppresses inactive-lane reads in hardware. + // Per-lane validity contract: only active lanes are read. + // Arches with hardware predicated loads override this. constexpr std::size_t size = batch::size; alignas(A::alignment()) std::array buffer {}; const uint64_t bits = mask.mask(); @@ -412,12 +408,8 @@ namespace xsimd XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept { - // Per-lane validity contract (matches native masked-store APIs): - // only active lanes of ``mem`` are touched. A load+select+store - // RMW would both read and write inactive bytes, breaking that - // contract — stay scalar. Arches with hardware predicated stores - // override this with a single intrinsic that suppresses inactive - // lanes in hardware. + // Per-lane validity contract: only active lanes are written. + // Arches with hardware predicated stores override this. constexpr std::size_t size = batch::size; alignas(A::alignment()) std::array src_buf; src.store_aligned(src_buf.data()); @@ -427,6 +419,58 @@ namespace xsimd mem[i] = src_buf[i]; } + // Head/tail forward to the runtime-mask path. ``tail`` offsets + // the base pointer back by ``(size - n)`` so the active high-``n`` + // lanes land at ``[mem, mem + n)``; the offset goes through + // ``uintptr_t`` to dodge ``-Warray-bounds`` on small buffers. + namespace detail + { + template + XSIMD_INLINE T const* offset_back(T const* p, std::size_t k) noexcept + { + return reinterpret_cast(reinterpret_cast(p) - k * sizeof(T)); + } + template + XSIMD_INLINE T* offset_back(T* p, std::size_t k) noexcept + { + return reinterpret_cast(reinterpret_cast(p) - k * sizeof(T)); + } + } + + template + XSIMD_INLINE batch + load_head(T const* mem, std::size_t n, Mode, requires_arch) noexcept + { + const auto mask = batch_bool::from_mask(::xsimd::details::full_mask(n)); + return load_masked(mem, mask, convert {}, unaligned_mode {}, A {}); + } + + template + XSIMD_INLINE void + store_head(T* mem, std::size_t n, batch const& src, Mode, requires_arch) noexcept + { + const auto mask = batch_bool::from_mask(::xsimd::details::full_mask(n)); + store_masked(mem, src, mask, unaligned_mode {}, A {}); + } + + template + XSIMD_INLINE batch + load_tail(T const* mem, std::size_t n, Mode, requires_arch) noexcept + { + constexpr std::size_t size = batch::size; + const auto mask = batch_bool::from_mask(::xsimd::details::full_mask(n) << (size - n)); + return load_masked(detail::offset_back(mem, size - n), mask, convert {}, unaligned_mode {}, A {}); + } + + template + XSIMD_INLINE void + store_tail(T* mem, std::size_t n, batch const& src, Mode, requires_arch) noexcept + { + constexpr std::size_t size = batch::size; + const auto mask = batch_bool::from_mask(::xsimd::details::full_mask(n) << (size - n)); + store_masked(detail::offset_back(mem, size - n), src, mask, unaligned_mode {}, A {}); + } + template XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept { diff --git a/include/xsimd/arch/xsimd_common_fwd.hpp b/include/xsimd/arch/xsimd_common_fwd.hpp index 1474aeeb8..42106267c 100644 --- a/include/xsimd/arch/xsimd_common_fwd.hpp +++ b/include/xsimd/arch/xsimd_common_fwd.hpp @@ -102,6 +102,16 @@ namespace xsimd template XSIMD_INLINE std::enable_if_t::value> store_masked(uint64_t*, batch const&, batch_bool_constant, Mode, requires_arch) noexcept; + // Head/tail: contiguous prefix/suffix variants of the masked load/store. + template + XSIMD_INLINE batch load_head(T const* mem, std::size_t n, Mode, requires_arch) noexcept; + template + XSIMD_INLINE batch load_tail(T const* mem, std::size_t n, Mode, requires_arch) noexcept; + template + XSIMD_INLINE void store_head(T* mem, std::size_t n, batch const& src, Mode, requires_arch) noexcept; + template + XSIMD_INLINE void store_tail(T* mem, std::size_t n, batch const& src, Mode, requires_arch) noexcept; + // Forward declarations for pack-level helpers namespace detail { diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp index 85bb72899..e3849f981 100644 --- a/include/xsimd/types/xsimd_api.hpp +++ b/include/xsimd/types/xsimd_api.hpp @@ -1631,6 +1631,51 @@ namespace xsimd return batch::load(ptr, mask, unaligned_mode {}); } + /** + * @ingroup batch_data_transfer + * + * Loads the prefix \c mem[0, n) into the low \c n lanes; remaining + * lanes are zero. Sugar for a runtime-mask load with mask + * (1 << n) - 1; same contract — only \c mem[0, n) is read. + * \c n is clamped to \c batch::size. + */ + template + XSIMD_INLINE batch load_head(T const* mem, std::size_t n, aligned_mode = {}) noexcept + { + detail::static_check_supported_config(); + return batch::load_head(mem, n, aligned_mode {}); + } + + /// \overload + template + XSIMD_INLINE batch load_head(T const* mem, std::size_t n, unaligned_mode) noexcept + { + detail::static_check_supported_config(); + return batch::load_head(mem, n, unaligned_mode {}); + } + + /** + * @ingroup batch_data_transfer + * + * Loads \c mem[0, n) into the high \c n lanes (lanes + * [size - n, size)); remaining low lanes are zero. Same + * contract as \ref load_head. \c n is clamped to \c batch::size. + */ + template + XSIMD_INLINE batch load_tail(T const* mem, std::size_t n, aligned_mode = {}) noexcept + { + detail::static_check_supported_config(); + return batch::load_tail(mem, n, aligned_mode {}); + } + + /// \overload + template + XSIMD_INLINE batch load_tail(T const* mem, std::size_t n, unaligned_mode) noexcept + { + detail::static_check_supported_config(); + return batch::load_tail(mem, n, unaligned_mode {}); + } + /** * @ingroup batch_data_transfer * @@ -2807,6 +2852,50 @@ namespace xsimd val.store(mem, mask, unaligned_mode {}); } + /** + * @ingroup batch_data_transfer + * + * Stores the low \c n lanes of \c val to \c mem[0, n). Sugar for a + * runtime-mask store with mask (1 << n) - 1; same contract — + * only \c mem[0, n) is written. \c n is clamped to \c batch::size. + */ + template + XSIMD_INLINE void store_head(T* mem, std::size_t n, batch const& val, aligned_mode = {}) noexcept + { + detail::static_check_supported_config(); + val.store_head(mem, n, aligned_mode {}); + } + + /// \overload + template + XSIMD_INLINE void store_head(T* mem, std::size_t n, batch const& val, unaligned_mode) noexcept + { + detail::static_check_supported_config(); + val.store_head(mem, n, unaligned_mode {}); + } + + /** + * @ingroup batch_data_transfer + * + * Stores the high \c n lanes (lanes [size - n, size)) of + * \c val to \c mem[0, n). Same contract as \ref store_head. \c n is + * clamped to \c batch::size. + */ + template + XSIMD_INLINE void store_tail(T* mem, std::size_t n, batch const& val, aligned_mode = {}) noexcept + { + detail::static_check_supported_config(); + val.store_tail(mem, n, aligned_mode {}); + } + + /// \overload + template + XSIMD_INLINE void store_tail(T* mem, std::size_t n, batch const& val, unaligned_mode) noexcept + { + detail::static_check_supported_config(); + val.store_tail(mem, n, unaligned_mode {}); + } + /** * @ingroup batch_data_transfer * diff --git a/include/xsimd/types/xsimd_batch.hpp b/include/xsimd/types/xsimd_batch.hpp index 8826e35c7..b99d66c70 100644 --- a/include/xsimd/types/xsimd_batch.hpp +++ b/include/xsimd/types/xsimd_batch.hpp @@ -151,6 +151,12 @@ namespace xsimd template XSIMD_INLINE void store(T* mem, batch_bool mask, Mode = {}) const noexcept; + // Head/tail: contiguous prefix/suffix variants of the runtime-mask store. + XSIMD_INLINE void store_head(T* mem, std::size_t n, aligned_mode) const noexcept; + XSIMD_INLINE void store_head(T* mem, std::size_t n, unaligned_mode) const noexcept; + XSIMD_INLINE void store_tail(T* mem, std::size_t n, aligned_mode) const noexcept; + XSIMD_INLINE void store_tail(T* mem, std::size_t n, unaligned_mode) const noexcept; + template XSIMD_NO_DISCARD static XSIMD_INLINE batch load_aligned(U const* mem) noexcept; template @@ -168,6 +174,12 @@ namespace xsimd template XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, stream_mode) noexcept; + // Head/tail: contiguous prefix/suffix variants of the runtime-mask load. + XSIMD_NO_DISCARD static XSIMD_INLINE batch load_head(T const* mem, std::size_t n, aligned_mode) noexcept; + XSIMD_NO_DISCARD static XSIMD_INLINE batch load_head(T const* mem, std::size_t n, unaligned_mode) noexcept; + XSIMD_NO_DISCARD static XSIMD_INLINE batch load_tail(T const* mem, std::size_t n, aligned_mode) noexcept; + XSIMD_NO_DISCARD static XSIMD_INLINE batch load_tail(T const* mem, std::size_t n, unaligned_mode) noexcept; + template XSIMD_NO_DISCARD static XSIMD_INLINE batch gather(U const* src, batch const& index) noexcept; template @@ -794,6 +806,106 @@ namespace xsimd kernel::store_masked(mem, *this, mask, mode, A {}); } + template + XSIMD_INLINE batch batch::load_head(T const* mem, std::size_t n, aligned_mode) noexcept + { + detail::static_check_supported_config(); + if (n == 0) + return broadcast(0); + if (n >= size) + return load_aligned(mem); + return kernel::load_head(mem, n, aligned_mode {}, A {}); + } + + template + XSIMD_INLINE batch batch::load_head(T const* mem, std::size_t n, unaligned_mode) noexcept + { + detail::static_check_supported_config(); + if (n == 0) + return broadcast(0); + if (n >= size) + return load_unaligned(mem); + return kernel::load_head(mem, n, unaligned_mode {}, A {}); + } + + template + XSIMD_INLINE batch batch::load_tail(T const* mem, std::size_t n, aligned_mode) noexcept + { + detail::static_check_supported_config(); + if (n == 0) + return broadcast(0); + if (n >= size) + return load_aligned(mem); + return kernel::load_tail(mem, n, aligned_mode {}, A {}); + } + + template + XSIMD_INLINE batch batch::load_tail(T const* mem, std::size_t n, unaligned_mode) noexcept + { + detail::static_check_supported_config(); + if (n == 0) + return broadcast(0); + if (n >= size) + return load_unaligned(mem); + return kernel::load_tail(mem, n, unaligned_mode {}, A {}); + } + + template + XSIMD_INLINE void batch::store_head(T* mem, std::size_t n, aligned_mode) const noexcept + { + detail::static_check_supported_config(); + if (n == 0) + return; + if (n >= size) + { + store_aligned(mem); + return; + } + kernel::store_head(mem, n, *this, aligned_mode {}, A {}); + } + + template + XSIMD_INLINE void batch::store_head(T* mem, std::size_t n, unaligned_mode) const noexcept + { + detail::static_check_supported_config(); + if (n == 0) + return; + if (n >= size) + { + store_unaligned(mem); + return; + } + kernel::store_head(mem, n, *this, unaligned_mode {}, A {}); + } + + template + XSIMD_INLINE void batch::store_tail(T* mem, std::size_t n, aligned_mode) const noexcept + { + detail::static_check_supported_config(); + if (n == 0) + return; + if (n >= size) + { + store_aligned(mem); + return; + } + kernel::store_tail(mem, n, *this, aligned_mode {}, A {}); + } + + template + XSIMD_INLINE void batch::store_tail(T* mem, std::size_t n, unaligned_mode) const noexcept + { + detail::static_check_supported_config(); + if (n == 0) + return; + if (n >= size) + { + store_unaligned(mem); + return; + } + kernel::store_tail(mem, n, *this, unaligned_mode {}, A {}); + } + template template XSIMD_INLINE batch batch::load(U const* mem, stream_mode) noexcept diff --git a/test/test_load_store.cpp b/test/test_load_store.cpp index 411b69472..5837d4061 100644 --- a/test/test_load_store.cpp +++ b/test/test_load_store.cpp @@ -238,6 +238,169 @@ struct load_store_test #endif } + template + void run_load_head_tail(const V&, const std::string&, std::false_type) + { + } + + template + void run_load_head_tail(const V& v, const std::string& name) + { + run_load_head_tail(v, name, std::is_same {}); + } + + template + void run_store_head_tail(const V&, const std::string&, std::false_type) + { + } + + template + void run_store_head_tail(const V& v, const std::string& name) + { + run_store_head_tail(v, name, std::is_same {}); + } + + template + void run_load_head_tail(const V& v, const std::string& name, std::true_type) + { + const array_type ref = [&] + { + array_type a; + std::copy(v.cbegin(), v.cend(), a.begin()); + return a; + }(); + + for (std::size_t n = 0; n <= size; ++n) + { + array_type expected_head {}; + for (std::size_t i = 0; i < n; ++i) + expected_head[i] = ref[i]; + array_type expected_tail {}; + for (std::size_t i = 0; i < n; ++i) + expected_tail[size - n + i] = ref[i]; + + const value_type* mem = v.data(); + using arch_t = typename batch_type::arch_type; + batch_type b = xsimd::load_head(mem, n, xsimd::aligned_mode()); + INFO(name, " load_head aligned n=", n); + CHECK_BATCH_EQ(b, expected_head); + + b = xsimd::load_head(mem, n, xsimd::unaligned_mode()); + INFO(name, " load_head unaligned n=", n); + CHECK_BATCH_EQ(b, expected_head); + + b = batch_type::load_head(mem, n, xsimd::aligned_mode()); + INFO(name, " batch::load_head aligned n=", n); + CHECK_BATCH_EQ(b, expected_head); + + b = xsimd::load_tail(mem, n, xsimd::aligned_mode()); + INFO(name, " load_tail aligned n=", n); + CHECK_BATCH_EQ(b, expected_tail); + + b = xsimd::load_tail(mem, n, xsimd::unaligned_mode()); + INFO(name, " load_tail unaligned n=", n); + CHECK_BATCH_EQ(b, expected_tail); + + b = batch_type::load_tail(mem, n, xsimd::aligned_mode()); + INFO(name, " batch::load_tail aligned n=", n); + CHECK_BATCH_EQ(b, expected_tail); + } + } + + template + void run_store_head_tail(const V& v, const std::string& name, std::true_type) + { + static constexpr value_type sentinel = static_cast(91); + batch_type b = batch_type::load_aligned(v.data()); + V scratch(size); + + for (std::size_t n = 0; n <= size; ++n) + { + // store_head: low n lanes -> mem[0, n) + std::fill(scratch.begin(), scratch.end(), sentinel); + xsimd::store_head(scratch.data(), n, b, xsimd::aligned_mode()); + for (std::size_t i = 0; i < n; ++i) + { + INFO(name, " store_head aligned n=", n, " i=", i); + CHECK_EQ(scratch[i], v[i]); + } + for (std::size_t i = n; i < size; ++i) + { + INFO(name, " store_head aligned untouched n=", n, " i=", i); + CHECK_EQ(scratch[i], sentinel); + } + + std::fill(scratch.begin(), scratch.end(), sentinel); + xsimd::store_head(scratch.data(), n, b, xsimd::unaligned_mode()); + for (std::size_t i = 0; i < n; ++i) + { + INFO(name, " store_head unaligned n=", n, " i=", i); + CHECK_EQ(scratch[i], v[i]); + } + for (std::size_t i = n; i < size; ++i) + { + INFO(name, " store_head unaligned untouched n=", n, " i=", i); + CHECK_EQ(scratch[i], sentinel); + } + + // store_tail: high n lanes -> mem[0, n) + std::fill(scratch.begin(), scratch.end(), sentinel); + xsimd::store_tail(scratch.data(), n, b, xsimd::aligned_mode()); + for (std::size_t i = 0; i < n; ++i) + { + INFO(name, " store_tail aligned n=", n, " i=", i); + CHECK_EQ(scratch[i], v[size - n + i]); + } + for (std::size_t i = n; i < size; ++i) + { + INFO(name, " store_tail aligned untouched n=", n, " i=", i); + CHECK_EQ(scratch[i], sentinel); + } + + std::fill(scratch.begin(), scratch.end(), sentinel); + xsimd::store_tail(scratch.data(), n, b, xsimd::unaligned_mode()); + for (std::size_t i = 0; i < n; ++i) + { + INFO(name, " store_tail unaligned n=", n, " i=", i); + CHECK_EQ(scratch[i], v[size - n + i]); + } + for (std::size_t i = n; i < size; ++i) + { + INFO(name, " store_tail unaligned untouched n=", n, " i=", i); + CHECK_EQ(scratch[i], sentinel); + } + } + } + + void test_head_tail() + { + run_load_head_tail(i8_vec, "head/tail int8_t"); + run_load_head_tail(ui8_vec, "head/tail uint8_t"); + run_load_head_tail(i16_vec, "head/tail int16_t"); + run_load_head_tail(ui16_vec, "head/tail uint16_t"); + run_load_head_tail(i32_vec, "head/tail int32_t"); + run_load_head_tail(ui32_vec, "head/tail uint32_t"); + run_load_head_tail(i64_vec, "head/tail int64_t"); + run_load_head_tail(ui64_vec, "head/tail uint64_t"); + run_load_head_tail(f_vec, "head/tail float"); +#if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64 + run_load_head_tail(d_vec, "head/tail double"); +#endif + + run_store_head_tail(i8_vec, "head/tail int8_t"); + run_store_head_tail(ui8_vec, "head/tail uint8_t"); + run_store_head_tail(i16_vec, "head/tail int16_t"); + run_store_head_tail(ui16_vec, "head/tail uint16_t"); + run_store_head_tail(i32_vec, "head/tail int32_t"); + run_store_head_tail(ui32_vec, "head/tail uint32_t"); + run_store_head_tail(i64_vec, "head/tail int64_t"); + run_store_head_tail(ui64_vec, "head/tail uint64_t"); + run_store_head_tail(f_vec, "head/tail float"); +#if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64 + run_store_head_tail(d_vec, "head/tail double"); +#endif + } + void test_masked() { using arch = typename B::arch_type; @@ -688,6 +851,8 @@ TEST_CASE_TEMPLATE("[load store]", B, BATCH_TYPES) SUBCASE("scatter") { Test.test_scatter(); } SUBCASE("masked") { Test.test_masked(); } + + SUBCASE("head_tail") { Test.test_head_tail(); } } #endif From d5f21c701993596fb18132680bbbf10f3bea363e Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Fri, 1 May 2026 15:51:26 -0400 Subject: [PATCH 3/3] feat: add runtime batch_bool mask overloads for avx_128 / avx2_128 Mirror the AVX/AVX2 runtime-mask load_masked / store_masked overloads on the new 128-bit SSE-register variants of those ISAs: - avx_128: float / double via _mm_maskload_ps/pd, _mm_maskstore_ps/pd - avx2_128: 32/64-bit integers via _mm_maskload_epi32/64, _mm_maskstore_epi32/64 8/16-bit integers continue to fall through to the scalar common path (no native maskload/store intrinsic at those widths). Both alignment modes route to the same intrinsic since masked-off lanes do not fault. --- include/xsimd/arch/xsimd_avx2_128.hpp | 37 +++++++++++++++++++++++++++ include/xsimd/arch/xsimd_avx_128.hpp | 31 ++++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/include/xsimd/arch/xsimd_avx2_128.hpp b/include/xsimd/arch/xsimd_avx2_128.hpp index 7e8b0d05a..232667ef7 100644 --- a/include/xsimd/arch/xsimd_avx2_128.hpp +++ b/include/xsimd/arch/xsimd_avx2_128.hpp @@ -133,6 +133,43 @@ namespace xsimd return _mm_maskstore_epi64((int64_t*)mem, mask.as_batch(), src); } + // Runtime-mask load for 32/64-bit integers on AVX2-128. 8/16-bit + // integers fall back to the scalar common path: there is no native + // _mm_maskload for those widths, and a load-then-blend would break + // fault-suppression at page boundaries (the main reason callers ask + // for a masked load). Both aligned_mode and unaligned_mode route to + // the same intrinsic — masked-off lanes do not fault regardless of + // alignment. + template + XSIMD_INLINE std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8), batch> + load_masked(T const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_maskload_epi32(reinterpret_cast(mem), __m128i(mask)); + } + else + { + return _mm_maskload_epi64(reinterpret_cast(mem), __m128i(mask)); + } + } + + // Runtime-mask store for 32/64-bit integers on AVX2-128. Same + // fault-suppression semantics as the masked loads above. + template + XSIMD_INLINE std::enable_if_t::value && (sizeof(T) == 4 || sizeof(T) == 8), void> + store_masked(T* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + _mm_maskstore_epi32(reinterpret_cast(mem), __m128i(mask), __m128i(src)); + } + else + { + _mm_maskstore_epi64(reinterpret_cast(mem), __m128i(mask), __m128i(src)); + } + } + // gather template = 0, detail::enable_sized_integral_t = 0> XSIMD_INLINE batch gather(batch const&, T const* src, batch const& index, diff --git a/include/xsimd/arch/xsimd_avx_128.hpp b/include/xsimd/arch/xsimd_avx_128.hpp index af17568e1..98642dac6 100644 --- a/include/xsimd/arch/xsimd_avx_128.hpp +++ b/include/xsimd/arch/xsimd_avx_128.hpp @@ -115,6 +115,22 @@ namespace xsimd return _mm_maskload_pd(mem, mask.as_batch()); } + // Runtime-mask load for float/double on AVX-128. Both aligned_mode and + // unaligned_mode map to _mm_maskload_* — the intrinsic does not fault + // on masked-off lanes, so partial loads across page boundaries are safe. + template + XSIMD_INLINE batch + load_masked(float const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + return _mm_maskload_ps(mem, _mm_castps_si128(mask)); + } + template + XSIMD_INLINE batch + load_masked(double const* mem, batch_bool mask, convert, Mode, requires_arch) noexcept + { + return _mm_maskload_pd(mem, _mm_castpd_si128(mask)); + } + // store_masked template XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept @@ -128,6 +144,21 @@ namespace xsimd return _mm_maskstore_pd(mem, mask.as_batch(), src); } + // Runtime-mask store for float/double on AVX-128. Same fault-suppression + // semantics as the masked loads above; alignment mode is irrelevant. + template + XSIMD_INLINE void + store_masked(float* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + _mm_maskstore_ps(mem, _mm_castps_si128(mask), src); + } + template + XSIMD_INLINE void + store_masked(double* mem, batch const& src, batch_bool mask, Mode, requires_arch) noexcept + { + _mm_maskstore_pd(mem, _mm_castpd_si128(mask), src); + } + // swizzle (dynamic mask) template ::value && sizeof(T) == sizeof(ITy)>> XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept