From b57a766745c7a1d69d4de81c8066a4be48458342 Mon Sep 17 00:00:00 2001
From: Marco Barbone <mbarbone@flatironinstitute.org>
Date: Thu, 30 Apr 2026 18:51:37 -0400
Subject: [PATCH 1/3] feat: add runtime batch_bool mask overloads for
 load_masked/store_masked

Add runtime-mask overloads of xsimd::load_masked and xsimd::store_masked
across AVX2, AVX-512, SSE, SVE, RVV, and NEON. The generic common-path
fallback is collapsed to a whole-vector select, and the unaligned
page-cross fast path is dropped since the underlying intrinsics suppress
faults on masked-off lanes regardless of alignment.

Also: forward SVE compile-time masked load/store through the
runtime path so the per-lane predicate is correct on SVE wider
than 128 bits (the previous svdupq_b* path replicates a 128-bit
chunk pattern across the vector).
---
 docs/source/api/data_transfer.rst             |  17 ++-
 .../xsimd/arch/common/xsimd_common_memory.hpp |  39 ++++++
 include/xsimd/arch/xsimd_avx.hpp              |  33 +++++
 include/xsimd/arch/xsimd_avx2.hpp             |  33 ++++-
 include/xsimd/arch/xsimd_common_fwd.hpp       |   4 +
 include/xsimd/arch/xsimd_rvv.hpp              |  24 ++++
 include/xsimd/arch/xsimd_sve.hpp              |  43 +++++-
 include/xsimd/types/xsimd_api.hpp             | 124 ++++++++++++++++++
 include/xsimd/types/xsimd_batch.hpp           |  53 +++++++-
 include/xsimd/types/xsimd_utils.hpp           |  10 ++
 test/test_load_store.cpp                      |  85 ++++++++++++
 11 files changed, 452 insertions(+), 13 deletions(-)

diff --git a/docs/source/api/data_transfer.rst b/docs/source/api/data_transfer.rst
index 815f56293..db63fbc39 100644
--- a/docs/source/api/data_transfer.rst
+++ b/docs/source/api/data_transfer.rst
@@ -12,7 +12,7 @@ Data Transfers
 From memory:
 
 +---------------------------------------+----------------------------------------------------+
-| :cpp:func:`load`                      | load values from memory (optionally masked)        |
+| :cpp:func:`load`                      | load values from memory (optionally masked) [#m]_  |
 +---------------------------------------+----------------------------------------------------+
 | :cpp:func:`load_aligned`              | load values from aligned memory                    |
 +---------------------------------------+----------------------------------------------------+
@@ -32,7 +32,7 @@ From a scalar:
 To memory:
 
 +---------------------------------------+----------------------------------------------------+
-| :cpp:func:`store`                     | store values to memory (optionally masked)         |
+| :cpp:func:`store`                     | store values to memory (optionally masked) [#m]_   |
 +---------------------------------------+----------------------------------------------------+
 | :cpp:func:`store_aligned`             | store values to aligned memory                     |
 +---------------------------------------+----------------------------------------------------+
@@ -84,3 +84,16 @@ The following empty types are used for tag dispatching:
 
 .. doxygenstruct:: xsimd::unaligned_mode
    :project: xsimd
+
+.. rubric:: Footnotes
+
+.. [#m] Masked ``load`` / ``store`` come in two flavours. The
+   :cpp:class:`batch_bool_constant` overload encodes the mask in the type, is
+   resolved at compile time and is always efficient. The runtime
+   :cpp:class:`batch_bool` overload, by contrast, falls back to a per-lane
+   scalar loop on architectures without a native masked load/store
+   instruction — SSE2 through SSE4.2, NEON/NEON64, VSX, S390x, and WASM.
+   AVX, AVX2, AVX-512, SVE and RVV use native masked instructions and pay no
+   such penalty. Prefer the compile-time mask whenever the selection is known
+   at compile time, and avoid runtime-mask loads/stores in hot inner loops on
+   the affected architectures.
diff --git a/include/xsimd/arch/common/xsimd_common_memory.hpp b/include/xsimd/arch/common/xsimd_common_memory.hpp
index c8038334a..d3834583a 100644
--- a/include/xsimd/arch/common/xsimd_common_memory.hpp
+++ b/include/xsimd/arch/common/xsimd_common_memory.hpp
@@ -15,6 +15,7 @@
 #include <algorithm>
 #include <array>
 #include <complex>
+#include <cstdint>
 
 #include "../../types/xsimd_batch_constant.hpp"
 #include "./xsimd_common_details.hpp"
@@ -374,6 +375,25 @@ namespace xsimd
             return batch<T_out, A>::load(buffer.data(), aligned_mode {});
         }
 
+        template <class A, class T, class Mode>
+        XSIMD_INLINE batch<T, A>
+        load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<common>) noexcept
+        {
+            // Per-lane validity contract: only active lanes of ``mem`` are
+            // required to be addressable. An unconditional whole-vector load
+            // would touch inactive lanes and trip ASan/Valgrind on partial
+            // buffers, so stay scalar. Arches with hardware predicated loads
+            // (AVX2 32/64-bit, AVX-512, SVE, RVV) override this with a single
+            // intrinsic that suppresses inactive-lane reads in hardware.
+            constexpr std::size_t size = batch<T, A>::size;
+            alignas(A::alignment()) std::array<T, size> buffer {};
+            const uint64_t bits = mask.mask();
+            for (std::size_t i = 0; i < size; ++i)
+                if ((bits >> i) & uint64_t(1))
+                    buffer[i] = mem[i];
+            return batch<T, A>::load_aligned(buffer.data());
+        }
+
         template <class A, class T_in, class T_out, bool... Values, class alignment>
         XSIMD_INLINE void
         store_masked(T_out* mem, batch<T_in, A> const& src, batch_bool_constant<T_in, A, Values...>, alignment, requires_arch<common>) noexcept
@@ -388,6 +408,25 @@ namespace xsimd
                 }
         }
 
+        template <class A, class T, class Mode>
+        XSIMD_INLINE void
+        store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<common>) noexcept
+        {
+            // Per-lane validity contract (matches native masked-store APIs):
+            // only active lanes of ``mem`` are touched. A load+select+store
+            // RMW would both read and write inactive bytes, breaking that
+            // contract — stay scalar. Arches with hardware predicated stores
+            // override this with a single intrinsic that suppresses inactive
+            // lanes in hardware.
+            constexpr std::size_t size = batch<T, A>::size;
+            alignas(A::alignment()) std::array<T, size> src_buf;
+            src.store_aligned(src_buf.data());
+            const uint64_t bits = mask.mask();
+            for (std::size_t i = 0; i < size; ++i)
+                if ((bits >> i) & uint64_t(1))
+                    mem[i] = src_buf[i];
+        }
+
         template <class A, bool... Values, class Mode>
         XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...>, convert<int32_t>, Mode, requires_arch<A>) noexcept
         {
diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp
index 305041f11..429637784 100644
--- a/include/xsimd/arch/xsimd_avx.hpp
+++ b/include/xsimd/arch/xsimd_avx.hpp
@@ -1015,6 +1015,23 @@ namespace xsimd
             }
         }
 
+        // Runtime-mask load for float/double on AVX. Both aligned_mode and
+        // unaligned_mode map to _mm256_maskload_* — the intrinsic does not fault
+        // on masked-off lanes, so partial loads across page boundaries are safe.
+        template <class A, class Mode>
+        XSIMD_INLINE batch<float, A>
+        load_masked(float const* mem, batch_bool<float, A> mask, convert<float>, Mode, requires_arch<avx>) noexcept
+        {
+            return _mm256_maskload_ps(mem, _mm256_castps_si256(mask));
+        }
+
+        template <class A, class Mode>
+        XSIMD_INLINE batch<double, A>
+        load_masked(double const* mem, batch_bool<double, A> mask, convert<double>, Mode, requires_arch<avx>) noexcept
+        {
+            return _mm256_maskload_pd(mem, _mm256_castpd_si256(mask));
+        }
+
         // store_masked
         namespace detail
         {
@@ -1031,6 +1048,22 @@ namespace xsimd
             }
         }
 
+        // Runtime-mask store for float/double on AVX. Same fault-suppression
+        // semantics as the masked loads above; alignment mode is irrelevant.
+        template <class A, class Mode>
+        XSIMD_INLINE void
+        store_masked(float* mem, batch<float, A> const& src, batch_bool<float, A> mask, Mode, requires_arch<avx>) noexcept
+        {
+            _mm256_maskstore_ps(mem, _mm256_castps_si256(mask), src);
+        }
+
+        template <class A, class Mode>
+        XSIMD_INLINE void
+        store_masked(double* mem, batch<double, A> const& src, batch_bool<double, A> mask, Mode, requires_arch<avx>) noexcept
+        {
+            _mm256_maskstore_pd(mem, _mm256_castpd_si256(mask), src);
+        }
+
         template <class A, class T, bool... Values, class Mode>
         XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx>) noexcept
         {
diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp
index ebffd910b..d3e9330be 100644
--- a/include/xsimd/arch/xsimd_avx2.hpp
+++ b/include/xsimd/arch/xsimd_avx2.hpp
@@ -119,7 +119,6 @@ namespace xsimd
         }
 
         // load_masked
-        // AVX2 low-level helpers (operate on raw SIMD registers)
         namespace detail
         {
             XSIMD_INLINE __m256i maskload(const int32_t* mem, __m256i mask) noexcept
@@ -138,14 +137,12 @@ namespace xsimd
             }
         }
 
-        // single templated implementation for integer masked loads (32/64-bit)
         template <class A, class T, bool... Values, class Mode>
         XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) >= 4), batch<T, A>>
         load_masked(T const* mem, batch_bool_constant<T, A, Values...> mask, convert<T>, Mode, requires_arch<avx2>) noexcept
         {
             static_assert(sizeof(T) == 4 || sizeof(T) == 8, "load_masked supports only 32/64-bit integers on AVX2");
             using int_t = std::conditional_t<sizeof(T) == 4, int32_t, long long>;
-            // Use the raw register-level maskload helpers for the remaining cases.
             return detail::maskload(reinterpret_cast<const int_t*>(mem), mask.as_batch());
         }
 
@@ -175,6 +172,20 @@ namespace xsimd
             return bitwise_cast<uint64_t>(r);
         }
 
+        // Runtime-mask load for 32/64-bit integers on AVX2. 8/16-bit integers
+        // fall back to the scalar common path: AVX2 has no native maskload for
+        // those widths, and a load-then-blend would break fault-suppression at
+        // page boundaries (the main reason callers ask for a masked load).
+        // Both aligned_mode and unaligned_mode route to the same intrinsic —
+        // masked-off lanes do not fault regardless of alignment.
+        template <class A, class T, class Mode>
+        XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), batch<T, A>>
+        load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<avx2>) noexcept
+        {
+            using int_t = std::conditional_t<sizeof(T) == 4, int32_t, long long>;
+            return detail::maskload(reinterpret_cast<const int_t*>(mem), __m256i(mask));
+        }
+
         // store_masked
         namespace detail
         {
@@ -196,14 +207,12 @@ namespace xsimd
         {
             constexpr size_t lanes_per_half = batch<T, A>::size / 2;
 
-            // confined to lower 128-bit half → forward to SSE
             XSIMD_IF_CONSTEXPR(mask.countl_zero() >= lanes_per_half)
             {
                 constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask);
                 const auto lo = detail::lower_half(src);
                 store_masked<sse4_2>(mem, lo, mlo, Mode {}, sse4_2 {});
             }
-            // confined to upper 128-bit half → forward to SSE
             else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= lanes_per_half)
             {
                 constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
@@ -230,6 +239,20 @@ namespace xsimd
             store_masked<A>(reinterpret_cast<int64_t*>(mem), s64, batch_bool_constant<int64_t, A, Values...> {}, Mode {}, avx2 {});
         }
 
+        template <class A, class T, class Mode>
+        XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), void>
+        store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                _mm256_maskstore_epi32(reinterpret_cast<int*>(mem), __m256i(mask), __m256i(src));
+            }
+            else
+            {
+                _mm256_maskstore_epi64(reinterpret_cast<long long*>(mem), __m256i(mask), __m256i(src));
+            }
+        }
+
         // load_stream
         template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value, void>>
         XSIMD_INLINE batch<T, A> load_stream(T const* mem, convert<T>, requires_arch<avx2>) noexcept
diff --git a/include/xsimd/arch/xsimd_common_fwd.hpp b/include/xsimd/arch/xsimd_common_fwd.hpp
index e78864f6e..1474aeeb8 100644
--- a/include/xsimd/arch/xsimd_common_fwd.hpp
+++ b/include/xsimd/arch/xsimd_common_fwd.hpp
@@ -79,8 +79,12 @@ namespace xsimd
         XSIMD_INLINE batch<T, A> load(T const* mem, unaligned_mode, requires_arch<A>) noexcept;
         template <class A, class T_in, class T_out, bool... Values, class alignment>
         XSIMD_INLINE batch<T_out, A> load_masked(T_in const* mem, batch_bool_constant<T_out, A, Values...> mask, convert<T_out>, alignment, requires_arch<common>) noexcept;
+        template <class A, class T, class Mode>
+        XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<common>) noexcept;
         template <class A, class T_in, class T_out, bool... Values, class alignment>
         XSIMD_INLINE void store_masked(T_out* mem, batch<T_in, A> const& src, batch_bool_constant<T_in, A, Values...> mask, alignment, requires_arch<common>) noexcept;
+        template <class A, class T, class Mode>
+        XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<common>) noexcept;
         template <class A, bool... Values, class Mode>
         XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...> mask, convert<int32_t>, Mode, requires_arch<A>) noexcept;
         template <class A, bool... Values, class Mode>
diff --git a/include/xsimd/arch/xsimd_rvv.hpp b/include/xsimd/arch/xsimd_rvv.hpp
index 3ae649fdb..df64d3c1b 100644
--- a/include/xsimd/arch/xsimd_rvv.hpp
+++ b/include/xsimd/arch/xsimd_rvv.hpp
@@ -409,6 +409,11 @@ namespace xsimd
         {
             XSIMD_RVV_OVERLOAD(rvvle, (__riscv_vle XSIMD_RVV_S _v_ XSIMD_RVV_TSM), , vec(T const*))
             XSIMD_RVV_OVERLOAD(rvvse, (__riscv_vse XSIMD_RVV_S _v_ XSIMD_RVV_TSM), , void(T*, vec))
+            // Masked load (mask-undisturbed with zero passthrough): inactive lanes read as 0,
+            // no memory access is performed for inactive lanes (page-fault safe).
+            XSIMD_RVV_OVERLOAD(rvvle_mu, (__riscv_vle XSIMD_RVV_S _v_ XSIMD_RVV_TSM _mu), , vec(bvec, vec, T const*))
+            // Masked store: inactive lanes are not written.
+            XSIMD_RVV_OVERLOAD(rvvse_m, (__riscv_vse XSIMD_RVV_S _v_ XSIMD_RVV_TSM _m), , void(bvec, T*, vec))
         }
 
         template <class A, class T, detail::enable_arithmetic_t<T> = 0>
@@ -423,6 +428,16 @@ namespace xsimd
             return load_aligned<A>(src, convert<T>(), rvv {});
         }
 
+        // load_masked (runtime mask): native vle*.v vd, (rs1), v0.t with zero-init
+        // passthrough so inactive lanes read as 0, matching xsimd's contract.
+        template <class A, class T, class Mode, detail::enable_arithmetic_t<T> = 0>
+        XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<rvv>) noexcept
+        {
+            using proj_t = map_to_sized_type_t<T>;
+            const auto zero = detail_rvv::rvvmv_splat(proj_t {});
+            return detail_rvv::rvvle_mu(mask, zero, reinterpret_cast<proj_t const*>(mem));
+        }
+
         // load_complex
         namespace detail_rvv
         {
@@ -500,6 +515,15 @@ namespace xsimd
             store_aligned<A>(dst, src, rvv {});
         }
 
+        // store_masked (runtime mask): native vse*.v vd, (rs1), v0.t — inactive lanes
+        // are not written (page-fault safe).
+        template <class A, class T, class Mode, detail::enable_arithmetic_t<T> = 0>
+        XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<rvv>) noexcept
+        {
+            using proj_t = map_to_sized_type_t<T>;
+            detail_rvv::rvvse_m(mask, reinterpret_cast<proj_t*>(mem), src);
+        }
+
         /******************
          * scatter/gather *
          ******************/
diff --git a/include/xsimd/arch/xsimd_sve.hpp b/include/xsimd/arch/xsimd_sve.hpp
index c15404dd9..e5729e65e 100644
--- a/include/xsimd/arch/xsimd_sve.hpp
+++ b/include/xsimd/arch/xsimd_sve.hpp
@@ -101,11 +101,28 @@ namespace xsimd
             return load_aligned<A>(src, convert<T>(), sve {});
         }
 
-        // load_masked
-        template <class A, class T, bool... Values, class Mode, detail::enable_arithmetic_t<T> = 0>
-        XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool_constant<float, A, Values...>, Mode, requires_arch<sve>) noexcept
+        // load_masked (compile-time mask): build a runtime predicate from
+        // the constant mask and reuse the runtime-mask path. ``pmask`` only
+        // constructs a 128-bit chunk predicate (svdupq_b{8,16,32,64}), which
+        // is replication-based and does not correctly express a per-lane
+        // mask on SVE wider than 128 bits — going through ``as_batch_bool``
+        // gives the right predicate for every vector width. ``int32``/
+        // ``int64``/``uint32``/``uint64`` are excluded so the common-arch
+        // dispatchers that reinterpret to ``float``/``double`` win partial
+        // ordering (otherwise we'd be ambiguous with ``requires_arch<A>``).
+        template <class A, class T, bool... Values, class Mode,
+                  detail::enable_arithmetic_t<T> = 0,
+                  std::enable_if_t<!(std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8)), int> = 0>
+        XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool_constant<T, A, Values...> mask, convert<T>, Mode m, requires_arch<sve>) noexcept
         {
-            return svld1(detail_sve::pmask<Values...>(), reinterpret_cast<map_to_sized_type_t<T> const*>(mem));
+            return load_masked<A>(mem, mask.as_batch_bool(), convert<T> {}, m, sve {});
+        }
+
+        // load_masked (runtime mask)
+        template <class A, class T, class Mode, detail::enable_arithmetic_t<T> = 0>
+        XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<sve>) noexcept
+        {
+            return svld1(mask, reinterpret_cast<map_to_sized_type_t<T> const*>(mem));
         }
 
         // load_complex
@@ -141,6 +158,24 @@ namespace xsimd
             store_aligned<A>(dst, src, sve {});
         }
 
+        // store_masked (compile-time mask): forward to the runtime-mask
+        // path for the same reason as load_masked above; same exclusion of
+        // 32/64-bit integers to defer to the common dispatchers.
+        template <class A, class T, bool... Values, class Mode,
+                  detail::enable_arithmetic_t<T> = 0,
+                  std::enable_if_t<!(std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8)), int> = 0>
+        XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode m, requires_arch<sve>) noexcept
+        {
+            store_masked<A>(mem, src, mask.as_batch_bool(), m, sve {});
+        }
+
+        // store_masked (runtime mask)
+        template <class A, class T, class Mode, detail::enable_arithmetic_t<T> = 0>
+        XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<sve>) noexcept
+        {
+            svst1(mask, reinterpret_cast<map_to_sized_type_t<T>*>(mem), src);
+        }
+
         // store_complex
         template <class A, class T, detail::enable_floating_point_t<T> = 0>
         XSIMD_INLINE void store_complex_aligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<sve>) noexcept
diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp
index 6a7206116..85bb72899 100644
--- a/include/xsimd/types/xsimd_api.hpp
+++ b/include/xsimd/types/xsimd_api.hpp
@@ -1550,6 +1550,36 @@ namespace xsimd
         return batch<T, A>::load(ptr, mask, aligned_mode {});
     }
 
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the buffer \c ptr using a runtime mask. Elements
+     * corresponding to \c false in the mask are not accessed in memory and are
+     * zero-initialized in the resulting batch. No type conversion is performed:
+     * \c ptr must point to \c T. Lanes whose mask bit is \c false do not fault,
+     * so partial loads across a page boundary are safe. \c stream_mode is not
+     * supported.
+     *
+     * \warning Runtime-mask loads carry a significant performance penalty on
+     * architectures without a native masked load instruction (SSE2 through
+     * SSE4.2, NEON/NEON64, VSX, S390x, WASM). On those targets the
+     * implementation falls back to a per-lane scalar loop. AVX, AVX2, AVX-512,
+     * SVE and RVV use native masked instructions and are not affected. Prefer
+     * the \c batch_bool_constant overload whenever the mask is known at
+     * compile time, or hoist the masked load out of hot inner loops.
+     * @param ptr the memory buffer to read. Must be aligned.
+     * @param mask runtime selection mask for the elements to load
+     * @return a new batch instance
+     */
+    template <class T, class A = default_arch>
+    XSIMD_INLINE batch<T, A> load(T const* ptr,
+                                  batch_bool<T, A> mask,
+                                  aligned_mode = {}) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return batch<T, A>::load(ptr, mask, aligned_mode {});
+    }
+
     /**
      * @ingroup batch_data_transfer
      *
@@ -1570,6 +1600,37 @@ namespace xsimd
         return batch<T, A>::load(ptr, mask, unaligned_mode {});
     }
 
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the buffer \c ptr using a runtime mask. Elements
+     * corresponding to \c false in the mask are not accessed in memory and are
+     * zero-initialized in the resulting batch. No type conversion is performed:
+     * \c ptr must point to \c T. Lanes whose mask bit is \c false do not fault,
+     * so partial loads across a page boundary are safe. \c stream_mode is not
+     * supported.
+     *
+     * \warning Runtime-mask loads carry a significant performance penalty on
+     * architectures without a native masked load instruction (SSE2 through
+     * SSE4.2, NEON/NEON64, VSX, S390x, WASM). On those targets the
+     * implementation falls back to a per-lane scalar loop. AVX, AVX2, AVX-512,
+     * SVE and RVV use native masked instructions and are not affected. Prefer
+     * the \c batch_bool_constant overload whenever the mask is known at
+     * compile time, or hoist the masked load out of hot inner loops.
+     * @param ptr the memory buffer to read. The buffer does not need to be
+     *            aligned.
+     * @param mask runtime selection mask for the elements to load
+     * @return a new batch instance
+     */
+    template <class T, class A = default_arch>
+    XSIMD_INLINE batch<T, A> load(T const* ptr,
+                                  batch_bool<T, A> mask,
+                                  unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return batch<T, A>::load(ptr, mask, unaligned_mode {});
+    }
+
     /**
      * @ingroup batch_data_transfer
      *
@@ -2663,6 +2724,37 @@ namespace xsimd
         val.store(mem, mask, aligned_mode {});
     }
 
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Copy selected elements of batch \c val to the buffer \c mem using a
+     * runtime mask. Elements corresponding to \c false in the mask are not
+     * written and leave the contents of \c mem untouched. No type conversion is
+     * performed: \c mem must point to \c T. Lanes whose mask bit is \c false do
+     * not fault, so partial stores across a page boundary are safe. \c
+     * stream_mode is not supported.
+     *
+     * \warning Runtime-mask stores carry a significant performance penalty on
+     * architectures without a native masked store instruction (SSE2 through
+     * SSE4.2, NEON/NEON64, VSX, S390x, WASM). On those targets the
+     * implementation falls back to a per-lane scalar loop. AVX, AVX2, AVX-512,
+     * SVE and RVV use native masked instructions and are not affected. Prefer
+     * the \c batch_bool_constant overload whenever the mask is known at
+     * compile time, or hoist the masked store out of hot inner loops.
+     * @param mem the memory buffer to write to. Must be aligned.
+     * @param val the batch to copy from
+     * @param mask runtime selection mask for the elements to store
+     */
+    template <class T, class A = default_arch>
+    XSIMD_INLINE void store(T* mem,
+                            batch<T, A> const& val,
+                            batch_bool<T, A> mask,
+                            aligned_mode = {}) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        val.store(mem, mask, aligned_mode {});
+    }
+
     /**
      * @ingroup batch_data_transfer
      *
@@ -2683,6 +2775,38 @@ namespace xsimd
         val.store(mem, mask, unaligned_mode {});
     }
 
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Copy selected elements of batch \c val to the buffer \c mem using a
+     * runtime mask. Elements corresponding to \c false in the mask are not
+     * written and leave the contents of \c mem untouched. No type conversion is
+     * performed: \c mem must point to \c T. Lanes whose mask bit is \c false do
+     * not fault, so partial stores across a page boundary are safe. \c
+     * stream_mode is not supported.
+     *
+     * \warning Runtime-mask stores carry a significant performance penalty on
+     * architectures without a native masked store instruction (SSE2 through
+     * SSE4.2, NEON/NEON64, VSX, S390x, WASM). On those targets the
+     * implementation falls back to a per-lane scalar loop. AVX, AVX2, AVX-512,
+     * SVE and RVV use native masked instructions and are not affected. Prefer
+     * the \c batch_bool_constant overload whenever the mask is known at
+     * compile time, or hoist the masked store out of hot inner loops.
+     * @param mem the memory buffer to write to. The buffer does not need to be
+     *            aligned.
+     * @param val the batch to copy from
+     * @param mask runtime selection mask for the elements to store
+     */
+    template <class T, class A = default_arch>
+    XSIMD_INLINE void store(T* mem,
+                            batch<T, A> const& val,
+                            batch_bool<T, A> mask,
+                            unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        val.store(mem, mask, unaligned_mode {});
+    }
+
     /**
      * @ingroup batch_data_transfer
      *
diff --git a/include/xsimd/types/xsimd_batch.hpp b/include/xsimd/types/xsimd_batch.hpp
index b584a2d81..8826e35c7 100644
--- a/include/xsimd/types/xsimd_batch.hpp
+++ b/include/xsimd/types/xsimd_batch.hpp
@@ -144,9 +144,12 @@ namespace xsimd
         template <class U>
         XSIMD_INLINE void store(U* mem, stream_mode) const noexcept;
 
-        // Compile-time mask overloads
+        // Masked overloads
         template <class U, bool... Values, class Mode = aligned_mode>
         XSIMD_INLINE void store(U* mem, batch_bool_constant<T, A, Values...> mask, Mode) const noexcept;
+        /** \brief Runtime-mask store; see xsimd::store(T*, batch const&, batch_bool<T,A>, Mode). */
+        template <class Mode = aligned_mode>
+        XSIMD_INLINE void store(T* mem, batch_bool<T, A> mask, Mode = {}) const noexcept;
 
         template <class U>
         XSIMD_NO_DISCARD static XSIMD_INLINE batch load_aligned(U const* mem) noexcept;
@@ -156,9 +159,12 @@ namespace xsimd
         XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, aligned_mode) noexcept;
         template <class U>
         XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, unaligned_mode) noexcept;
-        // Compile-time mask overloads
+        // Masked overloads
         template <class U, bool... Values, class Mode = aligned_mode>
         XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, batch_bool_constant<T, A, Values...> mask, Mode = {}) noexcept;
+        /** \brief Runtime-mask load; see xsimd::load(T const*, batch_bool<T,A>, Mode). */
+        template <class Mode = aligned_mode>
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load(T const* mem, batch_bool<T, A> mask, Mode = {}) noexcept;
         template <class U>
         XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, stream_mode) noexcept;
 
@@ -722,6 +728,26 @@ namespace xsimd
         }
     }
 
+    template <class T, class A>
+    template <class Mode>
+    XSIMD_INLINE batch<T, A> batch<T, A>::load(T const* mem, batch_bool<T, A> mask, Mode mode) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        static_assert(std::is_same<Mode, aligned_mode>::value || std::is_same<Mode, unaligned_mode>::value,
+                      "supported load mode");
+        constexpr uint64_t full_mask = details::full_mask(size);
+        const auto bits = mask.mask();
+        if (bits == 0)
+        {
+            return broadcast<T>(0);
+        }
+        if (bits == full_mask)
+        {
+            return load(mem, mode);
+        }
+        return kernel::load_masked<A>(mem, mask, kernel::convert<T> {}, mode, A {});
+    }
+
     template <class T, class A>
     template <class U, bool... Values, class Mode>
     XSIMD_INLINE void batch<T, A>::store(U* mem,
@@ -745,6 +771,29 @@ namespace xsimd
         }
     }
 
+    template <class T, class A>
+    template <class Mode>
+    XSIMD_INLINE void batch<T, A>::store(T* mem,
+                                         batch_bool<T, A> mask,
+                                         Mode mode) const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        static_assert(std::is_same<Mode, aligned_mode>::value || std::is_same<Mode, unaligned_mode>::value,
+                      "supported store mode");
+        constexpr uint64_t full_mask = details::full_mask(size);
+        const auto bits = mask.mask();
+        if (bits == 0)
+        {
+            return;
+        }
+        if (bits == full_mask)
+        {
+            store(mem, mode);
+            return;
+        }
+        kernel::store_masked<A>(mem, *this, mask, mode, A {});
+    }
+
     template <class T, class A>
     template <class U>
     XSIMD_INLINE batch<T, A> batch<T, A>::load(U const* mem, stream_mode) noexcept
diff --git a/include/xsimd/types/xsimd_utils.hpp b/include/xsimd/types/xsimd_utils.hpp
index 6af62c1a0..940ee084f 100644
--- a/include/xsimd/types/xsimd_utils.hpp
+++ b/include/xsimd/types/xsimd_utils.hpp
@@ -457,6 +457,16 @@ namespace xsimd
 
     template <class B>
     using complex_batch_type_t = typename complex_batch_type<B>::type;
+
+    namespace details
+    {
+        // Returns a bitmask with the lowest \c size bits set. Used by masked
+        // load/store fast paths to detect "all lanes active".
+        inline constexpr uint64_t full_mask(std::size_t size) noexcept
+        {
+            return size >= 64 ? ~uint64_t(0) : ((uint64_t(1) << size) - 1);
+        }
+    }
 }
 
 #endif
diff --git a/test/test_load_store.cpp b/test/test_load_store.cpp
index a5266eeb3..411b69472 100644
--- a/test/test_load_store.cpp
+++ b/test/test_load_store.cpp
@@ -105,6 +105,20 @@ struct load_store_test
         static constexpr bool get(std::size_t, std::size_t) noexcept { return true; }
     };
 
+    template <class Generator>
+    static batch_bool_type make_runtime_mask() noexcept
+    {
+        uint64_t bits = 0;
+        for (std::size_t i = 0; i < size; ++i)
+        {
+            if (Generator::get(i, size))
+            {
+                bits |= uint64_t(1) << i;
+            }
+        }
+        return batch_bool_type::from_mask(bits);
+    }
+
     int8_vector_type i8_vec;
     uint8_vector_type ui8_vec;
     int16_vector_type i16_vec;
@@ -377,6 +391,16 @@ struct load_store_test
         run_load_mask_pattern<mask_odd>(v, name, b, expected, " masked odd elements");
         run_load_mask_pattern<mask_pseudo_random>(v, name, b, expected, " masked pseudo random");
         run_load_mask_pattern<mask_all>(v, name, b, expected, " masked all elements");
+        run_load_runtime_mask_pattern<mask_none>(v, name, b, expected, " runtime masked none");
+        run_load_runtime_mask_pattern<mask_first>(v, name, b, expected, " runtime masked first element");
+        run_load_runtime_mask_pattern<mask_first_half>(v, name, b, expected, " runtime masked first half");
+        run_load_runtime_mask_pattern<mask_last_half>(v, name, b, expected, " runtime masked last half");
+        run_load_runtime_mask_pattern<mask_first_n>(v, name, b, expected, " runtime masked first N");
+        run_load_runtime_mask_pattern<mask_last_n>(v, name, b, expected, " runtime masked last N");
+        run_load_runtime_mask_pattern<mask_even>(v, name, b, expected, " runtime masked even elements");
+        run_load_runtime_mask_pattern<mask_odd>(v, name, b, expected, " runtime masked odd elements");
+        run_load_runtime_mask_pattern<mask_pseudo_random>(v, name, b, expected, " runtime masked pseudo random");
+        run_load_runtime_mask_pattern<mask_all>(v, name, b, expected, " runtime masked all elements");
     }
 
     template <class V>
@@ -404,6 +428,26 @@ struct load_store_test
         CHECK_BATCH_EQ(b, expected_masked);
     }
 
+    template <class Generator, class V>
+    void run_load_runtime_mask_pattern(const V& v, const std::string& name, batch_type& b, const array_type& expected, const std::string& label)
+    {
+        const auto mask = make_runtime_mask<Generator>();
+        array_type expected_masked { 0 };
+
+        for (std::size_t i = 0; i < size; ++i)
+        {
+            const bool active = Generator::get(i, size);
+            expected_masked[i] = active ? expected[i] : value_type();
+        }
+
+        b = xsimd::load(v.data(), mask, xsimd::aligned_mode());
+        INFO(name, label + " aligned");
+        CHECK_BATCH_EQ(b, expected_masked);
+        b = xsimd::load(v.data(), mask, xsimd::unaligned_mode());
+        INFO(name, label + " unaligned");
+        CHECK_BATCH_EQ(b, expected_masked);
+    }
+
     template <class Generator, class V>
     void run_store_mask_pattern(const V& v, const std::string& name, batch_type& b, V& res, V& expected_masked, const std::string& label)
     {
@@ -422,6 +466,24 @@ struct load_store_test
         CHECK_VECTOR_EQ(res, expected_masked);
     }
 
+    template <class Generator, class V>
+    void run_store_runtime_mask_pattern(const V& v, const std::string& name, batch_type& b, V& res, V& expected_masked, const std::string& label)
+    {
+        const auto mask = make_runtime_mask<Generator>();
+        for (std::size_t i = 0; i < size; ++i)
+        {
+            expected_masked[i] = Generator::get(i, size) ? v[i] : value_type();
+        }
+        std::fill(res.begin(), res.end(), value_type());
+        xsimd::store(res.data(), b, mask, xsimd::aligned_mode());
+        INFO(name, label + " aligned");
+        CHECK_VECTOR_EQ(res, expected_masked);
+        std::fill(res.begin(), res.end(), value_type());
+        xsimd::store(res.data(), b, mask, xsimd::unaligned_mode());
+        INFO(name, label + " unaligned");
+        CHECK_VECTOR_EQ(res, expected_masked);
+    }
+
     template <class V>
     void run_store_mask_tests(const V& v, const std::string& name, batch_type& b, V& res, V& expected_masked, std::true_type)
     {
@@ -434,6 +496,15 @@ struct load_store_test
         run_store_mask_pattern<mask_odd>(v, name, b, res, expected_masked, " masked odd elements");
         run_store_mask_pattern<mask_pseudo_random>(v, name, b, res, expected_masked, " masked pseudo random");
         run_store_mask_pattern<mask_all>(v, name, b, res, expected_masked, " masked all elements");
+        run_store_runtime_mask_pattern<mask_first>(v, name, b, res, expected_masked, " runtime masked first element");
+        run_store_runtime_mask_pattern<mask_first_half>(v, name, b, res, expected_masked, " runtime masked first half");
+        run_store_runtime_mask_pattern<mask_last_half>(v, name, b, res, expected_masked, " runtime masked last half");
+        run_store_runtime_mask_pattern<mask_first_n>(v, name, b, res, expected_masked, " runtime masked first N");
+        run_store_runtime_mask_pattern<mask_last_n>(v, name, b, res, expected_masked, " runtime masked last N");
+        run_store_runtime_mask_pattern<mask_even>(v, name, b, res, expected_masked, " runtime masked even elements");
+        run_store_runtime_mask_pattern<mask_odd>(v, name, b, res, expected_masked, " runtime masked odd elements");
+        run_store_runtime_mask_pattern<mask_pseudo_random>(v, name, b, res, expected_masked, " runtime masked pseudo random");
+        run_store_runtime_mask_pattern<mask_all>(v, name, b, res, expected_masked, " runtime masked all elements");
     }
 
     template <class V>
@@ -453,6 +524,7 @@ struct load_store_test
         V sentinel_expected(size, sentinel);
 
         auto zero_mask = xsimd::make_batch_bool_constant<value_type, mask_none, typename batch_type::arch_type>();
+        auto runtime_zero_mask = make_runtime_mask<mask_none>();
         std::fill(res.begin(), res.end(), sentinel);
         b.store(res.data(), zero_mask, xsimd::aligned_mode());
         INFO(name, " masked none aligned store");
@@ -470,6 +542,19 @@ struct load_store_test
         CHECK(std::all_of(scratch.begin(), scratch.end(), [](const value_type v)
                           { return v == sentinel; }));
 
+        std::fill(res.begin(), res.end(), sentinel);
+        xsimd::store(res.data(), b, runtime_zero_mask, xsimd::aligned_mode());
+        INFO(name, " runtime masked none aligned store");
+        CHECK_VECTOR_EQ(res, sentinel_expected);
+
+        std::fill(scratch.begin(), scratch.end(), sentinel);
+        xsimd::store(scratch_ptr, b, runtime_zero_mask, xsimd::unaligned_mode());
+        INFO(name, " runtime masked none unaligned store");
+        std::copy(scratch_ptr, scratch_ptr + scratch_slice.size(), scratch_slice.begin());
+        CHECK_VECTOR_EQ(scratch_slice, sentinel_expected);
+        CHECK(std::all_of(scratch.begin(), scratch.end(), [](const value_type v)
+                          { return v == sentinel; }));
+
         run_store_mask_tests(v, name, b, res, expected_masked, std::true_type {});
     }
 

From e22734657bbba4015b985264b8cb620635044c89 Mon Sep 17 00:00:00 2001
From: Marco Barbone <mbarbone@flatironinstitute.org>
Date: Thu, 30 Apr 2026 18:34:44 -0400
Subject: [PATCH 2/3] feat: add load_head / load_tail / store_head / store_tail
 APIs

Sugar over runtime-mask load/store for loop head/tail remainders.
Take ``n`` directly instead of a constructed batch_bool; only
``mem[0, n)`` is touched. ``head`` uses mask ``(1 << n) - 1``;
``tail`` uses ``((1 << n) - 1) << (size - n)`` with a base-pointer
offset (via uintptr_t to dodge -Warray-bounds), so every arch with
native predicated load/store inherits its intrinsic for free.

Tested on sse2/sse41/avx2/avx512f/emulated256 native and
neon64/rvv under qemu.
---
 docs/source/api/data_transfer.rst             |  14 ++
 .../xsimd/arch/common/xsimd_common_memory.hpp |  68 ++++++--
 include/xsimd/arch/xsimd_common_fwd.hpp       |  10 ++
 include/xsimd/types/xsimd_api.hpp             |  89 ++++++++++
 include/xsimd/types/xsimd_batch.hpp           | 112 ++++++++++++
 test/test_load_store.cpp                      | 165 ++++++++++++++++++
 6 files changed, 446 insertions(+), 12 deletions(-)

diff --git a/docs/source/api/data_transfer.rst b/docs/source/api/data_transfer.rst
index db63fbc39..74aa4b512 100644
--- a/docs/source/api/data_transfer.rst
+++ b/docs/source/api/data_transfer.rst
@@ -20,6 +20,10 @@ From memory:
 +---------------------------------------+----------------------------------------------------+
 | :cpp:func:`load_as`                   | load values, forcing a type conversion             |
 +---------------------------------------+----------------------------------------------------+
+| :cpp:func:`load_head`                  | load the first ``n`` contiguous elements [#h]_     |
++---------------------------------------+----------------------------------------------------+
+| :cpp:func:`load_tail`                  | load the last ``n`` contiguous elements [#h]_      |
++---------------------------------------+----------------------------------------------------+
 
 From a scalar:
 
@@ -40,6 +44,10 @@ To memory:
 +---------------------------------------+----------------------------------------------------+
 | :cpp:func:`store_as`                  | store values, forcing a type conversion            |
 +---------------------------------------+----------------------------------------------------+
+| :cpp:func:`store_head`                 | store the first ``n`` contiguous elements [#h]_    |
++---------------------------------------+----------------------------------------------------+
+| :cpp:func:`store_tail`                 | store the last ``n`` contiguous elements [#h]_     |
++---------------------------------------+----------------------------------------------------+
 
 In place:
 
@@ -97,3 +105,9 @@ The following empty types are used for tag dispatching:
    such penalty. Prefer the compile-time mask whenever the selection is known
    at compile time, and avoid runtime-mask loads/stores in hot inner loops on
    the affected architectures.
+
+.. [#h] ``load_head`` / ``store_head`` / ``load_tail`` / ``store_tail``
+   take a runtime element count ``n`` instead of a constructed mask;
+   they are sugar for the runtime-mask ``load`` / ``store`` with a
+   contiguous-prefix or contiguous-suffix mask, and inherit its
+   contract and per-arch codegen.
diff --git a/include/xsimd/arch/common/xsimd_common_memory.hpp b/include/xsimd/arch/common/xsimd_common_memory.hpp
index d3834583a..92a1155b5 100644
--- a/include/xsimd/arch/common/xsimd_common_memory.hpp
+++ b/include/xsimd/arch/common/xsimd_common_memory.hpp
@@ -379,12 +379,8 @@ namespace xsimd
         XSIMD_INLINE batch<T, A>
         load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<common>) noexcept
         {
-            // Per-lane validity contract: only active lanes of ``mem`` are
-            // required to be addressable. An unconditional whole-vector load
-            // would touch inactive lanes and trip ASan/Valgrind on partial
-            // buffers, so stay scalar. Arches with hardware predicated loads
-            // (AVX2 32/64-bit, AVX-512, SVE, RVV) override this with a single
-            // intrinsic that suppresses inactive-lane reads in hardware.
+            // Per-lane validity contract: only active lanes are read.
+            // Arches with hardware predicated loads override this.
             constexpr std::size_t size = batch<T, A>::size;
             alignas(A::alignment()) std::array<T, size> buffer {};
             const uint64_t bits = mask.mask();
@@ -412,12 +408,8 @@ namespace xsimd
         XSIMD_INLINE void
         store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<common>) noexcept
         {
-            // Per-lane validity contract (matches native masked-store APIs):
-            // only active lanes of ``mem`` are touched. A load+select+store
-            // RMW would both read and write inactive bytes, breaking that
-            // contract — stay scalar. Arches with hardware predicated stores
-            // override this with a single intrinsic that suppresses inactive
-            // lanes in hardware.
+            // Per-lane validity contract: only active lanes are written.
+            // Arches with hardware predicated stores override this.
             constexpr std::size_t size = batch<T, A>::size;
             alignas(A::alignment()) std::array<T, size> src_buf;
             src.store_aligned(src_buf.data());
@@ -427,6 +419,58 @@ namespace xsimd
                     mem[i] = src_buf[i];
         }
 
+        // Head/tail forward to the runtime-mask path. ``tail`` offsets
+        // the base pointer back by ``(size - n)`` so the active high-``n``
+        // lanes land at ``[mem, mem + n)``; the offset goes through
+        // ``uintptr_t`` to dodge ``-Warray-bounds`` on small buffers.
+        namespace detail
+        {
+            template <class T>
+            XSIMD_INLINE T const* offset_back(T const* p, std::size_t k) noexcept
+            {
+                return reinterpret_cast<T const*>(reinterpret_cast<std::uintptr_t>(p) - k * sizeof(T));
+            }
+            template <class T>
+            XSIMD_INLINE T* offset_back(T* p, std::size_t k) noexcept
+            {
+                return reinterpret_cast<T*>(reinterpret_cast<std::uintptr_t>(p) - k * sizeof(T));
+            }
+        }
+
+        template <class A, class T, class Mode>
+        XSIMD_INLINE batch<T, A>
+        load_head(T const* mem, std::size_t n, Mode, requires_arch<common>) noexcept
+        {
+            const auto mask = batch_bool<T, A>::from_mask(::xsimd::details::full_mask(n));
+            return load_masked<A>(mem, mask, convert<T> {}, unaligned_mode {}, A {});
+        }
+
+        template <class A, class T, class Mode>
+        XSIMD_INLINE void
+        store_head(T* mem, std::size_t n, batch<T, A> const& src, Mode, requires_arch<common>) noexcept
+        {
+            const auto mask = batch_bool<T, A>::from_mask(::xsimd::details::full_mask(n));
+            store_masked<A>(mem, src, mask, unaligned_mode {}, A {});
+        }
+
+        template <class A, class T, class Mode>
+        XSIMD_INLINE batch<T, A>
+        load_tail(T const* mem, std::size_t n, Mode, requires_arch<common>) noexcept
+        {
+            constexpr std::size_t size = batch<T, A>::size;
+            const auto mask = batch_bool<T, A>::from_mask(::xsimd::details::full_mask(n) << (size - n));
+            return load_masked<A>(detail::offset_back(mem, size - n), mask, convert<T> {}, unaligned_mode {}, A {});
+        }
+
+        template <class A, class T, class Mode>
+        XSIMD_INLINE void
+        store_tail(T* mem, std::size_t n, batch<T, A> const& src, Mode, requires_arch<common>) noexcept
+        {
+            constexpr std::size_t size = batch<T, A>::size;
+            const auto mask = batch_bool<T, A>::from_mask(::xsimd::details::full_mask(n) << (size - n));
+            store_masked<A>(detail::offset_back(mem, size - n), src, mask, unaligned_mode {}, A {});
+        }
+
         template <class A, bool... Values, class Mode>
         XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...>, convert<int32_t>, Mode, requires_arch<A>) noexcept
         {
diff --git a/include/xsimd/arch/xsimd_common_fwd.hpp b/include/xsimd/arch/xsimd_common_fwd.hpp
index 1474aeeb8..42106267c 100644
--- a/include/xsimd/arch/xsimd_common_fwd.hpp
+++ b/include/xsimd/arch/xsimd_common_fwd.hpp
@@ -102,6 +102,16 @@ namespace xsimd
         template <class A, bool... Values, class Mode>
         XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value> store_masked(uint64_t*, batch<uint64_t, A> const&, batch_bool_constant<uint64_t, A, Values...>, Mode, requires_arch<A>) noexcept;
 
+        // Head/tail: contiguous prefix/suffix variants of the masked load/store.
+        template <class A, class T, class Mode>
+        XSIMD_INLINE batch<T, A> load_head(T const* mem, std::size_t n, Mode, requires_arch<common>) noexcept;
+        template <class A, class T, class Mode>
+        XSIMD_INLINE batch<T, A> load_tail(T const* mem, std::size_t n, Mode, requires_arch<common>) noexcept;
+        template <class A, class T, class Mode>
+        XSIMD_INLINE void store_head(T* mem, std::size_t n, batch<T, A> const& src, Mode, requires_arch<common>) noexcept;
+        template <class A, class T, class Mode>
+        XSIMD_INLINE void store_tail(T* mem, std::size_t n, batch<T, A> const& src, Mode, requires_arch<common>) noexcept;
+
         // Forward declarations for pack-level helpers
         namespace detail
         {
diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp
index 85bb72899..e3849f981 100644
--- a/include/xsimd/types/xsimd_api.hpp
+++ b/include/xsimd/types/xsimd_api.hpp
@@ -1631,6 +1631,51 @@ namespace xsimd
         return batch<T, A>::load(ptr, mask, unaligned_mode {});
     }
 
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Loads the prefix \c mem[0, n) into the low \c n lanes; remaining
+     * lanes are zero. Sugar for a runtime-mask load with mask
+     * <tt>(1 << n) - 1</tt>; same contract — only \c mem[0, n) is read.
+     * \c n is clamped to \c batch::size.
+     */
+    template <class T, class A = default_arch>
+    XSIMD_INLINE batch<T, A> load_head(T const* mem, std::size_t n, aligned_mode = {}) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return batch<T, A>::load_head(mem, n, aligned_mode {});
+    }
+
+    /// \overload
+    template <class T, class A = default_arch>
+    XSIMD_INLINE batch<T, A> load_head(T const* mem, std::size_t n, unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return batch<T, A>::load_head(mem, n, unaligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Loads \c mem[0, n) into the high \c n lanes (lanes
+     * <tt>[size - n, size)</tt>); remaining low lanes are zero. Same
+     * contract as \ref load_head. \c n is clamped to \c batch::size.
+     */
+    template <class T, class A = default_arch>
+    XSIMD_INLINE batch<T, A> load_tail(T const* mem, std::size_t n, aligned_mode = {}) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return batch<T, A>::load_tail(mem, n, aligned_mode {});
+    }
+
+    /// \overload
+    template <class T, class A = default_arch>
+    XSIMD_INLINE batch<T, A> load_tail(T const* mem, std::size_t n, unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return batch<T, A>::load_tail(mem, n, unaligned_mode {});
+    }
+
     /**
      * @ingroup batch_data_transfer
      *
@@ -2807,6 +2852,50 @@ namespace xsimd
         val.store(mem, mask, unaligned_mode {});
     }
 
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Stores the low \c n lanes of \c val to \c mem[0, n). Sugar for a
+     * runtime-mask store with mask <tt>(1 << n) - 1</tt>; same contract —
+     * only \c mem[0, n) is written. \c n is clamped to \c batch::size.
+     */
+    template <class T, class A>
+    XSIMD_INLINE void store_head(T* mem, std::size_t n, batch<T, A> const& val, aligned_mode = {}) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        val.store_head(mem, n, aligned_mode {});
+    }
+
+    /// \overload
+    template <class T, class A>
+    XSIMD_INLINE void store_head(T* mem, std::size_t n, batch<T, A> const& val, unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        val.store_head(mem, n, unaligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Stores the high \c n lanes (lanes <tt>[size - n, size)</tt>) of
+     * \c val to \c mem[0, n). Same contract as \ref store_head. \c n is
+     * clamped to \c batch::size.
+     */
+    template <class T, class A>
+    XSIMD_INLINE void store_tail(T* mem, std::size_t n, batch<T, A> const& val, aligned_mode = {}) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        val.store_tail(mem, n, aligned_mode {});
+    }
+
+    /// \overload
+    template <class T, class A>
+    XSIMD_INLINE void store_tail(T* mem, std::size_t n, batch<T, A> const& val, unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        val.store_tail(mem, n, unaligned_mode {});
+    }
+
     /**
      * @ingroup batch_data_transfer
      *
diff --git a/include/xsimd/types/xsimd_batch.hpp b/include/xsimd/types/xsimd_batch.hpp
index 8826e35c7..b99d66c70 100644
--- a/include/xsimd/types/xsimd_batch.hpp
+++ b/include/xsimd/types/xsimd_batch.hpp
@@ -151,6 +151,12 @@ namespace xsimd
         template <class Mode = aligned_mode>
         XSIMD_INLINE void store(T* mem, batch_bool<T, A> mask, Mode = {}) const noexcept;
 
+        // Head/tail: contiguous prefix/suffix variants of the runtime-mask store.
+        XSIMD_INLINE void store_head(T* mem, std::size_t n, aligned_mode) const noexcept;
+        XSIMD_INLINE void store_head(T* mem, std::size_t n, unaligned_mode) const noexcept;
+        XSIMD_INLINE void store_tail(T* mem, std::size_t n, aligned_mode) const noexcept;
+        XSIMD_INLINE void store_tail(T* mem, std::size_t n, unaligned_mode) const noexcept;
+
         template <class U>
         XSIMD_NO_DISCARD static XSIMD_INLINE batch load_aligned(U const* mem) noexcept;
         template <class U>
@@ -168,6 +174,12 @@ namespace xsimd
         template <class U>
         XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, stream_mode) noexcept;
 
+        // Head/tail: contiguous prefix/suffix variants of the runtime-mask load.
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load_head(T const* mem, std::size_t n, aligned_mode) noexcept;
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load_head(T const* mem, std::size_t n, unaligned_mode) noexcept;
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load_tail(T const* mem, std::size_t n, aligned_mode) noexcept;
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load_tail(T const* mem, std::size_t n, unaligned_mode) noexcept;
+
         template <class U, class V>
         XSIMD_NO_DISCARD static XSIMD_INLINE batch gather(U const* src, batch<V, arch_type> const& index) noexcept;
         template <class U, class V>
@@ -794,6 +806,106 @@ namespace xsimd
         kernel::store_masked<A>(mem, *this, mask, mode, A {});
     }
 
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> batch<T, A>::load_head(T const* mem, std::size_t n, aligned_mode) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        if (n == 0)
+            return broadcast<T>(0);
+        if (n >= size)
+            return load_aligned(mem);
+        return kernel::load_head<A>(mem, n, aligned_mode {}, A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> batch<T, A>::load_head(T const* mem, std::size_t n, unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        if (n == 0)
+            return broadcast<T>(0);
+        if (n >= size)
+            return load_unaligned(mem);
+        return kernel::load_head<A>(mem, n, unaligned_mode {}, A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> batch<T, A>::load_tail(T const* mem, std::size_t n, aligned_mode) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        if (n == 0)
+            return broadcast<T>(0);
+        if (n >= size)
+            return load_aligned(mem);
+        return kernel::load_tail<A>(mem, n, aligned_mode {}, A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE batch<T, A> batch<T, A>::load_tail(T const* mem, std::size_t n, unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        if (n == 0)
+            return broadcast<T>(0);
+        if (n >= size)
+            return load_unaligned(mem);
+        return kernel::load_tail<A>(mem, n, unaligned_mode {}, A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE void batch<T, A>::store_head(T* mem, std::size_t n, aligned_mode) const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        if (n == 0)
+            return;
+        if (n >= size)
+        {
+            store_aligned(mem);
+            return;
+        }
+        kernel::store_head<A>(mem, n, *this, aligned_mode {}, A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE void batch<T, A>::store_head(T* mem, std::size_t n, unaligned_mode) const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        if (n == 0)
+            return;
+        if (n >= size)
+        {
+            store_unaligned(mem);
+            return;
+        }
+        kernel::store_head<A>(mem, n, *this, unaligned_mode {}, A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE void batch<T, A>::store_tail(T* mem, std::size_t n, aligned_mode) const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        if (n == 0)
+            return;
+        if (n >= size)
+        {
+            store_aligned(mem);
+            return;
+        }
+        kernel::store_tail<A>(mem, n, *this, aligned_mode {}, A {});
+    }
+
+    template <class T, class A>
+    XSIMD_INLINE void batch<T, A>::store_tail(T* mem, std::size_t n, unaligned_mode) const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        if (n == 0)
+            return;
+        if (n >= size)
+        {
+            store_unaligned(mem);
+            return;
+        }
+        kernel::store_tail<A>(mem, n, *this, unaligned_mode {}, A {});
+    }
+
     template <class T, class A>
     template <class U>
     XSIMD_INLINE batch<T, A> batch<T, A>::load(U const* mem, stream_mode) noexcept
diff --git a/test/test_load_store.cpp b/test/test_load_store.cpp
index 411b69472..5837d4061 100644
--- a/test/test_load_store.cpp
+++ b/test/test_load_store.cpp
@@ -238,6 +238,169 @@ struct load_store_test
 #endif
     }
 
+    template <class V>
+    void run_load_head_tail(const V&, const std::string&, std::false_type)
+    {
+    }
+
+    template <class V>
+    void run_load_head_tail(const V& v, const std::string& name)
+    {
+        run_load_head_tail(v, name, std::is_same<typename V::value_type, value_type> {});
+    }
+
+    template <class V>
+    void run_store_head_tail(const V&, const std::string&, std::false_type)
+    {
+    }
+
+    template <class V>
+    void run_store_head_tail(const V& v, const std::string& name)
+    {
+        run_store_head_tail(v, name, std::is_same<typename V::value_type, value_type> {});
+    }
+
+    template <class V>
+    void run_load_head_tail(const V& v, const std::string& name, std::true_type)
+    {
+        const array_type ref = [&]
+        {
+            array_type a;
+            std::copy(v.cbegin(), v.cend(), a.begin());
+            return a;
+        }();
+
+        for (std::size_t n = 0; n <= size; ++n)
+        {
+            array_type expected_head {};
+            for (std::size_t i = 0; i < n; ++i)
+                expected_head[i] = ref[i];
+            array_type expected_tail {};
+            for (std::size_t i = 0; i < n; ++i)
+                expected_tail[size - n + i] = ref[i];
+
+            const value_type* mem = v.data();
+            using arch_t = typename batch_type::arch_type;
+            batch_type b = xsimd::load_head<value_type, arch_t>(mem, n, xsimd::aligned_mode());
+            INFO(name, " load_head aligned n=", n);
+            CHECK_BATCH_EQ(b, expected_head);
+
+            b = xsimd::load_head<value_type, arch_t>(mem, n, xsimd::unaligned_mode());
+            INFO(name, " load_head unaligned n=", n);
+            CHECK_BATCH_EQ(b, expected_head);
+
+            b = batch_type::load_head(mem, n, xsimd::aligned_mode());
+            INFO(name, " batch::load_head aligned n=", n);
+            CHECK_BATCH_EQ(b, expected_head);
+
+            b = xsimd::load_tail<value_type, arch_t>(mem, n, xsimd::aligned_mode());
+            INFO(name, " load_tail aligned n=", n);
+            CHECK_BATCH_EQ(b, expected_tail);
+
+            b = xsimd::load_tail<value_type, arch_t>(mem, n, xsimd::unaligned_mode());
+            INFO(name, " load_tail unaligned n=", n);
+            CHECK_BATCH_EQ(b, expected_tail);
+
+            b = batch_type::load_tail(mem, n, xsimd::aligned_mode());
+            INFO(name, " batch::load_tail aligned n=", n);
+            CHECK_BATCH_EQ(b, expected_tail);
+        }
+    }
+
+    template <class V>
+    void run_store_head_tail(const V& v, const std::string& name, std::true_type)
+    {
+        static constexpr value_type sentinel = static_cast<value_type>(91);
+        batch_type b = batch_type::load_aligned(v.data());
+        V scratch(size);
+
+        for (std::size_t n = 0; n <= size; ++n)
+        {
+            // store_head: low n lanes -> mem[0, n)
+            std::fill(scratch.begin(), scratch.end(), sentinel);
+            xsimd::store_head(scratch.data(), n, b, xsimd::aligned_mode());
+            for (std::size_t i = 0; i < n; ++i)
+            {
+                INFO(name, " store_head aligned n=", n, " i=", i);
+                CHECK_EQ(scratch[i], v[i]);
+            }
+            for (std::size_t i = n; i < size; ++i)
+            {
+                INFO(name, " store_head aligned untouched n=", n, " i=", i);
+                CHECK_EQ(scratch[i], sentinel);
+            }
+
+            std::fill(scratch.begin(), scratch.end(), sentinel);
+            xsimd::store_head(scratch.data(), n, b, xsimd::unaligned_mode());
+            for (std::size_t i = 0; i < n; ++i)
+            {
+                INFO(name, " store_head unaligned n=", n, " i=", i);
+                CHECK_EQ(scratch[i], v[i]);
+            }
+            for (std::size_t i = n; i < size; ++i)
+            {
+                INFO(name, " store_head unaligned untouched n=", n, " i=", i);
+                CHECK_EQ(scratch[i], sentinel);
+            }
+
+            // store_tail: high n lanes -> mem[0, n)
+            std::fill(scratch.begin(), scratch.end(), sentinel);
+            xsimd::store_tail(scratch.data(), n, b, xsimd::aligned_mode());
+            for (std::size_t i = 0; i < n; ++i)
+            {
+                INFO(name, " store_tail aligned n=", n, " i=", i);
+                CHECK_EQ(scratch[i], v[size - n + i]);
+            }
+            for (std::size_t i = n; i < size; ++i)
+            {
+                INFO(name, " store_tail aligned untouched n=", n, " i=", i);
+                CHECK_EQ(scratch[i], sentinel);
+            }
+
+            std::fill(scratch.begin(), scratch.end(), sentinel);
+            xsimd::store_tail(scratch.data(), n, b, xsimd::unaligned_mode());
+            for (std::size_t i = 0; i < n; ++i)
+            {
+                INFO(name, " store_tail unaligned n=", n, " i=", i);
+                CHECK_EQ(scratch[i], v[size - n + i]);
+            }
+            for (std::size_t i = n; i < size; ++i)
+            {
+                INFO(name, " store_tail unaligned untouched n=", n, " i=", i);
+                CHECK_EQ(scratch[i], sentinel);
+            }
+        }
+    }
+
+    void test_head_tail()
+    {
+        run_load_head_tail(i8_vec, "head/tail int8_t");
+        run_load_head_tail(ui8_vec, "head/tail uint8_t");
+        run_load_head_tail(i16_vec, "head/tail int16_t");
+        run_load_head_tail(ui16_vec, "head/tail uint16_t");
+        run_load_head_tail(i32_vec, "head/tail int32_t");
+        run_load_head_tail(ui32_vec, "head/tail uint32_t");
+        run_load_head_tail(i64_vec, "head/tail int64_t");
+        run_load_head_tail(ui64_vec, "head/tail uint64_t");
+        run_load_head_tail(f_vec, "head/tail float");
+#if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64
+        run_load_head_tail(d_vec, "head/tail double");
+#endif
+
+        run_store_head_tail(i8_vec, "head/tail int8_t");
+        run_store_head_tail(ui8_vec, "head/tail uint8_t");
+        run_store_head_tail(i16_vec, "head/tail int16_t");
+        run_store_head_tail(ui16_vec, "head/tail uint16_t");
+        run_store_head_tail(i32_vec, "head/tail int32_t");
+        run_store_head_tail(ui32_vec, "head/tail uint32_t");
+        run_store_head_tail(i64_vec, "head/tail int64_t");
+        run_store_head_tail(ui64_vec, "head/tail uint64_t");
+        run_store_head_tail(f_vec, "head/tail float");
+#if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64
+        run_store_head_tail(d_vec, "head/tail double");
+#endif
+    }
+
     void test_masked()
     {
         using arch = typename B::arch_type;
@@ -688,6 +851,8 @@ TEST_CASE_TEMPLATE("[load store]", B, BATCH_TYPES)
     SUBCASE("scatter") { Test.test_scatter(); }
 
     SUBCASE("masked") { Test.test_masked(); }
+
+    SUBCASE("head_tail") { Test.test_head_tail(); }
 }
 
 #endif

From d5f21c701993596fb18132680bbbf10f3bea363e Mon Sep 17 00:00:00 2001
From: Marco Barbone <mbarbone@flatironinstitute.org>
Date: Fri, 1 May 2026 15:51:26 -0400
Subject: [PATCH 3/3] feat: add runtime batch_bool mask overloads for avx_128 /
 avx2_128

Mirror the AVX/AVX2 runtime-mask load_masked / store_masked overloads
on the new 128-bit SSE-register variants of those ISAs:

- avx_128: float / double via _mm_maskload_ps/pd, _mm_maskstore_ps/pd
- avx2_128: 32/64-bit integers via _mm_maskload_epi32/64, _mm_maskstore_epi32/64

8/16-bit integers continue to fall through to the scalar common path
(no native maskload/store intrinsic at those widths). Both alignment
modes route to the same intrinsic since masked-off lanes do not fault.
---
 include/xsimd/arch/xsimd_avx2_128.hpp | 37 +++++++++++++++++++++++++++
 include/xsimd/arch/xsimd_avx_128.hpp  | 31 ++++++++++++++++++++++
 2 files changed, 68 insertions(+)

diff --git a/include/xsimd/arch/xsimd_avx2_128.hpp b/include/xsimd/arch/xsimd_avx2_128.hpp
index 7e8b0d05a..232667ef7 100644
--- a/include/xsimd/arch/xsimd_avx2_128.hpp
+++ b/include/xsimd/arch/xsimd_avx2_128.hpp
@@ -133,6 +133,43 @@ namespace xsimd
             return _mm_maskstore_epi64((int64_t*)mem, mask.as_batch(), src);
         }
 
+        // Runtime-mask load for 32/64-bit integers on AVX2-128. 8/16-bit
+        // integers fall back to the scalar common path: there is no native
+        // _mm_maskload for those widths, and a load-then-blend would break
+        // fault-suppression at page boundaries (the main reason callers ask
+        // for a masked load). Both aligned_mode and unaligned_mode route to
+        // the same intrinsic — masked-off lanes do not fault regardless of
+        // alignment.
+        template <class A, class T, class Mode>
+        XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), batch<T, A>>
+        load_masked(T const* mem, batch_bool<T, A> mask, convert<T>, Mode, requires_arch<avx2_128>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_maskload_epi32(reinterpret_cast<const int*>(mem), __m128i(mask));
+            }
+            else
+            {
+                return _mm_maskload_epi64(reinterpret_cast<const long long*>(mem), __m128i(mask));
+            }
+        }
+
+        // Runtime-mask store for 32/64-bit integers on AVX2-128. Same
+        // fault-suppression semantics as the masked loads above.
+        template <class A, class T, class Mode>
+        XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 4 || sizeof(T) == 8), void>
+        store_masked(T* mem, batch<T, A> const& src, batch_bool<T, A> mask, Mode, requires_arch<avx2_128>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                _mm_maskstore_epi32(reinterpret_cast<int*>(mem), __m128i(mask), __m128i(src));
+            }
+            else
+            {
+                _mm_maskstore_epi64(reinterpret_cast<long long*>(mem), __m128i(mask), __m128i(src));
+            }
+        }
+
         // gather
         template <class T, class A, class U, detail::enable_sized_integral_t<T, 4> = 0, detail::enable_sized_integral_t<U, 4> = 0>
         XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
diff --git a/include/xsimd/arch/xsimd_avx_128.hpp b/include/xsimd/arch/xsimd_avx_128.hpp
index af17568e1..98642dac6 100644
--- a/include/xsimd/arch/xsimd_avx_128.hpp
+++ b/include/xsimd/arch/xsimd_avx_128.hpp
@@ -115,6 +115,22 @@ namespace xsimd
             return _mm_maskload_pd(mem, mask.as_batch());
         }
 
+        // Runtime-mask load for float/double on AVX-128. Both aligned_mode and
+        // unaligned_mode map to _mm_maskload_* — the intrinsic does not fault
+        // on masked-off lanes, so partial loads across page boundaries are safe.
+        template <class A, class Mode>
+        XSIMD_INLINE batch<float, A>
+        load_masked(float const* mem, batch_bool<float, A> mask, convert<float>, Mode, requires_arch<avx_128>) noexcept
+        {
+            return _mm_maskload_ps(mem, _mm_castps_si128(mask));
+        }
+        template <class A, class Mode>
+        XSIMD_INLINE batch<double, A>
+        load_masked(double const* mem, batch_bool<double, A> mask, convert<double>, Mode, requires_arch<avx_128>) noexcept
+        {
+            return _mm_maskload_pd(mem, _mm_castpd_si128(mask));
+        }
+
         // store_masked
         template <class A, bool... Values, class Mode>
         XSIMD_INLINE void store_masked(float* mem, batch<float, A> const& src, batch_bool_constant<float, A, Values...> mask, Mode, requires_arch<avx_128>) noexcept
@@ -128,6 +144,21 @@ namespace xsimd
             return _mm_maskstore_pd(mem, mask.as_batch(), src);
         }
 
+        // Runtime-mask store for float/double on AVX-128. Same fault-suppression
+        // semantics as the masked loads above; alignment mode is irrelevant.
+        template <class A, class Mode>
+        XSIMD_INLINE void
+        store_masked(float* mem, batch<float, A> const& src, batch_bool<float, A> mask, Mode, requires_arch<avx_128>) noexcept
+        {
+            _mm_maskstore_ps(mem, _mm_castps_si128(mask), src);
+        }
+        template <class A, class Mode>
+        XSIMD_INLINE void
+        store_masked(double* mem, batch<double, A> const& src, batch_bool<double, A> mask, Mode, requires_arch<avx_128>) noexcept
+        {
+            _mm_maskstore_pd(mem, _mm_castpd_si128(mask), src);
+        }
+
         // swizzle (dynamic mask)
         template <class A, class T, class ITy, class = std::enable_if_t<std::is_floating_point<T>::value && sizeof(T) == sizeof(ITy)>>
         XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch<ITy, A> mask, requires_arch<avx_128>) noexcept