From c8ab083bb2f0e5e22667724e382cb495c4733f2d Mon Sep 17 00:00:00 2001
From: Marco Barbone <mbarbone@flatironinstitute.org>
Date: Tue, 23 Sep 2025 18:14:53 -0400
Subject: [PATCH 1/2] 1. Adding stream API for non temporal data transfers 2.
 Adding xsimd::fence as a wrapper around std atomic for cache coherence 3.
 Adding tests

---
 .../xsimd/arch/common/xsimd_common_memory.hpp | 36 ++++++++
 include/xsimd/arch/xsimd_avx.hpp              | 17 ++++
 include/xsimd/arch/xsimd_avx2.hpp             | 17 ++++
 include/xsimd/arch/xsimd_avx512f.hpp          | 34 ++++++++
 include/xsimd/arch/xsimd_sse2.hpp             | 17 ++++
 include/xsimd/arch/xsimd_sse4_1.hpp           | 17 ++++
 include/xsimd/memory/xsimd_alignment.hpp      | 11 +++
 include/xsimd/types/xsimd_api.hpp             | 84 +++++++++++++++++++
 include/xsimd/types/xsimd_batch.hpp           | 62 ++++++++++++++
 test/test_load_store.cpp                      | 55 ++++++++++++
 10 files changed, 350 insertions(+)
diff --git a/include/xsimd/arch/common/xsimd_common_memory.hpp b/include/xsimd/arch/common/xsimd_common_memory.hpp
index 6a301dd44..6ead88f99 100644
--- a/include/xsimd/arch/common/xsimd_common_memory.hpp
+++ b/include/xsimd/arch/common/xsimd_common_memory.hpp
@@ -292,6 +292,12 @@ namespace xsimd
             return load_unaligned(mem, b, A {});
         }
 
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> load_stream(bool const* mem, batch_bool<T, A> b, requires_arch<common>) noexcept
+        {
+            return load_aligned(mem, b, A {});
+        }
+
         // load_aligned
         namespace detail
         {
@@ -438,6 +444,12 @@ namespace xsimd
             store_masked<A>(reinterpret_cast<double*>(mem), bitwise_cast<double>(src), batch_bool_constant<double, A, Values...> {}, Mode {}, A {});
         }
 
+        template <class A, class T_in, class T_out>
+        XSIMD_INLINE batch<T_out, A> load_stream(T_in const* mem, convert<T_out> cvt, requires_arch<common>) noexcept
+        {
+            return load_aligned<A>(mem, cvt, A {});
+        }
+
         // rotate_right
         template <size_t N, class A, class T>
         XSIMD_INLINE batch<T, A> rotate_right(batch<T, A> const& self, requires_arch<common>) noexcept
@@ -679,6 +691,12 @@ namespace xsimd
                 mem[i] = bool(buffer[i]);
         }
 
+        template <class A, class T>
+        XSIMD_INLINE void store_stream(batch_bool<T, A> const& self, bool* mem, requires_arch<common>) noexcept
+        {
+            store(self, mem, A {});
+        }
+
         // store_aligned
         template <class A, class T_in, class T_out>
         XSIMD_INLINE void store_aligned(T_out* mem, batch<T_in, A> const& self, requires_arch<common>) noexcept
@@ -697,6 +715,12 @@ namespace xsimd
             return store_aligned<A>(mem, self, common {});
         }
 
+        template <class A, class T_in, class T_out>
+        XSIMD_INLINE void store_stream(T_out* mem, batch<T_in, A> const& self, requires_arch<common>) noexcept
+        {
+            store_aligned<A>(mem, self, A {});
+        }
+
         // swizzle
         template <class A, class T, class ITy, ITy... Vs>
         XSIMD_INLINE batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch_constant<ITy, A, Vs...> mask, requires_arch<common>) noexcept
@@ -778,6 +802,12 @@ namespace xsimd
             return detail::load_complex(hi, lo, A {});
         }
 
+        template <class A, class T_out, class T_in>
+        XSIMD_INLINE batch<std::complex<T_out>, A> load_complex_stream(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<common>) noexcept
+        {
+            return load_complex_aligned<A>(mem, kernel::convert<std::complex<T_out>> {}, A {});
+        }
+
         // store_complex_aligned
         template <class A, class T_out, class T_in>
         XSIMD_INLINE void store_complex_aligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<common>) noexcept
@@ -802,6 +832,12 @@ namespace xsimd
             hi.store_unaligned(buffer + real_batch::size);
         }
 
+        template <class A, class T_out, class T_in>
+        XSIMD_INLINE void store_complex_stream(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<common>) noexcept
+        {
+            store_complex_aligned<A>(dst, src, A {});
+        }
+
         // transpose
         template <class A, class T>
         XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<common>) noexcept
diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp
index 4af728e07..441371643 100644
--- a/include/xsimd/arch/xsimd_avx.hpp
+++ b/include/xsimd/arch/xsimd_avx.hpp
@@ -1515,6 +1515,23 @@ namespace xsimd
             return _mm256_storeu_pd(mem, self);
         }
 
+        // store_stream
+        template <class A>
+        XSIMD_INLINE void store_stream(float* mem, batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            _mm256_stream_ps(mem, self);
+        }
+        template <class A>
+        XSIMD_INLINE void store_stream(double* mem, batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            _mm256_stream_pd(mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE void store_stream(T* mem, batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            _mm256_stream_si256((__m256i*)mem, self);
+        }
+
         // sub
         template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
         XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp
index 448a7f7bc..c172b73e6 100644
--- a/include/xsimd/arch/xsimd_avx2.hpp
+++ b/include/xsimd/arch/xsimd_avx2.hpp
@@ -229,6 +229,23 @@ namespace xsimd
             store_masked<A>(reinterpret_cast<int64_t*>(mem), s64, batch_bool_constant<int64_t, A, Values...> {}, Mode {}, avx2 {});
         }
 
+        // load_stream
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> load_stream(T const* mem, convert<T>, requires_arch<avx2>) noexcept
+        {
+            return _mm256_stream_load_si256((__m256i const*)mem);
+        }
+        template <class A>
+        XSIMD_INLINE batch<float, A> load_stream(float const* mem, convert<float>, requires_arch<avx2>) noexcept
+        {
+            return _mm256_castsi256_ps(_mm256_stream_load_si256((__m256i const*)mem));
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> load_stream(double const* mem, convert<double>, requires_arch<avx2>) noexcept
+        {
+            return _mm256_castsi256_pd(_mm256_stream_load_si256((__m256i const*)mem));
+        }
+
         // bitwise_and
         template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
         XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp
index 5ccf165f1..fe8d33d99 100644
--- a/include/xsimd/arch/xsimd_avx512f.hpp
+++ b/include/xsimd/arch/xsimd_avx512f.hpp
@@ -1513,6 +1513,23 @@ namespace xsimd
             return _mm512_loadu_pd(mem);
         }
 
+        // load_stream
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> load_stream(T const* mem, convert<T>, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_stream_load_si512((__m512i*)mem);
+        }
+        template <class A>
+        XSIMD_INLINE batch<float, A> load_stream(float const* mem, convert<float>, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_ps(_mm512_stream_load_si512((__m512i*)mem));
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> load_stream(double const* mem, convert<double>, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_pd(_mm512_stream_load_si512((__m512i*)mem));
+        }
+
         // lt
         template <class A>
         XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
@@ -2285,6 +2302,23 @@ namespace xsimd
             return _mm512_storeu_pd(mem, self);
         }
 
+        // store_stream
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE void store_stream(T* mem, batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            _mm512_stream_si512((__m512i*)mem, self);
+        }
+        template <class A>
+        XSIMD_INLINE void store_stream(float* mem, batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            _mm512_stream_ps(mem, self);
+        }
+        template <class A>
+        XSIMD_INLINE void store_stream(double* mem, batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            _mm512_stream_pd(mem, self);
+        }
+
         // sub
         template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
         XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp
index cccba8144..eb92d53ba 100644
--- a/include/xsimd/arch/xsimd_sse2.hpp
+++ b/include/xsimd/arch/xsimd_sse2.hpp
@@ -1927,6 +1927,23 @@ namespace xsimd
             return _mm_storeu_pd(mem, self);
         }
 
+        // store_stream
+        template <class A>
+        XSIMD_INLINE void store_stream(float* mem, batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            _mm_stream_ps(mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE void store_stream(T* mem, batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            _mm_stream_si128((__m128i*)mem, self);
+        }
+        template <class A>
+        XSIMD_INLINE void store_stream(double* mem, batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            _mm_stream_pd(mem, self);
+        }
+
         // sub
         template <class A>
         XSIMD_INLINE batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
diff --git a/include/xsimd/arch/xsimd_sse4_1.hpp b/include/xsimd/arch/xsimd_sse4_1.hpp
index 18b2a09d0..6536fad4b 100644
--- a/include/xsimd/arch/xsimd_sse4_1.hpp
+++ b/include/xsimd/arch/xsimd_sse4_1.hpp
@@ -228,6 +228,23 @@ namespace xsimd
             }
         }
 
+        // load_stream
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> load_stream(T const* mem, convert<T>, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_stream_load_si128((__m128i*)mem);
+        }
+        template <class A>
+        XSIMD_INLINE batch<float, A> load_stream(float const* mem, convert<float>, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_castsi128_ps(_mm_stream_load_si128((__m128i*)mem));
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> load_stream(double const* mem, convert<double>, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_castsi128_pd(_mm_stream_load_si128((__m128i*)mem));
+        }
+
         // min
         template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
         XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
diff --git a/include/xsimd/memory/xsimd_alignment.hpp b/include/xsimd/memory/xsimd_alignment.hpp
index 2d59ac1fc..fd1918bea 100644
--- a/include/xsimd/memory/xsimd_alignment.hpp
+++ b/include/xsimd/memory/xsimd_alignment.hpp
@@ -33,6 +33,17 @@ namespace xsimd
     {
     };
 
+    /**
+     * @struct stream_mode
+     * @brief tag for load and store of aligned non-temporal memory.
+     *
+     * Streaming accesses expect aligned pointers. When no architecture-specific
+     * implementation is available, they fall back to aligned semantics.
+     */
+    struct stream_mode
+    {
+    };
+
     /***********************
      * Allocator alignment *
      ***********************/
diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp
index aa64df4da..f551cfea0 100644
--- a/include/xsimd/types/xsimd_api.hpp
+++ b/include/xsimd/types/xsimd_api.hpp
@@ -12,6 +12,7 @@
 #ifndef XSIMD_API_HPP
 #define XSIMD_API_HPP
 
+#include <atomic>
 #include <complex>
 #include <cstddef>
 #include <limits>
@@ -1334,6 +1335,30 @@ namespace xsimd
         return kernel::load_complex_aligned<A>(ptr, kernel::convert<batch_value_type> {}, A {});
     }
 
+    template <class To, class A = default_arch, class From>
+    XSIMD_INLINE simd_return_type<From, To, A> load_as(From const* ptr, stream_mode) noexcept
+    {
+        using batch_value_type = typename simd_return_type<From, To, A>::value_type;
+        detail::static_check_supported_config<From, A>();
+        detail::static_check_supported_config<To, A>();
+        return kernel::load_stream<A>(ptr, kernel::convert<batch_value_type> {}, A {});
+    }
+
+    template <class To, class A = default_arch>
+    XSIMD_INLINE simd_return_type<bool, To, A> load_as(bool const* ptr, stream_mode) noexcept
+    {
+        detail::static_check_supported_config<To, A>();
+        return simd_return_type<bool, To, A>::load_stream(ptr);
+    }
+
+    template <class To, class A = default_arch, class From>
+    XSIMD_INLINE simd_return_type<std::complex<From>, To, A> load_as(std::complex<From> const* ptr, stream_mode) noexcept
+    {
+        detail::static_check_supported_config<To, A>();
+        using batch_value_type = typename simd_return_type<std::complex<From>, To, A>::value_type;
+        return kernel::load_complex_stream<A>(ptr, kernel::convert<batch_value_type> {}, A {});
+    }
+
 #ifdef XSIMD_ENABLE_XTL_COMPLEX
     template <class To, class A = default_arch, class From, bool i3ec>
     XSIMD_INLINE simd_return_type<xtl::xcomplex<From, From, i3ec>, To, A> load_as(xtl::xcomplex<From, From, i3ec> const* ptr, aligned_mode) noexcept
@@ -1342,6 +1367,14 @@ namespace xsimd
         detail::static_check_supported_config<From, A>();
         return load_as<To>(reinterpret_cast<std::complex<From> const*>(ptr), aligned_mode());
     }
+
+    template <class To, class A = default_arch, class From, bool i3ec>
+    XSIMD_INLINE simd_return_type<xtl::xcomplex<From, From, i3ec>, To, A> load_as(xtl::xcomplex<From, From, i3ec> const* ptr, stream_mode) noexcept
+    {
+        detail::static_check_supported_config<To, A>();
+        detail::static_check_supported_config<From, A>();
+        return load_as<To>(reinterpret_cast<std::complex<From> const*>(ptr), stream_mode());
+    }
 #endif
 
     /**
@@ -1416,6 +1449,13 @@ namespace xsimd
         return load_as<From, A>(ptr, unaligned_mode {});
     }
 
+    template <class A = default_arch, class From>
+    XSIMD_INLINE batch<From, A> load(From const* ptr, stream_mode) noexcept
+    {
+        detail::static_check_supported_config<From, A>();
+        return load_as<From, A>(ptr, stream_mode {});
+    }
+
     /**
      * @ingroup batch_data_transfer
      *
@@ -2420,12 +2460,40 @@ namespace xsimd
         kernel::store_complex_aligned<A>(dst, src, A {});
     }
 
+    template <class To, class A = default_arch, class From>
+    XSIMD_INLINE void store_as(To* dst, batch<From, A> const& src, stream_mode) noexcept
+    {
+        detail::static_check_supported_config<From, A>();
+        kernel::store_stream<A>(dst, src, A {});
+    }
+
+    template <class A = default_arch, class From>
+    XSIMD_INLINE void store_as(bool* dst, batch_bool<From, A> const& src, stream_mode) noexcept
+    {
+        detail::static_check_supported_config<From, A>();
+        kernel::store_stream<A>(src, dst, A {});
+    }
+
+    template <class To, class A = default_arch, class From>
+    XSIMD_INLINE void store_as(std::complex<To>* dst, batch<std::complex<From>, A> const& src, stream_mode) noexcept
+    {
+        detail::static_check_supported_config<std::complex<From>, A>();
+        kernel::store_complex_stream<A>(dst, src, A {});
+    }
+
 #ifdef XSIMD_ENABLE_XTL_COMPLEX
     template <class To, class A = default_arch, class From, bool i3ec>
     XSIMD_INLINE void store_as(xtl::xcomplex<To, To, i3ec>* dst, batch<std::complex<From>, A> const& src, aligned_mode) noexcept
     {
         store_as(reinterpret_cast<std::complex<To>*>(dst), src, aligned_mode());
     }
+
+    template <class To, class A = default_arch, class From, bool i3ec>
+    XSIMD_INLINE void store_as(xtl::xcomplex<To, To, i3ec>* dst, batch<std::complex<From>, A> const& src, stream_mode) noexcept
+    {
+        detail::static_check_supported_config<std::complex<From>, A>();
+        store_as(reinterpret_cast<std::complex<To>*>(dst), src, stream_mode());
+    }
 #endif
 
     /**
@@ -2494,6 +2562,22 @@ namespace xsimd
         store_as<T, A>(mem, val, unaligned_mode {});
     }
 
+    template <class A, class T>
+    XSIMD_INLINE void store(T* mem, batch<T, A> const& val, stream_mode) noexcept
+    {
+        store_as<T, A>(mem, val, stream_mode {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Issues a sequentially consistent memory fence.
+     */
+    XSIMD_INLINE void fence() noexcept
+    {
+        std::atomic_thread_fence(std::memory_order_seq_cst);
+    }
+
     /**
      * @ingroup batch_data_transfer
      *
diff --git a/include/xsimd/types/xsimd_batch.hpp b/include/xsimd/types/xsimd_batch.hpp
index 5ff525a11..4e0b71844 100644
--- a/include/xsimd/types/xsimd_batch.hpp
+++ b/include/xsimd/types/xsimd_batch.hpp
@@ -144,6 +144,8 @@ namespace xsimd
         XSIMD_INLINE void store(U* mem, aligned_mode) const noexcept;
         template <class U>
         XSIMD_INLINE void store(U* mem, unaligned_mode) const noexcept;
+        template <class U>
+        XSIMD_INLINE void store(U* mem, stream_mode) const noexcept;
 
         // Compile-time mask overloads
         template <class U, bool... Values, class Mode = aligned_mode>
@@ -160,6 +162,8 @@ namespace xsimd
         // Compile-time mask overloads
         template <class U, bool... Values, class Mode = aligned_mode>
         XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, batch_bool_constant<T, A, Values...> mask, Mode = {}) noexcept;
+        template <class U>
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, stream_mode) noexcept;
 
         template <class U, class V>
         XSIMD_NO_DISCARD static XSIMD_INLINE batch gather(U const* src, batch<V, arch_type> const& index) noexcept;
@@ -323,8 +327,10 @@ namespace xsimd
         // memory operators
         XSIMD_INLINE void store_aligned(bool* mem) const noexcept;
         XSIMD_INLINE void store_unaligned(bool* mem) const noexcept;
+        XSIMD_INLINE void store_stream(bool* mem) const noexcept;
         XSIMD_NO_DISCARD static XSIMD_INLINE batch_bool load_aligned(bool const* mem) noexcept;
         XSIMD_NO_DISCARD static XSIMD_INLINE batch_bool load_unaligned(bool const* mem) noexcept;
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch_bool load_stream(bool const* mem) noexcept;
 
         XSIMD_INLINE bool get(std::size_t i) const noexcept;
 
@@ -417,12 +423,16 @@ namespace xsimd
         template <class U, bool... Values, class Mode = aligned_mode>
         XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, batch_bool_constant<value_type, A, Values...> mask, Mode = {}) noexcept;
         template <class U>
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, stream_mode) noexcept;
+        template <class U>
         XSIMD_INLINE void store(U* mem, aligned_mode) const noexcept;
         template <class U>
         XSIMD_INLINE void store(U* mem, unaligned_mode) const noexcept;
         // Compile-time mask overloads
         template <class U, bool... Values, class Mode = aligned_mode>
         XSIMD_INLINE void store(U* mem, batch_bool_constant<value_type, A, Values...> mask, Mode = {}) const noexcept;
+        template <class U>
+        XSIMD_INLINE void store(U* mem, stream_mode) const noexcept;
 
         XSIMD_INLINE real_batch real() const noexcept;
         XSIMD_INLINE real_batch imag() const noexcept;
@@ -634,6 +644,16 @@ namespace xsimd
 
     // masked store free functions are provided in xsimd_api.hpp
 
+    template <class T, class A>
+    template <class U>
+    XSIMD_INLINE void batch<T, A>::store(U* mem, stream_mode) const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        assert(((reinterpret_cast<uintptr_t>(mem) % A::alignment()) == 0)
+               && "store location is not properly aligned");
+        kernel::store_stream<A>(mem, *this, A {});
+    }
+
     /**
      * Loading from aligned memory. May involve a conversion if \c U is different
      * from \c T.
@@ -728,6 +748,16 @@ namespace xsimd
         }
     }
 
+    template <class T, class A>
+    template <class U>
+    XSIMD_INLINE batch<T, A> batch<T, A>::load(U const* mem, stream_mode) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        assert(((reinterpret_cast<uintptr_t>(mem) % A::alignment()) == 0)
+               && "loaded pointer is not properly aligned");
+        return kernel::load_stream<A>(mem, kernel::convert<T> {}, A {});
+    }
+
     /**
      * Create a new batch gathering elements starting at address \c src and
      * offset by each element in \c index.
@@ -1051,6 +1081,12 @@ namespace xsimd
         store_aligned(mem);
     }
 
+    template <class T, class A>
+    XSIMD_INLINE void batch_bool<T, A>::store_stream(bool* mem) const noexcept
+    {
+        kernel::store_stream<A>(*this, mem, A {});
+    }
+
     template <class T, class A>
     XSIMD_INLINE batch_bool<T, A> batch_bool<T, A>::load_aligned(bool const* mem) noexcept
     {
@@ -1063,6 +1099,12 @@ namespace xsimd
         return kernel::load_unaligned<A>(mem, batch_bool<T, A>(), A {});
     }
 
+    template <class T, class A>
+    XSIMD_INLINE batch_bool<T, A> batch_bool<T, A>::load_stream(bool const* mem) noexcept
+    {
+        return kernel::load_stream<A>(mem, batch_bool<T, A>(), A {});
+    }
+
     /**
      * Extract a scalar mask representation from this @c batch_bool.
      *
@@ -1327,6 +1369,16 @@ namespace xsimd
         return kernel::load_masked<A>(mem, mask, kernel::convert<value_type> {}, mode, A {});
     }
 
+    template <class T, class A>
+    template <class U>
+    XSIMD_INLINE batch<std::complex<T>, A> batch<std::complex<T>, A>::load(U const* mem, stream_mode) noexcept
+    {
+        assert(((reinterpret_cast<uintptr_t>(mem) % A::alignment()) == 0)
+               && "loaded pointer is not properly aligned");
+        auto* ptr = reinterpret_cast<value_type const*>(mem);
+        return kernel::load_complex_stream<A>(ptr, kernel::convert<value_type> {}, A {});
+    }
+
     template <class T, class A>
     template <class U>
     XSIMD_INLINE void batch<std::complex<T>, A>::store(U* mem, aligned_mode) const noexcept
@@ -1341,6 +1393,16 @@ namespace xsimd
         return store_unaligned(mem);
     }
 
+    template <class T, class A>
+    template <class U>
+    XSIMD_INLINE void batch<std::complex<T>, A>::store(U* mem, stream_mode) const noexcept
+    {
+        assert(((reinterpret_cast<uintptr_t>(mem) % A::alignment()) == 0)
+               && "store location is not properly aligned");
+        auto* ptr = reinterpret_cast<value_type*>(mem);
+        return kernel::store_complex_stream(ptr, *this, A {});
+    }
+
     template <class T, class A>
     XSIMD_INLINE auto batch<std::complex<T>, A>::real() const noexcept -> real_batch
     {
diff --git a/test/test_load_store.cpp b/test/test_load_store.cpp
index 9fa7dbff8..ba0452531 100644
--- a/test/test_load_store.cpp
+++ b/test/test_load_store.cpp
@@ -303,6 +303,33 @@ struct load_store_test
     };
 #endif
 
+    template <class Ptr>
+    void stream_load_if_same(Ptr const* ptr, batch_type& b, array_type const& expected_values, const std::string& name,
+                             std::true_type) const
+    {
+        b = xsimd::load(ptr, xsimd::stream_mode());
+        INFO(name, " stream (load)");
+        CHECK_BATCH_EQ(b, expected_values);
+    }
+
+    template <class Ptr>
+    void stream_load_if_same(Ptr const*, batch_type&, array_type const&, const std::string&, std::false_type) const
+    {
+    }
+
+    template <class Vec>
+    void stream_store_if_same(Vec& res, batch_type const& b, Vec const& reference, const std::string& name, std::true_type) const
+    {
+        xsimd::store(res.data(), b, xsimd::stream_mode());
+        INFO(name, " stream (store)");
+        CHECK_VECTOR_EQ(res, reference);
+    }
+
+    template <class Vec>
+    void stream_store_if_same(Vec&, batch_type const&, Vec const&, const std::string&, std::false_type) const
+    {
+    }
+
     template <class V>
     void test_load_impl(const V& v, const std::string& name)
     {
@@ -316,6 +343,10 @@ struct load_store_test
         INFO(name, " aligned");
         CHECK_BATCH_EQ(b, expected);
 
+        b = batch_type::load(v.data(), xsimd::stream_mode());
+        INFO(name, " stream (batch::load)");
+        CHECK_BATCH_EQ(b, expected);
+
         b = xsimd::load_as<value_type>(v.data(), xsimd::unaligned_mode());
         INFO(name, " unaligned (load_as)");
         CHECK_BATCH_EQ(b, expected);
@@ -324,6 +355,13 @@ struct load_store_test
         INFO(name, " aligned (load_as)");
         CHECK_BATCH_EQ(b, expected);
 
+        b = xsimd::load_as<value_type>(v.data(), xsimd::stream_mode());
+        INFO(name, " stream (load_as)");
+        CHECK_BATCH_EQ(b, expected);
+
+        stream_load_if_same(v.data(), b, expected, name,
+                            std::integral_constant<bool, std::is_same<typename V::value_type, value_type>::value> {});
+
         run_mask_tests(v, name, b, expected, std::is_same<typename V::value_type, value_type> {});
     }
 
@@ -474,6 +512,17 @@ struct load_store_test
         INFO(name, " aligned (store_as)");
         CHECK_VECTOR_EQ(res, v);
 
+        b.store(res.data(), xsimd::stream_mode());
+        INFO(name, " stream (batch::store)");
+        CHECK_VECTOR_EQ(res, v);
+
+        xsimd::store_as(res.data(), b, xsimd::stream_mode());
+        INFO(name, " stream (store_as)");
+        CHECK_VECTOR_EQ(res, v);
+
+        stream_store_if_same(res, b, v, name,
+                             std::integral_constant<bool, std::is_same<typename V::value_type, value_type>::value> {});
+
         V expected_masked(size);
 
         run_store_mask_section(v, name, b, res, expected_masked, std::is_same<typename V::value_type, value_type> {});
@@ -556,4 +605,10 @@ TEST_CASE_TEMPLATE("[load store]", B, BATCH_TYPES)
 
     SUBCASE("masked") { Test.test_masked(); }
 }
+
+TEST_CASE("[fence] sequential consistency")
+{
+    xsimd::fence();
+    CHECK(true);
+}
 #endif

From 47dee36e3577c6db66d5cf3d8792cfaeb0b114c3 Mon Sep 17 00:00:00 2001
From: Marco Barbone <mbarbone@flatironinstitute.org>
Date: Tue, 3 Mar 2026 11:52:10 -0500
Subject: [PATCH 2/2] Add AArch64 NEON non-temporal load/store (ldnp/stnp)

Implement store_stream and load_stream for neon64 using inline asm
with LDNP/STNP instructions, providing non-temporal cache hints on
AArch64. Covers float, double, and integral types. Guarded behind
__GNUC__ so MSVC ARM64 falls back to aligned load/store.

Also remove xsimd::fence (std::atomic wrapper) and its test, which
were unrelated additions from a prior commit.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 include/xsimd/arch/xsimd_neon64.hpp | 84 +++++++++++++++++++++++++++++
 include/xsimd/types/xsimd_api.hpp   | 11 ----
 test/test_load_store.cpp            |  5 --
 3 files changed, 84 insertions(+), 16 deletions(-)

diff --git a/include/xsimd/arch/xsimd_neon64.hpp b/include/xsimd/arch/xsimd_neon64.hpp
index 9f3c4bce8..1847261f5 100644
--- a/include/xsimd/arch/xsimd_neon64.hpp
+++ b/include/xsimd/arch/xsimd_neon64.hpp
@@ -14,6 +14,7 @@
 
 #include <complex>
 #include <cstddef>
+#include <cstring>
 #include <tuple>
 #include <utility>
 
@@ -178,6 +179,89 @@ namespace xsimd
             return store_aligned<A>(dst, src, A {});
         }
 
+        /****************
+         * store_stream *
+         ****************/
+
+#if defined(__GNUC__)
+        template <class A>
+        XSIMD_INLINE void store_stream(float* mem, batch<float, A> const& val, requires_arch<neon64>) noexcept
+        {
+            float32x2_t lo = vget_low_f32(val);
+            float32x2_t hi = vget_high_f32(val);
+            __asm__ __volatile__("stnp %d[lo], %d[hi], [%[mem]]"
+                                 :
+                                 : [lo] "w"(lo), [hi] "w"(hi), [mem] "r"(mem)
+                                 : "memory");
+        }
+
+        template <class A>
+        XSIMD_INLINE void store_stream(double* mem, batch<double, A> const& val, requires_arch<neon64>) noexcept
+        {
+            float64x1_t lo = vget_low_f64(val);
+            float64x1_t hi = vget_high_f64(val);
+            __asm__ __volatile__("stnp %d[lo], %d[hi], [%[mem]]"
+                                 :
+                                 : [lo] "w"(lo), [hi] "w"(hi), [mem] "r"(mem)
+                                 : "memory");
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE void store_stream(T* mem, batch<T, A> const& val, requires_arch<neon64>) noexcept
+        {
+            uint64x2_t u64;
+            std::memcpy(&u64, &val, sizeof(u64));
+            uint64x1_t lo = vget_low_u64(u64);
+            uint64x1_t hi = vget_high_u64(u64);
+            __asm__ __volatile__("stnp %d[lo], %d[hi], [%[mem]]"
+                                 :
+                                 : [lo] "w"(lo), [hi] "w"(hi), [mem] "r"(mem)
+                                 : "memory");
+        }
+#endif
+
+        /***************
+         * load_stream *
+         ***************/
+
+#if defined(__GNUC__)
+        template <class A>
+        XSIMD_INLINE batch<float, A> load_stream(float const* mem, convert<float>, requires_arch<neon64>) noexcept
+        {
+            float32x2_t lo, hi;
+            __asm__ __volatile__("ldnp %d[lo], %d[hi], [%[mem]]"
+                                 : [lo] "=w"(lo), [hi] "=w"(hi)
+                                 : [mem] "r"(mem)
+                                 : "memory");
+            return vcombine_f32(lo, hi);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> load_stream(double const* mem, convert<double>, requires_arch<neon64>) noexcept
+        {
+            float64x1_t lo, hi;
+            __asm__ __volatile__("ldnp %d[lo], %d[hi], [%[mem]]"
+                                 : [lo] "=w"(lo), [hi] "=w"(hi)
+                                 : [mem] "r"(mem)
+                                 : "memory");
+            return vcombine_f64(lo, hi);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> load_stream(T const* mem, convert<T>, requires_arch<neon64>) noexcept
+        {
+            uint64x1_t lo, hi;
+            __asm__ __volatile__("ldnp %d[lo], %d[hi], [%[mem]]"
+                                 : [lo] "=w"(lo), [hi] "=w"(hi)
+                                 : [mem] "r"(mem)
+                                 : "memory");
+            uint64x2_t u64 = vcombine_u64(lo, hi);
+            batch<T, A> result;
+            std::memcpy(&result, &u64, sizeof(u64));
+            return result;
+        }
+#endif
+
         /*********************
          * store<batch_bool> *
          *********************/
diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp
index f551cfea0..a31d3e337 100644
--- a/include/xsimd/types/xsimd_api.hpp
+++ b/include/xsimd/types/xsimd_api.hpp
@@ -12,7 +12,6 @@
 #ifndef XSIMD_API_HPP
 #define XSIMD_API_HPP
 
-#include <atomic>
 #include <complex>
 #include <cstddef>
 #include <limits>
@@ -2568,16 +2567,6 @@ namespace xsimd
         store_as<T, A>(mem, val, stream_mode {});
     }
 
-    /**
-     * @ingroup batch_data_transfer
-     *
-     * Issues a sequentially consistent memory fence.
-     */
-    XSIMD_INLINE void fence() noexcept
-    {
-        std::atomic_thread_fence(std::memory_order_seq_cst);
-    }
-
     /**
      * @ingroup batch_data_transfer
      *
diff --git a/test/test_load_store.cpp b/test/test_load_store.cpp
index ba0452531..353527779 100644
--- a/test/test_load_store.cpp
+++ b/test/test_load_store.cpp
@@ -606,9 +606,4 @@ TEST_CASE_TEMPLATE("[load store]", B, BATCH_TYPES)
     SUBCASE("masked") { Test.test_masked(); }
 }
 
-TEST_CASE("[fence] sequential consistency")
-{
-    xsimd::fence();
-    CHECK(true);
-}
 #endif