From c8ab083bb2f0e5e22667724e382cb495c4733f2d Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 23 Sep 2025 18:14:53 -0400 Subject: [PATCH 1/2] 1. Adding stream API for non temporal data transfers 2. Adding xsimd::fence as a wrapper around std atomic for cache coherence 3. Adding tests --- .../xsimd/arch/common/xsimd_common_memory.hpp | 36 ++++++++ include/xsimd/arch/xsimd_avx.hpp | 17 ++++ include/xsimd/arch/xsimd_avx2.hpp | 17 ++++ include/xsimd/arch/xsimd_avx512f.hpp | 34 ++++++++ include/xsimd/arch/xsimd_sse2.hpp | 17 ++++ include/xsimd/arch/xsimd_sse4_1.hpp | 17 ++++ include/xsimd/memory/xsimd_alignment.hpp | 11 +++ include/xsimd/types/xsimd_api.hpp | 84 +++++++++++++++++++ include/xsimd/types/xsimd_batch.hpp | 62 ++++++++++++++ test/test_load_store.cpp | 55 ++++++++++++ 10 files changed, 350 insertions(+) diff --git a/include/xsimd/arch/common/xsimd_common_memory.hpp b/include/xsimd/arch/common/xsimd_common_memory.hpp index 6a301dd44..6ead88f99 100644 --- a/include/xsimd/arch/common/xsimd_common_memory.hpp +++ b/include/xsimd/arch/common/xsimd_common_memory.hpp @@ -292,6 +292,12 @@ namespace xsimd return load_unaligned(mem, b, A {}); } + template + XSIMD_INLINE batch_bool load_stream(bool const* mem, batch_bool b, requires_arch) noexcept + { + return load_aligned(mem, b, A {}); + } + // load_aligned namespace detail { @@ -438,6 +444,12 @@ namespace xsimd store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, A {}); } + template + XSIMD_INLINE batch load_stream(T_in const* mem, convert cvt, requires_arch) noexcept + { + return load_aligned(mem, cvt, A {}); + } + // rotate_right template XSIMD_INLINE batch rotate_right(batch const& self, requires_arch) noexcept @@ -679,6 +691,12 @@ namespace xsimd mem[i] = bool(buffer[i]); } + template + XSIMD_INLINE void store_stream(batch_bool const& self, bool* mem, requires_arch) noexcept + { + store(self, mem, A {}); + } + // store_aligned template XSIMD_INLINE void store_aligned(T_out* mem, batch const& self, requires_arch) noexcept @@ -697,6 +715,12 @@ namespace xsimd return store_aligned(mem, self, common {}); } + template + XSIMD_INLINE void store_stream(T_out* mem, batch const& self, requires_arch) noexcept + { + store_aligned(mem, self, A {}); + } + // swizzle template XSIMD_INLINE batch, A> swizzle(batch, A> const& self, batch_constant mask, requires_arch) noexcept @@ -778,6 +802,12 @@ namespace xsimd return detail::load_complex(hi, lo, A {}); } + template + XSIMD_INLINE batch, A> load_complex_stream(std::complex const* mem, convert>, requires_arch) noexcept + { + return load_complex_aligned(mem, kernel::convert> {}, A {}); + } + // store_complex_aligned template XSIMD_INLINE void store_complex_aligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept @@ -802,6 +832,12 @@ namespace xsimd hi.store_unaligned(buffer + real_batch::size); } + template + XSIMD_INLINE void store_complex_stream(std::complex* dst, batch, A> const& src, requires_arch) noexcept + { + store_complex_aligned(dst, src, A {}); + } + // transpose template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index 4af728e07..441371643 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -1515,6 +1515,23 @@ namespace xsimd return _mm256_storeu_pd(mem, self); } + // store_stream + template + XSIMD_INLINE void store_stream(float* mem, batch const& self, requires_arch) noexcept + { + _mm256_stream_ps(mem, self); + } + template + XSIMD_INLINE void store_stream(double* mem, batch const& self, requires_arch) noexcept + { + _mm256_stream_pd(mem, self); + } + template ::value, void>::type> + XSIMD_INLINE void store_stream(T* mem, batch const& self, requires_arch) noexcept + { + _mm256_stream_si256((__m256i*)mem, self); + } + // sub template ::value>> XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index 448a7f7bc..c172b73e6 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -229,6 +229,23 @@ namespace xsimd store_masked(reinterpret_cast(mem), s64, batch_bool_constant {}, Mode {}, avx2 {}); } + // load_stream + template ::value, void>::type> + XSIMD_INLINE batch load_stream(T const* mem, convert, requires_arch) noexcept + { + return _mm256_stream_load_si256((__m256i const*)mem); + } + template + XSIMD_INLINE batch load_stream(float const* mem, convert, requires_arch) noexcept + { + return _mm256_castsi256_ps(_mm256_stream_load_si256((__m256i const*)mem)); + } + template + XSIMD_INLINE batch load_stream(double const* mem, convert, requires_arch) noexcept + { + return _mm256_castsi256_pd(_mm256_stream_load_si256((__m256i const*)mem)); + } + // bitwise_and template ::value>> XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index 5ccf165f1..fe8d33d99 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -1513,6 +1513,23 @@ namespace xsimd return _mm512_loadu_pd(mem); } + // load_stream + template ::value, void>::type> + XSIMD_INLINE batch load_stream(T const* mem, convert, requires_arch) noexcept + { + return _mm512_stream_load_si512((__m512i*)mem); + } + template + XSIMD_INLINE batch load_stream(float const* mem, convert, requires_arch) noexcept + { + return _mm512_castsi512_ps(_mm512_stream_load_si512((__m512i*)mem)); + } + template + XSIMD_INLINE batch load_stream(double const* mem, convert, requires_arch) noexcept + { + return _mm512_castsi512_pd(_mm512_stream_load_si512((__m512i*)mem)); + } + // lt template XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept @@ -2285,6 +2302,23 @@ namespace xsimd return _mm512_storeu_pd(mem, self); } + // store_stream + template ::value, void>::type> + XSIMD_INLINE void store_stream(T* mem, batch const& self, requires_arch) noexcept + { + _mm512_stream_si512((__m512i*)mem, self); + } + template + XSIMD_INLINE void store_stream(float* mem, batch const& self, requires_arch) noexcept + { + _mm512_stream_ps(mem, self); + } + template + XSIMD_INLINE void store_stream(double* mem, batch const& self, requires_arch) noexcept + { + _mm512_stream_pd(mem, self); + } + // sub template ::value>> XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp index cccba8144..eb92d53ba 100644 --- a/include/xsimd/arch/xsimd_sse2.hpp +++ b/include/xsimd/arch/xsimd_sse2.hpp @@ -1927,6 +1927,23 @@ namespace xsimd return _mm_storeu_pd(mem, self); } + // store_stream + template + XSIMD_INLINE void store_stream(float* mem, batch const& self, requires_arch) noexcept + { + _mm_stream_ps(mem, self); + } + template ::value, void>::type> + XSIMD_INLINE void store_stream(T* mem, batch const& self, requires_arch) noexcept + { + _mm_stream_si128((__m128i*)mem, self); + } + template + XSIMD_INLINE void store_stream(double* mem, batch const& self, requires_arch) noexcept + { + _mm_stream_pd(mem, self); + } + // sub template XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_sse4_1.hpp b/include/xsimd/arch/xsimd_sse4_1.hpp index 18b2a09d0..6536fad4b 100644 --- a/include/xsimd/arch/xsimd_sse4_1.hpp +++ b/include/xsimd/arch/xsimd_sse4_1.hpp @@ -228,6 +228,23 @@ namespace xsimd } } + // load_stream + template ::value, void>::type> + XSIMD_INLINE batch load_stream(T const* mem, convert, requires_arch) noexcept + { + return _mm_stream_load_si128((__m128i*)mem); + } + template + XSIMD_INLINE batch load_stream(float const* mem, convert, requires_arch) noexcept + { + return _mm_castsi128_ps(_mm_stream_load_si128((__m128i*)mem)); + } + template + XSIMD_INLINE batch load_stream(double const* mem, convert, requires_arch) noexcept + { + return _mm_castsi128_pd(_mm_stream_load_si128((__m128i*)mem)); + } + // min template ::value>> XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept diff --git a/include/xsimd/memory/xsimd_alignment.hpp b/include/xsimd/memory/xsimd_alignment.hpp index 2d59ac1fc..fd1918bea 100644 --- a/include/xsimd/memory/xsimd_alignment.hpp +++ b/include/xsimd/memory/xsimd_alignment.hpp @@ -33,6 +33,17 @@ namespace xsimd { }; + /** + * @struct stream_mode + * @brief tag for load and store of aligned non-temporal memory. + * + * Streaming accesses expect aligned pointers. When no architecture-specific + * implementation is available, they fall back to aligned semantics. + */ + struct stream_mode + { + }; + /*********************** * Allocator alignment * ***********************/ diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp index aa64df4da..f551cfea0 100644 --- a/include/xsimd/types/xsimd_api.hpp +++ b/include/xsimd/types/xsimd_api.hpp @@ -12,6 +12,7 @@ #ifndef XSIMD_API_HPP #define XSIMD_API_HPP +#include #include #include #include @@ -1334,6 +1335,30 @@ namespace xsimd return kernel::load_complex_aligned(ptr, kernel::convert {}, A {}); } + template + XSIMD_INLINE simd_return_type load_as(From const* ptr, stream_mode) noexcept + { + using batch_value_type = typename simd_return_type::value_type; + detail::static_check_supported_config(); + detail::static_check_supported_config(); + return kernel::load_stream(ptr, kernel::convert {}, A {}); + } + + template + XSIMD_INLINE simd_return_type load_as(bool const* ptr, stream_mode) noexcept + { + detail::static_check_supported_config(); + return simd_return_type::load_stream(ptr); + } + + template + XSIMD_INLINE simd_return_type, To, A> load_as(std::complex const* ptr, stream_mode) noexcept + { + detail::static_check_supported_config(); + using batch_value_type = typename simd_return_type, To, A>::value_type; + return kernel::load_complex_stream(ptr, kernel::convert {}, A {}); + } + #ifdef XSIMD_ENABLE_XTL_COMPLEX template XSIMD_INLINE simd_return_type, To, A> load_as(xtl::xcomplex const* ptr, aligned_mode) noexcept @@ -1342,6 +1367,14 @@ namespace xsimd detail::static_check_supported_config(); return load_as(reinterpret_cast const*>(ptr), aligned_mode()); } + + template + XSIMD_INLINE simd_return_type, To, A> load_as(xtl::xcomplex const* ptr, stream_mode) noexcept + { + detail::static_check_supported_config(); + detail::static_check_supported_config(); + return load_as(reinterpret_cast const*>(ptr), stream_mode()); + } #endif /** @@ -1416,6 +1449,13 @@ namespace xsimd return load_as(ptr, unaligned_mode {}); } + template + XSIMD_INLINE batch load(From const* ptr, stream_mode) noexcept + { + detail::static_check_supported_config(); + return load_as(ptr, stream_mode {}); + } + /** * @ingroup batch_data_transfer * @@ -2420,12 +2460,40 @@ namespace xsimd kernel::store_complex_aligned(dst, src, A {}); } + template + XSIMD_INLINE void store_as(To* dst, batch const& src, stream_mode) noexcept + { + detail::static_check_supported_config(); + kernel::store_stream(dst, src, A {}); + } + + template + XSIMD_INLINE void store_as(bool* dst, batch_bool const& src, stream_mode) noexcept + { + detail::static_check_supported_config(); + kernel::store_stream(src, dst, A {}); + } + + template + XSIMD_INLINE void store_as(std::complex* dst, batch, A> const& src, stream_mode) noexcept + { + detail::static_check_supported_config, A>(); + kernel::store_complex_stream(dst, src, A {}); + } + #ifdef XSIMD_ENABLE_XTL_COMPLEX template XSIMD_INLINE void store_as(xtl::xcomplex* dst, batch, A> const& src, aligned_mode) noexcept { store_as(reinterpret_cast*>(dst), src, aligned_mode()); } + + template + XSIMD_INLINE void store_as(xtl::xcomplex* dst, batch, A> const& src, stream_mode) noexcept + { + detail::static_check_supported_config, A>(); + store_as(reinterpret_cast*>(dst), src, stream_mode()); + } #endif /** @@ -2494,6 +2562,22 @@ namespace xsimd store_as(mem, val, unaligned_mode {}); } + template + XSIMD_INLINE void store(T* mem, batch const& val, stream_mode) noexcept + { + store_as(mem, val, stream_mode {}); + } + + /** + * @ingroup batch_data_transfer + * + * Issues a sequentially consistent memory fence. + */ + XSIMD_INLINE void fence() noexcept + { + std::atomic_thread_fence(std::memory_order_seq_cst); + } + /** * @ingroup batch_data_transfer * diff --git a/include/xsimd/types/xsimd_batch.hpp b/include/xsimd/types/xsimd_batch.hpp index 5ff525a11..4e0b71844 100644 --- a/include/xsimd/types/xsimd_batch.hpp +++ b/include/xsimd/types/xsimd_batch.hpp @@ -144,6 +144,8 @@ namespace xsimd XSIMD_INLINE void store(U* mem, aligned_mode) const noexcept; template XSIMD_INLINE void store(U* mem, unaligned_mode) const noexcept; + template + XSIMD_INLINE void store(U* mem, stream_mode) const noexcept; // Compile-time mask overloads template @@ -160,6 +162,8 @@ namespace xsimd // Compile-time mask overloads template XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, batch_bool_constant mask, Mode = {}) noexcept; + template + XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, stream_mode) noexcept; template XSIMD_NO_DISCARD static XSIMD_INLINE batch gather(U const* src, batch const& index) noexcept; @@ -323,8 +327,10 @@ namespace xsimd // memory operators XSIMD_INLINE void store_aligned(bool* mem) const noexcept; XSIMD_INLINE void store_unaligned(bool* mem) const noexcept; + XSIMD_INLINE void store_stream(bool* mem) const noexcept; XSIMD_NO_DISCARD static XSIMD_INLINE batch_bool load_aligned(bool const* mem) noexcept; XSIMD_NO_DISCARD static XSIMD_INLINE batch_bool load_unaligned(bool const* mem) noexcept; + XSIMD_NO_DISCARD static XSIMD_INLINE batch_bool load_stream(bool const* mem) noexcept; XSIMD_INLINE bool get(std::size_t i) const noexcept; @@ -417,12 +423,16 @@ namespace xsimd template XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, batch_bool_constant mask, Mode = {}) noexcept; template + XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, stream_mode) noexcept; + template XSIMD_INLINE void store(U* mem, aligned_mode) const noexcept; template XSIMD_INLINE void store(U* mem, unaligned_mode) const noexcept; // Compile-time mask overloads template XSIMD_INLINE void store(U* mem, batch_bool_constant mask, Mode = {}) const noexcept; + template + XSIMD_INLINE void store(U* mem, stream_mode) const noexcept; XSIMD_INLINE real_batch real() const noexcept; XSIMD_INLINE real_batch imag() const noexcept; @@ -634,6 +644,16 @@ namespace xsimd // masked store free functions are provided in xsimd_api.hpp + template + template + XSIMD_INLINE void batch::store(U* mem, stream_mode) const noexcept + { + detail::static_check_supported_config(); + assert(((reinterpret_cast(mem) % A::alignment()) == 0) + && "store location is not properly aligned"); + kernel::store_stream(mem, *this, A {}); + } + /** * Loading from aligned memory. May involve a conversion if \c U is different * from \c T. @@ -728,6 +748,16 @@ namespace xsimd } } + template + template + XSIMD_INLINE batch batch::load(U const* mem, stream_mode) noexcept + { + detail::static_check_supported_config(); + assert(((reinterpret_cast(mem) % A::alignment()) == 0) + && "loaded pointer is not properly aligned"); + return kernel::load_stream(mem, kernel::convert {}, A {}); + } + /** * Create a new batch gathering elements starting at address \c src and * offset by each element in \c index. @@ -1051,6 +1081,12 @@ namespace xsimd store_aligned(mem); } + template + XSIMD_INLINE void batch_bool::store_stream(bool* mem) const noexcept + { + kernel::store_stream(*this, mem, A {}); + } + template XSIMD_INLINE batch_bool batch_bool::load_aligned(bool const* mem) noexcept { @@ -1063,6 +1099,12 @@ namespace xsimd return kernel::load_unaligned(mem, batch_bool(), A {}); } + template + XSIMD_INLINE batch_bool batch_bool::load_stream(bool const* mem) noexcept + { + return kernel::load_stream(mem, batch_bool(), A {}); + } + /** * Extract a scalar mask representation from this @c batch_bool. * @@ -1327,6 +1369,16 @@ namespace xsimd return kernel::load_masked(mem, mask, kernel::convert {}, mode, A {}); } + template + template + XSIMD_INLINE batch, A> batch, A>::load(U const* mem, stream_mode) noexcept + { + assert(((reinterpret_cast(mem) % A::alignment()) == 0) + && "loaded pointer is not properly aligned"); + auto* ptr = reinterpret_cast(mem); + return kernel::load_complex_stream(ptr, kernel::convert {}, A {}); + } + template template XSIMD_INLINE void batch, A>::store(U* mem, aligned_mode) const noexcept @@ -1341,6 +1393,16 @@ namespace xsimd return store_unaligned(mem); } + template + template + XSIMD_INLINE void batch, A>::store(U* mem, stream_mode) const noexcept + { + assert(((reinterpret_cast(mem) % A::alignment()) == 0) + && "store location is not properly aligned"); + auto* ptr = reinterpret_cast(mem); + return kernel::store_complex_stream(ptr, *this, A {}); + } + template XSIMD_INLINE auto batch, A>::real() const noexcept -> real_batch { diff --git a/test/test_load_store.cpp b/test/test_load_store.cpp index 9fa7dbff8..ba0452531 100644 --- a/test/test_load_store.cpp +++ b/test/test_load_store.cpp @@ -303,6 +303,33 @@ struct load_store_test }; #endif + template + void stream_load_if_same(Ptr const* ptr, batch_type& b, array_type const& expected_values, const std::string& name, + std::true_type) const + { + b = xsimd::load(ptr, xsimd::stream_mode()); + INFO(name, " stream (load)"); + CHECK_BATCH_EQ(b, expected_values); + } + + template + void stream_load_if_same(Ptr const*, batch_type&, array_type const&, const std::string&, std::false_type) const + { + } + + template + void stream_store_if_same(Vec& res, batch_type const& b, Vec const& reference, const std::string& name, std::true_type) const + { + xsimd::store(res.data(), b, xsimd::stream_mode()); + INFO(name, " stream (store)"); + CHECK_VECTOR_EQ(res, reference); + } + + template + void stream_store_if_same(Vec&, batch_type const&, Vec const&, const std::string&, std::false_type) const + { + } + template void test_load_impl(const V& v, const std::string& name) { @@ -316,6 +343,10 @@ struct load_store_test INFO(name, " aligned"); CHECK_BATCH_EQ(b, expected); + b = batch_type::load(v.data(), xsimd::stream_mode()); + INFO(name, " stream (batch::load)"); + CHECK_BATCH_EQ(b, expected); + b = xsimd::load_as(v.data(), xsimd::unaligned_mode()); INFO(name, " unaligned (load_as)"); CHECK_BATCH_EQ(b, expected); @@ -324,6 +355,13 @@ struct load_store_test INFO(name, " aligned (load_as)"); CHECK_BATCH_EQ(b, expected); + b = xsimd::load_as(v.data(), xsimd::stream_mode()); + INFO(name, " stream (load_as)"); + CHECK_BATCH_EQ(b, expected); + + stream_load_if_same(v.data(), b, expected, name, + std::integral_constant::value> {}); + run_mask_tests(v, name, b, expected, std::is_same {}); } @@ -474,6 +512,17 @@ struct load_store_test INFO(name, " aligned (store_as)"); CHECK_VECTOR_EQ(res, v); + b.store(res.data(), xsimd::stream_mode()); + INFO(name, " stream (batch::store)"); + CHECK_VECTOR_EQ(res, v); + + xsimd::store_as(res.data(), b, xsimd::stream_mode()); + INFO(name, " stream (store_as)"); + CHECK_VECTOR_EQ(res, v); + + stream_store_if_same(res, b, v, name, + std::integral_constant::value> {}); + V expected_masked(size); run_store_mask_section(v, name, b, res, expected_masked, std::is_same {}); @@ -556,4 +605,10 @@ TEST_CASE_TEMPLATE("[load store]", B, BATCH_TYPES) SUBCASE("masked") { Test.test_masked(); } } + +TEST_CASE("[fence] sequential consistency") +{ + xsimd::fence(); + CHECK(true); +} #endif From 47dee36e3577c6db66d5cf3d8792cfaeb0b114c3 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 3 Mar 2026 11:52:10 -0500 Subject: [PATCH 2/2] Add AArch64 NEON non-temporal load/store (ldnp/stnp) Implement store_stream and load_stream for neon64 using inline asm with LDNP/STNP instructions, providing non-temporal cache hints on AArch64. Covers float, double, and integral types. Guarded behind __GNUC__ so MSVC ARM64 falls back to aligned load/store. Also remove xsimd::fence (std::atomic wrapper) and its test, which were unrelated additions from a prior commit. Co-Authored-By: Claude Opus 4.6 --- include/xsimd/arch/xsimd_neon64.hpp | 84 +++++++++++++++++++++++++++++ include/xsimd/types/xsimd_api.hpp | 11 ---- test/test_load_store.cpp | 5 -- 3 files changed, 84 insertions(+), 16 deletions(-) diff --git a/include/xsimd/arch/xsimd_neon64.hpp b/include/xsimd/arch/xsimd_neon64.hpp index 9f3c4bce8..1847261f5 100644 --- a/include/xsimd/arch/xsimd_neon64.hpp +++ b/include/xsimd/arch/xsimd_neon64.hpp @@ -14,6 +14,7 @@ #include #include +#include #include #include @@ -178,6 +179,89 @@ namespace xsimd return store_aligned(dst, src, A {}); } + /**************** + * store_stream * + ****************/ + +#if defined(__GNUC__) + template + XSIMD_INLINE void store_stream(float* mem, batch const& val, requires_arch) noexcept + { + float32x2_t lo = vget_low_f32(val); + float32x2_t hi = vget_high_f32(val); + __asm__ __volatile__("stnp %d[lo], %d[hi], [%[mem]]" + : + : [lo] "w"(lo), [hi] "w"(hi), [mem] "r"(mem) + : "memory"); + } + + template + XSIMD_INLINE void store_stream(double* mem, batch const& val, requires_arch) noexcept + { + float64x1_t lo = vget_low_f64(val); + float64x1_t hi = vget_high_f64(val); + __asm__ __volatile__("stnp %d[lo], %d[hi], [%[mem]]" + : + : [lo] "w"(lo), [hi] "w"(hi), [mem] "r"(mem) + : "memory"); + } + + template ::value, void>::type> + XSIMD_INLINE void store_stream(T* mem, batch const& val, requires_arch) noexcept + { + uint64x2_t u64; + std::memcpy(&u64, &val, sizeof(u64)); + uint64x1_t lo = vget_low_u64(u64); + uint64x1_t hi = vget_high_u64(u64); + __asm__ __volatile__("stnp %d[lo], %d[hi], [%[mem]]" + : + : [lo] "w"(lo), [hi] "w"(hi), [mem] "r"(mem) + : "memory"); + } +#endif + + /*************** + * load_stream * + ***************/ + +#if defined(__GNUC__) + template + XSIMD_INLINE batch load_stream(float const* mem, convert, requires_arch) noexcept + { + float32x2_t lo, hi; + __asm__ __volatile__("ldnp %d[lo], %d[hi], [%[mem]]" + : [lo] "=w"(lo), [hi] "=w"(hi) + : [mem] "r"(mem) + : "memory"); + return vcombine_f32(lo, hi); + } + + template + XSIMD_INLINE batch load_stream(double const* mem, convert, requires_arch) noexcept + { + float64x1_t lo, hi; + __asm__ __volatile__("ldnp %d[lo], %d[hi], [%[mem]]" + : [lo] "=w"(lo), [hi] "=w"(hi) + : [mem] "r"(mem) + : "memory"); + return vcombine_f64(lo, hi); + } + + template ::value, void>::type> + XSIMD_INLINE batch load_stream(T const* mem, convert, requires_arch) noexcept + { + uint64x1_t lo, hi; + __asm__ __volatile__("ldnp %d[lo], %d[hi], [%[mem]]" + : [lo] "=w"(lo), [hi] "=w"(hi) + : [mem] "r"(mem) + : "memory"); + uint64x2_t u64 = vcombine_u64(lo, hi); + batch result; + std::memcpy(&result, &u64, sizeof(u64)); + return result; + } +#endif + /********************* * store * *********************/ diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp index f551cfea0..a31d3e337 100644 --- a/include/xsimd/types/xsimd_api.hpp +++ b/include/xsimd/types/xsimd_api.hpp @@ -12,7 +12,6 @@ #ifndef XSIMD_API_HPP #define XSIMD_API_HPP -#include #include #include #include @@ -2568,16 +2567,6 @@ namespace xsimd store_as(mem, val, stream_mode {}); } - /** - * @ingroup batch_data_transfer - * - * Issues a sequentially consistent memory fence. - */ - XSIMD_INLINE void fence() noexcept - { - std::atomic_thread_fence(std::memory_order_seq_cst); - } - /** * @ingroup batch_data_transfer * diff --git a/test/test_load_store.cpp b/test/test_load_store.cpp index ba0452531..353527779 100644 --- a/test/test_load_store.cpp +++ b/test/test_load_store.cpp @@ -606,9 +606,4 @@ TEST_CASE_TEMPLATE("[load store]", B, BATCH_TYPES) SUBCASE("masked") { Test.test_masked(); } } -TEST_CASE("[fence] sequential consistency") -{ - xsimd::fence(); - CHECK(true); -} #endif