From 1482e8e83a0a0a0039d5ccdcbb428cfa517d1008 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 30 Jan 2026 12:17:18 +0100 Subject: [PATCH 01/16] Refactor x86 CPU features --- include/xsimd/config/xsimd_config.hpp | 11 + include/xsimd/config/xsimd_cpuid.hpp | 162 +++------- include/xsimd/types/xsimd_all_registers.hpp | 2 + include/xsimd/xsimd_cpu_features_x86.hpp | 321 ++++++++++++++++++++ 4 files changed, 371 insertions(+), 125 deletions(-) create mode 100644 include/xsimd/xsimd_cpu_features_x86.hpp diff --git a/include/xsimd/config/xsimd_config.hpp b/include/xsimd/config/xsimd_config.hpp index e81dd8053..49af1b179 100644 --- a/include/xsimd/config/xsimd_config.hpp +++ b/include/xsimd/config/xsimd_config.hpp @@ -33,6 +33,17 @@ * @defgroup xsimd_config_macro Instruction Set Detection */ +/** + * @ingroup xsimd_config_macro + * + * Set to 1 if the target is the x86 architecture family. + */ +#if defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86) +#define XSIMD_TARGET_X86 1 +#else +#define XSIMD_TARGET_X86 0 +#endif + /** * @ingroup xsimd_config_macro * diff --git a/include/xsimd/config/xsimd_cpuid.hpp b/include/xsimd/config/xsimd_cpuid.hpp index 1be4f018a..8a969db16 100644 --- a/include/xsimd/config/xsimd_cpuid.hpp +++ b/include/xsimd/config/xsimd_cpuid.hpp @@ -12,9 +12,12 @@ #ifndef XSIMD_CPUID_HPP #define XSIMD_CPUID_HPP -#include #include +#include "../types/xsimd_all_registers.hpp" +#include "../xsimd_cpu_features_x86.hpp" +#include "xsimd_inline.hpp" + #if defined(__linux__) && (defined(__ARM_NEON) || defined(_M_ARM) || defined(__riscv_vector)) #include #include @@ -25,13 +28,6 @@ #endif -#if defined(_MSC_VER) -// Contains the definition of __cpuidex -#include -#endif - -#include "../types/xsimd_all_registers.hpp" - namespace xsimd { namespace detail @@ -122,138 +118,54 @@ namespace xsimd #endif rvv = bool(getauxval(AT_HWCAP) & HWCAP_V); #endif - -#elif defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86) - - auto get_xcr0_low = []() noexcept - { - uint32_t xcr0; - -#if defined(_MSC_VER) && _MSC_VER >= 1400 - - xcr0 = (uint32_t)_xgetbv(0); - -#elif defined(__GNUC__) - - __asm__( - "xorl %%ecx, %%ecx\n" - "xgetbv\n" - : "=a"(xcr0) - : -#if defined(__i386__) - : "ecx", "edx" -#else - : "rcx", "rdx" -#endif - ); - -#else /* _MSC_VER < 1400 */ -#error "_MSC_VER < 1400 is not supported" -#endif /* _MSC_VER && _MSC_VER >= 1400 */ - return xcr0; - }; - - auto get_cpuid = [](int reg[4], int level, int count = 0) noexcept - { - -#if defined(_MSC_VER) - __cpuidex(reg, level, count); - -#elif defined(__INTEL_COMPILER) - __cpuid(reg, level); - -#elif defined(__GNUC__) || defined(__clang__) - -#if defined(__i386__) && defined(__PIC__) - // %ebx may be the PIC register - __asm__("xchg{l}\t{%%}ebx, %1\n\t" - "cpuid\n\t" - "xchg{l}\t{%%}ebx, %1\n\t" - : "=a"(reg[0]), "=r"(reg[1]), "=c"(reg[2]), "=d"(reg[3]) - : "0"(level), "2"(count)); - -#else - __asm__("cpuid\n\t" - : "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]), "=d"(reg[3]) - : "0"(level), "2"(count)); -#endif - -#else -#error "Unsupported configuration" #endif - }; - - int regs1[4]; - - get_cpuid(regs1, 0x1); - - // OS can explicitly disable the usage of SSE/AVX extensions - // by setting an appropriate flag in CR0 register - // - // https://docs.kernel.org/admin-guide/hw-vuln/gather_data_sampling.html + const auto cpuid = xsimd::x86_cpu_id::read(); + auto xcr0 = xsimd::x86_xcr0::make_false(); - unsigned sse_state_os_enabled = 1; + bool sse_state_os_enabled = true; // AVX and AVX512 strictly require OSXSAVE to be enabled by the OS. // If OSXSAVE is disabled (e.g., via bcdedit /set xsavedisable 1), // AVX state won't be preserved across context switches, so AVX cannot be used. - unsigned avx_state_os_enabled = 0; - unsigned avx512_state_os_enabled = 0; + bool avx_state_os_enabled = false; + bool avx512_state_os_enabled = false; - // OSXSAVE: A value of 1 indicates that the OS has set CR4.OSXSAVE[bit - // 18] to enable XSETBV/XGETBV instructions to access XCR0 and - // to support processor extended state management using - // XSAVE/XRSTOR. - bool osxsave = regs1[2] >> 27 & 1; - if (osxsave) + if (cpuid.osxsave()) { + xcr0 = xsimd::x86_xcr0::read(); - uint32_t xcr0 = get_xcr0_low(); - - sse_state_os_enabled = xcr0 >> 1 & 1; - avx_state_os_enabled = xcr0 >> 2 & sse_state_os_enabled; - avx512_state_os_enabled = xcr0 >> 6 & avx_state_os_enabled; + sse_state_os_enabled = xcr0.sse_state_os_enabled(); + avx_state_os_enabled = xcr0.avx_state_os_enabled(); + avx512_state_os_enabled = xcr0.avx512_state_os_enabled(); } - sse2 = regs1[3] >> 26 & sse_state_os_enabled; - sse3 = regs1[2] >> 0 & sse_state_os_enabled; - ssse3 = regs1[2] >> 9 & sse_state_os_enabled; - sse4_1 = regs1[2] >> 19 & sse_state_os_enabled; - sse4_2 = regs1[2] >> 20 & sse_state_os_enabled; - fma3_sse42 = regs1[2] >> 12 & sse_state_os_enabled; - - avx = regs1[2] >> 28 & avx_state_os_enabled; - fma3_avx = avx && fma3_sse42; - - int regs8[4]; - get_cpuid(regs8, 0x80000001); - fma4 = regs8[2] >> 16 & avx_state_os_enabled; - - // sse4a = regs[2] >> 6 & 1; + sse2 = cpuid.sse2() && sse_state_os_enabled; + sse3 = cpuid.sse3() && sse_state_os_enabled; + ssse3 = cpuid.ssse3() && sse_state_os_enabled; + sse4_1 = cpuid.sse4_1() && sse_state_os_enabled; + sse4_2 = cpuid.sse4_2() && sse_state_os_enabled; + fma3_sse42 = cpuid.fma3() && sse_state_os_enabled; - // xop = regs[2] >> 11 & 1; - - int regs7[4]; - get_cpuid(regs7, 0x7); - avx2 = regs7[1] >> 5 & avx_state_os_enabled; - - int regs7a[4]; - get_cpuid(regs7a, 0x7, 0x1); - avxvnni = regs7a[0] >> 4 & avx_state_os_enabled; + // sse4a not implemented in cpu_id yet + // xop not implemented in cpu_id yet + avx = cpuid.avx() && avx_state_os_enabled; + fma3_avx = avx && fma3_sse42; + fma4 = cpuid.fma4() && avx_state_os_enabled; + avx2 = cpuid.avx2() && avx_state_os_enabled; + avxvnni = cpuid.avxvnni() && avx_state_os_enabled; fma3_avx2 = avx2 && fma3_sse42; - avx512f = regs7[1] >> 16 & avx512_state_os_enabled; - avx512cd = regs7[1] >> 28 & avx512_state_os_enabled; - avx512dq = regs7[1] >> 17 & avx512_state_os_enabled; - avx512bw = regs7[1] >> 30 & avx512_state_os_enabled; - avx512er = regs7[1] >> 27 & avx512_state_os_enabled; - avx512pf = regs7[1] >> 26 & avx512_state_os_enabled; - avx512ifma = regs7[1] >> 21 & avx512_state_os_enabled; - avx512vbmi = regs7[2] >> 1 & avx512_state_os_enabled; - avx512vbmi2 = regs7[2] >> 6 & avx512_state_os_enabled; - avx512vnni_bw = regs7[2] >> 11 & avx512_state_os_enabled; + avx512f = cpuid.avx512f() && avx512_state_os_enabled; + avx512cd = cpuid.avx512cd() && avx512_state_os_enabled; + avx512dq = cpuid.avx512dq() && avx512_state_os_enabled; + avx512bw = cpuid.avx512bw() && avx512_state_os_enabled; + avx512er = cpuid.avx512er() && avx512_state_os_enabled; + avx512pf = cpuid.avx512pf() && avx512_state_os_enabled; + avx512ifma = cpuid.avx512ifma() && avx512_state_os_enabled; + avx512vbmi = cpuid.avx512vbmi() && avx512_state_os_enabled; + avx512vbmi2 = cpuid.avx512vbmi2() && avx512_state_os_enabled; + avx512vnni_bw = cpuid.avx512vnni_bw() && avx512_state_os_enabled; avx512vnni_vbmi2 = avx512vbmi2 && avx512vnni_bw; -#endif } }; } // namespace detail diff --git a/include/xsimd/types/xsimd_all_registers.hpp b/include/xsimd/types/xsimd_all_registers.hpp index 33f9b465d..da87df2e4 100644 --- a/include/xsimd/types/xsimd_all_registers.hpp +++ b/include/xsimd/types/xsimd_all_registers.hpp @@ -9,6 +9,8 @@ * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ +#include "../config/xsimd_inline.hpp" + #include "xsimd_fma3_sse_register.hpp" #include "xsimd_fma4_register.hpp" #include "xsimd_sse2_register.hpp" diff --git a/include/xsimd/xsimd_cpu_features_x86.hpp b/include/xsimd/xsimd_cpu_features_x86.hpp new file mode 100644 index 000000000..185c08c30 --- /dev/null +++ b/include/xsimd/xsimd_cpu_features_x86.hpp @@ -0,0 +1,321 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_CPU_FEATURES_X86_HPP +#define XSIMD_CPU_FEATURES_X86_HPP + +#include + +#include "./config/xsimd_config.hpp" + +#if XSIMD_TARGET_X86 && defined(_MSC_VER) +// Contains the definition of __cpuidex +#include +#endif + +namespace xsimd +{ + namespace detail + { + template + constexpr I make_bit_mask(I bit) + { + return static_cast(I { 1 } << bit); + } + + template + constexpr I make_bit_mask(I bit, Args... bits) + { + return make_bit_mask(bit) | make_bit_mask(static_cast(bits)...); + } + + template + constexpr bool bit_is_set(I value) + { + constexpr I mask = make_bit_mask(static_cast(Bits)...); + return (value & mask) == mask; + } + + inline void get_cpuid(int reg[4], int level, int count = 0) noexcept; + + inline std::uint32_t get_xcr0_low() noexcept; + } + + /* + * Extended Control Register 0 (XCR0). + * + * Operating systems can explicitly disable the usage of instruction set (such + * as SSE or AVX extensions) by setting an appropriate flag in XCR0 register. + * This utility parses such bit values. + * + * @see https://docs.kernel.org/admin-guide/hw-vuln/gather_data_sampling.html + */ + class x86_xcr0 + { + public: + using reg_t = std::uint32_t; + + /** Parse a XCR0 value into individual components. */ + constexpr explicit x86_xcr0(reg_t low) noexcept + : m_low(low) + { + } + + /** Create an object that has all features set to false. */ + static constexpr x86_xcr0 make_false() + { + return x86_xcr0(0); + } + + /** Read the XCR0 register from the CPU if on the correct architecture. */ + static x86_xcr0 read() + { + return x86_xcr0(detail::get_xcr0_low()); + } + + constexpr bool sse_state_os_enabled() const noexcept + { + return detail::bit_is_set<1>(m_low); + } + + constexpr bool avx_state_os_enabled() const noexcept + { + // Check both SSE and AVX bits even though AVX must imply SSE + return detail::bit_is_set<1, 2>(m_low); + } + + constexpr bool avx512_state_os_enabled() const noexcept + { + // Check all SSE, AVX, and AVX52 bits even though AVX512 must + // imply AVX and SSE + return detail::bit_is_set<1, 2, 6>(m_low); + } + + private: + std::uint32_t m_low = {}; + }; + + class x86_cpu_id + { + public: + struct cpu_id_regs + { + using reg_t = int[4]; + + reg_t reg1 = {}; + reg_t reg7 = {}; + reg_t reg7a = {}; + reg_t reg8 = {}; + }; + + /** Parse CpuInfo register values into individual components. */ + constexpr explicit x86_cpu_id(const cpu_id_regs& regs) noexcept + : m_regs(regs) + { + } + + /** + * Read the CpuId registers from the CPU if on the correct architecture. + * + * This is only safe to call if bit 18 of CR4.OSXSAVE has been set. + * + * @see cpu_id::osxsave + */ + static x86_cpu_id read() + { + cpu_id_regs regs = {}; + detail::get_cpuid(regs.reg1, 0x1); + detail::get_cpuid(regs.reg7, 0x7); + detail::get_cpuid(regs.reg7a, 0x7, 0x1); + detail::get_cpuid(regs.reg8, 0x80000001); + return x86_cpu_id(regs); + } + + constexpr bool sse2() const noexcept + { + return detail::bit_is_set<26>(m_regs.reg1[3]); + } + + constexpr bool sse3() const noexcept + { + return detail::bit_is_set<0>(m_regs.reg1[2]); + } + + constexpr bool ssse3() const noexcept + { + return detail::bit_is_set<9>(m_regs.reg1[2]); + } + + constexpr bool sse4_1() const noexcept + { + return detail::bit_is_set<19>(m_regs.reg1[2]); + } + + constexpr bool sse4_2() const noexcept + { + return detail::bit_is_set<20>(m_regs.reg1[2]); + } + + constexpr bool fma3() const noexcept + { + return detail::bit_is_set<12>(m_regs.reg1[2]); + } + + /** + * Indicates whether the OS has enabled extended state management. + * + * When true, the OS has set bit 18 (OSXSAVE) in the CR4 control register, + * enabling the XGETBV/XSETBV instructions to access XCR0 and support + * processor extended state management using XSAVE/XRSTOR. + * + * This value is read from CPUID leaf 0x1, ECX bit 27, which reflects + * the state of CR4.OSXSAVE. + */ + constexpr bool osxsave() const noexcept + { + return detail::bit_is_set<27>(m_regs.reg1[2]); + } + + constexpr bool avx() const noexcept + { + return detail::bit_is_set<28>(m_regs.reg1[2]); + } + + constexpr bool avx2() const noexcept + { + return detail::bit_is_set<5>(m_regs.reg7[1]); + } + + constexpr bool avx512f() const noexcept + { + return detail::bit_is_set<16>(m_regs.reg7[1]); + } + + constexpr bool avx512dq() const noexcept + { + return detail::bit_is_set<17>(m_regs.reg7[1]); + } + + constexpr bool avx512ifma() const noexcept + { + return detail::bit_is_set<21>(m_regs.reg7[1]); + } + + constexpr bool avx512pf() const noexcept + { + return detail::bit_is_set<26>(m_regs.reg7[1]); + } + + constexpr bool avx512er() const noexcept + { + return detail::bit_is_set<27>(m_regs.reg7[1]); + } + + constexpr bool avx512cd() const noexcept + { + return detail::bit_is_set<28>(m_regs.reg7[1]); + } + + constexpr bool avx512bw() const noexcept + { + return detail::bit_is_set<30>(m_regs.reg7[1]); + } + + constexpr bool avx512vbmi() const noexcept + { + return detail::bit_is_set<1>(m_regs.reg7[2]); + } + + constexpr bool avx512vbmi2() const noexcept + { + return detail::bit_is_set<6>(m_regs.reg7[2]); + } + + constexpr bool avx512vnni_bw() const noexcept + { + return detail::bit_is_set<11>(m_regs.reg7[2]); + } + + constexpr bool avxvnni() const noexcept + { + return detail::bit_is_set<4>(m_regs.reg7a[0]); + } + + constexpr bool fma4() const noexcept + { + return detail::bit_is_set<16>(m_regs.reg8[2]); + } + + private: + cpu_id_regs m_regs = {}; + }; + + namespace detail + { + inline void get_cpuid(int reg[4], int level, int count) noexcept + { +#if !XSIMD_TARGET_X86 + reg = {}; // Fill zeros + +#elif defined(_MSC_VER) + __cpuidex(reg, level, count); + +#elif defined(__INTEL_COMPILER) + __cpuid(reg, level); + +#elif defined(__GNUC__) || defined(__clang__) + +#if defined(__i386__) && defined(__PIC__) + // %ebx may be the PIC register + __asm__("xchg{l}\t{%%}ebx, %1\n\t" + "cpuid\n\t" + "xchg{l}\t{%%}ebx, %1\n\t" + : "=a"(reg[0]), "=r"(reg[1]), "=c"(reg[2]), "=d"(reg[3]) + : "0"(level), "2"(count)); + +#else + __asm__("cpuid\n\t" + : "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]), "=d"(reg[3]) + : "0"(level), "2"(count)); +#endif +#endif + } + + inline std::uint32_t get_xcr0_low() noexcept + { +#if !XSIMD_TARGET_X86 + return {}; // return 0; + +#elif defined(_MSC_VER) && _MSC_VER >= 1400 + return static_cast(_xgetbv(0)); + +#elif defined(__GNUC__) + std::uint32_t xcr0 = {}; + __asm__( + "xorl %%ecx, %%ecx\n" + "xgetbv\n" + : "=a"(xcr0) + : +#if defined(__i386__) + : "ecx", "edx" +#else + : "rcx", "rdx" +#endif + ); + return xcr0; + +#else /* _MSC_VER < 1400 */ +#error "_MSC_VER < 1400 is not supported" +#endif /* _MSC_VER && _MSC_VER >= 1400 */ + }; + } +} +#endif From 44ab310150413be92c389d0652cc9fc1e2db2364 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 30 Jan 2026 12:23:06 +0100 Subject: [PATCH 02/16] Remove unsafe memset --- include/xsimd/config/xsimd_cpuid.hpp | 6 +----- include/xsimd/types/xsimd_common_arch.hpp | 2 ++ 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/include/xsimd/config/xsimd_cpuid.hpp b/include/xsimd/config/xsimd_cpuid.hpp index 8a969db16..c64bfa6f8 100644 --- a/include/xsimd/config/xsimd_cpuid.hpp +++ b/include/xsimd/config/xsimd_cpuid.hpp @@ -12,8 +12,6 @@ #ifndef XSIMD_CPUID_HPP #define XSIMD_CPUID_HPP -#include - #include "../types/xsimd_all_registers.hpp" #include "../xsimd_cpu_features_x86.hpp" #include "xsimd_inline.hpp" @@ -36,7 +34,7 @@ namespace xsimd { #define ARCH_FIELD_EX(arch, field_name) \ - unsigned field_name; \ + unsigned field_name = 0; \ XSIMD_INLINE bool has(::xsimd::arch) const { return this->field_name; } #define ARCH_FIELD_EX_REUSE(arch, field_name) \ @@ -86,8 +84,6 @@ namespace xsimd XSIMD_INLINE supported_arch() noexcept { - memset(this, 0, sizeof(supported_arch)); - #if XSIMD_WITH_WASM wasm = 1; #endif diff --git a/include/xsimd/types/xsimd_common_arch.hpp b/include/xsimd/types/xsimd_common_arch.hpp index 28491aeda..a33c868ea 100644 --- a/include/xsimd/types/xsimd_common_arch.hpp +++ b/include/xsimd/types/xsimd_common_arch.hpp @@ -12,6 +12,8 @@ #ifndef XSIMD_COMMON_ARCH_HPP #define XSIMD_COMMON_ARCH_HPP +#include + #include "../config/xsimd_config.hpp" /** From 3c0af6fb86b00cc0c975b9bc7707a9c2eda97f3c Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 30 Jan 2026 12:35:20 +0100 Subject: [PATCH 03/16] Fix warning --- include/xsimd/xsimd_cpu_features_x86.hpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/xsimd/xsimd_cpu_features_x86.hpp b/include/xsimd/xsimd_cpu_features_x86.hpp index 185c08c30..348c8b2c8 100644 --- a/include/xsimd/xsimd_cpu_features_x86.hpp +++ b/include/xsimd/xsimd_cpu_features_x86.hpp @@ -263,7 +263,12 @@ namespace xsimd inline void get_cpuid(int reg[4], int level, int count) noexcept { #if !XSIMD_TARGET_X86 - reg = {}; // Fill zeros + reg[0] = 0; + reg[1] = 0; + reg[2] = 0; + reg[3] = 0; + (void)level; + (void)count; #elif defined(_MSC_VER) __cpuidex(reg, level, count); From f4a8faec0f8d06e1d6e2441a424dd2725e2b6877 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 30 Jan 2026 12:40:15 +0100 Subject: [PATCH 04/16] Fix inline --- include/xsimd/xsimd_cpu_features_x86.hpp | 4 ++-- test/check_inline_specifier.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/xsimd/xsimd_cpu_features_x86.hpp b/include/xsimd/xsimd_cpu_features_x86.hpp index 348c8b2c8..99e6b6eb3 100644 --- a/include/xsimd/xsimd_cpu_features_x86.hpp +++ b/include/xsimd/xsimd_cpu_features_x86.hpp @@ -76,7 +76,7 @@ namespace xsimd } /** Read the XCR0 register from the CPU if on the correct architecture. */ - static x86_xcr0 read() + inline static x86_xcr0 read() { return x86_xcr0(detail::get_xcr0_low()); } @@ -129,7 +129,7 @@ namespace xsimd * * @see cpu_id::osxsave */ - static x86_cpu_id read() + inline static x86_cpu_id read() { cpu_id_regs regs = {}; detail::get_cpuid(regs.reg1, 0x1); diff --git a/test/check_inline_specifier.sh b/test/check_inline_specifier.sh index 1ccdda130..2337b3707 100755 --- a/test/check_inline_specifier.sh +++ b/test/check_inline_specifier.sh @@ -3,7 +3,7 @@ # Usage: $0 top_srcdir # # This script walks all headers in $top_srcdir/include and makes sure that all -# functions declared tehre are marked as inline or constexpr (which implies +# functions declared there are marked as inline or constexpr (which implies # inline). This makes sure the xsimd headers does not define symbol with global # linkage, and somehow convey our itnent to have all functions in xsimd being # inlined by the compiler. From e27e3fe89450f4ce76d2aa58eb844e187460bfaa Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 30 Jan 2026 14:54:33 +0100 Subject: [PATCH 05/16] Refactor bit utils --- include/xsimd/utils/bits.hpp | 40 +++++++++++++ include/xsimd/xsimd_cpu_features_x86.hpp | 71 +++++++++--------------- 2 files changed, 66 insertions(+), 45 deletions(-) create mode 100644 include/xsimd/utils/bits.hpp diff --git a/include/xsimd/utils/bits.hpp b/include/xsimd/utils/bits.hpp new file mode 100644 index 000000000..e3ad83e30 --- /dev/null +++ b/include/xsimd/utils/bits.hpp @@ -0,0 +1,40 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ***************************************************************************/ + +#ifndef XSIMD_CPUID_UTILS_HPP +#define XSIMD_CPUID_UTILS_HPP + +namespace xsimd +{ + namespace utils + { + template + constexpr I make_bit_mask(I bit) + { + return static_cast(I { 1 } << bit); + } + + template + constexpr I make_bit_mask(I bit, Args... bits) + { + return make_bit_mask(bit) | make_bit_mask(static_cast(bits)...); + } + + template + constexpr bool bit_is_set(I value) + { + constexpr I mask = make_bit_mask(static_cast(Bits)...); + return (value & mask) == mask; + } + } +} + +#endif diff --git a/include/xsimd/xsimd_cpu_features_x86.hpp b/include/xsimd/xsimd_cpu_features_x86.hpp index 99e6b6eb3..ba816d2cd 100644 --- a/include/xsimd/xsimd_cpu_features_x86.hpp +++ b/include/xsimd/xsimd_cpu_features_x86.hpp @@ -15,35 +15,16 @@ #include #include "./config/xsimd_config.hpp" +#include "./utils/bits.hpp" #if XSIMD_TARGET_X86 && defined(_MSC_VER) -// Contains the definition of __cpuidex -#include +#include // Contains the definition of __cpuidex #endif namespace xsimd { namespace detail { - template - constexpr I make_bit_mask(I bit) - { - return static_cast(I { 1 } << bit); - } - - template - constexpr I make_bit_mask(I bit, Args... bits) - { - return make_bit_mask(bit) | make_bit_mask(static_cast(bits)...); - } - - template - constexpr bool bit_is_set(I value) - { - constexpr I mask = make_bit_mask(static_cast(Bits)...); - return (value & mask) == mask; - } - inline void get_cpuid(int reg[4], int level, int count = 0) noexcept; inline std::uint32_t get_xcr0_low() noexcept; @@ -83,20 +64,20 @@ namespace xsimd constexpr bool sse_state_os_enabled() const noexcept { - return detail::bit_is_set<1>(m_low); + return utils::bit_is_set<1>(m_low); } constexpr bool avx_state_os_enabled() const noexcept { // Check both SSE and AVX bits even though AVX must imply SSE - return detail::bit_is_set<1, 2>(m_low); + return utils::bit_is_set<1, 2>(m_low); } constexpr bool avx512_state_os_enabled() const noexcept { // Check all SSE, AVX, and AVX52 bits even though AVX512 must // imply AVX and SSE - return detail::bit_is_set<1, 2, 6>(m_low); + return utils::bit_is_set<1, 2, 6>(m_low); } private: @@ -141,32 +122,32 @@ namespace xsimd constexpr bool sse2() const noexcept { - return detail::bit_is_set<26>(m_regs.reg1[3]); + return utils::bit_is_set<26>(m_regs.reg1[3]); } constexpr bool sse3() const noexcept { - return detail::bit_is_set<0>(m_regs.reg1[2]); + return utils::bit_is_set<0>(m_regs.reg1[2]); } constexpr bool ssse3() const noexcept { - return detail::bit_is_set<9>(m_regs.reg1[2]); + return utils::bit_is_set<9>(m_regs.reg1[2]); } constexpr bool sse4_1() const noexcept { - return detail::bit_is_set<19>(m_regs.reg1[2]); + return utils::bit_is_set<19>(m_regs.reg1[2]); } constexpr bool sse4_2() const noexcept { - return detail::bit_is_set<20>(m_regs.reg1[2]); + return utils::bit_is_set<20>(m_regs.reg1[2]); } constexpr bool fma3() const noexcept { - return detail::bit_is_set<12>(m_regs.reg1[2]); + return utils::bit_is_set<12>(m_regs.reg1[2]); } /** @@ -181,77 +162,77 @@ namespace xsimd */ constexpr bool osxsave() const noexcept { - return detail::bit_is_set<27>(m_regs.reg1[2]); + return utils::bit_is_set<27>(m_regs.reg1[2]); } constexpr bool avx() const noexcept { - return detail::bit_is_set<28>(m_regs.reg1[2]); + return utils::bit_is_set<28>(m_regs.reg1[2]); } constexpr bool avx2() const noexcept { - return detail::bit_is_set<5>(m_regs.reg7[1]); + return utils::bit_is_set<5>(m_regs.reg7[1]); } constexpr bool avx512f() const noexcept { - return detail::bit_is_set<16>(m_regs.reg7[1]); + return utils::bit_is_set<16>(m_regs.reg7[1]); } constexpr bool avx512dq() const noexcept { - return detail::bit_is_set<17>(m_regs.reg7[1]); + return utils::bit_is_set<17>(m_regs.reg7[1]); } constexpr bool avx512ifma() const noexcept { - return detail::bit_is_set<21>(m_regs.reg7[1]); + return utils::bit_is_set<21>(m_regs.reg7[1]); } constexpr bool avx512pf() const noexcept { - return detail::bit_is_set<26>(m_regs.reg7[1]); + return utils::bit_is_set<26>(m_regs.reg7[1]); } constexpr bool avx512er() const noexcept { - return detail::bit_is_set<27>(m_regs.reg7[1]); + return utils::bit_is_set<27>(m_regs.reg7[1]); } constexpr bool avx512cd() const noexcept { - return detail::bit_is_set<28>(m_regs.reg7[1]); + return utils::bit_is_set<28>(m_regs.reg7[1]); } constexpr bool avx512bw() const noexcept { - return detail::bit_is_set<30>(m_regs.reg7[1]); + return utils::bit_is_set<30>(m_regs.reg7[1]); } constexpr bool avx512vbmi() const noexcept { - return detail::bit_is_set<1>(m_regs.reg7[2]); + return utils::bit_is_set<1>(m_regs.reg7[2]); } constexpr bool avx512vbmi2() const noexcept { - return detail::bit_is_set<6>(m_regs.reg7[2]); + return utils::bit_is_set<6>(m_regs.reg7[2]); } constexpr bool avx512vnni_bw() const noexcept { - return detail::bit_is_set<11>(m_regs.reg7[2]); + return utils::bit_is_set<11>(m_regs.reg7[2]); } constexpr bool avxvnni() const noexcept { - return detail::bit_is_set<4>(m_regs.reg7a[0]); + return utils::bit_is_set<4>(m_regs.reg7a[0]); } constexpr bool fma4() const noexcept { - return detail::bit_is_set<16>(m_regs.reg8[2]); + return utils::bit_is_set<16>(m_regs.reg8[2]); } private: From a5b1732a85347fa18aa25e6952c0e1eda93f45d6 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 30 Jan 2026 17:32:56 +0100 Subject: [PATCH 06/16] Fix inline header in wrong location --- include/xsimd/types/xsimd_all_registers.hpp | 2 -- include/xsimd/types/xsimd_register.hpp | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/xsimd/types/xsimd_all_registers.hpp b/include/xsimd/types/xsimd_all_registers.hpp index da87df2e4..33f9b465d 100644 --- a/include/xsimd/types/xsimd_all_registers.hpp +++ b/include/xsimd/types/xsimd_all_registers.hpp @@ -9,8 +9,6 @@ * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ -#include "../config/xsimd_inline.hpp" - #include "xsimd_fma3_sse_register.hpp" #include "xsimd_fma4_register.hpp" #include "xsimd_sse2_register.hpp" diff --git a/include/xsimd/types/xsimd_register.hpp b/include/xsimd/types/xsimd_register.hpp index 018418af6..b14962e5b 100644 --- a/include/xsimd/types/xsimd_register.hpp +++ b/include/xsimd/types/xsimd_register.hpp @@ -14,6 +14,8 @@ #include +#include "../config/xsimd_inline.hpp" + namespace xsimd { namespace types From a5f933819a5cd6244df232d3c3ff2bb61da82a28 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 30 Jan 2026 17:43:44 +0100 Subject: [PATCH 07/16] Fix cpuid aliases --- include/xsimd/xsimd_cpu_features_x86.hpp | 36 ++++++++++++------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/include/xsimd/xsimd_cpu_features_x86.hpp b/include/xsimd/xsimd_cpu_features_x86.hpp index ba816d2cd..ec579aaec 100644 --- a/include/xsimd/xsimd_cpu_features_x86.hpp +++ b/include/xsimd/xsimd_cpu_features_x86.hpp @@ -12,6 +12,7 @@ #ifndef XSIMD_CPU_FEATURES_X86_HPP #define XSIMD_CPU_FEATURES_X86_HPP +#include #include #include "./config/xsimd_config.hpp" @@ -25,9 +26,11 @@ namespace xsimd { namespace detail { - inline void get_cpuid(int reg[4], int level, int count = 0) noexcept; + using cpuid_reg_t = std::array; + inline cpuid_reg_t get_cpuid(int level, int count = 0) noexcept; - inline std::uint32_t get_xcr0_low() noexcept; + using xcr0_reg_t = std::uint32_t; + inline xcr0_reg_t get_xcr0_low() noexcept; } /* @@ -42,7 +45,7 @@ namespace xsimd class x86_xcr0 { public: - using reg_t = std::uint32_t; + using reg_t = detail::xcr0_reg_t; /** Parse a XCR0 value into individual components. */ constexpr explicit x86_xcr0(reg_t low) noexcept @@ -81,7 +84,7 @@ namespace xsimd } private: - std::uint32_t m_low = {}; + reg_t m_low = {}; }; class x86_cpu_id @@ -89,7 +92,7 @@ namespace xsimd public: struct cpu_id_regs { - using reg_t = int[4]; + using reg_t = detail::cpuid_reg_t; reg_t reg1 = {}; reg_t reg7 = {}; @@ -113,10 +116,10 @@ namespace xsimd inline static x86_cpu_id read() { cpu_id_regs regs = {}; - detail::get_cpuid(regs.reg1, 0x1); - detail::get_cpuid(regs.reg7, 0x7); - detail::get_cpuid(regs.reg7a, 0x7, 0x1); - detail::get_cpuid(regs.reg8, 0x80000001); + regs.reg1 = detail::get_cpuid(0x1); + regs.reg7 = detail::get_cpuid(0x7); + regs.reg7a = detail::get_cpuid(0x7, 0x1); + regs.reg8 = detail::get_cpuid(0x80000001); return x86_cpu_id(regs); } @@ -241,15 +244,12 @@ namespace xsimd namespace detail { - inline void get_cpuid(int reg[4], int level, int count) noexcept + inline cpuid_reg_t get_cpuid(int level, int count) noexcept { #if !XSIMD_TARGET_X86 - reg[0] = 0; - reg[1] = 0; - reg[2] = 0; - reg[3] = 0; (void)level; (void)count; + return {}; // All bits to zero #elif defined(_MSC_VER) __cpuidex(reg, level, count); @@ -275,16 +275,16 @@ namespace xsimd #endif } - inline std::uint32_t get_xcr0_low() noexcept + inline xcr0_reg_t get_xcr0_low() noexcept { #if !XSIMD_TARGET_X86 - return {}; // return 0; + return {}; // All bits to zero #elif defined(_MSC_VER) && _MSC_VER >= 1400 - return static_cast(_xgetbv(0)); + return static_cast(_xgetbv(0)); #elif defined(__GNUC__) - std::uint32_t xcr0 = {}; + xcr0_reg_t xcr0 = {}; __asm__( "xorl %%ecx, %%ecx\n" "xgetbv\n" From d3243519a87d82d00ac2bf52ce281a9471d61308 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 30 Jan 2026 17:53:01 +0100 Subject: [PATCH 08/16] Add future TODO --- include/xsimd/utils/bits.hpp | 1 + include/xsimd/xsimd_cpu_features_x86.hpp | 1 + 2 files changed, 2 insertions(+) diff --git a/include/xsimd/utils/bits.hpp b/include/xsimd/utils/bits.hpp index e3ad83e30..a8a862219 100644 --- a/include/xsimd/utils/bits.hpp +++ b/include/xsimd/utils/bits.hpp @@ -25,6 +25,7 @@ namespace xsimd template constexpr I make_bit_mask(I bit, Args... bits) { + // TODO(C++17): Use fold expression return make_bit_mask(bit) | make_bit_mask(static_cast(bits)...); } diff --git a/include/xsimd/xsimd_cpu_features_x86.hpp b/include/xsimd/xsimd_cpu_features_x86.hpp index ec579aaec..7af58fcd0 100644 --- a/include/xsimd/xsimd_cpu_features_x86.hpp +++ b/include/xsimd/xsimd_cpu_features_x86.hpp @@ -116,6 +116,7 @@ namespace xsimd inline static x86_cpu_id read() { cpu_id_regs regs = {}; + // TODO(C++20): Use designated initializer regs.reg1 = detail::get_cpuid(0x1); regs.reg7 = detail::get_cpuid(0x7); regs.reg7a = detail::get_cpuid(0x7, 0x1); From 1691bad3368939d70cf086f340004e4334dcbe4e Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 30 Jan 2026 17:55:51 +0100 Subject: [PATCH 09/16] Fix missing var --- include/xsimd/xsimd_cpu_features_x86.hpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/include/xsimd/xsimd_cpu_features_x86.hpp b/include/xsimd/xsimd_cpu_features_x86.hpp index 7af58fcd0..cad9e2580 100644 --- a/include/xsimd/xsimd_cpu_features_x86.hpp +++ b/include/xsimd/xsimd_cpu_features_x86.hpp @@ -78,7 +78,7 @@ namespace xsimd constexpr bool avx512_state_os_enabled() const noexcept { - // Check all SSE, AVX, and AVX52 bits even though AVX512 must + // Check all SSE, AVX, and AVX512 bits even though AVX512 must // imply AVX and SSE return utils::bit_is_set<1, 2, 6>(m_low); } @@ -247,16 +247,18 @@ namespace xsimd { inline cpuid_reg_t get_cpuid(int level, int count) noexcept { + cpuid_reg_t reg = {}; + #if !XSIMD_TARGET_X86 (void)level; (void)count; return {}; // All bits to zero #elif defined(_MSC_VER) - __cpuidex(reg, level, count); + __cpuidex(reg.data(), level, count); #elif defined(__INTEL_COMPILER) - __cpuid(reg, level); + __cpuid(reg.data(), level); #elif defined(__GNUC__) || defined(__clang__) @@ -274,6 +276,7 @@ namespace xsimd : "0"(level), "2"(count)); #endif #endif + return reg; } inline xcr0_reg_t get_xcr0_low() noexcept From 525e8f8db2fec29234ac8c358c718ebfcf7ff50d Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 2 Feb 2026 16:18:27 +0100 Subject: [PATCH 10/16] Com[act lines --- include/xsimd/xsimd_cpu_features_x86.hpp | 105 +++++------------------ 1 file changed, 21 insertions(+), 84 deletions(-) diff --git a/include/xsimd/xsimd_cpu_features_x86.hpp b/include/xsimd/xsimd_cpu_features_x86.hpp index cad9e2580..b725d93c4 100644 --- a/include/xsimd/xsimd_cpu_features_x86.hpp +++ b/include/xsimd/xsimd_cpu_features_x86.hpp @@ -124,35 +124,17 @@ namespace xsimd return x86_cpu_id(regs); } - constexpr bool sse2() const noexcept - { - return utils::bit_is_set<26>(m_regs.reg1[3]); - } + constexpr bool sse2() const noexcept { return utils::bit_is_set<26>(m_regs.reg1[3]); } - constexpr bool sse3() const noexcept - { - return utils::bit_is_set<0>(m_regs.reg1[2]); - } + constexpr bool sse3() const noexcept { return utils::bit_is_set<0>(m_regs.reg1[2]); } - constexpr bool ssse3() const noexcept - { - return utils::bit_is_set<9>(m_regs.reg1[2]); - } + constexpr bool ssse3() const noexcept { return utils::bit_is_set<9>(m_regs.reg1[2]); } - constexpr bool sse4_1() const noexcept - { - return utils::bit_is_set<19>(m_regs.reg1[2]); - } + constexpr bool sse4_1() const noexcept { return utils::bit_is_set<19>(m_regs.reg1[2]); } - constexpr bool sse4_2() const noexcept - { - return utils::bit_is_set<20>(m_regs.reg1[2]); - } + constexpr bool sse4_2() const noexcept { return utils::bit_is_set<20>(m_regs.reg1[2]); } - constexpr bool fma3() const noexcept - { - return utils::bit_is_set<12>(m_regs.reg1[2]); - } + constexpr bool fma3() const noexcept { return utils::bit_is_set<12>(m_regs.reg1[2]); } /** * Indicates whether the OS has enabled extended state management. @@ -164,80 +146,35 @@ namespace xsimd * This value is read from CPUID leaf 0x1, ECX bit 27, which reflects * the state of CR4.OSXSAVE. */ - constexpr bool osxsave() const noexcept - { - return utils::bit_is_set<27>(m_regs.reg1[2]); - } + constexpr bool osxsave() const noexcept { return utils::bit_is_set<27>(m_regs.reg1[2]); } - constexpr bool avx() const noexcept - { - return utils::bit_is_set<28>(m_regs.reg1[2]); - } + constexpr bool avx() const noexcept { return utils::bit_is_set<28>(m_regs.reg1[2]); } - constexpr bool avx2() const noexcept - { - return utils::bit_is_set<5>(m_regs.reg7[1]); - } + constexpr bool avx2() const noexcept { return utils::bit_is_set<5>(m_regs.reg7[1]); } - constexpr bool avx512f() const noexcept - { - return utils::bit_is_set<16>(m_regs.reg7[1]); - } + constexpr bool avx512f() const noexcept { return utils::bit_is_set<16>(m_regs.reg7[1]); } - constexpr bool avx512dq() const noexcept - { - return utils::bit_is_set<17>(m_regs.reg7[1]); - } + constexpr bool avx512dq() const noexcept { return utils::bit_is_set<17>(m_regs.reg7[1]); } - constexpr bool avx512ifma() const noexcept - { - return utils::bit_is_set<21>(m_regs.reg7[1]); - } + constexpr bool avx512ifma() const noexcept { return utils::bit_is_set<21>(m_regs.reg7[1]); } - constexpr bool avx512pf() const noexcept - { - return utils::bit_is_set<26>(m_regs.reg7[1]); - } + constexpr bool avx512pf() const noexcept { return utils::bit_is_set<26>(m_regs.reg7[1]); } - constexpr bool avx512er() const noexcept - { - return utils::bit_is_set<27>(m_regs.reg7[1]); - } + constexpr bool avx512er() const noexcept { return utils::bit_is_set<27>(m_regs.reg7[1]); } - constexpr bool avx512cd() const noexcept - { - return utils::bit_is_set<28>(m_regs.reg7[1]); - } + constexpr bool avx512cd() const noexcept { return utils::bit_is_set<28>(m_regs.reg7[1]); } - constexpr bool avx512bw() const noexcept - { - return utils::bit_is_set<30>(m_regs.reg7[1]); - } + constexpr bool avx512bw() const noexcept { return utils::bit_is_set<30>(m_regs.reg7[1]); } - constexpr bool avx512vbmi() const noexcept - { - return utils::bit_is_set<1>(m_regs.reg7[2]); - } + constexpr bool avx512vbmi() const noexcept { return utils::bit_is_set<1>(m_regs.reg7[2]); } - constexpr bool avx512vbmi2() const noexcept - { - return utils::bit_is_set<6>(m_regs.reg7[2]); - } + constexpr bool avx512vbmi2() const noexcept { return utils::bit_is_set<6>(m_regs.reg7[2]); } - constexpr bool avx512vnni_bw() const noexcept - { - return utils::bit_is_set<11>(m_regs.reg7[2]); - } + constexpr bool avx512vnni_bw() const noexcept { return utils::bit_is_set<11>(m_regs.reg7[2]); } - constexpr bool avxvnni() const noexcept - { - return utils::bit_is_set<4>(m_regs.reg7a[0]); - } + constexpr bool avxvnni() const noexcept { return utils::bit_is_set<4>(m_regs.reg7a[0]); } - constexpr bool fma4() const noexcept - { - return utils::bit_is_set<16>(m_regs.reg8[2]); - } + constexpr bool fma4() const noexcept { return utils::bit_is_set<16>(m_regs.reg8[2]); } private: cpu_id_regs m_regs = {}; From a5cdb36337f6de24f390c7b48ffadbfde74fcd18 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 2 Feb 2026 16:24:20 +0100 Subject: [PATCH 11/16] Add doc --- include/xsimd/xsimd_cpu_features_x86.hpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/xsimd/xsimd_cpu_features_x86.hpp b/include/xsimd/xsimd_cpu_features_x86.hpp index b725d93c4..eb691cf57 100644 --- a/include/xsimd/xsimd_cpu_features_x86.hpp +++ b/include/xsimd/xsimd_cpu_features_x86.hpp @@ -87,6 +87,15 @@ namespace xsimd reg_t m_low = {}; }; + /** + * CPU Identification (CPUID) instruction results. + * + * The CPUID instruction provides detailed information about the processor, + * including supported instruction set extensions (SSE, AVX, AVX-512, etc.). + * This utility parses CPUID leaf values to detect available CPU features. + * + * @see https://en.wikipedia.org/wiki/CPUID + */ class x86_cpu_id { public: From f629a99250ac35be1e0366460ae61c388ae11a9c Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 2 Feb 2026 16:32:05 +0100 Subject: [PATCH 12/16] Shorter name --- include/xsimd/config/xsimd_cpuid.hpp | 52 ++++++++++++------------ include/xsimd/xsimd_cpu_features_x86.hpp | 6 +-- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/include/xsimd/config/xsimd_cpuid.hpp b/include/xsimd/config/xsimd_cpuid.hpp index c64bfa6f8..b0f1ac47b 100644 --- a/include/xsimd/config/xsimd_cpuid.hpp +++ b/include/xsimd/config/xsimd_cpuid.hpp @@ -118,49 +118,49 @@ namespace xsimd const auto cpuid = xsimd::x86_cpu_id::read(); auto xcr0 = xsimd::x86_xcr0::make_false(); - bool sse_state_os_enabled = true; + bool sse_enabled = true; // AVX and AVX512 strictly require OSXSAVE to be enabled by the OS. // If OSXSAVE is disabled (e.g., via bcdedit /set xsavedisable 1), // AVX state won't be preserved across context switches, so AVX cannot be used. - bool avx_state_os_enabled = false; - bool avx512_state_os_enabled = false; + bool avx_enabled = false; + bool avx512_enabled = false; if (cpuid.osxsave()) { xcr0 = xsimd::x86_xcr0::read(); - sse_state_os_enabled = xcr0.sse_state_os_enabled(); - avx_state_os_enabled = xcr0.avx_state_os_enabled(); - avx512_state_os_enabled = xcr0.avx512_state_os_enabled(); + sse_enabled = xcr0.sse_enabled(); + avx_enabled = xcr0.avx_enabled(); + avx512_enabled = xcr0.avx512_enabled(); } - sse2 = cpuid.sse2() && sse_state_os_enabled; - sse3 = cpuid.sse3() && sse_state_os_enabled; - ssse3 = cpuid.ssse3() && sse_state_os_enabled; - sse4_1 = cpuid.sse4_1() && sse_state_os_enabled; - sse4_2 = cpuid.sse4_2() && sse_state_os_enabled; - fma3_sse42 = cpuid.fma3() && sse_state_os_enabled; + sse2 = cpuid.sse2() && sse_enabled; + sse3 = cpuid.sse3() && sse_enabled; + ssse3 = cpuid.ssse3() && sse_enabled; + sse4_1 = cpuid.sse4_1() && sse_enabled; + sse4_2 = cpuid.sse4_2() && sse_enabled; + fma3_sse42 = cpuid.fma3() && sse_enabled; // sse4a not implemented in cpu_id yet // xop not implemented in cpu_id yet - avx = cpuid.avx() && avx_state_os_enabled; + avx = cpuid.avx() && avx_enabled; fma3_avx = avx && fma3_sse42; - fma4 = cpuid.fma4() && avx_state_os_enabled; - avx2 = cpuid.avx2() && avx_state_os_enabled; - avxvnni = cpuid.avxvnni() && avx_state_os_enabled; + fma4 = cpuid.fma4() && avx_enabled; + avx2 = cpuid.avx2() && avx_enabled; + avxvnni = cpuid.avxvnni() && avx_enabled; fma3_avx2 = avx2 && fma3_sse42; - avx512f = cpuid.avx512f() && avx512_state_os_enabled; - avx512cd = cpuid.avx512cd() && avx512_state_os_enabled; - avx512dq = cpuid.avx512dq() && avx512_state_os_enabled; - avx512bw = cpuid.avx512bw() && avx512_state_os_enabled; - avx512er = cpuid.avx512er() && avx512_state_os_enabled; - avx512pf = cpuid.avx512pf() && avx512_state_os_enabled; - avx512ifma = cpuid.avx512ifma() && avx512_state_os_enabled; - avx512vbmi = cpuid.avx512vbmi() && avx512_state_os_enabled; - avx512vbmi2 = cpuid.avx512vbmi2() && avx512_state_os_enabled; - avx512vnni_bw = cpuid.avx512vnni_bw() && avx512_state_os_enabled; + avx512f = cpuid.avx512f() && avx512_enabled; + avx512cd = cpuid.avx512cd() && avx512_enabled; + avx512dq = cpuid.avx512dq() && avx512_enabled; + avx512bw = cpuid.avx512bw() && avx512_enabled; + avx512er = cpuid.avx512er() && avx512_enabled; + avx512pf = cpuid.avx512pf() && avx512_enabled; + avx512ifma = cpuid.avx512ifma() && avx512_enabled; + avx512vbmi = cpuid.avx512vbmi() && avx512_enabled; + avx512vbmi2 = cpuid.avx512vbmi2() && avx512_enabled; + avx512vnni_bw = cpuid.avx512vnni_bw() && avx512_enabled; avx512vnni_vbmi2 = avx512vbmi2 && avx512vnni_bw; } }; diff --git a/include/xsimd/xsimd_cpu_features_x86.hpp b/include/xsimd/xsimd_cpu_features_x86.hpp index eb691cf57..7837d92d8 100644 --- a/include/xsimd/xsimd_cpu_features_x86.hpp +++ b/include/xsimd/xsimd_cpu_features_x86.hpp @@ -65,18 +65,18 @@ namespace xsimd return x86_xcr0(detail::get_xcr0_low()); } - constexpr bool sse_state_os_enabled() const noexcept + constexpr bool sse_enabled() const noexcept { return utils::bit_is_set<1>(m_low); } - constexpr bool avx_state_os_enabled() const noexcept + constexpr bool avx_enabled() const noexcept { // Check both SSE and AVX bits even though AVX must imply SSE return utils::bit_is_set<1, 2>(m_low); } - constexpr bool avx512_state_os_enabled() const noexcept + constexpr bool avx512_enabled() const noexcept { // Check all SSE, AVX, and AVX512 bits even though AVX512 must // imply AVX and SSE From db51ffef566c8462caf0d5d43c3a4c8eb76e3101 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 3 Feb 2026 09:55:03 +0100 Subject: [PATCH 13/16] Add default minimal factory for xcr0 --- include/xsimd/config/xsimd_cpuid.hpp | 61 +++++++++--------------- include/xsimd/utils/bits.hpp | 7 +++ include/xsimd/xsimd_cpu_features_x86.hpp | 25 +++++++--- 3 files changed, 49 insertions(+), 44 deletions(-) diff --git a/include/xsimd/config/xsimd_cpuid.hpp b/include/xsimd/config/xsimd_cpuid.hpp index b0f1ac47b..5b2a16636 100644 --- a/include/xsimd/config/xsimd_cpuid.hpp +++ b/include/xsimd/config/xsimd_cpuid.hpp @@ -115,52 +115,37 @@ namespace xsimd rvv = bool(getauxval(AT_HWCAP) & HWCAP_V); #endif #endif + // Safe on all platforms, we simply be false const auto cpuid = xsimd::x86_cpu_id::read(); - auto xcr0 = xsimd::x86_xcr0::make_false(); - - bool sse_enabled = true; - // AVX and AVX512 strictly require OSXSAVE to be enabled by the OS. - // If OSXSAVE is disabled (e.g., via bcdedit /set xsavedisable 1), - // AVX state won't be preserved across context switches, so AVX cannot be used. - bool avx_enabled = false; - bool avx512_enabled = false; - - if (cpuid.osxsave()) - { - xcr0 = xsimd::x86_xcr0::read(); - - sse_enabled = xcr0.sse_enabled(); - avx_enabled = xcr0.avx_enabled(); - avx512_enabled = xcr0.avx512_enabled(); - } - - sse2 = cpuid.sse2() && sse_enabled; - sse3 = cpuid.sse3() && sse_enabled; - ssse3 = cpuid.ssse3() && sse_enabled; - sse4_1 = cpuid.sse4_1() && sse_enabled; - sse4_2 = cpuid.sse4_2() && sse_enabled; - fma3_sse42 = cpuid.fma3() && sse_enabled; + const auto xcr0 = cpuid.osxsave() ? x86_xcr0::read() : x86_xcr0::safe_default(); + + sse2 = cpuid.sse2() && xcr0.sse_enabled(); + sse3 = cpuid.sse3() && xcr0.sse_enabled(); + ssse3 = cpuid.ssse3() && xcr0.sse_enabled(); + sse4_1 = cpuid.sse4_1() && xcr0.sse_enabled(); + sse4_2 = cpuid.sse4_2() && xcr0.sse_enabled(); + fma3_sse42 = cpuid.fma3() && xcr0.sse_enabled(); // sse4a not implemented in cpu_id yet // xop not implemented in cpu_id yet - avx = cpuid.avx() && avx_enabled; + avx = cpuid.avx() && xcr0.avx_enabled(); fma3_avx = avx && fma3_sse42; - fma4 = cpuid.fma4() && avx_enabled; - avx2 = cpuid.avx2() && avx_enabled; - avxvnni = cpuid.avxvnni() && avx_enabled; + fma4 = cpuid.fma4() && xcr0.avx_enabled(); + avx2 = cpuid.avx2() && xcr0.avx_enabled(); + avxvnni = cpuid.avxvnni() && xcr0.avx_enabled(); fma3_avx2 = avx2 && fma3_sse42; - avx512f = cpuid.avx512f() && avx512_enabled; - avx512cd = cpuid.avx512cd() && avx512_enabled; - avx512dq = cpuid.avx512dq() && avx512_enabled; - avx512bw = cpuid.avx512bw() && avx512_enabled; - avx512er = cpuid.avx512er() && avx512_enabled; - avx512pf = cpuid.avx512pf() && avx512_enabled; - avx512ifma = cpuid.avx512ifma() && avx512_enabled; - avx512vbmi = cpuid.avx512vbmi() && avx512_enabled; - avx512vbmi2 = cpuid.avx512vbmi2() && avx512_enabled; - avx512vnni_bw = cpuid.avx512vnni_bw() && avx512_enabled; + avx512f = cpuid.avx512f() && xcr0.avx512_enabled(); + avx512cd = cpuid.avx512cd() && xcr0.avx512_enabled(); + avx512dq = cpuid.avx512dq() && xcr0.avx512_enabled(); + avx512bw = cpuid.avx512bw() && xcr0.avx512_enabled(); + avx512er = cpuid.avx512er() && xcr0.avx512_enabled(); + avx512pf = cpuid.avx512pf() && xcr0.avx512_enabled(); + avx512ifma = cpuid.avx512ifma() && xcr0.avx512_enabled(); + avx512vbmi = cpuid.avx512vbmi() && xcr0.avx512_enabled(); + avx512vbmi2 = cpuid.avx512vbmi2() && xcr0.avx512_enabled(); + avx512vnni_bw = cpuid.avx512vnni_bw() && xcr0.avx512_enabled(); avx512vnni_vbmi2 = avx512vbmi2 && avx512vnni_bw; } }; diff --git a/include/xsimd/utils/bits.hpp b/include/xsimd/utils/bits.hpp index a8a862219..f9e6c8af1 100644 --- a/include/xsimd/utils/bits.hpp +++ b/include/xsimd/utils/bits.hpp @@ -35,6 +35,13 @@ namespace xsimd constexpr I mask = make_bit_mask(static_cast(Bits)...); return (value & mask) == mask; } + + template + constexpr I set_bit(I value) + { + constexpr I mask = make_bit_mask(static_cast(Bit)); + return value | mask; + } } } diff --git a/include/xsimd/xsimd_cpu_features_x86.hpp b/include/xsimd/xsimd_cpu_features_x86.hpp index 7837d92d8..4872f81c9 100644 --- a/include/xsimd/xsimd_cpu_features_x86.hpp +++ b/include/xsimd/xsimd_cpu_features_x86.hpp @@ -47,16 +47,29 @@ namespace xsimd public: using reg_t = detail::xcr0_reg_t; + static constexpr reg_t sse_bit = 1; + static constexpr reg_t avx_bit = 2; + static constexpr reg_t avx512_bit = 6; + /** Parse a XCR0 value into individual components. */ constexpr explicit x86_xcr0(reg_t low) noexcept : m_low(low) { } - /** Create an object that has all features set to false. */ - static constexpr x86_xcr0 make_false() + /** + * Create a default value with only SSE enabled. + * + * AVX and AVX512 strictly require OSXSAVE to be enabled by the OS. + * If OSXSAVE is disabled (e.g., via bcdedit /set xsavedisable 1), AVX state won't + * be preserved across context switches, so AVX cannot be used. + * SSE is therefore the only value safe to assume. + */ + constexpr static x86_xcr0 safe_default() noexcept { - return x86_xcr0(0); + reg_t low = {}; + low = utils::set_bit(low); + return x86_xcr0(low); } /** Read the XCR0 register from the CPU if on the correct architecture. */ @@ -67,20 +80,20 @@ namespace xsimd constexpr bool sse_enabled() const noexcept { - return utils::bit_is_set<1>(m_low); + return utils::bit_is_set(m_low); } constexpr bool avx_enabled() const noexcept { // Check both SSE and AVX bits even though AVX must imply SSE - return utils::bit_is_set<1, 2>(m_low); + return utils::bit_is_set(m_low); } constexpr bool avx512_enabled() const noexcept { // Check all SSE, AVX, and AVX512 bits even though AVX512 must // imply AVX and SSE - return utils::bit_is_set<1, 2, 6>(m_low); + return utils::bit_is_set(m_low); } private: From bde1481c6b1cc6a6acefa2d91d7ac139e538df91 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 3 Feb 2026 09:57:36 +0100 Subject: [PATCH 14/16] Move cpu features to config --- include/xsimd/{ => config}/xsimd_cpu_features_x86.hpp | 4 ++-- include/xsimd/config/xsimd_cpuid.hpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) rename include/xsimd/{ => config}/xsimd_cpu_features_x86.hpp (99%) diff --git a/include/xsimd/xsimd_cpu_features_x86.hpp b/include/xsimd/config/xsimd_cpu_features_x86.hpp similarity index 99% rename from include/xsimd/xsimd_cpu_features_x86.hpp rename to include/xsimd/config/xsimd_cpu_features_x86.hpp index 4872f81c9..6edd86e09 100644 --- a/include/xsimd/xsimd_cpu_features_x86.hpp +++ b/include/xsimd/config/xsimd_cpu_features_x86.hpp @@ -15,8 +15,8 @@ #include #include -#include "./config/xsimd_config.hpp" -#include "./utils/bits.hpp" +#include "../utils/bits.hpp" +#include "./xsimd_config.hpp" #if XSIMD_TARGET_X86 && defined(_MSC_VER) #include // Contains the definition of __cpuidex diff --git a/include/xsimd/config/xsimd_cpuid.hpp b/include/xsimd/config/xsimd_cpuid.hpp index 5b2a16636..c23a161ee 100644 --- a/include/xsimd/config/xsimd_cpuid.hpp +++ b/include/xsimd/config/xsimd_cpuid.hpp @@ -13,7 +13,7 @@ #define XSIMD_CPUID_HPP #include "../types/xsimd_all_registers.hpp" -#include "../xsimd_cpu_features_x86.hpp" +#include "./xsimd_cpu_features_x86.hpp" #include "xsimd_inline.hpp" #if defined(__linux__) && (defined(__ARM_NEON) || defined(_M_ARM) || defined(__riscv_vector)) From 8083e0c8ae6ad5eaaa72df98c6a3a589aad34dec Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 19 Feb 2026 17:46:50 +0100 Subject: [PATCH 15/16] Make register class implementation private --- .../xsimd/config/xsimd_cpu_features_x86.hpp | 60 ++++++++++--------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/include/xsimd/config/xsimd_cpu_features_x86.hpp b/include/xsimd/config/xsimd_cpu_features_x86.hpp index 6edd86e09..6d5af94f2 100644 --- a/include/xsimd/config/xsimd_cpu_features_x86.hpp +++ b/include/xsimd/config/xsimd_cpu_features_x86.hpp @@ -45,18 +45,6 @@ namespace xsimd class x86_xcr0 { public: - using reg_t = detail::xcr0_reg_t; - - static constexpr reg_t sse_bit = 1; - static constexpr reg_t avx_bit = 2; - static constexpr reg_t avx512_bit = 6; - - /** Parse a XCR0 value into individual components. */ - constexpr explicit x86_xcr0(reg_t low) noexcept - : m_low(low) - { - } - /** * Create a default value with only SSE enabled. * @@ -78,6 +66,9 @@ namespace xsimd return x86_xcr0(detail::get_xcr0_low()); } + /** Create a value which return false to everything. */ + constexpr x86_xcr0() noexcept = default; + constexpr bool sse_enabled() const noexcept { return utils::bit_is_set(m_low); @@ -97,6 +88,17 @@ namespace xsimd } private: + using reg_t = detail::xcr0_reg_t; + + static constexpr reg_t sse_bit = 1; + static constexpr reg_t avx_bit = 2; + static constexpr reg_t avx512_bit = 6; + + /** Parse a XCR0 value into individual components. */ + constexpr explicit x86_xcr0(reg_t low) noexcept + : m_low(low) + { + } reg_t m_low = {}; }; @@ -112,22 +114,6 @@ namespace xsimd class x86_cpu_id { public: - struct cpu_id_regs - { - using reg_t = detail::cpuid_reg_t; - - reg_t reg1 = {}; - reg_t reg7 = {}; - reg_t reg7a = {}; - reg_t reg8 = {}; - }; - - /** Parse CpuInfo register values into individual components. */ - constexpr explicit x86_cpu_id(const cpu_id_regs& regs) noexcept - : m_regs(regs) - { - } - /** * Read the CpuId registers from the CPU if on the correct architecture. * @@ -146,6 +132,9 @@ namespace xsimd return x86_cpu_id(regs); } + /** Create a value which return false to everything. */ + constexpr x86_cpu_id() noexcept = default; + constexpr bool sse2() const noexcept { return utils::bit_is_set<26>(m_regs.reg1[3]); } constexpr bool sse3() const noexcept { return utils::bit_is_set<0>(m_regs.reg1[2]); } @@ -199,6 +188,21 @@ namespace xsimd constexpr bool fma4() const noexcept { return utils::bit_is_set<16>(m_regs.reg8[2]); } private: + struct cpu_id_regs + { + using reg_t = detail::cpuid_reg_t; + + reg_t reg1 = {}; + reg_t reg7 = {}; + reg_t reg7a = {}; + reg_t reg8 = {}; + }; + + /** Parse CpuInfo register values into individual components. */ + constexpr explicit x86_cpu_id(const cpu_id_regs& regs) noexcept + : m_regs(regs) + { + } cpu_id_regs m_regs = {}; }; From 363258b213b36db9932d2fa4d78215f6060dc97e Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 19 Feb 2026 18:04:26 +0100 Subject: [PATCH 16/16] Document all xcr0 bits --- .../xsimd/config/xsimd_cpu_features_x86.hpp | 38 +++++++++++++++---- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/include/xsimd/config/xsimd_cpu_features_x86.hpp b/include/xsimd/config/xsimd_cpu_features_x86.hpp index 6d5af94f2..5ad0bce79 100644 --- a/include/xsimd/config/xsimd_cpu_features_x86.hpp +++ b/include/xsimd/config/xsimd_cpu_features_x86.hpp @@ -56,7 +56,7 @@ namespace xsimd constexpr static x86_xcr0 safe_default() noexcept { reg_t low = {}; - low = utils::set_bit(low); + low = utils::set_bit(bit::sse)>(low); return x86_xcr0(low); } @@ -71,28 +71,52 @@ namespace xsimd constexpr bool sse_enabled() const noexcept { - return utils::bit_is_set(m_low); + return bit_is_set(m_low); } constexpr bool avx_enabled() const noexcept { // Check both SSE and AVX bits even though AVX must imply SSE - return utils::bit_is_set(m_low); + return bit_is_set(m_low); } constexpr bool avx512_enabled() const noexcept { // Check all SSE, AVX, and AVX512 bits even though AVX512 must // imply AVX and SSE - return utils::bit_is_set(m_low); + return bit_is_set(m_low); } private: using reg_t = detail::xcr0_reg_t; - static constexpr reg_t sse_bit = 1; - static constexpr reg_t avx_bit = 2; - static constexpr reg_t avx512_bit = 6; + enum class bit : reg_t + { + /** x87 FPU/MMX support (must be 1). */ + x87 = 0, + /** XSAVE support for MXCSR and XMM registers. */ + sse = 1, + /** AVX enabled and XSAVE support for upper halves of YMM registers. */ + avx = 2, + /** MPX enabled and XSAVE support for BND0-BND3 registers. */ + bndreg = 3, + /** MPX enabled and XSAVE support for BNDCFGU and BNDSTATUS registers. */ + bndcsr = 4, + /** AVX-512 enabled and XSAVE support for opmask registers k0-k7. */ + opmask = 5, + /** AVX-512 enabled and XSAVE support for upper halves of lower ZMM registers. */ + zmm_hi256 = 6, + /** AVX-512 enabled and XSAVE support for upper ZMM registers. */ + hi16_zmm = 7, + /** XSAVE support for PKRU register. */ + pkru = 9, + }; + + template + static constexpr bool bit_is_set(reg_t value) noexcept + { + return utils::bit_is_set(Bits)...>(value); + } /** Parse a XCR0 value into individual components. */ constexpr explicit x86_xcr0(reg_t low) noexcept