From 934bddce4fcd7379f50670ddf8b1f4d27bd7baea Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 21 May 2026 06:33:55 -0400 Subject: [PATCH 1/3] ci: support XSIMD_DEFAULT_ARCH override and verify default_arch The CI job that exercises avx512vl_128 as the default arch was passing the variable but the test build never received it; this fixes both halves of the loop (linux.yml typo $CXX_FLAGS->$CXXFLAGS, CMakeLists propagation, and a static_assert in test_arch). --- .github/workflows/linux.yml | 2 +- test/CMakeLists.txt | 3 +++ test/test_arch.cpp | 6 ++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 03a914bda..b66c9a97c 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -100,7 +100,7 @@ jobs: fi if [[ '${{ matrix.sys.flags }}' == 'avx512vl_128' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512" - CXXFLAGS="$CXX_FLAGS -DXSIMD_DEFAULT_ARCH=avx512vl_128" + CXXFLAGS="$CXXFLAGS -DXSIMD_DEFAULT_ARCH=avx512vl_128" fi if [[ '${{ matrix.sys.flags }}' == 'avx512vl_256' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512" diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 662dcdc3f..b9a6d2968 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -49,6 +49,9 @@ if (TARGET_EMULATED) message(STATUS "Using emulated target: ${TARGET_EMULATED}") set(EMULATED_COMPILE_FLAGS -DXSIMD_DEFAULT_ARCH=${TARGET_ARCH};-DXSIMD_WITH_EMULATED=1) unset(TARGET_ARCH CACHE) +elseif (DEFINED XSIMD_DEFAULT_ARCH AND NOT "${XSIMD_DEFAULT_ARCH}" STREQUAL "") + message(STATUS "Forcing default arch to xsimd::${XSIMD_DEFAULT_ARCH}") + set(EMULATED_COMPILE_FLAGS -DXSIMD_DEFAULT_ARCH=${XSIMD_DEFAULT_ARCH}) endif() if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Intel") diff --git a/test/test_arch.cpp b/test/test_arch.cpp index d2d0df249..5d479bbaf 100644 --- a/test/test_arch.cpp +++ b/test/test_arch.cpp @@ -23,6 +23,12 @@ static_assert(xsimd::default_arch::supported(), "default arch must be supported" static_assert(std::is_same::value, "default arch is the best available"); static_assert(xsimd::supported_architectures::contains(), "default arch is supported"); static_assert(xsimd::all_architectures::contains(), "default arch is a valid arch"); +#else +namespace xsimd +{ + static_assert(std::is_same::value, + "default_arch does not match XSIMD_DEFAULT_ARCH"); +} #endif #if !XSIMD_WITH_SVE From 7c36cbca55b1205f28f6d556a3cb553072ad4d86 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 21 May 2026 06:35:19 -0400 Subject: [PATCH 2/3] chore: misc small fixes touched while debugging masked memory - sse2 store_masked: align call signature to the (Mode, common{}) shape - avx512vl_register: doc comment AVX512DQ -> AVX512VL typo - isa.hpp: include _128 headers before wider arch so the AVX half-fold's recursive call resolves at parse time (load-bearing for the upcoming perf change); wrap in clang-format off/on - avx_128 swizzle: split the dynamic-mask overload into per-type ovlds for true C++14 builds where XSIMD_IF_CONSTEXPR is plain 'if' and both branches must type-check --- include/xsimd/arch/xsimd_avx_128.hpp | 28 +++++++++---------- include/xsimd/arch/xsimd_isa.hpp | 9 ++++-- include/xsimd/arch/xsimd_sse2.hpp | 2 +- .../xsimd/types/xsimd_avx512vl_register.hpp | 2 +- 4 files changed, 23 insertions(+), 18 deletions(-) diff --git a/include/xsimd/arch/xsimd_avx_128.hpp b/include/xsimd/arch/xsimd_avx_128.hpp index 46fc9acb7..07dafd78b 100644 --- a/include/xsimd/arch/xsimd_avx_128.hpp +++ b/include/xsimd/arch/xsimd_avx_128.hpp @@ -129,20 +129,20 @@ namespace xsimd } // swizzle (dynamic mask) - template ::value && sizeof(T) == sizeof(ITy)>> - XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept - { - XSIMD_IF_CONSTEXPR(std::is_same::value) - { - return _mm_permutevar_ps(self, mask); - } - else - { - // VPERMILPD's variable control reads bit 1 of each 64-bit selector - // (bit 0 is ignored), so a {0,1} index needs to become {0,2}. - // Negation is a cheap alternative to a left shift by 1. - return _mm_permutevar_pd(self, -mask); - } + template + XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + { + static_assert(sizeof(float) == sizeof(ITy), "index type must match value width"); + return _mm_permutevar_ps(self, mask); + } + template + XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + { + static_assert(sizeof(double) == sizeof(ITy), "index type must match value width"); + // VPERMILPD's variable control reads bit 1 of each 64-bit selector + // (bit 0 is ignored), so a {0,1} index needs to become {0,2}. + // Negation is a cheap alternative to a left shift by 1. + return _mm_permutevar_pd(self, -mask); } // swizzle (constant mask) diff --git a/include/xsimd/arch/xsimd_isa.hpp b/include/xsimd/arch/xsimd_isa.hpp index cf88f64d7..06edfa98f 100644 --- a/include/xsimd/arch/xsimd_isa.hpp +++ b/include/xsimd/arch/xsimd_isa.hpp @@ -48,8 +48,11 @@ #endif #if XSIMD_WITH_AVX -#include "./xsimd_avx.hpp" +// clang-format off +// _128 first: avx half-fold recursive call needs avx_128 visible at parse time. #include "./xsimd_avx_128.hpp" +#include "./xsimd_avx.hpp" +// clang-format on #endif #if XSIMD_WITH_FMA3_AVX @@ -61,8 +64,10 @@ #endif #if XSIMD_WITH_AVX2 -#include "./xsimd_avx2.hpp" +// clang-format off #include "./xsimd_avx2_128.hpp" +#include "./xsimd_avx2.hpp" +// clang-format on #endif #if XSIMD_WITH_FMA3_AVX2 diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp index c6cfb5f07..0a95aae8b 100644 --- a/include/xsimd/arch/xsimd_sse2.hpp +++ b/include/xsimd/arch/xsimd_sse2.hpp @@ -2331,7 +2331,7 @@ namespace xsimd } else { - store_masked(mem, src, mask, requires_arch {}); + store_masked(mem, src, mask, aligned_mode {}, common {}); } } diff --git a/include/xsimd/types/xsimd_avx512vl_register.hpp b/include/xsimd/types/xsimd_avx512vl_register.hpp index c73c2a963..46d887333 100644 --- a/include/xsimd/types/xsimd_avx512vl_register.hpp +++ b/include/xsimd/types/xsimd_avx512vl_register.hpp @@ -20,7 +20,7 @@ namespace xsimd /** * @ingroup architectures * - * AVX512DQ instructions + * AVX512VL instructions */ struct avx512vl : avx512cd { From ea882e639777b95a2aa268cce180dd11317d05ca Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 21 May 2026 07:13:32 -0400 Subject: [PATCH 3/3] fix(avx-512vl): masked load/store reach per-arch EVEX intrinsics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three coupled changes; gcc-10 partial-ordering forces them into a single commit: 1) Rewrite avx512vl_128 / avx512vl_256 masked load/store. Adds the missing int64/uint64/float/double load_masked ovlds, corrects the batch_bool_constant typing on store_masked (was uint32_t/uint64_t for signed-int/float/double stores, now matches the value type), and branches aligned vs. unaligned to the right EVEX intrinsic. Unsigned ovlds delegate to the signed one via bitwise_cast. 2) Constrain the non-VL master ovlds (avx_128 float/double, avx2_128 int32/uint32 + int64/uint64, avx2 templated and int32/uint32/int64/ uint64) and the common-memory int<->float bridges with !is_base_of. gcc-10's partial ordering otherwise sees a concrete requires_arch and the inherited concrete requires_arch (Y a base of X) as equally specialized, likewise for templated bridge vs. native when A is VL. gcc-14 handles both cases naturally so this is a no-op there. The avx native gains an is_floating_point SFINAE and the avx2 templated gains is_integral && sizeof>=4 so the new half-fold dispatch (half_arch = avx for floats, avx2 for ints in a 512-bit batch) is unambiguous on gcc-10. 3) Resolve the half-fold target arch in avx / avx2 / avx512f through make_sized_batch_t::arch_type so the dispatch picks avx512vl_128 / avx512vl_256 when available and emits EVEX vmovdqu32{k}{z} instead of VEX vpmaskmovd / vmaskmovps. (Without (3), (2)'s is_integral SFINAE on the avx2 templated form leaves the pre-existing avx512f.hpp:339 'store_masked(float*, __m256, ...)' callsite with no matching ovld on gcc-10.) The xsimd_batch dispatch drops the explicit args on the kernel::store_masked call so the SFINAE'd overload set can be resolved by ADL, and adds a fwd decl of make_sized_batch ahead of xsimd_isa.hpp so the half-fold sites can see the type at parse time. bridge_not_vl lives in xsimd_common_fwd next to the bridge fwd-decls; fwd.hpp now pulls xsimd_avx512vl_register so the trait sees complete types. The 4 redundant register.hpp includes that would otherwise be added at the point-of-use are dropped — they're reachable transitively through fwd.hpp. --- .../xsimd/arch/common/xsimd_common_memory.hpp | 23 +-- include/xsimd/arch/xsimd_avx.hpp | 38 ++--- include/xsimd/arch/xsimd_avx2.hpp | 46 +++--- include/xsimd/arch/xsimd_avx2_128.hpp | 24 ++-- include/xsimd/arch/xsimd_avx512f.hpp | 31 ++-- include/xsimd/arch/xsimd_avx512vl_128.hpp | 134 ++++++++++++++---- include/xsimd/arch/xsimd_avx512vl_256.hpp | 134 ++++++++++++++---- include/xsimd/arch/xsimd_avx_128.hpp | 12 +- include/xsimd/arch/xsimd_common_fwd.hpp | 24 ++-- include/xsimd/types/xsimd_batch.hpp | 7 +- 10 files changed, 333 insertions(+), 140 deletions(-) diff --git a/include/xsimd/arch/common/xsimd_common_memory.hpp b/include/xsimd/arch/common/xsimd_common_memory.hpp index 7a1ed73a3..e584c5151 100644 --- a/include/xsimd/arch/common/xsimd_common_memory.hpp +++ b/include/xsimd/arch/common/xsimd_common_memory.hpp @@ -388,22 +388,27 @@ namespace xsimd } } + // Integer→float reinterpret bridges. Excluded for AVX-512VL archs which provide + // their own EVEX masked integer ovlds; without the exclusion gcc-10 sees the bridge + // and the VL native as equally specialized for A=avx512vl_*. (bridge_not_vl in fwd.hpp) template - XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept + XSIMD_INLINE std::enable_if_t::value, batch> + load_masked(int32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept { const auto f = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, A {}); return bitwise_cast(f); } template - XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept + XSIMD_INLINE std::enable_if_t::value, batch> + load_masked(uint32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept { const auto f = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, A {}); return bitwise_cast(f); } template - XSIMD_INLINE std::enable_if_t::value, batch> + XSIMD_INLINE std::enable_if_t::value && types::has_simd_register::value, batch> load_masked(int64_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept { const auto d = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, A {}); @@ -411,7 +416,7 @@ namespace xsimd } template - XSIMD_INLINE std::enable_if_t::value, batch> + XSIMD_INLINE std::enable_if_t::value && types::has_simd_register::value, batch> load_masked(uint64_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept { const auto d = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, A {}); @@ -419,26 +424,28 @@ namespace xsimd } template - XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept + XSIMD_INLINE std::enable_if_t::value> + store_masked(int32_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept { store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, A {}); } template - XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept + XSIMD_INLINE std::enable_if_t::value> + store_masked(uint32_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept { store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, A {}); } template - XSIMD_INLINE std::enable_if_t::value> + XSIMD_INLINE std::enable_if_t::value && types::has_simd_register::value> store_masked(int64_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept { store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, A {}); } template - XSIMD_INLINE std::enable_if_t::value> + XSIMD_INLINE std::enable_if_t::value && types::has_simd_register::value> store_masked(uint64_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept { store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, A {}); diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index 1ee0c5b89..d2d527fec 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -993,19 +993,20 @@ namespace xsimd { using int_t = as_integer_t; constexpr size_t half_size = batch::size / 2; + using half_arch = typename ::xsimd::make_sized_batch_t::arch_type; - // confined to lower 128-bit half → forward to 128 bit + // lower 128-bit half XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size) { - constexpr auto mlo = ::xsimd::detail::lower_half(batch_bool_constant {}); - const auto lo = load_masked(reinterpret_cast(mem), mlo, convert {}, Mode {}, avx_128 {}); + constexpr auto mlo = ::xsimd::detail::lower_half(batch_bool_constant {}); + const auto lo = load_masked(reinterpret_cast(mem), mlo, convert {}, Mode {}, half_arch {}); return bitwise_cast(batch(_mm256_zextsi128_si256(lo))); } - // confined to upper 128-bit half → forward to 128 bit + // upper 128-bit half else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size) { - constexpr auto mhi = ::xsimd::detail::upper_half(mask); - const auto hi = load_masked(mem + half_size, mhi, convert {}, Mode {}, avx_128 {}); + constexpr auto mhi = ::xsimd::detail::upper_half(mask); + const auto hi = load_masked(mem + half_size, mhi, convert {}, Mode {}, half_arch {}); return detail::zero_extend(hi); } else @@ -1019,36 +1020,39 @@ namespace xsimd namespace detail { template - XSIMD_INLINE void maskstore(float* mem, batch_bool const& mask, batch const& src) noexcept + XSIMD_INLINE void maskstore(float* mem, batch, A> const& mask, batch const& src) noexcept { _mm256_maskstore_ps(mem, mask, src); } template - XSIMD_INLINE void maskstore(double* mem, batch_bool const& mask, batch const& src) noexcept + XSIMD_INLINE void maskstore(double* mem, batch, A> const& mask, batch const& src) noexcept { _mm256_maskstore_pd(mem, mask, src); } } - template + template ::value>> XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { constexpr size_t half_size = batch::size / 2; + using half_batch = ::xsimd::make_sized_batch_t; + using half_arch = typename half_batch::arch_type; - // confined to lower 128-bit half → forward to 128 bit + // lower 128-bit half XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size) { - constexpr auto mlo = ::xsimd::detail::lower_half(mask); - const auto lo = detail::lower_half(src); - store_masked(mem, lo, mlo, Mode {}, sse4_2 {}); + constexpr auto mlo = ::xsimd::detail::lower_half(mask); + const half_batch lo = detail::lower_half(src); + store_masked(mem, lo, mlo, Mode {}, half_arch {}); } - // confined to upper 128-bit half → forward to 128 bit + // upper 128-bit half else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size) { - constexpr auto mhi = ::xsimd::detail::upper_half(mask); - const auto hi = detail::upper_half(src); - store_masked(mem + half_size, hi, mhi, Mode {}, sse4_2 {}); + constexpr auto mhi = ::xsimd::detail::upper_half(mask); + const half_batch hi = detail::upper_half(src); + store_masked(mem + half_size, hi, mhi, Mode {}, half_arch {}); } else { diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index e2c223cc7..ee0227066 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -138,7 +138,8 @@ namespace xsimd } // single templated implementation for integer masked loads (32/64-bit) - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE std::enable_if_t::value && (sizeof(T) >= 4), batch> load_masked(T const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { @@ -148,26 +149,30 @@ namespace xsimd return detail::maskload(reinterpret_cast(mem), mask.as_batch()); } - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { return load_masked(mem, mask, convert {}, Mode {}, avx2 {}); } - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept { const auto r = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, avx2 {}); return bitwise_cast(r); } - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE batch load_masked(int64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { return load_masked(mem, mask, convert {}, Mode {}, avx2 {}); } - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE batch load_masked(uint64_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept { const auto r = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, avx2 {}); @@ -190,24 +195,27 @@ namespace xsimd } } - template + template ::value && (sizeof(T) >= 4) && std::is_base_of::value && !std::is_base_of::value>> XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { constexpr size_t lanes_per_half = batch::size / 2; + using half_batch = ::xsimd::make_sized_batch_t; + using half_arch = typename half_batch::arch_type; - // confined to lower 128-bit half → forward to SSE + // lower 128-bit half XSIMD_IF_CONSTEXPR(mask.countl_zero() >= lanes_per_half) { - constexpr auto mlo = ::xsimd::detail::lower_half(mask); - const auto lo = detail::lower_half(src); - store_masked(mem, lo, mlo, Mode {}, sse4_2 {}); + constexpr auto mlo = ::xsimd::detail::lower_half(mask); + const half_batch lo = detail::lower_half(src); + store_masked(mem, lo, mlo, Mode {}, half_arch {}); } - // confined to upper 128-bit half → forward to SSE + // upper 128-bit half else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= lanes_per_half) { - constexpr auto mhi = ::xsimd::detail::upper_half(mask); - const auto hi = detail::upper_half(src); - store_masked(mem + lanes_per_half, hi, mhi, Mode {}, sse4_2 {}); + constexpr auto mhi = ::xsimd::detail::upper_half(mask); + const half_batch hi = detail::upper_half(src); + store_masked(mem + lanes_per_half, hi, mhi, Mode {}, half_arch {}); } else { @@ -215,14 +223,16 @@ namespace xsimd } } - template - XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template ::value && !std::is_base_of::value>> + XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept { const auto s32 = bitwise_cast(src); - store_masked(reinterpret_cast(mem), s32, mask, Mode {}, avx2 {}); + store_masked(reinterpret_cast(mem), s32, batch_bool_constant {}, Mode {}, avx2 {}); } - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept { const auto s64 = bitwise_cast(src); diff --git a/include/xsimd/arch/xsimd_avx2_128.hpp b/include/xsimd/arch/xsimd_avx2_128.hpp index 7a590c74f..f31900482 100644 --- a/include/xsimd/arch/xsimd_avx2_128.hpp +++ b/include/xsimd/arch/xsimd_avx2_128.hpp @@ -90,44 +90,52 @@ namespace xsimd } // load_masked - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { return _mm_maskload_epi32(mem, mask.as_batch()); } - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { return _mm_maskload_epi32((int32_t*)mem, mask.as_batch()); } - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE batch load_masked(int64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { return _mm_maskload_epi64(mem, mask.as_batch()); } - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE batch load_masked(uint64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { return _mm_maskload_epi64((int64_t*)mem, mask.as_batch()); } // store_masked - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { return _mm_maskstore_epi32(mem, mask.as_batch(), src); } - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { return _mm_maskstore_epi32((int32_t*)mem, mask.as_batch(), src); } - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE void store_masked(int64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { return _mm_maskstore_epi64(mem, mask.as_batch(), src); } - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { return _mm_maskstore_epi64((int64_t*)mem, mask.as_batch(), src); diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index 6a7316722..cc057eacf 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -305,16 +305,17 @@ namespace xsimd convert, Mode, requires_arch) noexcept { constexpr auto half = batch::size / 2; - XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half) // lower-half AVX2 forwarding + using half_arch = typename ::xsimd::make_sized_batch_t::arch_type; + XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half) // lower 256-bit half { - constexpr auto mlo = ::xsimd::detail::lower_half(mask); - const auto lo = load_masked(mem, mlo, convert {}, Mode {}, avx2 {}); + constexpr auto mlo = ::xsimd::detail::lower_half(mask); + const auto lo = load_masked(mem, mlo, convert {}, Mode {}, half_arch {}); return detail::load_masked(lo); // zero-extend low half } - else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half) // upper-half AVX2 forwarding + else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half) // upper 256-bit half { - constexpr auto mhi = ::xsimd::detail::upper_half(mask); - const auto hi = load_masked(mem + half, mhi, convert {}, Mode {}, avx2 {}); + constexpr auto mhi = ::xsimd::detail::upper_half(mask); + const auto hi = load_masked(mem + half, mhi, convert {}, Mode {}, half_arch {}); return detail::load_masked(hi, detail::high_tag {}); } else @@ -332,17 +333,19 @@ namespace xsimd Mode, requires_arch) noexcept { constexpr auto half = batch::size / 2; - XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half) // lower-half AVX2 forwarding + using half_batch = ::xsimd::make_sized_batch_t; + using half_arch = typename half_batch::arch_type; + XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half) // lower 256-bit half { - constexpr auto mlo = ::xsimd::detail::lower_half(mask); - const auto lo = detail::lower_half(src); - store_masked(mem, lo, mlo, Mode {}, avx2 {}); + constexpr auto mlo = ::xsimd::detail::lower_half(mask); + const half_batch lo = detail::lower_half(src); + store_masked(mem, lo, mlo, Mode {}, half_arch {}); } - else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half) // upper-half AVX2 forwarding + else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half) // upper 256-bit half { - constexpr auto mhi = ::xsimd::detail::upper_half(mask); - const auto hi = detail::upper_half(src); - store_masked(mem + half, hi, mhi, Mode {}, avx2 {}); + constexpr auto mhi = ::xsimd::detail::upper_half(mask); + const half_batch hi = detail::upper_half(src); + store_masked(mem + half, hi, mhi, Mode {}, half_arch {}); } else { diff --git a/include/xsimd/arch/xsimd_avx512vl_128.hpp b/include/xsimd/arch/xsimd_avx512vl_128.hpp index 155338425..855870af3 100644 --- a/include/xsimd/arch/xsimd_avx512vl_128.hpp +++ b/include/xsimd/arch/xsimd_avx512vl_128.hpp @@ -188,53 +188,125 @@ namespace xsimd return _mm_abs_epi64(self); } - // load masked - template - XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + // Per-type masked load/store — partial ordering picks these over the + // avx2 bridges this arch inherits. Unsigned overloads reinterpret to + // the signed EVEX intrinsic. + template + XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - constexpr auto imm_mask = mask.mask(); - return _mm_mask_loadu_epi32(_mm_setzero_si128(), imm_mask, mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm_maskz_load_epi32(mask.mask(), mem); + } + else + { + return _mm_maskz_loadu_epi32(mask.mask(), mem); + } } - template - XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + template + XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept { - constexpr auto imm_mask = mask.mask(); - return _mm_mask_loadu_epi32(_mm_setzero_si128(), imm_mask, mem); + return bitwise_cast(load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, avx512vl_128 {})); } - - // store masked - template - XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE batch load_masked(int64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - _mm_mask_storeu_epi32(mem, mask.mask(), src); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm_maskz_load_epi64(mask.mask(), mem); + } + else + { + return _mm_maskz_loadu_epi64(mask.mask(), mem); + } } - template - XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE batch load_masked(uint64_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept { - _mm_mask_storeu_epi32(mem, mask.mask(), src); + return bitwise_cast(load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, avx512vl_128 {})); } - - template - XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE batch load_masked(float const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm_maskz_load_ps(mask.mask(), mem); + } + else + { + return _mm_maskz_loadu_ps(mask.mask(), mem); + } + } + template + XSIMD_INLINE batch load_masked(double const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - _mm_mask_storeu_epi64(mem, mask.mask(), src); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm_maskz_load_pd(mask.mask(), mem); + } + else + { + return _mm_maskz_loadu_pd(mask.mask(), mem); + } } - template - XSIMD_INLINE void store_masked(int64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { - _mm_mask_storeu_epi64(mem, mask.mask(), src); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + _mm_mask_store_epi32(mem, mask.mask(), src); + } + else + { + _mm_mask_storeu_epi32(mem, mask.mask(), src); + } } - template - XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept { - _mm_mask_storeu_ps(mem, mask.mask(), src); + store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, avx512vl_128 {}); } - - template - XSIMD_INLINE void store_masked(double* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE void store_masked(int64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + _mm_mask_store_epi64(mem, mask.mask(), src); + } + else + { + _mm_mask_storeu_epi64(mem, mask.mask(), src); + } + } + template + XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept + { + store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, avx512vl_128 {}); + } + template + XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { - _mm_mask_storeu_pd(mem, mask.mask(), src); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + _mm_mask_store_ps(mem, mask.mask(), src); + } + else + { + _mm_mask_storeu_ps(mem, mask.mask(), src); + } + } + template + XSIMD_INLINE void store_masked(double* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + _mm_mask_store_pd(mem, mask.mask(), src); + } + else + { + _mm_mask_storeu_pd(mem, mask.mask(), src); + } } // max diff --git a/include/xsimd/arch/xsimd_avx512vl_256.hpp b/include/xsimd/arch/xsimd_avx512vl_256.hpp index a5ea546bc..c0b4a568e 100644 --- a/include/xsimd/arch/xsimd_avx512vl_256.hpp +++ b/include/xsimd/arch/xsimd_avx512vl_256.hpp @@ -188,53 +188,125 @@ namespace xsimd return _mm256_abs_epi64(self); } - // load masked - template - XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + // Per-type masked load/store — partial ordering picks these over the + // avx2 bridges this arch inherits. Unsigned overloads reinterpret to + // the signed EVEX intrinsic. + template + XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - constexpr auto imm_mask = mask.mask(); - return _mm256_mask_loadu_epi32(_mm256_setzero_si256(), imm_mask, mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm256_maskz_load_epi32(mask.mask(), mem); + } + else + { + return _mm256_maskz_loadu_epi32(mask.mask(), mem); + } } - template - XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + template + XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept { - constexpr auto imm_mask = mask.mask(); - return _mm256_mask_loadu_epi32(_mm256_setzero_si256(), imm_mask, mem); + return bitwise_cast(load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, avx512vl_256 {})); } - - // store masked - template - XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE batch load_masked(int64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - _mm256_mask_storeu_epi32(mem, mask.mask(), src); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm256_maskz_load_epi64(mask.mask(), mem); + } + else + { + return _mm256_maskz_loadu_epi64(mask.mask(), mem); + } } - template - XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE batch load_masked(uint64_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept { - _mm256_mask_storeu_epi32(mem, mask.mask(), src); + return bitwise_cast(load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, avx512vl_256 {})); } - - template - XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE batch load_masked(float const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm256_maskz_load_ps(mask.mask(), mem); + } + else + { + return _mm256_maskz_loadu_ps(mask.mask(), mem); + } + } + template + XSIMD_INLINE batch load_masked(double const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - _mm256_mask_storeu_epi64(mem, mask.mask(), src); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm256_maskz_load_pd(mask.mask(), mem); + } + else + { + return _mm256_maskz_loadu_pd(mask.mask(), mem); + } } - template - XSIMD_INLINE void store_masked(int64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { - _mm256_mask_storeu_epi64(mem, mask.mask(), src); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + _mm256_mask_store_epi32(mem, mask.mask(), src); + } + else + { + _mm256_mask_storeu_epi32(mem, mask.mask(), src); + } } - template - XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept { - _mm256_mask_storeu_ps(mem, mask.mask(), src); + store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, avx512vl_256 {}); } - - template - XSIMD_INLINE void store_masked(double* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE void store_masked(int64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + _mm256_mask_store_epi64(mem, mask.mask(), src); + } + else + { + _mm256_mask_storeu_epi64(mem, mask.mask(), src); + } + } + template + XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept + { + store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, avx512vl_256 {}); + } + template + XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { - _mm256_mask_storeu_pd(mem, mask.mask(), src); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + _mm256_mask_store_ps(mem, mask.mask(), src); + } + else + { + _mm256_mask_storeu_ps(mem, mask.mask(), src); + } + } + template + XSIMD_INLINE void store_masked(double* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + _mm256_mask_store_pd(mem, mask.mask(), src); + } + else + { + _mm256_mask_storeu_pd(mem, mask.mask(), src); + } } // max diff --git a/include/xsimd/arch/xsimd_avx_128.hpp b/include/xsimd/arch/xsimd_avx_128.hpp index 07dafd78b..30697d874 100644 --- a/include/xsimd/arch/xsimd_avx_128.hpp +++ b/include/xsimd/arch/xsimd_avx_128.hpp @@ -104,25 +104,29 @@ namespace xsimd } // load_masked - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE batch load_masked(float const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { return _mm_maskload_ps(mem, mask.as_batch()); } - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE batch load_masked(double const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { return _mm_maskload_pd(mem, mask.as_batch()); } // store_masked - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { return _mm_maskstore_ps(mem, mask.as_batch(), src); } - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE void store_masked(double* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { return _mm_maskstore_pd(mem, mask.as_batch(), src); diff --git a/include/xsimd/arch/xsimd_common_fwd.hpp b/include/xsimd/arch/xsimd_common_fwd.hpp index f5a7f4ffe..98249be90 100644 --- a/include/xsimd/arch/xsimd_common_fwd.hpp +++ b/include/xsimd/arch/xsimd_common_fwd.hpp @@ -13,6 +13,8 @@ #ifndef XSIMD_COMMON_FWD_HPP #define XSIMD_COMMON_FWD_HPP +#include "../types/xsimd_avx512vl_register.hpp" + #include #include #include @@ -43,6 +45,12 @@ namespace xsimd namespace kernel { + template + struct bridge_not_vl + : std::integral_constant::value && !std::is_base_of::value> + { + }; + // forward declaration template ::value>> XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept; @@ -88,21 +96,21 @@ namespace xsimd template XSIMD_INLINE void store_masked(T_out* mem, batch const& src, batch_bool_constant mask, alignment, requires_arch) noexcept; template - XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept; + XSIMD_INLINE std::enable_if_t::value, batch> load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept; template - XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept; + XSIMD_INLINE std::enable_if_t::value, batch> load_masked(uint32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept; template - XSIMD_INLINE std::enable_if_t::value, batch> load_masked(int64_t const*, batch_bool_constant, convert, Mode, requires_arch) noexcept; + XSIMD_INLINE std::enable_if_t::value && types::has_simd_register::value, batch> load_masked(int64_t const*, batch_bool_constant, convert, Mode, requires_arch) noexcept; template - XSIMD_INLINE std::enable_if_t::value, batch> load_masked(uint64_t const*, batch_bool_constant, convert, Mode, requires_arch) noexcept; + XSIMD_INLINE std::enable_if_t::value && types::has_simd_register::value, batch> load_masked(uint64_t const*, batch_bool_constant, convert, Mode, requires_arch) noexcept; template - XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept; + XSIMD_INLINE std::enable_if_t::value> store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept; template - XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept; + XSIMD_INLINE std::enable_if_t::value> store_masked(uint32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept; template - XSIMD_INLINE std::enable_if_t::value> store_masked(int64_t*, batch const&, batch_bool_constant, Mode, requires_arch) noexcept; + XSIMD_INLINE std::enable_if_t::value && types::has_simd_register::value> store_masked(int64_t*, batch const&, batch_bool_constant, Mode, requires_arch) noexcept; template - XSIMD_INLINE std::enable_if_t::value> store_masked(uint64_t*, batch const&, batch_bool_constant, Mode, requires_arch) noexcept; + XSIMD_INLINE std::enable_if_t::value && types::has_simd_register::value> store_masked(uint64_t*, batch const&, batch_bool_constant, Mode, requires_arch) noexcept; // Forward declarations for pack-level helpers namespace detail diff --git a/include/xsimd/types/xsimd_batch.hpp b/include/xsimd/types/xsimd_batch.hpp index 970483150..5ae7006b0 100644 --- a/include/xsimd/types/xsimd_batch.hpp +++ b/include/xsimd/types/xsimd_batch.hpp @@ -544,6 +544,11 @@ namespace xsimd "Please use batch, A> initialized from xtl::xcomplex instead"); }; #endif + + template + struct make_sized_batch; + template + using make_sized_batch_t = typename make_sized_batch::type; } #include "../arch/xsimd_isa.hpp" @@ -763,7 +768,7 @@ namespace xsimd } else { - kernel::store_masked(mem, *this, mask, mode, A {}); + kernel::store_masked(mem, *this, mask, mode, A {}); } }