diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 03a914bda..b66c9a97c 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -100,7 +100,7 @@ jobs: fi if [[ '${{ matrix.sys.flags }}' == 'avx512vl_128' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512" - CXXFLAGS="$CXX_FLAGS -DXSIMD_DEFAULT_ARCH=avx512vl_128" + CXXFLAGS="$CXXFLAGS -DXSIMD_DEFAULT_ARCH=avx512vl_128" fi if [[ '${{ matrix.sys.flags }}' == 'avx512vl_256' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512" diff --git a/include/xsimd/arch/common/xsimd_common_memory.hpp b/include/xsimd/arch/common/xsimd_common_memory.hpp index 7a1ed73a3..e584c5151 100644 --- a/include/xsimd/arch/common/xsimd_common_memory.hpp +++ b/include/xsimd/arch/common/xsimd_common_memory.hpp @@ -388,22 +388,27 @@ namespace xsimd } } + // Integer→float reinterpret bridges. Excluded for AVX-512VL archs which provide + // their own EVEX masked integer ovlds; without the exclusion gcc-10 sees the bridge + // and the VL native as equally specialized for A=avx512vl_*. (bridge_not_vl in fwd.hpp) template - XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept + XSIMD_INLINE std::enable_if_t::value, batch> + load_masked(int32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept { const auto f = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, A {}); return bitwise_cast(f); } template - XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept + XSIMD_INLINE std::enable_if_t::value, batch> + load_masked(uint32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept { const auto f = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, A {}); return bitwise_cast(f); } template - XSIMD_INLINE std::enable_if_t::value, batch> + XSIMD_INLINE std::enable_if_t::value && types::has_simd_register::value, batch> load_masked(int64_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept { const auto d = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, A {}); @@ -411,7 +416,7 @@ namespace xsimd } template - XSIMD_INLINE std::enable_if_t::value, batch> + XSIMD_INLINE std::enable_if_t::value && types::has_simd_register::value, batch> load_masked(uint64_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept { const auto d = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, A {}); @@ -419,26 +424,28 @@ namespace xsimd } template - XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept + XSIMD_INLINE std::enable_if_t::value> + store_masked(int32_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept { store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, A {}); } template - XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept + XSIMD_INLINE std::enable_if_t::value> + store_masked(uint32_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept { store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, A {}); } template - XSIMD_INLINE std::enable_if_t::value> + XSIMD_INLINE std::enable_if_t::value && types::has_simd_register::value> store_masked(int64_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept { store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, A {}); } template - XSIMD_INLINE std::enable_if_t::value> + XSIMD_INLINE std::enable_if_t::value && types::has_simd_register::value> store_masked(uint64_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept { store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, A {}); diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index 1ee0c5b89..d2d527fec 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -993,19 +993,20 @@ namespace xsimd { using int_t = as_integer_t; constexpr size_t half_size = batch::size / 2; + using half_arch = typename ::xsimd::make_sized_batch_t::arch_type; - // confined to lower 128-bit half → forward to 128 bit + // lower 128-bit half XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size) { - constexpr auto mlo = ::xsimd::detail::lower_half(batch_bool_constant {}); - const auto lo = load_masked(reinterpret_cast(mem), mlo, convert {}, Mode {}, avx_128 {}); + constexpr auto mlo = ::xsimd::detail::lower_half(batch_bool_constant {}); + const auto lo = load_masked(reinterpret_cast(mem), mlo, convert {}, Mode {}, half_arch {}); return bitwise_cast(batch(_mm256_zextsi128_si256(lo))); } - // confined to upper 128-bit half → forward to 128 bit + // upper 128-bit half else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size) { - constexpr auto mhi = ::xsimd::detail::upper_half(mask); - const auto hi = load_masked(mem + half_size, mhi, convert {}, Mode {}, avx_128 {}); + constexpr auto mhi = ::xsimd::detail::upper_half(mask); + const auto hi = load_masked(mem + half_size, mhi, convert {}, Mode {}, half_arch {}); return detail::zero_extend(hi); } else @@ -1019,36 +1020,39 @@ namespace xsimd namespace detail { template - XSIMD_INLINE void maskstore(float* mem, batch_bool const& mask, batch const& src) noexcept + XSIMD_INLINE void maskstore(float* mem, batch, A> const& mask, batch const& src) noexcept { _mm256_maskstore_ps(mem, mask, src); } template - XSIMD_INLINE void maskstore(double* mem, batch_bool const& mask, batch const& src) noexcept + XSIMD_INLINE void maskstore(double* mem, batch, A> const& mask, batch const& src) noexcept { _mm256_maskstore_pd(mem, mask, src); } } - template + template ::value>> XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { constexpr size_t half_size = batch::size / 2; + using half_batch = ::xsimd::make_sized_batch_t; + using half_arch = typename half_batch::arch_type; - // confined to lower 128-bit half → forward to 128 bit + // lower 128-bit half XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size) { - constexpr auto mlo = ::xsimd::detail::lower_half(mask); - const auto lo = detail::lower_half(src); - store_masked(mem, lo, mlo, Mode {}, sse4_2 {}); + constexpr auto mlo = ::xsimd::detail::lower_half(mask); + const half_batch lo = detail::lower_half(src); + store_masked(mem, lo, mlo, Mode {}, half_arch {}); } - // confined to upper 128-bit half → forward to 128 bit + // upper 128-bit half else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size) { - constexpr auto mhi = ::xsimd::detail::upper_half(mask); - const auto hi = detail::upper_half(src); - store_masked(mem + half_size, hi, mhi, Mode {}, sse4_2 {}); + constexpr auto mhi = ::xsimd::detail::upper_half(mask); + const half_batch hi = detail::upper_half(src); + store_masked(mem + half_size, hi, mhi, Mode {}, half_arch {}); } else { diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index e2c223cc7..ee0227066 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -138,7 +138,8 @@ namespace xsimd } // single templated implementation for integer masked loads (32/64-bit) - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE std::enable_if_t::value && (sizeof(T) >= 4), batch> load_masked(T const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { @@ -148,26 +149,30 @@ namespace xsimd return detail::maskload(reinterpret_cast(mem), mask.as_batch()); } - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { return load_masked(mem, mask, convert {}, Mode {}, avx2 {}); } - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept { const auto r = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, avx2 {}); return bitwise_cast(r); } - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE batch load_masked(int64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { return load_masked(mem, mask, convert {}, Mode {}, avx2 {}); } - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE batch load_masked(uint64_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept { const auto r = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, avx2 {}); @@ -190,24 +195,27 @@ namespace xsimd } } - template + template ::value && (sizeof(T) >= 4) && std::is_base_of::value && !std::is_base_of::value>> XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { constexpr size_t lanes_per_half = batch::size / 2; + using half_batch = ::xsimd::make_sized_batch_t; + using half_arch = typename half_batch::arch_type; - // confined to lower 128-bit half → forward to SSE + // lower 128-bit half XSIMD_IF_CONSTEXPR(mask.countl_zero() >= lanes_per_half) { - constexpr auto mlo = ::xsimd::detail::lower_half(mask); - const auto lo = detail::lower_half(src); - store_masked(mem, lo, mlo, Mode {}, sse4_2 {}); + constexpr auto mlo = ::xsimd::detail::lower_half(mask); + const half_batch lo = detail::lower_half(src); + store_masked(mem, lo, mlo, Mode {}, half_arch {}); } - // confined to upper 128-bit half → forward to SSE + // upper 128-bit half else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= lanes_per_half) { - constexpr auto mhi = ::xsimd::detail::upper_half(mask); - const auto hi = detail::upper_half(src); - store_masked(mem + lanes_per_half, hi, mhi, Mode {}, sse4_2 {}); + constexpr auto mhi = ::xsimd::detail::upper_half(mask); + const half_batch hi = detail::upper_half(src); + store_masked(mem + lanes_per_half, hi, mhi, Mode {}, half_arch {}); } else { @@ -215,14 +223,16 @@ namespace xsimd } } - template - XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template ::value && !std::is_base_of::value>> + XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept { const auto s32 = bitwise_cast(src); - store_masked(reinterpret_cast(mem), s32, mask, Mode {}, avx2 {}); + store_masked(reinterpret_cast(mem), s32, batch_bool_constant {}, Mode {}, avx2 {}); } - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept { const auto s64 = bitwise_cast(src); diff --git a/include/xsimd/arch/xsimd_avx2_128.hpp b/include/xsimd/arch/xsimd_avx2_128.hpp index 7a590c74f..f31900482 100644 --- a/include/xsimd/arch/xsimd_avx2_128.hpp +++ b/include/xsimd/arch/xsimd_avx2_128.hpp @@ -90,44 +90,52 @@ namespace xsimd } // load_masked - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { return _mm_maskload_epi32(mem, mask.as_batch()); } - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { return _mm_maskload_epi32((int32_t*)mem, mask.as_batch()); } - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE batch load_masked(int64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { return _mm_maskload_epi64(mem, mask.as_batch()); } - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE batch load_masked(uint64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { return _mm_maskload_epi64((int64_t*)mem, mask.as_batch()); } // store_masked - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { return _mm_maskstore_epi32(mem, mask.as_batch(), src); } - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { return _mm_maskstore_epi32((int32_t*)mem, mask.as_batch(), src); } - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE void store_masked(int64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { return _mm_maskstore_epi64(mem, mask.as_batch(), src); } - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { return _mm_maskstore_epi64((int64_t*)mem, mask.as_batch(), src); diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index 6a7316722..cc057eacf 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -305,16 +305,17 @@ namespace xsimd convert, Mode, requires_arch) noexcept { constexpr auto half = batch::size / 2; - XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half) // lower-half AVX2 forwarding + using half_arch = typename ::xsimd::make_sized_batch_t::arch_type; + XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half) // lower 256-bit half { - constexpr auto mlo = ::xsimd::detail::lower_half(mask); - const auto lo = load_masked(mem, mlo, convert {}, Mode {}, avx2 {}); + constexpr auto mlo = ::xsimd::detail::lower_half(mask); + const auto lo = load_masked(mem, mlo, convert {}, Mode {}, half_arch {}); return detail::load_masked(lo); // zero-extend low half } - else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half) // upper-half AVX2 forwarding + else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half) // upper 256-bit half { - constexpr auto mhi = ::xsimd::detail::upper_half(mask); - const auto hi = load_masked(mem + half, mhi, convert {}, Mode {}, avx2 {}); + constexpr auto mhi = ::xsimd::detail::upper_half(mask); + const auto hi = load_masked(mem + half, mhi, convert {}, Mode {}, half_arch {}); return detail::load_masked(hi, detail::high_tag {}); } else @@ -332,17 +333,19 @@ namespace xsimd Mode, requires_arch) noexcept { constexpr auto half = batch::size / 2; - XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half) // lower-half AVX2 forwarding + using half_batch = ::xsimd::make_sized_batch_t; + using half_arch = typename half_batch::arch_type; + XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half) // lower 256-bit half { - constexpr auto mlo = ::xsimd::detail::lower_half(mask); - const auto lo = detail::lower_half(src); - store_masked(mem, lo, mlo, Mode {}, avx2 {}); + constexpr auto mlo = ::xsimd::detail::lower_half(mask); + const half_batch lo = detail::lower_half(src); + store_masked(mem, lo, mlo, Mode {}, half_arch {}); } - else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half) // upper-half AVX2 forwarding + else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half) // upper 256-bit half { - constexpr auto mhi = ::xsimd::detail::upper_half(mask); - const auto hi = detail::upper_half(src); - store_masked(mem + half, hi, mhi, Mode {}, avx2 {}); + constexpr auto mhi = ::xsimd::detail::upper_half(mask); + const half_batch hi = detail::upper_half(src); + store_masked(mem + half, hi, mhi, Mode {}, half_arch {}); } else { diff --git a/include/xsimd/arch/xsimd_avx512vl_128.hpp b/include/xsimd/arch/xsimd_avx512vl_128.hpp index 155338425..855870af3 100644 --- a/include/xsimd/arch/xsimd_avx512vl_128.hpp +++ b/include/xsimd/arch/xsimd_avx512vl_128.hpp @@ -188,53 +188,125 @@ namespace xsimd return _mm_abs_epi64(self); } - // load masked - template - XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + // Per-type masked load/store — partial ordering picks these over the + // avx2 bridges this arch inherits. Unsigned overloads reinterpret to + // the signed EVEX intrinsic. + template + XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - constexpr auto imm_mask = mask.mask(); - return _mm_mask_loadu_epi32(_mm_setzero_si128(), imm_mask, mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm_maskz_load_epi32(mask.mask(), mem); + } + else + { + return _mm_maskz_loadu_epi32(mask.mask(), mem); + } } - template - XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + template + XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept { - constexpr auto imm_mask = mask.mask(); - return _mm_mask_loadu_epi32(_mm_setzero_si128(), imm_mask, mem); + return bitwise_cast(load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, avx512vl_128 {})); } - - // store masked - template - XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE batch load_masked(int64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - _mm_mask_storeu_epi32(mem, mask.mask(), src); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm_maskz_load_epi64(mask.mask(), mem); + } + else + { + return _mm_maskz_loadu_epi64(mask.mask(), mem); + } } - template - XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE batch load_masked(uint64_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept { - _mm_mask_storeu_epi32(mem, mask.mask(), src); + return bitwise_cast(load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, avx512vl_128 {})); } - - template - XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE batch load_masked(float const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm_maskz_load_ps(mask.mask(), mem); + } + else + { + return _mm_maskz_loadu_ps(mask.mask(), mem); + } + } + template + XSIMD_INLINE batch load_masked(double const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - _mm_mask_storeu_epi64(mem, mask.mask(), src); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm_maskz_load_pd(mask.mask(), mem); + } + else + { + return _mm_maskz_loadu_pd(mask.mask(), mem); + } } - template - XSIMD_INLINE void store_masked(int64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { - _mm_mask_storeu_epi64(mem, mask.mask(), src); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + _mm_mask_store_epi32(mem, mask.mask(), src); + } + else + { + _mm_mask_storeu_epi32(mem, mask.mask(), src); + } } - template - XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept { - _mm_mask_storeu_ps(mem, mask.mask(), src); + store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, avx512vl_128 {}); } - - template - XSIMD_INLINE void store_masked(double* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE void store_masked(int64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + _mm_mask_store_epi64(mem, mask.mask(), src); + } + else + { + _mm_mask_storeu_epi64(mem, mask.mask(), src); + } + } + template + XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept + { + store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, avx512vl_128 {}); + } + template + XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { - _mm_mask_storeu_pd(mem, mask.mask(), src); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + _mm_mask_store_ps(mem, mask.mask(), src); + } + else + { + _mm_mask_storeu_ps(mem, mask.mask(), src); + } + } + template + XSIMD_INLINE void store_masked(double* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + _mm_mask_store_pd(mem, mask.mask(), src); + } + else + { + _mm_mask_storeu_pd(mem, mask.mask(), src); + } } // max diff --git a/include/xsimd/arch/xsimd_avx512vl_256.hpp b/include/xsimd/arch/xsimd_avx512vl_256.hpp index a5ea546bc..c0b4a568e 100644 --- a/include/xsimd/arch/xsimd_avx512vl_256.hpp +++ b/include/xsimd/arch/xsimd_avx512vl_256.hpp @@ -188,53 +188,125 @@ namespace xsimd return _mm256_abs_epi64(self); } - // load masked - template - XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + // Per-type masked load/store — partial ordering picks these over the + // avx2 bridges this arch inherits. Unsigned overloads reinterpret to + // the signed EVEX intrinsic. + template + XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - constexpr auto imm_mask = mask.mask(); - return _mm256_mask_loadu_epi32(_mm256_setzero_si256(), imm_mask, mem); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm256_maskz_load_epi32(mask.mask(), mem); + } + else + { + return _mm256_maskz_loadu_epi32(mask.mask(), mem); + } } - template - XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + template + XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept { - constexpr auto imm_mask = mask.mask(); - return _mm256_mask_loadu_epi32(_mm256_setzero_si256(), imm_mask, mem); + return bitwise_cast(load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, avx512vl_256 {})); } - - // store masked - template - XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE batch load_masked(int64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - _mm256_mask_storeu_epi32(mem, mask.mask(), src); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm256_maskz_load_epi64(mask.mask(), mem); + } + else + { + return _mm256_maskz_loadu_epi64(mask.mask(), mem); + } } - template - XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE batch load_masked(uint64_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept { - _mm256_mask_storeu_epi32(mem, mask.mask(), src); + return bitwise_cast(load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, avx512vl_256 {})); } - - template - XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE batch load_masked(float const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm256_maskz_load_ps(mask.mask(), mem); + } + else + { + return _mm256_maskz_loadu_ps(mask.mask(), mem); + } + } + template + XSIMD_INLINE batch load_masked(double const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { - _mm256_mask_storeu_epi64(mem, mask.mask(), src); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm256_maskz_load_pd(mask.mask(), mem); + } + else + { + return _mm256_maskz_loadu_pd(mask.mask(), mem); + } } - template - XSIMD_INLINE void store_masked(int64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { - _mm256_mask_storeu_epi64(mem, mask.mask(), src); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + _mm256_mask_store_epi32(mem, mask.mask(), src); + } + else + { + _mm256_mask_storeu_epi32(mem, mask.mask(), src); + } } - template - XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept { - _mm256_mask_storeu_ps(mem, mask.mask(), src); + store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, avx512vl_256 {}); } - - template - XSIMD_INLINE void store_masked(double* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + template + XSIMD_INLINE void store_masked(int64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + _mm256_mask_store_epi64(mem, mask.mask(), src); + } + else + { + _mm256_mask_storeu_epi64(mem, mask.mask(), src); + } + } + template + XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept + { + store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, avx512vl_256 {}); + } + template + XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { - _mm256_mask_storeu_pd(mem, mask.mask(), src); + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + _mm256_mask_store_ps(mem, mask.mask(), src); + } + else + { + _mm256_mask_storeu_ps(mem, mask.mask(), src); + } + } + template + XSIMD_INLINE void store_masked(double* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + _mm256_mask_store_pd(mem, mask.mask(), src); + } + else + { + _mm256_mask_storeu_pd(mem, mask.mask(), src); + } } // max diff --git a/include/xsimd/arch/xsimd_avx_128.hpp b/include/xsimd/arch/xsimd_avx_128.hpp index 46fc9acb7..30697d874 100644 --- a/include/xsimd/arch/xsimd_avx_128.hpp +++ b/include/xsimd/arch/xsimd_avx_128.hpp @@ -104,45 +104,49 @@ namespace xsimd } // load_masked - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE batch load_masked(float const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { return _mm_maskload_ps(mem, mask.as_batch()); } - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE batch load_masked(double const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { return _mm_maskload_pd(mem, mask.as_batch()); } // store_masked - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { return _mm_maskstore_ps(mem, mask.as_batch(), src); } - template + template ::value && !std::is_base_of::value>> XSIMD_INLINE void store_masked(double* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { return _mm_maskstore_pd(mem, mask.as_batch(), src); } // swizzle (dynamic mask) - template ::value && sizeof(T) == sizeof(ITy)>> - XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept - { - XSIMD_IF_CONSTEXPR(std::is_same::value) - { - return _mm_permutevar_ps(self, mask); - } - else - { - // VPERMILPD's variable control reads bit 1 of each 64-bit selector - // (bit 0 is ignored), so a {0,1} index needs to become {0,2}. - // Negation is a cheap alternative to a left shift by 1. - return _mm_permutevar_pd(self, -mask); - } + template + XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + { + static_assert(sizeof(float) == sizeof(ITy), "index type must match value width"); + return _mm_permutevar_ps(self, mask); + } + template + XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + { + static_assert(sizeof(double) == sizeof(ITy), "index type must match value width"); + // VPERMILPD's variable control reads bit 1 of each 64-bit selector + // (bit 0 is ignored), so a {0,1} index needs to become {0,2}. + // Negation is a cheap alternative to a left shift by 1. + return _mm_permutevar_pd(self, -mask); } // swizzle (constant mask) diff --git a/include/xsimd/arch/xsimd_common_fwd.hpp b/include/xsimd/arch/xsimd_common_fwd.hpp index f5a7f4ffe..98249be90 100644 --- a/include/xsimd/arch/xsimd_common_fwd.hpp +++ b/include/xsimd/arch/xsimd_common_fwd.hpp @@ -13,6 +13,8 @@ #ifndef XSIMD_COMMON_FWD_HPP #define XSIMD_COMMON_FWD_HPP +#include "../types/xsimd_avx512vl_register.hpp" + #include #include #include @@ -43,6 +45,12 @@ namespace xsimd namespace kernel { + template + struct bridge_not_vl + : std::integral_constant::value && !std::is_base_of::value> + { + }; + // forward declaration template ::value>> XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept; @@ -88,21 +96,21 @@ namespace xsimd template XSIMD_INLINE void store_masked(T_out* mem, batch const& src, batch_bool_constant mask, alignment, requires_arch) noexcept; template - XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept; + XSIMD_INLINE std::enable_if_t::value, batch> load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept; template - XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept; + XSIMD_INLINE std::enable_if_t::value, batch> load_masked(uint32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept; template - XSIMD_INLINE std::enable_if_t::value, batch> load_masked(int64_t const*, batch_bool_constant, convert, Mode, requires_arch) noexcept; + XSIMD_INLINE std::enable_if_t::value && types::has_simd_register::value, batch> load_masked(int64_t const*, batch_bool_constant, convert, Mode, requires_arch) noexcept; template - XSIMD_INLINE std::enable_if_t::value, batch> load_masked(uint64_t const*, batch_bool_constant, convert, Mode, requires_arch) noexcept; + XSIMD_INLINE std::enable_if_t::value && types::has_simd_register::value, batch> load_masked(uint64_t const*, batch_bool_constant, convert, Mode, requires_arch) noexcept; template - XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept; + XSIMD_INLINE std::enable_if_t::value> store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept; template - XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept; + XSIMD_INLINE std::enable_if_t::value> store_masked(uint32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept; template - XSIMD_INLINE std::enable_if_t::value> store_masked(int64_t*, batch const&, batch_bool_constant, Mode, requires_arch) noexcept; + XSIMD_INLINE std::enable_if_t::value && types::has_simd_register::value> store_masked(int64_t*, batch const&, batch_bool_constant, Mode, requires_arch) noexcept; template - XSIMD_INLINE std::enable_if_t::value> store_masked(uint64_t*, batch const&, batch_bool_constant, Mode, requires_arch) noexcept; + XSIMD_INLINE std::enable_if_t::value && types::has_simd_register::value> store_masked(uint64_t*, batch const&, batch_bool_constant, Mode, requires_arch) noexcept; // Forward declarations for pack-level helpers namespace detail diff --git a/include/xsimd/arch/xsimd_isa.hpp b/include/xsimd/arch/xsimd_isa.hpp index cf88f64d7..06edfa98f 100644 --- a/include/xsimd/arch/xsimd_isa.hpp +++ b/include/xsimd/arch/xsimd_isa.hpp @@ -48,8 +48,11 @@ #endif #if XSIMD_WITH_AVX -#include "./xsimd_avx.hpp" +// clang-format off +// _128 first: avx half-fold recursive call needs avx_128 visible at parse time. #include "./xsimd_avx_128.hpp" +#include "./xsimd_avx.hpp" +// clang-format on #endif #if XSIMD_WITH_FMA3_AVX @@ -61,8 +64,10 @@ #endif #if XSIMD_WITH_AVX2 -#include "./xsimd_avx2.hpp" +// clang-format off #include "./xsimd_avx2_128.hpp" +#include "./xsimd_avx2.hpp" +// clang-format on #endif #if XSIMD_WITH_FMA3_AVX2 diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp index c6cfb5f07..0a95aae8b 100644 --- a/include/xsimd/arch/xsimd_sse2.hpp +++ b/include/xsimd/arch/xsimd_sse2.hpp @@ -2331,7 +2331,7 @@ namespace xsimd } else { - store_masked(mem, src, mask, requires_arch {}); + store_masked(mem, src, mask, aligned_mode {}, common {}); } } diff --git a/include/xsimd/types/xsimd_avx512vl_register.hpp b/include/xsimd/types/xsimd_avx512vl_register.hpp index c73c2a963..46d887333 100644 --- a/include/xsimd/types/xsimd_avx512vl_register.hpp +++ b/include/xsimd/types/xsimd_avx512vl_register.hpp @@ -20,7 +20,7 @@ namespace xsimd /** * @ingroup architectures * - * AVX512DQ instructions + * AVX512VL instructions */ struct avx512vl : avx512cd { diff --git a/include/xsimd/types/xsimd_batch.hpp b/include/xsimd/types/xsimd_batch.hpp index 970483150..5ae7006b0 100644 --- a/include/xsimd/types/xsimd_batch.hpp +++ b/include/xsimd/types/xsimd_batch.hpp @@ -544,6 +544,11 @@ namespace xsimd "Please use batch, A> initialized from xtl::xcomplex instead"); }; #endif + + template + struct make_sized_batch; + template + using make_sized_batch_t = typename make_sized_batch::type; } #include "../arch/xsimd_isa.hpp" @@ -763,7 +768,7 @@ namespace xsimd } else { - kernel::store_masked(mem, *this, mask, mode, A {}); + kernel::store_masked(mem, *this, mask, mode, A {}); } } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 662dcdc3f..b9a6d2968 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -49,6 +49,9 @@ if (TARGET_EMULATED) message(STATUS "Using emulated target: ${TARGET_EMULATED}") set(EMULATED_COMPILE_FLAGS -DXSIMD_DEFAULT_ARCH=${TARGET_ARCH};-DXSIMD_WITH_EMULATED=1) unset(TARGET_ARCH CACHE) +elseif (DEFINED XSIMD_DEFAULT_ARCH AND NOT "${XSIMD_DEFAULT_ARCH}" STREQUAL "") + message(STATUS "Forcing default arch to xsimd::${XSIMD_DEFAULT_ARCH}") + set(EMULATED_COMPILE_FLAGS -DXSIMD_DEFAULT_ARCH=${XSIMD_DEFAULT_ARCH}) endif() if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Intel") diff --git a/test/test_arch.cpp b/test/test_arch.cpp index d2d0df249..5d479bbaf 100644 --- a/test/test_arch.cpp +++ b/test/test_arch.cpp @@ -23,6 +23,12 @@ static_assert(xsimd::default_arch::supported(), "default arch must be supported" static_assert(std::is_same::value, "default arch is the best available"); static_assert(xsimd::supported_architectures::contains(), "default arch is supported"); static_assert(xsimd::all_architectures::contains(), "default arch is a valid arch"); +#else +namespace xsimd +{ + static_assert(std::is_same::value, + "default_arch does not match XSIMD_DEFAULT_ARCH"); +} #endif #if !XSIMD_WITH_SVE