-
Notifications
You must be signed in to change notification settings - Fork 301
fix: avx512vl masked load/store #1353
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -388,57 +388,64 @@ namespace xsimd | |
| } | ||
| } | ||
|
|
||
| // Integer→float reinterpret bridges. Excluded for AVX-512VL archs which provide | ||
| // their own EVEX masked integer ovlds; without the exclusion gcc-10 sees the bridge | ||
| // and the VL native as equally specialized for A=avx512vl_*. (bridge_not_vl in fwd.hpp) | ||
| template <class A, bool... Values, class Mode> | ||
| XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...>, convert<int32_t>, Mode, requires_arch<A>) noexcept | ||
| XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value, batch<int32_t, A>> | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there should not be any arch-specific code in |
||
| load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...>, convert<int32_t>, Mode, requires_arch<A>) noexcept | ||
| { | ||
| const auto f = load_masked<A>(reinterpret_cast<const float*>(mem), batch_bool_constant<float, A, Values...> {}, convert<float> {}, Mode {}, A {}); | ||
| return bitwise_cast<int32_t>(f); | ||
| } | ||
|
|
||
| template <class A, bool... Values, class Mode> | ||
| XSIMD_INLINE batch<uint32_t, A> load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...>, convert<uint32_t>, Mode, requires_arch<A>) noexcept | ||
| XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value, batch<uint32_t, A>> | ||
| load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...>, convert<uint32_t>, Mode, requires_arch<A>) noexcept | ||
| { | ||
| const auto f = load_masked<A>(reinterpret_cast<const float*>(mem), batch_bool_constant<float, A, Values...> {}, convert<float> {}, Mode {}, A {}); | ||
| return bitwise_cast<uint32_t>(f); | ||
| } | ||
|
|
||
| template <class A, bool... Values, class Mode> | ||
| XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value, batch<int64_t, A>> | ||
| XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value && types::has_simd_register<double, A>::value, batch<int64_t, A>> | ||
| load_masked(int64_t const* mem, batch_bool_constant<int64_t, A, Values...>, convert<int64_t>, Mode, requires_arch<A>) noexcept | ||
| { | ||
| const auto d = load_masked<A>(reinterpret_cast<const double*>(mem), batch_bool_constant<double, A, Values...> {}, convert<double> {}, Mode {}, A {}); | ||
| return bitwise_cast<int64_t>(d); | ||
| } | ||
|
|
||
| template <class A, bool... Values, class Mode> | ||
| XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value, batch<uint64_t, A>> | ||
| XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value && types::has_simd_register<double, A>::value, batch<uint64_t, A>> | ||
| load_masked(uint64_t const* mem, batch_bool_constant<uint64_t, A, Values...>, convert<uint64_t>, Mode, requires_arch<A>) noexcept | ||
| { | ||
| const auto d = load_masked<A>(reinterpret_cast<const double*>(mem), batch_bool_constant<double, A, Values...> {}, convert<double> {}, Mode {}, A {}); | ||
| return bitwise_cast<uint64_t>(d); | ||
| } | ||
|
|
||
| template <class A, bool... Values, class Mode> | ||
| XSIMD_INLINE void store_masked(int32_t* mem, batch<int32_t, A> const& src, batch_bool_constant<int32_t, A, Values...>, Mode, requires_arch<A>) noexcept | ||
| XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value> | ||
| store_masked(int32_t* mem, batch<int32_t, A> const& src, batch_bool_constant<int32_t, A, Values...>, Mode, requires_arch<A>) noexcept | ||
| { | ||
| store_masked<A>(reinterpret_cast<float*>(mem), bitwise_cast<float>(src), batch_bool_constant<float, A, Values...> {}, Mode {}, A {}); | ||
| } | ||
|
|
||
| template <class A, bool... Values, class Mode> | ||
| XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...>, Mode, requires_arch<A>) noexcept | ||
| XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value> | ||
| store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...>, Mode, requires_arch<A>) noexcept | ||
| { | ||
| store_masked<A>(reinterpret_cast<float*>(mem), bitwise_cast<float>(src), batch_bool_constant<float, A, Values...> {}, Mode {}, A {}); | ||
| } | ||
|
|
||
| template <class A, bool... Values, class Mode> | ||
| XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value> | ||
| XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value && types::has_simd_register<double, A>::value> | ||
| store_masked(int64_t* mem, batch<int64_t, A> const& src, batch_bool_constant<int64_t, A, Values...>, Mode, requires_arch<A>) noexcept | ||
| { | ||
| store_masked<A>(reinterpret_cast<double*>(mem), bitwise_cast<double>(src), batch_bool_constant<double, A, Values...> {}, Mode {}, A {}); | ||
| } | ||
|
|
||
| template <class A, bool... Values, class Mode> | ||
| XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value> | ||
| XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value && types::has_simd_register<double, A>::value> | ||
| store_masked(uint64_t* mem, batch<uint64_t, A> const& src, batch_bool_constant<uint64_t, A, Values...>, Mode, requires_arch<A>) noexcept | ||
| { | ||
| store_masked<A>(reinterpret_cast<double*>(mem), bitwise_cast<double>(src), batch_bool_constant<double, A, Values...> {}, Mode {}, A {}); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -993,19 +993,20 @@ namespace xsimd | |
| { | ||
| using int_t = as_integer_t<T>; | ||
| constexpr size_t half_size = batch<T, A>::size / 2; | ||
| using half_arch = typename ::xsimd::make_sized_batch_t<T, half_size>::arch_type; | ||
|
|
||
| // confined to lower 128-bit half → forward to 128 bit | ||
| // lower 128-bit half | ||
| XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size) | ||
| { | ||
| constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(batch_bool_constant<int_t, A, Values...> {}); | ||
| const auto lo = load_masked(reinterpret_cast<int_t const*>(mem), mlo, convert<int_t> {}, Mode {}, avx_128 {}); | ||
| constexpr auto mlo = ::xsimd::detail::lower_half<half_arch>(batch_bool_constant<int_t, A, Values...> {}); | ||
| const auto lo = load_masked(reinterpret_cast<int_t const*>(mem), mlo, convert<int_t> {}, Mode {}, half_arch {}); | ||
| return bitwise_cast<T>(batch<int_t, A>(_mm256_zextsi128_si256(lo))); | ||
| } | ||
| // confined to upper 128-bit half → forward to 128 bit | ||
| // upper 128-bit half | ||
| else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size) | ||
| { | ||
| constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask); | ||
| const auto hi = load_masked(mem + half_size, mhi, convert<T> {}, Mode {}, avx_128 {}); | ||
| constexpr auto mhi = ::xsimd::detail::upper_half<half_arch>(mask); | ||
| const auto hi = load_masked(mem + half_size, mhi, convert<T> {}, Mode {}, half_arch {}); | ||
| return detail::zero_extend<A>(hi); | ||
| } | ||
| else | ||
|
|
@@ -1019,36 +1020,39 @@ namespace xsimd | |
| namespace detail | ||
| { | ||
| template <class A> | ||
| XSIMD_INLINE void maskstore(float* mem, batch_bool<float, A> const& mask, batch<float, A> const& src) noexcept | ||
| XSIMD_INLINE void maskstore(float* mem, batch<as_integer_t<float>, A> const& mask, batch<float, A> const& src) noexcept | ||
| { | ||
| _mm256_maskstore_ps(mem, mask, src); | ||
| } | ||
|
|
||
| template <class A> | ||
| XSIMD_INLINE void maskstore(double* mem, batch_bool<double, A> const& mask, batch<double, A> const& src) noexcept | ||
| XSIMD_INLINE void maskstore(double* mem, batch<as_integer_t<double>, A> const& mask, batch<double, A> const& src) noexcept | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. wgy that change? In my mental model, the masked store take a bool mask, not an integer mask |
||
| { | ||
| _mm256_maskstore_pd(mem, mask, src); | ||
| } | ||
| } | ||
|
|
||
| template <class A, class T, bool... Values, class Mode> | ||
| template <class A, class T, bool... Values, class Mode, | ||
| typename = std::enable_if_t<std::is_floating_point<T>::value>> | ||
| XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx>) noexcept | ||
| { | ||
| constexpr size_t half_size = batch<T, A>::size / 2; | ||
| using half_batch = ::xsimd::make_sized_batch_t<T, half_size>; | ||
| using half_arch = typename half_batch::arch_type; | ||
|
|
||
| // confined to lower 128-bit half → forward to 128 bit | ||
| // lower 128-bit half | ||
| XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size) | ||
| { | ||
| constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask); | ||
| const auto lo = detail::lower_half(src); | ||
| store_masked<avx_128>(mem, lo, mlo, Mode {}, sse4_2 {}); | ||
| constexpr auto mlo = ::xsimd::detail::lower_half<half_arch>(mask); | ||
| const half_batch lo = detail::lower_half(src); | ||
| store_masked<half_arch>(mem, lo, mlo, Mode {}, half_arch {}); | ||
| } | ||
| // confined to upper 128-bit half → forward to 128 bit | ||
| // upper 128-bit half | ||
| else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size) | ||
| { | ||
| constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask); | ||
| const auto hi = detail::upper_half(src); | ||
| store_masked<avx_128>(mem + half_size, hi, mhi, Mode {}, sse4_2 {}); | ||
| constexpr auto mhi = ::xsimd::detail::upper_half<half_arch>(mask); | ||
| const half_batch hi = detail::upper_half(src); | ||
| store_masked<half_arch>(mem + half_size, hi, mhi, Mode {}, half_arch {}); | ||
| } | ||
| else | ||
| { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -138,7 +138,8 @@ namespace xsimd | |
| } | ||
|
|
||
| // single templated implementation for integer masked loads (32/64-bit) | ||
| template <class A, class T, bool... Values, class Mode> | ||
| template <class A, class T, bool... Values, class Mode, | ||
| class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>> | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I quite dislike the fact that
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tried, I'll have another look. Without this constraint all compilers work fine except gcc-10 :( |
||
| XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) >= 4), batch<T, A>> | ||
| load_masked(T const* mem, batch_bool_constant<T, A, Values...> mask, convert<T>, Mode, requires_arch<avx2>) noexcept | ||
| { | ||
|
|
@@ -148,26 +149,30 @@ namespace xsimd | |
| return detail::maskload(reinterpret_cast<const int_t*>(mem), mask.as_batch()); | ||
| } | ||
|
|
||
| template <class A, bool... Values, class Mode> | ||
| template <class A, bool... Values, class Mode, | ||
| class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>> | ||
| XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...> mask, convert<int32_t>, Mode, requires_arch<avx2>) noexcept | ||
| { | ||
| return load_masked<A, int32_t>(mem, mask, convert<int32_t> {}, Mode {}, avx2 {}); | ||
| } | ||
|
|
||
| template <class A, bool... Values, class Mode> | ||
| template <class A, bool... Values, class Mode, | ||
| class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>> | ||
| XSIMD_INLINE batch<uint32_t, A> load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...>, convert<uint32_t>, Mode, requires_arch<avx2>) noexcept | ||
| { | ||
| const auto r = load_masked<A, int32_t>(reinterpret_cast<int32_t const*>(mem), batch_bool_constant<int32_t, A, Values...> {}, convert<int32_t> {}, Mode {}, avx2 {}); | ||
| return bitwise_cast<uint32_t>(r); | ||
| } | ||
|
|
||
| template <class A, bool... Values, class Mode> | ||
| template <class A, bool... Values, class Mode, | ||
| class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>> | ||
| XSIMD_INLINE batch<int64_t, A> load_masked(int64_t const* mem, batch_bool_constant<int64_t, A, Values...> mask, convert<int64_t>, Mode, requires_arch<avx2>) noexcept | ||
| { | ||
| return load_masked<A, int64_t>(mem, mask, convert<int64_t> {}, Mode {}, avx2 {}); | ||
| } | ||
|
|
||
| template <class A, bool... Values, class Mode> | ||
| template <class A, bool... Values, class Mode, | ||
| class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>> | ||
| XSIMD_INLINE batch<uint64_t, A> load_masked(uint64_t const* mem, batch_bool_constant<uint64_t, A, Values...>, convert<uint64_t>, Mode, requires_arch<avx2>) noexcept | ||
| { | ||
| const auto r = load_masked<A, int64_t>(reinterpret_cast<int64_t const*>(mem), batch_bool_constant<int64_t, A, Values...> {}, convert<int64_t> {}, Mode {}, avx2 {}); | ||
|
|
@@ -190,39 +195,44 @@ namespace xsimd | |
| } | ||
| } | ||
|
|
||
| template <class A, class T, bool... Values, class Mode> | ||
| template <class A, class T, bool... Values, class Mode, | ||
| typename = std::enable_if_t<std::is_integral<T>::value && (sizeof(T) >= 4) && std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>> | ||
| XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx2>) noexcept | ||
| { | ||
| constexpr size_t lanes_per_half = batch<T, A>::size / 2; | ||
| using half_batch = ::xsimd::make_sized_batch_t<T, lanes_per_half>; | ||
| using half_arch = typename half_batch::arch_type; | ||
|
|
||
| // confined to lower 128-bit half → forward to SSE | ||
| // lower 128-bit half | ||
| XSIMD_IF_CONSTEXPR(mask.countl_zero() >= lanes_per_half) | ||
| { | ||
| constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask); | ||
| const auto lo = detail::lower_half(src); | ||
| store_masked<sse4_2>(mem, lo, mlo, Mode {}, sse4_2 {}); | ||
| constexpr auto mlo = ::xsimd::detail::lower_half<half_arch>(mask); | ||
| const half_batch lo = detail::lower_half(src); | ||
| store_masked<half_arch>(mem, lo, mlo, Mode {}, half_arch {}); | ||
| } | ||
| // confined to upper 128-bit half → forward to SSE | ||
| // upper 128-bit half | ||
| else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= lanes_per_half) | ||
| { | ||
| constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask); | ||
| const auto hi = detail::upper_half(src); | ||
| store_masked<sse4_2>(mem + lanes_per_half, hi, mhi, Mode {}, sse4_2 {}); | ||
| constexpr auto mhi = ::xsimd::detail::upper_half<half_arch>(mask); | ||
| const half_batch hi = detail::upper_half(src); | ||
| store_masked<half_arch>(mem + lanes_per_half, hi, mhi, Mode {}, half_arch {}); | ||
| } | ||
| else | ||
| { | ||
| detail::maskstore<T, A>(mem, mask.as_batch(), src); | ||
| } | ||
| } | ||
|
|
||
| template <class A, bool... Values, class Mode> | ||
| XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...> mask, Mode, requires_arch<avx2>) noexcept | ||
| template <class A, bool... Values, class Mode, | ||
| class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>> | ||
| XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...>, Mode, requires_arch<avx2>) noexcept | ||
| { | ||
| const auto s32 = bitwise_cast<int32_t>(src); | ||
| store_masked<A>(reinterpret_cast<int32_t*>(mem), s32, mask, Mode {}, avx2 {}); | ||
| store_masked<A>(reinterpret_cast<int32_t*>(mem), s32, batch_bool_constant<int32_t, A, Values...> {}, Mode {}, avx2 {}); | ||
| } | ||
|
|
||
| template <class A, bool... Values, class Mode> | ||
| template <class A, bool... Values, class Mode, | ||
| class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>> | ||
| XSIMD_INLINE void store_masked(uint64_t* mem, batch<uint64_t, A> const& src, batch_bool_constant<uint64_t, A, Values...>, Mode, requires_arch<avx2>) noexcept | ||
| { | ||
| const auto s64 = bitwise_cast<int64_t>(src); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
oopsie. Thanks for fixing this one.