Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ jobs:
fi
if [[ '${{ matrix.sys.flags }}' == 'avx512vl_128' ]]; then
CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512"
CXXFLAGS="$CXX_FLAGS -DXSIMD_DEFAULT_ARCH=avx512vl_128"
CXXFLAGS="$CXXFLAGS -DXSIMD_DEFAULT_ARCH=avx512vl_128"
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oopsie. Thanks for fixing this one.

fi
if [[ '${{ matrix.sys.flags }}' == 'avx512vl_256' ]]; then
CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512"
Expand Down
23 changes: 15 additions & 8 deletions include/xsimd/arch/common/xsimd_common_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -388,57 +388,64 @@ namespace xsimd
}
}

// Integer→float reinterpret bridges. Excluded for AVX-512VL archs which provide
// their own EVEX masked integer ovlds; without the exclusion gcc-10 sees the bridge
// and the VL native as equally specialized for A=avx512vl_*. (bridge_not_vl in fwd.hpp)
template <class A, bool... Values, class Mode>
XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...>, convert<int32_t>, Mode, requires_arch<A>) noexcept
XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value, batch<int32_t, A>>
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there should not be any arch-specific code in xsimd_common_memory.hpp. Could you find another approach?

load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...>, convert<int32_t>, Mode, requires_arch<A>) noexcept
{
const auto f = load_masked<A>(reinterpret_cast<const float*>(mem), batch_bool_constant<float, A, Values...> {}, convert<float> {}, Mode {}, A {});
return bitwise_cast<int32_t>(f);
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE batch<uint32_t, A> load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...>, convert<uint32_t>, Mode, requires_arch<A>) noexcept
XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value, batch<uint32_t, A>>
load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...>, convert<uint32_t>, Mode, requires_arch<A>) noexcept
{
const auto f = load_masked<A>(reinterpret_cast<const float*>(mem), batch_bool_constant<float, A, Values...> {}, convert<float> {}, Mode {}, A {});
return bitwise_cast<uint32_t>(f);
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value, batch<int64_t, A>>
XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value && types::has_simd_register<double, A>::value, batch<int64_t, A>>
load_masked(int64_t const* mem, batch_bool_constant<int64_t, A, Values...>, convert<int64_t>, Mode, requires_arch<A>) noexcept
{
const auto d = load_masked<A>(reinterpret_cast<const double*>(mem), batch_bool_constant<double, A, Values...> {}, convert<double> {}, Mode {}, A {});
return bitwise_cast<int64_t>(d);
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value, batch<uint64_t, A>>
XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value && types::has_simd_register<double, A>::value, batch<uint64_t, A>>
load_masked(uint64_t const* mem, batch_bool_constant<uint64_t, A, Values...>, convert<uint64_t>, Mode, requires_arch<A>) noexcept
{
const auto d = load_masked<A>(reinterpret_cast<const double*>(mem), batch_bool_constant<double, A, Values...> {}, convert<double> {}, Mode {}, A {});
return bitwise_cast<uint64_t>(d);
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE void store_masked(int32_t* mem, batch<int32_t, A> const& src, batch_bool_constant<int32_t, A, Values...>, Mode, requires_arch<A>) noexcept
XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value>
store_masked(int32_t* mem, batch<int32_t, A> const& src, batch_bool_constant<int32_t, A, Values...>, Mode, requires_arch<A>) noexcept
{
store_masked<A>(reinterpret_cast<float*>(mem), bitwise_cast<float>(src), batch_bool_constant<float, A, Values...> {}, Mode {}, A {});
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...>, Mode, requires_arch<A>) noexcept
XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value>
store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...>, Mode, requires_arch<A>) noexcept
{
store_masked<A>(reinterpret_cast<float*>(mem), bitwise_cast<float>(src), batch_bool_constant<float, A, Values...> {}, Mode {}, A {});
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value>
XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value && types::has_simd_register<double, A>::value>
store_masked(int64_t* mem, batch<int64_t, A> const& src, batch_bool_constant<int64_t, A, Values...>, Mode, requires_arch<A>) noexcept
{
store_masked<A>(reinterpret_cast<double*>(mem), bitwise_cast<double>(src), batch_bool_constant<double, A, Values...> {}, Mode {}, A {});
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE std::enable_if_t<types::has_simd_register<double, A>::value>
XSIMD_INLINE std::enable_if_t<bridge_not_vl<A>::value && types::has_simd_register<double, A>::value>
store_masked(uint64_t* mem, batch<uint64_t, A> const& src, batch_bool_constant<uint64_t, A, Values...>, Mode, requires_arch<A>) noexcept
{
store_masked<A>(reinterpret_cast<double*>(mem), bitwise_cast<double>(src), batch_bool_constant<double, A, Values...> {}, Mode {}, A {});
Expand Down
38 changes: 21 additions & 17 deletions include/xsimd/arch/xsimd_avx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -993,19 +993,20 @@ namespace xsimd
{
using int_t = as_integer_t<T>;
constexpr size_t half_size = batch<T, A>::size / 2;
using half_arch = typename ::xsimd::make_sized_batch_t<T, half_size>::arch_type;

// confined to lower 128-bit half → forward to 128 bit
// lower 128-bit half
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size)
{
constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(batch_bool_constant<int_t, A, Values...> {});
const auto lo = load_masked(reinterpret_cast<int_t const*>(mem), mlo, convert<int_t> {}, Mode {}, avx_128 {});
constexpr auto mlo = ::xsimd::detail::lower_half<half_arch>(batch_bool_constant<int_t, A, Values...> {});
const auto lo = load_masked(reinterpret_cast<int_t const*>(mem), mlo, convert<int_t> {}, Mode {}, half_arch {});
return bitwise_cast<T>(batch<int_t, A>(_mm256_zextsi128_si256(lo)));
}
// confined to upper 128-bit half → forward to 128 bit
// upper 128-bit half
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size)
{
constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
const auto hi = load_masked(mem + half_size, mhi, convert<T> {}, Mode {}, avx_128 {});
constexpr auto mhi = ::xsimd::detail::upper_half<half_arch>(mask);
const auto hi = load_masked(mem + half_size, mhi, convert<T> {}, Mode {}, half_arch {});
return detail::zero_extend<A>(hi);
}
else
Expand All @@ -1019,36 +1020,39 @@ namespace xsimd
namespace detail
{
template <class A>
XSIMD_INLINE void maskstore(float* mem, batch_bool<float, A> const& mask, batch<float, A> const& src) noexcept
XSIMD_INLINE void maskstore(float* mem, batch<as_integer_t<float>, A> const& mask, batch<float, A> const& src) noexcept
{
_mm256_maskstore_ps(mem, mask, src);
}

template <class A>
XSIMD_INLINE void maskstore(double* mem, batch_bool<double, A> const& mask, batch<double, A> const& src) noexcept
XSIMD_INLINE void maskstore(double* mem, batch<as_integer_t<double>, A> const& mask, batch<double, A> const& src) noexcept
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wgy that change? In my mental model, the masked store take a bool mask, not an integer mask

{
_mm256_maskstore_pd(mem, mask, src);
}
}

template <class A, class T, bool... Values, class Mode>
template <class A, class T, bool... Values, class Mode,
typename = std::enable_if_t<std::is_floating_point<T>::value>>
XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx>) noexcept
{
constexpr size_t half_size = batch<T, A>::size / 2;
using half_batch = ::xsimd::make_sized_batch_t<T, half_size>;
using half_arch = typename half_batch::arch_type;

// confined to lower 128-bit half → forward to 128 bit
// lower 128-bit half
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size)
{
constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask);
const auto lo = detail::lower_half(src);
store_masked<avx_128>(mem, lo, mlo, Mode {}, sse4_2 {});
constexpr auto mlo = ::xsimd::detail::lower_half<half_arch>(mask);
const half_batch lo = detail::lower_half(src);
store_masked<half_arch>(mem, lo, mlo, Mode {}, half_arch {});
}
// confined to upper 128-bit half → forward to 128 bit
// upper 128-bit half
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size)
{
constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
const auto hi = detail::upper_half(src);
store_masked<avx_128>(mem + half_size, hi, mhi, Mode {}, sse4_2 {});
constexpr auto mhi = ::xsimd::detail::upper_half<half_arch>(mask);
const half_batch hi = detail::upper_half(src);
store_masked<half_arch>(mem + half_size, hi, mhi, Mode {}, half_arch {});
}
else
{
Expand Down
46 changes: 28 additions & 18 deletions include/xsimd/arch/xsimd_avx2.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,8 @@ namespace xsimd
}

// single templated implementation for integer masked loads (32/64-bit)
template <class A, class T, bool... Values, class Mode>
template <class A, class T, bool... Values, class Mode,
class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I quite dislike the fact that xsimd_avx2.hpp needs to know stuff about avx512vl

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried, I'll have another look. Without this constraint all compilers work fine except gcc-10 :(

XSIMD_INLINE std::enable_if_t<std::is_integral<T>::value && (sizeof(T) >= 4), batch<T, A>>
load_masked(T const* mem, batch_bool_constant<T, A, Values...> mask, convert<T>, Mode, requires_arch<avx2>) noexcept
{
Expand All @@ -148,26 +149,30 @@ namespace xsimd
return detail::maskload(reinterpret_cast<const int_t*>(mem), mask.as_batch());
}

template <class A, bool... Values, class Mode>
template <class A, bool... Values, class Mode,
class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>
XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...> mask, convert<int32_t>, Mode, requires_arch<avx2>) noexcept
{
return load_masked<A, int32_t>(mem, mask, convert<int32_t> {}, Mode {}, avx2 {});
}

template <class A, bool... Values, class Mode>
template <class A, bool... Values, class Mode,
class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>
XSIMD_INLINE batch<uint32_t, A> load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...>, convert<uint32_t>, Mode, requires_arch<avx2>) noexcept
{
const auto r = load_masked<A, int32_t>(reinterpret_cast<int32_t const*>(mem), batch_bool_constant<int32_t, A, Values...> {}, convert<int32_t> {}, Mode {}, avx2 {});
return bitwise_cast<uint32_t>(r);
}

template <class A, bool... Values, class Mode>
template <class A, bool... Values, class Mode,
class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>
XSIMD_INLINE batch<int64_t, A> load_masked(int64_t const* mem, batch_bool_constant<int64_t, A, Values...> mask, convert<int64_t>, Mode, requires_arch<avx2>) noexcept
{
return load_masked<A, int64_t>(mem, mask, convert<int64_t> {}, Mode {}, avx2 {});
}

template <class A, bool... Values, class Mode>
template <class A, bool... Values, class Mode,
class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>
XSIMD_INLINE batch<uint64_t, A> load_masked(uint64_t const* mem, batch_bool_constant<uint64_t, A, Values...>, convert<uint64_t>, Mode, requires_arch<avx2>) noexcept
{
const auto r = load_masked<A, int64_t>(reinterpret_cast<int64_t const*>(mem), batch_bool_constant<int64_t, A, Values...> {}, convert<int64_t> {}, Mode {}, avx2 {});
Expand All @@ -190,39 +195,44 @@ namespace xsimd
}
}

template <class A, class T, bool... Values, class Mode>
template <class A, class T, bool... Values, class Mode,
typename = std::enable_if_t<std::is_integral<T>::value && (sizeof(T) >= 4) && std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>
XSIMD_INLINE void store_masked(T* mem, batch<T, A> const& src, batch_bool_constant<T, A, Values...> mask, Mode, requires_arch<avx2>) noexcept
{
constexpr size_t lanes_per_half = batch<T, A>::size / 2;
using half_batch = ::xsimd::make_sized_batch_t<T, lanes_per_half>;
using half_arch = typename half_batch::arch_type;

// confined to lower 128-bit half → forward to SSE
// lower 128-bit half
XSIMD_IF_CONSTEXPR(mask.countl_zero() >= lanes_per_half)
{
constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask);
const auto lo = detail::lower_half(src);
store_masked<sse4_2>(mem, lo, mlo, Mode {}, sse4_2 {});
constexpr auto mlo = ::xsimd::detail::lower_half<half_arch>(mask);
const half_batch lo = detail::lower_half(src);
store_masked<half_arch>(mem, lo, mlo, Mode {}, half_arch {});
}
// confined to upper 128-bit half → forward to SSE
// upper 128-bit half
else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= lanes_per_half)
{
constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
const auto hi = detail::upper_half(src);
store_masked<sse4_2>(mem + lanes_per_half, hi, mhi, Mode {}, sse4_2 {});
constexpr auto mhi = ::xsimd::detail::upper_half<half_arch>(mask);
const half_batch hi = detail::upper_half(src);
store_masked<half_arch>(mem + lanes_per_half, hi, mhi, Mode {}, half_arch {});
}
else
{
detail::maskstore<T, A>(mem, mask.as_batch(), src);
}
}

template <class A, bool... Values, class Mode>
XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...> mask, Mode, requires_arch<avx2>) noexcept
template <class A, bool... Values, class Mode,
class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>
XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...>, Mode, requires_arch<avx2>) noexcept
{
const auto s32 = bitwise_cast<int32_t>(src);
store_masked<A>(reinterpret_cast<int32_t*>(mem), s32, mask, Mode {}, avx2 {});
store_masked<A>(reinterpret_cast<int32_t*>(mem), s32, batch_bool_constant<int32_t, A, Values...> {}, Mode {}, avx2 {});
}

template <class A, bool... Values, class Mode>
template <class A, bool... Values, class Mode,
class = std::enable_if_t<std::is_base_of<avx2, A>::value && !std::is_base_of<avx512vl_256, A>::value>>
XSIMD_INLINE void store_masked(uint64_t* mem, batch<uint64_t, A> const& src, batch_bool_constant<uint64_t, A, Values...>, Mode, requires_arch<avx2>) noexcept
{
const auto s64 = bitwise_cast<int64_t>(src);
Expand Down
Loading
Loading