diff --git a/.drone.jsonnet b/.drone.jsonnet index d1bd4d0ed..8d0326729 100644 --- a/.drone.jsonnet +++ b/.drone.jsonnet @@ -435,8 +435,14 @@ local windows_pipeline(name, image, environment, arch = "amd64") = ), windows_pipeline( - "Windows VS2026 msvc-14.5", + "Windows VS2026 msvc-14.5 64-bit", "cppalliance/dronevs2026:1", - { TOOLSET: 'msvc-14.5', CXXSTD: '14,17,20,latest', ADDRMD: '32,64' }, + { TOOLSET: 'msvc-14.5', CXXSTD: '14,17,20,latest', ADDRMD: '64' }, ), + + windows_pipeline( + "Windows VS2026 msvc-14.5 32-bit", + "cppalliance/dronevs2026:1", + { TOOLSET: 'msvc-14.5', CXXSTD: '14,17,20,latest', ADDRMD: '32' }, + ), ] diff --git a/include/boost/decimal/decimal128_t.hpp b/include/boost/decimal/decimal128_t.hpp index 77b8a0569..d598ad945 100644 --- a/include/boost/decimal/decimal128_t.hpp +++ b/include/boost/decimal/decimal128_t.hpp @@ -598,6 +598,41 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto from_bits(const int128::uint128_t rhs) noexcep return result; } +namespace detail { + +// IEEE-pack a known-in-range (coeff, exp, sign) triple into a decimal128_t, +// skipping the constructor's bounds check + dead-branch handling. For d128 +// the precision (34 digits, 113 bits) always fits in not_11_significand_mask, +// so no combination-field branch is needed. Caller guarantees +// coeff <= d128_max_significand_value and (exp + bias) is in [0, d128_max_biased_exponent]. +template +BOOST_DECIMAL_CUDA_CONSTEXPR auto direct_pack_d128(int128::uint128_t coeff, T2 exp, bool sign) noexcept -> decimal128_t +{ + const auto biased_exp {static_cast(static_cast(exp) + bias_v)}; + int128::uint128_t bits {coeff & d128_not_11_significand_mask}; + bits.high |= (sign ? d128_sign_mask : UINT64_C(0)); + bits.high |= (biased_exp << d128_not_11_exp_high_word_shift) & d128_not_11_exp_mask; + return from_bits(bits); +} + +// Definition of the pack_in_range overload declared in +// add_impl.hpp. Lives here so the `decimal128_t{coeff, exp, sign}` fallback +// is parsed only after decimal128_t is complete (see add_impl.hpp rationale). +template +BOOST_DECIMAL_CUDA_CONSTEXPR auto pack_in_range(SigType coeff, ExpType exp, bool sign) noexcept + -> std::enable_if_t::value, decimal128_t> +{ + const auto biased_exp_check {static_cast(exp) + bias_v}; + if (BOOST_DECIMAL_LIKELY(biased_exp_check >= 0 + && biased_exp_check <= static_cast(max_biased_exp_v))) + { + return direct_pack_d128(static_cast(coeff), exp, sign); + } + return decimal128_t{coeff, exp, sign}; +} + +} // namespace detail + BOOST_DECIMAL_CUDA_CONSTEXPR auto decimal128_t::unbiased_exponent() const noexcept -> exponent_type { exponent_type expval {}; @@ -819,9 +854,11 @@ BOOST_DECIMAL_CUDA_CONSTEXPR decimal128_t::decimal128_t(T1 coeff, T2 exp, const } else if (digit_delta > 0 && coeff_digits + digit_delta <= detail::precision_v) { + // Same overflow-fold pattern as d32/d64: post-shift coeff is <= max_significand_v + // and biased_exp lands in [0, max], so pack_in_range routes to direct_pack. exp -= digit_delta; reduced_coeff *= detail::pow10(static_cast(digit_delta)); - *this = decimal128_t(reduced_coeff, exp, is_negative); + *this = detail::pack_in_range(reduced_coeff, exp, is_negative); } else if (coeff_digits + biased_exp <= detail::precision_v) { @@ -856,10 +893,12 @@ BOOST_DECIMAL_CUDA_CONSTEXPR decimal128_t::decimal128_t(T1 coeff, T2 exp, const } else if (digit_delta < 0 && coeff_digits - digit_delta <= detail::precision_v) { + // Expand to use the full precision; biased_exp ends up in [0, max] and + // coeff <= max_significand_v. pack_in_range routes to direct_pack. const auto offset {detail::precision_v - coeff_digits}; exp -= offset; reduced_coeff *= detail::pow10(static_cast(offset)); - *this = decimal128_t(reduced_coeff, exp, is_negative); + *this = detail::pack_in_range(reduced_coeff, exp, is_negative); } else { @@ -1337,6 +1376,11 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto operator<(const decimal128_t& lhs, const decim } #endif + if (BOOST_DECIMAL_UNLIKELY(lhs.bits_ == rhs.bits_)) + { + return false; + } + return less_parts_impl(lhs.full_significand(), lhs.biased_exponent(), lhs.isneg(), rhs.full_significand(), rhs.biased_exponent(), rhs.isneg()); } @@ -1469,16 +1513,17 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto operator<=>(const decimal128_t& lhs, const dec { return std::partial_ordering::less; } - else if (lhs > rhs) + if (rhs < lhs) { return std::partial_ordering::greater; } - else if (lhs == rhs) + #ifndef BOOST_DECIMAL_FAST_MATH + if (isnan(lhs) || isnan(rhs)) { - return std::partial_ordering::equivalent; + return std::partial_ordering::unordered; } - - return std::partial_ordering::unordered; + #endif + return std::partial_ordering::equivalent; } template @@ -1682,11 +1727,45 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto operator+(const decimal128_t& lhs, const decim { return from_bits(detail::d128_nan_mask); } - + return detail::check_non_finite(lhs, rhs); } #endif + // Two fast paths (see decimal64_t.hpp:operator+ for full explanation). + // Both gated on non-zero operands so zero short-circuit logic is preserved + // by falling through to d128_add_impl_new. + { + const auto lhs_sig {lhs.full_significand()}; + const auto rhs_sig {rhs.full_significand()}; + if (BOOST_DECIMAL_LIKELY(lhs_sig != 0U && rhs_sig != 0U)) + { + const auto lhs_exp {lhs.biased_exponent()}; + const auto rhs_exp {rhs.biased_exponent()}; + const auto exp_diff {lhs_exp > rhs_exp ? lhs_exp - rhs_exp : rhs_exp - lhs_exp}; + if (exp_diff > 75 || exp_diff <= 3) + { + auto round {_boost_decimal_global_rounding_mode}; + #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION + if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(lhs)) + { + round = _boost_decimal_global_runtime_rounding_mode; + } + #endif + if (BOOST_DECIMAL_LIKELY(round == rounding_mode::fe_dec_to_nearest)) + { + if (exp_diff > 75) + { + return lhs_exp > rhs_exp ? lhs : rhs; + } + return detail::aligned_add_kernel( + lhs_sig, rhs_sig, lhs_exp, rhs_exp, static_cast(exp_diff), + lhs.isneg(), rhs.isneg()); + } + } + } + } + auto lhs_components {lhs.to_components()}; detail::expand_significand(lhs_components.sig, lhs_components.exp); auto rhs_components {rhs.to_components()}; @@ -1741,11 +1820,45 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto operator-(const decimal128_t& lhs, const decim { return -rhs; } - + return detail::check_non_finite(lhs, rhs); } #endif + // Two fast paths; see operator+ above. Both gated on non-zero operands so + // zero short-circuit logic is preserved by falling through. For operator-, + // rhs sign is flipped before kernel dispatch. + { + const auto lhs_sig {lhs.full_significand()}; + const auto rhs_sig {rhs.full_significand()}; + if (BOOST_DECIMAL_LIKELY(lhs_sig != 0U && rhs_sig != 0U)) + { + const auto lhs_exp {lhs.biased_exponent()}; + const auto rhs_exp {rhs.biased_exponent()}; + const auto exp_diff {lhs_exp > rhs_exp ? lhs_exp - rhs_exp : rhs_exp - lhs_exp}; + if (exp_diff > 75 || exp_diff <= 3) + { + auto round {_boost_decimal_global_rounding_mode}; + #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION + if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(lhs)) + { + round = _boost_decimal_global_runtime_rounding_mode; + } + #endif + if (BOOST_DECIMAL_LIKELY(round == rounding_mode::fe_dec_to_nearest)) + { + if (exp_diff > 75) + { + return lhs_exp > rhs_exp ? lhs : -rhs; + } + return detail::aligned_add_kernel( + lhs_sig, rhs_sig, lhs_exp, rhs_exp, static_cast(exp_diff), + lhs.isneg(), !rhs.isneg()); + } + } + } + } + auto lhs_components {lhs.to_components()}; detail::expand_significand(lhs_components.sig, lhs_components.exp); auto rhs_components {rhs.to_components()}; @@ -1861,14 +1974,11 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto operator*(const decimal128_t lhs, const Intege auto lhs_sig {lhs.full_significand()}; auto lhs_exp {lhs.biased_exponent()}; - const auto lhs_zeros {detail::remove_trailing_zeros(lhs_sig)}; - lhs_sig = lhs_zeros.trimmed_number; - lhs_exp += static_cast(lhs_zeros.number_of_removed_zeros); + detail::expand_significand(lhs_sig, lhs_exp); auto rhs_sig {static_cast(detail::make_positive_unsigned(rhs))}; - const auto rhs_zeros {detail::remove_trailing_zeros(rhs_sig)}; - rhs_sig = rhs_zeros.trimmed_number; - const auto rhs_exp = static_cast(rhs_zeros.number_of_removed_zeros); + exp_type rhs_exp {0}; + detail::normalize(rhs_sig, rhs_exp); return detail::d128_mul_impl( lhs_sig, lhs_exp, lhs.isneg(), diff --git a/include/boost/decimal/decimal32_t.hpp b/include/boost/decimal/decimal32_t.hpp index 9f8ab8c40..18dedecb1 100644 --- a/include/boost/decimal/decimal32_t.hpp +++ b/include/boost/decimal/decimal32_t.hpp @@ -750,9 +750,14 @@ BOOST_DECIMAL_CUDA_CONSTEXPR decimal32_t::decimal32_t(T1 coeff, T2 exp, const de } else if (digit_delta > 0 && coeff_digits + digit_delta <= detail::precision) { + // After the shift, coeff has coeff_digits+digit_delta <= precision digits + // (so coeff <= max_significand_v) and biased_exp lands in [0, max] because + // we just subtracted exactly the overflow delta from exp. pack_in_range + // hits direct_pack on this in-range case; the fallback constructor is a + // safety belt if the analysis is wrong (same as the original recursion). exp -= digit_delta; reduced_coeff *= detail::pow10(static_cast(digit_delta)); - *this = decimal32_t(reduced_coeff, exp, is_negative); + *this = detail::pack_in_range(reduced_coeff, exp, is_negative); } else if (coeff_digits + biased_exp <= detail::precision) { @@ -786,11 +791,14 @@ BOOST_DECIMAL_CUDA_CONSTEXPR decimal32_t::decimal32_t(T1 coeff, T2 exp, const de } else if (digit_delta < 0 && coeff_digits - digit_delta <= detail::precision) { - // We can expand the coefficient to use the maximum number of digits + // We can expand the coefficient to use the maximum number of digits. + // After expansion, coeff has precision digits (<= max_significand_v) and + // biased_exp lands in [0, max] (we shift toward the valid range). Same + // pack_in_range pattern as the digit_delta > 0 branch above. const auto offset {detail::precision - coeff_digits}; exp -= offset; reduced_coeff *= detail::pow10(static_cast(offset)); - *this = decimal32_t(reduced_coeff, exp, is_negative); + *this = detail::pack_in_range(reduced_coeff, exp, is_negative); } else { @@ -828,6 +836,55 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto from_bits(const std::uint32_t bits) noexcept - namespace detail { +// IEEE-pack a known-in-range (coeff, exp, sign) triple into a decimal32_t, +// skipping the constructor's bounds check + dead-branch handling. Caller +// guarantees coeff <= d32_max_significand_value and (exp + bias) is in +// [0, d32_max_biased_exponent]. +template +BOOST_DECIMAL_CUDA_CONSTEXPR auto direct_pack_d32(T1 coeff, T2 exp, bool sign) noexcept -> decimal32_t +{ + const auto reduced_coeff {static_cast(coeff)}; + const auto biased_exp {static_cast(static_cast(exp) + bias_v)}; + std::uint32_t bits {sign ? d32_sign_mask : UINT32_C(0)}; + + // The non-combination encoding holds coefficients up to d32_biggest_no_combination_significand, + // which covers ~95%+ of post-arithmetic coefficients. The combination-field branch is cold. + if (BOOST_DECIMAL_LIKELY(reduced_coeff <= d32_biggest_no_combination_significand)) + { + bits |= (reduced_coeff & d32_not_11_significand_mask); + bits |= (biased_exp << d32_not_11_exp_shift) & d32_not_11_exp_mask; + } + else + { + bits |= (d32_comb_11_mask | (reduced_coeff & d32_11_significand_mask)); + bits |= (biased_exp << d32_11_exp_shift) & d32_11_exp_mask; + } + return from_bits(bits); +} + +// Definition of the pack_in_range overload declared in +// add_impl.hpp. Lives here because the body's `decimal32_t{coeff, exp, sign}` +// fallback constructor call requires decimal32_t to be a complete type -- +// both C++14 (regular if) and nvcc (eager template body parse for device +// codegen) would otherwise reject the constructor call when the function +// was defined at the add_impl.hpp include site (where decimal32_t is still +// forward-declared via fwd.hpp). +template +BOOST_DECIMAL_CUDA_CONSTEXPR auto pack_in_range(SigType coeff, ExpType exp, bool sign) noexcept + -> std::enable_if_t::value, decimal32_t> +{ + const auto biased_exp_check {static_cast(exp) + bias_v}; + if (BOOST_DECIMAL_LIKELY(biased_exp_check >= 0 + && biased_exp_check <= static_cast(max_biased_exp_v))) + { + return direct_pack_d32(static_cast(coeff), exp, sign); + } + return decimal32_t{coeff, exp, sign}; +} + +} // namespace detail (close to make the existing detail namespace block work) +namespace detail { + template class numeric_limits_impl32 { @@ -1025,6 +1082,41 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto operator+(const decimal32_t lhs, const decimal } #endif + // Two fast paths (see decimal64_t.hpp:operator+ for full explanation). + // Both gated on non-zero operands so zero short-circuit logic is preserved + // by falling through to add_impl. + { + const auto lhs_sig {lhs.full_significand()}; + const auto rhs_sig {rhs.full_significand()}; + if (BOOST_DECIMAL_LIKELY(lhs_sig != 0U && rhs_sig != 0U)) + { + const auto lhs_exp {lhs.biased_exponent()}; + const auto rhs_exp {rhs.biased_exponent()}; + const auto exp_diff {lhs_exp > rhs_exp ? lhs_exp - rhs_exp : rhs_exp - lhs_exp}; + if (exp_diff > 17 || exp_diff <= 3) + { + auto round {_boost_decimal_global_rounding_mode}; + #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION + if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(lhs)) + { + round = _boost_decimal_global_runtime_rounding_mode; + } + #endif + if (BOOST_DECIMAL_LIKELY(round == rounding_mode::fe_dec_to_nearest)) + { + if (exp_diff > 17) + { + return lhs_exp > rhs_exp ? lhs : rhs; + } + return detail::aligned_add_kernel( + static_cast(lhs_sig), static_cast(rhs_sig), + lhs_exp, rhs_exp, static_cast(exp_diff), + lhs.isneg(), rhs.isneg()); + } + } + } + } + auto lhs_components {lhs.to_components()}; detail::expand_significand(lhs_components.sig, lhs_components.exp); auto rhs_components {rhs.to_components()}; @@ -1130,6 +1222,41 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto operator-(const decimal32_t lhs, const decimal } #endif + // Two fast paths; see operator+ above. Both gated on non-zero operands so + // zero short-circuit logic is preserved by falling through. For operator-, + // rhs sign is flipped before kernel dispatch. + { + const auto lhs_sig {lhs.full_significand()}; + const auto rhs_sig {rhs.full_significand()}; + if (BOOST_DECIMAL_LIKELY(lhs_sig != 0U && rhs_sig != 0U)) + { + const auto lhs_exp {lhs.biased_exponent()}; + const auto rhs_exp {rhs.biased_exponent()}; + const auto exp_diff {lhs_exp > rhs_exp ? lhs_exp - rhs_exp : rhs_exp - lhs_exp}; + if (exp_diff > 17 || exp_diff <= 3) + { + auto round {_boost_decimal_global_rounding_mode}; + #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION + if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(lhs)) + { + round = _boost_decimal_global_runtime_rounding_mode; + } + #endif + if (BOOST_DECIMAL_LIKELY(round == rounding_mode::fe_dec_to_nearest)) + { + if (exp_diff > 17) + { + return lhs_exp > rhs_exp ? lhs : -rhs; + } + return detail::aligned_add_kernel( + static_cast(lhs_sig), static_cast(rhs_sig), + lhs_exp, rhs_exp, static_cast(exp_diff), + lhs.isneg(), !rhs.isneg()); + } + } + } + } + auto lhs_components {lhs.to_components()}; detail::expand_significand(lhs_components.sig, lhs_components.exp); auto rhs_components {rhs.to_components()}; @@ -1485,16 +1612,17 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto operator<=>(const decimal32_t lhs, const decim { return std::partial_ordering::less; } - else if (lhs > rhs) + if (rhs < lhs) { return std::partial_ordering::greater; } - else if (lhs == rhs) + #ifndef BOOST_DECIMAL_FAST_MATH + if (isnan(lhs) || isnan(rhs)) { - return std::partial_ordering::equivalent; + return std::partial_ordering::unordered; } - - return std::partial_ordering::unordered; + #endif + return std::partial_ordering::equivalent; } template diff --git a/include/boost/decimal/decimal64_t.hpp b/include/boost/decimal/decimal64_t.hpp index 3e13fa369..51f2d0ca8 100644 --- a/include/boost/decimal/decimal64_t.hpp +++ b/include/boost/decimal/decimal64_t.hpp @@ -603,6 +603,52 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto from_bits(std::uint64_t bits) noexcept -> deci return result; } +namespace detail { + +// IEEE-pack a known-in-range (coeff, exp, sign) triple into a decimal64_t, +// skipping the constructor's bounds check + dead-branch handling. Caller +// guarantees coeff <= d64_max_significand_value and the (exp + bias) value +// fits in [0, d64_max_biased_exponent]. Saves ~10 cycles per fast-path call. +template +BOOST_DECIMAL_CUDA_CONSTEXPR auto direct_pack_d64(T1 coeff, T2 exp, bool sign) noexcept -> decimal64_t +{ + const auto reduced_coeff {static_cast(coeff)}; + const auto biased_exp {static_cast(static_cast(exp) + bias_v)}; + std::uint64_t bits {sign ? d64_sign_mask : UINT64_C(0)}; + + // The non-combination encoding holds coefficients up to d64_biggest_no_combination_significand, + // which covers ~95%+ of post-arithmetic coefficients. The combination-field branch is cold. + if (BOOST_DECIMAL_LIKELY(reduced_coeff <= d64_biggest_no_combination_significand)) + { + bits |= (reduced_coeff & d64_not_11_significand_mask); + bits |= (biased_exp << d64_not_11_exp_shift) & d64_not_11_exp_mask; + } + else + { + bits |= (d64_combination_field_mask | (reduced_coeff & d64_11_significand_mask)); + bits |= (biased_exp << d64_11_exp_shift) & d64_11_exp_mask; + } + return from_bits(bits); +} + +// Definition of the pack_in_range overload declared in +// add_impl.hpp. Lives here so the `decimal64_t{coeff, exp, sign}` fallback +// is parsed only after decimal64_t is complete (see add_impl.hpp rationale). +template +BOOST_DECIMAL_CUDA_CONSTEXPR auto pack_in_range(SigType coeff, ExpType exp, bool sign) noexcept + -> std::enable_if_t::value, decimal64_t> +{ + const auto biased_exp_check {static_cast(exp) + bias_v}; + if (BOOST_DECIMAL_LIKELY(biased_exp_check >= 0 + && biased_exp_check <= static_cast(max_biased_exp_v))) + { + return direct_pack_d64(static_cast(coeff), exp, sign); + } + return decimal64_t{coeff, exp, sign}; +} + +} // namespace detail + constexpr auto to_bits(decimal64_t rhs) noexcept -> std::uint64_t { return rhs.bits_; @@ -745,9 +791,12 @@ BOOST_DECIMAL_CUDA_CONSTEXPR decimal64_t::decimal64_t(T1 coeff, T2 exp, const de } else if (digit_delta > 0 && coeff_digits + digit_delta <= detail::precision_v) { + // Coeff stays in range (<= max_significand_v) by the branch's digit budget, + // and biased_exp lands in [0, max] by construction. pack_in_range hits + // direct_pack on the in-range case; fallback constructor is a safety belt. exp -= digit_delta; reduced_coeff *= detail::pow10(static_cast(digit_delta)); - *this = decimal64_t(reduced_coeff, exp, is_negative); + *this = detail::pack_in_range(reduced_coeff, exp, is_negative); } else if (coeff_digits + biased_exp <= detail::precision_v) { @@ -781,20 +830,26 @@ BOOST_DECIMAL_CUDA_CONSTEXPR decimal64_t::decimal64_t(T1 coeff, T2 exp, const de } else if (digit_delta < 0 && coeff_digits - digit_delta <= detail::precision_v) { + // Expand to use the full precision; biased_exp ends up in [0, max] and + // coeff <= max_significand_v. pack_in_range routes to direct_pack. const auto offset {detail::precision_v - coeff_digits}; exp -= offset; reduced_coeff *= detail::pow10(static_cast(offset)); - *this = decimal64_t(reduced_coeff, exp, is_negative); + *this = detail::pack_in_range(reduced_coeff, exp, is_negative); } else if (biased_exp > detail::max_biased_exp_v) { - // Similar to subnormals, but for extremely large values + // Similar to subnormals, but for extremely large values: fold the + // overflow into the coefficient via trailing zeros when there's room. const auto available_space {detail::precision_v - coeff_digits}; if (available_space >= exp_delta) { + // available_space >= exp_delta means after subtracting exp_delta from + // biased_exp it lands in [0, max], and coeff has coeff_digits + available_space + // <= precision digits, so <= max_significand_v. pack_in_range applies. reduced_coeff *= detail::pow10(static_cast(available_space)); exp -= available_space; - *this = decimal64_t(reduced_coeff, exp, is_negative); + *this = detail::pack_in_range(reduced_coeff, exp, is_negative); } else { @@ -1472,16 +1527,17 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto operator<=>(const decimal64_t lhs, const decim { return std::partial_ordering::less; } - else if (lhs > rhs) + if (rhs < lhs) { return std::partial_ordering::greater; } - else if (lhs == rhs) + #ifndef BOOST_DECIMAL_FAST_MATH + if (isnan(lhs) || isnan(rhs)) { - return std::partial_ordering::equivalent; + return std::partial_ordering::unordered; } - - return std::partial_ordering::unordered; + #endif + return std::partial_ordering::equivalent; } template @@ -1662,6 +1718,47 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto operator+(const decimal64_t lhs, const decimal } #endif + // Two fast paths under default rounding, gated by user-facing exp_diff: + // 1. exp_diff > 36: slow path would return dominant operand unchanged; skip + // to_components/expand_significand and return it directly. + // 2. exp_diff <= 3: aligned_add_kernel can do the whole add in uint64 (max_sig + // 16 digits * 10^3 = 19 digits < 2^64); skip to_components/expand_significand + // and the add_impl dispatch entirely. + // The 4..36 band falls through to the existing slow path. Both fast paths + // require both operands to be non-zero: zero short-circuit logic + // (preferred-quantum result exponent, sign-of-opposite-sign-zeros) lives in + // add_impl and is not duplicated here. + { + const auto lhs_sig {lhs.full_significand()}; + const auto rhs_sig {rhs.full_significand()}; + if (BOOST_DECIMAL_LIKELY(lhs_sig != 0U && rhs_sig != 0U)) + { + const auto lhs_exp {lhs.biased_exponent()}; + const auto rhs_exp {rhs.biased_exponent()}; + const auto exp_diff {lhs_exp > rhs_exp ? lhs_exp - rhs_exp : rhs_exp - lhs_exp}; + if (exp_diff > 36 || exp_diff <= 3) + { + auto round {_boost_decimal_global_rounding_mode}; + #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION + if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(lhs)) + { + round = _boost_decimal_global_runtime_rounding_mode; + } + #endif + if (BOOST_DECIMAL_LIKELY(round == rounding_mode::fe_dec_to_nearest)) + { + if (exp_diff > 36) + { + return lhs_exp > rhs_exp ? lhs : rhs; + } + return detail::aligned_add_kernel( + lhs_sig, rhs_sig, lhs_exp, rhs_exp, static_cast(exp_diff), + lhs.isneg(), rhs.isneg()); + } + } + } + } + auto lhs_components {lhs.to_components()}; detail::expand_significand(lhs_components.sig, lhs_components.exp); auto rhs_components {rhs.to_components()}; @@ -1725,6 +1822,41 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto operator-(const decimal64_t lhs, const decimal } #endif + // Two fast paths (see operator+ above). Both gated on non-zero operands so + // zero short-circuit logic (preferred-quantum, sign-of-zero) is preserved + // by falling through to add_impl. For operator-, the rhs sign is flipped + // before dispatching to the kernel (subtraction = add with negated rhs). + { + const auto lhs_sig {lhs.full_significand()}; + const auto rhs_sig {rhs.full_significand()}; + if (BOOST_DECIMAL_LIKELY(lhs_sig != 0U && rhs_sig != 0U)) + { + const auto lhs_exp {lhs.biased_exponent()}; + const auto rhs_exp {rhs.biased_exponent()}; + const auto exp_diff {lhs_exp > rhs_exp ? lhs_exp - rhs_exp : rhs_exp - lhs_exp}; + if (exp_diff > 36 || exp_diff <= 3) + { + auto round {_boost_decimal_global_rounding_mode}; + #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION + if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(lhs)) + { + round = _boost_decimal_global_runtime_rounding_mode; + } + #endif + if (BOOST_DECIMAL_LIKELY(round == rounding_mode::fe_dec_to_nearest)) + { + if (exp_diff > 36) + { + return lhs_exp > rhs_exp ? lhs : -rhs; + } + return detail::aligned_add_kernel( + lhs_sig, rhs_sig, lhs_exp, rhs_exp, static_cast(exp_diff), + lhs.isneg(), !rhs.isneg()); + } + } + } + } + auto lhs_components {lhs.to_components()}; detail::expand_significand(lhs_components.sig, lhs_components.exp); auto rhs_components {rhs.to_components()}; diff --git a/include/boost/decimal/decimal_fast128_t.hpp b/include/boost/decimal/decimal_fast128_t.hpp index 5cfebd1d0..cb5f2b035 100644 --- a/include/boost/decimal/decimal_fast128_t.hpp +++ b/include/boost/decimal/decimal_fast128_t.hpp @@ -1074,6 +1074,36 @@ constexpr auto operator+(const decimal_fast128_t& lhs, const decimal_fast128_t& } #endif + // Dominant-operand short-circuit: for d128 d128_add_impl_new uses u256 + // promoted type with max_shift = digits10(u256) - precision - 1 + // = 77 - 34 - 1 = 42. Above that the slow path returns one operand + // unchanged anyway. Fast types maintain canonical significands so no + // expansion buffer is needed. + { + const auto lhs_sig {lhs.full_significand()}; + const auto rhs_sig {rhs.full_significand()}; + if (BOOST_DECIMAL_LIKELY(lhs_sig != 0U && rhs_sig != 0U)) + { + const auto lhs_exp {lhs.biased_exponent()}; + const auto rhs_exp {rhs.biased_exponent()}; + const auto exp_diff {lhs_exp > rhs_exp ? lhs_exp - rhs_exp : rhs_exp - lhs_exp}; + if (exp_diff > 42) + { + auto round {_boost_decimal_global_rounding_mode}; + #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION + if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(lhs)) + { + round = _boost_decimal_global_runtime_rounding_mode; + } + #endif + if (BOOST_DECIMAL_LIKELY(round == rounding_mode::fe_dec_to_nearest)) + { + return lhs_exp > rhs_exp ? lhs : rhs; + } + } + } + } + return detail::d128_add_impl_new(lhs, rhs); } @@ -1124,6 +1154,32 @@ constexpr auto operator-(const decimal_fast128_t& lhs, const decimal_fast128_t& } #endif + // Dominant-operand short-circuit; see operator+ above. + { + const auto lhs_sig {lhs.full_significand()}; + const auto rhs_sig {rhs.full_significand()}; + if (BOOST_DECIMAL_LIKELY(lhs_sig != 0U && rhs_sig != 0U)) + { + const auto lhs_exp {lhs.biased_exponent()}; + const auto rhs_exp {rhs.biased_exponent()}; + const auto exp_diff {lhs_exp > rhs_exp ? lhs_exp - rhs_exp : rhs_exp - lhs_exp}; + if (exp_diff > 42) + { + auto round {_boost_decimal_global_rounding_mode}; + #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION + if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(lhs)) + { + round = _boost_decimal_global_runtime_rounding_mode; + } + #endif + if (BOOST_DECIMAL_LIKELY(round == rounding_mode::fe_dec_to_nearest)) + { + return lhs_exp > rhs_exp ? lhs : -rhs; + } + } + } + } + return detail::d128_add_impl_new(lhs, -rhs); } @@ -1228,7 +1284,7 @@ constexpr auto operator*(const decimal_fast128_t& lhs, const Integer rhs) noexce exp_type rhs_exp {0}; detail::normalize(rhs_sig, rhs_exp); - return detail::d128_fast_mul_impl( + return detail::d128_mul_impl( lhs.significand_, lhs.biased_exponent(), lhs.sign_, rhs_sig, rhs_exp, (rhs < 0)); } diff --git a/include/boost/decimal/decimal_fast32_t.hpp b/include/boost/decimal/decimal_fast32_t.hpp index 53167ae38..7493a5790 100644 --- a/include/boost/decimal/decimal_fast32_t.hpp +++ b/include/boost/decimal/decimal_fast32_t.hpp @@ -1034,11 +1034,40 @@ constexpr auto operator+(const decimal_fast32_t lhs, const decimal_fast32_t rhs) { return direct_init(detail::d32_fast_qnan, UINT8_C((0))); } - + return detail::check_non_finite(lhs, rhs); } #endif + // Dominant-operand short-circuit: for d32 add_impl uses uint64 promoted + // type with max_shift = digits10(uint64) - precision - 1 = 19 - 7 - 1 = 11. + // Above that the slow path returns one operand unchanged anyway. Fast + // types maintain canonical significands so no expansion buffer is needed. + { + const auto lhs_sig {lhs.full_significand()}; + const auto rhs_sig {rhs.full_significand()}; + if (BOOST_DECIMAL_LIKELY(lhs_sig != 0U && rhs_sig != 0U)) + { + const auto lhs_exp {lhs.biased_exponent()}; + const auto rhs_exp {rhs.biased_exponent()}; + const auto exp_diff {lhs_exp > rhs_exp ? lhs_exp - rhs_exp : rhs_exp - lhs_exp}; + if (exp_diff > 11) + { + auto round {_boost_decimal_global_rounding_mode}; + #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION + if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(lhs)) + { + round = _boost_decimal_global_runtime_rounding_mode; + } + #endif + if (BOOST_DECIMAL_LIKELY(round == rounding_mode::fe_dec_to_nearest)) + { + return lhs_exp > rhs_exp ? lhs : rhs; + } + } + } + } + return detail::add_impl(lhs, rhs); } @@ -1088,11 +1117,37 @@ constexpr auto operator-(const decimal_fast32_t lhs, decimal_fast32_t rhs) noexc { return -rhs; } - + return detail::check_non_finite(lhs, rhs); } #endif + // Dominant-operand short-circuit; see operator+ above. + { + const auto lhs_sig {lhs.full_significand()}; + const auto rhs_sig {rhs.full_significand()}; + if (BOOST_DECIMAL_LIKELY(lhs_sig != 0U && rhs_sig != 0U)) + { + const auto lhs_exp {lhs.biased_exponent()}; + const auto rhs_exp {rhs.biased_exponent()}; + const auto exp_diff {lhs_exp > rhs_exp ? lhs_exp - rhs_exp : rhs_exp - lhs_exp}; + if (exp_diff > 11) + { + auto round {_boost_decimal_global_rounding_mode}; + #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION + if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(lhs)) + { + round = _boost_decimal_global_runtime_rounding_mode; + } + #endif + if (BOOST_DECIMAL_LIKELY(round == rounding_mode::fe_dec_to_nearest)) + { + return lhs_exp > rhs_exp ? lhs : -rhs; + } + } + } + } + rhs.sign_ = !rhs.sign_; return detail::add_impl(lhs, rhs); diff --git a/include/boost/decimal/decimal_fast64_t.hpp b/include/boost/decimal/decimal_fast64_t.hpp index e83e0543b..b282e8c8d 100644 --- a/include/boost/decimal/decimal_fast64_t.hpp +++ b/include/boost/decimal/decimal_fast64_t.hpp @@ -1173,6 +1173,36 @@ constexpr auto operator+(const decimal_fast64_t lhs, const decimal_fast64_t rhs) } #endif + // Dominant-operand short-circuit: when exp_diff exceeds max_shift in + // add_impl (= digits10(uint128) - precision - 1 = 21 for d64) the slow path + // returns one operand unchanged anyway. Skip the whole add_impl call. Fast + // types maintain canonical significands so the threshold has no + // worst-case-expansion buffer (unlike the IEEE 36 threshold). + { + const auto lhs_sig {lhs.full_significand()}; + const auto rhs_sig {rhs.full_significand()}; + if (BOOST_DECIMAL_LIKELY(lhs_sig != 0U && rhs_sig != 0U)) + { + const auto lhs_exp {lhs.biased_exponent()}; + const auto rhs_exp {rhs.biased_exponent()}; + const auto exp_diff {lhs_exp > rhs_exp ? lhs_exp - rhs_exp : rhs_exp - lhs_exp}; + if (exp_diff > 21) + { + auto round {_boost_decimal_global_rounding_mode}; + #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION + if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(lhs)) + { + round = _boost_decimal_global_runtime_rounding_mode; + } + #endif + if (BOOST_DECIMAL_LIKELY(round == rounding_mode::fe_dec_to_nearest)) + { + return lhs_exp > rhs_exp ? lhs : rhs; + } + } + } + } + return detail::add_impl(lhs, rhs); } @@ -1227,6 +1257,32 @@ constexpr auto operator-(const decimal_fast64_t lhs, decimal_fast64_t rhs) noexc } #endif + // Dominant-operand short-circuit; see operator+ above. + { + const auto lhs_sig {lhs.full_significand()}; + const auto rhs_sig {rhs.full_significand()}; + if (BOOST_DECIMAL_LIKELY(lhs_sig != 0U && rhs_sig != 0U)) + { + const auto lhs_exp {lhs.biased_exponent()}; + const auto rhs_exp {rhs.biased_exponent()}; + const auto exp_diff {lhs_exp > rhs_exp ? lhs_exp - rhs_exp : rhs_exp - lhs_exp}; + if (exp_diff > 21) + { + auto round {_boost_decimal_global_rounding_mode}; + #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION + if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(lhs)) + { + round = _boost_decimal_global_runtime_rounding_mode; + } + #endif + if (BOOST_DECIMAL_LIKELY(round == rounding_mode::fe_dec_to_nearest)) + { + return lhs_exp > rhs_exp ? lhs : -rhs; + } + } + } + } + rhs.sign_ = !rhs.sign_; return detail::add_impl(lhs, rhs); diff --git a/include/boost/decimal/detail/add_impl.hpp b/include/boost/decimal/detail/add_impl.hpp index c5ab9f590..7545f1ed7 100644 --- a/include/boost/decimal/detail/add_impl.hpp +++ b/include/boost/decimal/detail/add_impl.hpp @@ -17,6 +17,7 @@ #ifndef BOOST_DECIMAL_BUILD_MODULE #include +#include #endif namespace boost { @@ -28,6 +29,217 @@ namespace detail { # pragma warning(disable : 4127) // Conditional expression is constant #endif +// Returns the low 64 bits of an integer for parity testing. Specialized so the +// templated kernel below can use one expression regardless of SigType width. +BOOST_DECIMAL_CUDA_CONSTEXPR inline auto low64(std::uint64_t v) noexcept -> std::uint64_t { return v; } +BOOST_DECIMAL_CUDA_CONSTEXPR inline auto low64(const int128::uint128_t& v) noexcept -> std::uint64_t { return v.low; } + +// Forward declarations of the per-type IEEE direct-pack helpers (defined in +// decimal{32,64,128}_t.hpp after the bit-mask constants). These skip the +// generic constructor's bounds check + dead-branch handling for known-in-range +// inputs, saving ~5-10 cycles per call. +template +BOOST_DECIMAL_CUDA_CONSTEXPR auto direct_pack_d32(T1 coeff, T2 exp, bool sign) noexcept -> decimal32_t; +template +BOOST_DECIMAL_CUDA_CONSTEXPR auto direct_pack_d64(T1 coeff, T2 exp, bool sign) noexcept -> decimal64_t; +template +BOOST_DECIMAL_CUDA_CONSTEXPR auto direct_pack_d128(int128::uint128_t coeff, T2 exp, bool sign) noexcept -> decimal128_t; + +// Dispatch helper: for IEEE return types use the corresponding direct_pack +// helper if (exp + bias) is in the valid biased-exponent range; otherwise fall +// back to the regular constructor which handles overflow-to-infinity and +// subnormal flushing/re-canonicalization. For fast types always uses the +// regular constructor. +// +// SFINAE-disjoint overloads rather than an `if constexpr` chain because +// BOOST_DECIMAL_IF_CONSTEXPR falls back to plain `if` in C++14, which would +// force the d128 branch's `direct_pack_d128` body to be type-checked from +// add_impl.hpp's include site (where decimal128_t is still forward-declared). +// +// The IEEE-type overloads are declared here but DEFINED in their respective +// decimal*_t.hpp headers, after each class is complete. This avoids parsing +// `decimal32_t{coeff, exp, sign}` (etc.) constructor calls in an environment +// where the type is incomplete -- both C++14 (regular if) and nvcc, which +// eagerly compiles function template bodies for device codegen, would +// otherwise reject the incomplete-type expressions. +template +BOOST_DECIMAL_CUDA_CONSTEXPR auto pack_in_range(SigType coeff, ExpType exp, bool sign) noexcept + -> std::enable_if_t::value, decimal32_t>; + +template +BOOST_DECIMAL_CUDA_CONSTEXPR auto pack_in_range(SigType coeff, ExpType exp, bool sign) noexcept + -> std::enable_if_t::value, decimal64_t>; + +template +BOOST_DECIMAL_CUDA_CONSTEXPR auto pack_in_range(SigType coeff, ExpType exp, bool sign) noexcept + -> std::enable_if_t::value, decimal128_t>; + +// Fallback for fast types and components -- always uses the regular constructor. +// Defined here because its body doesn't reference any specific IEEE decimal type. +template +BOOST_DECIMAL_CUDA_CONSTEXPR auto pack_in_range(SigType coeff, ExpType exp, bool sign) noexcept + -> std::enable_if_t::value + && !std::is_same::value + && !std::is_same::value, ReturnType> +{ + return ReturnType{coeff, exp, sign}; +} + +// Inline shrink + pack for a sum/diff that exceeds max_significand_v. For 1-3 +// extra digits the bounds are determined by a chained comparison against the +// precomputed thresholds. For >3 extra digits (post-alignment shifts up to ~21 +// for d64), a single num_digits call computes extra and pow10 looks up the +// divisor. Either way the divide is one divmod_pow10 + round-half-to-even, +// avoiding the constructor's coefficient_rounding dispatch. +template +BOOST_DECIMAL_CUDA_CONSTEXPR auto aligned_shrink_and_pack( + SigType mag, ExpType result_exp, bool result_sign) noexcept -> ReturnType +{ + constexpr SigType ten_to_p { + static_cast(max_significand_v) + SigType{1}}; + constexpr SigType ten_to_p_plus_1 {static_cast(ten_to_p * SigType{10U})}; + constexpr SigType ten_to_p_plus_2 {static_cast(ten_to_p_plus_1 * SigType{10U})}; + constexpr SigType ten_to_p_plus_3 {static_cast(ten_to_p_plus_2 * SigType{10U})}; + + unsigned extra {1U}; + SigType pow_extra {SigType{10U}}; + SigType half {SigType{5U}}; + if (mag >= ten_to_p_plus_1) + { + extra = 2U; + pow_extra = SigType{100U}; + half = SigType{50U}; + if (mag >= ten_to_p_plus_2) + { + extra = 3U; + pow_extra = SigType{1000U}; + half = SigType{500U}; + if (BOOST_DECIMAL_UNLIKELY(mag >= ten_to_p_plus_3)) + { + // Tall overflow (more than 3 extra digits): post-alignment + // shifts up to ~21 can land here for d64. Compute extra via + // num_digits and look up pow10. + const auto digits {num_digits(mag)}; + extra = static_cast(digits - detail::precision_v); + pow_extra = detail::pow10(static_cast(extra)); + half = static_cast(pow_extra / SigType{2U}); + } + } + } + + const auto dr {impl::divmod_pow10_dispatch(mag, static_cast(extra), pow_extra)}; + auto q {static_cast(dr.quotient)}; + const auto r {static_cast(dr.remainder)}; + + if (r > half || (r == half && (low64(q) & UINT64_C(1)) != 0U)) + { + ++q; + if (BOOST_DECIMAL_UNLIKELY(q == ten_to_p)) + { + q = static_cast(ten_to_p / SigType{10U}); + ++extra; + } + } + return pack_in_range(q, result_exp + static_cast(extra), result_sign); +} + +// Aligned-add kernel that stays in the narrower integer width (uint64 for d64, +// uint128 for d128) instead of the kernel's normal promoted width (uint128 or +// u256). Two precision-p operands shifted by up to `narrow_max_shift` digits +// still fit in the narrow type. The kernel also handles the overflow shrink +// inline via Wang/Schulte injection-style rounding (precomputed half-constants, +// single divmod, round-half-to-even), avoiding the constructor's +// num_digits + coefficient_rounding dispatch (~50 cycles per overflow case). +// +// Caller must guarantee: +// - default rounding mode is fe_dec_to_nearest +// - shift in [0, 3] where 0 means same-exponent (no multiply) +// - both operands non-zero (zero short-circuit happens upstream) +template +BOOST_DECIMAL_CUDA_CONSTEXPR auto aligned_add_kernel( + SigType big_lhs, SigType big_rhs, + ExpType lhs_exp, ExpType rhs_exp, unsigned shift, + bool lhs_sign, bool rhs_sign) noexcept -> ReturnType +{ + SigType a {}; + SigType b {}; + ExpType result_exp {}; + if (shift == 0U) + { + a = big_lhs; + b = big_rhs; + result_exp = lhs_exp; + } + else + { + const auto pow_shift {detail::pow10(static_cast(shift))}; + if (lhs_exp < rhs_exp) + { + a = big_lhs; + b = static_cast(big_rhs * pow_shift); + result_exp = lhs_exp; + } + else + { + a = static_cast(big_lhs * pow_shift); + b = big_rhs; + result_exp = rhs_exp; + } + } + + // Same-sign: magnitudes add. Sum can overflow by up to `shift` digits + // (1 for shift=0 same-exp, up to `shift` for small-diff). + if (lhs_sign == rhs_sign) + { + const SigType sum {static_cast(a + b)}; + constexpr SigType ten_to_p_ss { + static_cast(max_significand_v) + SigType{1}}; + if (sum < ten_to_p_ss) + { + return pack_in_range(sum, result_exp, lhs_sign); + } + return aligned_shrink_and_pack(sum, result_exp, lhs_sign); + } + + // Opposite signs: magnitudes subtract. For shift=0 the result <= max(a, b) <= max_sig + // so no overflow check is needed; for shift>0 the result can be up to + // max_sig * 10^shift and may need shrink. + SigType mag {}; + bool result_sign {}; + if (a >= b) + { + mag = static_cast(a - b); + result_sign = lhs_sign; + } + else + { + mag = static_cast(b - a); + result_sign = rhs_sign; + } + if (shift == 0U) + { + // No overflow possible in same-exp opposite-sign subtraction. + return pack_in_range(mag, result_exp, result_sign); + } + constexpr SigType ten_to_p_os { + static_cast(max_significand_v) + SigType{1}}; + if (mag < ten_to_p_os) + { + return pack_in_range(mag, result_exp, result_sign); + } + return aligned_shrink_and_pack(mag, result_exp, result_sign); +} + +// Backwards-compatible aliases for the d128 callers that still use the old name. +template +BOOST_DECIMAL_CUDA_CONSTEXPR auto d128_uint128_aligned_kernel( + int128::uint128_t big_lhs, int128::uint128_t big_rhs, + ExpType lhs_exp, ExpType rhs_exp, unsigned shift, + bool lhs_sign, bool rhs_sign) noexcept -> ReturnType +{ + return aligned_add_kernel(big_lhs, big_rhs, lhs_exp, rhs_exp, shift, lhs_sign, rhs_sign); +} + template BOOST_DECIMAL_CUDA_CONSTEXPR auto add_impl(const T& lhs, const T& rhs) noexcept -> ReturnType { @@ -64,6 +276,53 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto add_impl(const T& lhs, const T& rhs) noexcept return ReturnType{lhs.full_significand(), lhs.biased_exponent(), lhs.isneg()}; } + // Phase 3 fast paths for d64 (uint128 promoted) and d32 (uint64 promoted): + // dispatch via aligned_add_kernel which handles same-exp + small-diff + the + // overflow shrink inline (avoiding the constructor's coefficient_rounding). + // + // Two width-buckets: + // - shift <= 3: stay in uint64 (max_sig 16 digits * 10^3 = 19 digits < 2^64). + // - shift in [4, max_shift]: stay in uint128 (the existing promoted width, + // but with inline shrink instead of the signed-add-then-construct path). + // + // Guard: only enter when inputs are at the return type's precision + // (operator+/- callers); FMA passes wider promoted-precision components and + // must use the slow path. + BOOST_DECIMAL_IF_CONSTEXPR (detail::precision_v <= 16 + && std::is_same::value) + { + constexpr unsigned u64_small_diff_limit {3U}; + constexpr auto u128_max_shift {detail::make_positive_unsigned( + std::numeric_limits::digits10 - detail::precision_v - 1)}; + const auto exp_diff {lhs_exp - rhs_exp}; + const auto shift_abs {static_cast(exp_diff < 0 ? -exp_diff : exp_diff)}; + if (shift_abs <= static_cast(u128_max_shift)) + { + bool default_rounding {_boost_decimal_global_rounding_mode == rounding_mode::fe_dec_to_nearest}; + #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION + if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(lhs)) + { + default_rounding = (_boost_decimal_global_runtime_rounding_mode == rounding_mode::fe_dec_to_nearest); + } + #endif + if (BOOST_DECIMAL_LIKELY(default_rounding)) + { + if (shift_abs <= u64_small_diff_limit) + { + return aligned_add_kernel( + static_cast(big_lhs), + static_cast(big_rhs), + lhs_exp, rhs_exp, shift_abs, + lhs.isneg(), rhs.isneg()); + } + return aligned_add_kernel( + big_lhs, big_rhs, + lhs_exp, rhs_exp, shift_abs, + lhs.isneg(), rhs.isneg()); + } + } + } + // Align to larger exponent if (lhs_exp != rhs_exp) { @@ -255,12 +514,66 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto d128_add_impl_new(const T& lhs, const T& rhs) return ReturnType{lhs.full_significand(), lhs.biased_exponent(), lhs.isneg()}; } + // Phase 1 fast path guard: only enter when inputs are at the return type's + // precision (operator+/- callers); FMA passes promoted components (wider sigs). + constexpr bool fast_path_eligible { + std::is_same::value}; + + // Phase 1 same-exp fast path: stays in uint128 (no u256 promotion, no u256 trailing add). + // Default rounding only; non-default rounding falls through to the existing u256 path. + BOOST_DECIMAL_IF_CONSTEXPR (fast_path_eligible) + { + if (lhs_exp == rhs_exp) + { + bool default_rounding {_boost_decimal_global_rounding_mode == rounding_mode::fe_dec_to_nearest}; + #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION + if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(lhs)) + { + default_rounding = (_boost_decimal_global_runtime_rounding_mode == rounding_mode::fe_dec_to_nearest); + } + #endif + if (BOOST_DECIMAL_LIKELY(default_rounding)) + { + return d128_uint128_aligned_kernel( + big_lhs, big_rhs, lhs_exp, rhs_exp, 0U, + lhs.isneg(), rhs.isneg()); + } + } + } + // Align to larger exponent if (lhs_exp != rhs_exp) { constexpr auto max_shift {detail::make_positive_unsigned(std::numeric_limits::digits10 - detail::precision_v - 1)}; const auto shift {detail::make_positive_unsigned(lhs_exp - rhs_exp)}; + // Phase 1 small-diff fast path: for shifts in [1, 3], the aligned multiply + // still fits in uint128 (precision 34 + shift 3 = 37 <= digits10(uint128) = 38). + // Skipping u256 saves ~50 cycles per op vs the u256 path below. Only taken + // under default rounding; the u256 slow path covers other modes. + // No LIKELY hint: random-exp workloads have shift >> 3, accumulation has + // shift <= 3, so neither prediction wins universally. + BOOST_DECIMAL_IF_CONSTEXPR (fast_path_eligible) + { + constexpr auto u128_small_diff_limit {3U}; + if (shift <= u128_small_diff_limit) + { + bool default_rounding {_boost_decimal_global_rounding_mode == rounding_mode::fe_dec_to_nearest}; + #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION + if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(lhs)) + { + default_rounding = (_boost_decimal_global_runtime_rounding_mode == rounding_mode::fe_dec_to_nearest); + } + #endif + if (BOOST_DECIMAL_LIKELY(default_rounding)) + { + return d128_uint128_aligned_kernel( + big_lhs, big_rhs, lhs_exp, rhs_exp, static_cast(shift), + lhs.isneg(), rhs.isneg()); + } + } + } + if (shift > max_shift) { auto round {_boost_decimal_global_rounding_mode}; diff --git a/include/boost/decimal/detail/comparison.hpp b/include/boost/decimal/detail/comparison.hpp index 09258b35c..c8b069da3 100644 --- a/include/boost/decimal/detail/comparison.hpp +++ b/include/boost/decimal/detail/comparison.hpp @@ -77,7 +77,7 @@ BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto equality_impl(Decim const auto delta_exp {lhs_exp - rhs_exp}; - if (delta_exp > detail::precision_v || delta_exp < -detail::precision_v) + if (BOOST_DECIMAL_UNLIKELY(delta_exp > detail::precision_v || delta_exp < -detail::precision_v)) { return false; } @@ -206,7 +206,7 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto equal_parts_impl(T1 lhs_sig, U1 lhs_exp, bool // Check the value of delta exp to avoid to large a value for pow10 // Also if only one of the significands is 0 then we know the values have to be mismatched - if (delta_exp > detail::precision_v || delta_exp < -detail::precision_v) + if (BOOST_DECIMAL_UNLIKELY(delta_exp > detail::precision_v || delta_exp < -detail::precision_v)) { return false; } @@ -462,64 +462,60 @@ BOOST_DECIMAL_FORCE_INLINE constexpr auto fast_type_less_parts_impl(T lhs_sig, U } template -BOOST_DECIMAL_CUDA_CONSTEXPR auto sequential_less_impl(DecimalType lhs, DecimalType rhs) noexcept -> bool +BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto sequential_less_impl(DecimalType lhs, DecimalType rhs) noexcept -> bool { using comp_type = std::conditional_t < 64, std::uint_fast64_t, int128::uint128_t>; - // Step 1: Handle our non-finite values in their own calling functions - - // Step 2: Check if they are bitwise equal: - /* + // Non-finite handling is performed by the callers. With NaN ruled out, + // identical bit patterns encode the same finite value, so lhs < rhs is false. if (lhs.bits_ == rhs.bits_) { return false; } - */ - // Step 3: Decode and compare signs first: - const auto lhs_sign {lhs.isneg()}; - const auto rhs_sign {rhs.isneg()}; + // One combination-field branch per operand instead of three. + const auto lhs_components {lhs.to_components()}; + const auto rhs_components {rhs.to_components()}; - if (lhs_sign != rhs_sign) - { - return lhs_sign; - } + const auto lhs_sign {lhs_components.sign}; + const auto rhs_sign {rhs_components.sign}; + + auto lhs_sig {static_cast(lhs_components.sig)}; + auto rhs_sig {static_cast(rhs_components.sig)}; - // Step 4: Decode the significand and do a trivial comp - auto lhs_sig {static_cast(lhs.full_significand())}; - auto rhs_sig {static_cast(rhs.full_significand())}; + // Signed zeros: +0 == -0, neither is less. if (lhs_sig == static_cast(0) || rhs_sig == static_cast(0)) { - return (lhs_sig == rhs_sig) ? false : (lhs_sig == static_cast(0) ? !rhs_sign : lhs_sign); + if (lhs_sig == rhs_sig) + { + return false; + } + return lhs_sig == static_cast(0) ? !rhs_sign : lhs_sign; } - // Step 5: Decode the exponent and see if we can even compare the significands - auto lhs_exp {lhs.biased_exponent()}; - auto rhs_exp {rhs.biased_exponent()}; + if (lhs_sign != rhs_sign) + { + return lhs_sign; + } + const auto lhs_exp {lhs_components.exp}; + const auto rhs_exp {rhs_components.exp}; const auto delta_exp {lhs_exp - rhs_exp}; + constexpr auto max_delta_diff {std::numeric_limits::digits10 - detail::precision_v}; - if (delta_exp > max_delta_diff || delta_exp < -max_delta_diff) + if (BOOST_DECIMAL_UNLIKELY(delta_exp > max_delta_diff || delta_exp < -max_delta_diff)) { - return rhs_sign ? rhs_exp < lhs_exp : rhs_exp > lhs_exp; + return lhs_sign ? lhs_exp > rhs_exp : lhs_exp < rhs_exp; } - // Step 6: Approximate normalization if we need to and then get the answer - if (delta_exp >= 0) + if (delta_exp > 0) { lhs_sig *= detail::pow10(static_cast(delta_exp)); - lhs_exp -= delta_exp; } - else + else if (delta_exp < 0) { rhs_sig *= detail::pow10(static_cast(-delta_exp)); - rhs_exp += delta_exp; - } - - if (lhs_exp != rhs_exp) - { - return lhs_sign ? lhs_exp > rhs_exp : lhs_exp < rhs_exp; } return lhs_sign ? lhs_sig > rhs_sig : lhs_sig < rhs_sig; diff --git a/include/boost/decimal/detail/components.hpp b/include/boost/decimal/detail/components.hpp index 3b6c1e648..24f41c48e 100644 --- a/include/boost/decimal/detail/components.hpp +++ b/include/boost/decimal/detail/components.hpp @@ -60,6 +60,11 @@ struct decimal_components return sign; } + BOOST_DECIMAL_CUDA_CONSTEXPR auto to_components() const -> decimal_components + { + return *this; + } + template explicit BOOST_DECIMAL_CUDA_CONSTEXPR operator decimal_components() const { diff --git a/include/boost/decimal/detail/mul_impl.hpp b/include/boost/decimal/detail/mul_impl.hpp index bfa5b5e7e..771071548 100644 --- a/include/boost/decimal/detail/mul_impl.hpp +++ b/include/boost/decimal/detail/mul_impl.hpp @@ -8,7 +8,8 @@ #include #include #include -#include "int128.hpp" +#include +#include #include #include #include @@ -22,182 +23,503 @@ namespace boost { namespace decimal { namespace detail { -// Each type has two different multiplication impls -// 1) Returns a decimal type and lets the constructor handle with shrinking the significand -// 2) Returns a struct of the constituent components (used with FMAs) +namespace impl { + +// Wide-product shrink + round-half-to-even + pack helpers used by the IEEE +// finite multiplication path under default rounding (fe_dec_to_nearest). +// +// After multiplying two precision-p significands (both expanded to full +// precision), the wide product has either 2p-1 or 2p digits. We shrink to +// exactly p digits by branching on the threshold 10^(2p-1), then dividing by +// 10^(p-1) or 10^p with a round-half-to-even rule and a carry handler for the +// rare q == 10^p case. Finally pack_in_range emits the IEEE bit pattern via +// direct_pack_d* when the biased exponent is in range, falling back to the +// constructor for overflow/underflow paths. +// +// Caller guarantees that both operand significands are non-zero (zero needs +// the constructor's cohort-canonicalization path), at full precision p, and +// that the active rounding mode is fe_dec_to_nearest. Non-default rounding +// modes go through the constructor path which honors fenv_round dispatch. + +// uint64 product -> decimal{32,fast32}_t. Product spans [10^12, ~10^14). +template +BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto mul_finalize_u64( + std::uint_fast64_t product, ExpType result_exp, bool result_sign) noexcept -> ReturnType +{ + int extra {6}; + if (product >= UINT64_C(10000000000000)) // 10^13 + { + extra = 7; + } + + const auto pow_extra {detail::pow10(static_cast(extra))}; + auto q {product / pow_extra}; + const auto r {product - q * pow_extra}; + const auto half {pow_extra >> 1}; -template -BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto mul_impl(const T& lhs, const T& rhs) noexcept -> ReturnType + if (r > half || (r == half && (q & UINT64_C(1)) != 0U)) + { + ++q; + if (BOOST_DECIMAL_UNLIKELY(q == UINT64_C(10000000))) // 10^7 + { + q = UINT64_C(1000000); // 10^6 + ++extra; + } + } + + return detail::pack_in_range(static_cast(q), + result_exp + static_cast(extra), + result_sign); +} + +// uint128 product -> decimal{64,fast64}_t. Product spans [10^30, ~10^32). +template +BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto mul_finalize_u128( + int128::uint128_t product, ExpType result_exp, bool result_sign) noexcept -> ReturnType { - using mul_type = std::conditional_t < 64, std::uint_fast64_t, int128::uint128_t>; + constexpr auto threshold {detail::pow10(int128::uint128_t{UINT64_C(31)})}; - // The constructor needs to calculate the number of digits in the significand which for uint128 is slow - // Since we know the value of res_sig is constrained to [1'000'000^2, 9'999'999^2] which equates to - // either 13 or 14 decimal digits we can use a single division to make binary search occur with - // uint32_t instead. 14 - 5 = 9 or 13 - 5 = 8 which are both still greater than or equal to - // digits10 + 1 for rounding which is 8 decimal digits + int extra {15}; + if (product >= threshold) + { + extra = 16; + } - auto res_sig {(static_cast(lhs.full_significand()) * static_cast(rhs.full_significand()))}; - auto res_exp {lhs.biased_exponent() + rhs.biased_exponent()}; + const auto pow_extra {detail::pow10(int128::uint128_t{static_cast(extra)})}; + const auto dr {detail::impl::divmod_pow10_dispatch(product, extra, pow_extra)}; + auto q {dr.quotient}; + const auto r {dr.remainder}; + const auto half {pow_extra >> 1}; - return {res_sig, res_exp, lhs.isneg() != rhs.isneg()}; + if (r > half || (r == half && (q.low & UINT64_C(1)) != 0U)) + { + ++q; + constexpr auto ten_p {detail::pow10(int128::uint128_t{UINT64_C(16)})}; + if (BOOST_DECIMAL_UNLIKELY(q == ten_p)) + { + q = detail::pow10(int128::uint128_t{UINT64_C(15)}); + ++extra; + } + } + + return detail::pack_in_range(q.low, + result_exp + static_cast(extra), + result_sign); } -template -BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto mul_impl(T lhs_sig, U lhs_exp, bool lhs_sign, - T rhs_sig, U rhs_exp, bool rhs_sign) noexcept -> ReturnType +// u256 product -> decimal{128,fast128}_t. Product spans [10^66, ~10^68). +template +BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto mul_finalize_u256( + const u256& product, ExpType result_exp, bool result_sign) noexcept -> ReturnType { - using mul_type = std::uint_fast64_t; + int extra {33}; + if (product >= detail::pow10(u256{UINT64_C(67)})) + { + extra = 34; + } + + const auto pow_extra {detail::pow10(u256{static_cast(extra)})}; + const auto dr {detail::impl::divmod_pow10_dispatch(product, extra, pow_extra)}; + auto q {dr.quotient}; + const auto r {dr.remainder}; + const auto half {pow_extra >> 1}; - // The constructor needs to calculate the number of digits in the significand which for uint128 is slow - // Since we know the value of res_sig is constrained to [1'000'000^2, 9'999'999^2] which equates to - // either 13 or 14 decimal digits we can use a single division to make binary search occur with - // uint32_t instead. 14 - 5 = 9 or 13 - 5 = 8 which are both still greater than or equal to - // digits10 + 1 for rounding which is 8 decimal digits + if (r > half || (r == half && (q.bytes[0] & UINT64_C(1)) != 0U)) + { + ++q; + constexpr auto ten_p {detail::pow10(u256{UINT64_C(34)})}; + if (BOOST_DECIMAL_UNLIKELY(q == ten_p)) + { + q = detail::pow10(u256{UINT64_C(33)}); + ++extra; + } + } - constexpr auto ten_pow_five {pow10(static_cast(5))}; - auto res_sig {(static_cast(lhs_sig) * static_cast(rhs_sig)) / ten_pow_five}; - auto res_exp {lhs_exp + rhs_exp + static_cast(5)}; + const int128::uint128_t q_u128 {q.bytes[1], q.bytes[0]}; + return detail::pack_in_range(q_u128, + result_exp + static_cast(extra), + result_sign); +} - return {static_cast(res_sig), res_exp, lhs_sign != rhs_sign}; +// Default-rounding gate matching add_impl's pattern: check the global rounding +// mode at compile-time and runtime; the fast path is only safe when the active +// mode is fe_dec_to_nearest (the round-half-to-even logic in mul_finalize_*). +// For any other mode, callers fall back to the constructor (which dispatches +// to fenv_round, honoring the runtime mode correctly). +template +BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto mul_default_rounding(const Anchor& anchor) noexcept -> bool +{ + static_cast(anchor); + bool default_rounding {_boost_decimal_global_rounding_mode == rounding_mode::fe_dec_to_nearest}; + #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION + if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(anchor)) + { + default_rounding = (_boost_decimal_global_runtime_rounding_mode == rounding_mode::fe_dec_to_nearest); + } + #endif + return default_rounding; } -// In the fast case we are better served doing our 128-bit division here since we are at a know starting point +} // namespace impl + +// Generic multiplication kernel. The body dispatches via SFINAE-disjoint +// helpers (impl::mul_impl_ieee / impl::mul_impl_components) so each helper's +// body is type-checked only for the ReturnType it actually serves +// (BOOST_DECIMAL_IF_CONSTEXPR falls back to plain `if` in C++14 and would +// otherwise force the FMA-only branch to resolve precision_v / +// expand_significand / mul_finalize_* against component structs that don't +// satisfy is_decimal_floating_point_v). The outer mul_impl signature stays +// `-> ReturnType` so existing friend declarations in decimal_fast*_t headers +// still match. +// +// (a) ReturnType is one of decimal{32,64,128}_t or decimal_fast{32,64,128}_t. +// Expands both operands to full precision (no-op if already canonical), +// multiplies, then shrinks to exactly precision digits with +// round-half-to-even and packs via pack_in_range (direct_pack fast path +// for IEEE types; constructor fallback for fast types). +// (b) ReturnType is a *_components struct (FMA consumes the unrounded wider +// product before its add step). Returns the raw product packed into the +// wider components type without shrinking. + +namespace impl { + +// Tag-dispatch helpers receive already-extracted components (so they don't +// need friend access to the fast types' private to_components). The outer +// mul_impl in the detail namespace below provides that access. Tag dispatch +// (std::integral_constant) is used instead of `if constexpr` so the wrong +// dispatcher isn't instantiated for any given ReturnType -- BOOST_DECIMAL_IF_CONSTEXPR +// falls back to plain `if` in C++14 and would otherwise force both branches' +// bodies to type-check. + +template +BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto mul_impl_dispatch( + Components lhs_c, Components rhs_c, std::true_type /*is_decimal_floating_point*/) noexcept -> ReturnType +{ + using mul_type = std::conditional_t; + const bool sign {lhs_c.sign != rhs_c.sign}; + + // d32 IEEE: revert to the simple multiply + constructor handoff. The + // hardware uint64 divide the constructor's coefficient_rounding does + // is already cheap enough that the expand + pre-shrink + direct_pack + // path's overhead doesn't pay off here. d32_fast still benefits and + // continues to use the optimized path below. + BOOST_DECIMAL_IF_CONSTEXPR (std::is_same::value) + { + const auto res_sig {static_cast(lhs_c.sig) * static_cast(rhs_c.sig)}; + const auto res_exp {lhs_c.exp + rhs_c.exp}; + return ReturnType{res_sig, res_exp, sign}; + } + else + { + // Small-product fast path: when at least one operand is + // non-canonical (sig below 10^(p-1)) AND the un-expanded product + // fits in p digits, the product itself is the result. Skip + // expand+shrink entirely and let pack_in_range route to direct_pack + // (or to the constructor for out-of-range biased_exp). Gated on + // sig comparisons so the canonical path pays only two compares. + using sig_t = typename ReturnType::significand_type; + constexpr auto min_normal_sig {detail::pow10(static_cast(detail::precision_v - 1))}; + if (BOOST_DECIMAL_UNLIKELY(lhs_c.sig < min_normal_sig || rhs_c.sig < min_normal_sig)) + { + const auto small_prod {static_cast(lhs_c.sig) * static_cast(rhs_c.sig)}; + constexpr mul_type small_threshold {static_cast(detail::max_significand_v) + mul_type{1U}}; + if (small_prod < small_threshold) + { + return detail::pack_in_range(static_cast(small_prod), + lhs_c.exp + rhs_c.exp, sign); + } + } + + // Fast types store significands at full precision already; expansion + // is a no-op for them so we skip the calls entirely. IEEE types can + // hold non-canonical cohorts and need the expansion to bring the + // significand up to precision digits before the threshold-based + // shrink in mul_finalize_*. + BOOST_DECIMAL_IF_CONSTEXPR (!detail::is_fast_type_v) + { + detail::expand_significand(lhs_c.sig, lhs_c.exp); + detail::expand_significand(rhs_c.sig, rhs_c.exp); + } + + const auto res_sig {static_cast(lhs_c.sig) * static_cast(rhs_c.sig)}; + const auto res_exp {lhs_c.exp + rhs_c.exp}; + + // Zero canonicalization: a zero significand must go through the + // constructor so the IEEE zero-cohort encoding is emitted. + if (BOOST_DECIMAL_UNLIKELY(lhs_c.sig == 0U || rhs_c.sig == 0U)) + { + return ReturnType{res_sig, res_exp, sign}; + } + if (BOOST_DECIMAL_UNLIKELY(!impl::mul_default_rounding(lhs_c.sig))) + { + // Non-default rounding mode: constructor's coefficient_rounding + // dispatches to fenv_round which honors the runtime mode. + return ReturnType{res_sig, res_exp, sign}; + } + BOOST_DECIMAL_IF_CONSTEXPR (TVal < 64) + { + // The static_cast is identity in this branch (mul_type is uint_fast64_t + // when TVal < 64) but is required to make the d64/d128 instantiation + // type-check in C++14, where BOOST_DECIMAL_IF_CONSTEXPR is plain `if` + // and both branches must be valid even though the wide-width branch + // is unreachable at runtime. + return impl::mul_finalize_u64(static_cast(res_sig), res_exp, sign); + } + else + { + return impl::mul_finalize_u128(res_sig, res_exp, sign); + } + } +} + +template +BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto mul_impl_dispatch( + Components lhs_c, Components rhs_c, std::false_type /*is_decimal_floating_point*/) noexcept -> ReturnType +{ + // FMA components path: return wide product unrounded. ReturnType is a + // decimal_components struct (decimal_val_v defined, + // but no precision_v / max_significand_v / direct_pack_*), so the + // optimized path's helpers above don't apply. + using mul_type = std::conditional_t; + + const bool sign {lhs_c.sign != rhs_c.sign}; + const auto res_sig {static_cast(lhs_c.sig) * static_cast(rhs_c.sig)}; + const auto res_exp {lhs_c.exp + rhs_c.exp}; + + return {res_sig, res_exp, sign}; +} + +} // namespace impl + +// Outer mul_impl: keeps the original `-> ReturnType` signature so friend +// declarations in decimal_fast*_t headers match. Extracts components via the +// (friend-accessible) to_components() and tag-dispatches to SFINAE-disjoint +// impl helpers. Tag dispatch (rather than if-constexpr) ensures only the +// matching helper body is instantiated for any given ReturnType -- BOOST_DECIMAL_IF_CONSTEXPR +// falls back to plain `if` in C++14, which would force both branches to +// type-check. template -BOOST_DECIMAL_CUDA_CONSTEXPR auto d64_mul_impl(const T& lhs, const T& rhs) noexcept -> ReturnType +BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto mul_impl(const T& lhs, const T& rhs) noexcept -> ReturnType { - using unsigned_int128_type = boost::int128::uint128_t; + return impl::mul_impl_dispatch>( + lhs.to_components(), rhs.to_components(), + std::integral_constant>{}); +} + +#ifdef _MSC_VER +# pragma warning(push) +# pragma warning(disable : 4702) // Complains that unlikely branches are unreachable +#endif + +template +BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto mul_impl(T lhs_sig, U lhs_exp, const bool lhs_sign, + T rhs_sig, U rhs_exp, const bool rhs_sign) noexcept -> ReturnType +{ + // d32 * Integer (sole caller) always passes an IEEE/fast decimal ReturnType. + // The body references mul_finalize_u64 and expand_significand, + // both valid for is_decimal_floating_point_v==true types. No + // SFINAE constraint needed because no FMA path uses this tuple form + // (FMA's mul_impl call passes pre-extracted components to the dec-ref form). + using mul_type = std::uint_fast64_t; - // Once we have the normalized significands and exponents all we have to do is - // multiply the significands and add the exponents - // - // The constructor needs to calculate the number of digits in the significand which for uint128 is slow - // Since we know the value of res_sig is constrained to [(10^16)^2, (10^17 - 1)^2] which equates to - // either 31 or 32 decimal digits we can use a single division to make binary search occur with - // uint_fast64_t instead. 32 - 13 = 19 or 31 - 13 = 18 which are both still greater than - // digits10 + 1 for rounding which is 17 decimal digits + const bool sign {lhs_sign != rhs_sign}; - constexpr auto ten_pow_13 {pow10(static_cast(13))}; - const auto res_sig {static_cast((static_cast(lhs.full_significand()) * static_cast(rhs.full_significand())) / ten_pow_13)}; - const auto res_exp {lhs.biased_exponent() + rhs.biased_exponent() + 13}; + // d32 IEEE: revert to the simple multiply + constructor handoff (see + // generic mul_impl above for rationale). + BOOST_DECIMAL_IF_CONSTEXPR (std::is_same::value) + { + const auto res_sig {static_cast(lhs_sig) * static_cast(rhs_sig)}; + const auto res_exp {lhs_exp + rhs_exp}; + return ReturnType{res_sig, res_exp, sign}; + } + + BOOST_DECIMAL_IF_CONSTEXPR (!detail::is_fast_type_v) + { + detail::expand_significand(lhs_sig, lhs_exp); + detail::expand_significand(rhs_sig, rhs_exp); + } - return ReturnType{res_sig, res_exp, lhs.isneg() != rhs.isneg()}; + const auto res_sig {static_cast(lhs_sig) * static_cast(rhs_sig)}; + const auto res_exp {lhs_exp + rhs_exp}; + + if (BOOST_DECIMAL_UNLIKELY(lhs_sig == 0U || rhs_sig == 0U)) + { + return ReturnType{res_sig, res_exp, sign}; + } + if (BOOST_DECIMAL_UNLIKELY(!impl::mul_default_rounding(lhs_sig))) + { + return ReturnType{res_sig, res_exp, sign}; + } + + return impl::mul_finalize_u64(res_sig, res_exp, sign); } -template -BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto d64_mul_impl(T lhs_sig, U lhs_exp, bool lhs_sign, - T rhs_sig, U rhs_exp, bool rhs_sign) noexcept - -> std::enable_if_t, ReturnType> +#ifdef _MSC_VER +# pragma warning(pop) +#endif + +// d64 specialization. Used by `decimal_fast64_t * decimal_fast64_t` (which +// passes decimal types directly without to_components conversion) and by the +// integer-rhs overloads of `decimal64_t * Integer` and +// `decimal_fast64_t * Integer`. + +template +BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto d64_mul_impl(const T& lhs, const T& rhs) noexcept -> ReturnType { using unsigned_int128_type = boost::int128::uint128_t; - // Once we have the normalized significands and exponents all we have to do is - // multiply the significands and add the exponents - // - // The constructor needs to calculate the number of digits in the significand which for uint128 is slow - // Since we know the value of res_sig is constrained to [(10^16)^2, (10^17 - 1)^2] which equates to - // either 31 or 32 decimal digits we can use a single division to make binary search occur with - // uint_fast64_t instead. 32 - 13 = 19 or 31 - 13 = 18 which are both still greater than - // digits10 + 1 for rounding which is 17 decimal digits + auto lhs_c {lhs.to_components()}; + auto rhs_c {rhs.to_components()}; + const bool sign {lhs_c.sign != rhs_c.sign}; - constexpr auto ten_pow_13 {pow10(static_cast(13))}; - auto res_sig {(static_cast(lhs_sig) * static_cast(rhs_sig)) / ten_pow_13}; - auto res_exp {lhs_exp + rhs_exp + static_cast(13)}; + BOOST_DECIMAL_IF_CONSTEXPR (!detail::is_fast_type_v) + { + detail::expand_significand(lhs_c.sig, lhs_c.exp); + detail::expand_significand(rhs_c.sig, rhs_c.exp); + } + + const auto res_sig {static_cast(lhs_c.sig) * static_cast(rhs_c.sig)}; + const auto res_exp {lhs_c.exp + rhs_c.exp}; + + if (BOOST_DECIMAL_UNLIKELY(lhs_c.sig == 0U || rhs_c.sig == 0U)) + { + return ReturnType{res_sig, res_exp, sign}; + } + if (BOOST_DECIMAL_UNLIKELY(!impl::mul_default_rounding(lhs))) + { + return ReturnType{res_sig, res_exp, sign}; + } - return {static_cast(res_sig), res_exp, lhs_sign != rhs_sign}; + return impl::mul_finalize_u128(res_sig, res_exp, sign); } template BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto d64_mul_impl(T lhs_sig, U lhs_exp, bool lhs_sign, T rhs_sig, U rhs_exp, bool rhs_sign) noexcept - -> std::enable_if_t, ReturnType> + -> std::enable_if_t, ReturnType> { using unsigned_int128_type = boost::int128::uint128_t; - constexpr auto comp_value {detail::pow10(static_cast(31))}; - - #ifdef BOOST_DECIMAL_DEBUG - std::cerr << "sig lhs: " << lhs_sig - << "\nexp lhs: " << lhs_exp - << "\nsig rhs: " << rhs_sig - << "\nexp rhs: " << rhs_exp; - #endif - bool sign {lhs_sign != rhs_sign}; - - // Once we have the normalized significands and exponents all we have to do is - // multiply the significands and add the exponents - - auto res_sig {static_cast(lhs_sig) * static_cast(rhs_sig)}; - auto res_exp {static_cast(lhs_exp + rhs_exp)}; + const bool sign {lhs_sign != rhs_sign}; - const auto sig_dig {res_sig >= comp_value ? 32 : 31}; - constexpr auto max_dig {std::numeric_limits::digits10}; - res_sig /= detail::pow10(static_cast(sig_dig - max_dig)); - res_exp += sig_dig - max_dig; + BOOST_DECIMAL_IF_CONSTEXPR (!detail::is_fast_type_v) + { + detail::expand_significand(lhs_sig, lhs_exp); + detail::expand_significand(rhs_sig, rhs_exp); + } - const auto res_sig_64 {static_cast(res_sig)}; + const auto res_sig {static_cast(lhs_sig) * static_cast(rhs_sig)}; + const auto res_exp {lhs_exp + rhs_exp}; - #ifdef BOOST_DECIMAL_DEBUG - std::cerr << "\nres sig: " << res_sig_64 - << "\nres exp: " << res_exp << std::endl; - #endif + if (BOOST_DECIMAL_UNLIKELY(lhs_sig == 0U || rhs_sig == 0U)) + { + return ReturnType{res_sig, res_exp, sign}; + } + if (BOOST_DECIMAL_UNLIKELY(!impl::mul_default_rounding(lhs_sig))) + { + return ReturnType{res_sig, res_exp, sign}; + } - return {res_sig_64, res_exp, sign}; + return impl::mul_finalize_u128(res_sig, res_exp, sign); } template BOOST_DECIMAL_FORCE_INLINE -BOOST_DECIMAL_CUDA_CONSTEXPR auto d128_mul_impl(const T1& lhs_sig, const U1 lhs_exp, const bool lhs_sign, - const T2& rhs_sig, const U2 rhs_exp, const bool rhs_sign) noexcept -> ReturnType +BOOST_DECIMAL_CUDA_CONSTEXPR auto d128_mul_impl(const T1& lhs_sig_in, const U1 lhs_exp_in, const bool lhs_sign, + const T2& rhs_sig_in, const U2 rhs_exp_in, const bool rhs_sign) noexcept -> ReturnType { using sig_type = T1; + static_assert(std::is_same::value, "Should have a common type by this point"); const bool sign {lhs_sign != rhs_sign}; - auto res_sig {detail::umul256(lhs_sig, rhs_sig)}; - auto res_exp {lhs_exp + rhs_exp}; - const auto sig_dig {detail::num_digits(res_sig)}; + // Small-product fast path: when both operands have at most precision/2 + // digits, the un-expanded product is guaranteed to fit in uint128 (no + // umul256 needed) and may even fit in p digits, in which case the + // product itself is the result. Gated on both sigs being below the + // sqrt(max_sig) threshold so the uint128 multiply cannot overflow. + constexpr auto small_sig_threshold {detail::pow10(static_cast(detail::precision_v / 2))}; + if (BOOST_DECIMAL_UNLIKELY(lhs_sig_in <= small_sig_threshold && rhs_sig_in <= small_sig_threshold)) + { + const auto small_prod {lhs_sig_in * rhs_sig_in}; + constexpr auto small_threshold {static_cast(detail::max_significand_v) + sig_type{1U}}; + if (small_prod < small_threshold) + { + const auto small_exp {static_cast(lhs_exp_in + rhs_exp_in)}; + return detail::pack_in_range(small_prod, small_exp, sign); + } + } - // 34 is the number of digits in the d128 significand - // this way we can skip rounding in the constructor a second time - const auto digit_delta {sig_dig - std::numeric_limits::digits10}; - if (BOOST_DECIMAL_LIKELY(digit_delta > 0)) + // Subnormal/underflow fast-bailout: when the operand exponents already sum + // to deep into the subnormal range, the optimized path below wastes a u256 + // divmod_pow10 (16 multiplies) producing a value the constructor would + // flush to zero anyway. The legacy umul256 + num_digits + coefficient_rounding + // path has an early-out (shift > digits10 -> coeff=0) that skips the + // divmod. Benchmarks show this saves ~7% on the d128 IEEE multiplication + // hot loop where ~25% of random inputs fall into this range. Kept for + // d128 only; the d64/d32 paths don't see the same benefit because + // divmod_pow10_uint128 is much cheaper than divmod_pow10_u256. + constexpr int subnormal_guard {-detail::bias_v + detail::precision_v}; + if (BOOST_DECIMAL_UNLIKELY(static_cast(lhs_exp_in) + static_cast(rhs_exp_in) < subnormal_guard)) { - auto biased_exp {res_exp + detail::bias_v}; - detail::coefficient_rounding(res_sig, res_exp, biased_exp, sign, sig_dig); + auto res_sig {detail::umul256(lhs_sig_in, rhs_sig_in)}; + auto res_exp_mut {static_cast(lhs_exp_in + rhs_exp_in)}; + const auto sig_dig {detail::num_digits(res_sig)}; + const auto digit_delta {sig_dig - std::numeric_limits::digits10}; + if (BOOST_DECIMAL_LIKELY(digit_delta > 0)) + { + auto biased_exp {res_exp_mut + detail::bias_v}; + detail::coefficient_rounding(res_sig, res_exp_mut, biased_exp, sign, sig_dig); + } + // coefficient_rounding leaves res_sig at exactly precision digits (<= max_significand_v), + // so pack_in_range can use direct_pack when biased_exp is in [0, max] and skip the + // constructor's redundant num_digits + branch tree. Falls back to the constructor + // for the (rarer) genuinely-subnormal output where its boundary handling is needed. + return detail::pack_in_range(int128::uint128_t{res_sig[1], res_sig[0]}, res_exp_mut, sign); } - BOOST_DECIMAL_ASSERT((res_sig[3] | res_sig[2]) == 0U); - return {int128::uint128_t{res_sig[1], res_sig[0]}, res_exp, sign}; -} + auto lhs_sig {lhs_sig_in}; + auto lhs_exp {lhs_exp_in}; + auto rhs_sig {rhs_sig_in}; + auto rhs_exp {rhs_exp_in}; -template -BOOST_DECIMAL_FORCE_INLINE -BOOST_DECIMAL_CUDA_CONSTEXPR auto d128_fast_mul_impl(const T1& lhs_sig, const U1 lhs_exp, const bool lhs_sign, - const T2& rhs_sig, const U2 rhs_exp, const bool rhs_sign) noexcept -> ReturnType -{ - const bool sign {lhs_sign != rhs_sign}; + BOOST_DECIMAL_IF_CONSTEXPR (!detail::is_fast_type_v) + { + detail::expand_significand(lhs_sig, lhs_exp); + detail::expand_significand(rhs_sig, rhs_exp); + } - // Once we have the normalized significands and exponents all we have to do is - // multiply the significands and add the exponents - auto res_sig {detail::umul256(lhs_sig, rhs_sig)}; - const auto res_exp {lhs_exp + rhs_exp + 30}; + const auto res_exp {static_cast(lhs_exp + rhs_exp)}; + + if (BOOST_DECIMAL_UNLIKELY(lhs_sig == 0U || rhs_sig == 0U)) + { + return ReturnType{int128::uint128_t{0}, res_exp, sign}; + } - constexpr auto ten_pow_30 {detail::pow10(static_cast(30))}; - res_sig /= ten_pow_30; + auto res_sig {detail::umul256(lhs_sig, rhs_sig)}; - BOOST_DECIMAL_ASSERT((res_sig[3] | res_sig[2]) == 0U); // LCOV_EXCL_LINE - return {int128::uint128_t{res_sig[1], res_sig[0]}, res_exp, sign}; -} + if (BOOST_DECIMAL_UNLIKELY(!impl::mul_default_rounding(lhs_sig))) + { + // Non-default rounding mode: use coefficient_rounding which dispatches + // to fenv_round and honors the runtime mode. Pack via pack_in_range so + // the in-range case takes direct_pack (no redundant num_digits in the + // constructor); out-of-range biased_exp falls back to the constructor. + const auto sig_dig {detail::num_digits(res_sig)}; + const auto digit_delta {sig_dig - std::numeric_limits::digits10}; + auto res_exp_mut {res_exp}; + + if (BOOST_DECIMAL_LIKELY(digit_delta > 0)) + { + auto biased_exp {res_exp_mut + detail::bias_v}; + detail::coefficient_rounding(res_sig, res_exp_mut, biased_exp, sign, sig_dig); + } + + return detail::pack_in_range(int128::uint128_t{res_sig[1], res_sig[0]}, res_exp_mut, sign); + } -template -BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto mul_impl(const decimal128_t_components& lhs, const decimal128_t_components& rhs) noexcept -> ReturnType -{ - return d128_mul_impl(lhs.sig, lhs.exp, lhs.sign, - rhs.sig, rhs.exp, rhs.sign); + return impl::mul_finalize_u256(res_sig, res_exp, sign); } } // namespace detail diff --git a/include/boost/decimal/detail/normalize.hpp b/include/boost/decimal/detail/normalize.hpp index c9a75c273..b7fafdca5 100644 --- a/include/boost/decimal/detail/normalize.hpp +++ b/include/boost/decimal/detail/normalize.hpp @@ -46,13 +46,23 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto normalize(T1& significand, T2& exp, bool sign #endif // This is a branchless version of the above which is used for implementing basic operations, -// since we know that the values in the decimal type are never larger than target_precision +// since we know that the values in the decimal type are never larger than target_precision. +// Fast path: when significand is already in the normalized range [10^(p-1), 10^p - 1] +// no expansion is needed and we skip the num_digits + pow10 multiply (~20 cycles saved). +// This is the dominant case for results of prior arithmetic that were packed at full precision. template BOOST_DECIMAL_CUDA_CONSTEXPR auto expand_significand(T1& significand, T2& exp) noexcept -> void { constexpr auto target_precision {detail::precision_v}; - const auto digits {num_digits(significand)}; + constexpr T1 min_normal_sig {static_cast(pow10(static_cast(target_precision - 1)))}; + if (significand >= min_normal_sig) + { + // Already has exactly target_precision digits; expansion is a no-op. + return; + } + + const auto digits {num_digits(significand)}; const auto zeros_needed {target_precision - digits}; significand *= pow10(static_cast(zeros_needed)); exp -= zeros_needed; diff --git a/include/boost/decimal/detail/u256.hpp b/include/boost/decimal/detail/u256.hpp index 58fe602cf..aa524af15 100644 --- a/include/boost/decimal/detail/u256.hpp +++ b/include/boost/decimal/detail/u256.hpp @@ -900,9 +900,19 @@ BOOST_DECIMAL_CUDA_CONSTEXPR u256 operator*(const UnsignedInteger lhs, const u25 { return impl::default_mul(rhs, lhs); } +BOOST_DECIMAL_CUDA_CONSTEXPR u256 mul128_by_64(const int128::uint128_t& a, const std::uint64_t b) noexcept; BOOST_DECIMAL_CUDA_CONSTEXPR u256 umul256(const int128::uint128_t& a, const int128::uint128_t& b) noexcept { + if (BOOST_DECIMAL_UNLIKELY(b.high == 0U)) + { + return mul128_by_64(a, b.low); + } + if (BOOST_DECIMAL_UNLIKELY(a.high == 0U)) + { + return mul128_by_64(b, a.low); + } + u256 result{}; const int128::uint128_t a_low {a.low}; diff --git a/test/Jamfile b/test/Jamfile index 2bb437ca1..903d91d9f 100644 --- a/test/Jamfile +++ b/test/Jamfile @@ -51,6 +51,7 @@ project : requirements run-fail benchmarks.cpp ; run-fail benchmark_uint256.cpp ; +run-fail bench_accumulation.cpp ; run compare_dec128_and_fast.cpp ; compile-fail concepts_test.cpp ; diff --git a/test/bench_accumulation.cpp b/test/bench_accumulation.cpp new file mode 100644 index 000000000..c04f7e8ac --- /dev/null +++ b/test/bench_accumulation.cpp @@ -0,0 +1,158 @@ +// Copyright 2026 Matt Borland +// Distributed under the Boost Software License, Version 1.0. +// https://www.boost.org/LICENSE_1_0.txt +// +// Benchmark for the small-exponent-difference alignment kernel in add_impl. +// The companion benchmarks.cpp uses random exponents across the full range, +// which makes the dominant-operand short-circuit fire in ~94% of operations +// and leaves the alignment kernel under-exercised. This benchmark generates +// same-magnitude operands so the kernel actually has to align and add. + +#define BOOST_DECIMAL_DETAIL_INT128_ALLOW_SIGN_CONVERSION +#define BOOST_DECIMAL_DETAIL_INT128_ALLOW_SIGN_COMPARE + +#include +#include +#include +#include +#include +#include +#include + +#if defined(__clang__) +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wold-style-cast" +# pragma clang diagnostic ignored "-Wundef" +# pragma clang diagnostic ignored "-Wconversion" +# pragma clang diagnostic ignored "-Wsign-conversion" +# pragma clang diagnostic ignored "-Wfloat-equal" +#elif defined(__GNUC__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wold-style-cast" +# pragma GCC diagnostic ignored "-Wundef" +# pragma GCC diagnostic ignored "-Wconversion" +# pragma GCC diagnostic ignored "-Wsign-conversion" +# pragma GCC diagnostic ignored "-Wfloat-equal" +#endif + +#include + +#ifdef BOOST_DECIMAL_RUN_BENCHMARKS + +using namespace boost::decimal; +using namespace std::chrono_literals; + +#ifdef __clang__ +# define BOOST_DECIMAL_NO_INLINE __attribute__ ((__noinline__)) +#elif defined(_MSC_VER) +# define BOOST_DECIMAL_NO_INLINE __declspec(noinline) +#elif defined(__GNUC__) +# define BOOST_DECIMAL_NO_INLINE __attribute__ ((__noinline__)) +#endif + +constexpr unsigned N = 20'000'000U; +constexpr int K = 5; + +// Same-exponent workload: every operand has biased_exponent == fixed_exp. +// Exercises the same-exp pass-through inside add_impl. +template +std::vector generate_same_exp_vector(int fixed_exp, unsigned seed = 42U) +{ + std::vector v(N); + std::mt19937_64 gen(seed); + boost::random::uniform_int_distribution + sig_dis(0U, std::numeric_limits::max() / 1000U); + for (std::size_t i = 0; i < v.size(); ++i) + { + v[i] = T{sig_dis(gen), fixed_exp}; + } + return v; +} + +// Small-diff workload: significands random, exponents drawn from a narrow window +// so the difference stays inside max_shift but is rarely zero. Forces the +// alignment-multiply-then-add path inside add_impl. +template +std::vector generate_small_diff_vector(int base_exp, int half_window, unsigned seed = 42U) +{ + std::vector v(N); + std::mt19937_64 gen(seed); + boost::random::uniform_int_distribution + sig_dis(0U, std::numeric_limits::max() / 1000U); + std::uniform_int_distribution exp_dis(base_exp - half_window, base_exp + half_window); + for (std::size_t i = 0; i < v.size(); ++i) + { + v[i] = T{sig_dis(gen), exp_dis(gen)}; + } + return v; +} + +template +BOOST_DECIMAL_NO_INLINE void test_two_element(const std::vector& data, Func op, + const char* label, const char* type) +{ + const auto t1 = std::chrono::steady_clock::now(); + std::size_t s = 0; + for (std::size_t k = 0; k < K; ++k) + { + for (std::size_t i = 0; i < data.size() - 1U; ++i) + { + s += static_cast(op(data[i], data[i + 1])); + } + } + const auto t2 = std::chrono::steady_clock::now(); + std::cerr << std::left << std::setw(11) << label << "<" << std::setw(13) << type + << ">: " << std::setw(10) << (t2 - t1) / 1us << " us (s=" << s << ")\n"; +} + +template +void run_same_exp(int fixed_exp, const char* type) +{ + const auto v = generate_same_exp_vector(fixed_exp); + test_two_element(v, std::plus<>(), "AddSameExp", type); + test_two_element(v, std::minus<>(), "SubSameExp", type); +} + +template +void run_small_diff(int base_exp, int half_window, const char* type) +{ + const auto v = generate_small_diff_vector(base_exp, half_window); + test_two_element(v, std::plus<>(), "AddSmallDiff", type); + test_two_element(v, std::minus<>(), "SubSmallDiff", type); +} + +int main() +{ + std::cerr << "\n===== Same-exponent (alignment kernel pass-through) =====\n"; + run_same_exp(0, "decimal32_t"); + run_same_exp(0, "decimal64_t"); + run_same_exp(0, "decimal128_t"); + run_same_exp(0, "dec32_fast"); + run_same_exp(0, "dec64_fast"); + run_same_exp(0, "dec128_fast"); + + // For d64 max_shift = 21; window of 5 keeps diffs in [0, 10] so the small-diff + // alignment kernel fires every time. For d128 max_shift = 43 in u256 mode but + // 3 in the planned uint128 fast path; window of 2 keeps diffs in [0, 4] so we + // exercise both paths (until Phase 1 lands, after which we'd want 0-3). + std::cerr << "\n===== Small exp diff (alignment kernel multiply+add) =====\n"; + run_small_diff(0, 5, "decimal32_t"); + run_small_diff(0, 5, "decimal64_t"); + run_small_diff(0, 2, "decimal128_t"); + run_small_diff(0, 5, "dec32_fast"); + run_small_diff(0, 5, "dec64_fast"); + run_small_diff(0, 2, "dec128_fast"); + + std::cerr << std::endl; + return 1; +} + +#else + +int main() +{ + std::cerr << "Benchmarks not run" << std::endl; + return 1; +} + +#endif