diff --git a/.drone.jsonnet b/.drone.jsonnet
index d1bd4d0ed..8d0326729 100644
--- a/.drone.jsonnet
+++ b/.drone.jsonnet
@@ -435,8 +435,14 @@ local windows_pipeline(name, image, environment, arch = "amd64") =
     ),
 
     windows_pipeline(
-        "Windows VS2026 msvc-14.5",
+        "Windows VS2026 msvc-14.5 64-bit",
         "cppalliance/dronevs2026:1",
-        { TOOLSET: 'msvc-14.5', CXXSTD: '14,17,20,latest', ADDRMD: '32,64' },
+        { TOOLSET: 'msvc-14.5', CXXSTD: '14,17,20,latest', ADDRMD: '64' },
     ),
+
+    windows_pipeline(
+            "Windows VS2026 msvc-14.5 32-bit",
+            "cppalliance/dronevs2026:1",
+            { TOOLSET: 'msvc-14.5', CXXSTD: '14,17,20,latest', ADDRMD: '32' },
+        ),
 ]
diff --git a/include/boost/decimal/decimal128_t.hpp b/include/boost/decimal/decimal128_t.hpp
index 77b8a0569..d598ad945 100644
--- a/include/boost/decimal/decimal128_t.hpp
+++ b/include/boost/decimal/decimal128_t.hpp
@@ -598,6 +598,41 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto from_bits(const int128::uint128_t rhs) noexcep
     return result;
 }
 
+namespace detail {
+
+// IEEE-pack a known-in-range (coeff, exp, sign) triple into a decimal128_t,
+// skipping the constructor's bounds check + dead-branch handling. For d128
+// the precision (34 digits, 113 bits) always fits in not_11_significand_mask,
+// so no combination-field branch is needed. Caller guarantees
+// coeff <= d128_max_significand_value and (exp + bias) is in [0, d128_max_biased_exponent].
+template <typename T2>
+BOOST_DECIMAL_CUDA_CONSTEXPR auto direct_pack_d128(int128::uint128_t coeff, T2 exp, bool sign) noexcept -> decimal128_t
+{
+    const auto biased_exp {static_cast<std::uint64_t>(static_cast<int>(exp) + bias_v<decimal128_t>)};
+    int128::uint128_t bits {coeff & d128_not_11_significand_mask};
+    bits.high |= (sign ? d128_sign_mask : UINT64_C(0));
+    bits.high |= (biased_exp << d128_not_11_exp_high_word_shift) & d128_not_11_exp_mask;
+    return from_bits(bits);
+}
+
+// Definition of the pack_in_range<decimal128_t> overload declared in
+// add_impl.hpp. Lives here so the `decimal128_t{coeff, exp, sign}` fallback
+// is parsed only after decimal128_t is complete (see add_impl.hpp rationale).
+template <typename ReturnType, typename SigType, typename ExpType>
+BOOST_DECIMAL_CUDA_CONSTEXPR auto pack_in_range(SigType coeff, ExpType exp, bool sign) noexcept
+    -> std::enable_if_t<std::is_same<ReturnType, decimal128_t>::value, decimal128_t>
+{
+    const auto biased_exp_check {static_cast<int>(exp) + bias_v<decimal128_t>};
+    if (BOOST_DECIMAL_LIKELY(biased_exp_check >= 0
+        && biased_exp_check <= static_cast<int>(max_biased_exp_v<decimal128_t>)))
+    {
+        return direct_pack_d128(static_cast<int128::uint128_t>(coeff), exp, sign);
+    }
+    return decimal128_t{coeff, exp, sign};
+}
+
+} // namespace detail
+
 BOOST_DECIMAL_CUDA_CONSTEXPR auto decimal128_t::unbiased_exponent() const noexcept -> exponent_type
 {
     exponent_type expval {};
@@ -819,9 +854,11 @@ BOOST_DECIMAL_CUDA_CONSTEXPR decimal128_t::decimal128_t(T1 coeff, T2 exp, const
         }
         else if (digit_delta > 0 && coeff_digits + digit_delta <= detail::precision_v<decimal128_t>)
         {
+            // Same overflow-fold pattern as d32/d64: post-shift coeff is <= max_significand_v
+            // and biased_exp lands in [0, max], so pack_in_range routes to direct_pack.
             exp -= digit_delta;
             reduced_coeff *= detail::pow10(static_cast<significand_type>(digit_delta));
-            *this = decimal128_t(reduced_coeff, exp, is_negative);
+            *this = detail::pack_in_range<decimal128_t>(reduced_coeff, exp, is_negative);
         }
         else if (coeff_digits + biased_exp <= detail::precision_v<decimal128_t>)
         {
@@ -856,10 +893,12 @@ BOOST_DECIMAL_CUDA_CONSTEXPR decimal128_t::decimal128_t(T1 coeff, T2 exp, const
         }
         else if (digit_delta < 0 && coeff_digits - digit_delta <= detail::precision_v<decimal128_t>)
         {
+            // Expand to use the full precision; biased_exp ends up in [0, max] and
+            // coeff <= max_significand_v. pack_in_range routes to direct_pack.
             const auto offset {detail::precision_v<decimal128_t> - coeff_digits};
             exp -= offset;
             reduced_coeff *= detail::pow10(static_cast<significand_type>(offset));
-            *this = decimal128_t(reduced_coeff, exp, is_negative);
+            *this = detail::pack_in_range<decimal128_t>(reduced_coeff, exp, is_negative);
         }
         else
         {
@@ -1337,6 +1376,11 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto operator<(const decimal128_t& lhs, const decim
     }
     #endif
 
+    if (BOOST_DECIMAL_UNLIKELY(lhs.bits_ == rhs.bits_))
+    {
+        return false;
+    }
+
     return less_parts_impl<decimal128_t>(lhs.full_significand(), lhs.biased_exponent(), lhs.isneg(),
                                        rhs.full_significand(), rhs.biased_exponent(), rhs.isneg());
 }
@@ -1469,16 +1513,17 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto operator<=>(const decimal128_t& lhs, const dec
     {
         return std::partial_ordering::less;
     }
-    else if (lhs > rhs)
+    if (rhs < lhs)
     {
         return std::partial_ordering::greater;
     }
-    else if (lhs == rhs)
+    #ifndef BOOST_DECIMAL_FAST_MATH
+    if (isnan(lhs) || isnan(rhs))
     {
-        return std::partial_ordering::equivalent;
+        return std::partial_ordering::unordered;
     }
-
-    return std::partial_ordering::unordered;
+    #endif
+    return std::partial_ordering::equivalent;
 }
 
 template <typename Integer>
@@ -1682,11 +1727,45 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto operator+(const decimal128_t& lhs, const decim
         {
             return from_bits(detail::d128_nan_mask);
         }
-        
+
         return detail::check_non_finite(lhs, rhs);
     }
     #endif
 
+    // Two fast paths (see decimal64_t.hpp:operator+ for full explanation).
+    // Both gated on non-zero operands so zero short-circuit logic is preserved
+    // by falling through to d128_add_impl_new.
+    {
+        const auto lhs_sig {lhs.full_significand()};
+        const auto rhs_sig {rhs.full_significand()};
+        if (BOOST_DECIMAL_LIKELY(lhs_sig != 0U && rhs_sig != 0U))
+        {
+            const auto lhs_exp {lhs.biased_exponent()};
+            const auto rhs_exp {rhs.biased_exponent()};
+            const auto exp_diff {lhs_exp > rhs_exp ? lhs_exp - rhs_exp : rhs_exp - lhs_exp};
+            if (exp_diff > 75 || exp_diff <= 3)
+            {
+                auto round {_boost_decimal_global_rounding_mode};
+                #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION
+                if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(lhs))
+                {
+                    round = _boost_decimal_global_runtime_rounding_mode;
+                }
+                #endif
+                if (BOOST_DECIMAL_LIKELY(round == rounding_mode::fe_dec_to_nearest))
+                {
+                    if (exp_diff > 75)
+                    {
+                        return lhs_exp > rhs_exp ? lhs : rhs;
+                    }
+                    return detail::aligned_add_kernel<decimal128_t, int128::uint128_t>(
+                        lhs_sig, rhs_sig, lhs_exp, rhs_exp, static_cast<unsigned>(exp_diff),
+                        lhs.isneg(), rhs.isneg());
+                }
+            }
+        }
+    }
+
     auto lhs_components {lhs.to_components()};
     detail::expand_significand<decimal128_t>(lhs_components.sig, lhs_components.exp);
     auto rhs_components {rhs.to_components()};
@@ -1741,11 +1820,45 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto operator-(const decimal128_t& lhs, const decim
         {
             return -rhs;
         }
-        
+
         return detail::check_non_finite(lhs, rhs);
     }
     #endif
 
+    // Two fast paths; see operator+ above. Both gated on non-zero operands so
+    // zero short-circuit logic is preserved by falling through. For operator-,
+    // rhs sign is flipped before kernel dispatch.
+    {
+        const auto lhs_sig {lhs.full_significand()};
+        const auto rhs_sig {rhs.full_significand()};
+        if (BOOST_DECIMAL_LIKELY(lhs_sig != 0U && rhs_sig != 0U))
+        {
+            const auto lhs_exp {lhs.biased_exponent()};
+            const auto rhs_exp {rhs.biased_exponent()};
+            const auto exp_diff {lhs_exp > rhs_exp ? lhs_exp - rhs_exp : rhs_exp - lhs_exp};
+            if (exp_diff > 75 || exp_diff <= 3)
+            {
+                auto round {_boost_decimal_global_rounding_mode};
+                #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION
+                if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(lhs))
+                {
+                    round = _boost_decimal_global_runtime_rounding_mode;
+                }
+                #endif
+                if (BOOST_DECIMAL_LIKELY(round == rounding_mode::fe_dec_to_nearest))
+                {
+                    if (exp_diff > 75)
+                    {
+                        return lhs_exp > rhs_exp ? lhs : -rhs;
+                    }
+                    return detail::aligned_add_kernel<decimal128_t, int128::uint128_t>(
+                        lhs_sig, rhs_sig, lhs_exp, rhs_exp, static_cast<unsigned>(exp_diff),
+                        lhs.isneg(), !rhs.isneg());
+                }
+            }
+        }
+    }
+
     auto lhs_components {lhs.to_components()};
     detail::expand_significand<decimal128_t>(lhs_components.sig, lhs_components.exp);
     auto rhs_components {rhs.to_components()};
@@ -1861,14 +1974,11 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto operator*(const decimal128_t lhs, const Intege
 
     auto lhs_sig {lhs.full_significand()};
     auto lhs_exp {lhs.biased_exponent()};
-    const auto lhs_zeros {detail::remove_trailing_zeros(lhs_sig)};
-    lhs_sig = lhs_zeros.trimmed_number;
-    lhs_exp += static_cast<std::int32_t>(lhs_zeros.number_of_removed_zeros);
+    detail::expand_significand<decimal128_t>(lhs_sig, lhs_exp);
 
     auto rhs_sig {static_cast<int128::uint128_t>(detail::make_positive_unsigned(rhs))};
-    const auto rhs_zeros {detail::remove_trailing_zeros(rhs_sig)};
-    rhs_sig = rhs_zeros.trimmed_number;
-    const auto rhs_exp = static_cast<exp_type>(rhs_zeros.number_of_removed_zeros);
+    exp_type rhs_exp {0};
+    detail::normalize<decimal128_t>(rhs_sig, rhs_exp);
 
     return detail::d128_mul_impl<decimal128_t>(
             lhs_sig, lhs_exp, lhs.isneg(),
diff --git a/include/boost/decimal/decimal32_t.hpp b/include/boost/decimal/decimal32_t.hpp
index 9f8ab8c40..18dedecb1 100644
--- a/include/boost/decimal/decimal32_t.hpp
+++ b/include/boost/decimal/decimal32_t.hpp
@@ -750,9 +750,14 @@ BOOST_DECIMAL_CUDA_CONSTEXPR decimal32_t::decimal32_t(T1 coeff, T2 exp, const de
         }
         else if (digit_delta > 0 && coeff_digits + digit_delta <= detail::precision)
         {
+            // After the shift, coeff has coeff_digits+digit_delta <= precision digits
+            // (so coeff <= max_significand_v) and biased_exp lands in [0, max] because
+            // we just subtracted exactly the overflow delta from exp. pack_in_range
+            // hits direct_pack on this in-range case; the fallback constructor is a
+            // safety belt if the analysis is wrong (same as the original recursion).
             exp -= digit_delta;
             reduced_coeff *= detail::pow10(static_cast<significand_type>(digit_delta));
-            *this = decimal32_t(reduced_coeff, exp, is_negative);
+            *this = detail::pack_in_range<decimal32_t>(reduced_coeff, exp, is_negative);
         }
         else if (coeff_digits + biased_exp <= detail::precision)
         {
@@ -786,11 +791,14 @@ BOOST_DECIMAL_CUDA_CONSTEXPR decimal32_t::decimal32_t(T1 coeff, T2 exp, const de
         }
         else if (digit_delta < 0 && coeff_digits - digit_delta <= detail::precision)
         {
-            // We can expand the coefficient to use the maximum number of digits
+            // We can expand the coefficient to use the maximum number of digits.
+            // After expansion, coeff has precision digits (<= max_significand_v) and
+            // biased_exp lands in [0, max] (we shift toward the valid range). Same
+            // pack_in_range pattern as the digit_delta > 0 branch above.
             const auto offset {detail::precision - coeff_digits};
             exp -= offset;
             reduced_coeff *= detail::pow10(static_cast<significand_type>(offset));
-            *this = decimal32_t(reduced_coeff, exp, is_negative);
+            *this = detail::pack_in_range<decimal32_t>(reduced_coeff, exp, is_negative);
         }
         else
         {
@@ -828,6 +836,55 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto from_bits(const std::uint32_t bits) noexcept -
 
 namespace detail {
 
+// IEEE-pack a known-in-range (coeff, exp, sign) triple into a decimal32_t,
+// skipping the constructor's bounds check + dead-branch handling. Caller
+// guarantees coeff <= d32_max_significand_value and (exp + bias) is in
+// [0, d32_max_biased_exponent].
+template <typename T1, typename T2>
+BOOST_DECIMAL_CUDA_CONSTEXPR auto direct_pack_d32(T1 coeff, T2 exp, bool sign) noexcept -> decimal32_t
+{
+    const auto reduced_coeff {static_cast<std::uint32_t>(coeff)};
+    const auto biased_exp {static_cast<std::uint32_t>(static_cast<int>(exp) + bias_v<decimal32_t>)};
+    std::uint32_t bits {sign ? d32_sign_mask : UINT32_C(0)};
+
+    // The non-combination encoding holds coefficients up to d32_biggest_no_combination_significand,
+    // which covers ~95%+ of post-arithmetic coefficients. The combination-field branch is cold.
+    if (BOOST_DECIMAL_LIKELY(reduced_coeff <= d32_biggest_no_combination_significand))
+    {
+        bits |= (reduced_coeff & d32_not_11_significand_mask);
+        bits |= (biased_exp << d32_not_11_exp_shift) & d32_not_11_exp_mask;
+    }
+    else
+    {
+        bits |= (d32_comb_11_mask | (reduced_coeff & d32_11_significand_mask));
+        bits |= (biased_exp << d32_11_exp_shift) & d32_11_exp_mask;
+    }
+    return from_bits(bits);
+}
+
+// Definition of the pack_in_range<decimal32_t> overload declared in
+// add_impl.hpp. Lives here because the body's `decimal32_t{coeff, exp, sign}`
+// fallback constructor call requires decimal32_t to be a complete type --
+// both C++14 (regular if) and nvcc (eager template body parse for device
+// codegen) would otherwise reject the constructor call when the function
+// was defined at the add_impl.hpp include site (where decimal32_t is still
+// forward-declared via fwd.hpp).
+template <typename ReturnType, typename SigType, typename ExpType>
+BOOST_DECIMAL_CUDA_CONSTEXPR auto pack_in_range(SigType coeff, ExpType exp, bool sign) noexcept
+    -> std::enable_if_t<std::is_same<ReturnType, decimal32_t>::value, decimal32_t>
+{
+    const auto biased_exp_check {static_cast<int>(exp) + bias_v<decimal32_t>};
+    if (BOOST_DECIMAL_LIKELY(biased_exp_check >= 0
+        && biased_exp_check <= static_cast<int>(max_biased_exp_v<decimal32_t>)))
+    {
+        return direct_pack_d32(static_cast<std::uint32_t>(coeff), exp, sign);
+    }
+    return decimal32_t{coeff, exp, sign};
+}
+
+} // namespace detail (close to make the existing detail namespace block work)
+namespace detail {
+
 template <bool>
 class numeric_limits_impl32
 {
@@ -1025,6 +1082,41 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto operator+(const decimal32_t lhs, const decimal
     }
     #endif
 
+    // Two fast paths (see decimal64_t.hpp:operator+ for full explanation).
+    // Both gated on non-zero operands so zero short-circuit logic is preserved
+    // by falling through to add_impl.
+    {
+        const auto lhs_sig {lhs.full_significand()};
+        const auto rhs_sig {rhs.full_significand()};
+        if (BOOST_DECIMAL_LIKELY(lhs_sig != 0U && rhs_sig != 0U))
+        {
+            const auto lhs_exp {lhs.biased_exponent()};
+            const auto rhs_exp {rhs.biased_exponent()};
+            const auto exp_diff {lhs_exp > rhs_exp ? lhs_exp - rhs_exp : rhs_exp - lhs_exp};
+            if (exp_diff > 17 || exp_diff <= 3)
+            {
+                auto round {_boost_decimal_global_rounding_mode};
+                #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION
+                if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(lhs))
+                {
+                    round = _boost_decimal_global_runtime_rounding_mode;
+                }
+                #endif
+                if (BOOST_DECIMAL_LIKELY(round == rounding_mode::fe_dec_to_nearest))
+                {
+                    if (exp_diff > 17)
+                    {
+                        return lhs_exp > rhs_exp ? lhs : rhs;
+                    }
+                    return detail::aligned_add_kernel<decimal32_t, std::uint64_t>(
+                        static_cast<std::uint64_t>(lhs_sig), static_cast<std::uint64_t>(rhs_sig),
+                        lhs_exp, rhs_exp, static_cast<unsigned>(exp_diff),
+                        lhs.isneg(), rhs.isneg());
+                }
+            }
+        }
+    }
+
     auto lhs_components {lhs.to_components()};
     detail::expand_significand<decimal32_t>(lhs_components.sig, lhs_components.exp);
     auto rhs_components {rhs.to_components()};
@@ -1130,6 +1222,41 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto operator-(const decimal32_t lhs, const decimal
     }
     #endif
 
+    // Two fast paths; see operator+ above. Both gated on non-zero operands so
+    // zero short-circuit logic is preserved by falling through. For operator-,
+    // rhs sign is flipped before kernel dispatch.
+    {
+        const auto lhs_sig {lhs.full_significand()};
+        const auto rhs_sig {rhs.full_significand()};
+        if (BOOST_DECIMAL_LIKELY(lhs_sig != 0U && rhs_sig != 0U))
+        {
+            const auto lhs_exp {lhs.biased_exponent()};
+            const auto rhs_exp {rhs.biased_exponent()};
+            const auto exp_diff {lhs_exp > rhs_exp ? lhs_exp - rhs_exp : rhs_exp - lhs_exp};
+            if (exp_diff > 17 || exp_diff <= 3)
+            {
+                auto round {_boost_decimal_global_rounding_mode};
+                #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION
+                if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(lhs))
+                {
+                    round = _boost_decimal_global_runtime_rounding_mode;
+                }
+                #endif
+                if (BOOST_DECIMAL_LIKELY(round == rounding_mode::fe_dec_to_nearest))
+                {
+                    if (exp_diff > 17)
+                    {
+                        return lhs_exp > rhs_exp ? lhs : -rhs;
+                    }
+                    return detail::aligned_add_kernel<decimal32_t, std::uint64_t>(
+                        static_cast<std::uint64_t>(lhs_sig), static_cast<std::uint64_t>(rhs_sig),
+                        lhs_exp, rhs_exp, static_cast<unsigned>(exp_diff),
+                        lhs.isneg(), !rhs.isneg());
+                }
+            }
+        }
+    }
+
     auto lhs_components {lhs.to_components()};
     detail::expand_significand<decimal32_t>(lhs_components.sig, lhs_components.exp);
     auto rhs_components {rhs.to_components()};
@@ -1485,16 +1612,17 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto operator<=>(const decimal32_t lhs, const decim
     {
         return std::partial_ordering::less;
     }
-    else if (lhs > rhs)
+    if (rhs < lhs)
     {
         return std::partial_ordering::greater;
     }
-    else if (lhs == rhs)
+    #ifndef BOOST_DECIMAL_FAST_MATH
+    if (isnan(lhs) || isnan(rhs))
     {
-        return std::partial_ordering::equivalent;
+        return std::partial_ordering::unordered;
     }
-
-    return std::partial_ordering::unordered;
+    #endif
+    return std::partial_ordering::equivalent;
 }
 
 template <typename Integer>
diff --git a/include/boost/decimal/decimal64_t.hpp b/include/boost/decimal/decimal64_t.hpp
index 3e13fa369..51f2d0ca8 100644
--- a/include/boost/decimal/decimal64_t.hpp
+++ b/include/boost/decimal/decimal64_t.hpp
@@ -603,6 +603,52 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto from_bits(std::uint64_t bits) noexcept -> deci
     return result;
 }
 
+namespace detail {
+
+// IEEE-pack a known-in-range (coeff, exp, sign) triple into a decimal64_t,
+// skipping the constructor's bounds check + dead-branch handling. Caller
+// guarantees coeff <= d64_max_significand_value and the (exp + bias) value
+// fits in [0, d64_max_biased_exponent]. Saves ~10 cycles per fast-path call.
+template <typename T1, typename T2>
+BOOST_DECIMAL_CUDA_CONSTEXPR auto direct_pack_d64(T1 coeff, T2 exp, bool sign) noexcept -> decimal64_t
+{
+    const auto reduced_coeff {static_cast<std::uint64_t>(coeff)};
+    const auto biased_exp {static_cast<std::uint64_t>(static_cast<int>(exp) + bias_v<decimal64_t>)};
+    std::uint64_t bits {sign ? d64_sign_mask : UINT64_C(0)};
+
+    // The non-combination encoding holds coefficients up to d64_biggest_no_combination_significand,
+    // which covers ~95%+ of post-arithmetic coefficients. The combination-field branch is cold.
+    if (BOOST_DECIMAL_LIKELY(reduced_coeff <= d64_biggest_no_combination_significand))
+    {
+        bits |= (reduced_coeff & d64_not_11_significand_mask);
+        bits |= (biased_exp << d64_not_11_exp_shift) & d64_not_11_exp_mask;
+    }
+    else
+    {
+        bits |= (d64_combination_field_mask | (reduced_coeff & d64_11_significand_mask));
+        bits |= (biased_exp << d64_11_exp_shift) & d64_11_exp_mask;
+    }
+    return from_bits(bits);
+}
+
+// Definition of the pack_in_range<decimal64_t> overload declared in
+// add_impl.hpp. Lives here so the `decimal64_t{coeff, exp, sign}` fallback
+// is parsed only after decimal64_t is complete (see add_impl.hpp rationale).
+template <typename ReturnType, typename SigType, typename ExpType>
+BOOST_DECIMAL_CUDA_CONSTEXPR auto pack_in_range(SigType coeff, ExpType exp, bool sign) noexcept
+    -> std::enable_if_t<std::is_same<ReturnType, decimal64_t>::value, decimal64_t>
+{
+    const auto biased_exp_check {static_cast<int>(exp) + bias_v<decimal64_t>};
+    if (BOOST_DECIMAL_LIKELY(biased_exp_check >= 0
+        && biased_exp_check <= static_cast<int>(max_biased_exp_v<decimal64_t>)))
+    {
+        return direct_pack_d64(static_cast<std::uint64_t>(coeff), exp, sign);
+    }
+    return decimal64_t{coeff, exp, sign};
+}
+
+} // namespace detail
+
 constexpr auto to_bits(decimal64_t rhs) noexcept -> std::uint64_t
 {
     return rhs.bits_;
@@ -745,9 +791,12 @@ BOOST_DECIMAL_CUDA_CONSTEXPR decimal64_t::decimal64_t(T1 coeff, T2 exp, const de
         }
         else if (digit_delta > 0 && coeff_digits + digit_delta <= detail::precision_v<decimal64_t>)
         {
+            // Coeff stays in range (<= max_significand_v) by the branch's digit budget,
+            // and biased_exp lands in [0, max] by construction. pack_in_range hits
+            // direct_pack on the in-range case; fallback constructor is a safety belt.
             exp -= digit_delta;
             reduced_coeff *= detail::pow10(static_cast<significand_type>(digit_delta));
-            *this = decimal64_t(reduced_coeff, exp, is_negative);
+            *this = detail::pack_in_range<decimal64_t>(reduced_coeff, exp, is_negative);
         }
         else if (coeff_digits + biased_exp <= detail::precision_v<decimal64_t>)
         {
@@ -781,20 +830,26 @@ BOOST_DECIMAL_CUDA_CONSTEXPR decimal64_t::decimal64_t(T1 coeff, T2 exp, const de
         }
         else if (digit_delta < 0 && coeff_digits - digit_delta <= detail::precision_v<decimal64_t>)
         {
+            // Expand to use the full precision; biased_exp ends up in [0, max] and
+            // coeff <= max_significand_v. pack_in_range routes to direct_pack.
             const auto offset {detail::precision_v<decimal64_t> - coeff_digits};
             exp -= offset;
             reduced_coeff *= detail::pow10(static_cast<significand_type>(offset));
-            *this = decimal64_t(reduced_coeff, exp, is_negative);
+            *this = detail::pack_in_range<decimal64_t>(reduced_coeff, exp, is_negative);
         }
         else if (biased_exp > detail::max_biased_exp_v<decimal64_t>)
         {
-            // Similar to subnormals, but for extremely large values
+            // Similar to subnormals, but for extremely large values: fold the
+            // overflow into the coefficient via trailing zeros when there's room.
             const auto available_space {detail::precision_v<decimal64_t> - coeff_digits};
             if (available_space >= exp_delta)
             {
+                // available_space >= exp_delta means after subtracting exp_delta from
+                // biased_exp it lands in [0, max], and coeff has coeff_digits + available_space
+                // <= precision digits, so <= max_significand_v. pack_in_range applies.
                 reduced_coeff *= detail::pow10(static_cast<significand_type>(available_space));
                 exp -= available_space;
-                *this = decimal64_t(reduced_coeff, exp, is_negative);
+                *this = detail::pack_in_range<decimal64_t>(reduced_coeff, exp, is_negative);
             }
             else
             {
@@ -1472,16 +1527,17 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto operator<=>(const decimal64_t lhs, const decim
     {
         return std::partial_ordering::less;
     }
-    else if (lhs > rhs)
+    if (rhs < lhs)
     {
         return std::partial_ordering::greater;
     }
-    else if (lhs == rhs)
+    #ifndef BOOST_DECIMAL_FAST_MATH
+    if (isnan(lhs) || isnan(rhs))
     {
-        return std::partial_ordering::equivalent;
+        return std::partial_ordering::unordered;
     }
-
-    return std::partial_ordering::unordered;
+    #endif
+    return std::partial_ordering::equivalent;
 }
 
 template <typename Integer>
@@ -1662,6 +1718,47 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto operator+(const decimal64_t lhs, const decimal
     }
     #endif
 
+    // Two fast paths under default rounding, gated by user-facing exp_diff:
+    //   1. exp_diff > 36: slow path would return dominant operand unchanged; skip
+    //      to_components/expand_significand and return it directly.
+    //   2. exp_diff <= 3: aligned_add_kernel can do the whole add in uint64 (max_sig
+    //      16 digits * 10^3 = 19 digits < 2^64); skip to_components/expand_significand
+    //      and the add_impl dispatch entirely.
+    // The 4..36 band falls through to the existing slow path. Both fast paths
+    // require both operands to be non-zero: zero short-circuit logic
+    // (preferred-quantum result exponent, sign-of-opposite-sign-zeros) lives in
+    // add_impl and is not duplicated here.
+    {
+        const auto lhs_sig {lhs.full_significand()};
+        const auto rhs_sig {rhs.full_significand()};
+        if (BOOST_DECIMAL_LIKELY(lhs_sig != 0U && rhs_sig != 0U))
+        {
+            const auto lhs_exp {lhs.biased_exponent()};
+            const auto rhs_exp {rhs.biased_exponent()};
+            const auto exp_diff {lhs_exp > rhs_exp ? lhs_exp - rhs_exp : rhs_exp - lhs_exp};
+            if (exp_diff > 36 || exp_diff <= 3)
+            {
+                auto round {_boost_decimal_global_rounding_mode};
+                #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION
+                if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(lhs))
+                {
+                    round = _boost_decimal_global_runtime_rounding_mode;
+                }
+                #endif
+                if (BOOST_DECIMAL_LIKELY(round == rounding_mode::fe_dec_to_nearest))
+                {
+                    if (exp_diff > 36)
+                    {
+                        return lhs_exp > rhs_exp ? lhs : rhs;
+                    }
+                    return detail::aligned_add_kernel<decimal64_t, std::uint64_t>(
+                        lhs_sig, rhs_sig, lhs_exp, rhs_exp, static_cast<unsigned>(exp_diff),
+                        lhs.isneg(), rhs.isneg());
+                }
+            }
+        }
+    }
+
     auto lhs_components {lhs.to_components()};
     detail::expand_significand<decimal64_t>(lhs_components.sig, lhs_components.exp);
     auto rhs_components {rhs.to_components()};
@@ -1725,6 +1822,41 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto operator-(const decimal64_t lhs, const decimal
     }
     #endif
 
+    // Two fast paths (see operator+ above). Both gated on non-zero operands so
+    // zero short-circuit logic (preferred-quantum, sign-of-zero) is preserved
+    // by falling through to add_impl. For operator-, the rhs sign is flipped
+    // before dispatching to the kernel (subtraction = add with negated rhs).
+    {
+        const auto lhs_sig {lhs.full_significand()};
+        const auto rhs_sig {rhs.full_significand()};
+        if (BOOST_DECIMAL_LIKELY(lhs_sig != 0U && rhs_sig != 0U))
+        {
+            const auto lhs_exp {lhs.biased_exponent()};
+            const auto rhs_exp {rhs.biased_exponent()};
+            const auto exp_diff {lhs_exp > rhs_exp ? lhs_exp - rhs_exp : rhs_exp - lhs_exp};
+            if (exp_diff > 36 || exp_diff <= 3)
+            {
+                auto round {_boost_decimal_global_rounding_mode};
+                #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION
+                if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(lhs))
+                {
+                    round = _boost_decimal_global_runtime_rounding_mode;
+                }
+                #endif
+                if (BOOST_DECIMAL_LIKELY(round == rounding_mode::fe_dec_to_nearest))
+                {
+                    if (exp_diff > 36)
+                    {
+                        return lhs_exp > rhs_exp ? lhs : -rhs;
+                    }
+                    return detail::aligned_add_kernel<decimal64_t, std::uint64_t>(
+                        lhs_sig, rhs_sig, lhs_exp, rhs_exp, static_cast<unsigned>(exp_diff),
+                        lhs.isneg(), !rhs.isneg());
+                }
+            }
+        }
+    }
+
     auto lhs_components {lhs.to_components()};
     detail::expand_significand<decimal64_t>(lhs_components.sig, lhs_components.exp);
     auto rhs_components {rhs.to_components()};
diff --git a/include/boost/decimal/decimal_fast128_t.hpp b/include/boost/decimal/decimal_fast128_t.hpp
index 5cfebd1d0..cb5f2b035 100644
--- a/include/boost/decimal/decimal_fast128_t.hpp
+++ b/include/boost/decimal/decimal_fast128_t.hpp
@@ -1074,6 +1074,36 @@ constexpr auto operator+(const decimal_fast128_t& lhs, const decimal_fast128_t&
     }
     #endif
 
+    // Dominant-operand short-circuit: for d128 d128_add_impl_new uses u256
+    // promoted type with max_shift = digits10(u256) - precision - 1
+    // = 77 - 34 - 1 = 42. Above that the slow path returns one operand
+    // unchanged anyway. Fast types maintain canonical significands so no
+    // expansion buffer is needed.
+    {
+        const auto lhs_sig {lhs.full_significand()};
+        const auto rhs_sig {rhs.full_significand()};
+        if (BOOST_DECIMAL_LIKELY(lhs_sig != 0U && rhs_sig != 0U))
+        {
+            const auto lhs_exp {lhs.biased_exponent()};
+            const auto rhs_exp {rhs.biased_exponent()};
+            const auto exp_diff {lhs_exp > rhs_exp ? lhs_exp - rhs_exp : rhs_exp - lhs_exp};
+            if (exp_diff > 42)
+            {
+                auto round {_boost_decimal_global_rounding_mode};
+                #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION
+                if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(lhs))
+                {
+                    round = _boost_decimal_global_runtime_rounding_mode;
+                }
+                #endif
+                if (BOOST_DECIMAL_LIKELY(round == rounding_mode::fe_dec_to_nearest))
+                {
+                    return lhs_exp > rhs_exp ? lhs : rhs;
+                }
+            }
+        }
+    }
+
     return detail::d128_add_impl_new<decimal_fast128_t>(lhs, rhs);
 }
 
@@ -1124,6 +1154,32 @@ constexpr auto operator-(const decimal_fast128_t& lhs, const decimal_fast128_t&
     }
     #endif
 
+    // Dominant-operand short-circuit; see operator+ above.
+    {
+        const auto lhs_sig {lhs.full_significand()};
+        const auto rhs_sig {rhs.full_significand()};
+        if (BOOST_DECIMAL_LIKELY(lhs_sig != 0U && rhs_sig != 0U))
+        {
+            const auto lhs_exp {lhs.biased_exponent()};
+            const auto rhs_exp {rhs.biased_exponent()};
+            const auto exp_diff {lhs_exp > rhs_exp ? lhs_exp - rhs_exp : rhs_exp - lhs_exp};
+            if (exp_diff > 42)
+            {
+                auto round {_boost_decimal_global_rounding_mode};
+                #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION
+                if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(lhs))
+                {
+                    round = _boost_decimal_global_runtime_rounding_mode;
+                }
+                #endif
+                if (BOOST_DECIMAL_LIKELY(round == rounding_mode::fe_dec_to_nearest))
+                {
+                    return lhs_exp > rhs_exp ? lhs : -rhs;
+                }
+            }
+        }
+    }
+
     return detail::d128_add_impl_new<decimal_fast128_t>(lhs, -rhs);
 }
 
@@ -1228,7 +1284,7 @@ constexpr auto operator*(const decimal_fast128_t& lhs, const Integer rhs) noexce
     exp_type rhs_exp {0};
     detail::normalize<decimal_fast128_t>(rhs_sig, rhs_exp);
 
-    return detail::d128_fast_mul_impl<decimal_fast128_t>(
+    return detail::d128_mul_impl<decimal_fast128_t>(
             lhs.significand_, lhs.biased_exponent(), lhs.sign_,
             rhs_sig, rhs_exp, (rhs < 0));
 }
diff --git a/include/boost/decimal/decimal_fast32_t.hpp b/include/boost/decimal/decimal_fast32_t.hpp
index 53167ae38..7493a5790 100644
--- a/include/boost/decimal/decimal_fast32_t.hpp
+++ b/include/boost/decimal/decimal_fast32_t.hpp
@@ -1034,11 +1034,40 @@ constexpr auto operator+(const decimal_fast32_t lhs, const decimal_fast32_t rhs)
         {
             return direct_init(detail::d32_fast_qnan, UINT8_C((0)));
         }
-        
+
         return detail::check_non_finite(lhs, rhs);
     }
     #endif
 
+    // Dominant-operand short-circuit: for d32 add_impl uses uint64 promoted
+    // type with max_shift = digits10(uint64) - precision - 1 = 19 - 7 - 1 = 11.
+    // Above that the slow path returns one operand unchanged anyway. Fast
+    // types maintain canonical significands so no expansion buffer is needed.
+    {
+        const auto lhs_sig {lhs.full_significand()};
+        const auto rhs_sig {rhs.full_significand()};
+        if (BOOST_DECIMAL_LIKELY(lhs_sig != 0U && rhs_sig != 0U))
+        {
+            const auto lhs_exp {lhs.biased_exponent()};
+            const auto rhs_exp {rhs.biased_exponent()};
+            const auto exp_diff {lhs_exp > rhs_exp ? lhs_exp - rhs_exp : rhs_exp - lhs_exp};
+            if (exp_diff > 11)
+            {
+                auto round {_boost_decimal_global_rounding_mode};
+                #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION
+                if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(lhs))
+                {
+                    round = _boost_decimal_global_runtime_rounding_mode;
+                }
+                #endif
+                if (BOOST_DECIMAL_LIKELY(round == rounding_mode::fe_dec_to_nearest))
+                {
+                    return lhs_exp > rhs_exp ? lhs : rhs;
+                }
+            }
+        }
+    }
+
     return detail::add_impl<decimal_fast32_t>(lhs, rhs);
 }
 
@@ -1088,11 +1117,37 @@ constexpr auto operator-(const decimal_fast32_t lhs, decimal_fast32_t rhs) noexc
         {
             return -rhs;
         }
-        
+
         return detail::check_non_finite(lhs, rhs);
     }
     #endif
 
+    // Dominant-operand short-circuit; see operator+ above.
+    {
+        const auto lhs_sig {lhs.full_significand()};
+        const auto rhs_sig {rhs.full_significand()};
+        if (BOOST_DECIMAL_LIKELY(lhs_sig != 0U && rhs_sig != 0U))
+        {
+            const auto lhs_exp {lhs.biased_exponent()};
+            const auto rhs_exp {rhs.biased_exponent()};
+            const auto exp_diff {lhs_exp > rhs_exp ? lhs_exp - rhs_exp : rhs_exp - lhs_exp};
+            if (exp_diff > 11)
+            {
+                auto round {_boost_decimal_global_rounding_mode};
+                #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION
+                if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(lhs))
+                {
+                    round = _boost_decimal_global_runtime_rounding_mode;
+                }
+                #endif
+                if (BOOST_DECIMAL_LIKELY(round == rounding_mode::fe_dec_to_nearest))
+                {
+                    return lhs_exp > rhs_exp ? lhs : -rhs;
+                }
+            }
+        }
+    }
+
     rhs.sign_ = !rhs.sign_;
 
     return detail::add_impl<decimal_fast32_t>(lhs, rhs);
diff --git a/include/boost/decimal/decimal_fast64_t.hpp b/include/boost/decimal/decimal_fast64_t.hpp
index e83e0543b..b282e8c8d 100644
--- a/include/boost/decimal/decimal_fast64_t.hpp
+++ b/include/boost/decimal/decimal_fast64_t.hpp
@@ -1173,6 +1173,36 @@ constexpr auto operator+(const decimal_fast64_t lhs, const decimal_fast64_t rhs)
     }
     #endif
 
+    // Dominant-operand short-circuit: when exp_diff exceeds max_shift in
+    // add_impl (= digits10(uint128) - precision - 1 = 21 for d64) the slow path
+    // returns one operand unchanged anyway. Skip the whole add_impl call. Fast
+    // types maintain canonical significands so the threshold has no
+    // worst-case-expansion buffer (unlike the IEEE 36 threshold).
+    {
+        const auto lhs_sig {lhs.full_significand()};
+        const auto rhs_sig {rhs.full_significand()};
+        if (BOOST_DECIMAL_LIKELY(lhs_sig != 0U && rhs_sig != 0U))
+        {
+            const auto lhs_exp {lhs.biased_exponent()};
+            const auto rhs_exp {rhs.biased_exponent()};
+            const auto exp_diff {lhs_exp > rhs_exp ? lhs_exp - rhs_exp : rhs_exp - lhs_exp};
+            if (exp_diff > 21)
+            {
+                auto round {_boost_decimal_global_rounding_mode};
+                #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION
+                if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(lhs))
+                {
+                    round = _boost_decimal_global_runtime_rounding_mode;
+                }
+                #endif
+                if (BOOST_DECIMAL_LIKELY(round == rounding_mode::fe_dec_to_nearest))
+                {
+                    return lhs_exp > rhs_exp ? lhs : rhs;
+                }
+            }
+        }
+    }
+
     return detail::add_impl<decimal_fast64_t>(lhs, rhs);
 }
 
@@ -1227,6 +1257,32 @@ constexpr auto operator-(const decimal_fast64_t lhs, decimal_fast64_t rhs) noexc
     }
     #endif
 
+    // Dominant-operand short-circuit; see operator+ above.
+    {
+        const auto lhs_sig {lhs.full_significand()};
+        const auto rhs_sig {rhs.full_significand()};
+        if (BOOST_DECIMAL_LIKELY(lhs_sig != 0U && rhs_sig != 0U))
+        {
+            const auto lhs_exp {lhs.biased_exponent()};
+            const auto rhs_exp {rhs.biased_exponent()};
+            const auto exp_diff {lhs_exp > rhs_exp ? lhs_exp - rhs_exp : rhs_exp - lhs_exp};
+            if (exp_diff > 21)
+            {
+                auto round {_boost_decimal_global_rounding_mode};
+                #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION
+                if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(lhs))
+                {
+                    round = _boost_decimal_global_runtime_rounding_mode;
+                }
+                #endif
+                if (BOOST_DECIMAL_LIKELY(round == rounding_mode::fe_dec_to_nearest))
+                {
+                    return lhs_exp > rhs_exp ? lhs : -rhs;
+                }
+            }
+        }
+    }
+
     rhs.sign_ = !rhs.sign_;
 
     return detail::add_impl<decimal_fast64_t>(lhs, rhs);
diff --git a/include/boost/decimal/detail/add_impl.hpp b/include/boost/decimal/detail/add_impl.hpp
index c5ab9f590..7545f1ed7 100644
--- a/include/boost/decimal/detail/add_impl.hpp
+++ b/include/boost/decimal/detail/add_impl.hpp
@@ -17,6 +17,7 @@
 
 #ifndef BOOST_DECIMAL_BUILD_MODULE
 #include <cstdint>
+#include <type_traits>
 #endif
 
 namespace boost {
@@ -28,6 +29,217 @@ namespace detail {
 #  pragma warning(disable : 4127) // Conditional expression is constant
 #endif
 
+// Returns the low 64 bits of an integer for parity testing. Specialized so the
+// templated kernel below can use one expression regardless of SigType width.
+BOOST_DECIMAL_CUDA_CONSTEXPR inline auto low64(std::uint64_t v) noexcept -> std::uint64_t { return v; }
+BOOST_DECIMAL_CUDA_CONSTEXPR inline auto low64(const int128::uint128_t& v) noexcept -> std::uint64_t { return v.low; }
+
+// Forward declarations of the per-type IEEE direct-pack helpers (defined in
+// decimal{32,64,128}_t.hpp after the bit-mask constants). These skip the
+// generic constructor's bounds check + dead-branch handling for known-in-range
+// inputs, saving ~5-10 cycles per call.
+template <typename T1, typename T2>
+BOOST_DECIMAL_CUDA_CONSTEXPR auto direct_pack_d32(T1 coeff, T2 exp, bool sign) noexcept -> decimal32_t;
+template <typename T1, typename T2>
+BOOST_DECIMAL_CUDA_CONSTEXPR auto direct_pack_d64(T1 coeff, T2 exp, bool sign) noexcept -> decimal64_t;
+template <typename T2>
+BOOST_DECIMAL_CUDA_CONSTEXPR auto direct_pack_d128(int128::uint128_t coeff, T2 exp, bool sign) noexcept -> decimal128_t;
+
+// Dispatch helper: for IEEE return types use the corresponding direct_pack
+// helper if (exp + bias) is in the valid biased-exponent range; otherwise fall
+// back to the regular constructor which handles overflow-to-infinity and
+// subnormal flushing/re-canonicalization. For fast types always uses the
+// regular constructor.
+//
+// SFINAE-disjoint overloads rather than an `if constexpr` chain because
+// BOOST_DECIMAL_IF_CONSTEXPR falls back to plain `if` in C++14, which would
+// force the d128 branch's `direct_pack_d128` body to be type-checked from
+// add_impl.hpp's include site (where decimal128_t is still forward-declared).
+//
+// The IEEE-type overloads are declared here but DEFINED in their respective
+// decimal*_t.hpp headers, after each class is complete. This avoids parsing
+// `decimal32_t{coeff, exp, sign}` (etc.) constructor calls in an environment
+// where the type is incomplete -- both C++14 (regular if) and nvcc, which
+// eagerly compiles function template bodies for device codegen, would
+// otherwise reject the incomplete-type expressions.
+template <typename ReturnType, typename SigType, typename ExpType>
+BOOST_DECIMAL_CUDA_CONSTEXPR auto pack_in_range(SigType coeff, ExpType exp, bool sign) noexcept
+    -> std::enable_if_t<std::is_same<ReturnType, decimal32_t>::value, decimal32_t>;
+
+template <typename ReturnType, typename SigType, typename ExpType>
+BOOST_DECIMAL_CUDA_CONSTEXPR auto pack_in_range(SigType coeff, ExpType exp, bool sign) noexcept
+    -> std::enable_if_t<std::is_same<ReturnType, decimal64_t>::value, decimal64_t>;
+
+template <typename ReturnType, typename SigType, typename ExpType>
+BOOST_DECIMAL_CUDA_CONSTEXPR auto pack_in_range(SigType coeff, ExpType exp, bool sign) noexcept
+    -> std::enable_if_t<std::is_same<ReturnType, decimal128_t>::value, decimal128_t>;
+
+// Fallback for fast types and components -- always uses the regular constructor.
+// Defined here because its body doesn't reference any specific IEEE decimal type.
+template <typename ReturnType, typename SigType, typename ExpType>
+BOOST_DECIMAL_CUDA_CONSTEXPR auto pack_in_range(SigType coeff, ExpType exp, bool sign) noexcept
+    -> std::enable_if_t<!std::is_same<ReturnType, decimal32_t>::value
+                        && !std::is_same<ReturnType, decimal64_t>::value
+                        && !std::is_same<ReturnType, decimal128_t>::value, ReturnType>
+{
+    return ReturnType{coeff, exp, sign};
+}
+
+// Inline shrink + pack for a sum/diff that exceeds max_significand_v. For 1-3
+// extra digits the bounds are determined by a chained comparison against the
+// precomputed thresholds. For >3 extra digits (post-alignment shifts up to ~21
+// for d64), a single num_digits call computes extra and pow10 looks up the
+// divisor. Either way the divide is one divmod_pow10 + round-half-to-even,
+// avoiding the constructor's coefficient_rounding dispatch.
+template <typename ReturnType, typename SigType, typename ExpType>
+BOOST_DECIMAL_CUDA_CONSTEXPR auto aligned_shrink_and_pack(
+    SigType mag, ExpType result_exp, bool result_sign) noexcept -> ReturnType
+{
+    constexpr SigType ten_to_p {
+        static_cast<SigType>(max_significand_v<ReturnType>) + SigType{1}};
+    constexpr SigType ten_to_p_plus_1 {static_cast<SigType>(ten_to_p * SigType{10U})};
+    constexpr SigType ten_to_p_plus_2 {static_cast<SigType>(ten_to_p_plus_1 * SigType{10U})};
+    constexpr SigType ten_to_p_plus_3 {static_cast<SigType>(ten_to_p_plus_2 * SigType{10U})};
+
+    unsigned extra {1U};
+    SigType pow_extra {SigType{10U}};
+    SigType half {SigType{5U}};
+    if (mag >= ten_to_p_plus_1)
+    {
+        extra = 2U;
+        pow_extra = SigType{100U};
+        half = SigType{50U};
+        if (mag >= ten_to_p_plus_2)
+        {
+            extra = 3U;
+            pow_extra = SigType{1000U};
+            half = SigType{500U};
+            if (BOOST_DECIMAL_UNLIKELY(mag >= ten_to_p_plus_3))
+            {
+                // Tall overflow (more than 3 extra digits): post-alignment
+                // shifts up to ~21 can land here for d64. Compute extra via
+                // num_digits and look up pow10.
+                const auto digits {num_digits(mag)};
+                extra = static_cast<unsigned>(digits - detail::precision_v<ReturnType>);
+                pow_extra = detail::pow10<SigType>(static_cast<SigType>(extra));
+                half = static_cast<SigType>(pow_extra / SigType{2U});
+            }
+        }
+    }
+
+    const auto dr {impl::divmod_pow10_dispatch(mag, static_cast<int>(extra), pow_extra)};
+    auto q {static_cast<SigType>(dr.quotient)};
+    const auto r {static_cast<SigType>(dr.remainder)};
+
+    if (r > half || (r == half && (low64(q) & UINT64_C(1)) != 0U))
+    {
+        ++q;
+        if (BOOST_DECIMAL_UNLIKELY(q == ten_to_p))
+        {
+            q = static_cast<SigType>(ten_to_p / SigType{10U});
+            ++extra;
+        }
+    }
+    return pack_in_range<ReturnType>(q, result_exp + static_cast<ExpType>(extra), result_sign);
+}
+
+// Aligned-add kernel that stays in the narrower integer width (uint64 for d64,
+// uint128 for d128) instead of the kernel's normal promoted width (uint128 or
+// u256). Two precision-p operands shifted by up to `narrow_max_shift` digits
+// still fit in the narrow type. The kernel also handles the overflow shrink
+// inline via Wang/Schulte injection-style rounding (precomputed half-constants,
+// single divmod, round-half-to-even), avoiding the constructor's
+// num_digits + coefficient_rounding dispatch (~50 cycles per overflow case).
+//
+// Caller must guarantee:
+//   - default rounding mode is fe_dec_to_nearest
+//   - shift in [0, 3] where 0 means same-exponent (no multiply)
+//   - both operands non-zero (zero short-circuit happens upstream)
+template <typename ReturnType, typename SigType, typename ExpType>
+BOOST_DECIMAL_CUDA_CONSTEXPR auto aligned_add_kernel(
+    SigType big_lhs, SigType big_rhs,
+    ExpType lhs_exp, ExpType rhs_exp, unsigned shift,
+    bool lhs_sign, bool rhs_sign) noexcept -> ReturnType
+{
+    SigType a {};
+    SigType b {};
+    ExpType result_exp {};
+    if (shift == 0U)
+    {
+        a = big_lhs;
+        b = big_rhs;
+        result_exp = lhs_exp;
+    }
+    else
+    {
+        const auto pow_shift {detail::pow10<SigType>(static_cast<SigType>(shift))};
+        if (lhs_exp < rhs_exp)
+        {
+            a = big_lhs;
+            b = static_cast<SigType>(big_rhs * pow_shift);
+            result_exp = lhs_exp;
+        }
+        else
+        {
+            a = static_cast<SigType>(big_lhs * pow_shift);
+            b = big_rhs;
+            result_exp = rhs_exp;
+        }
+    }
+
+    // Same-sign: magnitudes add. Sum can overflow by up to `shift` digits
+    // (1 for shift=0 same-exp, up to `shift` for small-diff).
+    if (lhs_sign == rhs_sign)
+    {
+        const SigType sum {static_cast<SigType>(a + b)};
+        constexpr SigType ten_to_p_ss {
+            static_cast<SigType>(max_significand_v<ReturnType>) + SigType{1}};
+        if (sum < ten_to_p_ss)
+        {
+            return pack_in_range<ReturnType>(sum, result_exp, lhs_sign);
+        }
+        return aligned_shrink_and_pack<ReturnType>(sum, result_exp, lhs_sign);
+    }
+
+    // Opposite signs: magnitudes subtract. For shift=0 the result <= max(a, b) <= max_sig
+    // so no overflow check is needed; for shift>0 the result can be up to
+    // max_sig * 10^shift and may need shrink.
+    SigType mag {};
+    bool result_sign {};
+    if (a >= b)
+    {
+        mag = static_cast<SigType>(a - b);
+        result_sign = lhs_sign;
+    }
+    else
+    {
+        mag = static_cast<SigType>(b - a);
+        result_sign = rhs_sign;
+    }
+    if (shift == 0U)
+    {
+        // No overflow possible in same-exp opposite-sign subtraction.
+        return pack_in_range<ReturnType>(mag, result_exp, result_sign);
+    }
+    constexpr SigType ten_to_p_os {
+        static_cast<SigType>(max_significand_v<ReturnType>) + SigType{1}};
+    if (mag < ten_to_p_os)
+    {
+        return pack_in_range<ReturnType>(mag, result_exp, result_sign);
+    }
+    return aligned_shrink_and_pack<ReturnType>(mag, result_exp, result_sign);
+}
+
+// Backwards-compatible aliases for the d128 callers that still use the old name.
+template <typename ReturnType, typename ExpType>
+BOOST_DECIMAL_CUDA_CONSTEXPR auto d128_uint128_aligned_kernel(
+    int128::uint128_t big_lhs, int128::uint128_t big_rhs,
+    ExpType lhs_exp, ExpType rhs_exp, unsigned shift,
+    bool lhs_sign, bool rhs_sign) noexcept -> ReturnType
+{
+    return aligned_add_kernel<ReturnType>(big_lhs, big_rhs, lhs_exp, rhs_exp, shift, lhs_sign, rhs_sign);
+}
+
 template <typename ReturnType, typename T>
 BOOST_DECIMAL_CUDA_CONSTEXPR auto add_impl(const T& lhs, const T& rhs) noexcept -> ReturnType
 {
@@ -64,6 +276,53 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto add_impl(const T& lhs, const T& rhs) noexcept
         return ReturnType{lhs.full_significand(), lhs.biased_exponent(), lhs.isneg()};
     }
 
+    // Phase 3 fast paths for d64 (uint128 promoted) and d32 (uint64 promoted):
+    // dispatch via aligned_add_kernel which handles same-exp + small-diff + the
+    // overflow shrink inline (avoiding the constructor's coefficient_rounding).
+    //
+    // Two width-buckets:
+    //   - shift <= 3: stay in uint64 (max_sig 16 digits * 10^3 = 19 digits < 2^64).
+    //   - shift in [4, max_shift]: stay in uint128 (the existing promoted width,
+    //     but with inline shrink instead of the signed-add-then-construct path).
+    //
+    // Guard: only enter when inputs are at the return type's precision
+    // (operator+/- callers); FMA passes wider promoted-precision components and
+    // must use the slow path.
+    BOOST_DECIMAL_IF_CONSTEXPR (detail::precision_v<ReturnType> <= 16
+        && std::is_same<typename T::significand_type, typename ReturnType::significand_type>::value)
+    {
+        constexpr unsigned u64_small_diff_limit {3U};
+        constexpr auto u128_max_shift {detail::make_positive_unsigned(
+            std::numeric_limits<promoted_sig_type>::digits10 - detail::precision_v<ReturnType> - 1)};
+        const auto exp_diff {lhs_exp - rhs_exp};
+        const auto shift_abs {static_cast<unsigned>(exp_diff < 0 ? -exp_diff : exp_diff)};
+        if (shift_abs <= static_cast<unsigned>(u128_max_shift))
+        {
+            bool default_rounding {_boost_decimal_global_rounding_mode == rounding_mode::fe_dec_to_nearest};
+            #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION
+            if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(lhs))
+            {
+                default_rounding = (_boost_decimal_global_runtime_rounding_mode == rounding_mode::fe_dec_to_nearest);
+            }
+            #endif
+            if (BOOST_DECIMAL_LIKELY(default_rounding))
+            {
+                if (shift_abs <= u64_small_diff_limit)
+                {
+                    return aligned_add_kernel<ReturnType, std::uint64_t>(
+                        static_cast<std::uint64_t>(big_lhs),
+                        static_cast<std::uint64_t>(big_rhs),
+                        lhs_exp, rhs_exp, shift_abs,
+                        lhs.isneg(), rhs.isneg());
+                }
+                return aligned_add_kernel<ReturnType, promoted_sig_type>(
+                    big_lhs, big_rhs,
+                    lhs_exp, rhs_exp, shift_abs,
+                    lhs.isneg(), rhs.isneg());
+            }
+        }
+    }
+
     // Align to larger exponent
     if (lhs_exp != rhs_exp)
     {
@@ -255,12 +514,66 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto d128_add_impl_new(const T& lhs, const T& rhs)
         return ReturnType{lhs.full_significand(), lhs.biased_exponent(), lhs.isneg()};
     }
 
+    // Phase 1 fast path guard: only enter when inputs are at the return type's
+    // precision (operator+/- callers); FMA passes promoted components (wider sigs).
+    constexpr bool fast_path_eligible {
+        std::is_same<typename T::significand_type, typename ReturnType::significand_type>::value};
+
+    // Phase 1 same-exp fast path: stays in uint128 (no u256 promotion, no u256 trailing add).
+    // Default rounding only; non-default rounding falls through to the existing u256 path.
+    BOOST_DECIMAL_IF_CONSTEXPR (fast_path_eligible)
+    {
+        if (lhs_exp == rhs_exp)
+        {
+            bool default_rounding {_boost_decimal_global_rounding_mode == rounding_mode::fe_dec_to_nearest};
+            #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION
+            if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(lhs))
+            {
+                default_rounding = (_boost_decimal_global_runtime_rounding_mode == rounding_mode::fe_dec_to_nearest);
+            }
+            #endif
+            if (BOOST_DECIMAL_LIKELY(default_rounding))
+            {
+                return d128_uint128_aligned_kernel<ReturnType>(
+                    big_lhs, big_rhs, lhs_exp, rhs_exp, 0U,
+                    lhs.isneg(), rhs.isneg());
+            }
+        }
+    }
+
     // Align to larger exponent
     if (lhs_exp != rhs_exp)
     {
         constexpr auto max_shift {detail::make_positive_unsigned(std::numeric_limits<promoted_sig_type>::digits10 - detail::precision_v<ReturnType> - 1)};
         const auto shift {detail::make_positive_unsigned(lhs_exp - rhs_exp)};
 
+        // Phase 1 small-diff fast path: for shifts in [1, 3], the aligned multiply
+        // still fits in uint128 (precision 34 + shift 3 = 37 <= digits10(uint128) = 38).
+        // Skipping u256 saves ~50 cycles per op vs the u256 path below. Only taken
+        // under default rounding; the u256 slow path covers other modes.
+        // No LIKELY hint: random-exp workloads have shift >> 3, accumulation has
+        // shift <= 3, so neither prediction wins universally.
+        BOOST_DECIMAL_IF_CONSTEXPR (fast_path_eligible)
+        {
+            constexpr auto u128_small_diff_limit {3U};
+            if (shift <= u128_small_diff_limit)
+            {
+                bool default_rounding {_boost_decimal_global_rounding_mode == rounding_mode::fe_dec_to_nearest};
+                #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION
+                if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(lhs))
+                {
+                    default_rounding = (_boost_decimal_global_runtime_rounding_mode == rounding_mode::fe_dec_to_nearest);
+                }
+                #endif
+                if (BOOST_DECIMAL_LIKELY(default_rounding))
+                {
+                    return d128_uint128_aligned_kernel<ReturnType>(
+                        big_lhs, big_rhs, lhs_exp, rhs_exp, static_cast<unsigned>(shift),
+                        lhs.isneg(), rhs.isneg());
+                }
+            }
+        }
+
         if (shift > max_shift)
         {
             auto round {_boost_decimal_global_rounding_mode};
diff --git a/include/boost/decimal/detail/comparison.hpp b/include/boost/decimal/detail/comparison.hpp
index 09258b35c..c8b069da3 100644
--- a/include/boost/decimal/detail/comparison.hpp
+++ b/include/boost/decimal/detail/comparison.hpp
@@ -77,7 +77,7 @@ BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto equality_impl(Decim
 
     const auto delta_exp {lhs_exp - rhs_exp};
 
-    if (delta_exp > detail::precision_v<DecimalType> || delta_exp < -detail::precision_v<DecimalType>)
+    if (BOOST_DECIMAL_UNLIKELY(delta_exp > detail::precision_v<DecimalType> || delta_exp < -detail::precision_v<DecimalType>))
     {
         return false;
     }
@@ -206,7 +206,7 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto equal_parts_impl(T1 lhs_sig, U1 lhs_exp, bool
 
     // Check the value of delta exp to avoid to large a value for pow10
     // Also if only one of the significands is 0 then we know the values have to be mismatched
-    if (delta_exp > detail::precision_v<DecimalType> || delta_exp < -detail::precision_v<DecimalType>)
+    if (BOOST_DECIMAL_UNLIKELY(delta_exp > detail::precision_v<DecimalType> || delta_exp < -detail::precision_v<DecimalType>))
     {
         return false;
     }
@@ -462,64 +462,60 @@ BOOST_DECIMAL_FORCE_INLINE constexpr auto fast_type_less_parts_impl(T lhs_sig, U
 }
 
 template <BOOST_DECIMAL_DECIMAL_FLOATING_TYPE DecimalType>
-BOOST_DECIMAL_CUDA_CONSTEXPR auto sequential_less_impl(DecimalType lhs, DecimalType rhs) noexcept -> bool
+BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto sequential_less_impl(DecimalType lhs, DecimalType rhs) noexcept -> bool
 {
     using comp_type = std::conditional_t<detail::decimal_val_v<DecimalType> < 64, std::uint_fast64_t, int128::uint128_t>;
 
-    // Step 1: Handle our non-finite values in their own calling functions
-
-    // Step 2: Check if they are bitwise equal:
-    /*
+    // Non-finite handling is performed by the callers. With NaN ruled out,
+    // identical bit patterns encode the same finite value, so lhs < rhs is false.
     if (lhs.bits_ == rhs.bits_)
     {
         return false;
     }
-    */
 
-    // Step 3: Decode and compare signs first:
-    const auto lhs_sign {lhs.isneg()};
-    const auto rhs_sign {rhs.isneg()};
+    // One combination-field branch per operand instead of three.
+    const auto lhs_components {lhs.to_components()};
+    const auto rhs_components {rhs.to_components()};
 
-    if (lhs_sign != rhs_sign)
-    {
-        return lhs_sign;
-    }
+    const auto lhs_sign {lhs_components.sign};
+    const auto rhs_sign {rhs_components.sign};
+
+    auto lhs_sig {static_cast<comp_type>(lhs_components.sig)};
+    auto rhs_sig {static_cast<comp_type>(rhs_components.sig)};
 
-    // Step 4: Decode the significand and do a trivial comp
-    auto lhs_sig {static_cast<comp_type>(lhs.full_significand())};
-    auto rhs_sig {static_cast<comp_type>(rhs.full_significand())};
+    // Signed zeros: +0 == -0, neither is less.
     if (lhs_sig == static_cast<comp_type>(0) || rhs_sig == static_cast<comp_type>(0))
     {
-        return (lhs_sig == rhs_sig) ? false : (lhs_sig == static_cast<comp_type>(0) ? !rhs_sign : lhs_sign);
+        if (lhs_sig == rhs_sig)
+        {
+            return false;
+        }
+        return lhs_sig == static_cast<comp_type>(0) ? !rhs_sign : lhs_sign;
     }
 
-    // Step 5: Decode the exponent and see if we can even compare the significands
-    auto lhs_exp {lhs.biased_exponent()};
-    auto rhs_exp {rhs.biased_exponent()};
+    if (lhs_sign != rhs_sign)
+    {
+        return lhs_sign;
+    }
 
+    const auto lhs_exp {lhs_components.exp};
+    const auto rhs_exp {rhs_components.exp};
     const auto delta_exp {lhs_exp - rhs_exp};
+
     constexpr auto max_delta_diff {std::numeric_limits<comp_type>::digits10 - detail::precision_v<DecimalType>};
 
-    if (delta_exp > max_delta_diff || delta_exp < -max_delta_diff)
+    if (BOOST_DECIMAL_UNLIKELY(delta_exp > max_delta_diff || delta_exp < -max_delta_diff))
     {
-        return rhs_sign ? rhs_exp < lhs_exp : rhs_exp > lhs_exp;
+        return lhs_sign ? lhs_exp > rhs_exp : lhs_exp < rhs_exp;
     }
 
-    // Step 6: Approximate normalization if we need to and then get the answer
-    if (delta_exp >= 0)
+    if (delta_exp > 0)
     {
         lhs_sig *= detail::pow10(static_cast<comp_type>(delta_exp));
-        lhs_exp -= delta_exp;
     }
-    else
+    else if (delta_exp < 0)
     {
         rhs_sig *= detail::pow10(static_cast<comp_type>(-delta_exp));
-        rhs_exp += delta_exp;
-    }
-
-    if (lhs_exp != rhs_exp)
-    {
-        return lhs_sign ? lhs_exp > rhs_exp : lhs_exp < rhs_exp;
     }
 
     return lhs_sign ? lhs_sig > rhs_sig : lhs_sig < rhs_sig;
diff --git a/include/boost/decimal/detail/components.hpp b/include/boost/decimal/detail/components.hpp
index 3b6c1e648..24f41c48e 100644
--- a/include/boost/decimal/detail/components.hpp
+++ b/include/boost/decimal/detail/components.hpp
@@ -60,6 +60,11 @@ struct decimal_components
         return sign;
     }
 
+    BOOST_DECIMAL_CUDA_CONSTEXPR auto to_components() const -> decimal_components
+    {
+        return *this;
+    }
+
     template <typename T1, typename T2>
     explicit BOOST_DECIMAL_CUDA_CONSTEXPR operator decimal_components<T1, T2>() const
     {
diff --git a/include/boost/decimal/detail/mul_impl.hpp b/include/boost/decimal/detail/mul_impl.hpp
index bfa5b5e7e..771071548 100644
--- a/include/boost/decimal/detail/mul_impl.hpp
+++ b/include/boost/decimal/detail/mul_impl.hpp
@@ -8,7 +8,8 @@
 #include <boost/decimal/detail/attributes.hpp>
 #include <boost/decimal/detail/apply_sign.hpp>
 #include <boost/decimal/detail/fenv_rounding.hpp>
-#include "int128.hpp"
+#include <boost/decimal/detail/add_impl.hpp>
+#include <boost/decimal/detail/normalize.hpp>
 #include <boost/decimal/detail/u256.hpp>
 #include <boost/decimal/detail/power_tables.hpp>
 #include <boost/decimal/detail/components.hpp>
@@ -22,182 +23,503 @@ namespace boost {
 namespace decimal {
 namespace detail {
 
-// Each type has two different multiplication impls
-// 1) Returns a decimal type and lets the constructor handle with shrinking the significand
-// 2) Returns a struct of the constituent components (used with FMAs)
+namespace impl {
+
+// Wide-product shrink + round-half-to-even + pack helpers used by the IEEE
+// finite multiplication path under default rounding (fe_dec_to_nearest).
+//
+// After multiplying two precision-p significands (both expanded to full
+// precision), the wide product has either 2p-1 or 2p digits. We shrink to
+// exactly p digits by branching on the threshold 10^(2p-1), then dividing by
+// 10^(p-1) or 10^p with a round-half-to-even rule and a carry handler for the
+// rare q == 10^p case. Finally pack_in_range emits the IEEE bit pattern via
+// direct_pack_d* when the biased exponent is in range, falling back to the
+// constructor for overflow/underflow paths.
+//
+// Caller guarantees that both operand significands are non-zero (zero needs
+// the constructor's cohort-canonicalization path), at full precision p, and
+// that the active rounding mode is fe_dec_to_nearest. Non-default rounding
+// modes go through the constructor path which honors fenv_round dispatch.
+
+// uint64 product -> decimal{32,fast32}_t. Product spans [10^12, ~10^14).
+template <typename ReturnType, typename ExpType>
+BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto mul_finalize_u64(
+    std::uint_fast64_t product, ExpType result_exp, bool result_sign) noexcept -> ReturnType
+{
+    int extra {6};
+    if (product >= UINT64_C(10000000000000)) // 10^13
+    {
+        extra = 7;
+    }
+
+    const auto pow_extra {detail::pow10<std::uint64_t>(static_cast<std::uint64_t>(extra))};
+    auto q {product / pow_extra};
+    const auto r {product - q * pow_extra};
+    const auto half {pow_extra >> 1};
 
-template <typename ReturnType, typename T>
-BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto mul_impl(const T& lhs, const T& rhs) noexcept -> ReturnType
+    if (r > half || (r == half && (q & UINT64_C(1)) != 0U))
+    {
+        ++q;
+        if (BOOST_DECIMAL_UNLIKELY(q == UINT64_C(10000000))) // 10^7
+        {
+            q = UINT64_C(1000000); // 10^6
+            ++extra;
+        }
+    }
+
+    return detail::pack_in_range<ReturnType>(static_cast<std::uint32_t>(q),
+                                             result_exp + static_cast<ExpType>(extra),
+                                             result_sign);
+}
+
+// uint128 product -> decimal{64,fast64}_t. Product spans [10^30, ~10^32).
+template <typename ReturnType, typename ExpType>
+BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto mul_finalize_u128(
+    int128::uint128_t product, ExpType result_exp, bool result_sign) noexcept -> ReturnType
 {
-    using mul_type = std::conditional_t<decimal_val_v<T> < 64, std::uint_fast64_t, int128::uint128_t>;
+    constexpr auto threshold {detail::pow10<int128::uint128_t>(int128::uint128_t{UINT64_C(31)})};
 
-    // The constructor needs to calculate the number of digits in the significand which for uint128 is slow
-    // Since we know the value of res_sig is constrained to [1'000'000^2, 9'999'999^2] which equates to
-    // either 13 or 14 decimal digits we can use a single division to make binary search occur with
-    // uint32_t instead. 14 - 5 = 9 or 13 - 5 = 8 which are both still greater than or equal to
-    // digits10 + 1 for rounding which is 8 decimal digits
+    int extra {15};
+    if (product >= threshold)
+    {
+        extra = 16;
+    }
 
-    auto res_sig {(static_cast<mul_type>(lhs.full_significand()) * static_cast<mul_type>(rhs.full_significand()))};
-    auto res_exp {lhs.biased_exponent() + rhs.biased_exponent()};
+    const auto pow_extra {detail::pow10<int128::uint128_t>(int128::uint128_t{static_cast<std::uint64_t>(extra)})};
+    const auto dr {detail::impl::divmod_pow10_dispatch(product, extra, pow_extra)};
+    auto q {dr.quotient};
+    const auto r {dr.remainder};
+    const auto half {pow_extra >> 1};
 
-    return {res_sig, res_exp, lhs.isneg() != rhs.isneg()};
+    if (r > half || (r == half && (q.low & UINT64_C(1)) != 0U))
+    {
+        ++q;
+        constexpr auto ten_p {detail::pow10<int128::uint128_t>(int128::uint128_t{UINT64_C(16)})};
+        if (BOOST_DECIMAL_UNLIKELY(q == ten_p))
+        {
+            q = detail::pow10<int128::uint128_t>(int128::uint128_t{UINT64_C(15)});
+            ++extra;
+        }
+    }
+
+    return detail::pack_in_range<ReturnType>(q.low,
+                                             result_exp + static_cast<ExpType>(extra),
+                                             result_sign);
 }
 
-template <typename ReturnType, typename T, typename U>
-BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto mul_impl(T lhs_sig, U lhs_exp, bool lhs_sign,
-                                                   T rhs_sig, U rhs_exp, bool rhs_sign) noexcept -> ReturnType
+// u256 product -> decimal{128,fast128}_t. Product spans [10^66, ~10^68).
+template <typename ReturnType, typename ExpType>
+BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto mul_finalize_u256(
+    const u256& product, ExpType result_exp, bool result_sign) noexcept -> ReturnType
 {
-    using mul_type = std::uint_fast64_t;
+    int extra {33};
+    if (product >= detail::pow10(u256{UINT64_C(67)}))
+    {
+        extra = 34;
+    }
+
+    const auto pow_extra {detail::pow10(u256{static_cast<std::uint64_t>(extra)})};
+    const auto dr {detail::impl::divmod_pow10_dispatch(product, extra, pow_extra)};
+    auto q {dr.quotient};
+    const auto r {dr.remainder};
+    const auto half {pow_extra >> 1};
 
-    // The constructor needs to calculate the number of digits in the significand which for uint128 is slow
-    // Since we know the value of res_sig is constrained to [1'000'000^2, 9'999'999^2] which equates to
-    // either 13 or 14 decimal digits we can use a single division to make binary search occur with
-    // uint32_t instead. 14 - 5 = 9 or 13 - 5 = 8 which are both still greater than or equal to
-    // digits10 + 1 for rounding which is 8 decimal digits
+    if (r > half || (r == half && (q.bytes[0] & UINT64_C(1)) != 0U))
+    {
+        ++q;
+        constexpr auto ten_p {detail::pow10(u256{UINT64_C(34)})};
+        if (BOOST_DECIMAL_UNLIKELY(q == ten_p))
+        {
+            q = detail::pow10(u256{UINT64_C(33)});
+            ++extra;
+        }
+    }
 
-    constexpr auto ten_pow_five {pow10(static_cast<mul_type>(5))};
-    auto res_sig {(static_cast<mul_type>(lhs_sig) * static_cast<mul_type>(rhs_sig)) / ten_pow_five};
-    auto res_exp {lhs_exp + rhs_exp + static_cast<U>(5)};
+    const int128::uint128_t q_u128 {q.bytes[1], q.bytes[0]};
+    return detail::pack_in_range<ReturnType>(q_u128,
+                                             result_exp + static_cast<ExpType>(extra),
+                                             result_sign);
+}
 
-    return {static_cast<std::uint32_t>(res_sig), res_exp, lhs_sign != rhs_sign};
+// Default-rounding gate matching add_impl's pattern: check the global rounding
+// mode at compile-time and runtime; the fast path is only safe when the active
+// mode is fe_dec_to_nearest (the round-half-to-even logic in mul_finalize_*).
+// For any other mode, callers fall back to the constructor (which dispatches
+// to fenv_round, honoring the runtime mode correctly).
+template <typename Anchor>
+BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto mul_default_rounding(const Anchor& anchor) noexcept -> bool
+{
+    static_cast<void>(anchor);
+    bool default_rounding {_boost_decimal_global_rounding_mode == rounding_mode::fe_dec_to_nearest};
+    #ifndef BOOST_DECIMAL_NO_CONSTEVAL_DETECTION
+    if (!BOOST_DECIMAL_IS_CONSTANT_EVALUATED(anchor))
+    {
+        default_rounding = (_boost_decimal_global_runtime_rounding_mode == rounding_mode::fe_dec_to_nearest);
+    }
+    #endif
+    return default_rounding;
 }
 
-// In the fast case we are better served doing our 128-bit division here since we are at a know starting point
+} // namespace impl
+
+// Generic multiplication kernel. The body dispatches via SFINAE-disjoint
+// helpers (impl::mul_impl_ieee / impl::mul_impl_components) so each helper's
+// body is type-checked only for the ReturnType it actually serves
+// (BOOST_DECIMAL_IF_CONSTEXPR falls back to plain `if` in C++14 and would
+// otherwise force the FMA-only branch to resolve precision_v /
+// expand_significand / mul_finalize_* against component structs that don't
+// satisfy is_decimal_floating_point_v). The outer mul_impl signature stays
+// `-> ReturnType` so existing friend declarations in decimal_fast*_t headers
+// still match.
+//
+//   (a) ReturnType is one of decimal{32,64,128}_t or decimal_fast{32,64,128}_t.
+//       Expands both operands to full precision (no-op if already canonical),
+//       multiplies, then shrinks to exactly precision digits with
+//       round-half-to-even and packs via pack_in_range (direct_pack fast path
+//       for IEEE types; constructor fallback for fast types).
+//   (b) ReturnType is a *_components struct (FMA consumes the unrounded wider
+//       product before its add step). Returns the raw product packed into the
+//       wider components type without shrinking.
+
+namespace impl {
+
+// Tag-dispatch helpers receive already-extracted components (so they don't
+// need friend access to the fast types' private to_components). The outer
+// mul_impl in the detail namespace below provides that access. Tag dispatch
+// (std::integral_constant) is used instead of `if constexpr` so the wrong
+// dispatcher isn't instantiated for any given ReturnType -- BOOST_DECIMAL_IF_CONSTEXPR
+// falls back to plain `if` in C++14 and would otherwise force both branches'
+// bodies to type-check.
+
+template <typename ReturnType, std::int32_t TVal, typename Components>
+BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto mul_impl_dispatch(
+    Components lhs_c, Components rhs_c, std::true_type /*is_decimal_floating_point*/) noexcept -> ReturnType
+{
+    using mul_type = std::conditional_t<TVal < 64, std::uint_fast64_t, int128::uint128_t>;
+    const bool sign {lhs_c.sign != rhs_c.sign};
+
+    // d32 IEEE: revert to the simple multiply + constructor handoff. The
+    // hardware uint64 divide the constructor's coefficient_rounding does
+    // is already cheap enough that the expand + pre-shrink + direct_pack
+    // path's overhead doesn't pay off here. d32_fast still benefits and
+    // continues to use the optimized path below.
+    BOOST_DECIMAL_IF_CONSTEXPR (std::is_same<ReturnType, decimal32_t>::value)
+    {
+        const auto res_sig {static_cast<mul_type>(lhs_c.sig) * static_cast<mul_type>(rhs_c.sig)};
+        const auto res_exp {lhs_c.exp + rhs_c.exp};
+        return ReturnType{res_sig, res_exp, sign};
+    }
+    else
+    {
+        // Small-product fast path: when at least one operand is
+        // non-canonical (sig below 10^(p-1)) AND the un-expanded product
+        // fits in p digits, the product itself is the result. Skip
+        // expand+shrink entirely and let pack_in_range route to direct_pack
+        // (or to the constructor for out-of-range biased_exp). Gated on
+        // sig comparisons so the canonical path pays only two compares.
+        using sig_t = typename ReturnType::significand_type;
+        constexpr auto min_normal_sig {detail::pow10<sig_t>(static_cast<sig_t>(detail::precision_v<ReturnType> - 1))};
+        if (BOOST_DECIMAL_UNLIKELY(lhs_c.sig < min_normal_sig || rhs_c.sig < min_normal_sig))
+        {
+            const auto small_prod {static_cast<mul_type>(lhs_c.sig) * static_cast<mul_type>(rhs_c.sig)};
+            constexpr mul_type small_threshold {static_cast<mul_type>(detail::max_significand_v<ReturnType>) + mul_type{1U}};
+            if (small_prod < small_threshold)
+            {
+                return detail::pack_in_range<ReturnType>(static_cast<sig_t>(small_prod),
+                                                         lhs_c.exp + rhs_c.exp, sign);
+            }
+        }
+
+        // Fast types store significands at full precision already; expansion
+        // is a no-op for them so we skip the calls entirely. IEEE types can
+        // hold non-canonical cohorts and need the expansion to bring the
+        // significand up to precision digits before the threshold-based
+        // shrink in mul_finalize_*.
+        BOOST_DECIMAL_IF_CONSTEXPR (!detail::is_fast_type_v<ReturnType>)
+        {
+            detail::expand_significand<ReturnType>(lhs_c.sig, lhs_c.exp);
+            detail::expand_significand<ReturnType>(rhs_c.sig, rhs_c.exp);
+        }
+
+        const auto res_sig {static_cast<mul_type>(lhs_c.sig) * static_cast<mul_type>(rhs_c.sig)};
+        const auto res_exp {lhs_c.exp + rhs_c.exp};
+
+        // Zero canonicalization: a zero significand must go through the
+        // constructor so the IEEE zero-cohort encoding is emitted.
+        if (BOOST_DECIMAL_UNLIKELY(lhs_c.sig == 0U || rhs_c.sig == 0U))
+        {
+            return ReturnType{res_sig, res_exp, sign};
+        }
+        if (BOOST_DECIMAL_UNLIKELY(!impl::mul_default_rounding(lhs_c.sig)))
+        {
+            // Non-default rounding mode: constructor's coefficient_rounding
+            // dispatches to fenv_round which honors the runtime mode.
+            return ReturnType{res_sig, res_exp, sign};
+        }
+        BOOST_DECIMAL_IF_CONSTEXPR (TVal < 64)
+        {
+            // The static_cast is identity in this branch (mul_type is uint_fast64_t
+            // when TVal < 64) but is required to make the d64/d128 instantiation
+            // type-check in C++14, where BOOST_DECIMAL_IF_CONSTEXPR is plain `if`
+            // and both branches must be valid even though the wide-width branch
+            // is unreachable at runtime.
+            return impl::mul_finalize_u64<ReturnType>(static_cast<std::uint_fast64_t>(res_sig), res_exp, sign);
+        }
+        else
+        {
+            return impl::mul_finalize_u128<ReturnType>(res_sig, res_exp, sign);
+        }
+    }
+}
+
+template <typename ReturnType, std::int32_t TVal, typename Components>
+BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto mul_impl_dispatch(
+    Components lhs_c, Components rhs_c, std::false_type /*is_decimal_floating_point*/) noexcept -> ReturnType
+{
+    // FMA components path: return wide product unrounded. ReturnType is a
+    // decimal_components<SigType, BiasedExpType> struct (decimal_val_v defined,
+    // but no precision_v / max_significand_v / direct_pack_*), so the
+    // optimized path's helpers above don't apply.
+    using mul_type = std::conditional_t<TVal < 64, std::uint_fast64_t, int128::uint128_t>;
+
+    const bool sign {lhs_c.sign != rhs_c.sign};
+    const auto res_sig {static_cast<mul_type>(lhs_c.sig) * static_cast<mul_type>(rhs_c.sig)};
+    const auto res_exp {lhs_c.exp + rhs_c.exp};
+
+    return {res_sig, res_exp, sign};
+}
+
+} // namespace impl
+
+// Outer mul_impl: keeps the original `-> ReturnType` signature so friend
+// declarations in decimal_fast*_t headers match. Extracts components via the
+// (friend-accessible) to_components() and tag-dispatches to SFINAE-disjoint
+// impl helpers. Tag dispatch (rather than if-constexpr) ensures only the
+// matching helper body is instantiated for any given ReturnType -- BOOST_DECIMAL_IF_CONSTEXPR
+// falls back to plain `if` in C++14, which would force both branches to
+// type-check.
 template <typename ReturnType, typename T>
-BOOST_DECIMAL_CUDA_CONSTEXPR auto d64_mul_impl(const T& lhs, const T& rhs) noexcept -> ReturnType
+BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto mul_impl(const T& lhs, const T& rhs) noexcept -> ReturnType
 {
-    using unsigned_int128_type = boost::int128::uint128_t;
+    return impl::mul_impl_dispatch<ReturnType, decimal_val_v<T>>(
+        lhs.to_components(), rhs.to_components(),
+        std::integral_constant<bool, detail::is_decimal_floating_point_v<ReturnType>>{});
+}
+
+#ifdef _MSC_VER
+#  pragma warning(push)
+#  pragma warning(disable : 4702) // Complains that unlikely branches are unreachable
+#endif
+
+template <typename ReturnType, typename T, typename U>
+BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto mul_impl(T lhs_sig, U lhs_exp, const bool lhs_sign,
+                                                   T rhs_sig, U rhs_exp, const bool rhs_sign) noexcept -> ReturnType
+{
+    // d32 * Integer (sole caller) always passes an IEEE/fast decimal ReturnType.
+    // The body references mul_finalize_u64<ReturnType> and expand_significand,
+    // both valid for is_decimal_floating_point_v<ReturnType>==true types. No
+    // SFINAE constraint needed because no FMA path uses this tuple form
+    // (FMA's mul_impl call passes pre-extracted components to the dec-ref form).
+    using mul_type = std::uint_fast64_t;
 
-    // Once we have the normalized significands and exponents all we have to do is
-    // multiply the significands and add the exponents
-    //
-    // The constructor needs to calculate the number of digits in the significand which for uint128 is slow
-    // Since we know the value of res_sig is constrained to [(10^16)^2, (10^17 - 1)^2] which equates to
-    // either 31 or 32 decimal digits we can use a single division to make binary search occur with
-    // uint_fast64_t instead. 32 - 13 = 19 or 31 - 13 = 18 which are both still greater than
-    // digits10 + 1 for rounding which is 17 decimal digits
+    const bool sign {lhs_sign != rhs_sign};
 
-    constexpr auto ten_pow_13 {pow10(static_cast<unsigned_int128_type>(13))};
-    const auto res_sig {static_cast<std::uint64_t>((static_cast<unsigned_int128_type>(lhs.full_significand()) * static_cast<unsigned_int128_type>(rhs.full_significand())) / ten_pow_13)};
-    const auto res_exp {lhs.biased_exponent() + rhs.biased_exponent() + 13};
+    // d32 IEEE: revert to the simple multiply + constructor handoff (see
+    // generic mul_impl above for rationale).
+    BOOST_DECIMAL_IF_CONSTEXPR (std::is_same<ReturnType, decimal32_t>::value)
+    {
+        const auto res_sig {static_cast<mul_type>(lhs_sig) * static_cast<mul_type>(rhs_sig)};
+        const auto res_exp {lhs_exp + rhs_exp};
+        return ReturnType{res_sig, res_exp, sign};
+    }
+
+    BOOST_DECIMAL_IF_CONSTEXPR (!detail::is_fast_type_v<ReturnType>)
+    {
+        detail::expand_significand<ReturnType>(lhs_sig, lhs_exp);
+        detail::expand_significand<ReturnType>(rhs_sig, rhs_exp);
+    }
 
-    return ReturnType{res_sig, res_exp, lhs.isneg() != rhs.isneg()};
+    const auto res_sig {static_cast<mul_type>(lhs_sig) * static_cast<mul_type>(rhs_sig)};
+    const auto res_exp {lhs_exp + rhs_exp};
+
+    if (BOOST_DECIMAL_UNLIKELY(lhs_sig == 0U || rhs_sig == 0U))
+    {
+        return ReturnType{res_sig, res_exp, sign};
+    }
+    if (BOOST_DECIMAL_UNLIKELY(!impl::mul_default_rounding(lhs_sig)))
+    {
+        return ReturnType{res_sig, res_exp, sign};
+    }
+
+    return impl::mul_finalize_u64<ReturnType>(res_sig, res_exp, sign);
 }
 
-template <typename ReturnType, BOOST_DECIMAL_INTEGRAL T, BOOST_DECIMAL_INTEGRAL U>
-BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto d64_mul_impl(T lhs_sig, U lhs_exp, bool lhs_sign,
-                                                       T rhs_sig, U rhs_exp, bool rhs_sign) noexcept
-                                                       -> std::enable_if_t<detail::is_decimal_floating_point_v<ReturnType>, ReturnType>
+#ifdef _MSC_VER
+#  pragma warning(pop)
+#endif
+
+// d64 specialization. Used by `decimal_fast64_t * decimal_fast64_t` (which
+// passes decimal types directly without to_components conversion) and by the
+// integer-rhs overloads of `decimal64_t * Integer` and
+// `decimal_fast64_t * Integer`.
+
+template <typename ReturnType, typename T>
+BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto d64_mul_impl(const T& lhs, const T& rhs) noexcept -> ReturnType
 {
     using unsigned_int128_type = boost::int128::uint128_t;
 
-    // Once we have the normalized significands and exponents all we have to do is
-    // multiply the significands and add the exponents
-    //
-    // The constructor needs to calculate the number of digits in the significand which for uint128 is slow
-    // Since we know the value of res_sig is constrained to [(10^16)^2, (10^17 - 1)^2] which equates to
-    // either 31 or 32 decimal digits we can use a single division to make binary search occur with
-    // uint_fast64_t instead. 32 - 13 = 19 or 31 - 13 = 18 which are both still greater than
-    // digits10 + 1 for rounding which is 17 decimal digits
+    auto lhs_c {lhs.to_components()};
+    auto rhs_c {rhs.to_components()};
+    const bool sign {lhs_c.sign != rhs_c.sign};
 
-    constexpr auto ten_pow_13 {pow10(static_cast<unsigned_int128_type>(13))};
-    auto res_sig {(static_cast<unsigned_int128_type>(lhs_sig) * static_cast<unsigned_int128_type>(rhs_sig)) / ten_pow_13};
-    auto res_exp {lhs_exp + rhs_exp + static_cast<U>(13)};
+    BOOST_DECIMAL_IF_CONSTEXPR (!detail::is_fast_type_v<ReturnType>)
+    {
+        detail::expand_significand<ReturnType>(lhs_c.sig, lhs_c.exp);
+        detail::expand_significand<ReturnType>(rhs_c.sig, rhs_c.exp);
+    }
+
+    const auto res_sig {static_cast<unsigned_int128_type>(lhs_c.sig) * static_cast<unsigned_int128_type>(rhs_c.sig)};
+    const auto res_exp {lhs_c.exp + rhs_c.exp};
+
+    if (BOOST_DECIMAL_UNLIKELY(lhs_c.sig == 0U || rhs_c.sig == 0U))
+    {
+        return ReturnType{res_sig, res_exp, sign};
+    }
+    if (BOOST_DECIMAL_UNLIKELY(!impl::mul_default_rounding(lhs)))
+    {
+        return ReturnType{res_sig, res_exp, sign};
+    }
 
-    return {static_cast<std::uint64_t>(res_sig), res_exp, lhs_sign != rhs_sign};
+    return impl::mul_finalize_u128<ReturnType>(res_sig, res_exp, sign);
 }
 
 template <typename ReturnType, BOOST_DECIMAL_INTEGRAL T, BOOST_DECIMAL_INTEGRAL U>
 BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto d64_mul_impl(T lhs_sig, U lhs_exp, bool lhs_sign,
                                                        T rhs_sig, U rhs_exp, bool rhs_sign) noexcept
-                                                       -> std::enable_if_t<!detail::is_decimal_floating_point_v<ReturnType>, ReturnType>
+                                                       -> std::enable_if_t<detail::is_decimal_floating_point_v<ReturnType>, ReturnType>
 {
     using unsigned_int128_type = boost::int128::uint128_t;
-    constexpr auto comp_value {detail::pow10(static_cast<unsigned_int128_type>(31))};
-
-    #ifdef BOOST_DECIMAL_DEBUG
-    std::cerr << "sig lhs: " << lhs_sig
-              << "\nexp lhs: " << lhs_exp
-              << "\nsig rhs: " << rhs_sig
-              << "\nexp rhs: " << rhs_exp;
-    #endif
 
-    bool sign {lhs_sign != rhs_sign};
-
-    // Once we have the normalized significands and exponents all we have to do is
-    // multiply the significands and add the exponents
-
-    auto res_sig {static_cast<unsigned_int128_type>(lhs_sig) * static_cast<unsigned_int128_type>(rhs_sig)};
-    auto res_exp {static_cast<typename ReturnType::biased_exponent_type>(lhs_exp + rhs_exp)};
+    const bool sign {lhs_sign != rhs_sign};
 
-    const auto sig_dig {res_sig >= comp_value ? 32 : 31};
-    constexpr auto max_dig {std::numeric_limits<typename ReturnType::significand_type>::digits10};
-    res_sig /= detail::pow10(static_cast<unsigned_int128_type>(sig_dig - max_dig));
-    res_exp += sig_dig - max_dig;
+    BOOST_DECIMAL_IF_CONSTEXPR (!detail::is_fast_type_v<ReturnType>)
+    {
+        detail::expand_significand<ReturnType>(lhs_sig, lhs_exp);
+        detail::expand_significand<ReturnType>(rhs_sig, rhs_exp);
+    }
 
-    const auto res_sig_64 {static_cast<typename ReturnType::significand_type>(res_sig)};
+    const auto res_sig {static_cast<unsigned_int128_type>(lhs_sig) * static_cast<unsigned_int128_type>(rhs_sig)};
+    const auto res_exp {lhs_exp + rhs_exp};
 
-    #ifdef BOOST_DECIMAL_DEBUG
-    std::cerr << "\nres sig: " << res_sig_64
-              << "\nres exp: " << res_exp << std::endl;
-    #endif
+    if (BOOST_DECIMAL_UNLIKELY(lhs_sig == 0U || rhs_sig == 0U))
+    {
+        return ReturnType{res_sig, res_exp, sign};
+    }
+    if (BOOST_DECIMAL_UNLIKELY(!impl::mul_default_rounding(lhs_sig)))
+    {
+        return ReturnType{res_sig, res_exp, sign};
+    }
 
-    return {res_sig_64, res_exp, sign};
+    return impl::mul_finalize_u128<ReturnType>(res_sig, res_exp, sign);
 }
 
 template <typename ReturnType, BOOST_DECIMAL_INTEGRAL T1, BOOST_DECIMAL_INTEGRAL U1,
                                BOOST_DECIMAL_INTEGRAL T2, BOOST_DECIMAL_INTEGRAL U2>
 BOOST_DECIMAL_FORCE_INLINE
-BOOST_DECIMAL_CUDA_CONSTEXPR auto d128_mul_impl(const T1& lhs_sig, const U1 lhs_exp, const bool lhs_sign,
-                             const T2& rhs_sig, const U2 rhs_exp, const bool rhs_sign) noexcept -> ReturnType
+BOOST_DECIMAL_CUDA_CONSTEXPR auto d128_mul_impl(const T1& lhs_sig_in, const U1 lhs_exp_in, const bool lhs_sign,
+                             const T2& rhs_sig_in, const U2 rhs_exp_in, const bool rhs_sign) noexcept -> ReturnType
 {
     using sig_type = T1;
+
     static_assert(std::is_same<sig_type, T2>::value, "Should have a common type by this point");
 
     const bool sign {lhs_sign != rhs_sign};
 
-    auto res_sig {detail::umul256(lhs_sig, rhs_sig)};
-    auto res_exp {lhs_exp + rhs_exp};
-    const auto sig_dig {detail::num_digits(res_sig)};
+    // Small-product fast path: when both operands have at most precision/2
+    // digits, the un-expanded product is guaranteed to fit in uint128 (no
+    // umul256 needed) and may even fit in p digits, in which case the
+    // product itself is the result. Gated on both sigs being below the
+    // sqrt(max_sig) threshold so the uint128 multiply cannot overflow.
+    constexpr auto small_sig_threshold {detail::pow10<sig_type>(static_cast<sig_type>(detail::precision_v<ReturnType> / 2))};
+    if (BOOST_DECIMAL_UNLIKELY(lhs_sig_in <= small_sig_threshold && rhs_sig_in <= small_sig_threshold))
+    {
+        const auto small_prod {lhs_sig_in * rhs_sig_in};
+        constexpr auto small_threshold {static_cast<sig_type>(detail::max_significand_v<ReturnType>) + sig_type{1U}};
+        if (small_prod < small_threshold)
+        {
+            const auto small_exp {static_cast<typename ReturnType::biased_exponent_type>(lhs_exp_in + rhs_exp_in)};
+            return detail::pack_in_range<ReturnType>(small_prod, small_exp, sign);
+        }
+    }
 
-    // 34 is the number of digits in the d128 significand
-    // this way we can skip rounding in the constructor a second time
-    const auto digit_delta {sig_dig - std::numeric_limits<sig_type>::digits10};
-    if (BOOST_DECIMAL_LIKELY(digit_delta > 0))
+    // Subnormal/underflow fast-bailout: when the operand exponents already sum
+    // to deep into the subnormal range, the optimized path below wastes a u256
+    // divmod_pow10 (16 multiplies) producing a value the constructor would
+    // flush to zero anyway. The legacy umul256 + num_digits + coefficient_rounding
+    // path has an early-out (shift > digits10 -> coeff=0) that skips the
+    // divmod. Benchmarks show this saves ~7% on the d128 IEEE multiplication
+    // hot loop where ~25% of random inputs fall into this range. Kept for
+    // d128 only; the d64/d32 paths don't see the same benefit because
+    // divmod_pow10_uint128 is much cheaper than divmod_pow10_u256.
+    constexpr int subnormal_guard {-detail::bias_v<ReturnType> + detail::precision_v<ReturnType>};
+    if (BOOST_DECIMAL_UNLIKELY(static_cast<int>(lhs_exp_in) + static_cast<int>(rhs_exp_in) < subnormal_guard))
     {
-        auto biased_exp {res_exp + detail::bias_v<ReturnType>};
-        detail::coefficient_rounding<ReturnType>(res_sig, res_exp, biased_exp, sign, sig_dig);
+        auto res_sig {detail::umul256(lhs_sig_in, rhs_sig_in)};
+        auto res_exp_mut {static_cast<typename ReturnType::biased_exponent_type>(lhs_exp_in + rhs_exp_in)};
+        const auto sig_dig {detail::num_digits(res_sig)};
+        const auto digit_delta {sig_dig - std::numeric_limits<sig_type>::digits10};
+        if (BOOST_DECIMAL_LIKELY(digit_delta > 0))
+        {
+            auto biased_exp {res_exp_mut + detail::bias_v<ReturnType>};
+            detail::coefficient_rounding<ReturnType>(res_sig, res_exp_mut, biased_exp, sign, sig_dig);
+        }
+        // coefficient_rounding leaves res_sig at exactly precision digits (<= max_significand_v),
+        // so pack_in_range can use direct_pack when biased_exp is in [0, max] and skip the
+        // constructor's redundant num_digits + branch tree. Falls back to the constructor
+        // for the (rarer) genuinely-subnormal output where its boundary handling is needed.
+        return detail::pack_in_range<ReturnType>(int128::uint128_t{res_sig[1], res_sig[0]}, res_exp_mut, sign);
     }
 
-    BOOST_DECIMAL_ASSERT((res_sig[3] | res_sig[2]) == 0U);
-    return {int128::uint128_t{res_sig[1], res_sig[0]}, res_exp, sign};
-}
+    auto lhs_sig {lhs_sig_in};
+    auto lhs_exp {lhs_exp_in};
+    auto rhs_sig {rhs_sig_in};
+    auto rhs_exp {rhs_exp_in};
 
-template <typename ReturnType, BOOST_DECIMAL_INTEGRAL T1, BOOST_DECIMAL_INTEGRAL U1,
-                               BOOST_DECIMAL_INTEGRAL T2, BOOST_DECIMAL_INTEGRAL U2>
-BOOST_DECIMAL_FORCE_INLINE
-BOOST_DECIMAL_CUDA_CONSTEXPR auto d128_fast_mul_impl(const T1& lhs_sig, const U1 lhs_exp, const bool lhs_sign,
-                                  const T2& rhs_sig, const U2 rhs_exp, const bool rhs_sign) noexcept -> ReturnType
-{
-    const bool sign {lhs_sign != rhs_sign};
+    BOOST_DECIMAL_IF_CONSTEXPR (!detail::is_fast_type_v<ReturnType>)
+    {
+        detail::expand_significand<ReturnType>(lhs_sig, lhs_exp);
+        detail::expand_significand<ReturnType>(rhs_sig, rhs_exp);
+    }
 
-    // Once we have the normalized significands and exponents all we have to do is
-    // multiply the significands and add the exponents
-    auto res_sig {detail::umul256(lhs_sig, rhs_sig)};
-    const auto res_exp {lhs_exp + rhs_exp + 30};
+    const auto res_exp {static_cast<typename ReturnType::biased_exponent_type>(lhs_exp + rhs_exp)};
+
+    if (BOOST_DECIMAL_UNLIKELY(lhs_sig == 0U || rhs_sig == 0U))
+    {
+        return ReturnType{int128::uint128_t{0}, res_exp, sign};
+    }
 
-    constexpr auto ten_pow_30 {detail::pow10(static_cast<int128::uint128_t>(30))};
-    res_sig /= ten_pow_30;
+    auto res_sig {detail::umul256(lhs_sig, rhs_sig)};
 
-    BOOST_DECIMAL_ASSERT((res_sig[3] | res_sig[2]) == 0U); // LCOV_EXCL_LINE
-    return {int128::uint128_t{res_sig[1], res_sig[0]}, res_exp, sign};
-}
+    if (BOOST_DECIMAL_UNLIKELY(!impl::mul_default_rounding(lhs_sig)))
+    {
+        // Non-default rounding mode: use coefficient_rounding which dispatches
+        // to fenv_round and honors the runtime mode. Pack via pack_in_range so
+        // the in-range case takes direct_pack (no redundant num_digits in the
+        // constructor); out-of-range biased_exp falls back to the constructor.
+        const auto sig_dig {detail::num_digits(res_sig)};
+        const auto digit_delta {sig_dig - std::numeric_limits<sig_type>::digits10};
+        auto res_exp_mut {res_exp};
+
+        if (BOOST_DECIMAL_LIKELY(digit_delta > 0))
+        {
+            auto biased_exp {res_exp_mut + detail::bias_v<ReturnType>};
+            detail::coefficient_rounding<ReturnType>(res_sig, res_exp_mut, biased_exp, sign, sig_dig);
+        }
+
+        return detail::pack_in_range<ReturnType>(int128::uint128_t{res_sig[1], res_sig[0]}, res_exp_mut, sign);
+    }
 
-template <typename ReturnType>
-BOOST_DECIMAL_FORCE_INLINE BOOST_DECIMAL_CUDA_CONSTEXPR auto mul_impl(const decimal128_t_components& lhs, const decimal128_t_components& rhs) noexcept -> ReturnType
-{
-    return d128_mul_impl<ReturnType>(lhs.sig, lhs.exp, lhs.sign,
-                                     rhs.sig, rhs.exp, rhs.sign);
+    return impl::mul_finalize_u256<ReturnType>(res_sig, res_exp, sign);
 }
 
 } // namespace detail
diff --git a/include/boost/decimal/detail/normalize.hpp b/include/boost/decimal/detail/normalize.hpp
index c9a75c273..b7fafdca5 100644
--- a/include/boost/decimal/detail/normalize.hpp
+++ b/include/boost/decimal/detail/normalize.hpp
@@ -46,13 +46,23 @@ BOOST_DECIMAL_CUDA_CONSTEXPR auto normalize(T1& significand, T2& exp, bool sign
 #endif
 
 // This is a branchless version of the above which is used for implementing basic operations,
-// since we know that the values in the decimal type are never larger than target_precision
+// since we know that the values in the decimal type are never larger than target_precision.
+// Fast path: when significand is already in the normalized range [10^(p-1), 10^p - 1]
+// no expansion is needed and we skip the num_digits + pow10 multiply (~20 cycles saved).
+// This is the dominant case for results of prior arithmetic that were packed at full precision.
 template <typename TargetDecimalType, typename T1, typename T2>
 BOOST_DECIMAL_CUDA_CONSTEXPR auto expand_significand(T1& significand, T2& exp) noexcept -> void
 {
     constexpr auto target_precision {detail::precision_v<TargetDecimalType>};
-    const auto digits {num_digits(significand)};
+    constexpr T1 min_normal_sig {static_cast<T1>(pow10(static_cast<T1>(target_precision - 1)))};
 
+    if (significand >= min_normal_sig)
+    {
+        // Already has exactly target_precision digits; expansion is a no-op.
+        return;
+    }
+
+    const auto digits {num_digits(significand)};
     const auto zeros_needed {target_precision - digits};
     significand *= pow10(static_cast<T1>(zeros_needed));
     exp -= zeros_needed;
diff --git a/include/boost/decimal/detail/u256.hpp b/include/boost/decimal/detail/u256.hpp
index 58fe602cf..aa524af15 100644
--- a/include/boost/decimal/detail/u256.hpp
+++ b/include/boost/decimal/detail/u256.hpp
@@ -900,9 +900,19 @@ BOOST_DECIMAL_CUDA_CONSTEXPR u256 operator*(const UnsignedInteger lhs, const u25
 {
     return impl::default_mul(rhs, lhs);
 }
+BOOST_DECIMAL_CUDA_CONSTEXPR u256 mul128_by_64(const int128::uint128_t& a, const std::uint64_t b) noexcept;
 
 BOOST_DECIMAL_CUDA_CONSTEXPR u256 umul256(const int128::uint128_t& a, const int128::uint128_t& b) noexcept
 {
+    if (BOOST_DECIMAL_UNLIKELY(b.high == 0U))
+    {
+        return mul128_by_64(a, b.low);
+    }
+    if (BOOST_DECIMAL_UNLIKELY(a.high == 0U))
+    {
+        return mul128_by_64(b, a.low);
+    }
+
     u256 result{};
 
     const int128::uint128_t a_low {a.low};
diff --git a/test/Jamfile b/test/Jamfile
index 2bb437ca1..903d91d9f 100644
--- a/test/Jamfile
+++ b/test/Jamfile
@@ -51,6 +51,7 @@ project : requirements
 
 run-fail benchmarks.cpp ;
 run-fail benchmark_uint256.cpp ;
+run-fail bench_accumulation.cpp ;
 
 run compare_dec128_and_fast.cpp ;
 compile-fail concepts_test.cpp ;
diff --git a/test/bench_accumulation.cpp b/test/bench_accumulation.cpp
new file mode 100644
index 000000000..c04f7e8ac
--- /dev/null
+++ b/test/bench_accumulation.cpp
@@ -0,0 +1,158 @@
+// Copyright 2026 Matt Borland
+// Distributed under the Boost Software License, Version 1.0.
+// https://www.boost.org/LICENSE_1_0.txt
+//
+// Benchmark for the small-exponent-difference alignment kernel in add_impl.
+// The companion benchmarks.cpp uses random exponents across the full range,
+// which makes the dominant-operand short-circuit fire in ~94% of operations
+// and leaves the alignment kernel under-exercised. This benchmark generates
+// same-magnitude operands so the kernel actually has to align and add.
+
+#define BOOST_DECIMAL_DETAIL_INT128_ALLOW_SIGN_CONVERSION
+#define BOOST_DECIMAL_DETAIL_INT128_ALLOW_SIGN_COMPARE
+
+#include <boost/decimal.hpp>
+#include <chrono>
+#include <random>
+#include <vector>
+#include <type_traits>
+#include <iostream>
+#include <iomanip>
+
+#if defined(__clang__)
+#  pragma clang diagnostic push
+#  pragma clang diagnostic ignored "-Wold-style-cast"
+#  pragma clang diagnostic ignored "-Wundef"
+#  pragma clang diagnostic ignored "-Wconversion"
+#  pragma clang diagnostic ignored "-Wsign-conversion"
+#  pragma clang diagnostic ignored "-Wfloat-equal"
+#elif defined(__GNUC__)
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wold-style-cast"
+#  pragma GCC diagnostic ignored "-Wundef"
+#  pragma GCC diagnostic ignored "-Wconversion"
+#  pragma GCC diagnostic ignored "-Wsign-conversion"
+#  pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+
+#include <boost/random/uniform_int_distribution.hpp>
+
+#ifdef BOOST_DECIMAL_RUN_BENCHMARKS
+
+using namespace boost::decimal;
+using namespace std::chrono_literals;
+
+#ifdef __clang__
+#  define BOOST_DECIMAL_NO_INLINE __attribute__ ((__noinline__))
+#elif defined(_MSC_VER)
+#  define BOOST_DECIMAL_NO_INLINE __declspec(noinline)
+#elif defined(__GNUC__)
+#  define BOOST_DECIMAL_NO_INLINE __attribute__ ((__noinline__))
+#endif
+
+constexpr unsigned N = 20'000'000U;
+constexpr int K = 5;
+
+// Same-exponent workload: every operand has biased_exponent == fixed_exp.
+// Exercises the same-exp pass-through inside add_impl.
+template <typename T>
+std::vector<T> generate_same_exp_vector(int fixed_exp, unsigned seed = 42U)
+{
+    std::vector<T> v(N);
+    std::mt19937_64 gen(seed);
+    boost::random::uniform_int_distribution<typename T::significand_type>
+        sig_dis(0U, std::numeric_limits<typename T::significand_type>::max() / 1000U);
+    for (std::size_t i = 0; i < v.size(); ++i)
+    {
+        v[i] = T{sig_dis(gen), fixed_exp};
+    }
+    return v;
+}
+
+// Small-diff workload: significands random, exponents drawn from a narrow window
+// so the difference stays inside max_shift but is rarely zero. Forces the
+// alignment-multiply-then-add path inside add_impl.
+template <typename T>
+std::vector<T> generate_small_diff_vector(int base_exp, int half_window, unsigned seed = 42U)
+{
+    std::vector<T> v(N);
+    std::mt19937_64 gen(seed);
+    boost::random::uniform_int_distribution<typename T::significand_type>
+        sig_dis(0U, std::numeric_limits<typename T::significand_type>::max() / 1000U);
+    std::uniform_int_distribution<int> exp_dis(base_exp - half_window, base_exp + half_window);
+    for (std::size_t i = 0; i < v.size(); ++i)
+    {
+        v[i] = T{sig_dis(gen), exp_dis(gen)};
+    }
+    return v;
+}
+
+template <typename T, typename Func>
+BOOST_DECIMAL_NO_INLINE void test_two_element(const std::vector<T>& data, Func op,
+                                              const char* label, const char* type)
+{
+    const auto t1 = std::chrono::steady_clock::now();
+    std::size_t s = 0;
+    for (std::size_t k = 0; k < K; ++k)
+    {
+        for (std::size_t i = 0; i < data.size() - 1U; ++i)
+        {
+            s += static_cast<std::size_t>(op(data[i], data[i + 1]));
+        }
+    }
+    const auto t2 = std::chrono::steady_clock::now();
+    std::cerr << std::left << std::setw(11) << label << "<" << std::setw(13) << type
+              << ">: " << std::setw(10) << (t2 - t1) / 1us << " us (s=" << s << ")\n";
+}
+
+template <typename T>
+void run_same_exp(int fixed_exp, const char* type)
+{
+    const auto v = generate_same_exp_vector<T>(fixed_exp);
+    test_two_element(v, std::plus<>(),  "AddSameExp", type);
+    test_two_element(v, std::minus<>(), "SubSameExp", type);
+}
+
+template <typename T>
+void run_small_diff(int base_exp, int half_window, const char* type)
+{
+    const auto v = generate_small_diff_vector<T>(base_exp, half_window);
+    test_two_element(v, std::plus<>(),  "AddSmallDiff", type);
+    test_two_element(v, std::minus<>(), "SubSmallDiff", type);
+}
+
+int main()
+{
+    std::cerr << "\n===== Same-exponent (alignment kernel pass-through) =====\n";
+    run_same_exp<decimal32_t>(0, "decimal32_t");
+    run_same_exp<decimal64_t>(0, "decimal64_t");
+    run_same_exp<decimal128_t>(0, "decimal128_t");
+    run_same_exp<decimal_fast32_t>(0, "dec32_fast");
+    run_same_exp<decimal_fast64_t>(0, "dec64_fast");
+    run_same_exp<decimal_fast128_t>(0, "dec128_fast");
+
+    // For d64 max_shift = 21; window of 5 keeps diffs in [0, 10] so the small-diff
+    // alignment kernel fires every time. For d128 max_shift = 43 in u256 mode but
+    // 3 in the planned uint128 fast path; window of 2 keeps diffs in [0, 4] so we
+    // exercise both paths (until Phase 1 lands, after which we'd want 0-3).
+    std::cerr << "\n===== Small exp diff (alignment kernel multiply+add) =====\n";
+    run_small_diff<decimal32_t>(0, 5, "decimal32_t");
+    run_small_diff<decimal64_t>(0, 5, "decimal64_t");
+    run_small_diff<decimal128_t>(0, 2, "decimal128_t");
+    run_small_diff<decimal_fast32_t>(0, 5, "dec32_fast");
+    run_small_diff<decimal_fast64_t>(0, 5, "dec64_fast");
+    run_small_diff<decimal_fast128_t>(0, 2, "dec128_fast");
+
+    std::cerr << std::endl;
+    return 1;
+}
+
+#else
+
+int main()
+{
+    std::cerr << "Benchmarks not run" << std::endl;
+    return 1;
+}
+
+#endif