diff --git a/Lib/test/test_bytes.py b/Lib/test/test_bytes.py index 742bad21a3346b..1c64bf888f9d27 100644 --- a/Lib/test/test_bytes.py +++ b/Lib/test/test_bytes.py @@ -584,6 +584,37 @@ def test_hex_separator_six_bytes(self): self.assertEqual(six_bytes.hex(':', -6), '0306090c0f12') self.assertEqual(six_bytes.hex(' ', -95), '0306090c0f12') + def test_hex_simd_boundaries(self): + # Test lengths around the SIMD threshold (16 bytes). + # SIMD processes 16 bytes at a time; smaller inputs use scalar code. + for length in (14, 15, 16, 17, 31, 32, 33, 64, 65): + data = self.type2test(bytes(range(length))) + expected = ''.join(f'{b:02x}' for b in range(length)) + with self.subTest(length=length): + self.assertEqual(data.hex(), expected) + + def test_hex_nibble_boundaries(self): + # Test the nibble value boundary at 9/10 (where '9' becomes 'a'). + # SIMD uses signed comparison for efficiency; verify correctness + # at this boundary for various nibble combinations. + boundary_bytes = self.type2test(bytes([ + 0x09, # both nibbles: 0, 9 + 0x0a, # both nibbles: 0, 10 + 0x90, # both nibbles: 9, 0 + 0x99, # both nibbles: 9, 9 (max all-digit) + 0x9a, # both nibbles: 9, 10 + 0xa0, # both nibbles: 10, 0 + 0xa9, # both nibbles: 10, 9 + 0xaa, # both nibbles: 10, 10 (min all-letter) + 0x00, # min value + 0xff, # max value + ])) + self.assertEqual(boundary_bytes.hex(), '090a90999aa0a9aa00ff') + + # Repeat with 16+ bytes to exercise SIMD path + simd_boundary = self.type2test(boundary_bytes * 2) + self.assertEqual(simd_boundary.hex(), '090a90999aa0a9aa00ff' * 2) + def test_join(self): self.assertEqual(self.type2test(b"").join([]), b"") self.assertEqual(self.type2test(b"").join([b""]), b"") diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-02-22-20-15-00.gh-issue-144015.pystrhex_simd.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-02-22-20-15-00.gh-issue-144015.pystrhex_simd.rst new file mode 100644 index 00000000000000..122315e031bc87 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-02-22-20-15-00.gh-issue-144015.pystrhex_simd.rst @@ -0,0 +1,5 @@ +Speed up :meth:`bytes.hex`, :meth:`bytearray.hex`, :func:`binascii.hexlify`, +and :mod:`hashlib` ``.hexdigest()`` operations with SIMD on x86-64, ARM64, +and ARM32 with NEON when built with gcc (version 12 or higher) or clang +(version 3 or higher) compilers. Around 1.1-3x faster for common 16-64 byte +inputs such as hashlib hex digests, and up to 8x faster for larger data. diff --git a/Python/pystrhex.c b/Python/pystrhex.c index af2f5c5dce5fca..698e7f26fbaae7 100644 --- a/Python/pystrhex.c +++ b/Python/pystrhex.c @@ -4,6 +4,113 @@ #include "pycore_strhex.h" // _Py_strhex_with_sep() #include "pycore_unicodeobject.h" // _PyUnicode_CheckConsistency() +/* Scalar hexlify: convert len bytes to 2*len hex characters. + Uses table lookup via Py_hexdigits for the conversion. */ +static inline void +_Py_hexlify_scalar(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len) +{ + /* Various optimizations like using math instead of a table lookup, + manually unrolling the loop, storing the global table pointer locally, + and doing wider dst writes have been tried and benchmarked; all produced + nearly identical performance on gcc 15. Using a 256 entry uint16_t + table was a bit slower. So we keep our old simple and obvious code. */ + for (Py_ssize_t i = 0; i < len; i++) { + unsigned char c = src[i]; + *dst++ = Py_hexdigits[c >> 4]; + *dst++ = Py_hexdigits[c & 0x0f]; + } +} + +/* Portable SIMD optimization for hexlify using GCC/Clang vector extensions. + Uses __builtin_shufflevector for portable interleave that compiles to + native SIMD instructions (SSE2 punpcklbw/punpckhbw on x86-64 [always], + NEON zip1/zip2 on ARM64 [always], & vzip on ARM32 when compiler flags + for the target microarch allow it [try -march=native if running 32-bit + on an RPi3 or later]). + + Performance: + - For more common small data it varies between 1.1-3x faster. + - Up to 11x faster on larger data than the scalar code. + + While faster is possible for big data using AVX2 or AVX512, that + adds a ton of complication. Who ever really hexes huge data? + The 16-64 byte boosts align nicely with md5 - sha512 hexdigests. +*/ +#ifdef HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR + +/* 128-bit vector of 16 unsigned bytes */ +typedef unsigned char v16u8 __attribute__((vector_size(16))); +/* 128-bit vector of 16 signed bytes - for efficient comparison. + Using signed comparison generates pcmpgtb on x86-64 instead of + the slower psubusb+pcmpeqb sequence from unsigned comparison. + ARM NEON performs the same either way. */ +typedef signed char v16s8 __attribute__((vector_size(16))); + +/* Splat a byte value across all 16 lanes */ +static inline v16u8 +v16u8_splat(unsigned char x) +{ + return (v16u8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x}; +} + +static inline v16s8 +v16s8_splat(signed char x) +{ + return (v16s8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x}; +} + +/* Portable SIMD hexlify: converts 16 bytes to 32 hex chars per iteration. + Compiles to native SSE2 on x86-64, NEON on ARM64 (and some ARM32). */ +static void +_Py_hexlify_simd(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len) +{ + const v16u8 mask_0f = v16u8_splat(0x0f); + const v16u8 ascii_0 = v16u8_splat('0'); + const v16u8 offset = v16u8_splat('a' - '0' - 10); /* 0x27 */ + const v16s8 nine = v16s8_splat(9); + + Py_ssize_t i = 0; + + /* Process 16 bytes at a time */ + for (; i + 16 <= len; i += 16, dst += 32) { + /* Load 16 bytes (memcpy for safe unaligned access) */ + v16u8 data; + memcpy(&data, src + i, 16); + + /* Extract high and low nibbles using vector operators */ + v16u8 hi = (data >> 4) & mask_0f; + v16u8 lo = data & mask_0f; + + /* Compare > 9 using signed comparison for efficient codegen. + Nibble values 0-15 are safely in signed byte range. + This generates pcmpgtb on x86-64, avoiding the slower + psubusb+pcmpeqb sequence from unsigned comparison. */ + v16u8 hi_gt9 = (v16u8)((v16s8)hi > nine); + v16u8 lo_gt9 = (v16u8)((v16s8)lo > nine); + + /* Convert nibbles to hex ASCII */ + hi = hi + ascii_0 + (hi_gt9 & offset); + lo = lo + ascii_0 + (lo_gt9 & offset); + + /* Interleave hi/lo nibbles using portable shufflevector. + This compiles to punpcklbw/punpckhbw on x86-64, zip1/zip2 on ARM64, + or vzip on ARM32. */ + v16u8 result0 = __builtin_shufflevector(hi, lo, + 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); + v16u8 result1 = __builtin_shufflevector(hi, lo, + 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); + + /* Store 32 hex characters */ + memcpy(dst, &result0, 16); + memcpy(dst + 16, &result1, 16); + } + + /* Scalar fallback for remaining 0-15 bytes */ + _Py_hexlify_scalar(src + i, dst, len - i); +} + +#endif /* HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR */ + static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen, PyObject* sep, int bytes_per_sep_group, const int return_bytes) @@ -82,13 +189,15 @@ static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen, unsigned char c; if (bytes_per_sep_group == 0) { - for (i = j = 0; i < arglen; ++i) { - assert((j + 1) < resultlen); - c = argbuf[i]; - retbuf[j++] = Py_hexdigits[c >> 4]; - retbuf[j++] = Py_hexdigits[c & 0x0f]; +#ifdef HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR + if (arglen >= 16) { + _Py_hexlify_simd((const unsigned char *)argbuf, retbuf, arglen); + } + else +#endif + { + _Py_hexlify_scalar((const unsigned char *)argbuf, retbuf, arglen); } - assert(j == resultlen); } else { /* The number of complete chunk+sep periods */ diff --git a/configure b/configure index 73a758384553b2..98b4af86858673 100755 --- a/configure +++ b/configure @@ -18889,6 +18889,72 @@ then : printf "%s\n" "#define HAVE_BUILTIN_ATOMIC 1" >>confdefs.h +fi + +# Check for __builtin_shufflevector with 128-bit vector support on an +# architecture where it compiles to worthwhile native SIMD instructions. +# Used for SIMD-accelerated bytes.hex() in Python/pystrhex.c. +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for __builtin_shufflevector" >&5 +printf %s "checking for __builtin_shufflevector... " >&6; } +if test ${ac_cv_efficient_builtin_shufflevector+y} +then : + printf %s "(cached) " >&6 +else case e in #( + e) +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + + + /* __builtin_shufflevector is available on many platforms, but 128-bit + vector code is only worthwhile on architectures with native SIMD: + x86-64 (SSE2, always available), ARM64 (NEON, always available), + or ARM32 when NEON is enabled via compiler flags (e.g. -march=native + on RPi3+). On ARM32 without NEON (e.g. armv6 builds), the compiler + has the builtin but generates slow scalar code instead. */ + #if !defined(__x86_64__) && !defined(__aarch64__) && \ + !(defined(__arm__) && defined(__ARM_NEON)) + # error "128-bit vector SIMD not worthwhile on this architecture" + #endif + typedef unsigned char v16u8 __attribute__((vector_size(16))); + +int +main (void) +{ + + v16u8 a = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; + v16u8 b = {16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}; + v16u8 c = __builtin_shufflevector(a, b, + 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); + (void)c; + return 0; + + ; + return 0; +} + +_ACEOF +if ac_fn_c_try_link "$LINENO" +then : + ac_cv_efficient_builtin_shufflevector=yes +else case e in #( + e) ac_cv_efficient_builtin_shufflevector=no ;; +esac +fi +rm -f core conftest.err conftest.$ac_objext conftest.beam \ + conftest$ac_exeext conftest.$ac_ext + ;; +esac +fi +{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_efficient_builtin_shufflevector" >&5 +printf "%s\n" "$ac_cv_efficient_builtin_shufflevector" >&6; } + +if test "x$ac_cv_efficient_builtin_shufflevector" = xyes +then : + + +printf "%s\n" "#define HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR 1" >>confdefs.h + + fi # --with-mimalloc diff --git a/configure.ac b/configure.ac index 2ba63b2a8a05e0..34318769fcc29f 100644 --- a/configure.ac +++ b/configure.ac @@ -5017,6 +5017,41 @@ AS_VAR_IF([ac_cv_builtin_atomic], [yes], [ AC_DEFINE(HAVE_BUILTIN_ATOMIC, 1, [Has builtin __atomic_load_n() and __atomic_store_n() functions]) ]) +# Check for __builtin_shufflevector with 128-bit vector support on an +# architecture where it compiles to worthwhile native SIMD instructions. +# Used for SIMD-accelerated bytes.hex() in Python/pystrhex.c. +AC_CACHE_CHECK([for __builtin_shufflevector], [ac_cv_efficient_builtin_shufflevector], [ +AC_LINK_IFELSE([ + AC_LANG_PROGRAM([[ + /* __builtin_shufflevector is available on many platforms, but 128-bit + vector code is only worthwhile on architectures with native SIMD: + x86-64 (SSE2, always available), ARM64 (NEON, always available), + or ARM32 when NEON is enabled via compiler flags (e.g. -march=native + on RPi3+). On ARM32 without NEON (e.g. armv6 builds), the compiler + has the builtin but generates slow scalar code instead. */ + #if !defined(__x86_64__) && !defined(__aarch64__) && \ + !(defined(__arm__) && defined(__ARM_NEON)) + # error "128-bit vector SIMD not worthwhile on this architecture" + #endif + typedef unsigned char v16u8 __attribute__((vector_size(16))); + ]], [[ + v16u8 a = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; + v16u8 b = {16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}; + v16u8 c = __builtin_shufflevector(a, b, + 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); + (void)c; + return 0; + ]]) +],[ac_cv_efficient_builtin_shufflevector=yes],[ac_cv_efficient_builtin_shufflevector=no]) +]) + +AS_VAR_IF([ac_cv_efficient_builtin_shufflevector], [yes], [ + AC_DEFINE([HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR], [1], + [Define if compiler supports __builtin_shufflevector with 128-bit + vectors AND the target architecture has native SIMD (not just API + availability)]) +]) + # --with-mimalloc AC_MSG_CHECKING([for --with-mimalloc]) AC_ARG_WITH([mimalloc], diff --git a/pyconfig.h.in b/pyconfig.h.in index e2009b2d9ee57e..fbd5d4d625908e 100644 --- a/pyconfig.h.in +++ b/pyconfig.h.in @@ -324,6 +324,10 @@ /* Define to 1 if you have the header file. */ #undef HAVE_EDITLINE_READLINE_H +/* Define if compiler supports __builtin_shufflevector with 128-bit vectors + AND the target architecture has native SIMD (not just API availability) */ +#undef HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR + /* Define to 1 if you have the header file. */ #undef HAVE_ENDIAN_H