Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
bbb4a8a
pystrhex: Add AVX2 SIMD optimization for hex conversion
gpshead Jan 18, 2026
90da084
pystrhex: Add AVX-512 SIMD optimization for hex conversion
gpshead Jan 18, 2026
bdceb9c
pystrhex: Add ARM NEON SIMD optimization for hex conversion
gpshead Jan 18, 2026
4f3d60d
pystrhex: Add SSE2 SIMD optimization for hex conversion
gpshead Jan 18, 2026
015cc55
pystrhex: Remove AVX2/AVX-512, keep SSE2-only for simplicity
gpshead Jan 18, 2026
ae3d7be
pystrhex: Replace SSE2/NEON with portable SIMD using GCC vector exten…
gpshead Jan 18, 2026
a22e5ce
pystrhex: Add SIMD optimization for large separator groups
gpshead Jan 18, 2026
2fe987c
measure more
gpshead Jan 18, 2026
e643fb8
pystrhex: Lower SIMD separator threshold to 8 bytes
gpshead Jan 18, 2026
92c281d
pystrhex: Use signed comparison for efficient SIMD codegen
gpshead Jan 18, 2026
190936f
pystrhex: Factor out scalar hexlify into shared inline function
gpshead Jan 18, 2026
b2dd34e
remove benchmark data
gpshead Jan 18, 2026
b6feaba
pystrhex: Enable SIMD on 32-bit ARM with NEON
gpshead Jan 18, 2026
0f94c06
explain more in the comments
gpshead Jan 18, 2026
e8650f3
remove the microbenchmark code
gpshead Jan 18, 2026
5fc294c
test_bytes: Add SIMD-specific test cases for bytes.hex()
gpshead Jan 18, 2026
2a378d4
pystrhex: Remove SIMD path for separator groups
gpshead Feb 22, 2026
8110537
pystrhex: Use configure.ac detection for SIMD support
gpshead Feb 22, 2026
597c675
NEWS entry
gpshead Feb 22, 2026
dfc295f
remove unnecessary info from the code comment
gpshead Feb 22, 2026
37d01ca
Rename HAVE_BUILTIN_SHUFFLEVECTOR to HAVE_EFFICIENT_BUILTIN_SHUFFLEVE…
gpshead Feb 22, 2026
325deb6
Merge branch 'main' into opt-pystrhex
gpshead Feb 22, 2026
9b27e02
reword news, mention hashlib hexdigest
gpshead Feb 22, 2026
348a2f8
aww, goodbye cute "brrr..." comment
gpshead Feb 22, 2026
7e67993
rename the autoconf ac_ function to be consistent
gpshead Feb 22, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions Lib/test/test_bytes.py
Original file line number Diff line number Diff line change
Expand Up @@ -584,6 +584,37 @@ def test_hex_separator_six_bytes(self):
self.assertEqual(six_bytes.hex(':', -6), '0306090c0f12')
self.assertEqual(six_bytes.hex(' ', -95), '0306090c0f12')

def test_hex_simd_boundaries(self):
# Test lengths around the SIMD threshold (16 bytes).
# SIMD processes 16 bytes at a time; smaller inputs use scalar code.
for length in (14, 15, 16, 17, 31, 32, 33, 64, 65):
data = self.type2test(bytes(range(length)))
expected = ''.join(f'{b:02x}' for b in range(length))
with self.subTest(length=length):
self.assertEqual(data.hex(), expected)

def test_hex_nibble_boundaries(self):
# Test the nibble value boundary at 9/10 (where '9' becomes 'a').
# SIMD uses signed comparison for efficiency; verify correctness
# at this boundary for various nibble combinations.
boundary_bytes = self.type2test(bytes([
0x09, # both nibbles: 0, 9
0x0a, # both nibbles: 0, 10
0x90, # both nibbles: 9, 0
0x99, # both nibbles: 9, 9 (max all-digit)
0x9a, # both nibbles: 9, 10
0xa0, # both nibbles: 10, 0
0xa9, # both nibbles: 10, 9
0xaa, # both nibbles: 10, 10 (min all-letter)
0x00, # min value
0xff, # max value
]))
self.assertEqual(boundary_bytes.hex(), '090a90999aa0a9aa00ff')

# Repeat with 16+ bytes to exercise SIMD path
simd_boundary = self.type2test(boundary_bytes * 2)
self.assertEqual(simd_boundary.hex(), '090a90999aa0a9aa00ff' * 2)

def test_join(self):
self.assertEqual(self.type2test(b"").join([]), b"")
self.assertEqual(self.type2test(b"").join([b""]), b"")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Speed up :meth:`bytes.hex`, :meth:`bytearray.hex`, :func:`binascii.hexlify`,
and :mod:`hashlib` ``.hexdigest()`` operations with SIMD on x86-64, ARM64,
and ARM32 with NEON when built with gcc (version 12 or higher) or clang
(version 3 or higher) compilers. Around 1.1-3x faster for common 16-64 byte
inputs such as hashlib hex digests, and up to 8x faster for larger data.
121 changes: 115 additions & 6 deletions Python/pystrhex.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,113 @@
#include "pycore_strhex.h" // _Py_strhex_with_sep()
#include "pycore_unicodeobject.h" // _PyUnicode_CheckConsistency()

/* Scalar hexlify: convert len bytes to 2*len hex characters.
Uses table lookup via Py_hexdigits for the conversion. */
static inline void
_Py_hexlify_scalar(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
{
/* Various optimizations like using math instead of a table lookup,
manually unrolling the loop, storing the global table pointer locally,
and doing wider dst writes have been tried and benchmarked; all produced
nearly identical performance on gcc 15. Using a 256 entry uint16_t
table was a bit slower. So we keep our old simple and obvious code. */
for (Py_ssize_t i = 0; i < len; i++) {
unsigned char c = src[i];
*dst++ = Py_hexdigits[c >> 4];
*dst++ = Py_hexdigits[c & 0x0f];
}
}

/* Portable SIMD optimization for hexlify using GCC/Clang vector extensions.
Uses __builtin_shufflevector for portable interleave that compiles to
native SIMD instructions (SSE2 punpcklbw/punpckhbw on x86-64 [always],
NEON zip1/zip2 on ARM64 [always], & vzip on ARM32 when compiler flags
for the target microarch allow it [try -march=native if running 32-bit
on an RPi3 or later]).
Performance:
- For more common small data it varies between 1.1-3x faster.
- Up to 11x faster on larger data than the scalar code.
While faster is possible for big data using AVX2 or AVX512, that
adds a ton of complication. Who ever really hexes huge data?
The 16-64 byte boosts align nicely with md5 - sha512 hexdigests.
*/
#ifdef HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR

/* 128-bit vector of 16 unsigned bytes */
typedef unsigned char v16u8 __attribute__((vector_size(16)));
/* 128-bit vector of 16 signed bytes - for efficient comparison.
Using signed comparison generates pcmpgtb on x86-64 instead of
the slower psubusb+pcmpeqb sequence from unsigned comparison.
ARM NEON performs the same either way. */
typedef signed char v16s8 __attribute__((vector_size(16)));

/* Splat a byte value across all 16 lanes */
static inline v16u8
v16u8_splat(unsigned char x)
{
return (v16u8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x};
}

static inline v16s8
v16s8_splat(signed char x)
{
return (v16s8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x};
}

/* Portable SIMD hexlify: converts 16 bytes to 32 hex chars per iteration.
Compiles to native SSE2 on x86-64, NEON on ARM64 (and some ARM32). */
static void
_Py_hexlify_simd(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
{
const v16u8 mask_0f = v16u8_splat(0x0f);
const v16u8 ascii_0 = v16u8_splat('0');
const v16u8 offset = v16u8_splat('a' - '0' - 10); /* 0x27 */
const v16s8 nine = v16s8_splat(9);

Py_ssize_t i = 0;

/* Process 16 bytes at a time */
for (; i + 16 <= len; i += 16, dst += 32) {
/* Load 16 bytes (memcpy for safe unaligned access) */
v16u8 data;
memcpy(&data, src + i, 16);

/* Extract high and low nibbles using vector operators */
v16u8 hi = (data >> 4) & mask_0f;
v16u8 lo = data & mask_0f;

/* Compare > 9 using signed comparison for efficient codegen.
Nibble values 0-15 are safely in signed byte range.
This generates pcmpgtb on x86-64, avoiding the slower
psubusb+pcmpeqb sequence from unsigned comparison. */
v16u8 hi_gt9 = (v16u8)((v16s8)hi > nine);
v16u8 lo_gt9 = (v16u8)((v16s8)lo > nine);

/* Convert nibbles to hex ASCII */
hi = hi + ascii_0 + (hi_gt9 & offset);
lo = lo + ascii_0 + (lo_gt9 & offset);

/* Interleave hi/lo nibbles using portable shufflevector.
This compiles to punpcklbw/punpckhbw on x86-64, zip1/zip2 on ARM64,
or vzip on ARM32. */
v16u8 result0 = __builtin_shufflevector(hi, lo,
0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
v16u8 result1 = __builtin_shufflevector(hi, lo,
8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);

/* Store 32 hex characters */
memcpy(dst, &result0, 16);
memcpy(dst + 16, &result1, 16);
}

/* Scalar fallback for remaining 0-15 bytes */
_Py_hexlify_scalar(src + i, dst, len - i);
}

#endif /* HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR */

static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen,
PyObject* sep, int bytes_per_sep_group,
const int return_bytes)
Expand Down Expand Up @@ -82,13 +189,15 @@ static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen,
unsigned char c;

if (bytes_per_sep_group == 0) {
for (i = j = 0; i < arglen; ++i) {
assert((j + 1) < resultlen);
c = argbuf[i];
retbuf[j++] = Py_hexdigits[c >> 4];
retbuf[j++] = Py_hexdigits[c & 0x0f];
#ifdef HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR
if (arglen >= 16) {
_Py_hexlify_simd((const unsigned char *)argbuf, retbuf, arglen);
}
else
#endif
{
_Py_hexlify_scalar((const unsigned char *)argbuf, retbuf, arglen);
}
assert(j == resultlen);
}
else {
/* The number of complete chunk+sep periods */
Expand Down
66 changes: 66 additions & 0 deletions configure

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

35 changes: 35 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -5017,6 +5017,41 @@ AS_VAR_IF([ac_cv_builtin_atomic], [yes], [
AC_DEFINE(HAVE_BUILTIN_ATOMIC, 1, [Has builtin __atomic_load_n() and __atomic_store_n() functions])
])

# Check for __builtin_shufflevector with 128-bit vector support on an
# architecture where it compiles to worthwhile native SIMD instructions.
# Used for SIMD-accelerated bytes.hex() in Python/pystrhex.c.
AC_CACHE_CHECK([for __builtin_shufflevector], [ac_cv_efficient_builtin_shufflevector], [
AC_LINK_IFELSE([
AC_LANG_PROGRAM([[
/* __builtin_shufflevector is available on many platforms, but 128-bit
vector code is only worthwhile on architectures with native SIMD:
x86-64 (SSE2, always available), ARM64 (NEON, always available),
or ARM32 when NEON is enabled via compiler flags (e.g. -march=native
on RPi3+). On ARM32 without NEON (e.g. armv6 builds), the compiler
has the builtin but generates slow scalar code instead. */
#if !defined(__x86_64__) && !defined(__aarch64__) && \
!(defined(__arm__) && defined(__ARM_NEON))
# error "128-bit vector SIMD not worthwhile on this architecture"
#endif
typedef unsigned char v16u8 __attribute__((vector_size(16)));
]], [[
v16u8 a = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
v16u8 b = {16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
v16u8 c = __builtin_shufflevector(a, b,
0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
(void)c;
return 0;
]])
],[ac_cv_efficient_builtin_shufflevector=yes],[ac_cv_efficient_builtin_shufflevector=no])
])

AS_VAR_IF([ac_cv_efficient_builtin_shufflevector], [yes], [
AC_DEFINE([HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR], [1],
[Define if compiler supports __builtin_shufflevector with 128-bit
vectors AND the target architecture has native SIMD (not just API
availability)])
])

# --with-mimalloc
AC_MSG_CHECKING([for --with-mimalloc])
AC_ARG_WITH([mimalloc],
Expand Down
4 changes: 4 additions & 0 deletions pyconfig.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,10 @@
/* Define to 1 if you have the <editline/readline.h> header file. */
#undef HAVE_EDITLINE_READLINE_H

/* Define if compiler supports __builtin_shufflevector with 128-bit vectors
AND the target architecture has native SIMD (not just API availability) */
#undef HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR

/* Define to 1 if you have the <endian.h> header file. */
#undef HAVE_ENDIAN_H

Expand Down
Loading