python · gpshead · Dec 28, 2025 · Dec 29, 2025 · Dec 29, 2025 · Dec 29, 2025
@@ -428,6 +428,12 @@ argparse
   inline code when color output is enabled.
   (Contributed by Savannah Ostrowski in :gh:`142390`.)
 
+base64 & binascii
+-----------------
+
+* CPython's underlying base64 implementation now encodes 2x faster and decodes 3x
+  faster thanks to simple CPU pipelining optimizations.
+
 calendar
 --------
 

diff --git a/Misc/NEWS.d/next/Library/2025-12-29-00-42-26.gh-issue-124951.OsC5K4.rst b/Misc/NEWS.d/next/Library/2025-12-29-00-42-26.gh-issue-124951.OsC5K4.rst
@@ -0,0 +1,3 @@
+The base64 implementation behind the :mod:`binascii`, :mod:`base64`, and
+related codec has been optimized for modern pipelined CPU architectures and
+now performs 2-3x faster across all platforms.
diff --git a/Modules/binascii.c b/Modules/binascii.c
@@ -76,7 +76,8 @@ get_binascii_state(PyObject *module)
 }
 
 
-static const unsigned char table_a2b_base64[] = {
+/* Align to 64 bytes to ensure table fits in a single L1 cache line */
+static const unsigned char table_a2b_base64[] Py_ALIGNED(64) = {
     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63,
@@ -101,9 +102,101 @@ static const unsigned char table_a2b_base64[] = {
 /* Max binary chunk size; limited only by available memory */
 #define BASE64_MAXBIN ((PY_SSIZE_T_MAX - 3) / 2)
 
-static const unsigned char table_b2a_base64[] =
+/*
+ * Fast base64 encoding/decoding helpers.
+ *
+ * Process complete groups without loop-carried dependencies.
+ */
+
+/* Align to 64 bytes to ensure table fits in a single L1 cache line */
+static const unsigned char table_b2a_base64[] Py_ALIGNED(64) =
 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
 
+/* Encode 3 bytes into 4 base64 characters. */
+static inline void
+base64_encode_trio(const unsigned char *in, unsigned char *out,
+                   const unsigned char *table)
+{
+    unsigned int combined = ((unsigned int)in[0] << 16) |
+                            ((unsigned int)in[1] << 8) |
+                            (unsigned int)in[2];
+    out[0] = table[(combined >> 18) & 0x3f];
+    out[1] = table[(combined >> 12) & 0x3f];
+    out[2] = table[(combined >> 6) & 0x3f];
+    out[3] = table[combined & 0x3f];
+}
+
+/* Encode multiple complete 3-byte groups.
+ * Returns the number of input bytes processed (always a multiple of 3).
+ */
+static inline Py_ssize_t
+base64_encode_fast(const unsigned char *in, Py_ssize_t in_len,
+                   unsigned char *out, const unsigned char *table)
+{
+    Py_ssize_t n_trios = in_len / 3;
+    Py_ssize_t i;
+
+    for (i = 0; i < n_trios; i++) {
+        base64_encode_trio(in + i * 3, out + i * 4, table);
+    }
+
+    return n_trios * 3;
+}
+
+/* Decode 4 base64 characters into 3 bytes.
+ * Returns 1 on success, 0 if any character is invalid.
+ */
+static inline int
+base64_decode_quad(const unsigned char *in, unsigned char *out,
+                   const unsigned char *table)
+{
+    unsigned char v0 = table[in[0]];
+    unsigned char v1 = table[in[1]];
+    unsigned char v2 = table[in[2]];
+    unsigned char v3 = table[in[3]];
+
+    if ((v0 | v1 | v2 | v3) & 0xc0) {
+        return 0;
+    }
+
+    out[0] = (v0 << 2) | (v1 >> 4);
+    out[1] = (v1 << 4) | (v2 >> 2);
+    out[2] = (v2 << 6) | v3;
+    return 1;
+}
+
+/* Decode multiple complete 4-character groups (no padding allowed).
+ * Returns the number of input characters processed.
+ * Stops at the first invalid character, padding, or incomplete group.
+ */
+static inline Py_ssize_t
+base64_decode_fast(const unsigned char *in, Py_ssize_t in_len,
+                   unsigned char *out, const unsigned char *table)
+{
+    Py_ssize_t n_quads = in_len / 4;
+    Py_ssize_t i;
+
+    for (i = 0; i < n_quads; i++) {
+        const unsigned char *inp = in + i * 4;
+
+        /* Check for padding - exit fast path to handle it properly.
+         * Four independent comparisons lets the compiler choose the optimal
+         * approach; on modern pipelined CPUs this is faster than bitmask tricks
+         * like XOR+SUB+AND for zero-detection which have data dependencies.
+         */
+        if (inp[0] == BASE64_PAD || inp[1] == BASE64_PAD ||
+            inp[2] == BASE64_PAD || inp[3] == BASE64_PAD) {
+            break;
+        }
+
+        if (!base64_decode_quad(inp, out + i * 3, table)) {
+            break;
+        }
+    }
+
+    return i * 4;
+}
+
 
 static const unsigned short crctab_hqx[256] = {
     0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5, 0x60c6, 0x70e7,
@@ -403,10 +496,26 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode)
         goto error_end;
     }
 
+    size_t i = 0;  /* Current position in input */
+
+    /* Fast path: use optimized decoder for complete quads.
+     * This works for both strict and non-strict mode for valid input.
+     * The fast path stops at padding, invalid chars, or incomplete groups.
+     */
+    if (ascii_len >= 4) {
+        Py_ssize_t fast_chars = base64_decode_fast(ascii_data, (Py_ssize_t)ascii_len,
+                                                   bin_data, table_a2b_base64);
+        if (fast_chars > 0) {
+            i = (size_t)fast_chars;
+            bin_data += (fast_chars / 4) * 3;
+        }
+    }
+
+    /* Slow path: handle remaining input (padding, invalid chars, partial groups) */
     int quad_pos = 0;
     unsigned char leftchar = 0;
     int pads = 0;
-    for (size_t i = 0; i < ascii_len; i++) {
+    for (; i < ascii_len; i++) {
         unsigned char this_ch = ascii_data[i];
 
         /* Check for pad sequences and ignore
@@ -533,9 +642,6 @@ binascii_b2a_base64_impl(PyObject *module, Py_buffer *data, int newline)
 /*[clinic end generated code: output=4ad62c8e8485d3b3 input=0e20ff59c5f2e3e1]*/
 {
     const unsigned char *bin_data;
-    int leftbits = 0;
-    unsigned char this_ch;
-    unsigned int leftchar = 0;
     Py_ssize_t bin_len;
     binascii_state *state;
 
@@ -566,26 +672,31 @@ binascii_b2a_base64_impl(PyObject *module, Py_buffer *data, int newline)
     }
     unsigned char *ascii_data = PyBytesWriter_GetData(writer);
 
-    for( ; bin_len > 0 ; bin_len--, bin_data++ ) {
-        /* Shift the data into our buffer */
-        leftchar = (leftchar << 8) | *bin_data;
-        leftbits += 8;
-
-        /* See if there are 6-bit groups ready */
-        while ( leftbits >= 6 ) {
-            this_ch = (leftchar >> (leftbits-6)) & 0x3f;
-            leftbits -= 6;
-            *ascii_data++ = table_b2a_base64[this_ch];
-        }
-    }
-    if ( leftbits == 2 ) {
-        *ascii_data++ = table_b2a_base64[(leftchar&3) << 4];
+    /* Use the optimized fast path for complete 3-byte groups */
+    Py_ssize_t fast_bytes = base64_encode_fast(bin_data, bin_len, ascii_data,
+                                               table_b2a_base64);
+    bin_data += fast_bytes;
+    ascii_data += (fast_bytes / 3) * 4;
+    bin_len -= fast_bytes;
+
+    /* Handle remaining 0-2 bytes */
+    if (bin_len == 1) {
+        /* 1 byte remaining: produces 2 base64 chars + 2 padding */
+        unsigned int val = bin_data[0];
+        *ascii_data++ = table_b2a_base64[(val >> 2) & 0x3f];
+        *ascii_data++ = table_b2a_base64[(val << 4) & 0x3f];
         *ascii_data++ = BASE64_PAD;
         *ascii_data++ = BASE64_PAD;
-    } else if ( leftbits == 4 ) {
-        *ascii_data++ = table_b2a_base64[(leftchar&0xf) << 2];
+    }
+    else if (bin_len == 2) {
+        /* 2 bytes remaining: produces 3 base64 chars + 1 padding */
+        unsigned int val = ((unsigned int)bin_data[0] << 8) | bin_data[1];
+        *ascii_data++ = table_b2a_base64[(val >> 10) & 0x3f];
+        *ascii_data++ = table_b2a_base64[(val >> 4) & 0x3f];
+        *ascii_data++ = table_b2a_base64[(val << 2) & 0x3f];
         *ascii_data++ = BASE64_PAD;
     }
+
     if (newline)
         *ascii_data++ = '\n';       /* Append a courtesy newline */