-
-
Notifications
You must be signed in to change notification settings - Fork 33.8k
gh-124951: Optimize base64 encode & decode for an easy 2-3x speedup [no SIMD] #143262
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
2d2be30
573eaf3
eaac671
060dbf5
1e12273
ef38895
7458c99
1f8ff74
4b1245b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| The base64 implementation behind the :mod:`binascii`, :mod:`base64`, and | ||
| related codec has been optimized for modern pipelined CPU architectures and | ||
| now performs 2-3x faster across all platforms. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -76,7 +76,8 @@ get_binascii_state(PyObject *module) | |
| } | ||
|
|
||
|
|
||
| static const unsigned char table_a2b_base64[] = { | ||
| /* Align to 64 bytes to ensure table fits in a single L1 cache line */ | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The entire 256-bytes table will not fit in a single L1 cache line. It may be worth to align anyway, but the comment is incorrect. |
||
| static const unsigned char table_a2b_base64[] Py_ALIGNED(64) = { | ||
| -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, | ||
| -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, | ||
| -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63, | ||
|
|
@@ -101,9 +102,101 @@ static const unsigned char table_a2b_base64[] = { | |
| /* Max binary chunk size; limited only by available memory */ | ||
| #define BASE64_MAXBIN ((PY_SSIZE_T_MAX - 3) / 2) | ||
|
|
||
| static const unsigned char table_b2a_base64[] = | ||
| /* | ||
| * Fast base64 encoding/decoding helpers. | ||
| * | ||
| * Process complete groups without loop-carried dependencies. | ||
| */ | ||
|
|
||
| /* Align to 64 bytes to ensure table fits in a single L1 cache line */ | ||
| static const unsigned char table_b2a_base64[] Py_ALIGNED(64) = | ||
| "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; | ||
|
|
||
| /* Encode 3 bytes into 4 base64 characters. */ | ||
| static inline void | ||
| base64_encode_trio(const unsigned char *in, unsigned char *out, | ||
| const unsigned char *table) | ||
| { | ||
| unsigned int combined = ((unsigned int)in[0] << 16) | | ||
| ((unsigned int)in[1] << 8) | | ||
| (unsigned int)in[2]; | ||
| out[0] = table[(combined >> 18) & 0x3f]; | ||
| out[1] = table[(combined >> 12) & 0x3f]; | ||
| out[2] = table[(combined >> 6) & 0x3f]; | ||
| out[3] = table[combined & 0x3f]; | ||
| } | ||
|
|
||
| /* Encode multiple complete 3-byte groups. | ||
| * Returns the number of input bytes processed (always a multiple of 3). | ||
| */ | ||
| static inline Py_ssize_t | ||
| base64_encode_fast(const unsigned char *in, Py_ssize_t in_len, | ||
| unsigned char *out, const unsigned char *table) | ||
| { | ||
| Py_ssize_t n_trios = in_len / 3; | ||
| Py_ssize_t i; | ||
|
|
||
| for (i = 0; i < n_trios; i++) { | ||
| base64_encode_trio(in + i * 3, out + i * 4, table); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it faster than incrementing |
||
| } | ||
|
|
||
| return n_trios * 3; | ||
| } | ||
|
|
||
| /* Decode 4 base64 characters into 3 bytes. | ||
| * Returns 1 on success, 0 if any character is invalid. | ||
| */ | ||
| static inline int | ||
| base64_decode_quad(const unsigned char *in, unsigned char *out, | ||
| const unsigned char *table) | ||
| { | ||
| unsigned char v0 = table[in[0]]; | ||
| unsigned char v1 = table[in[1]]; | ||
| unsigned char v2 = table[in[2]]; | ||
| unsigned char v3 = table[in[3]]; | ||
|
|
||
| if ((v0 | v1 | v2 | v3) & 0xc0) { | ||
| return 0; | ||
| } | ||
|
|
||
| out[0] = (v0 << 2) | (v1 >> 4); | ||
| out[1] = (v1 << 4) | (v2 >> 2); | ||
| out[2] = (v2 << 6) | v3; | ||
| return 1; | ||
| } | ||
|
|
||
| /* Decode multiple complete 4-character groups (no padding allowed). | ||
| * Returns the number of input characters processed. | ||
| * Stops at the first invalid character, padding, or incomplete group. | ||
| */ | ||
| static inline Py_ssize_t | ||
| base64_decode_fast(const unsigned char *in, Py_ssize_t in_len, | ||
| unsigned char *out, const unsigned char *table) | ||
| { | ||
| Py_ssize_t n_quads = in_len / 4; | ||
| Py_ssize_t i; | ||
|
|
||
| for (i = 0; i < n_quads; i++) { | ||
| const unsigned char *inp = in + i * 4; | ||
|
|
||
| /* Check for padding - exit fast path to handle it properly. | ||
| * Four independent comparisons lets the compiler choose the optimal | ||
| * approach; on modern pipelined CPUs this is faster than bitmask tricks | ||
| * like XOR+SUB+AND for zero-detection which have data dependencies. | ||
| */ | ||
| if (inp[0] == BASE64_PAD || inp[1] == BASE64_PAD || | ||
| inp[2] == BASE64_PAD || inp[3] == BASE64_PAD) { | ||
| break; | ||
| } | ||
|
Comment on lines
+182
to
+190
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For each group we have two checks. One here, with comparing all bytes to BASE64_PAD, and other in |
||
|
|
||
| if (!base64_decode_quad(inp, out + i * 3, table)) { | ||
| break; | ||
| } | ||
| } | ||
|
|
||
| return i * 4; | ||
| } | ||
|
|
||
|
|
||
| static const unsigned short crctab_hqx[256] = { | ||
| 0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5, 0x60c6, 0x70e7, | ||
|
|
@@ -403,10 +496,26 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode) | |
| goto error_end; | ||
| } | ||
|
|
||
| size_t i = 0; /* Current position in input */ | ||
|
|
||
| /* Fast path: use optimized decoder for complete quads. | ||
| * This works for both strict and non-strict mode for valid input. | ||
| * The fast path stops at padding, invalid chars, or incomplete groups. | ||
| */ | ||
| if (ascii_len >= 4) { | ||
| Py_ssize_t fast_chars = base64_decode_fast(ascii_data, (Py_ssize_t)ascii_len, | ||
| bin_data, table_a2b_base64); | ||
| if (fast_chars > 0) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this condition needed? |
||
| i = (size_t)fast_chars; | ||
| bin_data += (fast_chars / 4) * 3; | ||
| } | ||
| } | ||
|
|
||
| /* Slow path: handle remaining input (padding, invalid chars, partial groups) */ | ||
| int quad_pos = 0; | ||
| unsigned char leftchar = 0; | ||
| int pads = 0; | ||
| for (size_t i = 0; i < ascii_len; i++) { | ||
| for (; i < ascii_len; i++) { | ||
| unsigned char this_ch = ascii_data[i]; | ||
|
|
||
| /* Check for pad sequences and ignore | ||
|
|
@@ -533,9 +642,6 @@ binascii_b2a_base64_impl(PyObject *module, Py_buffer *data, int newline) | |
| /*[clinic end generated code: output=4ad62c8e8485d3b3 input=0e20ff59c5f2e3e1]*/ | ||
| { | ||
| const unsigned char *bin_data; | ||
| int leftbits = 0; | ||
| unsigned char this_ch; | ||
| unsigned int leftchar = 0; | ||
| Py_ssize_t bin_len; | ||
| binascii_state *state; | ||
|
|
||
|
|
@@ -566,26 +672,31 @@ binascii_b2a_base64_impl(PyObject *module, Py_buffer *data, int newline) | |
| } | ||
| unsigned char *ascii_data = PyBytesWriter_GetData(writer); | ||
|
|
||
| for( ; bin_len > 0 ; bin_len--, bin_data++ ) { | ||
| /* Shift the data into our buffer */ | ||
| leftchar = (leftchar << 8) | *bin_data; | ||
| leftbits += 8; | ||
|
|
||
| /* See if there are 6-bit groups ready */ | ||
| while ( leftbits >= 6 ) { | ||
| this_ch = (leftchar >> (leftbits-6)) & 0x3f; | ||
| leftbits -= 6; | ||
| *ascii_data++ = table_b2a_base64[this_ch]; | ||
| } | ||
| } | ||
| if ( leftbits == 2 ) { | ||
| *ascii_data++ = table_b2a_base64[(leftchar&3) << 4]; | ||
| /* Use the optimized fast path for complete 3-byte groups */ | ||
| Py_ssize_t fast_bytes = base64_encode_fast(bin_data, bin_len, ascii_data, | ||
| table_b2a_base64); | ||
| bin_data += fast_bytes; | ||
| ascii_data += (fast_bytes / 3) * 4; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder, would not it be more efficient to return the number of groups, so you can avoid division. Although it can be below a noise. |
||
| bin_len -= fast_bytes; | ||
|
|
||
| /* Handle remaining 0-2 bytes */ | ||
| if (bin_len == 1) { | ||
| /* 1 byte remaining: produces 2 base64 chars + 2 padding */ | ||
| unsigned int val = bin_data[0]; | ||
| *ascii_data++ = table_b2a_base64[(val >> 2) & 0x3f]; | ||
| *ascii_data++ = table_b2a_base64[(val << 4) & 0x3f]; | ||
| *ascii_data++ = BASE64_PAD; | ||
| *ascii_data++ = BASE64_PAD; | ||
| } else if ( leftbits == 4 ) { | ||
| *ascii_data++ = table_b2a_base64[(leftchar&0xf) << 2]; | ||
| } | ||
| else if (bin_len == 2) { | ||
| /* 2 bytes remaining: produces 3 base64 chars + 1 padding */ | ||
| unsigned int val = ((unsigned int)bin_data[0] << 8) | bin_data[1]; | ||
| *ascii_data++ = table_b2a_base64[(val >> 10) & 0x3f]; | ||
| *ascii_data++ = table_b2a_base64[(val >> 4) & 0x3f]; | ||
| *ascii_data++ = table_b2a_base64[(val << 2) & 0x3f]; | ||
| *ascii_data++ = BASE64_PAD; | ||
| } | ||
|
|
||
| if (newline) | ||
| *ascii_data++ = '\n'; /* Append a courtesy newline */ | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Contributed by ...