From 8235c1af411cc5dc8df0e61703a99c7ead34e065 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 31 May 2026 09:15:35 +0000 Subject: [PATCH] fix(bit_hash): off-by-one in sha1_compute padding; restore one-shot FFI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The native one-shot `sha1_compute` C entry had `if (remainder < 55)` for the "padding fits in one block" branch. At `remainder == 55` — exactly the case where the 0x80 marker plus the 8-byte big-endian length still fit in the current 64-byte block — the comparison rejected the input and routed it through the two-block path, producing a different digest than the incremental `Sha1State` walk. Commit c25986d disabled `sha1_compute_ffi` entirely as a workaround, masking the bug and making `sha1_raw` pay one C FFI call per 64-byte block. Fix the boundary (`< 56`) and restore `sha1_raw` to the one-shot path; add a sweep test that walks both padding cliffs (55-byte and 119-byte). `moon bench -p mizchi/bit_hash --target native --release` on a scalar (non-SHA-NI) CPU: sha1_raw 64 bytes 1.18 µs -> 842 ns (-29%) sha1_raw 1 KiB 9.46 µs -> 6.72 µs (-29%) sha1_raw 8 KiB 69.51 µs -> 50.65 µs (-27%) sha1_raw 64 KiB 546.63 µs -> 403.00 µs (-26%) bit_hash, bit_object, bit_lib, bitx_hub, bit_pack tests all pass (15 + 27 + 285 + 113 + 72 = 512). HubStore's `put_record` / `get_record` round-trip — the original c25986d regression — exercises the fixed path via `array_to_bytes` -> `sha1`. https://claude.ai/code/session_01FgY6EMujwhzucSQkBVcodR --- docs/benchmarks.md | 55 +++++++++++++++++++++++ modules/bit_hash/src/sha1_native_impl.mbt | 11 ++--- modules/bit_hash/src/sha1_ni.c | 6 +-- modules/bit_hash/src/sha1_ni_ffi.mbt | 8 ++++ modules/bit_hash/src/sha1_test.mbt | 24 ++++++++++ 5 files changed, 96 insertions(+), 8 deletions(-) diff --git a/docs/benchmarks.md b/docs/benchmarks.md index a833b659..1938b838 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -184,6 +184,61 @@ moon bench -p mizchi/bit/pack --target native -f bench_test.mbt --release ## Targeted fixes after profiling +### `sha1_compute` padding off-by-one (native one-shot path) + +A regression introduced by commit `c25986d` routed `sha1_raw` through +`Sha1State` (one C FFI call per 64-byte block) instead of the one-shot +`sha1_compute` C entry, because the latter was suspected of giving +wrong results for `Bytes::from_iter` inputs. The real cause was a +padding off-by-one in `sha1_compute`: the condition `if (remainder < +55)` rejected the boundary case `remainder == 55`, where `0x80` plus +the 8-byte big-endian length still fit in one 64-byte block. The +mis-padded second block produced a different digest than `Sha1State`'s +correct one-block path, so HubStore writes and reads computed +inconsistent hashes. Fixed by changing the condition to `< 56` and +restoring `sha1_raw` to the one-shot FFI; added a sweep test that +walks both padding cliffs. + +`moon bench -p mizchi/bit_hash --target native --release` on a non- +SHA-NI CPU (scalar fallback): + +| Benchmark | Sha1State (regressed) | One-shot (fixed) | Δ | +| ----------------- | --------------------- | ---------------- | ------ | +| sha1_raw 64 bytes | 1.18 µs | 842 ns | −29% | +| sha1_raw 1 KiB | 9.46 µs | 6.72 µs | −29% | +| sha1_raw 8 KiB | 69.51 µs | 50.65 µs | −27% | +| sha1_raw 64 KiB | 546.63 µs | 403.00 µs | −26% | + +(The pre-`c25986d` numbers in the table at the top of this document +were taken on a SHA-NI-capable host and remain the ceiling; on this +non-SHA-NI VM we cap out at the C scalar throughput. The win above +comes from collapsing 16-1024 FFI hops into one, not from SHA-NI.) + +### Profiling target: `mizchi/simd::simdhash` rotates (wasm-gc) + +The existing `modules/bit_hash/bench/cmd/sha_hash/main.mbt` was wired +for `moon-pprof`. Built with `-g` for source-mapped symbols and +profiled via `moon-pprof profile … --out` + `summary`: + +``` +Top user functions by self time (mem-mgmt frames hidden) + 1825.83 ms 29.7% mizchi::simd::src::simdhash::rotr + 1440.03 ms 23.4% mizchi::simd::src::simdhash::sha256__compress + 1012.83 ms 16.5% mizchi::simd::src::simdhash::sha1__scalar + 963.74 ms 15.7% mizchi::simd::src::simdhash::rotl + 565.27 ms 9.2% mizchi::simd::src::simdhash::pad__message +``` + +`rotr` + `rotl` together account for 45.4% of self time on the +wasm-gc target. They're defined as plain expressions (`(x >> n) | (x +<< (32 - n))`) without `#inline`, so each round-constant invocation +inside the SHA loops eats a real call. Marking them `#inline` in +`mizchi/simd` would benefit every wasm-gc / js consumer of +`@simdhash.sha1` / `sha256`; filed upstream-side, not patched here +because `mizchi/simd` is an external dependency. + + + ### `refs.list_refs_with_ids` loose subtree pruning `collect_loose_ref_ids` used to recurse into every subdir under diff --git a/modules/bit_hash/src/sha1_native_impl.mbt b/modules/bit_hash/src/sha1_native_impl.mbt index fd2b61c2..47423ac0 100644 --- a/modules/bit_hash/src/sha1_native_impl.mbt +++ b/modules/bit_hash/src/sha1_native_impl.mbt @@ -7,13 +7,14 @@ fn Sha1State::process_block(self : Sha1State) -> Unit { ///| pub fn sha1_bytes(data : Bytes) -> Bytes { - let raw = sha1_raw(data) - Bytes::makei(20, fn(i) { raw[i] }) + let out : FixedArray[Byte] = FixedArray::make(20, b'\x00') + sha1_compute_ffi(data, data.length(), out) + Bytes::makei(20, fn(i) { out[i] }) } ///| pub fn sha1_raw(data : Bytes) -> FixedArray[Byte] { - let state = Sha1State::new() - state.update(data) - state.finish_raw() + let out : FixedArray[Byte] = FixedArray::make(20, b'\x00') + sha1_compute_ffi(data, data.length(), out) + out } diff --git a/modules/bit_hash/src/sha1_ni.c b/modules/bit_hash/src/sha1_ni.c index 7d4e0618..71ee4c23 100644 --- a/modules/bit_hash/src/sha1_ni.c +++ b/modules/bit_hash/src/sha1_ni.c @@ -321,12 +321,12 @@ void sha1_compute(const uint8_t* data, int32_t len, uint8_t* out) { pad[remainder] = 0x80; int32_t pad_len; - if (remainder < 55) { - /* One padding block. */ + if (remainder < 56) { + /* One padding block: 0x80 at pad[remainder], length at pad[56..63]. */ memset(pad + remainder + 1, 0, (size_t)(55 - remainder)); pad_len = 64; } else { - /* Two padding blocks. */ + /* Two padding blocks: 0x80 collides with length in this block. */ memset(pad + remainder + 1, 0, (size_t)(119 - remainder)); pad_len = 128; } diff --git a/modules/bit_hash/src/sha1_ni_ffi.mbt b/modules/bit_hash/src/sha1_ni_ffi.mbt index 8042e486..a4f75e49 100644 --- a/modules/bit_hash/src/sha1_ni_ffi.mbt +++ b/modules/bit_hash/src/sha1_ni_ffi.mbt @@ -1,5 +1,13 @@ // FFI declarations for SHA-NI / C SHA-1 and SHA-256 (native target only). +///| +#borrow(data, out) +extern "C" fn sha1_compute_ffi( + data : Bytes, + len : Int, + out : FixedArray[Byte], +) -> Unit = "sha1_compute" + ///| #borrow(h, data) extern "C" fn sha1_process_blocks_ffi( diff --git a/modules/bit_hash/src/sha1_test.mbt b/modules/bit_hash/src/sha1_test.mbt index 85827206..c8f35f74 100644 --- a/modules/bit_hash/src/sha1_test.mbt +++ b/modules/bit_hash/src/sha1_test.mbt @@ -42,6 +42,30 @@ test "Sha1State: large input matches sha1_raw" { inspect(got == expected, content="true") } +///| +test "sha1_raw: one-shot matches Sha1State across padding boundaries" { + // Regression guard for a padding off-by-one in `sha1_compute` that + // disagreed with `Sha1State` exactly at `remainder == 55` — the + // tight case where 0x80 plus the 8-byte length still fit in one + // 64-byte block. Sweep both the 56-byte and 120-byte padding + // cliffs. + for len in [ + 0, 1, 31, 32, 33, 54, 55, 56, 57, 63, 64, 65, 100, 118, 119, 120, 121, 127, + 128, 129, 200, 256, 511, 512, 513, 1024, 4096, + ] { + let data : Array[Byte] = [] + for i in 0.. raw_to_hex + let state = Sha1State::new() + state.update(input) + let incremental = state.finish_raw() |> raw_to_hex + inspect(one_shot == incremental, content="true") + } +} + ///| test "sha1_raw: 35-byte input" { // printf "blob 29 hub/proposal/pr/abc12345/meta" | sha1sum