From 8235c1af411cc5dc8df0e61703a99c7ead34e065 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 31 May 2026 09:15:35 +0000
Subject: [PATCH] fix(bit_hash): off-by-one in sha1_compute padding; restore
 one-shot FFI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The native one-shot `sha1_compute` C entry had `if (remainder < 55)` for
the "padding fits in one block" branch. At `remainder == 55` — exactly
the case where the 0x80 marker plus the 8-byte big-endian length still
fit in the current 64-byte block — the comparison rejected the input
and routed it through the two-block path, producing a different digest
than the incremental `Sha1State` walk. Commit c25986d disabled
`sha1_compute_ffi` entirely as a workaround, masking the bug and
making `sha1_raw` pay one C FFI call per 64-byte block. Fix the
boundary (`< 56`) and restore `sha1_raw` to the one-shot path; add a
sweep test that walks both padding cliffs (55-byte and 119-byte).

`moon bench -p mizchi/bit_hash --target native --release` on a scalar
(non-SHA-NI) CPU:

  sha1_raw 64 bytes   1.18 µs ->  842 ns  (-29%)
  sha1_raw 1 KiB      9.46 µs -> 6.72 µs  (-29%)
  sha1_raw 8 KiB     69.51 µs -> 50.65 µs (-27%)
  sha1_raw 64 KiB   546.63 µs -> 403.00 µs (-26%)

bit_hash, bit_object, bit_lib, bitx_hub, bit_pack tests all pass (15 +
27 + 285 + 113 + 72 = 512). HubStore's `put_record` / `get_record`
round-trip — the original c25986d regression — exercises the fixed
path via `array_to_bytes` -> `sha1`.

https://claude.ai/code/session_01FgY6EMujwhzucSQkBVcodR
---
 docs/benchmarks.md                        | 55 +++++++++++++++++++++++
 modules/bit_hash/src/sha1_native_impl.mbt | 11 ++---
 modules/bit_hash/src/sha1_ni.c            |  6 +--
 modules/bit_hash/src/sha1_ni_ffi.mbt      |  8 ++++
 modules/bit_hash/src/sha1_test.mbt        | 24 ++++++++++
 5 files changed, 96 insertions(+), 8 deletions(-)

diff --git a/docs/benchmarks.md b/docs/benchmarks.md
index a833b659..1938b838 100644
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -184,6 +184,61 @@ moon bench -p mizchi/bit/pack --target native -f bench_test.mbt --release
 
 ## Targeted fixes after profiling
 
+### `sha1_compute` padding off-by-one (native one-shot path)
+
+A regression introduced by commit `c25986d` routed `sha1_raw` through
+`Sha1State` (one C FFI call per 64-byte block) instead of the one-shot
+`sha1_compute` C entry, because the latter was suspected of giving
+wrong results for `Bytes::from_iter` inputs. The real cause was a
+padding off-by-one in `sha1_compute`: the condition `if (remainder <
+55)` rejected the boundary case `remainder == 55`, where `0x80` plus
+the 8-byte big-endian length still fit in one 64-byte block. The
+mis-padded second block produced a different digest than `Sha1State`'s
+correct one-block path, so HubStore writes and reads computed
+inconsistent hashes. Fixed by changing the condition to `< 56` and
+restoring `sha1_raw` to the one-shot FFI; added a sweep test that
+walks both padding cliffs.
+
+`moon bench -p mizchi/bit_hash --target native --release` on a non-
+SHA-NI CPU (scalar fallback):
+
+| Benchmark         | Sha1State (regressed) | One-shot (fixed) | Δ      |
+| ----------------- | --------------------- | ---------------- | ------ |
+| sha1_raw 64 bytes |   1.18 µs             |   842 ns         | −29%   |
+| sha1_raw 1 KiB    |   9.46 µs             |  6.72 µs         | −29%   |
+| sha1_raw 8 KiB    |  69.51 µs             | 50.65 µs         | −27%   |
+| sha1_raw 64 KiB   | 546.63 µs             | 403.00 µs        | −26%   |
+
+(The pre-`c25986d` numbers in the table at the top of this document
+were taken on a SHA-NI-capable host and remain the ceiling; on this
+non-SHA-NI VM we cap out at the C scalar throughput. The win above
+comes from collapsing 16-1024 FFI hops into one, not from SHA-NI.)
+
+### Profiling target: `mizchi/simd::simdhash` rotates (wasm-gc)
+
+The existing `modules/bit_hash/bench/cmd/sha_hash/main.mbt` was wired
+for `moon-pprof`. Built with `-g` for source-mapped symbols and
+profiled via `moon-pprof profile … --out` + `summary`:
+
+```
+Top user functions by self time (mem-mgmt frames hidden)
+    1825.83 ms   29.7%  mizchi::simd::src::simdhash::rotr
+    1440.03 ms   23.4%  mizchi::simd::src::simdhash::sha256__compress
+    1012.83 ms   16.5%  mizchi::simd::src::simdhash::sha1__scalar
+     963.74 ms   15.7%  mizchi::simd::src::simdhash::rotl
+     565.27 ms    9.2%  mizchi::simd::src::simdhash::pad__message
+```
+
+`rotr` + `rotl` together account for 45.4% of self time on the
+wasm-gc target. They're defined as plain expressions (`(x >> n) | (x
+<< (32 - n))`) without `#inline`, so each round-constant invocation
+inside the SHA loops eats a real call. Marking them `#inline` in
+`mizchi/simd` would benefit every wasm-gc / js consumer of
+`@simdhash.sha1` / `sha256`; filed upstream-side, not patched here
+because `mizchi/simd` is an external dependency.
+
+
+
 ### `refs.list_refs_with_ids` loose subtree pruning
 
 `collect_loose_ref_ids` used to recurse into every subdir under
diff --git a/modules/bit_hash/src/sha1_native_impl.mbt b/modules/bit_hash/src/sha1_native_impl.mbt
index fd2b61c2..47423ac0 100644
--- a/modules/bit_hash/src/sha1_native_impl.mbt
+++ b/modules/bit_hash/src/sha1_native_impl.mbt
@@ -7,13 +7,14 @@ fn Sha1State::process_block(self : Sha1State) -> Unit {
 
 ///|
 pub fn sha1_bytes(data : Bytes) -> Bytes {
-  let raw = sha1_raw(data)
-  Bytes::makei(20, fn(i) { raw[i] })
+  let out : FixedArray[Byte] = FixedArray::make(20, b'\x00')
+  sha1_compute_ffi(data, data.length(), out)
+  Bytes::makei(20, fn(i) { out[i] })
 }
 
 ///|
 pub fn sha1_raw(data : Bytes) -> FixedArray[Byte] {
-  let state = Sha1State::new()
-  state.update(data)
-  state.finish_raw()
+  let out : FixedArray[Byte] = FixedArray::make(20, b'\x00')
+  sha1_compute_ffi(data, data.length(), out)
+  out
 }
diff --git a/modules/bit_hash/src/sha1_ni.c b/modules/bit_hash/src/sha1_ni.c
index 7d4e0618..71ee4c23 100644
--- a/modules/bit_hash/src/sha1_ni.c
+++ b/modules/bit_hash/src/sha1_ni.c
@@ -321,12 +321,12 @@ void sha1_compute(const uint8_t* data, int32_t len, uint8_t* out) {
   pad[remainder] = 0x80;
 
   int32_t pad_len;
-  if (remainder < 55) {
-    /* One padding block. */
+  if (remainder < 56) {
+    /* One padding block: 0x80 at pad[remainder], length at pad[56..63]. */
     memset(pad + remainder + 1, 0, (size_t)(55 - remainder));
     pad_len = 64;
   } else {
-    /* Two padding blocks. */
+    /* Two padding blocks: 0x80 collides with length in this block. */
     memset(pad + remainder + 1, 0, (size_t)(119 - remainder));
     pad_len = 128;
   }
diff --git a/modules/bit_hash/src/sha1_ni_ffi.mbt b/modules/bit_hash/src/sha1_ni_ffi.mbt
index 8042e486..a4f75e49 100644
--- a/modules/bit_hash/src/sha1_ni_ffi.mbt
+++ b/modules/bit_hash/src/sha1_ni_ffi.mbt
@@ -1,5 +1,13 @@
 // FFI declarations for SHA-NI / C SHA-1 and SHA-256 (native target only).
 
+///|
+#borrow(data, out)
+extern "C" fn sha1_compute_ffi(
+  data : Bytes,
+  len : Int,
+  out : FixedArray[Byte],
+) -> Unit = "sha1_compute"
+
 ///|
 #borrow(h, data)
 extern "C" fn sha1_process_blocks_ffi(
diff --git a/modules/bit_hash/src/sha1_test.mbt b/modules/bit_hash/src/sha1_test.mbt
index 85827206..c8f35f74 100644
--- a/modules/bit_hash/src/sha1_test.mbt
+++ b/modules/bit_hash/src/sha1_test.mbt
@@ -42,6 +42,30 @@ test "Sha1State: large input matches sha1_raw" {
   inspect(got == expected, content="true")
 }
 
+///|
+test "sha1_raw: one-shot matches Sha1State across padding boundaries" {
+  // Regression guard for a padding off-by-one in `sha1_compute` that
+  // disagreed with `Sha1State` exactly at `remainder == 55` — the
+  // tight case where 0x80 plus the 8-byte length still fit in one
+  // 64-byte block. Sweep both the 56-byte and 120-byte padding
+  // cliffs.
+  for len in [
+    0, 1, 31, 32, 33, 54, 55, 56, 57, 63, 64, 65, 100, 118, 119, 120, 121, 127,
+    128, 129, 200, 256, 511, 512, 513, 1024, 4096,
+  ] {
+    let data : Array[Byte] = []
+    for i in 0..<len {
+      data.push(((i * 31 + 7) % 251).to_byte())
+    }
+    let input = Bytes::from_array(data)
+    let one_shot = sha1_raw(input) |> raw_to_hex
+    let state = Sha1State::new()
+    state.update(input)
+    let incremental = state.finish_raw() |> raw_to_hex
+    inspect(one_shot == incremental, content="true")
+  }
+}
+
 ///|
 test "sha1_raw: 35-byte input" {
   // printf "blob 29 hub/proposal/pr/abc12345/meta" | sha1sum