From 2866b118957255d8ce8effcdc0232a58973a4471 Mon Sep 17 00:00:00 2001 From: satyakwok <119509589+satyakwok@users.noreply.github.com> Date: Mon, 25 May 2026 13:03:58 +0200 Subject: [PATCH] fix(core): self-heal stale total_minted in B3b reconcile via block sum MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `load_blockchain` runs Patch B3 to overwrite stale `accounts` from the canonical trie. But `total_minted` lives only in the bincode blob — no trie key, no per-block index. So when B3 fires (trie ahead of blob), `accounts` snap to canonical but `total_minted` stays at whatever the last save_blockchain wrote. That alone wouldn't matter if save_blockchain were always called in lock-step with apply, but there are edges where it isn't: an offline chain.db moved between hosts via cp without first halting, a save that raced a halt window, or a B2 replay path where the per-block apply updates `total_minted` but the corrupted-from-the-start blob doesn't get rewritten until after B3. In the 2026-05-24 mainnet incident, treasury and beacon ended up with identical `accounts` (B3 reconciled both from the same trie) but differing `total_minted`. `STATE-FP fp = SHA256(acc + total_minted)` then disagreed across two otherwise-identical validators. Same block content, same state-root in the trie, but the fingerprint trace fired. This commit adds B3b right after B3: * `recompute_total_minted_from_blocks(bc)` walks every persisted block 1..=tip, sums `block.coinbase().amount`, and adds `TOTAL_PREMINE`. C-04 (`block_executor.rs:336`) enforces `coinbase.amount == reward`, so the chain has no other source of newly-minted supply — this sum is the canonical value of `total_minted` at the tip. * `load_blockchain` compares the blob value against the recomputed value and warns + overwrites on mismatch. * Save-back happens via the existing atomic B1 path when either B3 repaired an account OR B3b repaired `total_minted`. Cost: O(N) MDBX block loads at boot. At mainnet h~2.2M this is ~30-60s on warm SSD. Acceptable for a once-per-boot sanity pass that only writes back when divergence is detected. Missing blocks are skipped with a warning, matching the B3 fail-soft posture earned during the 2026-05-20 testnet trie-gap incident. Two unit tests added: - `test_recompute_total_minted_matches_block_sum` — ground-truth check that the helper matches `TOTAL_PREMINE + N * BLOCK_REWARD` for a freshly-mined chain. - `test_b3b_repairs_stale_total_minted_on_load` — end-to-end via the production `load_blockchain` path: persist a blob with intentionally- stale `total_minted`, reload, assert the loaded value is canonical. Fails on main; passes after this commit. --- crates/sentrix-core/src/storage.rs | 220 ++++++++++++++++++++++++++++- 1 file changed, 216 insertions(+), 4 deletions(-) diff --git a/crates/sentrix-core/src/storage.rs b/crates/sentrix-core/src/storage.rs index 09d28dc3..69b14501 100644 --- a/crates/sentrix-core/src/storage.rs +++ b/crates/sentrix-core/src/storage.rs @@ -218,12 +218,49 @@ impl Storage { // absent — refuse to start so an operator surfaces it instead // of silently running on inconsistent state. let (checked, repaired) = Self::reconcile_accounts_from_trie(&mut bc)?; - if repaired > 0 { + + // B3b — `total_minted` self-heal. + // + // The blob carries `total_minted` as a plain `u64` snapshot. The + // bincode blob save is atomic with `accounts`, so under the normal + // crash path B2 replay catches the lag for both. But there is a + // class of stale-blob scenarios — most concretely an offline + // chain.db moved between hosts via partial copy, or a save that + // raced a halt — where the trie is at height N and the blob's + // `total_minted` is at an earlier value. B3 overwrites `accounts` + // from trie (so `acc` in STATE-FP agrees across nodes), but + // `total_minted` lives nowhere in the trie and quietly stays + // diverged. `fp = SHA256(acc + total_minted.to_be_bytes())` then + // disagrees across two otherwise-identical nodes, exactly the + // 2026-05-24 symptom (treasury and beacon agreed on `acc` but + // not `fp`). + // + // The recompute uses a hard invariant: every block's coinbase + // amount is bounded by `coinbase.amount == reward` (C-04 at + // `block_executor.rs:336`), so summing coinbase amounts across + // the stored block range plus the genesis premine yields the + // canonical `total_minted` at the current height. There is no + // other source of newly-minted supply. + let recomputed = self.recompute_total_minted_from_blocks(&bc)?; + let total_minted_was_stale = recomputed != bc.total_minted; + if total_minted_was_stale { + tracing::warn!( + "load_blockchain B3b: total_minted blob={} != recomputed-from-blocks={} \ + at height {} — overwriting blob (block-sum is canonical)", + bc.total_minted, + recomputed, + bc.height() + ); + bc.total_minted = recomputed; + } + + if repaired > 0 || total_minted_was_stale { tracing::warn!( - "load_blockchain B3: reconciled {}/{} accounts from trie at height {} \ - (blob was stale; trie is canonical)", + "load_blockchain B3: reconciled {}/{} accounts + total_minted_stale={} \ + from trie/blocks at height {} (blob was stale; trie+blocks are canonical)", repaired, checked, + total_minted_was_stale, bc.height() ); // Persist the repaired state via the atomic B1 path so the @@ -233,7 +270,8 @@ impl Storage { .map_err(|e| SentrixError::StorageError(e.to_string()))?; } else { tracing::debug!( - "load_blockchain B3: {}/{} accounts checked, none required reconcile at height {}", + "load_blockchain B3: {}/{} accounts checked, total_minted matches; \ + nothing to reconcile at height {}", repaired, checked, bc.height() @@ -494,6 +532,52 @@ impl Storage { Ok((checked, repaired)) } + /// Recompute `total_minted` by summing every persisted block's coinbase + /// amount and adding the genesis premine. Used by B3b on load to detect + /// + repair a stale blob-snapshot value (see `load_blockchain` above). + /// + /// This is the canonical derivation: `block_executor.rs:336` enforces + /// `coinbase.amount == reward` (C-04), so the chain has no other source + /// of newly-minted supply. Premine is fixed at genesis (`TOTAL_PREMINE` + /// in `address.rs`), block 0 carries no coinbase (genesis), and every + /// subsequent block contributes exactly its coinbase value. + /// + /// Cost: O(N) block loads (N = current height). At mainnet h≈2.2M this + /// is ~30-60s of MDBX reads on warm SSD — acceptable for a once-per-boot + /// sanity pass that only writes back when divergence is detected. + /// Blocks that fail to load are skipped with a warning rather than + /// aborting boot — same fail-soft posture as B3 reconcile (2026-05-20 + /// trie-gap incident). + fn recompute_total_minted_from_blocks(&self, bc: &Blockchain) -> SentrixResult { + use crate::address::TOTAL_PREMINE; + + let tip = bc.height(); + let mut total: u64 = TOTAL_PREMINE; + let mut missing: u64 = 0; + // Block 0 = genesis (no coinbase). Block 1 is the first reward. + for h in 1..=tip { + let block = match self.load_block(h)? { + Some(b) => b, + None => { + missing += 1; + continue; + } + }; + if let Some(cb) = block.coinbase() { + total = total.saturating_add(cb.amount); + } + } + if missing > 0 { + tracing::warn!( + "recompute_total_minted_from_blocks: {} blocks missing from MDBX in range \ + 1..={tip} — sum may underestimate true total_minted; surfacing partial \ + value rather than aborting boot", + missing + ); + } + Ok(total) + } + // ── Utility ────────────────────────────────────────── pub fn has_blockchain(&self) -> bool { @@ -697,4 +781,132 @@ mod tests { let _ = std::fs::remove_dir_all(&path); } + + /// `recompute_total_minted_from_blocks` is the canonical derivation + /// used by B3b. Build a chain with N blocks, then verify the helper + /// returns `TOTAL_PREMINE + sum(coinbase.amount over 1..=tip)`. + #[test] + fn test_recompute_total_minted_matches_block_sum() { + use crate::address::TOTAL_PREMINE; + use crate::tokenomics::BLOCK_REWARD; + + let path = temp_db_path(); + let storage = Storage::open(&path).unwrap(); + + let mut bc = Blockchain::new("admin".to_string()); + bc.authority.add_validator_unchecked( + "val1".to_string(), + "V1".to_string(), + "pk1".to_string(), + ); + + // Mine 3 blocks. `add_block` updates `bc.total_minted` via the + // production apply path so it serves as the "ground truth" the + // helper must match. + for _ in 0..3 { + let block = bc.create_block("val1").unwrap(); + bc.add_block(block).unwrap(); + } + storage.save_blockchain(&bc).unwrap(); + + let expected = TOTAL_PREMINE + 3 * BLOCK_REWARD; + assert_eq!(bc.total_minted, expected, "bc.total_minted ground truth"); + + let recomputed = storage.recompute_total_minted_from_blocks(&bc).unwrap(); + assert_eq!(recomputed, expected, "helper must match block-sum ground truth"); + + let _ = std::fs::remove_dir_all(&path); + } + + /// B3b repair on load: when the persisted blob's `total_minted` is + /// stale (e.g. lagged save_blockchain after a halt-with-trie-ahead + /// scenario), `load_blockchain` must detect the divergence via the + /// block-sum invariant and overwrite the blob value before handing + /// the Blockchain back to the caller. Without this, two validators + /// can converge on identical `accounts` (via B3 trie reconcile) but + /// keep divergent `total_minted` forever — exactly the 2026-05-24 + /// STATE-FP `fp`-divergence-with-matching-`acc` symptom. + #[test] + fn test_b3b_repairs_stale_total_minted_on_load() { + let path = temp_db_path(); + let storage = Storage::open(&path).unwrap(); + + let mut bc = Blockchain::new("admin".to_string()); + bc.authority.add_validator_unchecked( + "val1".to_string(), + "V1".to_string(), + "pk1".to_string(), + ); + for _ in 0..3 { + let block = bc.create_block("val1").unwrap(); + bc.add_block(block).unwrap(); + } + let canonical_total = bc.total_minted; + + // Persist a corrupted view: blocks remain canonical, but the + // blob's total_minted is off by one block reward (as if save + // lagged one block behind apply, or a partial copy from a + // healthy host shipped stale state). + bc.total_minted = canonical_total - 1; + storage.save_blockchain(&bc).unwrap(); + + // Load via the production path — B3b must catch + repair. + let loaded = storage.load_blockchain().unwrap().unwrap(); + assert_eq!( + loaded.total_minted, canonical_total, + "B3b must repair stale total_minted from block sum" + ); + + let _ = std::fs::remove_dir_all(&path); + } + + /// `recompute_total_minted_from_blocks` must skip + count blocks that + /// fail to load instead of aborting. Forge that case by deleting a + /// block's entry from MDBX after a save, then re-running the + /// recompute. The result must equal `TOTAL_PREMINE + sum(coinbase + /// for the surviving blocks)` — i.e., the missing block's reward is + /// silently omitted. The B3 fail-soft pattern (2026-05-20 testnet + /// incident) is the precedent for this posture: never refuse to + /// boot just because one block went missing on disk. + #[test] + fn test_recompute_total_minted_skips_missing_blocks() { + use crate::address::TOTAL_PREMINE; + use crate::tokenomics::BLOCK_REWARD; + + let path = temp_db_path(); + let storage = Storage::open(&path).unwrap(); + + let mut bc = Blockchain::new("admin".to_string()); + bc.authority.add_validator_unchecked( + "val1".to_string(), + "V1".to_string(), + "pk1".to_string(), + ); + for _ in 0..3 { + let block = bc.create_block("val1").unwrap(); + bc.add_block(block).unwrap(); + } + storage.save_blockchain(&bc).unwrap(); + + // Yank block #2 out of MDBX. The block stays in `bc.chain` + // (in-memory window) but the persisted entry is gone, so the + // recompute loop hits `load_block(2) -> Ok(None)` and counts + // the gap. + let mdbx = storage.mdbx_arc(); + mdbx.delete(sentrix_storage::tables::TABLE_META, b"block:2") + .unwrap(); + + // Sum should be `TOTAL_PREMINE + 2 * BLOCK_REWARD` (block 1 + + // block 3; block 2 was deleted). + let recomputed = storage + .recompute_total_minted_from_blocks(&bc) + .unwrap(); + assert_eq!( + recomputed, + TOTAL_PREMINE + 2 * BLOCK_REWARD, + "missing block must be silently skipped (fail-soft per B3 precedent)" + ); + + let _ = std::fs::remove_dir_all(&path); + } }