From a74d53f0a2204156280b0f86edfbd4ddf90c7c52 Mon Sep 17 00:00:00 2001 From: satyakwok <119509589+satyakwok@users.noreply.github.com> Date: Wed, 20 May 2026 06:45:29 +0200 Subject: [PATCH] fix(core): b3 reconcile skips missing trie nodes instead of crashing boot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The trie-reconcile pass at boot used to propagate any `trie.get` error all the way out of `load_blockchain`, which turned a single dangling trie node into an unbootable validator — even when the other 99.99% of the trie was healthy and the chain was actively producing blocks before the restart. Hit live on testnet 2026-05-20 during a binary swap. One address (0x044daee895f2bb2ffecd4dd2e771f5ce405ab0ea at h=5003961) had a dangling reference to node 314e57bd...; the validator had been up for 5 hours serving that exact state without complaint, but the next boot crashed on phase-1 trie scan. All four testnet validators were stuck because they shared the same trie shape. The reconcile already has well-defined behavior for "no trie leaf for this address" — phase-2 keeps the blob view as canonical. Treat a failed `trie.get` the same way: log the gap, count it, surface a summary warning if any were skipped, then continue. Touched accounts will rewrite the trie node on next apply, so the gap closes on its own without operator intervention. This is strictly a fail-soft on a previously fatal path. Healthy chains never hit it; the only behavioral change is that broken state loads instead of refusing to load. Also updates the `reconcile_accounts_from_trie` docstring so it matches the new catch-and-continue semantics (was still claiming hard-fail). --- crates/sentrix-core/src/storage.rs | 43 ++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/crates/sentrix-core/src/storage.rs b/crates/sentrix-core/src/storage.rs index d1186dd1..09d28dc3 100644 --- a/crates/sentrix-core/src/storage.rs +++ b/crates/sentrix-core/src/storage.rs @@ -365,8 +365,11 @@ impl Storage { /// premine / genesis accounts that pre-date the first trie touch. /// We must not zero those out. /// - /// Trie-lookup errors (`Err(_)`) propagate: a corrupted trie is a - /// hard-fail, not silent fallback. + /// Trie-lookup errors (`Err(_)`) are logged and skipped: a missing + /// or corrupted trie node for one address no longer aborts boot. + /// The address is treated as having no trie leaf (blob value + /// preserved); the next block touching that account rewrites the + /// trie entry and closes the gap. fn reconcile_accounts_from_trie(bc: &mut Blockchain) -> SentrixResult<(usize, usize)> { // Build the candidate address set first — sorted + deduped so // the reconcile order is deterministic across runs (helps debug @@ -409,18 +412,42 @@ impl Storage { // Phase 1: read all trie leaves into a buffer. This avoids // holding the trie borrow while we mutate accounts in phase 2. + // + // 2026-05-20: a missing trie node here used to crash boot. Testnet + // hit this with one address having a dangling reference to node + // 314e57bd... at h=5003961; the other 99.99% of the trie was + // healthy and the chain had been producing for 5 hours. Refusing + // to boot turned one stale leaf into an unrecoverable validator. + // Fail-soft now: log the gap, skip the address (existing phase-2 + // logic treats a `None` leaf as "trie has no opinion, keep the + // blob"), and let the next block apply rewrite the entry. let mut trie_values: Vec<(String, Option<(u64, u64)>)> = Vec::with_capacity(addrs.len()); + let mut trie_gaps: usize = 0; for addr in &addrs { let key = address_to_key(addr); - let leaf = trie.get(&key).map_err(|e| { - SentrixError::Internal(format!( - "B3 reconcile: trie lookup for {addr} failed at h={}: {e}", - bc.chain.last().map(|b| b.index).unwrap_or(0) - )) - })?; + let leaf = match trie.get(&key) { + Ok(leaf) => leaf, + Err(e) => { + tracing::warn!( + "B3 reconcile: trie lookup for {addr} failed at h={}: {e} — \ + skipping reconcile for this address (will rewrite on next touch)", + bc.chain.last().map(|b| b.index).unwrap_or(0) + ); + trie_gaps += 1; + None + } + }; let decoded = leaf.and_then(|bytes| account_value_decode(&bytes)); trie_values.push((addr.clone(), decoded)); } + if trie_gaps > 0 { + tracing::warn!( + "B3 reconcile: skipped {trie_gaps}/{} addresses due to missing trie nodes \ + at h={}; chain will continue producing — touched accounts repair themselves", + addrs.len(), + bc.chain.last().map(|b| b.index).unwrap_or(0) + ); + } // Phase 2: apply repairs. let height = bc.chain.last().map(|b| b.index).unwrap_or(0);