From fff979b180b42b4f945c6fb38a54685afe49a3cc Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Sat, 20 Jun 2026 14:47:35 -0500 Subject: [PATCH 1/4] fix(gl): make /encrypted-blobs recovery parsing schema-strict Deserialize the /encrypted-blobs 200 body into a typed EncryptedBlobsResponse instead of poking at a serde_json::Value and defaulting blobs to [] via unwrap_or_default(). A server schema regression now fails closed (a hard parse error) rather than silently yielding nothing to recover, mirroring the existing WithheldPathsResponse convention. The sole caller still does not abort the clone but now surfaces the error instead of swallowing it into an empty result. Part of #49. --- crates/gl/src/clone.rs | 64 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 52 insertions(+), 12 deletions(-) diff --git a/crates/gl/src/clone.rs b/crates/gl/src/clone.rs index 93e998d..f427cb5 100644 --- a/crates/gl/src/clone.rs +++ b/crates/gl/src/clone.rs @@ -234,6 +234,22 @@ struct WithheldPathsResponse { reinclude: Vec, } +/// The node's `/encrypted-blobs` 200 body. Same rationale as `WithheldPathsResponse`: +/// deserializing into a typed struct makes a missing or mistyped `blobs` field (or a +/// blob entry missing its `oid`) a hard error instead of silently becoming "nothing +/// to recover", which would mask a server schema regression behind a clone that +/// quietly omits authorized files. Unknown server fields (e.g. `size`) are ignored +/// by serde, so the response may carry extra keys without breaking. +#[derive(Deserialize)] +struct EncryptedBlobsResponse { + blobs: Vec, +} + +#[derive(Deserialize)] +struct EncryptedBlobEntry { + oid: String, +} + /// After the base clone, recover encrypted blobs the caller is authorized for /// that are missing locally: fetch the envelope, decrypt with the caller's key, /// install as a loose object. Returns the repo-relative paths recovered. @@ -259,13 +275,8 @@ async fn recover_encrypted_blobs( Ok(r) if r.status().is_success() => r, _ => return Ok(vec![]), }; - let body: serde_json::Value = resp.json().await.context("parsing encrypted-blobs")?; - let blobs = body - .get("blobs") - .and_then(|b| b.as_array()) - .cloned() - .unwrap_or_default(); - if blobs.is_empty() { + let body: EncryptedBlobsResponse = resp.json().await.context("parsing encrypted-blobs")?; + if body.blobs.is_empty() { return Ok(vec![]); } @@ -283,10 +294,8 @@ async fn recover_encrypted_blobs( } let mut recovered = Vec::new(); - for entry in blobs { - let Some(oid) = entry.get("oid").and_then(|o| o.as_str()) else { - continue; - }; + for entry in body.blobs { + let oid = entry.oid.as_str(); // Skip if already present locally. let present = Command::new("git") .args(["-C", dest_str, "cat-file", "-e", oid]) @@ -547,7 +556,14 @@ pub async fn run(args: CloneArgs) -> Result<()> { // fallback for any authorized blobs the node could not supply. let mut paths = recover_encrypted_blobs(&args.node, &owner, &name, &dest, &keypair) .await - .unwrap_or_default(); + .unwrap_or_else(|e| { + // A node-recovery error must not abort the clone (the Arweave + // fallback still runs), but the strict /encrypted-blobs parse now + // fails closed on schema drift, so surface it rather than letting + // `.unwrap_or_default()` silently swallow it into "no paths". + eprintln!("warning: encrypted-blobs recovery failed: {e}"); + Vec::new() + }); let from_arweave = recover_from_arweave( &args.arweave_gateway, &args.ipfs_gateway, @@ -787,6 +803,30 @@ mod tests { .is_err()); } + #[test] + fn encrypted_blobs_response_is_schema_strict() { + // Happy path: a well-formed body parses and exposes the oids. + let ok: EncryptedBlobsResponse = + serde_json::from_str(r#"{"blobs":[{"oid":"abc"}]}"#).unwrap(); + assert_eq!(ok.blobs.len(), 1); + assert_eq!(ok.blobs[0].oid, "abc"); + + // Unknown server fields on an entry are tolerated (no deny_unknown_fields). + let extra: EncryptedBlobsResponse = + serde_json::from_str(r#"{"blobs":[{"oid":"abc","size":42}]}"#).unwrap(); + assert_eq!(extra.blobs[0].oid, "abc"); + + // Empty list is valid and distinct from a missing field. + let empty: EncryptedBlobsResponse = serde_json::from_str(r#"{"blobs":[]}"#).unwrap(); + assert!(empty.blobs.is_empty()); + + // Schema drift is a hard error, not a silent "nothing to recover": + // missing `blobs`, wrong-typed `blobs`, and an entry missing `oid`. + assert!(serde_json::from_str::(r#"{"items":[]}"#).is_err()); + assert!(serde_json::from_str::(r#"{"blobs":"nope"}"#).is_err()); + assert!(serde_json::from_str::(r#"{"blobs":[{"size":1}]}"#).is_err()); + } + #[test] fn parse_tx_ids_extracts_node_ids() { let v: serde_json::Value = serde_json::from_str( From ea82a9a36bcd05d280548948faf83f78630ac40c Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Sat, 20 Jun 2026 14:59:02 -0500 Subject: [PATCH 2/4] fix(gl): paginate Arweave manifest discovery and rank ties by block height Discovery hard-coded a single first:100 GraphQL query with no pagination, so a repo with more than 100 encrypted-manifest anchors (one per push) silently lost recovery coverage for any blob whose most recent seal fell outside that window. Follow cursor pagination (after + pageInfo) until coverage is complete, deduping transaction ids and bounding the loop (1000 pages / 10000 ids, degenerate-cursor guard) so a hostile or buggy gateway cannot grow the downstream per-id fetch loop without limit. A full page returned without pageInfo, a cap hit, or a failure after discovery began now warns; a page-1 failure (unconfigured/unreachable gateway) stays silent as before. Pagination concatenates pages in gateway order, which made merge_manifests' equal-timestamp tie-break (previously insertion-order, i.e. accidentally newest on single-page HEIGHT_DESC) gateway-dependent. Rank candidates by the composite key (timestamp, Arweave block height, cid): timestamp primary keeps the existing distinct-timestamp latest-wins; block height breaks equal-timestamp ties by anchor recency (a not-yet-mined anchor counts as newest), preserving the pre-pagination behavior; cid is a final deterministic tiebreak for same-block anchors. Height is parsed defensively so a present-but-unreadable height (e.g. a gateway stringifying large ints) cannot masquerade as a pending/newest anchor. Part of #49. --- crates/gl/src/clone.rs | 617 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 564 insertions(+), 53 deletions(-) diff --git a/crates/gl/src/clone.rs b/crates/gl/src/clone.rs index f427cb5..dfb2b95 100644 --- a/crates/gl/src/clone.rs +++ b/crates/gl/src/clone.rs @@ -363,43 +363,131 @@ struct Manifest { blobs: Vec, } -/// Extract transaction ids from an Arweave GraphQL `transactions` response. -fn parse_tx_ids(v: &serde_json::Value) -> Vec { - v.get("data") - .and_then(|d| d.get("transactions")) +/// One discovered manifest transaction: its Arweave id and block height. The +/// height is `None` only when the anchor is not yet mined (no `block`), in which +/// case it is the most recent and sorts as newest in the merge tie-break. A mined +/// anchor is always `Some(_)` (0 if the gateway reported an unreadable height) so +/// a malformed height can never masquerade as pending/newest. +struct TxRef { + id: String, + height: Option, +} + +/// A parsed page of an Arweave GraphQL `transactions` response: the edge refs +/// plus pagination state. `has_next` is `None` when the response omitted +/// `pageInfo` entirely (an older or partial gateway), which the discovery loop +/// treats as terminal but flags when the page was full (possible truncation). +struct TxPage { + refs: Vec, + has_next: Option, + end_cursor: Option, +} + +/// Parse one Arweave GraphQL `transactions` page: each edge's tx id and block +/// height, plus `pageInfo` for cursor pagination. Hand-walks the `Value` (the +/// envelope nests `data.transactions.{edges,pageInfo}`) rather than deriving a +/// wrapper struct, matching the surrounding style. +fn parse_tx_page(v: &serde_json::Value) -> TxPage { + let txns = v.get("data").and_then(|d| d.get("transactions")); + let refs: Vec = txns .and_then(|t| t.get("edges")) .and_then(|e| e.as_array()) .map(|edges| { edges .iter() .filter_map(|edge| { - edge.get("node") - .and_then(|n| n.get("id")) - .and_then(|i| i.as_str()) - .map(String::from) + let node = edge.get("node")?; + let id = node.get("id").and_then(|i| i.as_str())?.to_string(); + // Distinguish "not yet mined" from "mined but height unreadable". + // Only a genuinely absent/null `block` is pending (`None`, + // ranked newest). A present `block` means the anchor IS mined, so + // it must not be promoted to newest: parse the height permissively + // (number or numeric string, guarding a gateway that stringifies + // large ints), and an unreadable height ranks lowest (0) rather + // than collapsing to `None`/newest. + let height = match node.get("block") { + None | Some(serde_json::Value::Null) => None, + Some(b) => Some( + b.get("height") + .and_then(|h| { + h.as_u64() + .or_else(|| h.as_str().and_then(|s| s.parse::().ok())) + }) + .unwrap_or(0), + ), + }; + Some(TxRef { id, height }) }) .collect() }) - .unwrap_or_default() + .unwrap_or_default(); + let page_info = txns.and_then(|t| t.get("pageInfo")); + let has_next = page_info + .and_then(|p| p.get("hasNextPage")) + .and_then(|h| h.as_bool()); + let end_cursor = page_info + .and_then(|p| p.get("endCursor")) + .and_then(|c| c.as_str()) + .map(String::from); + TxPage { + refs, + has_next, + end_cursor, + } } -/// Merge per-push manifests into a single `oid -> cid` map, latest-wins by the -/// manifest `timestamp` (RFC3339, compared lexicographically; a later push that -/// re-sealed a blob overrides the earlier entry). -fn merge_manifests(manifests: Vec) -> std::collections::HashMap { - let mut best: std::collections::HashMap = - std::collections::HashMap::new(); // oid -> (cid, timestamp) - for m in manifests { +/// Order a block height so that a not-yet-mined anchor (`None`) sorts above any +/// mined one, and mined anchors order by ascending height. Higher rank == newer. +fn height_rank(h: Option) -> (u8, u64) { + match h { + None => (1, 0), // pending anchor: newest + Some(v) => (0, v), // mined: by block height + } +} + +/// Merge per-push manifests into a single `oid -> cid` map. A later seal of an +/// oid overrides an earlier one, ranked by the composite key +/// `(timestamp, block height, cid)`: +/// - manifest `timestamp` (RFC3339, lexicographic) is primary — the unchanged +/// latest-wins behavior; +/// - Arweave block height breaks equal-timestamp ties by anchor recency (a +/// pending anchor counts as newest), preserving the "newest wins" that +/// single-page `HEIGHT_DESC` discovery used to provide by insertion order. +/// The height is gateway-reported (chain-derived but not independently +/// verified here), so it removes order-dependence and honest-node clock skew, +/// not trust in the gateway — discovery already assumes an honest gateway; +/// - cid is a final lexicographic tiebreak so two anchors in the *same* block +/// still resolve deterministically (arbitrary, not recency-ordered, within a +/// block), independent of discovery order. +fn merge_manifests( + manifests: Vec<(Manifest, Option)>, +) -> std::collections::HashMap { + // oid -> (cid, timestamp, height_rank) + let mut best: std::collections::HashMap = + std::collections::HashMap::new(); + for (m, height) in manifests { + let cand_rank = height_rank(height); for b in m.blobs { - match best.get(&b.oid) { - Some((_, ts)) if ts.as_str() >= m.timestamp.as_str() => {} - _ => { - best.insert(b.oid, (b.cid, m.timestamp.clone())); + let keep_incumbent = match best.get(&b.oid) { + // Incumbent wins iff its composite key is >= the candidate's. The + // `>=` (not `>`) keeps the incumbent only on a full tie, where the + // entries are identical anyway. A strictly-greater candidate key + // (newer timestamp, or equal timestamp + higher block, or equal + // both + larger cid) replaces it. + Some((cur_cid, cur_ts, cur_rank)) => { + (cur_ts.as_str(), cur_rank, cur_cid.as_str()) + >= (m.timestamp.as_str(), &cand_rank, b.cid.as_str()) } + None => false, + }; + if !keep_incumbent { + best.insert(b.oid, (b.cid, m.timestamp.clone(), cand_rank)); } } } - best.into_iter().map(|(oid, (cid, _))| (oid, cid)).collect() + best.into_iter() + .map(|(oid, (cid, _, _))| (oid, cid)) + .collect() } /// Option B3 fallback recovery, with no dependency on a gitlawb node API. Query @@ -433,36 +521,117 @@ async fn recover_from_arweave( .build() .unwrap_or_else(|_| reqwest::Client::new()); - // 1. Discover manifest transaction ids via Arweave GraphQL. - let query = r#"query($repo:String!){transactions(tags:[{name:"App-Name",values:["gitlawb"]},{name:"Schema",values:["gitlawb/encrypted-manifest/v1"]},{name:"Repo",values:[$repo]}],first:100){edges{node{id}}}}"#; - let gql_body = serde_json::json!({ "query": query, "variables": { "repo": slug } }); - let resp = match client - .post(format!("{ag}/graphql")) - .json(&gql_body) - .send() - .await - { - Ok(r) if r.status().is_success() => r, - _ => return Ok(vec![]), - }; - let gql: serde_json::Value = match resp.json().await { - Ok(v) => v, - Err(_) => return Ok(vec![]), - }; - let tx_ids = parse_tx_ids(&gql); - if tx_ids.is_empty() { + // 1. Discover manifest transactions via paginated Arweave GraphQL. Collect + // (tx id, block height) refs across every page, deduping by id and bounding + // the loop so a hostile or buggy gateway cannot grow the set, the page + // count, or the downstream per-id fetch loop without limit. Best-effort: a + // page-1 failure (gateway down/unconfigured) stays silent as before; a + // failure or cap reached *after* discovery began warns, since the recovery + // is then knowingly partial. + const MAX_PAGES: usize = 1000; + const MAX_TX_IDS: usize = 10_000; + const PAGE_SIZE: usize = 100; + let query = r#"query($repo:String!,$cursor:String){transactions(tags:[{name:"App-Name",values:["gitlawb"]},{name:"Schema",values:["gitlawb/encrypted-manifest/v1"]},{name:"Repo",values:[$repo]}],first:100,after:$cursor){pageInfo{hasNextPage endCursor}edges{cursor node{id block{height}}}}}"#; + let mut after: Option = None; + let mut seen: std::collections::HashSet = std::collections::HashSet::new(); + let mut refs: Vec = Vec::new(); + let mut prev_cursor: Option = None; + let mut page_num = 0usize; + loop { + if page_num >= MAX_PAGES { + eprintln!( + "warning: Arweave manifest discovery hit the {MAX_PAGES}-page cap; \ + some authorized files may not be recovered" + ); + break; + } + page_num += 1; + let gql_body = + serde_json::json!({ "query": query, "variables": { "repo": slug, "cursor": after } }); + let value: serde_json::Value = match client + .post(format!("{ag}/graphql")) + .json(&gql_body) + .send() + .await + { + Ok(r) if r.status().is_success() => match r.json().await { + Ok(v) => v, + Err(_) => { + if !refs.is_empty() { + eprintln!( + "warning: Arweave manifest discovery was interrupted by a gateway \ + error; some authorized files may not be recovered" + ); + } + break; + } + }, + // Whole-gateway failure on the first page is the benign unconfigured/ + // unreachable case: stay silent, exactly as before. + _ => { + if !refs.is_empty() { + eprintln!( + "warning: Arweave manifest discovery was interrupted by a gateway \ + error; some authorized files may not be recovered" + ); + } + break; + } + }; + let page = parse_tx_page(&value); + let full = page.refs.len() >= PAGE_SIZE; + for r in page.refs { + if seen.insert(r.id.clone()) { + refs.push(r); + } + } + // Cap checked at the page boundary so a partial page is never half-merged. + if refs.len() >= MAX_TX_IDS { + eprintln!( + "warning: Arweave manifest discovery hit the {MAX_TX_IDS}-transaction cap; \ + some authorized files may not be recovered" + ); + break; + } + match page.has_next { + Some(true) => match page.end_cursor { + // Advance only on a fresh cursor; a missing or repeating cursor is + // a degenerate gateway response and must not loop forever. + Some(c) if Some(&c) != prev_cursor.as_ref() => { + prev_cursor = Some(c.clone()); + after = Some(c); + } + _ => break, + }, + Some(false) => break, + // No pageInfo: terminal. A *full* page without it may be a silently + // truncated gateway, so flag that; a short page is genuinely the end. + None => { + if full { + eprintln!( + "warning: Arweave gateway returned a full page without pagination \ + metadata; discovery may be incomplete and some authorized files may \ + not be recovered" + ); + } + break; + } + } + } + if refs.is_empty() { return Ok(vec![]); } - // 2. Fetch and parse each manifest body, then merge latest-wins per oid. - let mut manifests = Vec::new(); - for tx in tx_ids { - let m = match client.get(format!("{ag}/{tx}")).send().await { - Ok(r) if r.status().is_success() => r, + // 2. Fetch each manifest body, pair it with its anchor's block height, then + // merge (latest-wins by timestamp, tie-broken by block height then cid). + let mut manifests: Vec<(Manifest, Option)> = Vec::new(); + for r in refs { + let m = match client.get(format!("{ag}/{}", r.id)).send().await { + Ok(resp) if resp.status().is_success() => resp, _ => continue, }; if let Ok(parsed) = m.json::().await { - manifests.push(parsed); + manifests.push((parsed, r.height)); } } let oid_cid = merge_manifests(manifests); @@ -824,23 +993,66 @@ mod tests { // missing `blobs`, wrong-typed `blobs`, and an entry missing `oid`. assert!(serde_json::from_str::(r#"{"items":[]}"#).is_err()); assert!(serde_json::from_str::(r#"{"blobs":"nope"}"#).is_err()); - assert!(serde_json::from_str::(r#"{"blobs":[{"size":1}]}"#).is_err()); + assert!( + serde_json::from_str::(r#"{"blobs":[{"size":1}]}"#).is_err() + ); } #[test] - fn parse_tx_ids_extracts_node_ids() { + fn parse_tx_page_extracts_node_ids_and_heights() { let v: serde_json::Value = serde_json::from_str( - r#"{"data":{"transactions":{"edges":[{"node":{"id":"TX1"}},{"node":{"id":"TX2"}}]}}}"#, + r#"{"data":{"transactions":{"pageInfo":{"hasNextPage":true,"endCursor":"C1"},"edges":[{"node":{"id":"TX1","block":{"height":100}}},{"node":{"id":"TX2","block":null}}]}}}"#, ) .unwrap(); - assert_eq!(parse_tx_ids(&v), vec!["TX1".to_string(), "TX2".to_string()]); + let page = parse_tx_page(&v); + let ids: Vec<&str> = page.refs.iter().map(|r| r.id.as_str()).collect(); + assert_eq!(ids, vec!["TX1", "TX2"]); + assert_eq!(page.refs[0].height, Some(100)); + assert_eq!(page.refs[1].height, None); // null block -> pending -> None + assert_eq!(page.has_next, Some(true)); + assert_eq!(page.end_cursor.as_deref(), Some("C1")); } #[test] - fn parse_tx_ids_empty_on_no_edges() { + fn parse_tx_page_empty_on_no_edges() { let v: serde_json::Value = serde_json::from_str(r#"{"data":{"transactions":{"edges":[]}}}"#).unwrap(); - assert!(parse_tx_ids(&v).is_empty()); + let page = parse_tx_page(&v); + assert!(page.refs.is_empty()); + // No pageInfo present -> has_next None (the loop treats it as terminal). + assert_eq!(page.has_next, None); + assert_eq!(page.end_cursor, None); + } + + #[test] + fn parse_tx_page_missing_block_is_none() { + let v: serde_json::Value = + serde_json::from_str(r#"{"data":{"transactions":{"edges":[{"node":{"id":"TX1"}}]}}}"#) + .unwrap(); + let page = parse_tx_page(&v); + assert_eq!(page.refs[0].height, None); + } + + #[test] + fn parse_tx_page_height_parsing_is_robust() { + // A gateway that serializes height as a JSON string (a habit to dodge + // 53-bit float precision) must still parse to the real height, not + // collapse to None and masquerade as a pending/newest anchor. + let v: serde_json::Value = serde_json::from_str( + r#"{"data":{"transactions":{"edges":[ + {"node":{"id":"TXSTR","block":{"height":"12345"}}}, + {"node":{"id":"TXBAD","block":{"height":"not-a-number"}}}, + {"node":{"id":"TXNOH","block":{}}} + ]}}}"#, + ) + .unwrap(); + let page = parse_tx_page(&v); + // Numeric string parses to the real height. + assert_eq!(page.refs[0].height, Some(12345)); + // A present block with an unreadable/absent height is mined-but-unknown: + // it ranks lowest (Some(0)), NOT None/newest, so it can't steal a tie. + assert_eq!(page.refs[1].height, Some(0)); + assert_eq!(page.refs[2].height, Some(0)); } #[test] @@ -877,7 +1089,7 @@ mod tests { }, ], }; - let merged = merge_manifests(vec![older, newer]); + let merged = merge_manifests(vec![(older, Some(1)), (newer, Some(2))]); assert_eq!(merged.get("o1").map(String::as_str), Some("cidNEW")); assert_eq!(merged.get("o2").map(String::as_str), Some("cid2")); } @@ -899,10 +1111,109 @@ mod tests { }], }; // Newer first, older second: newer must still win. - let merged = merge_manifests(vec![newer, older]); + let merged = merge_manifests(vec![(newer, Some(2)), (older, Some(1))]); + assert_eq!(merged.get("o1").map(String::as_str), Some("cidNEW")); + } + + // Helper: a single-blob manifest for the merge tie-break tests. + fn manifest_with(ts: &str, oid: &str, cid: &str) -> Manifest { + Manifest { + timestamp: ts.to_string(), + blobs: vec![ManifestBlob { + oid: oid.to_string(), + cid: cid.to_string(), + }], + } + } + + #[test] + fn merge_equal_timestamp_breaks_by_block_height() { + // Same timestamp, different block heights: the higher block (newer + // anchor) must win, regardless of input order. + let lo = manifest_with("2026-06-11T00:00:00Z", "o1", "cidLO"); + let hi = manifest_with("2026-06-11T00:00:00Z", "o1", "cidHI"); + let a = merge_manifests(vec![ + ( + manifest_with("2026-06-11T00:00:00Z", "o1", "cidLO"), + Some(10), + ), + ( + manifest_with("2026-06-11T00:00:00Z", "o1", "cidHI"), + Some(20), + ), + ]); + assert_eq!(a.get("o1").map(String::as_str), Some("cidHI")); + // Reversed order, same result. + let b = merge_manifests(vec![(hi, Some(20)), (lo, Some(10))]); + assert_eq!(b.get("o1").map(String::as_str), Some("cidHI")); + } + + #[test] + fn merge_pending_anchor_is_newest_on_tie() { + // Same timestamp; one mined anchor, one pending (None height). The + // pending one is the most recent and must win, order-independently. + let mined = manifest_with("2026-06-11T00:00:00Z", "o1", "cidMINED"); + let pending = manifest_with("2026-06-11T00:00:00Z", "o1", "cidPENDING"); + let a = merge_manifests(vec![(mined, Some(99)), (pending, None)]); + assert_eq!(a.get("o1").map(String::as_str), Some("cidPENDING")); + let mined2 = manifest_with("2026-06-11T00:00:00Z", "o1", "cidMINED"); + let pending2 = manifest_with("2026-06-11T00:00:00Z", "o1", "cidPENDING"); + let b = merge_manifests(vec![(pending2, None), (mined2, Some(99))]); + assert_eq!(b.get("o1").map(String::as_str), Some("cidPENDING")); + } + + #[test] + fn merge_same_block_breaks_by_cid_deterministically() { + // Equal timestamp AND equal height: the cid tiebreak makes the result + // deterministic and order-independent (lexicographically-larger cid wins). + let a = merge_manifests(vec![ + ( + manifest_with("2026-06-11T00:00:00Z", "o1", "cidAAA"), + Some(5), + ), + ( + manifest_with("2026-06-11T00:00:00Z", "o1", "cidBBB"), + Some(5), + ), + ]); + let b = merge_manifests(vec![ + ( + manifest_with("2026-06-11T00:00:00Z", "o1", "cidBBB"), + Some(5), + ), + ( + manifest_with("2026-06-11T00:00:00Z", "o1", "cidAAA"), + Some(5), + ), + ]); + assert_eq!(a.get("o1"), b.get("o1")); + assert_eq!(a.get("o1").map(String::as_str), Some("cidBBB")); + } + + #[test] + fn merge_newer_timestamp_wins_regardless_of_height() { + // Timestamp is primary: a newer timestamp wins even with a lower block + // height than the older-timestamp manifest. + let older_hi_block = manifest_with("2026-06-10T00:00:00Z", "o1", "cidOLD"); + let newer_lo_block = manifest_with("2026-06-11T00:00:00Z", "o1", "cidNEW"); + let merged = merge_manifests(vec![ + (older_hi_block, Some(9999)), + (newer_lo_block, Some(1)), + ]); assert_eq!(merged.get("o1").map(String::as_str), Some("cidNEW")); } + #[test] + fn merge_empty_timestamp_loses_to_timestamped() { + // A manifest with a missing/empty timestamp (serde default) sorts lowest + // and loses to any timestamped manifest for the same oid. Pins the + // empty-sorts-lowest precondition as a deliberate decision. + let untimestamped = manifest_with("", "o1", "cidEMPTY"); + let timestamped = manifest_with("2026-06-11T00:00:00Z", "o1", "cidTS"); + let a = merge_manifests(vec![(untimestamped, Some(50)), (timestamped, Some(1))]); + assert_eq!(a.get("o1").map(String::as_str), Some("cidTS")); + } + /// Read-path end-to-end over a mocked Arweave + IPFS gateway: discover the /// manifest via GraphQL, fetch it, fetch the envelope, decrypt with the /// caller's key, and install the previously-withheld blob. @@ -1001,6 +1312,206 @@ mod tests { ); } + /// Discovery must follow cursor pagination: a blob whose manifest is anchored + /// only on the SECOND page is still recovered. If the loop stopped at page 1 + /// (the old `first:100` behavior), this blob would be lost. + #[tokio::test] + async fn recover_from_arweave_paginates_to_later_page() { + use gitlawb_core::encrypt::seal_blob; + use gitlawb_core::identity::Keypair; + + let (td, url) = bare_remote(&[("public/a.txt", b"pub\n"), ("secret/b.txt", b"SECRET\n")]); + let dest = td.path().join("dest"); + let bare = url.strip_prefix("file://").unwrap(); + assert!(Command::new("git") + .args(["-C", bare, "config", "uploadpack.allowFilter", "true"]) + .status() + .unwrap() + .success()); + setup_partial_clone(&dest, &url, &["/secret/**".to_string()], &[], None).unwrap(); + let oid = { + let out = Command::new("git") + .args([ + "-C", + dest.to_str().unwrap(), + "rev-parse", + "HEAD:secret/b.txt", + ]) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + }; + std::fs::remove_dir_all(url.strip_prefix("file://").unwrap()).unwrap(); + + let reader = Keypair::generate(); + let envelope = seal_blob(b"SECRET\n", &[reader.verifying_key()]).unwrap(); + let cid = "testcid123"; + + let mut server = mockito::Server::new_async().await; + // Page 1 (cursor null): hasNextPage=true, endCursor C1, an empty manifest + // anchor that does NOT carry the withheld blob. + let _gql_p1 = server + .mock("POST", "/graphql") + .match_body(mockito::Matcher::PartialJsonString( + r#"{"variables":{"cursor":null}}"#.into(), + )) + .with_status(200) + .with_header("content-type", "application/json") + .with_body( + r#"{"data":{"transactions":{"pageInfo":{"hasNextPage":true,"endCursor":"C1"},"edges":[{"node":{"id":"TXP1","block":{"height":10}}}]}}}"#, + ) + .create_async() + .await; + // Page 2 (cursor "C1"): hasNextPage=false, the anchor with the blob. + let _gql_p2 = server + .mock("POST", "/graphql") + .match_body(mockito::Matcher::PartialJsonString( + r#"{"variables":{"cursor":"C1"}}"#.into(), + )) + .with_status(200) + .with_header("content-type", "application/json") + .with_body( + r#"{"data":{"transactions":{"pageInfo":{"hasNextPage":false,"endCursor":"C2"},"edges":[{"node":{"id":"TX1","block":{"height":20}}}]}}}"#, + ) + .create_async() + .await; + let _tx_p1 = server + .mock("GET", "/TXP1") + .with_status(200) + .with_header("content-type", "application/json") + .with_body(r#"{"timestamp":"2026-06-10T00:00:00Z","blobs":[]}"#) + .create_async() + .await; + let manifest_body = serde_json::json!({ + "timestamp": "2026-06-11T00:00:00Z", + "blobs": [{ "oid": oid, "cid": cid, "recipients": [] }], + }) + .to_string(); + let _tx1 = server + .mock("GET", "/TX1") + .with_status(200) + .with_header("content-type", "application/json") + .with_body(manifest_body) + .create_async() + .await; + let _blob = server + .mock("GET", format!("/ipfs/{cid}").as_str()) + .with_status(200) + .with_body(envelope) + .create_async() + .await; + + let paths = recover_from_arweave( + &server.url(), + &server.url(), + "alice", + "myrepo", + &dest, + &reader, + ) + .await + .unwrap(); + assert_eq!( + paths, + vec!["secret/b.txt".to_string()], + "blob anchored only on page 2 must be recovered via pagination" + ); + } + + /// Best-effort on mid-pagination failure: page 1 already yielded a usable + /// anchor; page 2 fails. Recovery still installs page 1's blob (partial Ok), + /// never aborts the clone. + #[tokio::test] + async fn recover_from_arweave_partial_on_midpagination_failure() { + use gitlawb_core::encrypt::seal_blob; + use gitlawb_core::identity::Keypair; + + let (td, url) = bare_remote(&[("public/a.txt", b"pub\n"), ("secret/b.txt", b"SECRET\n")]); + let dest = td.path().join("dest"); + let bare = url.strip_prefix("file://").unwrap(); + assert!(Command::new("git") + .args(["-C", bare, "config", "uploadpack.allowFilter", "true"]) + .status() + .unwrap() + .success()); + setup_partial_clone(&dest, &url, &["/secret/**".to_string()], &[], None).unwrap(); + let oid = { + let out = Command::new("git") + .args([ + "-C", + dest.to_str().unwrap(), + "rev-parse", + "HEAD:secret/b.txt", + ]) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + }; + std::fs::remove_dir_all(url.strip_prefix("file://").unwrap()).unwrap(); + + let reader = Keypair::generate(); + let envelope = seal_blob(b"SECRET\n", &[reader.verifying_key()]).unwrap(); + let cid = "testcid123"; + + let mut server = mockito::Server::new_async().await; + // Page 1 (cursor null): the blob anchor, hasNextPage=true. + let manifest_body = serde_json::json!({ + "timestamp": "2026-06-11T00:00:00Z", + "blobs": [{ "oid": oid, "cid": cid, "recipients": [] }], + }) + .to_string(); + let _gql_p1 = server + .mock("POST", "/graphql") + .match_body(mockito::Matcher::PartialJsonString( + r#"{"variables":{"cursor":null}}"#.into(), + )) + .with_status(200) + .with_header("content-type", "application/json") + .with_body( + r#"{"data":{"transactions":{"pageInfo":{"hasNextPage":true,"endCursor":"C1"},"edges":[{"node":{"id":"TX1","block":{"height":20}}}]}}}"#, + ) + .create_async() + .await; + // Page 2 (cursor "C1"): gateway error. + let _gql_p2 = server + .mock("POST", "/graphql") + .match_body(mockito::Matcher::PartialJsonString( + r#"{"variables":{"cursor":"C1"}}"#.into(), + )) + .with_status(500) + .create_async() + .await; + let _tx1 = server + .mock("GET", "/TX1") + .with_status(200) + .with_header("content-type", "application/json") + .with_body(manifest_body) + .create_async() + .await; + let _blob = server + .mock("GET", format!("/ipfs/{cid}").as_str()) + .with_status(200) + .with_body(envelope) + .create_async() + .await; + + let paths = recover_from_arweave( + &server.url(), + &server.url(), + "alice", + "myrepo", + &dest, + &reader, + ) + .await + .expect("mid-pagination failure must not abort recovery"); + assert_eq!( + paths, + vec!["secret/b.txt".to_string()], + "page-1 blob must still be recovered when page 2 fails" + ); + } + /// A caller who is not a recipient cannot decrypt the envelope, so nothing is /// recovered even though the manifest and envelope are reachable. #[tokio::test] From 82eeea1900a17c9f845296e3468d978118771881 Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Sat, 20 Jun 2026 15:08:05 -0500 Subject: [PATCH 3/4] fix(review): bound edges per Arweave page and cover degenerate-cursor/dedup Code-review follow-ups on the #49 recovery change: - Cap edges parsed per GraphQL page at ARWEAVE_PAGE_SIZE so a hostile gateway returning one oversized page cannot defeat the MAX_TX_IDS budget or flood the downstream per-id manifest fetch loop (adversarial review). - Collapse the two identical gateway-failure arms in the discovery loop into a single warn site, and hoist PAGE_SIZE to a named const tied to the query's first:100 (maintainability). - Add tests: per-page edge bound, non-advancing-cursor termination (infinite-loop guard), and cross-page tx-id dedup fetched exactly once (testing). Part of #49. --- crates/gl/src/clone.rs | 238 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 220 insertions(+), 18 deletions(-) diff --git a/crates/gl/src/clone.rs b/crates/gl/src/clone.rs index dfb2b95..a7e679f 100644 --- a/crates/gl/src/clone.rs +++ b/crates/gl/src/clone.rs @@ -373,6 +373,11 @@ struct TxRef { height: Option, } +/// Edges requested per Arweave GraphQL page (`first:` in the discovery query) and +/// the per-page bound enforced when parsing a response. The `first:N` literal in +/// the query string MUST equal this constant. +const ARWEAVE_PAGE_SIZE: usize = 100; + /// A parsed page of an Arweave GraphQL `transactions` response: the edge refs /// plus pagination state. `has_next` is `None` when the response omitted /// `pageInfo` entirely (an older or partial gateway), which the discovery loop @@ -387,6 +392,11 @@ struct TxPage { /// height, plus `pageInfo` for cursor pagination. Hand-walks the `Value` (the /// envelope nests `data.transactions.{edges,pageInfo}`) rather than deriving a /// wrapper struct, matching the surrounding style. +/// +/// At most `ARWEAVE_PAGE_SIZE` edges are taken from a single page: the query asks +/// for `first:100`, so a response with more edges is a misbehaving (or hostile) +/// gateway. Bounding here caps per-page allocation and the downstream per-id fetch +/// loop so the `MAX_TX_IDS` budget can't be defeated by one oversized page. fn parse_tx_page(v: &serde_json::Value) -> TxPage { let txns = v.get("data").and_then(|d| d.get("transactions")); let refs: Vec = txns @@ -395,6 +405,7 @@ fn parse_tx_page(v: &serde_json::Value) -> TxPage { .map(|edges| { edges .iter() + .take(ARWEAVE_PAGE_SIZE) .filter_map(|edge| { let node = edge.get("node")?; let id = node.get("id").and_then(|i| i.as_str())?.to_string(); @@ -530,7 +541,9 @@ async fn recover_from_arweave( // is then knowingly partial. const MAX_PAGES: usize = 1000; const MAX_TX_IDS: usize = 10_000; - const PAGE_SIZE: usize = 100; + // `first:100` below must equal ARWEAVE_PAGE_SIZE (the per-page bound enforced + // in `parse_tx_page`); the query string keeps the literal because formatting + // it would mean escaping the brace-heavy GraphQL body. let query = r#"query($repo:String!,$cursor:String){transactions(tags:[{name:"App-Name",values:["gitlawb"]},{name:"Schema",values:["gitlawb/encrypted-manifest/v1"]},{name:"Repo",values:[$repo]}],first:100,after:$cursor){pageInfo{hasNextPage endCursor}edges{cursor node{id block{height}}}}}"#; let mut after: Option = None; let mut seen: std::collections::HashSet = std::collections::HashSet::new(); @@ -548,27 +561,22 @@ async fn recover_from_arweave( page_num += 1; let gql_body = serde_json::json!({ "query": query, "variables": { "repo": slug, "cursor": after } }); - let value: serde_json::Value = match client + // Any gateway failure (non-2xx, send error, unparseable body) ends the loop. + let response = match client .post(format!("{ag}/graphql")) .json(&gql_body) .send() .await { - Ok(r) if r.status().is_success() => match r.json().await { - Ok(v) => v, - Err(_) => { - if !refs.is_empty() { - eprintln!( - "warning: Arweave manifest discovery was interrupted by a gateway \ - error; some authorized files may not be recovered" - ); - } - break; - } - }, - // Whole-gateway failure on the first page is the benign unconfigured/ - // unreachable case: stay silent, exactly as before. - _ => { + Ok(r) if r.status().is_success() => r.json::().await.ok(), + _ => None, + }; + let value = match response { + Some(v) => v, + None => { + // A failure *after* discovery began means recovery is knowingly + // partial, so warn. A page-1 failure is the benign unconfigured/ + // unreachable-gateway case and stays silent, exactly as before. if !refs.is_empty() { eprintln!( "warning: Arweave manifest discovery was interrupted by a gateway \ @@ -579,7 +587,7 @@ async fn recover_from_arweave( } }; let page = parse_tx_page(&value); - let full = page.refs.len() >= PAGE_SIZE; + let full = page.refs.len() >= ARWEAVE_PAGE_SIZE; for r in page.refs { if seen.insert(r.id.clone()) { refs.push(r); @@ -1055,6 +1063,23 @@ mod tests { assert_eq!(page.refs[2].height, Some(0)); } + #[test] + fn parse_tx_page_bounds_edges_per_page() { + // A hostile gateway returning more than ARWEAVE_PAGE_SIZE edges on one + // page must not defeat the MAX_TX_IDS budget: parse caps per-page refs so + // the downstream fetch loop stays bounded. + let edges: String = (0..ARWEAVE_PAGE_SIZE + 50) + .map(|i| format!(r#"{{"node":{{"id":"TX{i}"}}}}"#)) + .collect::>() + .join(","); + let v: serde_json::Value = serde_json::from_str(&format!( + r#"{{"data":{{"transactions":{{"edges":[{edges}]}}}}}}"# + )) + .unwrap(); + let page = parse_tx_page(&v); + assert_eq!(page.refs.len(), ARWEAVE_PAGE_SIZE); + } + #[test] fn manifest_parses_and_ignores_recipients() { let m: Manifest = serde_json::from_str( @@ -1512,6 +1537,183 @@ mod tests { ); } + /// A gateway that reports `hasNextPage=true` but never advances the cursor + /// (returns the same `endCursor` every page) must not loop forever: the + /// degenerate-cursor guard terminates the loop and recovery returns based on + /// what was seen. The test completing at all proves no hang. + #[tokio::test] + async fn recover_from_arweave_terminates_on_nonadvancing_cursor() { + use gitlawb_core::encrypt::seal_blob; + use gitlawb_core::identity::Keypair; + + let (td, url) = bare_remote(&[("public/a.txt", b"pub\n"), ("secret/b.txt", b"SECRET\n")]); + let dest = td.path().join("dest"); + let bare = url.strip_prefix("file://").unwrap(); + assert!(Command::new("git") + .args(["-C", bare, "config", "uploadpack.allowFilter", "true"]) + .status() + .unwrap() + .success()); + setup_partial_clone(&dest, &url, &["/secret/**".to_string()], &[], None).unwrap(); + let oid = { + let out = Command::new("git") + .args([ + "-C", + dest.to_str().unwrap(), + "rev-parse", + "HEAD:secret/b.txt", + ]) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + }; + std::fs::remove_dir_all(url.strip_prefix("file://").unwrap()).unwrap(); + + let reader = Keypair::generate(); + let envelope = seal_blob(b"SECRET\n", &[reader.verifying_key()]).unwrap(); + let cid = "testcid123"; + + let mut server = mockito::Server::new_async().await; + // Every GraphQL POST returns hasNextPage=true with the SAME endCursor. + // Without the guard this would request pages until MAX_PAGES; with it the + // loop breaks after the first non-advancing cursor. + let manifest_body = serde_json::json!({ + "timestamp": "2026-06-11T00:00:00Z", + "blobs": [{ "oid": oid, "cid": cid, "recipients": [] }], + }) + .to_string(); + let _gql = server + .mock("POST", "/graphql") + .expect_at_most(3) // a couple at most; certainly not MAX_PAGES + .with_status(200) + .with_header("content-type", "application/json") + .with_body( + r#"{"data":{"transactions":{"pageInfo":{"hasNextPage":true,"endCursor":"STUCK"},"edges":[{"node":{"id":"TX1","block":{"height":20}}}]}}}"#, + ) + .create_async() + .await; + let _tx1 = server + .mock("GET", "/TX1") + .with_status(200) + .with_header("content-type", "application/json") + .with_body(manifest_body) + .create_async() + .await; + let _blob = server + .mock("GET", format!("/ipfs/{cid}").as_str()) + .with_status(200) + .with_body(envelope) + .create_async() + .await; + + let paths = recover_from_arweave( + &server.url(), + &server.url(), + "alice", + "myrepo", + &dest, + &reader, + ) + .await + .expect("non-advancing cursor must terminate, not hang or error"); + assert_eq!(paths, vec!["secret/b.txt".to_string()]); + } + + /// A tx id repeated across pages must be fetched only once: the cross-page + /// dedup (R8) bounds the downstream per-id manifest fetch loop. + #[tokio::test] + async fn recover_from_arweave_dedups_tx_ids_across_pages() { + use gitlawb_core::encrypt::seal_blob; + use gitlawb_core::identity::Keypair; + + let (td, url) = bare_remote(&[("public/a.txt", b"pub\n"), ("secret/b.txt", b"SECRET\n")]); + let dest = td.path().join("dest"); + let bare = url.strip_prefix("file://").unwrap(); + assert!(Command::new("git") + .args(["-C", bare, "config", "uploadpack.allowFilter", "true"]) + .status() + .unwrap() + .success()); + setup_partial_clone(&dest, &url, &["/secret/**".to_string()], &[], None).unwrap(); + let oid = { + let out = Command::new("git") + .args([ + "-C", + dest.to_str().unwrap(), + "rev-parse", + "HEAD:secret/b.txt", + ]) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + }; + std::fs::remove_dir_all(url.strip_prefix("file://").unwrap()).unwrap(); + + let reader = Keypair::generate(); + let envelope = seal_blob(b"SECRET\n", &[reader.verifying_key()]).unwrap(); + let cid = "testcid123"; + + let mut server = mockito::Server::new_async().await; + // Page 1 and page 2 both list TX1 (cursor advances); the dedup must fetch + // the /TX1 manifest exactly once. + let _gql_p1 = server + .mock("POST", "/graphql") + .match_body(mockito::Matcher::PartialJsonString( + r#"{"variables":{"cursor":null}}"#.into(), + )) + .with_status(200) + .with_header("content-type", "application/json") + .with_body( + r#"{"data":{"transactions":{"pageInfo":{"hasNextPage":true,"endCursor":"C1"},"edges":[{"node":{"id":"TX1","block":{"height":20}}}]}}}"#, + ) + .create_async() + .await; + let _gql_p2 = server + .mock("POST", "/graphql") + .match_body(mockito::Matcher::PartialJsonString( + r#"{"variables":{"cursor":"C1"}}"#.into(), + )) + .with_status(200) + .with_header("content-type", "application/json") + .with_body( + r#"{"data":{"transactions":{"pageInfo":{"hasNextPage":false,"endCursor":"C2"},"edges":[{"node":{"id":"TX1","block":{"height":20}}}]}}}"#, + ) + .create_async() + .await; + let manifest_body = serde_json::json!({ + "timestamp": "2026-06-11T00:00:00Z", + "blobs": [{ "oid": oid, "cid": cid, "recipients": [] }], + }) + .to_string(); + let tx1 = server + .mock("GET", "/TX1") + .expect(1) // dedup: fetched once despite appearing on both pages + .with_status(200) + .with_header("content-type", "application/json") + .with_body(manifest_body) + .create_async() + .await; + let _blob = server + .mock("GET", format!("/ipfs/{cid}").as_str()) + .with_status(200) + .with_body(envelope) + .create_async() + .await; + + let paths = recover_from_arweave( + &server.url(), + &server.url(), + "alice", + "myrepo", + &dest, + &reader, + ) + .await + .unwrap(); + assert_eq!(paths, vec!["secret/b.txt".to_string()]); + tx1.assert_async().await; // exactly one /TX1 fetch + } + /// A caller who is not a recipient cannot decrypt the envelope, so nothing is /// recovered even though the manifest and envelope are reachable. #[tokio::test] From 3a06b27d9061c3666b5dfdb7a53f77a24b94bfdc Mon Sep 17 00:00:00 2001 From: beardthelion <56458543+beardthelion@users.noreply.github.com> Date: Sat, 20 Jun 2026 15:16:48 -0500 Subject: [PATCH 4/4] fix(review): make the degenerate-cursor test assert, and pin the page-size invariant Round-2 review follow-ups: - The non-advancing-cursor test bound its mock to a _-prefixed handle with expect_at_most(3) and never asserted it; mockito defaults assert_on_drop=false, so the test passed even if the guard were removed. Bind it, tighten to expect(2) (the guard breaks on the second page, so discovery makes exactly two POSTs), and assert_async. - Correct the parse_tx_page docstring: the per-page take() bounds the TxRef vec and fetch loop, not the response-body deserialization (an unbounded-body concern shared by every gateway read here). - Hoist the discovery query to a const and add query_first_matches_page_size so the query first:N literal and ARWEAVE_PAGE_SIZE are kept equal by a test rather than a comment. Part of #49. --- crates/gl/src/clone.rs | 43 +++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/crates/gl/src/clone.rs b/crates/gl/src/clone.rs index a7e679f..6754c5e 100644 --- a/crates/gl/src/clone.rs +++ b/crates/gl/src/clone.rs @@ -375,9 +375,15 @@ struct TxRef { /// Edges requested per Arweave GraphQL page (`first:` in the discovery query) and /// the per-page bound enforced when parsing a response. The `first:N` literal in -/// the query string MUST equal this constant. +/// `ARWEAVE_DISCOVERY_QUERY` MUST equal this constant; `query_first_matches_page_size` +/// pins that invariant at test time (the query keeps a literal because `format!`-ing +/// the brace-heavy GraphQL body would be error-prone). const ARWEAVE_PAGE_SIZE: usize = 100; +/// Paginated discovery query: manifests tagged for this repo, newest first, with +/// `pageInfo` for cursor pagination and `block{height}` for the recency tie-break. +const ARWEAVE_DISCOVERY_QUERY: &str = r#"query($repo:String!,$cursor:String){transactions(tags:[{name:"App-Name",values:["gitlawb"]},{name:"Schema",values:["gitlawb/encrypted-manifest/v1"]},{name:"Repo",values:[$repo]}],first:100,after:$cursor){pageInfo{hasNextPage endCursor}edges{cursor node{id block{height}}}}}"#; + /// A parsed page of an Arweave GraphQL `transactions` response: the edge refs /// plus pagination state. `has_next` is `None` when the response omitted /// `pageInfo` entirely (an older or partial gateway), which the discovery loop @@ -395,8 +401,11 @@ struct TxPage { /// /// At most `ARWEAVE_PAGE_SIZE` edges are taken from a single page: the query asks /// for `first:100`, so a response with more edges is a misbehaving (or hostile) -/// gateway. Bounding here caps per-page allocation and the downstream per-id fetch -/// loop so the `MAX_TX_IDS` budget can't be defeated by one oversized page. +/// gateway. Bounding here caps the per-page `TxRef` vec and the downstream per-id +/// fetch loop so the `MAX_TX_IDS` budget can't be defeated by one oversized page. +/// (It does not bound the response body itself: `reqwest` buffers and deserializes +/// the whole body before this runs — an unbounded-body concern shared by every +/// gateway read in this file, addressed separately if it ever matters.) fn parse_tx_page(v: &serde_json::Value) -> TxPage { let txns = v.get("data").and_then(|d| d.get("transactions")); let refs: Vec = txns @@ -541,10 +550,7 @@ async fn recover_from_arweave( // is then knowingly partial. const MAX_PAGES: usize = 1000; const MAX_TX_IDS: usize = 10_000; - // `first:100` below must equal ARWEAVE_PAGE_SIZE (the per-page bound enforced - // in `parse_tx_page`); the query string keeps the literal because formatting - // it would mean escaping the brace-heavy GraphQL body. - let query = r#"query($repo:String!,$cursor:String){transactions(tags:[{name:"App-Name",values:["gitlawb"]},{name:"Schema",values:["gitlawb/encrypted-manifest/v1"]},{name:"Repo",values:[$repo]}],first:100,after:$cursor){pageInfo{hasNextPage endCursor}edges{cursor node{id block{height}}}}}"#; + let query = ARWEAVE_DISCOVERY_QUERY; let mut after: Option = None; let mut seen: std::collections::HashSet = std::collections::HashSet::new(); let mut refs: Vec = Vec::new(); @@ -1080,6 +1086,16 @@ mod tests { assert_eq!(page.refs.len(), ARWEAVE_PAGE_SIZE); } + #[test] + fn query_first_matches_page_size() { + // The `first:N` the discovery query requests must equal the per-page bound + // enforced in parse_tx_page, or full-page detection and the cap drift. + assert!( + ARWEAVE_DISCOVERY_QUERY.contains(&format!("first:{ARWEAVE_PAGE_SIZE}")), + "discovery query `first:` must equal ARWEAVE_PAGE_SIZE ({ARWEAVE_PAGE_SIZE})" + ); + } + #[test] fn manifest_parses_and_ignores_recipients() { let m: Manifest = serde_json::from_str( @@ -1575,16 +1591,20 @@ mod tests { let mut server = mockito::Server::new_async().await; // Every GraphQL POST returns hasNextPage=true with the SAME endCursor. - // Without the guard this would request pages until MAX_PAGES; with it the - // loop breaks after the first non-advancing cursor. + // The guard fires on the second page (cursor repeats prev_cursor) and + // breaks, so discovery makes exactly two POSTs: page 1 (cursor null) records + // STUCK, page 2 (cursor STUCK) sees the repeat and stops. We assert that + // exact count below — a regression in the advance logic that loops extra + // times in the happy path fails the expect(2). (MAX_PAGES is the hard + // infinite-loop backstop independent of this guard.) let manifest_body = serde_json::json!({ "timestamp": "2026-06-11T00:00:00Z", "blobs": [{ "oid": oid, "cid": cid, "recipients": [] }], }) .to_string(); - let _gql = server + let gql = server .mock("POST", "/graphql") - .expect_at_most(3) // a couple at most; certainly not MAX_PAGES + .expect(2) .with_status(200) .with_header("content-type", "application/json") .with_body( @@ -1617,6 +1637,7 @@ mod tests { .await .expect("non-advancing cursor must terminate, not hang or error"); assert_eq!(paths, vec!["secret/b.txt".to_string()]); + gql.assert_async().await; // exactly two discovery POSTs, then the guard breaks } /// A tx id repeated across pages must be fetched only once: the cross-page