diff --git a/crates/gl/src/clone.rs b/crates/gl/src/clone.rs index 93e998d..6754c5e 100644 --- a/crates/gl/src/clone.rs +++ b/crates/gl/src/clone.rs @@ -234,6 +234,22 @@ struct WithheldPathsResponse { reinclude: Vec, } +/// The node's `/encrypted-blobs` 200 body. Same rationale as `WithheldPathsResponse`: +/// deserializing into a typed struct makes a missing or mistyped `blobs` field (or a +/// blob entry missing its `oid`) a hard error instead of silently becoming "nothing +/// to recover", which would mask a server schema regression behind a clone that +/// quietly omits authorized files. Unknown server fields (e.g. `size`) are ignored +/// by serde, so the response may carry extra keys without breaking. +#[derive(Deserialize)] +struct EncryptedBlobsResponse { + blobs: Vec, +} + +#[derive(Deserialize)] +struct EncryptedBlobEntry { + oid: String, +} + /// After the base clone, recover encrypted blobs the caller is authorized for /// that are missing locally: fetch the envelope, decrypt with the caller's key, /// install as a loose object. Returns the repo-relative paths recovered. @@ -259,13 +275,8 @@ async fn recover_encrypted_blobs( Ok(r) if r.status().is_success() => r, _ => return Ok(vec![]), }; - let body: serde_json::Value = resp.json().await.context("parsing encrypted-blobs")?; - let blobs = body - .get("blobs") - .and_then(|b| b.as_array()) - .cloned() - .unwrap_or_default(); - if blobs.is_empty() { + let body: EncryptedBlobsResponse = resp.json().await.context("parsing encrypted-blobs")?; + if body.blobs.is_empty() { return Ok(vec![]); } @@ -283,10 +294,8 @@ async fn recover_encrypted_blobs( } let mut recovered = Vec::new(); - for entry in blobs { - let Some(oid) = entry.get("oid").and_then(|o| o.as_str()) else { - continue; - }; + for entry in body.blobs { + let oid = entry.oid.as_str(); // Skip if already present locally. let present = Command::new("git") .args(["-C", dest_str, "cat-file", "-e", oid]) @@ -354,43 +363,151 @@ struct Manifest { blobs: Vec, } -/// Extract transaction ids from an Arweave GraphQL `transactions` response. -fn parse_tx_ids(v: &serde_json::Value) -> Vec { - v.get("data") - .and_then(|d| d.get("transactions")) +/// One discovered manifest transaction: its Arweave id and block height. The +/// height is `None` only when the anchor is not yet mined (no `block`), in which +/// case it is the most recent and sorts as newest in the merge tie-break. A mined +/// anchor is always `Some(_)` (0 if the gateway reported an unreadable height) so +/// a malformed height can never masquerade as pending/newest. +struct TxRef { + id: String, + height: Option, +} + +/// Edges requested per Arweave GraphQL page (`first:` in the discovery query) and +/// the per-page bound enforced when parsing a response. The `first:N` literal in +/// `ARWEAVE_DISCOVERY_QUERY` MUST equal this constant; `query_first_matches_page_size` +/// pins that invariant at test time (the query keeps a literal because `format!`-ing +/// the brace-heavy GraphQL body would be error-prone). +const ARWEAVE_PAGE_SIZE: usize = 100; + +/// Paginated discovery query: manifests tagged for this repo, newest first, with +/// `pageInfo` for cursor pagination and `block{height}` for the recency tie-break. +const ARWEAVE_DISCOVERY_QUERY: &str = r#"query($repo:String!,$cursor:String){transactions(tags:[{name:"App-Name",values:["gitlawb"]},{name:"Schema",values:["gitlawb/encrypted-manifest/v1"]},{name:"Repo",values:[$repo]}],first:100,after:$cursor){pageInfo{hasNextPage endCursor}edges{cursor node{id block{height}}}}}"#; + +/// A parsed page of an Arweave GraphQL `transactions` response: the edge refs +/// plus pagination state. `has_next` is `None` when the response omitted +/// `pageInfo` entirely (an older or partial gateway), which the discovery loop +/// treats as terminal but flags when the page was full (possible truncation). +struct TxPage { + refs: Vec, + has_next: Option, + end_cursor: Option, +} + +/// Parse one Arweave GraphQL `transactions` page: each edge's tx id and block +/// height, plus `pageInfo` for cursor pagination. Hand-walks the `Value` (the +/// envelope nests `data.transactions.{edges,pageInfo}`) rather than deriving a +/// wrapper struct, matching the surrounding style. +/// +/// At most `ARWEAVE_PAGE_SIZE` edges are taken from a single page: the query asks +/// for `first:100`, so a response with more edges is a misbehaving (or hostile) +/// gateway. Bounding here caps the per-page `TxRef` vec and the downstream per-id +/// fetch loop so the `MAX_TX_IDS` budget can't be defeated by one oversized page. +/// (It does not bound the response body itself: `reqwest` buffers and deserializes +/// the whole body before this runs — an unbounded-body concern shared by every +/// gateway read in this file, addressed separately if it ever matters.) +fn parse_tx_page(v: &serde_json::Value) -> TxPage { + let txns = v.get("data").and_then(|d| d.get("transactions")); + let refs: Vec = txns .and_then(|t| t.get("edges")) .and_then(|e| e.as_array()) .map(|edges| { edges .iter() + .take(ARWEAVE_PAGE_SIZE) .filter_map(|edge| { - edge.get("node") - .and_then(|n| n.get("id")) - .and_then(|i| i.as_str()) - .map(String::from) + let node = edge.get("node")?; + let id = node.get("id").and_then(|i| i.as_str())?.to_string(); + // Distinguish "not yet mined" from "mined but height unreadable". + // Only a genuinely absent/null `block` is pending (`None`, + // ranked newest). A present `block` means the anchor IS mined, so + // it must not be promoted to newest: parse the height permissively + // (number or numeric string, guarding a gateway that stringifies + // large ints), and an unreadable height ranks lowest (0) rather + // than collapsing to `None`/newest. + let height = match node.get("block") { + None | Some(serde_json::Value::Null) => None, + Some(b) => Some( + b.get("height") + .and_then(|h| { + h.as_u64() + .or_else(|| h.as_str().and_then(|s| s.parse::().ok())) + }) + .unwrap_or(0), + ), + }; + Some(TxRef { id, height }) }) .collect() }) - .unwrap_or_default() + .unwrap_or_default(); + let page_info = txns.and_then(|t| t.get("pageInfo")); + let has_next = page_info + .and_then(|p| p.get("hasNextPage")) + .and_then(|h| h.as_bool()); + let end_cursor = page_info + .and_then(|p| p.get("endCursor")) + .and_then(|c| c.as_str()) + .map(String::from); + TxPage { + refs, + has_next, + end_cursor, + } +} + +/// Order a block height so that a not-yet-mined anchor (`None`) sorts above any +/// mined one, and mined anchors order by ascending height. Higher rank == newer. +fn height_rank(h: Option) -> (u8, u64) { + match h { + None => (1, 0), // pending anchor: newest + Some(v) => (0, v), // mined: by block height + } } -/// Merge per-push manifests into a single `oid -> cid` map, latest-wins by the -/// manifest `timestamp` (RFC3339, compared lexicographically; a later push that -/// re-sealed a blob overrides the earlier entry). -fn merge_manifests(manifests: Vec) -> std::collections::HashMap { - let mut best: std::collections::HashMap = - std::collections::HashMap::new(); // oid -> (cid, timestamp) - for m in manifests { +/// Merge per-push manifests into a single `oid -> cid` map. A later seal of an +/// oid overrides an earlier one, ranked by the composite key +/// `(timestamp, block height, cid)`: +/// - manifest `timestamp` (RFC3339, lexicographic) is primary — the unchanged +/// latest-wins behavior; +/// - Arweave block height breaks equal-timestamp ties by anchor recency (a +/// pending anchor counts as newest), preserving the "newest wins" that +/// single-page `HEIGHT_DESC` discovery used to provide by insertion order. +/// The height is gateway-reported (chain-derived but not independently +/// verified here), so it removes order-dependence and honest-node clock skew, +/// not trust in the gateway — discovery already assumes an honest gateway; +/// - cid is a final lexicographic tiebreak so two anchors in the *same* block +/// still resolve deterministically (arbitrary, not recency-ordered, within a +/// block), independent of discovery order. +fn merge_manifests( + manifests: Vec<(Manifest, Option)>, +) -> std::collections::HashMap { + // oid -> (cid, timestamp, height_rank) + let mut best: std::collections::HashMap = + std::collections::HashMap::new(); + for (m, height) in manifests { + let cand_rank = height_rank(height); for b in m.blobs { - match best.get(&b.oid) { - Some((_, ts)) if ts.as_str() >= m.timestamp.as_str() => {} - _ => { - best.insert(b.oid, (b.cid, m.timestamp.clone())); + let keep_incumbent = match best.get(&b.oid) { + // Incumbent wins iff its composite key is >= the candidate's. The + // `>=` (not `>`) keeps the incumbent only on a full tie, where the + // entries are identical anyway. A strictly-greater candidate key + // (newer timestamp, or equal timestamp + higher block, or equal + // both + larger cid) replaces it. + Some((cur_cid, cur_ts, cur_rank)) => { + (cur_ts.as_str(), cur_rank, cur_cid.as_str()) + >= (m.timestamp.as_str(), &cand_rank, b.cid.as_str()) } + None => false, + }; + if !keep_incumbent { + best.insert(b.oid, (b.cid, m.timestamp.clone(), cand_rank)); } } } - best.into_iter().map(|(oid, (cid, _))| (oid, cid)).collect() + best.into_iter() + .map(|(oid, (cid, _, _))| (oid, cid)) + .collect() } /// Option B3 fallback recovery, with no dependency on a gitlawb node API. Query @@ -424,36 +541,111 @@ async fn recover_from_arweave( .build() .unwrap_or_else(|_| reqwest::Client::new()); - // 1. Discover manifest transaction ids via Arweave GraphQL. - let query = r#"query($repo:String!){transactions(tags:[{name:"App-Name",values:["gitlawb"]},{name:"Schema",values:["gitlawb/encrypted-manifest/v1"]},{name:"Repo",values:[$repo]}],first:100){edges{node{id}}}}"#; - let gql_body = serde_json::json!({ "query": query, "variables": { "repo": slug } }); - let resp = match client - .post(format!("{ag}/graphql")) - .json(&gql_body) - .send() - .await - { - Ok(r) if r.status().is_success() => r, - _ => return Ok(vec![]), - }; - let gql: serde_json::Value = match resp.json().await { - Ok(v) => v, - Err(_) => return Ok(vec![]), - }; - let tx_ids = parse_tx_ids(&gql); - if tx_ids.is_empty() { + // 1. Discover manifest transactions via paginated Arweave GraphQL. Collect + // (tx id, block height) refs across every page, deduping by id and bounding + // the loop so a hostile or buggy gateway cannot grow the set, the page + // count, or the downstream per-id fetch loop without limit. Best-effort: a + // page-1 failure (gateway down/unconfigured) stays silent as before; a + // failure or cap reached *after* discovery began warns, since the recovery + // is then knowingly partial. + const MAX_PAGES: usize = 1000; + const MAX_TX_IDS: usize = 10_000; + let query = ARWEAVE_DISCOVERY_QUERY; + let mut after: Option = None; + let mut seen: std::collections::HashSet = std::collections::HashSet::new(); + let mut refs: Vec = Vec::new(); + let mut prev_cursor: Option = None; + let mut page_num = 0usize; + loop { + if page_num >= MAX_PAGES { + eprintln!( + "warning: Arweave manifest discovery hit the {MAX_PAGES}-page cap; \ + some authorized files may not be recovered" + ); + break; + } + page_num += 1; + let gql_body = + serde_json::json!({ "query": query, "variables": { "repo": slug, "cursor": after } }); + // Any gateway failure (non-2xx, send error, unparseable body) ends the loop. + let response = match client + .post(format!("{ag}/graphql")) + .json(&gql_body) + .send() + .await + { + Ok(r) if r.status().is_success() => r.json::().await.ok(), + _ => None, + }; + let value = match response { + Some(v) => v, + None => { + // A failure *after* discovery began means recovery is knowingly + // partial, so warn. A page-1 failure is the benign unconfigured/ + // unreachable-gateway case and stays silent, exactly as before. + if !refs.is_empty() { + eprintln!( + "warning: Arweave manifest discovery was interrupted by a gateway \ + error; some authorized files may not be recovered" + ); + } + break; + } + }; + let page = parse_tx_page(&value); + let full = page.refs.len() >= ARWEAVE_PAGE_SIZE; + for r in page.refs { + if seen.insert(r.id.clone()) { + refs.push(r); + } + } + // Cap checked at the page boundary so a partial page is never half-merged. + if refs.len() >= MAX_TX_IDS { + eprintln!( + "warning: Arweave manifest discovery hit the {MAX_TX_IDS}-transaction cap; \ + some authorized files may not be recovered" + ); + break; + } + match page.has_next { + Some(true) => match page.end_cursor { + // Advance only on a fresh cursor; a missing or repeating cursor is + // a degenerate gateway response and must not loop forever. + Some(c) if Some(&c) != prev_cursor.as_ref() => { + prev_cursor = Some(c.clone()); + after = Some(c); + } + _ => break, + }, + Some(false) => break, + // No pageInfo: terminal. A *full* page without it may be a silently + // truncated gateway, so flag that; a short page is genuinely the end. + None => { + if full { + eprintln!( + "warning: Arweave gateway returned a full page without pagination \ + metadata; discovery may be incomplete and some authorized files may \ + not be recovered" + ); + } + break; + } + } + } + if refs.is_empty() { return Ok(vec![]); } - // 2. Fetch and parse each manifest body, then merge latest-wins per oid. - let mut manifests = Vec::new(); - for tx in tx_ids { - let m = match client.get(format!("{ag}/{tx}")).send().await { - Ok(r) if r.status().is_success() => r, + // 2. Fetch each manifest body, pair it with its anchor's block height, then + // merge (latest-wins by timestamp, tie-broken by block height then cid). + let mut manifests: Vec<(Manifest, Option)> = Vec::new(); + for r in refs { + let m = match client.get(format!("{ag}/{}", r.id)).send().await { + Ok(resp) if resp.status().is_success() => resp, _ => continue, }; if let Ok(parsed) = m.json::().await { - manifests.push(parsed); + manifests.push((parsed, r.height)); } } let oid_cid = merge_manifests(manifests); @@ -547,7 +739,14 @@ pub async fn run(args: CloneArgs) -> Result<()> { // fallback for any authorized blobs the node could not supply. let mut paths = recover_encrypted_blobs(&args.node, &owner, &name, &dest, &keypair) .await - .unwrap_or_default(); + .unwrap_or_else(|e| { + // A node-recovery error must not abort the clone (the Arweave + // fallback still runs), but the strict /encrypted-blobs parse now + // fails closed on schema drift, so surface it rather than letting + // `.unwrap_or_default()` silently swallow it into "no paths". + eprintln!("warning: encrypted-blobs recovery failed: {e}"); + Vec::new() + }); let from_arweave = recover_from_arweave( &args.arweave_gateway, &args.ipfs_gateway, @@ -788,19 +987,113 @@ mod tests { } #[test] - fn parse_tx_ids_extracts_node_ids() { + fn encrypted_blobs_response_is_schema_strict() { + // Happy path: a well-formed body parses and exposes the oids. + let ok: EncryptedBlobsResponse = + serde_json::from_str(r#"{"blobs":[{"oid":"abc"}]}"#).unwrap(); + assert_eq!(ok.blobs.len(), 1); + assert_eq!(ok.blobs[0].oid, "abc"); + + // Unknown server fields on an entry are tolerated (no deny_unknown_fields). + let extra: EncryptedBlobsResponse = + serde_json::from_str(r#"{"blobs":[{"oid":"abc","size":42}]}"#).unwrap(); + assert_eq!(extra.blobs[0].oid, "abc"); + + // Empty list is valid and distinct from a missing field. + let empty: EncryptedBlobsResponse = serde_json::from_str(r#"{"blobs":[]}"#).unwrap(); + assert!(empty.blobs.is_empty()); + + // Schema drift is a hard error, not a silent "nothing to recover": + // missing `blobs`, wrong-typed `blobs`, and an entry missing `oid`. + assert!(serde_json::from_str::(r#"{"items":[]}"#).is_err()); + assert!(serde_json::from_str::(r#"{"blobs":"nope"}"#).is_err()); + assert!( + serde_json::from_str::(r#"{"blobs":[{"size":1}]}"#).is_err() + ); + } + + #[test] + fn parse_tx_page_extracts_node_ids_and_heights() { let v: serde_json::Value = serde_json::from_str( - r#"{"data":{"transactions":{"edges":[{"node":{"id":"TX1"}},{"node":{"id":"TX2"}}]}}}"#, + r#"{"data":{"transactions":{"pageInfo":{"hasNextPage":true,"endCursor":"C1"},"edges":[{"node":{"id":"TX1","block":{"height":100}}},{"node":{"id":"TX2","block":null}}]}}}"#, ) .unwrap(); - assert_eq!(parse_tx_ids(&v), vec!["TX1".to_string(), "TX2".to_string()]); + let page = parse_tx_page(&v); + let ids: Vec<&str> = page.refs.iter().map(|r| r.id.as_str()).collect(); + assert_eq!(ids, vec!["TX1", "TX2"]); + assert_eq!(page.refs[0].height, Some(100)); + assert_eq!(page.refs[1].height, None); // null block -> pending -> None + assert_eq!(page.has_next, Some(true)); + assert_eq!(page.end_cursor.as_deref(), Some("C1")); } #[test] - fn parse_tx_ids_empty_on_no_edges() { + fn parse_tx_page_empty_on_no_edges() { let v: serde_json::Value = serde_json::from_str(r#"{"data":{"transactions":{"edges":[]}}}"#).unwrap(); - assert!(parse_tx_ids(&v).is_empty()); + let page = parse_tx_page(&v); + assert!(page.refs.is_empty()); + // No pageInfo present -> has_next None (the loop treats it as terminal). + assert_eq!(page.has_next, None); + assert_eq!(page.end_cursor, None); + } + + #[test] + fn parse_tx_page_missing_block_is_none() { + let v: serde_json::Value = + serde_json::from_str(r#"{"data":{"transactions":{"edges":[{"node":{"id":"TX1"}}]}}}"#) + .unwrap(); + let page = parse_tx_page(&v); + assert_eq!(page.refs[0].height, None); + } + + #[test] + fn parse_tx_page_height_parsing_is_robust() { + // A gateway that serializes height as a JSON string (a habit to dodge + // 53-bit float precision) must still parse to the real height, not + // collapse to None and masquerade as a pending/newest anchor. + let v: serde_json::Value = serde_json::from_str( + r#"{"data":{"transactions":{"edges":[ + {"node":{"id":"TXSTR","block":{"height":"12345"}}}, + {"node":{"id":"TXBAD","block":{"height":"not-a-number"}}}, + {"node":{"id":"TXNOH","block":{}}} + ]}}}"#, + ) + .unwrap(); + let page = parse_tx_page(&v); + // Numeric string parses to the real height. + assert_eq!(page.refs[0].height, Some(12345)); + // A present block with an unreadable/absent height is mined-but-unknown: + // it ranks lowest (Some(0)), NOT None/newest, so it can't steal a tie. + assert_eq!(page.refs[1].height, Some(0)); + assert_eq!(page.refs[2].height, Some(0)); + } + + #[test] + fn parse_tx_page_bounds_edges_per_page() { + // A hostile gateway returning more than ARWEAVE_PAGE_SIZE edges on one + // page must not defeat the MAX_TX_IDS budget: parse caps per-page refs so + // the downstream fetch loop stays bounded. + let edges: String = (0..ARWEAVE_PAGE_SIZE + 50) + .map(|i| format!(r#"{{"node":{{"id":"TX{i}"}}}}"#)) + .collect::>() + .join(","); + let v: serde_json::Value = serde_json::from_str(&format!( + r#"{{"data":{{"transactions":{{"edges":[{edges}]}}}}}}"# + )) + .unwrap(); + let page = parse_tx_page(&v); + assert_eq!(page.refs.len(), ARWEAVE_PAGE_SIZE); + } + + #[test] + fn query_first_matches_page_size() { + // The `first:N` the discovery query requests must equal the per-page bound + // enforced in parse_tx_page, or full-page detection and the cap drift. + assert!( + ARWEAVE_DISCOVERY_QUERY.contains(&format!("first:{ARWEAVE_PAGE_SIZE}")), + "discovery query `first:` must equal ARWEAVE_PAGE_SIZE ({ARWEAVE_PAGE_SIZE})" + ); } #[test] @@ -837,7 +1130,7 @@ mod tests { }, ], }; - let merged = merge_manifests(vec![older, newer]); + let merged = merge_manifests(vec![(older, Some(1)), (newer, Some(2))]); assert_eq!(merged.get("o1").map(String::as_str), Some("cidNEW")); assert_eq!(merged.get("o2").map(String::as_str), Some("cid2")); } @@ -859,10 +1152,109 @@ mod tests { }], }; // Newer first, older second: newer must still win. - let merged = merge_manifests(vec![newer, older]); + let merged = merge_manifests(vec![(newer, Some(2)), (older, Some(1))]); assert_eq!(merged.get("o1").map(String::as_str), Some("cidNEW")); } + // Helper: a single-blob manifest for the merge tie-break tests. + fn manifest_with(ts: &str, oid: &str, cid: &str) -> Manifest { + Manifest { + timestamp: ts.to_string(), + blobs: vec![ManifestBlob { + oid: oid.to_string(), + cid: cid.to_string(), + }], + } + } + + #[test] + fn merge_equal_timestamp_breaks_by_block_height() { + // Same timestamp, different block heights: the higher block (newer + // anchor) must win, regardless of input order. + let lo = manifest_with("2026-06-11T00:00:00Z", "o1", "cidLO"); + let hi = manifest_with("2026-06-11T00:00:00Z", "o1", "cidHI"); + let a = merge_manifests(vec![ + ( + manifest_with("2026-06-11T00:00:00Z", "o1", "cidLO"), + Some(10), + ), + ( + manifest_with("2026-06-11T00:00:00Z", "o1", "cidHI"), + Some(20), + ), + ]); + assert_eq!(a.get("o1").map(String::as_str), Some("cidHI")); + // Reversed order, same result. + let b = merge_manifests(vec![(hi, Some(20)), (lo, Some(10))]); + assert_eq!(b.get("o1").map(String::as_str), Some("cidHI")); + } + + #[test] + fn merge_pending_anchor_is_newest_on_tie() { + // Same timestamp; one mined anchor, one pending (None height). The + // pending one is the most recent and must win, order-independently. + let mined = manifest_with("2026-06-11T00:00:00Z", "o1", "cidMINED"); + let pending = manifest_with("2026-06-11T00:00:00Z", "o1", "cidPENDING"); + let a = merge_manifests(vec![(mined, Some(99)), (pending, None)]); + assert_eq!(a.get("o1").map(String::as_str), Some("cidPENDING")); + let mined2 = manifest_with("2026-06-11T00:00:00Z", "o1", "cidMINED"); + let pending2 = manifest_with("2026-06-11T00:00:00Z", "o1", "cidPENDING"); + let b = merge_manifests(vec![(pending2, None), (mined2, Some(99))]); + assert_eq!(b.get("o1").map(String::as_str), Some("cidPENDING")); + } + + #[test] + fn merge_same_block_breaks_by_cid_deterministically() { + // Equal timestamp AND equal height: the cid tiebreak makes the result + // deterministic and order-independent (lexicographically-larger cid wins). + let a = merge_manifests(vec![ + ( + manifest_with("2026-06-11T00:00:00Z", "o1", "cidAAA"), + Some(5), + ), + ( + manifest_with("2026-06-11T00:00:00Z", "o1", "cidBBB"), + Some(5), + ), + ]); + let b = merge_manifests(vec![ + ( + manifest_with("2026-06-11T00:00:00Z", "o1", "cidBBB"), + Some(5), + ), + ( + manifest_with("2026-06-11T00:00:00Z", "o1", "cidAAA"), + Some(5), + ), + ]); + assert_eq!(a.get("o1"), b.get("o1")); + assert_eq!(a.get("o1").map(String::as_str), Some("cidBBB")); + } + + #[test] + fn merge_newer_timestamp_wins_regardless_of_height() { + // Timestamp is primary: a newer timestamp wins even with a lower block + // height than the older-timestamp manifest. + let older_hi_block = manifest_with("2026-06-10T00:00:00Z", "o1", "cidOLD"); + let newer_lo_block = manifest_with("2026-06-11T00:00:00Z", "o1", "cidNEW"); + let merged = merge_manifests(vec![ + (older_hi_block, Some(9999)), + (newer_lo_block, Some(1)), + ]); + assert_eq!(merged.get("o1").map(String::as_str), Some("cidNEW")); + } + + #[test] + fn merge_empty_timestamp_loses_to_timestamped() { + // A manifest with a missing/empty timestamp (serde default) sorts lowest + // and loses to any timestamped manifest for the same oid. Pins the + // empty-sorts-lowest precondition as a deliberate decision. + let untimestamped = manifest_with("", "o1", "cidEMPTY"); + let timestamped = manifest_with("2026-06-11T00:00:00Z", "o1", "cidTS"); + let a = merge_manifests(vec![(untimestamped, Some(50)), (timestamped, Some(1))]); + assert_eq!(a.get("o1").map(String::as_str), Some("cidTS")); + } + /// Read-path end-to-end over a mocked Arweave + IPFS gateway: discover the /// manifest via GraphQL, fetch it, fetch the envelope, decrypt with the /// caller's key, and install the previously-withheld blob. @@ -961,6 +1353,388 @@ mod tests { ); } + /// Discovery must follow cursor pagination: a blob whose manifest is anchored + /// only on the SECOND page is still recovered. If the loop stopped at page 1 + /// (the old `first:100` behavior), this blob would be lost. + #[tokio::test] + async fn recover_from_arweave_paginates_to_later_page() { + use gitlawb_core::encrypt::seal_blob; + use gitlawb_core::identity::Keypair; + + let (td, url) = bare_remote(&[("public/a.txt", b"pub\n"), ("secret/b.txt", b"SECRET\n")]); + let dest = td.path().join("dest"); + let bare = url.strip_prefix("file://").unwrap(); + assert!(Command::new("git") + .args(["-C", bare, "config", "uploadpack.allowFilter", "true"]) + .status() + .unwrap() + .success()); + setup_partial_clone(&dest, &url, &["/secret/**".to_string()], &[], None).unwrap(); + let oid = { + let out = Command::new("git") + .args([ + "-C", + dest.to_str().unwrap(), + "rev-parse", + "HEAD:secret/b.txt", + ]) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + }; + std::fs::remove_dir_all(url.strip_prefix("file://").unwrap()).unwrap(); + + let reader = Keypair::generate(); + let envelope = seal_blob(b"SECRET\n", &[reader.verifying_key()]).unwrap(); + let cid = "testcid123"; + + let mut server = mockito::Server::new_async().await; + // Page 1 (cursor null): hasNextPage=true, endCursor C1, an empty manifest + // anchor that does NOT carry the withheld blob. + let _gql_p1 = server + .mock("POST", "/graphql") + .match_body(mockito::Matcher::PartialJsonString( + r#"{"variables":{"cursor":null}}"#.into(), + )) + .with_status(200) + .with_header("content-type", "application/json") + .with_body( + r#"{"data":{"transactions":{"pageInfo":{"hasNextPage":true,"endCursor":"C1"},"edges":[{"node":{"id":"TXP1","block":{"height":10}}}]}}}"#, + ) + .create_async() + .await; + // Page 2 (cursor "C1"): hasNextPage=false, the anchor with the blob. + let _gql_p2 = server + .mock("POST", "/graphql") + .match_body(mockito::Matcher::PartialJsonString( + r#"{"variables":{"cursor":"C1"}}"#.into(), + )) + .with_status(200) + .with_header("content-type", "application/json") + .with_body( + r#"{"data":{"transactions":{"pageInfo":{"hasNextPage":false,"endCursor":"C2"},"edges":[{"node":{"id":"TX1","block":{"height":20}}}]}}}"#, + ) + .create_async() + .await; + let _tx_p1 = server + .mock("GET", "/TXP1") + .with_status(200) + .with_header("content-type", "application/json") + .with_body(r#"{"timestamp":"2026-06-10T00:00:00Z","blobs":[]}"#) + .create_async() + .await; + let manifest_body = serde_json::json!({ + "timestamp": "2026-06-11T00:00:00Z", + "blobs": [{ "oid": oid, "cid": cid, "recipients": [] }], + }) + .to_string(); + let _tx1 = server + .mock("GET", "/TX1") + .with_status(200) + .with_header("content-type", "application/json") + .with_body(manifest_body) + .create_async() + .await; + let _blob = server + .mock("GET", format!("/ipfs/{cid}").as_str()) + .with_status(200) + .with_body(envelope) + .create_async() + .await; + + let paths = recover_from_arweave( + &server.url(), + &server.url(), + "alice", + "myrepo", + &dest, + &reader, + ) + .await + .unwrap(); + assert_eq!( + paths, + vec!["secret/b.txt".to_string()], + "blob anchored only on page 2 must be recovered via pagination" + ); + } + + /// Best-effort on mid-pagination failure: page 1 already yielded a usable + /// anchor; page 2 fails. Recovery still installs page 1's blob (partial Ok), + /// never aborts the clone. + #[tokio::test] + async fn recover_from_arweave_partial_on_midpagination_failure() { + use gitlawb_core::encrypt::seal_blob; + use gitlawb_core::identity::Keypair; + + let (td, url) = bare_remote(&[("public/a.txt", b"pub\n"), ("secret/b.txt", b"SECRET\n")]); + let dest = td.path().join("dest"); + let bare = url.strip_prefix("file://").unwrap(); + assert!(Command::new("git") + .args(["-C", bare, "config", "uploadpack.allowFilter", "true"]) + .status() + .unwrap() + .success()); + setup_partial_clone(&dest, &url, &["/secret/**".to_string()], &[], None).unwrap(); + let oid = { + let out = Command::new("git") + .args([ + "-C", + dest.to_str().unwrap(), + "rev-parse", + "HEAD:secret/b.txt", + ]) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + }; + std::fs::remove_dir_all(url.strip_prefix("file://").unwrap()).unwrap(); + + let reader = Keypair::generate(); + let envelope = seal_blob(b"SECRET\n", &[reader.verifying_key()]).unwrap(); + let cid = "testcid123"; + + let mut server = mockito::Server::new_async().await; + // Page 1 (cursor null): the blob anchor, hasNextPage=true. + let manifest_body = serde_json::json!({ + "timestamp": "2026-06-11T00:00:00Z", + "blobs": [{ "oid": oid, "cid": cid, "recipients": [] }], + }) + .to_string(); + let _gql_p1 = server + .mock("POST", "/graphql") + .match_body(mockito::Matcher::PartialJsonString( + r#"{"variables":{"cursor":null}}"#.into(), + )) + .with_status(200) + .with_header("content-type", "application/json") + .with_body( + r#"{"data":{"transactions":{"pageInfo":{"hasNextPage":true,"endCursor":"C1"},"edges":[{"node":{"id":"TX1","block":{"height":20}}}]}}}"#, + ) + .create_async() + .await; + // Page 2 (cursor "C1"): gateway error. + let _gql_p2 = server + .mock("POST", "/graphql") + .match_body(mockito::Matcher::PartialJsonString( + r#"{"variables":{"cursor":"C1"}}"#.into(), + )) + .with_status(500) + .create_async() + .await; + let _tx1 = server + .mock("GET", "/TX1") + .with_status(200) + .with_header("content-type", "application/json") + .with_body(manifest_body) + .create_async() + .await; + let _blob = server + .mock("GET", format!("/ipfs/{cid}").as_str()) + .with_status(200) + .with_body(envelope) + .create_async() + .await; + + let paths = recover_from_arweave( + &server.url(), + &server.url(), + "alice", + "myrepo", + &dest, + &reader, + ) + .await + .expect("mid-pagination failure must not abort recovery"); + assert_eq!( + paths, + vec!["secret/b.txt".to_string()], + "page-1 blob must still be recovered when page 2 fails" + ); + } + + /// A gateway that reports `hasNextPage=true` but never advances the cursor + /// (returns the same `endCursor` every page) must not loop forever: the + /// degenerate-cursor guard terminates the loop and recovery returns based on + /// what was seen. The test completing at all proves no hang. + #[tokio::test] + async fn recover_from_arweave_terminates_on_nonadvancing_cursor() { + use gitlawb_core::encrypt::seal_blob; + use gitlawb_core::identity::Keypair; + + let (td, url) = bare_remote(&[("public/a.txt", b"pub\n"), ("secret/b.txt", b"SECRET\n")]); + let dest = td.path().join("dest"); + let bare = url.strip_prefix("file://").unwrap(); + assert!(Command::new("git") + .args(["-C", bare, "config", "uploadpack.allowFilter", "true"]) + .status() + .unwrap() + .success()); + setup_partial_clone(&dest, &url, &["/secret/**".to_string()], &[], None).unwrap(); + let oid = { + let out = Command::new("git") + .args([ + "-C", + dest.to_str().unwrap(), + "rev-parse", + "HEAD:secret/b.txt", + ]) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + }; + std::fs::remove_dir_all(url.strip_prefix("file://").unwrap()).unwrap(); + + let reader = Keypair::generate(); + let envelope = seal_blob(b"SECRET\n", &[reader.verifying_key()]).unwrap(); + let cid = "testcid123"; + + let mut server = mockito::Server::new_async().await; + // Every GraphQL POST returns hasNextPage=true with the SAME endCursor. + // The guard fires on the second page (cursor repeats prev_cursor) and + // breaks, so discovery makes exactly two POSTs: page 1 (cursor null) records + // STUCK, page 2 (cursor STUCK) sees the repeat and stops. We assert that + // exact count below — a regression in the advance logic that loops extra + // times in the happy path fails the expect(2). (MAX_PAGES is the hard + // infinite-loop backstop independent of this guard.) + let manifest_body = serde_json::json!({ + "timestamp": "2026-06-11T00:00:00Z", + "blobs": [{ "oid": oid, "cid": cid, "recipients": [] }], + }) + .to_string(); + let gql = server + .mock("POST", "/graphql") + .expect(2) + .with_status(200) + .with_header("content-type", "application/json") + .with_body( + r#"{"data":{"transactions":{"pageInfo":{"hasNextPage":true,"endCursor":"STUCK"},"edges":[{"node":{"id":"TX1","block":{"height":20}}}]}}}"#, + ) + .create_async() + .await; + let _tx1 = server + .mock("GET", "/TX1") + .with_status(200) + .with_header("content-type", "application/json") + .with_body(manifest_body) + .create_async() + .await; + let _blob = server + .mock("GET", format!("/ipfs/{cid}").as_str()) + .with_status(200) + .with_body(envelope) + .create_async() + .await; + + let paths = recover_from_arweave( + &server.url(), + &server.url(), + "alice", + "myrepo", + &dest, + &reader, + ) + .await + .expect("non-advancing cursor must terminate, not hang or error"); + assert_eq!(paths, vec!["secret/b.txt".to_string()]); + gql.assert_async().await; // exactly two discovery POSTs, then the guard breaks + } + + /// A tx id repeated across pages must be fetched only once: the cross-page + /// dedup (R8) bounds the downstream per-id manifest fetch loop. + #[tokio::test] + async fn recover_from_arweave_dedups_tx_ids_across_pages() { + use gitlawb_core::encrypt::seal_blob; + use gitlawb_core::identity::Keypair; + + let (td, url) = bare_remote(&[("public/a.txt", b"pub\n"), ("secret/b.txt", b"SECRET\n")]); + let dest = td.path().join("dest"); + let bare = url.strip_prefix("file://").unwrap(); + assert!(Command::new("git") + .args(["-C", bare, "config", "uploadpack.allowFilter", "true"]) + .status() + .unwrap() + .success()); + setup_partial_clone(&dest, &url, &["/secret/**".to_string()], &[], None).unwrap(); + let oid = { + let out = Command::new("git") + .args([ + "-C", + dest.to_str().unwrap(), + "rev-parse", + "HEAD:secret/b.txt", + ]) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + }; + std::fs::remove_dir_all(url.strip_prefix("file://").unwrap()).unwrap(); + + let reader = Keypair::generate(); + let envelope = seal_blob(b"SECRET\n", &[reader.verifying_key()]).unwrap(); + let cid = "testcid123"; + + let mut server = mockito::Server::new_async().await; + // Page 1 and page 2 both list TX1 (cursor advances); the dedup must fetch + // the /TX1 manifest exactly once. + let _gql_p1 = server + .mock("POST", "/graphql") + .match_body(mockito::Matcher::PartialJsonString( + r#"{"variables":{"cursor":null}}"#.into(), + )) + .with_status(200) + .with_header("content-type", "application/json") + .with_body( + r#"{"data":{"transactions":{"pageInfo":{"hasNextPage":true,"endCursor":"C1"},"edges":[{"node":{"id":"TX1","block":{"height":20}}}]}}}"#, + ) + .create_async() + .await; + let _gql_p2 = server + .mock("POST", "/graphql") + .match_body(mockito::Matcher::PartialJsonString( + r#"{"variables":{"cursor":"C1"}}"#.into(), + )) + .with_status(200) + .with_header("content-type", "application/json") + .with_body( + r#"{"data":{"transactions":{"pageInfo":{"hasNextPage":false,"endCursor":"C2"},"edges":[{"node":{"id":"TX1","block":{"height":20}}}]}}}"#, + ) + .create_async() + .await; + let manifest_body = serde_json::json!({ + "timestamp": "2026-06-11T00:00:00Z", + "blobs": [{ "oid": oid, "cid": cid, "recipients": [] }], + }) + .to_string(); + let tx1 = server + .mock("GET", "/TX1") + .expect(1) // dedup: fetched once despite appearing on both pages + .with_status(200) + .with_header("content-type", "application/json") + .with_body(manifest_body) + .create_async() + .await; + let _blob = server + .mock("GET", format!("/ipfs/{cid}").as_str()) + .with_status(200) + .with_body(envelope) + .create_async() + .await; + + let paths = recover_from_arweave( + &server.url(), + &server.url(), + "alice", + "myrepo", + &dest, + &reader, + ) + .await + .unwrap(); + assert_eq!(paths, vec!["secret/b.txt".to_string()]); + tx1.assert_async().await; // exactly one /TX1 fetch + } + /// A caller who is not a recipient cannot decrypt the envelope, so nothing is /// recovered even though the manifest and envelope are reachable. #[tokio::test]