diff --git a/cmd/elastickv-snapshot-decode/main.go b/cmd/elastickv-snapshot-decode/main.go index 912f0b2ce..3359e2eaf 100644 --- a/cmd/elastickv-snapshot-decode/main.go +++ b/cmd/elastickv-snapshot-decode/main.go @@ -275,6 +275,7 @@ func emitManifest(cfg *config, res backup.DecodeResult) error { IncludeOrphans: cfg.includeOrphans, PreserveSQSVisibility: cfg.preserveSQSVisibility, IncludeSQSSideRecords: cfg.includeSQSSideRecords, + RenameS3Collisions: cfg.renameCollisions, } if cfg.bundleJSONL { m.DynamoDBLayout = backup.DynamoDBLayoutJSONL diff --git a/cmd/elastickv-snapshot-encode/main.go b/cmd/elastickv-snapshot-encode/main.go new file mode 100644 index 000000000..d3905d769 --- /dev/null +++ b/cmd/elastickv-snapshot-encode/main.go @@ -0,0 +1,842 @@ +// Command elastickv-snapshot-encode is the Phase 0b M6 snapshot encoder +// described in docs/design/2026_06_01_proposed_snapshot_encode_cli.md +// (parent: docs/design/2026_05_25_partial_snapshot_logical_encoder.md). +// +// It reads a vendor-independent per-adapter directory tree (produced by +// elastickv-snapshot-decode or by a future Phase 1 live extractor) and +// writes a native EKVPBBL1 .fsm a stopped node can load via the +// stop-replace-restart restore runbook (parent §"Restore via +// stop-replace-restart"). +// +// The CLI is offline-only. It does not talk to a running cluster; the +// receiving cluster loads the output .fsm via its existing snapshot +// loader on next restart. +// +// Atomic publish: the .fsm is written to .tmp- first, +// fsync+close, then renamed to only after the optional +// self-test matches. A self-test failure removes the temp file, so a +// known-bad .fsm never reaches the restore path (codex P2 v2 #896). +// +// version is stamped at build time via -ldflags "-X main.version=$(git rev-parse HEAD)". +// Test builds keep the literal "dev" so CLI-level tests can assert the +// field is present without depending on a release tag. +package main + +import ( + "crypto/rand" + "encoding/hex" + "flag" + "io" + "log/slog" + "os" + "path/filepath" + "strconv" + "strings" + "time" + + "github.com/bootjp/elastickv/internal/backup" + "github.com/cockroachdb/errors" +) + +var version = "dev" + +const ( + exitSuccess = 0 + exitUserErr = 1 + exitDataErr = 2 + // tempSuffixHexLen is the hex-character length of the random + // suffix appended to .tmp-; 16 hex chars = 8 bytes of + // entropy = 2^64 collision space per --output path. The earlier + // 8-hex/4-byte form was flagged as collision-prone in highly + // concurrent CI environments (gemini medium #904); 8 bytes is the + // same width crypto/rand.Read pads cryptographic nonces to. + tempSuffixHexLen = 16 + tempSuffixByteLen = tempSuffixHexLen / 2 + // mismatchTxtPerm + sidecar perm constants were removed in v25: + // both writes now go through backup.OpenSidecarFile which fixes + // the mode at 0o600 internally (codex P2 v25 #904 — no-follow + // open required different syscall semantics than os.OpenFile + + // O_TRUNC, so the perm now lives in the helper). + encodeInfoFilePerm = 0o600 +) + +type config struct { + inputPath string + outputPath string + adapters backup.AdapterSet + lastCommitTSPresent bool + lastCommitTS uint64 + selfTest bool + scratchRoot string +} + +func main() { + logger := slog.New(slog.NewTextHandler(os.Stderr, nil)) + exitCode, err := run(os.Args[1:], logger) + if err != nil { + logger.Error("elastickv-snapshot-encode", "err", err) + } + os.Exit(exitCode) +} + +func run(argv []string, logger *slog.Logger) (int, error) { + cfg, err := parseFlags(argv) + if err != nil { + return exitUserErr, err + } + if err := encodeOne(cfg, logger); err != nil { + return classifyEncodeError(err), err + } + return exitSuccess, nil +} + +// classifyEncodeError maps the encodeOne return value to a CLI exit +// code. Data-correctness sentinels (HLC ceiling regression, JSONL +// layout, unsupported manifest exclusion flags, adapter scope +// mismatch with manifest, adapter rejecting input-tree contents, +// self-test mismatch, corrupt manifest) → exit 2; everything else +// → exit 1. Runbooks branch on exit status to triage bad-dump-data +// vs operator typos, so this mapping is part of the CLI contract. +// +// Sources of each sentinel: +// - ErrSelfTestLowerLastCommitTS: CLI resolveLastCommitTS + library +// validateEncodeOptionsData (codex P2 v2 #904) +// - ErrEncodeUnsupportedDynamoDBLayout: validateEncodeOptionsData +// (codex P2 v7 #904) +// - ErrEncodeUnsupportedS3IncompleteUploads: validateEncodeOptionsUnsupportedFeatures +// (codex P2 v21 #904) +// - ErrEncodeUnsupportedS3Orphans: validateEncodeOptionsUnsupportedFeatures +// (codex P2 v21 #904) +// - ErrEncodeUnsupportedSQSPreserveVisibility: validateEncodeOptionsUnsupportedFeatures +// (codex P2 v21 #904) +// - ErrEncodeAdapterData: runAdapterEncoders mark on adapter +// rejection (codex P2 v9 #904) +// - errSelfTestMismatch: writeAndPublish self-test branch +// - ErrInvalidManifest / ErrUnsupportedFormatVersion: readInputManifest +// surfacing backup.ReadManifest sentinels (codex P2 v14 #904) +// - errAdapterNotInManifest: validateAdaptersAgainstManifest when +// a selected adapter has a nil manifest scope pointer (codex P2 +// v26 #904 scenario B; retracted v27 scenario A in v30 per codex +// P1 v29 #904) +func classifyEncodeError(err error) int { + switch { + case errors.Is(err, backup.ErrSelfTestLowerLastCommitTS), + errors.Is(err, backup.ErrEncodeUnsupportedDynamoDBLayout), + errors.Is(err, backup.ErrEncodeUnsupportedS3IncompleteUploads), + errors.Is(err, backup.ErrEncodeUnsupportedS3Orphans), + errors.Is(err, backup.ErrEncodeUnsupportedSQSPreserveVisibility), + errors.Is(err, backup.ErrEncodeAdapterData), + errors.Is(err, errSelfTestMismatch), + errors.Is(err, backup.ErrInvalidManifest), + errors.Is(err, backup.ErrUnsupportedFormatVersion), + errors.Is(err, errAdapterNotInManifest): + return exitDataErr + default: + return exitUserErr + } +} + +// validateAdaptersAgainstManifest checks each enabled adapter +// against the nil/non-nil manifest scope pointer (manifest.Adapters.). +// One failure mode, routed to exit 2: +// +// - Manifest lists no scope (nil pointer) for an enabled adapter +// (codex P2 v26 #904 scenario B). A truncated/wrong manifest +// combined with the default `--adapter dynamodb,s3,redis,sqs` +// would otherwise pick up a stale on-disk subdir for an adapter +// the producer did not dump. +// +// Scenario A (non-nil scope but on-disk subdir missing) cannot be +// detected from the manifest alone because the decoder's +// populateAdapterScopes defers scope enumeration and always writes +// an empty &Adapter{} regardless of record count (codex P1 v29 #904 +// pulled the v27/v28/v29 stat-and-readdir checks). Future work: +// add a SHA / record-count manifest field so scenario A becomes +// detectable. See checkOneAdapterScope's doc for the full per-shape +// decision matrix. +// +// A nil manifest.Adapters is treated as "manifest has no scopes for +// any adapter" — every enabled adapter trips the guard. Older +// manifests that omit the Adapters block deliberately are expected +// to pass `--adapter` set to only what they DO contain; that case +// is operator-driven and surfaces the same fail-closed error here. +func validateAdaptersAgainstManifest(selected backup.AdapterSet, m backup.Manifest, inputPath string) error { + checks := []struct { + name string + selected bool + scope *backup.Adapter + subdir string + }{ + {"dynamodb", selected.DynamoDB, manifestAdapterField(m.Adapters, "dynamodb"), "dynamodb"}, + {"s3", selected.S3, manifestAdapterField(m.Adapters, "s3"), "s3"}, + {"redis", selected.Redis, manifestAdapterField(m.Adapters, "redis"), "redis"}, + {"sqs", selected.SQS, manifestAdapterField(m.Adapters, "sqs"), "sqs"}, + } + for _, c := range checks { + if err := checkOneAdapterScope(c.name, c.selected, c.scope, filepath.Join(inputPath, c.subdir)); err != nil { + return err + } + } + return nil +} + +// manifestAdapterField returns the *Adapter for one adapter name from +// m.Adapters, or nil if m.Adapters or the specific adapter pointer is +// nil. Centralized so validateAdaptersAgainstManifest's table stays +// readable. +func manifestAdapterField(a *backup.Adapters, name string) *backup.Adapter { + if a == nil { + return nil + } + switch name { + case "dynamodb": + return a.DynamoDB + case "s3": + return a.S3 + case "redis": + return a.Redis + case "sqs": + return a.SQS + default: + return nil + } +} + +// checkOneAdapterScope is the per-adapter half of +// validateAdaptersAgainstManifest. The decoder's populateAdapterScopes +// explicitly defers scope enumeration and writes `&Adapter{}` (empty) +// for every enabled adapter regardless of whether records were +// dumped, so the manifest's scope CONTENT cannot distinguish +// "this adapter had no records" from "this adapter had records but +// the decoder didn't enumerate them" (codex P1 v29 #904 corrected +// v27/v28/v29's over-eager subdir stat-and-readdir checks). +// +// The only sound nil/non-nil signal is the per-adapter POINTER +// (`manifest.Adapters.`): +// +// - scope == nil → fail (producer did NOT enable this adapter; any +// on-disk subdir is stale and would otherwise be encoded under +// the default `--adapter all`, codex P2 v26 #904 scenario B). +// - scope != nil → pass (producer enabled the adapter; trust the +// manifest contract regardless of the on-disk subdir's +// presence/contents). +// +// Detecting truncated dumps (codex P2 v26 #904 scenario A: scope +// non-nil but on-disk subdir lost) needs SHA / record-count +// verification at the producer side; the manifest alone cannot +// surface it. Tracked as future work. +// +// subdirPath is intentionally unused now but kept in the signature so +// a future check that pairs the manifest with a SHA index doesn't +// need a call-site refactor. +func checkOneAdapterScope(name string, selected bool, scope *backup.Adapter, _ string) error { + if !selected { + return nil + } + if scope == nil { + return errors.Wrapf(errAdapterNotInManifest, + "adapter %q selected but MANIFEST.json has no scope for it (use --adapter to restrict, or re-dump including this adapter)", + name) + } + return nil +} + +func parseFlags(argv []string) (*config, error) { + fs := flag.NewFlagSet("elastickv-snapshot-encode", flag.ContinueOnError) + fs.SetOutput(io.Discard) + + var ( + inputPath string + outputPath string + adapterCSV string + ltsRaw string + selfTest bool + scratchRoot string + ) + fs.StringVar(&inputPath, "input", "", "Directory tree root produced by elastickv-snapshot-decode (required, must contain MANIFEST.json)") + fs.StringVar(&outputPath, "output", "", "Destination .fsm file path (required)") + fs.StringVar(&adapterCSV, "adapter", "dynamodb,s3,redis,sqs", "Comma-separated subset of adapters to encode") + fs.StringVar(<sRaw, "last-commit-ts", "", "Override the manifest's last_commit_ts; must be >= manifest value (HLC ceiling can only rise)") + fs.BoolVar(&selfTest, "self-test", false, "After encode, decode the produced .fsm and assert it structurally matches --input") + fs.StringVar(&scratchRoot, "scratch-root", "", "Base directory for self-test scratch subdir (default os.TempDir); a unique encode-self-test- subdir is always created underneath") + + if err := fs.Parse(argv); err != nil { + return nil, errors.WithStack(err) + } + if inputPath == "" { + return nil, errors.New("--input is required") + } + if outputPath == "" { + return nil, errors.New("--output is required") + } + adapters, err := parseAdapterSet(adapterCSV) + if err != nil { + return nil, err + } + cfg := &config{ + inputPath: inputPath, + outputPath: outputPath, + adapters: adapters, + selfTest: selfTest, + scratchRoot: scratchRoot, + } + if ltsRaw != "" { + ts, perr := parseLastCommitTS(ltsRaw) + if perr != nil { + return nil, perr + } + cfg.lastCommitTSPresent = true + cfg.lastCommitTS = ts + } + return cfg, nil +} + +// parseLastCommitTS parses --last-commit-ts as a uint64. Hex (0x prefix) +// or decimal accepted. Uses strconv.ParseUint strict parsing so trailing +// garbage is rejected — fmt.Sscanf would silently accept "0xffZZ" as +// 0xff and "100oops" as 100, which becomes a snapshot HLC ceiling that +// silently disagrees with what the operator typed (claude high #904, +// codex P2 #904). Negative or out-of-range surfaces as exit-1; the +// semantic check (T >= manifest) is exit-2. +func parseLastCommitTS(raw string) (uint64, error) { + s := strings.TrimSpace(raw) + if s == "" { + return 0, errors.New("--last-commit-ts is empty") + } + const ( + base16 = 16 + base10 = 10 + uint64Bits = 64 + ) + if strings.HasPrefix(s, "0x") || strings.HasPrefix(s, "0X") { + ts, err := strconv.ParseUint(s[2:], base16, uint64Bits) + if err != nil { + return 0, errors.Wrap(err, "--last-commit-ts hex parse") + } + return ts, nil + } + ts, err := strconv.ParseUint(s, base10, uint64Bits) + if err != nil { + return 0, errors.Wrap(err, "--last-commit-ts decimal parse") + } + return ts, nil +} + +// parseAdapterSet decodes a comma-separated adapter list (or "all"). +// Mirrors the decoder's parser so a typo cannot silently disable an +// adapter. Unknown name → exit-1. A CSV that contains only separators +// or whitespace (e.g. `--adapter ' ,'`) is also rejected — without this +// guard a templated argument that expands to spaces would yield a +// zero AdapterSet and the encoder would publish a valid header-only +// .fsm (no adapters invoked), turning a bad argument into a silent +// empty restore artifact (codex P2 #904). +func parseAdapterSet(csv string) (backup.AdapterSet, error) { + if csv == "" || csv == "all" { + return backup.AdapterSet{DynamoDB: true, S3: true, Redis: true, SQS: true}, nil + } + var set backup.AdapterSet + for _, raw := range strings.Split(csv, ",") { + name := strings.TrimSpace(strings.ToLower(raw)) + if name == "" { + continue + } + if err := applyAdapterName(name, &set); err != nil { + return backup.AdapterSet{}, err + } + } + if !set.DynamoDB && !set.S3 && !set.Redis && !set.SQS { + return backup.AdapterSet{}, errors.Errorf("--adapter %q selects no adapters; use \"all\" or a comma-separated subset", csv) + } + return set, nil +} + +// applyAdapterName sets the bit on s for one normalized adapter name, +// or returns an unknown-name error. Split out so parseAdapterSet stays +// under the cyclop threshold. +func applyAdapterName(name string, s *backup.AdapterSet) error { + switch name { + case "dynamodb": + s.DynamoDB = true + case "s3": + s.S3 = true + case "redis": + s.Redis = true + case "sqs": + s.SQS = true + default: + return errors.Errorf("unknown adapter %q", name) + } + return nil +} + +// errSelfTestMismatch is a typed sentinel so run() can map self-test diffs +// to exit-2 without coupling to the encoder's mismatch.txt format. +var errSelfTestMismatch = errors.New("backup: --self-test diff against --input") + +// errAdapterNotInManifest is returned by validateAdaptersAgainstManifest +// when the user has enabled an adapter that the manifest doesn't list +// (nil pointer in manifest.Adapters.). This is the codex P2 v26 +// #904 scenario B: a stale on-disk subdir for an adapter the producer +// did not dump would otherwise be encoded under the default +// `--adapter dynamodb,s3,redis,sqs`. classifyEncodeError routes the +// sentinel to exit 2 (data-correctness). +// +// The earlier v27/v28/v29 attempts to also detect a missing on-disk +// subdir under a non-nil scope (codex P2 v26 scenario A) were retracted +// in v30 once codex P1 v29 #904 clarified that the decoder defers +// scope enumeration; the manifest can no longer distinguish "no +// records dumped" from "records dumped but scope not enumerated." +var errAdapterNotInManifest = errors.New("encode: adapter scope mismatch between MANIFEST.json and --adapter / on-disk tree") + +func encodeOne(cfg *config, logger *slog.Logger) error { + manifest, err := readInputManifest(cfg.inputPath) + if err != nil { + return err + } + if err := validateAdaptersAgainstManifest(cfg.adapters, manifest, cfg.inputPath); err != nil { + return err + } + effectiveTS, overridden, err := resolveLastCommitTS(cfg, manifest.LastCommitTS) + if err != nil { + return err + } + encodeOpts := buildEncodeOptions(cfg, effectiveTS, manifest) + + mismatchPath := cfg.outputPath + ".mismatch.txt" + _ = os.Remove(mismatchPath) // stale-mismatch cleanup (gemini medium v6 #896) + // Do NOT pre-clean the sidecar here. The sidecar describes the + // .fsm at cfg.outputPath; the .fsm is preserved when a run fails + // in the adapter encoders (non-self-test exit-2 path), so wiping + // its sidecar would leave the prior restore artifact without its + // matching provenance metadata (codex P2 v17 #904). writeSidecar + // uses O_CREATE|O_TRUNC, so the sidecar is atomically overwritten + // on success and on self-test mismatch (where the .fsm is also + // replaced or removed in lock-step). On adapter-encoder errors + // neither writeSidecar nor removeStaleOutputFSM runs; the prior + // .fsm + prior sidecar therefore stay paired. + + result, publishErr := writeAndPublish(cfg, encodeOpts, mismatchPath, logger) + // Sidecar is written even on self-test mismatch so an operator + // has both .mismatch.txt AND .encode_info.json + // (with self_test.matched=false) for diagnostics. Only skipped + // when the encode itself errored before any result was populated + // (publishErr != nil && !errSelfTestMismatch) (codex P2 v6 #904). + if publishErr == nil || errors.Is(publishErr, errSelfTestMismatch) { + sidecarTruncated, serr := writeSidecar(cfg, manifest, effectiveTS, overridden, result) + if serr != nil { + // Surface the sidecar-write failure only if encode itself + // succeeded; on mismatch the mismatch error takes priority. + if publishErr == nil { + // .fsm was just renamed into place by writeAndPublish + // but the sidecar write failed → we have an orphan + // .fsm without its matching provenance metadata. + // Roll back to a consistent absent state. The sidecar + // rollback is gated on sidecarTruncated so we don't + // remove an operator-owned pre-existing entry that + // OpenSidecarFile refused to clobber (claude/codex + // P2 v31 added the rollback; codex P2 v33 #904 added + // the truncation gate for hard-linked sidecars). + rollbackOrphanFSMAndSidecar(cfg.outputPath, sidecarTruncated, logger) + return errors.Wrap(serr, "write encode_info sidecar") + } + logger.Warn("write encode_info sidecar on mismatch", "err", serr) + } + } + if publishErr != nil { + return publishErr + } + logger.Info("encode complete", + "output", cfg.outputPath, + "bytes", result.BytesWritten, + "self_test", cfg.selfTest, + "adapters", strings.Join(result.AdaptersEnabled, ","), + ) + return nil +} + +// readInputManifest opens + decodes /MANIFEST.json. +func readInputManifest(inputPath string) (backup.Manifest, error) { + manifestPath := filepath.Join(inputPath, "MANIFEST.json") + manifestFile, err := os.Open(manifestPath) //nolint:gosec // operator-supplied path + if err != nil { + return backup.Manifest{}, errors.Wrapf(err, "open %s", manifestPath) + } + defer func() { _ = manifestFile.Close() }() + m, err := backup.ReadManifest(manifestFile) + if err != nil { + return backup.Manifest{}, errors.Wrap(err, "read manifest") + } + return m, nil +} + +func buildEncodeOptions(cfg *config, effectiveTS uint64, manifest backup.Manifest) backup.EncodeOptions { + encodeOpts := backup.EncodeOptions{ + InputRoot: cfg.inputPath, + Adapters: cfg.adapters, + LastCommitTS: effectiveTS, + ManifestLastCommitTS: manifest.LastCommitTS, + DynamoDBBundleJSONL: manifest.DynamoDBLayout == backup.DynamoDBLayoutJSONL, + SelfTest: cfg.selfTest, + } + // Thread manifest exclusions into the library guards (codex P2 v21 + // #904): the S3/SQS reverse encoders can't honor these today, so + // failing closed here surfaces the unsupported-feature errors + // before any bytes are written. The CLI's existing + // buildSelfTestDecodeOptions also threads the same fields into + // the scratch decode path so self-test sees a coherent picture. + if manifest.Exclusions != nil { + encodeOpts.S3IncludeIncompleteUploads = manifest.Exclusions.IncludeIncompleteUploads + encodeOpts.S3IncludeOrphans = manifest.Exclusions.IncludeOrphans + encodeOpts.PreserveSQSVisibility = manifest.Exclusions.PreserveSQSVisibility + } + if cfg.selfTest { + encodeOpts.SelfTestDecodeOptions = buildSelfTestDecodeOptions(cfg, manifest) + } + return encodeOpts +} + +// rollbackOrphanFSMAndSidecar removes both the just-published +// .fsm and the partial .encode_info.json after a +// sidecar-write failure on the encode success path. The pair was +// supposed to move together (the .fsm describes the data the sidecar +// records the provenance for); if the sidecar didn't land, the +// operator must not see a "successful" .fsm without its matching +// provenance metadata (claude / codex P2 v31 observation on PR #904). +// +// A prior successful encode at the same output path is unrecoverable +// — writeAndPublish's os.Rename already overwrote it before +// writeSidecar ran. The rollback brings the state to "no .fsm, no +// sidecar at this path", which is the same end state as "encode +// never ran." That's the cleanest consistent outcome the CLI can +// produce without filesystem transactions. +// +// Both os.Remove calls log-and-continue on non-ErrNotExist failures +// so the caller's primary sidecar-write error remains the dominant +// signal. +// rollbackOrphanFSMAndSidecar reverts an encode that succeeded in +// publishing the .fsm but failed in writing the sidecar. Always +// removes the just-renamed .fsm at outputPath. The sidecar at +// EncodeInfoSidecarPath(outputPath) is removed ONLY when +// sidecarTruncated is true — i.e., backup.OpenSidecarFile succeeded +// and either created a fresh file or truncated an existing +// single-link regular file. When sidecarTruncated is false the +// existing entry is operator-owned (symlink, hard link with +// Nlink > 1, FIFO, directory, etc.) that OpenSidecarFile refused to +// clobber, so this rollback must NOT destroy it either (codex P2 +// v32 #904 / codex P2 v33 #904). +func rollbackOrphanFSMAndSidecar(outputPath string, sidecarTruncated bool, logger *slog.Logger) { + if rerr := os.Remove(outputPath); rerr != nil && !errors.Is(rerr, os.ErrNotExist) { + logger.Warn("rollback orphan .fsm after sidecar failure", "err", rerr) + } + if !sidecarTruncated { + return + } + sidecarPath := backup.EncodeInfoSidecarPath(outputPath) + if srerr := os.Remove(sidecarPath); srerr != nil && !errors.Is(srerr, os.ErrNotExist) { + logger.Warn("rollback partial sidecar after write failure", "err", srerr) + } +} + +// writeMismatchTxt writes the self-test mismatch report to mismatchPath +// using the same no-follow/no-clobber discipline as the sidecar +// writer: an attacker pre-placing a symlink at +// .mismatch.txt could otherwise redirect the +// truncate-and-write into a target of their choosing (codex P2 v25 +// #904 — extending the sidecar guard to the sibling deterministic +// write path). On open failure the caller (writeAndPublish) logs at +// warn level and continues; the failure does NOT block the +// errSelfTestMismatch return so the mismatch error remains the +// dominant signal. +func writeMismatchTxt(mismatchPath string, body []byte) error { + f, err := backup.OpenSidecarFile(mismatchPath) + if err != nil { + return errors.Wrap(err, "open mismatch.txt") + } + if _, werr := f.Write(body); werr != nil { + _ = f.Close() + return errors.Wrap(werr, "write mismatch.txt body") + } + if cerr := f.Close(); cerr != nil { + return errors.Wrap(cerr, "close mismatch.txt") + } + return nil +} + +// removeStaleOutputFSM removes outputPath ONLY when it exists as a +// regular file or a symlink. Both shapes satisfy the "no +// restore-visible FSM after self-test mismatch" contract: removing a +// regular file empties --output; removing a symlink unlinks the +// name (the target is preserved as a side effect — os.Remove on a +// symlink operates on the link, not the resolved target). A directory, +// device, FIFO, or socket at --output is left alone — those shapes +// were never valid restore targets, and os.Remove on a non-empty +// directory or device would be destructive in ways the mismatch +// contract does not require (codex P2 v14 #904 caught the directory +// case; codex P2 v19 #904 caught the symlink case where the prior +// IsRegular()-only check silently left the symlink resolving to the +// stale snapshot). +// +// Errors other than ErrNotExist are downgraded to warn-and-continue +// so the caller's primary mismatch error remains the dominant signal. +func removeStaleOutputFSM(outputPath string, logger *slog.Logger) { + info, err := os.Lstat(outputPath) + if err != nil { + if !errors.Is(err, os.ErrNotExist) { + logger.Warn("stat stale .fsm on self-test mismatch", "err", err) + } + return + } + mode := info.Mode() + isSymlink := mode&os.ModeSymlink != 0 + if !mode.IsRegular() && !isSymlink { + logger.Warn("skip stale .fsm cleanup: --output is not a regular file or symlink", + "path", outputPath, "mode", mode) + return + } + if rerr := os.Remove(outputPath); rerr != nil && !errors.Is(rerr, os.ErrNotExist) { + logger.Warn("remove stale .fsm on self-test mismatch", "err", rerr) + } +} + +// writeAndPublish writes the .fsm to a temp path, runs the optional +// self-test via EncodeSnapshot, and renames temp → output on success. +// On self-test failure: writes mismatch.txt, removes any stale +// .fsm or symlink at left by a prior successful +// run (codex P2 v10 #904 covered regular files, codex P2 v19 #904 +// extended to symlinks; directories and special files are left +// alone per v14 L347), removes the temp file via the deferred +// cleanup, returns errSelfTestMismatch. See removeStaleOutputFSM +// for the per-shape decision matrix. +func writeAndPublish(cfg *config, encodeOpts backup.EncodeOptions, mismatchPath string, logger *slog.Logger) (backup.EncodeResult, error) { + tempPath, err := tempOutputPath(cfg.outputPath) + if err != nil { + return backup.EncodeResult{}, err + } + result, err := encodeToTempFile(tempPath, encodeOpts) + publishedTempPath := tempPath + defer func() { + if publishedTempPath != "" { + _ = os.Remove(publishedTempPath) + } + }() + if err != nil { + return result, err + } + if cfg.selfTest && !result.SelfTestMatched { + if werr := writeMismatchTxt(mismatchPath, result.SelfTestMismatchTxt); werr != nil { + logger.Warn("write mismatch.txt", "err", werr) + } + // Remove the stale .fsm if one exists from a prior + // successful run AND is a regular file. encodeOne is about to + // write a fresh .encode_info.json with + // self_test.matched=false and a NEW SHA pointing to the + // unpublished temp snapshot; leaving old bytes on disk would + // make the sidecar describe an FSM that does not exist and + // violate the "self-test failure leaves no restore-visible + // FSM" contract (codex P2 v10 #904). + // + // The mode-check guards against an --output that names a + // directory (or any non-regular file): the normal publish + // path would fail at os.Rename anyway, but the mismatch + // cleanup must not destructively delete a directory the + // operator passed in error (codex P2 v14 #904). + removeStaleOutputFSM(cfg.outputPath, logger) + return result, errors.Wrap(errSelfTestMismatch, "self-test diff (see "+mismatchPath+")") + } + if perr := publishAndFsync(tempPath, cfg.outputPath, logger); perr != nil { + return result, perr + } + publishedTempPath = "" // rename succeeded; defer no-ops + return result, nil +} + +// publishAndFsync renames tempPath → outputPath and then fsyncs the +// parent directory. If the fsync fails, the just-renamed .fsm is +// removed so the operator does not see a non-durable "successful" +// .fsm (codex P2 v24 #904 added the fsync; codex P2 v32 #904 added +// the rollback). Split out of writeAndPublish to keep that function +// under the cyclop bound. +func publishAndFsync(tempPath, outputPath string, logger *slog.Logger) error { + if err := os.Rename(tempPath, outputPath); err != nil { + return errors.Wrap(err, "rename tmp -> output") + } + // fsync the parent dir so the rename's new directory entry is + // durable. Without this, a power loss / host crash immediately + // after a successful encode can lose the new entry (or + // resurrect the old one) on filesystems where rename durability + // requires syncing the containing directory. Mirrors the repo + // pattern used by internal/encryption/sidecar.go + + // internal/raftengine/etcd/persistence.go. + if err := fsyncParentDir(outputPath); err != nil { + // Roll back so the operator doesn't see a non-durable + // "successful" .fsm; restoring the consistent absent state + // is the same outcome encodeOne enforces on sidecar-write + // failures (codex P2 v32 #904). + if rerr := os.Remove(outputPath); rerr != nil && !errors.Is(rerr, os.ErrNotExist) { + logger.Warn("rollback orphan .fsm after parent-dir fsync failure", "err", rerr) + } + return errors.Wrap(err, "fsync output dir after rename") + } + return nil +} + +// fsyncParentDir opens the parent directory of path read-only and +// calls fsync on its file descriptor. On most POSIX filesystems this +// is what makes os.Rename durable. Errors other than path-traversal +// (which means the operator passed something weird like "" — already +// rejected upstream) bubble up so the caller can surface them. +// +// Mirrors syncDir in internal/encryption/sidecar.go and the etcd +// raftengine persistence helper; kept local here so the CLI binary +// doesn't depend on internal/encryption for a 6-line helper. +func fsyncParentDir(path string) error { + dir := filepath.Dir(path) + f, err := os.Open(dir) //nolint:gosec // dir is derived from operator-supplied --output path + if err != nil { + return errors.Wrapf(err, "open parent dir %q", dir) + } + defer func() { _ = f.Close() }() + if err := f.Sync(); err != nil { + return errors.Wrapf(err, "fsync parent dir %q", dir) + } + return nil +} + +// encodeToTempFile creates tempPath, runs EncodeSnapshot into it, +// fsync+close. Caller is responsible for the os.Remove cleanup on error. +// The temp file is created mode 0600 so the on-disk .fsm is not +// world-readable while the encode is in flight (claude v4 #904). +func encodeToTempFile(tempPath string, encodeOpts backup.EncodeOptions) (backup.EncodeResult, error) { + tempFile, err := os.OpenFile(tempPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, encodeInfoFilePerm) //nolint:gosec // operator-supplied path + if err != nil { + return backup.EncodeResult{}, errors.Wrapf(err, "create %s", tempPath) + } + result, err := backup.EncodeSnapshot(encodeOpts, tempFile) + if err != nil { + _ = tempFile.Close() + return result, errors.Wrap(err, "EncodeSnapshot") + } + if err := tempFile.Sync(); err != nil { + _ = tempFile.Close() + return result, errors.Wrap(err, "fsync tmp") + } + if err := tempFile.Close(); err != nil { + return result, errors.Wrap(err, "close tmp") + } + return result, nil +} + +// resolveLastCommitTS applies the parent doc's HLC-ceiling-only-rises +// rule. Returns the effective T, whether an override was applied, and a +// typed error on regression. +func resolveLastCommitTS(cfg *config, manifestTS uint64) (uint64, bool, error) { + if !cfg.lastCommitTSPresent { + return manifestTS, false, nil + } + if cfg.lastCommitTS < manifestTS { + return 0, false, errors.Wrapf(backup.ErrSelfTestLowerLastCommitTS, + "--last-commit-ts %d < manifest %d", cfg.lastCommitTS, manifestTS) + } + return cfg.lastCommitTS, true, nil +} + +// buildSelfTestDecodeOptions translates manifest fields into the +// DecodeOptions the self-test feeds into DecodeSnapshot, so the scratch +// tree matches what the original decoder would have produced (codex P2 +// v3 #896). +func buildSelfTestDecodeOptions(cfg *config, m backup.Manifest) backup.DecodeOptions { + opts := backup.DecodeOptions{ + OutRoot: cfg.scratchRoot, + Adapters: cfg.adapters, + } + if m.Exclusions != nil { + opts.IncludeIncompleteUploads = m.Exclusions.IncludeIncompleteUploads + opts.IncludeOrphans = m.Exclusions.IncludeOrphans + opts.PreserveSQSVisibility = m.Exclusions.PreserveSQSVisibility + opts.IncludeSQSSideRecords = m.Exclusions.IncludeSQSSideRecords + opts.RenameS3Collisions = m.Exclusions.RenameS3Collisions + } + if m.DynamoDBLayout == backup.DynamoDBLayoutJSONL { + opts.DynamoDBBundleJSONL = true + } + return opts +} + +// tempOutputPath returns .tmp- for the write-then-rename +// atomic publish. crypto/rand provides the suffix so concurrent encodes +// against the same --output cannot collide. +func tempOutputPath(output string) (string, error) { + buf := make([]byte, tempSuffixByteLen) + if _, err := rand.Read(buf); err != nil { + return "", errors.Wrap(err, "rand suffix") + } + return output + ".tmp-" + hex.EncodeToString(buf), nil +} + +// writeSidecar emits ENCODE_INFO.json next to the published .fsm +// (path-derived per gemini medium v2 #896). Returns (truncated, err): +// truncated is true iff backup.OpenSidecarFile succeeded — i.e., the +// existing path was truncated by THIS run (or a fresh file was +// created). When truncated is false, the caller MUST NOT roll back +// the sidecar path: any pre-existing entry there is operator-owned +// and OpenSidecarFile correctly refused to clobber it (codex P2 v33 +// #904 — hard-linked sidecars in particular pass IsRegular but were +// refused via Nlink>1; v32's IsRegular-only rollback gate would +// have destroyed those). +func writeSidecar(cfg *config, m backup.Manifest, effectiveTS uint64, overridden bool, result backup.EncodeResult) (bool, error) { + info := backup.NewEncodeInfo(time.Now()) + info.EncoderVersion = version + info.InputRoot = cfg.inputPath + info.OutputFSMPath = cfg.outputPath + info.OutputFSMSHA256 = hex.EncodeToString(result.SHA256[:]) + info.LastCommitTS = effectiveTS + info.LastCommitTSOverridden = overridden + info.ManifestLastCommitTS = m.LastCommitTS + info.ManifestClusterID = m.ClusterID + info.AdaptersEnabled = result.AdaptersEnabled + info.SelfTest = backup.EncodeInfoSelfTest{ + Ran: result.SelfTestRan, + Matched: result.SelfTestMatched, + } + sidecarPath := backup.EncodeInfoSidecarPath(cfg.outputPath) + // 0o600 keeps ENCODE_INFO.json (which includes the source path, + // cluster_id, and SHA-256 of the .fsm) from leaking to non-owner + // users on multi-user backup hosts (claude v4 #904). + // + // backup.OpenSidecarFile refuses to follow a symlink at the + // sidecar path, refuses to truncate a hard-linked or + // non-regular file there, and (on unix) refuses to block on a + // reader-less FIFO — all the clobber-attack vectors the adapter + // dump writers already defend against. Without these guards an + // attacker pre-placing a symlink at .encode_info.json + // could redirect the truncate-and-write into a target of their + // choosing (codex P2 v25 #904). + f, err := backup.OpenSidecarFile(sidecarPath) + if err != nil { + // Pre-existing entry refused (symlink/hard-link/non-regular). + // Caller must NOT rollback the sidecar path; the file there + // is operator-owned and was never touched by this run. + return false, errors.Wrap(err, "open sidecar") + } + // From this point, f points at a truncated (zero-length) regular + // file owned by this run. Any subsequent failure leaves partial + // bytes (or empty) on disk — the caller's rollback removes them. + if err := backup.WriteEncodeInfo(f, info); err != nil { + _ = f.Close() + return true, errors.Wrap(err, "WriteEncodeInfo") + } + if err := f.Sync(); err != nil { + _ = f.Close() + return true, errors.WithStack(err) + } + if err := f.Close(); err != nil { + return true, errors.WithStack(err) + } + // fsync the parent dir so the new sidecar's directory entry is + // durable alongside its bytes. Mirrors the rename path + // (codex P2 v24 #904). + if err := fsyncParentDir(sidecarPath); err != nil { + return true, errors.Wrap(err, "fsync sidecar parent dir") + } + return true, nil +} diff --git a/cmd/elastickv-snapshot-encode/main_test.go b/cmd/elastickv-snapshot-encode/main_test.go new file mode 100644 index 000000000..b97cb5761 --- /dev/null +++ b/cmd/elastickv-snapshot-encode/main_test.go @@ -0,0 +1,1190 @@ +package main + +import ( + "bytes" + "encoding/json" + "io" + "log/slog" + "os" + "path/filepath" + "runtime" + "testing" + "time" + + "github.com/bootjp/elastickv/internal/backup" + "github.com/cockroachdb/errors" +) + +// isWindows is true on Windows builds; perm-bit tests skip on Windows +// where Unix-style modes are not meaningful. +var isWindows = runtime.GOOS == "windows" + +// emitMinimalManifest writes a minimal valid MANIFEST.json under outRoot +// with the given lastCommitTS. Used by every CLI test as the producer- +// side artifact the encoder will consume. +// +// Mirrors what elastickv-snapshot-decode's populateAdapterScopes +// produces: a non-nil &Adapter{} per enabled adapter, with all scope +// arrays empty (scope enumeration is explicitly deferred — codex P1 +// v29 #904 corrected the earlier placeholder-scope fixture that +// fabricated entries the real decoder never writes). The v30 +// checkOneAdapterScope reads only the per-adapter pointer, never the +// scope content, so the empty &Adapter{} is sufficient. +// +// Tests that need a specific adapter-scope mismatch (e.g. a nil +// pointer for one adapter to exercise the v27 nil-scope rejection) +// bypass this helper and write a custom manifest. +func emitMinimalManifest(t *testing.T, outRoot string, lastCommitTS uint64) { + t.Helper() + m := backup.NewPhase0SnapshotManifest(time.Date(2026, 6, 1, 0, 0, 0, 0, time.UTC)) + m.LastCommitTS = lastCommitTS + m.Adapters = &backup.Adapters{ + DynamoDB: &backup.Adapter{}, + S3: &backup.Adapter{}, + Redis: &backup.Adapter{}, + SQS: &backup.Adapter{}, + } + m.Exclusions = &backup.Exclusions{} + f, err := os.Create(filepath.Join(outRoot, "MANIFEST.json")) + if err != nil { + t.Fatalf("create MANIFEST.json: %v", err) + } + if err := backup.WriteManifest(f, m); err != nil { + t.Fatalf("WriteManifest: %v", err) + } + if err := f.Close(); err != nil { + t.Fatalf("close: %v", err) + } +} + +func quietLogger() *slog.Logger { + return slog.New(slog.NewTextHandler(io.Discard, nil)) +} + +// TestCLIRejectsMissingManifest pins the user-input-error path: --input +// directory without MANIFEST.json → exit 1, no .fsm written. +func TestCLIRejectsMissingManifest(t *testing.T) { + t.Parallel() + in := t.TempDir() + out := filepath.Join(t.TempDir(), "out.fsm") + code, err := run([]string{"--input", in, "--output", out}, quietLogger()) + if err == nil { + t.Fatalf("run succeeded; want error") + } + if code != exitUserErr { + t.Errorf("exit code = %d, want %d", code, exitUserErr) + } + if _, statErr := os.Stat(out); !os.IsNotExist(statErr) { + t.Errorf(".fsm exists at %s; should not be written on missing manifest", out) + } +} + +// TestCLIRejectsUnknownAdapter pins the decoder-parity adapter CSV +// parser: unknown adapter → exit 1. +func TestCLIRejectsUnknownAdapter(t *testing.T) { + t.Parallel() + in := t.TempDir() + emitMinimalManifest(t, in, 100) + out := filepath.Join(t.TempDir(), "out.fsm") + code, err := run([]string{"--input", in, "--output", out, "--adapter", "foo"}, quietLogger()) + if err == nil { + t.Fatalf("run succeeded; want error") + } + if code != exitUserErr { + t.Errorf("exit code = %d, want %d", code, exitUserErr) + } +} + +// TestCLIAdapterDataErrorExitsTwo pins codex P2 v9 #904: when an +// adapter encoder rejects the input tree's contents (e.g. a malformed +// DynamoDB _schema.json with an empty table_name), the CLI exits 2 +// (data-correctness) rather than 1 (operator/flag error) so runbooks +// can branch on exit status to quarantine bad dump data. Pinned via +// the same ErrDDBEncodeInvalidSchema fixture pattern used by the +// in-package DynamoDB encoder tests. +func TestCLIAdapterDataErrorExitsTwo(t *testing.T) { + t.Parallel() + in := t.TempDir() + emitMinimalManifest(t, in, 100) + // Empty table_name triggers ErrDDBEncodeInvalidSchema inside the + // DynamoDB encoder; runAdapterEncoders marks it with + // ErrEncodeAdapterData; run() maps that to exitDataErr. + schemaDir := filepath.Join(in, "dynamodb", "tbl") + if err := os.MkdirAll(schemaDir, 0o755); err != nil { + t.Fatalf("MkdirAll: %v", err) + } + schemaPath := filepath.Join(schemaDir, "_schema.json") + body := []byte(`{"format_version":1,"table_name":"","primary_key":{"hash_key":{"name":"id","type":"S"}}}`) + if err := os.WriteFile(schemaPath, body, 0o600); err != nil { + t.Fatalf("WriteFile: %v", err) + } + out := filepath.Join(t.TempDir(), "out.fsm") + code, err := run([]string{ + "--input", in, + "--output", out, + "--adapter", "dynamodb", + }, quietLogger()) + if err == nil { + t.Fatalf("run succeeded; want adapter rejection error") + } + if code != exitDataErr { + t.Errorf("exit code = %d, want %d (data error from adapter rejection, not flag-parse error)", code, exitDataErr) + } + if _, statErr := os.Stat(out); !os.IsNotExist(statErr) { + t.Errorf(".fsm exists at %s; should not be published on adapter rejection", out) + } +} + +// TestCLIRejectsUnsupportedManifestExclusions pins codex P2 v21 #904 +// end-to-end: when MANIFEST.json sets one of the three exclusion +// flags the encoder cannot honor (include_incomplete_uploads, +// include_orphans, preserve_sqs_visibility) AND the corresponding +// adapter is enabled, the CLI must exit 2 (data-correctness) before +// any bytes are written. Mirrors the DynamoDB JSONL guard. +func TestCLIRejectsUnsupportedManifestExclusions(t *testing.T) { + t.Parallel() + cases := []struct { + name string + mutate func(*backup.Exclusions) + adapters string + }{ + { + name: "include_incomplete_uploads with --adapter=s3", + mutate: func(e *backup.Exclusions) { e.IncludeIncompleteUploads = true }, + adapters: "s3", + }, + { + name: "include_orphans with --adapter=s3", + mutate: func(e *backup.Exclusions) { e.IncludeOrphans = true }, + adapters: "s3", + }, + { + name: "preserve_sqs_visibility with --adapter=sqs", + mutate: func(e *backup.Exclusions) { e.PreserveSQSVisibility = true }, + adapters: "sqs", + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + in := t.TempDir() + m := backup.NewPhase0SnapshotManifest(time.Date(2026, 6, 1, 0, 0, 0, 0, time.UTC)) + m.LastCommitTS = 100 + m.Adapters = &backup.Adapters{} + m.Exclusions = &backup.Exclusions{} + tc.mutate(m.Exclusions) + f, ferr := os.Create(filepath.Join(in, "MANIFEST.json")) + if ferr != nil { + t.Fatalf("create MANIFEST.json: %v", ferr) + } + if werr := backup.WriteManifest(f, m); werr != nil { + t.Fatalf("WriteManifest: %v", werr) + } + if cerr := f.Close(); cerr != nil { + t.Fatalf("close: %v", cerr) + } + out := filepath.Join(t.TempDir(), "out.fsm") + code, err := run([]string{ + "--input", in, + "--output", out, + "--adapter", tc.adapters, + }, quietLogger()) + if err == nil { + t.Fatalf("run succeeded; want exit-2 from unsupported manifest exclusion") + } + if code != exitDataErr { + t.Errorf("exit code = %d, want %d (data-correctness for unsupported exclusion)", code, exitDataErr) + } + if _, statErr := os.Stat(out); !os.IsNotExist(statErr) { + t.Errorf(".fsm exists at %s; should not be published on unsupported-feature rejection", out) + } + }) + } +} + +// TestCLIRejectsAdapterNotInManifest pins codex P2 v26 #904 scenario B: +// the user enables an adapter (default `--adapter` is all four) but +// MANIFEST.json doesn't list a scope for it. The prior code would +// pick up a stale on-disk subdir for an unlisted adapter; the new +// guard rejects with exit 2. +func TestCLIRejectsAdapterNotInManifest(t *testing.T) { + t.Parallel() + in := t.TempDir() + // Manifest lists ONLY SQS (no DynamoDB / S3 / Redis), but the + // default `--adapter dynamodb,s3,redis,sqs` enables all four. + m := backup.NewPhase0SnapshotManifest(time.Date(2026, 6, 1, 0, 0, 0, 0, time.UTC)) + m.LastCommitTS = 100 + m.Adapters = &backup.Adapters{SQS: &backup.Adapter{}} + m.Exclusions = &backup.Exclusions{} + f, ferr := os.Create(filepath.Join(in, "MANIFEST.json")) + if ferr != nil { + t.Fatalf("create MANIFEST.json: %v", ferr) + } + if werr := backup.WriteManifest(f, m); werr != nil { + t.Fatalf("WriteManifest: %v", werr) + } + if cerr := f.Close(); cerr != nil { + t.Fatalf("close: %v", cerr) + } + // Operator forgot to restrict --adapter; encoder must NOT silently + // pick up the (absent) dynamodb/s3/redis subtrees just because + // the CLI default enables them. + out := filepath.Join(t.TempDir(), "out.fsm") + code, err := run([]string{"--input", in, "--output", out}, quietLogger()) + if err == nil { + t.Fatalf("run succeeded; want adapter-scope rejection") + } + if code != exitDataErr { + t.Errorf("exit code = %d, want %d (data-correctness)", code, exitDataErr) + } + if _, statErr := os.Stat(out); !os.IsNotExist(statErr) { + t.Errorf(".fsm should not be published on adapter-scope rejection") + } +} + +// TestCLIAcceptsEmptyAdapterScopeRoundTrip pins codex P1 v29 #904 +// reality: elastickv-snapshot-decode's populateAdapterScopes always +// writes an empty &Adapter{} for every enabled adapter (scope +// enumeration is deferred). A real decoded dump with files under +// dynamodb/ / s3/ / redis/ / sqs/ MUST be re-encodable through the +// CLI even though the manifest's per-adapter scope arrays are empty +// — v28/v29's stat-and-readdir checks were rejecting these. +func TestCLIAcceptsEmptyAdapterScopeRoundTrip(t *testing.T) { + t.Parallel() + in := t.TempDir() + // Pre-populate sqs/ with a real fixture (simulates a normal + // decoder run that emitted SQS records). + writeSQSFixture(t, in) + // Manifest mirrors the decoder: every adapter scope is an empty + // &Adapter{} regardless of on-disk content. + emitMinimalManifest(t, in, 100) + + out := filepath.Join(t.TempDir(), "out.fsm") + code, err := run([]string{"--input", in, "--output", out, "--adapter", "sqs"}, quietLogger()) + if err != nil { + t.Fatalf("run failed: code=%d err=%v (codex P1 v29 regression — empty scope must coexist with populated subdir)", code, err) + } + if code != exitSuccess { + t.Errorf("exit code = %d, want %d", code, exitSuccess) + } + if _, statErr := os.Stat(out); statErr != nil { + t.Errorf(".fsm not published at %s: %v", out, statErr) + } +} + +// TestCLIRejectsLowerLastCommitTSOverride is the fail-closed pin per +// parent §"MVCC re-encoding": T < manifest.last_commit_ts → exit 2 +// (data-correctness failure, not flag-parse error). +func TestCLIRejectsLowerLastCommitTSOverride(t *testing.T) { + t.Parallel() + in := t.TempDir() + emitMinimalManifest(t, in, 1000) + out := filepath.Join(t.TempDir(), "out.fsm") + code, err := run([]string{ + "--input", in, + "--output", out, + "--last-commit-ts", "500", // below manifest + }, quietLogger()) + if err == nil { + t.Fatalf("run succeeded; want HLC ceiling regression error") + } + if code != exitDataErr { + t.Errorf("exit code = %d, want %d (data error, not flag-parse error)", code, exitDataErr) + } + if _, statErr := os.Stat(out); !os.IsNotExist(statErr) { + t.Errorf(".fsm exists at %s; should not be published on regression", out) + } +} + +// TestCLIAcceptsEqualAndHigherLastCommitTSOverride pins T == manifest +// (default) and T > manifest both succeed with the effective T stamped +// into the .fsm header and sidecar. +func TestCLIAcceptsEqualAndHigherLastCommitTSOverride(t *testing.T) { + t.Parallel() + for _, tc := range []struct { + name string + argTS string + want uint64 + }{ + {"equal", "1000", 1000}, + {"higher", "5000", 5000}, + } { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + in := t.TempDir() + emitMinimalManifest(t, in, 1000) + out := filepath.Join(t.TempDir(), "out.fsm") + code, err := run([]string{ + "--input", in, + "--output", out, + "--last-commit-ts", tc.argTS, + }, quietLogger()) + if err != nil { + t.Fatalf("run: %v", err) + } + if code != exitSuccess { + t.Errorf("exit code = %d, want %d", code, exitSuccess) + } + // Inspect sidecar. + sidecar := backup.EncodeInfoSidecarPath(out) + body, err := os.ReadFile(sidecar) + if err != nil { + t.Fatalf("read sidecar: %v", err) + } + var info backup.EncodeInfo + if err := json.Unmarshal(body, &info); err != nil { + t.Fatalf("unmarshal sidecar: %v", err) + } + if info.LastCommitTS != tc.want { + t.Errorf("sidecar LastCommitTS = %d, want %d", info.LastCommitTS, tc.want) + } + }) + } +} + +// TestCLIEncodeInfoPathDerivedFromOutput pins gemini medium v2 #896: +// the sidecar is named .encode_info.json, not a static name. +func TestCLIEncodeInfoPathDerivedFromOutput(t *testing.T) { + t.Parallel() + in := t.TempDir() + emitMinimalManifest(t, in, 100) + outDir := t.TempDir() + out := filepath.Join(outDir, "node1.fsm") + code, err := run([]string{"--input", in, "--output", out}, quietLogger()) + if err != nil || code != exitSuccess { + t.Fatalf("run failed: code=%d err=%v", code, err) + } + if _, err := os.Stat(out); err != nil { + t.Fatalf(".fsm not found: %v", err) + } + want := filepath.Join(outDir, "node1.fsm.encode_info.json") + if _, err := os.Stat(want); err != nil { + t.Errorf("sidecar not at %s: %v", want, err) + } + // Ensure the static-named version was NOT created. + if _, err := os.Stat(filepath.Join(outDir, "ENCODE_INFO.json")); err == nil { + t.Errorf("static ENCODE_INFO.json exists; expected only path-derived sidecar") + } +} + +// TestCLIEncodeInfoTwoFilesNoCollision pins the no-collision property: +// two --output paths in the same dir produce two distinct sidecars. +func TestCLIEncodeInfoTwoFilesNoCollision(t *testing.T) { + t.Parallel() + in := t.TempDir() + emitMinimalManifest(t, in, 100) + outDir := t.TempDir() + for _, name := range []string{"a.fsm", "b.fsm"} { + out := filepath.Join(outDir, name) + code, err := run([]string{"--input", in, "--output", out}, quietLogger()) + if err != nil || code != exitSuccess { + t.Fatalf("run for %s failed: code=%d err=%v", name, code, err) + } + } + for _, name := range []string{"a.fsm", "b.fsm"} { + want := filepath.Join(outDir, name+".encode_info.json") + if _, err := os.Stat(want); err != nil { + t.Errorf("sidecar %s missing: %v", want, err) + } + } + // a.fsm.encode_info.json and b.fsm.encode_info.json must have + // different output_fsm_path values. + aBody, _ := os.ReadFile(filepath.Join(outDir, "a.fsm.encode_info.json")) + bBody, _ := os.ReadFile(filepath.Join(outDir, "b.fsm.encode_info.json")) + if bytes.Equal(aBody, bBody) { + t.Errorf("sidecars are byte-equal; should differ by output_fsm_path") + } +} + +// writeSQSFixture writes a minimal sqs//{queue.json, +// messages.jsonl} fixture under root. Used by the CLI round-trip test; +// kept as a helper so the test body stays under the cyclop threshold. +func writeSQSFixture(t *testing.T, root string) { + t.Helper() + dir := filepath.Join(root, "sqs", "cnQ") // base64url("rt") + if err := os.MkdirAll(dir, 0o755); err != nil { + t.Fatalf("MkdirAll: %v", err) + } + if err := os.WriteFile(filepath.Join(dir, "_queue.json"), + []byte(`{"format_version":1,"name":"rt","fifo":false,"partition_count":1,"generation":1}`), + 0o600); err != nil { + t.Fatalf("WriteFile _queue.json: %v", err) + } + if err := os.WriteFile(filepath.Join(dir, "messages.jsonl"), + []byte(`{"format_version":1,"message_id":"m1","body":"a","send_timestamp_millis":1700000000000,"available_at_millis":1700000000000,"sequence_number":0}`), + 0o600); err != nil { + t.Fatalf("WriteFile messages.jsonl: %v", err) + } +} + +// canonicalizeInput runs encode → decode once so the input matches the +// encoder's output shape. Subsequent self-tests against the canonical +// tree are byte-equal (any non-canonical formatting differences are +// flattened by this first pass). +// canonicalRoundTripTS is the fixed last_commit_ts used by every +// canonicalizeInput call site. Kept as a const so a future test that +// wants a different value can lift it back into a parameter. +const canonicalRoundTripTS uint64 = 7000 + +func canonicalizeInput(t *testing.T, rawIn string) string { + t.Helper() + canonicalIn := t.TempDir() + tmpOut := filepath.Join(t.TempDir(), "canonical.fsm") + code, err := run([]string{"--input", rawIn, "--output", tmpOut, "--adapter", "sqs"}, quietLogger()) + if err != nil || code != exitSuccess { + t.Fatalf("canonical encode: code=%d err=%v", code, err) + } + f, oerr := os.Open(tmpOut) + if oerr != nil { + t.Fatalf("open canonical output: %v", oerr) + } + defer func() { _ = f.Close() }() + if _, err := backup.DecodeSnapshot(f, backup.DecodeOptions{ + OutRoot: canonicalIn, + Adapters: backup.AdapterSet{SQS: true}, + }); err != nil { + t.Fatalf("canonical decode: %v", err) + } + emitMinimalManifest(t, canonicalIn, canonicalRoundTripTS) + return canonicalIn +} + +// flipBytesPastHeaderInTempCorruptHook returns a corrupt-buffer hook +// that flips one byte every 13 starting at offset 200 in the on-disk +// self-test buffer — the same pattern as the library's +// flipBytesPastHeaderHelper. Extracted so the three CLI mismatch +// tests share one body rather than each open-coding the same loop. +func flipBytesPastHeaderInTempCorruptHook(t *testing.T) func(*os.File) { + t.Helper() + return func(f *os.File) { + info, ferr := f.Stat() + if ferr != nil { + t.Fatalf("temp Stat: %v", ferr) + } + const headerSkip = 200 + if info.Size() <= headerSkip { + t.Fatalf("temp file too small to corrupt past header: %d bytes", info.Size()) + } + buf := make([]byte, info.Size()-headerSkip) + if _, rerr := f.ReadAt(buf, headerSkip); rerr != nil { + t.Fatalf("ReadAt: %v", rerr) + } + for i := 0; i < len(buf); i += 13 { + buf[i] ^= 0xFF + } + if _, werr := f.WriteAt(buf, headerSkip); werr != nil { + t.Fatalf("WriteAt: %v", werr) + } + } +} + +// readSidecar reads .encode_info.json into an EncodeInfo struct. +func readSidecar(t *testing.T, output string) backup.EncodeInfo { + t.Helper() + body, err := os.ReadFile(output + ".encode_info.json") + if err != nil { + t.Fatalf("read sidecar: %v", err) + } + var info backup.EncodeInfo + if err := json.Unmarshal(body, &info); err != nil { + t.Fatalf("unmarshal sidecar: %v", err) + } + return info +} + +// TestCLIAdapterErrorPreservesPriorSidecar pins codex P2 v17 #904: when +// a run gets past manifest/TS validation and then fails inside an +// adapter encoder (non-self-test exit-2), the prior .fsm is +// preserved (only the self-test mismatch path removes it), so the +// prior .encode_info.json must ALSO be preserved — wiping it +// while leaving the .fsm would orphan the restore artifact from its +// provenance metadata. The v17 fix drops the pre-encode sidecar +// cleanup; this test pins the resulting invariant end-to-end. +func TestCLIAdapterErrorPreservesPriorSidecar(t *testing.T) { + t.Parallel() + in, out, priorFSM, priorSidecar := setupAdapterErrorFixture(t) + code, err := run([]string{ + "--input", in, + "--output", out, + "--adapter", "dynamodb", + }, quietLogger()) + if err == nil { + t.Fatalf("run succeeded; want adapter-data rejection") + } + if code != exitDataErr { + t.Errorf("exit code = %d, want %d", code, exitDataErr) + } + assertFilePreserved(t, out, priorFSM, "prior .fsm") + // Prior sidecar unchanged (codex P2 v17: the v17 fix drops the + // pre-encode sidecar cleanup so the sidecar+.fsm stay paired). + assertFilePreserved(t, out+".encode_info.json", priorSidecar, "prior sidecar") +} + +// setupAdapterErrorFixture builds a fixture for +// TestCLIAdapterErrorPreservesPriorSidecar: an InputRoot with a valid +// MANIFEST.json plus a malformed dynamodb _schema.json (empty +// table_name → ErrDDBEncodeInvalidSchema), and a pre-placed FSM + +// sidecar at the output path representing a hypothetical earlier +// successful run. Returns (inputRoot, outputPath, priorFSMBytes, +// priorSidecarBytes). +func setupAdapterErrorFixture(t *testing.T) (string, string, []byte, []byte) { + t.Helper() + in := t.TempDir() + emitMinimalManifest(t, in, 1000) + out := filepath.Join(t.TempDir(), "out.fsm") + priorFSM := []byte("PRIOR FSM BYTES") + priorSidecar := []byte(`{"format_version":1,"encoder_version":"prior","input_root":"x","output_fsm_path":"x"}`) + if err := os.WriteFile(out, priorFSM, 0o600); err != nil { + t.Fatalf("WriteFile prior fsm: %v", err) + } + if err := os.WriteFile(out+".encode_info.json", priorSidecar, 0o600); err != nil { + t.Fatalf("WriteFile prior sidecar: %v", err) + } + schemaDir := filepath.Join(in, "dynamodb", "tbl") + if err := os.MkdirAll(schemaDir, 0o755); err != nil { + t.Fatalf("MkdirAll: %v", err) + } + body := []byte(`{"format_version":1,"table_name":"","primary_key":{"hash_key":{"name":"id","type":"S"}}}`) + if err := os.WriteFile(filepath.Join(schemaDir, "_schema.json"), body, 0o600); err != nil { + t.Fatalf("WriteFile bad schema: %v", err) + } + return in, out, priorFSM, priorSidecar +} + +// assertFilePreserved asserts the named file is still present and its +// contents exactly match wantBody. label appears in error messages. +func assertFilePreserved(t *testing.T, path string, wantBody []byte, label string) { + t.Helper() + got, err := os.ReadFile(path) + if err != nil { + t.Fatalf("read %s at %s: %v", label, path, err) + } + if !bytes.Equal(got, wantBody) { + t.Errorf("%s mutated; codex P2 v17 expected adapter-error to preserve", label) + } +} + +// TestCLISelfTestMismatchSkipsDirectoryAtOutputPath pins codex P2 v14 +// #904: the self-test-mismatch cleanup must NOT delete an --output +// path that resolves to a directory (or any non-regular file). The +// prior unconditional os.Remove(cfg.outputPath) would have wiped an +// empty directory the operator passed in error. +// +// The fixture pre-creates an empty directory at the --output path, +// drives a self-test mismatch, asserts publishErr == errSelfTestMismatch, +// and asserts the directory is STILL PRESENT (the destructive +// cleanup did not fire). The normal publish path would have failed +// at os.Rename — this test pins the mismatch-cleanup-specific guard. +func TestCLISelfTestMismatchSkipsDirectoryAtOutputPath(t *testing.T) { + t.Parallel() + rawIn := t.TempDir() + writeSQSFixture(t, rawIn) + emitMinimalManifest(t, rawIn, 7000) + canonicalIn := canonicalizeInput(t, rawIn) + + out := filepath.Join(t.TempDir(), "out.fsm") + // Pre-create a directory at the --output path — an operator + // typo, but the cleanup MUST NOT destructively remove it. + if err := os.Mkdir(out, 0o755); err != nil { + t.Fatalf("Mkdir: %v", err) + } + + scratchBase := t.TempDir() + encodeOpts := backup.EncodeOptions{ + InputRoot: canonicalIn, + Adapters: backup.AdapterSet{SQS: true}, + LastCommitTS: 7000, + ManifestLastCommitTS: 7000, + SelfTest: true, + SelfTestDecodeOptions: backup.DecodeOptions{ + OutRoot: scratchBase, + Adapters: backup.AdapterSet{SQS: true}, + }, + } + encodeOpts.SetSelfTestCorruptHookForTest(flipBytesPastHeaderInTempCorruptHook(t)) + + cfg := &config{ + inputPath: canonicalIn, + outputPath: out, + adapters: backup.AdapterSet{SQS: true}, + selfTest: true, + } + mismatchPath := out + ".mismatch.txt" + + _, publishErr := writeAndPublish(cfg, encodeOpts, mismatchPath, quietLogger()) + if !errors.Is(publishErr, errSelfTestMismatch) { + t.Fatalf("publishErr = %v, want errSelfTestMismatch", publishErr) + } + info, statErr := os.Stat(out) + if statErr != nil { + t.Fatalf("output path missing after mismatch (codex P2 v14 destructive cleanup regression): %v", statErr) + } + if !info.IsDir() { + t.Errorf("output mode = %s; expected the pre-placed directory to be preserved", info.Mode()) + } +} + +// TestCLIInvalidManifestExitsTwo pins codex P2 v14 #904: a malformed +// MANIFEST.json (invalid JSON or unsupported format_version) surfaces +// backup.ErrInvalidManifest / backup.ErrUnsupportedFormatVersion from +// readInputManifest, and the CLI MUST map both to exit 2 +// (data-correctness). Treating a broken manifest as exit 1 misroutes +// runbook recovery for corrupt-dump scenarios. +func TestCLIInvalidManifestExitsTwo(t *testing.T) { + t.Parallel() + t.Run("invalid JSON body", func(t *testing.T) { + t.Parallel() + in := t.TempDir() + if err := os.WriteFile(filepath.Join(in, "MANIFEST.json"), []byte("{not json"), 0o600); err != nil { + t.Fatalf("WriteFile: %v", err) + } + out := filepath.Join(t.TempDir(), "out.fsm") + code, err := run([]string{"--input", in, "--output", out}, quietLogger()) + if err == nil { + t.Fatalf("run succeeded; want manifest parse error") + } + if code != exitDataErr { + t.Errorf("exit code = %d, want %d (invalid manifest is data-correctness)", code, exitDataErr) + } + }) + t.Run("unsupported format_version", func(t *testing.T) { + t.Parallel() + in := t.TempDir() + if err := os.WriteFile(filepath.Join(in, "MANIFEST.json"), + []byte(`{"format_version":99,"last_commit_ts":1}`), 0o600); err != nil { + t.Fatalf("WriteFile: %v", err) + } + out := filepath.Join(t.TempDir(), "out.fsm") + code, err := run([]string{"--input", in, "--output", out}, quietLogger()) + if err == nil { + t.Fatalf("run succeeded; want unsupported format_version") + } + if code != exitDataErr { + t.Errorf("exit code = %d, want %d (unsupported manifest format_version is data-correctness)", code, exitDataErr) + } + }) +} + +// TestCLISelfTestMismatchRemovesSymlinkOutputButPreservesTarget pins +// codex P2 v19 #904: when --output is a symlink to a prior .fsm file +// and the new --self-test invocation mismatches, the cleanup must +// unlink the symlink (so the restore-visible --output path now +// resolves to ENOENT, matching the mismatch contract) while leaving +// the underlying target file intact (os.Remove on a symlink operates +// on the link, not the resolved target). +// +// Before v21 the IsRegular()-only check silently skipped the symlink +// cleanup; the new sidecar (matched=false) then described a fresh +// failed encode while --output still resolved to a prior valid .fsm, +// breaking the "no restore-visible FSM after self-test mismatch" +// invariant. Linux-only because Windows symlink semantics differ. +func TestCLISelfTestMismatchRemovesSymlinkOutputButPreservesTarget(t *testing.T) { + if isWindows { + t.Skip("symlink semantics differ on Windows") + } + t.Parallel() + rawIn := t.TempDir() + writeSQSFixture(t, rawIn) + emitMinimalManifest(t, rawIn, 7000) + canonicalIn := canonicalizeInput(t, rawIn) + + targetDir := t.TempDir() + target := filepath.Join(targetDir, "real.fsm") + const targetBody = "TARGET FSM BYTES (must survive symlink removal)" + if err := os.WriteFile(target, []byte(targetBody), 0o600); err != nil { + t.Fatalf("WriteFile target: %v", err) + } + out := filepath.Join(t.TempDir(), "out.fsm") + if err := os.Symlink(target, out); err != nil { + t.Fatalf("Symlink: %v", err) + } + + scratchBase := t.TempDir() + encodeOpts := backup.EncodeOptions{ + InputRoot: canonicalIn, + Adapters: backup.AdapterSet{SQS: true}, + LastCommitTS: 7000, + ManifestLastCommitTS: 7000, + SelfTest: true, + SelfTestDecodeOptions: backup.DecodeOptions{ + OutRoot: scratchBase, + Adapters: backup.AdapterSet{SQS: true}, + }, + } + encodeOpts.SetSelfTestCorruptHookForTest(flipBytesPastHeaderInTempCorruptHook(t)) + + cfg := &config{ + inputPath: canonicalIn, + outputPath: out, + adapters: backup.AdapterSet{SQS: true}, + selfTest: true, + } + mismatchPath := out + ".mismatch.txt" + + _, publishErr := writeAndPublish(cfg, encodeOpts, mismatchPath, quietLogger()) + if !errors.Is(publishErr, errSelfTestMismatch) { + t.Fatalf("publishErr = %v, want errSelfTestMismatch", publishErr) + } + // --output (the symlink) must now resolve to ENOENT. + if _, statErr := os.Lstat(out); !os.IsNotExist(statErr) { + t.Errorf("symlink at --output not removed after mismatch (codex P2 v19 regression)") + } + // The target file (which the symlink pointed to) must survive. + gotTarget, rerr := os.ReadFile(target) + if rerr != nil { + t.Fatalf("target file vanished (os.Remove operated on resolved target instead of symlink): %v", rerr) + } + if string(gotTarget) != targetBody { + t.Errorf("target body mutated; want preserved") + } +} + +// TestCLIWriteAndPublishRemovesStaleFSMOnSelfTestMismatch pins codex +// P2 v10 #904: when a prior successful run left an .fsm on +// disk and a new --self-test invocation produces a mismatch, +// writeAndPublish must remove that stale .fsm. Otherwise encodeOne +// writes a fresh sidecar (matched=false, NEW SHA) alongside the OLD +// bytes — violating the CLI contract that a self-test failure leaves +// no restore-visible FSM, and making the sidecar describe an FSM that +// is not on disk. +// +// To drive a deterministic self-test mismatch end-to-end through the +// CLI's writeAndPublish, the test uses the backup package's exported +// test seam (SetSelfTestCorruptHookForTest) to flip bytes in the +// disk-backed self-test buffer between WriteTo and the re-decode. +func TestCLIWriteAndPublishRemovesStaleFSMOnSelfTestMismatch(t *testing.T) { + t.Parallel() + rawIn := t.TempDir() + writeSQSFixture(t, rawIn) + emitMinimalManifest(t, rawIn, 7000) + canonicalIn := canonicalizeInput(t, rawIn) + + out := filepath.Join(t.TempDir(), "out.fsm") + // Pre-place a stale .fsm — what a prior successful run would have + // left behind. The codex P2 v10 contract is that a subsequent + // self-test mismatch invalidates this file. + stalePayload := []byte("STALE FSM FROM PRIOR SUCCESSFUL RUN") + if err := os.WriteFile(out, stalePayload, 0o600); err != nil { + t.Fatalf("WriteFile stale: %v", err) + } + + scratchBase := t.TempDir() + encodeOpts := backup.EncodeOptions{ + InputRoot: canonicalIn, + Adapters: backup.AdapterSet{SQS: true}, + LastCommitTS: 7000, + ManifestLastCommitTS: 7000, + SelfTest: true, + SelfTestDecodeOptions: backup.DecodeOptions{ + OutRoot: scratchBase, + Adapters: backup.AdapterSet{SQS: true}, + }, + } + // Flip bytes past the EKVPBBL1 header so the re-decode trips on + // a malformed entry length and the self-test returns matched=false. + encodeOpts.SetSelfTestCorruptHookForTest(flipBytesPastHeaderInTempCorruptHook(t)) + + cfg := &config{ + inputPath: canonicalIn, + outputPath: out, + adapters: backup.AdapterSet{SQS: true}, + selfTest: true, + } + mismatchPath := out + ".mismatch.txt" + + _, publishErr := writeAndPublish(cfg, encodeOpts, mismatchPath, quietLogger()) + if !errors.Is(publishErr, errSelfTestMismatch) { + t.Fatalf("publishErr = %v, want errSelfTestMismatch", publishErr) + } + if _, statErr := os.Stat(out); !os.IsNotExist(statErr) { + t.Errorf("stale .fsm at %s not removed after self-test mismatch (codex P2 v10)", out) + } + // The mismatch.txt should be present as the operator-visible + // record of the failed encode attempt. + if _, statErr := os.Stat(mismatchPath); statErr != nil { + t.Errorf("mismatch.txt missing after self-test mismatch: %v", statErr) + } +} + +// TestCLINonSelfTestExitTwoPreservesPriorFSM pins the surgical scope +// of the codex P2 v10 fix: non-self-test exit-2 paths (e.g. the +// manifest-floor HLC regression that fails BEFORE writeAndPublish) +// must NOT remove a prior .fsm. Only self-test mismatch +// triggers the cleanup; a runbook recovering from a typo'd +// --last-commit-ts still has its last good FSM on disk. +func TestCLINonSelfTestExitTwoPreservesPriorFSM(t *testing.T) { + t.Parallel() + in := t.TempDir() + emitMinimalManifest(t, in, 1000) + out := filepath.Join(t.TempDir(), "out.fsm") + stalePayload := []byte("STALE FSM FROM PRIOR SUCCESSFUL RUN") + if err := os.WriteFile(out, stalePayload, 0o600); err != nil { + t.Fatalf("WriteFile stale: %v", err) + } + // Manifest-floor regression → exit-2 from resolveLastCommitTS, + // before writeAndPublish runs. Stale .fsm should be preserved. + code, err := run([]string{ + "--input", in, + "--output", out, + "--last-commit-ts", "500", // below manifest 1000 + }, quietLogger()) + if err == nil { + t.Fatalf("run succeeded; want manifest-floor regression") + } + if code != exitDataErr { + t.Errorf("exit code = %d, want %d", code, exitDataErr) + } + body, rerr := os.ReadFile(out) + if rerr != nil { + t.Fatalf("read stale .fsm: %v (must be preserved on non-self-test exit-2)", rerr) + } + if !bytes.Equal(body, stalePayload) { + t.Errorf("stale .fsm mutated; want preserved on manifest-floor regression") + } +} + +// TestCLIRoundTripSelfTestAllAdapters is the gold-standard CLI-level +// end-to-end test: a real adapter fixture, encoder runs with +// --self-test, exit 0, matched:true in the sidecar. +func TestCLIRoundTripSelfTestAllAdapters(t *testing.T) { + t.Parallel() + rawIn := t.TempDir() + writeSQSFixture(t, rawIn) + emitMinimalManifest(t, rawIn, 7000) + canonicalIn := canonicalizeInput(t, rawIn) + + out := filepath.Join(t.TempDir(), "out.fsm") + code, err := run([]string{ + "--input", canonicalIn, + "--output", out, + "--adapter", "sqs", + "--self-test", + }, quietLogger()) + if err != nil { + t.Fatalf("run with self-test: %v", err) + } + if code != exitSuccess { + t.Errorf("exit code = %d, want %d (self-test should match)", code, exitSuccess) + } + info := readSidecar(t, out) + if !info.SelfTest.Ran || !info.SelfTest.Matched { + t.Errorf("self_test Ran=%v Matched=%v, want both true", info.SelfTest.Ran, info.SelfTest.Matched) + } + if _, err := os.Stat(out + ".mismatch.txt"); err == nil { + t.Errorf("mismatch.txt exists on a successful self-test") + } +} + +// TestCLIManifestFloorLeavesNoStaleSidecar pins that the +// manifest-floor preflight failure (--last-commit-ts T < manifest; +// fails in resolveLastCommitTS BEFORE writeAndPublish) leaves NO +// .encode_info.json on disk — neither a fresh one nor a +// stale one from a prior run (the pre-encode cleanup at the start +// of encodeOne removes it). +// +// Note: the test name was previously TestCLISelfTestMismatchWritesSidecarWithMatchedFalse, +// which contradicted the assertion (the encode does NOT run on this +// path, so no sidecar is written). The actual sidecar-on-mismatch +// behavior is now pinned end-to-end by +// TestCLIWriteAndPublishRemovesStaleFSMOnSelfTestMismatch using the +// CLI-level corruption seam (codex P2 v6/v10 #904; claude v12 rename). +func TestCLIManifestFloorLeavesNoStaleSidecar(t *testing.T) { + t.Parallel() + // The pre-encode cleanup at the top of encodeOne removes any + // stale .encode_info.json before writeAndPublish runs. + // On the manifest-floor path, resolveLastCommitTS exits with + // exit-2 BEFORE that cleanup even runs (it's the second step in + // encodeOne after readInputManifest). So the assertion is: a + // fresh TempDir produces no sidecar at all. + in := t.TempDir() + emitMinimalManifest(t, in, 1000) + out := filepath.Join(t.TempDir(), "out.fsm") + // Force a data error via a too-low override. Exit code 2. + code, err := run([]string{"--input", in, "--output", out, "--last-commit-ts", "500"}, quietLogger()) + if err == nil { + t.Fatalf("run succeeded; want data error") + } + if code != exitDataErr { + t.Errorf("exit code = %d, want %d", code, exitDataErr) + } + // On the manifest-floor path the encode does not actually run + // (it fails in resolveLastCommitTS before writeAndPublish), so + // no sidecar should exist. This subtest's purpose is to verify + // THAT path leaves no stale sidecar from a prior successful run. + if _, statErr := os.Stat(out + ".encode_info.json"); !os.IsNotExist(statErr) { + t.Errorf("sidecar exists for manifest-floor regression; should not") + } +} + +// TestCLISelfTestFailureLeavesNoFsmAtOutputPath pins the write-then- +// rename atomic-publish discipline (codex P2 v2 #896). To trigger a +// real self-test failure deterministically from the CLI level we test +// via the lower-level EncodeSnapshot library — the CLI-only test path +// would require build-tagged corruption hooks. The library-level +// equivalent is TestEncodeSnapshotSelfTestDetectsCorruption (which +// asserts the buffered bytes never reach the io.Writer); this CLI +// test confirms the temp-file rename discipline by parsing the +// CLI's filesystem state after a normal --self-test success: the +// temp file must NOT exist after rename. +func TestCLISelfTestFailureLeavesNoFsmAtOutputPath(t *testing.T) { + t.Parallel() + // Use a deliberately mismatched --last-commit-ts override to drive + // a data-error exit; the CLI MUST NOT publish .fsm on data-error. + in := t.TempDir() + emitMinimalManifest(t, in, 1000) + out := filepath.Join(t.TempDir(), "out.fsm") + code, err := run([]string{ + "--input", in, + "--output", out, + "--last-commit-ts", "500", + }, quietLogger()) + if err == nil { + t.Fatalf("run succeeded; want data error") + } + if code != exitDataErr { + t.Errorf("exit code = %d, want %d", code, exitDataErr) + } + if _, statErr := os.Stat(out); !os.IsNotExist(statErr) { + t.Errorf(".fsm at %s; data error must not publish", out) + } + // No temp file should linger either. + matches, _ := filepath.Glob(out + ".tmp-*") + if len(matches) > 0 { + t.Errorf("temp file lingered: %v", matches) + } +} + +// TestParseLastCommitTSDecimal + Hex pin both representations the +// --last-commit-ts flag accepts, and verify strict-parse rejection of +// trailing junk (claude high #904, codex P2 #904). +func TestParseLastCommitTS(t *testing.T) { + t.Parallel() + for _, tc := range []struct { + in string + want uint64 + }{ + {"0", 0}, + {"1234567890", 1234567890}, + {"0xff", 0xff}, + {"0X10", 0x10}, + } { + got, err := parseLastCommitTS(tc.in) + if err != nil { + t.Errorf("%q: %v", tc.in, err) + continue + } + if got != tc.want { + t.Errorf("%q: got %d want %d", tc.in, got, tc.want) + } + } + // Reject empty, malformed, and trailing junk. + for _, bad := range []string{ + "", + "abc", + "0xZZ", + "0xffZZ", // trailing hex garbage — fmt.Sscanf would accept as 0xff + "100oops", // trailing decimal garbage — fmt.Sscanf would accept as 100 + "-1", // negative + " 100 ext", // whitespace + extra + } { + if _, err := parseLastCommitTS(bad); err == nil { + t.Errorf("%q parsed successfully; want error", bad) + } + } +} + +// TestCLISidecarWriteRefusesSymlinkTarget pins codex P2 v25 #904: the +// CLI's encode_info sidecar writer must not follow a pre-existing +// symlink at .encode_info.json. An attacker (or a confused +// shared-host config) could plant a symlink pointing at a sensitive +// file the encoding user can write; the prior os.OpenFile + O_TRUNC +// path would have truncated that target and then written the JSON +// blob to it. backup.OpenSidecarFile refuses the open with ELOOP on +// unix. Skipped on Windows where symlink semantics differ and the +// Windows variant of OpenSidecarFile uses the Lstat-then-OpenFile +// guard. +func TestCLISidecarWriteRefusesSymlinkTarget(t *testing.T) { + if isWindows { + t.Skip("symlink-refusal semantics differ on Windows") + } + t.Parallel() + in := t.TempDir() + emitMinimalManifest(t, in, 100) + + outDir := t.TempDir() + out := filepath.Join(outDir, "out.fsm") + sidecarPath := backup.EncodeInfoSidecarPath(out) + // Pre-plant the attacker's victim file and the symlink. + victimDir := t.TempDir() + victim := filepath.Join(victimDir, "victim.json") + const victimBody = "VICTIM CONTENTS — must survive sidecar write" + if err := os.WriteFile(victim, []byte(victimBody), 0o600); err != nil { + t.Fatalf("WriteFile victim: %v", err) + } + if err := os.Symlink(victim, sidecarPath); err != nil { + t.Fatalf("Symlink at sidecar path: %v", err) + } + + code, err := run([]string{"--input", in, "--output", out}, quietLogger()) + if err == nil { + t.Fatalf("run succeeded; want sidecar open to fail on symlinked path") + } + if code != exitUserErr { + t.Errorf("exit code = %d, want %d (operator-env error, not data-correctness)", code, exitUserErr) + } + assertSymlinkSidecarRollbackInvariants(t, victim, victimBody, out, sidecarPath) +} + +// assertSymlinkSidecarRollbackInvariants checks all three post- +// failure invariants for TestCLISidecarWriteRefusesSymlinkTarget: +// +// (1) the symlink target file is unchanged (no-follow open never +// resolved the link), +// (2) the .fsm at --output is removed (v32 rollback fired after +// the .fsm was renamed into place but before encodeOne +// returned), and +// (3) the operator-placed symlink at the sidecar path is +// preserved (v33 rollback only removes regular files; +// non-regular sidecar entries are operator-owned). +// +// Extracted into a helper to keep the test body under the cyclop +// bound. +func assertSymlinkSidecarRollbackInvariants(t *testing.T, victim string, victimBody, out, sidecarPath string) { + t.Helper() + got, rerr := os.ReadFile(victim) + if rerr != nil { + t.Fatalf("read victim: %v", rerr) + } + if string(got) != victimBody { + t.Errorf("victim mutated; OpenSidecarFile followed the symlink (codex P2 v25 regression)") + } + if _, statErr := os.Stat(out); !os.IsNotExist(statErr) { + t.Errorf(".fsm at %s should be removed by v32 rollback after sidecar failure", out) + } + linkInfo, lerr := os.Lstat(sidecarPath) + if lerr != nil { + t.Errorf("operator-placed symlink at sidecar path was removed by rollback (codex P2 v32 regression): %v", lerr) + return + } + if linkInfo.Mode()&os.ModeSymlink == 0 { + t.Errorf("sidecar path mode = %s; expected symlink preserved by v33 rollback", linkInfo.Mode()) + } +} + +// TestCLISidecarWriteRefusesHardLinkTarget pins codex P2 v33 #904: +// when the sidecar path is a hard link to an operator-owned file, +// OpenSidecarFile refuses to truncate (Nlink > 1 guard); the v32 +// rollback's IsRegular()-only gate would have unlinked the hard +// link anyway, destroying operator state. v34 routes the rollback +// through a sidecarTruncated bool so a refused OpenSidecarFile +// keeps the operator's entry intact. Unix-only (Windows hard-link +// semantics differ). +func TestCLISidecarWriteRefusesHardLinkTarget(t *testing.T) { + if isWindows { + t.Skip("hard-link refusal semantics differ on Windows") + } + t.Parallel() + in := t.TempDir() + emitMinimalManifest(t, in, 100) + + outDir := t.TempDir() + out := filepath.Join(outDir, "out.fsm") + sidecarPath := backup.EncodeInfoSidecarPath(out) + // Pre-plant a victim file and hard-link the sidecar path to it. + victimDir := t.TempDir() + victim := filepath.Join(victimDir, "victim.json") + const victimBody = "VICTIM CONTENTS — must survive hard-link rejection" + if err := os.WriteFile(victim, []byte(victimBody), 0o600); err != nil { + t.Fatalf("WriteFile victim: %v", err) + } + if err := os.Link(victim, sidecarPath); err != nil { + t.Fatalf("Link victim → sidecar path: %v", err) + } + + code, err := run([]string{"--input", in, "--output", out}, quietLogger()) + if err == nil { + t.Fatalf("run succeeded; want sidecar open to fail on hard-linked path") + } + if code != exitUserErr { + t.Errorf("exit code = %d, want %d (operator-env error)", code, exitUserErr) + } + // Victim contents MUST be preserved (OpenSidecarFile's Nlink + // guard refused to truncate; v34 rollback's sidecarTruncated=false + // branch then skipped the os.Remove). + got, rerr := os.ReadFile(victim) + if rerr != nil { + t.Fatalf("read victim: %v", rerr) + } + if string(got) != victimBody { + t.Errorf("victim mutated; OpenSidecarFile truncated through hard link (codex P2 v33 regression)") + } + // The hard link at the sidecar path MUST also survive — the + // rollback no longer os.Removes it (codex P2 v33 fix). + if _, statErr := os.Lstat(sidecarPath); statErr != nil { + t.Errorf("hard link at sidecar path was removed by rollback (codex P2 v33 regression): %v", statErr) + } + // .fsm at outputPath MUST be removed (rollback's FSM branch + // always fires when the encode succeeded but sidecar failed). + if _, statErr := os.Stat(out); !os.IsNotExist(statErr) { + t.Errorf(".fsm at %s should be removed by rollback after sidecar failure", out) + } +} + +// TestCLIPublishesFsmAndSidecarMode0600 pins claude v4 #904: the +// produced .fsm and ENCODE_INFO.json are created with mode 0o600 so a +// multi-user backup host does not get a world-readable dataset. The +// earlier os.Create-based path relied on umask (typically 0644). +// +// Skips on Windows where Unix-style perm bits are not meaningful. +func TestCLIPublishesFsmAndSidecarMode0600(t *testing.T) { + t.Parallel() + if isWindows { + t.Skip("perm bits not meaningful on Windows") + } + in := t.TempDir() + emitMinimalManifest(t, in, 100) + out := filepath.Join(t.TempDir(), "out.fsm") + code, err := run([]string{"--input", in, "--output", out}, quietLogger()) + if err != nil || code != exitSuccess { + t.Fatalf("run failed: code=%d err=%v", code, err) + } + for _, p := range []string{out, out + ".encode_info.json"} { + info, err := os.Stat(p) + if err != nil { + t.Fatalf("stat %s: %v", p, err) + } + // Only check the owner bits (rwx); umask cannot widen beyond + // what OpenFile requested but a misconfigured fs.ModeSticky + // or similar could theoretically narrow. We just assert no + // group/other access bits are set. + if perm := info.Mode().Perm(); perm&0o077 != 0 { + t.Errorf("%s mode = %o, want no group/other bits (0o600 or stricter)", p, perm) + } + } +} + +// TestParseAdapterSetRejectsEmptySelection pins codex P2 #904: a CSV +// of only separators/whitespace MUST surface as a flag-parse error, not +// silently produce a zero AdapterSet that would publish a header-only +// .fsm. +func TestParseAdapterSetRejectsEmptySelection(t *testing.T) { + t.Parallel() + for _, bad := range []string{ + " ,", + ",,,", + " ", + ",", + } { + if _, err := parseAdapterSet(bad); err == nil { + t.Errorf("--adapter %q parsed to a non-empty set; want error", bad) + } + } + // Single-adapter selection still works. + set, err := parseAdapterSet("s3") + if err != nil { + t.Fatalf("--adapter s3: %v", err) + } + if !set.S3 || set.Redis || set.DynamoDB || set.SQS { + t.Errorf("--adapter s3 produced %+v, want only S3", set) + } +} diff --git a/internal/backup/encode_info.go b/internal/backup/encode_info.go new file mode 100644 index 000000000..5ae2773c4 --- /dev/null +++ b/internal/backup/encode_info.go @@ -0,0 +1,114 @@ +package backup + +import ( + "encoding/json" + "io" + "time" + + "github.com/cockroachdb/errors" +) + +// EncodeInfoFormatVersion is the on-disk schema version for ENCODE_INFO.json. +// Bumped on incompatible schema changes; ReadEncodeInfo rejects unknown +// versions with ErrUnsupportedEncodeInfoFormatVersion so a future encoder +// release cannot silently drop fields a current operator relies on. +const EncodeInfoFormatVersion uint32 = 1 + +// ErrUnsupportedEncodeInfoFormatVersion is returned by ReadEncodeInfo when +// the sidecar's format_version is not EncodeInfoFormatVersion. Mirrors +// the decoder's ErrUnsupportedFormatVersion contract so callers can branch +// on errors.Is. +var ErrUnsupportedEncodeInfoFormatVersion = errors.New("backup: unsupported ENCODE_INFO format_version") + +// EncodeInfoSelfTest captures the self-test outcome (parent §"Round-trip +// self-test"). Ran=false when --self-test was off; Matched is only +// meaningful when Ran=true. +type EncodeInfoSelfTest struct { + Ran bool `json:"ran"` + Matched bool `json:"matched"` +} + +// EncodeInfo is the on-disk shape of .encode_info.json. Schema +// pinned by docs/design/2026_06_01_proposed_snapshot_encode_cli.md +// §"ENCODE_INFO.json". Restore operators rely on this for "encoded for +// the right cluster, by the right encoder version, against this exact +// file" confirmation; tag changes are a breaking schema bump. +type EncodeInfo struct { + FormatVersion uint32 `json:"format_version"` + EncoderVersion string `json:"encoder_version"` + EncoderKeyFormatVersion uint32 `json:"encoder_key_format_version"` + WallTimeISO string `json:"wall_time_iso"` + InputRoot string `json:"input_root"` + OutputFSMPath string `json:"output_fsm_path"` + OutputFSMSHA256 string `json:"output_fsm_sha256"` + LastCommitTS uint64 `json:"last_commit_ts"` + LastCommitTSOverridden bool `json:"last_commit_ts_overridden"` + ManifestLastCommitTS uint64 `json:"manifest_last_commit_ts"` + ManifestClusterID string `json:"manifest_cluster_id,omitempty"` + AdaptersEnabled []string `json:"adapters_enabled"` + SelfTest EncodeInfoSelfTest `json:"self_test"` +} + +// NewEncodeInfo stamps the current format version + wall time so callers +// only fill in the encode-specific fields. Mirrors NewPhase0SnapshotManifest. +// EncoderKeyFormatVersion is the on-disk key format the encoder produces; +// today it tracks CurrentFormatVersion (no separate key-format version +// has been declared), which is conservative: future encoder bumps that +// change MVCC layout MUST bump both manifest and encoder-key formats so +// restore operators can correlate. +func NewEncodeInfo(now time.Time) EncodeInfo { + return EncodeInfo{ + FormatVersion: EncodeInfoFormatVersion, + EncoderKeyFormatVersion: CurrentFormatVersion, + WallTimeISO: now.UTC().Format(time.RFC3339Nano), + } +} + +// WriteEncodeInfo serializes info to w. Caller is responsible for the +// fsync+close discipline (the cmd wrapper uses os.File.Sync then Close +// to surface late writeback errors — gemini r1 medium on #810). +func WriteEncodeInfo(w io.Writer, info EncodeInfo) error { + enc := json.NewEncoder(w) + enc.SetIndent("", " ") + if err := enc.Encode(info); err != nil { + return errors.Wrap(err, "encode ENCODE_INFO.json") + } + return nil +} + +// ReadEncodeInfo parses an ENCODE_INFO.json payload from r. Rejects +// unknown format_version values with ErrUnsupportedEncodeInfoFormatVersion +// so a future schema bump surfaces as a typed error rather than a silent +// field drop. Unknown JSON fields are tolerated to allow forward-compat +// additions within the same format_version. +func ReadEncodeInfo(r io.Reader) (EncodeInfo, error) { + body, err := io.ReadAll(r) + if err != nil { + return EncodeInfo{}, errors.Wrap(err, "read ENCODE_INFO.json") + } + var probe struct { + FormatVersion uint32 `json:"format_version"` + } + if err := json.Unmarshal(body, &probe); err != nil { + return EncodeInfo{}, errors.Wrap(err, "decode ENCODE_INFO.json format_version") + } + if probe.FormatVersion != EncodeInfoFormatVersion { + return EncodeInfo{}, errors.Wrapf(ErrUnsupportedEncodeInfoFormatVersion, "got %d, want %d", probe.FormatVersion, EncodeInfoFormatVersion) + } + var info EncodeInfo + if err := json.Unmarshal(body, &info); err != nil { + return EncodeInfo{}, errors.Wrap(err, "decode ENCODE_INFO.json") + } + return info, nil +} + +// EncodeInfoSidecarPath returns the path-derived sidecar location for a +// given .fsm output path. Multiple .fsm files can share a directory +// (e.g., per-node dumps under /backups/); a static "ENCODE_INFO.json" +// name would silently overwrite siblings (gemini medium #896). +// +// Convention: append ".encode_info.json" to the full output path. The +// same scheme gpg and sha256sum follow when their input is path-addressable. +func EncodeInfoSidecarPath(fsmPath string) string { + return fsmPath + ".encode_info.json" +} diff --git a/internal/backup/encode_info_test.go b/internal/backup/encode_info_test.go new file mode 100644 index 000000000..beb42a756 --- /dev/null +++ b/internal/backup/encode_info_test.go @@ -0,0 +1,95 @@ +package backup + +import ( + "bytes" + "strings" + "testing" + "time" + + "github.com/cockroachdb/errors" +) + +// TestEncodeInfoRoundTrip pins WriteEncodeInfo -> ReadEncodeInfo for a +// populated struct. Forward-compat: an ENCODE_INFO.json with unknown +// extra fields at the same format_version must decode cleanly. +func TestEncodeInfoRoundTrip(t *testing.T) { + t.Parallel() + info := NewEncodeInfo(time.Date(2026, 6, 1, 12, 0, 0, 0, time.UTC)) + info.EncoderVersion = "test-rev" + info.InputRoot = "/in" + info.OutputFSMPath = "/out.fsm" + info.OutputFSMSHA256 = "deadbeef" + info.LastCommitTS = 18446744073709551610 + info.LastCommitTSOverridden = false + info.ManifestLastCommitTS = 18446744073709551610 + info.ManifestClusterID = "cluster-1" + info.AdaptersEnabled = []string{"redis", "dynamodb", "s3", "sqs"} + info.SelfTest = EncodeInfoSelfTest{Ran: true, Matched: true} + + var buf bytes.Buffer + if err := WriteEncodeInfo(&buf, info); err != nil { + t.Fatalf("WriteEncodeInfo: %v", err) + } + got, err := ReadEncodeInfo(&buf) + if err != nil { + t.Fatalf("ReadEncodeInfo: %v", err) + } + if got.EncoderVersion != "test-rev" || got.OutputFSMSHA256 != "deadbeef" || got.LastCommitTS != 18446744073709551610 { + t.Errorf("round-trip mismatch: %+v", got) + } + if got.SelfTest.Ran != true || got.SelfTest.Matched != true { + t.Errorf("self_test field round-trip: %+v", got.SelfTest) + } + + // Forward-compat: extra field at same version decodes cleanly. + withExtra := `{"format_version":1,"encoder_version":"x","wall_time_iso":"2026-06-01T12:00:00Z","input_root":"/in","output_fsm_path":"/out.fsm","output_fsm_sha256":"d","last_commit_ts":1,"last_commit_ts_overridden":false,"manifest_last_commit_ts":1,"adapters_enabled":[],"self_test":{"ran":false,"matched":false},"future_field":"ignored"}` + if _, err := ReadEncodeInfo(strings.NewReader(withExtra)); err != nil { + t.Errorf("forward-compat unknown field rejected: %v", err) + } +} + +// TestEncodeInfoRejectsUnknownFormatVersion mirrors the decoder's +// TestManifestVersionGate: a future schema bump surfaces as a typed +// error rather than a silent field drop. +func TestEncodeInfoRejectsUnknownFormatVersion(t *testing.T) { + t.Parallel() + bad := `{"format_version":99,"encoder_version":"x","wall_time_iso":"2026-06-01T12:00:00Z","input_root":"/in","output_fsm_path":"/out.fsm","output_fsm_sha256":"d","last_commit_ts":1,"last_commit_ts_overridden":false,"manifest_last_commit_ts":1,"adapters_enabled":[],"self_test":{"ran":false,"matched":false}}` + _, err := ReadEncodeInfo(strings.NewReader(bad)) + if !errors.Is(err, ErrUnsupportedEncodeInfoFormatVersion) { + t.Fatalf("err = %v, want ErrUnsupportedEncodeInfoFormatVersion", err) + } +} + +// TestExclusionsLegacyManifestOmitsRenameS3Collisions pins forward-compat +// on the new rename_s3_collisions field. Older manifests written before +// M6 do not include the field; ReadManifest must decode them with the +// zero value (false) — NOT reject as ErrInvalidManifest (gemini medium +// v5 #896). +func TestExclusionsLegacyManifestOmitsRenameS3Collisions(t *testing.T) { + t.Parallel() + // Build a known-valid manifest via the public constructor, then + // rewrite the JSON to omit the rename_s3_collisions field — this + // is exactly the on-disk shape a pre-M6 decoder run would produce. + m := NewPhase0SnapshotManifest(time.Date(2026, 5, 1, 0, 0, 0, 0, time.UTC)) + m.Exclusions = &Exclusions{} + m.Adapters = &Adapters{} + var buf bytes.Buffer + if err := WriteManifest(&buf, m); err != nil { + t.Fatalf("WriteManifest: %v", err) + } + // Strip the new field to simulate a legacy producer. + legacy := strings.ReplaceAll(buf.String(), `"rename_s3_collisions":false,`, ``) + legacy = strings.ReplaceAll(legacy, `,"rename_s3_collisions":false`, ``) + legacy = strings.ReplaceAll(legacy, `"rename_s3_collisions":false`, ``) + + got, err := ReadManifest(strings.NewReader(legacy)) + if err != nil { + t.Fatalf("legacy manifest must decode without error, got: %v", err) + } + if got.Exclusions == nil { + t.Fatalf("Exclusions = nil") + } + if got.Exclusions.RenameS3Collisions != false { + t.Errorf("RenameS3Collisions = %v, want false (zero value for missing field)", got.Exclusions.RenameS3Collisions) + } +} diff --git a/internal/backup/encode_s3.go b/internal/backup/encode_s3.go index b0b77dce4..4e537963b 100644 --- a/internal/backup/encode_s3.go +++ b/internal/backup/encode_s3.go @@ -5,6 +5,7 @@ import ( "encoding/json" "os" "path/filepath" + "strings" "github.com/bootjp/elastickv/internal/s3keys" "github.com/cockroachdb/errors" @@ -83,10 +84,34 @@ func (e *S3RecordEncoder) Encode(b *snapshotBuilder) error { return err } for _, ent := range entries { + name := ent.Name() if !ent.IsDir() { - continue + // Top-level entries under s3/ must be bucket directories. + // A regular file or symlink here means the dump is + // malformed or partially truncated — silently skipping + // would let the encoder publish a partial .fsm with + // the affected bucket omitted (codex P2 v31 #904; the + // manifest's empty S3 scope from populateAdapterScopes + // cannot otherwise distinguish missing bucket from + // dumped-empty bucket). + // + // Reserved-prefix entries that start with "_" (e.g. + // _incomplete_uploads, _orphans) are handled by their + // own dedicated paths and are NOT top-level buckets; + // the fail-closed should not catch them. Today the + // reverse encoder doesn't emit those subtrees at all + // (covered by ErrEncodeUnsupportedS3IncompleteUploads / + // ErrEncodeUnsupportedS3Orphans) so any "_*" entry here + // would have been rejected upstream — but skip them + // here too for forward compat. + if strings.HasPrefix(name, "_") { + continue + } + return errors.Wrapf(ErrS3EncodeNotRegular, + "s3/%s is not a directory (mode=%s); top-level entries under s3/ must be bucket directories", + name, ent.Type()) } - if err := e.encodeBucket(b, root, ent.Name()); err != nil { + if err := e.encodeBucket(b, root, name); err != nil { return err } } diff --git a/internal/backup/encode_s3_test.go b/internal/backup/encode_s3_test.go index 840bd66b0..13487d389 100644 --- a/internal/backup/encode_s3_test.go +++ b/internal/backup/encode_s3_test.go @@ -137,6 +137,54 @@ func TestS3EncodeMissingDirIsNoop(t *testing.T) { } } +// TestS3EncodeRejectsNonDirectoryBucketEntry pins codex P2 v32 #904: +// when an entry directly under s3/ is a regular file or symlink +// rather than a bucket directory, the encoder must fail closed with +// ErrS3EncodeNotRegular rather than silently skipping (which would +// publish a partial .fsm with the affected bucket omitted; the +// manifest's deferred-enumeration empty S3 scope cannot otherwise +// flag the missing data). Reserved-prefix `_*` entries (e.g. +// `_incomplete_uploads`) are explicitly tolerated because they're +// handled by dedicated paths. +func TestS3EncodeRejectsNonDirectoryBucketEntry(t *testing.T) { + t.Parallel() + in := t.TempDir() + if err := os.MkdirAll(filepath.Join(in, "s3"), 0o755); err != nil { + t.Fatalf("mkdir: %v", err) + } + // Plant a regular file where a bucket directory should be. + if err := os.WriteFile(filepath.Join(in, "s3", "stray.txt"), []byte("oops"), 0o600); err != nil { + t.Fatalf("WriteFile: %v", err) + } + b := newSnapshotBuilder(s3EncTS) + err := NewS3RecordEncoder(in).Encode(b) + if !errors.Is(err, ErrS3EncodeNotRegular) { + t.Fatalf("Encode err = %v, want errors.Is ErrS3EncodeNotRegular", err) + } +} + +// TestS3EncodeIgnoresReservedPrefixEntry pins that codex P2 v32's +// fail-closed for non-directory top-level entries does NOT fire on +// reserved-prefix entries (those starting with "_"). The reverse +// encoder's unsupported-features guard handles those subtrees +// separately via ErrEncodeUnsupportedS3IncompleteUploads / +// ErrEncodeUnsupportedS3Orphans. +func TestS3EncodeIgnoresReservedPrefixEntry(t *testing.T) { + t.Parallel() + in := t.TempDir() + if err := os.MkdirAll(filepath.Join(in, "s3"), 0o755); err != nil { + t.Fatalf("mkdir: %v", err) + } + // Reserved-prefix file (e.g., a marker the operator left). + if err := os.WriteFile(filepath.Join(in, "s3", "_marker"), []byte("x"), 0o600); err != nil { + t.Fatalf("WriteFile: %v", err) + } + b := newSnapshotBuilder(s3EncTS) + if err := NewS3RecordEncoder(in).Encode(b); err != nil { + t.Errorf("Encode err = %v, want nil (reserved-prefix entries should be skipped)", err) + } +} + // TestS3EncodeRejectsNonRegularBucketMeta pins the pre-open guard: a // _bucket.json that is a directory is refused with ErrS3EncodeNotRegular. func TestS3EncodeRejectsNonRegularBucketMeta(t *testing.T) { diff --git a/internal/backup/encode_snapshot.go b/internal/backup/encode_snapshot.go new file mode 100644 index 000000000..dfc38c436 --- /dev/null +++ b/internal/backup/encode_snapshot.go @@ -0,0 +1,931 @@ +package backup + +import ( + "bytes" + "crypto/sha256" + "hash" + "io" + "os" + "path/filepath" + "sort" + "strconv" + "strings" + + "github.com/cockroachdb/errors" +) + +// ErrSelfTestLowerLastCommitTS is returned when the operator-supplied +// T is below the manifest's last_commit_ts. The HLC ceiling invariant +// (CLAUDE.md "Timestamp Oracle") forbids lowering the ceiling on +// restore: a lower T would let a post-restart leader issue a read +// ts ≤ a restored row's commit ts. +// +// Enforced at two layers: +// - CLI (`resolveLastCommitTS`) rejects --last-commit-ts T < manifest +// before EncodeSnapshot is called (exit code 2). +// - Library (`validateEncodeOptions`) rejects when the caller threads +// `opts.ManifestLastCommitTS > 0` and `opts.LastCommitTS` is below +// it — defense-in-depth for in-process callers (Phase 1 live +// extractor, integration tests) that bypass the CLI. +// +// Callers can errors.Is on this sentinel to map to the right exit code +// (claude v3 doc bug #904 + claude v7 doc bug #904 + codex P2 v2 #904). +var ErrSelfTestLowerLastCommitTS = errors.New("backup: --last-commit-ts T < manifest.last_commit_ts (HLC ceiling regression)") + +// ErrEncodeUnsupportedDynamoDBLayout is returned when an input dump +// declares `dynamodb_layout: "jsonl"` in MANIFEST.json. The DynamoDB +// reverse encoder only walks per-item files (items/*.json, +// items/*/*.json) and would silently skip every items/data-*.jsonl +// file, producing an .fsm with only table metadata and no items — +// a silent-data-loss restore artifact (codex P2 v7 #904). Fail closed +// until the encoder learns the JSONL layout (M7 / future milestone). +var ErrEncodeUnsupportedDynamoDBLayout = errors.New("backup: DynamoDB JSONL layout not supported by encoder") + +// ErrEncodeUnsupportedS3IncompleteUploads is returned when a caller +// (the CLI threading manifest.exclusions.include_incomplete_uploads, +// or a library caller setting EncodeOptions.S3IncludeIncompleteUploads) +// asks for S3 in-flight multipart uploads to round-trip, but the +// reverse encoder cannot rebuild that subtree. The S3 reverse encoder +// silently skips `_incomplete_uploads/` payload directories +// (internal/backup/encode_s3_objects.go), so a dump that included +// those records would publish an .fsm missing them. Fail closed until +// the encoder learns the subtree (codex P2 v21 #904). +var ErrEncodeUnsupportedS3IncompleteUploads = errors.New("backup: S3 include_incomplete_uploads not supported by encoder") + +// ErrEncodeUnsupportedS3Orphans is returned when the manifest (or a +// library caller) requests round-tripping S3 pre-generation orphan +// blob chunks but the reverse encoder cannot rebuild that subtree. +// Same pattern as ErrEncodeUnsupportedS3IncompleteUploads — the S3 +// reverse encoder silently skips `_orphans/` payload directories; +// fail closed until the encoder learns them (codex P2 v21 #904). +var ErrEncodeUnsupportedS3Orphans = errors.New("backup: S3 include_orphans not supported by encoder") + +// ErrEncodeUnsupportedSQSPreserveVisibility is returned when the +// manifest requests preserving in-flight SQS message visibility state +// (`preserve_sqs_visibility=true`), but the reverse encoder +// unconditionally resets VisibleAtMillis / receive count / first +// receive / receipt token to zero on every restored message +// (internal/backup/encode_sqs.go). A dump that intentionally +// preserved visibility would silently restore as "every message +// visible/reset" without this guard (codex P2 v21 #904). +var ErrEncodeUnsupportedSQSPreserveVisibility = errors.New("backup: preserve_sqs_visibility not supported by encoder") + +// ErrRedisEncodeMultiDBUnsupported is returned when the input tree +// contains a Redis db_/ for any N != 0, or contains multiple +// db_ directories. The current Redis MVCC key prefixes +// (!redis|str|, !redis|hll|, !redis|ttl|, …) carry NO database +// component, so feeding two distinct DBs into the same snapshot +// builder would either collide on same-named keys or silently merge +// both DBs under db_0 on restore (DecodeOptions.RedisDBIndex +// defaults to 0). Failing closed preserves correctness until Phase 1 +// makes the native keys DB-aware (codex P2 v14 #904). +// +// v14 originally fanned out across db_ to address codex P1 v13's +// silent-data-loss concern; codex's v14 follow-up clarified that +// fan-out under the current key format produces mis-scoped output. +// The corrected fix replaces fan-out with fail-closed. +var ErrRedisEncodeMultiDBUnsupported = errors.New("backup: redis encoder requires single db_0 (multi-DB or non-zero DB not yet supported)") + +// ErrEncodeAdapterData marks every error returned by an adapter +// encoder (Redis / DynamoDB / S3 / SQS) so callers can distinguish +// "the input tree contained content the encoder cannot translate" +// from "operator passed a bad flag". The encoder is offline-only — +// every adapter error originates from rejecting the content under +// opts.InputRoot (a malformed DynamoDB _schema.json, an S3 collision +// artifact the encoder cannot reverse, a SQS side-record with an +// unknown kind, …). These are data-correctness failures, not user +// errors; the CLI maps this sentinel to exit 2 so runbooks can branch +// on exit status to quarantine bad dump data (codex P2 v9 #904). +// +// Wrapped via errors.Mark inside runAdapterEncoders so the original +// adapter sentinel chain (ErrDDBEncodeInvalidSchema, …) is preserved +// for callers that errors.Is on the more specific type. +var ErrEncodeAdapterData = errors.New("backup: adapter encoder rejected input tree") + +// The encoder dispatch order (redis → dynamodb → s3 → sqs) is encoded +// inside adapterRunners() and is intentionally distinct from decode.go's +// finalize order (dynamodb → s3 → redis → sqs). The final .fsm byte +// sequence is determined by encoded-key sort (snapshotBuilder.WriteTo), +// not by adapter fan-out order, so either ordering is correct as long +// as it is fixed. The encoder follows the parent design doc's +// enumeration order so ENCODE_INFO.json adapters_enabled is bytewise +// reproducible across runs that pass --adapter in different sequences +// (claude review v7 #896). + +// EncodeOptions configures EncodeSnapshot. Mirrors the decoder's +// DecodeOptions in shape: required InputRoot, AdapterSet, then per-adapter +// option flags read back from the input MANIFEST.json by the CLI. +type EncodeOptions struct { + // InputRoot is the directory tree root produced by the decoder. + // Must contain MANIFEST.json; per-adapter encoders read their + // subtrees (redis/, dynamodb/, s3/, sqs/) directly off this root. + InputRoot string + // Adapters selects which adapter encoders to invoke; disabled + // adapters are skipped without error. Mirrors DecodeOptions.Adapters. + Adapters AdapterSet + // LastCommitTS is the EFFECTIVE T used for both the EKVPBBL1 + // header and every key's invTS = ^T. Callers pass manifest.last_commit_ts + // by default and the --last-commit-ts override otherwise. + LastCommitTS uint64 + // DynamoDBBundleJSONL is true when the input dump's MANIFEST.json + // has `dynamodb_layout: "jsonl"`. The reverse encoder does not + // support that layout — it would silently skip every + // items/data-*.jsonl file and publish an .fsm with only table + // metadata. Fail-closed via ErrEncodeUnsupportedDynamoDBLayout + // when true (codex P2 v7 #904). When the encoder gains JSONL + // support, this field will switch from a guard to a control. + DynamoDBBundleJSONL bool + + // S3IncludeIncompleteUploads is true when the input dump's + // MANIFEST.json has `exclusions.include_incomplete_uploads=true` + // (the producer dumped in-flight multipart uploads under + // _incomplete_uploads/). The reverse encoder cannot rebuild that + // subtree today; fail-closed via ErrEncodeUnsupportedS3IncompleteUploads + // when true AND Adapters.S3 is enabled (codex P2 v21 #904). + S3IncludeIncompleteUploads bool + + // S3IncludeOrphans is true when the input dump's MANIFEST.json + // has `exclusions.include_orphans=true` (the producer dumped + // pre-generation orphan blob chunks under _orphans/). The reverse + // encoder cannot rebuild that subtree today; fail-closed via + // ErrEncodeUnsupportedS3Orphans when true AND Adapters.S3 is + // enabled (codex P2 v21 #904). + S3IncludeOrphans bool + + // PreserveSQSVisibility is true when the input dump's MANIFEST.json + // has `exclusions.preserve_sqs_visibility=true` (the producer + // preserved in-flight message visibility state — VisibleAtMillis, + // receive count, first receive, receipt token). The reverse + // encoder unconditionally zeros those fields on every restored + // message; fail-closed via ErrEncodeUnsupportedSQSPreserveVisibility + // when true AND Adapters.SQS is enabled (codex P2 v21 #904). + PreserveSQSVisibility bool + + // ManifestLastCommitTS is the floor LastCommitTS must not fall + // below. When > 0, EncodeSnapshot fails-closed with + // ErrSelfTestLowerLastCommitTS if LastCommitTS < ManifestLastCommitTS. + // This is defense-in-depth for the CLI's pre-check (which already + // rejects --last-commit-ts T < manifest), and it's the load-bearing + // guard for future in-process library callers (Phase 1 live extractor, + // integration tests) that bypass the CLI: a library caller that + // forgets to compare against the manifest can no longer silently + // publish a low-TS .fsm (codex P2 v2 #904). Callers that genuinely + // have no manifest reference (synthetic test fixtures) leave this + // at 0 to opt out of the check. + ManifestLastCommitTS uint64 + // SelfTest enables the round-trip self-test. When true, + // EncodeSnapshot writes the FSM to an on-disk temp file under + // SelfTestDecodeOptions.OutRoot (encode-self-test-fsm-*), streams + // it through DecodeSnapshot, and copies to the caller's io.Writer + // ONLY if the decode survives — i.e. the bytes the encoder + // produced are loadable. When false, the FSM streams straight to + // the writer with no extra buffering. Memory cost in self-test + // mode is O(1) on top of the sort working set (the temp file + // holds the snapshot; only a small streaming buffer is in RAM). + SelfTest bool + // SelfTestDecodeOptions are threaded into the scratch DecodeSnapshot + // call. The CLI reads MANIFEST.json's Exclusions + DynamoDBLayout + // and populates this so the self-test's scratch tree matches what + // the original decoder would have produced. + SelfTestDecodeOptions DecodeOptions + + // AllowMissingManifest opts out of the MANIFEST.json presence + // check in validateEncodeOptions. When false (default), + // EncodeSnapshot requires /MANIFEST.json to exist — + // the contract on InputRoot has always claimed this, but until + // codex P2 v17 #904 the library only checked the path was a + // directory, so a real library caller pointing at the wrong + // directory would silently emit a header-only .fsm (each enabled + // adapter no-ops when its top-level subdir is missing). + // + // Set to true for synthetic test fixtures that don't have a + // MANIFEST.json on disk. Production callers (CLI, Phase 1 + // in-process extractor) MUST leave this at false so a bad + // InputRoot surfaces an explicit error rather than a + // silent-empty .fsm. + AllowMissingManifest bool + + // corruptBufferForTest is an unexported test-only hook that fires + // against the on-disk self-test buffer AFTER snapshotBuilder.WriteTo + // returns but BEFORE the self-test DecodeSnapshot call (when + // SelfTest=true). Same-package tests use it to inject corruption + // reachable by the self-test but never reaching the io.Writer + // passed to EncodeSnapshot (the write-then-rename invariant: a + // self-test failure must not publish corrupt bytes — codex P2 v6 + // #896). External callers cannot set it (lowercase identifier). + // + // The hook receives the *os.File handle (positioned at offset 0) + // of the disk-backed self-test buffer; tests typically WriteAt + // a byte flip and rely on Seek-back-to-0 before returning so + // the encoder's subsequent Read sees the corrupted bytes. + corruptBufferForTest func(*os.File) +} + +// SetSelfTestCorruptHookForTest installs a same-process hook that +// fires against the on-disk self-test buffer between WriteTo and the +// re-decode call. The hook can WriteAt into the file to inject +// corruption so the subsequent self-test mismatches deterministically. +// +// Production code MUST NOT call this; it is exclusively a test seam +// for callers OUTSIDE package backup (specifically the +// cmd/elastickv-snapshot-encode CLI tests, which need to drive a real +// end-to-end self-test mismatch to verify the stale-.fsm cleanup +// path — codex P2 v10 #904). In-package tests should set +// EncodeOptions.corruptBufferForTest directly. +func (o *EncodeOptions) SetSelfTestCorruptHookForTest(hook func(*os.File)) { + o.corruptBufferForTest = hook +} + +// EncodeResult is the public return value from EncodeSnapshot. Mirrors +// the decoder's DecodeResult shape. +type EncodeResult struct { + // Header is what ReadSnapshotWithHeader returned when the encoder + // decoded its own output for the self-test. Header.LastCommitTS + // equals the effective T (uniform-stamping rule per parent doc + // §"MVCC re-encoding"). + Header SnapshotHeader + // BytesWritten is the number of bytes written to the caller's + // io.Writer (the SHA256-anchored payload). + BytesWritten int64 + // SHA256 of the produced .fsm bytes (raw 32-byte digest; the CLI + // hex-encodes it via encoding/hex when writing ENCODE_INFO.json). + SHA256 [32]byte + // SelfTestRan is true iff opts.SelfTest was true AND the encoder + // ran (i.e. no earlier per-adapter error short-circuited). + SelfTestRan bool + // SelfTestMatched is meaningful only when SelfTestRan; reports + // whether the re-decode produced no diff against InputRoot. + SelfTestMatched bool + // SelfTestMismatchTxt is non-nil when SelfTestRan && !SelfTestMatched. + // The CLI writes it as .mismatch.txt at exit 2. + SelfTestMismatchTxt []byte + // AdaptersEnabled is the canonical fan-out order of adapters that + // were actually invoked; ENCODE_INFO.json embeds this verbatim. + AdaptersEnabled []string +} + +// validateEncodeOptions enforces the four pre-encode invariants: +// InputRoot non-empty + exists-as-directory, out non-nil, non-empty +// adapter selection, optional manifest-TS floor, and DDB JSONL guard. +// Split out so EncodeSnapshot stays under the cyclop threshold; the +// data-correctness checks live in validateEncodeOptionsData. +func validateEncodeOptions(opts EncodeOptions, out io.Writer) error { + if err := checkInputRoot(opts); err != nil { + return err + } + if out == nil { + return errors.New("backup: EncodeSnapshot out writer is nil") + } + if !opts.Adapters.DynamoDB && !opts.Adapters.S3 && !opts.Adapters.Redis && !opts.Adapters.SQS { + // Zero AdapterSet would silently produce a header-only .fsm — + // a "successful" empty restore artifact (codex v5 + claude v5 #904). + return errors.New("backup: EncodeOptions.Adapters has no enabled adapter") + } + return validateEncodeOptionsData(opts) +} + +// checkInputRoot validates InputRoot's path-level invariants: present, +// existing as a directory, and (unless AllowMissingManifest) contains +// a MANIFEST.json. Split out of validateEncodeOptions to keep cyclop +// happy and to make the three failure modes inspectable in isolation. +func checkInputRoot(opts EncodeOptions) error { + if opts.InputRoot == "" { + return errors.New("backup: EncodeOptions.InputRoot is required") + } + // Stat the path so a typo'd or deleted directory surfaces here + // rather than fan-out-no-op'ing every adapter and producing a + // header-only .fsm (codex P2 v8 #904). + info, statErr := os.Stat(opts.InputRoot) + if statErr != nil { + return errors.Wrapf(statErr, "stat InputRoot %q", opts.InputRoot) + } + if !info.IsDir() { + return errors.Errorf("backup: InputRoot %q is not a directory", opts.InputRoot) + } + // Require MANIFEST.json at InputRoot unless the caller has + // explicitly opted out (codex P2 v17 #904). The doc on InputRoot + // has always required it; this guard catches a library caller + // pointing at an existing-but-wrong directory whose adapter + // subdirs are all absent — without this check the call would + // silently succeed and publish a header-only .fsm. + if !opts.AllowMissingManifest { + manifestPath := filepath.Join(opts.InputRoot, "MANIFEST.json") + if _, mstat := os.Stat(manifestPath); mstat != nil { + return errors.Wrapf(mstat, "stat MANIFEST.json under InputRoot %q (set EncodeOptions.AllowMissingManifest=true for synthetic fixtures)", opts.InputRoot) + } + } + return nil +} + +// validateEncodeOptionsData covers the data-correctness pre-conditions: +// HLC ceiling floor, DynamoDB JSONL guard, and (via +// validateEncodeOptionsUnsupportedFeatures) the three manifest +// exclusion guards added by codex P2 v21 #904 (S3 incomplete uploads, +// S3 orphans, SQS preserve-visibility). Kept separate from the +// nil/empty-args checks so each function stays cyclop-clean. +func validateEncodeOptionsData(opts EncodeOptions) error { + if opts.ManifestLastCommitTS > 0 && opts.LastCommitTS < opts.ManifestLastCommitTS { + // Defense-in-depth HLC ceiling floor (codex P2 v2 #904). + return errors.Wrapf(ErrSelfTestLowerLastCommitTS, + "EncodeSnapshot opts.LastCommitTS %d < opts.ManifestLastCommitTS %d", + opts.LastCommitTS, opts.ManifestLastCommitTS) + } + if opts.DynamoDBBundleJSONL && opts.Adapters.DynamoDB { + // The DynamoDB reverse encoder only walks per-item files; + // JSONL items would be silently skipped (codex P2 v7 #904). + return errors.WithStack(ErrEncodeUnsupportedDynamoDBLayout) + } + return validateEncodeOptionsUnsupportedFeatures(opts) +} + +// validateEncodeOptionsUnsupportedFeatures rejects manifest-derived +// flags that the per-adapter encoders cannot honor today. Each guard +// fires only when the corresponding adapter is enabled — a caller +// encoding only Redis + SQS with `include_incomplete_uploads=true` +// inherited from the manifest is unaffected (no S3 → no concern). +// Split out from validateEncodeOptionsData to stay under cyclop; +// codex P2 v21 #904 added the three S3/SQS exclusion guards. +func validateEncodeOptionsUnsupportedFeatures(opts EncodeOptions) error { + if opts.S3IncludeIncompleteUploads && opts.Adapters.S3 { + // The S3 reverse encoder skips _incomplete_uploads/ payload + // directories; a dump that included them would silently lose + // those records (codex P2 v21 #904 L326). + return errors.WithStack(ErrEncodeUnsupportedS3IncompleteUploads) + } + if opts.S3IncludeOrphans && opts.Adapters.S3 { + // The S3 reverse encoder skips _orphans/ payload directories; + // same silent-data-loss pattern (codex P2 v21 #904 L326). + return errors.WithStack(ErrEncodeUnsupportedS3Orphans) + } + if opts.PreserveSQSVisibility && opts.Adapters.SQS { + // The SQS reverse encoder unconditionally zeroes the + // visibility fields; a dump that preserved them would lose + // the state on restore (codex P2 v21 #904 L473). + return errors.WithStack(ErrEncodeUnsupportedSQSPreserveVisibility) + } + return nil +} + +// EncodeSnapshot reads the directory tree at opts.InputRoot, invokes the +// enabled per-adapter encoders in canonical fan-out order, optionally +// runs the round-trip self-test, and writes the .fsm bytes to out. +// The .fsm bytes are NOT returned; they go to out. +// +// When opts.SelfTest=false the FSM streams straight to out with a +// sha256 tee and no extra buffering. When opts.SelfTest=true the FSM +// is written to an on-disk temp file (encode-self-test-fsm-*) under +// opts.SelfTestDecodeOptions.OutRoot, the file is streamed through +// DecodeSnapshot, and bytes are copied to out ONLY if the decode +// survives. Memory cost in self-test mode is O(1) on top of the +// sort working set (gemini high #904 — the earlier *bytes.Buffer +// version would OOM on multi-GB snapshots). +// +// Self-test failure returns (result, nil) with result.SelfTestMatched +// == false and result.SelfTestMismatchTxt populated. Callers MUST +// check result.SelfTestMatched before treating a nil error as success. +// The CLI relies on this contract to write mismatch.txt + exit 2; +// library callers should follow the same pattern. +// +// EncodeSnapshot does NOT read MANIFEST.json itself, but it WILL +// enforce a floor on opts.LastCommitTS when the caller threads the +// manifest value through opts.ManifestLastCommitTS — a low +// LastCommitTS returns ErrSelfTestLowerLastCommitTS BEFORE any bytes +// are written. The CLI's resolveLastCommitTS sets both fields to the +// reconciled values, and library callers SHOULD do the same. The +// check is opt-in (ManifestLastCommitTS=0 disables it) so synthetic +// test fixtures without a manifest reference can still call this +// directly (codex P2 v2 #904). +func EncodeSnapshot(opts EncodeOptions, out io.Writer) (EncodeResult, error) { + if err := validateEncodeOptions(opts, out); err != nil { + return EncodeResult{}, err + } + + b := newSnapshotBuilder(opts.LastCommitTS) + enabled, err := runAdapterEncoders(b, opts) + if err != nil { + return EncodeResult{}, err + } + + if !opts.SelfTest { + return encodeStream(b, opts, enabled, out) + } + return encodeBuffered(b, opts, enabled, out) +} + +// encodeStream is the no-self-test path: SHA256 + writer tee with no +// extra buffering. FSM bytes go straight to out. +func encodeStream(b *snapshotBuilder, opts EncodeOptions, enabled []string, out io.Writer) (EncodeResult, error) { + hashWriter := newSHA256Writer(out) + bytesWritten, err := b.WriteTo(hashWriter) + if err != nil { + return EncodeResult{}, errors.WithStack(err) + } + return EncodeResult{ + Header: SnapshotHeader{LastCommitTS: opts.LastCommitTS}, + BytesWritten: bytesWritten, + SHA256: hashWriter.Sum(), + SelfTestRan: false, + AdaptersEnabled: enabled, + }, nil +} + +// encodeBuffered is the SelfTest=true path: write the FSM to a temp +// file on disk (NOT in memory — gemini high #904, OOM risk on large +// snapshots), self-test by streaming the temp file through DecodeSnapshot, +// copy to out only on match. The temp file is os.Remove'd via defer on +// every exit path. +// +// Memory cost: O(1) — only the sha256 running state + read buffer for +// the final io.Copy. Replaces the prior in-memory bytes.Buffer. +// +// Corruption hook (if set) fires against the temp file between WriteTo +// and self-test so the self-test sees the corruption but out never does +// (codex P2 v6 #896, codex P2 v7 #896). +func encodeBuffered(b *snapshotBuilder, opts EncodeOptions, enabled []string, out io.Writer) (EncodeResult, error) { + tempFile, err := os.CreateTemp(opts.SelfTestDecodeOptions.OutRoot, "encode-self-test-fsm-") + if err != nil { + return EncodeResult{}, errors.Wrap(err, "create self-test temp file") + } + tempPath := tempFile.Name() + defer func() { + _ = tempFile.Close() + _ = os.Remove(tempPath) + }() + + hashTee := newSHA256Writer(tempFile) + bytesWritten, err := b.WriteTo(hashTee) + if err != nil { + return EncodeResult{}, errors.WithStack(err) + } + if err := tempFile.Sync(); err != nil { + return EncodeResult{}, errors.Wrap(err, "fsync self-test temp file") + } + if opts.corruptBufferForTest != nil { + opts.corruptBufferForTest(tempFile) + } + if _, err := tempFile.Seek(0, io.SeekStart); err != nil { + return EncodeResult{}, errors.Wrap(err, "seek self-test temp file") + } + + header, mismatchTxt, matched, stErr := runSelfTest(tempFile, opts) + sha := hashTee.Sum() + result := EncodeResult{ + Header: header, + BytesWritten: bytesWritten, + SHA256: sha, + SelfTestRan: true, + SelfTestMatched: matched, + SelfTestMismatchTxt: mismatchTxt, + AdaptersEnabled: enabled, + } + if stErr != nil { + return result, stErr + } + if !matched { + return result, nil + } + if _, err := tempFile.Seek(0, io.SeekStart); err != nil { + return result, errors.Wrap(err, "rewind self-test temp file for copy") + } + if _, err := io.Copy(out, tempFile); err != nil { + return result, errors.Wrap(err, "copy buffered fsm to out") + } + return result, nil +} + +// redisDBDirPrefix is the canonical "db_" prefix produced by the +// decoder for redis/db_/ directories. Mirrored by encoder +// enumeration (encodeAllRedisDBs) so a multi-DB dump round-trips. +const redisDBDirPrefix = "db_" + +// enumerateRedisDBs returns the sorted dbIndex values for which +// /redis/db_/ exists as a directory. A missing redis/ +// directory returns nil; the caller treats it as no-op (same convention +// as the per-DB encoder, which is a no-op when its db_ subdir is +// absent). Non-db_ entries (regular files, symlinks at the redis/ +// level, non-numeric or non-canonical suffixes like "db_-1" or +// "db_01") are silently skipped — they cannot have been produced by +// the canonical decoder and are not the encoder's concern. +// +// Codex P1 v13 #904: replaces the prior hardcoded NewRedisEncoder(_, 0) +// in adapterRunners that silently dropped non-default DBs from any +// future Phase 1 multi-DB dump. +func enumerateRedisDBs(inRoot string) ([]int, error) { + redisDir := filepath.Join(inRoot, "redis") + if err := checkRedisRoot(redisDir); err != nil { + return nil, err + } + entries, err := os.ReadDir(redisDir) + if err != nil { + if errors.Is(err, os.ErrNotExist) { + return nil, nil + } + return nil, errors.WithStack(err) + } + var indices []int + for _, ent := range entries { + idx, ok := parseRedisDBName(ent.Name()) + if !ok { + continue + } + // Canonical db_ name; entry MUST be a directory. + // Silently skipping a regular file or symlink at + // redis/db_ would let a malformed dump publish a + // header-only/partial FSM (codex P2 v14 #904 L427). + if !ent.IsDir() { + return nil, errors.Wrapf(ErrRedisEncodeNotDir, + "redis/%s exists but is not a directory (mode=%s)", + ent.Name(), ent.Type()) + } + indices = append(indices, idx) + } + sort.Ints(indices) + return indices, nil +} + +// checkRedisRoot stats /redis/ and rejects symlink / non-dir +// shapes. Missing is allowed (caller returns nil indices). Split out +// of enumerateRedisDBs to keep that function under the cyclop bound. +func checkRedisRoot(redisDir string) error { + info, err := os.Lstat(redisDir) + switch { + case errors.Is(err, os.ErrNotExist): + return nil + case err != nil: + return errors.WithStack(err) + case info.Mode()&os.ModeSymlink != 0: + // Symlinked redis/ would let os.OpenRoot in the per-DB encoder + // resolve outside the dump tree (mirrors the per-DB encoder's + // symlink refusal on redis/db_). + return errors.Wrapf(ErrRedisEncodeNotDir, "redis path %q is a symlink", redisDir) + case !info.IsDir(): + return errors.Wrapf(ErrRedisEncodeNotDir, "redis path %q is not a directory", redisDir) + } + return nil +} + +// parseRedisDBName returns (dbIndex, true) when name matches the +// canonical db_ pattern (N is a non-negative decimal with no +// leading zeros). Non-matching names return (0, false) so the caller +// can skip them without erroring — they cannot have been produced by +// the canonical decoder. Reject non-canonical decimals so a +// hypothetical Phase 1 dumper cannot double-emit the same db under +// two distinct directory names. +// +// This is a pure name parser; the caller is responsible for +// validating the directory-entry shape (codex P2 v14 #904 L427 +// shifted the IsDir check to enumerateRedisDBs so a regular file +// at redis/db_ fails closed instead of being silently skipped). +func parseRedisDBName(name string) (int, bool) { + if !strings.HasPrefix(name, redisDBDirPrefix) { + return 0, false + } + suffix := name[len(redisDBDirPrefix):] + idx, err := strconv.Atoi(suffix) + if err != nil || idx < 0 || strconv.Itoa(idx) != suffix { + return 0, false + } + return idx, true +} + +// encodeAllRedisDBs invokes NewRedisEncoder for redis/db_0/ when the +// input tree has exactly that DB (or no Redis content at all). A +// missing redis/ directory is a no-op. Any non-zero DB or the +// presence of multiple db_ directories fails closed with +// ErrRedisEncodeMultiDBUnsupported. +// +// Codex P1 v13 #904 originally asked for a per-DB fan-out to address +// the prior hardcoded db_0 dispatch silently dropping non-default +// DBs. Codex P2 v14 #904 (L452) clarified that fan-out under the +// current MVCC key prefixes (!redis|str|, !redis|hll|, !redis|ttl|, +// …, none of which carry a database component) would either collide +// on same-named keys across DBs or merge everything under db_0 at +// decode time. The corrected fix replaces the silent drop and the +// incorrect fan-out with a fail-closed sentinel until Phase 1 +// makes the native keys DB-aware. +func encodeAllRedisDBs(b *snapshotBuilder, inRoot string) error { + indices, err := enumerateRedisDBs(inRoot) + if err != nil { + return errors.Wrap(err, "redis encoder enumerate") + } + if len(indices) == 0 { + return nil + } + if len(indices) > 1 || indices[0] != 0 { + return errors.Wrapf(ErrRedisEncodeMultiDBUnsupported, + "redis encoder enumerated db indices %v", indices) + } + if err := NewRedisEncoder(inRoot, 0).Encode(b); err != nil { + return errors.Wrap(err, "redis encoder db_0") + } + return nil +} + +// adapterRunner pairs an enabled-check with an Encode call, keeping +// runAdapterEncoders's per-iteration body to two branches (cyclop). +type adapterRunner struct { + name string + enabled func(AdapterSet) bool + encode func(*snapshotBuilder, string) error +} + +func adapterRunners() []adapterRunner { + return []adapterRunner{ + {"redis", func(s AdapterSet) bool { return s.Redis }, encodeAllRedisDBs}, + {"dynamodb", func(s AdapterSet) bool { return s.DynamoDB }, func(b *snapshotBuilder, root string) error { + return errors.Wrap(NewDynamoDBEncoder(root).Encode(b), "dynamodb encoder") + }}, + {"s3", func(s AdapterSet) bool { return s.S3 }, func(b *snapshotBuilder, root string) error { + return errors.Wrap(NewS3RecordEncoder(root).Encode(b), "s3 encoder") + }}, + {"sqs", func(s AdapterSet) bool { return s.SQS }, func(b *snapshotBuilder, root string) error { + return errors.Wrap(NewSQSRecordEncoder(root).Encode(b), "sqs encoder") + }}, + } +} + +// runAdapterEncoders invokes each enabled adapter encoder in +// canonicalAdapterFanOutOrder, returning the list of adapter names +// actually invoked (for ENCODE_INFO.json adapters_enabled). +// +// Adapter errors are marked with ErrEncodeAdapterData so the CLI can +// route them to exit-2 (data-correctness) rather than exit-1 (user +// error). The original adapter sentinel chain is preserved — callers +// that errors.Is on ErrDDBEncodeInvalidSchema, +// ErrS3EncodeUnsupportedCollision, etc. still see those (codex P2 v9 +// #904; phantom-sentinel doc fix from claude v10 #904). +func runAdapterEncoders(b *snapshotBuilder, opts EncodeOptions) ([]string, error) { + var enabled []string + for _, r := range adapterRunners() { + if !r.enabled(opts.Adapters) { + continue + } + if err := r.encode(b, opts.InputRoot); err != nil { + return nil, errors.WithStack(errors.Mark(err, ErrEncodeAdapterData)) + } + enabled = append(enabled, r.name) + } + return enabled, nil +} + +// runSelfTest streams fsmFile through DecodeSnapshot into a unique +// scratch subdir, structurally diffs against opts.InputRoot, and returns +// (header, mismatchTxt, matched, err). matched=false with err=nil +// indicates a structural diff; matched=true with err=nil indicates +// success. err is non-nil only on infrastructure failure (mkdir, decoder +// error, walk error). +// +// fsmFile is read from its current position (caller must Seek(0) before +// calling). The scratch subdir is removed via defer regardless of +// outcome. The caller cleans up .mismatch.txt at the start of +// each run. +func runSelfTest(fsmFile io.Reader, opts EncodeOptions) (SnapshotHeader, []byte, bool, error) { + scratchBase := opts.SelfTestDecodeOptions.OutRoot + scratchDir, err := os.MkdirTemp(scratchBase, "encode-self-test-") + if err != nil { + return SnapshotHeader{}, nil, false, errors.Wrap(err, "mkdir scratch") + } + defer func() { + _ = os.RemoveAll(scratchDir) + }() + + decOpts := opts.SelfTestDecodeOptions + decOpts.OutRoot = scratchDir + + result, derr := DecodeSnapshot(fsmFile, decOpts) + if derr != nil { + // Decoder errored on our own output — that IS a self-test + // failure (the .fsm we produced isn't loadable). Surface as + // a mismatch with the decoder error embedded in the txt. + mismatchTxt := []byte("self-test failed: DecodeSnapshot rejected the produced .fsm: " + derr.Error()) + return SnapshotHeader{}, mismatchTxt, false, nil + } + + if result.Header.LastCommitTS != opts.LastCommitTS { + mismatchTxt := []byte(formatHeaderMismatch(opts.LastCommitTS, result.Header.LastCommitTS)) + return result.Header, mismatchTxt, false, nil + } + + diff, derr := diffAdapterTrees(opts.InputRoot, scratchDir, opts.Adapters) + if derr != nil { + return result.Header, nil, false, errors.Wrap(derr, "diff scratch tree") + } + if len(diff) > 0 { + return result.Header, []byte(strings.Join(diff, "\n") + "\n"), false, nil + } + return result.Header, nil, true, nil +} + +// diffAdapterTrees returns a list of paths (relative to input/scratch +// root) where the two trees differ, restricted to the adapter subtrees +// enabled in adapters. MANIFEST.json itself is NOT compared — the scratch +// doesn't have one (DecodeSnapshot library doesn't emit it; the CLI +// wrapper does, codex P2 v1 #896 — header check above is the +// last_commit_ts substitute). Bounded to selfTestMaxMismatchPaths. +func diffAdapterTrees(inputRoot, scratchRoot string, adapters AdapterSet) ([]string, error) { + subdirs := enabledAdapterSubdirs(adapters) + var diffs []string + for _, sub := range subdirs { + paths, err := diffOneSubdir(filepath.Join(inputRoot, sub), filepath.Join(scratchRoot, sub), sub) + if err != nil { + return nil, err + } + diffs = append(diffs, paths...) + if len(diffs) >= selfTestMaxMismatchPaths { + diffs = diffs[:selfTestMaxMismatchPaths] + diffs = append(diffs, "... (truncated; first "+strconv.Itoa(selfTestMaxMismatchPaths)+" paths shown)") + return diffs, nil + } + } + return diffs, nil +} + +const selfTestMaxMismatchPaths = 64 + +// enabledAdapterSubdirs returns the top-level adapter subdir names for +// the enabled adapters, in canonical order for stable mismatch.txt output. +func enabledAdapterSubdirs(adapters AdapterSet) []string { + var out []string + for _, r := range adapterRunners() { + if r.enabled(adapters) { + out = append(out, r.name) + } + } + return out +} + +// diffOneSubdir walks aDir + bDir in parallel, returning paths (prefixed +// by relPrefix) that differ in presence, size, or bytes. Files are +// compared by streaming reads (NOT by loading whole bytes into memory) +// so a multi-GB S3 blob does not OOM the encoder (gemini high #904). +// Missing-on-one-side is a mismatch. The returned diffs are sorted +// alphabetically so mismatch.txt is deterministic across runs with +// identical inputs (claude v2 carry-over observation #904). +func diffOneSubdir(aDir, bDir, relPrefix string) ([]string, error) { + aPaths, aErr := walkRegularFilePaths(aDir) + if aErr != nil && !errors.Is(aErr, os.ErrNotExist) { + return nil, errors.Wrapf(aErr, "walk input %s", aDir) + } + bPaths, bErr := walkRegularFilePaths(bDir) + if bErr != nil && !errors.Is(bErr, os.ErrNotExist) { + return nil, errors.Wrapf(bErr, "walk scratch %s", bDir) + } + + var diffs []string + for relPath, bFull := range bPaths { + aFull, ok := aPaths[relPath] + if !ok { + diffs = append(diffs, relPrefix+"/"+relPath+" (missing in input)") + continue + } + eq, derr := streamFilesEqual(aFull, bFull) + if derr != nil { + return nil, errors.Wrapf(derr, "compare %s vs %s", aFull, bFull) + } + if !eq { + diffs = append(diffs, relPrefix+"/"+relPath+" (bytes differ)") + } + delete(aPaths, relPath) + } + for relPath := range aPaths { + diffs = append(diffs, relPrefix+"/"+relPath+" (missing in scratch)") + } + sort.Strings(diffs) + return diffs, nil +} + +// walkRegularFilePaths returns a map of relative path → absolute path +// for every regular file under root. Replaces walkRegularFiles which +// eagerly read file bytes; this version only records paths so the diff +// can stream-compare per file (gemini high #904). +func walkRegularFilePaths(root string) (map[string]string, error) { + out := map[string]string{} + rootInfo, err := os.Stat(root) + if err != nil { + return nil, errors.WithStack(err) + } + if !rootInfo.IsDir() { + return nil, errors.Errorf("not a directory: %s", root) + } + if err := filepath.WalkDir(root, func(path string, d os.DirEntry, err error) error { + if err != nil { + return err + } + if d.IsDir() || !d.Type().IsRegular() { + return nil + } + rel, rerr := filepath.Rel(root, path) + if rerr != nil { + return errors.WithStack(rerr) + } + out[filepath.ToSlash(rel)] = path + return nil + }); err != nil { + return nil, errors.WithStack(err) + } + return out, nil +} + +// streamCmpBufSize is the per-file read buffer for the streaming +// compare. 64 KiB matches Go's default bufio buffer and keeps the +// allocation small relative to the modal adapter file size. +const streamCmpBufSize = 64 * 1024 + +// streamFilesEqual reports whether the contents at aPath and bPath are +// byte-equal without loading either file fully into memory. A size +// mismatch short-circuits. Used by diffOneSubdir to bound the +// self-test's memory at O(streamCmpBufSize) per concurrent compare +// (gemini high #904). +func streamFilesEqual(aPath, bPath string) (bool, error) { + aSize, err := fileSize(aPath) + if err != nil { + return false, err + } + bSize, err := fileSize(bPath) + if err != nil { + return false, err + } + if aSize != bSize { + return false, nil + } + aFile, err := os.Open(aPath) //nolint:gosec // walking caller-provided dirs + if err != nil { + return false, errors.WithStack(err) + } + defer func() { _ = aFile.Close() }() + bFile, err := os.Open(bPath) //nolint:gosec // walking caller-provided dirs + if err != nil { + return false, errors.WithStack(err) + } + defer func() { _ = bFile.Close() }() + return streamReadersEqual(aFile, bFile) +} + +func fileSize(path string) (int64, error) { + info, err := os.Stat(path) + if err != nil { + return 0, errors.WithStack(err) + } + return info.Size(), nil +} + +// streamReadersEqual compares two readers of equal length chunk-by-chunk +// and returns false on any difference, true on full match. +func streamReadersEqual(a, b io.Reader) (bool, error) { + aBuf := make([]byte, streamCmpBufSize) + bBuf := make([]byte, streamCmpBufSize) + for { + an, aErr := io.ReadFull(a, aBuf) + bn, bErr := io.ReadFull(b, bBuf) + if an != bn || !bytes.Equal(aBuf[:an], bBuf[:bn]) { + return false, nil + } + if aErr == io.EOF || aErr == io.ErrUnexpectedEOF { + return true, nil + } + if aErr != nil { + return false, errors.WithStack(aErr) + } + if bErr != nil { + return false, errors.WithStack(bErr) + } + } +} + +func formatHeaderMismatch(want, got uint64) string { + return "self-test failed: header.LastCommitTS mismatch (want " + + strconv.FormatUint(want, 10) + + ", got " + + strconv.FormatUint(got, 10) + + ")\n" +} + +// sha256Writer wraps an io.Writer and tees every byte into a SHA-256 +// hasher so the encoder gets a single-pass SHA256 of the produced .fsm +// without an extra buffer-pass. Used in the no-self-test streaming path. +type sha256Writer struct { + w io.Writer + h hash.Hash +} + +func newSHA256Writer(w io.Writer) *sha256Writer { + return &sha256Writer{w: w, h: sha256.New()} +} + +func (s *sha256Writer) Write(p []byte) (int, error) { + if _, err := s.h.Write(p); err != nil { + // crypto/sha256 never errors on Write per stdlib contract. + return 0, errors.WithStack(err) + } + n, err := s.w.Write(p) + if err != nil { + return n, errors.WithStack(err) + } + return n, nil +} + +func (s *sha256Writer) Sum() [32]byte { + var out [32]byte + copy(out[:], s.h.Sum(nil)) + return out +} diff --git a/internal/backup/encode_snapshot_test.go b/internal/backup/encode_snapshot_test.go new file mode 100644 index 000000000..59e157087 --- /dev/null +++ b/internal/backup/encode_snapshot_test.go @@ -0,0 +1,802 @@ +package backup + +import ( + "bytes" + "os" + "path/filepath" + "testing" + + "github.com/cockroachdb/errors" +) + +// TestEncodeSnapshotLibraryRoundTrip pins the public library entrypoint: +// EncodeSnapshot writes a .fsm to the supplied io.Writer; running +// DecodeSnapshot on those bytes into a scratch dir produces an +// equivalent adapter tree. No CLI involved. Codex P2 v2 #896 — encoder +// entrypoint exposure. +func TestEncodeSnapshotLibraryRoundTrip(t *testing.T) { + t.Parallel() + in := t.TempDir() + // One tiny SQS queue fixture is enough to exercise the SQS slice + // end-to-end via the new library wrapper; the per-adapter tree + // shape is already covered by the M5-1/M5-2 tests. + const queue = "lib-rt" + writeSQSQueue(t, in, queue, + []byte(`{"format_version":1,"name":"lib-rt","fifo":false,"partition_count":1,"generation":1}`), + [][]byte{ + []byte(`{"format_version":1,"message_id":"m1","body":"a","send_timestamp_millis":1700000000000,"available_at_millis":1700000000000,"sequence_number":0}`), + }, + ) + + var buf bytes.Buffer + result, err := EncodeSnapshot(EncodeOptions{ + InputRoot: in, + Adapters: AdapterSet{SQS: true}, + LastCommitTS: 0xDEADBEEF, + AllowMissingManifest: true, + }, &buf) + if err != nil { + t.Fatalf("EncodeSnapshot: %v", err) + } + if result.SelfTestRan { + t.Errorf("SelfTestRan = true, want false (SelfTest opt was false)") + } + if result.BytesWritten == 0 { + t.Errorf("BytesWritten = 0") + } + if len(result.AdaptersEnabled) != 1 || result.AdaptersEnabled[0] != "sqs" { + t.Errorf("AdaptersEnabled = %v, want [sqs]", result.AdaptersEnabled) + } + + // Decode the produced bytes into a scratch tree. + scratch := t.TempDir() + decResult, err := DecodeSnapshot(bytes.NewReader(buf.Bytes()), DecodeOptions{ + OutRoot: scratch, + Adapters: AdapterSet{SQS: true}, + }) + if err != nil { + t.Fatalf("DecodeSnapshot of EncodeSnapshot output failed: %v", err) + } + if decResult.Header.LastCommitTS != 0xDEADBEEF { + t.Errorf("decoded header.LastCommitTS = %x, want 0xDEADBEEF", decResult.Header.LastCommitTS) + } +} + +// TestEncodeSnapshotSelfTestMatchesInput pins the happy-path self-test +// against a tree that has already been canonicalized by one decode pass +// (so the input matches what DecodeSnapshot would write back, modulo +// the encoder's idempotency). The full encode -> decode -> encode chain +// is the gold-standard round trip the parent design mandates. +func TestEncodeSnapshotSelfTestMatchesInput(t *testing.T) { + t.Parallel() + rawIn := t.TempDir() + const queue = "selftest-match" + writeSQSQueue(t, rawIn, queue, + []byte(`{"format_version":1,"name":"selftest-match","fifo":false,"partition_count":1,"generation":1}`), + [][]byte{ + []byte(`{"format_version":1,"message_id":"m1","body":"a","send_timestamp_millis":1700000000000,"available_at_millis":1700000000000,"sequence_number":0}`), + }, + ) + + // Canonicalize: encode rawIn, decode it back to canonicalIn. The + // resulting tree is what the encoder's self-test will produce in + // the scratch dir, so a second encode against it must match. + canonicalIn := t.TempDir() + var canonicalBuf bytes.Buffer + if _, err := EncodeSnapshot(EncodeOptions{ + InputRoot: rawIn, + Adapters: AdapterSet{SQS: true}, + LastCommitTS: 0xCAFE, + AllowMissingManifest: true, + }, &canonicalBuf); err != nil { + t.Fatalf("canonical encode: %v", err) + } + if _, err := DecodeSnapshot(bytes.NewReader(canonicalBuf.Bytes()), DecodeOptions{ + OutRoot: canonicalIn, + Adapters: AdapterSet{SQS: true}, + }); err != nil { + t.Fatalf("canonical decode: %v", err) + } + + scratchBase := t.TempDir() + var buf bytes.Buffer + result, err := EncodeSnapshot(EncodeOptions{ + InputRoot: canonicalIn, + Adapters: AdapterSet{SQS: true}, + LastCommitTS: 0xCAFE, + SelfTest: true, + SelfTestDecodeOptions: DecodeOptions{ + OutRoot: scratchBase, + Adapters: AdapterSet{SQS: true}, + }, + AllowMissingManifest: true, + }, &buf) + if err != nil { + t.Fatalf("EncodeSnapshot: %v", err) + } + if !result.SelfTestRan || !result.SelfTestMatched { + t.Errorf("SelfTestRan=%v Matched=%v, want both true; mismatch=%s", result.SelfTestRan, result.SelfTestMatched, string(result.SelfTestMismatchTxt)) + } + if buf.Len() == 0 { + t.Errorf("bytes were not copied to out after successful self-test") + } + if result.Header.LastCommitTS != 0xCAFE { + t.Errorf("Header.LastCommitTS = %x, want 0xCAFE", result.Header.LastCommitTS) + } +} + +// flipBytesPastHeaderHelper returns a corruption hook that flips bytes +// every 13 bytes starting at offset 200 in the buffered self-test temp +// file — far enough past the EKVPBBL1 header + lastCommitTS that the +// decoder trips on a malformed entry length. Extracted from the test +// body so the test body itself stays under the cyclop threshold. +func flipBytesPastHeaderHelper(t *testing.T) func(*os.File) { + t.Helper() + return func(f *os.File) { + info, err := f.Stat() + if err != nil { + t.Fatalf("temp Stat: %v", err) + } + const headerSkip = 200 + if info.Size() <= headerSkip { + t.Fatalf("temp file too small to corrupt past header: %d bytes", info.Size()) + } + buf := make([]byte, info.Size()-headerSkip) + if _, err := f.ReadAt(buf, headerSkip); err != nil { + t.Fatalf("ReadAt: %v", err) + } + for i := 0; i < len(buf); i += 13 { + buf[i] ^= 0xFF + } + if _, err := f.WriteAt(buf, headerSkip); err != nil { + t.Fatalf("WriteAt: %v", err) + } + } +} + +// TestEncodeSnapshotSelfTestDetectsCorruption pins that the unexported +// corruptBufferForTest hook lets the self-test catch corruption in the +// internal buffer. The corruption must be reachable by the self-test +// decode but MUST NOT reach the supplied io.Writer (the write-then- +// rename invariant — codex P2 v6 #896). +func TestEncodeSnapshotSelfTestDetectsCorruption(t *testing.T) { + t.Parallel() + in := t.TempDir() + const queue = "selftest-corrupt" + writeSQSQueue(t, in, queue, + []byte(`{"format_version":1,"name":"selftest-corrupt","fifo":false,"partition_count":1,"generation":1}`), + [][]byte{ + []byte(`{"format_version":1,"message_id":"m1","body":"a","send_timestamp_millis":1700000000000,"available_at_millis":1700000000000,"sequence_number":0}`), + }, + ) + + scratchBase := t.TempDir() + var out bytes.Buffer + corrupt := flipBytesPastHeaderHelper(t) + result, err := EncodeSnapshot(EncodeOptions{ + InputRoot: in, + Adapters: AdapterSet{SQS: true}, + LastCommitTS: 0xCAFE, + SelfTest: true, + SelfTestDecodeOptions: DecodeOptions{ + OutRoot: scratchBase, + Adapters: AdapterSet{SQS: true}, + }, + corruptBufferForTest: corrupt, + AllowMissingManifest: true, + }, &out) + if err != nil { + t.Fatalf("EncodeSnapshot: %v", err) + } + if !result.SelfTestRan { + t.Fatalf("SelfTestRan = false") + } + if result.SelfTestMatched { + t.Errorf("SelfTestMatched = true with corruption injected; want false") + } + if len(result.SelfTestMismatchTxt) == 0 { + t.Errorf("SelfTestMismatchTxt is empty; expected a mismatch report") + } + // CRITICAL: the corrupt bytes must NEVER reach out. The + // write-then-rename atomic-publish discipline requires that a + // self-test failure publishes nothing. + if out.Len() != 0 { + t.Errorf("out.Len = %d, want 0 (no bytes should reach out on self-test failure)", out.Len()) + } +} + +// TestEncodeSnapshotRequiresInputRoot rejects EncodeOptions with no +// InputRoot — a simple guard so the constructor errors surface early. +func TestEncodeSnapshotRequiresInputRoot(t *testing.T) { + t.Parallel() + var buf bytes.Buffer + if _, err := EncodeSnapshot(EncodeOptions{}, &buf); err == nil { + t.Fatalf("EncodeSnapshot with empty InputRoot succeeded; want error") + } +} + +// TestEncodeSnapshotRejectsMissingInputRoot pins codex P2 v8 #904: a +// non-existent or non-directory InputRoot must be rejected before any +// adapter runs. Otherwise each enabled adapter treats its missing +// top-level subdirectory as a no-op, the call "succeeds", and the +// caller gets a header-only .fsm — a silent empty-restore artifact. +// CLI callers don't hit this path (they open MANIFEST.json first), +// but library callers can pass a stale path, so the guard belongs in +// EncodeSnapshot itself. +func TestEncodeSnapshotRejectsMissingInputRoot(t *testing.T) { + t.Parallel() + t.Run("non-existent path", func(t *testing.T) { + t.Parallel() + missing := filepath.Join(t.TempDir(), "does-not-exist") + var buf bytes.Buffer + _, err := EncodeSnapshot(EncodeOptions{ + InputRoot: missing, + Adapters: AdapterSet{SQS: true}, + LastCommitTS: 1, + }, &buf) + if err == nil { + t.Fatalf("EncodeSnapshot with non-existent InputRoot succeeded; want error") + } + if buf.Len() != 0 { + t.Errorf("buf.Len = %d, want 0 (no bytes should be written for missing InputRoot)", buf.Len()) + } + }) + t.Run("regular file", func(t *testing.T) { + t.Parallel() + dir := t.TempDir() + filePath := filepath.Join(dir, "not-a-dir") + if err := os.WriteFile(filePath, []byte("x"), 0o600); err != nil { + t.Fatalf("WriteFile: %v", err) + } + var buf bytes.Buffer + _, err := EncodeSnapshot(EncodeOptions{ + InputRoot: filePath, + Adapters: AdapterSet{SQS: true}, + LastCommitTS: 1, + }, &buf) + if err == nil { + t.Fatalf("EncodeSnapshot with file-as-InputRoot succeeded; want error") + } + if buf.Len() != 0 { + t.Errorf("buf.Len = %d, want 0 (no bytes should be written for non-directory InputRoot)", buf.Len()) + } + }) +} + +// TestEncodeSnapshotRequiresManifest pins codex P2 v17 #904: a library +// caller pointing at an existing-but-wrong directory (no +// MANIFEST.json) must fail closed with an error referencing +// MANIFEST.json, NOT silently emit a header-only .fsm. The CLI hits +// this path naturally by opening MANIFEST.json first; the library +// validation layer needs the equivalent guard. +func TestEncodeSnapshotRequiresManifest(t *testing.T) { + t.Parallel() + in := t.TempDir() // exists, is a directory, but contains no MANIFEST.json + var buf bytes.Buffer + _, err := EncodeSnapshot(EncodeOptions{ + InputRoot: in, + Adapters: AdapterSet{SQS: true}, + LastCommitTS: 1, + // AllowMissingManifest: false (default) — must require manifest + }, &buf) + if err == nil { + t.Fatalf("EncodeSnapshot with missing MANIFEST.json succeeded; want error") + } + if buf.Len() != 0 { + t.Errorf("buf.Len = %d, want 0 (no bytes should be written when MANIFEST.json is missing)", buf.Len()) + } +} + +// TestEncodeSnapshotAllowMissingManifestOptOut pins that the +// AllowMissingManifest opt-out works for synthetic test fixtures. +// Mirrors the ManifestLastCommitTS=0 opt-out pattern from codex P2 v2. +func TestEncodeSnapshotAllowMissingManifestOptOut(t *testing.T) { + t.Parallel() + in := t.TempDir() + const queue = "manifest-opt-out" + writeSQSQueue(t, in, queue, + []byte(`{"format_version":1,"name":"manifest-opt-out","fifo":false,"partition_count":1,"generation":1}`), + [][]byte{ + []byte(`{"format_version":1,"message_id":"m1","body":"a","send_timestamp_millis":1700000000000,"available_at_millis":1700000000000,"sequence_number":0}`), + }, + ) + var buf bytes.Buffer + _, err := EncodeSnapshot(EncodeOptions{ + InputRoot: in, + Adapters: AdapterSet{SQS: true}, + LastCommitTS: 1, + AllowMissingManifest: true, + }, &buf) + if err != nil { + t.Fatalf("EncodeSnapshot with AllowMissingManifest=true failed: %v", err) + } +} + +// TestEncodeSnapshotRejectsLowManifestFloor pins codex P2 v2: the +// library-level HLC floor check fails-closed when opts.LastCommitTS +// is below opts.ManifestLastCommitTS. Defense-in-depth for the CLI's +// resolveLastCommitTS — a future in-process caller (Phase 1 live +// extractor) cannot silently publish a low-TS .fsm. +func TestEncodeSnapshotRejectsLowManifestFloor(t *testing.T) { + t.Parallel() + in := t.TempDir() + var buf bytes.Buffer + _, err := EncodeSnapshot(EncodeOptions{ + InputRoot: in, + Adapters: AdapterSet{SQS: true}, + LastCommitTS: 500, + ManifestLastCommitTS: 1000, // floor; LastCommitTS is below + AllowMissingManifest: true, + }, &buf) + if err == nil { + t.Fatalf("EncodeSnapshot with LastCommitTS < ManifestLastCommitTS succeeded; want error") + } + if !errors.Is(err, ErrSelfTestLowerLastCommitTS) { + t.Errorf("err = %v, want errors.Is ErrSelfTestLowerLastCommitTS", err) + } + if buf.Len() != 0 { + t.Errorf("buf.Len = %d, want 0 (no bytes should be written on floor regression)", buf.Len()) + } +} + +// TestEncodeSnapshotManifestFloorOptOut pins that ManifestLastCommitTS=0 +// disables the check (synthetic test fixtures, library callers without a +// manifest reference). The existing TestEncodeSnapshotLibraryRoundTrip +// implicitly relies on this opt-out. +func TestEncodeSnapshotManifestFloorOptOut(t *testing.T) { + t.Parallel() + in := t.TempDir() + const queue = "floor-opt-out" + writeSQSQueue(t, in, queue, + []byte(`{"format_version":1,"name":"floor-opt-out","fifo":false,"partition_count":1,"generation":1}`), + [][]byte{ + []byte(`{"format_version":1,"message_id":"m1","body":"a","send_timestamp_millis":1700000000000,"available_at_millis":1700000000000,"sequence_number":0}`), + }, + ) + var buf bytes.Buffer + _, err := EncodeSnapshot(EncodeOptions{ + InputRoot: in, + Adapters: AdapterSet{SQS: true}, + LastCommitTS: 500, + ManifestLastCommitTS: 0, // opt-out + AllowMissingManifest: true, + }, &buf) + if err != nil { + t.Fatalf("EncodeSnapshot with opt-out floor failed: %v", err) + } +} + +// TestEncodeSnapshotRejectsUnsupportedFeatures pins codex P2 v21 #904: +// three manifest-derived flags that the per-adapter encoders cannot +// honor today must fail-closed before any bytes are written. Each +// guard fires only when the corresponding adapter is enabled — +// orthogonal callers (e.g., S3IncludeIncompleteUploads=true while +// encoding only Redis) are unaffected. Pattern matches the +// DynamoDBBundleJSONL guard from v8. +func TestEncodeSnapshotRejectsUnsupportedFeatures(t *testing.T) { + t.Parallel() + cases := []struct { + name string + opts EncodeOptions + wantErr error + }{ + { + name: "S3 include_incomplete_uploads", + opts: EncodeOptions{ + Adapters: AdapterSet{S3: true}, + S3IncludeIncompleteUploads: true, + }, + wantErr: ErrEncodeUnsupportedS3IncompleteUploads, + }, + { + name: "S3 include_orphans", + opts: EncodeOptions{ + Adapters: AdapterSet{S3: true}, + S3IncludeOrphans: true, + }, + wantErr: ErrEncodeUnsupportedS3Orphans, + }, + { + name: "SQS preserve_visibility", + opts: EncodeOptions{ + Adapters: AdapterSet{SQS: true}, + PreserveSQSVisibility: true, + }, + wantErr: ErrEncodeUnsupportedSQSPreserveVisibility, + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + in := t.TempDir() + opts := tc.opts + opts.InputRoot = in + opts.LastCommitTS = 1 + opts.AllowMissingManifest = true + var buf bytes.Buffer + _, err := EncodeSnapshot(opts, &buf) + if err == nil { + t.Fatalf("EncodeSnapshot accepted unsupported feature; want %v", tc.wantErr) + } + if !errors.Is(err, tc.wantErr) { + t.Errorf("err = %v, want errors.Is %v", err, tc.wantErr) + } + if buf.Len() != 0 { + t.Errorf("buf.Len = %d, want 0 (no bytes should be written on rejection)", buf.Len()) + } + }) + } +} + +// TestEncodeSnapshotUnsupportedFeaturesGatedByAdapter pins that each +// of the three v21 guards fires ONLY when its corresponding adapter +// is enabled. A library caller that inherits the manifest flag but +// disables the affected adapter is unaffected — mirrors the JSONL +// guard's "DDB not in scope" exemption. +func TestEncodeSnapshotUnsupportedFeaturesGatedByAdapter(t *testing.T) { + t.Parallel() + cases := []struct { + name string + opts EncodeOptions + }{ + { + name: "S3IncludeIncompleteUploads with S3 disabled", + opts: EncodeOptions{ + Adapters: AdapterSet{SQS: true}, // not S3 + S3IncludeIncompleteUploads: true, + }, + }, + { + name: "S3IncludeOrphans with S3 disabled", + opts: EncodeOptions{ + Adapters: AdapterSet{SQS: true}, // not S3 + S3IncludeOrphans: true, + }, + }, + { + name: "PreserveSQSVisibility with SQS disabled", + opts: EncodeOptions{ + Adapters: AdapterSet{Redis: true}, // not SQS + PreserveSQSVisibility: true, + }, + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + in := t.TempDir() + opts := tc.opts + opts.InputRoot = in + opts.LastCommitTS = 1 + opts.AllowMissingManifest = true + var buf bytes.Buffer + if _, err := EncodeSnapshot(opts, &buf); err != nil { + t.Errorf("EncodeSnapshot rejected unsupported flag when its adapter was out of scope: %v", err) + } + }) + } +} + +// TestEncodeSnapshotRejectsDynamoDBJSONLLayout pins codex P2 v7 #904: +// the DynamoDB reverse encoder does not support the JSONL bundle +// layout, so a caller that threads DynamoDBBundleJSONL=true must be +// rejected with ErrEncodeUnsupportedDynamoDBLayout before any bytes +// are written. The CLI hits this path automatically when MANIFEST.json +// has `dynamodb_layout: "jsonl"`; library callers that mirror that +// thread the field themselves. +func TestEncodeSnapshotRejectsDynamoDBJSONLLayout(t *testing.T) { + t.Parallel() + in := t.TempDir() + var buf bytes.Buffer + _, err := EncodeSnapshot(EncodeOptions{ + InputRoot: in, + Adapters: AdapterSet{DynamoDB: true}, + LastCommitTS: 1, + DynamoDBBundleJSONL: true, + AllowMissingManifest: true, + }, &buf) + if err == nil { + t.Fatalf("EncodeSnapshot with DynamoDBBundleJSONL accepted; want error") + } + if !errors.Is(err, ErrEncodeUnsupportedDynamoDBLayout) { + t.Errorf("err = %v, want errors.Is ErrEncodeUnsupportedDynamoDBLayout", err) + } + if buf.Len() != 0 { + t.Errorf("buf.Len = %d, want 0 (no bytes should be written when JSONL is rejected)", buf.Len()) + } +} + +// TestEncodeSnapshotJSONLOnlyRejectedWhenDDBEnabled pins that the JSONL +// guard fires only when DynamoDB is in the adapter set — a caller that +// happens to set DynamoDBBundleJSONL=true while encoding ONLY Redis (or +// any other adapter) is unaffected. Prevents the guard from becoming +// over-zealous for callers who simply mirror the manifest field. +func TestEncodeSnapshotJSONLOnlyRejectedWhenDDBEnabled(t *testing.T) { + t.Parallel() + in := t.TempDir() + const queue = "no-ddb" + writeSQSQueue(t, in, queue, + []byte(`{"format_version":1,"name":"no-ddb","fifo":false,"partition_count":1,"generation":1}`), + [][]byte{ + []byte(`{"format_version":1,"message_id":"m1","body":"a","send_timestamp_millis":1700000000000,"available_at_millis":1700000000000,"sequence_number":0}`), + }, + ) + var buf bytes.Buffer + _, err := EncodeSnapshot(EncodeOptions{ + InputRoot: in, + Adapters: AdapterSet{SQS: true}, // DDB NOT in scope + LastCommitTS: 1, + DynamoDBBundleJSONL: true, // would be rejected if DDB were enabled + AllowMissingManifest: true, + }, &buf) + if err != nil { + t.Fatalf("EncodeSnapshot rejected JSONL flag when DDB not in scope: %v", err) + } +} + +// TestEncodeSnapshotRejectsZeroAdapterSet pins claude v5 + codex v5 +// carry-over: a library caller that forgets to thread Adapters into +// EncodeOptions gets a fail-closed error rather than a silently empty +// header-only .fsm. The CLI's parseAdapterSet already rejects this for +// flag-driven entry; this test pins the library-level guard. +func TestEncodeSnapshotRejectsZeroAdapterSet(t *testing.T) { + t.Parallel() + in := t.TempDir() + var buf bytes.Buffer + _, err := EncodeSnapshot(EncodeOptions{ + InputRoot: in, + Adapters: AdapterSet{}, // explicit zero + LastCommitTS: 1, + // AllowMissingManifest: true bypasses the v19 MANIFEST.json + // guard so this test actually exercises the zero-adapter + // guard further down in validateEncodeOptions — without this, + // checkInputRoot would error on the missing MANIFEST.json + // first and the assertion would pin the wrong invariant + // (claude v19 #904). + AllowMissingManifest: true, + }, &buf) + if err == nil { + t.Fatalf("EncodeSnapshot with empty AdapterSet succeeded; want error") + } + if buf.Len() != 0 { + t.Errorf("buf.Len = %d, want 0 (no bytes should be written on guard rejection)", buf.Len()) + } +} + +// TestEnumerateRedisDBsMissingDir pins codex P1 v13 #904: a missing +// redis/ directory returns nil indices (no-op), matching the per-DB +// encoder's "missing db_ = nothing to encode" convention. +func TestEnumerateRedisDBsMissingDir(t *testing.T) { + t.Parallel() + indices, err := enumerateRedisDBs(t.TempDir()) + if err != nil { + t.Fatalf("err = %v, want nil", err) + } + if indices != nil { + t.Errorf("indices = %v, want nil for missing redis/", indices) + } +} + +// TestEnumerateRedisDBsMixedEntries pins codex P1 v13 #904: only +// canonical db_ entries are kept; non-numeric, negative, leading- +// zero, empty-suffix, wrong-prefix, and non-directory entries are +// silently skipped. The returned slice is sorted ascending. +func TestEnumerateRedisDBsMixedEntries(t *testing.T) { + t.Parallel() + in := t.TempDir() + for _, name := range []string{"db_0", "db_1", "db_5"} { + if err := os.MkdirAll(filepath.Join(in, "redis", name), 0o755); err != nil { + t.Fatalf("MkdirAll %s: %v", name, err) + } + } + // Entries that MUST be skipped: + // db_garbage — non-numeric suffix + // db_-1 — negative + // db_01 — non-canonical leading zero + // db_ — empty suffix + // notdb_2 — wrong prefix + for _, name := range []string{"db_garbage", "db_-1", "db_01", "db_", "notdb_2"} { + if err := os.MkdirAll(filepath.Join(in, "redis", name), 0o755); err != nil { + t.Fatalf("MkdirAll %s: %v", name, err) + } + } + // A regular file under redis/ must be skipped (not enumerable). + if err := os.WriteFile(filepath.Join(in, "redis", "README"), []byte("x"), 0o600); err != nil { + t.Fatalf("WriteFile README: %v", err) + } + indices, err := enumerateRedisDBs(in) + if err != nil { + t.Fatalf("enumerateRedisDBs: %v", err) + } + want := []int{0, 1, 5} + if len(indices) != len(want) { + t.Fatalf("indices = %v, want %v", indices, want) + } + for i, v := range want { + if indices[i] != v { + t.Errorf("indices[%d] = %d, want %d", i, indices[i], v) + } + } +} + +// TestEnumerateRedisDBsRedisIsRegularFile pins fail-closed when the +// "redis" path inside the dump is a regular file rather than a +// directory — distinct from the missing case. +func TestEnumerateRedisDBsRedisIsRegularFile(t *testing.T) { + t.Parallel() + in := t.TempDir() + if err := os.WriteFile(filepath.Join(in, "redis"), []byte("not a dir"), 0o600); err != nil { + t.Fatalf("WriteFile: %v", err) + } + _, err := enumerateRedisDBs(in) + if !errors.Is(err, ErrRedisEncodeNotDir) { + t.Errorf("err = %v, want errors.Is ErrRedisEncodeNotDir", err) + } +} + +// TestEncodeSnapshotRedisRejectsNonZeroDB pins codex P2 v14 #904 +// L452: the Redis MVCC key prefixes (!redis|str|, !redis|hll|, +// !redis|ttl|, …) carry no database component, so feeding a +// non-zero DB through the encoder would mis-scope the produced +// .fsm — same-named keys collide and a db_3-only self-test would +// decode under db_0. Until Phase 1 makes native keys DB-aware, +// non-zero-DB inputs MUST fail closed. +// +// The fixture places a single string under redis/db_3/ ONLY. +// EncodeSnapshot must reject with ErrRedisEncodeMultiDBUnsupported +// and write no bytes. (v14 originally attempted to fan out per DB; +// codex's L452 follow-up established the correct fix is fail-closed.) +func TestEncodeSnapshotRedisRejectsNonZeroDB(t *testing.T) { + t.Parallel() + in := t.TempDir() + encKey := EncodeSegment([]byte("k3")) + db3Strings := filepath.Join(in, "redis", "db_3", "strings") + if err := os.MkdirAll(db3Strings, 0o755); err != nil { + t.Fatalf("MkdirAll db_3/strings: %v", err) + } + if err := os.WriteFile(filepath.Join(db3Strings, encKey+".bin"), []byte("v3"), 0o600); err != nil { + t.Fatalf("WriteFile db_3 string: %v", err) + } + var buf bytes.Buffer + _, err := EncodeSnapshot(EncodeOptions{ + InputRoot: in, + Adapters: AdapterSet{Redis: true}, + LastCommitTS: 1, + AllowMissingManifest: true, + }, &buf) + if err == nil { + t.Fatalf("EncodeSnapshot accepted db_3-only Redis input; want ErrRedisEncodeMultiDBUnsupported") + } + if !errors.Is(err, ErrRedisEncodeMultiDBUnsupported) { + t.Errorf("err = %v, want errors.Is ErrRedisEncodeMultiDBUnsupported", err) + } + // Marked as adapter-data so the CLI routes it to exit-2. + if !errors.Is(err, ErrEncodeAdapterData) { + t.Errorf("err = %v, want errors.Is ErrEncodeAdapterData (mark from runAdapterEncoders)", err) + } + if buf.Len() != 0 { + t.Errorf("buf.Len = %d, want 0 (no bytes should be written on multi-DB rejection)", buf.Len()) + } +} + +// TestEncodeSnapshotRedisRejectsMultipleDBs pins the multi-DB case: +// redis/db_0 + redis/db_3 → ErrRedisEncodeMultiDBUnsupported (the +// fan-out would collide on same-named keys or merge both DBs under +// db_0 on restore; codex P2 v14 #904 L452). +func TestEncodeSnapshotRedisRejectsMultipleDBs(t *testing.T) { + t.Parallel() + in := t.TempDir() + for _, name := range []string{"db_0", "db_3"} { + if err := os.MkdirAll(filepath.Join(in, "redis", name, "strings"), 0o755); err != nil { + t.Fatalf("MkdirAll %s: %v", name, err) + } + } + var buf bytes.Buffer + _, err := EncodeSnapshot(EncodeOptions{ + InputRoot: in, + Adapters: AdapterSet{Redis: true}, + LastCommitTS: 1, + AllowMissingManifest: true, + }, &buf) + if err == nil { + t.Fatalf("EncodeSnapshot accepted db_0 + db_3; want ErrRedisEncodeMultiDBUnsupported") + } + if !errors.Is(err, ErrRedisEncodeMultiDBUnsupported) { + t.Errorf("err = %v, want errors.Is ErrRedisEncodeMultiDBUnsupported", err) + } + // Marked as adapter-data so the CLI routes it to exit-2 (mirrors + // TestEncodeSnapshotRedisRejectsNonZeroDB; claude v17 parity). + if !errors.Is(err, ErrEncodeAdapterData) { + t.Errorf("err = %v, want errors.Is ErrEncodeAdapterData (mark from runAdapterEncoders)", err) + } + if buf.Len() != 0 { + t.Errorf("buf.Len = %d, want 0 (no bytes should be written on multi-DB rejection)", buf.Len()) + } +} + +// TestEnumerateRedisDBsRejectsNonDirDBEntry pins codex P2 v14 #904 +// L427: when a canonical db_ name resolves to a regular file +// (or symlink) instead of a directory, enumerateRedisDBs must fail +// closed with ErrRedisEncodeNotDir — silently skipping would let a +// malformed dump publish a header-only/partial FSM. +func TestEnumerateRedisDBsRejectsNonDirDBEntry(t *testing.T) { + t.Parallel() + in := t.TempDir() + if err := os.MkdirAll(filepath.Join(in, "redis"), 0o755); err != nil { + t.Fatalf("MkdirAll: %v", err) + } + // redis/db_2 is a regular file — name matches the canonical + // pattern but the entry shape is wrong. + if err := os.WriteFile(filepath.Join(in, "redis", "db_2"), []byte("not a dir"), 0o600); err != nil { + t.Fatalf("WriteFile: %v", err) + } + _, err := enumerateRedisDBs(in) + if !errors.Is(err, ErrRedisEncodeNotDir) { + t.Errorf("err = %v, want errors.Is ErrRedisEncodeNotDir", err) + } +} + +// TestEncodeSnapshotMarksAdapterDataErrors pins codex P2 v9 #904: when +// an adapter encoder rejects the input tree's contents (e.g. a +// malformed DynamoDB _schema.json), EncodeSnapshot must surface the +// failure as ErrEncodeAdapterData so the CLI can route it to exit-2 +// (data-correctness) rather than exit-1 (operator/flag error). +// Crucially, errors.Mark preserves the original sentinel chain, so a +// caller that errors.Is on the per-adapter sentinel +// (ErrDDBEncodeInvalidSchema here) still gets a match — the marking +// is additive. +func TestEncodeSnapshotMarksAdapterDataErrors(t *testing.T) { + t.Parallel() + in := t.TempDir() + // Empty table_name triggers ErrDDBEncodeInvalidSchema inside the + // DynamoDB encoder (encode_dynamodb.go:120). + writeDDBSchema(t, in, "tbl", + []byte(`{"format_version":1,"table_name":"","primary_key":{"hash_key":{"name":"id","type":"S"}}}`)) + var buf bytes.Buffer + _, err := EncodeSnapshot(EncodeOptions{ + InputRoot: in, + Adapters: AdapterSet{DynamoDB: true}, + LastCommitTS: 1, + AllowMissingManifest: true, + }, &buf) + if err == nil { + t.Fatalf("EncodeSnapshot with malformed schema succeeded; want error") + } + if !errors.Is(err, ErrEncodeAdapterData) { + t.Errorf("err = %v, want errors.Is ErrEncodeAdapterData", err) + } + // Inner sentinel must still be reachable so existing per-adapter + // errors.Is callers are unaffected by the additional mark. + if !errors.Is(err, ErrDDBEncodeInvalidSchema) { + t.Errorf("err = %v, want errors.Is ErrDDBEncodeInvalidSchema (mark must preserve inner chain)", err) + } +} + +// TestEncodeInfoSidecarPath pins the path-derivation rule for the +// sidecar (gemini medium v2 #896): one .fsm path produces one distinct +// sidecar path; two .fsm files in the same dir produce two distinct +// sidecars (no collision). +func TestEncodeInfoSidecarPath(t *testing.T) { + t.Parallel() + dir := t.TempDir() + a := filepath.Join(dir, "a.fsm") + b := filepath.Join(dir, "b.fsm") + sa := EncodeInfoSidecarPath(a) + sb := EncodeInfoSidecarPath(b) + if sa == sb { + t.Fatalf("sidecar paths collided: %s == %s", sa, sb) + } + // Verify each ends with the expected suffix. + if got, want := filepath.Base(sa), "a.fsm.encode_info.json"; got != want { + t.Errorf("sidecar(a) basename = %q, want %q", got, want) + } + if got, want := filepath.Base(sb), "b.fsm.encode_info.json"; got != want { + t.Errorf("sidecar(b) basename = %q, want %q", got, want) + } + // Both writable next to their .fsm (no OS-level collision). + for _, p := range []string{sa, sb} { + if err := os.WriteFile(p, []byte("{}"), 0o600); err != nil { + t.Fatalf("write %s: %v", p, err) + } + } +} diff --git a/internal/backup/manifest.go b/internal/backup/manifest.go index da73c5ef3..84409eac7 100644 --- a/internal/backup/manifest.go +++ b/internal/backup/manifest.go @@ -128,6 +128,14 @@ type Exclusions struct { IncludeOrphans bool `json:"include_orphans"` PreserveSQSVisibility bool `json:"preserve_sqs_visibility"` IncludeSQSSideRecords bool `json:"include_sqs_side_records"` + // RenameS3Collisions records whether the producer ran with + // --rename-collisions (DecodeOptions.RenameS3Collisions), so the + // M6 encoder's self-test can thread the same option back through + // DecodeSnapshot. Older manifests that omit this field decode as + // false (no-rename), matching the decoder default. Intentionally + // NOT added to exclusionsRequiredFields below so legacy manifests + // continue to validate (#896 v5 — claude review on M6 design). + RenameS3Collisions bool `json:"rename_s3_collisions,omitempty"` } // Manifest is the on-disk MANIFEST.json structure. Field tags match the diff --git a/internal/backup/open_nofollow_unix.go b/internal/backup/open_nofollow_unix.go index bdfb1db75..9a0d06279 100644 --- a/internal/backup/open_nofollow_unix.go +++ b/internal/backup/open_nofollow_unix.go @@ -58,7 +58,7 @@ func refuseHardLink(info os.FileInfo, path string) error { func openSidecarFile(path string) (*os.File, error) { // Note: NO O_TRUNC here — we truncate after the link-count check. const flag = os.O_WRONLY | os.O_CREATE | syscall.O_NOFOLLOW | syscall.O_NONBLOCK - f, err := os.OpenFile(path, flag, 0o600) //nolint:gosec,mnd // path is composed from output-root + fixed file name; 0600 is the standard owner-only mode + f, err := os.OpenFile(path, flag, sidecarFileMode) //nolint:gosec // path is composed from output-root + fixed file name; sidecarFileMode is the standard owner-only mode used here and by the post-Truncate Chmod below if err != nil { if errors.Is(err, syscall.ELOOP) { return nil, cockroachdberr.WithStack(cockroachdberr.Wrapf(err, @@ -98,5 +98,21 @@ func openSidecarFile(path string) (*os.File, error) { _ = f.Close() return nil, cockroachdberr.WithStack(err) } + // Enforce 0o600 on the descriptor. The flags-arg mode (0o600 + // above) is applied by the kernel ONLY on file creation; if + // path already existed, its pre-existing perms are kept. An + // older encoder writing 0o644 would otherwise leave the + // sidecar's source path / cluster ID / SHA256 world-readable + // after re-encode (claude / codex P2 v31 observation on PR #904). + if err := f.Chmod(sidecarFileMode); err != nil { + _ = f.Close() + return nil, cockroachdberr.WithStack(err) + } return f, nil } + +// sidecarFileMode is the file mode openSidecarFile enforces — owner +// read/write only. Pulled into a named const so the truncate-then- +// chmod step here matches the OpenFile flag-arg mode above; a future +// edit that widens one must touch both. +const sidecarFileMode os.FileMode = 0o600 diff --git a/internal/backup/open_nofollow_unix_test.go b/internal/backup/open_nofollow_unix_test.go new file mode 100644 index 000000000..cc1efde08 --- /dev/null +++ b/internal/backup/open_nofollow_unix_test.go @@ -0,0 +1,48 @@ +//go:build unix + +package backup + +import ( + "os" + "path/filepath" + "testing" +) + +// TestOpenSidecarFileEnforcesOwnerOnlyMode pins claude / codex P2 v31 +// observation on PR #904: an older encoder may have written the +// sidecar at 0o644; OpenFile's mode arg only applies on CREATE, so +// re-opening for re-encode would preserve the wider perms. The +// post-Truncate Chmod restores 0o600 on every successful open. +func TestOpenSidecarFileEnforcesOwnerOnlyMode(t *testing.T) { + t.Parallel() + dir := t.TempDir() + path := filepath.Join(dir, "sidecar.json") + // Pre-existing sidecar with wider perms (simulating an older + // encoder). + if err := os.WriteFile(path, []byte("prior"), 0o644); err != nil { //nolint:gosec // test simulates legacy permissive sidecar + t.Fatalf("WriteFile: %v", err) + } + // Verify the environment actually honored the broader seed mode; + // a restrictive umask or stricter FS could silently produce 0o600 + // and the test would pass even if the chmod-enforcement logic + // regressed (CodeRabbit nit on PR #904). + seedInfo, err := os.Stat(path) + if err != nil { + t.Fatalf("Stat seeded file: %v", err) + } + if seedInfo.Mode().Perm()&0o077 == 0 { + t.Skipf("environment refused permissive seed mode (got 0o%o); test cannot exercise chmod-enforcement", seedInfo.Mode().Perm()) + } + f, err := OpenSidecarFile(path) + if err != nil { + t.Fatalf("OpenSidecarFile: %v", err) + } + t.Cleanup(func() { _ = f.Close() }) + info, err := f.Stat() + if err != nil { + t.Fatalf("Stat: %v", err) + } + if got := info.Mode().Perm(); got != 0o600 { + t.Errorf("perm = %o, want 0o600 (Chmod after Truncate must tighten existing-file perms)", got) + } +} diff --git a/internal/backup/open_sidecar_export.go b/internal/backup/open_sidecar_export.go new file mode 100644 index 000000000..b476e87bc --- /dev/null +++ b/internal/backup/open_sidecar_export.go @@ -0,0 +1,21 @@ +package backup + +import "os" + +// OpenSidecarFile is the exported wrapper around the per-platform +// openSidecarFile. It opens path for write while refusing symlink, +// hard-link, FIFO, socket, and other non-regular-file clobber +// attacks via the platform-appropriate primitives (O_NOFOLLOW + +// O_NONBLOCK + Nlink check on unix; Lstat-then-OpenFile on Windows; +// a stricter Lstat-then-OpenFile fallback on other platforms). +// +// Use this whenever a writer creates or replaces a "sidecar" style +// file at a deterministic path inside an operator-supplied +// directory — the path is predictable to an attacker who can pre- +// create the entry, so the open MUST refuse to follow a symlink or +// truncate a hard-linked / non-regular file (codex P2 v25 #904 +// extended this from in-package adapter writers to the +// cmd/elastickv-snapshot-encode CLI's ENCODE_INFO.json writer). +func OpenSidecarFile(path string) (*os.File, error) { + return openSidecarFile(path) +}