diff --git a/README.md b/README.md index ca16fb9..bb66078 100644 --- a/README.md +++ b/README.md @@ -89,6 +89,25 @@ Because the command is exec'd without a shell, the volume path is never string-c > **Don't double-schedule verification.** If your external tool already runs its own verify on a timer (e.g. a cron/systemd job), don't *also* set `interval` for a verify command — two heavy passes will step on each other. Pick one driver: let squirrel schedule it (so the result lands in `squirrel hooks` / the TUI) **or** let the tool schedule it (maximum independence — verification keeps happening even when the agent is down), not both. +### Index snapshots + +The catalog should be as redundant as the data it describes. After every successful sync, squirrel takes one `VACUUM INTO` snapshot of the whole index (a self-contained, `db check`-able `.db` file) to a **local tier** and — for destination (bucket/sftp/…) syncs — rides a copy **along to the destination**, under each synced volume's `.squirrel-index/`. A restore-from-cloud then yields the data *and* the index that explains it. + +This is **on by default, zero-config** — an absent `[backups]` table means it's enabled with the defaults below. Override or disable via: + +```toml +[backups] +enabled = true # local snapshot-on-sync (default true) +dir = "" # local snapshot directory (default: /backups) +keep = 7 # local snapshots kept (rotation; 0 = keep all) +cloud = true # ride a copy along to destination buckets (default true) +cloud_keep = 7 # snapshots kept per //.squirrel-index/ (0 = keep all) +``` + +`enabled = false` disables both halves; `cloud = false` keeps the local snapshot but uploads nothing. Snapshots are named `index--run-.db` — lexically sortable and traceable to the run that produced them. A single snapshot is taken per `squirrel sync` invocation and fanned out to every target; a snapshot or upload failure is surfaced as a warning but never fails the sync. + +> **Privacy.** The ride-along payload is the *full global* `index.db` — paths and BLAKE3 hashes for **all** volumes (never file contents). It lands in the same bucket as your data (same trust boundary). Use a private bucket and server-side encryption. + ## Quickstart Index a configured volume: @@ -173,6 +192,7 @@ Each destination is a tree shaped like the local volumes: pictures/ 2024/cat.jpg .squirrel-history/run-7/2024/cat.jpg # prior content of cat.jpg + .squirrel-index/index-20260604T120000.000Z-run-12.db # global index snapshot (ride-along) docs/ invoice.pdf .squirrel-history/run-9/invoice.pdf @@ -180,6 +200,8 @@ Each destination is a tree shaped like the local volumes: `.squirrel-history/run-/` is rclone's `--backup-dir` target for that sync run. It is filtered out of all subsequent comparisons so it does not grow rclone's listing time or get uploaded back. A directory literally called `.squirrel-history` in your source volume is also filtered (with a warning), to keep the reserved name out of the destination tree by accident. +`.squirrel-index/` holds the index snapshots ridden along after each successful sync (see [Index snapshots](#index-snapshots)). Like `.squirrel-history`, it is filtered out of all sync and restore transfers and from peer-sync, so a snapshot is never mistaken for user content. + ## Notes - Hash: BLAKE3-256 via `github.com/zeebo/blake3`. Stored as a 32-byte `BLOB` in the `blake3` column. The CLI accepts and prints hex. diff --git a/cmd/squirrel/agent.go b/cmd/squirrel/agent.go index 7675787..d4a5ff5 100644 --- a/cmd/squirrel/agent.go +++ b/cmd/squirrel/agent.go @@ -180,7 +180,14 @@ func buildSchedulerSyncRunner(cfg *config.Config, s *store.Store, rcl *sync.Rclo if err != nil { return agent.SyncRunReport{Err: err} } - rep, runErr := sync.RunPair(ctx, s, rcl, pair, sync.Options{}) + // Snapshot-on-sync fires on each node's scheduled syncs too (#75): + // this is the operating cadence the catalog churns on. Each kick is + // a single pair, so a fresh Snapshotter per kick is the right unit. + opts := sync.Options{} + if cfg.Backups.Enabled { + opts.Snapshot = sync.NewSnapshotter(s, rcl, snapshotConfig(cfg, s.Path())) + } + rep, runErr := sync.RunPair(ctx, s, rcl, pair, opts) return agent.SyncRunReport{RunID: rep.RunID, Status: rep.Status, Err: runErr} } } diff --git a/cmd/squirrel/sync.go b/cmd/squirrel/sync.go index a60e63d..bb5c4d1 100644 --- a/cmd/squirrel/sync.go +++ b/cmd/squirrel/sync.go @@ -74,6 +74,13 @@ func runSync(cmd *cobra.Command, volumeName, destinationName string, opts sync.O if err := writeRcloneConfigLogged(out, rcl, cfg); err != nil { return err } + // One snapshotter shared across every pair: the VACUUM INTO snapshot + // is taken once per invocation and fanned out (decision #1). Disabled + // for dry-run (no run rows to snapshot against) and when [backups] is + // turned off. + if !opts.DryRun && cfg.Backups.Enabled { + opts.Snapshot = sync.NewSnapshotter(s, rcl, snapshotConfig(cfg, s.Path())) + } var anyFailed bool for _, p := range pairs { @@ -89,6 +96,23 @@ func runSync(cmd *cobra.Command, volumeName, destinationName string, opts sync.O return nil } +// snapshotConfig resolves the [backups] config into the sync package's +// SnapshotConfig, filling an empty backups dir with the default sibling +// backups/ directory next to the live DB (the same tier `db backup` and +// the pre-migration snapshots use). +func snapshotConfig(cfg *config.Config, dbPath string) sync.SnapshotConfig { + dir := cfg.Backups.Dir + if dir == "" { + dir = defaultBackupsDir(dbPath) + } + return sync.SnapshotConfig{ + Dir: dir, + Keep: cfg.Backups.Keep, + Cloud: cfg.Backups.Cloud, + CloudKeep: cfg.Backups.CloudKeep, + } +} + // shallowSyncWarning is printed at the CLI layer when a sync or restore // runs with --shallow. It spells out the safety trade so the operator // knows a destination whose size and mtime happen to match the source @@ -164,6 +188,12 @@ func printSyncReport(w io.Writer, rep sync.Report, runErr error) { // the runs row is stuck in 'running' until manually reconciled. fmt.Fprintf(w, " warning: failed to record terminal run state: %v\n", rep.FinishErr) } + if rep.SnapshotErr != nil { + // Defense-in-depth only: the sync itself succeeded; the index + // snapshot or its ride-along did not. Surface it without colouring + // the run's status. + fmt.Fprintf(w, " warning: index snapshot: %v\n", rep.SnapshotErr) + } if runErr != nil { fmt.Fprintf(w, " %v\n", runErr) } diff --git a/config/backups.go b/config/backups.go new file mode 100644 index 0000000..060c77c --- /dev/null +++ b/config/backups.go @@ -0,0 +1,89 @@ +package config + +import "fmt" + +// Backups is the resolved `[backups]` configuration governing the +// snapshot-on-sync feature (#75): after a successful sync, squirrel takes +// a VACUUM INTO snapshot of the index to a local tier and — for +// destination syncs — rides a copy along to the destination bucket so the +// catalog inherits the same redundancy as the data it describes. +// +// Defense-in-depth is the default: an absent `[backups]` table means +// "enabled with the defaults below", and individual keys override only +// what they name. Setting Enabled=false disables both halves; Cloud=false +// keeps the local snapshot but skips the ride-along upload. +type Backups struct { + // Enabled gates the whole feature — the local snapshot-on-sync and, + // transitively, the cloud ride-along. + Enabled bool + // Dir is the local snapshot directory. Empty means the consumer + // resolves it to "/backups" (the same sibling directory + // the pre-migration and `db backup` snapshots use); the dependency on + // the resolved DB path is why the default is applied at the call site + // rather than here. + Dir string + // Keep bounds the local snapshot directory: after writing, the oldest + // snapshots are rotated away until at most Keep remain. Zero means no + // rotation. + Keep int + // Cloud gates the destination ride-along. Ignored when Enabled is + // false (no snapshot is taken to upload). + Cloud bool + // CloudKeep bounds each destination's per-volume .squirrel-index/ + // directory. Zero means no rotation. + CloudKeep int +} + +// DefaultBackups returns the zero-config defaults: both halves on, seven +// snapshots kept on each tier, local directory resolved by the consumer. +func DefaultBackups() Backups { + return Backups{Enabled: true, Dir: "", Keep: 7, Cloud: true, CloudKeep: 7} +} + +// rawBackups is the on-disk shape of the `[backups]` table. Every field is +// a pointer (or, for Dir, distinguished by emptiness) so resolveBackups +// can tell "key omitted" from "key set to the zero value" — without that, +// `enabled = false` would be indistinguishable from a missing key. +type rawBackups struct { + Enabled *bool `toml:"enabled"` + Dir string `toml:"dir"` + Keep *int `toml:"keep"` + Cloud *bool `toml:"cloud"` + CloudKeep *int `toml:"cloud_keep"` +} + +// resolveBackups folds an optional `[backups]` table over the defaults. A +// nil raw (no table) yields DefaultBackups unchanged. Present keys +// override; Keep and CloudKeep must be non-negative. +func resolveBackups(raw *rawBackups) (Backups, error) { + b := DefaultBackups() + if raw == nil { + return b, nil + } + if raw.Enabled != nil { + b.Enabled = *raw.Enabled + } + if raw.Dir != "" { + dir, err := expandPath(raw.Dir) + if err != nil { + return Backups{}, fmt.Errorf("dir: %w", err) + } + b.Dir = dir + } + if raw.Keep != nil { + if *raw.Keep < 0 { + return Backups{}, fmt.Errorf("keep must be non-negative, got %d", *raw.Keep) + } + b.Keep = *raw.Keep + } + if raw.Cloud != nil { + b.Cloud = *raw.Cloud + } + if raw.CloudKeep != nil { + if *raw.CloudKeep < 0 { + return Backups{}, fmt.Errorf("cloud_keep must be non-negative, got %d", *raw.CloudKeep) + } + b.CloudKeep = *raw.CloudKeep + } + return b, nil +} diff --git a/config/backups_test.go b/config/backups_test.go new file mode 100644 index 0000000..67846ec --- /dev/null +++ b/config/backups_test.go @@ -0,0 +1,124 @@ +package config + +import ( + "strings" + "testing" +) + +// TestBackupsDefaultWhenAbsent: an absent [backups] table resolves to the +// zero-config defaults — on by default, both halves enabled. +func TestBackupsDefaultWhenAbsent(t *testing.T) { + p := writeConfig(t, ` +[volumes.pics] +path = "/tmp/pics" +`) + cfg, err := Load(p) + if err != nil { + t.Fatalf("Load: %v", err) + } + want := DefaultBackups() + if cfg.Backups != want { + t.Fatalf("Backups = %+v, want defaults %+v", cfg.Backups, want) + } +} + +// TestBackupsOverrides: present keys override the defaults; omitted keys +// keep their default. dir is expanded to an absolute path. +func TestBackupsOverrides(t *testing.T) { + p := writeConfig(t, ` +[backups] +keep = 3 +cloud_keep = 10 +dir = "/var/backups/squirrel" + +[volumes.pics] +path = "/tmp/pics" +`) + cfg, err := Load(p) + if err != nil { + t.Fatalf("Load: %v", err) + } + b := cfg.Backups + if !b.Enabled || !b.Cloud { + t.Fatalf("Enabled=%v Cloud=%v, want both true (omitted → default)", b.Enabled, b.Cloud) + } + if b.Keep != 3 || b.CloudKeep != 10 { + t.Fatalf("Keep=%d CloudKeep=%d, want 3/10", b.Keep, b.CloudKeep) + } + if b.Dir != "/var/backups/squirrel" { + t.Fatalf("Dir = %q, want /var/backups/squirrel", b.Dir) + } +} + +// TestBackupsDisabled: enabled=false is distinguishable from "omitted" +// thanks to the pointer field, and turns the whole feature off. +func TestBackupsDisabled(t *testing.T) { + p := writeConfig(t, ` +[backups] +enabled = false + +[volumes.pics] +path = "/tmp/pics" +`) + cfg, err := Load(p) + if err != nil { + t.Fatalf("Load: %v", err) + } + if cfg.Backups.Enabled { + t.Fatalf("Enabled = true, want false") + } + // Cloud keeps its default; Enabled is the master switch the consumer + // checks first. + if !cfg.Backups.Cloud { + t.Fatalf("Cloud = false, want default true (only enabled was set)") + } +} + +// TestBackupsCloudDisabled: cloud=false keeps the local snapshot on but +// turns off the ride-along. +func TestBackupsCloudDisabled(t *testing.T) { + p := writeConfig(t, ` +[backups] +cloud = false + +[volumes.pics] +path = "/tmp/pics" +`) + cfg, err := Load(p) + if err != nil { + t.Fatalf("Load: %v", err) + } + if !cfg.Backups.Enabled { + t.Fatalf("Enabled = false, want true") + } + if cfg.Backups.Cloud { + t.Fatalf("Cloud = true, want false") + } +} + +func TestBackupsRejectsNegativeKeep(t *testing.T) { + p := writeConfig(t, ` +[backups] +keep = -1 + +[volumes.pics] +path = "/tmp/pics" +`) + _, err := Load(p) + if err == nil || !strings.Contains(err.Error(), "keep must be non-negative") { + t.Fatalf("expected negative-keep error, got %v", err) + } +} + +func TestBackupsRejectsUnknownField(t *testing.T) { + p := writeConfig(t, ` +[backups] +nope = true + +[volumes.pics] +path = "/tmp/pics" +`) + if _, err := Load(p); err == nil { + t.Fatalf("expected unknown-field error for [backups].nope") + } +} diff --git a/config/config.go b/config/config.go index c5eea6c..568a4bb 100644 --- a/config/config.go +++ b/config/config.go @@ -58,6 +58,10 @@ type Config struct { // Agent is non-nil when the config declares an `[agent]` block. The // agent subcommand requires it; other subcommands ignore it. Agent *Agent + // Backups is the resolved `[backups]` configuration. Always populated: + // an absent table resolves to DefaultBackups (snapshot-on-sync on with + // sensible defaults). + Backups Backups } // Volume is one indexable root. @@ -190,6 +194,7 @@ type rawConfig struct { Destinations map[string]map[string]any `toml:"destinations"` Nodes map[string]rawNode `toml:"nodes"` Agent *rawAgent `toml:"agent"` + Backups *rawBackups `toml:"backups"` } type rawVolume struct { @@ -257,6 +262,11 @@ func (r *rawConfig) resolve(path string) (*Config, error) { } cfg.Agent = a } + backups, err := resolveBackups(r.Backups) + if err != nil { + return nil, fmt.Errorf("backups: %w", err) + } + cfg.Backups = backups return cfg, nil } diff --git a/sync/node.go b/sync/node.go index aa34442..75b3527 100644 --- a/sync/node.go +++ b/sync/node.go @@ -44,6 +44,21 @@ func SyncNode(ctx context.Context, s *store.Store, rcl *Rclone, vol *config.Volu if err != nil { return rep, err } + err = runNodeSession(ctx, s, rcl, vol, volID, node, opts, &rep) + // runNodeSession's deferred finishRun has committed the run's + // terminal state by now, so the snapshot reflects this run's own row. + // Peer-sync takes the local snapshot only — there is no ride-along to + // peer nodes (dest=nil), and the Snapshotter no-ops on non-terminal + // states and dry-run. + opts.Snapshot.afterSync(ctx, &rep, vol, nil) + return rep, err +} + +// runNodeSession runs the five-phase driver and owns the deferred +// terminal-state write. It is split out of SyncNode so the deferred +// finishRun commits before the snapshot-on-sync hook runs — the snapshot +// must reflect this run's own committed row. +func runNodeSession(ctx context.Context, s *store.Store, rcl *Rclone, vol *config.Volume, volID int64, node *config.Node, opts Options, rep *Report) (err error) { rep.Status = store.RunStatusFailed defer func() { // Re-derive on the way out: when we never made it past Begin we @@ -74,9 +89,9 @@ func SyncNode(ctx context.Context, s *store.Store, rcl *Rclone, vol *config.Volu node: node, client: client, opts: opts, - report: &rep, + report: rep, } - return rep, driver.run() + return driver.run() } // nodeSyncDriver drives the five-phase initiator-side flow. The @@ -427,7 +442,8 @@ func (d *nodeSyncDriver) collectIndexEntries() ([]syncproto.IndexEntry, error) { func isReservedSyncPath(p string) bool { return strings.HasPrefix(p, HistoryDirName+"/") || strings.HasPrefix(p, ConflictsDirName+"/") || - strings.HasPrefix(p, RestoreHistoryDirName+"/") + strings.HasPrefix(p, RestoreHistoryDirName+"/") || + strings.HasPrefix(p, IndexDirName+"/") } // isReservedFolderPath is the folder-path variant of @@ -439,7 +455,7 @@ func isReservedSyncPath(p string) bool { // rejects, aborting the whole walk. func isReservedFolderPath(p string) bool { return p == HistoryDirName || p == ConflictsDirName || p == RestoreHistoryDirName || - isReservedSyncPath(p) + p == IndexDirName || isReservedSyncPath(p) } // phaseTransfer invokes rclone exactly once over the transfer + diff --git a/sync/rclone.go b/sync/rclone.go index 4644ae2..7f305a6 100644 --- a/sync/rclone.go +++ b/sync/rclone.go @@ -314,6 +314,71 @@ func (r *Rclone) RunWithProgress(ctx context.Context, onProgress func(runevents. return result, nil } +// runPlain executes rclone with the wrapper's --config but without the +// JSON-log and stats flags Run uses, returning captured stdout. It backs +// the auxiliary commands the snapshot ride-along needs (copyto, lsf, +// deletefile), where we want a simple exit-code success/failure and, for +// lsf, a short listing — not the streamed copy stats parseJSONLog builds. +// Stderr is folded into the error on failure for diagnostics. +func (r *Rclone) runPlain(ctx context.Context, args ...string) ([]byte, error) { + if r.Config == "" { + return nil, errors.New("rclone wrapper: Config not set (call WriteRcloneConfig first)") + } + full := append([]string{"--config", r.Config}, args...) + cmd := exec.CommandContext(ctx, r.Binary, full...) + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + if err := cmd.Run(); err != nil { + if msg := strings.TrimSpace(stderr.String()); msg != "" { + return stdout.Bytes(), fmt.Errorf("rclone %s: %w: %s", args[0], err, msg) + } + return stdout.Bytes(), fmt.Errorf("rclone %s: %w", args[0], err) + } + return stdout.Bytes(), nil +} + +// copyTo copies a single source file to a single destination path via +// `rclone copyto`, creating intermediate directories as needed. Used by +// the snapshot ride-along to land one .db file at a fixed destination +// name (copy, by contrast, would treat the destination as a directory). +func (r *Rclone) copyTo(ctx context.Context, src, dst string) error { + _, err := r.runPlain(ctx, "copyto", src, dst) + return err +} + +// listSnapshots returns the snapshot filenames directly under dirURI via +// `rclone lsf`. A missing directory yields an empty list, not an error: +// the first ride-along to a volume legitimately finds nothing there yet, +// and rclone reports the absent directory on stderr with a non-zero exit. +// Only index-* .db entries are returned so an unrelated file in the tree +// is never a rotation candidate. +func (r *Rclone) listSnapshots(ctx context.Context, dirURI string) ([]string, error) { + out, err := r.runPlain(ctx, "lsf", "--files-only", dirURI) + if err != nil { + // lsf against a not-yet-created directory exits non-zero; treat an + // empty listing as "nothing to rotate" rather than a hard error. + if len(bytes.TrimSpace(out)) == 0 { + return nil, nil + } + return nil, err + } + var names []string + for _, line := range strings.Split(string(out), "\n") { + name := strings.TrimSpace(line) + if strings.HasPrefix(name, snapshotPrefix) && strings.HasSuffix(name, ".db") { + names = append(names, name) + } + } + return names, nil +} + +// deleteFile removes a single file at fileURI via `rclone deletefile`. +func (r *Rclone) deleteFile(ctx context.Context, fileURI string) error { + _, err := r.runPlain(ctx, "deletefile", fileURI) + return err +} + // rcloneEvent captures the subset of rclone's JSON log we care about: the // level (for error filtering), the per-object message and object name (for // failed-file lists), and the stats object that rclone emits at the end of diff --git a/sync/snapshot.go b/sync/snapshot.go new file mode 100644 index 0000000..d18bdd8 --- /dev/null +++ b/sync/snapshot.go @@ -0,0 +1,254 @@ +package sync + +import ( + "context" + "fmt" + "os" + "path" + "path/filepath" + "sort" + "strings" + "sync" + "time" + + "github.com/mbertschler/squirrel/config" + "github.com/mbertschler/squirrel/store" +) + +// snapshotTimeLayout is the ISO8601-ish, lexically sortable timestamp +// embedded in snapshot filenames. Millisecond precision so two snapshots +// taken in the same second (back-to-back CLI invocations, tests) get +// distinct names without a retry loop. Matches the layout the store and +// `db backup` already use so one backups/ directory stays consistent. +const snapshotTimeLayout = "20060102T150405.000Z" + +// snapshotPrefix is the filename stem for snapshot-on-sync files. The +// run id follows so a snapshot is traceable to the exact runs row that +// produced it, and the leading timestamp keeps the directory lexically +// (and chronologically) sortable for rotation. +const snapshotPrefix = "index-" + +// Snapshotter coordinates the snapshot-on-sync feature (#75) across the +// pairs of one `squirrel sync` invocation. It takes at most one VACUUM +// INTO snapshot — lazily, on the first pair that reaches a terminal +// success/partial state — and reuses that single file for the local tier +// and every destination ride-along. Construct one per CLI invocation with +// NewSnapshotter and pass it via Options.Snapshot; a nil *Snapshotter is +// the disabled state and every method is a safe no-op on it. +type Snapshotter struct { + store *store.Store + rcl *Rclone + dir string // resolved local snapshot directory + keep int // local rotation bound (0 = no rotation) + cloud bool // ride snapshots along to destination buckets + cloudKeep int // per-volume .squirrel-index/ rotation bound + + mu sync.Mutex + taken bool // the single VACUUM has been attempted + localPath string // the snapshot file, "" if the VACUUM failed + takeErr error // memoised local-snapshot/rotation error +} + +// SnapshotConfig is the resolved input to NewSnapshotter. The CLI builds +// it from config.Backups, resolving Dir against the live DB path (an +// empty config.Backups.Dir means "/backups"). +type SnapshotConfig struct { + Dir string + Keep int + Cloud bool + CloudKeep int +} + +// NewSnapshotter returns a Snapshotter ready to be shared across one +// invocation's pairs. The store backs the VACUUM INTO snapshot; the +// rclone wrapper (with its Config already written) backs the ride-along. +func NewSnapshotter(s *store.Store, rcl *Rclone, cfg SnapshotConfig) *Snapshotter { + return &Snapshotter{ + store: s, + rcl: rcl, + dir: cfg.Dir, + keep: cfg.Keep, + cloud: cfg.Cloud, + cloudKeep: cfg.CloudKeep, + } +} + +// afterSync is the post-run hook called by Sync and SyncNode once the +// run's terminal state is committed. It takes (once) the local snapshot +// and, for destination syncs with cloud enabled, rides a copy along to +// the destination. Failures are surfaced on rep.SnapshotErr and never +// mutate rep.Status — the snapshot is defense-in-depth, not part of the +// sync's success contract. A nil receiver (feature disabled) is a no-op. +func (sn *Snapshotter) afterSync(ctx context.Context, rep *Report, vol *config.Volume, dest *config.Destination) { + if sn == nil { + return + } + // Only snapshot when the run actually reached a terminal good state + // and wrote a row. Dry-run never populates RunID (and the CLI leaves + // Snapshot nil for it anyway), so this also guards that path. + if rep.RunID == 0 { + return + } + if rep.Status != store.RunStatusSuccess && rep.Status != store.RunStatusPartial { + return + } + + localPath, err := sn.ensureLocalSnapshot(ctx, rep.RunID) + if err != nil { + rep.SnapshotErr = err + } + if localPath == "" { + // The VACUUM itself failed; there is nothing to ride along. + return + } + if dest == nil || !sn.cloud { + return + } + if rideErr := sn.rideAlong(ctx, localPath, dest, vol.Name); rideErr != nil { + rep.SnapshotErr = rideErr + } +} + +// ensureLocalSnapshot takes the single VACUUM INTO snapshot the first +// time it is called and memoises the result; later pairs reuse the same +// file (decision #1: one snapshot per invocation, fanned out — never one +// VACUUM per pair). The returned path is "" only when the VACUUM failed; +// a non-fatal rotation error is returned alongside a valid path so the +// ride-along still proceeds. +func (sn *Snapshotter) ensureLocalSnapshot(ctx context.Context, runID int64) (string, error) { + sn.mu.Lock() + defer sn.mu.Unlock() + if sn.taken { + return sn.localPath, sn.takeErr + } + sn.taken = true + + name := fmt.Sprintf("%s%s-run-%d.db", snapshotPrefix, time.Now().UTC().Format(snapshotTimeLayout), runID) + dst := filepath.Join(sn.dir, name) + if err := sn.store.Backup(ctx, dst); err != nil { + sn.takeErr = fmt.Errorf("snapshot index to %s: %w", dst, err) + return "", sn.takeErr + } + sn.localPath = dst + if _, err := rotateSnapshots(sn.dir, sn.keep); err != nil { + // The snapshot we just wrote is valid; a rotation hiccup shouldn't + // block the ride-along. Record it but keep the path. + sn.takeErr = fmt.Errorf("rotate local snapshots in %s: %w", sn.dir, err) + } + return sn.localPath, sn.takeErr +} + +// rideAlong uploads localPath to //.squirrel-index/ via the +// rclone wrapper, then rotates that per-volume directory to at most +// cloudKeep snapshots. The uploaded copy keeps the snapshot's filename so +// the catalog is traceable to its producing run on the destination too. +func (sn *Snapshotter) rideAlong(ctx context.Context, localPath string, dest *config.Destination, volumeName string) error { + dirURI := indexDirURI(dest, volumeName) + name := filepath.Base(localPath) + if err := sn.rcl.copyTo(ctx, localPath, dirURI+"/"+name); err != nil { + return fmt.Errorf("ride-along upload to %s: %w", dest.Name, err) + } + if err := sn.rotateCloud(ctx, dirURI); err != nil { + return fmt.Errorf("rotate %s/%s/%s: %w", dest.Name, volumeName, IndexDirName, err) + } + return nil +} + +// rotateCloud lists the destination's .squirrel-index/ directory and +// deletes the oldest snapshots until at most cloudKeep remain. Snapshots +// are lexically sortable (decision #3), so "newest N" is the tail of the +// name-sorted list — no per-file metadata read required. cloudKeep<=0 +// means "no rotation". +func (sn *Snapshotter) rotateCloud(ctx context.Context, dirURI string) error { + if sn.cloudKeep <= 0 { + return nil + } + names, err := sn.rcl.listSnapshots(ctx, dirURI) + if err != nil { + return err + } + if len(names) <= sn.cloudKeep { + return nil + } + sort.Strings(names) + for _, old := range names[:len(names)-sn.cloudKeep] { + if err := sn.rcl.deleteFile(ctx, dirURI+"/"+old); err != nil { + return err + } + } + return nil +} + +// indexDirURI returns the rclone URI of the per-volume .squirrel-index/ +// directory under dest, mirroring backupDirURI: an absolute filesystem +// path for type=local, "://.squirrel-index" otherwise. +func indexDirURI(dest *config.Destination, volumeName string) string { + subpath := path.Join(volumeName, IndexDirName) + switch dest.Type { + case "local": + return filepath.ToSlash(filepath.Join(dest.Root, subpath)) + default: + return dest.Name + ":" + path.Join(dest.Root, subpath) + } +} + +// rotateSnapshots deletes the oldest snapshots in dir until only keep +// remain, mirroring the CLI's `db backup --keep` rotation. Files are +// matched by the index-/pre-migration- prefixes squirrel writes — the +// snapshot-on-sync directory defaults to the same backups/ dir the store +// and CLI use, so both prefixes share the bound. Unknown files are left +// untouched. keep<=0 means "no rotation". +func rotateSnapshots(dir string, keep int) ([]string, error) { + if keep <= 0 { + return nil, nil + } + entries, err := os.ReadDir(dir) + if err != nil { + if os.IsNotExist(err) { + return nil, nil + } + return nil, err + } + type snap struct { + name string + modTime time.Time + } + var snaps []snap + for _, e := range entries { + if e.IsDir() { + continue + } + name := e.Name() + if !strings.HasPrefix(name, snapshotPrefix) && !strings.HasPrefix(name, "pre-migration-") { + continue + } + info, err := e.Info() + if err != nil { + continue + } + snaps = append(snaps, snap{name: name, modTime: info.ModTime()}) + } + if len(snaps) <= keep { + return nil, nil + } + // Order oldest-first. Break modtime ties by name: filenames embed a + // sortable timestamp, so on filesystems with coarse mtime resolution + // (or snapshots written within one tick) the name keeps the order + // deterministic and chronological — without it, equal modtimes could + // rotate away a newer snapshot. + sort.Slice(snaps, func(i, j int) bool { + if snaps[i].modTime.Equal(snaps[j].modTime) { + return snaps[i].name < snaps[j].name + } + return snaps[i].modTime.Before(snaps[j].modTime) + }) + var removed []string + for _, s := range snaps[:len(snaps)-keep] { + p := filepath.Join(dir, s.name) + if err := os.Remove(p); err != nil { + return removed, err + } + removed = append(removed, p) + } + return removed, nil +} diff --git a/sync/snapshot_test.go b/sync/snapshot_test.go new file mode 100644 index 0000000..75d9efd --- /dev/null +++ b/sync/snapshot_test.go @@ -0,0 +1,357 @@ +package sync + +import ( + "context" + "fmt" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/mbertschler/squirrel/config" + "github.com/mbertschler/squirrel/index" + "github.com/mbertschler/squirrel/store" +) + +// snapshotterFor builds a Snapshotter wired to the fixture's store and +// rclone wrapper, with the local tier pointed at a fresh temp dir. +func (f *syncFixture) snapshotterFor(t *testing.T, cfg SnapshotConfig) *Snapshotter { + t.Helper() + if cfg.Dir == "" { + cfg.Dir = t.TempDir() + } + return NewSnapshotter(f.store, f.rcl, cfg) +} + +func globOne(t *testing.T, pattern string) string { + t.Helper() + matches, err := filepath.Glob(pattern) + if err != nil { + t.Fatalf("glob %s: %v", pattern, err) + } + if len(matches) != 1 { + t.Fatalf("glob %s = %d matches, want exactly 1: %v", pattern, len(matches), matches) + } + return matches[0] +} + +// TestSyncCloudRideAlong is the headline acceptance: after a successful +// destination sync a snapshot lands under +// //.squirrel-index/, is the *global* index.db (carries +// rows for a volume that was never synced to this destination, per +// decision #1), opens cleanly, and a local-tier copy exists alongside. +// The filename carries the producing run id. +func TestSyncCloudRideAlong(t *testing.T) { + f := setupFixture(t) + if err := os.WriteFile(filepath.Join(f.vol.Path, "a.txt"), []byte("alpha"), 0o644); err != nil { + t.Fatal(err) + } + f.runIndex(t) + + // A second volume indexed into the *same* store but never synced here. + // Its rows must still appear in the ride-along snapshot, proving the + // payload is the whole catalog rather than a per-volume slice. + otherPath := t.TempDir() + if err := os.WriteFile(filepath.Join(otherPath, "other.txt"), []byte("o"), 0o644); err != nil { + t.Fatal(err) + } + if _, err := index.Index(context.Background(), f.store, otherPath, index.Options{Name: "other"}); err != nil { + t.Fatalf("index other volume: %v", err) + } + + localDir := t.TempDir() + sn := f.snapshotterFor(t, SnapshotConfig{Dir: localDir, Keep: 7, Cloud: true, CloudKeep: 7}) + rep, err := Sync(context.Background(), f.store, f.rcl, f.vol, f.dest, Options{Snapshot: sn}) + if err != nil { + t.Fatalf("Sync: %v (rep=%+v)", err, rep) + } + if rep.Status != store.RunStatusSuccess { + t.Fatalf("Status = %q, want success", rep.Status) + } + if rep.SnapshotErr != nil { + t.Fatalf("SnapshotErr = %v, want nil", rep.SnapshotErr) + } + + // Cloud snapshot present under the per-volume .squirrel-index dir. + cloudSnap := globOne(t, filepath.Join(f.dest.Root, f.vol.Name, IndexDirName, "index-*-run-*.db")) + if want := fmt.Sprintf("run-%d.db", rep.RunID); !strings.HasSuffix(cloudSnap, want) { + t.Fatalf("cloud snapshot %s does not end with %q", cloudSnap, want) + } + // Local tier copy present with the same name. + localSnap := globOne(t, filepath.Join(localDir, "index-*-run-*.db")) + if filepath.Base(localSnap) != filepath.Base(cloudSnap) { + t.Fatalf("local %s and cloud %s have different names", filepath.Base(localSnap), filepath.Base(cloudSnap)) + } + + // Opens cleanly and is the global catalog: the never-synced "other" + // volume's rows are present in the destination-side snapshot. + snap, err := store.Open(cloudSnap) + if err != nil { + t.Fatalf("open cloud snapshot: %v", err) + } + defer snap.Close() + rows, err := snap.IntegrityCheck(context.Background()) + if err != nil { + t.Fatalf("snapshot integrity check: %v", err) + } + if !store.IsIntegrityClean(rows) { + t.Fatalf("snapshot integrity = %v, want ok", rows) + } + other, err := snap.GetVolumeByName(context.Background(), "other") + if err != nil { + t.Fatalf("snapshot missing 'other' volume (not a global catalog?): %v", err) + } + paths, err := snap.ListPresentPathsUnder(context.Background(), other.ID) + if err != nil { + t.Fatalf("list other paths: %v", err) + } + if _, ok := paths["other.txt"]; !ok { + t.Fatalf("snapshot 'other' volume missing other.txt; paths=%v", paths) + } +} + +// TestSyncCloudDisabledKeepsLocalSnapshot: with Cloud=false a local +// snapshot is still taken, but nothing is uploaded to the destination. +func TestSyncCloudDisabledKeepsLocalSnapshot(t *testing.T) { + f := setupFixture(t) + if err := os.WriteFile(filepath.Join(f.vol.Path, "a.txt"), []byte("alpha"), 0o644); err != nil { + t.Fatal(err) + } + f.runIndex(t) + + localDir := t.TempDir() + sn := f.snapshotterFor(t, SnapshotConfig{Dir: localDir, Keep: 7, Cloud: false, CloudKeep: 7}) + rep, err := Sync(context.Background(), f.store, f.rcl, f.vol, f.dest, Options{Snapshot: sn}) + if err != nil { + t.Fatalf("Sync: %v", err) + } + if rep.SnapshotErr != nil { + t.Fatalf("SnapshotErr = %v, want nil", rep.SnapshotErr) + } + if _, err := os.Stat(filepath.Join(localDir)); err != nil { + t.Fatalf("local backups dir missing: %v", err) + } + globOne(t, filepath.Join(localDir, "index-*-run-*.db")) + if _, err := os.Stat(filepath.Join(f.dest.Root, f.vol.Name, IndexDirName)); !os.IsNotExist(err) { + t.Fatalf("cloud .squirrel-index exists with cloud=false (err=%v)", err) + } +} + +// TestSyncNoSnapshotterIsNoOp: a nil Options.Snapshot (feature disabled, +// e.g. [backups] enabled=false) takes no snapshot at all. +func TestSyncNoSnapshotterIsNoOp(t *testing.T) { + f := setupFixture(t) + if err := os.WriteFile(filepath.Join(f.vol.Path, "a.txt"), []byte("alpha"), 0o644); err != nil { + t.Fatal(err) + } + f.runIndex(t) + + rep, err := Sync(context.Background(), f.store, f.rcl, f.vol, f.dest, Options{}) + if err != nil { + t.Fatalf("Sync: %v", err) + } + if rep.SnapshotErr != nil { + t.Fatalf("SnapshotErr = %v, want nil", rep.SnapshotErr) + } + if _, err := os.Stat(filepath.Join(f.dest.Root, f.vol.Name, IndexDirName)); !os.IsNotExist(err) { + t.Fatalf("snapshot taken with no Snapshotter (err=%v)", err) + } +} + +// TestSnapshotErrorDoesNotFailSync injects a backup failure (the local +// snapshot dir's parent is a regular file, so MkdirAll fails) and asserts +// the sync run's status is unchanged while the failure surfaces on +// SnapshotErr. Defense-in-depth must never flip a good sync to failed. +func TestSnapshotErrorDoesNotFailSync(t *testing.T) { + f := setupFixture(t) + if err := os.WriteFile(filepath.Join(f.vol.Path, "a.txt"), []byte("alpha"), 0o644); err != nil { + t.Fatal(err) + } + f.runIndex(t) + + // Make the snapshot dir un-creatable: its parent is a file. + blocker := filepath.Join(t.TempDir(), "not-a-dir") + if err := os.WriteFile(blocker, []byte("x"), 0o644); err != nil { + t.Fatal(err) + } + sn := NewSnapshotter(f.store, f.rcl, SnapshotConfig{Dir: filepath.Join(blocker, "backups"), Keep: 7, Cloud: true, CloudKeep: 7}) + + rep, err := Sync(context.Background(), f.store, f.rcl, f.vol, f.dest, Options{Snapshot: sn}) + if err != nil { + t.Fatalf("Sync returned error for a snapshot-only failure: %v", err) + } + if rep.Status != store.RunStatusSuccess { + t.Fatalf("Status = %q, want success (snapshot failure must not flip status)", rep.Status) + } + if rep.SnapshotErr == nil { + t.Fatalf("SnapshotErr = nil, want the injected backup failure") + } + // The runs row itself stays success. + run, err := f.store.GetRun(context.Background(), rep.RunID) + if err != nil { + t.Fatalf("GetRun: %v", err) + } + if run.Status != store.RunStatusSuccess { + t.Fatalf("run row status = %q, want success", run.Status) + } +} + +// TestCloudRotationBoundsDir runs several syncs and asserts the +// destination .squirrel-index/ dir is bounded to cloud_keep snapshots. +// CloudKeep=2 with three syncs must leave exactly two (the newest). +func TestCloudRotationBoundsDir(t *testing.T) { + f := setupFixture(t) + if err := os.WriteFile(filepath.Join(f.vol.Path, "a.txt"), []byte("alpha"), 0o644); err != nil { + t.Fatal(err) + } + f.runIndex(t) + + indexDir := filepath.Join(f.dest.Root, f.vol.Name, IndexDirName) + var names []string + for i := 0; i < 3; i++ { + // A fresh Snapshotter per invocation: that is the real shape (one + // per `squirrel sync`), and each takes its own single snapshot. + sn := f.snapshotterFor(t, SnapshotConfig{Dir: t.TempDir(), Keep: 7, Cloud: true, CloudKeep: 2}) + rep, err := Sync(context.Background(), f.store, f.rcl, f.vol, f.dest, Options{Snapshot: sn}) + if err != nil { + t.Fatalf("Sync %d: %v", i, err) + } + if rep.SnapshotErr != nil { + t.Fatalf("Sync %d SnapshotErr: %v", i, rep.SnapshotErr) + } + names = append(names, filepath.Base(globOneNewest(t, indexDir))) + } + left, err := filepath.Glob(filepath.Join(indexDir, "index-*-run-*.db")) + if err != nil { + t.Fatal(err) + } + if len(left) != 2 { + t.Fatalf("cloud dir has %d snapshots, want 2 (cloud_keep): %v", len(left), left) + } + // The two survivors are the two newest by name (lexically sortable). + if !containsBase(left, names[1]) || !containsBase(left, names[2]) { + t.Fatalf("survivors %v are not the two newest %v", left, names[1:]) + } +} + +// globOneNewest returns the lexically-greatest index snapshot in dir. +func globOneNewest(t *testing.T, dir string) string { + t.Helper() + matches, err := filepath.Glob(filepath.Join(dir, "index-*-run-*.db")) + if err != nil || len(matches) == 0 { + t.Fatalf("glob newest in %s: %d matches, err=%v", dir, len(matches), err) + } + newest := matches[0] + for _, m := range matches[1:] { + if m > newest { + newest = m + } + } + return newest +} + +func containsBase(paths []string, base string) bool { + for _, p := range paths { + if filepath.Base(p) == base { + return true + } + } + return false +} + +// TestLocalRotationBoundsDir exercises rotateSnapshots directly: writing +// more than keep index-* files and rotating leaves the newest keep. +func TestLocalRotationBoundsDir(t *testing.T) { + dir := t.TempDir() + // Distinct, lexically-ordered names; modtime order matches creation. + for i := 0; i < 5; i++ { + name := fmt.Sprintf("index-2026010%dT000000.000Z-run-%d.db", i, i) + if err := os.WriteFile(filepath.Join(dir, name), []byte("x"), 0o644); err != nil { + t.Fatal(err) + } + } + // An unrelated file must be left untouched. + if err := os.WriteFile(filepath.Join(dir, "keep-me.txt"), []byte("x"), 0o644); err != nil { + t.Fatal(err) + } + removed, err := rotateSnapshots(dir, 2) + if err != nil { + t.Fatalf("rotateSnapshots: %v", err) + } + if len(removed) != 3 { + t.Fatalf("removed %d, want 3: %v", len(removed), removed) + } + left, _ := filepath.Glob(filepath.Join(dir, "index-*.db")) + if len(left) != 2 { + t.Fatalf("index files left = %d, want 2: %v", len(left), left) + } + if _, err := os.Stat(filepath.Join(dir, "keep-me.txt")); err != nil { + t.Fatalf("rotation removed an unrelated file: %v", err) + } +} + +// TestSyncFiltersOutIndexDirFromSource is the reserved-path guard for +// sync: a .squirrel-index dir that incidentally exists in the source +// volume must not be uploaded. +func TestSyncFiltersOutIndexDirFromSource(t *testing.T) { + f := setupFixture(t) + if err := os.MkdirAll(filepath.Join(f.vol.Path, IndexDirName), 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(f.vol.Path, IndexDirName, "stale.db"), []byte("nope"), 0o644); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(f.vol.Path, "a.txt"), []byte("alpha"), 0o644); err != nil { + t.Fatal(err) + } + f.runIndex(t) + // No snapshotter here: we are asserting the *data sync* filter, not the + // ride-along, so any .squirrel-index at the destination would be from + // the source tree leaking through. + if _, err := Sync(context.Background(), f.store, f.rcl, f.vol, f.dest, Options{}); err != nil { + t.Fatalf("Sync: %v", err) + } + if _, err := os.Stat(filepath.Join(f.dest.Root, f.vol.Name, IndexDirName, "stale.db")); err == nil { + t.Fatalf("source .squirrel-index leaked to destination; should be filtered") + } +} + +// TestReservedSyncPathCoversIndexDir documents that the peer-sync +// reserved-path predicates exclude .squirrel-index, so a node whose index +// carries rows there never re-publishes them. +func TestReservedSyncPathCoversIndexDir(t *testing.T) { + if !isReservedSyncPath(IndexDirName + "/index-x-run-1.db") { + t.Fatalf("isReservedSyncPath does not cover %s/", IndexDirName) + } + if !isReservedFolderPath(IndexDirName) { + t.Fatalf("isReservedFolderPath does not cover bare %s", IndexDirName) + } +} + +// TestBuildArgsFilterIndexDir asserts both the sync and restore arg +// builders carry the .squirrel-index filter. +func TestBuildArgsFilterIndexDir(t *testing.T) { + vol := &config.Volume{Name: "pics", Path: "/tmp/pics"} + dest := &config.Destination{Name: "scratch", Type: "local", Root: "/tmp/dst"} + + syncArgs, err := buildRcloneArgs(vol, dest, 1, Options{}) + if err != nil { + t.Fatalf("buildRcloneArgs: %v", err) + } + if !argsHaveFilter(syncArgs, "- /"+IndexDirName+"/**") { + t.Fatalf("sync args missing .squirrel-index filter: %v", syncArgs) + } + restoreArgs := buildRestoreArgs(vol, dest, 1, RestoreOptions{}) + if !argsHaveFilter(restoreArgs, "- /"+IndexDirName+"/**") { + t.Fatalf("restore args missing .squirrel-index filter: %v", restoreArgs) + } +} + +func argsHaveFilter(args []string, want string) bool { + for i, a := range args { + if a == "--filter" && i+1 < len(args) && args[i+1] == want { + return true + } + } + return false +} diff --git a/sync/sync.go b/sync/sync.go index 7fc9504..47d9ace 100644 --- a/sync/sync.go +++ b/sync/sync.go @@ -42,6 +42,14 @@ const ConflictsDirName = ".squirrel-conflicts" // destroyed atomically by rclone copy. const RestoreHistoryDirName = ".squirrel-restore-history" +// IndexDirName is the per-volume directory at the destination where the +// cloud ride-along (#75) lands index snapshots: one VACUUM INTO copy of +// the global index.db per successful sync, under +// //.squirrel-index/. Like the other reserved +// directories it is filtered out of the data transfer (sync and restore) +// and from peer-sync so a snapshot is never mistaken for user content. +const IndexDirName = ".squirrel-index" + // Options shapes one Sync invocation. type Options struct { // Shallow drops --checksum and --hash blake3 so rclone uses its default @@ -69,6 +77,15 @@ type Options struct { // must keep it cheap (non-blocking channel send is the canonical // shape). nil means no-op. Progress func(runevents.Progress) + // Snapshot, if non-nil, drives the snapshot-on-sync feature (#75): + // after a run reaches a terminal success/partial state, a single + // VACUUM INTO snapshot is taken (lazily, once per CLI invocation) to + // the local tier and, for destination syncs, ridden along to the + // destination bucket. nil disables the feature for this run (dry-run, + // restore, or `[backups] enabled=false`). One Snapshotter is shared + // across every pair of one `squirrel sync` so the VACUUM happens once + // and fans out. + Snapshot *Snapshotter } // Report is the outcome of one Sync invocation. Volume and Destination @@ -107,6 +124,13 @@ type Report struct { // and the new BLAKE3 plus the receiver-relative preserved path so // the CLI can render a meaningful "review at " pointer. NodeConflicts []syncproto.ConflictDetail + // SnapshotErr captures a failure to take the post-sync index + // snapshot or to ride it along to the destination (#75). It is + // strictly defense-in-depth: a snapshot failure must not flip a + // successful sync to failed, so it is surfaced here and logged + // rather than folded into Status. Nil when no snapshot was attempted + // (dry-run, disabled, or a run that didn't reach success/partial). + SnapshotErr error // NodePendingWarnings is the receiver's drift-detection advisory // from the handshake (#17): one line per audit run on the volume // since the last successful sync that flipped content @@ -195,6 +219,11 @@ func Sync(ctx context.Context, s *store.Store, rcl *Rclone, vol *config.Volume, func(runID int64) ([]string, error) { return buildRcloneArgs(vol, dest, runID, opts) }) + // runRcloneOperation's deferred finishRun has committed the run's + // terminal state by now, so the snapshot reflects this run's own row. + // Destination syncs are eligible for the cloud ride-along; the + // Snapshotter no-ops on dry-run and on non-terminal-success states. + opts.Snapshot.afterSync(ctx, &rep, vol, dest) return rep, err } @@ -463,6 +492,12 @@ func buildRcloneArgs(vol *config.Volume, dest *config.Destination, runID int64, // source volume; filtering it out of sync uploads keeps it // from leaking onto destinations. "--filter", "- /" + RestoreHistoryDirName + "/**", + // .squirrel-index holds the cloud ride-along snapshots (#75). + // It is written by a separate copyto *after* the data transfer, + // so filtering it out of the data sync keeps an existing snapshot + // dir from being treated as user content (re-uploaded, or pulled + // back down on restore). + "--filter", "- /" + IndexDirName + "/**", } if !opts.Shallow { args = append(args, "--checksum", "--hash", "blake3") @@ -793,6 +828,7 @@ func buildRestoreArgs(vol *config.Volume, dest *config.Destination, runID int64, args = append(args, "--filter", "- /"+HistoryDirName+"/**") args = append(args, "--filter", "- /"+volmark.MarkerName) args = append(args, "--filter", "- /"+RestoreHistoryDirName+"/**") + args = append(args, "--filter", "- /"+IndexDirName+"/**") } if !opts.Shallow { args = append(args, "--checksum", "--hash", "blake3")