diff --git a/README.md b/README.md index 18a6049..ca16fb9 100644 --- a/README.md +++ b/README.md @@ -65,22 +65,30 @@ A volume can declare a per-volume **hook** — a command the agent runs to nudge ```toml [volumes.pictures.hook] -command = ["kopia", "snapshot", "create", "."] -timeout = "30m" # optional, defaults to 1h +command = ["kopia", "snapshot", "create", "."] +timeout = "30m" # optional, defaults to 1h +interval = "24h" # optional — also fire on this cadence (see below) ``` -The hook fires after a successful index run on the volume (which the agent runs on the `index_every` / `sync_every` cadence). It is **best-effort**: a hook failure or timeout never fails or blocks the run that triggered it, and overlapping invocations for the same volume are skipped rather than stacked. The command receives: +A hook fires on two triggers, both reusing the same command: + +- **on change** — after every successful index run on the volume (which the agent runs on the `index_every` / `sync_every` cadence). This answers *"is the latest content backed up?"*. It keys off content settling, not off a sync to a remote, so a volume needs no `sync_to` destination for the hook to be useful. +- **on interval** — every `interval`, *regardless of whether anything changed*. This answers *"is the existing backup still intact?"*. Verification is orthogonal to change — bitrot happens to static data — so re-checks have to run on a clock. Omit `interval` to fire on-change only. + +The command tells the two apart via `SQUIRREL_TRIGGER` (so a single command can back up on change and verify on interval). It is **best-effort**: a hook failure or timeout never fails or blocks the run that triggered it, and overlapping invocations for the same volume are skipped rather than stacked. The command receives: | Variable | Meaning | |---|---| | `SQUIRREL_VOLUME` | volume name | | `SQUIRREL_PATH` | absolute volume path | -| `SQUIRREL_RUN_ID` | the index run that triggered the hook | -| `SQUIRREL_CHANGED` | `true`/`false` — whether the run observed changes (so the command can cheaply no-op) | -| `SQUIRREL_TRIGGER` | `change` | +| `SQUIRREL_RUN_ID` | the index run that triggered the hook (empty on the interval trigger) | +| `SQUIRREL_CHANGED` | `true`/`false` — whether the run observed changes (so the command can cheaply no-op); always `false` on the interval trigger | +| `SQUIRREL_TRIGGER` | `change` or `interval` | Because the command is exec'd without a shell, the volume path is never string-concatenated into a command line. If you want shell features, make the command `["sh", "-c", "…"]` yourself. Recorded outcomes are visible via `squirrel hooks` and the TUI's Hooks tab. +> **Don't double-schedule verification.** If your external tool already runs its own verify on a timer (e.g. a cron/systemd job), don't *also* set `interval` for a verify command — two heavy passes will step on each other. Pick one driver: let squirrel schedule it (so the result lands in `squirrel hooks` / the TUI) **or** let the tool schedule it (maximum independence — verification keeps happening even when the agent is down), not both. + ## Quickstart Index a configured volume: diff --git a/agent/hooks.go b/agent/hooks.go index 9287a24..9b35830 100644 --- a/agent/hooks.go +++ b/agent/hooks.go @@ -50,11 +50,7 @@ func newHookRunner(s *store.Store, logger *slog.Logger) *hookRunner { // // A nil receiver is a no-op so tests can construct a bare scheduler // without wiring a runner. -// -// trigger is always "change" until the interval caller lands in #86; -// keeping it a parameter keeps the foundation trigger-agnostic, hence the -// nolint until the second caller exercises the other value. -func (h *hookRunner) fire(ctx context.Context, vol *config.Volume, volumeID int64, trigger string, triggeringRunID int64, changed bool) { //nolint:unparam +func (h *hookRunner) fire(ctx context.Context, vol *config.Volume, volumeID int64, trigger string, triggeringRunID int64, changed bool) { if h == nil || vol.Hook == nil { return } diff --git a/agent/hooks_test.go b/agent/hooks_test.go index 9b67f79..616f8b0 100644 --- a/agent/hooks_test.go +++ b/agent/hooks_test.go @@ -208,3 +208,83 @@ func TestSchedulerFiresChangeHook(t *testing.T) { t.Fatalf("hook command did not run (marker absent): %v", err) } } + +// TestSchedulerFiresIntervalHook drives the interval ("check") trigger: +// a volume with only a hook interval (no index/sync cadence) fires the +// command on its cadence regardless of change, and not before. The +// trigger is recorded as 'interval' with no triggering run and +// changed=false. +func TestSchedulerFiresIntervalHook(t *testing.T) { + vol := &config.Volume{ + Name: "v", + Hook: &config.VolumeHook{ + Command: []string{"sh", "-c", "exit 0"}, + Timeout: 5 * time.Second, + Interval: 10 * time.Minute, + }, + } + f := newSchedulerFixture(t, vol) + sched := f.scheduler() + ctx := context.Background() + + // First tick: no prior interval hook, so it is due. + sched.tick(ctx) + sched.hooks.wait() + assertIntervalHookCount(t, f, 1) + + // The interval hook fires independent of indexing — no runs row exists. + runs, err := f.store.ListRuns(ctx, store.ListRunsOpts{}) + if err != nil { + t.Fatalf("ListRuns: %v", err) + } + if len(runs) != 0 { + t.Fatalf("interval hook created %d runs rows; want 0 (it never indexes)", len(runs)) + } + + hooks, err := f.store.ListHookRuns(ctx, store.HookRunListOpts{Descending: true}) + if err != nil { + t.Fatalf("ListHookRuns: %v", err) + } + hr := hooks[0] + if hr.Trigger != store.HookTriggerInterval { + t.Fatalf("trigger = %q, want interval", hr.Trigger) + } + if hr.TriggeringRunID.Valid { + t.Fatalf("TriggeringRunID = %v, want NULL for interval hook", hr.TriggeringRunID) + } + if hr.Changed { + t.Fatalf("Changed = true, want false for interval hook") + } + if hr.Status != store.HookStatusSuccess { + t.Fatalf("status = %q, want success", hr.Status) + } + + // Under the cadence: a second tick must not re-fire. + f.clock.Add(5 * time.Minute) + sched.tick(ctx) + sched.hooks.wait() + assertIntervalHookCount(t, f, 1) + + // Past the cadence: the third tick re-fires. + f.clock.Add(10 * time.Minute) + sched.tick(ctx) + sched.hooks.wait() + assertIntervalHookCount(t, f, 2) +} + +func assertIntervalHookCount(t *testing.T, f *schedulerFixture, want int) { + t.Helper() + hooks, err := f.store.ListHookRuns(context.Background(), store.HookRunListOpts{}) + if err != nil { + t.Fatalf("ListHookRuns: %v", err) + } + n := 0 + for _, h := range hooks { + if h.Trigger == store.HookTriggerInterval { + n++ + } + } + if n != want { + t.Fatalf("interval hook runs = %d, want %d", n, want) + } +} diff --git a/agent/scheduler.go b/agent/scheduler.go index 0c282aa..17d3883 100644 --- a/agent/scheduler.go +++ b/agent/scheduler.go @@ -75,13 +75,24 @@ func newScheduler(srv *Server, tickEvery time.Duration, now func() time.Time) *s // when nothing is scheduled. func (s *scheduler) anyScheduledVolume() bool { for _, v := range s.volumes { - if v.SyncEvery > 0 || v.IndexEvery > 0 { + if volumeHasCadence(v) { return true } } return false } +// volumeHasCadence reports whether a volume opts into any scheduler-driven +// cadence: a sync, a standalone index, or an interval hook. The interval +// hook counts on its own — a volume can want periodic verification of its +// external backup without any squirrel index/sync schedule. +func volumeHasCadence(v *config.Volume) bool { + if v.SyncEvery > 0 || v.IndexEvery > 0 { + return true + } + return v.Hook != nil && v.Hook.Interval > 0 +} + // run is the scheduler's main loop. Returns on ctx cancellation. One // tick body never overlaps with the next: a slow tick simply drops the // queued ticker fire (time.Ticker's 1-buffer channel discards values @@ -112,7 +123,7 @@ func (s *scheduler) tick(ctx context.Context) { return } v := s.volumes[name] - if v.SyncEvery == 0 && v.IndexEvery == 0 { + if !volumeHasCadence(v) { continue } s.evaluateVolume(ctx, v) @@ -133,6 +144,13 @@ func sortedVolumeNames(m map[string]*config.Volume) []string { // indexes immediately before pushing; if that pre-sync index ran, it // satisfies the standalone-index cadence too (both write kind='index' // rows the same way) and the standalone branch naturally skips. +// +// The interval hook is evaluated last, after any index/sync this tick. +// That ordering means a tick that re-indexed the source (verifying HDD1) +// also fires the external check (verifying HDD2) right after — verify the +// source, then verify the backup — without coupling the two cadences: +// each runs on its own clock and the interval hook fires regardless of +// whether an index ran. func (s *scheduler) evaluateVolume(ctx context.Context, vol *config.Volume) { v, err := s.resolveVolume(ctx, vol.Name, vol.Path) if err != nil { @@ -144,6 +162,50 @@ func (s *scheduler) evaluateVolume(ctx context.Context, vol *config.Volume) { if !syncFired && vol.IndexEvery > 0 { s.maybeRunIndex(ctx, vol, v.ID, "standalone", vol.IndexEvery) } + s.maybeFireIntervalHook(ctx, vol, v.ID) +} + +// maybeFireIntervalHook fires the volume's hook on its interval cadence +// (#86), independent of any change. It is the time-driven counterpart to +// the on-change fire in executeIndex: verification is orthogonal to change +// (bitrot hits static data), so it must run on a clock. The fire reuses +// the foundation's exec/env/best-effort/don't-stack/timeout path with the +// trigger set to interval; SQUIRREL_CHANGED is always false here (no run +// observed anything) and there is no triggering run. +func (s *scheduler) maybeFireIntervalHook(ctx context.Context, vol *config.Volume, volumeID int64) { + if vol.Hook == nil || vol.Hook.Interval <= 0 { + return + } + due, err := s.intervalHookDue(ctx, volumeID, vol.Hook.Interval) + if err != nil { + s.logger.Error("scheduler.error", + "kind", "hook", "volume", vol.Name, "err", err.Error()) + return + } + if !due { + return + } + s.hooks.fire(ctx, vol, volumeID, store.HookTriggerInterval, 0, false) +} + +// intervalHookDue reports whether `now - last_interval_hook >= cadence` +// for the volume. A volume with no interval hook run yet is always due. +// The last run's end timestamp anchors the cadence (falling back to its +// start, mirroring elapsedSince); a still-running hook is handled by the +// don't-stack guard inside fire, not here. +func (s *scheduler) intervalHookDue(ctx context.Context, volumeID int64, cadence time.Duration) (bool, error) { + r, err := s.store.LatestHookRun(ctx, volumeID, store.HookTriggerInterval) + if err != nil { + if store.IsNotFound(err) { + return true, nil + } + return false, fmt.Errorf("lookup last interval hook: %w", err) + } + ts := r.StartedAtNs + if r.EndedAtNs.Valid { + ts = r.EndedAtNs.Int64 + } + return s.now().Sub(time.Unix(0, ts)) >= cadence, nil } // resolveVolume looks up (or creates) the volume row by name. Mirrors diff --git a/config/config.go b/config/config.go index e3d512d..c5eea6c 100644 --- a/config/config.go +++ b/config/config.go @@ -100,6 +100,14 @@ type VolumeHook struct { // Timeout bounds one invocation so a hung hook can't wedge the agent's // scheduler. Zero is replaced with DefaultHookTimeout at load time. Timeout time.Duration + // Interval is the cadence for the interval ("check") trigger: the + // agent fires the same command on this period regardless of whether + // content changed (the motivating use is periodic backup + // verification, where bitrot happens to static data and so must be + // re-checked on a clock, not on an event). Zero means "no interval + // firing" — the hook then fires only on-change. The command tells the + // two triggers apart via SQUIRREL_TRIGGER (change vs interval). + Interval time.Duration } // DefaultHookTimeout is the per-invocation timeout applied when a hook @@ -193,8 +201,9 @@ type rawVolume struct { } type rawVolumeHook struct { - Command []string `toml:"command"` - Timeout string `toml:"timeout"` + Command []string `toml:"command"` + Timeout string `toml:"timeout"` + Interval string `toml:"interval"` } func (r *rawConfig) resolve(path string) (*Config, error) { @@ -328,9 +337,14 @@ func resolveVolumeHook(raw *rawVolumeHook) (*VolumeHook, error) { } timeout = dur } + interval, err := parseVolumeCadence("hook.interval", raw.Interval) + if err != nil { + return nil, err + } return &VolumeHook{ - Command: append([]string(nil), raw.Command...), - Timeout: timeout, + Command: append([]string(nil), raw.Command...), + Timeout: timeout, + Interval: interval, }, nil } diff --git a/config/hook_test.go b/config/hook_test.go index 5c51d68..b5c7948 100644 --- a/config/hook_test.go +++ b/config/hook_test.go @@ -49,6 +49,71 @@ command = ["backup.sh"] } } +func TestLoadVolumeHookInterval(t *testing.T) { + p := writeConfig(t, ` +[volumes.photos] +path = "/tmp/photos" + +[volumes.photos.hook] +command = ["kopia", "snapshot", "verify"] +interval = "24h" +`) + cfg, err := Load(p) + if err != nil { + t.Fatalf("Load: %v", err) + } + if got := cfg.Volumes["photos"].Hook.Interval; got != 24*time.Hour { + t.Fatalf("Interval = %s, want 24h", got) + } +} + +func TestLoadVolumeHookIntervalDefaultsZero(t *testing.T) { + p := writeConfig(t, ` +[volumes.photos] +path = "/tmp/photos" + +[volumes.photos.hook] +command = ["backup.sh"] +`) + cfg, err := Load(p) + if err != nil { + t.Fatalf("Load: %v", err) + } + if got := cfg.Volumes["photos"].Hook.Interval; got != 0 { + t.Fatalf("Interval = %s, want 0 (no interval firing)", got) + } +} + +func TestLoadVolumeHookIntervalErrors(t *testing.T) { + cases := []struct { + name string + body string + want string + }{ + { + name: "bad interval", + body: "[volumes.v]\npath=\"/tmp/v\"\n[volumes.v.hook]\ncommand=[\"x\"]\ninterval=\"nope\"\n", + want: "hook.interval", + }, + { + name: "below floor", + body: "[volumes.v]\npath=\"/tmp/v\"\n[volumes.v.hook]\ncommand=[\"x\"]\ninterval=\"10ms\"\n", + want: "hook.interval must be at least", + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + _, err := Load(writeConfig(t, tc.body)) + if err == nil { + t.Fatalf("expected error containing %q, got nil", tc.want) + } + if !strings.Contains(err.Error(), tc.want) { + t.Fatalf("error = %v, want substring %q", err, tc.want) + } + }) + } +} + func TestLoadVolumeHookNoBlock(t *testing.T) { p := writeConfig(t, ` [volumes.photos] diff --git a/store/hookruns.go b/store/hookruns.go index 8ee2693..0289605 100644 --- a/store/hookruns.go +++ b/store/hookruns.go @@ -173,3 +173,25 @@ func (s *Store) ListHookRuns(ctx context.Context, opts HookRunListOpts) ([]HookR } return queryRows(ctx, s.db, query, scanHookRunRow, args...) } + +// LatestHookRun returns the most recent hook run for the given (volume, +// trigger), or sql.ErrNoRows when none exists. The interval-hook scheduler +// computes `now - last_run` from this row to decide whether a +// cadence-driven check is due; a still-running row counts (its +// started_at_ns anchors the cadence, and the don't-stack guard prevents a +// second concurrent invocation regardless). +// +// Ordering is by started_at_ns so the idx_hook_runs_volume_trigger index +// (volume_id, trigger, started_at_ns) satisfies both the filter and the +// sort — no separate sort pass as hook history grows. id breaks ties +// deterministically and is the index's implicit trailing rowid, so it +// stays index-only. +func (s *Store) LatestHookRun(ctx context.Context, volumeID int64, trigger string) (HookRun, error) { + row := s.db.QueryRowContext(ctx, ` + SELECT `+hookRunColumns+` + FROM hook_runs + WHERE volume_id = ? AND trigger = ? + ORDER BY started_at_ns DESC, id DESC LIMIT 1 + `, volumeID, trigger) + return scanHookRun(row.Scan) +} diff --git a/store/hookruns_test.go b/store/hookruns_test.go index b0b6dcb..db3ea9e 100644 --- a/store/hookruns_test.go +++ b/store/hookruns_test.go @@ -172,6 +172,50 @@ func TestListHookRuns(t *testing.T) { _ = id2 } +func TestLatestHookRun(t *testing.T) { + s := openTestStore(t) + ctx := context.Background() + vol, err := s.CreateVolume(ctx, "v", "/tmp/v") + if err != nil { + t.Fatalf("CreateVolume: %v", err) + } + + if _, err := s.LatestHookRun(ctx, vol.ID, HookTriggerInterval); !errors.Is(err, sql.ErrNoRows) { + t.Fatalf("LatestHookRun on empty = %v, want sql.ErrNoRows", err) + } + + // The change hook needs a real triggering run (the trigger↔run coupling + // is enforced); interval hooks carry none. + runID, err := s.BeginIndexRun(ctx, RunKindIndex, vol.ID, true) + if err != nil { + t.Fatalf("BeginIndexRun: %v", err) + } + mustBegin := func(spec HookRunSpec) int64 { + t.Helper() + id, err := s.BeginHookRun(ctx, spec) + if err != nil { + t.Fatalf("BeginHookRun(%s): %v", spec.Trigger, err) + } + return id + } + // Two interval runs and a change run; LatestHookRun must return the + // newest of the requested trigger only. + mustBegin(HookRunSpec{VolumeID: vol.ID, Trigger: HookTriggerInterval}) + mustBegin(HookRunSpec{VolumeID: vol.ID, Trigger: HookTriggerChange, TriggeringRunID: runID}) + latestInterval := mustBegin(HookRunSpec{VolumeID: vol.ID, Trigger: HookTriggerInterval}) + + got, err := s.LatestHookRun(ctx, vol.ID, HookTriggerInterval) + if err != nil { + t.Fatalf("LatestHookRun: %v", err) + } + if got.ID != latestInterval { + t.Fatalf("latest interval id = %d, want %d", got.ID, latestInterval) + } + if got.Trigger != HookTriggerInterval { + t.Fatalf("trigger = %q, want interval", got.Trigger) + } +} + // hookRunByID reads a single hook run by id. It lives in the test // file (not the package surface) because production code never fetches a // hook run by primary key — it lists or, in the follow-up interval work,