Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion services/kiloclaw/src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,17 @@ export const RESTARTING_TIMEOUT_MS = 5 * 60 * 1000; // 5 min
export const RESTARTING_MAX_TIMEOUT_MS = 15 * 60 * 1000; // 15 min
/** Maximum time to stay in 'recovering' before surfacing a timeout */
export const RECOVERING_TIMEOUT_MS = 10 * 60 * 1000; // 10 min
/** Destroying: retry pending deletes quickly */
/** Destroying: initial retry interval for pending deletes */
export const ALARM_INTERVAL_DESTROYING_MS = 60 * 1000; // 1 min
/** Volume deletion retry tiers; the last tier repeats until the retry cap. */
export const DESTROY_VOLUME_RETRY_DELAYS_MS = [
60 * 1000,
5 * 60 * 1000,
15 * 60 * 1000,
60 * 60 * 1000,
6 * 60 * 60 * 1000,
24 * 60 * 60 * 1000,
] as const;
/** Pending destroy age before emitting stuck-destroy telemetry */
export const DESTROY_STUCK_THRESHOLD_MS = 15 * 60 * 1000; // 15 min
/** Minimum interval between repeated stuck-destroy telemetry events */
Expand Down
45 changes: 42 additions & 3 deletions services/kiloclaw/src/durable-objects/kiloclaw-instance.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ vi.mock('../utils/encryption', async () => {

import { KiloClawInstance } from './kiloclaw-instance';
import { buildChannelConfigPatch } from './kiloclaw-instance/channel-config';
import { destroyRetryDelay } from './kiloclaw-instance/log';
import * as flyClient from '../fly/client';
import { FlyApiError } from '../fly/client';
import * as db from '../db';
Expand Down Expand Up @@ -1394,7 +1395,17 @@ describe('destroy error tracking', () => {
});
});

describe('destroy volume: max-retry abandon', () => {
describe('destroy volume: retry backoff and abandon', () => {
it('uses tiered proportional jitter and caps at the daily tier', () => {
expect(destroyRetryDelay(1, 0.5)).toBe(60 * 1000);
expect(destroyRetryDelay(2, 0.5)).toBe(5 * 60 * 1000);
expect(destroyRetryDelay(3, 0.5)).toBe(15 * 60 * 1000);
expect(destroyRetryDelay(4, 0.5)).toBe(60 * 60 * 1000);
expect(destroyRetryDelay(5, 0.5)).toBe(6 * 60 * 60 * 1000);
expect(destroyRetryDelay(6, 0.5)).toBe(24 * 60 * 60 * 1000);
expect(destroyRetryDelay(100, 0)).toBe(12 * 60 * 60 * 1000);
expect(destroyRetryDelay(100, 1)).toBe(36 * 60 * 60 * 1000);
});
// vi.clearAllMocks() in the global beforeEach clears call history but not
// implementations. Without this reset, a previous test in the file that
// used `.mockResolvedValue([volumes...])` on listVolumes would leak its
Expand Down Expand Up @@ -1442,6 +1453,34 @@ describe('destroy volume: max-retry abandon', () => {
expect(storage._store.get('pendingDestroyVolumeId')).toBe('vol-1');
});

it('emits retry escalation telemetry before reaching the cap', async () => {
const env = createFakeEnv();
const { storage } = createInstance(createFakeStorage(), env);
await seedProvisioned(storage, {
status: 'destroying',
flyMachineId: null,
flyVolumeId: 'vol-1',
pendingDestroyMachineId: null,
pendingDestroyVolumeId: 'vol-1',
destroyVolumeAttempts: 5,
});
(flyClient.getVolume as Mock).mockResolvedValue({
id: 'vol-1',
attached_machine_id: null,
state: 'detached',
});
(flyClient.deleteVolume as Mock).mockRejectedValue(
new FlyApiError('persistent failure', 503, '{}')
);

const { instance } = createInstance(storage, env);
await instance.alarm();

expect(storage._store.get('destroyVolumeAttempts')).toBe(6);
expect(storage._store.get('pendingDestroyVolumeId')).toBe('vol-1');
expect(analyticsEventsByName(env, 'reconcile.destroy_volume_retry_escalated')).toHaveLength(1);
});

it('emits destroy_volume_abandoned_after_max_retries and clears state at the cap', async () => {
const env = createFakeEnv();
const { storage } = createInstance(createFakeStorage(), env);
Expand All @@ -1452,7 +1491,7 @@ describe('destroy volume: max-retry abandon', () => {
flyVolumeId: 'vol-1',
pendingDestroyMachineId: null,
pendingDestroyVolumeId: 'vol-1',
destroyVolumeAttempts: 49,
destroyVolumeAttempts: 99,
});

(flyClient.getVolume as Mock).mockResolvedValue({
Expand Down Expand Up @@ -1500,7 +1539,7 @@ describe('destroy volume: max-retry abandon', () => {
flyVolumeId: 'vol-1',
pendingDestroyMachineId: null,
pendingDestroyVolumeId: 'vol-1',
destroyVolumeAttempts: 49,
destroyVolumeAttempts: 99,
});

(flyClient.getVolume as Mock).mockResolvedValue({
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,7 @@ export class KiloClawInstance extends DurableObject<KiloClawEnv> {

private async scheduleAlarm(): Promise<void> {
if (!this.s.status) return;
await this.ctx.storage.setAlarm(nextAlarmTime(this.s.status));
await this.ctx.storage.setAlarm(nextAlarmTime(this.s.status, this.s.destroyVolumeAttempts));
}

private recoveryRuntime(): RecoveryRuntime {
Expand Down
12 changes: 11 additions & 1 deletion services/kiloclaw/src/durable-objects/kiloclaw-instance/log.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import {
ALARM_INTERVAL_DESTROYING_MS,
ALARM_INTERVAL_IDLE_MS,
ALARM_JITTER_MS,
DESTROY_VOLUME_RETRY_DELAYS_MS,
} from '../../config';
import { writeEvent, eventContextFromState } from '../../utils/analytics';

Expand Down Expand Up @@ -233,9 +234,18 @@ export function alarmIntervalForStatus(status: InstanceStatus): number {
}
}

export function destroyRetryDelay(attempt: number, random = Math.random()): number {
const index = Math.min(Math.max(attempt, 1) - 1, DESTROY_VOLUME_RETRY_DELAYS_MS.length - 1);
const baseDelay = DESTROY_VOLUME_RETRY_DELAYS_MS[index];
return baseDelay * (0.5 + random);
}

/**
* Next alarm time with jitter.
*/
export function nextAlarmTime(status: InstanceStatus): number {
export function nextAlarmTime(status: InstanceStatus, destroyVolumeAttempts = 0): number {
if (status === 'destroying' && destroyVolumeAttempts > 0) {
return Date.now() + destroyRetryDelay(destroyVolumeAttempts);
}
return Date.now() + alarmIntervalForStatus(status) + Math.random() * ALARM_JITTER_MS;
}
Original file line number Diff line number Diff line change
Expand Up @@ -1543,8 +1543,9 @@ export async function tryDeleteMachine(

/**
* Cap on retries against a single `pendingDestroyVolumeId` before the DO gives
* up. At the current ~1 retry/minute alarm cadence, 50 attempts is roughly an
* hour of wall-clock retries. Past this point the volume is treated as
* up. Retry alarms back off from one minute to a jittered daily cadence, so this
* cap represents a long-lived provider failure rather than a short outage.
* Past this point the volume is treated as
* permanently stuck — the DO emits `destroy_volume_abandoned_after_max_retries`
* (for alerting), clears the pending pointer so the destroy loop can finalize,
* and the volume will be picked up by the org-wide volume janitor (if any).
Expand All @@ -1563,7 +1564,16 @@ export async function tryDeleteMachine(
* needs human attention" rather than "this volume is leaked," and re-check
* actual Fly state before acting.
*/
const MAX_DESTROY_VOLUME_ATTEMPTS = 50;
const MAX_DESTROY_VOLUME_ATTEMPTS = 100;
/**
* Attempt at which to emit `destroy_volume_retry_escalated` as an early signal.
* The retry backoff reaches its daily tier when scheduling after attempt 6
* (~7h of accumulated retries), which is the point a short outage becomes a
* long-lived provider failure worth alerting on. Firing later (e.g. at attempt
* 10, ~4 days in) would just trail the daily-tier transition instead of
* surfacing it while still actionable.
*/
const DESTROY_VOLUME_ESCALATION_ATTEMPTS = 6;

export async function tryDeleteVolume(
flyConfig: FlyClientConfig,
Expand Down Expand Up @@ -1597,6 +1607,14 @@ export async function tryDeleteVolume(
await persistDestroyError(ctx, state, 'volume', status, message);

const attempts = state.destroyVolumeAttempts + 1;
if (attempts === DESTROY_VOLUME_ESCALATION_ATTEMPTS) {
rctx.log('destroy_volume_retry_escalated', {
volume_id: state.pendingDestroyVolumeId,
attempts,
last_error: message,
last_status: status,
});
}
if (attempts >= MAX_DESTROY_VOLUME_ATTEMPTS) {
rctx.log('destroy_volume_abandoned_after_max_retries', {
volume_id: state.pendingDestroyVolumeId,
Expand Down