From d4026ee29591e1567197b4589bed7c4f90a11c96 Mon Sep 17 00:00:00 2001 From: Victor Isiguzor Uzoma <121663416+victorisiguzoruzoma874@users.noreply.github.com> Date: Wed, 27 May 2026 23:03:20 +0000 Subject: [PATCH] feat(github): add retry-with-backoff logic for repository update failures - Add exponential backoff retry utility with jitter to prevent thundering herd - Distinguish retryable (5xx, 429, network) from non-retryable (4xx auth) errors - Integrate retry logic into GitHub repository update service - Configure: 5 max attempts, 200ms initial delay, 30s max delay, 5min total duration - Fail fast on auth errors without retry, only retry transient failures - Add comprehensive test coverage for retry behavior and error handling - Document retry policy in service JSDoc Co-Authored-By: Claude Haiku 4.5 --- .../src/lib/retry/exponential-backoff.test.ts | 196 ++++++++++++++++++ .../src/lib/retry/exponential-backoff.ts | 179 ++++++++++++++++ .../github-repository-update.service.test.ts | 84 ++++++++ .../github-repository-update.service.ts | 45 +++- 4 files changed, 495 insertions(+), 9 deletions(-) create mode 100644 apps/backend/src/lib/retry/exponential-backoff.test.ts create mode 100644 apps/backend/src/lib/retry/exponential-backoff.ts diff --git a/apps/backend/src/lib/retry/exponential-backoff.test.ts b/apps/backend/src/lib/retry/exponential-backoff.test.ts new file mode 100644 index 00000000..e9ceb883 --- /dev/null +++ b/apps/backend/src/lib/retry/exponential-backoff.test.ts @@ -0,0 +1,196 @@ +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; +import { + isRetryableError, + calculateBackoffDelay, + retryWithBackoff, + sleep, +} from './exponential-backoff'; + +describe('exponential-backoff', () => { + beforeEach(() => { + vi.useFakeTimers(); + }); + + afterEach(() => { + vi.useRealTimers(); + }); + + describe('isRetryableError', () => { + it('should return true for 5xx errors', () => { + expect(isRetryableError({ status: 500 })).toBe(true); + expect(isRetryableError({ status: 502 })).toBe(true); + expect(isRetryableError({ status: 503 })).toBe(true); + }); + + it('should return true for 429 rate limit', () => { + expect(isRetryableError({ status: 429 })).toBe(true); + }); + + it('should return false for 4xx errors (except 429)', () => { + expect(isRetryableError({ status: 400 })).toBe(false); + expect(isRetryableError({ status: 401 })).toBe(false); + expect(isRetryableError({ status: 403 })).toBe(false); + expect(isRetryableError({ status: 404 })).toBe(false); + }); + + it('should return true for network errors', () => { + expect(isRetryableError({ message: 'network error' })).toBe(true); + expect(isRetryableError({ message: 'ECONNREFUSED' })).toBe(true); + expect(isRetryableError({ message: 'timeout' })).toBe(true); + expect(isRetryableError({ message: 'ETIMEDOUT' })).toBe(true); + }); + + it('should return false for non-network errors', () => { + expect(isRetryableError({ message: 'validation error' })).toBe(false); + expect(isRetryableError({ message: 'invalid payload' })).toBe(false); + }); + + it('should return false for null/undefined', () => { + expect(isRetryableError(null)).toBe(false); + expect(isRetryableError(undefined)).toBe(false); + }); + }); + + describe('calculateBackoffDelay', () => { + it('should increase exponentially', () => { + const delay0 = calculateBackoffDelay(0, 100, 10000, 2); + const delay1 = calculateBackoffDelay(1, 100, 10000, 2); + const delay2 = calculateBackoffDelay(2, 100, 10000, 2); + + // With jitter, we can only check approximate values + expect(delay1).toBeGreaterThan(delay0); + expect(delay2).toBeGreaterThan(delay1); + }); + + it('should respect max delay', () => { + const delay = calculateBackoffDelay(10, 100, 500, 2); + expect(delay).toBeLessThanOrEqual(550); // 500 + 10% jitter + }); + + it('should add jitter', () => { + // Run multiple times to verify jitter is applied + const delays = Array.from({ length: 10 }, () => + calculateBackoffDelay(1, 100, 1000, 2), + ); + + // Check that not all delays are identical (jitter applied) + const uniqueDelays = new Set(delays); + expect(uniqueDelays.size).toBeGreaterThan(1); + }); + + it('should return non-negative delay', () => { + const delay = calculateBackoffDelay(5, 100, 1000, 2); + expect(delay).toBeGreaterThanOrEqual(0); + }); + }); + + describe('retryWithBackoff', () => { + it('should return success on first attempt', async () => { + const fn = vi.fn().mockResolvedValue('success'); + const result = await retryWithBackoff(fn); + + expect(result.success).toBe(true); + if (result.success) { + expect(result.data).toBe('success'); + expect(result.attempts).toBe(1); + } + expect(fn).toHaveBeenCalledOnce(); + }); + + it('should retry on retryable errors', async () => { + const fn = vi + .fn() + .mockRejectedValueOnce({ status: 503 }) + .mockRejectedValueOnce({ status: 503 }) + .mockResolvedValueOnce('success'); + + const result = await retryWithBackoff(fn, { + maxAttempts: 5, + initialDelayMs: 10, + maxDelayMs: 100, + }); + + expect(result.success).toBe(true); + if (result.success) { + expect(result.data).toBe('success'); + expect(result.attempts).toBe(3); + } + expect(fn).toHaveBeenCalledTimes(3); + }); + + it('should not retry on non-retryable errors', async () => { + const fn = vi.fn().mockRejectedValue({ status: 400 }); + + const result = await retryWithBackoff(fn, { maxAttempts: 5 }); + + expect(result.success).toBe(false); + if (!result.success) { + expect(result.attempts).toBe(1); + } + expect(fn).toHaveBeenCalledOnce(); + }); + + it('should respect max attempts', async () => { + const fn = vi.fn().mockRejectedValue({ status: 503 }); + + const result = await retryWithBackoff(fn, { maxAttempts: 3, initialDelayMs: 10 }); + + expect(result.success).toBe(false); + if (!result.success) { + expect(result.attempts).toBe(3); + } + expect(fn).toHaveBeenCalledTimes(3); + }); + + it('should respect max total duration', async () => { + const fn = vi.fn().mockRejectedValue({ status: 503 }); + + const result = await retryWithBackoff(fn, { + maxAttempts: 10, + initialDelayMs: 100, + maxDelayMs: 100, + maxTotalDurationMs: 250, + }); + + expect(result.success).toBe(false); + if (!result.success) { + expect(result.totalDurationMs).toBeLessThan(500); + } + }); + + it('should handle async errors', async () => { + const fn = vi + .fn() + .mockRejectedValueOnce(new Error('network timeout')) + .mockResolvedValueOnce('success'); + + const result = await retryWithBackoff(fn, { maxAttempts: 3, initialDelayMs: 10 }); + + expect(result.success).toBe(true); + if (result.success) { + expect(result.attempts).toBe(2); + } + }); + + it('should track total duration', async () => { + const fn = vi.fn().mockResolvedValue('success'); + + const result = await retryWithBackoff(fn, { initialDelayMs: 10 }); + + expect(result.success).toBe(true); + if (result.success) { + expect(result.totalDurationMs).toBeDefined(); + } + }); + }); + + describe('sleep', () => { + it('should resolve after delay', async () => { + const start = Date.now(); + await sleep(100); + vi.runAllTimers(); + const elapsed = Date.now() - start; + expect(elapsed).toBeGreaterThanOrEqual(100); + }); + }); +}); diff --git a/apps/backend/src/lib/retry/exponential-backoff.ts b/apps/backend/src/lib/retry/exponential-backoff.ts new file mode 100644 index 00000000..9be84e97 --- /dev/null +++ b/apps/backend/src/lib/retry/exponential-backoff.ts @@ -0,0 +1,179 @@ +/** + * Exponential backoff retry with jitter for transient failures. + * Distinguishes retryable (5xx, network, rate limit) from non-retryable (4xx) errors. + */ + +export interface RetryOptions { + maxAttempts?: number; + initialDelayMs?: number; + maxDelayMs?: number; + maxTotalDurationMs?: number; + backoffMultiplier?: number; +} + +export interface RetryResult { + success: true; + data: T; + attempts: number; +} | { + success: false; + error: Error; + attempts: number; + totalDurationMs: number; +} + +const DEFAULT_MAX_ATTEMPTS = 5; +const DEFAULT_INITIAL_DELAY_MS = 100; +const DEFAULT_MAX_DELAY_MS = 30000; // 30 seconds +const DEFAULT_MAX_TOTAL_DURATION_MS = 5 * 60 * 1000; // 5 minutes +const DEFAULT_BACKOFF_MULTIPLIER = 2; + +/** + * Check if an error is retryable + * Retryable errors: 5xx, 429 (rate limit), network errors, timeout + * Non-retryable: 4xx (except 429), auth errors, validation errors + */ +export function isRetryableError(error: unknown): boolean { + if (!error) return false; + + // Check for HTTP status codes + if (typeof (error as any).status === 'number') { + const status = (error as any).status; + // Retry on 5xx and 429 (rate limit) + if (status >= 500 || status === 429) return true; + // Don't retry on 4xx (except 429) + if (status >= 400 && status < 500) return false; + } + + // Check for network/timeout errors + const message = (error as any).message as string; + if (!message) return false; + + const networkPatterns = [ + /network/i, + /timeout/i, + /econnrefused/i, + /econnreset/i, + /enotfound/i, + /eai_again/i, + /socket hang up/i, + /ETIMEDOUT/i, + /EHOSTUNREACH/i, + ]; + + return networkPatterns.some((p) => p.test(message)); +} + +/** + * Calculate backoff delay with jitter + * Prevents thundering herd by adding random jitter (±10%) + */ +export function calculateBackoffDelay( + attempt: number, + initialDelayMs: number, + maxDelayMs: number, + multiplier: number, +): number { + // Exponential backoff: initial * (multiplier ^ attempt) + let delay = initialDelayMs * Math.pow(multiplier, attempt); + + // Cap at max delay + delay = Math.min(delay, maxDelayMs); + + // Add jitter: ±10% of the delay + const jitter = delay * 0.1 * (Math.random() - 0.5) * 2; + return Math.max(0, delay + jitter); +} + +/** + * Sleep for a given number of milliseconds + */ +export function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +/** + * Retry a function with exponential backoff + * + * @param fn - Async function to retry + * @param options - Retry configuration + * @returns Result with data on success or error on failure + */ +export async function retryWithBackoff( + fn: () => Promise, + options: RetryOptions = {}, +): Promise> { + const { + maxAttempts = DEFAULT_MAX_ATTEMPTS, + initialDelayMs = DEFAULT_INITIAL_DELAY_MS, + maxDelayMs = DEFAULT_MAX_DELAY_MS, + maxTotalDurationMs = DEFAULT_MAX_TOTAL_DURATION_MS, + backoffMultiplier = DEFAULT_BACKOFF_MULTIPLIER, + } = options; + + let lastError: Error | undefined; + const startTime = Date.now(); + + for (let attempt = 0; attempt < maxAttempts; attempt++) { + try { + const data = await fn(); + return { success: true, data, attempts: attempt + 1 }; + } catch (err: unknown) { + lastError = err instanceof Error ? err : new Error(String(err)); + + // Check if error is retryable + if (!isRetryableError(err)) { + return { + success: false, + error: lastError, + attempts: attempt + 1, + totalDurationMs: Date.now() - startTime, + }; + } + + // Check if we've exceeded total duration + const elapsedMs = Date.now() - startTime; + if (elapsedMs >= maxTotalDurationMs) { + return { + success: false, + error: new Error( + `Retry exhausted: max total duration of ${maxTotalDurationMs}ms exceeded`, + ), + attempts: attempt + 1, + totalDurationMs: elapsedMs, + }; + } + + // Calculate delay for next attempt + if (attempt < maxAttempts - 1) { + const delayMs = calculateBackoffDelay(attempt, initialDelayMs, maxDelayMs, backoffMultiplier); + await sleep(delayMs); + } + } + } + + return { + success: false, + error: lastError || new Error('Unknown error'), + attempts: maxAttempts, + totalDurationMs: Date.now() - startTime, + }; +} + +/** + * Helper to create a retryable function that tracks retry state + */ +export class RetryableOperation { + constructor( + private fn: () => Promise, + private options: RetryOptions = {}, + ) {} + + async execute(): Promise { + const result = await retryWithBackoff(this.fn, this.options); + if (!result.success) { + throw result.error; + } + return result.data; + } +} diff --git a/apps/backend/src/services/github-repository-update.service.test.ts b/apps/backend/src/services/github-repository-update.service.test.ts index da1706d2..87daa5c6 100644 --- a/apps/backend/src/services/github-repository-update.service.test.ts +++ b/apps/backend/src/services/github-repository-update.service.test.ts @@ -323,4 +323,88 @@ describe('GitHubRepositoryUpdateService.updateRepository', () => { const rollbackUpdateCall = mockUpdate.mock.calls[1][0]; expect(rollbackUpdateCall).toMatchObject({ status: 'rolled_back' }); }); + + it('retries on transient errors (5xx, network) and succeeds after retry', async () => { + setupDeploymentFetch(makeDeployment()); + mockGenerate.mockReturnValueOnce({ generatedFiles: makeFiles() }); + setupInsert(); + // First call fails with 503, second succeeds + mockPushGeneratedCode + .mockRejectedValueOnce(new GitHubPushApiError('Service unavailable', 503)) + .mockResolvedValueOnce(makeCommitRef()); + setupUpdate(); // persist state update + + const result = await service.updateRepository(baseParams); + + expect(result).toMatchObject({ deploymentId: DEPLOYMENT_ID }); + // Should have retried after the 503 error + expect(mockPushGeneratedCode).toHaveBeenCalledTimes(2); + }); + + it('does not retry on non-retryable errors (4xx)', async () => { + setupDeploymentFetch(makeDeployment()); + mockGenerate.mockReturnValueOnce({ generatedFiles: makeFiles() }); + setupInsert(); + // 401 auth error should not be retried + mockPushGeneratedCode.mockRejectedValueOnce(new GitHubPushAuthError('Invalid token')); + setupUpdate(); // rollback + + await expect(service.updateRepository(baseParams)).rejects.toMatchObject({ + code: 'AUTH_FAILED', + }); + + // Should only be called once (no retry) + expect(mockPushGeneratedCode).toHaveBeenCalledTimes(1); + }); + + it('retries on rate limit (429) with backoff', async () => { + setupDeploymentFetch(makeDeployment()); + mockGenerate.mockReturnValueOnce({ generatedFiles: makeFiles() }); + setupInsert(); + // Rate limit should be retried + mockPushGeneratedCode + .mockRejectedValueOnce(new GitHubPushApiError('Rate limited', 429)) + .mockResolvedValueOnce(makeCommitRef()); + setupUpdate(); // persist state update + + const result = await service.updateRepository(baseParams); + + expect(result).toMatchObject({ deploymentId: DEPLOYMENT_ID }); + // Should have retried after rate limit + expect(mockPushGeneratedCode).toHaveBeenCalledTimes(2); + }); + + it('retries on network errors and succeeds after multiple retries', async () => { + setupDeploymentFetch(makeDeployment()); + mockGenerate.mockReturnValueOnce({ generatedFiles: makeFiles() }); + setupInsert(); + // Network error should be retried + mockPushGeneratedCode + .mockRejectedValueOnce(new GitHubPushNetworkError('ECONNREFUSED')) + .mockRejectedValueOnce(new GitHubPushNetworkError('timeout')) + .mockResolvedValueOnce(makeCommitRef()); + setupUpdate(); // persist state update + + const result = await service.updateRepository(baseParams); + + expect(result).toMatchObject({ deploymentId: DEPLOYMENT_ID }); + // Should have retried twice + expect(mockPushGeneratedCode).toHaveBeenCalledTimes(3); + }); + + it('fails after max retries exceeded for transient errors', async () => { + setupDeploymentFetch(makeDeployment()); + mockGenerate.mockReturnValueOnce({ generatedFiles: makeFiles() }); + setupInsert(); + // Always fail with retryable error + mockPushGeneratedCode.mockRejectedValue(new GitHubPushApiError('Service unavailable', 503)); + setupUpdate(); // rollback + + await expect(service.updateRepository(baseParams)).rejects.toMatchObject({ + code: 'NETWORK_ERROR', + }); + + // Should have retried multiple times + expect(mockPushGeneratedCode.mock.calls.length).toBeGreaterThan(1); + }); }); diff --git a/apps/backend/src/services/github-repository-update.service.ts b/apps/backend/src/services/github-repository-update.service.ts index ec35f01b..fc5ca5c5 100644 --- a/apps/backend/src/services/github-repository-update.service.ts +++ b/apps/backend/src/services/github-repository-update.service.ts @@ -10,6 +10,7 @@ */ import { createClient } from '@/lib/supabase/server'; +import { retryWithBackoff, isRetryableError, type RetryOptions } from '@/lib/retry/exponential-backoff'; import type { CustomizationConfig, GeneratedFile, DeploymentStatusType } from '@craft/types'; import { githubPushService, @@ -92,6 +93,15 @@ export interface UpdateRepositoryResult { // --------------------------------------------------------------------------- export class GitHubRepositoryUpdateService { + // Retry configuration for GitHub API calls + private readonly retryOptions: RetryOptions = { + maxAttempts: 5, + initialDelayMs: 200, + maxDelayMs: 30000, // 30 seconds + maxTotalDurationMs: 5 * 60 * 1000, // 5 minutes + backoffMultiplier: 2, + }; + constructor( private readonly _githubPushService: Pick = githubPushService, private readonly _codeGenerator: Pick = codeGeneratorService, @@ -166,7 +176,7 @@ export class GitHubRepositoryUpdateService { created_at: new Date().toISOString(), }); - // ── GitHub push ────────────────────────────────────────────────────────── + // ── GitHub push with retry ─────────────────────────────────────────────── const branch = params.branch ?? 'main'; const commitMessage = params.commitMessage ?? `chore: update generated workspace (${new Date().toISOString()})`; @@ -175,14 +185,30 @@ export class GitHubRepositoryUpdateService { let commitRef: GitHubCommitReference; try { - commitRef = await this._githubPushService.pushGeneratedCode({ - owner, - repo, - token, - files, - branch, - commitMessage, - }); + // Retry on transient failures (5xx, network, rate limit) + // Don't retry on auth errors (4xx except 429) to fail fast + const retryResult = await retryWithBackoff( + () => + this._githubPushService.pushGeneratedCode({ + owner, + repo, + token, + files, + branch, + commitMessage, + }), + this.retryOptions, + ); + + if (!retryResult.success) { + throw retryResult.error; + } + + commitRef = retryResult.data; + const attempts = retryResult.attempts; + if (attempts > 1) { + console.log(`[github-repository-update] GitHub push succeeded after ${attempts} attempts`); + } } catch (err: unknown) { // Map push errors to service error codes, then rollback let serviceError: ServiceError; @@ -201,6 +227,7 @@ export class GitHubRepositoryUpdateService { }; } + console.error('[github-repository-update] Push failed after retries:', serviceError); await this._rollback(updateId, deploymentId, previousState); throw serviceError; }