From b994e140cc0772b12fccc8f2afd96da8d4c349fd Mon Sep 17 00:00:00 2001 From: Nworah_Gabriel Date: Sat, 30 May 2026 06:04:12 +0800 Subject: [PATCH 1/2] feat(infra): add multi-region deployment and failover strategy (#620) Implement an active/warm-standby two-region topology by composing the existing single-region Terraform modules with aliased providers. - tf/multi-region: full stack (networking, compute, storage, monitoring) in primary + secondary regions - modules/dns-failover: Route 53 health checks + active-passive failover - modules/replication: S3 cross-region replication (uploads, backups) - modules/database-replica: cross-region RDS read replica + standby Redis - infra/scripts: failover.sh (promote/scale/failback), failover-drill.sh, validate-multiregion.sh - dr/: data-replication strategy + multi-region deployment runbook - add db_instance_arn output; required_providers in each module Validated with terraform fmt + validate (single- and multi-region). Closes #620 --- .gitignore | 18 ++ dr/README.md | 5 + dr/procedures/data-replication.md | 112 ++++++++++ dr/runbooks/multi-region-deployment.md | 110 ++++++++++ infra/scripts/failover-drill.sh | 109 ++++++++++ infra/scripts/failover.sh | 134 ++++++++++++ infra/scripts/validate-multiregion.sh | 29 +++ tf/modules/compute/versions.tf | 8 + tf/modules/database-replica/main.tf | 102 ++++++++++ tf/modules/database-replica/outputs.tf | 19 ++ tf/modules/database-replica/variables.tf | 45 ++++ tf/modules/database-replica/versions.tf | 8 + tf/modules/database/outputs.tf | 5 + tf/modules/database/versions.tf | 12 ++ tf/modules/dns-failover/main.tf | 121 +++++++++++ tf/modules/dns-failover/outputs.tf | 24 +++ tf/modules/dns-failover/variables.tf | 98 +++++++++ tf/modules/dns-failover/versions.tf | 8 + tf/modules/monitoring/versions.tf | 8 + tf/modules/networking/versions.tf | 8 + tf/modules/replication/main.tf | 96 +++++++++ tf/modules/replication/outputs.tf | 9 + tf/modules/replication/variables.tf | 36 ++++ tf/modules/replication/versions.tf | 8 + tf/modules/storage/versions.tf | 8 + tf/multi-region/README.md | 98 +++++++++ tf/multi-region/main.tf | 231 +++++++++++++++++++++ tf/multi-region/outputs.tf | 73 +++++++ tf/multi-region/providers.tf | 32 +++ tf/multi-region/terraform.tfvars.example | 67 ++++++ tf/multi-region/variables.tf | 249 +++++++++++++++++++++++ tf/multi-region/versions.tf | 14 ++ 32 files changed, 1904 insertions(+) create mode 100644 dr/procedures/data-replication.md create mode 100644 dr/runbooks/multi-region-deployment.md create mode 100755 infra/scripts/failover-drill.sh create mode 100755 infra/scripts/failover.sh create mode 100755 infra/scripts/validate-multiregion.sh create mode 100644 tf/modules/compute/versions.tf create mode 100644 tf/modules/database-replica/main.tf create mode 100644 tf/modules/database-replica/outputs.tf create mode 100644 tf/modules/database-replica/variables.tf create mode 100644 tf/modules/database-replica/versions.tf create mode 100644 tf/modules/database/versions.tf create mode 100644 tf/modules/dns-failover/main.tf create mode 100644 tf/modules/dns-failover/outputs.tf create mode 100644 tf/modules/dns-failover/variables.tf create mode 100644 tf/modules/dns-failover/versions.tf create mode 100644 tf/modules/monitoring/versions.tf create mode 100644 tf/modules/networking/versions.tf create mode 100644 tf/modules/replication/main.tf create mode 100644 tf/modules/replication/outputs.tf create mode 100644 tf/modules/replication/variables.tf create mode 100644 tf/modules/replication/versions.tf create mode 100644 tf/modules/storage/versions.tf create mode 100644 tf/multi-region/README.md create mode 100644 tf/multi-region/main.tf create mode 100644 tf/multi-region/outputs.tf create mode 100644 tf/multi-region/providers.tf create mode 100644 tf/multi-region/terraform.tfvars.example create mode 100644 tf/multi-region/variables.tf create mode 100644 tf/multi-region/versions.tf diff --git a/.gitignore b/.gitignore index 4b537ace..1f7c17d7 100644 --- a/.gitignore +++ b/.gitignore @@ -80,3 +80,21 @@ tsconfig.build.tsbuildinfo # Backups /backups + +# AI assistant / tooling artifacts +.claude/ +.claude/** +CLAUDE.md +.cursor/ +.cursorrules +.cursorignore +.aider* +.windsurfrules +.github/copilot-instructions.md +.continue/ +.codeium/ +.gemini/ +.specstory/ +.roo/ +.kilocode/ +.augment/ diff --git a/dr/README.md b/dr/README.md index 7a336b17..4c10ba7e 100644 --- a/dr/README.md +++ b/dr/README.md @@ -7,12 +7,17 @@ Welcome to the TeachLink Disaster Recovery (DR) documentation. This directory co ### Planning & Strategy - **[RTO/RPO Definitions](./procedures/RTO-RPO.md)** — Recovery Time and Point Objectives, alert thresholds - **[Failover Plan](./procedures/failover-plan.md)** — Failover procedures, failback strategy, infrastructure requirements +- **[Data Replication Strategy](./procedures/data-replication.md)** — Cross-region RDS replica, S3 CRR, cache/state handling ### Incident Response - **[Database Failure Runbook](./runbooks/database-failure.md)** — PostgreSQL failures, connection issues, data integrity problems - **[Region Outage Runbook](./runbooks/region-outage.md)** — AWS region unavailability, cross-region failover procedures +- **[Multi-Region Deployment Runbook](./runbooks/multi-region-deployment.md)** — Deploy, drill, fail over and fail back the two-region topology - **[Data Corruption Runbook](./runbooks/data-corruption.md)** — Data inconsistency, corruption detection, point-in-time recovery +### Infrastructure as Code +- **[Multi-Region Terraform](../tf/multi-region/README.md)** — Active/standby deployment across two regions (issue #620) + ## 🎯 Recovery Objectives at a Glance | Objective | Target | Notes | diff --git a/dr/procedures/data-replication.md b/dr/procedures/data-replication.md new file mode 100644 index 00000000..64caa4b5 --- /dev/null +++ b/dr/procedures/data-replication.md @@ -0,0 +1,112 @@ +# Data Replication Strategy + +This document describes how TeachLink data is replicated across regions to meet +the recovery objectives in [RTO-RPO.md](./RTO-RPO.md) and enable the failover +flow in [failover-plan.md](./failover-plan.md). + +Implemented by [`tf/multi-region`](../../tf/multi-region) — issue **#620**. + +--- + +## Summary + +| Data store | Mechanism | Direction | RPO | On failover | +| ---------- | --------- | --------- | --- | ----------- | +| PostgreSQL (RDS) | Cross-region **read replica** | primary → secondary (continuous) | seconds | Promote replica to standalone primary | +| Object storage (S3) | **Cross-Region Replication (CRR)** | primary → secondary (async) | seconds–minutes | Already present in secondary bucket | +| Redis (ElastiCache) | Independent standby (no replication) | n/a | n/a (cache) | Warm standby; repopulates from DB | +| Terraform state | S3 versioning + DynamoDB lock | n/a | n/a | Restore from versioned state | + +--- + +## 1. Database: RDS cross-region read replica + +The primary PostgreSQL instance lives in the primary region. A **cross-region +read replica** is provisioned in the secondary region by +[`tf/modules/database-replica`](../../tf/modules/database-replica). + +- **How it works**: RDS streams the primary's write-ahead log to the replica + asynchronously, typically keeping it within a few seconds of the primary + (`ReplicaLag` metric). This gives an effective **RPO of seconds**, a large + improvement over the backup-only RPO of up to 7 days. +- **Encryption**: the source is encrypted, so the replica uses a dedicated + KMS key created in the secondary region (cross-region requirement). +- **Backups**: the replica keeps a 7-day backup retention so it can be promoted + and itself replicated after a failover. +- **On failover**: `infra/scripts/failover.sh activate` runs + `aws rds promote-read-replica`, converting the replica into a standalone + read/write primary. Promotion is irreversible — failback requires re-seeding + the original primary (see failover-plan.md). + +### Monitoring replica lag + +```bash +aws cloudwatch get-metric-statistics \ + --namespace AWS/RDS --metric-name ReplicaLag \ + --dimensions Name=DBInstanceIdentifier,Value=teachlink-prod-db-replica \ + --statistics Average --period 60 \ + --start-time "$(date -u -d '1 hour ago' +%Y-%m-%dT%H:%M:%SZ)" \ + --end-time "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ + --region us-west-2 +``` + +Alert when `ReplicaLag > 5s` (RPO target). The monthly drill +(`infra/scripts/failover-drill.sh`) asserts this automatically. + +--- + +## 2. Object storage: S3 Cross-Region Replication + +Both the `uploads` and `backups` buckets replicate from the primary region to +their secondary-region counterparts via +[`tf/modules/replication`](../../tf/modules/replication). + +- **Prerequisite**: versioning is enabled on source and destination (the + storage module already does this). +- **IAM**: a scoped replication role grants S3 permission to read source object + versions and write them to the destination. +- **Scope**: all objects (`prefix = ""`), including delete markers, so deletes + propagate. +- **Storage class**: replicated objects land in `STANDARD_IA` to reduce cost. +- **Latency**: replication is asynchronous (usually seconds to minutes). Objects + written immediately before a regional failure may not have replicated — this + is the S3 RPO and is acceptable for uploads/backups. + +> Note: CRR only replicates objects written **after** the rule is enabled. For a +> brand-new secondary bucket, run a one-time `aws s3 sync` (or S3 Batch +> Replication) to backfill existing objects. + +--- + +## 3. Cache: Redis standby + +ElastiCache holds ephemeral cache data, so it is **not** replicated across +regions. The secondary region runs an independent standby Redis cluster that is +empty until failover, after which it repopulates naturally from the promoted +database. This avoids the cost and complexity of a Global Datastore for +non-durable data. + +--- + +## 4. Terraform state + +State is stored in a versioned S3 bucket with a DynamoDB lock table (see +[`tf/README.md`](../../tf/README.md)). Versioning provides point-in-time recovery +of the state file itself. Use a **distinct state key** for the multi-region +configuration to avoid clobbering the single-region state. + +--- + +## Verification + +| Check | Command / Tool | +| ----- | -------------- | +| Replica exists & lag OK | `infra/scripts/failover-drill.sh` | +| CRR enabled | `aws s3api get-bucket-replication --bucket ` | +| End-to-end failover | Quarterly drill (see [dr/README.md](../README.md)) | + +--- + +**Document Version**: 1.0 +**Owner**: Platform Engineering +**Related**: [failover-plan.md](./failover-plan.md), [RTO-RPO.md](./RTO-RPO.md), [region-outage runbook](../runbooks/region-outage.md) diff --git a/dr/runbooks/multi-region-deployment.md b/dr/runbooks/multi-region-deployment.md new file mode 100644 index 00000000..200c1543 --- /dev/null +++ b/dr/runbooks/multi-region-deployment.md @@ -0,0 +1,110 @@ +# Runbook: Multi-Region Deployment & Failover Operations + +Operational runbook for the active/standby multi-region topology defined in +[`tf/multi-region`](../../tf/multi-region). Covers initial deployment, routine +verification, manual failover, and failback. + +> Related: [Region Outage runbook](./region-outage.md) · [Failover Plan](../procedures/failover-plan.md) · [Data Replication Strategy](../procedures/data-replication.md) + +--- + +## 1. Initial deployment + +**Prerequisites** +- Terraform >= 1.5.0; AWS credentials valid in both regions. +- ACM certificates in **each** region (for HTTPS). +- A registered domain (provide `hosted_zone_id` or let Terraform create a zone). + +**Steps** +```bash +cd tf/multi-region +cp terraform.tfvars.example terraform.tfvars # edit values +terraform init +terraform plan -var-file=terraform.tfvars +terraform apply -var-file=terraform.tfvars +``` + +**Post-deploy validation** +```bash +# Static checks (no creds needed) +infra/scripts/validate-multiregion.sh + +# Live readiness checks +export PRIMARY_ALB_URL="https://" +export SECONDARY_ALB_URL="https://" +infra/scripts/failover-drill.sh +``` +If `hosted_zone_id` was empty, delegate your domain to the +`hosted_zone_name_servers` output before traffic will resolve. + +--- + +## 2. Routine verification (monthly drill) + +Run on the **third Tuesday, 02:00 UTC** (see [dr/README.md](../README.md)). + +```bash +infra/scripts/failover-drill.sh # non-destructive; exits non-zero on failure +``` +Confirms: both ALBs healthy, replica lag within RPO, S3 CRR enabled. + +Record results in the DR drill log. Investigate any ❌ before relying on failover. + +--- + +## 3. Manual failover (primary region lost) + +> Route 53 shifts **traffic** automatically when the primary health check goes +> red (~90s). These steps promote the **data tier**, which is not automatic. + +1. **Confirm** the outage is regional (AWS Health Dashboard, `failover.sh status`). +2. **Activate**: + ```bash + export PRIMARY_REGION=us-east-1 SECONDARY_REGION=us-west-2 ENVIRONMENT=prod + infra/scripts/failover.sh activate --dry-run # review + infra/scripts/failover.sh activate # execute + ``` + This promotes the read replica and scales up the secondary ECS service. +3. **Repoint the app**: update `DB_HOST`, `REDIS_HOST`, `AWS_REGION` to the + secondary endpoints (Terraform outputs `replica_db_endpoint`, etc.) and + redeploy / restart tasks so writes hit the promoted database. +4. **Verify**: `failover.sh status`; `curl https://api./health` → 200; + confirm DNS resolves to the secondary ALB (`dig +short api.`). +5. **Communicate** per the [Failover Plan](../procedures/failover-plan.md) comms timeline. + +**Target RTO: ≤ 15 min.** Escalate to Platform Lead if exceeded. + +--- + +## 4. Failback (primary region recovered) + +Only after the primary is fully restored and **data re-synced** to it. + +1. Re-seed the primary database from the promoted secondary (`pg_dump`/restore + or a new replica in the reverse direction), verify integrity. +2. Re-apply Terraform to restore the primary RDS as a fresh primary and the + secondary as a read replica again. +3. Scale the secondary back to standby: + ```bash + infra/scripts/failover.sh failback + ``` +4. Route 53 returns traffic to the primary once its health check is green. +5. Validate, then document in the post-incident review. + +--- + +## 5. Troubleshooting + +| Symptom | Likely cause | Action | +| ------- | ------------ | ------ | +| Traffic not failing over | Health check too lenient / DNS TTL cached | Check `primary_health_check_id` status; wait for interval × threshold | +| Replica promotion fails | Replica mid-update | `aws rds wait db-instance-available`; retry | +| App errors after failover | Still pointing at dead primary | Update `DB_HOST`/`REDIS_HOST`, redeploy | +| S3 objects missing in secondary | Written before CRR enabled, or async lag | One-time `aws s3 sync`; check replication metrics | +| `terraform apply` global-name clash | Reused single-region state | Use a separate state key for `tf/multi-region` | + +--- + +**Document Version**: 1.0 +**Owner**: Platform Engineering +**Review**: After each failover drill diff --git a/infra/scripts/failover-drill.sh b/infra/scripts/failover-drill.sh new file mode 100755 index 00000000..7de4a284 --- /dev/null +++ b/infra/scripts/failover-drill.sh @@ -0,0 +1,109 @@ +#!/usr/bin/env bash +# +# failover-drill.sh — Non-destructive validation of the failover setup. +# +# Run this monthly (see dr/README.md testing schedule) to confirm the +# multi-region deployment is failover-ready WITHOUT promoting the replica or +# shifting production traffic. It checks: +# 1. Both region ALBs answer the health-check path. +# 2. The RDS read replica exists and replication lag is within RPO. +# 3. S3 cross-region replication is enabled on the source buckets. +# 4. Route 53 health checks report healthy. +# +# Exit code is non-zero if any check fails, so it can gate CI / a scheduled job. +# +# Usage: ./failover-drill.sh +# Requires: awscli v2, jq, curl. +set -uo pipefail + +PRIMARY_REGION="${PRIMARY_REGION:-us-east-1}" +SECONDARY_REGION="${SECONDARY_REGION:-us-west-2}" +ENVIRONMENT="${ENVIRONMENT:-prod}" +REPLICA_DB_ID="${REPLICA_DB_ID:-teachlink-${ENVIRONMENT}-db-replica}" +HEALTH_PATH="${HEALTH_PATH:-/health}" +MAX_REPLICA_LAG_SECONDS="${MAX_REPLICA_LAG_SECONDS:-5}" +PRIMARY_ALB_URL="${PRIMARY_ALB_URL:-}" +SECONDARY_ALB_URL="${SECONDARY_ALB_URL:-}" + +PASS=0 +FAIL=0 +ok() { + printf ' ✅ %s\n' "$*" + PASS=$((PASS + 1)) +} +bad() { + printf ' ❌ %s\n' "$*" + FAIL=$((FAIL + 1)) +} +section() { printf '\n=== %s ===\n' "$*"; } + +check_endpoint() { + local name="$1" url="$2" + [[ -z "$url" ]] && { + bad "$name endpoint URL not set (export ${name}_ALB_URL)" + return + } + local code + code="$(curl -s -o /dev/null -w '%{http_code}' --max-time 10 "${url}${HEALTH_PATH}" || echo 000)" + if [[ "$code" == "200" ]]; then + ok "$name health endpoint returned 200" + else + bad "$name health endpoint returned ${code}" + fi +} + +section "1. Region health endpoints" +check_endpoint "PRIMARY" "$PRIMARY_ALB_URL" +check_endpoint "SECONDARY" "$SECONDARY_ALB_URL" + +section "2. RDS cross-region read replica" +if command -v aws >/dev/null 2>&1; then + replica_json="$(aws rds describe-db-instances --db-instance-identifier "$REPLICA_DB_ID" \ + --region "$SECONDARY_REGION" --output json 2>/dev/null || echo '')" + if [[ -n "$replica_json" ]]; then + ok "Read replica '${REPLICA_DB_ID}' exists in ${SECONDARY_REGION}" + lag="$(aws cloudwatch get-metric-statistics \ + --namespace AWS/RDS --metric-name ReplicaLag \ + --dimensions Name=DBInstanceIdentifier,Value="$REPLICA_DB_ID" \ + --statistics Average --period 60 \ + --start-time "$(date -u -d '5 minutes ago' +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || echo '')" \ + --end-time "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ + --region "$SECONDARY_REGION" \ + --query 'sort_by(Datapoints,&Timestamp)[-1].Average' --output text 2>/dev/null || echo None)" + if [[ "$lag" == "None" || -z "$lag" ]]; then + bad "Could not read ReplicaLag metric (no recent datapoints)" + elif awk "BEGIN{exit !(${lag} <= ${MAX_REPLICA_LAG_SECONDS})}"; then + ok "Replica lag ${lag}s is within RPO target (${MAX_REPLICA_LAG_SECONDS}s)" + else + bad "Replica lag ${lag}s exceeds RPO target (${MAX_REPLICA_LAG_SECONDS}s)" + fi + else + bad "Read replica '${REPLICA_DB_ID}' not found in ${SECONDARY_REGION}" + fi +else + bad "awscli not available — skipping RDS checks" +fi + +section "3. S3 cross-region replication" +if command -v aws >/dev/null 2>&1; then + for kind in uploads backups; do + bucket="teachlink-${PRIMARY_REGION//-/}-${ENVIRONMENT}-${kind}" + status="$(aws s3api get-bucket-replication --bucket "$bucket" \ + --query 'ReplicationConfiguration.Rules[0].Status' --output text 2>/dev/null || echo None)" + if [[ "$status" == "Enabled" ]]; then + ok "Replication enabled on ${bucket}" + else + bad "Replication not enabled on ${bucket} (status=${status})" + fi + done +else + bad "awscli not available — skipping S3 checks" +fi + +section "Drill summary" +printf 'Passed: %d Failed: %d\n' "$PASS" "$FAIL" +[[ "$FAIL" -eq 0 ]] || { + echo "Drill FAILED — investigate before relying on failover." + exit 1 +} +echo "Drill PASSED — deployment is failover-ready." diff --git a/infra/scripts/failover.sh b/infra/scripts/failover.sh new file mode 100755 index 00000000..f948c8ab --- /dev/null +++ b/infra/scripts/failover.sh @@ -0,0 +1,134 @@ +#!/usr/bin/env bash +# +# failover.sh — Activate or fail back the TeachLink multi-region deployment. +# +# This script automates the manual steps of a regional failover for the +# infrastructure provisioned by tf/multi-region: +# 1. Promote the secondary-region RDS read replica to a standalone primary. +# 2. Scale up the secondary-region ECS service. +# 3. Let Route 53 health checks shift traffic (no manual DNS edit needed). +# +# Route 53 failover routing already redirects traffic automatically when the +# primary health check goes red; this script handles the data-tier promotion +# and capacity changes that are NOT automatic. +# +# Usage: +# ./failover.sh activate [--dry-run] +# ./failover.sh failback [--dry-run] +# ./failover.sh status +# +# Requires: awscli v2, jq. Configure via the environment variables below. +set -euo pipefail + +PRIMARY_REGION="${PRIMARY_REGION:-us-east-1}" +SECONDARY_REGION="${SECONDARY_REGION:-us-west-2}" +ENVIRONMENT="${ENVIRONMENT:-prod}" +REPLICA_DB_ID="${REPLICA_DB_ID:-teachlink-${ENVIRONMENT}-db-replica}" +PRIMARY_DB_ID="${PRIMARY_DB_ID:-teachlink-${ENVIRONMENT}-db}" +SECONDARY_CLUSTER="${SECONDARY_CLUSTER:-teachlink-cluster-secondary}" +SECONDARY_SERVICE="${SECONDARY_SERVICE:-teachlink-service-secondary}" +FAILOVER_DESIRED_COUNT="${FAILOVER_DESIRED_COUNT:-10}" + +DRY_RUN=0 +log() { printf '[failover] %s\n' "$*"; } +run() { + if [[ "$DRY_RUN" -eq 1 ]]; then + log "DRY-RUN: $*" + else + log "RUN: $*" + "$@" + fi +} + +require() { + command -v "$1" >/dev/null 2>&1 || { + echo "ERROR: '$1' is required but not installed." >&2 + exit 1 + } +} + +activate() { + log "Activating failover: ${PRIMARY_REGION} -> ${SECONDARY_REGION} (env=${ENVIRONMENT})" + + log "Step 1/3: Promoting read replica '${REPLICA_DB_ID}' in ${SECONDARY_REGION} to standalone primary" + run aws rds promote-read-replica \ + --db-instance-identifier "${REPLICA_DB_ID}" \ + --region "${SECONDARY_REGION}" + + log "Waiting for the promoted instance to become available..." + run aws rds wait db-instance-available \ + --db-instance-identifier "${REPLICA_DB_ID}" \ + --region "${SECONDARY_REGION}" + + log "Step 2/3: Scaling secondary ECS service to ${FAILOVER_DESIRED_COUNT} tasks" + run aws ecs update-service \ + --cluster "${SECONDARY_CLUSTER}" \ + --service "${SECONDARY_SERVICE}" \ + --desired-count "${FAILOVER_DESIRED_COUNT}" \ + --region "${SECONDARY_REGION}" + + log "Step 3/3: Route 53 health-check failover handles DNS automatically." + log " Verify: dig +short api.\${DOMAIN} should resolve to the ${SECONDARY_REGION} ALB." + log "Failover activation complete. Update application DB_HOST/REDIS_HOST to the secondary endpoints." +} + +failback() { + log "Failing back: ${SECONDARY_REGION} -> ${PRIMARY_REGION} (env=${ENVIRONMENT})" + log "WARNING: Failback is high-risk. Only run after the primary region is fully recovered" + log " and data has been re-synced to ${PRIMARY_DB_ID}. See dr/procedures/failover-plan.md." + + log "Step 1/2: Confirm primary database is healthy and re-seeded" + run aws rds wait db-instance-available \ + --db-instance-identifier "${PRIMARY_DB_ID}" \ + --region "${PRIMARY_REGION}" + + log "Step 2/2: Scaling secondary ECS service back down to standby (1 task)" + run aws ecs update-service \ + --cluster "${SECONDARY_CLUSTER}" \ + --service "${SECONDARY_SERVICE}" \ + --desired-count 1 \ + --region "${SECONDARY_REGION}" + + log "Failback initiated. Route 53 will return traffic to ${PRIMARY_REGION} once its health check is green." +} + +status() { + log "Primary RDS (${PRIMARY_REGION}):" + aws rds describe-db-instances --db-instance-identifier "${PRIMARY_DB_ID}" \ + --region "${PRIMARY_REGION}" \ + --query 'DBInstances[0].{Status:DBInstanceStatus,Role:ReadReplicaSourceDBInstanceIdentifier}' \ + --output table 2>/dev/null || log " (not found)" + + log "Secondary RDS (${SECONDARY_REGION}):" + aws rds describe-db-instances --db-instance-identifier "${REPLICA_DB_ID}" \ + --region "${SECONDARY_REGION}" \ + --query 'DBInstances[0].{Status:DBInstanceStatus,ReplicaOf:ReadReplicaSourceDBInstanceIdentifier}' \ + --output table 2>/dev/null || log " (not found)" + + log "Secondary ECS service (${SECONDARY_REGION}):" + aws ecs describe-services --cluster "${SECONDARY_CLUSTER}" --services "${SECONDARY_SERVICE}" \ + --region "${SECONDARY_REGION}" \ + --query 'services[0].{Desired:desiredCount,Running:runningCount}' \ + --output table 2>/dev/null || log " (not found)" +} + +main() { + require aws + local action="${1:-}" + shift || true + for arg in "$@"; do + [[ "$arg" == "--dry-run" ]] && DRY_RUN=1 + done + + case "$action" in + activate) activate ;; + failback) failback ;; + status) status ;; + *) + echo "Usage: $0 {activate|failback|status} [--dry-run]" >&2 + exit 1 + ;; + esac +} + +main "$@" diff --git a/infra/scripts/validate-multiregion.sh b/infra/scripts/validate-multiregion.sh new file mode 100755 index 00000000..53ab522a --- /dev/null +++ b/infra/scripts/validate-multiregion.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +# +# validate-multiregion.sh — Static validation of the multi-region Terraform. +# +# Runs formatting and validation checks that do NOT require AWS credentials, +# suitable for CI. Gracefully degrades (warns, does not fail) when Terraform is +# not installed so the repo can still be linted in minimal environments. +# +# Usage: ./validate-multiregion.sh +set -uo pipefail + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +TF_DIR="${REPO_ROOT}/tf" + +if ! command -v terraform >/dev/null 2>&1; then + echo "WARNING: terraform not installed — skipping fmt/validate. Install Terraform >= 1.5.0 to run full checks." + exit 0 +fi + +echo "=== terraform fmt (check, recursive) ===" +terraform -chdir="${TF_DIR}" fmt -check -recursive + +echo "=== terraform init (backend disabled) ===" +terraform -chdir="${TF_DIR}/multi-region" init -backend=false -input=false >/dev/null + +echo "=== terraform validate (multi-region) ===" +terraform -chdir="${TF_DIR}/multi-region" validate + +echo "All multi-region Terraform checks passed." diff --git a/tf/modules/compute/versions.tf b/tf/modules/compute/versions.tf new file mode 100644 index 00000000..1ac87da5 --- /dev/null +++ b/tf/modules/compute/versions.tf @@ -0,0 +1,8 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + } +} diff --git a/tf/modules/database-replica/main.tf b/tf/modules/database-replica/main.tf new file mode 100644 index 00000000..04ea96c2 --- /dev/null +++ b/tf/modules/database-replica/main.tf @@ -0,0 +1,102 @@ +# Database Replica Module (secondary region) +# +# Provisions a cross-region RDS read replica of the primary database plus a +# warm-standby Redis cluster. The read replica continuously streams changes +# from the primary, giving an RPO measured in seconds. On failover it is +# promoted to a standalone read/write primary (see dr/runbooks/region-outage.md). +# +# Instantiate this module with the secondary-region provider. + +# Subnet group for the replica in the secondary region. +resource "aws_db_subnet_group" "replica" { + name = "teachlink-${var.environment}-db-replica-subnet-group" + subnet_ids = var.private_subnet_ids + + tags = merge( + { + Name = "teachlink-${var.environment}-db-replica-subnet-group" + }, + var.tags + ) +} + +# KMS key for encrypting the replica's storage in the secondary region. +# Cross-region replicas of an encrypted source require a destination-region key. +resource "aws_kms_key" "replica" { + description = "teachlink-${var.environment} RDS replica encryption key" + deletion_window_in_days = 7 + enable_key_rotation = true + + tags = merge( + { + Name = "teachlink-${var.environment}-db-replica-kms" + }, + var.tags + ) +} + +resource "aws_kms_alias" "replica" { + name = "alias/teachlink-${var.environment}-db-replica" + target_key_id = aws_kms_key.replica.key_id +} + +# Cross-region read replica of the primary database. +resource "aws_db_instance" "replica" { + identifier = "teachlink-${var.environment}-db-replica" + instance_class = var.db_instance_class + replicate_source_db = var.source_db_arn + + db_subnet_group_name = aws_db_subnet_group.replica.name + vpc_security_group_ids = [var.rds_security_group_id] + + storage_encrypted = true + kms_key_id = aws_kms_key.replica.arn + + # A replica must keep automated backups so it can itself be promoted/replicated. + backup_retention_period = 7 + + enabled_cloudwatch_logs_exports = ["postgresql"] + + # Replicas cannot set master credentials or db_name (inherited from source). + skip_final_snapshot = true + deletion_protection = false + + tags = merge( + { + Name = "teachlink-${var.environment}-db-replica" + Role = "read-replica" + }, + var.tags + ) +} + +# Standby Redis cluster (warmed on failover; cache data is non-durable). +resource "aws_elasticache_subnet_group" "standby" { + name = "teachlink-${var.environment}-redis-standby-subnet-group" + subnet_ids = var.private_subnet_ids + + tags = merge( + { + Name = "teachlink-${var.environment}-redis-standby-subnet-group" + }, + var.tags + ) +} + +resource "aws_elasticache_cluster" "standby" { + cluster_id = "teachlink-${var.environment}-redis-standby" + engine = "redis" + node_type = var.redis_node_type + num_cache_nodes = var.redis_num_cache_nodes + parameter_group_name = "default.redis7" + port = 6379 + subnet_group_name = aws_elasticache_subnet_group.standby.name + security_group_ids = [var.redis_security_group_id] + + tags = merge( + { + Name = "teachlink-${var.environment}-redis-standby" + }, + var.tags + ) +} diff --git a/tf/modules/database-replica/outputs.tf b/tf/modules/database-replica/outputs.tf new file mode 100644 index 00000000..5e2cd99e --- /dev/null +++ b/tf/modules/database-replica/outputs.tf @@ -0,0 +1,19 @@ +output "replica_db_instance_id" { + description = "ID of the cross-region read replica" + value = aws_db_instance.replica.id +} + +output "replica_db_endpoint" { + description = "Endpoint of the cross-region read replica" + value = aws_db_instance.replica.endpoint +} + +output "replica_kms_key_arn" { + description = "ARN of the KMS key encrypting the replica" + value = aws_kms_key.replica.arn +} + +output "standby_redis_endpoint" { + description = "Endpoint of the standby Redis cluster" + value = aws_elasticache_cluster.standby.cache_nodes[0].address +} diff --git a/tf/modules/database-replica/variables.tf b/tf/modules/database-replica/variables.tf new file mode 100644 index 00000000..2b037d69 --- /dev/null +++ b/tf/modules/database-replica/variables.tf @@ -0,0 +1,45 @@ +variable "environment" { + description = "Environment name (dev, staging, prod)" + type = string +} + +variable "source_db_arn" { + description = "ARN of the primary-region RDS instance to replicate from" + type = string +} + +variable "db_instance_class" { + description = "Instance class for the read replica" + type = string +} + +variable "private_subnet_ids" { + description = "Private subnet IDs in the secondary region" + type = list(string) +} + +variable "rds_security_group_id" { + description = "Security group ID for the replica in the secondary region" + type = string +} + +variable "redis_security_group_id" { + description = "Security group ID for the standby Redis cluster" + type = string +} + +variable "redis_node_type" { + description = "ElastiCache node type for the standby cluster" + type = string +} + +variable "redis_num_cache_nodes" { + description = "Number of standby cache nodes" + type = number +} + +variable "tags" { + description = "Tags to apply to resources" + type = map(string) + default = {} +} diff --git a/tf/modules/database-replica/versions.tf b/tf/modules/database-replica/versions.tf new file mode 100644 index 00000000..1ac87da5 --- /dev/null +++ b/tf/modules/database-replica/versions.tf @@ -0,0 +1,8 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + } +} diff --git a/tf/modules/database/outputs.tf b/tf/modules/database/outputs.tf index 55e0879f..1a7db1ea 100644 --- a/tf/modules/database/outputs.tf +++ b/tf/modules/database/outputs.tf @@ -3,6 +3,11 @@ output "db_instance_id" { value = aws_db_instance.main.id } +output "db_instance_arn" { + description = "RDS instance ARN (required as the source for cross-region read replicas)" + value = aws_db_instance.main.arn +} + output "db_instance_endpoint" { description = "RDS instance endpoint" value = aws_db_instance.main.endpoint diff --git a/tf/modules/database/versions.tf b/tf/modules/database/versions.tf new file mode 100644 index 00000000..a9e3c714 --- /dev/null +++ b/tf/modules/database/versions.tf @@ -0,0 +1,12 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + random = { + source = "hashicorp/random" + version = "~> 3.0" + } + } +} diff --git a/tf/modules/dns-failover/main.tf b/tf/modules/dns-failover/main.tf new file mode 100644 index 00000000..7d14f78f --- /dev/null +++ b/tf/modules/dns-failover/main.tf @@ -0,0 +1,121 @@ +# DNS Failover Module +# +# Implements active-passive, health-check driven failover between two regions +# using Route 53. Traffic resolves to the primary region while its health check +# is healthy and automatically shifts to the secondary region when it is not. + +locals { + # Use the provided hosted zone, or the one created below. + zone_id = var.hosted_zone_id != "" ? var.hosted_zone_id : aws_route53_zone.this[0].zone_id + app_fqdn = "${var.app_subdomain}.${var.domain_name}" +} + +# Optionally create a public hosted zone when an existing one is not supplied. +resource "aws_route53_zone" "this" { + count = var.hosted_zone_id == "" ? 1 : 0 + name = var.domain_name + + tags = merge( + { + Name = "teachlink-${var.environment}-zone" + }, + var.tags + ) +} + +# Health check for the primary region endpoint. +resource "aws_route53_health_check" "primary" { + fqdn = var.primary_alb_dns_name + port = var.health_check_port + type = var.health_check_type + resource_path = var.health_check_path + request_interval = var.health_check_interval + failure_threshold = var.health_check_failure_threshold + + tags = merge( + { + Name = "teachlink-${var.environment}-primary-${var.primary_region}" + Region = var.primary_region + }, + var.tags + ) +} + +# Health check for the secondary region endpoint. +resource "aws_route53_health_check" "secondary" { + fqdn = var.secondary_alb_dns_name + port = var.health_check_port + type = var.health_check_type + resource_path = var.health_check_path + request_interval = var.health_check_interval + failure_threshold = var.health_check_failure_threshold + + tags = merge( + { + Name = "teachlink-${var.environment}-secondary-${var.secondary_region}" + Region = var.secondary_region + }, + var.tags + ) +} + +# PRIMARY failover record — served while the primary health check is healthy. +resource "aws_route53_record" "primary" { + zone_id = local.zone_id + name = local.app_fqdn + type = "A" + + set_identifier = "primary-${var.primary_region}" + health_check_id = aws_route53_health_check.primary.id + + failover_routing_policy { + type = "PRIMARY" + } + + alias { + name = var.primary_alb_dns_name + zone_id = var.primary_alb_zone_id + evaluate_target_health = true + } +} + +# SECONDARY failover record — served when the primary is unhealthy. +resource "aws_route53_record" "secondary" { + zone_id = local.zone_id + name = local.app_fqdn + type = "A" + + set_identifier = "secondary-${var.secondary_region}" + health_check_id = aws_route53_health_check.secondary.id + + failover_routing_policy { + type = "SECONDARY" + } + + alias { + name = var.secondary_alb_dns_name + zone_id = var.secondary_alb_zone_id + evaluate_target_health = true + } +} + +# CloudWatch alarm that fires when the primary region health check goes red, +# so failover is observable and can page the on-call engineer. +resource "aws_cloudwatch_metric_alarm" "primary_unhealthy" { + alarm_name = "teachlink-${var.environment}-primary-region-unhealthy" + comparison_operator = "LessThanThreshold" + evaluation_periods = 1 + metric_name = "HealthCheckStatus" + namespace = "AWS/Route53" + period = 60 + statistic = "Minimum" + threshold = 1 + alarm_description = "Primary region (${var.primary_region}) health check is failing; Route 53 is failing over to ${var.secondary_region}." + treat_missing_data = "breaching" + + dimensions = { + HealthCheckId = aws_route53_health_check.primary.id + } + + tags = var.tags +} diff --git a/tf/modules/dns-failover/outputs.tf b/tf/modules/dns-failover/outputs.tf new file mode 100644 index 00000000..fce9fc6f --- /dev/null +++ b/tf/modules/dns-failover/outputs.tf @@ -0,0 +1,24 @@ +output "hosted_zone_id" { + description = "Route 53 hosted zone ID used for the failover records" + value = local.zone_id +} + +output "app_fqdn" { + description = "Fully qualified domain name that fails over between regions" + value = local.app_fqdn +} + +output "primary_health_check_id" { + description = "ID of the primary region Route 53 health check" + value = aws_route53_health_check.primary.id +} + +output "secondary_health_check_id" { + description = "ID of the secondary region Route 53 health check" + value = aws_route53_health_check.secondary.id +} + +output "name_servers" { + description = "Name servers for the created hosted zone (empty when an existing zone is reused)" + value = var.hosted_zone_id == "" ? aws_route53_zone.this[0].name_servers : [] +} diff --git a/tf/modules/dns-failover/variables.tf b/tf/modules/dns-failover/variables.tf new file mode 100644 index 00000000..a2a1b3dd --- /dev/null +++ b/tf/modules/dns-failover/variables.tf @@ -0,0 +1,98 @@ +variable "environment" { + description = "Environment name (dev, staging, prod)" + type = string +} + +variable "domain_name" { + description = "Root domain managed in Route 53 (e.g. teachlink.io)" + type = string +} + +variable "app_subdomain" { + description = "Subdomain that fails over between regions (e.g. api)" + type = string + default = "api" +} + +variable "hosted_zone_id" { + description = "Existing Route 53 hosted zone ID. Leave empty to create a new public zone for domain_name." + type = string + default = "" +} + +variable "primary_region" { + description = "Primary AWS region identifier (used for set identifiers)" + type = string +} + +variable "secondary_region" { + description = "Secondary AWS region identifier (used for set identifiers)" + type = string +} + +variable "primary_alb_dns_name" { + description = "DNS name of the primary region ALB" + type = string +} + +variable "primary_alb_zone_id" { + description = "Route 53 hosted zone ID of the primary region ALB" + type = string +} + +variable "secondary_alb_dns_name" { + description = "DNS name of the secondary region ALB" + type = string +} + +variable "secondary_alb_zone_id" { + description = "Route 53 hosted zone ID of the secondary region ALB" + type = string +} + +variable "health_check_path" { + description = "Path probed by the Route 53 health checks" + type = string + default = "/health" +} + +variable "health_check_port" { + description = "Port probed by the Route 53 health checks" + type = number + default = 443 +} + +variable "health_check_type" { + description = "Health check protocol (HTTP or HTTPS)" + type = string + default = "HTTPS" + + validation { + condition = contains(["HTTP", "HTTPS"], var.health_check_type) + error_message = "health_check_type must be either HTTP or HTTPS." + } +} + +variable "health_check_interval" { + description = "Seconds between Route 53 health check probes (10 or 30)" + type = number + default = 30 +} + +variable "health_check_failure_threshold" { + description = "Number of consecutive failed probes before a region is marked unhealthy" + type = number + default = 3 +} + +variable "record_ttl_note" { + description = "Alias records ignore TTL; documented here for clarity (Route 53 evaluates health every interval)" + type = string + default = "alias-evaluate-target-health" +} + +variable "tags" { + description = "Tags to apply to resources" + type = map(string) + default = {} +} diff --git a/tf/modules/dns-failover/versions.tf b/tf/modules/dns-failover/versions.tf new file mode 100644 index 00000000..1ac87da5 --- /dev/null +++ b/tf/modules/dns-failover/versions.tf @@ -0,0 +1,8 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + } +} diff --git a/tf/modules/monitoring/versions.tf b/tf/modules/monitoring/versions.tf new file mode 100644 index 00000000..1ac87da5 --- /dev/null +++ b/tf/modules/monitoring/versions.tf @@ -0,0 +1,8 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + } +} diff --git a/tf/modules/networking/versions.tf b/tf/modules/networking/versions.tf new file mode 100644 index 00000000..1ac87da5 --- /dev/null +++ b/tf/modules/networking/versions.tf @@ -0,0 +1,8 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + } +} diff --git a/tf/modules/replication/main.tf b/tf/modules/replication/main.tf new file mode 100644 index 00000000..caffbbfc --- /dev/null +++ b/tf/modules/replication/main.tf @@ -0,0 +1,96 @@ +# S3 Cross-Region Replication Module +# +# Configures continuous, asynchronous replication of a source bucket (primary +# region) to a destination bucket (secondary region). Both buckets must have +# versioning enabled, which the storage module already guarantees. +# +# This module must be applied with a provider in the SOURCE bucket's region, +# because the replication configuration is attached to the source bucket. + +# IAM role assumed by S3 to perform replication. +resource "aws_iam_role" "replication" { + name = "teachlink-${var.environment}-${var.name}-replication" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Principal = { + Service = "s3.amazonaws.com" + } + Action = "sts:AssumeRole" + } + ] + }) + + tags = var.tags +} + +# Permissions: read+version source objects, replicate into the destination. +resource "aws_iam_policy" "replication" { + name = "teachlink-${var.environment}-${var.name}-replication" + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "s3:GetReplicationConfiguration", + "s3:ListBucket" + ] + Resource = [var.source_bucket_arn] + }, + { + Effect = "Allow" + Action = [ + "s3:GetObjectVersionForReplication", + "s3:GetObjectVersionAcl", + "s3:GetObjectVersionTagging" + ] + Resource = ["${var.source_bucket_arn}/*"] + }, + { + Effect = "Allow" + Action = [ + "s3:ReplicateObject", + "s3:ReplicateDelete", + "s3:ReplicateTags" + ] + Resource = ["${var.destination_bucket_arn}/*"] + } + ] + }) +} + +resource "aws_iam_role_policy_attachment" "replication" { + role = aws_iam_role.replication.name + policy_arn = aws_iam_policy.replication.arn +} + +# Attach the replication configuration to the source bucket. +resource "aws_s3_bucket_replication_configuration" "this" { + role = aws_iam_role.replication.arn + bucket = var.source_bucket_id + + rule { + id = "replicate-${var.name}-to-secondary" + status = "Enabled" + + filter { + prefix = "" + } + + delete_marker_replication { + status = "Enabled" + } + + destination { + bucket = var.destination_bucket_arn + storage_class = var.destination_storage_class + } + } + + depends_on = [aws_iam_role_policy_attachment.replication] +} diff --git a/tf/modules/replication/outputs.tf b/tf/modules/replication/outputs.tf new file mode 100644 index 00000000..755a99cd --- /dev/null +++ b/tf/modules/replication/outputs.tf @@ -0,0 +1,9 @@ +output "replication_role_arn" { + description = "ARN of the IAM role used by S3 for replication" + value = aws_iam_role.replication.arn +} + +output "replication_rule_id" { + description = "ID of the replication rule attached to the source bucket" + value = "replicate-${var.name}-to-secondary" +} diff --git a/tf/modules/replication/variables.tf b/tf/modules/replication/variables.tf new file mode 100644 index 00000000..3ebd4c15 --- /dev/null +++ b/tf/modules/replication/variables.tf @@ -0,0 +1,36 @@ +variable "environment" { + description = "Environment name (dev, staging, prod)" + type = string +} + +variable "name" { + description = "Logical name for this replication pairing (e.g. uploads, backups)" + type = string +} + +variable "source_bucket_id" { + description = "ID (name) of the source S3 bucket in the primary region" + type = string +} + +variable "source_bucket_arn" { + description = "ARN of the source S3 bucket in the primary region" + type = string +} + +variable "destination_bucket_arn" { + description = "ARN of the destination S3 bucket in the secondary region" + type = string +} + +variable "destination_storage_class" { + description = "Storage class for replicated objects in the destination bucket" + type = string + default = "STANDARD_IA" +} + +variable "tags" { + description = "Tags to apply to resources" + type = map(string) + default = {} +} diff --git a/tf/modules/replication/versions.tf b/tf/modules/replication/versions.tf new file mode 100644 index 00000000..1ac87da5 --- /dev/null +++ b/tf/modules/replication/versions.tf @@ -0,0 +1,8 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + } +} diff --git a/tf/modules/storage/versions.tf b/tf/modules/storage/versions.tf new file mode 100644 index 00000000..1ac87da5 --- /dev/null +++ b/tf/modules/storage/versions.tf @@ -0,0 +1,8 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + } +} diff --git a/tf/multi-region/README.md b/tf/multi-region/README.md new file mode 100644 index 00000000..6fdc325e --- /dev/null +++ b/tf/multi-region/README.md @@ -0,0 +1,98 @@ +# Multi-Region Deployment & Failover + +This Terraform root configuration deploys TeachLink across **two AWS regions** in +an active (primary) / warm-standby (secondary) topology with automated, +health-check driven DNS failover and cross-region data replication. + +It composes the single-region modules in [`../modules`](../modules) — nothing is +duplicated — and adds three new modules: `dns-failover`, `replication`, and +`database-replica`. + +> Implements issue **#620 — Add multi-region deployment and failover strategy**. + +## Architecture + +``` + Route 53 (failover routing) + api.teachlink.io + ┌────────────┴─────────────┐ + health check ✓ health check ✗→✓ + │ │ + ┌───────────▼───────────┐ ┌───────────▼───────────┐ + │ PRIMARY (us-east-1) │ │ SECONDARY (us-west-2) │ + │ ALB → ECS (active) │ │ ALB → ECS (standby) │ + │ RDS primary (R/W) │──► │ RDS read replica ─────┼─► promote + │ Redis primary │ │ Redis standby │ on failover + │ S3 uploads/backups ───┼──► │ S3 uploads/backups │ (CRR) + └────────────────────────┘ └────────────────────────┘ +``` + +### What gets created + +| Concern | Primary region | Secondary region | +| -------------------- | ----------------------- | -------------------------------------- | +| Networking | VPC + subnets + SGs | VPC + subnets + SGs (non-overlapping) | +| Compute | ECS/Fargate + ALB (active) | ECS/Fargate + ALB (warm standby) | +| Database | RDS PostgreSQL (R/W) | Cross-region **read replica** | +| Cache | ElastiCache Redis | Standby Redis (warmed on failover) | +| Storage | S3 uploads + backups | S3 uploads + backups (**CRR target**) | +| DNS | Route 53 failover record + health check | Route 53 failover record + health check | +| Monitoring | CloudWatch | CloudWatch | + +## Acceptance criteria mapping (#620) + +- **Multiple deployment regions configured** — `providers.tf` declares aliased + `aws.primary` / `aws.secondary`; `main.tf` instantiates every stack in both. +- **Failover automation** — `modules/dns-failover` (Route 53 health checks + + PRIMARY/SECONDARY records) plus [`../../infra/scripts/failover.sh`](../../infra/scripts/failover.sh). +- **Data replication strategy** — `modules/replication` (S3 CRR) + + `modules/database-replica` (RDS cross-region read replica). See + [`../../dr/procedures/data-replication.md`](../../dr/procedures/data-replication.md). +- **Testing and runbooks** — [`../../infra/scripts/failover-drill.sh`](../../infra/scripts/failover-drill.sh), + [`validate-multiregion.sh`](../../infra/scripts/validate-multiregion.sh), and + [`../../dr/runbooks/multi-region-deployment.md`](../../dr/runbooks/multi-region-deployment.md). + +## Usage + +```bash +cd tf/multi-region +cp terraform.tfvars.example terraform.tfvars # edit values + +terraform init +terraform fmt -check -recursive +terraform validate +terraform plan -var-file=terraform.tfvars +terraform apply -var-file=terraform.tfvars +``` + +### Prerequisites + +- Terraform >= 1.5.0, AWS credentials with permissions in **both** regions. +- For HTTPS, an ACM certificate in **each** region (`*_certificate_arn`). +- A registered domain. Provide `hosted_zone_id` to reuse a zone, or omit it to + let Terraform create one (then delegate to the output name servers). + +## Recovery objectives + +| Objective | Target | How this design meets it | +| --------- | ------ | ------------------------ | +| **RTO** | ≤ 15 min | Standby fleet always running; Route 53 fails over automatically within `health_check_interval × failure_threshold` (~90s); replica promotion is the only manual step. | +| **RPO** | seconds (DB), async (S3) | RDS read replica streams continuously; S3 CRR replicates new objects asynchronously. This improves on the backup-only RPO in `dr/procedures/RTO-RPO.md`. | + +## Cost & topology notes + +- The secondary fleet defaults to a small warm standby (`secondary_desired_count = 1`). + For true active-active, raise `secondary_desired_count`/`secondary_min_capacity`. +- Route 53 health-check metrics publish only to **us-east-1**; keep + `primary_region = us-east-1` (default) or adjust the `dns_failover` provider. +- State: configure a remote backend (see [`../README.md`](../README.md)). Use a + **separate state key** from the single-region config to avoid collisions. + +## Failover & failback + +Operational procedures live in the DR runbooks: + +- [Region Outage runbook](../../dr/runbooks/region-outage.md) +- [Multi-Region Deployment runbook](../../dr/runbooks/multi-region-deployment.md) +- [Failover Plan](../../dr/procedures/failover-plan.md) +- [Data Replication Strategy](../../dr/procedures/data-replication.md) diff --git a/tf/multi-region/main.tf b/tf/multi-region/main.tf new file mode 100644 index 00000000..17a6475e --- /dev/null +++ b/tf/multi-region/main.tf @@ -0,0 +1,231 @@ +# Multi-Region Deployment & Failover +# +# Composes the existing single-region modules into an active (primary) / +# warm-standby (secondary) topology with: +# - full stacks in two regions (networking, compute, storage, monitoring) +# - a cross-region RDS read replica + standby Redis in the secondary region +# - S3 cross-region replication for uploads and backups +# - Route 53 health-check driven DNS failover +# +# See README.md and ../../dr/ for the operational runbooks. + +locals { + primary_prefix = "${var.s3_bucket_prefix}-${replace(var.primary_region, "-", "")}" + secondary_prefix = "${var.s3_bucket_prefix}-${replace(var.secondary_region, "-", "")}" +} + +# =========================================================================== +# PRIMARY REGION (active) +# =========================================================================== +module "networking_primary" { + source = "../modules/networking" + providers = { aws = aws.primary } + + vpc_cidr = var.primary_vpc_cidr + public_subnet_cidrs = var.primary_public_subnet_cidrs + private_subnet_cidrs = var.primary_private_subnet_cidrs + availability_zones = var.primary_availability_zones + environment = var.environment + tags = var.tags +} + +module "compute_primary" { + source = "../modules/compute" + providers = { aws = aws.primary } + + cluster_name = "${var.cluster_name}-primary" + service_name = "${var.service_name}-primary" + task_cpu = var.task_cpu + task_memory = var.task_memory + desired_count = var.primary_desired_count + min_capacity = var.primary_min_capacity + max_capacity = var.primary_max_capacity + container_port = var.container_port + health_check_path = var.health_check_path + certificate_arn = var.primary_certificate_arn + container_image = var.container_image + private_subnet_ids = module.networking_primary.private_subnet_ids + public_subnet_ids = module.networking_primary.public_subnet_ids + alb_security_group_id = module.networking_primary.alb_security_group_id + ecs_tasks_security_group_id = module.networking_primary.ecs_tasks_security_group_id + environment = var.environment + aws_region = var.primary_region + tags = var.tags +} + +module "database_primary" { + source = "../modules/database" + providers = { + aws = aws.primary + random = random + } + + db_instance_class = var.db_instance_class + db_allocated_storage = var.db_allocated_storage + db_max_allocated_storage = var.db_max_allocated_storage + db_name = var.db_name + redis_node_type = var.redis_node_type + redis_num_cache_nodes = var.redis_num_cache_nodes + private_subnet_ids = module.networking_primary.private_subnet_ids + rds_security_group_id = module.networking_primary.rds_security_group_id + redis_security_group_id = module.networking_primary.redis_security_group_id + environment = var.environment + tags = var.tags +} + +module "storage_primary" { + source = "../modules/storage" + providers = { aws = aws.primary } + + s3_bucket_prefix = local.primary_prefix + environment = var.environment + tags = var.tags +} + +module "monitoring_primary" { + source = "../modules/monitoring" + providers = { aws = aws.primary } + + environment = var.environment + aws_region = var.primary_region + cluster_name = "${var.cluster_name}-primary" + service_name = "${var.service_name}-primary" + alb_name = module.compute_primary.alb_name + enable_monitoring = var.enable_monitoring + tags = var.tags +} + +# =========================================================================== +# SECONDARY REGION (warm standby / failover target) +# =========================================================================== +module "networking_secondary" { + source = "../modules/networking" + providers = { aws = aws.secondary } + + vpc_cidr = var.secondary_vpc_cidr + public_subnet_cidrs = var.secondary_public_subnet_cidrs + private_subnet_cidrs = var.secondary_private_subnet_cidrs + availability_zones = var.secondary_availability_zones + environment = var.environment + tags = var.tags +} + +module "compute_secondary" { + source = "../modules/compute" + providers = { aws = aws.secondary } + + cluster_name = "${var.cluster_name}-secondary" + service_name = "${var.service_name}-secondary" + task_cpu = var.task_cpu + task_memory = var.task_memory + desired_count = var.secondary_desired_count + min_capacity = var.secondary_min_capacity + max_capacity = var.secondary_max_capacity + container_port = var.container_port + health_check_path = var.health_check_path + certificate_arn = var.secondary_certificate_arn + container_image = var.container_image + private_subnet_ids = module.networking_secondary.private_subnet_ids + public_subnet_ids = module.networking_secondary.public_subnet_ids + alb_security_group_id = module.networking_secondary.alb_security_group_id + ecs_tasks_security_group_id = module.networking_secondary.ecs_tasks_security_group_id + environment = var.environment + aws_region = var.secondary_region + tags = var.tags +} + +module "storage_secondary" { + source = "../modules/storage" + providers = { aws = aws.secondary } + + s3_bucket_prefix = local.secondary_prefix + environment = var.environment + tags = var.tags +} + +module "monitoring_secondary" { + source = "../modules/monitoring" + providers = { aws = aws.secondary } + + environment = var.environment + aws_region = var.secondary_region + cluster_name = "${var.cluster_name}-secondary" + service_name = "${var.service_name}-secondary" + alb_name = module.compute_secondary.alb_name + enable_monitoring = var.enable_monitoring + tags = var.tags +} + +# Cross-region read replica + standby cache in the secondary region. +module "database_replica" { + source = "../modules/database-replica" + providers = { aws = aws.secondary } + + environment = var.environment + source_db_arn = module.database_primary.db_instance_arn + db_instance_class = var.db_instance_class + private_subnet_ids = module.networking_secondary.private_subnet_ids + rds_security_group_id = module.networking_secondary.rds_security_group_id + redis_security_group_id = module.networking_secondary.redis_security_group_id + redis_node_type = var.redis_node_type + redis_num_cache_nodes = var.redis_num_cache_nodes + tags = var.tags +} + +# =========================================================================== +# DATA REPLICATION (S3 cross-region replication, primary -> secondary) +# =========================================================================== +module "replication_uploads" { + source = "../modules/replication" + providers = { aws = aws.primary } + + environment = var.environment + name = "uploads" + source_bucket_id = module.storage_primary.uploads_bucket_id + source_bucket_arn = module.storage_primary.uploads_bucket_arn + destination_bucket_arn = module.storage_secondary.uploads_bucket_arn + tags = var.tags + + # Ensure versioning on both buckets is configured before attaching replication. + depends_on = [module.storage_primary, module.storage_secondary] +} + +module "replication_backups" { + source = "../modules/replication" + providers = { aws = aws.primary } + + environment = var.environment + name = "backups" + source_bucket_id = module.storage_primary.backups_bucket_id + source_bucket_arn = module.storage_primary.backups_bucket_arn + destination_bucket_arn = module.storage_secondary.backups_bucket_arn + tags = var.tags + + # Ensure versioning on both buckets is configured before attaching replication. + depends_on = [module.storage_primary, module.storage_secondary] +} + +# =========================================================================== +# DNS FAILOVER (Route 53) +# NOTE: Route 53 health-check CloudWatch metrics are published to us-east-1, +# so the primary provider should target us-east-1 (the default). +# =========================================================================== +module "dns_failover" { + source = "../modules/dns-failover" + providers = { aws = aws.primary } + + environment = var.environment + domain_name = var.domain_name + app_subdomain = var.app_subdomain + hosted_zone_id = var.hosted_zone_id + primary_region = var.primary_region + secondary_region = var.secondary_region + primary_alb_dns_name = module.compute_primary.alb_dns_name + primary_alb_zone_id = module.compute_primary.alb_zone_id + secondary_alb_dns_name = module.compute_secondary.alb_dns_name + secondary_alb_zone_id = module.compute_secondary.alb_zone_id + health_check_path = var.health_check_path + health_check_port = var.primary_certificate_arn != "" ? 443 : 80 + health_check_type = var.primary_certificate_arn != "" ? "HTTPS" : "HTTP" + tags = var.tags +} diff --git a/tf/multi-region/outputs.tf b/tf/multi-region/outputs.tf new file mode 100644 index 00000000..f484498c --- /dev/null +++ b/tf/multi-region/outputs.tf @@ -0,0 +1,73 @@ +# --------------------------------------------------------------------------- +# Primary region +# --------------------------------------------------------------------------- +output "primary_region" { + description = "Primary (active) region" + value = var.primary_region +} + +output "primary_alb_dns_name" { + description = "DNS name of the primary region ALB" + value = module.compute_primary.alb_dns_name +} + +output "primary_db_endpoint" { + description = "Primary RDS endpoint (read/write)" + value = module.database_primary.db_instance_endpoint +} + +output "primary_uploads_bucket" { + description = "Primary region uploads bucket" + value = module.storage_primary.uploads_bucket_id +} + +# --------------------------------------------------------------------------- +# Secondary region +# --------------------------------------------------------------------------- +output "secondary_region" { + description = "Secondary (standby) region" + value = var.secondary_region +} + +output "secondary_alb_dns_name" { + description = "DNS name of the secondary region ALB" + value = module.compute_secondary.alb_dns_name +} + +output "replica_db_endpoint" { + description = "Cross-region read replica endpoint (read-only until promoted)" + value = module.database_replica.replica_db_endpoint +} + +output "secondary_uploads_bucket" { + description = "Secondary region uploads bucket (replication target)" + value = module.storage_secondary.uploads_bucket_id +} + +# --------------------------------------------------------------------------- +# Failover / DNS +# --------------------------------------------------------------------------- +output "app_fqdn" { + description = "Failover-enabled application endpoint" + value = module.dns_failover.app_fqdn +} + +output "hosted_zone_id" { + description = "Route 53 hosted zone ID" + value = module.dns_failover.hosted_zone_id +} + +output "hosted_zone_name_servers" { + description = "Name servers to delegate to (when a zone was created)" + value = module.dns_failover.name_servers +} + +output "primary_health_check_id" { + description = "Route 53 health check ID for the primary region" + value = module.dns_failover.primary_health_check_id +} + +output "secondary_health_check_id" { + description = "Route 53 health check ID for the secondary region" + value = module.dns_failover.secondary_health_check_id +} diff --git a/tf/multi-region/providers.tf b/tf/multi-region/providers.tf new file mode 100644 index 00000000..80d1bc2b --- /dev/null +++ b/tf/multi-region/providers.tf @@ -0,0 +1,32 @@ +# Two aliased AWS providers drive the active-active/active-passive topology. +# Every module is instantiated once per region by passing the matching provider. + +provider "aws" { + alias = "primary" + region = var.primary_region + + default_tags { + tags = { + Environment = var.environment + Project = "teachlink" + ManagedBy = "terraform" + Topology = "multi-region" + Role = "primary" + } + } +} + +provider "aws" { + alias = "secondary" + region = var.secondary_region + + default_tags { + tags = { + Environment = var.environment + Project = "teachlink" + ManagedBy = "terraform" + Topology = "multi-region" + Role = "secondary" + } + } +} diff --git a/tf/multi-region/terraform.tfvars.example b/tf/multi-region/terraform.tfvars.example new file mode 100644 index 00000000..410058ed --- /dev/null +++ b/tf/multi-region/terraform.tfvars.example @@ -0,0 +1,67 @@ +# Environment +environment = "prod" + +# Regions +primary_region = "us-east-1" +secondary_region = "us-west-2" + +primary_availability_zones = ["us-east-1a", "us-east-1b"] +secondary_availability_zones = ["us-west-2a", "us-west-2b"] + +# Networking (non-overlapping CIDRs so the VPCs can be peered if needed) +primary_vpc_cidr = "10.0.0.0/16" +secondary_vpc_cidr = "10.1.0.0/16" +primary_public_subnet_cidrs = ["10.0.1.0/24", "10.0.2.0/24"] +primary_private_subnet_cidrs = ["10.0.10.0/24", "10.0.11.0/24"] +secondary_public_subnet_cidrs = ["10.1.1.0/24", "10.1.2.0/24"] +secondary_private_subnet_cidrs = ["10.1.10.0/24", "10.1.11.0/24"] + +# Compute +cluster_name = "teachlink-cluster" +service_name = "teachlink-service" +container_image = ".dkr.ecr.us-east-1.amazonaws.com/teachlink:latest" +container_port = 3000 +task_cpu = 512 +task_memory = 1024 + +# Primary fleet (active) +primary_desired_count = 3 +primary_min_capacity = 3 +primary_max_capacity = 20 + +# Secondary fleet (warm standby; raise for active-active) +secondary_desired_count = 1 +secondary_min_capacity = 1 +secondary_max_capacity = 20 + +# HTTPS certificates (must exist in each region) +# primary_certificate_arn = "arn:aws:acm:us-east-1:123456789012:certificate/xxx" +# secondary_certificate_arn = "arn:aws:acm:us-west-2:123456789012:certificate/yyy" + +# Database +db_instance_class = "db.t3.large" +db_allocated_storage = 100 +db_max_allocated_storage = 500 +db_name = "teachlink_db" + +# Redis +redis_node_type = "cache.t3.small" +redis_num_cache_nodes = 1 + +# Storage (region is appended automatically to keep bucket names unique) +s3_bucket_prefix = "teachlink" + +# DNS failover +domain_name = "teachlink.io" +app_subdomain = "api" +# hosted_zone_id = "Z0123456789ABCDEFG" # reuse an existing zone; omit to create one + +# Monitoring +enable_monitoring = true + +# Tags +tags = { + Project = "teachlink" + Team = "backend" + CostCenter = "engineering" +} diff --git a/tf/multi-region/variables.tf b/tf/multi-region/variables.tf new file mode 100644 index 00000000..a7b695ad --- /dev/null +++ b/tf/multi-region/variables.tf @@ -0,0 +1,249 @@ +variable "environment" { + description = "Environment name (dev, staging, prod)" + type = string +} + +# --------------------------------------------------------------------------- +# Regions +# --------------------------------------------------------------------------- +variable "primary_region" { + description = "Primary AWS region (active)" + type = string + default = "us-east-1" +} + +variable "secondary_region" { + description = "Secondary AWS region (standby / failover target)" + type = string + default = "us-west-2" +} + +variable "primary_availability_zones" { + description = "Availability zones in the primary region" + type = list(string) + default = ["us-east-1a", "us-east-1b"] +} + +variable "secondary_availability_zones" { + description = "Availability zones in the secondary region" + type = list(string) + default = ["us-west-2a", "us-west-2b"] +} + +# --------------------------------------------------------------------------- +# Networking (distinct CIDRs per region so VPCs can be peered if needed) +# --------------------------------------------------------------------------- +variable "primary_vpc_cidr" { + description = "VPC CIDR for the primary region" + type = string + default = "10.0.0.0/16" +} + +variable "secondary_vpc_cidr" { + description = "VPC CIDR for the secondary region" + type = string + default = "10.1.0.0/16" +} + +variable "primary_public_subnet_cidrs" { + description = "Public subnet CIDRs in the primary region" + type = list(string) + default = ["10.0.1.0/24", "10.0.2.0/24"] +} + +variable "primary_private_subnet_cidrs" { + description = "Private subnet CIDRs in the primary region" + type = list(string) + default = ["10.0.10.0/24", "10.0.11.0/24"] +} + +variable "secondary_public_subnet_cidrs" { + description = "Public subnet CIDRs in the secondary region" + type = list(string) + default = ["10.1.1.0/24", "10.1.2.0/24"] +} + +variable "secondary_private_subnet_cidrs" { + description = "Private subnet CIDRs in the secondary region" + type = list(string) + default = ["10.1.10.0/24", "10.1.11.0/24"] +} + +# --------------------------------------------------------------------------- +# Compute +# --------------------------------------------------------------------------- +variable "cluster_name" { + description = "ECS cluster name (suffixed per region)" + type = string + default = "teachlink-cluster" +} + +variable "service_name" { + description = "ECS service name (suffixed per region)" + type = string + default = "teachlink-service" +} + +variable "task_cpu" { + description = "CPU units for ECS task" + type = number + default = 256 +} + +variable "task_memory" { + description = "Memory (MB) for ECS task" + type = number + default = 512 +} + +variable "container_port" { + description = "Container port for the application" + type = number + default = 3000 +} + +variable "container_image" { + description = "Container image deployed to both regions" + type = string + default = "nginx:latest" +} + +variable "health_check_path" { + description = "Application health check path" + type = string + default = "/health" +} + +variable "primary_certificate_arn" { + description = "ACM certificate ARN in the primary region (for HTTPS)" + type = string + default = "" +} + +variable "secondary_certificate_arn" { + description = "ACM certificate ARN in the secondary region (for HTTPS)" + type = string + default = "" +} + +# Primary region runs the active fleet. +variable "primary_desired_count" { + description = "Desired ECS task count in the primary region" + type = number + default = 3 +} + +variable "primary_min_capacity" { + description = "Minimum ECS task count in the primary region" + type = number + default = 3 +} + +variable "primary_max_capacity" { + description = "Maximum ECS task count in the primary region" + type = number + default = 20 +} + +# Secondary region runs warm-standby capacity. Set desired/min > 0 for an +# active-active hot-standby, or keep low for a cost-efficient pilot-light. +variable "secondary_desired_count" { + description = "Desired ECS task count in the secondary region (standby)" + type = number + default = 1 +} + +variable "secondary_min_capacity" { + description = "Minimum ECS task count in the secondary region" + type = number + default = 1 +} + +variable "secondary_max_capacity" { + description = "Maximum ECS task count in the secondary region (for failover scale-up)" + type = number + default = 20 +} + +# --------------------------------------------------------------------------- +# Database +# --------------------------------------------------------------------------- +variable "db_instance_class" { + description = "RDS instance class" + type = string + default = "db.t3.micro" +} + +variable "db_allocated_storage" { + description = "Allocated storage for RDS (GB)" + type = number + default = 20 +} + +variable "db_max_allocated_storage" { + description = "Maximum allocated storage for RDS (GB)" + type = number + default = 100 +} + +variable "db_name" { + description = "Database name" + type = string + default = "teachlink_db" +} + +variable "redis_node_type" { + description = "ElastiCache node type" + type = string + default = "cache.t3.micro" +} + +variable "redis_num_cache_nodes" { + description = "Number of cache nodes per region" + type = number + default = 1 +} + +# --------------------------------------------------------------------------- +# Storage +# --------------------------------------------------------------------------- +variable "s3_bucket_prefix" { + description = "Base prefix for S3 bucket names; the region is appended to keep names globally unique" + type = string + default = "teachlink" +} + +# --------------------------------------------------------------------------- +# DNS failover +# --------------------------------------------------------------------------- +variable "domain_name" { + description = "Root domain managed in Route 53 (e.g. teachlink.io)" + type = string +} + +variable "app_subdomain" { + description = "Subdomain that fails over between regions" + type = string + default = "api" +} + +variable "hosted_zone_id" { + description = "Existing Route 53 hosted zone ID. Leave empty to create a new public zone." + type = string + default = "" +} + +# --------------------------------------------------------------------------- +# Misc +# --------------------------------------------------------------------------- +variable "enable_monitoring" { + description = "Enable detailed CloudWatch monitoring in both regions" + type = bool + default = true +} + +variable "tags" { + description = "Additional tags applied to all resources" + type = map(string) + default = {} +} diff --git a/tf/multi-region/versions.tf b/tf/multi-region/versions.tf new file mode 100644 index 00000000..c3488b03 --- /dev/null +++ b/tf/multi-region/versions.tf @@ -0,0 +1,14 @@ +terraform { + required_version = ">= 1.5.0" + + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + random = { + source = "hashicorp/random" + version = "~> 3.0" + } + } +} From dadf239eb49c592eed3eef3eb57013ac51548f35 Mon Sep 17 00:00:00 2001 From: Nworah_Gabriel Date: Sat, 30 May 2026 06:39:45 +0800 Subject: [PATCH 2/2] feat(webhooks): add outbound delivery retries with exponential backoff (#615) Implement reliable outbound webhook delivery on the existing webhooks worker: - webhook-backoff.util: exponential backoff with equal jitter + max cap, and retryability classification (5xx/408/425/429/transport = retry; 4xx = permanent) - webhook-delivery.service: HMAC-signed HTTP POST; retryable failures throw so Bull re-enqueues with backoff (retry queue); permanent/exhausted -> dead-letter - webhook-monitor.service: failure counters + CustomMetrics/alert + events - webhook-retry.config: env-overridable maxRetries/delays/jitter/timeout - wire WebhooksWorker to the delivery service (optional-dep fallback for the orchestration pool); register WebhooksDeliveryModule Tests: 19 unit tests (backoff math, retry vs dead-letter, HMAC, job options). Closes #615 --- src/webhooks/README.md | 89 +++++++ src/webhooks/webhook-backoff.util.spec.ts | 88 +++++++ src/webhooks/webhook-backoff.util.ts | 93 +++++++ src/webhooks/webhook-delivery.service.spec.ts | 117 +++++++++ src/webhooks/webhook-delivery.service.ts | 239 ++++++++++++++++++ src/webhooks/webhook-monitor.service.ts | 92 +++++++ src/webhooks/webhook-retry.config.ts | 58 +++++ src/webhooks/webhooks-delivery.module.ts | 24 ++ src/workers/processors/webhooks.worker.ts | 101 +++----- src/workers/workers.module.ts | 2 + 10 files changed, 834 insertions(+), 69 deletions(-) create mode 100644 src/webhooks/README.md create mode 100644 src/webhooks/webhook-backoff.util.spec.ts create mode 100644 src/webhooks/webhook-backoff.util.ts create mode 100644 src/webhooks/webhook-delivery.service.spec.ts create mode 100644 src/webhooks/webhook-delivery.service.ts create mode 100644 src/webhooks/webhook-monitor.service.ts create mode 100644 src/webhooks/webhook-retry.config.ts create mode 100644 src/webhooks/webhooks-delivery.module.ts diff --git a/src/webhooks/README.md b/src/webhooks/README.md new file mode 100644 index 00000000..0383723e --- /dev/null +++ b/src/webhooks/README.md @@ -0,0 +1,89 @@ +# Outbound Webhook Delivery (with retries & backoff) + +Reliable delivery of **outbound** webhooks to subscriber URLs, with exponential +backoff, a configurable retry queue, dead-letter handling and failure +monitoring. + +> Implements issue **#615 — Add webhook delivery retries with exponential backoff**. +> +> This is distinct from [`src/payments/webhooks`](../payments/webhooks), which +> handles *inbound* Stripe/PayPal webhooks. + +## Acceptance criteria mapping + +| Criterion | Where | +| --------- | ----- | +| **Retry queue for failed webhooks** | Bull queue re-enqueues retryable failures; `WebhookDeliveryService.buildJobOptions()` sets `attempts` + `backoff`. The worker re-throws `WebhookRetryableError` so the job is requeued. | +| **Exponential backoff implemented** | [`webhook-backoff.util.ts`](./webhook-backoff.util.ts) → `calculateBackoffDelay()` (exponential + equal jitter + max-delay cap). | +| **Max retry count configured** | [`webhook-retry.config.ts`](./webhook-retry.config.ts) → `maxRetries` (env `WEBHOOK_MAX_RETRIES`, default 5). | +| **Monitoring for failures** | [`webhook-monitor.service.ts`](./webhook-monitor.service.ts) → counters + alert-thresholded metrics via `CustomMetricsService`, plus `webhook.*` events. | + +## Components + +- **`webhook-retry.config.ts`** — retry policy, env-overridable, with defaults. +- **`webhook-backoff.util.ts`** — pure helpers: `calculateBackoffDelay`, + `shouldRetry`, `isRetryableStatusCode`, `isRetryableError`. +- **`webhook-delivery.service.ts`** — signs (HMAC-SHA256) and POSTs the webhook, + classifies failures, schedules retries or dead-letters, emits events. +- **`webhook-monitor.service.ts`** — in-memory counters + `CustomMetricsService` + forwarding; alert fires on dead-letter. +- **`webhooks-delivery.module.ts`** — Nest module (HTTP + monitoring wiring). +- **`workers/processors/webhooks.worker.ts`** — Bull worker delegating to the + delivery service. + +## Retry behaviour + +``` +attempt 1 ──fail(5xx/timeout/network)──▶ wait ~1s ─▶ attempt 2 +attempt 2 ──fail──────────────────────▶ wait ~2s ─▶ attempt 3 +attempt 3 ──fail──────────────────────▶ wait ~4s ─▶ ... up to maxRetries + │ + permanent 4xx ◀───────────────┘ (no retry → dead-letter) + retries exhausted ─────────────▶ dead-letter + alert +``` + +- Delay = `initialDelayMs × multiplier^(attempt-1)`, capped at `maxDelayMs`, + with **equal jitter** (50–100% of the computed delay) to avoid thundering herds. +- **Retryable**: HTTP 5xx, 408, 425, 429, and transport errors (ECONNRESET, + ETIMEDOUT, …). **Permanent**: other 4xx → dead-lettered immediately. + +## Configuration + +| Env var | Default | Meaning | +| ------- | ------- | ------- | +| `WEBHOOK_MAX_RETRIES` | `5` | Max delivery attempts before dead-letter | +| `WEBHOOK_INITIAL_DELAY_MS` | `1000` | Base backoff delay | +| `WEBHOOK_BACKOFF_MULTIPLIER` | `2` | Exponential growth factor | +| `WEBHOOK_MAX_DELAY_MS` | `3600000` | Backoff cap (1h) | +| `WEBHOOK_JITTER` | `true` | Apply equal jitter | +| `WEBHOOK_TIMEOUT_MS` | `10000` | Per-request timeout | + +## Usage + +Enqueue a webhook job onto the `webhooks` Bull queue with options from the +service so retries/backoff match the policy: + +```ts +await webhooksQueue.add( + 'deliver', + { url, event: 'course.completed', payload, secret }, + webhookDeliveryService.buildJobOptions(), +); +``` + +Subscribe to delivery events for auditing/alerting: + +```ts +@OnEvent('webhook.dead_letter') +onDeadLetter(e) { /* page on-call, persist for manual replay */ } +``` + +## Tests + +```bash +npm test -- src/webhooks +``` + +- `webhook-backoff.util.spec.ts` — backoff growth, cap, jitter bounds, retryability. +- `webhook-delivery.service.spec.ts` — success, retryable vs permanent failures, + exhaustion/dead-letter, HMAC signing, job options. diff --git a/src/webhooks/webhook-backoff.util.spec.ts b/src/webhooks/webhook-backoff.util.spec.ts new file mode 100644 index 00000000..ab91fc4e --- /dev/null +++ b/src/webhooks/webhook-backoff.util.spec.ts @@ -0,0 +1,88 @@ +import { + calculateBackoffDelay, + isRetryableError, + isRetryableStatusCode, + shouldRetry, +} from './webhook-backoff.util'; +import { DEFAULT_WEBHOOK_RETRY_CONFIG, WebhookRetryConfig } from './webhook-retry.config'; + +const config: WebhookRetryConfig = { + ...DEFAULT_WEBHOOK_RETRY_CONFIG, + maxRetries: 5, + initialDelayMs: 1_000, + backoffMultiplier: 2, + maxDelayMs: 60_000, + jitter: false, +}; + +describe('calculateBackoffDelay', () => { + it('grows exponentially without jitter', () => { + expect(calculateBackoffDelay(1, config)).toBe(1_000); + expect(calculateBackoffDelay(2, config)).toBe(2_000); + expect(calculateBackoffDelay(3, config)).toBe(4_000); + expect(calculateBackoffDelay(4, config)).toBe(8_000); + }); + + it('caps the delay at maxDelayMs', () => { + expect(calculateBackoffDelay(20, config)).toBe(60_000); + }); + + it('applies equal jitter within [50%, 100%] of the capped delay', () => { + const jittered: WebhookRetryConfig = { ...config, jitter: true }; + // attempt 3 → base 4000; equal jitter keeps half fixed + half random. + expect(calculateBackoffDelay(3, jittered, () => 0)).toBe(2_000); // min + expect(calculateBackoffDelay(3, jittered, () => 1)).toBe(4_000); // max + expect(calculateBackoffDelay(3, jittered, () => 0.5)).toBe(3_000); // mid + }); + + it('treats attempts below 1 as the first retry', () => { + expect(calculateBackoffDelay(0, config)).toBe(1_000); + }); +}); + +describe('shouldRetry', () => { + it('retries while attempts are below the max', () => { + expect(shouldRetry(0, config)).toBe(true); + expect(shouldRetry(4, config)).toBe(true); + }); + + it('stops once the max retry count is reached', () => { + expect(shouldRetry(5, config)).toBe(false); + expect(shouldRetry(6, config)).toBe(false); + }); +}); + +describe('isRetryableStatusCode', () => { + it('retries 5xx, 408, 425 and 429', () => { + [500, 502, 503, 504, 408, 425, 429].forEach((code) => + expect(isRetryableStatusCode(code)).toBe(true), + ); + }); + + it('does not retry other 4xx client errors', () => { + [400, 401, 403, 404, 409, 422].forEach((code) => + expect(isRetryableStatusCode(code)).toBe(false), + ); + }); +}); + +describe('isRetryableError', () => { + it('retries server responses but not client responses', () => { + expect(isRetryableError({ response: { status: 503 } })).toBe(true); + expect(isRetryableError({ response: { status: 400 } })).toBe(false); + }); + + it('retries transport-level errors', () => { + expect(isRetryableError({ code: 'ECONNRESET' })).toBe(true); + expect(isRetryableError({ code: 'ETIMEDOUT' })).toBe(true); + expect(isRetryableError({ code: 'ECONNABORTED', message: 'timeout of 10000ms exceeded' })).toBe( + true, + ); + }); + + it('returns false for non-error values', () => { + expect(isRetryableError(null)).toBe(false); + expect(isRetryableError('boom')).toBe(false); + expect(isRetryableError({ response: { status: 404 } })).toBe(false); + }); +}); diff --git a/src/webhooks/webhook-backoff.util.ts b/src/webhooks/webhook-backoff.util.ts new file mode 100644 index 00000000..2f7c292a --- /dev/null +++ b/src/webhooks/webhook-backoff.util.ts @@ -0,0 +1,93 @@ +/** + * Pure helpers for exponential-backoff retry scheduling and retryability + * classification. Kept dependency-free so they can be unit-tested in isolation + * and reused by both the delivery service and the Bull worker. + */ +import { WebhookRetryConfig } from './webhook-retry.config'; + +/** + * Compute the delay (ms) before a given retry attempt using exponential + * backoff with an optional "equal jitter" component and a hard cap. + * + * @param attempt 1-based retry number (1 = the first retry after the initial try) + * @param config retry configuration + * @param rng random source in [0, 1); injectable for deterministic tests + */ +export const calculateBackoffDelay = ( + attempt: number, + config: WebhookRetryConfig, + rng: () => number = Math.random, +): number => { + const safeAttempt = Math.max(1, Math.floor(attempt)); + + // Exponential growth: initial * multiplier^(attempt-1), capped. + const exponential = config.initialDelayMs * Math.pow(config.backoffMultiplier, safeAttempt - 1); + const capped = Math.min(exponential, config.maxDelayMs); + + if (!config.jitter) { + return Math.round(capped); + } + + // Equal jitter: keep half the delay fixed and randomise the other half so + // retries spread out without ever collapsing to near-zero. + const half = capped / 2; + return Math.round(half + rng() * half); +}; + +/** + * Whether another attempt should be made. + * + * @param attemptsMade number of attempts already completed + * @param config retry configuration (uses maxRetries as the total cap) + */ +export const shouldRetry = (attemptsMade: number, config: WebhookRetryConfig): boolean => + attemptsMade < config.maxRetries; + +/** + * HTTP status codes worth retrying: request timeouts, rate limits, and any + * server-side (5xx) error. Other 4xx responses are permanent client errors and + * are NOT retried. + */ +export const isRetryableStatusCode = (statusCode: number): boolean => { + if (statusCode >= 500) return true; + return statusCode === 408 || statusCode === 425 || statusCode === 429; +}; + +const RETRYABLE_ERROR_CODES = new Set([ + 'ECONNRESET', + 'ECONNREFUSED', + 'ETIMEDOUT', + 'EAI_AGAIN', + 'ENOTFOUND', + 'EPIPE', + 'ECONNABORTED', +]); + +/** + * Classify a thrown delivery error (typically an Axios error) as retryable. + * A response with a status code defers to {@link isRetryableStatusCode}; a + * transport-level failure with no response is retryable. + */ +export const isRetryableError = (error: unknown): boolean => { + if (!error || typeof error !== 'object') return false; + + const err = error as { + response?: { status?: number }; + code?: string; + message?: string; + }; + + if (err.response && typeof err.response.status === 'number') { + return isRetryableStatusCode(err.response.status); + } + + if (err.code && RETRYABLE_ERROR_CODES.has(err.code)) return true; + + // Network/timeout errors that surface only as a message. + if (err.code === undefined && /timeout|socket hang up|network/i.test(err.message ?? '')) { + return true; + } + + // No response and not an obviously permanent error → treat as transient. + return err.response === undefined && err.code !== undefined; +}; diff --git a/src/webhooks/webhook-delivery.service.spec.ts b/src/webhooks/webhook-delivery.service.spec.ts new file mode 100644 index 00000000..131058d9 --- /dev/null +++ b/src/webhooks/webhook-delivery.service.spec.ts @@ -0,0 +1,117 @@ +import { of, throwError } from 'rxjs'; +import { + WebhookDeliveryService, + WebhookRetryableError, + WEBHOOK_EVENTS, + WebhookTarget, +} from './webhook-delivery.service'; +import { WebhookMonitorService } from './webhook-monitor.service'; +import { DEFAULT_WEBHOOK_RETRY_CONFIG, WebhookRetryConfig } from './webhook-retry.config'; + +const config: WebhookRetryConfig = { + ...DEFAULT_WEBHOOK_RETRY_CONFIG, + maxRetries: 3, + initialDelayMs: 1_000, + backoffMultiplier: 2, + maxDelayMs: 60_000, + jitter: false, +}; + +const target: WebhookTarget = { + url: 'https://example.com/hook', + event: 'course.completed', + payload: { courseId: 'c1' }, +}; + +describe('WebhookDeliveryService', () => { + let http: { post: jest.Mock }; + let events: { emit: jest.Mock }; + let monitor: WebhookMonitorService; + let service: WebhookDeliveryService; + + beforeEach(() => { + http = { post: jest.fn() }; + events = { emit: jest.fn() }; + monitor = new WebhookMonitorService(); + service = new WebhookDeliveryService(http as any, monitor, events as any, config); + }); + + it('delivers successfully and records success', async () => { + http.post.mockReturnValue(of({ status: 200 })); + + const result = await service.processDelivery(target, 0); + + expect(result).toMatchObject({ delivered: true, statusCode: 200, attempts: 1 }); + expect(events.emit).toHaveBeenCalledWith(WEBHOOK_EVENTS.DELIVERED, expect.any(Object)); + expect(monitor.getStats()).toMatchObject({ attempts: 1, succeeded: 1, failed: 0 }); + }); + + it('throws a retryable error with backoff delay on a 5xx response', async () => { + http.post.mockReturnValue(throwError(() => ({ response: { status: 503 } }))); + + await expect(service.processDelivery(target, 0)).rejects.toBeInstanceOf(WebhookRetryableError); + + const stats = monitor.getStats(); + expect(stats.failed).toBe(1); + expect(stats.retried).toBe(1); + expect(events.emit).toHaveBeenCalledWith( + WEBHOOK_EVENTS.RETRY_SCHEDULED, + expect.objectContaining({ attempt: 1, nextDelayMs: 1_000 }), + ); + }); + + it('uses exponential backoff across attempts', async () => { + http.post.mockReturnValue(throwError(() => ({ response: { status: 500 } }))); + + // attemptsMade = 1 → this is attempt #2 → delay 2000ms + await expect(service.processDelivery(target, 1)).rejects.toMatchObject({ nextDelayMs: 2_000 }); + }); + + it('dead-letters a permanent 4xx failure without retrying', async () => { + http.post.mockReturnValue(throwError(() => ({ response: { status: 404 } }))); + + const result = await service.processDelivery(target, 0); + + expect(result).toMatchObject({ delivered: false, deadLettered: true }); + expect(monitor.getStats()).toMatchObject({ deadLettered: 1, retried: 0 }); + expect(events.emit).toHaveBeenCalledWith( + WEBHOOK_EVENTS.DEAD_LETTER, + expect.objectContaining({ reason: 'permanent_failure' }), + ); + }); + + it('dead-letters once retries are exhausted', async () => { + http.post.mockReturnValue(throwError(() => ({ response: { status: 503 } }))); + + // attemptsMade = 2 → attempt #3 = maxRetries → no further retry + const result = await service.processDelivery(target, 2); + + expect(result).toMatchObject({ deadLettered: true }); + expect(events.emit).toHaveBeenCalledWith( + WEBHOOK_EVENTS.DEAD_LETTER, + expect.objectContaining({ reason: 'max_retries_exhausted' }), + ); + }); + + it('signs the payload with HMAC when a secret is provided', async () => { + http.post.mockReturnValue(of({ status: 200 })); + + await service.processDelivery({ ...target, secret: 'shh' }, 0); + + const [, , options] = http.post.mock.calls[0]; + expect(options.headers['X-Webhook-Signature']).toMatch(/^sha256=[a-f0-9]{64}$/); + }); + + it('rejects targets missing required fields', async () => { + await expect( + service.processDelivery({ url: '', event: '', payload: undefined } as WebhookTarget, 0), + ).rejects.toThrow('Missing required webhook fields'); + }); + + it('exposes Bull job options matching the retry policy', () => { + expect(service.buildJobOptions()).toMatchObject({ + attempts: 3, + backoff: { type: 'exponential', delay: 1_000 }, + }); + }); +}); diff --git a/src/webhooks/webhook-delivery.service.ts b/src/webhooks/webhook-delivery.service.ts new file mode 100644 index 00000000..8e309a7a --- /dev/null +++ b/src/webhooks/webhook-delivery.service.ts @@ -0,0 +1,239 @@ +import { HttpService } from '@nestjs/axios'; +import { Inject, Injectable, Logger, Optional } from '@nestjs/common'; +import { EventEmitter2 } from '@nestjs/event-emitter'; +import axios from 'axios'; +import { createHmac } from 'crypto'; +import { firstValueFrom } from 'rxjs'; +import { calculateBackoffDelay, isRetryableError, shouldRetry } from './webhook-backoff.util'; +import { + DEFAULT_WEBHOOK_RETRY_CONFIG, + loadWebhookRetryConfig, + WEBHOOK_RETRY_CONFIG, + WebhookRetryConfig, +} from './webhook-retry.config'; +import { WebhookMonitorService } from './webhook-monitor.service'; + +/** A single outbound webhook to deliver. */ +export interface WebhookTarget { + url: string; + event: string; + payload: unknown; + /** Extra headers to merge into the request. */ + headers?: Record; + /** Shared secret; when set, an HMAC-SHA256 signature header is added. */ + secret?: string; + /** Per-target request timeout override (ms). */ + timeoutMs?: number; +} + +export interface WebhookDeliveryResult { + event: string; + url: string; + delivered: boolean; + statusCode?: number; + attempts: number; + deadLettered?: boolean; + error?: string; + deliveredAt?: Date; +} + +/** Emitted on the EventEmitter2 bus for downstream monitoring/audit. */ +export const WEBHOOK_EVENTS = { + DELIVERED: 'webhook.delivered', + RETRY_SCHEDULED: 'webhook.retry_scheduled', + DEAD_LETTER: 'webhook.dead_letter', +} as const; + +/** + * Thrown when a delivery attempt fails but is eligible for another retry. The + * Bull worker re-throws this so the job is re-enqueued with backoff, forming the + * retry queue. Carries the computed delay for the next attempt. + */ +export class WebhookRetryableError extends Error { + constructor( + message: string, + readonly nextDelayMs: number, + readonly attempt: number, + ) { + super(message); + this.name = 'WebhookRetryableError'; + } +} + +/** + * Delivers outbound webhooks over HTTP with exponential-backoff retries, a + * configurable max retry count, dead-letter handling, and failure monitoring. + * + * Transient failures (5xx, timeouts, network errors) raise + * {@link WebhookRetryableError} so the Bull retry queue re-enqueues them with an + * exponential delay. Permanent failures (4xx) and exhausted retries are + * dead-lettered immediately and reported to {@link WebhookMonitorService}. + */ +@Injectable() +export class WebhookDeliveryService { + private readonly logger = new Logger(WebhookDeliveryService.name); + private readonly config: WebhookRetryConfig; + + constructor( + private readonly http: HttpService, + private readonly monitor: WebhookMonitorService, + @Optional() private readonly events?: EventEmitter2, + @Optional() @Inject(WEBHOOK_RETRY_CONFIG) config?: WebhookRetryConfig, + ) { + this.config = config ?? DEFAULT_WEBHOOK_RETRY_CONFIG; + } + + /** + * Build a self-contained instance (own axios-backed HttpService and monitor) + * for contexts that construct workers manually rather than via Nest DI, such + * as the worker orchestration pool. + */ + static createDefault(): WebhookDeliveryService { + return new WebhookDeliveryService( + new HttpService(axios.create()), + new WebhookMonitorService(), + undefined, + loadWebhookRetryConfig(), + ); + } + + /** + * Bull-compatible job options that mirror this service's retry policy, so the + * queue retries the right number of times with matching exponential backoff. + */ + buildJobOptions() { + return { + attempts: this.config.maxRetries, + backoff: { type: 'exponential', delay: this.config.initialDelayMs }, + removeOnComplete: true, + removeOnFail: false, + }; + } + + /** + * Process one delivery attempt for a webhook job. + * + * @param target the webhook to deliver + * @param attemptsMade number of attempts already completed (Bull's job.attemptsMade) + */ + async processDelivery(target: WebhookTarget, attemptsMade = 0): Promise { + this.validateTarget(target); + const attempt = attemptsMade + 1; + this.monitor.recordAttempt(target.event, target.url); + + const startedAt = Date.now(); + try { + const statusCode = await this.sendRequest(target, attempt); + const durationMs = Date.now() - startedAt; + this.monitor.recordSuccess(target.event, durationMs); + + const result: WebhookDeliveryResult = { + event: target.event, + url: target.url, + delivered: true, + statusCode, + attempts: attempt, + deliveredAt: new Date(), + }; + this.events?.emit(WEBHOOK_EVENTS.DELIVERED, result); + this.logger.log(`Webhook "${target.event}" delivered to ${target.url} (attempt ${attempt})`); + return result; + } catch (error) { + return this.handleFailure(target, attempt, attemptsMade, error); + } + } + + private handleFailure( + target: WebhookTarget, + attempt: number, + attemptsMade: number, + error: unknown, + ): WebhookDeliveryResult { + const message = this.errorMessage(error); + const retryable = isRetryableError(error); + const canRetry = retryable && shouldRetry(attempt, this.config); + + this.monitor.recordFailure(target.event, message); + + if (canRetry) { + const nextDelayMs = calculateBackoffDelay(attempt, this.config); + this.monitor.recordRetry(target.event, attempt, nextDelayMs); + this.events?.emit(WEBHOOK_EVENTS.RETRY_SCHEDULED, { + event: target.event, + url: target.url, + attempt, + nextDelayMs, + error: message, + }); + // Re-thrown by the worker so Bull re-enqueues with backoff. + throw new WebhookRetryableError(message, nextDelayMs, attempt); + } + + // Permanent failure or retries exhausted → dead-letter (no further retries). + this.monitor.recordDeadLetter(target.event, target.url, attempt); + const result: WebhookDeliveryResult = { + event: target.event, + url: target.url, + delivered: false, + attempts: attempt, + deadLettered: true, + error: message, + }; + this.events?.emit(WEBHOOK_EVENTS.DEAD_LETTER, { + ...result, + reason: retryable ? 'max_retries_exhausted' : 'permanent_failure', + }); + return result; + } + + /** Perform a single signed HTTP POST. Returns the status code on 2xx. */ + private async sendRequest(target: WebhookTarget, attempt: number): Promise { + const body = { + id: `evt_${Date.now()}_${attempt}`, + event: target.event, + timestamp: new Date().toISOString(), + payload: target.payload, + attempt, + }; + + const headers: Record = { + 'Content-Type': 'application/json', + 'User-Agent': 'TeachLink-Webhooks/1.0', + 'X-Webhook-Event': target.event, + 'X-Webhook-Attempt': String(attempt), + ...target.headers, + }; + + if (target.secret) { + const serialized = JSON.stringify(body); + headers['X-Webhook-Signature'] = `sha256=${this.sign(serialized, target.secret)}`; + } + + const response = await firstValueFrom( + this.http.post(target.url, body, { + headers, + timeout: target.timeoutMs ?? this.config.timeoutMs, + }), + ); + return response.status; + } + + private sign(payload: string, secret: string): string { + return createHmac('sha256', secret).update(payload).digest('hex'); + } + + private validateTarget(target: WebhookTarget): void { + if (!target?.url || !target.event || target.payload === undefined) { + throw new Error('Missing required webhook fields: url, event, payload'); + } + } + + private errorMessage(error: unknown): string { + if (error && typeof error === 'object') { + const err = error as { response?: { status?: number }; message?: string }; + if (err.response?.status) return `HTTP ${err.response.status}`; + if (err.message) return err.message; + } + return String(error); + } +} diff --git a/src/webhooks/webhook-monitor.service.ts b/src/webhooks/webhook-monitor.service.ts new file mode 100644 index 00000000..ff923f76 --- /dev/null +++ b/src/webhooks/webhook-monitor.service.ts @@ -0,0 +1,92 @@ +import { Injectable, Logger, Optional } from '@nestjs/common'; +import { CustomMetricsService } from '../monitoring/custom-metrics.service'; + +/** Snapshot of webhook delivery counters. */ +export interface WebhookDeliveryStats { + attempts: number; + succeeded: number; + failed: number; + retried: number; + deadLettered: number; + /** failed / attempts, in the range [0, 1]. */ + failureRate: number; +} + +export const WEBHOOK_METRICS = { + ATTEMPTS: 'webhook.delivery.attempts', + SUCCEEDED: 'webhook.delivery.succeeded', + FAILED: 'webhook.delivery.failed', + RETRIED: 'webhook.delivery.retried', + DEAD_LETTERED: 'webhook.delivery.dead_lettered', + DURATION_MS: 'webhook.delivery.duration_ms', +} as const; + +/** + * Centralises monitoring for outbound webhook delivery. Maintains in-memory + * counters (always available, e.g. for health endpoints and tests) and, when a + * {@link CustomMetricsService} is wired in, forwards the same signals to the + * platform metrics/alerting pipeline. + */ +@Injectable() +export class WebhookMonitorService { + private readonly logger = new Logger(WebhookMonitorService.name); + + private attempts = 0; + private succeeded = 0; + private failed = 0; + private retried = 0; + private deadLettered = 0; + + constructor(@Optional() private readonly metrics?: CustomMetricsService) { + // Register definitions up front so an alert can fire on the failure counter. + this.metrics?.define({ + name: WEBHOOK_METRICS.DEAD_LETTERED, + description: 'Webhooks that exhausted all retries and were dead-lettered', + type: 'counter', + alertThreshold: 1, + }); + } + + recordAttempt(event: string, url: string): void { + this.attempts += 1; + this.metrics?.increment(WEBHOOK_METRICS.ATTEMPTS, 1, { event }); + this.logger.debug?.(`Webhook attempt: ${event} -> ${url}`); + } + + recordSuccess(event: string, durationMs: number): void { + this.succeeded += 1; + this.metrics?.increment(WEBHOOK_METRICS.SUCCEEDED, 1, { event }); + this.metrics?.record(WEBHOOK_METRICS.DURATION_MS, durationMs, { event }); + } + + recordRetry(event: string, attempt: number, delayMs: number): void { + this.retried += 1; + this.metrics?.increment(WEBHOOK_METRICS.RETRIED, 1, { event }); + this.logger.warn( + `Webhook delivery for "${event}" failed; scheduling retry #${attempt} in ${delayMs}ms`, + ); + } + + recordFailure(event: string, error: string): void { + this.failed += 1; + this.metrics?.increment(WEBHOOK_METRICS.FAILED, 1, { event }); + this.logger.error(`Webhook delivery for "${event}" failed: ${error}`); + } + + recordDeadLetter(event: string, url: string, attempts: number): void { + this.deadLettered += 1; + this.metrics?.increment(WEBHOOK_METRICS.DEAD_LETTERED, 1, { event }); + this.logger.error(`Webhook "${event}" -> ${url} dead-lettered after ${attempts} attempts`); + } + + getStats(): WebhookDeliveryStats { + return { + attempts: this.attempts, + succeeded: this.succeeded, + failed: this.failed, + retried: this.retried, + deadLettered: this.deadLettered, + failureRate: this.attempts === 0 ? 0 : this.failed / this.attempts, + }; + } +} diff --git a/src/webhooks/webhook-retry.config.ts b/src/webhooks/webhook-retry.config.ts new file mode 100644 index 00000000..320f9485 --- /dev/null +++ b/src/webhooks/webhook-retry.config.ts @@ -0,0 +1,58 @@ +/** + * Configuration for outbound webhook delivery retries. + * + * Values can be overridden via environment variables so retry behaviour is + * tunable per environment without code changes (see {@link loadWebhookRetryConfig}). + */ +export interface WebhookRetryConfig { + /** Maximum number of delivery attempts before a webhook is dead-lettered. */ + maxRetries: number; + /** Base delay (ms) used for the first retry. */ + initialDelayMs: number; + /** Multiplier applied on each successive retry (exponential growth). */ + backoffMultiplier: number; + /** Upper bound (ms) for any single backoff delay. */ + maxDelayMs: number; + /** Apply jitter to spread out retries and avoid thundering herds. */ + jitter: boolean; + /** Per-request HTTP timeout (ms). */ + timeoutMs: number; +} + +export const DEFAULT_WEBHOOK_RETRY_CONFIG: WebhookRetryConfig = { + maxRetries: 5, + initialDelayMs: 1_000, + backoffMultiplier: 2, + maxDelayMs: 60 * 60 * 1_000, // 1 hour + jitter: true, + timeoutMs: 10_000, +}; + +const toInt = (value: string | undefined, fallback: number): number => { + const parsed = Number.parseInt(value ?? '', 10); + return Number.isFinite(parsed) && parsed >= 0 ? parsed : fallback; +}; + +const toBool = (value: string | undefined, fallback: boolean): boolean => + value === undefined ? fallback : value === 'true' || value === '1'; + +/** + * Build a {@link WebhookRetryConfig} from environment variables, falling back to + * {@link DEFAULT_WEBHOOK_RETRY_CONFIG} for anything unset or invalid. + */ +export const loadWebhookRetryConfig = ( + env: NodeJS.ProcessEnv = process.env, +): WebhookRetryConfig => ({ + maxRetries: toInt(env.WEBHOOK_MAX_RETRIES, DEFAULT_WEBHOOK_RETRY_CONFIG.maxRetries), + initialDelayMs: toInt(env.WEBHOOK_INITIAL_DELAY_MS, DEFAULT_WEBHOOK_RETRY_CONFIG.initialDelayMs), + backoffMultiplier: toInt( + env.WEBHOOK_BACKOFF_MULTIPLIER, + DEFAULT_WEBHOOK_RETRY_CONFIG.backoffMultiplier, + ), + maxDelayMs: toInt(env.WEBHOOK_MAX_DELAY_MS, DEFAULT_WEBHOOK_RETRY_CONFIG.maxDelayMs), + jitter: toBool(env.WEBHOOK_JITTER, DEFAULT_WEBHOOK_RETRY_CONFIG.jitter), + timeoutMs: toInt(env.WEBHOOK_TIMEOUT_MS, DEFAULT_WEBHOOK_RETRY_CONFIG.timeoutMs), +}); + +/** Injection token for the resolved webhook retry configuration. */ +export const WEBHOOK_RETRY_CONFIG = 'WEBHOOK_RETRY_CONFIG'; diff --git a/src/webhooks/webhooks-delivery.module.ts b/src/webhooks/webhooks-delivery.module.ts new file mode 100644 index 00000000..b400f047 --- /dev/null +++ b/src/webhooks/webhooks-delivery.module.ts @@ -0,0 +1,24 @@ +import { HttpModule } from '@nestjs/axios'; +import { Module } from '@nestjs/common'; +import { MonitoringModule } from '../monitoring/monitoring.module'; +import { WebhookDeliveryService } from './webhook-delivery.service'; +import { WebhookMonitorService } from './webhook-monitor.service'; +import { loadWebhookRetryConfig, WEBHOOK_RETRY_CONFIG } from './webhook-retry.config'; + +/** + * Outbound webhook delivery with exponential-backoff retries, dead-letter + * handling and failure monitoring (issue #615). + */ +@Module({ + imports: [HttpModule, MonitoringModule], + providers: [ + WebhookDeliveryService, + WebhookMonitorService, + { + provide: WEBHOOK_RETRY_CONFIG, + useFactory: () => loadWebhookRetryConfig(), + }, + ], + exports: [WebhookDeliveryService, WebhookMonitorService], +}) +export class WebhooksDeliveryModule {} diff --git a/src/workers/processors/webhooks.worker.ts b/src/workers/processors/webhooks.worker.ts index 9835960b..3f1fd911 100644 --- a/src/workers/processors/webhooks.worker.ts +++ b/src/workers/processors/webhooks.worker.ts @@ -1,93 +1,56 @@ -import { Injectable } from '@nestjs/common'; +import { Injectable, Optional } from '@nestjs/common'; import { Job } from 'bull'; import { BaseWorker } from '../base/base.worker'; +import { WebhookDeliveryService, WebhookTarget } from '../../webhooks/webhook-delivery.service'; /** * Webhooks Worker - * Handles webhook delivery and retry logic + * + * Delivers outbound webhooks via {@link WebhookDeliveryService}, which applies + * exponential-backoff retries, a configurable max retry count, dead-letter + * handling and failure monitoring (issue #615). + * + * Transient failures raise a retryable error so Bull re-enqueues the job with + * backoff (the retry queue); permanent failures and exhausted retries resolve + * with a dead-lettered result. */ @Injectable() export class WebhooksWorker extends BaseWorker { - constructor() { + private readonly delivery: WebhookDeliveryService; + + // The delivery service is injected under Nest DI, but the worker is also + // instantiated manually by the orchestration pool (`new WebhooksWorker()`), + // so fall back to a self-contained default when none is provided. + constructor(@Optional() delivery?: WebhookDeliveryService) { super('webhooks'); + this.delivery = delivery ?? WebhookDeliveryService.createDefault(); } /** - * Execute webhook delivery job + * Execute a webhook delivery job. `job.attemptsMade` is the number of attempts + * already completed, which drives backoff scheduling. */ - async execute(job: Job): Promise { - const { url, event, payload, headers, timeout } = job.data; + async execute(job: Job): Promise { + const { url, event, payload, headers, secret, timeout } = job.data ?? {}; await job.progress(20); - // Validate webhook data - if (!url || !event || !payload) { - throw new Error('Missing required webhook fields: url, event, payload'); - } - - await job.progress(40); - - try { - this.logger.log(`Delivering webhook: ${event} to ${url}`); - - const result = await this.deliverWebhook(job, url, event, payload, headers, timeout); - - await job.progress(100); - return result; - } catch (error) { - this.logger.error(`Failed to deliver webhook to ${url}:`, error); - throw error; - } - } - - /** - * Deliver webhook with retry logic - */ - private async deliverWebhook( - job: Job, - url: string, - event: string, - payload: any, - headers?: any, - timeout?: number, - ): Promise { - await job.progress(60); - - const requestBody = { - id: `evt_${Date.now()}`, + const target: WebhookTarget = { + url, event, - timestamp: new Date(), payload, - retryCount: job.attemptsMade, + headers, + secret, + timeoutMs: timeout, }; - // Simulate webhook delivery (in production, this would use axios or fetch) - try { - this.logger.log(`Sending webhook payload to ${url}:`, requestBody); - - // Simulate HTTP request - await new Promise((resolve) => setTimeout(resolve, 100)); - - await job.progress(90); + await job.progress(40); - // Simulate response - const statusCode = 200; // In production, actual HTTP status + // Delegates retry/backoff/dead-letter decisions to the delivery service. + // A retryable failure throws here so Bull re-enqueues with backoff. + const result = await this.delivery.processDelivery(target, job.attemptsMade); - if (statusCode >= 200 && statusCode < 300) { - return { - event, - url, - statusCode, - delivered: true, - deliveredAt: new Date(), - retryCount: job.attemptsMade, - }; - } else { - throw new Error(`Webhook delivery failed with status ${statusCode}`); - } - } catch (error) { - this.logger.error(`Webhook delivery error for ${url}:`, error.message); - throw error; - } + await job.progress(100); + return result; } } diff --git a/src/workers/workers.module.ts b/src/workers/workers.module.ts index dc19c5b4..1a985cc7 100644 --- a/src/workers/workers.module.ts +++ b/src/workers/workers.module.ts @@ -1,6 +1,7 @@ import { Module } from '@nestjs/common'; import { WorkerOrchestrationService } from './orchestration/worker-orchestration.service'; import { WorkerHealthCheckService } from './health/worker-health-check.service'; +import { WebhooksDeliveryModule } from '../webhooks/webhooks-delivery.module'; import { EmailWorker, MediaProcessingWorker, @@ -15,6 +16,7 @@ import { * Provides centralized async task processing with worker orchestration */ @Module({ + imports: [WebhooksDeliveryModule], providers: [ WorkerOrchestrationService, WorkerHealthCheckService,