From b994e140cc0772b12fccc8f2afd96da8d4c349fd Mon Sep 17 00:00:00 2001
From: Nworah_Gabriel <gabrielnworah6@gmail.com>
Date: Sat, 30 May 2026 06:04:12 +0800
Subject: [PATCH 1/2] feat(infra): add multi-region deployment and failover
 strategy (#620)

Implement an active/warm-standby two-region topology by composing the
existing single-region Terraform modules with aliased providers.

- tf/multi-region: full stack (networking, compute, storage, monitoring)
  in primary + secondary regions
- modules/dns-failover: Route 53 health checks + active-passive failover
- modules/replication: S3 cross-region replication (uploads, backups)
- modules/database-replica: cross-region RDS read replica + standby Redis
- infra/scripts: failover.sh (promote/scale/failback), failover-drill.sh,
  validate-multiregion.sh
- dr/: data-replication strategy + multi-region deployment runbook
- add db_instance_arn output; required_providers in each module

Validated with terraform fmt + validate (single- and multi-region).

Closes #620
---
 .gitignore                               |  18 ++
 dr/README.md                             |   5 +
 dr/procedures/data-replication.md        | 112 ++++++++++
 dr/runbooks/multi-region-deployment.md   | 110 ++++++++++
 infra/scripts/failover-drill.sh          | 109 ++++++++++
 infra/scripts/failover.sh                | 134 ++++++++++++
 infra/scripts/validate-multiregion.sh    |  29 +++
 tf/modules/compute/versions.tf           |   8 +
 tf/modules/database-replica/main.tf      | 102 ++++++++++
 tf/modules/database-replica/outputs.tf   |  19 ++
 tf/modules/database-replica/variables.tf |  45 ++++
 tf/modules/database-replica/versions.tf  |   8 +
 tf/modules/database/outputs.tf           |   5 +
 tf/modules/database/versions.tf          |  12 ++
 tf/modules/dns-failover/main.tf          | 121 +++++++++++
 tf/modules/dns-failover/outputs.tf       |  24 +++
 tf/modules/dns-failover/variables.tf     |  98 +++++++++
 tf/modules/dns-failover/versions.tf      |   8 +
 tf/modules/monitoring/versions.tf        |   8 +
 tf/modules/networking/versions.tf        |   8 +
 tf/modules/replication/main.tf           |  96 +++++++++
 tf/modules/replication/outputs.tf        |   9 +
 tf/modules/replication/variables.tf      |  36 ++++
 tf/modules/replication/versions.tf       |   8 +
 tf/modules/storage/versions.tf           |   8 +
 tf/multi-region/README.md                |  98 +++++++++
 tf/multi-region/main.tf                  | 231 +++++++++++++++++++++
 tf/multi-region/outputs.tf               |  73 +++++++
 tf/multi-region/providers.tf             |  32 +++
 tf/multi-region/terraform.tfvars.example |  67 ++++++
 tf/multi-region/variables.tf             | 249 +++++++++++++++++++++++
 tf/multi-region/versions.tf              |  14 ++
 32 files changed, 1904 insertions(+)
 create mode 100644 dr/procedures/data-replication.md
 create mode 100644 dr/runbooks/multi-region-deployment.md
 create mode 100755 infra/scripts/failover-drill.sh
 create mode 100755 infra/scripts/failover.sh
 create mode 100755 infra/scripts/validate-multiregion.sh
 create mode 100644 tf/modules/compute/versions.tf
 create mode 100644 tf/modules/database-replica/main.tf
 create mode 100644 tf/modules/database-replica/outputs.tf
 create mode 100644 tf/modules/database-replica/variables.tf
 create mode 100644 tf/modules/database-replica/versions.tf
 create mode 100644 tf/modules/database/versions.tf
 create mode 100644 tf/modules/dns-failover/main.tf
 create mode 100644 tf/modules/dns-failover/outputs.tf
 create mode 100644 tf/modules/dns-failover/variables.tf
 create mode 100644 tf/modules/dns-failover/versions.tf
 create mode 100644 tf/modules/monitoring/versions.tf
 create mode 100644 tf/modules/networking/versions.tf
 create mode 100644 tf/modules/replication/main.tf
 create mode 100644 tf/modules/replication/outputs.tf
 create mode 100644 tf/modules/replication/variables.tf
 create mode 100644 tf/modules/replication/versions.tf
 create mode 100644 tf/modules/storage/versions.tf
 create mode 100644 tf/multi-region/README.md
 create mode 100644 tf/multi-region/main.tf
 create mode 100644 tf/multi-region/outputs.tf
 create mode 100644 tf/multi-region/providers.tf
 create mode 100644 tf/multi-region/terraform.tfvars.example
 create mode 100644 tf/multi-region/variables.tf
 create mode 100644 tf/multi-region/versions.tf

diff --git a/.gitignore b/.gitignore
index 4b537ace..1f7c17d7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -80,3 +80,21 @@ tsconfig.build.tsbuildinfo
 
 # Backups
 /backups
+
+# AI assistant / tooling artifacts
+.claude/
+.claude/**
+CLAUDE.md
+.cursor/
+.cursorrules
+.cursorignore
+.aider*
+.windsurfrules
+.github/copilot-instructions.md
+.continue/
+.codeium/
+.gemini/
+.specstory/
+.roo/
+.kilocode/
+.augment/
diff --git a/dr/README.md b/dr/README.md
index 7a336b17..4c10ba7e 100644
--- a/dr/README.md
+++ b/dr/README.md
@@ -7,12 +7,17 @@ Welcome to the TeachLink Disaster Recovery (DR) documentation. This directory co
 ### Planning & Strategy
 - **[RTO/RPO Definitions](./procedures/RTO-RPO.md)** — Recovery Time and Point Objectives, alert thresholds
 - **[Failover Plan](./procedures/failover-plan.md)** — Failover procedures, failback strategy, infrastructure requirements
+- **[Data Replication Strategy](./procedures/data-replication.md)** — Cross-region RDS replica, S3 CRR, cache/state handling
 
 ### Incident Response
 - **[Database Failure Runbook](./runbooks/database-failure.md)** — PostgreSQL failures, connection issues, data integrity problems
 - **[Region Outage Runbook](./runbooks/region-outage.md)** — AWS region unavailability, cross-region failover procedures
+- **[Multi-Region Deployment Runbook](./runbooks/multi-region-deployment.md)** — Deploy, drill, fail over and fail back the two-region topology
 - **[Data Corruption Runbook](./runbooks/data-corruption.md)** — Data inconsistency, corruption detection, point-in-time recovery
 
+### Infrastructure as Code
+- **[Multi-Region Terraform](../tf/multi-region/README.md)** — Active/standby deployment across two regions (issue #620)
+
 ## 🎯 Recovery Objectives at a Glance
 
 | Objective | Target | Notes |
diff --git a/dr/procedures/data-replication.md b/dr/procedures/data-replication.md
new file mode 100644
index 00000000..64caa4b5
--- /dev/null
+++ b/dr/procedures/data-replication.md
@@ -0,0 +1,112 @@
+# Data Replication Strategy
+
+This document describes how TeachLink data is replicated across regions to meet
+the recovery objectives in [RTO-RPO.md](./RTO-RPO.md) and enable the failover
+flow in [failover-plan.md](./failover-plan.md).
+
+Implemented by [`tf/multi-region`](../../tf/multi-region) — issue **#620**.
+
+---
+
+## Summary
+
+| Data store | Mechanism | Direction | RPO | On failover |
+| ---------- | --------- | --------- | --- | ----------- |
+| PostgreSQL (RDS) | Cross-region **read replica** | primary → secondary (continuous) | seconds | Promote replica to standalone primary |
+| Object storage (S3) | **Cross-Region Replication (CRR)** | primary → secondary (async) | seconds–minutes | Already present in secondary bucket |
+| Redis (ElastiCache) | Independent standby (no replication) | n/a | n/a (cache) | Warm standby; repopulates from DB |
+| Terraform state | S3 versioning + DynamoDB lock | n/a | n/a | Restore from versioned state |
+
+---
+
+## 1. Database: RDS cross-region read replica
+
+The primary PostgreSQL instance lives in the primary region. A **cross-region
+read replica** is provisioned in the secondary region by
+[`tf/modules/database-replica`](../../tf/modules/database-replica).
+
+- **How it works**: RDS streams the primary's write-ahead log to the replica
+  asynchronously, typically keeping it within a few seconds of the primary
+  (`ReplicaLag` metric). This gives an effective **RPO of seconds**, a large
+  improvement over the backup-only RPO of up to 7 days.
+- **Encryption**: the source is encrypted, so the replica uses a dedicated
+  KMS key created in the secondary region (cross-region requirement).
+- **Backups**: the replica keeps a 7-day backup retention so it can be promoted
+  and itself replicated after a failover.
+- **On failover**: `infra/scripts/failover.sh activate` runs
+  `aws rds promote-read-replica`, converting the replica into a standalone
+  read/write primary. Promotion is irreversible — failback requires re-seeding
+  the original primary (see failover-plan.md).
+
+### Monitoring replica lag
+
+```bash
+aws cloudwatch get-metric-statistics \
+  --namespace AWS/RDS --metric-name ReplicaLag \
+  --dimensions Name=DBInstanceIdentifier,Value=teachlink-prod-db-replica \
+  --statistics Average --period 60 \
+  --start-time "$(date -u -d '1 hour ago' +%Y-%m-%dT%H:%M:%SZ)" \
+  --end-time   "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
+  --region us-west-2
+```
+
+Alert when `ReplicaLag > 5s` (RPO target). The monthly drill
+(`infra/scripts/failover-drill.sh`) asserts this automatically.
+
+---
+
+## 2. Object storage: S3 Cross-Region Replication
+
+Both the `uploads` and `backups` buckets replicate from the primary region to
+their secondary-region counterparts via
+[`tf/modules/replication`](../../tf/modules/replication).
+
+- **Prerequisite**: versioning is enabled on source and destination (the
+  storage module already does this).
+- **IAM**: a scoped replication role grants S3 permission to read source object
+  versions and write them to the destination.
+- **Scope**: all objects (`prefix = ""`), including delete markers, so deletes
+  propagate.
+- **Storage class**: replicated objects land in `STANDARD_IA` to reduce cost.
+- **Latency**: replication is asynchronous (usually seconds to minutes). Objects
+  written immediately before a regional failure may not have replicated — this
+  is the S3 RPO and is acceptable for uploads/backups.
+
+> Note: CRR only replicates objects written **after** the rule is enabled. For a
+> brand-new secondary bucket, run a one-time `aws s3 sync` (or S3 Batch
+> Replication) to backfill existing objects.
+
+---
+
+## 3. Cache: Redis standby
+
+ElastiCache holds ephemeral cache data, so it is **not** replicated across
+regions. The secondary region runs an independent standby Redis cluster that is
+empty until failover, after which it repopulates naturally from the promoted
+database. This avoids the cost and complexity of a Global Datastore for
+non-durable data.
+
+---
+
+## 4. Terraform state
+
+State is stored in a versioned S3 bucket with a DynamoDB lock table (see
+[`tf/README.md`](../../tf/README.md)). Versioning provides point-in-time recovery
+of the state file itself. Use a **distinct state key** for the multi-region
+configuration to avoid clobbering the single-region state.
+
+---
+
+## Verification
+
+| Check | Command / Tool |
+| ----- | -------------- |
+| Replica exists & lag OK | `infra/scripts/failover-drill.sh` |
+| CRR enabled | `aws s3api get-bucket-replication --bucket <bucket>` |
+| End-to-end failover | Quarterly drill (see [dr/README.md](../README.md)) |
+
+---
+
+**Document Version**: 1.0
+**Owner**: Platform Engineering
+**Related**: [failover-plan.md](./failover-plan.md), [RTO-RPO.md](./RTO-RPO.md), [region-outage runbook](../runbooks/region-outage.md)
diff --git a/dr/runbooks/multi-region-deployment.md b/dr/runbooks/multi-region-deployment.md
new file mode 100644
index 00000000..200c1543
--- /dev/null
+++ b/dr/runbooks/multi-region-deployment.md
@@ -0,0 +1,110 @@
+# Runbook: Multi-Region Deployment & Failover Operations
+
+Operational runbook for the active/standby multi-region topology defined in
+[`tf/multi-region`](../../tf/multi-region). Covers initial deployment, routine
+verification, manual failover, and failback.
+
+> Related: [Region Outage runbook](./region-outage.md) · [Failover Plan](../procedures/failover-plan.md) · [Data Replication Strategy](../procedures/data-replication.md)
+
+---
+
+## 1. Initial deployment
+
+**Prerequisites**
+- Terraform >= 1.5.0; AWS credentials valid in both regions.
+- ACM certificates in **each** region (for HTTPS).
+- A registered domain (provide `hosted_zone_id` or let Terraform create a zone).
+
+**Steps**
+```bash
+cd tf/multi-region
+cp terraform.tfvars.example terraform.tfvars   # edit values
+terraform init
+terraform plan  -var-file=terraform.tfvars
+terraform apply -var-file=terraform.tfvars
+```
+
+**Post-deploy validation**
+```bash
+# Static checks (no creds needed)
+infra/scripts/validate-multiregion.sh
+
+# Live readiness checks
+export PRIMARY_ALB_URL="https://<primary-alb-dns>"
+export SECONDARY_ALB_URL="https://<secondary-alb-dns>"
+infra/scripts/failover-drill.sh
+```
+If `hosted_zone_id` was empty, delegate your domain to the
+`hosted_zone_name_servers` output before traffic will resolve.
+
+---
+
+## 2. Routine verification (monthly drill)
+
+Run on the **third Tuesday, 02:00 UTC** (see [dr/README.md](../README.md)).
+
+```bash
+infra/scripts/failover-drill.sh   # non-destructive; exits non-zero on failure
+```
+Confirms: both ALBs healthy, replica lag within RPO, S3 CRR enabled.
+
+Record results in the DR drill log. Investigate any ❌ before relying on failover.
+
+---
+
+## 3. Manual failover (primary region lost)
+
+> Route 53 shifts **traffic** automatically when the primary health check goes
+> red (~90s). These steps promote the **data tier**, which is not automatic.
+
+1. **Confirm** the outage is regional (AWS Health Dashboard, `failover.sh status`).
+2. **Activate**:
+   ```bash
+   export PRIMARY_REGION=us-east-1 SECONDARY_REGION=us-west-2 ENVIRONMENT=prod
+   infra/scripts/failover.sh activate --dry-run   # review
+   infra/scripts/failover.sh activate             # execute
+   ```
+   This promotes the read replica and scales up the secondary ECS service.
+3. **Repoint the app**: update `DB_HOST`, `REDIS_HOST`, `AWS_REGION` to the
+   secondary endpoints (Terraform outputs `replica_db_endpoint`, etc.) and
+   redeploy / restart tasks so writes hit the promoted database.
+4. **Verify**: `failover.sh status`; `curl https://api.<domain>/health` → 200;
+   confirm DNS resolves to the secondary ALB (`dig +short api.<domain>`).
+5. **Communicate** per the [Failover Plan](../procedures/failover-plan.md) comms timeline.
+
+**Target RTO: ≤ 15 min.** Escalate to Platform Lead if exceeded.
+
+---
+
+## 4. Failback (primary region recovered)
+
+Only after the primary is fully restored and **data re-synced** to it.
+
+1. Re-seed the primary database from the promoted secondary (`pg_dump`/restore
+   or a new replica in the reverse direction), verify integrity.
+2. Re-apply Terraform to restore the primary RDS as a fresh primary and the
+   secondary as a read replica again.
+3. Scale the secondary back to standby:
+   ```bash
+   infra/scripts/failover.sh failback
+   ```
+4. Route 53 returns traffic to the primary once its health check is green.
+5. Validate, then document in the post-incident review.
+
+---
+
+## 5. Troubleshooting
+
+| Symptom | Likely cause | Action |
+| ------- | ------------ | ------ |
+| Traffic not failing over | Health check too lenient / DNS TTL cached | Check `primary_health_check_id` status; wait for interval × threshold |
+| Replica promotion fails | Replica mid-update | `aws rds wait db-instance-available`; retry |
+| App errors after failover | Still pointing at dead primary | Update `DB_HOST`/`REDIS_HOST`, redeploy |
+| S3 objects missing in secondary | Written before CRR enabled, or async lag | One-time `aws s3 sync`; check replication metrics |
+| `terraform apply` global-name clash | Reused single-region state | Use a separate state key for `tf/multi-region` |
+
+---
+
+**Document Version**: 1.0
+**Owner**: Platform Engineering
+**Review**: After each failover drill
diff --git a/infra/scripts/failover-drill.sh b/infra/scripts/failover-drill.sh
new file mode 100755
index 00000000..7de4a284
--- /dev/null
+++ b/infra/scripts/failover-drill.sh
@@ -0,0 +1,109 @@
+#!/usr/bin/env bash
+#
+# failover-drill.sh — Non-destructive validation of the failover setup.
+#
+# Run this monthly (see dr/README.md testing schedule) to confirm the
+# multi-region deployment is failover-ready WITHOUT promoting the replica or
+# shifting production traffic. It checks:
+#   1. Both region ALBs answer the health-check path.
+#   2. The RDS read replica exists and replication lag is within RPO.
+#   3. S3 cross-region replication is enabled on the source buckets.
+#   4. Route 53 health checks report healthy.
+#
+# Exit code is non-zero if any check fails, so it can gate CI / a scheduled job.
+#
+# Usage: ./failover-drill.sh
+# Requires: awscli v2, jq, curl.
+set -uo pipefail
+
+PRIMARY_REGION="${PRIMARY_REGION:-us-east-1}"
+SECONDARY_REGION="${SECONDARY_REGION:-us-west-2}"
+ENVIRONMENT="${ENVIRONMENT:-prod}"
+REPLICA_DB_ID="${REPLICA_DB_ID:-teachlink-${ENVIRONMENT}-db-replica}"
+HEALTH_PATH="${HEALTH_PATH:-/health}"
+MAX_REPLICA_LAG_SECONDS="${MAX_REPLICA_LAG_SECONDS:-5}"
+PRIMARY_ALB_URL="${PRIMARY_ALB_URL:-}"
+SECONDARY_ALB_URL="${SECONDARY_ALB_URL:-}"
+
+PASS=0
+FAIL=0
+ok() {
+  printf '  ✅ %s\n' "$*"
+  PASS=$((PASS + 1))
+}
+bad() {
+  printf '  ❌ %s\n' "$*"
+  FAIL=$((FAIL + 1))
+}
+section() { printf '\n=== %s ===\n' "$*"; }
+
+check_endpoint() {
+  local name="$1" url="$2"
+  [[ -z "$url" ]] && {
+    bad "$name endpoint URL not set (export ${name}_ALB_URL)"
+    return
+  }
+  local code
+  code="$(curl -s -o /dev/null -w '%{http_code}' --max-time 10 "${url}${HEALTH_PATH}" || echo 000)"
+  if [[ "$code" == "200" ]]; then
+    ok "$name health endpoint returned 200"
+  else
+    bad "$name health endpoint returned ${code}"
+  fi
+}
+
+section "1. Region health endpoints"
+check_endpoint "PRIMARY" "$PRIMARY_ALB_URL"
+check_endpoint "SECONDARY" "$SECONDARY_ALB_URL"
+
+section "2. RDS cross-region read replica"
+if command -v aws >/dev/null 2>&1; then
+  replica_json="$(aws rds describe-db-instances --db-instance-identifier "$REPLICA_DB_ID" \
+    --region "$SECONDARY_REGION" --output json 2>/dev/null || echo '')"
+  if [[ -n "$replica_json" ]]; then
+    ok "Read replica '${REPLICA_DB_ID}' exists in ${SECONDARY_REGION}"
+    lag="$(aws cloudwatch get-metric-statistics \
+      --namespace AWS/RDS --metric-name ReplicaLag \
+      --dimensions Name=DBInstanceIdentifier,Value="$REPLICA_DB_ID" \
+      --statistics Average --period 60 \
+      --start-time "$(date -u -d '5 minutes ago' +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || echo '')" \
+      --end-time "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
+      --region "$SECONDARY_REGION" \
+      --query 'sort_by(Datapoints,&Timestamp)[-1].Average' --output text 2>/dev/null || echo None)"
+    if [[ "$lag" == "None" || -z "$lag" ]]; then
+      bad "Could not read ReplicaLag metric (no recent datapoints)"
+    elif awk "BEGIN{exit !(${lag} <= ${MAX_REPLICA_LAG_SECONDS})}"; then
+      ok "Replica lag ${lag}s is within RPO target (${MAX_REPLICA_LAG_SECONDS}s)"
+    else
+      bad "Replica lag ${lag}s exceeds RPO target (${MAX_REPLICA_LAG_SECONDS}s)"
+    fi
+  else
+    bad "Read replica '${REPLICA_DB_ID}' not found in ${SECONDARY_REGION}"
+  fi
+else
+  bad "awscli not available — skipping RDS checks"
+fi
+
+section "3. S3 cross-region replication"
+if command -v aws >/dev/null 2>&1; then
+  for kind in uploads backups; do
+    bucket="teachlink-${PRIMARY_REGION//-/}-${ENVIRONMENT}-${kind}"
+    status="$(aws s3api get-bucket-replication --bucket "$bucket" \
+      --query 'ReplicationConfiguration.Rules[0].Status' --output text 2>/dev/null || echo None)"
+    if [[ "$status" == "Enabled" ]]; then
+      ok "Replication enabled on ${bucket}"
+    else
+      bad "Replication not enabled on ${bucket} (status=${status})"
+    fi
+  done
+else
+  bad "awscli not available — skipping S3 checks"
+fi
+
+section "Drill summary"
+printf 'Passed: %d  Failed: %d\n' "$PASS" "$FAIL"
+[[ "$FAIL" -eq 0 ]] || {
+  echo "Drill FAILED — investigate before relying on failover."
+  exit 1
+}
+echo "Drill PASSED — deployment is failover-ready."
diff --git a/infra/scripts/failover.sh b/infra/scripts/failover.sh
new file mode 100755
index 00000000..f948c8ab
--- /dev/null
+++ b/infra/scripts/failover.sh
@@ -0,0 +1,134 @@
+#!/usr/bin/env bash
+#
+# failover.sh — Activate or fail back the TeachLink multi-region deployment.
+#
+# This script automates the manual steps of a regional failover for the
+# infrastructure provisioned by tf/multi-region:
+#   1. Promote the secondary-region RDS read replica to a standalone primary.
+#   2. Scale up the secondary-region ECS service.
+#   3. Let Route 53 health checks shift traffic (no manual DNS edit needed).
+#
+# Route 53 failover routing already redirects traffic automatically when the
+# primary health check goes red; this script handles the data-tier promotion
+# and capacity changes that are NOT automatic.
+#
+# Usage:
+#   ./failover.sh activate   [--dry-run]
+#   ./failover.sh failback   [--dry-run]
+#   ./failover.sh status
+#
+# Requires: awscli v2, jq. Configure via the environment variables below.
+set -euo pipefail
+
+PRIMARY_REGION="${PRIMARY_REGION:-us-east-1}"
+SECONDARY_REGION="${SECONDARY_REGION:-us-west-2}"
+ENVIRONMENT="${ENVIRONMENT:-prod}"
+REPLICA_DB_ID="${REPLICA_DB_ID:-teachlink-${ENVIRONMENT}-db-replica}"
+PRIMARY_DB_ID="${PRIMARY_DB_ID:-teachlink-${ENVIRONMENT}-db}"
+SECONDARY_CLUSTER="${SECONDARY_CLUSTER:-teachlink-cluster-secondary}"
+SECONDARY_SERVICE="${SECONDARY_SERVICE:-teachlink-service-secondary}"
+FAILOVER_DESIRED_COUNT="${FAILOVER_DESIRED_COUNT:-10}"
+
+DRY_RUN=0
+log() { printf '[failover] %s\n' "$*"; }
+run() {
+  if [[ "$DRY_RUN" -eq 1 ]]; then
+    log "DRY-RUN: $*"
+  else
+    log "RUN: $*"
+    "$@"
+  fi
+}
+
+require() {
+  command -v "$1" >/dev/null 2>&1 || {
+    echo "ERROR: '$1' is required but not installed." >&2
+    exit 1
+  }
+}
+
+activate() {
+  log "Activating failover: ${PRIMARY_REGION} -> ${SECONDARY_REGION} (env=${ENVIRONMENT})"
+
+  log "Step 1/3: Promoting read replica '${REPLICA_DB_ID}' in ${SECONDARY_REGION} to standalone primary"
+  run aws rds promote-read-replica \
+    --db-instance-identifier "${REPLICA_DB_ID}" \
+    --region "${SECONDARY_REGION}"
+
+  log "Waiting for the promoted instance to become available..."
+  run aws rds wait db-instance-available \
+    --db-instance-identifier "${REPLICA_DB_ID}" \
+    --region "${SECONDARY_REGION}"
+
+  log "Step 2/3: Scaling secondary ECS service to ${FAILOVER_DESIRED_COUNT} tasks"
+  run aws ecs update-service \
+    --cluster "${SECONDARY_CLUSTER}" \
+    --service "${SECONDARY_SERVICE}" \
+    --desired-count "${FAILOVER_DESIRED_COUNT}" \
+    --region "${SECONDARY_REGION}"
+
+  log "Step 3/3: Route 53 health-check failover handles DNS automatically."
+  log "          Verify: dig +short api.\${DOMAIN} should resolve to the ${SECONDARY_REGION} ALB."
+  log "Failover activation complete. Update application DB_HOST/REDIS_HOST to the secondary endpoints."
+}
+
+failback() {
+  log "Failing back: ${SECONDARY_REGION} -> ${PRIMARY_REGION} (env=${ENVIRONMENT})"
+  log "WARNING: Failback is high-risk. Only run after the primary region is fully recovered"
+  log "         and data has been re-synced to ${PRIMARY_DB_ID}. See dr/procedures/failover-plan.md."
+
+  log "Step 1/2: Confirm primary database is healthy and re-seeded"
+  run aws rds wait db-instance-available \
+    --db-instance-identifier "${PRIMARY_DB_ID}" \
+    --region "${PRIMARY_REGION}"
+
+  log "Step 2/2: Scaling secondary ECS service back down to standby (1 task)"
+  run aws ecs update-service \
+    --cluster "${SECONDARY_CLUSTER}" \
+    --service "${SECONDARY_SERVICE}" \
+    --desired-count 1 \
+    --region "${SECONDARY_REGION}"
+
+  log "Failback initiated. Route 53 will return traffic to ${PRIMARY_REGION} once its health check is green."
+}
+
+status() {
+  log "Primary RDS (${PRIMARY_REGION}):"
+  aws rds describe-db-instances --db-instance-identifier "${PRIMARY_DB_ID}" \
+    --region "${PRIMARY_REGION}" \
+    --query 'DBInstances[0].{Status:DBInstanceStatus,Role:ReadReplicaSourceDBInstanceIdentifier}' \
+    --output table 2>/dev/null || log "  (not found)"
+
+  log "Secondary RDS (${SECONDARY_REGION}):"
+  aws rds describe-db-instances --db-instance-identifier "${REPLICA_DB_ID}" \
+    --region "${SECONDARY_REGION}" \
+    --query 'DBInstances[0].{Status:DBInstanceStatus,ReplicaOf:ReadReplicaSourceDBInstanceIdentifier}' \
+    --output table 2>/dev/null || log "  (not found)"
+
+  log "Secondary ECS service (${SECONDARY_REGION}):"
+  aws ecs describe-services --cluster "${SECONDARY_CLUSTER}" --services "${SECONDARY_SERVICE}" \
+    --region "${SECONDARY_REGION}" \
+    --query 'services[0].{Desired:desiredCount,Running:runningCount}' \
+    --output table 2>/dev/null || log "  (not found)"
+}
+
+main() {
+  require aws
+  local action="${1:-}"
+  shift || true
+  for arg in "$@"; do
+    [[ "$arg" == "--dry-run" ]] && DRY_RUN=1
+  done
+
+  case "$action" in
+    activate) activate ;;
+    failback) failback ;;
+    status) status ;;
+    *)
+      echo "Usage: $0 {activate|failback|status} [--dry-run]" >&2
+      exit 1
+      ;;
+  esac
+}
+
+main "$@"
diff --git a/infra/scripts/validate-multiregion.sh b/infra/scripts/validate-multiregion.sh
new file mode 100755
index 00000000..53ab522a
--- /dev/null
+++ b/infra/scripts/validate-multiregion.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+#
+# validate-multiregion.sh — Static validation of the multi-region Terraform.
+#
+# Runs formatting and validation checks that do NOT require AWS credentials,
+# suitable for CI. Gracefully degrades (warns, does not fail) when Terraform is
+# not installed so the repo can still be linted in minimal environments.
+#
+# Usage: ./validate-multiregion.sh
+set -uo pipefail
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+TF_DIR="${REPO_ROOT}/tf"
+
+if ! command -v terraform >/dev/null 2>&1; then
+  echo "WARNING: terraform not installed — skipping fmt/validate. Install Terraform >= 1.5.0 to run full checks."
+  exit 0
+fi
+
+echo "=== terraform fmt (check, recursive) ==="
+terraform -chdir="${TF_DIR}" fmt -check -recursive
+
+echo "=== terraform init (backend disabled) ==="
+terraform -chdir="${TF_DIR}/multi-region" init -backend=false -input=false >/dev/null
+
+echo "=== terraform validate (multi-region) ==="
+terraform -chdir="${TF_DIR}/multi-region" validate
+
+echo "All multi-region Terraform checks passed."
diff --git a/tf/modules/compute/versions.tf b/tf/modules/compute/versions.tf
new file mode 100644
index 00000000..1ac87da5
--- /dev/null
+++ b/tf/modules/compute/versions.tf
@@ -0,0 +1,8 @@
+terraform {
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = "~> 5.0"
+    }
+  }
+}
diff --git a/tf/modules/database-replica/main.tf b/tf/modules/database-replica/main.tf
new file mode 100644
index 00000000..04ea96c2
--- /dev/null
+++ b/tf/modules/database-replica/main.tf
@@ -0,0 +1,102 @@
+# Database Replica Module (secondary region)
+#
+# Provisions a cross-region RDS read replica of the primary database plus a
+# warm-standby Redis cluster. The read replica continuously streams changes
+# from the primary, giving an RPO measured in seconds. On failover it is
+# promoted to a standalone read/write primary (see dr/runbooks/region-outage.md).
+#
+# Instantiate this module with the secondary-region provider.
+
+# Subnet group for the replica in the secondary region.
+resource "aws_db_subnet_group" "replica" {
+  name       = "teachlink-${var.environment}-db-replica-subnet-group"
+  subnet_ids = var.private_subnet_ids
+
+  tags = merge(
+    {
+      Name = "teachlink-${var.environment}-db-replica-subnet-group"
+    },
+    var.tags
+  )
+}
+
+# KMS key for encrypting the replica's storage in the secondary region.
+# Cross-region replicas of an encrypted source require a destination-region key.
+resource "aws_kms_key" "replica" {
+  description             = "teachlink-${var.environment} RDS replica encryption key"
+  deletion_window_in_days = 7
+  enable_key_rotation     = true
+
+  tags = merge(
+    {
+      Name = "teachlink-${var.environment}-db-replica-kms"
+    },
+    var.tags
+  )
+}
+
+resource "aws_kms_alias" "replica" {
+  name          = "alias/teachlink-${var.environment}-db-replica"
+  target_key_id = aws_kms_key.replica.key_id
+}
+
+# Cross-region read replica of the primary database.
+resource "aws_db_instance" "replica" {
+  identifier          = "teachlink-${var.environment}-db-replica"
+  instance_class      = var.db_instance_class
+  replicate_source_db = var.source_db_arn
+
+  db_subnet_group_name   = aws_db_subnet_group.replica.name
+  vpc_security_group_ids = [var.rds_security_group_id]
+
+  storage_encrypted = true
+  kms_key_id        = aws_kms_key.replica.arn
+
+  # A replica must keep automated backups so it can itself be promoted/replicated.
+  backup_retention_period = 7
+
+  enabled_cloudwatch_logs_exports = ["postgresql"]
+
+  # Replicas cannot set master credentials or db_name (inherited from source).
+  skip_final_snapshot = true
+  deletion_protection = false
+
+  tags = merge(
+    {
+      Name = "teachlink-${var.environment}-db-replica"
+      Role = "read-replica"
+    },
+    var.tags
+  )
+}
+
+# Standby Redis cluster (warmed on failover; cache data is non-durable).
+resource "aws_elasticache_subnet_group" "standby" {
+  name       = "teachlink-${var.environment}-redis-standby-subnet-group"
+  subnet_ids = var.private_subnet_ids
+
+  tags = merge(
+    {
+      Name = "teachlink-${var.environment}-redis-standby-subnet-group"
+    },
+    var.tags
+  )
+}
+
+resource "aws_elasticache_cluster" "standby" {
+  cluster_id           = "teachlink-${var.environment}-redis-standby"
+  engine               = "redis"
+  node_type            = var.redis_node_type
+  num_cache_nodes      = var.redis_num_cache_nodes
+  parameter_group_name = "default.redis7"
+  port                 = 6379
+  subnet_group_name    = aws_elasticache_subnet_group.standby.name
+  security_group_ids   = [var.redis_security_group_id]
+
+  tags = merge(
+    {
+      Name = "teachlink-${var.environment}-redis-standby"
+    },
+    var.tags
+  )
+}
diff --git a/tf/modules/database-replica/outputs.tf b/tf/modules/database-replica/outputs.tf
new file mode 100644
index 00000000..5e2cd99e
--- /dev/null
+++ b/tf/modules/database-replica/outputs.tf
@@ -0,0 +1,19 @@
+output "replica_db_instance_id" {
+  description = "ID of the cross-region read replica"
+  value       = aws_db_instance.replica.id
+}
+
+output "replica_db_endpoint" {
+  description = "Endpoint of the cross-region read replica"
+  value       = aws_db_instance.replica.endpoint
+}
+
+output "replica_kms_key_arn" {
+  description = "ARN of the KMS key encrypting the replica"
+  value       = aws_kms_key.replica.arn
+}
+
+output "standby_redis_endpoint" {
+  description = "Endpoint of the standby Redis cluster"
+  value       = aws_elasticache_cluster.standby.cache_nodes[0].address
+}
diff --git a/tf/modules/database-replica/variables.tf b/tf/modules/database-replica/variables.tf
new file mode 100644
index 00000000..2b037d69
--- /dev/null
+++ b/tf/modules/database-replica/variables.tf
@@ -0,0 +1,45 @@
+variable "environment" {
+  description = "Environment name (dev, staging, prod)"
+  type        = string
+}
+
+variable "source_db_arn" {
+  description = "ARN of the primary-region RDS instance to replicate from"
+  type        = string
+}
+
+variable "db_instance_class" {
+  description = "Instance class for the read replica"
+  type        = string
+}
+
+variable "private_subnet_ids" {
+  description = "Private subnet IDs in the secondary region"
+  type        = list(string)
+}
+
+variable "rds_security_group_id" {
+  description = "Security group ID for the replica in the secondary region"
+  type        = string
+}
+
+variable "redis_security_group_id" {
+  description = "Security group ID for the standby Redis cluster"
+  type        = string
+}
+
+variable "redis_node_type" {
+  description = "ElastiCache node type for the standby cluster"
+  type        = string
+}
+
+variable "redis_num_cache_nodes" {
+  description = "Number of standby cache nodes"
+  type        = number
+}
+
+variable "tags" {
+  description = "Tags to apply to resources"
+  type        = map(string)
+  default     = {}
+}
diff --git a/tf/modules/database-replica/versions.tf b/tf/modules/database-replica/versions.tf
new file mode 100644
index 00000000..1ac87da5
--- /dev/null
+++ b/tf/modules/database-replica/versions.tf
@@ -0,0 +1,8 @@
+terraform {
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = "~> 5.0"
+    }
+  }
+}
diff --git a/tf/modules/database/outputs.tf b/tf/modules/database/outputs.tf
index 55e0879f..1a7db1ea 100644
--- a/tf/modules/database/outputs.tf
+++ b/tf/modules/database/outputs.tf
@@ -3,6 +3,11 @@ output "db_instance_id" {
   value       = aws_db_instance.main.id
 }
 
+output "db_instance_arn" {
+  description = "RDS instance ARN (required as the source for cross-region read replicas)"
+  value       = aws_db_instance.main.arn
+}
+
 output "db_instance_endpoint" {
   description = "RDS instance endpoint"
   value       = aws_db_instance.main.endpoint
diff --git a/tf/modules/database/versions.tf b/tf/modules/database/versions.tf
new file mode 100644
index 00000000..a9e3c714
--- /dev/null
+++ b/tf/modules/database/versions.tf
@@ -0,0 +1,12 @@
+terraform {
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = "~> 5.0"
+    }
+    random = {
+      source  = "hashicorp/random"
+      version = "~> 3.0"
+    }
+  }
+}
diff --git a/tf/modules/dns-failover/main.tf b/tf/modules/dns-failover/main.tf
new file mode 100644
index 00000000..7d14f78f
--- /dev/null
+++ b/tf/modules/dns-failover/main.tf
@@ -0,0 +1,121 @@
+# DNS Failover Module
+#
+# Implements active-passive, health-check driven failover between two regions
+# using Route 53. Traffic resolves to the primary region while its health check
+# is healthy and automatically shifts to the secondary region when it is not.
+
+locals {
+  # Use the provided hosted zone, or the one created below.
+  zone_id  = var.hosted_zone_id != "" ? var.hosted_zone_id : aws_route53_zone.this[0].zone_id
+  app_fqdn = "${var.app_subdomain}.${var.domain_name}"
+}
+
+# Optionally create a public hosted zone when an existing one is not supplied.
+resource "aws_route53_zone" "this" {
+  count = var.hosted_zone_id == "" ? 1 : 0
+  name  = var.domain_name
+
+  tags = merge(
+    {
+      Name = "teachlink-${var.environment}-zone"
+    },
+    var.tags
+  )
+}
+
+# Health check for the primary region endpoint.
+resource "aws_route53_health_check" "primary" {
+  fqdn              = var.primary_alb_dns_name
+  port              = var.health_check_port
+  type              = var.health_check_type
+  resource_path     = var.health_check_path
+  request_interval  = var.health_check_interval
+  failure_threshold = var.health_check_failure_threshold
+
+  tags = merge(
+    {
+      Name   = "teachlink-${var.environment}-primary-${var.primary_region}"
+      Region = var.primary_region
+    },
+    var.tags
+  )
+}
+
+# Health check for the secondary region endpoint.
+resource "aws_route53_health_check" "secondary" {
+  fqdn              = var.secondary_alb_dns_name
+  port              = var.health_check_port
+  type              = var.health_check_type
+  resource_path     = var.health_check_path
+  request_interval  = var.health_check_interval
+  failure_threshold = var.health_check_failure_threshold
+
+  tags = merge(
+    {
+      Name   = "teachlink-${var.environment}-secondary-${var.secondary_region}"
+      Region = var.secondary_region
+    },
+    var.tags
+  )
+}
+
+# PRIMARY failover record — served while the primary health check is healthy.
+resource "aws_route53_record" "primary" {
+  zone_id = local.zone_id
+  name    = local.app_fqdn
+  type    = "A"
+
+  set_identifier  = "primary-${var.primary_region}"
+  health_check_id = aws_route53_health_check.primary.id
+
+  failover_routing_policy {
+    type = "PRIMARY"
+  }
+
+  alias {
+    name                   = var.primary_alb_dns_name
+    zone_id                = var.primary_alb_zone_id
+    evaluate_target_health = true
+  }
+}
+
+# SECONDARY failover record — served when the primary is unhealthy.
+resource "aws_route53_record" "secondary" {
+  zone_id = local.zone_id
+  name    = local.app_fqdn
+  type    = "A"
+
+  set_identifier  = "secondary-${var.secondary_region}"
+  health_check_id = aws_route53_health_check.secondary.id
+
+  failover_routing_policy {
+    type = "SECONDARY"
+  }
+
+  alias {
+    name                   = var.secondary_alb_dns_name
+    zone_id                = var.secondary_alb_zone_id
+    evaluate_target_health = true
+  }
+}
+
+# CloudWatch alarm that fires when the primary region health check goes red,
+# so failover is observable and can page the on-call engineer.
+resource "aws_cloudwatch_metric_alarm" "primary_unhealthy" {
+  alarm_name          = "teachlink-${var.environment}-primary-region-unhealthy"
+  comparison_operator = "LessThanThreshold"
+  evaluation_periods  = 1
+  metric_name         = "HealthCheckStatus"
+  namespace           = "AWS/Route53"
+  period              = 60
+  statistic           = "Minimum"
+  threshold           = 1
+  alarm_description   = "Primary region (${var.primary_region}) health check is failing; Route 53 is failing over to ${var.secondary_region}."
+  treat_missing_data  = "breaching"
+
+  dimensions = {
+    HealthCheckId = aws_route53_health_check.primary.id
+  }
+
+  tags = var.tags
+}
diff --git a/tf/modules/dns-failover/outputs.tf b/tf/modules/dns-failover/outputs.tf
new file mode 100644
index 00000000..fce9fc6f
--- /dev/null
+++ b/tf/modules/dns-failover/outputs.tf
@@ -0,0 +1,24 @@
+output "hosted_zone_id" {
+  description = "Route 53 hosted zone ID used for the failover records"
+  value       = local.zone_id
+}
+
+output "app_fqdn" {
+  description = "Fully qualified domain name that fails over between regions"
+  value       = local.app_fqdn
+}
+
+output "primary_health_check_id" {
+  description = "ID of the primary region Route 53 health check"
+  value       = aws_route53_health_check.primary.id
+}
+
+output "secondary_health_check_id" {
+  description = "ID of the secondary region Route 53 health check"
+  value       = aws_route53_health_check.secondary.id
+}
+
+output "name_servers" {
+  description = "Name servers for the created hosted zone (empty when an existing zone is reused)"
+  value       = var.hosted_zone_id == "" ? aws_route53_zone.this[0].name_servers : []
+}
diff --git a/tf/modules/dns-failover/variables.tf b/tf/modules/dns-failover/variables.tf
new file mode 100644
index 00000000..a2a1b3dd
--- /dev/null
+++ b/tf/modules/dns-failover/variables.tf
@@ -0,0 +1,98 @@
+variable "environment" {
+  description = "Environment name (dev, staging, prod)"
+  type        = string
+}
+
+variable "domain_name" {
+  description = "Root domain managed in Route 53 (e.g. teachlink.io)"
+  type        = string
+}
+
+variable "app_subdomain" {
+  description = "Subdomain that fails over between regions (e.g. api)"
+  type        = string
+  default     = "api"
+}
+
+variable "hosted_zone_id" {
+  description = "Existing Route 53 hosted zone ID. Leave empty to create a new public zone for domain_name."
+  type        = string
+  default     = ""
+}
+
+variable "primary_region" {
+  description = "Primary AWS region identifier (used for set identifiers)"
+  type        = string
+}
+
+variable "secondary_region" {
+  description = "Secondary AWS region identifier (used for set identifiers)"
+  type        = string
+}
+
+variable "primary_alb_dns_name" {
+  description = "DNS name of the primary region ALB"
+  type        = string
+}
+
+variable "primary_alb_zone_id" {
+  description = "Route 53 hosted zone ID of the primary region ALB"
+  type        = string
+}
+
+variable "secondary_alb_dns_name" {
+  description = "DNS name of the secondary region ALB"
+  type        = string
+}
+
+variable "secondary_alb_zone_id" {
+  description = "Route 53 hosted zone ID of the secondary region ALB"
+  type        = string
+}
+
+variable "health_check_path" {
+  description = "Path probed by the Route 53 health checks"
+  type        = string
+  default     = "/health"
+}
+
+variable "health_check_port" {
+  description = "Port probed by the Route 53 health checks"
+  type        = number
+  default     = 443
+}
+
+variable "health_check_type" {
+  description = "Health check protocol (HTTP or HTTPS)"
+  type        = string
+  default     = "HTTPS"
+
+  validation {
+    condition     = contains(["HTTP", "HTTPS"], var.health_check_type)
+    error_message = "health_check_type must be either HTTP or HTTPS."
+  }
+}
+
+variable "health_check_interval" {
+  description = "Seconds between Route 53 health check probes (10 or 30)"
+  type        = number
+  default     = 30
+}
+
+variable "health_check_failure_threshold" {
+  description = "Number of consecutive failed probes before a region is marked unhealthy"
+  type        = number
+  default     = 3
+}
+
+variable "record_ttl_note" {
+  description = "Alias records ignore TTL; documented here for clarity (Route 53 evaluates health every interval)"
+  type        = string
+  default     = "alias-evaluate-target-health"
+}
+
+variable "tags" {
+  description = "Tags to apply to resources"
+  type        = map(string)
+  default     = {}
+}
diff --git a/tf/modules/dns-failover/versions.tf b/tf/modules/dns-failover/versions.tf
new file mode 100644
index 00000000..1ac87da5
--- /dev/null
+++ b/tf/modules/dns-failover/versions.tf
@@ -0,0 +1,8 @@
+terraform {
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = "~> 5.0"
+    }
+  }
+}
diff --git a/tf/modules/monitoring/versions.tf b/tf/modules/monitoring/versions.tf
new file mode 100644
index 00000000..1ac87da5
--- /dev/null
+++ b/tf/modules/monitoring/versions.tf
@@ -0,0 +1,8 @@
+terraform {
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = "~> 5.0"
+    }
+  }
+}
diff --git a/tf/modules/networking/versions.tf b/tf/modules/networking/versions.tf
new file mode 100644
index 00000000..1ac87da5
--- /dev/null
+++ b/tf/modules/networking/versions.tf
@@ -0,0 +1,8 @@
+terraform {
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = "~> 5.0"
+    }
+  }
+}
diff --git a/tf/modules/replication/main.tf b/tf/modules/replication/main.tf
new file mode 100644
index 00000000..caffbbfc
--- /dev/null
+++ b/tf/modules/replication/main.tf
@@ -0,0 +1,96 @@
+# S3 Cross-Region Replication Module
+#
+# Configures continuous, asynchronous replication of a source bucket (primary
+# region) to a destination bucket (secondary region). Both buckets must have
+# versioning enabled, which the storage module already guarantees.
+#
+# This module must be applied with a provider in the SOURCE bucket's region,
+# because the replication configuration is attached to the source bucket.
+
+# IAM role assumed by S3 to perform replication.
+resource "aws_iam_role" "replication" {
+  name = "teachlink-${var.environment}-${var.name}-replication"
+
+  assume_role_policy = jsonencode({
+    Version = "2012-10-17"
+    Statement = [
+      {
+        Effect = "Allow"
+        Principal = {
+          Service = "s3.amazonaws.com"
+        }
+        Action = "sts:AssumeRole"
+      }
+    ]
+  })
+
+  tags = var.tags
+}
+
+# Permissions: read+version source objects, replicate into the destination.
+resource "aws_iam_policy" "replication" {
+  name = "teachlink-${var.environment}-${var.name}-replication"
+
+  policy = jsonencode({
+    Version = "2012-10-17"
+    Statement = [
+      {
+        Effect = "Allow"
+        Action = [
+          "s3:GetReplicationConfiguration",
+          "s3:ListBucket"
+        ]
+        Resource = [var.source_bucket_arn]
+      },
+      {
+        Effect = "Allow"
+        Action = [
+          "s3:GetObjectVersionForReplication",
+          "s3:GetObjectVersionAcl",
+          "s3:GetObjectVersionTagging"
+        ]
+        Resource = ["${var.source_bucket_arn}/*"]
+      },
+      {
+        Effect = "Allow"
+        Action = [
+          "s3:ReplicateObject",
+          "s3:ReplicateDelete",
+          "s3:ReplicateTags"
+        ]
+        Resource = ["${var.destination_bucket_arn}/*"]
+      }
+    ]
+  })
+}
+
+resource "aws_iam_role_policy_attachment" "replication" {
+  role       = aws_iam_role.replication.name
+  policy_arn = aws_iam_policy.replication.arn
+}
+
+# Attach the replication configuration to the source bucket.
+resource "aws_s3_bucket_replication_configuration" "this" {
+  role   = aws_iam_role.replication.arn
+  bucket = var.source_bucket_id
+
+  rule {
+    id     = "replicate-${var.name}-to-secondary"
+    status = "Enabled"
+
+    filter {
+      prefix = ""
+    }
+
+    delete_marker_replication {
+      status = "Enabled"
+    }
+
+    destination {
+      bucket        = var.destination_bucket_arn
+      storage_class = var.destination_storage_class
+    }
+  }
+
+  depends_on = [aws_iam_role_policy_attachment.replication]
+}
diff --git a/tf/modules/replication/outputs.tf b/tf/modules/replication/outputs.tf
new file mode 100644
index 00000000..755a99cd
--- /dev/null
+++ b/tf/modules/replication/outputs.tf
@@ -0,0 +1,9 @@
+output "replication_role_arn" {
+  description = "ARN of the IAM role used by S3 for replication"
+  value       = aws_iam_role.replication.arn
+}
+
+output "replication_rule_id" {
+  description = "ID of the replication rule attached to the source bucket"
+  value       = "replicate-${var.name}-to-secondary"
+}
diff --git a/tf/modules/replication/variables.tf b/tf/modules/replication/variables.tf
new file mode 100644
index 00000000..3ebd4c15
--- /dev/null
+++ b/tf/modules/replication/variables.tf
@@ -0,0 +1,36 @@
+variable "environment" {
+  description = "Environment name (dev, staging, prod)"
+  type        = string
+}
+
+variable "name" {
+  description = "Logical name for this replication pairing (e.g. uploads, backups)"
+  type        = string
+}
+
+variable "source_bucket_id" {
+  description = "ID (name) of the source S3 bucket in the primary region"
+  type        = string
+}
+
+variable "source_bucket_arn" {
+  description = "ARN of the source S3 bucket in the primary region"
+  type        = string
+}
+
+variable "destination_bucket_arn" {
+  description = "ARN of the destination S3 bucket in the secondary region"
+  type        = string
+}
+
+variable "destination_storage_class" {
+  description = "Storage class for replicated objects in the destination bucket"
+  type        = string
+  default     = "STANDARD_IA"
+}
+
+variable "tags" {
+  description = "Tags to apply to resources"
+  type        = map(string)
+  default     = {}
+}
diff --git a/tf/modules/replication/versions.tf b/tf/modules/replication/versions.tf
new file mode 100644
index 00000000..1ac87da5
--- /dev/null
+++ b/tf/modules/replication/versions.tf
@@ -0,0 +1,8 @@
+terraform {
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = "~> 5.0"
+    }
+  }
+}
diff --git a/tf/modules/storage/versions.tf b/tf/modules/storage/versions.tf
new file mode 100644
index 00000000..1ac87da5
--- /dev/null
+++ b/tf/modules/storage/versions.tf
@@ -0,0 +1,8 @@
+terraform {
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = "~> 5.0"
+    }
+  }
+}
diff --git a/tf/multi-region/README.md b/tf/multi-region/README.md
new file mode 100644
index 00000000..6fdc325e
--- /dev/null
+++ b/tf/multi-region/README.md
@@ -0,0 +1,98 @@
+# Multi-Region Deployment & Failover
+
+This Terraform root configuration deploys TeachLink across **two AWS regions** in
+an active (primary) / warm-standby (secondary) topology with automated,
+health-check driven DNS failover and cross-region data replication.
+
+It composes the single-region modules in [`../modules`](../modules) — nothing is
+duplicated — and adds three new modules: `dns-failover`, `replication`, and
+`database-replica`.
+
+> Implements issue **#620 — Add multi-region deployment and failover strategy**.
+
+## Architecture
+
+```
+                          Route 53 (failover routing)
+                          api.teachlink.io
+                          ┌────────────┴─────────────┐
+                  health check ✓               health check ✗→✓
+                          │                            │
+              ┌───────────▼───────────┐    ┌───────────▼───────────┐
+              │   PRIMARY (us-east-1)  │    │  SECONDARY (us-west-2) │
+              │  ALB → ECS (active)    │    │  ALB → ECS (standby)   │
+              │  RDS primary (R/W)     │──► │  RDS read replica ─────┼─► promote
+              │  Redis primary         │    │  Redis standby         │   on failover
+              │  S3 uploads/backups ───┼──► │  S3 uploads/backups    │   (CRR)
+              └────────────────────────┘    └────────────────────────┘
+```
+
+### What gets created
+
+| Concern              | Primary region          | Secondary region                       |
+| -------------------- | ----------------------- | -------------------------------------- |
+| Networking           | VPC + subnets + SGs     | VPC + subnets + SGs (non-overlapping)  |
+| Compute              | ECS/Fargate + ALB (active) | ECS/Fargate + ALB (warm standby)    |
+| Database             | RDS PostgreSQL (R/W)    | Cross-region **read replica**          |
+| Cache                | ElastiCache Redis       | Standby Redis (warmed on failover)     |
+| Storage              | S3 uploads + backups    | S3 uploads + backups (**CRR target**)  |
+| DNS                  | Route 53 failover record + health check | Route 53 failover record + health check |
+| Monitoring           | CloudWatch              | CloudWatch                             |
+
+## Acceptance criteria mapping (#620)
+
+- **Multiple deployment regions configured** — `providers.tf` declares aliased
+  `aws.primary` / `aws.secondary`; `main.tf` instantiates every stack in both.
+- **Failover automation** — `modules/dns-failover` (Route 53 health checks +
+  PRIMARY/SECONDARY records) plus [`../../infra/scripts/failover.sh`](../../infra/scripts/failover.sh).
+- **Data replication strategy** — `modules/replication` (S3 CRR) +
+  `modules/database-replica` (RDS cross-region read replica). See
+  [`../../dr/procedures/data-replication.md`](../../dr/procedures/data-replication.md).
+- **Testing and runbooks** — [`../../infra/scripts/failover-drill.sh`](../../infra/scripts/failover-drill.sh),
+  [`validate-multiregion.sh`](../../infra/scripts/validate-multiregion.sh), and
+  [`../../dr/runbooks/multi-region-deployment.md`](../../dr/runbooks/multi-region-deployment.md).
+
+## Usage
+
+```bash
+cd tf/multi-region
+cp terraform.tfvars.example terraform.tfvars   # edit values
+
+terraform init
+terraform fmt -check -recursive
+terraform validate
+terraform plan  -var-file=terraform.tfvars
+terraform apply -var-file=terraform.tfvars
+```
+
+### Prerequisites
+
+- Terraform >= 1.5.0, AWS credentials with permissions in **both** regions.
+- For HTTPS, an ACM certificate in **each** region (`*_certificate_arn`).
+- A registered domain. Provide `hosted_zone_id` to reuse a zone, or omit it to
+  let Terraform create one (then delegate to the output name servers).
+
+## Recovery objectives
+
+| Objective | Target | How this design meets it |
+| --------- | ------ | ------------------------ |
+| **RTO** | ≤ 15 min | Standby fleet always running; Route 53 fails over automatically within `health_check_interval × failure_threshold` (~90s); replica promotion is the only manual step. |
+| **RPO** | seconds (DB), async (S3) | RDS read replica streams continuously; S3 CRR replicates new objects asynchronously. This improves on the backup-only RPO in `dr/procedures/RTO-RPO.md`. |
+
+## Cost & topology notes
+
+- The secondary fleet defaults to a small warm standby (`secondary_desired_count = 1`).
+  For true active-active, raise `secondary_desired_count`/`secondary_min_capacity`.
+- Route 53 health-check metrics publish only to **us-east-1**; keep
+  `primary_region = us-east-1` (default) or adjust the `dns_failover` provider.
+- State: configure a remote backend (see [`../README.md`](../README.md)). Use a
+  **separate state key** from the single-region config to avoid collisions.
+
+## Failover & failback
+
+Operational procedures live in the DR runbooks:
+
+- [Region Outage runbook](../../dr/runbooks/region-outage.md)
+- [Multi-Region Deployment runbook](../../dr/runbooks/multi-region-deployment.md)
+- [Failover Plan](../../dr/procedures/failover-plan.md)
+- [Data Replication Strategy](../../dr/procedures/data-replication.md)
diff --git a/tf/multi-region/main.tf b/tf/multi-region/main.tf
new file mode 100644
index 00000000..17a6475e
--- /dev/null
+++ b/tf/multi-region/main.tf
@@ -0,0 +1,231 @@
+# Multi-Region Deployment & Failover
+#
+# Composes the existing single-region modules into an active (primary) /
+# warm-standby (secondary) topology with:
+#   - full stacks in two regions (networking, compute, storage, monitoring)
+#   - a cross-region RDS read replica + standby Redis in the secondary region
+#   - S3 cross-region replication for uploads and backups
+#   - Route 53 health-check driven DNS failover
+#
+# See README.md and ../../dr/ for the operational runbooks.
+
+locals {
+  primary_prefix   = "${var.s3_bucket_prefix}-${replace(var.primary_region, "-", "")}"
+  secondary_prefix = "${var.s3_bucket_prefix}-${replace(var.secondary_region, "-", "")}"
+}
+
+# ===========================================================================
+# PRIMARY REGION (active)
+# ===========================================================================
+module "networking_primary" {
+  source    = "../modules/networking"
+  providers = { aws = aws.primary }
+
+  vpc_cidr             = var.primary_vpc_cidr
+  public_subnet_cidrs  = var.primary_public_subnet_cidrs
+  private_subnet_cidrs = var.primary_private_subnet_cidrs
+  availability_zones   = var.primary_availability_zones
+  environment          = var.environment
+  tags                 = var.tags
+}
+
+module "compute_primary" {
+  source    = "../modules/compute"
+  providers = { aws = aws.primary }
+
+  cluster_name                = "${var.cluster_name}-primary"
+  service_name                = "${var.service_name}-primary"
+  task_cpu                    = var.task_cpu
+  task_memory                 = var.task_memory
+  desired_count               = var.primary_desired_count
+  min_capacity                = var.primary_min_capacity
+  max_capacity                = var.primary_max_capacity
+  container_port              = var.container_port
+  health_check_path           = var.health_check_path
+  certificate_arn             = var.primary_certificate_arn
+  container_image             = var.container_image
+  private_subnet_ids          = module.networking_primary.private_subnet_ids
+  public_subnet_ids           = module.networking_primary.public_subnet_ids
+  alb_security_group_id       = module.networking_primary.alb_security_group_id
+  ecs_tasks_security_group_id = module.networking_primary.ecs_tasks_security_group_id
+  environment                 = var.environment
+  aws_region                  = var.primary_region
+  tags                        = var.tags
+}
+
+module "database_primary" {
+  source = "../modules/database"
+  providers = {
+    aws    = aws.primary
+    random = random
+  }
+
+  db_instance_class        = var.db_instance_class
+  db_allocated_storage     = var.db_allocated_storage
+  db_max_allocated_storage = var.db_max_allocated_storage
+  db_name                  = var.db_name
+  redis_node_type          = var.redis_node_type
+  redis_num_cache_nodes    = var.redis_num_cache_nodes
+  private_subnet_ids       = module.networking_primary.private_subnet_ids
+  rds_security_group_id    = module.networking_primary.rds_security_group_id
+  redis_security_group_id  = module.networking_primary.redis_security_group_id
+  environment              = var.environment
+  tags                     = var.tags
+}
+
+module "storage_primary" {
+  source    = "../modules/storage"
+  providers = { aws = aws.primary }
+
+  s3_bucket_prefix = local.primary_prefix
+  environment      = var.environment
+  tags             = var.tags
+}
+
+module "monitoring_primary" {
+  source    = "../modules/monitoring"
+  providers = { aws = aws.primary }
+
+  environment       = var.environment
+  aws_region        = var.primary_region
+  cluster_name      = "${var.cluster_name}-primary"
+  service_name      = "${var.service_name}-primary"
+  alb_name          = module.compute_primary.alb_name
+  enable_monitoring = var.enable_monitoring
+  tags              = var.tags
+}
+
+# ===========================================================================
+# SECONDARY REGION (warm standby / failover target)
+# ===========================================================================
+module "networking_secondary" {
+  source    = "../modules/networking"
+  providers = { aws = aws.secondary }
+
+  vpc_cidr             = var.secondary_vpc_cidr
+  public_subnet_cidrs  = var.secondary_public_subnet_cidrs
+  private_subnet_cidrs = var.secondary_private_subnet_cidrs
+  availability_zones   = var.secondary_availability_zones
+  environment          = var.environment
+  tags                 = var.tags
+}
+
+module "compute_secondary" {
+  source    = "../modules/compute"
+  providers = { aws = aws.secondary }
+
+  cluster_name                = "${var.cluster_name}-secondary"
+  service_name                = "${var.service_name}-secondary"
+  task_cpu                    = var.task_cpu
+  task_memory                 = var.task_memory
+  desired_count               = var.secondary_desired_count
+  min_capacity                = var.secondary_min_capacity
+  max_capacity                = var.secondary_max_capacity
+  container_port              = var.container_port
+  health_check_path           = var.health_check_path
+  certificate_arn             = var.secondary_certificate_arn
+  container_image             = var.container_image
+  private_subnet_ids          = module.networking_secondary.private_subnet_ids
+  public_subnet_ids           = module.networking_secondary.public_subnet_ids
+  alb_security_group_id       = module.networking_secondary.alb_security_group_id
+  ecs_tasks_security_group_id = module.networking_secondary.ecs_tasks_security_group_id
+  environment                 = var.environment
+  aws_region                  = var.secondary_region
+  tags                        = var.tags
+}
+
+module "storage_secondary" {
+  source    = "../modules/storage"
+  providers = { aws = aws.secondary }
+
+  s3_bucket_prefix = local.secondary_prefix
+  environment      = var.environment
+  tags             = var.tags
+}
+
+module "monitoring_secondary" {
+  source    = "../modules/monitoring"
+  providers = { aws = aws.secondary }
+
+  environment       = var.environment
+  aws_region        = var.secondary_region
+  cluster_name      = "${var.cluster_name}-secondary"
+  service_name      = "${var.service_name}-secondary"
+  alb_name          = module.compute_secondary.alb_name
+  enable_monitoring = var.enable_monitoring
+  tags              = var.tags
+}
+
+# Cross-region read replica + standby cache in the secondary region.
+module "database_replica" {
+  source    = "../modules/database-replica"
+  providers = { aws = aws.secondary }
+
+  environment             = var.environment
+  source_db_arn           = module.database_primary.db_instance_arn
+  db_instance_class       = var.db_instance_class
+  private_subnet_ids      = module.networking_secondary.private_subnet_ids
+  rds_security_group_id   = module.networking_secondary.rds_security_group_id
+  redis_security_group_id = module.networking_secondary.redis_security_group_id
+  redis_node_type         = var.redis_node_type
+  redis_num_cache_nodes   = var.redis_num_cache_nodes
+  tags                    = var.tags
+}
+
+# ===========================================================================
+# DATA REPLICATION (S3 cross-region replication, primary -> secondary)
+# ===========================================================================
+module "replication_uploads" {
+  source    = "../modules/replication"
+  providers = { aws = aws.primary }
+
+  environment            = var.environment
+  name                   = "uploads"
+  source_bucket_id       = module.storage_primary.uploads_bucket_id
+  source_bucket_arn      = module.storage_primary.uploads_bucket_arn
+  destination_bucket_arn = module.storage_secondary.uploads_bucket_arn
+  tags                   = var.tags
+
+  # Ensure versioning on both buckets is configured before attaching replication.
+  depends_on = [module.storage_primary, module.storage_secondary]
+}
+
+module "replication_backups" {
+  source    = "../modules/replication"
+  providers = { aws = aws.primary }
+
+  environment            = var.environment
+  name                   = "backups"
+  source_bucket_id       = module.storage_primary.backups_bucket_id
+  source_bucket_arn      = module.storage_primary.backups_bucket_arn
+  destination_bucket_arn = module.storage_secondary.backups_bucket_arn
+  tags                   = var.tags
+
+  # Ensure versioning on both buckets is configured before attaching replication.
+  depends_on = [module.storage_primary, module.storage_secondary]
+}
+
+# ===========================================================================
+# DNS FAILOVER (Route 53)
+# NOTE: Route 53 health-check CloudWatch metrics are published to us-east-1,
+# so the primary provider should target us-east-1 (the default).
+# ===========================================================================
+module "dns_failover" {
+  source    = "../modules/dns-failover"
+  providers = { aws = aws.primary }
+
+  environment            = var.environment
+  domain_name            = var.domain_name
+  app_subdomain          = var.app_subdomain
+  hosted_zone_id         = var.hosted_zone_id
+  primary_region         = var.primary_region
+  secondary_region       = var.secondary_region
+  primary_alb_dns_name   = module.compute_primary.alb_dns_name
+  primary_alb_zone_id    = module.compute_primary.alb_zone_id
+  secondary_alb_dns_name = module.compute_secondary.alb_dns_name
+  secondary_alb_zone_id  = module.compute_secondary.alb_zone_id
+  health_check_path      = var.health_check_path
+  health_check_port      = var.primary_certificate_arn != "" ? 443 : 80
+  health_check_type      = var.primary_certificate_arn != "" ? "HTTPS" : "HTTP"
+  tags                   = var.tags
+}
diff --git a/tf/multi-region/outputs.tf b/tf/multi-region/outputs.tf
new file mode 100644
index 00000000..f484498c
--- /dev/null
+++ b/tf/multi-region/outputs.tf
@@ -0,0 +1,73 @@
+# ---------------------------------------------------------------------------
+# Primary region
+# ---------------------------------------------------------------------------
+output "primary_region" {
+  description = "Primary (active) region"
+  value       = var.primary_region
+}
+
+output "primary_alb_dns_name" {
+  description = "DNS name of the primary region ALB"
+  value       = module.compute_primary.alb_dns_name
+}
+
+output "primary_db_endpoint" {
+  description = "Primary RDS endpoint (read/write)"
+  value       = module.database_primary.db_instance_endpoint
+}
+
+output "primary_uploads_bucket" {
+  description = "Primary region uploads bucket"
+  value       = module.storage_primary.uploads_bucket_id
+}
+
+# ---------------------------------------------------------------------------
+# Secondary region
+# ---------------------------------------------------------------------------
+output "secondary_region" {
+  description = "Secondary (standby) region"
+  value       = var.secondary_region
+}
+
+output "secondary_alb_dns_name" {
+  description = "DNS name of the secondary region ALB"
+  value       = module.compute_secondary.alb_dns_name
+}
+
+output "replica_db_endpoint" {
+  description = "Cross-region read replica endpoint (read-only until promoted)"
+  value       = module.database_replica.replica_db_endpoint
+}
+
+output "secondary_uploads_bucket" {
+  description = "Secondary region uploads bucket (replication target)"
+  value       = module.storage_secondary.uploads_bucket_id
+}
+
+# ---------------------------------------------------------------------------
+# Failover / DNS
+# ---------------------------------------------------------------------------
+output "app_fqdn" {
+  description = "Failover-enabled application endpoint"
+  value       = module.dns_failover.app_fqdn
+}
+
+output "hosted_zone_id" {
+  description = "Route 53 hosted zone ID"
+  value       = module.dns_failover.hosted_zone_id
+}
+
+output "hosted_zone_name_servers" {
+  description = "Name servers to delegate to (when a zone was created)"
+  value       = module.dns_failover.name_servers
+}
+
+output "primary_health_check_id" {
+  description = "Route 53 health check ID for the primary region"
+  value       = module.dns_failover.primary_health_check_id
+}
+
+output "secondary_health_check_id" {
+  description = "Route 53 health check ID for the secondary region"
+  value       = module.dns_failover.secondary_health_check_id
+}
diff --git a/tf/multi-region/providers.tf b/tf/multi-region/providers.tf
new file mode 100644
index 00000000..80d1bc2b
--- /dev/null
+++ b/tf/multi-region/providers.tf
@@ -0,0 +1,32 @@
+# Two aliased AWS providers drive the active-active/active-passive topology.
+# Every module is instantiated once per region by passing the matching provider.
+
+provider "aws" {
+  alias  = "primary"
+  region = var.primary_region
+
+  default_tags {
+    tags = {
+      Environment = var.environment
+      Project     = "teachlink"
+      ManagedBy   = "terraform"
+      Topology    = "multi-region"
+      Role        = "primary"
+    }
+  }
+}
+
+provider "aws" {
+  alias  = "secondary"
+  region = var.secondary_region
+
+  default_tags {
+    tags = {
+      Environment = var.environment
+      Project     = "teachlink"
+      ManagedBy   = "terraform"
+      Topology    = "multi-region"
+      Role        = "secondary"
+    }
+  }
+}
diff --git a/tf/multi-region/terraform.tfvars.example b/tf/multi-region/terraform.tfvars.example
new file mode 100644
index 00000000..410058ed
--- /dev/null
+++ b/tf/multi-region/terraform.tfvars.example
@@ -0,0 +1,67 @@
+# Environment
+environment = "prod"
+
+# Regions
+primary_region   = "us-east-1"
+secondary_region = "us-west-2"
+
+primary_availability_zones   = ["us-east-1a", "us-east-1b"]
+secondary_availability_zones = ["us-west-2a", "us-west-2b"]
+
+# Networking (non-overlapping CIDRs so the VPCs can be peered if needed)
+primary_vpc_cidr               = "10.0.0.0/16"
+secondary_vpc_cidr             = "10.1.0.0/16"
+primary_public_subnet_cidrs    = ["10.0.1.0/24", "10.0.2.0/24"]
+primary_private_subnet_cidrs   = ["10.0.10.0/24", "10.0.11.0/24"]
+secondary_public_subnet_cidrs  = ["10.1.1.0/24", "10.1.2.0/24"]
+secondary_private_subnet_cidrs = ["10.1.10.0/24", "10.1.11.0/24"]
+
+# Compute
+cluster_name    = "teachlink-cluster"
+service_name    = "teachlink-service"
+container_image = "<your-account>.dkr.ecr.us-east-1.amazonaws.com/teachlink:latest"
+container_port  = 3000
+task_cpu        = 512
+task_memory     = 1024
+
+# Primary fleet (active)
+primary_desired_count = 3
+primary_min_capacity  = 3
+primary_max_capacity  = 20
+
+# Secondary fleet (warm standby; raise for active-active)
+secondary_desired_count = 1
+secondary_min_capacity  = 1
+secondary_max_capacity  = 20
+
+# HTTPS certificates (must exist in each region)
+# primary_certificate_arn   = "arn:aws:acm:us-east-1:123456789012:certificate/xxx"
+# secondary_certificate_arn = "arn:aws:acm:us-west-2:123456789012:certificate/yyy"
+
+# Database
+db_instance_class        = "db.t3.large"
+db_allocated_storage     = 100
+db_max_allocated_storage = 500
+db_name                  = "teachlink_db"
+
+# Redis
+redis_node_type       = "cache.t3.small"
+redis_num_cache_nodes = 1
+
+# Storage (region is appended automatically to keep bucket names unique)
+s3_bucket_prefix = "teachlink"
+
+# DNS failover
+domain_name   = "teachlink.io"
+app_subdomain = "api"
+# hosted_zone_id = "Z0123456789ABCDEFG"  # reuse an existing zone; omit to create one
+
+# Monitoring
+enable_monitoring = true
+
+# Tags
+tags = {
+  Project    = "teachlink"
+  Team       = "backend"
+  CostCenter = "engineering"
+}
diff --git a/tf/multi-region/variables.tf b/tf/multi-region/variables.tf
new file mode 100644
index 00000000..a7b695ad
--- /dev/null
+++ b/tf/multi-region/variables.tf
@@ -0,0 +1,249 @@
+variable "environment" {
+  description = "Environment name (dev, staging, prod)"
+  type        = string
+}
+
+# ---------------------------------------------------------------------------
+# Regions
+# ---------------------------------------------------------------------------
+variable "primary_region" {
+  description = "Primary AWS region (active)"
+  type        = string
+  default     = "us-east-1"
+}
+
+variable "secondary_region" {
+  description = "Secondary AWS region (standby / failover target)"
+  type        = string
+  default     = "us-west-2"
+}
+
+variable "primary_availability_zones" {
+  description = "Availability zones in the primary region"
+  type        = list(string)
+  default     = ["us-east-1a", "us-east-1b"]
+}
+
+variable "secondary_availability_zones" {
+  description = "Availability zones in the secondary region"
+  type        = list(string)
+  default     = ["us-west-2a", "us-west-2b"]
+}
+
+# ---------------------------------------------------------------------------
+# Networking (distinct CIDRs per region so VPCs can be peered if needed)
+# ---------------------------------------------------------------------------
+variable "primary_vpc_cidr" {
+  description = "VPC CIDR for the primary region"
+  type        = string
+  default     = "10.0.0.0/16"
+}
+
+variable "secondary_vpc_cidr" {
+  description = "VPC CIDR for the secondary region"
+  type        = string
+  default     = "10.1.0.0/16"
+}
+
+variable "primary_public_subnet_cidrs" {
+  description = "Public subnet CIDRs in the primary region"
+  type        = list(string)
+  default     = ["10.0.1.0/24", "10.0.2.0/24"]
+}
+
+variable "primary_private_subnet_cidrs" {
+  description = "Private subnet CIDRs in the primary region"
+  type        = list(string)
+  default     = ["10.0.10.0/24", "10.0.11.0/24"]
+}
+
+variable "secondary_public_subnet_cidrs" {
+  description = "Public subnet CIDRs in the secondary region"
+  type        = list(string)
+  default     = ["10.1.1.0/24", "10.1.2.0/24"]
+}
+
+variable "secondary_private_subnet_cidrs" {
+  description = "Private subnet CIDRs in the secondary region"
+  type        = list(string)
+  default     = ["10.1.10.0/24", "10.1.11.0/24"]
+}
+
+# ---------------------------------------------------------------------------
+# Compute
+# ---------------------------------------------------------------------------
+variable "cluster_name" {
+  description = "ECS cluster name (suffixed per region)"
+  type        = string
+  default     = "teachlink-cluster"
+}
+
+variable "service_name" {
+  description = "ECS service name (suffixed per region)"
+  type        = string
+  default     = "teachlink-service"
+}
+
+variable "task_cpu" {
+  description = "CPU units for ECS task"
+  type        = number
+  default     = 256
+}
+
+variable "task_memory" {
+  description = "Memory (MB) for ECS task"
+  type        = number
+  default     = 512
+}
+
+variable "container_port" {
+  description = "Container port for the application"
+  type        = number
+  default     = 3000
+}
+
+variable "container_image" {
+  description = "Container image deployed to both regions"
+  type        = string
+  default     = "nginx:latest"
+}
+
+variable "health_check_path" {
+  description = "Application health check path"
+  type        = string
+  default     = "/health"
+}
+
+variable "primary_certificate_arn" {
+  description = "ACM certificate ARN in the primary region (for HTTPS)"
+  type        = string
+  default     = ""
+}
+
+variable "secondary_certificate_arn" {
+  description = "ACM certificate ARN in the secondary region (for HTTPS)"
+  type        = string
+  default     = ""
+}
+
+# Primary region runs the active fleet.
+variable "primary_desired_count" {
+  description = "Desired ECS task count in the primary region"
+  type        = number
+  default     = 3
+}
+
+variable "primary_min_capacity" {
+  description = "Minimum ECS task count in the primary region"
+  type        = number
+  default     = 3
+}
+
+variable "primary_max_capacity" {
+  description = "Maximum ECS task count in the primary region"
+  type        = number
+  default     = 20
+}
+
+# Secondary region runs warm-standby capacity. Set desired/min > 0 for an
+# active-active hot-standby, or keep low for a cost-efficient pilot-light.
+variable "secondary_desired_count" {
+  description = "Desired ECS task count in the secondary region (standby)"
+  type        = number
+  default     = 1
+}
+
+variable "secondary_min_capacity" {
+  description = "Minimum ECS task count in the secondary region"
+  type        = number
+  default     = 1
+}
+
+variable "secondary_max_capacity" {
+  description = "Maximum ECS task count in the secondary region (for failover scale-up)"
+  type        = number
+  default     = 20
+}
+
+# ---------------------------------------------------------------------------
+# Database
+# ---------------------------------------------------------------------------
+variable "db_instance_class" {
+  description = "RDS instance class"
+  type        = string
+  default     = "db.t3.micro"
+}
+
+variable "db_allocated_storage" {
+  description = "Allocated storage for RDS (GB)"
+  type        = number
+  default     = 20
+}
+
+variable "db_max_allocated_storage" {
+  description = "Maximum allocated storage for RDS (GB)"
+  type        = number
+  default     = 100
+}
+
+variable "db_name" {
+  description = "Database name"
+  type        = string
+  default     = "teachlink_db"
+}
+
+variable "redis_node_type" {
+  description = "ElastiCache node type"
+  type        = string
+  default     = "cache.t3.micro"
+}
+
+variable "redis_num_cache_nodes" {
+  description = "Number of cache nodes per region"
+  type        = number
+  default     = 1
+}
+
+# ---------------------------------------------------------------------------
+# Storage
+# ---------------------------------------------------------------------------
+variable "s3_bucket_prefix" {
+  description = "Base prefix for S3 bucket names; the region is appended to keep names globally unique"
+  type        = string
+  default     = "teachlink"
+}
+
+# ---------------------------------------------------------------------------
+# DNS failover
+# ---------------------------------------------------------------------------
+variable "domain_name" {
+  description = "Root domain managed in Route 53 (e.g. teachlink.io)"
+  type        = string
+}
+
+variable "app_subdomain" {
+  description = "Subdomain that fails over between regions"
+  type        = string
+  default     = "api"
+}
+
+variable "hosted_zone_id" {
+  description = "Existing Route 53 hosted zone ID. Leave empty to create a new public zone."
+  type        = string
+  default     = ""
+}
+
+# ---------------------------------------------------------------------------
+# Misc
+# ---------------------------------------------------------------------------
+variable "enable_monitoring" {
+  description = "Enable detailed CloudWatch monitoring in both regions"
+  type        = bool
+  default     = true
+}
+
+variable "tags" {
+  description = "Additional tags applied to all resources"
+  type        = map(string)
+  default     = {}
+}
diff --git a/tf/multi-region/versions.tf b/tf/multi-region/versions.tf
new file mode 100644
index 00000000..c3488b03
--- /dev/null
+++ b/tf/multi-region/versions.tf
@@ -0,0 +1,14 @@
+terraform {
+  required_version = ">= 1.5.0"
+
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = "~> 5.0"
+    }
+    random = {
+      source  = "hashicorp/random"
+      version = "~> 3.0"
+    }
+  }
+}

From dadf239eb49c592eed3eef3eb57013ac51548f35 Mon Sep 17 00:00:00 2001
From: Nworah_Gabriel <gabrielnworah6@gmail.com>
Date: Sat, 30 May 2026 06:39:45 +0800
Subject: [PATCH 2/2] feat(webhooks): add outbound delivery retries with
 exponential backoff (#615)

Implement reliable outbound webhook delivery on the existing webhooks worker:

- webhook-backoff.util: exponential backoff with equal jitter + max cap, and
  retryability classification (5xx/408/425/429/transport = retry; 4xx = permanent)
- webhook-delivery.service: HMAC-signed HTTP POST; retryable failures throw so
  Bull re-enqueues with backoff (retry queue); permanent/exhausted -> dead-letter
- webhook-monitor.service: failure counters + CustomMetrics/alert + events
- webhook-retry.config: env-overridable maxRetries/delays/jitter/timeout
- wire WebhooksWorker to the delivery service (optional-dep fallback for the
  orchestration pool); register WebhooksDeliveryModule

Tests: 19 unit tests (backoff math, retry vs dead-letter, HMAC, job options).

Closes #615
---
 src/webhooks/README.md                        |  89 +++++++
 src/webhooks/webhook-backoff.util.spec.ts     |  88 +++++++
 src/webhooks/webhook-backoff.util.ts          |  93 +++++++
 src/webhooks/webhook-delivery.service.spec.ts | 117 +++++++++
 src/webhooks/webhook-delivery.service.ts      | 239 ++++++++++++++++++
 src/webhooks/webhook-monitor.service.ts       |  92 +++++++
 src/webhooks/webhook-retry.config.ts          |  58 +++++
 src/webhooks/webhooks-delivery.module.ts      |  24 ++
 src/workers/processors/webhooks.worker.ts     | 101 +++-----
 src/workers/workers.module.ts                 |   2 +
 10 files changed, 834 insertions(+), 69 deletions(-)
 create mode 100644 src/webhooks/README.md
 create mode 100644 src/webhooks/webhook-backoff.util.spec.ts
 create mode 100644 src/webhooks/webhook-backoff.util.ts
 create mode 100644 src/webhooks/webhook-delivery.service.spec.ts
 create mode 100644 src/webhooks/webhook-delivery.service.ts
 create mode 100644 src/webhooks/webhook-monitor.service.ts
 create mode 100644 src/webhooks/webhook-retry.config.ts
 create mode 100644 src/webhooks/webhooks-delivery.module.ts

diff --git a/src/webhooks/README.md b/src/webhooks/README.md
new file mode 100644
index 00000000..0383723e
--- /dev/null
+++ b/src/webhooks/README.md
@@ -0,0 +1,89 @@
+# Outbound Webhook Delivery (with retries & backoff)
+
+Reliable delivery of **outbound** webhooks to subscriber URLs, with exponential
+backoff, a configurable retry queue, dead-letter handling and failure
+monitoring.
+
+> Implements issue **#615 — Add webhook delivery retries with exponential backoff**.
+>
+> This is distinct from [`src/payments/webhooks`](../payments/webhooks), which
+> handles *inbound* Stripe/PayPal webhooks.
+
+## Acceptance criteria mapping
+
+| Criterion | Where |
+| --------- | ----- |
+| **Retry queue for failed webhooks** | Bull queue re-enqueues retryable failures; `WebhookDeliveryService.buildJobOptions()` sets `attempts` + `backoff`. The worker re-throws `WebhookRetryableError` so the job is requeued. |
+| **Exponential backoff implemented** | [`webhook-backoff.util.ts`](./webhook-backoff.util.ts) → `calculateBackoffDelay()` (exponential + equal jitter + max-delay cap). |
+| **Max retry count configured** | [`webhook-retry.config.ts`](./webhook-retry.config.ts) → `maxRetries` (env `WEBHOOK_MAX_RETRIES`, default 5). |
+| **Monitoring for failures** | [`webhook-monitor.service.ts`](./webhook-monitor.service.ts) → counters + alert-thresholded metrics via `CustomMetricsService`, plus `webhook.*` events. |
+
+## Components
+
+- **`webhook-retry.config.ts`** — retry policy, env-overridable, with defaults.
+- **`webhook-backoff.util.ts`** — pure helpers: `calculateBackoffDelay`,
+  `shouldRetry`, `isRetryableStatusCode`, `isRetryableError`.
+- **`webhook-delivery.service.ts`** — signs (HMAC-SHA256) and POSTs the webhook,
+  classifies failures, schedules retries or dead-letters, emits events.
+- **`webhook-monitor.service.ts`** — in-memory counters + `CustomMetricsService`
+  forwarding; alert fires on dead-letter.
+- **`webhooks-delivery.module.ts`** — Nest module (HTTP + monitoring wiring).
+- **`workers/processors/webhooks.worker.ts`** — Bull worker delegating to the
+  delivery service.
+
+## Retry behaviour
+
+```
+attempt 1 ──fail(5xx/timeout/network)──▶ wait ~1s  ─▶ attempt 2
+attempt 2 ──fail──────────────────────▶ wait ~2s  ─▶ attempt 3
+attempt 3 ──fail──────────────────────▶ wait ~4s  ─▶ ... up to maxRetries
+                                              │
+              permanent 4xx ◀───────────────┘ (no retry → dead-letter)
+              retries exhausted ─────────────▶ dead-letter + alert
+```
+
+- Delay = `initialDelayMs × multiplier^(attempt-1)`, capped at `maxDelayMs`,
+  with **equal jitter** (50–100% of the computed delay) to avoid thundering herds.
+- **Retryable**: HTTP 5xx, 408, 425, 429, and transport errors (ECONNRESET,
+  ETIMEDOUT, …). **Permanent**: other 4xx → dead-lettered immediately.
+
+## Configuration
+
+| Env var | Default | Meaning |
+| ------- | ------- | ------- |
+| `WEBHOOK_MAX_RETRIES` | `5` | Max delivery attempts before dead-letter |
+| `WEBHOOK_INITIAL_DELAY_MS` | `1000` | Base backoff delay |
+| `WEBHOOK_BACKOFF_MULTIPLIER` | `2` | Exponential growth factor |
+| `WEBHOOK_MAX_DELAY_MS` | `3600000` | Backoff cap (1h) |
+| `WEBHOOK_JITTER` | `true` | Apply equal jitter |
+| `WEBHOOK_TIMEOUT_MS` | `10000` | Per-request timeout |
+
+## Usage
+
+Enqueue a webhook job onto the `webhooks` Bull queue with options from the
+service so retries/backoff match the policy:
+
+```ts
+await webhooksQueue.add(
+  'deliver',
+  { url, event: 'course.completed', payload, secret },
+  webhookDeliveryService.buildJobOptions(),
+);
+```
+
+Subscribe to delivery events for auditing/alerting:
+
+```ts
+@OnEvent('webhook.dead_letter')
+onDeadLetter(e) { /* page on-call, persist for manual replay */ }
+```
+
+## Tests
+
+```bash
+npm test -- src/webhooks
+```
+
+- `webhook-backoff.util.spec.ts` — backoff growth, cap, jitter bounds, retryability.
+- `webhook-delivery.service.spec.ts` — success, retryable vs permanent failures,
+  exhaustion/dead-letter, HMAC signing, job options.
diff --git a/src/webhooks/webhook-backoff.util.spec.ts b/src/webhooks/webhook-backoff.util.spec.ts
new file mode 100644
index 00000000..ab91fc4e
--- /dev/null
+++ b/src/webhooks/webhook-backoff.util.spec.ts
@@ -0,0 +1,88 @@
+import {
+  calculateBackoffDelay,
+  isRetryableError,
+  isRetryableStatusCode,
+  shouldRetry,
+} from './webhook-backoff.util';
+import { DEFAULT_WEBHOOK_RETRY_CONFIG, WebhookRetryConfig } from './webhook-retry.config';
+
+const config: WebhookRetryConfig = {
+  ...DEFAULT_WEBHOOK_RETRY_CONFIG,
+  maxRetries: 5,
+  initialDelayMs: 1_000,
+  backoffMultiplier: 2,
+  maxDelayMs: 60_000,
+  jitter: false,
+};
+
+describe('calculateBackoffDelay', () => {
+  it('grows exponentially without jitter', () => {
+    expect(calculateBackoffDelay(1, config)).toBe(1_000);
+    expect(calculateBackoffDelay(2, config)).toBe(2_000);
+    expect(calculateBackoffDelay(3, config)).toBe(4_000);
+    expect(calculateBackoffDelay(4, config)).toBe(8_000);
+  });
+
+  it('caps the delay at maxDelayMs', () => {
+    expect(calculateBackoffDelay(20, config)).toBe(60_000);
+  });
+
+  it('applies equal jitter within [50%, 100%] of the capped delay', () => {
+    const jittered: WebhookRetryConfig = { ...config, jitter: true };
+    // attempt 3 → base 4000; equal jitter keeps half fixed + half random.
+    expect(calculateBackoffDelay(3, jittered, () => 0)).toBe(2_000); // min
+    expect(calculateBackoffDelay(3, jittered, () => 1)).toBe(4_000); // max
+    expect(calculateBackoffDelay(3, jittered, () => 0.5)).toBe(3_000); // mid
+  });
+
+  it('treats attempts below 1 as the first retry', () => {
+    expect(calculateBackoffDelay(0, config)).toBe(1_000);
+  });
+});
+
+describe('shouldRetry', () => {
+  it('retries while attempts are below the max', () => {
+    expect(shouldRetry(0, config)).toBe(true);
+    expect(shouldRetry(4, config)).toBe(true);
+  });
+
+  it('stops once the max retry count is reached', () => {
+    expect(shouldRetry(5, config)).toBe(false);
+    expect(shouldRetry(6, config)).toBe(false);
+  });
+});
+
+describe('isRetryableStatusCode', () => {
+  it('retries 5xx, 408, 425 and 429', () => {
+    [500, 502, 503, 504, 408, 425, 429].forEach((code) =>
+      expect(isRetryableStatusCode(code)).toBe(true),
+    );
+  });
+
+  it('does not retry other 4xx client errors', () => {
+    [400, 401, 403, 404, 409, 422].forEach((code) =>
+      expect(isRetryableStatusCode(code)).toBe(false),
+    );
+  });
+});
+
+describe('isRetryableError', () => {
+  it('retries server responses but not client responses', () => {
+    expect(isRetryableError({ response: { status: 503 } })).toBe(true);
+    expect(isRetryableError({ response: { status: 400 } })).toBe(false);
+  });
+
+  it('retries transport-level errors', () => {
+    expect(isRetryableError({ code: 'ECONNRESET' })).toBe(true);
+    expect(isRetryableError({ code: 'ETIMEDOUT' })).toBe(true);
+    expect(isRetryableError({ code: 'ECONNABORTED', message: 'timeout of 10000ms exceeded' })).toBe(
+      true,
+    );
+  });
+
+  it('returns false for non-error values', () => {
+    expect(isRetryableError(null)).toBe(false);
+    expect(isRetryableError('boom')).toBe(false);
+    expect(isRetryableError({ response: { status: 404 } })).toBe(false);
+  });
+});
diff --git a/src/webhooks/webhook-backoff.util.ts b/src/webhooks/webhook-backoff.util.ts
new file mode 100644
index 00000000..2f7c292a
--- /dev/null
+++ b/src/webhooks/webhook-backoff.util.ts
@@ -0,0 +1,93 @@
+/**
+ * Pure helpers for exponential-backoff retry scheduling and retryability
+ * classification. Kept dependency-free so they can be unit-tested in isolation
+ * and reused by both the delivery service and the Bull worker.
+ */
+import { WebhookRetryConfig } from './webhook-retry.config';
+
+/**
+ * Compute the delay (ms) before a given retry attempt using exponential
+ * backoff with an optional "equal jitter" component and a hard cap.
+ *
+ * @param attempt  1-based retry number (1 = the first retry after the initial try)
+ * @param config   retry configuration
+ * @param rng      random source in [0, 1); injectable for deterministic tests
+ */
+export const calculateBackoffDelay = (
+  attempt: number,
+  config: WebhookRetryConfig,
+  rng: () => number = Math.random,
+): number => {
+  const safeAttempt = Math.max(1, Math.floor(attempt));
+
+  // Exponential growth: initial * multiplier^(attempt-1), capped.
+  const exponential = config.initialDelayMs * Math.pow(config.backoffMultiplier, safeAttempt - 1);
+  const capped = Math.min(exponential, config.maxDelayMs);
+
+  if (!config.jitter) {
+    return Math.round(capped);
+  }
+
+  // Equal jitter: keep half the delay fixed and randomise the other half so
+  // retries spread out without ever collapsing to near-zero.
+  const half = capped / 2;
+  return Math.round(half + rng() * half);
+};
+
+/**
+ * Whether another attempt should be made.
+ *
+ * @param attemptsMade number of attempts already completed
+ * @param config       retry configuration (uses maxRetries as the total cap)
+ */
+export const shouldRetry = (attemptsMade: number, config: WebhookRetryConfig): boolean =>
+  attemptsMade < config.maxRetries;
+
+/**
+ * HTTP status codes worth retrying: request timeouts, rate limits, and any
+ * server-side (5xx) error. Other 4xx responses are permanent client errors and
+ * are NOT retried.
+ */
+export const isRetryableStatusCode = (statusCode: number): boolean => {
+  if (statusCode >= 500) return true;
+  return statusCode === 408 || statusCode === 425 || statusCode === 429;
+};
+
+const RETRYABLE_ERROR_CODES = new Set([
+  'ECONNRESET',
+  'ECONNREFUSED',
+  'ETIMEDOUT',
+  'EAI_AGAIN',
+  'ENOTFOUND',
+  'EPIPE',
+  'ECONNABORTED',
+]);
+
+/**
+ * Classify a thrown delivery error (typically an Axios error) as retryable.
+ * A response with a status code defers to {@link isRetryableStatusCode}; a
+ * transport-level failure with no response is retryable.
+ */
+export const isRetryableError = (error: unknown): boolean => {
+  if (!error || typeof error !== 'object') return false;
+
+  const err = error as {
+    response?: { status?: number };
+    code?: string;
+    message?: string;
+  };
+
+  if (err.response && typeof err.response.status === 'number') {
+    return isRetryableStatusCode(err.response.status);
+  }
+
+  if (err.code && RETRYABLE_ERROR_CODES.has(err.code)) return true;
+
+  // Network/timeout errors that surface only as a message.
+  if (err.code === undefined && /timeout|socket hang up|network/i.test(err.message ?? '')) {
+    return true;
+  }
+
+  // No response and not an obviously permanent error → treat as transient.
+  return err.response === undefined && err.code !== undefined;
+};
diff --git a/src/webhooks/webhook-delivery.service.spec.ts b/src/webhooks/webhook-delivery.service.spec.ts
new file mode 100644
index 00000000..131058d9
--- /dev/null
+++ b/src/webhooks/webhook-delivery.service.spec.ts
@@ -0,0 +1,117 @@
+import { of, throwError } from 'rxjs';
+import {
+  WebhookDeliveryService,
+  WebhookRetryableError,
+  WEBHOOK_EVENTS,
+  WebhookTarget,
+} from './webhook-delivery.service';
+import { WebhookMonitorService } from './webhook-monitor.service';
+import { DEFAULT_WEBHOOK_RETRY_CONFIG, WebhookRetryConfig } from './webhook-retry.config';
+
+const config: WebhookRetryConfig = {
+  ...DEFAULT_WEBHOOK_RETRY_CONFIG,
+  maxRetries: 3,
+  initialDelayMs: 1_000,
+  backoffMultiplier: 2,
+  maxDelayMs: 60_000,
+  jitter: false,
+};
+
+const target: WebhookTarget = {
+  url: 'https://example.com/hook',
+  event: 'course.completed',
+  payload: { courseId: 'c1' },
+};
+
+describe('WebhookDeliveryService', () => {
+  let http: { post: jest.Mock };
+  let events: { emit: jest.Mock };
+  let monitor: WebhookMonitorService;
+  let service: WebhookDeliveryService;
+
+  beforeEach(() => {
+    http = { post: jest.fn() };
+    events = { emit: jest.fn() };
+    monitor = new WebhookMonitorService();
+    service = new WebhookDeliveryService(http as any, monitor, events as any, config);
+  });
+
+  it('delivers successfully and records success', async () => {
+    http.post.mockReturnValue(of({ status: 200 }));
+
+    const result = await service.processDelivery(target, 0);
+
+    expect(result).toMatchObject({ delivered: true, statusCode: 200, attempts: 1 });
+    expect(events.emit).toHaveBeenCalledWith(WEBHOOK_EVENTS.DELIVERED, expect.any(Object));
+    expect(monitor.getStats()).toMatchObject({ attempts: 1, succeeded: 1, failed: 0 });
+  });
+
+  it('throws a retryable error with backoff delay on a 5xx response', async () => {
+    http.post.mockReturnValue(throwError(() => ({ response: { status: 503 } })));
+
+    await expect(service.processDelivery(target, 0)).rejects.toBeInstanceOf(WebhookRetryableError);
+
+    const stats = monitor.getStats();
+    expect(stats.failed).toBe(1);
+    expect(stats.retried).toBe(1);
+    expect(events.emit).toHaveBeenCalledWith(
+      WEBHOOK_EVENTS.RETRY_SCHEDULED,
+      expect.objectContaining({ attempt: 1, nextDelayMs: 1_000 }),
+    );
+  });
+
+  it('uses exponential backoff across attempts', async () => {
+    http.post.mockReturnValue(throwError(() => ({ response: { status: 500 } })));
+
+    // attemptsMade = 1 → this is attempt #2 → delay 2000ms
+    await expect(service.processDelivery(target, 1)).rejects.toMatchObject({ nextDelayMs: 2_000 });
+  });
+
+  it('dead-letters a permanent 4xx failure without retrying', async () => {
+    http.post.mockReturnValue(throwError(() => ({ response: { status: 404 } })));
+
+    const result = await service.processDelivery(target, 0);
+
+    expect(result).toMatchObject({ delivered: false, deadLettered: true });
+    expect(monitor.getStats()).toMatchObject({ deadLettered: 1, retried: 0 });
+    expect(events.emit).toHaveBeenCalledWith(
+      WEBHOOK_EVENTS.DEAD_LETTER,
+      expect.objectContaining({ reason: 'permanent_failure' }),
+    );
+  });
+
+  it('dead-letters once retries are exhausted', async () => {
+    http.post.mockReturnValue(throwError(() => ({ response: { status: 503 } })));
+
+    // attemptsMade = 2 → attempt #3 = maxRetries → no further retry
+    const result = await service.processDelivery(target, 2);
+
+    expect(result).toMatchObject({ deadLettered: true });
+    expect(events.emit).toHaveBeenCalledWith(
+      WEBHOOK_EVENTS.DEAD_LETTER,
+      expect.objectContaining({ reason: 'max_retries_exhausted' }),
+    );
+  });
+
+  it('signs the payload with HMAC when a secret is provided', async () => {
+    http.post.mockReturnValue(of({ status: 200 }));
+
+    await service.processDelivery({ ...target, secret: 'shh' }, 0);
+
+    const [, , options] = http.post.mock.calls[0];
+    expect(options.headers['X-Webhook-Signature']).toMatch(/^sha256=[a-f0-9]{64}$/);
+  });
+
+  it('rejects targets missing required fields', async () => {
+    await expect(
+      service.processDelivery({ url: '', event: '', payload: undefined } as WebhookTarget, 0),
+    ).rejects.toThrow('Missing required webhook fields');
+  });
+
+  it('exposes Bull job options matching the retry policy', () => {
+    expect(service.buildJobOptions()).toMatchObject({
+      attempts: 3,
+      backoff: { type: 'exponential', delay: 1_000 },
+    });
+  });
+});
diff --git a/src/webhooks/webhook-delivery.service.ts b/src/webhooks/webhook-delivery.service.ts
new file mode 100644
index 00000000..8e309a7a
--- /dev/null
+++ b/src/webhooks/webhook-delivery.service.ts
@@ -0,0 +1,239 @@
+import { HttpService } from '@nestjs/axios';
+import { Inject, Injectable, Logger, Optional } from '@nestjs/common';
+import { EventEmitter2 } from '@nestjs/event-emitter';
+import axios from 'axios';
+import { createHmac } from 'crypto';
+import { firstValueFrom } from 'rxjs';
+import { calculateBackoffDelay, isRetryableError, shouldRetry } from './webhook-backoff.util';
+import {
+  DEFAULT_WEBHOOK_RETRY_CONFIG,
+  loadWebhookRetryConfig,
+  WEBHOOK_RETRY_CONFIG,
+  WebhookRetryConfig,
+} from './webhook-retry.config';
+import { WebhookMonitorService } from './webhook-monitor.service';
+
+/** A single outbound webhook to deliver. */
+export interface WebhookTarget {
+  url: string;
+  event: string;
+  payload: unknown;
+  /** Extra headers to merge into the request. */
+  headers?: Record<string, string>;
+  /** Shared secret; when set, an HMAC-SHA256 signature header is added. */
+  secret?: string;
+  /** Per-target request timeout override (ms). */
+  timeoutMs?: number;
+}
+
+export interface WebhookDeliveryResult {
+  event: string;
+  url: string;
+  delivered: boolean;
+  statusCode?: number;
+  attempts: number;
+  deadLettered?: boolean;
+  error?: string;
+  deliveredAt?: Date;
+}
+
+/** Emitted on the EventEmitter2 bus for downstream monitoring/audit. */
+export const WEBHOOK_EVENTS = {
+  DELIVERED: 'webhook.delivered',
+  RETRY_SCHEDULED: 'webhook.retry_scheduled',
+  DEAD_LETTER: 'webhook.dead_letter',
+} as const;
+
+/**
+ * Thrown when a delivery attempt fails but is eligible for another retry. The
+ * Bull worker re-throws this so the job is re-enqueued with backoff, forming the
+ * retry queue. Carries the computed delay for the next attempt.
+ */
+export class WebhookRetryableError extends Error {
+  constructor(
+    message: string,
+    readonly nextDelayMs: number,
+    readonly attempt: number,
+  ) {
+    super(message);
+    this.name = 'WebhookRetryableError';
+  }
+}
+
+/**
+ * Delivers outbound webhooks over HTTP with exponential-backoff retries, a
+ * configurable max retry count, dead-letter handling, and failure monitoring.
+ *
+ * Transient failures (5xx, timeouts, network errors) raise
+ * {@link WebhookRetryableError} so the Bull retry queue re-enqueues them with an
+ * exponential delay. Permanent failures (4xx) and exhausted retries are
+ * dead-lettered immediately and reported to {@link WebhookMonitorService}.
+ */
+@Injectable()
+export class WebhookDeliveryService {
+  private readonly logger = new Logger(WebhookDeliveryService.name);
+  private readonly config: WebhookRetryConfig;
+
+  constructor(
+    private readonly http: HttpService,
+    private readonly monitor: WebhookMonitorService,
+    @Optional() private readonly events?: EventEmitter2,
+    @Optional() @Inject(WEBHOOK_RETRY_CONFIG) config?: WebhookRetryConfig,
+  ) {
+    this.config = config ?? DEFAULT_WEBHOOK_RETRY_CONFIG;
+  }
+
+  /**
+   * Build a self-contained instance (own axios-backed HttpService and monitor)
+   * for contexts that construct workers manually rather than via Nest DI, such
+   * as the worker orchestration pool.
+   */
+  static createDefault(): WebhookDeliveryService {
+    return new WebhookDeliveryService(
+      new HttpService(axios.create()),
+      new WebhookMonitorService(),
+      undefined,
+      loadWebhookRetryConfig(),
+    );
+  }
+
+  /**
+   * Bull-compatible job options that mirror this service's retry policy, so the
+   * queue retries the right number of times with matching exponential backoff.
+   */
+  buildJobOptions() {
+    return {
+      attempts: this.config.maxRetries,
+      backoff: { type: 'exponential', delay: this.config.initialDelayMs },
+      removeOnComplete: true,
+      removeOnFail: false,
+    };
+  }
+
+  /**
+   * Process one delivery attempt for a webhook job.
+   *
+   * @param target       the webhook to deliver
+   * @param attemptsMade number of attempts already completed (Bull's job.attemptsMade)
+   */
+  async processDelivery(target: WebhookTarget, attemptsMade = 0): Promise<WebhookDeliveryResult> {
+    this.validateTarget(target);
+    const attempt = attemptsMade + 1;
+    this.monitor.recordAttempt(target.event, target.url);
+
+    const startedAt = Date.now();
+    try {
+      const statusCode = await this.sendRequest(target, attempt);
+      const durationMs = Date.now() - startedAt;
+      this.monitor.recordSuccess(target.event, durationMs);
+
+      const result: WebhookDeliveryResult = {
+        event: target.event,
+        url: target.url,
+        delivered: true,
+        statusCode,
+        attempts: attempt,
+        deliveredAt: new Date(),
+      };
+      this.events?.emit(WEBHOOK_EVENTS.DELIVERED, result);
+      this.logger.log(`Webhook "${target.event}" delivered to ${target.url} (attempt ${attempt})`);
+      return result;
+    } catch (error) {
+      return this.handleFailure(target, attempt, attemptsMade, error);
+    }
+  }
+
+  private handleFailure(
+    target: WebhookTarget,
+    attempt: number,
+    attemptsMade: number,
+    error: unknown,
+  ): WebhookDeliveryResult {
+    const message = this.errorMessage(error);
+    const retryable = isRetryableError(error);
+    const canRetry = retryable && shouldRetry(attempt, this.config);
+
+    this.monitor.recordFailure(target.event, message);
+
+    if (canRetry) {
+      const nextDelayMs = calculateBackoffDelay(attempt, this.config);
+      this.monitor.recordRetry(target.event, attempt, nextDelayMs);
+      this.events?.emit(WEBHOOK_EVENTS.RETRY_SCHEDULED, {
+        event: target.event,
+        url: target.url,
+        attempt,
+        nextDelayMs,
+        error: message,
+      });
+      // Re-thrown by the worker so Bull re-enqueues with backoff.
+      throw new WebhookRetryableError(message, nextDelayMs, attempt);
+    }
+
+    // Permanent failure or retries exhausted → dead-letter (no further retries).
+    this.monitor.recordDeadLetter(target.event, target.url, attempt);
+    const result: WebhookDeliveryResult = {
+      event: target.event,
+      url: target.url,
+      delivered: false,
+      attempts: attempt,
+      deadLettered: true,
+      error: message,
+    };
+    this.events?.emit(WEBHOOK_EVENTS.DEAD_LETTER, {
+      ...result,
+      reason: retryable ? 'max_retries_exhausted' : 'permanent_failure',
+    });
+    return result;
+  }
+
+  /** Perform a single signed HTTP POST. Returns the status code on 2xx. */
+  private async sendRequest(target: WebhookTarget, attempt: number): Promise<number> {
+    const body = {
+      id: `evt_${Date.now()}_${attempt}`,
+      event: target.event,
+      timestamp: new Date().toISOString(),
+      payload: target.payload,
+      attempt,
+    };
+
+    const headers: Record<string, string> = {
+      'Content-Type': 'application/json',
+      'User-Agent': 'TeachLink-Webhooks/1.0',
+      'X-Webhook-Event': target.event,
+      'X-Webhook-Attempt': String(attempt),
+      ...target.headers,
+    };
+
+    if (target.secret) {
+      const serialized = JSON.stringify(body);
+      headers['X-Webhook-Signature'] = `sha256=${this.sign(serialized, target.secret)}`;
+    }
+
+    const response = await firstValueFrom(
+      this.http.post(target.url, body, {
+        headers,
+        timeout: target.timeoutMs ?? this.config.timeoutMs,
+      }),
+    );
+    return response.status;
+  }
+
+  private sign(payload: string, secret: string): string {
+    return createHmac('sha256', secret).update(payload).digest('hex');
+  }
+
+  private validateTarget(target: WebhookTarget): void {
+    if (!target?.url || !target.event || target.payload === undefined) {
+      throw new Error('Missing required webhook fields: url, event, payload');
+    }
+  }
+
+  private errorMessage(error: unknown): string {
+    if (error && typeof error === 'object') {
+      const err = error as { response?: { status?: number }; message?: string };
+      if (err.response?.status) return `HTTP ${err.response.status}`;
+      if (err.message) return err.message;
+    }
+    return String(error);
+  }
+}
diff --git a/src/webhooks/webhook-monitor.service.ts b/src/webhooks/webhook-monitor.service.ts
new file mode 100644
index 00000000..ff923f76
--- /dev/null
+++ b/src/webhooks/webhook-monitor.service.ts
@@ -0,0 +1,92 @@
+import { Injectable, Logger, Optional } from '@nestjs/common';
+import { CustomMetricsService } from '../monitoring/custom-metrics.service';
+
+/** Snapshot of webhook delivery counters. */
+export interface WebhookDeliveryStats {
+  attempts: number;
+  succeeded: number;
+  failed: number;
+  retried: number;
+  deadLettered: number;
+  /** failed / attempts, in the range [0, 1]. */
+  failureRate: number;
+}
+
+export const WEBHOOK_METRICS = {
+  ATTEMPTS: 'webhook.delivery.attempts',
+  SUCCEEDED: 'webhook.delivery.succeeded',
+  FAILED: 'webhook.delivery.failed',
+  RETRIED: 'webhook.delivery.retried',
+  DEAD_LETTERED: 'webhook.delivery.dead_lettered',
+  DURATION_MS: 'webhook.delivery.duration_ms',
+} as const;
+
+/**
+ * Centralises monitoring for outbound webhook delivery. Maintains in-memory
+ * counters (always available, e.g. for health endpoints and tests) and, when a
+ * {@link CustomMetricsService} is wired in, forwards the same signals to the
+ * platform metrics/alerting pipeline.
+ */
+@Injectable()
+export class WebhookMonitorService {
+  private readonly logger = new Logger(WebhookMonitorService.name);
+
+  private attempts = 0;
+  private succeeded = 0;
+  private failed = 0;
+  private retried = 0;
+  private deadLettered = 0;
+
+  constructor(@Optional() private readonly metrics?: CustomMetricsService) {
+    // Register definitions up front so an alert can fire on the failure counter.
+    this.metrics?.define({
+      name: WEBHOOK_METRICS.DEAD_LETTERED,
+      description: 'Webhooks that exhausted all retries and were dead-lettered',
+      type: 'counter',
+      alertThreshold: 1,
+    });
+  }
+
+  recordAttempt(event: string, url: string): void {
+    this.attempts += 1;
+    this.metrics?.increment(WEBHOOK_METRICS.ATTEMPTS, 1, { event });
+    this.logger.debug?.(`Webhook attempt: ${event} -> ${url}`);
+  }
+
+  recordSuccess(event: string, durationMs: number): void {
+    this.succeeded += 1;
+    this.metrics?.increment(WEBHOOK_METRICS.SUCCEEDED, 1, { event });
+    this.metrics?.record(WEBHOOK_METRICS.DURATION_MS, durationMs, { event });
+  }
+
+  recordRetry(event: string, attempt: number, delayMs: number): void {
+    this.retried += 1;
+    this.metrics?.increment(WEBHOOK_METRICS.RETRIED, 1, { event });
+    this.logger.warn(
+      `Webhook delivery for "${event}" failed; scheduling retry #${attempt} in ${delayMs}ms`,
+    );
+  }
+
+  recordFailure(event: string, error: string): void {
+    this.failed += 1;
+    this.metrics?.increment(WEBHOOK_METRICS.FAILED, 1, { event });
+    this.logger.error(`Webhook delivery for "${event}" failed: ${error}`);
+  }
+
+  recordDeadLetter(event: string, url: string, attempts: number): void {
+    this.deadLettered += 1;
+    this.metrics?.increment(WEBHOOK_METRICS.DEAD_LETTERED, 1, { event });
+    this.logger.error(`Webhook "${event}" -> ${url} dead-lettered after ${attempts} attempts`);
+  }
+
+  getStats(): WebhookDeliveryStats {
+    return {
+      attempts: this.attempts,
+      succeeded: this.succeeded,
+      failed: this.failed,
+      retried: this.retried,
+      deadLettered: this.deadLettered,
+      failureRate: this.attempts === 0 ? 0 : this.failed / this.attempts,
+    };
+  }
+}
diff --git a/src/webhooks/webhook-retry.config.ts b/src/webhooks/webhook-retry.config.ts
new file mode 100644
index 00000000..320f9485
--- /dev/null
+++ b/src/webhooks/webhook-retry.config.ts
@@ -0,0 +1,58 @@
+/**
+ * Configuration for outbound webhook delivery retries.
+ *
+ * Values can be overridden via environment variables so retry behaviour is
+ * tunable per environment without code changes (see {@link loadWebhookRetryConfig}).
+ */
+export interface WebhookRetryConfig {
+  /** Maximum number of delivery attempts before a webhook is dead-lettered. */
+  maxRetries: number;
+  /** Base delay (ms) used for the first retry. */
+  initialDelayMs: number;
+  /** Multiplier applied on each successive retry (exponential growth). */
+  backoffMultiplier: number;
+  /** Upper bound (ms) for any single backoff delay. */
+  maxDelayMs: number;
+  /** Apply jitter to spread out retries and avoid thundering herds. */
+  jitter: boolean;
+  /** Per-request HTTP timeout (ms). */
+  timeoutMs: number;
+}
+
+export const DEFAULT_WEBHOOK_RETRY_CONFIG: WebhookRetryConfig = {
+  maxRetries: 5,
+  initialDelayMs: 1_000,
+  backoffMultiplier: 2,
+  maxDelayMs: 60 * 60 * 1_000, // 1 hour
+  jitter: true,
+  timeoutMs: 10_000,
+};
+
+const toInt = (value: string | undefined, fallback: number): number => {
+  const parsed = Number.parseInt(value ?? '', 10);
+  return Number.isFinite(parsed) && parsed >= 0 ? parsed : fallback;
+};
+
+const toBool = (value: string | undefined, fallback: boolean): boolean =>
+  value === undefined ? fallback : value === 'true' || value === '1';
+
+/**
+ * Build a {@link WebhookRetryConfig} from environment variables, falling back to
+ * {@link DEFAULT_WEBHOOK_RETRY_CONFIG} for anything unset or invalid.
+ */
+export const loadWebhookRetryConfig = (
+  env: NodeJS.ProcessEnv = process.env,
+): WebhookRetryConfig => ({
+  maxRetries: toInt(env.WEBHOOK_MAX_RETRIES, DEFAULT_WEBHOOK_RETRY_CONFIG.maxRetries),
+  initialDelayMs: toInt(env.WEBHOOK_INITIAL_DELAY_MS, DEFAULT_WEBHOOK_RETRY_CONFIG.initialDelayMs),
+  backoffMultiplier: toInt(
+    env.WEBHOOK_BACKOFF_MULTIPLIER,
+    DEFAULT_WEBHOOK_RETRY_CONFIG.backoffMultiplier,
+  ),
+  maxDelayMs: toInt(env.WEBHOOK_MAX_DELAY_MS, DEFAULT_WEBHOOK_RETRY_CONFIG.maxDelayMs),
+  jitter: toBool(env.WEBHOOK_JITTER, DEFAULT_WEBHOOK_RETRY_CONFIG.jitter),
+  timeoutMs: toInt(env.WEBHOOK_TIMEOUT_MS, DEFAULT_WEBHOOK_RETRY_CONFIG.timeoutMs),
+});
+
+/** Injection token for the resolved webhook retry configuration. */
+export const WEBHOOK_RETRY_CONFIG = 'WEBHOOK_RETRY_CONFIG';
diff --git a/src/webhooks/webhooks-delivery.module.ts b/src/webhooks/webhooks-delivery.module.ts
new file mode 100644
index 00000000..b400f047
--- /dev/null
+++ b/src/webhooks/webhooks-delivery.module.ts
@@ -0,0 +1,24 @@
+import { HttpModule } from '@nestjs/axios';
+import { Module } from '@nestjs/common';
+import { MonitoringModule } from '../monitoring/monitoring.module';
+import { WebhookDeliveryService } from './webhook-delivery.service';
+import { WebhookMonitorService } from './webhook-monitor.service';
+import { loadWebhookRetryConfig, WEBHOOK_RETRY_CONFIG } from './webhook-retry.config';
+
+/**
+ * Outbound webhook delivery with exponential-backoff retries, dead-letter
+ * handling and failure monitoring (issue #615).
+ */
+@Module({
+  imports: [HttpModule, MonitoringModule],
+  providers: [
+    WebhookDeliveryService,
+    WebhookMonitorService,
+    {
+      provide: WEBHOOK_RETRY_CONFIG,
+      useFactory: () => loadWebhookRetryConfig(),
+    },
+  ],
+  exports: [WebhookDeliveryService, WebhookMonitorService],
+})
+export class WebhooksDeliveryModule {}
diff --git a/src/workers/processors/webhooks.worker.ts b/src/workers/processors/webhooks.worker.ts
index 9835960b..3f1fd911 100644
--- a/src/workers/processors/webhooks.worker.ts
+++ b/src/workers/processors/webhooks.worker.ts
@@ -1,93 +1,56 @@
-import { Injectable } from '@nestjs/common';
+import { Injectable, Optional } from '@nestjs/common';
 import { Job } from 'bull';
 import { BaseWorker } from '../base/base.worker';
+import { WebhookDeliveryService, WebhookTarget } from '../../webhooks/webhook-delivery.service';
 
 /**
  * Webhooks Worker
- * Handles webhook delivery and retry logic
+ *
+ * Delivers outbound webhooks via {@link WebhookDeliveryService}, which applies
+ * exponential-backoff retries, a configurable max retry count, dead-letter
+ * handling and failure monitoring (issue #615).
+ *
+ * Transient failures raise a retryable error so Bull re-enqueues the job with
+ * backoff (the retry queue); permanent failures and exhausted retries resolve
+ * with a dead-lettered result.
  */
 @Injectable()
 export class WebhooksWorker extends BaseWorker {
-  constructor() {
+  private readonly delivery: WebhookDeliveryService;
+
+  // The delivery service is injected under Nest DI, but the worker is also
+  // instantiated manually by the orchestration pool (`new WebhooksWorker()`),
+  // so fall back to a self-contained default when none is provided.
+  constructor(@Optional() delivery?: WebhookDeliveryService) {
     super('webhooks');
+    this.delivery = delivery ?? WebhookDeliveryService.createDefault();
   }
 
   /**
-   * Execute webhook delivery job
+   * Execute a webhook delivery job. `job.attemptsMade` is the number of attempts
+   * already completed, which drives backoff scheduling.
    */
-  async execute(job: Job): Promise<any> {
-    const { url, event, payload, headers, timeout } = job.data;
+  async execute(job: Job): Promise<unknown> {
+    const { url, event, payload, headers, secret, timeout } = job.data ?? {};
 
     await job.progress(20);
 
-    // Validate webhook data
-    if (!url || !event || !payload) {
-      throw new Error('Missing required webhook fields: url, event, payload');
-    }
-
-    await job.progress(40);
-
-    try {
-      this.logger.log(`Delivering webhook: ${event} to ${url}`);
-
-      const result = await this.deliverWebhook(job, url, event, payload, headers, timeout);
-
-      await job.progress(100);
-      return result;
-    } catch (error) {
-      this.logger.error(`Failed to deliver webhook to ${url}:`, error);
-      throw error;
-    }
-  }
-
-  /**
-   * Deliver webhook with retry logic
-   */
-  private async deliverWebhook(
-    job: Job,
-    url: string,
-    event: string,
-    payload: any,
-    headers?: any,
-    timeout?: number,
-  ): Promise<any> {
-    await job.progress(60);
-
-    const requestBody = {
-      id: `evt_${Date.now()}`,
+    const target: WebhookTarget = {
+      url,
       event,
-      timestamp: new Date(),
       payload,
-      retryCount: job.attemptsMade,
+      headers,
+      secret,
+      timeoutMs: timeout,
     };
 
-    // Simulate webhook delivery (in production, this would use axios or fetch)
-    try {
-      this.logger.log(`Sending webhook payload to ${url}:`, requestBody);
-
-      // Simulate HTTP request
-      await new Promise((resolve) => setTimeout(resolve, 100));
-
-      await job.progress(90);
+    await job.progress(40);
 
-      // Simulate response
-      const statusCode = 200; // In production, actual HTTP status
+    // Delegates retry/backoff/dead-letter decisions to the delivery service.
+    // A retryable failure throws here so Bull re-enqueues with backoff.
+    const result = await this.delivery.processDelivery(target, job.attemptsMade);
 
-      if (statusCode >= 200 && statusCode < 300) {
-        return {
-          event,
-          url,
-          statusCode,
-          delivered: true,
-          deliveredAt: new Date(),
-          retryCount: job.attemptsMade,
-        };
-      } else {
-        throw new Error(`Webhook delivery failed with status ${statusCode}`);
-      }
-    } catch (error) {
-      this.logger.error(`Webhook delivery error for ${url}:`, error.message);
-      throw error;
-    }
+    await job.progress(100);
+    return result;
   }
 }
diff --git a/src/workers/workers.module.ts b/src/workers/workers.module.ts
index dc19c5b4..1a985cc7 100644
--- a/src/workers/workers.module.ts
+++ b/src/workers/workers.module.ts
@@ -1,6 +1,7 @@
 import { Module } from '@nestjs/common';
 import { WorkerOrchestrationService } from './orchestration/worker-orchestration.service';
 import { WorkerHealthCheckService } from './health/worker-health-check.service';
+import { WebhooksDeliveryModule } from '../webhooks/webhooks-delivery.module';
 import {
   EmailWorker,
   MediaProcessingWorker,
@@ -15,6 +16,7 @@ import {
  * Provides centralized async task processing with worker orchestration
  */
 @Module({
+  imports: [WebhooksDeliveryModule],
   providers: [
     WorkerOrchestrationService,
     WorkerHealthCheckService,