From 63cfb587f9d90050dd80d9ea641f1767be656bac Mon Sep 17 00:00:00 2001 From: dkijania Date: Mon, 29 Jun 2026 08:27:52 +0200 Subject: [PATCH] docs: add reference deployment manifests (k8s + prod Compose) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There were no production deployment artifacts — operators had npm/Docker/Compose for dev but no opinionated manifest with probes, resource limits, and a hardened runtime. Add deploy/: - kubernetes.yaml — Deployment + Service + HPA (+ placeholder Secret) with liveness (/healthcheck) and readiness (/readiness) probes, resource requests/limits, a 2→6 CPU autoscaler, Prometheus scrape annotations for /metrics, a hardened pod securityContext (non-root, readOnlyRootFilesystem, no privilege escalation, all caps dropped, RuntimeDefault seccomp), and a 30s termination grace period matching the graceful-shutdown drain. - docker-compose.prod.yml — the published image against an external read-only Postgres, with CPU/memory caps. - README.md — usage and how this maps to the security deployment contract. Linked from the root README. References the probe/metrics endpoints delivered by the sibling P1 PRs. Closes #179. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01QSuak9smCHbp4N17xjjLF6 --- README.md | 4 ++ deploy/README.md | 45 ++++++++++++ deploy/docker-compose.prod.yml | 22 ++++++ deploy/kubernetes.yaml | 123 +++++++++++++++++++++++++++++++++ 4 files changed, 194 insertions(+) create mode 100644 deploy/README.md create mode 100644 deploy/docker-compose.prod.yml create mode 100644 deploy/kubernetes.yaml diff --git a/README.md b/README.md index 2a413d3..de1c9ee 100644 --- a/README.md +++ b/README.md @@ -93,6 +93,10 @@ CI builds, publishes the npm package with provenance, and pushes Docker tags `1. The bottleneck is the Postgres database, not this server. For production load, point `PG_CONN` at multiple read replicas — the server fans queries across them and recovers automatically as hosts come and go. A recent benchmark on a 12-core / 32 GB box (API + Postgres co-located) sustained ~800 req/s with p99 latency of 39 ms. Use `npm run benchmark` to size your own deployment. +## Deployment + +Reference Kubernetes and production Docker Compose manifests — with liveness/readiness probes, resource limits, autoscaling, and a hardened pod security context — live in [`deploy/`](./deploy/). Read [`docs/security.md`](./docs/security.md) for the deployment contract (TLS gateway, read-only DB role, private Postgres). + ## Contributing - AI coding agents: read [`AGENTS.md`](./AGENTS.md) first. diff --git a/deploy/README.md b/deploy/README.md new file mode 100644 index 0000000..0ad2ffe --- /dev/null +++ b/deploy/README.md @@ -0,0 +1,45 @@ +# Reference deployment artifacts + +Opinionated starting points for running the Archive Node API in production. They +are references to adapt, not turnkey configs — review image tags, sizing, and +secret management for your environment. Read [`docs/security.md`](../docs/security.md) +first for the deployment contract (TLS gateway, read-only DB role, private +Postgres). + +## Kubernetes — [`kubernetes.yaml`](./kubernetes.yaml) + +A `Deployment` + `Service` + `HorizontalPodAutoscaler` (and a placeholder +`Secret`) with the production defaults baked in: + +- **Liveness** probe on `/healthcheck` (process up) and **readiness** probe on + `/readiness` (database reachable) — a node with a dead DB stops receiving + traffic without being restarted. +- **Resource** requests/limits and a 2→6 replica HPA on CPU. +- Hardened pod: non-root, `readOnlyRootFilesystem`, `allowPrivilegeEscalation: +false`, all capabilities dropped, `RuntimeDefault` seccomp. +- Prometheus scrape annotations pointing at `/metrics`. +- `terminationGracePeriodSeconds: 30` to match the app's graceful-shutdown drain. + +```sh +# edit the Secret's PG_CONN (use a read-only role) and the image tag first +kubectl apply -f deploy/kubernetes.yaml +``` + +Put a TLS-terminating Ingress/gateway in front (it must set `X-Forwarded-For` +for per-client rate limiting) — see [`docs/security.md`](../docs/security.md). + +## Docker Compose — [`docker-compose.prod.yml`](./docker-compose.prod.yml) + +Runs only the published image against an external Postgres (contrast with the +repo-root `docker-compose.yml`, which is for local dev with a bundled DB). + +```sh +PG_CONN='postgres://archive_api_ro:...@db:5432/archive' \ + docker compose -f deploy/docker-compose.prod.yml up -d +``` + +## Sizing + +The bottleneck is Postgres, not this server; point `PG_CONN` at read replicas for +throughput. See the benchmark note in the root [`README.md`](../README.md#hardware-requirements) +and use `npm run benchmark` to size your own deployment. diff --git a/deploy/docker-compose.prod.yml b/deploy/docker-compose.prod.yml new file mode 100644 index 0000000..ec3e04f --- /dev/null +++ b/deploy/docker-compose.prod.yml @@ -0,0 +1,22 @@ +# Production-shaped Compose for the Archive Node API alone (bring your own +# Postgres). Unlike the repo-root docker-compose.yml — which stands up Postgres + +# Jaeger for local development — this runs only the published image against an +# external, read-only archive database. See deploy/README.md and docs/security.md. +services: + archive-node-api: + # Pin a specific version in production rather than :latest. + image: ghcr.io/o1-labs/archive-node-api:latest + restart: unless-stopped + ports: + - '8080:8080' + environment: + # Required. Point at a read-only Postgres role (see docs/security.md). + PG_CONN: ${PG_CONN:?set PG_CONN to your archive-node Postgres connection string} + PORT: '8080' + # Restrict cross-origin access — leave unset for same-origin only. + # CORS_ORIGIN: 'https://app.example.com' + # Resource caps (Compose v2). + cpus: 1.0 + mem_limit: 512m + # The image ships a HEALTHCHECK against /healthcheck; Compose surfaces it as + # the container health status. diff --git a/deploy/kubernetes.yaml b/deploy/kubernetes.yaml new file mode 100644 index 0000000..60374ba --- /dev/null +++ b/deploy/kubernetes.yaml @@ -0,0 +1,123 @@ +# Reference Kubernetes manifest for the Archive Node API. +# +# This is a starting point, not a turnkey production deploy — review the image +# tag, replica count, resource sizing, and secret management for your cluster. +# See deploy/README.md and docs/security.md for the full deployment contract. +--- +apiVersion: v1 +kind: Secret +metadata: + name: archive-node-api +type: Opaque +stringData: + # Point at a read-only Postgres role (see docs/security.md). Replace before use. + PG_CONN: 'postgres://archive_api_ro:CHANGE_ME@postgres:5432/archive' +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: archive-node-api + labels: + app: archive-node-api +spec: + replicas: 2 + selector: + matchLabels: + app: archive-node-api + template: + metadata: + labels: + app: archive-node-api + annotations: + prometheus.io/scrape: 'true' + prometheus.io/port: '8080' + prometheus.io/path: /metrics + spec: + securityContext: + runAsNonRoot: true + runAsUser: 1001 + seccompProfile: + type: RuntimeDefault + containers: + - name: archive-node-api + # Pin a specific version in production rather than :latest. + image: ghcr.io/o1-labs/archive-node-api:latest + ports: + - containerPort: 8080 + env: + - name: PORT + value: '8080' + - name: PG_CONN + valueFrom: + secretKeyRef: + name: archive-node-api + key: PG_CONN + # Restrict cross-origin access (see docs/security.md). Leave unset for + # same-origin only, or set an explicit allowlist. + # - name: CORS_ORIGIN + # value: 'https://app.example.com' + resources: + requests: + cpu: '250m' + memory: '256Mi' + limits: + cpu: '1' + memory: '512Mi' + # Liveness: process is up. Readiness: the database is reachable. + livenessProbe: + httpGet: + path: /healthcheck + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 15 + timeoutSeconds: 5 + readinessProbe: + httpGet: + path: /readiness + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 5 + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + # Give in-flight requests time to drain on rollout (matches the app's + # graceful-shutdown window). + terminationGracePeriodSeconds: 30 +--- +apiVersion: v1 +kind: Service +metadata: + name: archive-node-api + labels: + app: archive-node-api +spec: + type: ClusterIP + selector: + app: archive-node-api + ports: + - name: http + port: 80 + targetPort: 8080 +--- +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: archive-node-api +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: archive-node-api + minReplicas: 2 + maxReplicas: 6 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 70