From 63cfb587f9d90050dd80d9ea641f1767be656bac Mon Sep 17 00:00:00 2001
From: dkijania <dariusz@o1labs.org>
Date: Mon, 29 Jun 2026 08:27:52 +0200
Subject: [PATCH] docs: add reference deployment manifests (k8s + prod Compose)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There were no production deployment artifacts — operators had npm/Docker/Compose
for dev but no opinionated manifest with probes, resource limits, and a hardened
runtime.

Add deploy/:
- kubernetes.yaml — Deployment + Service + HPA (+ placeholder Secret) with
  liveness (/healthcheck) and readiness (/readiness) probes, resource
  requests/limits, a 2→6 CPU autoscaler, Prometheus scrape annotations for
  /metrics, a hardened pod securityContext (non-root, readOnlyRootFilesystem,
  no privilege escalation, all caps dropped, RuntimeDefault seccomp), and a
  30s termination grace period matching the graceful-shutdown drain.
- docker-compose.prod.yml — the published image against an external read-only
  Postgres, with CPU/memory caps.
- README.md — usage and how this maps to the security deployment contract.

Linked from the root README. References the probe/metrics endpoints delivered by
the sibling P1 PRs.

Closes #179.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01QSuak9smCHbp4N17xjjLF6
---
 README.md                      |   4 ++
 deploy/README.md               |  45 ++++++++++++
 deploy/docker-compose.prod.yml |  22 ++++++
 deploy/kubernetes.yaml         | 123 +++++++++++++++++++++++++++++++++
 4 files changed, 194 insertions(+)
 create mode 100644 deploy/README.md
 create mode 100644 deploy/docker-compose.prod.yml
 create mode 100644 deploy/kubernetes.yaml

diff --git a/README.md b/README.md
index 2a413d3..de1c9ee 100644
--- a/README.md
+++ b/README.md
@@ -93,6 +93,10 @@ CI builds, publishes the npm package with provenance, and pushes Docker tags `1.
 
 The bottleneck is the Postgres database, not this server. For production load, point `PG_CONN` at multiple read replicas — the server fans queries across them and recovers automatically as hosts come and go. A recent benchmark on a 12-core / 32 GB box (API + Postgres co-located) sustained ~800 req/s with p99 latency of 39 ms. Use `npm run benchmark` to size your own deployment.
 
+## Deployment
+
+Reference Kubernetes and production Docker Compose manifests — with liveness/readiness probes, resource limits, autoscaling, and a hardened pod security context — live in [`deploy/`](./deploy/). Read [`docs/security.md`](./docs/security.md) for the deployment contract (TLS gateway, read-only DB role, private Postgres).
+
 ## Contributing
 
 - AI coding agents: read [`AGENTS.md`](./AGENTS.md) first.
diff --git a/deploy/README.md b/deploy/README.md
new file mode 100644
index 0000000..0ad2ffe
--- /dev/null
+++ b/deploy/README.md
@@ -0,0 +1,45 @@
+# Reference deployment artifacts
+
+Opinionated starting points for running the Archive Node API in production. They
+are references to adapt, not turnkey configs — review image tags, sizing, and
+secret management for your environment. Read [`docs/security.md`](../docs/security.md)
+first for the deployment contract (TLS gateway, read-only DB role, private
+Postgres).
+
+## Kubernetes — [`kubernetes.yaml`](./kubernetes.yaml)
+
+A `Deployment` + `Service` + `HorizontalPodAutoscaler` (and a placeholder
+`Secret`) with the production defaults baked in:
+
+- **Liveness** probe on `/healthcheck` (process up) and **readiness** probe on
+  `/readiness` (database reachable) — a node with a dead DB stops receiving
+  traffic without being restarted.
+- **Resource** requests/limits and a 2→6 replica HPA on CPU.
+- Hardened pod: non-root, `readOnlyRootFilesystem`, `allowPrivilegeEscalation:
+false`, all capabilities dropped, `RuntimeDefault` seccomp.
+- Prometheus scrape annotations pointing at `/metrics`.
+- `terminationGracePeriodSeconds: 30` to match the app's graceful-shutdown drain.
+
+```sh
+# edit the Secret's PG_CONN (use a read-only role) and the image tag first
+kubectl apply -f deploy/kubernetes.yaml
+```
+
+Put a TLS-terminating Ingress/gateway in front (it must set `X-Forwarded-For`
+for per-client rate limiting) — see [`docs/security.md`](../docs/security.md).
+
+## Docker Compose — [`docker-compose.prod.yml`](./docker-compose.prod.yml)
+
+Runs only the published image against an external Postgres (contrast with the
+repo-root `docker-compose.yml`, which is for local dev with a bundled DB).
+
+```sh
+PG_CONN='postgres://archive_api_ro:...@db:5432/archive' \
+  docker compose -f deploy/docker-compose.prod.yml up -d
+```
+
+## Sizing
+
+The bottleneck is Postgres, not this server; point `PG_CONN` at read replicas for
+throughput. See the benchmark note in the root [`README.md`](../README.md#hardware-requirements)
+and use `npm run benchmark` to size your own deployment.
diff --git a/deploy/docker-compose.prod.yml b/deploy/docker-compose.prod.yml
new file mode 100644
index 0000000..ec3e04f
--- /dev/null
+++ b/deploy/docker-compose.prod.yml
@@ -0,0 +1,22 @@
+# Production-shaped Compose for the Archive Node API alone (bring your own
+# Postgres). Unlike the repo-root docker-compose.yml — which stands up Postgres +
+# Jaeger for local development — this runs only the published image against an
+# external, read-only archive database. See deploy/README.md and docs/security.md.
+services:
+  archive-node-api:
+    # Pin a specific version in production rather than :latest.
+    image: ghcr.io/o1-labs/archive-node-api:latest
+    restart: unless-stopped
+    ports:
+      - '8080:8080'
+    environment:
+      # Required. Point at a read-only Postgres role (see docs/security.md).
+      PG_CONN: ${PG_CONN:?set PG_CONN to your archive-node Postgres connection string}
+      PORT: '8080'
+      # Restrict cross-origin access — leave unset for same-origin only.
+      # CORS_ORIGIN: 'https://app.example.com'
+    # Resource caps (Compose v2).
+    cpus: 1.0
+    mem_limit: 512m
+    # The image ships a HEALTHCHECK against /healthcheck; Compose surfaces it as
+    # the container health status.
diff --git a/deploy/kubernetes.yaml b/deploy/kubernetes.yaml
new file mode 100644
index 0000000..60374ba
--- /dev/null
+++ b/deploy/kubernetes.yaml
@@ -0,0 +1,123 @@
+# Reference Kubernetes manifest for the Archive Node API.
+#
+# This is a starting point, not a turnkey production deploy — review the image
+# tag, replica count, resource sizing, and secret management for your cluster.
+# See deploy/README.md and docs/security.md for the full deployment contract.
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: archive-node-api
+type: Opaque
+stringData:
+  # Point at a read-only Postgres role (see docs/security.md). Replace before use.
+  PG_CONN: 'postgres://archive_api_ro:CHANGE_ME@postgres:5432/archive'
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: archive-node-api
+  labels:
+    app: archive-node-api
+spec:
+  replicas: 2
+  selector:
+    matchLabels:
+      app: archive-node-api
+  template:
+    metadata:
+      labels:
+        app: archive-node-api
+      annotations:
+        prometheus.io/scrape: 'true'
+        prometheus.io/port: '8080'
+        prometheus.io/path: /metrics
+    spec:
+      securityContext:
+        runAsNonRoot: true
+        runAsUser: 1001
+        seccompProfile:
+          type: RuntimeDefault
+      containers:
+        - name: archive-node-api
+          # Pin a specific version in production rather than :latest.
+          image: ghcr.io/o1-labs/archive-node-api:latest
+          ports:
+            - containerPort: 8080
+          env:
+            - name: PORT
+              value: '8080'
+            - name: PG_CONN
+              valueFrom:
+                secretKeyRef:
+                  name: archive-node-api
+                  key: PG_CONN
+            # Restrict cross-origin access (see docs/security.md). Leave unset for
+            # same-origin only, or set an explicit allowlist.
+            # - name: CORS_ORIGIN
+            #   value: 'https://app.example.com'
+          resources:
+            requests:
+              cpu: '250m'
+              memory: '256Mi'
+            limits:
+              cpu: '1'
+              memory: '512Mi'
+          # Liveness: process is up. Readiness: the database is reachable.
+          livenessProbe:
+            httpGet:
+              path: /healthcheck
+              port: 8080
+            initialDelaySeconds: 10
+            periodSeconds: 15
+            timeoutSeconds: 5
+          readinessProbe:
+            httpGet:
+              path: /readiness
+              port: 8080
+            initialDelaySeconds: 5
+            periodSeconds: 10
+            timeoutSeconds: 5
+          securityContext:
+            allowPrivilegeEscalation: false
+            readOnlyRootFilesystem: true
+            capabilities:
+              drop:
+                - ALL
+      # Give in-flight requests time to drain on rollout (matches the app's
+      # graceful-shutdown window).
+      terminationGracePeriodSeconds: 30
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: archive-node-api
+  labels:
+    app: archive-node-api
+spec:
+  type: ClusterIP
+  selector:
+    app: archive-node-api
+  ports:
+    - name: http
+      port: 80
+      targetPort: 8080
+---
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: archive-node-api
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: archive-node-api
+  minReplicas: 2
+  maxReplicas: 6
+  metrics:
+    - type: Resource
+      resource:
+        name: cpu
+        target:
+          type: Utilization
+          averageUtilization: 70