From f62dd748fbb5cf65a6775c7ba0ffd2071873c158 Mon Sep 17 00:00:00 2001
From: Lalithnarayan C <Lalithnarayan.C@amd.com>
Date: Wed, 24 Jun 2026 05:36:25 -0600
Subject: [PATCH] add serving-llms-on-epyc skill (vLLM + zentorch CPU serving)

Signed-off-by: Lalithnarayan C <Lalithnarayan.C@amd.com>
Change-Id: I1dc2362e0983326658b6618015a161ecd44f40e6
---
 .claude-plugin/marketplace.json               |   5 +
 .cursor-plugin/marketplace.json               |   5 +
 .../tests/test_serving_llms_on_epyc.py        |  41 +++
 skills/serving-llms-on-epyc/SKILL.md          | 236 ++++++++++++++++++
 skills/serving-llms-on-epyc/data/epyc.json    |  53 ++++
 skills/serving-llms-on-epyc/reference.md      | 122 +++++++++
 .../scripts/check_model.py                    | 169 +++++++++++++
 .../serving-llms-on-epyc/scripts/cpu_tune.py  | 144 +++++++++++
 skills/serving-llms-on-epyc/scripts/detect.py | 149 +++++++++++
 .../scripts/estimate_memory.py                | 138 ++++++++++
 .../serving-llms-on-epyc/scripts/validate.py  | 156 ++++++++++++
 skills/serving-llms-on-epyc/skill-card.md     |  13 +
 12 files changed, 1231 insertions(+)
 create mode 100644 eval/behavioral/tests/test_serving_llms_on_epyc.py
 create mode 100644 skills/serving-llms-on-epyc/SKILL.md
 create mode 100644 skills/serving-llms-on-epyc/data/epyc.json
 create mode 100644 skills/serving-llms-on-epyc/reference.md
 create mode 100644 skills/serving-llms-on-epyc/scripts/check_model.py
 create mode 100644 skills/serving-llms-on-epyc/scripts/cpu_tune.py
 create mode 100644 skills/serving-llms-on-epyc/scripts/detect.py
 create mode 100644 skills/serving-llms-on-epyc/scripts/estimate_memory.py
 create mode 100644 skills/serving-llms-on-epyc/scripts/validate.py
 create mode 100644 skills/serving-llms-on-epyc/skill-card.md

diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index f00ba1e..fd141aa 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -24,6 +24,11 @@
       "source": "./skills/magpie-kernel-evaluator",
       "description": "Performs GPU kernel correctness and performance evaluation and LLM inference benchmarking with Magpie. Analyzes single or multiple kernels (HIP/CUDA/PyTorch), compares kernel implementations, runs vLLM/SGLang benchmarks with profiling and TraceLens, and runs gap analysis on torch traces."
     },
+    {
+      "name": "serving-llms-on-epyc",
+      "source": "./skills/serving-llms-on-epyc",
+      "description": "Serve LLMs on AMD EPYC CPUs with vLLM + zentorch, in a container (Docker/Podman) or conda. Handles CPU detection, runtime/env validation, vLLM model-support and RAM-fit checks, hardware-sized threads/KV, launch, and health verification. Single instance; reports and stops on failure."
+    },
     {
       "name": "serving-llms-on-instinct",
       "source": "./skills/serving-llms-on-instinct",
diff --git a/.cursor-plugin/marketplace.json b/.cursor-plugin/marketplace.json
index f00ba1e..fd141aa 100644
--- a/.cursor-plugin/marketplace.json
+++ b/.cursor-plugin/marketplace.json
@@ -24,6 +24,11 @@
       "source": "./skills/magpie-kernel-evaluator",
       "description": "Performs GPU kernel correctness and performance evaluation and LLM inference benchmarking with Magpie. Analyzes single or multiple kernels (HIP/CUDA/PyTorch), compares kernel implementations, runs vLLM/SGLang benchmarks with profiling and TraceLens, and runs gap analysis on torch traces."
     },
+    {
+      "name": "serving-llms-on-epyc",
+      "source": "./skills/serving-llms-on-epyc",
+      "description": "Serve LLMs on AMD EPYC CPUs with vLLM + zentorch, in a container (Docker/Podman) or conda. Handles CPU detection, runtime/env validation, vLLM model-support and RAM-fit checks, hardware-sized threads/KV, launch, and health verification. Single instance; reports and stops on failure."
+    },
     {
       "name": "serving-llms-on-instinct",
       "source": "./skills/serving-llms-on-instinct",
diff --git a/eval/behavioral/tests/test_serving_llms_on_epyc.py b/eval/behavioral/tests/test_serving_llms_on_epyc.py
new file mode 100644
index 0000000..c4750f5
--- /dev/null
+++ b/eval/behavioral/tests/test_serving_llms_on_epyc.py
@@ -0,0 +1,41 @@
+"""Behavioral tests for the `serving-llms-on-epyc` skill.
+
+Run locally (needs the `claude` CLI authenticated; the agent does not actually
+launch a server in the judge's sandbox, so this grades the *plan/behavior*, not
+a live endpoint):
+
+    pytest eval/behavioral/tests/test_serving_llms_on_epyc.py -s
+
+`logs_contains` is deterministic; `should` / `should_not` are graded by an LLM
+judge over the captured evidence (tool calls + outputs), so the agent's prose
+cannot fake a pass.
+"""
+
+from harness import claude
+
+
+def test_serve_model_on_epyc():
+    with claude("sonnet", skill="serving-llms-on-epyc") as agent:
+        run = agent.prompt(
+            "Serve Qwen/Qwen3-0.6B on this AMD EPYC box with vLLM and zentorch. "
+            "Use the default settings."
+        )
+
+        # Programmatic expectation: the skill was actually loaded.
+        run.logs_contains("serving-llms-on-epyc")
+
+        # Positive behavioral expectations (the state machine).
+        run.should("Detect the CPU and confirm it is an AMD EPYC host before serving (e.g. runs detect.py)")
+        run.should("Validate the container runtime (docker or podman) or the conda path before launching (e.g. runs validate.py)")
+        run.should("Take validate.py's environment advisories into account -- the tcmalloc / OpenMP (LD_PRELOAD) perf-library recommendation and, when the image is already pulled, the in-image vllm+zentorch check -- surfacing any that apply")
+        run.should("Check that vLLM supports the model before serving (e.g. runs check_model.py), rather than refusing it just for being multimodal")
+        run.should("Check that the model fits in host RAM (e.g. runs estimate_memory.py)")
+        run.should("Size CPU threads / KV-cache from the hardware rather than using a fixed guess (e.g. runs cpu_tune.py)")
+        run.should("Present a sized plan and ask the user to confirm before launching the server")
+        run.should("Plan to launch with 'vllm serve' and poll until /health is healthy")
+
+        # Negative behavioral expectations (the explicit Don'ts).
+        run.should_not("Pass '--device cpu' to vllm serve")
+        run.should_not("Launch the server before the user has confirmed the plan")
+        run.should_not("Enter a debugging loop or retry after a launch failure")
+        run.should_not("Attempt GPU, ROCm, or Instinct serving")
diff --git a/skills/serving-llms-on-epyc/SKILL.md b/skills/serving-llms-on-epyc/SKILL.md
new file mode 100644
index 0000000..7521054
--- /dev/null
+++ b/skills/serving-llms-on-epyc/SKILL.md
@@ -0,0 +1,236 @@
+---
+name: serving-llms-on-epyc
+description: >-
+  Serves a language model on an AMD EPYC CPU host using vLLM with the zentorch
+  backend, in a container (Docker or Podman) or a conda env. Use whenever the
+  user wants to run, serve, deploy, start, host, or launch an LLM on AMD EPYC,
+  Zen CPU, "vLLM on CPU", "zentorch serving", or "serve a model without a GPU".
+  Use for "serve Qwen on EPYC", "start a CPU vLLM endpoint", "run an OpenAI
+  server on my EPYC box", or similar. Handles the full single-instance flow:
+  detect the CPU (incl. EPYC generation), validate the runtime/env, check vLLM
+  supports the model (via vLLM's registry, not a modality blocklist), check it
+  fits host RAM, size CPU threads/KV/NUMA from the hardware, confirm the plan with
+  the user, launch, and poll until the endpoint is responsive. Single instance
+  only. Does NOT debug failures
+  and does NOT retry -- it reports and stops. Do not use for GPU/Instinct (use
+  serving-llms-on-instinct) or multi-node.
+allowed-tools: Bash, Read
+---
+
+# Serving LLMs on AMD EPYC (vLLM + zentorch, CPU)
+
+Bring up a single vLLM OpenAI endpoint on an AMD EPYC host with the zentorch CPU
+backend, sized to the hardware. Container-first (Docker or Podman); conda/host
+is the fallback.
+
+Hard rule for this skill: **on any failure, report the cause + logs and STOP.
+Do not retry, do not debug.** (Debugging is a separate workflow.)
+
+**The agent does the serve flow itself** -- pull, configure, launch, poll --
+using the runtime `validate.py` reports. Never hand the user per-serve commands.
+Like serving-llms-on-instinct, an accessible container runtime is a one-time
+**prerequisite**: if `validate.py` finds none, report its one-time fix (make
+docker accessible / install podman / provide a conda env) and stop. Do not
+attempt `sudo` or privilege escalation.
+
+## Data file
+
+Read `data/epyc.json` directly. It holds the container image, mandatory CPU run
+flags, supported precision, the model-support policy, the default model, and the
+verified throughput-flag gotcha. Do not hardcode the image tag from memory -- read it.
+
+## Step 1: Detect the CPU
+
+```bash
+python3 scripts/detect.py            # add --host user@box for a remote host
+```
+
+Returns `cpu_model`, `is_amd_epyc`, `epyc_generation`
+(Naples/Rome/Milan/Genoa/Bergamo/Siena/Turin), `zen_arch`, `avx512`,
+`logical_cores`, `physical_cores`, `sockets`, `numa_nodes`, `memory_gb`. If
+`is_amd_epyc` is `false`, stop: this skill targets AMD EPYC. (Other x86 may work
+but is unsupported here.) Carry `epyc_generation` / `avx512` through the later
+phases -- e.g. AVX-512 + bf16 land on Zen4+ (Genoa/Turin), and Turin packs up to
+128 cores/socket, which the thread-binding in Step 5 sizes from.
+
+## Step 2: Validate the runtime and environment
+
+```bash
+python3 scripts/validate.py --image <image from data/epyc.json>
+```
+
+Returns `ready`, `runtime` (`docker`, `podman`, or null), `runtime_detail`,
+`conda_path_available`, `ram_gb`, and `errors/warnings/advisories`. Pick the path:
+- `runtime` is `docker` or `podman` -> container path (Step 6), used verbatim.
+- `runtime` null but `conda_path_available: true` -> conda/host path.
+- `runtime` null and no conda -> `ready` is false. Report the one-time
+  onboarding `fix` (make docker accessible / install podman / conda env) and stop.
+
+Do not proceed if `ready` is `false`.
+
+## Step 3: Resolve and validate the model
+
+If the user named no model, use `default_model` from `data/epyc.json`
+(`Qwen/Qwen3-0.6B` -- ungated, tiny, fast first success). Otherwise use theirs.
+
+Check that vLLM actually supports the model (do **not** blanket-block multimodal):
+
+```bash
+python3 scripts/check_model.py --model-id <model> --vllm-version <vllm_version from data/epyc.json>
+```
+
+- Exit 0 = vLLM serves it as a generation endpoint (`kind` `text` or `multimodal`),
+  or support is undeterminable (gated/offline) -- proceed; launch confirms.
+- Exit 1 = positively unsupported: the architecture is not in vLLM's registry, or
+  it is a `pooling`/embedding/reranker (not a chat/completion endpoint). Report the
+  printed `message` and stop.
+- A `multimodal` model is allowed; a vLLM-supported multimodal arch may still hit a
+  GPU-only kernel on CPU, which surfaces at load (the no-retry rule then applies).
+
+**Precision/dtype**: native CPU dtypes are `bf16` (default), `fp16`, `fp32`. Use
+`bfloat16` unless the user asks otherwise.
+
+For gated models (Llama, Gemma) `HF_TOKEN` must be set and the license accepted on
+HuggingFace; if not, stop and say so.
+
+## Step 4: Check it fits host RAM
+
+RAM is the ceiling on CPU (weights + KV cache both live in RAM). Run on ONE line:
+
+```bash
+python3 scripts/estimate_memory.py --model-id <model> --ram-gb <memory_gb from detect> --max-model-len <4096 or user value> --num-prompts <1 or desired concurrency>
+```
+
+Exit 0 = fits, exit 1 = does not fit. If `fit.fits` is false: **do not launch.**
+Tell the user `required_gb` vs `ram_gb` and the printed `fit.action` -- reduce
+`--max-model-len` to `fit.suggested_max_model_len` and retry, or use a smaller
+model. `--max-model-len` and `--num-prompts` are the two knobs that move KV.
+Extra flag: `--weight-gb N` overrides weights if a model has no HF metadata
+(rare). KV cache is bf16-only on zentorch CPU (no fp8 KV).
+
+## Step 5: Size the CPU runtime from the hardware
+
+```bash
+eval "$(python3 scripts/cpu_tune.py)"      # or --format json to inspect
+```
+
+Exports `VLLM_CPU_OMP_THREADS_BIND` (physical cores of **socket 0**) and
+`VLLM_CPU_KVCACHE_SPACE` (GB). It does **not** set `OMP_NUM_THREADS` (vLLM derives
+it from the bind list) or `VLLM_CPU_NUM_OF_RESERVED_CPU` (vLLM has its own default
+when unset). Default policy, the same for NPS1/NPS2/NPS4: a single instance uses
+**socket 0's whole CPU with no memory binding**. On a multi-socket host the JSON
+gives `container_cpuset` (`--cpuset-cpus` only -- no `--cpuset-mems`) for the
+container path; the conda path needs nothing extra (the bind env var binds the
+threads). If socket 0 spans multiple NUMA nodes (NPS2/NPS4), `perf_note` flags that
+optimal per-node binding could give more performance -- surface it, but proceed.
+
+## Step 6: Confirm the plan, then launch (container-first)
+
+Before launching, present this summary and **wait for the user to confirm** -- do
+not launch unprompted. This is the human gate before anything runs:
+
+| Field | Value |
+|---|---|
+| Model / kind | `<model>` -- `text` or `multimodal` (from `check_model.py`) |
+| Path | container (`<runtime>`, image from `data/epyc.json`) or conda/host |
+| Precision | `bfloat16` (or the user's choice) |
+| Fit | required `<required_gb>` GB vs `<ram_gb>` GB RAM |
+| CPU sizing | thread bind `<VLLM_CPU_OMP_THREADS_BIND>` (socket 0), KV `<VLLM_CPU_KVCACHE_SPACE>` GB, no memory binding |
+| Hardware | EPYC `<epyc_generation>` (`<zen_arch>`), `<physical_cores>` cores, AVX-512 `<avx512>` |
+| Port | `<port>` |
+
+Proceed only on a clear "go". If the user declines or wants changes (model,
+`--max-model-len`, port), stop and adjust -- do not launch.
+
+Build the launch from `data/epyc.json`. The CLI is `vllm serve <model>`.
+**Do not pass `--device cpu`** on vLLM >= 0.20 -- the zentorch plugin
+auto-selects the CPU platform and `vllm serve` rejects the flag. Only add it if
+`vllm serve --help` lists it (older vLLM).
+
+**Container path** (`runtime` from validate.py). The agent runs these itself,
+including the pull. `RT` is the resolved runtime verbatim:
+```bash
+RT="<runtime from validate.py: docker | podman>"
+$RT pull <image from data/epyc.json>          # agent pulls; do not ask the user to
+$RT run -d --name vllm-epyc \
+  <run_flags from data/epyc.json>            # --ipc=host --shm-size=16g --network=host
+  <hf_cache_mount> \
+  <container_cpuset from cpu_tune, on multi-socket>   # --cpuset-cpus=... (no --cpuset-mems)
+  --env VLLM_CPU_OMP_THREADS_BIND="$VLLM_CPU_OMP_THREADS_BIND" \
+  --env VLLM_CPU_KVCACHE_SPACE=$VLLM_CPU_KVCACHE_SPACE \
+  --env HF_TOKEN=${HF_TOKEN} \
+  <image from data/epyc.json> \
+  vllm serve <model> --dtype bfloat16 --port <port> --max-model-len <len>
+```
+
+**Conda/host path** (no container runtime, `conda_path_available` true). `eval`-ing
+cpu_tune already exported the env vars; just launch -- `VLLM_CPU_OMP_THREADS_BIND`
+binds the threads to socket 0, and there is no memory binding by default:
+```bash
+vllm serve <model> --dtype bfloat16 --port <port> --max-model-len <len> &
+```
+
+Optional throughput flags are **opt-in and must move together** (see Gotchas):
+`TORCHINDUCTOR_FREEZING=1` + `VLLM_USE_AOT_COMPILE=0` (+ `ZENTORCH_WEIGHT_PREPACK=1`).
+The base launch sets none of them.
+
+## Step 7: Poll until up and responsive
+
+A 503 while loading is normal. Poll until the server answers, then prove the
+chat endpoint works. CPU first-token compile can take a minute or two.
+
+```bash
+# container alive (or process alive for conda) + /health
+for i in $(seq 1 120); do
+  # container path:
+  $RT inspect -f '{{.State.Running}}' vllm-epyc 2>/dev/null | grep -q true || { echo "FAILED: container exited"; $RT logs --tail 50 vllm-epyc; break; }
+  curl -sf http://localhost:<port>/health >/dev/null 2>&1 && { echo "HEALTHY"; break; }
+  sleep 3
+done
+```
+
+Then validate the OpenAI endpoint is actually accessible:
+```bash
+curl -sf http://localhost:<port>/v1/chat/completions -H 'Content-Type: application/json' \
+  -d '{"model":"<model>","messages":[{"role":"user","content":"hi"}],"max_tokens":8}'
+```
+
+Resource sanity (your validation list): `$RT stats --no-stream vllm-epyc`.
+
+**If the server never becomes healthy or the endpoint does not respond: print
+the container/process logs, state the failure, and STOP. Do not retry. Do not
+start a debugging loop.**
+
+## Step 8: On success, hand over the endpoint
+
+Print a connection table (model, runtime, port, OMP threads, KV GB, max-model-len,
+NUMA pinning) and a ready-to-run example:
+```bash
+curl -s http://localhost:<port>/v1/chat/completions -H 'Content-Type: application/json' \
+  -d '{"model":"<model>","messages":[{"role":"user","content":"Hello"}]}'
+```
+To stop: `$RT rm -f vllm-epyc` (container) or `kill <pid>` (conda).
+
+## Offline (single-instance batch)
+
+For a one-shot offline run instead of a server, replace Step 6-8 with a single
+`vllm bench throughput` (or an offline `LLM.generate`) using the same sized env,
+wait for completion, and report the metrics. Same no-retry / no-debug rule.
+
+## Gotchas
+
+See [reference.md](reference.md) for the full list. The load-bearing ones:
+
+- **`--device cpu` was removed** from `vllm serve` in vLLM >= 0.20. The zentorch
+  plugin auto-selects CPU. Passing it makes `vllm serve` error with
+  "unrecognized arguments: --device cpu".
+- **`TORCHINDUCTOR_FREEZING=1` alone crashes engine-core init** on vLLM 0.23 /
+  zentorch 2.11 (`AssertionError: expected OutputCode, got function`). It only
+  works with `VLLM_USE_AOT_COMPILE=0` set alongside it. Never set one without
+  the other.
+- **`--shm-size`**: vLLM needs a large `/dev/shm`; the container default (64MB)
+  is too small. Use `--shm-size=16g` (in `data/epyc.json`).
+- **NUMA**: the default is simple -- one instance on **socket 0's CPUs, no memory
+  binding** (`--cpuset-cpus` from `cpu_tune.py` for the container; the bind env var
+  for conda). If socket 0 spans multiple NUMA nodes (NPS2/NPS4), `cpu_tune.py` notes
+  that optimal per-node binding could add performance; the base recipe doesn't do it.
diff --git a/skills/serving-llms-on-epyc/data/epyc.json b/skills/serving-llms-on-epyc/data/epyc.json
new file mode 100644
index 0000000..5206c8e
--- /dev/null
+++ b/skills/serving-llms-on-epyc/data/epyc.json
@@ -0,0 +1,53 @@
+{
+  "vllm_version": "0.22.0",
+  "container": {
+    "image": "amdih/zendnn_zentorch:vllm_v0.22.0_zentorch_v2.11.0.1_ubuntu22.04_2026_ww23",
+    "runtimes": ["docker", "podman"],
+    "comment": "Public vLLM + zentorch CPU image on Docker Hub (amdih/zendnn_zentorch) -- no internal-registry access needed. Tags are vllm_v<ver>_zentorch_v<ver>_<os>_<build>; prefer the newest ubuntu22.04 stable. Both docker and podman are supported; the skill prefers docker and falls back to podman.",
+    "run_flags": [
+      "--ipc=host",
+      "--shm-size=16g",
+      "--network=host"
+    ],
+    "hf_cache_mount": "-v ~/.cache/huggingface:/root/.cache/huggingface",
+    "flag_notes": {
+      "--ipc=host": "vLLM workers use host IPC/shared memory.",
+      "--shm-size=16g": "vLLM needs a large /dev/shm; default 64MB is not enough.",
+      "--network=host": "Expose the served port directly. Alternative: -p <port>:<port>.",
+      "numa": "Default: a single instance uses socket 0's CPUs with NO memory binding (cpu_tune.py emits --cpuset-cpus for the container; conda relies on VLLM_CPU_OMP_THREADS_BIND). On NPS2/NPS4 (multiple NUMA nodes per socket), optimal per-node binding could add performance -- cpu_tune.py notes this; the base recipe does not do it."
+    }
+  },
+  "launch": {
+    "cli": "vllm serve",
+    "device_flag_note": "Do NOT pass --device cpu on vLLM >= 0.20; the zentorch plugin auto-selects the CPU platform and `vllm serve` rejects --device. Only pass it if `vllm serve --help` advertises it (older vLLM)."
+  },
+  "precision": {
+    "native": ["bf16", "fp16", "fp32"],
+    "default": "bfloat16",
+    "notes": "bf16 is the throughput default on EPYC (Zen). fp32 is slower and for debugging only. WOQ (per-channel/per-group int) is supported by zentorch but out of scope for the base recipe."
+  },
+  "model_support": {
+    "check_script": "scripts/check_model.py",
+    "policy": "Do NOT blanket-block multimodal. check_model.py reads the model's HF architectures and checks them against vLLM's model registry for the pinned vllm_version. Text and multimodal generation endpoints are allowed; pooling/embedding/reranker and non-LLM architectures are rejected (not chat/completion endpoints).",
+    "cpu_note": "A vLLM-supported multimodal arch may still hit a GPU-only kernel on CPU; that surfaces at load, where the no-retry rule applies."
+  },
+  "default_model": "Qwen/Qwen3-0.6B",
+  "default_model_notes": "Ungated (Apache-2.0), tiny, fast first success on CPU. For a real workload pick a larger Qwen3 / Llama once the flow is verified.",
+  "smoke_model": "Qwen/Qwen3-0.6B",
+  "smoke_model_notes": "Current small Qwen, chat-capable (ships a chat template, so /v1/chat/completions works -- unlike base models such as opt-125m).",
+  "env_defaults": {
+    "VLLM_CPU_OMP_THREADS_BIND": "set by cpu_tune.py (physical cores of socket 0)",
+    "VLLM_CPU_KVCACHE_SPACE": "set by cpu_tune.py (GB)",
+    "do_not_set": "OMP_NUM_THREADS -- vLLM sets it from the bind list (len of cpu_list); and VLLM_CPU_NUM_OF_RESERVED_CPU -- vLLM has its own default when unset, forcing 0 overrides it."
+  },
+  "throughput_flags_optional": {
+    "TORCHINDUCTOR_FREEZING": "1",
+    "VLLM_USE_AOT_COMPILE": "0",
+    "ZENTORCH_WEIGHT_PREPACK": "1",
+    "gotcha": "VERIFIED on vLLM 0.22.0 / zentorch 2.11.0.1: TORCHINDUCTOR_FREEZING=1 ALONE crashes engine-core init with 'AssertionError: expected OutputCode, got function'. It only works when VLLM_USE_AOT_COMPILE=0 is set alongside it. Never set FREEZING=1 without AOT_COMPILE=0. The base recipe leaves all three unset."
+  },
+  "ram": {
+    "os_headroom_gb": 16,
+    "comment": "Reserve ~16 GB for OS + framework beyond model weights + KV cache when checking fit."
+  }
+}
diff --git a/skills/serving-llms-on-epyc/reference.md b/skills/serving-llms-on-epyc/reference.md
new file mode 100644
index 0000000..db18db2
--- /dev/null
+++ b/skills/serving-llms-on-epyc/reference.md
@@ -0,0 +1,122 @@
+# serving-llms-on-epyc -- Reference
+
+## Table of Contents
+1. [Runtime selection](#runtime-selection)
+2. [Container run flags (CPU)](#container-run-flags-cpu)
+3. [Precision and modality](#precision-and-modality)
+4. [CPU sizing](#cpu-sizing)
+5. [Known quirks](#known-quirks)
+
+---
+
+## Runtime selection
+
+`scripts/validate.py` resolves a runtime the **agent can drive
+non-interactively** and reports it as `runtime` (the exact command prefix the
+agent uses for `pull`/`run`/`stats`/`logs`). Preference order maximizes
+agent-drivability with no human in the loop:
+
+1. **docker** (direct) -- if `docker ps` exits 0 (user in the `docker` group /
+   daemon reachable). No sudo. Best.
+2. **podman** (rootless) -- no daemon, no sudo. Note: rootless podman needs a
+   storage backend that supports its overlay; some networked/`/proj`
+   filesystems reject the overlay `pivot_root` (the run fails even though
+   `podman info` succeeds). On those hosts use docker or the conda path.
+3. **sudo docker** -- only if `sudo -n docker ps` works (passwordless sudo). The
+   agent can still drive it unattended; `runtime` comes back as `"sudo docker"`.
+4. **conda/host** -- requires `import vllm, zentorch` in the active env.
+
+If docker is installed but **none** of the above is agent-drivable (no docker
+group, no passwordless sudo), `validate.py` returns `runtime: null`,
+`runtime_agent_drivable: false`, and a **one-time** setup `fix`:
+`sudo usermod -aG docker $USER && newgrp docker` (or a NOPASSWD sudoers entry).
+This is one-time onboarding, not a per-serve command. After it, every serve is
+fully agent-driven. The skill must not degrade into asking the user to paste
+docker commands for each serve.
+
+## Container run flags (CPU)
+
+From `data/epyc.json`. Unlike the Instinct (GPU) skill there are **no**
+`/dev/kfd`, `/dev/dri`, `--group-add`, or ROCm flags -- this is pure CPU.
+
+| Flag | Why |
+|---|---|
+| `--ipc=host` | vLLM workers use host IPC / shared memory |
+| `--shm-size=16g` | vLLM needs a large `/dev/shm`; the 64MB default is too small |
+| `--network=host` | expose the served port directly (or use `-p <port>:<port>`) |
+| `--cpuset-cpus` | (multi-socket) restrict the container to socket 0's CPUs; from `cpu_tune.py`. No `--cpuset-mems` -- no memory binding by default |
+| `-v ~/.cache/huggingface:/root/.cache/huggingface` | reuse the host model cache |
+
+Image: `amdih/zendnn_zentorch:<tag>` -- the public vLLM + zentorch CPU image on
+Docker Hub (no internal-registry access needed). The exact tag lives in
+`data/epyc.json`; read it, never hardcode it.
+
+## Precision and modality
+
+| Dtype | EPYC (Zen) | Notes |
+|---|---|---|
+| BF16 | Native (default) | throughput default |
+| FP16 | Native | |
+| FP32 | Native | slower; debugging only |
+| WOQ int8/int4 | Supported by zentorch | per-channel / per-group; out of scope for the base recipe |
+
+Modality: not gated by a static blocklist. `scripts/check_model.py` checks the
+model's architecture against vLLM's model registry (pinned to `vllm_version`):
+text **and** multimodal generation endpoints are allowed; pooling/embedding/
+reranker and non-LLM architectures are rejected (not chat/completion endpoints).
+A vLLM-supported multimodal arch may still hit a GPU-only kernel on CPU -- that
+surfaces at load, where the no-retry rule applies.
+
+## CPU sizing
+
+Default policy (the same for NPS1/NPS2/NPS4): a single instance uses **socket 0's
+whole CPU with no memory binding**. `scripts/cpu_tune.py` derives:
+- `VLLM_CPU_OMP_THREADS_BIND` = the physical cores of socket 0 (one thread per
+  physical core; SMT siblings do not help vLLM CPU). vLLM sets `OMP_NUM_THREADS`
+  itself from this list, so we don't.
+- `VLLM_CPU_KVCACHE_SPACE` (GB) = `min(mem*kv_frac, mem-16)`; on <=32GB hosts, `mem*0.5`.
+- `container_cpuset` = `--cpuset-cpus=<socket 0 cpus>` (no `--cpuset-mems`) for the
+  container path on a multi-socket host. The conda path needs nothing extra -- the
+  bind env var binds the threads.
+
+Not set: `OMP_NUM_THREADS` (vLLM derives it from the bind) and
+`VLLM_CPU_NUM_OF_RESERVED_CPU` (vLLM has its own default when unset).
+
+When socket 0 spans multiple NUMA nodes (NPS2/NPS4), `cpu_tune.py` emits a
+`perf_note`: the simple default leaves some performance on the table versus optimal
+per-NUMA-node binding (one instance per node, memory bound). That tuning is out of
+scope for the base recipe.
+
+## Known quirks
+
+**`--device cpu` removed (vLLM >= 0.20)**
+`vllm serve` no longer accepts `--device cpu`; the zentorch plugin auto-selects
+the CPU platform. Passing it -> `vllm: error: unrecognized arguments: --device cpu`.
+Only pass it if `vllm serve --help` advertises it (older vLLM).
+
+**`TORCHINDUCTOR_FREEZING=1` + `VLLM_USE_AOT_COMPILE` (VERIFIED)**
+On vLLM 0.23.0 / zentorch 2.11.0.2 (EPYC 9454, facebook/opt-125m, 2026-06-23):
+`TORCHINDUCTOR_FREEZING=1` alone crashes engine-core init with
+`AssertionError: expected OutputCode, got function` (inductor codecache). Adding
+`VLLM_USE_AOT_COMPILE=0` fixes it (healthy in ~99s). The only changed variable
+between the failing and passing runs was `VLLM_USE_AOT_COMPILE`. Never set
+`FREEZING=1` without `VLLM_USE_AOT_COMPILE=0`. The base recipe leaves both unset.
+
+**`/dev/shm` too small**
+Without `--shm-size=16g` (or `--ipc=host`), vLLM workers fail to allocate shared
+memory at startup.
+
+**RAM is the ceiling, not VRAM**
+CPU serving keeps weights + KV cache in system RAM. `estimate_memory.py` checks
+`weights + KV(max_model_len x num_prompts) + reserve <= RAM` (reserve default
+16 GB, `--reserve-gb`). It exits 1 when it does not fit and prints
+`suggested_max_model_len` + an `action` to reduce and retry. Weights come from
+HF file sizes (`.safetensors` or legacy `.bin`); `--weight-gb` overrides when a
+model has no metadata. KV cache is bf16-only on zentorch CPU (no fp8 KV), so the estimate always uses 2 bytes/element.
+
+**NUMA cross-node traffic**
+On a 2-socket EPYC, an unpinned instance spreads threads across both sockets and
+pays cross-socket latency. The default keeps one instance on **socket 0's CPUs**
+(`cpu_tune.py` -> `VLLM_CPU_OMP_THREADS_BIND`, plus `--cpuset-cpus` for the
+container), with **no memory binding**. On NPS2/NPS4, `cpu_tune.py` notes that
+optimal per-NUMA-node binding could add performance; the base recipe doesn't do it.
diff --git a/skills/serving-llms-on-epyc/scripts/check_model.py b/skills/serving-llms-on-epyc/scripts/check_model.py
new file mode 100644
index 0000000..534bfea
--- /dev/null
+++ b/skills/serving-llms-on-epyc/scripts/check_model.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python3
+"""
+Does vLLM support this model's architecture? -- so the skill checks real vLLM
+support instead of blanket-blocking multimodal.
+
+Reads the model's `architectures` from its HF config.json, then checks them
+against vLLM's model registry for the pinned vLLM version. The registry comes
+from the version-pinned registry.py on GitHub (no vLLM install needed); if that
+is unreachable it falls back to an importable local `vllm`. Generation endpoints
+(text + multimodal) are supported; pooling/embedding/reranker and non-LLM
+architectures are not chat/completion endpoints and are rejected.
+
+    check_model.py --model-id Qwen/Qwen3-0.6B
+    check_model.py --model-id <id> --vllm-version 0.22.0
+
+Exit 0 if vLLM serves it as a generation endpoint (or support is undeterminable
+-- launch confirms), 1 if it is positively unsupported. JSON to stdout.
+Env: HF_TOKEN for gated models.
+"""
+
+import argparse
+import json
+import os
+import re
+import subprocess
+import sys
+import urllib.request
+import urllib.error
+
+HF = "https://huggingface.co"
+GH_RAW = "https://raw.githubusercontent.com/vllm-project/vllm"
+REG_PATH = "vllm/model_executor/models/registry.py"
+
+# registry.py dict name -> kind we care about
+_SECTIONS = {
+    "_TEXT_GENERATION_MODELS": "text",
+    "_TRANSFORMERS_BACKEND_MODELS": "text",
+    "_MULTIMODAL_MODELS": "multimodal",
+    "_EMBEDDING_MODELS": "pooling",
+    "_POOLING_MODELS": "pooling",
+    "_CROSS_ENCODER_MODELS": "pooling",
+}
+
+
+def _get(url, token=None):
+    """GET text from a URL. Returns (text, error_message)."""
+    headers = {"User-Agent": "check-model/1"}
+    if token:
+        headers["Authorization"] = f"Bearer {token}"
+    try:
+        with urllib.request.urlopen(urllib.request.Request(url, headers=headers), timeout=30) as r:
+            return r.read().decode("utf-8"), None
+    except urllib.error.HTTPError as e:
+        return None, {401: "not found or gated (set HF_TOKEN)",
+                      403: "access denied -- accept the model license on HuggingFace",
+                      404: "not found"}.get(e.code, f"HTTP {e.code}")
+    except Exception as e:
+        return None, str(e)
+
+
+def model_architectures(model, rev, token):
+    """Architectures declared in the model's HF config.json. Returns (list, error)."""
+    text, err = _get(f"{HF}/{model}/resolve/{rev}/config.json", token)
+    if text is None:
+        return None, err
+    try:
+        cfg = json.loads(text)
+    except ValueError:
+        return None, "config.json is not valid JSON"
+    return cfg.get("architectures") or [], None
+
+
+def registry_from_github(version):
+    """Parse vLLM's registry.py at v<version>. Returns ({arch: kind}, source) or (None, err)."""
+    src, err = _get(f"{GH_RAW}/v{version}/{REG_PATH}")
+    if src is None:
+        return None, err
+    reg, cur = {}, None
+    for line in src.splitlines():
+        s = line.strip()
+        sec = re.match(r"^(_[A-Z0-9_]+_MODELS)\s*(?::[^=]+)?=\s*\{", s)
+        if sec:
+            cur = _SECTIONS.get(sec.group(1))
+            continue
+        if s.startswith("}"):
+            cur = None
+            continue
+        if cur:
+            key = re.match(r'^"([A-Za-z0-9_]+)"\s*:', s)
+            if key:
+                reg[key.group(1)] = cur
+    return (reg or None), (f"github:v{version}" if reg else "registry.py had no parseable archs")
+
+
+def registry_from_local():
+    """Coarse fallback: an importable local `vllm` (text vs multimodal). Returns ({arch: kind}, source) or (None, None)."""
+    snippet = (
+        "import json;"
+        "from vllm import ModelRegistry as R;"
+        "a=list(R.get_supported_archs());"
+        "mm=set(x for x in a if R.is_multimodal_model([x]));"
+        "print(json.dumps({'archs':a,'mm':list(mm)}))"
+    )
+    r = subprocess.run(["python", "-c", snippet], stdout=subprocess.PIPE,
+                       stderr=subprocess.PIPE, text=True, timeout=60)
+    if r.returncode != 0 or not r.stdout.strip():
+        return None, None
+    try:
+        d = json.loads(r.stdout)
+    except ValueError:
+        return None, None
+    mm = set(d.get("mm", []))
+    return {a: ("multimodal" if a in mm else "text") for a in d.get("archs", [])}, "vllm-import"
+
+
+def main():
+    p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    p.add_argument("--model-id", required=True)
+    p.add_argument("--revision", default="main")
+    p.add_argument("--vllm-version", default="0.22.0", help="pin the registry to this vLLM version (from data/epyc.json)")
+    a = p.parse_args()
+    token = os.environ.get("HF_TOKEN", "")
+
+    archs, aerr = model_architectures(a.model_id, a.revision, token)
+    if not archs:
+        # Cannot read the config (gated/offline) -- do not positively block; the
+        # gating check and launch will catch real problems.
+        print(json.dumps({"model_id": a.model_id, "supported": None, "kind": "undetermined",
+                           "message": f"Could not read architectures ({aerr or 'none declared'}); support unverified. "
+                                      "If gated, set HF_TOKEN. This does not bypass the gating/launch checks."}, indent=2))
+        sys.exit(0)
+
+    reg, source = registry_from_github(a.vllm_version)
+    if reg is None:
+        reg, source = registry_from_local()
+    if reg is None:
+        print(json.dumps({"model_id": a.model_id, "architectures": archs, "supported": None,
+                           "kind": "undetermined",
+                           "message": "Could not load vLLM's model registry (no network and no importable vllm); "
+                                      "support unverified. vLLM confirms support at load (no-retry rule applies)."}, indent=2))
+        sys.exit(0)
+
+    kinds = [reg.get(arch) for arch in archs]
+    known = [k for k in kinds if k]
+    out = {"model_id": a.model_id, "architectures": archs, "registry_source": source}
+
+    if not known:
+        out.update(supported=False, kind="unsupported",
+                   message=f"vLLM has no registry entry for {archs}; it cannot serve this model on any backend. Stop.")
+        print(json.dumps(out, indent=2))
+        sys.exit(1)
+
+    if any(k in ("text", "multimodal") for k in known):
+        kind = "multimodal" if "multimodal" in known else "text"
+        msg = f"vLLM supports {archs} as a {kind} generation endpoint."
+        if kind == "multimodal":
+            msg += " A multimodal arch may still hit a GPU-only kernel on CPU; that surfaces at load (no-retry rule applies)."
+        out.update(supported=True, kind=kind, message=msg)
+        print(json.dumps(out, indent=2))
+        sys.exit(0)
+
+    out.update(supported=False, kind="pooling",
+               message=f"{archs} is a pooling/embedding/reranker model in vLLM, not a chat/completion endpoint. Stop.")
+    print(json.dumps(out, indent=2))
+    sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/skills/serving-llms-on-epyc/scripts/cpu_tune.py b/skills/serving-llms-on-epyc/scripts/cpu_tune.py
new file mode 100644
index 0000000..f37dbc5
--- /dev/null
+++ b/skills/serving-llms-on-epyc/scripts/cpu_tune.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+"""
+Derive vLLM-on-CPU runtime knobs from the host. Deterministic and read-only.
+
+Default policy (kept deliberately simple): a single instance uses **socket 0's
+entire CPU**, with **no memory binding** -- regardless of NPS mode (NPS1/2/4).
+
+Emits two env vars:
+  - VLLM_CPU_OMP_THREADS_BIND : physical cores of socket 0. vLLM binds its OMP
+    threads to these and sets OMP_NUM_THREADS itself (= len(cores)), so we don't.
+  - VLLM_CPU_KVCACHE_SPACE    : KV-cache RAM (GB).
+
+And, for the container path on a multi-socket host, a CPU-only cpuset:
+  - container_cpuset : --cpuset-cpus=<socket 0 physical cores>   (no --cpuset-mems)
+The conda path needs nothing extra -- VLLM_CPU_OMP_THREADS_BIND binds the threads.
+
+We do NOT set OMP_NUM_THREADS (vLLM derives it) or VLLM_CPU_NUM_OF_RESERVED_CPU
+(vLLM has its own default when unset).
+
+If socket 0 spans multiple NUMA nodes (NPS2/NPS4), `perf_note` flags that optimal
+per-node binding could give more performance -- the default does not do it.
+
+Usage:
+    python3 scripts/cpu_tune.py                 # export lines for `eval`
+    python3 scripts/cpu_tune.py --format json   # machine-readable
+    python3 scripts/cpu_tune.py --kv-frac 0.5
+"""
+
+import argparse
+import json
+import re
+import subprocess
+import sys
+
+OS_HEADROOM_GB = 16
+
+
+def _sh(cmd):
+    try:
+        r = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE,
+                           stderr=subprocess.PIPE, text=True, timeout=15)
+        return r.stdout
+    except Exception:
+        return ""
+
+
+def _lscpu_int(out, label, default):
+    m = re.search(rf"^{re.escape(label)}:\s*(\d+)", out, re.MULTILINE)
+    return int(m.group(1)) if m else default
+
+
+def _ranges(cpus):
+    """Compress a sorted int list to a range string: [0,1,2,5] -> '0-2,5'."""
+    if not cpus:
+        return ""
+    out, start, prev = [], cpus[0], cpus[0]
+    for c in cpus[1:]:
+        if c == prev + 1:
+            prev = c
+            continue
+        out.append(f"{start}-{prev}" if start != prev else f"{start}")
+        start = prev = c
+    out.append(f"{start}-{prev}" if start != prev else f"{start}")
+    return ",".join(out)
+
+
+def socket0_physical_cpus():
+    """Physical cores of socket 0 from `lscpu -p`: one CPU per core, SMT siblings
+    dropped. vLLM CPU gains nothing from SMT, so we run on physical cores only."""
+    phys, seen = [], set()
+    for line in _sh("lscpu -p=CPU,CORE,SOCKET").splitlines():
+        if line.startswith("#") or not line.strip():
+            continue
+        parts = line.split(",")
+        if len(parts) < 3 or parts[2] != "0":
+            continue
+        cpu, core = int(parts[0]), parts[1]
+        if core not in seen:
+            seen.add(core)
+            phys.append(cpu)
+    return sorted(phys)
+
+
+def main():
+    p = argparse.ArgumentParser(description=__doc__,
+                                formatter_class=argparse.RawDescriptionHelpFormatter)
+    p.add_argument("--kv-frac", type=float, default=0.4, help="fraction of RAM for KV cache")
+    p.add_argument("--format", choices=["env", "json"], default="env")
+    args = p.parse_args()
+
+    lscpu = _sh("lscpu")
+    numa = _lscpu_int(lscpu, "NUMA node(s)", 1)
+    sockets = _lscpu_int(lscpu, "Socket(s)", 1)
+    nodes_per_socket = max(1, numa // max(1, sockets))
+
+    mem = 0
+    m = re.search(r"MemTotal:\s*(\d+)", _sh("grep MemTotal /proc/meminfo"))
+    if m:
+        mem = int(m.group(1)) // (1024 * 1024)
+    if mem <= 2 * OS_HEADROOM_GB:
+        kv = max(1, int(mem * 0.5))
+    else:
+        kv = max(1, min(int(mem * args.kv_frac), mem - OS_HEADROOM_GB))
+
+    bind = _ranges(socket0_physical_cpus())
+    # CPU-only cpuset for the container, physical cores only (same list as the
+    # bind). Only meaningful when there is more than one socket to exclude. No
+    # --cpuset-mems (no memory binding by default).
+    container_cpuset = f"--cpuset-cpus={bind}" if sockets > 1 else ""
+
+    perf_note = ""
+    if nodes_per_socket > 1:
+        perf_note = (f"socket 0 spans {nodes_per_socket} NUMA nodes (NPS{nodes_per_socket}); "
+                     "the default uses the whole socket with no memory binding. Optimal "
+                     "per-NUMA-node binding (memory bound to each node) could give more "
+                     "performance -- not done by default.")
+
+    result = {
+        "vllm_cpu_omp_threads_bind": bind,
+        "vllm_cpu_kvcache_space_gb": kv,
+        "sockets": sockets,
+        "numa_nodes": numa,
+        "nodes_per_socket": nodes_per_socket,
+        "container_cpuset": container_cpuset,
+        "memory_gb": mem,
+        "perf_note": perf_note,
+    }
+
+    if args.format == "json":
+        print(json.dumps(result, indent=2))
+        return
+
+    print(f'export VLLM_CPU_OMP_THREADS_BIND="{bind}"')
+    print(f"export VLLM_CPU_KVCACHE_SPACE={kv}")
+    print(f"# default: socket 0's CPUs, no memory binding ({sockets} socket(s), {numa} NUMA node(s))")
+    if container_cpuset:
+        print(f"#   container: {container_cpuset}")
+    print("#   conda: VLLM_CPU_OMP_THREADS_BIND binds the threads; nothing else needed")
+    if perf_note:
+        print(f"# NOTE: {perf_note}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/skills/serving-llms-on-epyc/scripts/detect.py b/skills/serving-llms-on-epyc/scripts/detect.py
new file mode 100644
index 0000000..c0c3340
--- /dev/null
+++ b/skills/serving-llms-on-epyc/scripts/detect.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+"""
+Detect AMD EPYC CPU hardware for vLLM + zentorch serving.
+
+Usage:
+    python3 scripts/detect.py
+    python3 scripts/detect.py --host user@hostname
+
+Output: JSON with cpu_model, is_amd_epyc, logical_cores, physical_cores,
+sockets, threads_per_core, numa_nodes, memory_gb, epyc_generation
+(Naples/Rome/Milan/Genoa/Bergamo/Siena/Turin), zen_arch, and avx512. Exits 0 on
+success, 1 if no CPU info could be read.
+
+Env vars (used when --host is not given):
+    ZEN_SSH_HOST, ZEN_SSH_USER, ZEN_SSH_PORT
+"""
+
+import argparse
+import json
+import os
+import re
+import subprocess
+import sys
+
+
+def _is_local(host):
+    return not host or host in ("local", "localhost", "127.0.0.1")
+
+
+def _run(cmd, host, user, port, timeout=20):
+    if _is_local(host):
+        r = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE,
+                           stderr=subprocess.PIPE, text=True, timeout=timeout)
+    else:
+        ssh_target = f"{user}@{host}" if user else host
+        ssh = ["ssh", "-o", "StrictHostKeyChecking=accept-new",
+               "-o", "ConnectTimeout=15", "-o", "BatchMode=yes",
+               "-o", "LogLevel=ERROR", "-p", str(port), ssh_target, cmd]
+        r = subprocess.run(ssh, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                           text=True, timeout=timeout)
+    return r.returncode, r.stdout, r.stderr
+
+
+def _lscpu_field(lscpu_out, label):
+    m = re.search(rf"^{re.escape(label)}:\s*(.+)$", lscpu_out, re.MULTILINE)
+    return m.group(1).strip() if m else ""
+
+
+def _epyc_generation(model):
+    """Map an AMD EPYC model name to (generation, zen_arch).
+
+    EPYC numbering encodes the generation: 7xx1=Naples (Zen1), 7xx2=Rome (Zen2),
+    7xx3=Milan (Zen3), 8xx4=Siena (Zen4c), 97x4=Bergamo (Zen4c), 9xx4=Genoa (Zen4),
+    9xx5=Turin (Zen5). The agent should carry this through every phase (e.g. AVX-512
+    + bf16 land on Zen4+, Turin has up to 128 cores per socket -> thread binding)."""
+    m = re.search(r"EPYC\s+(\d{4})", model.upper())
+    if not m:
+        return "unknown", "unknown"
+    num = m.group(1)
+    first, last = num[0], num[3]
+    if first == "7":
+        return {"1": ("Naples", "Zen1"), "2": ("Rome", "Zen2"),
+                "3": ("Milan", "Zen3")}.get(last, ("unknown", "unknown"))
+    if first == "8" and last == "4":
+        return "Siena", "Zen4c"
+    if first == "9":
+        if num.startswith("97") and last == "4":
+            return "Bergamo", "Zen4c"
+        if last == "4":
+            return "Genoa", "Zen4"
+        if last == "5":
+            return "Turin", "Zen5"
+    return "unknown", "unknown"
+
+
+def main():
+    p = argparse.ArgumentParser(description=__doc__,
+                                formatter_class=argparse.RawDescriptionHelpFormatter)
+    p.add_argument("--host", default="", help="[user@]host (default: local or ZEN_SSH_HOST)")
+    p.add_argument("--user", default="")
+    p.add_argument("--port", type=int, default=0)
+    args = p.parse_args()
+
+    host, user = args.host, args.user
+    if "@" in host:
+        user, host = host.split("@", 1)
+    host = host or os.environ.get("ZEN_SSH_HOST", "")
+    user = user or os.environ.get("ZEN_SSH_USER", "")
+    port = args.port or int(os.environ.get("ZEN_SSH_PORT", "22"))
+
+    rc, lscpu_out, err = _run("lscpu", host, user, port)
+    if rc != 0 or not lscpu_out:
+        print(json.dumps({"error": "lscpu failed",
+                          "detail": err.strip() or f"exit {rc}"}))
+        sys.exit(1)
+
+    model = _lscpu_field(lscpu_out, "Model name") or "unknown"
+    vendor = _lscpu_field(lscpu_out, "Vendor ID")
+
+    def _int(label, default=0):
+        v = _lscpu_field(lscpu_out, label)
+        try:
+            return int(v)
+        except ValueError:
+            return default
+
+    sockets = _int("Socket(s)", 1)
+    cores_per_socket = _int("Core(s) per socket", 0)
+    threads_per_core = _int("Thread(s) per core", 1) or 1
+    numa_nodes = _int("NUMA node(s)", 1)
+
+    rc, nproc_out, _ = _run("nproc --all", host, user, port)
+    try:
+        logical = int(nproc_out.strip())
+    except (ValueError, AttributeError):
+        logical = sockets * cores_per_socket * threads_per_core
+
+    physical = sockets * cores_per_socket if cores_per_socket else logical // threads_per_core
+
+    rc, mem_out, _ = _run("grep MemTotal /proc/meminfo", host, user, port)
+    mem_kb = 0
+    m = re.search(r"(\d+)", mem_out or "")
+    if m:
+        mem_kb = int(m.group(1))
+    memory_gb = mem_kb // (1024 * 1024)
+
+    is_epyc = vendor == "AuthenticAMD" and "EPYC" in model.upper()
+    generation, zen_arch = _epyc_generation(model)
+    avx512 = "avx512f" in _lscpu_field(lscpu_out, "Flags").split()
+
+    print(json.dumps({
+        "cpu_model": model,
+        "vendor": vendor,
+        "is_amd_epyc": is_epyc,
+        "epyc_generation": generation,
+        "zen_arch": zen_arch,
+        "avx512": avx512,
+        "logical_cores": logical,
+        "physical_cores": physical,
+        "sockets": sockets,
+        "threads_per_core": threads_per_core,
+        "numa_nodes": numa_nodes,
+        "memory_gb": memory_gb,
+        "target": "local" if _is_local(host) else host,
+    }, indent=2))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/skills/serving-llms-on-epyc/scripts/estimate_memory.py b/skills/serving-llms-on-epyc/scripts/estimate_memory.py
new file mode 100644
index 0000000..75c50ad
--- /dev/null
+++ b/skills/serving-llms-on-epyc/scripts/estimate_memory.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python3
+"""
+Does a HuggingFace model fit in host RAM for CPU serving at a given context?
+
+No download -- reads HF metadata over HTTP. Answers one question:
+    weights + KV(max_model_len x num_prompts) + reserve  <=  RAM ?
+If not, prints the largest max_model_len that would fit, so you reduce it and
+retry. Exit 0 = fits, 1 = does not fit (or error).
+
+    estimate_memory.py --model-id Qwen/Qwen3-8B --ram-gb 755 --max-model-len 4096 --num-prompts 8
+
+Three sub-problems, one function each: weight_gb(), kv_bytes_per_token(), fit().
+Env: HF_TOKEN for gated models. --weight-gb overrides weights if metadata is missing.
+"""
+
+import argparse
+import json
+import os
+import sys
+import urllib.request
+import urllib.error
+
+HF = "https://huggingface.co"
+KV_BYTES_PER_ELEM = 2  # zentorch CPU KV cache is bf16-only (2 bytes); no fp8 KV support
+
+
+def _get(url, token):
+    """GET JSON from HF. Returns (data, error_message)."""
+    headers = {"User-Agent": "estimate-memory/2"}
+    if token:
+        headers["Authorization"] = f"Bearer {token}"
+    try:
+        with urllib.request.urlopen(urllib.request.Request(url, headers=headers), timeout=30) as r:
+            return json.load(r), None
+    except urllib.error.HTTPError as e:
+        return None, {401: "not found, or gated (set HF_TOKEN if it is gated)",
+                      403: "access denied -- accept the model license on HuggingFace",
+                      404: "model not found"}.get(e.code, f"HTTP {e.code}")
+    except Exception as e:
+        return None, str(e)
+
+
+def weight_gb(model, rev, token):
+    """(1) Weight RAM = sum of uncompressed weight-file sizes. Works for
+    .safetensors and legacy .bin; file size is ground truth even for quantized
+    checkpoints. Returns (gb, error)."""
+    tree, err = _get(f"{HF}/api/models/{model}/tree/{rev}", token)
+    if not isinstance(tree, list):
+        return None, err or "no file tree"
+    total = sum(
+        f.get("size", 0) for f in tree
+        if f.get("type") == "file" and (
+            f.get("path", "").endswith(".safetensors")
+            or (f.get("path", "").endswith(".bin") and "model" in f.get("path", "").lower())
+        )
+    )
+    if total == 0:
+        return None, "no weight files (.safetensors/.bin) found -- pass --weight-gb"
+    return round(total / 2**30, 2), None
+
+
+def get_config(model, rev, token):
+    """Model config.json, unwrapping the LLM sub-config of multimodal models."""
+    cfg, _ = _get(f"{HF}/{model}/resolve/{rev}/config.json", token)
+    if cfg and "num_hidden_layers" not in cfg:
+        for k in ("text_config", "language_config", "llm_config"):
+            if isinstance(cfg.get(k), dict) and cfg[k].get("num_hidden_layers"):
+                sub = dict(cfg[k])
+                sub.setdefault("max_position_embeddings", cfg.get("max_position_embeddings"))
+                return sub
+    return cfg
+
+
+def kv_bytes_per_token(cfg):
+    """(2) KV-cache bytes per token = 2(K,V) x layers x kv_heads x head_dim x 2 (bf16).
+    zentorch CPU caches KV in bf16 only. MLA models (DeepSeek) cache a compressed latent."""
+    if not cfg or not cfg.get("num_hidden_layers"):
+        return 0
+    nbytes = KV_BYTES_PER_ELEM
+    layers = cfg["num_hidden_layers"]
+    if "kv_lora_rank" in cfg:  # MLA: latent KV
+        return 2 * layers * (cfg["kv_lora_rank"] + cfg.get("qk_rope_head_dim", 0)) * nbytes
+    kv_heads = cfg.get("num_key_value_heads", cfg.get("num_attention_heads", 0))
+    head_dim = cfg.get("head_dim") or (cfg.get("hidden_size", 0) // max(1, cfg.get("num_attention_heads", 1)))
+    return 2 * layers * kv_heads * head_dim * nbytes
+
+
+def fit(weight, kv_per_tok, ctx, prompts, ram, reserve):
+    """(3) Verdict + the largest max_model_len that would fit if it doesn't."""
+    kv_gb = kv_per_tok * ctx * prompts / 2**30
+    required = round(weight + kv_gb + reserve, 2)
+    out = {"max_model_len": ctx, "num_prompts": prompts, "weight_gb": weight,
+           "kv_cache_gb": round(kv_gb, 2), "reserve_gb": reserve,
+           "required_gb": required, "ram_gb": ram, "fits": required <= ram}
+    if not out["fits"]:
+        budget = (ram - weight - reserve) * 2**30
+        best = int(budget / (kv_per_tok * prompts)) // 256 * 256 if kv_per_tok and budget > 0 else 0
+        out["suggested_max_model_len"] = max(0, best)
+        out["action"] = (f"reduce --max-model-len to {best} or less and retry"
+                         if best >= 256 else "weights alone exceed RAM -- use a smaller model")
+    return out
+
+
+def main():
+    p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    p.add_argument("--model-id", required=True)
+    p.add_argument("--revision", default="main")
+    p.add_argument("--ram-gb", type=float, default=0, help="host RAM (enables the fit verdict)")
+    p.add_argument("--max-model-len", type=int, default=4096)
+    p.add_argument("--num-prompts", type=int, default=1, help="concurrent sequences")
+    p.add_argument("--reserve-gb", type=float, default=16, help="RAM held back for OS + vLLM runtime")
+    p.add_argument("--weight-gb", type=float, default=0, help="override weight RAM if metadata is unavailable")
+    a = p.parse_args()
+    token = os.environ.get("HF_TOKEN", "")
+
+    w = a.weight_gb if a.weight_gb > 0 else None
+    if w is None:
+        w, err = weight_gb(a.model_id, a.revision, token)
+        if w is None:
+            print(json.dumps({"error": err, "model_id": a.model_id}))
+            sys.exit(1)
+
+    cfg = get_config(a.model_id, a.revision, token)
+    kv_per_tok = kv_bytes_per_token(cfg)
+    max_seq = cfg.get("max_position_embeddings") if cfg else None
+    ctx = min(a.max_model_len, max_seq) if max_seq else a.max_model_len
+
+    out = {"model_id": a.model_id, "weight_gb": w, "kv_dtype": "bf16",
+           "kv_bytes_per_token": kv_per_tok, "model_max_seq_len": max_seq}
+    if a.ram_gb > 0:
+        out["fit"] = fit(w, kv_per_tok, ctx, a.num_prompts, a.ram_gb, a.reserve_gb)
+
+    print(json.dumps(out, indent=2))
+    sys.exit(0 if out.get("fit", {"fits": True})["fits"] else 1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/skills/serving-llms-on-epyc/scripts/validate.py b/skills/serving-llms-on-epyc/scripts/validate.py
new file mode 100644
index 0000000..95fd37a
--- /dev/null
+++ b/skills/serving-llms-on-epyc/scripts/validate.py
@@ -0,0 +1,156 @@
+#!/usr/bin/env python3
+"""
+Validate the environment before serving vLLM + zentorch on an EPYC CPU host.
+
+Checks a container runtime (docker or podman), whether the vLLM+zentorch image
+is present (and, if already pulled, that `import vllm, zentorch` works inside it),
+a conda/host fallback (`import vllm, zentorch`), the host perf libraries
+(tcmalloc / OpenMP via LD_PRELOAD), HF_TOKEN, and RAM. Each issue is error
+(blocks launch) / warning (degrades) / advisory (info).
+
+Usage:
+    python3 scripts/validate.py
+    python3 scripts/validate.py --image amdih/zendnn_zentorch:vllm_v0.22.0_zentorch_v2.11.0.1_ubuntu22.04_2026_ww23
+
+Exits 0 if no error-severity issues remain, 1 otherwise. JSON to stdout.
+"""
+
+import argparse
+import json
+import os
+import shutil
+import subprocess
+import sys
+
+
+def _sh(cmd, timeout=20):
+    try:
+        r = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE,
+                           stderr=subprocess.PIPE, text=True, timeout=timeout)
+        return r.returncode, r.stdout.strip(), r.stderr.strip()
+    except subprocess.TimeoutExpired:
+        return 1, "", f"timed out after {timeout}s"
+
+
+def _detect_runtime():
+    """Pick an accessible container runtime: docker (daemon reachable) > podman
+    (rootless). Returns (runtime, detail) or (None, why).
+
+    Like serving-llms-on-instinct, an accessible runtime is a PREREQUISITE. We
+    check and report a one-time fix; we never escalate privileges (no sudo).
+    """
+    if shutil.which("docker"):
+        rc, _, err = _sh("docker ps -q")
+        if rc == 0:
+            return "docker", "docker reachable"
+        last = (err or "docker ps failed").splitlines()[0][:120]
+    else:
+        last = "docker not installed"
+    if shutil.which("podman"):
+        rc, _, err = _sh("podman info --format '{{.Host.Arch}}'")
+        if rc == 0:
+            return "podman", "podman available (rootless)"
+        last = (err or last).splitlines()[0][:120] if err else last
+    return None, last
+
+
+def main():
+    p = argparse.ArgumentParser(description=__doc__,
+                                formatter_class=argparse.RawDescriptionHelpFormatter)
+    p.add_argument("--image", default="", help="container image to check for (advisory)")
+    args = p.parse_args()
+
+    issues = []
+
+    # 1. Container runtime (prerequisite): docker > podman, else conda fallback.
+    runtime, detail = _detect_runtime()
+    conda_ok = _sh('python -c "import vllm, zentorch"')[0] == 0
+
+    if runtime is None:
+        if conda_ok:
+            issues.append({"check": "container_runtime", "severity": "warning",
+                           "message": f"No accessible container runtime ({detail}); using the conda/host path.",
+                           "fix": "For the container path, make docker accessible or install rootless podman (see fix below)."})
+        else:
+            issues.append({"check": "container_runtime", "severity": "error",
+                           "message": f"No accessible container runtime ({detail}) and no host vllm+zentorch.",
+                           "fix": "One-time onboarding: add your user to the docker group "
+                                  "(sudo usermod -aG docker $USER, then re-login) or start the daemon; "
+                                  "OR install rootless podman; OR activate a conda env with vllm+zentorch."})
+
+    # 2. Image present + (only if already pulled) zentorch inside it. The in-image
+    #    import check runs ONLY when the image is local, so it never triggers a
+    #    multi-GB pull just to validate.
+    if runtime and args.image:
+        repo = args.image.rsplit(":", 1)[0]  # strip the tag, keep any host:port/repo
+        rc, out, _ = _sh(f"{runtime} images {repo} --format '{{{{.Repository}}}}:{{{{.Tag}}}}'")
+        if args.image not in (out or ""):
+            issues.append({"check": "image", "severity": "advisory",
+                           "message": f"Image {args.image} not pulled yet; first launch will download it (in-image zentorch check deferred to launch).",
+                           "fix": f"{runtime} pull {args.image}"})
+        else:
+            rc, ver, err = _sh(f'{runtime} run --rm {args.image} '
+                               f'python -c "import vllm,zentorch;print(vllm.__version__,zentorch.__version__)"', timeout=90)
+            if rc == 0 and ver:
+                issues.append({"check": "image_stack", "severity": "advisory",
+                               "message": f"Image has vllm+zentorch ({ver})."})
+            else:
+                issues.append({"check": "image_stack", "severity": "warning",
+                               "message": f"Image {args.image} is present but `import vllm, zentorch` failed inside it: {(err or 'unknown')[:120]}",
+                               "fix": "Use an image tag that bundles the zentorch plugin (see data/epyc.json)."})
+
+    # 3. Host vllm+zentorch (for the conda path)
+    if conda_ok:
+        _, ver, _ = _sh('python -c "import vllm,zentorch;print(vllm.__version__,zentorch.__version__)"')
+        issues.append({"check": "host_stack", "severity": "advisory",
+                       "message": f"Host vllm+zentorch importable ({ver}); conda path available."})
+    elif runtime:
+        issues.append({"check": "host_stack", "severity": "advisory",
+                       "message": "Host `import vllm, zentorch` not available; use the container path."})
+
+    # 4. HF_TOKEN
+    if not os.environ.get("HF_TOKEN"):
+        issues.append({"check": "hf_token", "severity": "advisory",
+                       "message": "HF_TOKEN not set. Required for gated models (Llama, Gemma); not needed for Qwen3.",
+                       "fix": "export HF_TOKEN=hf_..."})
+
+    # 5. RAM
+    rc, out, _ = _sh("grep MemTotal /proc/meminfo | awk '{print int($2/1024/1024)}'")
+    try:
+        ram_gb = int(out)
+    except ValueError:
+        ram_gb = 0
+    if 0 < ram_gb < 32:
+        issues.append({"check": "ram", "severity": "warning",
+                       "message": f"Only {ram_gb} GB RAM. CPU serving keeps weights + KV cache in RAM; large models may not fit.",
+                       "fix": "Use a small model or a host with more RAM."})
+
+    # 6. Perf libraries for the host/conda path (advisory). vLLM CPU wants
+    #    libtcmalloc + libiomp (OpenMP) preloaded and warns otherwise. The
+    #    container image sets these itself, so only check the host when the
+    #    conda/host path is viable.
+    if conda_ok:
+        ld = os.environ.get("LD_PRELOAD", "")
+        missing = [lib for lib in ("libtcmalloc", "libiomp") if lib not in ld]
+        if missing:
+            issues.append({"check": "perf_libs", "severity": "advisory",
+                           "message": f"LD_PRELOAD is missing {', '.join(missing)}; vLLM CPU warns about this and throughput suffers without them (host/conda path).",
+                           "fix": "export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$CONDA_PREFIX/lib/libiomp5.so:$LD_PRELOAD"})
+
+    errors = [i for i in issues if i["severity"] == "error"]
+    result = {
+        "ready": len(errors) == 0,
+        "runtime": runtime,
+        "runtime_detail": detail,
+        "conda_path_available": conda_ok,
+        "ram_gb": ram_gb,
+        "errors": errors,
+        "warnings": [i for i in issues if i["severity"] == "warning"],
+        "advisories": [i for i in issues if i["severity"] == "advisory"],
+    }
+    print(json.dumps(result, indent=2))
+    sys.exit(0 if len(errors) == 0 else 1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/skills/serving-llms-on-epyc/skill-card.md b/skills/serving-llms-on-epyc/skill-card.md
new file mode 100644
index 0000000..120283f
--- /dev/null
+++ b/skills/serving-llms-on-epyc/skill-card.md
@@ -0,0 +1,13 @@
+# Skill Card
+
+## Description
+
+Serve a single LLM on an AMD EPYC CPU host with vLLM + zentorch (Docker, Podman, or conda), handling CPU detection, runtime/env validation, model + RAM-fit checks, hardware-sized threads/KV/NUMA, launch, and health verification. Reports and stops on failure; does not debug.
+
+## Owner
+
+AMD
+
+## License
+
+MIT