From f62dd748fbb5cf65a6775c7ba0ffd2071873c158 Mon Sep 17 00:00:00 2001 From: Lalithnarayan C Date: Wed, 24 Jun 2026 05:36:25 -0600 Subject: [PATCH] add serving-llms-on-epyc skill (vLLM + zentorch CPU serving) Signed-off-by: Lalithnarayan C Change-Id: I1dc2362e0983326658b6618015a161ecd44f40e6 --- .claude-plugin/marketplace.json | 5 + .cursor-plugin/marketplace.json | 5 + .../tests/test_serving_llms_on_epyc.py | 41 +++ skills/serving-llms-on-epyc/SKILL.md | 236 ++++++++++++++++++ skills/serving-llms-on-epyc/data/epyc.json | 53 ++++ skills/serving-llms-on-epyc/reference.md | 122 +++++++++ .../scripts/check_model.py | 169 +++++++++++++ .../serving-llms-on-epyc/scripts/cpu_tune.py | 144 +++++++++++ skills/serving-llms-on-epyc/scripts/detect.py | 149 +++++++++++ .../scripts/estimate_memory.py | 138 ++++++++++ .../serving-llms-on-epyc/scripts/validate.py | 156 ++++++++++++ skills/serving-llms-on-epyc/skill-card.md | 13 + 12 files changed, 1231 insertions(+) create mode 100644 eval/behavioral/tests/test_serving_llms_on_epyc.py create mode 100644 skills/serving-llms-on-epyc/SKILL.md create mode 100644 skills/serving-llms-on-epyc/data/epyc.json create mode 100644 skills/serving-llms-on-epyc/reference.md create mode 100644 skills/serving-llms-on-epyc/scripts/check_model.py create mode 100644 skills/serving-llms-on-epyc/scripts/cpu_tune.py create mode 100644 skills/serving-llms-on-epyc/scripts/detect.py create mode 100644 skills/serving-llms-on-epyc/scripts/estimate_memory.py create mode 100644 skills/serving-llms-on-epyc/scripts/validate.py create mode 100644 skills/serving-llms-on-epyc/skill-card.md diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index f00ba1e..fd141aa 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -24,6 +24,11 @@ "source": "./skills/magpie-kernel-evaluator", "description": "Performs GPU kernel correctness and performance evaluation and LLM inference benchmarking with Magpie. Analyzes single or multiple kernels (HIP/CUDA/PyTorch), compares kernel implementations, runs vLLM/SGLang benchmarks with profiling and TraceLens, and runs gap analysis on torch traces." }, + { + "name": "serving-llms-on-epyc", + "source": "./skills/serving-llms-on-epyc", + "description": "Serve LLMs on AMD EPYC CPUs with vLLM + zentorch, in a container (Docker/Podman) or conda. Handles CPU detection, runtime/env validation, vLLM model-support and RAM-fit checks, hardware-sized threads/KV, launch, and health verification. Single instance; reports and stops on failure." + }, { "name": "serving-llms-on-instinct", "source": "./skills/serving-llms-on-instinct", diff --git a/.cursor-plugin/marketplace.json b/.cursor-plugin/marketplace.json index f00ba1e..fd141aa 100644 --- a/.cursor-plugin/marketplace.json +++ b/.cursor-plugin/marketplace.json @@ -24,6 +24,11 @@ "source": "./skills/magpie-kernel-evaluator", "description": "Performs GPU kernel correctness and performance evaluation and LLM inference benchmarking with Magpie. Analyzes single or multiple kernels (HIP/CUDA/PyTorch), compares kernel implementations, runs vLLM/SGLang benchmarks with profiling and TraceLens, and runs gap analysis on torch traces." }, + { + "name": "serving-llms-on-epyc", + "source": "./skills/serving-llms-on-epyc", + "description": "Serve LLMs on AMD EPYC CPUs with vLLM + zentorch, in a container (Docker/Podman) or conda. Handles CPU detection, runtime/env validation, vLLM model-support and RAM-fit checks, hardware-sized threads/KV, launch, and health verification. Single instance; reports and stops on failure." + }, { "name": "serving-llms-on-instinct", "source": "./skills/serving-llms-on-instinct", diff --git a/eval/behavioral/tests/test_serving_llms_on_epyc.py b/eval/behavioral/tests/test_serving_llms_on_epyc.py new file mode 100644 index 0000000..c4750f5 --- /dev/null +++ b/eval/behavioral/tests/test_serving_llms_on_epyc.py @@ -0,0 +1,41 @@ +"""Behavioral tests for the `serving-llms-on-epyc` skill. + +Run locally (needs the `claude` CLI authenticated; the agent does not actually +launch a server in the judge's sandbox, so this grades the *plan/behavior*, not +a live endpoint): + + pytest eval/behavioral/tests/test_serving_llms_on_epyc.py -s + +`logs_contains` is deterministic; `should` / `should_not` are graded by an LLM +judge over the captured evidence (tool calls + outputs), so the agent's prose +cannot fake a pass. +""" + +from harness import claude + + +def test_serve_model_on_epyc(): + with claude("sonnet", skill="serving-llms-on-epyc") as agent: + run = agent.prompt( + "Serve Qwen/Qwen3-0.6B on this AMD EPYC box with vLLM and zentorch. " + "Use the default settings." + ) + + # Programmatic expectation: the skill was actually loaded. + run.logs_contains("serving-llms-on-epyc") + + # Positive behavioral expectations (the state machine). + run.should("Detect the CPU and confirm it is an AMD EPYC host before serving (e.g. runs detect.py)") + run.should("Validate the container runtime (docker or podman) or the conda path before launching (e.g. runs validate.py)") + run.should("Take validate.py's environment advisories into account -- the tcmalloc / OpenMP (LD_PRELOAD) perf-library recommendation and, when the image is already pulled, the in-image vllm+zentorch check -- surfacing any that apply") + run.should("Check that vLLM supports the model before serving (e.g. runs check_model.py), rather than refusing it just for being multimodal") + run.should("Check that the model fits in host RAM (e.g. runs estimate_memory.py)") + run.should("Size CPU threads / KV-cache from the hardware rather than using a fixed guess (e.g. runs cpu_tune.py)") + run.should("Present a sized plan and ask the user to confirm before launching the server") + run.should("Plan to launch with 'vllm serve' and poll until /health is healthy") + + # Negative behavioral expectations (the explicit Don'ts). + run.should_not("Pass '--device cpu' to vllm serve") + run.should_not("Launch the server before the user has confirmed the plan") + run.should_not("Enter a debugging loop or retry after a launch failure") + run.should_not("Attempt GPU, ROCm, or Instinct serving") diff --git a/skills/serving-llms-on-epyc/SKILL.md b/skills/serving-llms-on-epyc/SKILL.md new file mode 100644 index 0000000..7521054 --- /dev/null +++ b/skills/serving-llms-on-epyc/SKILL.md @@ -0,0 +1,236 @@ +--- +name: serving-llms-on-epyc +description: >- + Serves a language model on an AMD EPYC CPU host using vLLM with the zentorch + backend, in a container (Docker or Podman) or a conda env. Use whenever the + user wants to run, serve, deploy, start, host, or launch an LLM on AMD EPYC, + Zen CPU, "vLLM on CPU", "zentorch serving", or "serve a model without a GPU". + Use for "serve Qwen on EPYC", "start a CPU vLLM endpoint", "run an OpenAI + server on my EPYC box", or similar. Handles the full single-instance flow: + detect the CPU (incl. EPYC generation), validate the runtime/env, check vLLM + supports the model (via vLLM's registry, not a modality blocklist), check it + fits host RAM, size CPU threads/KV/NUMA from the hardware, confirm the plan with + the user, launch, and poll until the endpoint is responsive. Single instance + only. Does NOT debug failures + and does NOT retry -- it reports and stops. Do not use for GPU/Instinct (use + serving-llms-on-instinct) or multi-node. +allowed-tools: Bash, Read +--- + +# Serving LLMs on AMD EPYC (vLLM + zentorch, CPU) + +Bring up a single vLLM OpenAI endpoint on an AMD EPYC host with the zentorch CPU +backend, sized to the hardware. Container-first (Docker or Podman); conda/host +is the fallback. + +Hard rule for this skill: **on any failure, report the cause + logs and STOP. +Do not retry, do not debug.** (Debugging is a separate workflow.) + +**The agent does the serve flow itself** -- pull, configure, launch, poll -- +using the runtime `validate.py` reports. Never hand the user per-serve commands. +Like serving-llms-on-instinct, an accessible container runtime is a one-time +**prerequisite**: if `validate.py` finds none, report its one-time fix (make +docker accessible / install podman / provide a conda env) and stop. Do not +attempt `sudo` or privilege escalation. + +## Data file + +Read `data/epyc.json` directly. It holds the container image, mandatory CPU run +flags, supported precision, the model-support policy, the default model, and the +verified throughput-flag gotcha. Do not hardcode the image tag from memory -- read it. + +## Step 1: Detect the CPU + +```bash +python3 scripts/detect.py # add --host user@box for a remote host +``` + +Returns `cpu_model`, `is_amd_epyc`, `epyc_generation` +(Naples/Rome/Milan/Genoa/Bergamo/Siena/Turin), `zen_arch`, `avx512`, +`logical_cores`, `physical_cores`, `sockets`, `numa_nodes`, `memory_gb`. If +`is_amd_epyc` is `false`, stop: this skill targets AMD EPYC. (Other x86 may work +but is unsupported here.) Carry `epyc_generation` / `avx512` through the later +phases -- e.g. AVX-512 + bf16 land on Zen4+ (Genoa/Turin), and Turin packs up to +128 cores/socket, which the thread-binding in Step 5 sizes from. + +## Step 2: Validate the runtime and environment + +```bash +python3 scripts/validate.py --image +``` + +Returns `ready`, `runtime` (`docker`, `podman`, or null), `runtime_detail`, +`conda_path_available`, `ram_gb`, and `errors/warnings/advisories`. Pick the path: +- `runtime` is `docker` or `podman` -> container path (Step 6), used verbatim. +- `runtime` null but `conda_path_available: true` -> conda/host path. +- `runtime` null and no conda -> `ready` is false. Report the one-time + onboarding `fix` (make docker accessible / install podman / conda env) and stop. + +Do not proceed if `ready` is `false`. + +## Step 3: Resolve and validate the model + +If the user named no model, use `default_model` from `data/epyc.json` +(`Qwen/Qwen3-0.6B` -- ungated, tiny, fast first success). Otherwise use theirs. + +Check that vLLM actually supports the model (do **not** blanket-block multimodal): + +```bash +python3 scripts/check_model.py --model-id --vllm-version +``` + +- Exit 0 = vLLM serves it as a generation endpoint (`kind` `text` or `multimodal`), + or support is undeterminable (gated/offline) -- proceed; launch confirms. +- Exit 1 = positively unsupported: the architecture is not in vLLM's registry, or + it is a `pooling`/embedding/reranker (not a chat/completion endpoint). Report the + printed `message` and stop. +- A `multimodal` model is allowed; a vLLM-supported multimodal arch may still hit a + GPU-only kernel on CPU, which surfaces at load (the no-retry rule then applies). + +**Precision/dtype**: native CPU dtypes are `bf16` (default), `fp16`, `fp32`. Use +`bfloat16` unless the user asks otherwise. + +For gated models (Llama, Gemma) `HF_TOKEN` must be set and the license accepted on +HuggingFace; if not, stop and say so. + +## Step 4: Check it fits host RAM + +RAM is the ceiling on CPU (weights + KV cache both live in RAM). Run on ONE line: + +```bash +python3 scripts/estimate_memory.py --model-id --ram-gb --max-model-len <4096 or user value> --num-prompts <1 or desired concurrency> +``` + +Exit 0 = fits, exit 1 = does not fit. If `fit.fits` is false: **do not launch.** +Tell the user `required_gb` vs `ram_gb` and the printed `fit.action` -- reduce +`--max-model-len` to `fit.suggested_max_model_len` and retry, or use a smaller +model. `--max-model-len` and `--num-prompts` are the two knobs that move KV. +Extra flag: `--weight-gb N` overrides weights if a model has no HF metadata +(rare). KV cache is bf16-only on zentorch CPU (no fp8 KV). + +## Step 5: Size the CPU runtime from the hardware + +```bash +eval "$(python3 scripts/cpu_tune.py)" # or --format json to inspect +``` + +Exports `VLLM_CPU_OMP_THREADS_BIND` (physical cores of **socket 0**) and +`VLLM_CPU_KVCACHE_SPACE` (GB). It does **not** set `OMP_NUM_THREADS` (vLLM derives +it from the bind list) or `VLLM_CPU_NUM_OF_RESERVED_CPU` (vLLM has its own default +when unset). Default policy, the same for NPS1/NPS2/NPS4: a single instance uses +**socket 0's whole CPU with no memory binding**. On a multi-socket host the JSON +gives `container_cpuset` (`--cpuset-cpus` only -- no `--cpuset-mems`) for the +container path; the conda path needs nothing extra (the bind env var binds the +threads). If socket 0 spans multiple NUMA nodes (NPS2/NPS4), `perf_note` flags that +optimal per-node binding could give more performance -- surface it, but proceed. + +## Step 6: Confirm the plan, then launch (container-first) + +Before launching, present this summary and **wait for the user to confirm** -- do +not launch unprompted. This is the human gate before anything runs: + +| Field | Value | +|---|---| +| Model / kind | `` -- `text` or `multimodal` (from `check_model.py`) | +| Path | container (``, image from `data/epyc.json`) or conda/host | +| Precision | `bfloat16` (or the user's choice) | +| Fit | required `` GB vs `` GB RAM | +| CPU sizing | thread bind `` (socket 0), KV `` GB, no memory binding | +| Hardware | EPYC `` (``), `` cores, AVX-512 `` | +| Port | `` | + +Proceed only on a clear "go". If the user declines or wants changes (model, +`--max-model-len`, port), stop and adjust -- do not launch. + +Build the launch from `data/epyc.json`. The CLI is `vllm serve `. +**Do not pass `--device cpu`** on vLLM >= 0.20 -- the zentorch plugin +auto-selects the CPU platform and `vllm serve` rejects the flag. Only add it if +`vllm serve --help` lists it (older vLLM). + +**Container path** (`runtime` from validate.py). The agent runs these itself, +including the pull. `RT` is the resolved runtime verbatim: +```bash +RT="" +$RT pull # agent pulls; do not ask the user to +$RT run -d --name vllm-epyc \ + # --ipc=host --shm-size=16g --network=host + \ + # --cpuset-cpus=... (no --cpuset-mems) + --env VLLM_CPU_OMP_THREADS_BIND="$VLLM_CPU_OMP_THREADS_BIND" \ + --env VLLM_CPU_KVCACHE_SPACE=$VLLM_CPU_KVCACHE_SPACE \ + --env HF_TOKEN=${HF_TOKEN} \ + \ + vllm serve --dtype bfloat16 --port --max-model-len +``` + +**Conda/host path** (no container runtime, `conda_path_available` true). `eval`-ing +cpu_tune already exported the env vars; just launch -- `VLLM_CPU_OMP_THREADS_BIND` +binds the threads to socket 0, and there is no memory binding by default: +```bash +vllm serve --dtype bfloat16 --port --max-model-len & +``` + +Optional throughput flags are **opt-in and must move together** (see Gotchas): +`TORCHINDUCTOR_FREEZING=1` + `VLLM_USE_AOT_COMPILE=0` (+ `ZENTORCH_WEIGHT_PREPACK=1`). +The base launch sets none of them. + +## Step 7: Poll until up and responsive + +A 503 while loading is normal. Poll until the server answers, then prove the +chat endpoint works. CPU first-token compile can take a minute or two. + +```bash +# container alive (or process alive for conda) + /health +for i in $(seq 1 120); do + # container path: + $RT inspect -f '{{.State.Running}}' vllm-epyc 2>/dev/null | grep -q true || { echo "FAILED: container exited"; $RT logs --tail 50 vllm-epyc; break; } + curl -sf http://localhost:/health >/dev/null 2>&1 && { echo "HEALTHY"; break; } + sleep 3 +done +``` + +Then validate the OpenAI endpoint is actually accessible: +```bash +curl -sf http://localhost:/v1/chat/completions -H 'Content-Type: application/json' \ + -d '{"model":"","messages":[{"role":"user","content":"hi"}],"max_tokens":8}' +``` + +Resource sanity (your validation list): `$RT stats --no-stream vllm-epyc`. + +**If the server never becomes healthy or the endpoint does not respond: print +the container/process logs, state the failure, and STOP. Do not retry. Do not +start a debugging loop.** + +## Step 8: On success, hand over the endpoint + +Print a connection table (model, runtime, port, OMP threads, KV GB, max-model-len, +NUMA pinning) and a ready-to-run example: +```bash +curl -s http://localhost:/v1/chat/completions -H 'Content-Type: application/json' \ + -d '{"model":"","messages":[{"role":"user","content":"Hello"}]}' +``` +To stop: `$RT rm -f vllm-epyc` (container) or `kill ` (conda). + +## Offline (single-instance batch) + +For a one-shot offline run instead of a server, replace Step 6-8 with a single +`vllm bench throughput` (or an offline `LLM.generate`) using the same sized env, +wait for completion, and report the metrics. Same no-retry / no-debug rule. + +## Gotchas + +See [reference.md](reference.md) for the full list. The load-bearing ones: + +- **`--device cpu` was removed** from `vllm serve` in vLLM >= 0.20. The zentorch + plugin auto-selects CPU. Passing it makes `vllm serve` error with + "unrecognized arguments: --device cpu". +- **`TORCHINDUCTOR_FREEZING=1` alone crashes engine-core init** on vLLM 0.23 / + zentorch 2.11 (`AssertionError: expected OutputCode, got function`). It only + works with `VLLM_USE_AOT_COMPILE=0` set alongside it. Never set one without + the other. +- **`--shm-size`**: vLLM needs a large `/dev/shm`; the container default (64MB) + is too small. Use `--shm-size=16g` (in `data/epyc.json`). +- **NUMA**: the default is simple -- one instance on **socket 0's CPUs, no memory + binding** (`--cpuset-cpus` from `cpu_tune.py` for the container; the bind env var + for conda). If socket 0 spans multiple NUMA nodes (NPS2/NPS4), `cpu_tune.py` notes + that optimal per-node binding could add performance; the base recipe doesn't do it. diff --git a/skills/serving-llms-on-epyc/data/epyc.json b/skills/serving-llms-on-epyc/data/epyc.json new file mode 100644 index 0000000..5206c8e --- /dev/null +++ b/skills/serving-llms-on-epyc/data/epyc.json @@ -0,0 +1,53 @@ +{ + "vllm_version": "0.22.0", + "container": { + "image": "amdih/zendnn_zentorch:vllm_v0.22.0_zentorch_v2.11.0.1_ubuntu22.04_2026_ww23", + "runtimes": ["docker", "podman"], + "comment": "Public vLLM + zentorch CPU image on Docker Hub (amdih/zendnn_zentorch) -- no internal-registry access needed. Tags are vllm_v_zentorch_v__; prefer the newest ubuntu22.04 stable. Both docker and podman are supported; the skill prefers docker and falls back to podman.", + "run_flags": [ + "--ipc=host", + "--shm-size=16g", + "--network=host" + ], + "hf_cache_mount": "-v ~/.cache/huggingface:/root/.cache/huggingface", + "flag_notes": { + "--ipc=host": "vLLM workers use host IPC/shared memory.", + "--shm-size=16g": "vLLM needs a large /dev/shm; default 64MB is not enough.", + "--network=host": "Expose the served port directly. Alternative: -p :.", + "numa": "Default: a single instance uses socket 0's CPUs with NO memory binding (cpu_tune.py emits --cpuset-cpus for the container; conda relies on VLLM_CPU_OMP_THREADS_BIND). On NPS2/NPS4 (multiple NUMA nodes per socket), optimal per-node binding could add performance -- cpu_tune.py notes this; the base recipe does not do it." + } + }, + "launch": { + "cli": "vllm serve", + "device_flag_note": "Do NOT pass --device cpu on vLLM >= 0.20; the zentorch plugin auto-selects the CPU platform and `vllm serve` rejects --device. Only pass it if `vllm serve --help` advertises it (older vLLM)." + }, + "precision": { + "native": ["bf16", "fp16", "fp32"], + "default": "bfloat16", + "notes": "bf16 is the throughput default on EPYC (Zen). fp32 is slower and for debugging only. WOQ (per-channel/per-group int) is supported by zentorch but out of scope for the base recipe." + }, + "model_support": { + "check_script": "scripts/check_model.py", + "policy": "Do NOT blanket-block multimodal. check_model.py reads the model's HF architectures and checks them against vLLM's model registry for the pinned vllm_version. Text and multimodal generation endpoints are allowed; pooling/embedding/reranker and non-LLM architectures are rejected (not chat/completion endpoints).", + "cpu_note": "A vLLM-supported multimodal arch may still hit a GPU-only kernel on CPU; that surfaces at load, where the no-retry rule applies." + }, + "default_model": "Qwen/Qwen3-0.6B", + "default_model_notes": "Ungated (Apache-2.0), tiny, fast first success on CPU. For a real workload pick a larger Qwen3 / Llama once the flow is verified.", + "smoke_model": "Qwen/Qwen3-0.6B", + "smoke_model_notes": "Current small Qwen, chat-capable (ships a chat template, so /v1/chat/completions works -- unlike base models such as opt-125m).", + "env_defaults": { + "VLLM_CPU_OMP_THREADS_BIND": "set by cpu_tune.py (physical cores of socket 0)", + "VLLM_CPU_KVCACHE_SPACE": "set by cpu_tune.py (GB)", + "do_not_set": "OMP_NUM_THREADS -- vLLM sets it from the bind list (len of cpu_list); and VLLM_CPU_NUM_OF_RESERVED_CPU -- vLLM has its own default when unset, forcing 0 overrides it." + }, + "throughput_flags_optional": { + "TORCHINDUCTOR_FREEZING": "1", + "VLLM_USE_AOT_COMPILE": "0", + "ZENTORCH_WEIGHT_PREPACK": "1", + "gotcha": "VERIFIED on vLLM 0.22.0 / zentorch 2.11.0.1: TORCHINDUCTOR_FREEZING=1 ALONE crashes engine-core init with 'AssertionError: expected OutputCode, got function'. It only works when VLLM_USE_AOT_COMPILE=0 is set alongside it. Never set FREEZING=1 without AOT_COMPILE=0. The base recipe leaves all three unset." + }, + "ram": { + "os_headroom_gb": 16, + "comment": "Reserve ~16 GB for OS + framework beyond model weights + KV cache when checking fit." + } +} diff --git a/skills/serving-llms-on-epyc/reference.md b/skills/serving-llms-on-epyc/reference.md new file mode 100644 index 0000000..db18db2 --- /dev/null +++ b/skills/serving-llms-on-epyc/reference.md @@ -0,0 +1,122 @@ +# serving-llms-on-epyc -- Reference + +## Table of Contents +1. [Runtime selection](#runtime-selection) +2. [Container run flags (CPU)](#container-run-flags-cpu) +3. [Precision and modality](#precision-and-modality) +4. [CPU sizing](#cpu-sizing) +5. [Known quirks](#known-quirks) + +--- + +## Runtime selection + +`scripts/validate.py` resolves a runtime the **agent can drive +non-interactively** and reports it as `runtime` (the exact command prefix the +agent uses for `pull`/`run`/`stats`/`logs`). Preference order maximizes +agent-drivability with no human in the loop: + +1. **docker** (direct) -- if `docker ps` exits 0 (user in the `docker` group / + daemon reachable). No sudo. Best. +2. **podman** (rootless) -- no daemon, no sudo. Note: rootless podman needs a + storage backend that supports its overlay; some networked/`/proj` + filesystems reject the overlay `pivot_root` (the run fails even though + `podman info` succeeds). On those hosts use docker or the conda path. +3. **sudo docker** -- only if `sudo -n docker ps` works (passwordless sudo). The + agent can still drive it unattended; `runtime` comes back as `"sudo docker"`. +4. **conda/host** -- requires `import vllm, zentorch` in the active env. + +If docker is installed but **none** of the above is agent-drivable (no docker +group, no passwordless sudo), `validate.py` returns `runtime: null`, +`runtime_agent_drivable: false`, and a **one-time** setup `fix`: +`sudo usermod -aG docker $USER && newgrp docker` (or a NOPASSWD sudoers entry). +This is one-time onboarding, not a per-serve command. After it, every serve is +fully agent-driven. The skill must not degrade into asking the user to paste +docker commands for each serve. + +## Container run flags (CPU) + +From `data/epyc.json`. Unlike the Instinct (GPU) skill there are **no** +`/dev/kfd`, `/dev/dri`, `--group-add`, or ROCm flags -- this is pure CPU. + +| Flag | Why | +|---|---| +| `--ipc=host` | vLLM workers use host IPC / shared memory | +| `--shm-size=16g` | vLLM needs a large `/dev/shm`; the 64MB default is too small | +| `--network=host` | expose the served port directly (or use `-p :`) | +| `--cpuset-cpus` | (multi-socket) restrict the container to socket 0's CPUs; from `cpu_tune.py`. No `--cpuset-mems` -- no memory binding by default | +| `-v ~/.cache/huggingface:/root/.cache/huggingface` | reuse the host model cache | + +Image: `amdih/zendnn_zentorch:` -- the public vLLM + zentorch CPU image on +Docker Hub (no internal-registry access needed). The exact tag lives in +`data/epyc.json`; read it, never hardcode it. + +## Precision and modality + +| Dtype | EPYC (Zen) | Notes | +|---|---|---| +| BF16 | Native (default) | throughput default | +| FP16 | Native | | +| FP32 | Native | slower; debugging only | +| WOQ int8/int4 | Supported by zentorch | per-channel / per-group; out of scope for the base recipe | + +Modality: not gated by a static blocklist. `scripts/check_model.py` checks the +model's architecture against vLLM's model registry (pinned to `vllm_version`): +text **and** multimodal generation endpoints are allowed; pooling/embedding/ +reranker and non-LLM architectures are rejected (not chat/completion endpoints). +A vLLM-supported multimodal arch may still hit a GPU-only kernel on CPU -- that +surfaces at load, where the no-retry rule applies. + +## CPU sizing + +Default policy (the same for NPS1/NPS2/NPS4): a single instance uses **socket 0's +whole CPU with no memory binding**. `scripts/cpu_tune.py` derives: +- `VLLM_CPU_OMP_THREADS_BIND` = the physical cores of socket 0 (one thread per + physical core; SMT siblings do not help vLLM CPU). vLLM sets `OMP_NUM_THREADS` + itself from this list, so we don't. +- `VLLM_CPU_KVCACHE_SPACE` (GB) = `min(mem*kv_frac, mem-16)`; on <=32GB hosts, `mem*0.5`. +- `container_cpuset` = `--cpuset-cpus=` (no `--cpuset-mems`) for the + container path on a multi-socket host. The conda path needs nothing extra -- the + bind env var binds the threads. + +Not set: `OMP_NUM_THREADS` (vLLM derives it from the bind) and +`VLLM_CPU_NUM_OF_RESERVED_CPU` (vLLM has its own default when unset). + +When socket 0 spans multiple NUMA nodes (NPS2/NPS4), `cpu_tune.py` emits a +`perf_note`: the simple default leaves some performance on the table versus optimal +per-NUMA-node binding (one instance per node, memory bound). That tuning is out of +scope for the base recipe. + +## Known quirks + +**`--device cpu` removed (vLLM >= 0.20)** +`vllm serve` no longer accepts `--device cpu`; the zentorch plugin auto-selects +the CPU platform. Passing it -> `vllm: error: unrecognized arguments: --device cpu`. +Only pass it if `vllm serve --help` advertises it (older vLLM). + +**`TORCHINDUCTOR_FREEZING=1` + `VLLM_USE_AOT_COMPILE` (VERIFIED)** +On vLLM 0.23.0 / zentorch 2.11.0.2 (EPYC 9454, facebook/opt-125m, 2026-06-23): +`TORCHINDUCTOR_FREEZING=1` alone crashes engine-core init with +`AssertionError: expected OutputCode, got function` (inductor codecache). Adding +`VLLM_USE_AOT_COMPILE=0` fixes it (healthy in ~99s). The only changed variable +between the failing and passing runs was `VLLM_USE_AOT_COMPILE`. Never set +`FREEZING=1` without `VLLM_USE_AOT_COMPILE=0`. The base recipe leaves both unset. + +**`/dev/shm` too small** +Without `--shm-size=16g` (or `--ipc=host`), vLLM workers fail to allocate shared +memory at startup. + +**RAM is the ceiling, not VRAM** +CPU serving keeps weights + KV cache in system RAM. `estimate_memory.py` checks +`weights + KV(max_model_len x num_prompts) + reserve <= RAM` (reserve default +16 GB, `--reserve-gb`). It exits 1 when it does not fit and prints +`suggested_max_model_len` + an `action` to reduce and retry. Weights come from +HF file sizes (`.safetensors` or legacy `.bin`); `--weight-gb` overrides when a +model has no metadata. KV cache is bf16-only on zentorch CPU (no fp8 KV), so the estimate always uses 2 bytes/element. + +**NUMA cross-node traffic** +On a 2-socket EPYC, an unpinned instance spreads threads across both sockets and +pays cross-socket latency. The default keeps one instance on **socket 0's CPUs** +(`cpu_tune.py` -> `VLLM_CPU_OMP_THREADS_BIND`, plus `--cpuset-cpus` for the +container), with **no memory binding**. On NPS2/NPS4, `cpu_tune.py` notes that +optimal per-NUMA-node binding could add performance; the base recipe doesn't do it. diff --git a/skills/serving-llms-on-epyc/scripts/check_model.py b/skills/serving-llms-on-epyc/scripts/check_model.py new file mode 100644 index 0000000..534bfea --- /dev/null +++ b/skills/serving-llms-on-epyc/scripts/check_model.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python3 +""" +Does vLLM support this model's architecture? -- so the skill checks real vLLM +support instead of blanket-blocking multimodal. + +Reads the model's `architectures` from its HF config.json, then checks them +against vLLM's model registry for the pinned vLLM version. The registry comes +from the version-pinned registry.py on GitHub (no vLLM install needed); if that +is unreachable it falls back to an importable local `vllm`. Generation endpoints +(text + multimodal) are supported; pooling/embedding/reranker and non-LLM +architectures are not chat/completion endpoints and are rejected. + + check_model.py --model-id Qwen/Qwen3-0.6B + check_model.py --model-id --vllm-version 0.22.0 + +Exit 0 if vLLM serves it as a generation endpoint (or support is undeterminable +-- launch confirms), 1 if it is positively unsupported. JSON to stdout. +Env: HF_TOKEN for gated models. +""" + +import argparse +import json +import os +import re +import subprocess +import sys +import urllib.request +import urllib.error + +HF = "https://huggingface.co" +GH_RAW = "https://raw.githubusercontent.com/vllm-project/vllm" +REG_PATH = "vllm/model_executor/models/registry.py" + +# registry.py dict name -> kind we care about +_SECTIONS = { + "_TEXT_GENERATION_MODELS": "text", + "_TRANSFORMERS_BACKEND_MODELS": "text", + "_MULTIMODAL_MODELS": "multimodal", + "_EMBEDDING_MODELS": "pooling", + "_POOLING_MODELS": "pooling", + "_CROSS_ENCODER_MODELS": "pooling", +} + + +def _get(url, token=None): + """GET text from a URL. Returns (text, error_message).""" + headers = {"User-Agent": "check-model/1"} + if token: + headers["Authorization"] = f"Bearer {token}" + try: + with urllib.request.urlopen(urllib.request.Request(url, headers=headers), timeout=30) as r: + return r.read().decode("utf-8"), None + except urllib.error.HTTPError as e: + return None, {401: "not found or gated (set HF_TOKEN)", + 403: "access denied -- accept the model license on HuggingFace", + 404: "not found"}.get(e.code, f"HTTP {e.code}") + except Exception as e: + return None, str(e) + + +def model_architectures(model, rev, token): + """Architectures declared in the model's HF config.json. Returns (list, error).""" + text, err = _get(f"{HF}/{model}/resolve/{rev}/config.json", token) + if text is None: + return None, err + try: + cfg = json.loads(text) + except ValueError: + return None, "config.json is not valid JSON" + return cfg.get("architectures") or [], None + + +def registry_from_github(version): + """Parse vLLM's registry.py at v. Returns ({arch: kind}, source) or (None, err).""" + src, err = _get(f"{GH_RAW}/v{version}/{REG_PATH}") + if src is None: + return None, err + reg, cur = {}, None + for line in src.splitlines(): + s = line.strip() + sec = re.match(r"^(_[A-Z0-9_]+_MODELS)\s*(?::[^=]+)?=\s*\{", s) + if sec: + cur = _SECTIONS.get(sec.group(1)) + continue + if s.startswith("}"): + cur = None + continue + if cur: + key = re.match(r'^"([A-Za-z0-9_]+)"\s*:', s) + if key: + reg[key.group(1)] = cur + return (reg or None), (f"github:v{version}" if reg else "registry.py had no parseable archs") + + +def registry_from_local(): + """Coarse fallback: an importable local `vllm` (text vs multimodal). Returns ({arch: kind}, source) or (None, None).""" + snippet = ( + "import json;" + "from vllm import ModelRegistry as R;" + "a=list(R.get_supported_archs());" + "mm=set(x for x in a if R.is_multimodal_model([x]));" + "print(json.dumps({'archs':a,'mm':list(mm)}))" + ) + r = subprocess.run(["python", "-c", snippet], stdout=subprocess.PIPE, + stderr=subprocess.PIPE, text=True, timeout=60) + if r.returncode != 0 or not r.stdout.strip(): + return None, None + try: + d = json.loads(r.stdout) + except ValueError: + return None, None + mm = set(d.get("mm", [])) + return {a: ("multimodal" if a in mm else "text") for a in d.get("archs", [])}, "vllm-import" + + +def main(): + p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + p.add_argument("--model-id", required=True) + p.add_argument("--revision", default="main") + p.add_argument("--vllm-version", default="0.22.0", help="pin the registry to this vLLM version (from data/epyc.json)") + a = p.parse_args() + token = os.environ.get("HF_TOKEN", "") + + archs, aerr = model_architectures(a.model_id, a.revision, token) + if not archs: + # Cannot read the config (gated/offline) -- do not positively block; the + # gating check and launch will catch real problems. + print(json.dumps({"model_id": a.model_id, "supported": None, "kind": "undetermined", + "message": f"Could not read architectures ({aerr or 'none declared'}); support unverified. " + "If gated, set HF_TOKEN. This does not bypass the gating/launch checks."}, indent=2)) + sys.exit(0) + + reg, source = registry_from_github(a.vllm_version) + if reg is None: + reg, source = registry_from_local() + if reg is None: + print(json.dumps({"model_id": a.model_id, "architectures": archs, "supported": None, + "kind": "undetermined", + "message": "Could not load vLLM's model registry (no network and no importable vllm); " + "support unverified. vLLM confirms support at load (no-retry rule applies)."}, indent=2)) + sys.exit(0) + + kinds = [reg.get(arch) for arch in archs] + known = [k for k in kinds if k] + out = {"model_id": a.model_id, "architectures": archs, "registry_source": source} + + if not known: + out.update(supported=False, kind="unsupported", + message=f"vLLM has no registry entry for {archs}; it cannot serve this model on any backend. Stop.") + print(json.dumps(out, indent=2)) + sys.exit(1) + + if any(k in ("text", "multimodal") for k in known): + kind = "multimodal" if "multimodal" in known else "text" + msg = f"vLLM supports {archs} as a {kind} generation endpoint." + if kind == "multimodal": + msg += " A multimodal arch may still hit a GPU-only kernel on CPU; that surfaces at load (no-retry rule applies)." + out.update(supported=True, kind=kind, message=msg) + print(json.dumps(out, indent=2)) + sys.exit(0) + + out.update(supported=False, kind="pooling", + message=f"{archs} is a pooling/embedding/reranker model in vLLM, not a chat/completion endpoint. Stop.") + print(json.dumps(out, indent=2)) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/skills/serving-llms-on-epyc/scripts/cpu_tune.py b/skills/serving-llms-on-epyc/scripts/cpu_tune.py new file mode 100644 index 0000000..f37dbc5 --- /dev/null +++ b/skills/serving-llms-on-epyc/scripts/cpu_tune.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +""" +Derive vLLM-on-CPU runtime knobs from the host. Deterministic and read-only. + +Default policy (kept deliberately simple): a single instance uses **socket 0's +entire CPU**, with **no memory binding** -- regardless of NPS mode (NPS1/2/4). + +Emits two env vars: + - VLLM_CPU_OMP_THREADS_BIND : physical cores of socket 0. vLLM binds its OMP + threads to these and sets OMP_NUM_THREADS itself (= len(cores)), so we don't. + - VLLM_CPU_KVCACHE_SPACE : KV-cache RAM (GB). + +And, for the container path on a multi-socket host, a CPU-only cpuset: + - container_cpuset : --cpuset-cpus= (no --cpuset-mems) +The conda path needs nothing extra -- VLLM_CPU_OMP_THREADS_BIND binds the threads. + +We do NOT set OMP_NUM_THREADS (vLLM derives it) or VLLM_CPU_NUM_OF_RESERVED_CPU +(vLLM has its own default when unset). + +If socket 0 spans multiple NUMA nodes (NPS2/NPS4), `perf_note` flags that optimal +per-node binding could give more performance -- the default does not do it. + +Usage: + python3 scripts/cpu_tune.py # export lines for `eval` + python3 scripts/cpu_tune.py --format json # machine-readable + python3 scripts/cpu_tune.py --kv-frac 0.5 +""" + +import argparse +import json +import re +import subprocess +import sys + +OS_HEADROOM_GB = 16 + + +def _sh(cmd): + try: + r = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, text=True, timeout=15) + return r.stdout + except Exception: + return "" + + +def _lscpu_int(out, label, default): + m = re.search(rf"^{re.escape(label)}:\s*(\d+)", out, re.MULTILINE) + return int(m.group(1)) if m else default + + +def _ranges(cpus): + """Compress a sorted int list to a range string: [0,1,2,5] -> '0-2,5'.""" + if not cpus: + return "" + out, start, prev = [], cpus[0], cpus[0] + for c in cpus[1:]: + if c == prev + 1: + prev = c + continue + out.append(f"{start}-{prev}" if start != prev else f"{start}") + start = prev = c + out.append(f"{start}-{prev}" if start != prev else f"{start}") + return ",".join(out) + + +def socket0_physical_cpus(): + """Physical cores of socket 0 from `lscpu -p`: one CPU per core, SMT siblings + dropped. vLLM CPU gains nothing from SMT, so we run on physical cores only.""" + phys, seen = [], set() + for line in _sh("lscpu -p=CPU,CORE,SOCKET").splitlines(): + if line.startswith("#") or not line.strip(): + continue + parts = line.split(",") + if len(parts) < 3 or parts[2] != "0": + continue + cpu, core = int(parts[0]), parts[1] + if core not in seen: + seen.add(core) + phys.append(cpu) + return sorted(phys) + + +def main(): + p = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + p.add_argument("--kv-frac", type=float, default=0.4, help="fraction of RAM for KV cache") + p.add_argument("--format", choices=["env", "json"], default="env") + args = p.parse_args() + + lscpu = _sh("lscpu") + numa = _lscpu_int(lscpu, "NUMA node(s)", 1) + sockets = _lscpu_int(lscpu, "Socket(s)", 1) + nodes_per_socket = max(1, numa // max(1, sockets)) + + mem = 0 + m = re.search(r"MemTotal:\s*(\d+)", _sh("grep MemTotal /proc/meminfo")) + if m: + mem = int(m.group(1)) // (1024 * 1024) + if mem <= 2 * OS_HEADROOM_GB: + kv = max(1, int(mem * 0.5)) + else: + kv = max(1, min(int(mem * args.kv_frac), mem - OS_HEADROOM_GB)) + + bind = _ranges(socket0_physical_cpus()) + # CPU-only cpuset for the container, physical cores only (same list as the + # bind). Only meaningful when there is more than one socket to exclude. No + # --cpuset-mems (no memory binding by default). + container_cpuset = f"--cpuset-cpus={bind}" if sockets > 1 else "" + + perf_note = "" + if nodes_per_socket > 1: + perf_note = (f"socket 0 spans {nodes_per_socket} NUMA nodes (NPS{nodes_per_socket}); " + "the default uses the whole socket with no memory binding. Optimal " + "per-NUMA-node binding (memory bound to each node) could give more " + "performance -- not done by default.") + + result = { + "vllm_cpu_omp_threads_bind": bind, + "vllm_cpu_kvcache_space_gb": kv, + "sockets": sockets, + "numa_nodes": numa, + "nodes_per_socket": nodes_per_socket, + "container_cpuset": container_cpuset, + "memory_gb": mem, + "perf_note": perf_note, + } + + if args.format == "json": + print(json.dumps(result, indent=2)) + return + + print(f'export VLLM_CPU_OMP_THREADS_BIND="{bind}"') + print(f"export VLLM_CPU_KVCACHE_SPACE={kv}") + print(f"# default: socket 0's CPUs, no memory binding ({sockets} socket(s), {numa} NUMA node(s))") + if container_cpuset: + print(f"# container: {container_cpuset}") + print("# conda: VLLM_CPU_OMP_THREADS_BIND binds the threads; nothing else needed") + if perf_note: + print(f"# NOTE: {perf_note}") + + +if __name__ == "__main__": + main() diff --git a/skills/serving-llms-on-epyc/scripts/detect.py b/skills/serving-llms-on-epyc/scripts/detect.py new file mode 100644 index 0000000..c0c3340 --- /dev/null +++ b/skills/serving-llms-on-epyc/scripts/detect.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python3 +""" +Detect AMD EPYC CPU hardware for vLLM + zentorch serving. + +Usage: + python3 scripts/detect.py + python3 scripts/detect.py --host user@hostname + +Output: JSON with cpu_model, is_amd_epyc, logical_cores, physical_cores, +sockets, threads_per_core, numa_nodes, memory_gb, epyc_generation +(Naples/Rome/Milan/Genoa/Bergamo/Siena/Turin), zen_arch, and avx512. Exits 0 on +success, 1 if no CPU info could be read. + +Env vars (used when --host is not given): + ZEN_SSH_HOST, ZEN_SSH_USER, ZEN_SSH_PORT +""" + +import argparse +import json +import os +import re +import subprocess +import sys + + +def _is_local(host): + return not host or host in ("local", "localhost", "127.0.0.1") + + +def _run(cmd, host, user, port, timeout=20): + if _is_local(host): + r = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, text=True, timeout=timeout) + else: + ssh_target = f"{user}@{host}" if user else host + ssh = ["ssh", "-o", "StrictHostKeyChecking=accept-new", + "-o", "ConnectTimeout=15", "-o", "BatchMode=yes", + "-o", "LogLevel=ERROR", "-p", str(port), ssh_target, cmd] + r = subprocess.run(ssh, stdout=subprocess.PIPE, stderr=subprocess.PIPE, + text=True, timeout=timeout) + return r.returncode, r.stdout, r.stderr + + +def _lscpu_field(lscpu_out, label): + m = re.search(rf"^{re.escape(label)}:\s*(.+)$", lscpu_out, re.MULTILINE) + return m.group(1).strip() if m else "" + + +def _epyc_generation(model): + """Map an AMD EPYC model name to (generation, zen_arch). + + EPYC numbering encodes the generation: 7xx1=Naples (Zen1), 7xx2=Rome (Zen2), + 7xx3=Milan (Zen3), 8xx4=Siena (Zen4c), 97x4=Bergamo (Zen4c), 9xx4=Genoa (Zen4), + 9xx5=Turin (Zen5). The agent should carry this through every phase (e.g. AVX-512 + + bf16 land on Zen4+, Turin has up to 128 cores per socket -> thread binding).""" + m = re.search(r"EPYC\s+(\d{4})", model.upper()) + if not m: + return "unknown", "unknown" + num = m.group(1) + first, last = num[0], num[3] + if first == "7": + return {"1": ("Naples", "Zen1"), "2": ("Rome", "Zen2"), + "3": ("Milan", "Zen3")}.get(last, ("unknown", "unknown")) + if first == "8" and last == "4": + return "Siena", "Zen4c" + if first == "9": + if num.startswith("97") and last == "4": + return "Bergamo", "Zen4c" + if last == "4": + return "Genoa", "Zen4" + if last == "5": + return "Turin", "Zen5" + return "unknown", "unknown" + + +def main(): + p = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + p.add_argument("--host", default="", help="[user@]host (default: local or ZEN_SSH_HOST)") + p.add_argument("--user", default="") + p.add_argument("--port", type=int, default=0) + args = p.parse_args() + + host, user = args.host, args.user + if "@" in host: + user, host = host.split("@", 1) + host = host or os.environ.get("ZEN_SSH_HOST", "") + user = user or os.environ.get("ZEN_SSH_USER", "") + port = args.port or int(os.environ.get("ZEN_SSH_PORT", "22")) + + rc, lscpu_out, err = _run("lscpu", host, user, port) + if rc != 0 or not lscpu_out: + print(json.dumps({"error": "lscpu failed", + "detail": err.strip() or f"exit {rc}"})) + sys.exit(1) + + model = _lscpu_field(lscpu_out, "Model name") or "unknown" + vendor = _lscpu_field(lscpu_out, "Vendor ID") + + def _int(label, default=0): + v = _lscpu_field(lscpu_out, label) + try: + return int(v) + except ValueError: + return default + + sockets = _int("Socket(s)", 1) + cores_per_socket = _int("Core(s) per socket", 0) + threads_per_core = _int("Thread(s) per core", 1) or 1 + numa_nodes = _int("NUMA node(s)", 1) + + rc, nproc_out, _ = _run("nproc --all", host, user, port) + try: + logical = int(nproc_out.strip()) + except (ValueError, AttributeError): + logical = sockets * cores_per_socket * threads_per_core + + physical = sockets * cores_per_socket if cores_per_socket else logical // threads_per_core + + rc, mem_out, _ = _run("grep MemTotal /proc/meminfo", host, user, port) + mem_kb = 0 + m = re.search(r"(\d+)", mem_out or "") + if m: + mem_kb = int(m.group(1)) + memory_gb = mem_kb // (1024 * 1024) + + is_epyc = vendor == "AuthenticAMD" and "EPYC" in model.upper() + generation, zen_arch = _epyc_generation(model) + avx512 = "avx512f" in _lscpu_field(lscpu_out, "Flags").split() + + print(json.dumps({ + "cpu_model": model, + "vendor": vendor, + "is_amd_epyc": is_epyc, + "epyc_generation": generation, + "zen_arch": zen_arch, + "avx512": avx512, + "logical_cores": logical, + "physical_cores": physical, + "sockets": sockets, + "threads_per_core": threads_per_core, + "numa_nodes": numa_nodes, + "memory_gb": memory_gb, + "target": "local" if _is_local(host) else host, + }, indent=2)) + + +if __name__ == "__main__": + main() diff --git a/skills/serving-llms-on-epyc/scripts/estimate_memory.py b/skills/serving-llms-on-epyc/scripts/estimate_memory.py new file mode 100644 index 0000000..75c50ad --- /dev/null +++ b/skills/serving-llms-on-epyc/scripts/estimate_memory.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python3 +""" +Does a HuggingFace model fit in host RAM for CPU serving at a given context? + +No download -- reads HF metadata over HTTP. Answers one question: + weights + KV(max_model_len x num_prompts) + reserve <= RAM ? +If not, prints the largest max_model_len that would fit, so you reduce it and +retry. Exit 0 = fits, 1 = does not fit (or error). + + estimate_memory.py --model-id Qwen/Qwen3-8B --ram-gb 755 --max-model-len 4096 --num-prompts 8 + +Three sub-problems, one function each: weight_gb(), kv_bytes_per_token(), fit(). +Env: HF_TOKEN for gated models. --weight-gb overrides weights if metadata is missing. +""" + +import argparse +import json +import os +import sys +import urllib.request +import urllib.error + +HF = "https://huggingface.co" +KV_BYTES_PER_ELEM = 2 # zentorch CPU KV cache is bf16-only (2 bytes); no fp8 KV support + + +def _get(url, token): + """GET JSON from HF. Returns (data, error_message).""" + headers = {"User-Agent": "estimate-memory/2"} + if token: + headers["Authorization"] = f"Bearer {token}" + try: + with urllib.request.urlopen(urllib.request.Request(url, headers=headers), timeout=30) as r: + return json.load(r), None + except urllib.error.HTTPError as e: + return None, {401: "not found, or gated (set HF_TOKEN if it is gated)", + 403: "access denied -- accept the model license on HuggingFace", + 404: "model not found"}.get(e.code, f"HTTP {e.code}") + except Exception as e: + return None, str(e) + + +def weight_gb(model, rev, token): + """(1) Weight RAM = sum of uncompressed weight-file sizes. Works for + .safetensors and legacy .bin; file size is ground truth even for quantized + checkpoints. Returns (gb, error).""" + tree, err = _get(f"{HF}/api/models/{model}/tree/{rev}", token) + if not isinstance(tree, list): + return None, err or "no file tree" + total = sum( + f.get("size", 0) for f in tree + if f.get("type") == "file" and ( + f.get("path", "").endswith(".safetensors") + or (f.get("path", "").endswith(".bin") and "model" in f.get("path", "").lower()) + ) + ) + if total == 0: + return None, "no weight files (.safetensors/.bin) found -- pass --weight-gb" + return round(total / 2**30, 2), None + + +def get_config(model, rev, token): + """Model config.json, unwrapping the LLM sub-config of multimodal models.""" + cfg, _ = _get(f"{HF}/{model}/resolve/{rev}/config.json", token) + if cfg and "num_hidden_layers" not in cfg: + for k in ("text_config", "language_config", "llm_config"): + if isinstance(cfg.get(k), dict) and cfg[k].get("num_hidden_layers"): + sub = dict(cfg[k]) + sub.setdefault("max_position_embeddings", cfg.get("max_position_embeddings")) + return sub + return cfg + + +def kv_bytes_per_token(cfg): + """(2) KV-cache bytes per token = 2(K,V) x layers x kv_heads x head_dim x 2 (bf16). + zentorch CPU caches KV in bf16 only. MLA models (DeepSeek) cache a compressed latent.""" + if not cfg or not cfg.get("num_hidden_layers"): + return 0 + nbytes = KV_BYTES_PER_ELEM + layers = cfg["num_hidden_layers"] + if "kv_lora_rank" in cfg: # MLA: latent KV + return 2 * layers * (cfg["kv_lora_rank"] + cfg.get("qk_rope_head_dim", 0)) * nbytes + kv_heads = cfg.get("num_key_value_heads", cfg.get("num_attention_heads", 0)) + head_dim = cfg.get("head_dim") or (cfg.get("hidden_size", 0) // max(1, cfg.get("num_attention_heads", 1))) + return 2 * layers * kv_heads * head_dim * nbytes + + +def fit(weight, kv_per_tok, ctx, prompts, ram, reserve): + """(3) Verdict + the largest max_model_len that would fit if it doesn't.""" + kv_gb = kv_per_tok * ctx * prompts / 2**30 + required = round(weight + kv_gb + reserve, 2) + out = {"max_model_len": ctx, "num_prompts": prompts, "weight_gb": weight, + "kv_cache_gb": round(kv_gb, 2), "reserve_gb": reserve, + "required_gb": required, "ram_gb": ram, "fits": required <= ram} + if not out["fits"]: + budget = (ram - weight - reserve) * 2**30 + best = int(budget / (kv_per_tok * prompts)) // 256 * 256 if kv_per_tok and budget > 0 else 0 + out["suggested_max_model_len"] = max(0, best) + out["action"] = (f"reduce --max-model-len to {best} or less and retry" + if best >= 256 else "weights alone exceed RAM -- use a smaller model") + return out + + +def main(): + p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + p.add_argument("--model-id", required=True) + p.add_argument("--revision", default="main") + p.add_argument("--ram-gb", type=float, default=0, help="host RAM (enables the fit verdict)") + p.add_argument("--max-model-len", type=int, default=4096) + p.add_argument("--num-prompts", type=int, default=1, help="concurrent sequences") + p.add_argument("--reserve-gb", type=float, default=16, help="RAM held back for OS + vLLM runtime") + p.add_argument("--weight-gb", type=float, default=0, help="override weight RAM if metadata is unavailable") + a = p.parse_args() + token = os.environ.get("HF_TOKEN", "") + + w = a.weight_gb if a.weight_gb > 0 else None + if w is None: + w, err = weight_gb(a.model_id, a.revision, token) + if w is None: + print(json.dumps({"error": err, "model_id": a.model_id})) + sys.exit(1) + + cfg = get_config(a.model_id, a.revision, token) + kv_per_tok = kv_bytes_per_token(cfg) + max_seq = cfg.get("max_position_embeddings") if cfg else None + ctx = min(a.max_model_len, max_seq) if max_seq else a.max_model_len + + out = {"model_id": a.model_id, "weight_gb": w, "kv_dtype": "bf16", + "kv_bytes_per_token": kv_per_tok, "model_max_seq_len": max_seq} + if a.ram_gb > 0: + out["fit"] = fit(w, kv_per_tok, ctx, a.num_prompts, a.ram_gb, a.reserve_gb) + + print(json.dumps(out, indent=2)) + sys.exit(0 if out.get("fit", {"fits": True})["fits"] else 1) + + +if __name__ == "__main__": + main() diff --git a/skills/serving-llms-on-epyc/scripts/validate.py b/skills/serving-llms-on-epyc/scripts/validate.py new file mode 100644 index 0000000..95fd37a --- /dev/null +++ b/skills/serving-llms-on-epyc/scripts/validate.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 +""" +Validate the environment before serving vLLM + zentorch on an EPYC CPU host. + +Checks a container runtime (docker or podman), whether the vLLM+zentorch image +is present (and, if already pulled, that `import vllm, zentorch` works inside it), +a conda/host fallback (`import vllm, zentorch`), the host perf libraries +(tcmalloc / OpenMP via LD_PRELOAD), HF_TOKEN, and RAM. Each issue is error +(blocks launch) / warning (degrades) / advisory (info). + +Usage: + python3 scripts/validate.py + python3 scripts/validate.py --image amdih/zendnn_zentorch:vllm_v0.22.0_zentorch_v2.11.0.1_ubuntu22.04_2026_ww23 + +Exits 0 if no error-severity issues remain, 1 otherwise. JSON to stdout. +""" + +import argparse +import json +import os +import shutil +import subprocess +import sys + + +def _sh(cmd, timeout=20): + try: + r = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, text=True, timeout=timeout) + return r.returncode, r.stdout.strip(), r.stderr.strip() + except subprocess.TimeoutExpired: + return 1, "", f"timed out after {timeout}s" + + +def _detect_runtime(): + """Pick an accessible container runtime: docker (daemon reachable) > podman + (rootless). Returns (runtime, detail) or (None, why). + + Like serving-llms-on-instinct, an accessible runtime is a PREREQUISITE. We + check and report a one-time fix; we never escalate privileges (no sudo). + """ + if shutil.which("docker"): + rc, _, err = _sh("docker ps -q") + if rc == 0: + return "docker", "docker reachable" + last = (err or "docker ps failed").splitlines()[0][:120] + else: + last = "docker not installed" + if shutil.which("podman"): + rc, _, err = _sh("podman info --format '{{.Host.Arch}}'") + if rc == 0: + return "podman", "podman available (rootless)" + last = (err or last).splitlines()[0][:120] if err else last + return None, last + + +def main(): + p = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + p.add_argument("--image", default="", help="container image to check for (advisory)") + args = p.parse_args() + + issues = [] + + # 1. Container runtime (prerequisite): docker > podman, else conda fallback. + runtime, detail = _detect_runtime() + conda_ok = _sh('python -c "import vllm, zentorch"')[0] == 0 + + if runtime is None: + if conda_ok: + issues.append({"check": "container_runtime", "severity": "warning", + "message": f"No accessible container runtime ({detail}); using the conda/host path.", + "fix": "For the container path, make docker accessible or install rootless podman (see fix below)."}) + else: + issues.append({"check": "container_runtime", "severity": "error", + "message": f"No accessible container runtime ({detail}) and no host vllm+zentorch.", + "fix": "One-time onboarding: add your user to the docker group " + "(sudo usermod -aG docker $USER, then re-login) or start the daemon; " + "OR install rootless podman; OR activate a conda env with vllm+zentorch."}) + + # 2. Image present + (only if already pulled) zentorch inside it. The in-image + # import check runs ONLY when the image is local, so it never triggers a + # multi-GB pull just to validate. + if runtime and args.image: + repo = args.image.rsplit(":", 1)[0] # strip the tag, keep any host:port/repo + rc, out, _ = _sh(f"{runtime} images {repo} --format '{{{{.Repository}}}}:{{{{.Tag}}}}'") + if args.image not in (out or ""): + issues.append({"check": "image", "severity": "advisory", + "message": f"Image {args.image} not pulled yet; first launch will download it (in-image zentorch check deferred to launch).", + "fix": f"{runtime} pull {args.image}"}) + else: + rc, ver, err = _sh(f'{runtime} run --rm {args.image} ' + f'python -c "import vllm,zentorch;print(vllm.__version__,zentorch.__version__)"', timeout=90) + if rc == 0 and ver: + issues.append({"check": "image_stack", "severity": "advisory", + "message": f"Image has vllm+zentorch ({ver})."}) + else: + issues.append({"check": "image_stack", "severity": "warning", + "message": f"Image {args.image} is present but `import vllm, zentorch` failed inside it: {(err or 'unknown')[:120]}", + "fix": "Use an image tag that bundles the zentorch plugin (see data/epyc.json)."}) + + # 3. Host vllm+zentorch (for the conda path) + if conda_ok: + _, ver, _ = _sh('python -c "import vllm,zentorch;print(vllm.__version__,zentorch.__version__)"') + issues.append({"check": "host_stack", "severity": "advisory", + "message": f"Host vllm+zentorch importable ({ver}); conda path available."}) + elif runtime: + issues.append({"check": "host_stack", "severity": "advisory", + "message": "Host `import vllm, zentorch` not available; use the container path."}) + + # 4. HF_TOKEN + if not os.environ.get("HF_TOKEN"): + issues.append({"check": "hf_token", "severity": "advisory", + "message": "HF_TOKEN not set. Required for gated models (Llama, Gemma); not needed for Qwen3.", + "fix": "export HF_TOKEN=hf_..."}) + + # 5. RAM + rc, out, _ = _sh("grep MemTotal /proc/meminfo | awk '{print int($2/1024/1024)}'") + try: + ram_gb = int(out) + except ValueError: + ram_gb = 0 + if 0 < ram_gb < 32: + issues.append({"check": "ram", "severity": "warning", + "message": f"Only {ram_gb} GB RAM. CPU serving keeps weights + KV cache in RAM; large models may not fit.", + "fix": "Use a small model or a host with more RAM."}) + + # 6. Perf libraries for the host/conda path (advisory). vLLM CPU wants + # libtcmalloc + libiomp (OpenMP) preloaded and warns otherwise. The + # container image sets these itself, so only check the host when the + # conda/host path is viable. + if conda_ok: + ld = os.environ.get("LD_PRELOAD", "") + missing = [lib for lib in ("libtcmalloc", "libiomp") if lib not in ld] + if missing: + issues.append({"check": "perf_libs", "severity": "advisory", + "message": f"LD_PRELOAD is missing {', '.join(missing)}; vLLM CPU warns about this and throughput suffers without them (host/conda path).", + "fix": "export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$CONDA_PREFIX/lib/libiomp5.so:$LD_PRELOAD"}) + + errors = [i for i in issues if i["severity"] == "error"] + result = { + "ready": len(errors) == 0, + "runtime": runtime, + "runtime_detail": detail, + "conda_path_available": conda_ok, + "ram_gb": ram_gb, + "errors": errors, + "warnings": [i for i in issues if i["severity"] == "warning"], + "advisories": [i for i in issues if i["severity"] == "advisory"], + } + print(json.dumps(result, indent=2)) + sys.exit(0 if len(errors) == 0 else 1) + + +if __name__ == "__main__": + main() diff --git a/skills/serving-llms-on-epyc/skill-card.md b/skills/serving-llms-on-epyc/skill-card.md new file mode 100644 index 0000000..120283f --- /dev/null +++ b/skills/serving-llms-on-epyc/skill-card.md @@ -0,0 +1,13 @@ +# Skill Card + +## Description + +Serve a single LLM on an AMD EPYC CPU host with vLLM + zentorch (Docker, Podman, or conda), handling CPU detection, runtime/env validation, model + RAM-fit checks, hardware-sized threads/KV/NUMA, launch, and health verification. Reports and stops on failure; does not debug. + +## Owner + +AMD + +## License + +MIT