From 421efd51437cd9842d65bc99b65447a642c71d5e Mon Sep 17 00:00:00 2001 From: willamhou <1205157517@qq.com> Date: Mon, 25 May 2026 10:50:17 +0800 Subject: [PATCH] fix(server): size non-streaming event channel to max_tokens + add baseline bench doc ## Non-streaming 500 fix A `stream:false` request with a long generation (observed at max_tokens=256) returned HTTP 500 `event channel full, cancelling slow consumer`. The engine emits Token events via `try_send` into a bounded 256-slot per-request channel and cancels the sequence if it's full (slow-consumer protection). The streaming consumer drains incrementally as it writes SSE, so 256 slack suffices; the non-streaming consumer accumulates tokens and only returns at Finish, so a burst of >256 tokens faster than the consumer task is scheduled overflows the channel. Fix: size the channel to `max_tokens + 16` (floor 256) for non-streaming requests; keep 256 for streaming (SSE backpressure handles it). A well-behaved non-streaming consumer can no longer be dropped mid-stream. Verified: max_tokens=256 non-streaming now returns finish=length with 256 completion tokens and zero channel-full warnings. ## Baseline benchmark doc docs/benchmarks/2026-05-25-baseline-tinyllama-gb10.md records the pre-CUDA-Graphs reference on GB10 (TinyLlama-1.1B, F16, paged attention): - single-stream decode 38.2 ms/token (26.2 tok/s) - concurrent throughput 179 / 531 / 730 tok/s at C = 4 / 16 / 32 Single-stream is launch-overhead-bound (~220 kernel launches/decode step), which is exactly what the upcoming CUDA Graphs end-to-end work targets. Doc notes the measurement method (differential to cancel prefill; wall-clock only, since urllib SSE line-buffering corrupts naive per-token timing) and this non-streaming bug (now fixed). Co-Authored-By: Claude Opus 4.7 --- .../2026-05-25-baseline-tinyllama-gb10.md | 79 +++++++++++++++++++ forge-server/src/api/openai.rs | 20 ++++- 2 files changed, 98 insertions(+), 1 deletion(-) create mode 100644 docs/benchmarks/2026-05-25-baseline-tinyllama-gb10.md diff --git a/docs/benchmarks/2026-05-25-baseline-tinyllama-gb10.md b/docs/benchmarks/2026-05-25-baseline-tinyllama-gb10.md new file mode 100644 index 0000000..da2c185 --- /dev/null +++ b/docs/benchmarks/2026-05-25-baseline-tinyllama-gb10.md @@ -0,0 +1,79 @@ +# Benchmark Baseline — TinyLlama-1.1B on GB10 (pre-CUDA-Graphs) + +**Date:** 2026-05-25 +**Hardware:** NVIDIA GB10 (DGX Spark, Blackwell, sm_121) +**Model:** TinyLlama-1.1B-Chat-v1.0, F16 weights (BF16→F16 at load) +**Config:** `--backend cuda --kv-cache paged --num-blocks 1024 --block-size 16 --max-batch-size 64` +**Build:** `cargo build --release` (CUDA on) +**Path under test:** paged attention decode, **no CUDA Graphs** (engine calls +the allocating `model.forward()`; `forward_into` + graph capture not yet wired). + +This is the reference point for the CUDA Graphs end-to-end work. Re-run the +same harness after graph capture lands to quantify the win. + +## Numbers + +| Metric | Value | +|---|---| +| **Single-stream decode** | **38.2 ms/token (26.2 tok/s)** | +| Concurrent throughput, C=4 | 179 tok/s | +| Concurrent throughput, C=16 | 531 tok/s | +| Concurrent throughput, C=32 | **730 tok/s** | +| TTFT (C=4, short prompt) | ~148 ms | + +Single-stream decode measured by the differential method: stream the same +prompt at `max_tokens` 32 and 256 (greedy), take min-of-3 each, subtract to +cancel prefill — `(256-32) tok / (t256 - t32)`. This avoids the urllib +line-buffering artifact that corrupts naive per-chunk TTFT/ITL timing on a +single fast stream (see "Measurement notes"). + +Concurrent throughput = total decoded tokens / wall-clock, firing C streams +simultaneously (Python threads), max_tokens=128 each. + +## Interpretation + +1. **38 ms/token single-stream is slow for a 1.1B model on Blackwell** + (should be sub-10 ms). The decode hot path issues ~220 kernel launches + per step (22 layers × ~10 kernels/layer: matmul ×4, rms_norm ×2, + silu_mul, paged_attention, rope, add, …) plus per-op output allocations. + Launch latency + allocation churn dominate at batch=1. **This is exactly + what CUDA Graphs targets** — collapse the ~220 launches into one graph + replay. + +2. **Concurrent throughput scales near-linearly** (179 → 531 → 730 tok/s for + C = 4 → 16 → 32). Continuous batching + paged attention are working; + C=32 throughput is ~28× single-stream, so the GPU is well-utilized under + batching and the single-step bottleneck is launch overhead, not FLOPs. + +3. **Expected CUDA Graphs win**: single-stream TPOT should drop substantially + (launch overhead amortized into one replay). Concurrent throughput may + improve less (already GPU-bound) but should still benefit from removing + per-step CPU launch work. + +## Measurement notes / caveats + +- **urllib SSE buffering artifact**: a naive `for line in resp` over a single + fast SSE stream buffers the whole body, so per-chunk TTFT/ITL timing is + bogus (TTFT≈total, ITL≈0). Only wall-clock-total metrics are trustworthy + for single streams. The differential method sidesteps this. +- **Non-streaming long generations 500**: `stream:false` + `max_tokens=256` + returns HTTP 500 (`event channel full, cancelling slow consumer`). The + non-streaming handler does not drain the bounded engine event channel + concurrently, so long generations overflow it. Streaming works fine. This + is a separate forge bug (fixed in a follow-up); the benchmark uses + streaming throughout. +- Numbers are single-run (min-of-3 for the differential); not averaged over + many trials. Treat as order-of-magnitude baseline, not a precise figure. + +## Repro + +```bash +# Start server +./target/release/forge-server \ + --model-path /path/to/tinyllama-1.1b-chat \ + --backend cuda --kv-cache paged --num-blocks 1024 --block-size 16 \ + --max-batch-size 64 --port 8110 + +# Differential single-stream decode + concurrent throughput +python bench.py http://127.0.0.1:8110 128 +``` diff --git a/forge-server/src/api/openai.rs b/forge-server/src/api/openai.rs index 2beecc1..27b1e95 100644 --- a/forge-server/src/api/openai.rs +++ b/forge-server/src/api/openai.rs @@ -186,6 +186,24 @@ pub async fn chat_completions( None }; + // Per-request event channel capacity. + // + // The engine emits Token events via `try_send` and cancels a sequence if + // the channel is full ("slow consumer" protection — one stuck client must + // not stall the shared engine). The streaming consumer drains + // incrementally as it writes SSE, so 256 of slack is plenty. The + // non-streaming consumer accumulates tokens and only returns at Finish; if + // the engine bursts faster than the consumer task gets scheduled, a + // bounded 256 channel overflows on long generations (observed: + // max_tokens=256 → HTTP 500 "event channel full"). Size the non-streaming + // channel to hold the whole generation plus slack so it can't drop a + // well-behaved consumer. + let channel_cap = if is_stream { + 256 + } else { + (params.max_tokens + 16).max(256) + }; + let inference_req = InferenceRequest { request_id: request_id.clone(), prompt_tokens, @@ -193,7 +211,7 @@ pub async fn chat_completions( }; // Create per-request event channel - let (event_tx, event_rx) = mpsc::channel(256); + let (event_tx, event_rx) = mpsc::channel(channel_cap); let engine_req = EngineRequest { inference_req,