trace replay H200 llama fp8 1 hr v1.1 #117

Workflow file for this run

.github/workflows/multiturn-sweep.yml at 0dc5364

	name: Multi-Turn Benchmark Sweep
	run-name: "${{ inputs.run_name \|\| format('Multi-Turn Sweep - tp={0} users={1} offload={2}', inputs.tp_values, inputs.user_values, inputs.offload_values) }}"

	on:
	# push:
	# branches:
	# - experimental/multi-turn-benchmark
	# paths:
	# - .github/workflows/multiturn-sweep.yml
	workflow_dispatch:
	inputs:
	run_name:
	description: 'Custom run name (optional)'
	required: false
	default: ''
	type: string
	tp_values:
	description: 'TP sizes (JSON array)'
	required: true
	default: '[1, 2, 4, 8]'
	type: string
	user_values:
	description: 'Concurrent user counts (JSON array). Ignored if config_file is set.'
	required: false
	default: '[8, 16, 32, 64, 128, 256, 512, 1024, 2048]'
	type: string
	offload_values:
	description: 'Offload modes (JSON array: on/off/noprefix). Ignored if config_file is set.'
	required: false
	default: '["on", "off", "noprefix"]'
	type: string
	config_file:
	description: 'YAML config with per-TP sweep settings (e.g. .github/configs/multiturn-agentic-trace.yaml). Overrides tp/user/offload values.'
	required: false
	default: ''
	type: string
	config_key:
	description: 'Top-level key in config_file to use (e.g. h200-fp8-llama70b, b200-fp4-dsr1). Required if config_file has multiple entries.'
	required: false
	default: ''
	type: string
	duration:
	description: 'Benchmark duration in seconds (optional, runs to completion if omitted)'
	required: false
	default: ''
	type: string
	request_rate:
	description: 'Request rate per client (Poisson, req/s). 0 = no delay.'
	required: false
	default: '0'
	type: string
	total_cpu_dram_gb:
	description: 'Total CPU DRAM for KV offload (GB)'
	required: true
	default: '100'
	type: string
	image:
	description: 'Container image'
	required: true
	default: 'vllm/vllm-openai:v0.18.0'
	type: string
	model:
	description: 'Model name'
	required: true
	default: 'nvidia/Llama-3.3-70B-Instruct-FP4'
	type: string
	precision:
	description: 'Model precision (fp4, fp8, etc.) — used to select benchmark script'
	required: false
	default: 'fp4'
	type: string
	script_suffix:
	description: 'Suffix for benchmark script (e.g. "_lmcache" → multiturn_fp4_b200_lmcache.sh)'
	required: false
	default: ''
	type: string
	runner:
	description: 'Runner label (e.g. b200, h200-dgxc-slurm)'
	required: false
	default: 'b200'
	type: string
	ep:
	description: 'Expert parallelism size (for MoE models, default 0 = disabled)'
	required: false
	default: '0'
	type: string
	ref:
	description: 'Git ref (branch/sha) to checkout'
	required: false
	type: string

	jobs:
	# ---------------------------------------------------------------------------
	# Generate matrix from config file or CLI inputs
	# ---------------------------------------------------------------------------
	generate-matrix:
	runs-on: ubuntu-latest
	outputs:
	matrix: ${{ steps.gen.outputs.matrix }}
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	if: ${{ inputs.config_file != '' }}
	with:
	token: ${{ secrets.REPO_PAT }}
	fetch-depth: 1
	ref: ${{ inputs.ref \|\| github.ref }}
	sparse-checkout: ${{ inputs.config_file }}

	- id: gen
	run: \|
	pip install -q pyyaml
	python3 << 'PYEOF'
	import json, os, sys

	config_file = "${{ inputs.config_file }}".strip()

	if config_file:
	import yaml
	with open(config_file) as f:
	full_config = yaml.safe_load(f)

	config_key = "${{ inputs.config_key }}".strip()

	# If config_key specified, use that section; otherwise auto-detect
	if config_key and config_key in full_config:
	config = full_config[config_key]
	elif config_key:
	print(f"ERROR: config_key '{config_key}' not found. Available: {list(full_config.keys())}")
	sys.exit(1)
	elif len(full_config) == 1:
	config = next(iter(full_config.values()))
	else:
	# Check if top-level keys look like tp entries (tp2, tp4, etc.)
	if all(k.startswith("tp") for k in full_config):
	config = full_config
	else:
	print(f"ERROR: Multiple entries in config, specify --config_key. Available: {list(full_config.keys())}")
	sys.exit(1)

	includes = []
	for key, settings in config.items():
	tp = int(key.replace("tp", ""))
	users = settings.get("users", [])
	offloads = settings.get("offload", ["on", "off"])
	ep = settings.get("ep", 0)
	for u in users:
	for o in offloads:
	entry = {"tp": tp, "users": u, "offload": o}
	if ep > 0:
	entry["ep"] = ep
	includes.append(entry)
	else:
	tp_values = json.loads('${{ inputs.tp_values }}')
	user_values = json.loads('${{ inputs.user_values }}')
	offload_values = json.loads('${{ inputs.offload_values }}')
	includes = []
	for tp in tp_values:
	for u in user_values:
	for o in offload_values:
	includes.append({"tp": tp, "users": u, "offload": o})

	matrix = {"include": includes}
	print(f"Generated {len(includes)} matrix entries")
	with open(os.environ["GITHUB_OUTPUT"], "a") as f:
	f.write(f"matrix={json.dumps(matrix)}\n")
	PYEOF

	# ---------------------------------------------------------------------------
	# Matrix benchmark jobs — each cell calls the multiturn template
	# ---------------------------------------------------------------------------
	sweep:
	needs: generate-matrix
	uses: ./.github/workflows/benchmark-multiturn-tmpl.yml
	name: sweep /
	strategy:
	fail-fast: false
	matrix: ${{ fromJson(needs.generate-matrix.outputs.matrix) }}
	secrets: inherit
	with:
	runner: ${{ inputs.runner }}
	image: ${{ inputs.image }}
	model: ${{ inputs.model }}
	precision: ${{ inputs.precision }}
	exp-name: "multiturn_tp${{ matrix.tp }}_users${{ matrix.users }}_offload${{ matrix.offload }}"
	tp: "${{ matrix.tp }}"
	users: "${{ matrix.users }}"
	offload-mode: ${{ matrix.offload }}
	duration: ${{ inputs.duration }}
	request-rate: ${{ inputs.request_rate }}
	total-cpu-dram-gb: ${{ inputs.total_cpu_dram_gb }}
	script-suffix: ${{ inputs.script_suffix }}
	ep: "${{ matrix.ep \|\| inputs.ep }}"
	ref: ${{ inputs.ref }}

	# ---------------------------------------------------------------------------
	# Collect & aggregate results
	# ---------------------------------------------------------------------------
	collect:
	runs-on: ubuntu-latest
	needs: sweep
	if: always()
	name: Collect results
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
	with:
	token: ${{ secrets.REPO_PAT }}
	fetch-depth: 1
	ref: ${{ inputs.ref \|\| github.ref }}

	- uses: actions/setup-python@v5
	with:
	python-version: '3.11'

	- name: Install dependencies
	run: pip install pandas matplotlib numpy

	- name: Download all artifacts
	uses: actions/download-artifact@v4
	with:
	pattern: 'multiturn_*'
	path: results/

	- name: Run aggregation
	run: \|
	python experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py results/ aggregated/

	- name: Upload aggregated results
	uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
	with:
	name: multiturn_aggregated
	path: aggregated/

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

trace replay H200 llama fp8 1 hr v1.1 #117

Workflow file

trace replay H200 llama fp8 1 hr v1.1 #117

Uh oh!

Workflow file for this run