From 071e07af9305708c6bc2ad6a4f9e2b880f1ec98d Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Mon, 1 Jun 2026 23:06:55 +0200 Subject: [PATCH] feat: add GPU workload sandbox images Signed-off-by: Evan Lezar --- .github/workflows/build-sandboxes.yml | 23 +++--- README.md | 3 + THIRD-PARTY-NOTICES | 12 ++++ sandboxes/gpu-workload-cuda-basic/Dockerfile | 72 +++++++++++++++++++ sandboxes/gpu-workload-cuda-basic/README.md | 52 ++++++++++++++ sandboxes/gpu-workload-cuda-basic/workload.sh | 40 +++++++++++ sandboxes/gpu-workload-smoke-fail/Dockerfile | 15 ++++ sandboxes/gpu-workload-smoke-fail/README.md | 34 +++++++++ sandboxes/gpu-workload-smoke-fail/workload.sh | 9 +++ sandboxes/gpu-workload-smoke-pass/Dockerfile | 15 ++++ sandboxes/gpu-workload-smoke-pass/README.md | 32 +++++++++ sandboxes/gpu-workload-smoke-pass/workload.sh | 8 +++ 12 files changed, 305 insertions(+), 10 deletions(-) create mode 100644 sandboxes/gpu-workload-cuda-basic/Dockerfile create mode 100644 sandboxes/gpu-workload-cuda-basic/README.md create mode 100644 sandboxes/gpu-workload-cuda-basic/workload.sh create mode 100644 sandboxes/gpu-workload-smoke-fail/Dockerfile create mode 100644 sandboxes/gpu-workload-smoke-fail/README.md create mode 100644 sandboxes/gpu-workload-smoke-fail/workload.sh create mode 100644 sandboxes/gpu-workload-smoke-pass/Dockerfile create mode 100644 sandboxes/gpu-workload-smoke-pass/README.md create mode 100644 sandboxes/gpu-workload-smoke-pass/workload.sh diff --git a/.github/workflows/build-sandboxes.yml b/.github/workflows/build-sandboxes.yml index 48ddbd1..a5495ee 100644 --- a/.github/workflows/build-sandboxes.yml +++ b/.github/workflows/build-sandboxes.yml @@ -169,6 +169,17 @@ jobs: with: driver-opts: ${{ github.ref != 'refs/heads/main' && 'network=host' || '' }} + - name: Set build platforms + id: platforms + run: | + if [ "${{ github.ref }}" = "refs/heads/main" ]; then + echo "value=linux/amd64,linux/arm64" >> "$GITHUB_OUTPUT" + elif [[ "${{ matrix.sandbox }}" == "nvidia-gpu" || "${{ matrix.sandbox }}" == gpu-workload-* ]]; then + echo "value=linux/amd64,linux/arm64" >> "$GITHUB_OUTPUT" + else + echo "value=linux/amd64" >> "$GITHUB_OUTPUT" + fi + - name: Log in to GHCR uses: docker/login-action@v3 with: @@ -209,6 +220,7 @@ jobs: uses: docker/build-push-action@v6 with: context: sandboxes/base + platforms: ${{ steps.platforms.outputs.value }} push: true tags: localhost:5000/sandboxes/base:latest cache-from: type=gha,scope=base @@ -220,6 +232,7 @@ jobs: uses: docker/build-push-action@v6 with: context: sandboxes/${{ steps.parent.outputs.sandbox }} + platforms: ${{ steps.platforms.outputs.value }} push: true tags: localhost:5000/sandboxes/${{ steps.parent.outputs.sandbox }}:latest build-args: | @@ -247,15 +260,6 @@ jobs: type=sha,prefix= type=raw,value=latest,enable={{is_default_branch}} - - name: Set build platforms - id: platforms - run: | - if [ "${{ github.ref }}" = "refs/heads/main" ] || [ "${{ matrix.sandbox }}" = "nvidia-gpu" ]; then - echo "value=linux/amd64,linux/arm64" >> "$GITHUB_OUTPUT" - else - echo "value=linux/amd64" >> "$GITHUB_OUTPUT" - fi - - name: Build and push uses: docker/build-push-action@v6 with: @@ -268,4 +272,3 @@ jobs: BASE_IMAGE=${{ steps.base.outputs.image }} cache-from: type=gha,scope=${{ matrix.sandbox }} cache-to: type=gha,mode=max,scope=${{ matrix.sandbox }} - diff --git a/README.md b/README.md index 150f822..c7b15ef 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,9 @@ This repo is the community ecosystem around OpenShell -- a hub for contributed s | `sandboxes/base/` | Foundational image with system tools, users, and dev environment | | `sandboxes/droid/` | Android automation and mobile testing workflows | | `sandboxes/gemini/` | Gemini CLI workflows | +| `sandboxes/gpu-workload-cuda-basic/` | CUDA `deviceQuery` and `vectorAdd` GPU e2e validation workload | +| `sandboxes/gpu-workload-smoke-fail/` | Intentional-failure GPU e2e workload fixture | +| `sandboxes/gpu-workload-smoke-pass/` | Success-path GPU e2e workload fixture | | `sandboxes/nvidia-gpu/` | GPU-enabled VM sandbox image with NVIDIA userspace tooling | | `sandboxes/ollama/` | Ollama for local and cloud LLMs with Claude Code, Codex, OpenCode pre-installed | | `sandboxes/pi/` | [Pi](https://pi.dev) pre-installed | diff --git a/THIRD-PARTY-NOTICES b/THIRD-PARTY-NOTICES index fd58ef3..d548ba9 100644 --- a/THIRD-PARTY-NOTICES +++ b/THIRD-PARTY-NOTICES @@ -23,6 +23,18 @@ Image: nvidia/cuda:12.8.1-base-ubuntu22.04 License: NVIDIA CUDA Toolkit End User License Agreement and Ubuntu component licenses URL: https://hub.docker.com/r/nvidia/cuda +Image: nvcr.io/nvidia/cuda:12.8.1-base-ubuntu22.04 +License: NVIDIA CUDA Toolkit End User License Agreement and Ubuntu component licenses +URL: https://catalog.ngc.nvidia.com/orgs/nvidia/containers/cuda + +================================================================================ +Source Artifacts +================================================================================ + +Source: NVIDIA/cuda-samples v12.8 +License: NVIDIA CUDA Samples License +URL: https://github.com/NVIDIA/cuda-samples + ================================================================================ System Packages (APT — Ubuntu 24.04) ================================================================================ diff --git a/sandboxes/gpu-workload-cuda-basic/Dockerfile b/sandboxes/gpu-workload-cuda-basic/Dockerfile new file mode 100644 index 0000000..a156d95 --- /dev/null +++ b/sandboxes/gpu-workload-cuda-basic/Dockerfile @@ -0,0 +1,72 @@ +# syntax=docker/dockerfile:1 + +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +ARG CUDA_BUILD_IMAGE=nvcr.io/nvidia/cuda:12.8.1-base-ubuntu22.04 +ARG BASE_IMAGE=ghcr.io/nvidia/openshell-community/sandboxes/base:latest + +FROM ${CUDA_BUILD_IMAGE} AS builder + +ARG DEBIAN_FRONTEND=noninteractive +ARG CUDA_SAMPLES_REF=v12.8 +ARG CUDA_SAMPLES_REPO=https://github.com/NVIDIA/cuda-samples + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + cmake \ + cuda-nvcc-12-8 \ + curl \ + g++ \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /build/cuda-samples + +RUN set -eux; \ + curl -fsSL "${CUDA_SAMPLES_REPO}/archive/refs/tags/${CUDA_SAMPLES_REF}.tar.gz" \ + -o /tmp/cuda-samples.tar.gz; \ + tar -xzf /tmp/cuda-samples.tar.gz \ + --strip-components=1 \ + --wildcards \ + '*/Common/*' \ + '*/cmake/*' \ + '*/Samples/0_Introduction/vectorAdd/*' \ + '*/Samples/1_Utilities/deviceQuery/*' \ + '*/LICENSE'; \ + sed -i 's/CUDA::cudart/CUDA::cudart_static/g' \ + Samples/1_Utilities/deviceQuery/CMakeLists.txt; \ + cmake -S Samples/1_Utilities/deviceQuery -B /tmp/build-device-query \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CUDA_RUNTIME_LIBRARY=Static; \ + cmake --build /tmp/build-device-query --parallel; \ + cmake -S Samples/0_Introduction/vectorAdd -B /tmp/build-vector-add \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CUDA_RUNTIME_LIBRARY=Static; \ + cmake --build /tmp/build-vector-add --parallel; \ + mkdir -p /opt/openshell-gpu-workload; \ + cp /tmp/build-device-query/deviceQuery /opt/openshell-gpu-workload/deviceQuery; \ + cp /tmp/build-vector-add/vectorAdd /opt/openshell-gpu-workload/vectorAdd; \ + cp LICENSE /opt/openshell-gpu-workload/cuda-samples.LICENSE; \ + rm -f /tmp/cuda-samples.tar.gz + +FROM ${BASE_IMAGE} + +ARG CUDA_SAMPLES_REF=v12.8 + +LABEL com.nvidia.openshell.gpu-workload.name="cuda-basic" \ + com.nvidia.openshell.gpu-workload.cuda-samples-ref="${CUDA_SAMPLES_REF}" + +USER root +RUN mkdir -p /usr/local/lib/openshell-gpu-workload \ + /usr/local/share/doc/openshell-gpu-workload +COPY --from=builder /opt/openshell-gpu-workload/deviceQuery /usr/local/lib/openshell-gpu-workload/deviceQuery +COPY --from=builder /opt/openshell-gpu-workload/vectorAdd /usr/local/lib/openshell-gpu-workload/vectorAdd +COPY --from=builder /opt/openshell-gpu-workload/cuda-samples.LICENSE /usr/local/share/doc/openshell-gpu-workload/cuda-samples.LICENSE +COPY workload.sh /usr/local/bin/openshell-gpu-workload +RUN chmod 0755 /usr/local/bin/openshell-gpu-workload \ + /usr/local/lib/openshell-gpu-workload/deviceQuery \ + /usr/local/lib/openshell-gpu-workload/vectorAdd + +USER sandbox +ENTRYPOINT ["/usr/local/bin/openshell-gpu-workload"] diff --git a/sandboxes/gpu-workload-cuda-basic/README.md b/sandboxes/gpu-workload-cuda-basic/README.md new file mode 100644 index 0000000..95887f1 --- /dev/null +++ b/sandboxes/gpu-workload-cuda-basic/README.md @@ -0,0 +1,52 @@ + + + +# GPU Workload CUDA Basic + +`gpu-workload-cuda-basic` validates that a GPU-enabled environment can run a +basic CUDA runtime workload. It is a single image that runs two validation +steps: + +1. `deviceQuery` checks CUDA runtime, driver, and device discovery. +2. `vectorAdd` checks kernel launch, device memory allocation, host/device + copies, synchronization, and result validation. + +The image builds the samples from `NVIDIA/cuda-samples` tag `v12.8` with a CUDA +12.8 builder image, then copies only the compiled binaries into the OpenShell +community base final image. Published builds are multiarch for `linux/amd64` +and `linux/arm64`. + +The workload prints `OPENSHELL_GPU_WORKLOAD_SUCCESS` only after both samples +pass. On failure it prints `OPENSHELL_GPU_WORKLOAD_FAILURE` and exits non-zero. + +## Contract + +The image installs the workload at `/usr/local/bin/openshell-gpu-workload`. +Direct container execution runs the workload as the image entrypoint. OpenShell +tests that create a sandbox from this image should run the workload path +explicitly because sandbox creation replaces the OCI entrypoint. + +The workload requires no network access after the image is pulled. It does not +vendor GPU driver libraries such as `libcuda.so.1`; those libraries must be +provided by the host GPU runtime or CDI injection. + +## Build + +```shell +docker build -t gpu-workload-cuda-basic . +``` + +## Run + +Run it directly with Docker CDI: + +```shell +docker run --rm --device nvidia.com/gpu=all gpu-workload-cuda-basic +``` + +Use `podman run` with the same `--device nvidia.com/gpu=all` option when Podman +CDI is configured. + +The CUDA samples are redistributed under the NVIDIA CUDA samples license. The +license text is copied into the image at +`/usr/local/share/doc/openshell-gpu-workload/cuda-samples.LICENSE`. diff --git a/sandboxes/gpu-workload-cuda-basic/workload.sh b/sandboxes/gpu-workload-cuda-basic/workload.sh new file mode 100644 index 0000000..e20a67d --- /dev/null +++ b/sandboxes/gpu-workload-cuda-basic/workload.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash + +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +set -euo pipefail + +readonly SUCCESS_MARKER="OPENSHELL_GPU_WORKLOAD_SUCCESS" +readonly FAILURE_MARKER="OPENSHELL_GPU_WORKLOAD_FAILURE" +readonly WORKLOAD_DIR="/usr/local/lib/openshell-gpu-workload" + +run_sample() { + local name=$1 + local expected=$2 + local binary="${WORKLOAD_DIR}/${name}" + local output + + output="$(mktemp)" + echo "running CUDA sample: ${name}" + if ! "${binary}" >"${output}" 2>&1; then + cat "${output}" + echo "${FAILURE_MARKER} ${name} exited non-zero" >&2 + rm -f "${output}" + exit 1 + fi + + cat "${output}" + if ! grep -Fq "${expected}" "${output}"; then + echo "${FAILURE_MARKER} ${name} did not print expected output: ${expected}" >&2 + rm -f "${output}" + exit 1 + fi + + rm -f "${output}" +} + +run_sample "deviceQuery" "Result = PASS" +run_sample "vectorAdd" "Test PASSED" + +echo "${SUCCESS_MARKER} cuda-basic" diff --git a/sandboxes/gpu-workload-smoke-fail/Dockerfile b/sandboxes/gpu-workload-smoke-fail/Dockerfile new file mode 100644 index 0000000..27b5e33 --- /dev/null +++ b/sandboxes/gpu-workload-smoke-fail/Dockerfile @@ -0,0 +1,15 @@ +# syntax=docker/dockerfile:1 + +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +ARG BASE_IMAGE=ghcr.io/nvidia/openshell-community/sandboxes/base:latest + +FROM ${BASE_IMAGE} + +USER root +COPY workload.sh /usr/local/bin/openshell-gpu-workload +RUN chmod 0755 /usr/local/bin/openshell-gpu-workload + +USER sandbox +ENTRYPOINT ["/usr/local/bin/openshell-gpu-workload"] diff --git a/sandboxes/gpu-workload-smoke-fail/README.md b/sandboxes/gpu-workload-smoke-fail/README.md new file mode 100644 index 0000000..34afa84 --- /dev/null +++ b/sandboxes/gpu-workload-smoke-fail/README.md @@ -0,0 +1,34 @@ + + + +# GPU Workload Smoke Fail + +`gpu-workload-smoke-fail` validates negative-path diagnostics in e2e test +plumbing. + +The workload does not perform GPU-specific work. It prints +`OPENSHELL_GPU_WORKLOAD_FAILURE`, emits a stable diagnostic, and exits with +status `42`. + +## Contract + +The image installs the workload at `/usr/local/bin/openshell-gpu-workload`. +Direct container execution runs the workload as the image entrypoint. OpenShell +tests that create a sandbox from this image should run the workload path +explicitly because sandbox creation replaces the OCI entrypoint. + +The workload requires no network access after the image is pulled. + +## Build + +```shell +docker build -t gpu-workload-smoke-fail . +``` + +## Run + +```shell +docker run --rm gpu-workload-smoke-fail +``` + +The direct run should fail. diff --git a/sandboxes/gpu-workload-smoke-fail/workload.sh b/sandboxes/gpu-workload-smoke-fail/workload.sh new file mode 100644 index 0000000..8c57624 --- /dev/null +++ b/sandboxes/gpu-workload-smoke-fail/workload.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +set -euo pipefail + +echo "OPENSHELL_GPU_WORKLOAD_FAILURE smoke-fail intentional failure" >&2 +exit 42 diff --git a/sandboxes/gpu-workload-smoke-pass/Dockerfile b/sandboxes/gpu-workload-smoke-pass/Dockerfile new file mode 100644 index 0000000..27b5e33 --- /dev/null +++ b/sandboxes/gpu-workload-smoke-pass/Dockerfile @@ -0,0 +1,15 @@ +# syntax=docker/dockerfile:1 + +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +ARG BASE_IMAGE=ghcr.io/nvidia/openshell-community/sandboxes/base:latest + +FROM ${BASE_IMAGE} + +USER root +COPY workload.sh /usr/local/bin/openshell-gpu-workload +RUN chmod 0755 /usr/local/bin/openshell-gpu-workload + +USER sandbox +ENTRYPOINT ["/usr/local/bin/openshell-gpu-workload"] diff --git a/sandboxes/gpu-workload-smoke-pass/README.md b/sandboxes/gpu-workload-smoke-pass/README.md new file mode 100644 index 0000000..4a5ee2d --- /dev/null +++ b/sandboxes/gpu-workload-smoke-pass/README.md @@ -0,0 +1,32 @@ + + + +# GPU Workload Smoke Pass + +`gpu-workload-smoke-pass` validates image publishing, sandbox image +compatibility, default entrypoint execution, and success-marker assertion +plumbing. + +The workload does not perform GPU-specific work. It prints +`OPENSHELL_GPU_WORKLOAD_SUCCESS` and exits `0`. + +## Contract + +The image installs the workload at `/usr/local/bin/openshell-gpu-workload`. +Direct container execution runs the workload as the image entrypoint. OpenShell +tests that create a sandbox from this image should run the workload path +explicitly because sandbox creation replaces the OCI entrypoint. + +The workload requires no network access after the image is pulled. + +## Build + +```shell +docker build -t gpu-workload-smoke-pass . +``` + +## Run + +```shell +docker run --rm gpu-workload-smoke-pass +``` diff --git a/sandboxes/gpu-workload-smoke-pass/workload.sh b/sandboxes/gpu-workload-smoke-pass/workload.sh new file mode 100644 index 0000000..76f848f --- /dev/null +++ b/sandboxes/gpu-workload-smoke-pass/workload.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +set -euo pipefail + +echo "OPENSHELL_GPU_WORKLOAD_SUCCESS smoke-pass"