From 071e07af9305708c6bc2ad6a4f9e2b880f1ec98d Mon Sep 17 00:00:00 2001
From: Evan Lezar <elezar@nvidia.com>
Date: Mon, 1 Jun 2026 23:06:55 +0200
Subject: [PATCH] feat: add GPU workload sandbox images

Signed-off-by: Evan Lezar <elezar@nvidia.com>
---
 .github/workflows/build-sandboxes.yml         | 23 +++---
 README.md                                     |  3 +
 THIRD-PARTY-NOTICES                           | 12 ++++
 sandboxes/gpu-workload-cuda-basic/Dockerfile  | 72 +++++++++++++++++++
 sandboxes/gpu-workload-cuda-basic/README.md   | 52 ++++++++++++++
 sandboxes/gpu-workload-cuda-basic/workload.sh | 40 +++++++++++
 sandboxes/gpu-workload-smoke-fail/Dockerfile  | 15 ++++
 sandboxes/gpu-workload-smoke-fail/README.md   | 34 +++++++++
 sandboxes/gpu-workload-smoke-fail/workload.sh |  9 +++
 sandboxes/gpu-workload-smoke-pass/Dockerfile  | 15 ++++
 sandboxes/gpu-workload-smoke-pass/README.md   | 32 +++++++++
 sandboxes/gpu-workload-smoke-pass/workload.sh |  8 +++
 12 files changed, 305 insertions(+), 10 deletions(-)
 create mode 100644 sandboxes/gpu-workload-cuda-basic/Dockerfile
 create mode 100644 sandboxes/gpu-workload-cuda-basic/README.md
 create mode 100644 sandboxes/gpu-workload-cuda-basic/workload.sh
 create mode 100644 sandboxes/gpu-workload-smoke-fail/Dockerfile
 create mode 100644 sandboxes/gpu-workload-smoke-fail/README.md
 create mode 100644 sandboxes/gpu-workload-smoke-fail/workload.sh
 create mode 100644 sandboxes/gpu-workload-smoke-pass/Dockerfile
 create mode 100644 sandboxes/gpu-workload-smoke-pass/README.md
 create mode 100644 sandboxes/gpu-workload-smoke-pass/workload.sh

diff --git a/.github/workflows/build-sandboxes.yml b/.github/workflows/build-sandboxes.yml
index 48ddbd1..a5495ee 100644
--- a/.github/workflows/build-sandboxes.yml
+++ b/.github/workflows/build-sandboxes.yml
@@ -169,6 +169,17 @@ jobs:
         with:
           driver-opts: ${{ github.ref != 'refs/heads/main' && 'network=host' || '' }}
 
+      - name: Set build platforms
+        id: platforms
+        run: |
+          if [ "${{ github.ref }}" = "refs/heads/main" ]; then
+            echo "value=linux/amd64,linux/arm64" >> "$GITHUB_OUTPUT"
+          elif [[ "${{ matrix.sandbox }}" == "nvidia-gpu" || "${{ matrix.sandbox }}" == gpu-workload-* ]]; then
+            echo "value=linux/amd64,linux/arm64" >> "$GITHUB_OUTPUT"
+          else
+            echo "value=linux/amd64" >> "$GITHUB_OUTPUT"
+          fi
+
       - name: Log in to GHCR
         uses: docker/login-action@v3
         with:
@@ -209,6 +220,7 @@ jobs:
         uses: docker/build-push-action@v6
         with:
           context: sandboxes/base
+          platforms: ${{ steps.platforms.outputs.value }}
           push: true
           tags: localhost:5000/sandboxes/base:latest
           cache-from: type=gha,scope=base
@@ -220,6 +232,7 @@ jobs:
         uses: docker/build-push-action@v6
         with:
           context: sandboxes/${{ steps.parent.outputs.sandbox }}
+          platforms: ${{ steps.platforms.outputs.value }}
           push: true
           tags: localhost:5000/sandboxes/${{ steps.parent.outputs.sandbox }}:latest
           build-args: |
@@ -247,15 +260,6 @@ jobs:
             type=sha,prefix=
             type=raw,value=latest,enable={{is_default_branch}}
 
-      - name: Set build platforms
-        id: platforms
-        run: |
-          if [ "${{ github.ref }}" = "refs/heads/main" ] || [ "${{ matrix.sandbox }}" = "nvidia-gpu" ]; then
-            echo "value=linux/amd64,linux/arm64" >> "$GITHUB_OUTPUT"
-          else
-            echo "value=linux/amd64" >> "$GITHUB_OUTPUT"
-          fi
-
       - name: Build and push
         uses: docker/build-push-action@v6
         with:
@@ -268,4 +272,3 @@ jobs:
             BASE_IMAGE=${{ steps.base.outputs.image }}
           cache-from: type=gha,scope=${{ matrix.sandbox }}
           cache-to: type=gha,mode=max,scope=${{ matrix.sandbox }}
-
diff --git a/README.md b/README.md
index 150f822..c7b15ef 100644
--- a/README.md
+++ b/README.md
@@ -24,6 +24,9 @@ This repo is the community ecosystem around OpenShell -- a hub for contributed s
 | `sandboxes/base/`       | Foundational image with system tools, users, and dev environment |
 | `sandboxes/droid/`      | Android automation and mobile testing workflows              |
 | `sandboxes/gemini/`     | Gemini CLI workflows                                         |
+| `sandboxes/gpu-workload-cuda-basic/` | CUDA `deviceQuery` and `vectorAdd` GPU e2e validation workload |
+| `sandboxes/gpu-workload-smoke-fail/` | Intentional-failure GPU e2e workload fixture       |
+| `sandboxes/gpu-workload-smoke-pass/` | Success-path GPU e2e workload fixture              |
 | `sandboxes/nvidia-gpu/` | GPU-enabled VM sandbox image with NVIDIA userspace tooling   |
 | `sandboxes/ollama/`     | Ollama for local and cloud LLMs with Claude Code, Codex, OpenCode pre-installed |
 | `sandboxes/pi/`         | [Pi](https://pi.dev) pre-installed                           |
diff --git a/THIRD-PARTY-NOTICES b/THIRD-PARTY-NOTICES
index fd58ef3..d548ba9 100644
--- a/THIRD-PARTY-NOTICES
+++ b/THIRD-PARTY-NOTICES
@@ -23,6 +23,18 @@ Image: nvidia/cuda:12.8.1-base-ubuntu22.04
 License: NVIDIA CUDA Toolkit End User License Agreement and Ubuntu component licenses
 URL: https://hub.docker.com/r/nvidia/cuda
 
+Image: nvcr.io/nvidia/cuda:12.8.1-base-ubuntu22.04
+License: NVIDIA CUDA Toolkit End User License Agreement and Ubuntu component licenses
+URL: https://catalog.ngc.nvidia.com/orgs/nvidia/containers/cuda
+
+================================================================================
+Source Artifacts
+================================================================================
+
+Source: NVIDIA/cuda-samples v12.8
+License: NVIDIA CUDA Samples License
+URL: https://github.com/NVIDIA/cuda-samples
+
 ================================================================================
 System Packages (APT — Ubuntu 24.04)
 ================================================================================
diff --git a/sandboxes/gpu-workload-cuda-basic/Dockerfile b/sandboxes/gpu-workload-cuda-basic/Dockerfile
new file mode 100644
index 0000000..a156d95
--- /dev/null
+++ b/sandboxes/gpu-workload-cuda-basic/Dockerfile
@@ -0,0 +1,72 @@
+# syntax=docker/dockerfile:1
+
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+ARG CUDA_BUILD_IMAGE=nvcr.io/nvidia/cuda:12.8.1-base-ubuntu22.04
+ARG BASE_IMAGE=ghcr.io/nvidia/openshell-community/sandboxes/base:latest
+
+FROM ${CUDA_BUILD_IMAGE} AS builder
+
+ARG DEBIAN_FRONTEND=noninteractive
+ARG CUDA_SAMPLES_REF=v12.8
+ARG CUDA_SAMPLES_REPO=https://github.com/NVIDIA/cuda-samples
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        cmake \
+        cuda-nvcc-12-8 \
+        curl \
+        g++ \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /build/cuda-samples
+
+RUN set -eux; \
+    curl -fsSL "${CUDA_SAMPLES_REPO}/archive/refs/tags/${CUDA_SAMPLES_REF}.tar.gz" \
+        -o /tmp/cuda-samples.tar.gz; \
+    tar -xzf /tmp/cuda-samples.tar.gz \
+        --strip-components=1 \
+        --wildcards \
+        '*/Common/*' \
+        '*/cmake/*' \
+        '*/Samples/0_Introduction/vectorAdd/*' \
+        '*/Samples/1_Utilities/deviceQuery/*' \
+        '*/LICENSE'; \
+    sed -i 's/CUDA::cudart/CUDA::cudart_static/g' \
+        Samples/1_Utilities/deviceQuery/CMakeLists.txt; \
+    cmake -S Samples/1_Utilities/deviceQuery -B /tmp/build-device-query \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_CUDA_RUNTIME_LIBRARY=Static; \
+    cmake --build /tmp/build-device-query --parallel; \
+    cmake -S Samples/0_Introduction/vectorAdd -B /tmp/build-vector-add \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_CUDA_RUNTIME_LIBRARY=Static; \
+    cmake --build /tmp/build-vector-add --parallel; \
+    mkdir -p /opt/openshell-gpu-workload; \
+    cp /tmp/build-device-query/deviceQuery /opt/openshell-gpu-workload/deviceQuery; \
+    cp /tmp/build-vector-add/vectorAdd /opt/openshell-gpu-workload/vectorAdd; \
+    cp LICENSE /opt/openshell-gpu-workload/cuda-samples.LICENSE; \
+    rm -f /tmp/cuda-samples.tar.gz
+
+FROM ${BASE_IMAGE}
+
+ARG CUDA_SAMPLES_REF=v12.8
+
+LABEL com.nvidia.openshell.gpu-workload.name="cuda-basic" \
+      com.nvidia.openshell.gpu-workload.cuda-samples-ref="${CUDA_SAMPLES_REF}"
+
+USER root
+RUN mkdir -p /usr/local/lib/openshell-gpu-workload \
+    /usr/local/share/doc/openshell-gpu-workload
+COPY --from=builder /opt/openshell-gpu-workload/deviceQuery /usr/local/lib/openshell-gpu-workload/deviceQuery
+COPY --from=builder /opt/openshell-gpu-workload/vectorAdd /usr/local/lib/openshell-gpu-workload/vectorAdd
+COPY --from=builder /opt/openshell-gpu-workload/cuda-samples.LICENSE /usr/local/share/doc/openshell-gpu-workload/cuda-samples.LICENSE
+COPY workload.sh /usr/local/bin/openshell-gpu-workload
+RUN chmod 0755 /usr/local/bin/openshell-gpu-workload \
+    /usr/local/lib/openshell-gpu-workload/deviceQuery \
+    /usr/local/lib/openshell-gpu-workload/vectorAdd
+
+USER sandbox
+ENTRYPOINT ["/usr/local/bin/openshell-gpu-workload"]
diff --git a/sandboxes/gpu-workload-cuda-basic/README.md b/sandboxes/gpu-workload-cuda-basic/README.md
new file mode 100644
index 0000000..95887f1
--- /dev/null
+++ b/sandboxes/gpu-workload-cuda-basic/README.md
@@ -0,0 +1,52 @@
+<!-- SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -->
+<!-- SPDX-License-Identifier: Apache-2.0 -->
+
+# GPU Workload CUDA Basic
+
+`gpu-workload-cuda-basic` validates that a GPU-enabled environment can run a
+basic CUDA runtime workload. It is a single image that runs two validation
+steps:
+
+1. `deviceQuery` checks CUDA runtime, driver, and device discovery.
+2. `vectorAdd` checks kernel launch, device memory allocation, host/device
+   copies, synchronization, and result validation.
+
+The image builds the samples from `NVIDIA/cuda-samples` tag `v12.8` with a CUDA
+12.8 builder image, then copies only the compiled binaries into the OpenShell
+community base final image. Published builds are multiarch for `linux/amd64`
+and `linux/arm64`.
+
+The workload prints `OPENSHELL_GPU_WORKLOAD_SUCCESS` only after both samples
+pass. On failure it prints `OPENSHELL_GPU_WORKLOAD_FAILURE` and exits non-zero.
+
+## Contract
+
+The image installs the workload at `/usr/local/bin/openshell-gpu-workload`.
+Direct container execution runs the workload as the image entrypoint. OpenShell
+tests that create a sandbox from this image should run the workload path
+explicitly because sandbox creation replaces the OCI entrypoint.
+
+The workload requires no network access after the image is pulled. It does not
+vendor GPU driver libraries such as `libcuda.so.1`; those libraries must be
+provided by the host GPU runtime or CDI injection.
+
+## Build
+
+```shell
+docker build -t gpu-workload-cuda-basic .
+```
+
+## Run
+
+Run it directly with Docker CDI:
+
+```shell
+docker run --rm --device nvidia.com/gpu=all gpu-workload-cuda-basic
+```
+
+Use `podman run` with the same `--device nvidia.com/gpu=all` option when Podman
+CDI is configured.
+
+The CUDA samples are redistributed under the NVIDIA CUDA samples license. The
+license text is copied into the image at
+`/usr/local/share/doc/openshell-gpu-workload/cuda-samples.LICENSE`.
diff --git a/sandboxes/gpu-workload-cuda-basic/workload.sh b/sandboxes/gpu-workload-cuda-basic/workload.sh
new file mode 100644
index 0000000..e20a67d
--- /dev/null
+++ b/sandboxes/gpu-workload-cuda-basic/workload.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+set -euo pipefail
+
+readonly SUCCESS_MARKER="OPENSHELL_GPU_WORKLOAD_SUCCESS"
+readonly FAILURE_MARKER="OPENSHELL_GPU_WORKLOAD_FAILURE"
+readonly WORKLOAD_DIR="/usr/local/lib/openshell-gpu-workload"
+
+run_sample() {
+  local name=$1
+  local expected=$2
+  local binary="${WORKLOAD_DIR}/${name}"
+  local output
+
+  output="$(mktemp)"
+  echo "running CUDA sample: ${name}"
+  if ! "${binary}" >"${output}" 2>&1; then
+    cat "${output}"
+    echo "${FAILURE_MARKER} ${name} exited non-zero" >&2
+    rm -f "${output}"
+    exit 1
+  fi
+
+  cat "${output}"
+  if ! grep -Fq "${expected}" "${output}"; then
+    echo "${FAILURE_MARKER} ${name} did not print expected output: ${expected}" >&2
+    rm -f "${output}"
+    exit 1
+  fi
+
+  rm -f "${output}"
+}
+
+run_sample "deviceQuery" "Result = PASS"
+run_sample "vectorAdd" "Test PASSED"
+
+echo "${SUCCESS_MARKER} cuda-basic"
diff --git a/sandboxes/gpu-workload-smoke-fail/Dockerfile b/sandboxes/gpu-workload-smoke-fail/Dockerfile
new file mode 100644
index 0000000..27b5e33
--- /dev/null
+++ b/sandboxes/gpu-workload-smoke-fail/Dockerfile
@@ -0,0 +1,15 @@
+# syntax=docker/dockerfile:1
+
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+ARG BASE_IMAGE=ghcr.io/nvidia/openshell-community/sandboxes/base:latest
+
+FROM ${BASE_IMAGE}
+
+USER root
+COPY workload.sh /usr/local/bin/openshell-gpu-workload
+RUN chmod 0755 /usr/local/bin/openshell-gpu-workload
+
+USER sandbox
+ENTRYPOINT ["/usr/local/bin/openshell-gpu-workload"]
diff --git a/sandboxes/gpu-workload-smoke-fail/README.md b/sandboxes/gpu-workload-smoke-fail/README.md
new file mode 100644
index 0000000..34afa84
--- /dev/null
+++ b/sandboxes/gpu-workload-smoke-fail/README.md
@@ -0,0 +1,34 @@
+<!-- SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -->
+<!-- SPDX-License-Identifier: Apache-2.0 -->
+
+# GPU Workload Smoke Fail
+
+`gpu-workload-smoke-fail` validates negative-path diagnostics in e2e test
+plumbing.
+
+The workload does not perform GPU-specific work. It prints
+`OPENSHELL_GPU_WORKLOAD_FAILURE`, emits a stable diagnostic, and exits with
+status `42`.
+
+## Contract
+
+The image installs the workload at `/usr/local/bin/openshell-gpu-workload`.
+Direct container execution runs the workload as the image entrypoint. OpenShell
+tests that create a sandbox from this image should run the workload path
+explicitly because sandbox creation replaces the OCI entrypoint.
+
+The workload requires no network access after the image is pulled.
+
+## Build
+
+```shell
+docker build -t gpu-workload-smoke-fail .
+```
+
+## Run
+
+```shell
+docker run --rm gpu-workload-smoke-fail
+```
+
+The direct run should fail.
diff --git a/sandboxes/gpu-workload-smoke-fail/workload.sh b/sandboxes/gpu-workload-smoke-fail/workload.sh
new file mode 100644
index 0000000..8c57624
--- /dev/null
+++ b/sandboxes/gpu-workload-smoke-fail/workload.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+set -euo pipefail
+
+echo "OPENSHELL_GPU_WORKLOAD_FAILURE smoke-fail intentional failure" >&2
+exit 42
diff --git a/sandboxes/gpu-workload-smoke-pass/Dockerfile b/sandboxes/gpu-workload-smoke-pass/Dockerfile
new file mode 100644
index 0000000..27b5e33
--- /dev/null
+++ b/sandboxes/gpu-workload-smoke-pass/Dockerfile
@@ -0,0 +1,15 @@
+# syntax=docker/dockerfile:1
+
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+ARG BASE_IMAGE=ghcr.io/nvidia/openshell-community/sandboxes/base:latest
+
+FROM ${BASE_IMAGE}
+
+USER root
+COPY workload.sh /usr/local/bin/openshell-gpu-workload
+RUN chmod 0755 /usr/local/bin/openshell-gpu-workload
+
+USER sandbox
+ENTRYPOINT ["/usr/local/bin/openshell-gpu-workload"]
diff --git a/sandboxes/gpu-workload-smoke-pass/README.md b/sandboxes/gpu-workload-smoke-pass/README.md
new file mode 100644
index 0000000..4a5ee2d
--- /dev/null
+++ b/sandboxes/gpu-workload-smoke-pass/README.md
@@ -0,0 +1,32 @@
+<!-- SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -->
+<!-- SPDX-License-Identifier: Apache-2.0 -->
+
+# GPU Workload Smoke Pass
+
+`gpu-workload-smoke-pass` validates image publishing, sandbox image
+compatibility, default entrypoint execution, and success-marker assertion
+plumbing.
+
+The workload does not perform GPU-specific work. It prints
+`OPENSHELL_GPU_WORKLOAD_SUCCESS` and exits `0`.
+
+## Contract
+
+The image installs the workload at `/usr/local/bin/openshell-gpu-workload`.
+Direct container execution runs the workload as the image entrypoint. OpenShell
+tests that create a sandbox from this image should run the workload path
+explicitly because sandbox creation replaces the OCI entrypoint.
+
+The workload requires no network access after the image is pulled.
+
+## Build
+
+```shell
+docker build -t gpu-workload-smoke-pass .
+```
+
+## Run
+
+```shell
+docker run --rm gpu-workload-smoke-pass
+```
diff --git a/sandboxes/gpu-workload-smoke-pass/workload.sh b/sandboxes/gpu-workload-smoke-pass/workload.sh
new file mode 100644
index 0000000..76f848f
--- /dev/null
+++ b/sandboxes/gpu-workload-smoke-pass/workload.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+set -euo pipefail
+
+echo "OPENSHELL_GPU_WORKLOAD_SUCCESS smoke-pass"