From b60fa050e7f41bee5640c8c00c0cfd557bf38434 Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Tue, 3 Mar 2026 05:55:13 +0000 Subject: [PATCH 1/5] Add GPU client kernels documentation Document how CUDA/ROCm kernels interact with the Chimaera runtime, including task definition, client API, host-side setup, configuration parameters, and troubleshooting. Co-Authored-By: Claude Sonnet 4.6 --- docs/sdk/context-runtime/6.gpu_clients.md | 399 ++++++++++++++++++++++ 1 file changed, 399 insertions(+) create mode 100644 docs/sdk/context-runtime/6.gpu_clients.md diff --git a/docs/sdk/context-runtime/6.gpu_clients.md b/docs/sdk/context-runtime/6.gpu_clients.md new file mode 100644 index 00000000..2abe9080 --- /dev/null +++ b/docs/sdk/context-runtime/6.gpu_clients.md @@ -0,0 +1,399 @@ +--- +sidebar_position: 6 +title: GPU Client Kernels +description: How CUDA/ROCm kernels submit tasks to the Chimaera runtime and receive results. +--- + +# GPU Client Kernels + +This guide explains how CUDA or ROCm kernels can submit tasks to the Chimaera runtime and receive results, using the same client API as host code. + +## Overview + +The Chimaera runtime supports **GPU-initiated task submission**: a CUDA/ROCm kernel running on the GPU can create a task, send it through a shared-memory ring buffer to a CPU worker, and wait for the result — all without returning to host code. + +The communication path is: + +``` +GPU Kernel CPU Worker + │ │ + │ 1. NewTask (GPU allocator) │ + │ 2. Send (SPSC ring buffer →) ───> dequeue + deserialize + │ │ 3. Execute task + │ 4. Wait (← SPSC ring buffer) <─── serialize output + FUTURE_COMPLETE + │ 5. Read result │ +``` + +Both directions use a **SPSC (single-producer single-consumer) ring buffer** embedded in a `FutureShm` structure that lives in GPU-registered pinned shared memory. System-scope atomics ensure visibility between GPU and CPU. + +## Prerequisites + +### CMake flags + +Enable GPU support when configuring the build: + +```bash +# CUDA +cmake .. -DWRP_CORE_ENABLE_CUDA=ON + +# ROCm +cmake .. -DWRP_CORE_ENABLE_ROCM=ON +``` + +These set the `HSHM_ENABLE_CUDA` or `HSHM_ENABLE_ROCM` preprocessor macros, which gate all GPU code paths. + +### Required headers + +```cpp +#include +#include +#include +#include +#include + +// Your module's client and task headers +#include +#include +``` + +## Writing a GPU-Compatible Task + +A task that can be created and submitted from a GPU kernel must: + +1. Mark constructors and serialization methods with `HSHM_CROSS_FUN` +2. Provide `SerializeIn` and `SerializeOut` methods +3. Use only GPU-safe types (no `std::string`, `std::vector`, etc.) + +```cpp +struct MyGpuTask : public chi::Task { + IN chi::u32 input_value_; + INOUT chi::u32 result_; + + HSHM_CROSS_FUN MyGpuTask() : chi::Task(), input_value_(0), result_(0) {} + + HSHM_CROSS_FUN explicit MyGpuTask( + const chi::TaskId& task_id, + const chi::PoolId& pool_id, + const chi::PoolQuery& query, + chi::u32 input_value) + : chi::Task(task_id, pool_id, query, kMethodId), + input_value_(input_value), result_(0) { + task_id_ = task_id; + pool_id_ = pool_id; + method_ = kMethodId; + task_flags_.Clear(); + pool_query_ = query; + } + + // Serialization: fields sent GPU → CPU + template + HSHM_CROSS_FUN void SerializeIn(Archive& ar) { + Task::SerializeIn(ar); + ar(input_value_, result_); + } + + // Serialization: fields sent CPU → GPU (after execution) + template + HSHM_CROSS_FUN void SerializeOut(Archive& ar) { + Task::SerializeOut(ar); + ar(result_); + } +}; +``` + +**Important**: Every function called during serialization must be `HSHM_CROSS_FUN` (i.e., `__host__ __device__`). Without this annotation, NVCC's SFINAE-based dispatch silently skips serialization on the GPU, producing corrupted data with no compiler error. + +## Writing a GPU-Compatible Client + +The client class needs `HSHM_CROSS_FUN` on the constructor and any methods called from GPU code: + +```cpp +class Client : public chi::ContainerClient { + public: + HSHM_CROSS_FUN Client() = default; + HSHM_CROSS_FUN explicit Client(const chi::PoolId& pool_id) { Init(pool_id); } + + // Host-only methods (AsyncCreate, etc.) stay as-is... + + // GPU-callable task submission + HSHM_CROSS_FUN + chi::Future AsyncMyTask(const chi::PoolQuery& query, + chi::u32 input_value) { + auto* ipc = CHI_IPC; + auto task = ipc->NewTask( + chi::CreateTaskId(), pool_id_, query, input_value); + return ipc->Send(task); + } +}; +``` + +`CHI_IPC->Send()` automatically dispatches to the GPU ring-buffer path (`SendGpu`) when compiled for device code. + +## GPU Kernel Implementation + +### The CHIMAERA_GPU_INIT Macro + +Every GPU kernel that interacts with the runtime must call `CHIMAERA_GPU_INIT` at the top. This macro: + +1. Creates a per-block `IpcManager` singleton in `__shared__` memory +2. Initializes per-thread `ArenaAllocator` instances from the GPU memory backend +3. Synchronizes all threads in the block via `__syncthreads()` + +```cpp +__global__ void my_kernel(chi::IpcManagerGpu gpu_info, + chi::PoolId pool_id, + chi::u32 input, + int* d_result, + chi::u32* d_output) { + // Initialize per-block IPC manager (required) + CHIMAERA_GPU_INIT(gpu_info); + + // Use the client API — same code as host + chimaera::my_module::Client client(pool_id); + auto future = client.AsyncMyTask(chi::PoolQuery::Local(), input); + future.Wait(); + + // Read result + *d_output = future->result_; + *d_result = 1; // success +} +``` + +### Memory Layout + +`CHIMAERA_GPU_INIT` partitions the GPU memory backend as: + +``` + ┌──────────────────────────────────────────────────────┐ + │ Pointer table (num_threads * sizeof(ptr)) │ + ├──────────────────────────────────────────────────────┤ + │ ArenaAllocator 0 (per_thread_size bytes) │ + ├──────────────────────────────────────────────────────┤ + │ ArenaAllocator 1 (per_thread_size bytes) │ + ├──────────────────────────────────────────────────────┤ + │ ... │ + ├──────────────────────────────────────────────────────┤ + │ ArenaAllocator N-1 │ + └──────────────────────────────────────────────────────┘ +``` + +Each `ArenaAllocator` is a bump-pointer allocator that provides fast, lock-free allocation for its thread. All `ShmPtr` offsets are relative to the backend base so they can be resolved by the CPU worker. + +## Host-Side Setup + +The host code must: + +1. Start the Chimaera runtime +2. Create the module's container (pool) +3. Allocate GPU-registered pinned shared memory +4. Create a GPU task queue and register it with the runtime +5. Register the GPU memory backend for host-side ShmPtr resolution +6. Launch the kernel + +### Complete Host-Side Example + +```cpp +#include +#include +#include + +int run_gpu_test(chi::PoolId pool_id, chi::u32 test_value, + chi::u32* out_result) { + // Increase GPU stack for deep template call chains + cudaDeviceSetLimit(cudaLimitStackSize, 131072); // 128 KB + + // 1. Create GPU memory backend (pinned shared memory) + hipc::MemoryBackendId backend_id(3, 0); + size_t gpu_memory_size = 10 * 1024 * 1024; // 10 MB + hipc::GpuShmMmap gpu_backend; + if (!gpu_backend.shm_init(backend_id, gpu_memory_size, "/gpu_test", 0)) + return -100; + + // 2. Create GPU task queue (pinned shared memory) + hipc::MemoryBackendId queue_backend_id(4, 0); + hipc::GpuShmMmap queue_backend; + if (!queue_backend.shm_init(queue_backend_id, 2 * 1024 * 1024, + "/gpu_queue", 0)) + return -101; + + // 3. Create allocator and task queue in the queue backend + auto* queue_alloc = queue_backend.MakeAlloc>( + queue_backend.data_capacity_); + auto gpu_queue_ptr = queue_alloc->NewObj( + queue_alloc, 1, 2, 1024); // 1 lane, 2 priorities, depth 1024 + + // 4. Register queue and assign lanes to the GPU worker + CHI_IPC->RegisterGpuQueue(gpu_queue_ptr); + CHI_IPC->AssignGpuLanesToWorker(); + + // 5. Register GPU backend for host-side ShmPtr resolution + CHI_IPC->RegisterGpuAllocator(backend_id, gpu_backend.data_, + gpu_backend.data_capacity_); + + // 6. Build IpcManagerGpu and launch kernel + chi::IpcManagerGpu gpu_info(gpu_backend, gpu_queue_ptr.ptr_); + + int* d_result = hshm::GpuApi::Malloc(sizeof(int)); + chi::u32* d_out = hshm::GpuApi::Malloc(sizeof(chi::u32)); + int h_result = 0; + chi::u32 h_out = 0; + hshm::GpuApi::Memcpy(d_result, &h_result, sizeof(int)); + hshm::GpuApi::Memcpy(d_out, &h_out, sizeof(chi::u32)); + + my_kernel<<<1, 1>>>(gpu_info, pool_id, test_value, d_result, d_out); + + cudaDeviceSynchronize(); + hshm::GpuApi::Memcpy(&h_result, d_result, sizeof(int)); + hshm::GpuApi::Memcpy(&h_out, d_out, sizeof(chi::u32)); + + *out_result = h_out; + hshm::GpuApi::Free(d_result); + hshm::GpuApi::Free(d_out); + return h_result; +} +``` + +### Key Registration Functions + +| Function | Purpose | +|---|---| +| `CHI_IPC->RegisterGpuQueue(queue)` | Registers a GPU task queue so the runtime can poll it | +| `CHI_IPC->AssignGpuLanesToWorker()` | Assigns all registered GPU queue lanes to the designated GPU worker thread | +| `CHI_IPC->RegisterGpuAllocator(id, data, capacity)` | Registers GPU backend memory so the host can resolve `ShmPtr`s allocated by the GPU kernel | + +## Configuration Parameters + +### GPU Stack Size + +GPU kernels that use the client API involve deep template instantiation chains (serialization, ring buffer, allocator). The default CUDA stack size (1024 bytes) is insufficient. + +```cpp +cudaDeviceSetLimit(cudaLimitStackSize, 131072); // 128 KB recommended +``` + +Symptoms of stack overflow: silent kernel completion with corrupted or missing data (no error reported by `cudaGetLastError`). + +### Ring Buffer Size (Copy Space) + +The SPSC ring buffer size determines how much serialized data can be in-flight between GPU and CPU. This is set per-task: + +```cpp +// Inside your task class, override GetCopySpaceSize: +HSHM_CROSS_FUN size_t GetCopySpaceSize() const { return 8192; } +``` + +The default is **4096 bytes** if `GetCopySpaceSize()` returns 0. The `FutureShm` is allocated as `sizeof(FutureShm) + copy_space_size`, with separate `input_` and `output_` ring buffers sharing the same copy space. + +For tasks with large serialized payloads, increase this value. If the ring buffer is too small, the producer will spin-wait for the consumer to drain data. + +### GPU Memory Backend Size + +The GPU memory backend is partitioned equally among all threads in a block. Size it based on: + +- Number of threads per block +- Per-thread allocation needs (tasks, serialization buffers, FutureShm) + +```cpp +// Example: 10 MB for a single-thread kernel +size_t gpu_memory_size = 10 * 1024 * 1024; +``` + +A single task submission typically consumes: +- `sizeof(TaskT)` for the task object +- `sizeof(FutureShm) + copy_space_size` for the future +- Temporary serialization buffer (~copy_space_size) + +### GPU Task Queue Configuration + +```cpp +// Parameters: allocator, num_lanes, num_priorities, depth +queue_alloc->NewObj(queue_alloc, 1, 2, 1024); +``` + +| Parameter | Default | Description | +|---|---|---| +| `num_lanes` | 1 | Number of queue lanes (1 per GPU is typical) | +| `num_priorities` | 2 | Priority levels per lane | +| `depth` | 1024 | Maximum tasks per lane before blocking | + +### Queue Backend Size + +The queue backend holds the `TaskQueue` data structure and its internal lane storage: + +```cpp +size_t queue_memory_size = 2 * 1024 * 1024; // 2 MB is sufficient for most use cases +``` + +### Runtime Thread Configuration + +The Chimaera runtime automatically detects GPU devices and creates GPU queues during `ServerInit`. The scheduler assigns a dedicated GPU worker thread to poll GPU lanes. + +In the runtime YAML configuration, ensure enough worker threads are configured: + +```yaml +# At minimum: N task workers + 1 network worker + 1 GPU worker +num_threads: 4 +``` + +The scheduler's `DivideWorkers` method partitions workers into task, network, and GPU roles. With `num_threads: 4`, you get 2 task workers, 1 network worker, and 1 GPU worker (the GPU worker is selected from the task worker pool by the scheduler). + +## Data Flow Details + +### Send Path (GPU → CPU) + +1. **NewTask**: Allocates the task from the per-thread `ArenaAllocator` +2. **SendGpu**: Allocates a `FutureShm` with embedded ring buffer copy space +3. **Enqueue**: Pushes the `Future` onto the GPU task queue lane (the CPU worker polls this) +4. **Serialize + Send**: Builds a `LocalSaveTaskArchive`, calls `task->SerializeIn(ar)`, then streams the archive through `ShmTransport::Send` into the input ring buffer + +### Receive Path (CPU → GPU) + +1. **CPU worker** dequeues the `FutureShm`, reads from the input ring buffer via `ShmTransport::Recv`, deserializes, executes the task, then writes output back through the output ring buffer +2. **RecvGpu**: On the GPU, `Future::Wait()` calls `RecvGpu` which reads from the output ring buffer via `ShmTransport::Recv` +3. **Deserialize**: A `LocalLoadTaskArchive` deserializes the output fields via `task->SerializeOut(ar)` +4. **Completion**: The GPU spins on `FUTURE_COMPLETE` flag (should already be set after Recv completes) + +### Wire Format + +GPU and CPU use compatible but different archive types: + +| Side | Serialization | Deserialization | +|---|---|---| +| GPU (send) | `LocalSaveTaskArchive` | `LocalLoadTaskArchive` | +| CPU (worker) | `SaveTaskArchive` | `LoadTaskArchive` | + +Both produce the same binary wire format: `[vector_size][elements...][msg_type][stream_data]`. This is critical — if the formats diverge, deserialization will produce corrupted data. + +## Troubleshooting + +### CUDA Error 700 (Illegal Memory Access) + +Usually caused by GPU stack overflow during deep serialization template chains. Increase the stack size: +```cpp +cudaDeviceSetLimit(cudaLimitStackSize, 131072); +``` + +### Silent data corruption (kernel completes but wrong values) + +Check that **all** serialization functions are marked `HSHM_CROSS_FUN`. Without `__device__` annotation, NVCC's SFINAE dispatch silently skips serialization, consuming 0 bytes and misaligning the stream. The compiler produces no error. + +### Ring buffer deadlock (kernel hangs) + +The GPU producer and CPU consumer share the ring buffer. If the buffer fills, the producer spins waiting for the consumer. Ensure: +- The CPU worker is running and polling the GPU queue lanes +- `RegisterGpuQueue` and `AssignGpuLanesToWorker` were called before the kernel launch +- The copy space is large enough for the serialized payload + +### Kernel launch failure (-201) + +Check `cudaGetLastError()` after the kernel launch. Common causes: +- Too many threads per block (ArenaAllocator table won't fit in memory) +- Missing `CHIMAERA_GPU_INIT` (uninitialized shared memory access) + +### ShmPtr resolution failure on host + +The CPU worker must be able to resolve `ShmPtr`s that point into GPU-allocated pinned memory. Call `RegisterGpuAllocator` before launching the kernel: +```cpp +CHI_IPC->RegisterGpuAllocator(backend_id, gpu_backend.data_, + gpu_backend.data_capacity_); +``` From c7d6c3fdccae87a2036418a8d420c4ba4377cab2 Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Tue, 3 Mar 2026 05:56:26 +0000 Subject: [PATCH 2/5] Rename base-modules from sidebar position 6 to 20 Move base-modules section after the new GPU clients page to maintain logical sidebar ordering. Co-Authored-By: Claude Sonnet 4.6 --- .../{6.base-modules => 20.base-modules}/1.admin.md | 0 .../context-runtime/{6.base-modules => 20.base-modules}/2.bdev.md | 0 .../{6.base-modules => 20.base-modules}/3.MOD_NAME.md | 0 .../{6.base-modules => 20.base-modules}/_category_.json | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename docs/sdk/context-runtime/{6.base-modules => 20.base-modules}/1.admin.md (100%) rename docs/sdk/context-runtime/{6.base-modules => 20.base-modules}/2.bdev.md (100%) rename docs/sdk/context-runtime/{6.base-modules => 20.base-modules}/3.MOD_NAME.md (100%) rename docs/sdk/context-runtime/{6.base-modules => 20.base-modules}/_category_.json (100%) diff --git a/docs/sdk/context-runtime/6.base-modules/1.admin.md b/docs/sdk/context-runtime/20.base-modules/1.admin.md similarity index 100% rename from docs/sdk/context-runtime/6.base-modules/1.admin.md rename to docs/sdk/context-runtime/20.base-modules/1.admin.md diff --git a/docs/sdk/context-runtime/6.base-modules/2.bdev.md b/docs/sdk/context-runtime/20.base-modules/2.bdev.md similarity index 100% rename from docs/sdk/context-runtime/6.base-modules/2.bdev.md rename to docs/sdk/context-runtime/20.base-modules/2.bdev.md diff --git a/docs/sdk/context-runtime/6.base-modules/3.MOD_NAME.md b/docs/sdk/context-runtime/20.base-modules/3.MOD_NAME.md similarity index 100% rename from docs/sdk/context-runtime/6.base-modules/3.MOD_NAME.md rename to docs/sdk/context-runtime/20.base-modules/3.MOD_NAME.md diff --git a/docs/sdk/context-runtime/6.base-modules/_category_.json b/docs/sdk/context-runtime/20.base-modules/_category_.json similarity index 100% rename from docs/sdk/context-runtime/6.base-modules/_category_.json rename to docs/sdk/context-runtime/20.base-modules/_category_.json From 72d7a1a70d65c28c89696b37fa74ebd7113326af Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Sun, 8 Mar 2026 02:06:41 +0000 Subject: [PATCH 3/5] Update GPU client docs: dual-allocator, corrected API names, performance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Update all queue field names to current API (cpu2gpu_queue, gpu2cu_queue, gpu2gpu_queue) throughout examples and narrative - Update ClientConnectTask handshake table to new field names - Add gpu_heap_backend (GpuMalloc, 9000+gpu_id) and gpu2gpu device-memory backend (3000+gpu_id) to server backend table and memory layout diagrams - Replace single-ArenaAllocator description with dual-allocator architecture: ArenaAllocator (HSHM_DEFAULT_ALLOC_GPU_T, primary bump-pointer) + BuddyAllocator (CHI_GPU_HEAP_T, serialization heap with individual free) - Document CHIMAERA_GPU_ORCHESTRATOR_INIT(gpu_info, num_blocks) macro for multi-block client kernels; CHI_CLIENT_GPU_INIT alias - Add GetClientGpuInfo(gpu_id) to host-side setup — fills all IpcManagerGpuInfo fields automatically for same-process kernel launches - Add Performance section: ~200 µs BuddyAllocator vs ~400 µs device malloc, corrected latency measurement explanation, arena-reset semantics - Update server phase-1 init sequence for new backends and queue layout - Update GPU memory backend size guidance for primary arena vs heap backends Co-Authored-By: Claude Sonnet 4.6 --- docs/sdk/context-runtime/6.gpu_clients.md | 1160 ++++++++++++++++++--- 1 file changed, 1040 insertions(+), 120 deletions(-) diff --git a/docs/sdk/context-runtime/6.gpu_clients.md b/docs/sdk/context-runtime/6.gpu_clients.md index 2abe9080..1dafbf4c 100644 --- a/docs/sdk/context-runtime/6.gpu_clients.md +++ b/docs/sdk/context-runtime/6.gpu_clients.md @@ -6,25 +6,491 @@ description: How CUDA/ROCm kernels submit tasks to the Chimaera runtime and rece # GPU Client Kernels -This guide explains how CUDA or ROCm kernels can submit tasks to the Chimaera runtime and receive results, using the same client API as host code. +This guide explains how CUDA or ROCm kernels can submit tasks to the Chimaera runtime and receive results, using the same client API as host code. It also covers **cross-device routing**: CPU code sending tasks to run on the GPU, and GPU kernels sending tasks to run on the CPU. ## Overview -The Chimaera runtime supports **GPU-initiated task submission**: a CUDA/ROCm kernel running on the GPU can create a task, send it through a shared-memory ring buffer to a CPU worker, and wait for the result — all without returning to host code. +The Chimaera runtime supports four GPU task routing modes: -The communication path is: +| Routing Mode | Direction | Description | +|---|---|---| +| `PoolQuery::Local()` | GPU → GPU | GPU kernel submits a task processed by the GPU work orchestrator | +| `PoolQuery::LocalGpuBcast()` | CPU → GPU | CPU submits a task processed by the GPU work orchestrator | +| `PoolQuery::ToLocalGpu(gpu_id)` | CPU → GPU | CPU submits a task to a specific GPU device | +| `PoolQuery::ToLocalCpu()` | GPU → CPU | GPU kernel submits a task processed by a CPU worker | + +All paths use the same `client.AsyncMyTask(query, ...)` API. The `IpcManager::Send()` method detects the routing mode and dispatches to the correct transport automatically. + +### Architecture + +``` +CPU Host GPU Work Orchestrator + | | + | AsyncGpuSubmit(LocalGpuBcast()) | + | -> Send() detects LocalGpuBcast | + | -> SendToGpu(): serialize + push cpu2gpu_queue ---> | pop + deserialize + | | dispatch to GpuRuntime + | Wait(): spin on FUTURE_COMPLETE | serialize output + | <- deserialize output from FutureShm <--- | threadfence_system + FUTURE_COMPLETE + | | +GPU Kernel CPU Worker + | | + | AsyncGpuSubmit(ToLocalCpu()) | + | -> SendGpu(): serialize + push gpu2cpu_queue ---> | dequeue + deserialize + | | execute task + | Wait(): RecvGpu reads output ring buffer <--- | serialize output + FUTURE_COMPLETE + | | +GPU Kernel GPU Work Orchestrator + | | + | AsyncGpuSubmit(Local()) | + | -> SendGpuLocal(): push gpu2gpu_queue ---> | pop + deserialize + | | dispatch to GpuRuntime + | Wait(): RecvGpu reads output ring buffer <--- | serialize output + FUTURE_COMPLETE +``` + +## Runtime vs. Client Process Execution Modes + +The Chimaera runtime supports two process configurations for GPU task submission: + +| Mode | Init Call | GPU Resources | Use Case | +|---|---|---|---| +| **In-process runtime** | `CHIMAERA_INIT(kClient, true)` | Full: all GPU queues, orchestrator, GPU memory backends | Single process with embedded server | +| **Separate client process** | `CHIMAERA_INIT(kClient, false)` | None: no GPU queues or orchestrator | Client connects to standalone server | + +### In-Process Runtime (Embedded Server) + +When `CHIMAERA_INIT(kClient, true)` is called, the process runs both client and server: + +```cpp +// One process does everything +chi::CHIMAERA_INIT(chi::ChimaeraMode::kClient, true); + +chimaera::my_module::Client client(pool_id); +client.AsyncCreate(chi::PoolQuery::Dynamic(), name, pool_id).Wait(); + +// All GPU routing modes work: +// Local(), LocalGpuBcast(), ToLocalGpu(), ToLocalCpu() +auto future = client.AsyncGpuSubmit(chi::PoolQuery::LocalGpuBcast(), 0, value); +future.Wait(); +``` + +**What gets initialized:** + +1. **`ServerInit()`** runs in sequence: + - `IpcManager::ServerInit()` → `ServerInitGpuQueues()` creates all GPU queue infrastructure + - `WorkOrchestrator::Init()` + `StartWorkers()` launches worker threads + - `PoolManager::ServerInit()` initializes pool management + - `LaunchGpuOrchestrator()` launches persistent GPU kernel (deferred until after pools are composed) + - `StartLocalServer()` opens the SHM segment for external clients +2. **`ClientInit()`** initializes the client-side IPC connection + +All four GPU routing modes are available because the process owns the GPU queues and orchestrator. + +### Separate Client Process + +When `CHIMAERA_INIT(kClient, false)` is called, the process is a pure client. During the `ClientConnect` handshake, the server transmits GPU queue metadata (offsets, backend sizes, IPC handles) back to the client. If the client binary was compiled with CUDA/ROCm support, `ClientInitGpuQueues()` attaches to the server's GPU queue backends and reconstructs local queue pointers, enabling direct GPU task submission. + +```cpp +// Server is a separate process +// Client connects via SHM, TCP, or IPC transport +chi::CHIMAERA_INIT(chi::ChimaeraMode::kClient, false); + +chimaera::my_module::Client client(pool_id); +client.AsyncCreate(chi::PoolQuery::Dynamic(), name, pool_id).Wait(); + +// PoolQuery::Local() always works (routed to server's CPU worker via SHM/TCP/IPC) +auto future = client.AsyncGpuSubmit(chi::PoolQuery::Local(), 0, value); +future.Wait(); +// Server's CPU handler executes: result = test_value * 2 + gpu_id + +// With GPU support compiled in, LocalGpuBcast also works (direct GPU queue push) +auto gpu_future = client.AsyncGpuSubmit(chi::PoolQuery::LocalGpuBcast(), 0, value); +gpu_future.Wait(); +// Server's GPU handler executes: result = test_value * 2 + gpu_id +``` + +**What gets initialized:** + +1. **`ClientInit()` only** — no `ServerInit()`: + - `ConfigManager::Init()` reads `chimaera.yaml` + - `IpcManager::ClientInit()` connects to the server via SHM, TCP, or IPC + - Admin client singleton is created +2. **GPU queue attachment** (if compiled with CUDA/ROCm): + - `ClientConnect` response carries GPU queue offsets, backend sizes, and IPC handles + - `ClientInitGpuQueues()` attaches to each GPU's `GpuShmMmap` queue backend via `shm_attach()` + - `cpu2gpu_queues_`, `gpu2gpu_queues_` are reconstructed from the received offsets + - `RegisterGpuAllocator()` is called so the client can resolve `ShmPtr`s into queue memory +3. **No GPU orchestrator**: `gpu_orchestrator_` is `nullptr` (the server runs the orchestrator) + +**Available routing:** + +| Routing Mode | Works? | Why | +|---|---|---| +| `PoolQuery::Local()` | Yes | Goes through `SendShm()` or `SendZmq()` to server's CPU worker | +| `PoolQuery::LocalGpuBcast()` | Yes (with GPU) | `SendToGpu()` pushes to attached `cpu2gpu_queue` → server's GPU orchestrator | +| `PoolQuery::ToLocalGpu(id)` | Yes (with GPU) | Same path, targeting a specific GPU device | +| `PoolQuery::ToLocalCpu()` | No | Only callable from GPU device code | + +Without GPU support compiled in, `LocalGpuBcast` and `ToLocalGpu` return empty futures (no `cpu2gpu_queues_` attached). Use `PoolQuery::Local()` as a fallback — the server's CPU handler executes the task instead. + +#### ClientConnect GPU Queue Handshake + +The `ClientConnectTask` response includes these GPU-specific fields: + +| Field | Type | Description | +|---|---|---| +| `num_gpus_` | `u32` | Number of GPU devices on the server | +| `cpu2gpu_queue_off_[8]` | `u64[8]` | Byte offsets of `cpu2gpu_queue` within each GPU's `GpuShmMmap` | +| `gpu2cpu_queue_off_[8]` | `u64[8]` | Byte offsets of `gpu2cpu_queue` (GPU→CPU) | +| `gpu2gpu_queue_off_[8]` | `u64[8]` | Byte offsets of `gpu2gpu_queue` | +| `gpu_queue_backend_size_[8]` | `u64[8]` | Size of each GPU's `GpuShmMmap` backend | +| `gpu_queue_depth_` | `u32` | Queue depth (number of slots) | + +The client uses these offsets to reconstruct `TaskQueue` pointers. The `cpu2gpu` and `gpu2cpu` queues reside in the pinned host `GpuShmMmap` backend; the `gpu2gpu` queue resides in device memory. A single `shm_attach()` per GPU attaches the pinned backend; the `gpu2gpu` backend is attached separately via its IPC handle. + +#### Cross-Process GPU Memory Registration + +Clients can register GPU device memory backends with the server using the extended `RegisterMemoryTask`. This enables the server to resolve `ShmPtr`s that point into GPU memory allocated by a different process. + +```cpp +// Client allocates a GpuMalloc backend +hipc::MemoryBackendId backend_id(static_cast(getpid()), 100); +hipc::GpuMalloc gpu_backend; +gpu_backend.shm_init(backend_id, data_size, "", gpu_id); + +// Get IPC handle from the backend's private header +hipc::GpuMallocPrivateHeader priv_header; +GpuApi::Memcpy(&priv_header, gpu_backend.GetPrivateHeader(), sizeof(priv_header)); + +// Register with server via RegisterMemory(kGpuDeviceMemory) +auto reg_task = ipc->NewTask( + CreateTaskId(), kAdminPoolId, PoolQuery::Local(), + backend_id, + admin::MemoryType::kGpuDeviceMemory, + gpu_id, data_size, &priv_header.ipc_handle_); +auto future = ipc->SendZmq(reg_task, IpcMode::kTcp); +future.Wait(); +// Server opens the IPC handle and registers the allocator in gpu_alloc_map_ +``` + +The `MemoryType` enum controls the registration path: + +| MemoryType | Value | Backend | Registration Path | +|---|---|---|---| +| `kCpuMemory` | 0 | POSIX shared memory | Existing `IpcManager::RegisterMemory()` | +| `kPinnedHostMemory` | 1 | `GpuShmMmap` (pinned) | Attach via `shm_attach()` + `RegisterGpuAllocator()` | +| `kGpuDeviceMemory` | 2 | `GpuMalloc` (device) | Open IPC handle via `shm_attach_ipc()` + `RegisterGpuAllocator()` | + +### Fork-Based Test Pattern + +For testing, a common pattern forks a child server and runs a pure client in the parent: + +```cpp +int main(int argc, char* argv[]) { + // Child process: server mode + if (argc > 1 && std::string(argv[1]) == "--server-mode") { + CHIMAERA_INIT(ChimaeraMode::kServer, true); + sleep(300); // Wait for parent to kill us + return 0; + } + + // Parent: fork server, then run as client + pid_t server = fork(); + if (server == 0) { + setpgid(0, 0); + execl("/proc/self/exe", argv[0], "--server-mode", nullptr); + _exit(1); + } + setpgid(server, server); + WaitForServer(); // Poll for SHM segment file + + // Pure client + setenv("CHI_WITH_RUNTIME", "0", 1); + CHIMAERA_INIT(ChimaeraMode::kClient, false); + + // Submit tasks via Local() routing + chimaera::my_module::Client client(pool_id); + client.AsyncCreate(PoolQuery::Dynamic(), name, pool_id).Wait(); + auto future = client.AsyncGpuSubmit(PoolQuery::Local(), 0, 42); + future.Wait(); + // result = 42 * 2 + 0 = 84 (CPU handler) + + // Cleanup + kill(-server, SIGKILL); + waitpid(server, &status, 0); +} +``` + +The `execl("/proc/self/exe", ...)` pattern avoids inheriting the parent's `CHIMAERA_INIT` static guard state. + +See `context-runtime/modules/MOD_NAME/test/test_gpu_client_process.cc` for a complete implementation of this pattern with 5 test cases covering CPU routing, GPU queue attachment, GPU routing, and cross-process memory registration. + +## Memory Allocation Architecture + +### GPU Queue Memory (Server Only) + +During `ServerInitGpuQueues()`, the server allocates **pinned host memory** backends for GPU communication. Each GPU device gets: + +| Backend | ID | Type | Size | Purpose | +|---|---|---|---|---| +| CPU→GPU queue backend | `1000 + gpu_id` | `GpuShmMmap` (pinned) | 64 MB | Holds `TaskQueue` structures (`cpu2gpu`, `gpu2cpu`) and `FutureShm` allocations for CPU→GPU tasks | +| GPU→GPU queue backend | `3000 + gpu_id` | Device memory | 32 MB | Holds `gpu2gpu_queue` and per-thread `ArenaAllocator` storage for GPU→GPU tasks | +| Orchestrator scratch | `2000 + gpu_id` | `GpuShmMmap` (pinned) | 64 MB | Per-block `ArenaAllocator` storage for the persistent orchestrator kernel | +| GPU heap | `9000 + gpu_id` | `GpuMalloc` (device) | 64 MB | Per-thread `BuddyAllocator` (`CHI_GPU_HEAP_T`) for serialization scratch buffers | + +`GpuShmMmap` backends use `cudaHostAlloc` (pinned host memory) accessible from both CPU and GPU via UVA. `GpuMalloc` backends use `cudaMalloc` (device memory) for GPU-only allocations. + +``` +Server Process Memory Layout (per GPU): + +-----------------------------------------------------------------+ + | GpuShmMmap (backend ID 1000+gpu_id, 64 MB pinned host memory) | + | URL: /chi_gpu_queue_{gpu_id} | + | | + | +-----------+ +-----------+ | + | | TaskQueue | | TaskQueue | | + | | gpu2cpu_q | | cpu2gpu_q | | + | | (GPU->CPU)| | (CPU->GPU)| | + | +-----------+ +-----------+ | + | | + | +-----------------------------+ | + | | FutureShm allocations | (allocated by SendToGpu) | + | | [FutureShm + copy_space] | | + | +-----------------------------+ | + +-----------------------------------------------------------------+ + + +-----------------------------------------------------------------+ + | Device memory (backend ID 3000+gpu_id, 32 MB) | + | URL: /chi_gpu2gpu_queue_{gpu_id} | + | | + | +-----------+ | + | | TaskQueue | | + | | gpu2gpu_q | | + | | (GPU->GPU)| | + | +-----------+ | + | | + | +--------------------------------------------+ | + | | ArenaAllocator per-thread (GPU->GPU tasks) | | + | +--------------------------------------------+ | + +-----------------------------------------------------------------+ + + +-----------------------------------------------------------------+ + | GpuShmMmap (backend ID 2000+gpu_id, 64 MB pinned host memory) | + | URL: /chi_gpu_orchestrator_{gpu_id} | + | | + | +--------------------------------------------+ | + | | Block 0: ArenaAllocator (per-thread bump) | | + | +--------------------------------------------+ | + | | Block 1: ArenaAllocator | | + | +--------------------------------------------+ | + | | ... | | + | +--------------------------------------------+ | + +-----------------------------------------------------------------+ + + +-----------------------------------------------------------------+ + | GpuMalloc (backend ID 9000+gpu_id, 64 MB device memory) | + | URL: /chi_gpu_heap_{gpu_id} | + | | + | +--------------------------------------------+ | + | | Block 0: BuddyAllocator (CHI_GPU_HEAP_T) | | + | +--------------------------------------------+ | + | | Block 1: BuddyAllocator | | + | +--------------------------------------------+ | + | | ... | | + | +--------------------------------------------+ | + +-----------------------------------------------------------------+ +``` + +### Client Process Memory + +A pure client (`CHIMAERA_INIT(kClient, false)`) does not create GPU backends, but **attaches to the server's GPU queue backends** during `ClientInitGpuQueues()` (if compiled with GPU support). Tasks can be sent through: + +- **SHM mode**: `FutureShm` allocated in the server's shared memory segment (`chi_main_segment`). The client writes serialized task data into `FutureShm::copy_space` via `ShmTransport::Send()`. +- **TCP/IPC mode**: `FutureShm` allocated via `HSHM_MALLOC` (process-local heap). Task data serialized and sent through ZMQ DEALER socket. +- **Direct GPU queue push** (GPU-compiled clients): `SendToGpu()` serializes into the attached `GpuShmMmap` queue backend and pushes to `cpu2gpu_queue`. The server's GPU orchestrator dequeues and processes the task. + +``` +Client Process (GPU-compiled) Memory Layout: + + Attached from server (read/write via pinned host memory): + +-----------------------------------------------------------------+ + | GpuShmMmap (backend 1000+gpu_id) — attached via shm_attach() | + | cpu2gpu_queue, gpu2gpu_queue pointers reconstructed | + | FutureShm allocations for SendToGpu live here | + +-----------------------------------------------------------------+ + + Optionally registered with server: + +-----------------------------------------------------------------+ + | GpuMalloc (client-owned, backend pid:unique_id) | + | Device memory allocated via cudaMalloc | + | IPC handle sent to server via RegisterMemory(kGpuDeviceMemory) | + | Server opens handle to resolve ShmPtrs from this backend | + +-----------------------------------------------------------------+ +``` + +### GPU Kernel Memory (User-Allocated) + +GPU kernels that use the client API require two memory backends passed in `IpcManagerGpuInfo`: + +**Primary backend** (`backend`): `GpuShmMmap` pinned host memory (or `GpuMalloc` device memory for GPU→GPU). Provides per-thread `ArenaAllocator` (`HSHM_DEFAULT_ALLOC_GPU_T`) for fast bump-pointer allocation of `FutureShm` and task objects. + +**Heap backend** (`gpu_heap_backend`): `GpuMalloc` device memory. Provides per-thread `BuddyAllocator` (`CHI_GPU_HEAP_T`) for serialization scratch buffers in `LocalSaveTaskArchive` / `LocalLoadTaskArchive`. Unlike the arena, the buddy allocator supports individual `free()`, so scratch memory is reclaimed after each task without exhausting the primary arena. + +Use `CHI_IPC->GetClientGpuInfo(gpu_id)` to build the `IpcManagerGpuInfo` for same-process kernel launches — it fills all fields automatically: + +```cpp +// Same-process launch (Chimaera running in-process): +chi::IpcManagerGpuInfo gpu_info = CHI_IPC->GetClientGpuInfo(0); +// gpu_info.backend, .gpu2gpu_queue, .gpu_heap_backend all set + +// Register primary backend so CPU can resolve GPU ShmPtrs +CHI_IPC->RegisterGpuAllocator(gpu_info.backend.id_, + gpu_info.backend.data_, + gpu_info.backend.data_capacity_); +``` + +The `CHIMAERA_GPU_INIT` macro partitions both backends per-thread: + +``` + backend (ArenaAllocator, HSHM_DEFAULT_ALLOC_GPU_T): + +------------------------------------------------------+ + | Pointer table (num_threads * sizeof(ptr)) | + +------------------------------------------------------+ + | ArenaAllocator 0 (per_thread_size bytes) | + +------------------------------------------------------+ + | ArenaAllocator 1 (per_thread_size bytes) | + +------------------------------------------------------+ + | ... | + +------------------------------------------------------+ + + gpu_heap_backend (BuddyAllocator, CHI_GPU_HEAP_T): + +------------------------------------------------------+ + | Pointer table (num_threads * sizeof(ptr)) | + +------------------------------------------------------+ + | BuddyAllocator 0 (per_thread_size bytes) | + +------------------------------------------------------+ + | BuddyAllocator 1 (per_thread_size bytes) | + +------------------------------------------------------+ + | ... | + +------------------------------------------------------+ +``` + +The `ArenaAllocator` is a bump-pointer allocator; it is reset (via `alloc->Reset()`) after each task completes — safe because only one task is in-flight per thread at a time. The `BuddyAllocator` reclaims memory individually as serialization vectors go out of scope. Access the per-thread heap via `CHI_GPU_HEAP` (expands to `CHI_IPC->GetGpuHeap()`). + +## GPU Initialization Sequence + +### Server GPU Init (Full Sequence) + +The server initializes GPU resources in three phases during `ServerInit()`: + +**Phase 1: Queue creation** (`IpcManager::ServerInitGpuQueues()`) +1. Query `hshm::GpuApi::GetDeviceCount()` for number of GPU devices +2. For each GPU, create a 64 MB `GpuShmMmap` pinned host backend (ID `1000+gpu_id`) for `cpu2gpu` and `gpu2cpu` queues plus CPU→GPU `FutureShm` allocations +3. Create a device-memory backend (ID `3000+gpu_id`) for the `gpu2gpu_queue` and its per-thread `ArenaAllocator` +4. Create two `TaskQueue` objects in the pinned backend: + - `gpu2cpu_queues_[gpu_id]` — GPU→CPU queue (CPU worker polls this) + - `cpu2gpu_queues_[gpu_id]` — CPU→GPU queue (orchestrator polls this) +5. Create one `TaskQueue` in device memory: + - `gpu2gpu_queues_[gpu_id]` — GPU→GPU queue (orchestrator polls this) +6. Each queue has 1 lane, 2 priorities (normal + resumed), configurable depth +7. Create orchestrator scratch backends (64 MB `GpuShmMmap`, IDs `2000+gpu_id`) +8. Create GPU heap backends (64 MB `GpuMalloc`, IDs `9000+gpu_id`) for `CHI_GPU_HEAP_T = BuddyAllocator` +9. Register all backends with `RegisterGpuAllocator()` for host-side `ShmPtr` resolution +10. Populate `gpu_orchestrator_info_` struct for GPU 0 including `gpu_heap_backend` + +**Phase 2: Pool composition** (deferred) +- All compose-section pools are created before the orchestrator launches +- GPU containers are allocated during pool creation via autogenerated allocation kernels + +**Phase 3: Orchestrator launch** (`IpcManager::LaunchGpuOrchestrator()`) +1. Read `gpu_blocks` and `gpu_threads_per_block` from configuration +2. Set CUDA stack size to 131072 bytes (`cudaDeviceSetLimit`) +3. Allocate `WorkOrchestratorControl` in pinned host memory +4. Allocate `gpu::PoolManager` on device +5. Create dedicated CUDA stream +6. Launch persistent kernel: `chimaera_gpu_orchestrator<<>>()` +7. Only block 0, thread 0 runs the `gpu::Worker` poll loop; other threads spin-wait for exit + +**Why deferred?** The orchestrator occupies all SMs with persistent thread blocks. If launched before pool composition, `cudaMalloc` calls during GPU container allocation would deadlock against the persistent kernel. + +### Client GPU Init (None) + +`ClientInit()` performs no GPU initialization: +- No `ServerInitGpuQueues()` — `cpu2gpu_queues_` remains empty +- No `LaunchGpuOrchestrator()` — `gpu_orchestrator_` remains `nullptr` +- No GPU memory backends allocated + +The client can only submit tasks through CPU transport (SHM, TCP, IPC). + +## Queue Assignment Algorithm + +### Worker Thread Partitioning + +The `DefaultScheduler::DivideWorkers()` method partitions the configured worker threads into four roles: + +``` +Given N total worker threads (from num_threads in chimaera.yaml): + + Worker 0 -> Scheduler worker (handles task routing) + Workers 1..N-3 -> I/O workers (execute tasks) + Worker N-2 -> GPU worker (polls GPU->CPU lanes) + Worker N-1 -> Network worker (handles ZMQ send/recv) +``` + +Example with `num_threads: 4`: +- Worker 0: scheduler +- Worker 1: I/O worker (also used as GPU worker since N-2 = 2, overlaps I/O range) +- Worker 2: GPU worker +- Worker 3: network worker + +The GPU worker is assigned only when `total_workers > 2`. With 2 or fewer workers, there is no dedicated GPU worker and GPU→CPU tasks cannot be processed. + +### GPU Lane Assignment + +After `ServerInitGpuQueues()` creates the per-GPU `TaskQueue` objects, `AssignGpuLanesToWorker()` connects them to the GPU worker: + +```cpp +void IpcManager::AssignGpuLanesToWorker() { + Worker *gpu_worker = scheduler_->GetGpuWorker(); + + // Collect lane 0 from each GPU's gpu_queue (GPU->CPU) + for (size_t gpu_id = 0; gpu_id < num_gpus; ++gpu_id) { + TaskLane *gpu_lane = &gpu_queues_[gpu_id]->GetLane(0, 0); + gpu_lane->SetAssignedWorkerId(gpu_worker->GetId()); + gpu_lanes.push_back(gpu_lane); + } + + gpu_worker->SetGpuLanes(gpu_lanes); +} +``` + +**Result:** The single GPU worker polls **all** GPU→CPU lanes across all GPUs. The GPU work orchestrator (persistent kernel) polls the CPU→GPU and GPU→GPU queues independently. + +### Queue Routing Summary ``` -GPU Kernel CPU Worker - │ │ - │ 1. NewTask (GPU allocator) │ - │ 2. Send (SPSC ring buffer →) ───> dequeue + deserialize - │ │ 3. Execute task - │ 4. Wait (← SPSC ring buffer) <─── serialize output + FUTURE_COMPLETE - │ 5. Read result │ + Pure Client Process Server Process + +-----------------+ +------------------+ + | No GPU queues | | cpu2gpu_queues_[]|----> GPU Orchestrator + | No orchestrator| | gpu2gpu_queues_[]|----> GPU Orchestrator +AsyncGpuSubmit() ->-| SendShm() or |--SHM/TCP/-->| gpu_queues_[] |----> GPU Worker (CPU) +(PoolQuery::Local) | SendZmq() | IPC | | + +-----------------+ +------------------+ + | + CPU Workers execute + the task's CPU handler ``` -Both directions use a **SPSC (single-producer single-consumer) ring buffer** embedded in a `FutureShm` structure that lives in GPU-registered pinned shared memory. System-scope atomics ensure visibility between GPU and CPU. +### Client-to-Server Transport Modes + +The pure client's transport mode is determined by the `CHI_IPC_MODE` environment variable or `chimaera.yaml` configuration: + +| Mode | Transport | How FutureShm is Allocated | How Task Data is Sent | +|---|---|---|---| +| `SHM` | Shared memory | In server's SHM segment | Written to `FutureShm::copy_space` ring buffer | +| `TCP` | ZMQ DEALER/ROUTER | Process-local heap (`HSHM_MALLOC`) | Serialized through ZMQ TCP socket | +| `IPC` | ZMQ DEALER/ROUTER (Unix socket) | Process-local heap (`HSHM_MALLOC`) | Serialized through ZMQ IPC socket | + +For SHM mode, the server worker deserializes from the `FutureShm::copy_space` ring buffer using `ShmTransport::Recv()`. For TCP/IPC mode, the server receives the serialized data through the ZMQ ROUTER socket and deserializes on the network worker thread. ## Prerequisites @@ -85,14 +551,14 @@ struct MyGpuTask : public chi::Task { pool_query_ = query; } - // Serialization: fields sent GPU → CPU + // Serialization: fields sent GPU -> CPU template HSHM_CROSS_FUN void SerializeIn(Archive& ar) { Task::SerializeIn(ar); ar(input_value_, result_); } - // Serialization: fields sent CPU → GPU (after execution) + // Serialization: fields sent CPU -> GPU (after execution) template HSHM_CROSS_FUN void SerializeOut(Archive& ar) { Task::SerializeOut(ar); @@ -127,139 +593,378 @@ class Client : public chi::ContainerClient { }; ``` -`CHI_IPC->Send()` automatically dispatches to the GPU ring-buffer path (`SendGpu`) when compiled for device code. +`CHI_IPC->Send()` automatically dispatches based on compilation context and routing mode: + +- **GPU device code** (`HSHM_IS_GPU`): routes via `SendGpu` or `SendGpuLocal` +- **CPU host code** with `HSHM_ENABLE_CUDA=1`: detects `LocalGpuBcast`/`ToLocalGpu` and routes via `SendToGpu` +- **CPU host code** without GPU: routes through normal CPU routing (pool query → worker) + +## GPU Routing Modes + +### GPU → GPU (Local) + +A GPU kernel submits a task that is processed by the **GPU work orchestrator**. The task runs on the GPU itself. + +```cpp +// Inside a __global__ kernel +chimaera::my_module::Client client(pool_id); +auto future = client.AsyncMyTask(chi::PoolQuery::Local(), input); +future.Wait(); +chi::u32 result = future->result_; +``` + +The GPU orchestrator must be running (not paused). The task is dispatched to your module's `GpuRuntime::MyMethod()` handler. + +### CPU → GPU (LocalGpuBcast) + +CPU host code submits a task to be processed by the **GPU work orchestrator**. The task runs on the GPU. + +```cpp +// Host code -- must be compiled with HSHM_ENABLE_CUDA=1 +chimaera::my_module::Client client(pool_id); +auto future = client.AsyncMyTask(chi::PoolQuery::LocalGpuBcast(), input); +future.Wait(); // blocks until GPU completes + deserializes output +chi::u32 result = future->result_; +``` + +**How it works:** + +1. `Send()` detects `LocalGpuBcast` and calls `SendToGpu()` instead of normal CPU routing +2. `SendToGpu()` allocates a `FutureShm` in GPU-accessible pinned host memory, serializes task input into its ring buffer, and pushes to the `cpu2gpu_queue` +3. The GPU orchestrator pops the task, deserializes, dispatches to `GpuRuntime`, serializes output, and sets `FUTURE_COMPLETE` +4. `Wait()` spins on `FUTURE_COMPLETE`, then deserializes output from the FutureShm ring buffer back into the task + +**Important -- HSHM_ENABLE_CUDA must be 1**: The `Send()` GPU routing interceptor is inside `#if HSHM_ENABLE_CUDA || HSHM_ENABLE_ROCM`. If your source file is compiled with `HSHM_ENABLE_CUDA=0` (e.g., a plain C++ file in a mixed CUDA/C++ target), the interceptor is compiled out and the task will go through normal CPU routing, which does not complete the future correctly for GPU tasks. + +**Solution**: Call `AsyncMyTask(LocalGpuBcast(), ...)` from a source file compiled as CUDA (`.cu` or with `LANGUAGE CUDA` property), or from a wrapper function defined in such a file. + +### CPU → Specific GPU (ToLocalGpu) + +Same as `LocalGpuBcast` but targets a specific GPU by device ID: + +```cpp +// Send to GPU device 1 +auto future = client.AsyncMyTask(chi::PoolQuery::ToLocalGpu(1), input); +future.Wait(); +``` + +### GPU → CPU (ToLocalCpu) + +A GPU kernel submits a task to be processed by a **CPU worker thread**. The task runs on the CPU. + +```cpp +// Inside a __global__ kernel +chimaera::my_module::Client client(pool_id); +auto future = client.AsyncMyTask(chi::PoolQuery::ToLocalCpu(), input); +future.Wait(); +chi::u32 result = future->result_; +``` + +**How it works:** + +1. `Send()` on the GPU calls `SendGpu()`, which serializes the task and pushes to the `gpu2cpu_queue` +2. The CPU GPU-worker thread polls this queue, deserializes, and routes to the module's CPU runtime handler +3. The CPU worker serializes output into the FutureShm ring buffer and sets `FUTURE_COMPLETE` +4. `Wait()` on the GPU calls `RecvGpu()` which reads from the output ring buffer + +**Host setup required**: The GPU kernel needs a `gpu2cpu_queue` registered with the runtime. See [Host-Side Setup for GPU → CPU](#host-side-setup-for-gpu--cpu) below. + +## GPU Work Orchestrator + +The GPU work orchestrator is a **persistent CUDA kernel** that processes tasks on the GPU. It runs on a dedicated CUDA stream and polls two queues: + +- `cpu2gpu_queue` -- tasks pushed by CPU via `SendToGpu()` +- `gpu2gpu_queue` -- tasks pushed by GPU kernels via `SendGpuLocal()` + +The runtime launches the orchestrator automatically during `ServerInit()` when GPU support is enabled. + +### Persistent Kernel Structure + +The orchestrator is launched with configurable blocks and threads per block. Only **block 0, thread 0** runs the worker poll loop; all other threads spin-wait for the exit signal: + +```cpp +__global__ void chimaera_gpu_orchestrator(gpu::PoolManager *pool_mgr, + gpu::WorkOrchestratorControl *control, + IpcManagerGpuInfo gpu_info, + u32 num_blocks) { + CHIMAERA_GPU_ORCHESTRATOR_INIT(gpu_info, num_blocks); + + if (blockIdx.x == 0 && threadIdx.x == 0) { + control->running_flag = 1; + gpu::Worker worker; + worker.Init(0, gpu_info.cpu2gpu_queue, gpu_info.gpu2gpu_queue, + pool_mgr, gpu_info.cpu2gpu_queue_base); + + while (!control->exit_flag) { + worker.PollOnce(); + } + worker.Finalize(); + } + + // Other blocks/threads: wait for exit signal + if (blockIdx.x != 0 || threadIdx.x != 0) { + while (!control->exit_flag) { /* spin */ } + } +} +``` + +The extra blocks are launched to occupy all SMs, preventing other kernels from preempting the orchestrator. + +### GPU Container (GpuRuntime) + +Each module that supports GPU execution defines a `GpuRuntime` class: + +```cpp +// my_module_gpu_runtime.h +class GpuRuntime : public chi::gpu::Container { + public: + HSHM_GPU_FUN GpuRuntime() = default; + HSHM_GPU_FUN ~GpuRuntime() override = default; + + /** GPU handler for MyMethod */ + HSHM_GPU_FUN void MyMethod(hipc::FullPtr task, + chi::gpu::GpuRunContext &rctx) { + task->result_ = task->input_value_ * 3; + } + + // Autogenerated virtual method overrides (switch-case dispatch) + #include "autogen/my_module_gpu_lib_exec.h" +}; +``` + +### Module YAML Configuration + +Enable GPU support in your module's `chimaera_mod.yaml`: + +```yaml +module_name: my_module +namespace: chimaera + +# ... method IDs ... +kMyMethod: 25 + +# GPU support +has_gpu: true +gpu_methods: + - kMyMethod +``` + +Run `chimaera repo refresh .` to regenerate the autogen dispatch code: +- Per-module: `autogen/my_module_gpu_lib_exec.h` (virtual method overrides) +- Repo-level: `src/autogen/gpu_work_orchestrator_modules.h` (container allocation + registry) + +### Pause/Resume + +The GPU orchestrator occupies all SMs with persistent thread blocks. To launch other GPU kernels (e.g., test kernels), you must pause the orchestrator first: + +```cpp +CHI_IPC->PauseGpuOrchestrator(); // signal exit + synchronize stream +// ... launch your kernel on a separate stream ... +CHI_IPC->ResumeGpuOrchestrator(); // relaunch persistent kernel +``` + +**Important**: Use stream-based synchronization (`hshm::GpuApi::Synchronize(stream)`) for your own kernels, not `cudaDeviceSynchronize()`. The latter would block on the orchestrator's stream. + +### Container Registration + +GPU containers are automatically allocated and registered when a pool with `has_gpu: true` is created. The allocation kernel runs in the orchestrator's CUDA module context so vtables are correct for virtual dispatch. ## GPU Kernel Implementation -### The CHIMAERA_GPU_INIT Macro +### Initialization Macros + +Three macros are available depending on the kernel's role: -Every GPU kernel that interacts with the runtime must call `CHIMAERA_GPU_INIT` at the top. This macro: +#### `CHIMAERA_GPU_INIT(gpu_info)` -1. Creates a per-block `IpcManager` singleton in `__shared__` memory -2. Initializes per-thread `ArenaAllocator` instances from the GPU memory backend -3. Synchronizes all threads in the block via `__syncthreads()` +For regular client kernels (not the orchestrator). Initializes the per-block `IpcManager` from a fully-populated `IpcManagerGpuInfo`. All threads in a block share one `IpcManager` instance (in `__shared__` memory); thread 0 runs `ClientInitGpu`, then `__syncthreads()` makes it visible to all threads. ```cpp __global__ void my_kernel(chi::IpcManagerGpu gpu_info, chi::PoolId pool_id, chi::u32 input, - int* d_result, chi::u32* d_output) { - // Initialize per-block IPC manager (required) CHIMAERA_GPU_INIT(gpu_info); - // Use the client API — same code as host chimaera::my_module::Client client(pool_id); auto future = client.AsyncMyTask(chi::PoolQuery::Local(), input); future.Wait(); - - // Read result *d_output = future->result_; - *d_result = 1; // success } ``` -### Memory Layout +#### `CHI_CLIENT_GPU_INIT(gpu_info)` + +Alias for `CHIMAERA_GPU_INIT`. Use this in client-process kernels for clarity: + +```cpp +__global__ void client_kernel(chi::IpcManagerGpu gpu_info, ...) { + CHI_CLIENT_GPU_INIT(gpu_info); + // identical behavior +} +``` + +#### `CHIMAERA_GPU_ORCHESTRATOR_INIT(gpu_info, num_blocks)` -`CHIMAERA_GPU_INIT` partitions the GPU memory backend as: +For kernels that partition their memory across multiple blocks (e.g., benchmark client kernels and the runtime orchestrator itself). Before calling `ClientInitGpu`, this macro splits all three backends (`backend`, `gpu2cpu_backend`, `gpu_heap_backend`) so each block gets its own non-overlapping slice: ``` - ┌──────────────────────────────────────────────────────┐ - │ Pointer table (num_threads * sizeof(ptr)) │ - ├──────────────────────────────────────────────────────┤ - │ ArenaAllocator 0 (per_thread_size bytes) │ - ├──────────────────────────────────────────────────────┤ - │ ArenaAllocator 1 (per_thread_size bytes) │ - ├──────────────────────────────────────────────────────┤ - │ ... │ - ├──────────────────────────────────────────────────────┤ - │ ArenaAllocator N-1 │ - └──────────────────────────────────────────────────────┘ +per_block = data_capacity / num_blocks +block N slice starts at: data_ + N * per_block ``` -Each `ArenaAllocator` is a bump-pointer allocator that provides fast, lock-free allocation for its thread. All `ShmPtr` offsets are relative to the backend base so they can be resolved by the CPU worker. +Use this when launching multiple blocks where each block submits tasks independently: + +```cpp +__global__ void bench_kernel(chi::IpcManagerGpu gpu_info, + chi::PoolId pool_id, + chi::u32 num_blocks, + chi::u32 total_tasks, + int* d_done) { + CHIMAERA_GPU_ORCHESTRATOR_INIT(gpu_info, num_blocks); + + if (threadIdx.x != 0) return; // only thread 0 per block submits + + chimaera::my_module::Client client(pool_id); + for (chi::u32 i = 0; i < total_tasks; ++i) { + auto future = client.AsyncMyTask(chi::PoolQuery::Local(), i); + future.Wait(); + } + + if (blockIdx.x == 0) { + __threadfence_system(); + *d_done = 1; + } +} +``` ## Host-Side Setup -The host code must: +### Host-Side Setup for GPU → GPU + +For `PoolQuery::Local()` from a GPU kernel, the task goes through the `gpu2gpu_queue` which is managed by the orchestrator. The simplest approach is `GetClientGpuInfo()`, which fills all `IpcManagerGpuInfo` fields automatically from the running runtime: + +```cpp +// Build IpcManagerGpuInfo — fills backend, gpu2gpu_queue, gpu_heap_backend, etc. +chi::IpcManagerGpuInfo gpu_info = CHI_IPC->GetClientGpuInfo(0); -1. Start the Chimaera runtime -2. Create the module's container (pool) -3. Allocate GPU-registered pinned shared memory -4. Create a GPU task queue and register it with the runtime -5. Register the GPU memory backend for host-side ShmPtr resolution -6. Launch the kernel +// Register primary backend so CPU can resolve GPU ShmPtrs +CHI_IPC->RegisterGpuAllocator(gpu_info.backend.id_, + gpu_info.backend.data_, + gpu_info.backend.data_capacity_); +``` -### Complete Host-Side Example +For custom backends (e.g., benchmark kernels with a dedicated allocation region): ```cpp -#include -#include -#include +// Custom primary backend (pinned host, for ArenaAllocator) +hipc::MemoryBackendId backend_id(100, 0); +hipc::GpuShmMmap gpu_backend; +gpu_backend.shm_init(backend_id, 10 * 1024 * 1024, "/gpu_test", 0); +CHI_IPC->RegisterGpuAllocator(backend_id, gpu_backend.data_, + gpu_backend.data_capacity_); + +// Custom heap backend (device memory, for BuddyAllocator / CHI_GPU_HEAP_T) +hipc::MemoryBackendId heap_id(101, 0); +hipc::GpuMalloc gpu_heap; +gpu_heap.shm_init(heap_id, 4 * 1024 * 1024, "/gpu_heap", 0); + +chi::IpcManagerGpuInfo gpu_info; +gpu_info.backend = gpu_backend; +gpu_info.gpu_heap_backend = gpu_heap; +gpu_info.gpu2cu_queue = nullptr; // not using GPU->CPU +gpu_info.cpu2gpu_queue = nullptr; // not receiving CPU->GPU +gpu_info.gpu2gpu_queue = CHI_IPC->GetGpuToGpuQueue(0); +``` -int run_gpu_test(chi::PoolId pool_id, chi::u32 test_value, - chi::u32* out_result) { - // Increase GPU stack for deep template call chains - cudaDeviceSetLimit(cudaLimitStackSize, 131072); // 128 KB - - // 1. Create GPU memory backend (pinned shared memory) - hipc::MemoryBackendId backend_id(3, 0); - size_t gpu_memory_size = 10 * 1024 * 1024; // 10 MB - hipc::GpuShmMmap gpu_backend; - if (!gpu_backend.shm_init(backend_id, gpu_memory_size, "/gpu_test", 0)) - return -100; - - // 2. Create GPU task queue (pinned shared memory) - hipc::MemoryBackendId queue_backend_id(4, 0); - hipc::GpuShmMmap queue_backend; - if (!queue_backend.shm_init(queue_backend_id, 2 * 1024 * 1024, - "/gpu_queue", 0)) - return -101; - - // 3. Create allocator and task queue in the queue backend - auto* queue_alloc = queue_backend.MakeAlloc>( - queue_backend.data_capacity_); - auto gpu_queue_ptr = queue_alloc->NewObj( - queue_alloc, 1, 2, 1024); // 1 lane, 2 priorities, depth 1024 - - // 4. Register queue and assign lanes to the GPU worker - CHI_IPC->RegisterGpuQueue(gpu_queue_ptr); - CHI_IPC->AssignGpuLanesToWorker(); - - // 5. Register GPU backend for host-side ShmPtr resolution - CHI_IPC->RegisterGpuAllocator(backend_id, gpu_backend.data_, - gpu_backend.data_capacity_); - - // 6. Build IpcManagerGpu and launch kernel - chi::IpcManagerGpu gpu_info(gpu_backend, gpu_queue_ptr.ptr_); - - int* d_result = hshm::GpuApi::Malloc(sizeof(int)); - chi::u32* d_out = hshm::GpuApi::Malloc(sizeof(chi::u32)); - int h_result = 0; - chi::u32 h_out = 0; - hshm::GpuApi::Memcpy(d_result, &h_result, sizeof(int)); - hshm::GpuApi::Memcpy(d_out, &h_out, sizeof(chi::u32)); - - my_kernel<<<1, 1>>>(gpu_info, pool_id, test_value, d_result, d_out); - - cudaDeviceSynchronize(); - hshm::GpuApi::Memcpy(&h_result, d_result, sizeof(int)); - hshm::GpuApi::Memcpy(&h_out, d_out, sizeof(chi::u32)); - - *out_result = h_out; - hshm::GpuApi::Free(d_result); - hshm::GpuApi::Free(d_out); - return h_result; +Use pinned host memory for result polling (not `cudaDeviceSynchronize`, which hangs with the persistent orchestrator): + +```cpp +int *d_result; +cudaMallocHost(&d_result, sizeof(int)); +*d_result = 0; + +// Pause orchestrator, launch kernel, resume orchestrator +CHI_IPC->PauseGpuOrchestrator(); +my_kernel<<<1, 1, 0, stream>>>(gpu_info, pool_id, input, d_result, d_out); +CHI_IPC->ResumeGpuOrchestrator(); + +// Poll pinned memory for completion +while (*d_result == 0) { + std::this_thread::sleep_for(std::chrono::microseconds(100)); } ``` +### Host-Side Setup for GPU → CPU + +For `PoolQuery::ToLocalCpu()` from a GPU kernel, the task goes through the `gpu2cpu_queue` that a CPU worker thread polls. The host must create and register this queue, and also provide a pinned-host `gpu2cpu_backend` so the GPU kernel can allocate `FutureShm` objects: + +```cpp +// 1. Primary backend (device memory, for ArenaAllocator) +hipc::MemoryBackendId backend_id(5, 0); +hipc::GpuMalloc gpu_backend; +gpu_backend.shm_init(backend_id, 10 * 1024 * 1024, "/gpu_to_cpu", 0); + +// 2. GPU->CPU copy-space backend (pinned host, for FutureShm allocation) +hipc::MemoryBackendId copy_backend_id(6, 0); +hipc::GpuShmMmap copy_backend; +copy_backend.shm_init(copy_backend_id, 4 * 1024 * 1024, "/gpu_to_cpu_copy", 0); +CHI_IPC->RegisterGpuAllocator(copy_backend_id, copy_backend.data_, + copy_backend.data_capacity_); + +// 3. GPU task queue (pinned shared memory) +hipc::MemoryBackendId queue_backend_id(7, 0); +hipc::GpuShmMmap queue_backend; +queue_backend.shm_init(queue_backend_id, 2 * 1024 * 1024, "/gpu_to_cpu_q", 0); + +auto *queue_alloc = queue_backend.MakeAlloc>( + queue_backend.data_capacity_); +auto gpu_queue_ptr = queue_alloc->NewObj( + queue_alloc, 1, 2, 1024); + +// 4. Register queue and assign to CPU GPU-worker +CHI_IPC->RegisterGpuQueue(gpu_queue_ptr); +CHI_IPC->AssignGpuLanesToWorker(); +CHI_IPC->RegisterGpuAllocator(backend_id, gpu_backend.data_, + gpu_backend.data_capacity_); + +// 5. Build IpcManagerGpuInfo and launch kernel +chi::IpcManagerGpuInfo gpu_info; +gpu_info.backend = gpu_backend; +gpu_info.gpu2cpu_queue = gpu_queue_ptr.ptr_; +gpu_info.gpu2cpu_backend = copy_backend; +``` + +### CPU → GPU (No Extra Host Setup) + +For `PoolQuery::LocalGpuBcast()` from CPU host code, no additional host setup is needed beyond creating the pool. The runtime creates the `cpu2gpu_queue` and GPU orchestrator during `ServerInit()`. + +```cpp +// Compile this in a CUDA source file (HSHM_ENABLE_CUDA=1) +chi::CHIMAERA_INIT(chi::ChimaeraMode::kClient, true); + +chimaera::my_module::Client client(pool_id); +auto create = client.AsyncCreate(chi::PoolQuery::Dynamic(), name, pool_id); +create.Wait(); + +// Submit task to GPU -- Send() detects LocalGpuBcast -> SendToGpu() +auto future = client.AsyncMyTask(chi::PoolQuery::LocalGpuBcast(), input); +future.Wait(); // spins on FUTURE_COMPLETE, deserializes GPU output +chi::u32 result = future->result_; +``` + ### Key Registration Functions | Function | Purpose | |---|---| +| `CHI_IPC->GetClientGpuInfo(gpu_id)` | Build a fully-populated `IpcManagerGpuInfo` for same-process kernel launches | | `CHI_IPC->RegisterGpuQueue(queue)` | Registers a GPU task queue so the runtime can poll it | | `CHI_IPC->AssignGpuLanesToWorker()` | Assigns all registered GPU queue lanes to the designated GPU worker thread | | `CHI_IPC->RegisterGpuAllocator(id, data, capacity)` | Registers GPU backend memory so the host can resolve `ShmPtr`s allocated by the GPU kernel | +| `CHI_IPC->PauseGpuOrchestrator()` | Pauses the persistent GPU orchestrator to free SMs | +| `CHI_IPC->ResumeGpuOrchestrator()` | Resumes the GPU orchestrator | +| `CHI_IPC->GetGpuToGpuQueue(gpu_id)` | Returns the orchestrator's GPU→GPU queue pointer for a given device | +| `CHI_GPU_HEAP` | Per-thread `BuddyAllocator` (`CHI_GPU_HEAP_T`) from the GPU heap table; valid in device code after `CHIMAERA_GPU_INIT` | ## Configuration Parameters @@ -286,22 +991,31 @@ The default is **4096 bytes** if `GetCopySpaceSize()` returns 0. The `FutureShm` For tasks with large serialized payloads, increase this value. If the ring buffer is too small, the producer will spin-wait for the consumer to drain data. -### GPU Memory Backend Size +### GPU Memory Backend Sizes -The GPU memory backend is partitioned equally among all threads in a block. Size it based on: +Two backends must be sized independently: -- Number of threads per block -- Per-thread allocation needs (tasks, serialization buffers, FutureShm) +**Primary backend** (`backend`, `ArenaAllocator`): holds `FutureShm` + task objects. Reset after each task so size only needs to cover one task at a time: +```cpp +// Per-block: sizeof(FutureShm) + copy_space + sizeof(TaskT) + overhead +// 10 MB per block is sufficient for typical tasks +size_t backend_size = num_blocks * 10 * 1024 * 1024; +``` +**Heap backend** (`gpu_heap_backend`, `BuddyAllocator`): holds serialization scratch vectors. Individual `free()` reclaims memory, so size covers peak concurrent scratch usage (~2× copy_space per task): ```cpp -// Example: 10 MB for a single-thread kernel -size_t gpu_memory_size = 10 * 1024 * 1024; +// Per-block: ~2 * copy_space_size + BuddyAllocator overhead +// 4 MB per block is sufficient for typical tasks +size_t heap_backend_size = num_blocks * 4 * 1024 * 1024; ``` -A single task submission typically consumes: -- `sizeof(TaskT)` for the task object +A single task submission typically consumes from the primary arena: - `sizeof(FutureShm) + copy_space_size` for the future -- Temporary serialization buffer (~copy_space_size) +- `sizeof(TaskT)` for the deserialized task object (orchestrator side) + +And from the heap: +- Input serialization vector (~`copy_space_size`) +- Output serialization vector (~`copy_space_size`) ### GPU Task Queue Configuration @@ -326,33 +1040,113 @@ size_t queue_memory_size = 2 * 1024 * 1024; // 2 MB is sufficient for most use ### Runtime Thread Configuration -The Chimaera runtime automatically detects GPU devices and creates GPU queues during `ServerInit`. The scheduler assigns a dedicated GPU worker thread to poll GPU lanes. - -In the runtime YAML configuration, ensure enough worker threads are configured: +The Chimaera runtime automatically detects GPU devices and creates GPU queues during `ServerInit`. The default scheduler (`DefaultScheduler`) partitions workers as follows: ```yaml -# At minimum: N task workers + 1 network worker + 1 GPU worker +# chimaera.yaml num_threads: 4 ``` -The scheduler's `DivideWorkers` method partitions workers into task, network, and GPU roles. With `num_threads: 4`, you get 2 task workers, 1 network worker, and 1 GPU worker (the GPU worker is selected from the task worker pool by the scheduler). +With `num_threads: N`: + +| Worker ID | Role | Responsibility | +|---|---|---| +| 0 | Scheduler | Task routing and scheduling | +| 1 to N-3 | I/O workers | Execute tasks from CPU queues | +| N-2 | GPU worker | Polls GPU→CPU queue lanes | +| N-1 | Network worker | Handles ZMQ send/recv | + +The GPU worker polls **all** GPU→CPU lanes across all GPU devices. With `num_threads: 4`, you get 1 scheduler, 1 I/O worker (which doubles as the GPU worker range), 1 GPU worker, and 1 network worker. + +## CMake Integration + +### Mixed CUDA/C++ Targets + +When building a test or application that has both CUDA and C++ source files, keep GPU-routing calls in the CUDA source file. The typical CMake pattern: + +```cmake +# GPU kernels compiled as CUDA +set_source_files_properties(my_gpu_kernels.cc PROPERTIES LANGUAGE CUDA) +add_library(my_gpu_kernels OBJECT my_gpu_kernels.cc) +target_link_libraries(my_gpu_kernels PRIVATE hshm::cuda_cxx) + +# Main executable with CPU sources +add_executable(my_app my_main.cc $) + +# CRITICAL: CPU sources are compiled with HSHM_ENABLE_CUDA=0 to avoid +# __device__ errors. GPU routing in Send() is compiled out. +set_source_files_properties(my_main.cc PROPERTIES + COMPILE_OPTIONS "-UHSHM_ENABLE_CUDA;-DHSHM_ENABLE_CUDA=0" +) +``` + +**Consequence**: `client.AsyncMyTask(LocalGpuBcast(), ...)` called from `my_main.cc` will NOT route to the GPU because the `Send()` interceptor is compiled out. Instead, define a wrapper in the CUDA source file: + +```cpp +// my_gpu_kernels.cc (compiled as CUDA, HSHM_ENABLE_CUDA=1) +extern "C" int run_cpu_to_gpu(chi::PoolId pool_id, chi::u32 input, + chi::u32 *out_result) { + chimaera::my_module::Client client(pool_id); + auto future = client.AsyncMyTask(chi::PoolQuery::LocalGpuBcast(), input); + if (!future.Wait(10.0f)) return -3; // timeout + *out_result = future->result_; + return 1; +} +``` + +### GPU Module Include Directories + +GPU targets need `$` for autogen headers: + +```cmake +target_include_directories(my_gpu_target PRIVATE + $ +) +``` ## Data Flow Details -### Send Path (GPU → CPU) +### Send Path (GPU → CPU) 1. **NewTask**: Allocates the task from the per-thread `ArenaAllocator` 2. **SendGpu**: Allocates a `FutureShm` with embedded ring buffer copy space 3. **Enqueue**: Pushes the `Future` onto the GPU task queue lane (the CPU worker polls this) 4. **Serialize + Send**: Builds a `LocalSaveTaskArchive`, calls `task->SerializeIn(ar)`, then streams the archive through `ShmTransport::Send` into the input ring buffer -### Receive Path (CPU → GPU) +### Receive Path (CPU → GPU) 1. **CPU worker** dequeues the `FutureShm`, reads from the input ring buffer via `ShmTransport::Recv`, deserializes, executes the task, then writes output back through the output ring buffer 2. **RecvGpu**: On the GPU, `Future::Wait()` calls `RecvGpu` which reads from the output ring buffer via `ShmTransport::Recv` 3. **Deserialize**: A `LocalLoadTaskArchive` deserializes the output fields via `task->SerializeOut(ar)` 4. **Completion**: The GPU spins on `FUTURE_COMPLETE` flag (should already be set after Recv completes) +### CPU → GPU Send Path (SendToGpu) + +1. **Send()** detects `LocalGpuBcast`/`ToLocalGpu` routing and calls `SendToGpu()` +2. **SendToGpu** allocates a `FutureShm` in GPU-accessible pinned host memory via `AllocateGpuBuffer()` +3. Serializes task input via `task->SerializeIn(ar)` + `ShmTransport::Send` into the input ring buffer +4. Pushes a `Future` onto `cpu2gpu_queues_[gpu_id]` +5. Returns a `Future` with the pinned-memory FutureShm + +### CPU → GPU Receive Path (Wait runtime path) + +1. **Wait()** enters the runtime spin-wait loop on `FUTURE_COMPLETE` +2. The GPU orchestrator pops the task, dispatches to `GpuRuntime`, serializes output into the FutureShm's output ring buffer, calls `__threadfence_system()`, and sets `FUTURE_COMPLETE` +3. **Wait()** detects `FUTURE_COPY_FROM_CLIENT` flag and non-zero `output_.copy_space_size_`, then deserializes output via `ShmTransport::Recv` + `task->SerializeOut(ar)` + +### Client Process → Server CPU Path (SendShm/SendZmq) + +When a pure client uses `PoolQuery::Local()`, the task bypasses all GPU queues and goes through the standard client transport: + +1. **SendShm**: Allocates `FutureShm` in server's SHM segment with `copy_space` ring buffer. Enqueues to `worker_queues_` lane (mapped by `ClientMapTask`). Serializes task via `SaveTaskArchive` + `ShmTransport::Send` into the ring buffer. +2. **Server worker**: Dequeues `FutureShm`, checks `FUTURE_COPY_FROM_CLIENT` flag, reads from `copy_space` ring buffer via `ShmTransport::Recv`, deserializes, executes the **CPU handler** (e.g., `result = test_value * 2 + gpu_id`). +3. **Response**: Worker serializes output back through the ring buffer, sets `FUTURE_COMPLETE`. +4. **Client Wait()**: Spins on `FUTURE_COMPLETE`, deserializes output from `copy_space`. + +This path uses the CPU runtime handler, not the GPU handler. The computation differs: +- CPU handler: `result_value = test_value * 2 + gpu_id` +- GPU handler: `result_value = test_value * 3 + gpu_id` + ### Wire Format GPU and CPU use compatible but different archive types: @@ -362,7 +1156,105 @@ GPU and CPU use compatible but different archive types: | GPU (send) | `LocalSaveTaskArchive` | `LocalLoadTaskArchive` | | CPU (worker) | `SaveTaskArchive` | `LoadTaskArchive` | -Both produce the same binary wire format: `[vector_size][elements...][msg_type][stream_data]`. This is critical — if the formats diverge, deserialization will produce corrupted data. +Both produce the same binary wire format: `[vector_size][elements...][msg_type][stream_data]`. This is critical -- if the formats diverge, deserialization will produce corrupted data. + +## Client Process GPU Tests + +The test file `context-runtime/modules/MOD_NAME/test/test_gpu_client_process.cc` verifies cross-process GPU task submission using the fork-based test pattern. A child process runs as a standalone Chimaera server (with `--server-mode`), while the parent connects as a pure client. + +### Test Infrastructure + +All tests share the same server/client lifecycle managed by `EnsureInitialized()`: + +1. **`StartServerProcess()`**: Forks a child, sets `CHI_WITH_RUNTIME=1`, and `execl`s back into the same binary with `--server-mode`. The child calls `CHIMAERA_INIT(kServer, true)` and sleeps until killed. +2. **`WaitForServer()`**: Polls for the SHM segment file (`/tmp/chimaera_/chi_main_segment_`) up to 50 times at 200ms intervals. +3. **Parent client init**: Sets `CHI_WITH_RUNTIME=0` and calls `CHIMAERA_INIT(kClient, false)`. During `ClientConnect`, the client receives GPU queue metadata from the server. +4. **Cleanup**: `CleanupServer()` sends `SIGKILL` to the server process group and removes SHM files. + +### Test Cases + +#### `client_process_gpu_submit_local` + +Submits a `GpuSubmitTask` with `PoolQuery::Local()` routing from the client process. The task travels via SHM or TCP transport to the server's CPU worker, which executes the CPU handler (`result = test_value * 2 + gpu_id`). Verifies the result matches the expected CPU computation. + +This test works regardless of GPU support — `Local()` routing always goes through the CPU worker. + +#### `client_process_gpu_submit_multiple` + +Submits 5 sequential `GpuSubmitTask`s with different `test_value` parameters (100–104), each via `PoolQuery::Local()`. Verifies that all return correct results, confirming the transport handles repeated task round-trips without state corruption. + +#### `client_process_gpu_queue_attachment` + +Verifies that `ClientInitGpuQueues()` successfully attached to the server's GPU queues during `ClientConnect`. Checks that `GetToGpuQueueCount() > 0` and that `GetToGpuQueue(0)` and `GetGpuToGpuQueue(0)` return non-null pointers. + +Requires `HSHM_ENABLE_CUDA` or `HSHM_ENABLE_ROCM` at compile time. Without GPU support, prints "GPU support not compiled, skipping" and passes. + +#### `client_process_gpu_submit_to_gpu` + +Submits a `GpuSubmitTask` with `PoolQuery::LocalGpuBcast()` routing. This exercises the `SendToGpu()` path: the client serializes the task into the attached `GpuShmMmap` queue backend and pushes it onto `cpu2gpu_queue`. The server's GPU orchestrator dequeues and dispatches to the module's GPU handler. + +Requires GPU support compiled in and a GPU device on the server. Verifies that `result = test_value * 2 + gpu_id` (GPU handler uses the same formula in this test module). + +#### `client_process_register_gpu_memory` + +Tests cross-process GPU memory registration. The client: +1. Allocates a 4 MB `GpuMalloc` backend (device memory via `cudaMalloc`) +2. Reads the IPC handle from the backend's private header (GPU→host memcpy) +3. Sends a `RegisterMemoryTask` with `MemoryType::kGpuDeviceMemory` to the server +4. The server opens the IPC handle via `GpuMalloc::shm_attach_ipc()` and registers the allocator in `gpu_alloc_map_` + +After registration, the server can resolve `ShmPtr`s pointing into this client-owned device memory. + +### Building and Running + +```bash +cd /workspace/build +cmake /workspace && cmake --build . -j$(nproc) + +# CRITICAL: install ALL .so files (stale modules cause serialization mismatches) +sudo cp bin/lib*.so /usr/local/lib/ && sudo ldconfig + +# Run all client process GPU tests +./bin/chimaera_gpu_client_process_tests + +# Run a specific test +./bin/chimaera_gpu_client_process_tests client_process_gpu_submit_local +``` + +**Note:** Tests 3–5 (`gpu_queue_attachment`, `gpu_submit_to_gpu`, `register_gpu_memory`) require the test binary to be compiled with CUDA/ROCm support. Without it, they skip gracefully. Tests 1–2 work with any build configuration. + +## Performance + +### GPU→GPU Task Round-Trip Latency + +The following latency was measured using the in-process benchmark (`bench_gpu_runtime`) with 1 client block × 1 active thread (sequential `AsyncGpuSubmit` + `Wait()` loop): + +| Allocator (`CHI_GPU_HEAP_T`) | Avg Latency | Throughput (1 thread) | Notes | +|---|---|---|---| +| `BuddyAllocator` (default) | ~200 µs | ~5,000 tasks/sec | Managed pool; per-allocation free; no exhaustion | +| `MallocAllocator` (device heap) | ~400 µs | ~2,500 tasks/sec | CUDA device `malloc()`/`free()`; serialized across warps | + +`BuddyAllocator` backed by `GpuMalloc` device memory is the default because it is 2× faster than CUDA device heap `malloc()` and does not exhaust memory across arbitrarily many tasks. + +### How Latency Is Measured + +The benchmark timer starts on the CPU immediately after `ResumeGpuOrchestrator()` and ends when the CPU polls `*d_done == 1` (set by the GPU after the last task completes). The formula is: + +``` +avg_latency = elapsed_ms * 1000 / total_tasks (µs per task) +``` + +This captures the full GPU→GPU round-trip per task: +1. `AsyncGpuSubmit`: serialize inputs into `FutureShm` ring buffer, push to `gpu2gpu_queue` +2. Orchestrator polls, dispatches `DispatchTask`: deserialize → `Run()` → serialize output +3. `FUTURE_COMPLETE` written with `__threadfence_system()` + system-scope atomic +4. `future.Wait()` spin exits, `RecvGpu` deserializes output + +**Note:** the first task's queue-push overhead is excluded because the GPU kernel starts before the CPU timer. CPU polling overhead (100 µs sleep per iteration) adds at most ~0.1 µs per task across 1,000 tasks and is negligible. + +### Memory Limits + +The default `GpuShmMmap` orchestrator scratch (`HSHM_DEFAULT_ALLOC_GPU_T = ArenaAllocator`) is reset after each completed task (`alloc->Reset()`), so it never exhausts regardless of task count. The `BuddyAllocator` heap (`CHI_GPU_HEAP_T`) reclaims memory individually as serialization vectors go out of scope. ## Troubleshooting @@ -373,6 +1265,12 @@ Usually caused by GPU stack overflow during deep serialization template chains. cudaDeviceSetLimit(cudaLimitStackSize, 131072); ``` +### CUDA Error 715 (Illegal Instruction) + +Caused by using system-scope atomics on pinned host memory, or by cross-library virtual function calls. Solutions: +- Use device-scope atomics (`atomicAdd`, `atomicOr`) with explicit `__threadfence_system()` at strategic points +- Ensure all GPU containers are allocated within the orchestrator's CUDA module context (the autogen system handles this) + ### Silent data corruption (kernel completes but wrong values) Check that **all** serialization functions are marked `HSHM_CROSS_FUN`. Without `__device__` annotation, NVCC's SFINAE dispatch silently skips serialization, consuming 0 bytes and misaligning the stream. The compiler produces no error. @@ -384,6 +1282,20 @@ The GPU producer and CPU consumer share the ring buffer. If the buffer fills, th - `RegisterGpuQueue` and `AssignGpuLanesToWorker` were called before the kernel launch - The copy space is large enough for the serialized payload +### CPU→GPU Wait() timeout + +If `future.Wait()` times out after `SendToGpu()`: +- Verify the GPU orchestrator is running (not paused) +- Verify the module's GPU container is registered (check for "Registered GPU container" log message) +- Verify the call site is compiled with `HSHM_ENABLE_CUDA=1` -- if compiled with `HSHM_ENABLE_CUDA=0`, the `Send()` GPU interceptor is absent and the task goes through normal CPU routing, which cannot complete a GPU-bound future + +### Client process GPU routing fails + +If a pure client process (`CHIMAERA_INIT(kClient, false)`) tries to use `LocalGpuBcast` or `ToLocalGpu` and gets empty futures: +- **Binary not compiled with GPU support**: `ClientInitGpuQueues()` is skipped, `cpu2gpu_queues_` is empty. Use `PoolQuery::Local()` as a fallback (task goes to the server's CPU handler). +- **Stale installed libraries**: After rebuilding with GPU queue extensions, **all** `.so` files must be reinstalled — not just `libchimaera_cxx.so`. Stale module `.so` files (e.g., `libchimaera_admin_runtime.so`) cause `ClientConnect` serialization mismatches because the server loads modules from `/usr/local/lib/`. Fix: `sudo cp /workspace/build/bin/lib*.so /usr/local/lib/ && sudo ldconfig` +- **Server has no GPUs**: `ClientConnect` returns `num_gpus_ == 0`, so no queues are attached. This is expected behavior. + ### Kernel launch failure (-201) Check `cudaGetLastError()` after the kernel launch. Common causes: @@ -397,3 +1309,11 @@ The CPU worker must be able to resolve `ShmPtr`s that point into GPU-allocated p CHI_IPC->RegisterGpuAllocator(backend_id, gpu_backend.data_, gpu_backend.data_capacity_); ``` + +### GPU orchestrator SM starvation + +The persistent orchestrator occupies all SMs (32 blocks by default). To launch other GPU kernels: +1. Call `CHI_IPC->PauseGpuOrchestrator()` +2. Launch your kernel on a separate stream +3. Synchronize your stream (not `cudaDeviceSynchronize`) +4. Call `CHI_IPC->ResumeGpuOrchestrator()` From 39947f31db4ff6bada880c46edfc194a6f2d08cf Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Mon, 9 Mar 2026 23:07:19 +0000 Subject: [PATCH 4/5] Add GPU CTE and GPU Infinite Memory (UVM) docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gpu-cte.md: documents GPU kernel integration with CTE — CPU fork-client setup, three-backend memory layout, IpcManagerGpuInfo, kernel-side AsyncPutBlob/GetBlob, routing table, and stop-flag polling pattern. gpu-inf-mem.md: documents the GpuShmMmap UVM backend — why UVM is chosen over pinned memory (no HostNativeAtomicSupported required), shm_init API, memory layout, kernel passing, IPC manager registration, per-block allocator construction, destruction rules, and comparison table against MallocBackend / PosixShmMmap / GpuMalloc. Co-Authored-By: Claude Sonnet 4.6 --- docs/sdk/context-transfer-engine/gpu-cte.md | 325 ++++++++++++++++++ .../context-transfer-engine/gpu-inf-mem.md | 161 +++++++++ 2 files changed, 486 insertions(+) create mode 100644 docs/sdk/context-transfer-engine/gpu-cte.md create mode 100644 docs/sdk/context-transfer-engine/gpu-inf-mem.md diff --git a/docs/sdk/context-transfer-engine/gpu-cte.md b/docs/sdk/context-transfer-engine/gpu-cte.md new file mode 100644 index 00000000..b62dbc9d --- /dev/null +++ b/docs/sdk/context-transfer-engine/gpu-cte.md @@ -0,0 +1,325 @@ +# GPU CTE API + +This guide describes how to call `AsyncPutBlob`, `AsyncGetBlob`, and +`AsyncGetOrCreateTag` from CUDA GPU kernels using the CTE (Context Transfer +Engine) client API. Two usage patterns are covered: + +1. **CPU-side with fork client** -- the simpler path where a CPU test process + launches a background Chimaera runtime and issues CTE calls from host code + using shared-memory buffers. +2. **GPU kernel-side** -- the advanced path where a CUDA `__global__` kernel + allocates tasks, submits them through the GPU-to-CPU queue, and waits for + completion entirely on-device. + +## Headers + +```cpp +#include +#include +#include +#include +#include + +// GPU memory backends (GPU kernel path only) +#include +#include +#include +``` + +## CPU-Side Usage (Fork Client) + +This is the recommended starting point. A fork client spawns a background +Chimaera runtime in a child process, then the parent issues CTE calls from +normal host code. Blob data is allocated in shared memory so the runtime +workers can access it directly. + +### 1. Initialize Chimaera + +```cpp +bool ok = chi::CHIMAERA_INIT(chi::ChimaeraMode::kClient, /*fork=*/true); +``` + +The second argument `true` forks a background runtime. Allow ~500 ms for the +child process to start before issuing tasks. + +### 2. Create the CTE Pool + +```cpp +chi::PoolId core_pool_id = wrp_cte::core::kCtePoolId; +wrp_cte::core::Client core_client(core_pool_id); + +wrp_cte::core::CreateParams params; +auto create_task = core_client.AsyncCreate( + chi::PoolQuery::Dynamic(), + wrp_cte::core::kCtePoolName, core_pool_id, params); +create_task.Wait(); +``` + +### 3. Register a Storage Target + +CTE needs at least one storage backend for blob data. + +```cpp +auto reg_task = core_client.AsyncRegisterTarget( + "/tmp/cte_gpu_test.dat", // path on disk + chimaera::bdev::BdevType::kFile, // file-backed target + 16 * 1024 * 1024, // 16 MB capacity + chi::PoolQuery::Local(), + chi::PoolId(700, 0)); // target pool ID +reg_task.Wait(); +``` + +### 4. Create a Tag + +Tags group blobs logically. The returned `tag_id_` is required by PutBlob +and GetBlob. + +```cpp +auto tag_task = core_client.AsyncGetOrCreateTag("my_gpu_tag"); +tag_task.Wait(); +wrp_cte::core::TagId tag_id = tag_task->tag_id_; +``` + +### 5. AsyncPutBlob + +Allocate a shared-memory buffer, fill it, then submit a PutBlob task. + +```cpp +const size_t blob_size = 4096; + +// Allocate in shared memory (accessible to runtime workers) +hipc::FullPtr buf = CHI_IPC->AllocateBuffer(blob_size); +std::memset(buf.ptr_, 0xAB, blob_size); + +// Cast to a void ShmPtr for the CTE API +hipc::ShmPtr<> blob_data = buf.shm_.Cast(); + +auto put = core_client.AsyncPutBlob( + tag_id, + "my_blob", // blob name + /*offset=*/0, + /*size=*/blob_size, + blob_data, + /*score=*/-1.0f, // -1 = auto-place + wrp_cte::core::Context(), + /*flags=*/0, + chi::PoolQuery::Local()); +put.Wait(); +assert(put->GetReturnCode() == 0); +``` + +### 6. AsyncGetBlob + +Allocate an output buffer in shared memory and submit a GetBlob task. + +```cpp +hipc::FullPtr out_buf = CHI_IPC->AllocateBuffer(blob_size); +std::memset(out_buf.ptr_, 0, blob_size); +hipc::ShmPtr<> out_data = out_buf.shm_.Cast(); + +auto get = core_client.AsyncGetBlob( + tag_id, + "my_blob", + /*offset=*/0, + /*size=*/blob_size, + /*flags=*/0, + out_data, + chi::PoolQuery::Local()); +get.Wait(); +assert(get->GetReturnCode() == 0); +// out_buf.ptr_ now contains the retrieved blob data +``` + +## GPU Kernel-Side Usage + +For submitting CTE tasks directly from a CUDA `__global__` kernel, additional +setup is required. The GPU kernel uses `CHIMAERA_GPU_INIT` to set up per-thread +allocators, then calls `CHI_IPC->NewTask()` and `CHI_IPC->Send()` exactly +like host code. + +### Memory Backend Setup (Host Side) + +Three GPU memory backends must be initialized before launching the kernel: + +```cpp +// 1. Primary backend -- task object allocation (GpuShmMmap = UVM) +hipc::MemoryBackendId backend_id(20, 0); +hipc::GpuShmMmap gpu_backend; +gpu_backend.shm_init(backend_id, 10 * 1024 * 1024, "/my_gpu_cte", 0); + +// Register with the IPC manager so the GPU kernel can allocate from it +CHI_IPC->RegisterGpuAllocator( + backend_id, gpu_backend.data_, gpu_backend.data_capacity_); + +// 2. GPU-to-CPU backend -- FutureShm lives here (UVM, CPU+GPU visible) +hipc::MemoryBackendId g2c_backend_id(21, 0); +hipc::GpuShmMmap g2c_backend; +g2c_backend.shm_init(g2c_backend_id, 4 * 1024 * 1024, "/my_gpu_g2c", 0); + +// 3. GPU heap backend -- device memory for serialization scratch +hipc::MemoryBackendId heap_backend_id(22, 0); +hipc::GpuMalloc gpu_heap_backend; +gpu_heap_backend.shm_init(heap_backend_id, 4 * 1024 * 1024, "/my_gpu_heap", 0); +``` + +### IpcManagerGpuInfo Setup (Host Side) + +Pack the backends and queue pointer into a struct that the kernel receives +by value: + +```cpp +chi::IpcManagerGpuInfo gpu_info; +gpu_info.backend = + static_cast(gpu_backend); +gpu_info.gpu2cpu_queue = + CHI_IPC->GetGpuQueue(0); // pre-existing GPU-to-CPU queue +gpu_info.gpu2cpu_backend = + static_cast(g2c_backend); +gpu_info.gpu_heap_backend = + static_cast(gpu_heap_backend); +``` + +### Kernel Launch (Host Side) + +Use a non-blocking CUDA stream to avoid serializing with the default stream +(important when a persistent GPU orchestrator kernel is also running): + +```cpp +cudaStream_t stream; +cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); + +my_cte_kernel<<<1, 1, 0, stream>>>(gpu_info, pool_id, tag_id); + +// Poll or synchronize as needed (see polling pattern below) +``` + +### GPU Kernel Implementation + +```cpp +__global__ void my_cte_kernel(chi::IpcManagerGpu gpu_info, + chi::PoolId pool_id, + wrp_cte::core::TagId tag_id) { + // Initialize per-thread GPU allocators and IPC context + CHIMAERA_GPU_INIT(gpu_info); + + // ---- GetOrCreateTag ---- + auto tag_task = CHI_IPC->NewTask>( + chi::CreateTaskId(), + pool_id, + chi::PoolQuery::ToLocalCpu(), // route to CPU worker + "my_gpu_tag", + wrp_cte::core::TagId::GetNull()); + auto tag_future = CHI_IPC->Send(tag_task); + tag_future.Wait(); + wrp_cte::core::TagId result_tag_id = tag_future->tag_id_; + + // ---- PutBlob ---- + const size_t blob_size = 4096; + hipc::FullPtr buf = CHI_IPC->AllocateBuffer(blob_size); + // Fill buffer on device... + + auto put_task = CHI_IPC->NewTask( + chi::CreateTaskId(), + pool_id, + chi::PoolQuery::ToLocalCpu(), + result_tag_id, + "my_blob", + /*offset=*/0ULL, + /*size=*/(chi::u64)blob_size, + buf.shm_.Cast(), + /*score=*/-1.0f, + wrp_cte::core::Context(), + /*flags=*/0U); + auto put_future = CHI_IPC->Send(put_task); + put_future.Wait(); + + // ---- GetBlob ---- + hipc::FullPtr out = CHI_IPC->AllocateBuffer(blob_size); + auto get_task = CHI_IPC->NewTask( + chi::CreateTaskId(), + pool_id, + chi::PoolQuery::ToLocalCpu(), + result_tag_id, + "my_blob", + /*offset=*/0ULL, + /*size=*/(chi::u64)blob_size, + /*flags=*/0U, + out.shm_.Cast()); + auto get_future = CHI_IPC->Send(get_task); + get_future.Wait(); + // out.ptr_ now contains the retrieved data +} +``` + +### Routing from GPU Kernels + +| PoolQuery | Direction | Description | +|-----------|-----------|-------------| +| `PoolQuery::ToLocalCpu()` | GPU to CPU | Task enters the `gpu2cpu_queue` and is processed by a CPU worker. Use this for CTE blob operations. | +| `PoolQuery::Local()` | GPU to GPU | Task stays on the GPU orchestrator. Use for GPU-native module methods. | +| `PoolQuery::LocalGpuBcast()` | CPU to GPU | CPU pushes a task to GPU orchestrator. Used from host code, not inside kernels. | + +For CTE blob I/O, always use `PoolQuery::ToLocalCpu()` because the CTE +runtime (storage targets, metadata) runs on CPU workers. + +### GPU Wait with Stop Flag + +`future.Wait()` spins on the GPU until the CPU worker marks the future +complete. For robustness, use a manual poll loop with a CPU-controlled stop +flag so the host can terminate a stuck kernel: + +```cpp +__global__ void safe_kernel(chi::IpcManagerGpu gpu_info, + /* ... */, + volatile int *d_stop) { + CHIMAERA_GPU_INIT(gpu_info); + // ... NewTask + Send ... + auto fshm_full = future.GetFutureShm(); + chi::FutureShm *fshm = fshm_full.ptr_; + while (fshm && !fshm->flags_.AnySystem(chi::FutureShm::FUTURE_COMPLETE)) { + // Bypass GPU L2 cache to see CPU-written stop flag + int stop = atomicAdd_system(const_cast(d_stop), 0); + if (stop) return; + HSHM_THREAD_MODEL->Yield(); + } +} +``` + +On the host side, allocate `d_stop` with `cudaMallocManaged` and set it to 1 +if the kernel times out. + +## Context Structure + +`wrp_cte::core::Context` controls compression and placement behavior. Pass a +default-constructed `Context()` for standard uncompressed I/O. + +Key fields: + +| Field | Default | Description | +|-------|---------|-------------| +| `dynamic_compress_` | 0 | 0=skip, 1=static, 2=dynamic compression | +| `compress_lib_` | 0 | Compression library index (0-10) | +| `compress_preset_` | 2 | 1=FAST, 2=BALANCED, 3=BEST | +| `min_persistence_level_` | 0 | 0=volatile, 1=temp-nonvolatile, 2=long-term | +| `score` (PutBlob param) | -1.0 | -1=auto, 0.0-1.0=explicit tier (higher=faster) | + +## CMake Integration + +GPU CTE kernels must be compiled with CUDA (`*.cu` files or +`set_source_files_properties(... LANGUAGE CUDA)`). Link against: + +```cmake +target_link_libraries(my_target + wrp_cte_core_client # CTE client library + chimaera_cxx # Chimaera runtime + hermes_shm_host # Shared memory primitives +) +``` + +Enable CUDA in the project: + +```cmake +enable_language(CUDA) +set(CMAKE_CUDA_STANDARD 17) +set(CMAKE_CUDA_ARCHITECTURES 70 80 90) +``` diff --git a/docs/sdk/context-transfer-engine/gpu-inf-mem.md b/docs/sdk/context-transfer-engine/gpu-inf-mem.md new file mode 100644 index 00000000..faf44158 --- /dev/null +++ b/docs/sdk/context-transfer-engine/gpu-inf-mem.md @@ -0,0 +1,161 @@ +# GPU Infinite Memory (UVM Backend) + +`GpuShmMmap` is IOWarp's **Unified Virtual Memory (UVM)** memory backend. +It allocates a single region with `cudaMallocManaged` so that both CPU +threads and GPU kernels can read and write the same bytes without any +explicit copy or synchronization primitive beyond a memory fence. + +This is the backbone of the GPU transport layer: `FutureShm` ring buffers, +task queues, and ArenaAllocator scratch regions all live in UVM memory so +a CPU worker and a GPU orchestrator kernel can exchange data at cache speed +rather than PCIe bandwidth. + +## Why UVM Instead of Pinned Memory + +Pinned host memory (`cudaMallocHost`) requires **system-scope atomics** +(`atomicAdd_system`, `atomicExch_system`) for GPU→CPU visibility. These +atomics in turn require `cudaDevAttrHostNativeAtomicSupported`, which is +`0` on most discrete/PCIe GPUs (e.g. RTX 4070, A100 over PCIe). Without +hardware support the operations silently fall back to slower paths and can +exhibit stale-read races. + +UVM with `cudaDevAttrConcurrentManagedAccess` (available on SM 6.0+ GPUs) +uses the GPU's hardware page-migration engine to maintain coherence, so: + +- **Standard device-scope atomics** on the GPU are visible to the CPU. +- **`std::atomic` on the CPU** is visible to the GPU. +- No `clflush`, write-combining flags, or `__threadfence_system` needed. + +| Feature | `cudaMallocHost` (pinned) | `cudaMallocManaged` (UVM) | +|---------|--------------------------|--------------------------| +| GPU→CPU atomics | `atomicAdd_system` required | device-scope sufficient | +| Hardware requirement | `HostNativeAtomicSupported` | `ConcurrentManagedAccess` | +| Cross-process sharing | Yes (via IPC handles) | No (single process only) | +| Free with | `cudaFreeHost` | `cudaFree` | +| Typical use | small control flags | ring buffers, allocators | + +## Header + +```cpp +#include +``` + +Requires `HSHM_ENABLE_CUDA=1` or `HSHM_ENABLE_ROCM=1`. + +## Allocation + +```cpp +hipc::MemoryBackendId backend_id(100, 0); // (major, minor) — must be unique +hipc::GpuShmMmap backend; + +bool ok = backend.shm_init( + backend_id, + 32 * 1024 * 1024, // 32 MB total (headers + data) + "/my_uvm_region", // informational name — not a file path + 0); // GPU device ID (informational) + +assert(ok); +char *data = backend.data_; // usable data region +size_t capacity = backend.data_capacity_; // usable bytes (total − 8 KB headers) +``` + +`shm_init` calls `cudaMallocManaged(&ptr, size, cudaMemAttachGlobal)` and +lays out the region as: + +``` +[ 4 KB backend header ][ 4 KB shared header ][ data ... ] +``` + +The two header pages are reserved for `MemoryBackendHeader` metadata; your +allocator or ring buffer should start from `data_`. + +## Passing to a GPU Kernel + +`GpuShmMmap` is a plain struct (no vtable, no host-only members) and can +be passed to a `__global__` kernel **by value** as part of +`IpcManagerGpuInfo`. The `data_` pointer is a CUDA-managed virtual +address valid on both host and device. + +```cpp +chi::IpcManagerGpuInfo gpu_info; +gpu_info.backend = static_cast(backend); +gpu_info.gpu2cpu_backend = static_cast(g2c_backend); +// ... set queue pointers ... + +my_kernel<<>>(gpu_info, ...); +``` + +Inside the kernel, `CHIMAERA_GPU_ORCHESTRATOR_INIT` (or `CHIMAERA_GPU_INIT` +for client kernels) reconstructs the per-block ArenaAllocator from +`gpu_info.backend`, partitioning the UVM region evenly across blocks. + +## Registering with the IPC Manager + +On the host side, register the backend so the CPU-side IPC manager can +resolve allocator IDs returned by the GPU: + +```cpp +CHI_IPC->RegisterGpuAllocator( + backend_id, + backend.data_, + backend.data_capacity_); +``` + +This records a mapping from `backend_id` → `(data_, capacity)` so that +`FutureShm` offsets written by a GPU thread can be dereferenced by a CPU +worker without an extra copy. + +## Memory Layout Inside the Region + +After calling `shm_init`, build an ArenaAllocator (or any other +`HSHM_DEFAULT_ALLOC_GPU_T`) directly on top of the UVM region: + +```cpp +// Host side — initialize allocator +auto *alloc = reinterpret_cast(backend.data_); +new (alloc) HSHM_DEFAULT_ALLOC_GPU_T(); +alloc->shm_init(backend_id, nullptr, backend.data_, backend.data_capacity_); +``` + +The same bytes are then accessible from the GPU kernel as the +`gpu_alloc_table_[thread_id]` entry set up by `CHIMAERA_GPU_ORCHESTRATOR_INIT`. + +## Destruction + +```cpp +backend.shm_destroy(); // calls cudaFree internally +``` + +Do **not** call `cudaFreeHost` — the backing memory was allocated with +`cudaMallocManaged`, not `cudaMallocHost`. + +`shm_attach` is intentionally unsupported; UVM allocations are +process-local and cannot be imported by another PID. + +## Limitations + +| Limitation | Detail | +|-----------|--------| +| Single-process only | No `shm_attach`; cannot share across PIDs | +| Minimum size | Enforced to 1 MB (`kMinBackendSize`) | +| Requires CUDA/ROCm | Compiled out when neither flag is set | +| SM requirement | `ConcurrentManagedAccess` needed for coherent access; available on SM 6.0+ (Pascal and newer) | + +## Relation to Other Backends + +| Backend | Memory type | GPU accessible | Cross-process | +|---------|------------|---------------|--------------| +| `MallocBackend` | Host heap | No | No | +| `PosixShmMmap` | POSIX shm | No | Yes | +| `GpuShmMmap` | UVM (managed) | Yes (coherent) | No | +| `GpuMalloc` | Device DRAM | Yes (device-scope) | Yes (IPC handles) | + +For tasks that must be visible to CPU workers, use `GpuShmMmap`. For +opaque device-side scratch memory shared across processes, use `GpuMalloc`. + +## CMake + +```cmake +target_link_libraries(my_target hermes_shm_cuda) +target_compile_definitions(my_target PRIVATE HSHM_ENABLE_CUDA=1) +``` From 16fcae76bb6604ecc75728243d62c9f5cc358938 Mon Sep 17 00:00:00 2001 From: lukemartinlogan Date: Mon, 9 Mar 2026 23:09:30 +0000 Subject: [PATCH 5/5] Fix gpu-inf-mem.md: document wrp_cte_uvm demand-paging module Replace placeholder UVM backend doc with accurate documentation of the GpuVirtualMemoryManager class in context-transfer-engine/uvm. Covers: - GpuVmmConfig fields and defaults - init/destroy lifecycle - touchPage / touchRange / touchPageAsync demand page-in - evictPage / evictPageAsync page-out to host RAM or CTE blob store - prefetch_window auto-prefetch - state queries (isMapped, isEvictedToHost, getMappedPageCount, ...) - separate transfer/compute stream model - CTE blob store backing option - full end-to-end example - CMake integration and hardware requirements Co-Authored-By: Claude Sonnet 4.6 --- .../context-transfer-engine/gpu-inf-mem.md | 254 ++++++++++-------- 1 file changed, 144 insertions(+), 110 deletions(-) diff --git a/docs/sdk/context-transfer-engine/gpu-inf-mem.md b/docs/sdk/context-transfer-engine/gpu-inf-mem.md index faf44158..4494f031 100644 --- a/docs/sdk/context-transfer-engine/gpu-inf-mem.md +++ b/docs/sdk/context-transfer-engine/gpu-inf-mem.md @@ -1,161 +1,195 @@ -# GPU Infinite Memory (UVM Backend) +# GPU Infinite Memory (UVM) -`GpuShmMmap` is IOWarp's **Unified Virtual Memory (UVM)** memory backend. -It allocates a single region with `cudaMallocManaged` so that both CPU -threads and GPU kernels can read and write the same bytes without any -explicit copy or synchronization primitive beyond a memory fence. +The `wrp_cte_uvm` module provides a **software-managed GPU demand-paging +system** built on the CUDA Driver API's virtual memory management (VMM) +primitives. It lets you reserve an enormous virtual address space (up to +512 GB by default) on the GPU while backing only the pages that are +actually touched with physical device memory. Pages that have not been +accessed yet consume no physical memory; pages that are evicted are saved to +host RAM (or to a CTE blob store) and transparently restored on next access. -This is the backbone of the GPU transport layer: `FutureShm` ring buffers, -task queues, and ArenaAllocator scratch regions all live in UVM memory so -a CPU worker and a GPU orchestrator kernel can exchange data at cache speed -rather than PCIe bandwidth. +## Headers -## Why UVM Instead of Pinned Memory - -Pinned host memory (`cudaMallocHost`) requires **system-scope atomics** -(`atomicAdd_system`, `atomicExch_system`) for GPU→CPU visibility. These -atomics in turn require `cudaDevAttrHostNativeAtomicSupported`, which is -`0` on most discrete/PCIe GPUs (e.g. RTX 4070, A100 over PCIe). Without -hardware support the operations silently fall back to slower paths and can -exhibit stale-read races. +```cpp +#include +``` -UVM with `cudaDevAttrConcurrentManagedAccess` (available on SM 6.0+ GPUs) -uses the GPU's hardware page-migration engine to maintain coherence, so: +Link against `wrp_cte_uvm`. Requires `WRP_CORE_ENABLE_CUDA=ON`. -- **Standard device-scope atomics** on the GPU are visible to the CPU. -- **`std::atomic` on the CPU** is visible to the GPU. -- No `clflush`, write-combining flags, or `__threadfence_system` needed. +## Core Concepts -| Feature | `cudaMallocHost` (pinned) | `cudaMallocManaged` (UVM) | -|---------|--------------------------|--------------------------| -| GPU→CPU atomics | `atomicAdd_system` required | device-scope sufficient | -| Hardware requirement | `HostNativeAtomicSupported` | `ConcurrentManagedAccess` | -| Cross-process sharing | Yes (via IPC handles) | No (single process only) | -| Free with | `cudaFreeHost` | `cudaFree` | -| Typical use | small control flags | ring buffers, allocators | +| Concept | Detail | +|---------|--------| +| Virtual address space | Reserved once with `cuMemAddressReserve`; no physical cost | +| Page size | 2 MB default, auto-aligned to GPU hardware granularity | +| Physical backing | Allocated on-demand per page with `cuMemCreate` + `cuMemMap` | +| Eviction target | Host pinned RAM (`cudaMallocHost`) or CTE blob store | +| Thread safety | All public methods protected by `std::mutex` | +| Async support | Separate transfer and compute CUDA streams | -## Header +## Configuration ```cpp -#include +wrp_cte::uvm::GpuVmmConfig cfg; +cfg.va_size_bytes = 512ULL * 1024 * 1024 * 1024; // 512 GB virtual space +cfg.page_size = 2 * 1024 * 1024; // 2 MB pages +cfg.fill_value = 0; // newly-mapped page fill +cfg.device = 0; // CUDA device ordinal +cfg.prefetch_window = 0; // pages to auto-touch ahead +cfg.use_cte = false; // use CTE blob store for eviction ``` -Requires `HSHM_ENABLE_CUDA=1` or `HSHM_ENABLE_ROCM=1`. +All fields have defaults; a zero-initialized `GpuVmmConfig` is valid and +uses 512 GB / 2 MB pages / device 0 / host-RAM backing. -## Allocation +## Initialization and Teardown ```cpp -hipc::MemoryBackendId backend_id(100, 0); // (major, minor) — must be unique -hipc::GpuShmMmap backend; - -bool ok = backend.shm_init( - backend_id, - 32 * 1024 * 1024, // 32 MB total (headers + data) - "/my_uvm_region", // informational name — not a file path - 0); // GPU device ID (informational) - -assert(ok); -char *data = backend.data_; // usable data region -size_t capacity = backend.data_capacity_; // usable bytes (total − 8 KB headers) +wrp_cte::uvm::GpuVirtualMemoryManager vmm; + +vmm.init(cfg); // reserve VA, create streams, verify hardware granularity +// ... use vmm ... +vmm.destroy(); // unmap all pages, free host backing, release VA range ``` -`shm_init` calls `cudaMallocManaged(&ptr, size, cudaMemAttachGlobal)` and -lays out the region as: +`init` validates that `page_size` is a multiple of the GPU's hardware +allocation granularity (queried with `cuMemGetAllocationGranularity`). It +fails if the device does not support virtual memory management. -``` -[ 4 KB backend header ][ 4 KB shared header ][ data ... ] -``` +## Demand Paging — Page In -The two header pages are reserved for `MemoryBackendHeader` metadata; your -allocator or ring buffer should start from `data_`. +```cpp +// Touch a single 2 MB page (zero-based page index) +vmm.touchPage(page_idx); -## Passing to a GPU Kernel +// Touch all pages that cover a byte range +vmm.touchRange(byte_offset, byte_length); -`GpuShmMmap` is a plain struct (no vtable, no host-only members) and can -be passed to a `__global__` kernel **by value** as part of -`IpcManagerGpuInfo`. The `data_` pointer is a CUDA-managed virtual -address valid on both host and device. +// Non-blocking touch on the internal transfer stream +vmm.touchPageAsync(page_idx); +vmm.syncTransfer(); // wait for async touches +``` + +On first touch, `touchPage` calls `cuMemCreate` to allocate a 2 MB physical +chunk, maps it into the reserved VA with `cuMemMap` + `cuMemSetAccess`, then +launches a fill kernel. If the page was previously evicted, the saved host +buffer (or CTE blob) is copied back to the device instead. + +## Eviction — Page Out ```cpp -chi::IpcManagerGpuInfo gpu_info; -gpu_info.backend = static_cast(backend); -gpu_info.gpu2cpu_backend = static_cast(g2c_backend); -// ... set queue pointers ... +// Evict a single page to host RAM (synchronous D2H copy) +vmm.evictPage(page_idx); -my_kernel<<>>(gpu_info, ...); +// Async eviction — D2H copy queued on transfer stream +vmm.evictPageAsync(page_idx); +vmm.syncTransfer(); ``` -Inside the kernel, `CHIMAERA_GPU_ORCHESTRATOR_INIT` (or `CHIMAERA_GPU_INIT` -for client kernels) reconstructs the per-block ArenaAllocator from -`gpu_info.backend`, partitioning the UVM region evenly across blocks. +Eviction: +1. Copies the 2 MB page to a `cudaMallocHost` buffer (or `AsyncPutBlob` + when `use_cte=true`). +2. Unmaps the page from the VA range (`cuMemUnmap`). +3. Releases the physical allocation (`cuMemRelease`). -## Registering with the IPC Manager +After eviction the virtual address is still valid but accessing it from a +GPU kernel will fault. The next `touchPage` call restores the data. -On the host side, register the backend so the CPU-side IPC manager can -resolve allocator IDs returned by the GPU: +## Prefetching + +Set `cfg.prefetch_window = N` to automatically touch the next `N` pages +whenever a page is touched: ```cpp -CHI_IPC->RegisterGpuAllocator( - backend_id, - backend.data_, - backend.data_capacity_); +cfg.prefetch_window = 3; // touching page P also touches P+1, P+2, P+3 async ``` -This records a mapping from `backend_id` → `(data_, capacity)` so that -`FutureShm` offsets written by a GPU thread can be dereferenced by a CPU -worker without an extra copy. - -## Memory Layout Inside the Region +Prefetch touches are issued on the transfer stream and do not block the +caller. -After calling `shm_init`, build an ArenaAllocator (or any other -`HSHM_DEFAULT_ALLOC_GPU_T`) directly on top of the UVM region: +## Querying State ```cpp -// Host side — initialize allocator -auto *alloc = reinterpret_cast(backend.data_); -new (alloc) HSHM_DEFAULT_ALLOC_GPU_T(); -alloc->shm_init(backend_id, nullptr, backend.data_, backend.data_capacity_); +void *base = vmm.getBasePtr(); // VA range start +size_t page_size = vmm.getPageSize(); +size_t total = vmm.getTotalPages(); +size_t mapped = vmm.getMappedPageCount(); // pages with physical backing +size_t evicted = vmm.getEvictedPageCount(); // pages saved to host RAM + +bool is_mapped = vmm.isMapped(page_idx); +bool is_evicted = vmm.isEvictedToHost(page_idx); + +// Device pointer to the start of page N +void *page_ptr = vmm.getPagePtr(page_idx); ``` -The same bytes are then accessible from the GPU kernel as the -`gpu_alloc_table_[thread_id]` entry set up by `CHIMAERA_GPU_ORCHESTRATOR_INIT`. +## CTE Blob Store Backing + +When `cfg.use_cte = true` the module uses CTE `AsyncPutBlob` / `AsyncGetBlob` +for eviction instead of host pinned RAM. This lets evicted pages survive +process restart and be loaded from a persistent storage tier. + +Requires the CTE pool to be initialized before calling `vmm.init()`. The +module creates one blob per page named by its index. -## Destruction +## Stream Management ```cpp -backend.shm_destroy(); // calls cudaFree internally +cudaStream_t xfer = vmm.getTransferStream(); // D2H / H2D copies +cudaStream_t comp = vmm.getComputeStream(); // kernel launches + +vmm.syncTransfer(); // cudaStreamSynchronize(transfer_stream_) +vmm.syncCompute(); // cudaStreamSynchronize(compute_stream_) ``` -Do **not** call `cudaFreeHost` — the backing memory was allocated with -`cudaMallocManaged`, not `cudaMallocHost`. +Use `getComputeStream()` for kernels that read or write demand-paged memory +so they are ordered after any in-flight page-in operations on the transfer +stream. + +## Full Example + +```cpp +#include + +// 1. Configure a 4 GB virtual address space with 2 MB pages +wrp_cte::uvm::GpuVmmConfig cfg; +cfg.va_size_bytes = 4ULL * 1024 * 1024 * 1024; +cfg.prefetch_window = 2; -`shm_attach` is intentionally unsupported; UVM allocations are -process-local and cannot be imported by another PID. +wrp_cte::uvm::GpuVirtualMemoryManager vmm; +vmm.init(cfg); -## Limitations +// 2. Touch page 0 — physical memory allocated, filled with cfg.fill_value +vmm.touchPage(0); -| Limitation | Detail | -|-----------|--------| -| Single-process only | No `shm_attach`; cannot share across PIDs | -| Minimum size | Enforced to 1 MB (`kMinBackendSize`) | -| Requires CUDA/ROCm | Compiled out when neither flag is set | -| SM requirement | `ConcurrentManagedAccess` needed for coherent access; available on SM 6.0+ (Pascal and newer) | +// 3. Write custom data via a GPU kernel +int *page0 = reinterpret_cast(vmm.getPagePtr(0)); +writeKernel<<<1, 256, 0, vmm.getComputeStream()>>>(page0, 42, vmm.getPageSize()); +vmm.syncCompute(); -## Relation to Other Backends +// 4. Evict page 0 to free device memory +vmm.evictPage(0); +// physical memory is now released; VA still valid -| Backend | Memory type | GPU accessible | Cross-process | -|---------|------------|---------------|--------------| -| `MallocBackend` | Host heap | No | No | -| `PosixShmMmap` | POSIX shm | No | Yes | -| `GpuShmMmap` | UVM (managed) | Yes (coherent) | No | -| `GpuMalloc` | Device DRAM | Yes (device-scope) | Yes (IPC handles) | +// 5. Re-touch page 0 — data restored from host backing +vmm.touchPage(0); +// page0 now contains the data written in step 3 -For tasks that must be visible to CPU workers, use `GpuShmMmap`. For -opaque device-side scratch memory shared across processes, use `GpuMalloc`. +vmm.destroy(); +``` -## CMake +## CMake Integration ```cmake -target_link_libraries(my_target hermes_shm_cuda) -target_compile_definitions(my_target PRIVATE HSHM_ENABLE_CUDA=1) +find_package(wrp_cte REQUIRED) + +target_link_libraries(my_target PRIVATE wrp_cte_uvm) +enable_language(CUDA) +set_target_properties(my_target PROPERTIES CUDA_SEPARABLE_COMPILATION ON) ``` + +## Hardware Requirements + +- CUDA 10.2+ (virtual memory management API) +- GPU with driver-side virtual memory support (`CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED`) +- SM 7.0+ (Volta) recommended for best performance