From b60fa050e7f41bee5640c8c00c0cfd557bf38434 Mon Sep 17 00:00:00 2001
From: lukemartinlogan <lukemartinlogan@gmail.com>
Date: Tue, 3 Mar 2026 05:55:13 +0000
Subject: [PATCH 1/5] Add GPU client kernels documentation

Document how CUDA/ROCm kernels interact with the Chimaera runtime,
including task definition, client API, host-side setup, configuration
parameters, and troubleshooting.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/sdk/context-runtime/6.gpu_clients.md | 399 ++++++++++++++++++++++
 1 file changed, 399 insertions(+)
 create mode 100644 docs/sdk/context-runtime/6.gpu_clients.md
diff --git a/docs/sdk/context-runtime/6.gpu_clients.md b/docs/sdk/context-runtime/6.gpu_clients.md
new file mode 100644
index 00000000..2abe9080
--- /dev/null
+++ b/docs/sdk/context-runtime/6.gpu_clients.md
@@ -0,0 +1,399 @@
+---
+sidebar_position: 6
+title: GPU Client Kernels
+description: How CUDA/ROCm kernels submit tasks to the Chimaera runtime and receive results.
+---
+
+# GPU Client Kernels
+
+This guide explains how CUDA or ROCm kernels can submit tasks to the Chimaera runtime and receive results, using the same client API as host code.
+
+## Overview
+
+The Chimaera runtime supports **GPU-initiated task submission**: a CUDA/ROCm kernel running on the GPU can create a task, send it through a shared-memory ring buffer to a CPU worker, and wait for the result — all without returning to host code.
+
+The communication path is:
+
+```
+GPU Kernel                          CPU Worker
+    │                                    │
+    │  1. NewTask (GPU allocator)        │
+    │  2. Send  (SPSC ring buffer →)  ───>  dequeue + deserialize
+    │                                    │  3. Execute task
+    │  4. Wait  (← SPSC ring buffer) <───  serialize output + FUTURE_COMPLETE
+    │  5. Read result                    │
+```
+
+Both directions use a **SPSC (single-producer single-consumer) ring buffer** embedded in a `FutureShm` structure that lives in GPU-registered pinned shared memory. System-scope atomics ensure visibility between GPU and CPU.
+
+## Prerequisites
+
+### CMake flags
+
+Enable GPU support when configuring the build:
+
+```bash
+# CUDA
+cmake .. -DWRP_CORE_ENABLE_CUDA=ON
+
+# ROCm
+cmake .. -DWRP_CORE_ENABLE_ROCM=ON
+```
+
+These set the `HSHM_ENABLE_CUDA` or `HSHM_ENABLE_ROCM` preprocessor macros, which gate all GPU code paths.
+
+### Required headers
+
+```cpp
+#include <chimaera/chimaera.h>
+#include <chimaera/singletons.h>
+#include <chimaera/task.h>
+#include <chimaera/pool_query.h>
+#include <hermes_shm/util/gpu_api.h>
+
+// Your module's client and task headers
+#include <chimaera/my_module/my_module_client.h>
+#include <chimaera/my_module/my_module_tasks.h>
+```
+
+## Writing a GPU-Compatible Task
+
+A task that can be created and submitted from a GPU kernel must:
+
+1. Mark constructors and serialization methods with `HSHM_CROSS_FUN`
+2. Provide `SerializeIn` and `SerializeOut` methods
+3. Use only GPU-safe types (no `std::string`, `std::vector`, etc.)
+
+```cpp
+struct MyGpuTask : public chi::Task {
+  IN chi::u32 input_value_;
+  INOUT chi::u32 result_;
+
+  HSHM_CROSS_FUN MyGpuTask() : chi::Task(), input_value_(0), result_(0) {}
+
+  HSHM_CROSS_FUN explicit MyGpuTask(
+      const chi::TaskId& task_id,
+      const chi::PoolId& pool_id,
+      const chi::PoolQuery& query,
+      chi::u32 input_value)
+      : chi::Task(task_id, pool_id, query, kMethodId),
+        input_value_(input_value), result_(0) {
+    task_id_ = task_id;
+    pool_id_ = pool_id;
+    method_ = kMethodId;
+    task_flags_.Clear();
+    pool_query_ = query;
+  }
+
+  // Serialization: fields sent GPU → CPU
+  template <typename Archive>
+  HSHM_CROSS_FUN void SerializeIn(Archive& ar) {
+    Task::SerializeIn(ar);
+    ar(input_value_, result_);
+  }
+
+  // Serialization: fields sent CPU → GPU (after execution)
+  template <typename Archive>
+  HSHM_CROSS_FUN void SerializeOut(Archive& ar) {
+    Task::SerializeOut(ar);
+    ar(result_);
+  }
+};
+```
+
+**Important**: Every function called during serialization must be `HSHM_CROSS_FUN` (i.e., `__host__ __device__`). Without this annotation, NVCC's SFINAE-based dispatch silently skips serialization on the GPU, producing corrupted data with no compiler error.
+
+## Writing a GPU-Compatible Client
+
+The client class needs `HSHM_CROSS_FUN` on the constructor and any methods called from GPU code:
+
+```cpp
+class Client : public chi::ContainerClient {
+ public:
+  HSHM_CROSS_FUN Client() = default;
+  HSHM_CROSS_FUN explicit Client(const chi::PoolId& pool_id) { Init(pool_id); }
+
+  // Host-only methods (AsyncCreate, etc.) stay as-is...
+
+  // GPU-callable task submission
+  HSHM_CROSS_FUN
+  chi::Future<MyGpuTask> AsyncMyTask(const chi::PoolQuery& query,
+                                      chi::u32 input_value) {
+    auto* ipc = CHI_IPC;
+    auto task = ipc->NewTask<MyGpuTask>(
+        chi::CreateTaskId(), pool_id_, query, input_value);
+    return ipc->Send(task);
+  }
+};
+```
+
+`CHI_IPC->Send()` automatically dispatches to the GPU ring-buffer path (`SendGpu`) when compiled for device code.
+
+## GPU Kernel Implementation
+
+### The CHIMAERA_GPU_INIT Macro
+
+Every GPU kernel that interacts with the runtime must call `CHIMAERA_GPU_INIT` at the top. This macro:
+
+1. Creates a per-block `IpcManager` singleton in `__shared__` memory
+2. Initializes per-thread `ArenaAllocator` instances from the GPU memory backend
+3. Synchronizes all threads in the block via `__syncthreads()`
+
+```cpp
+__global__ void my_kernel(chi::IpcManagerGpu gpu_info,
+                           chi::PoolId pool_id,
+                           chi::u32 input,
+                           int* d_result,
+                           chi::u32* d_output) {
+  // Initialize per-block IPC manager (required)
+  CHIMAERA_GPU_INIT(gpu_info);
+
+  // Use the client API — same code as host
+  chimaera::my_module::Client client(pool_id);
+  auto future = client.AsyncMyTask(chi::PoolQuery::Local(), input);
+  future.Wait();
+
+  // Read result
+  *d_output = future->result_;
+  *d_result = 1;  // success
+}
+```
+
+### Memory Layout
+
+`CHIMAERA_GPU_INIT` partitions the GPU memory backend as:
+
+```
+ ┌──────────────────────────────────────────────────────┐
+ │  Pointer table (num_threads * sizeof(ptr))           │
+ ├──────────────────────────────────────────────────────┤
+ │  ArenaAllocator 0  (per_thread_size bytes)           │
+ ├──────────────────────────────────────────────────────┤
+ │  ArenaAllocator 1  (per_thread_size bytes)           │
+ ├──────────────────────────────────────────────────────┤
+ │  ...                                                 │
+ ├──────────────────────────────────────────────────────┤
+ │  ArenaAllocator N-1                                  │
+ └──────────────────────────────────────────────────────┘
+```
+
+Each `ArenaAllocator` is a bump-pointer allocator that provides fast, lock-free allocation for its thread. All `ShmPtr` offsets are relative to the backend base so they can be resolved by the CPU worker.
+
+## Host-Side Setup
+
+The host code must:
+
+1. Start the Chimaera runtime
+2. Create the module's container (pool)
+3. Allocate GPU-registered pinned shared memory
+4. Create a GPU task queue and register it with the runtime
+5. Register the GPU memory backend for host-side ShmPtr resolution
+6. Launch the kernel
+
+### Complete Host-Side Example
+
+```cpp
+#include <chimaera/chimaera.h>
+#include <chimaera/singletons.h>
+#include <hermes_shm/util/gpu_api.h>
+
+int run_gpu_test(chi::PoolId pool_id, chi::u32 test_value,
+                  chi::u32* out_result) {
+  // Increase GPU stack for deep template call chains
+  cudaDeviceSetLimit(cudaLimitStackSize, 131072);  // 128 KB
+
+  // 1. Create GPU memory backend (pinned shared memory)
+  hipc::MemoryBackendId backend_id(3, 0);
+  size_t gpu_memory_size = 10 * 1024 * 1024;  // 10 MB
+  hipc::GpuShmMmap gpu_backend;
+  if (!gpu_backend.shm_init(backend_id, gpu_memory_size, "/gpu_test", 0))
+    return -100;
+
+  // 2. Create GPU task queue (pinned shared memory)
+  hipc::MemoryBackendId queue_backend_id(4, 0);
+  hipc::GpuShmMmap queue_backend;
+  if (!queue_backend.shm_init(queue_backend_id, 2 * 1024 * 1024,
+                               "/gpu_queue", 0))
+    return -101;
+
+  // 3. Create allocator and task queue in the queue backend
+  auto* queue_alloc = queue_backend.MakeAlloc<hipc::ArenaAllocator<false>>(
+      queue_backend.data_capacity_);
+  auto gpu_queue_ptr = queue_alloc->NewObj<chi::TaskQueue>(
+      queue_alloc, 1, 2, 1024);  // 1 lane, 2 priorities, depth 1024
+
+  // 4. Register queue and assign lanes to the GPU worker
+  CHI_IPC->RegisterGpuQueue(gpu_queue_ptr);
+  CHI_IPC->AssignGpuLanesToWorker();
+
+  // 5. Register GPU backend for host-side ShmPtr resolution
+  CHI_IPC->RegisterGpuAllocator(backend_id, gpu_backend.data_,
+                                 gpu_backend.data_capacity_);
+
+  // 6. Build IpcManagerGpu and launch kernel
+  chi::IpcManagerGpu gpu_info(gpu_backend, gpu_queue_ptr.ptr_);
+
+  int* d_result = hshm::GpuApi::Malloc<int>(sizeof(int));
+  chi::u32* d_out = hshm::GpuApi::Malloc<chi::u32>(sizeof(chi::u32));
+  int h_result = 0;
+  chi::u32 h_out = 0;
+  hshm::GpuApi::Memcpy(d_result, &h_result, sizeof(int));
+  hshm::GpuApi::Memcpy(d_out, &h_out, sizeof(chi::u32));
+
+  my_kernel<<<1, 1>>>(gpu_info, pool_id, test_value, d_result, d_out);
+
+  cudaDeviceSynchronize();
+  hshm::GpuApi::Memcpy(&h_result, d_result, sizeof(int));
+  hshm::GpuApi::Memcpy(&h_out, d_out, sizeof(chi::u32));
+
+  *out_result = h_out;
+  hshm::GpuApi::Free(d_result);
+  hshm::GpuApi::Free(d_out);
+  return h_result;
+}
+```
+
+### Key Registration Functions
+
+| Function | Purpose |
+|---|---|
+| `CHI_IPC->RegisterGpuQueue(queue)` | Registers a GPU task queue so the runtime can poll it |
+| `CHI_IPC->AssignGpuLanesToWorker()` | Assigns all registered GPU queue lanes to the designated GPU worker thread |
+| `CHI_IPC->RegisterGpuAllocator(id, data, capacity)` | Registers GPU backend memory so the host can resolve `ShmPtr`s allocated by the GPU kernel |
+
+## Configuration Parameters
+
+### GPU Stack Size
+
+GPU kernels that use the client API involve deep template instantiation chains (serialization, ring buffer, allocator). The default CUDA stack size (1024 bytes) is insufficient.
+
+```cpp
+cudaDeviceSetLimit(cudaLimitStackSize, 131072);  // 128 KB recommended
+```
+
+Symptoms of stack overflow: silent kernel completion with corrupted or missing data (no error reported by `cudaGetLastError`).
+
+### Ring Buffer Size (Copy Space)
+
+The SPSC ring buffer size determines how much serialized data can be in-flight between GPU and CPU. This is set per-task:
+
+```cpp
+// Inside your task class, override GetCopySpaceSize:
+HSHM_CROSS_FUN size_t GetCopySpaceSize() const { return 8192; }
+```
+
+The default is **4096 bytes** if `GetCopySpaceSize()` returns 0. The `FutureShm` is allocated as `sizeof(FutureShm) + copy_space_size`, with separate `input_` and `output_` ring buffers sharing the same copy space.
+
+For tasks with large serialized payloads, increase this value. If the ring buffer is too small, the producer will spin-wait for the consumer to drain data.
+
+### GPU Memory Backend Size
+
+The GPU memory backend is partitioned equally among all threads in a block. Size it based on:
+
+- Number of threads per block
+- Per-thread allocation needs (tasks, serialization buffers, FutureShm)
+
+```cpp
+// Example: 10 MB for a single-thread kernel
+size_t gpu_memory_size = 10 * 1024 * 1024;
+```
+
+A single task submission typically consumes:
+- `sizeof(TaskT)` for the task object
+- `sizeof(FutureShm) + copy_space_size` for the future
+- Temporary serialization buffer (~copy_space_size)
+
+### GPU Task Queue Configuration
+
+```cpp
+// Parameters: allocator, num_lanes, num_priorities, depth
+queue_alloc->NewObj<chi::TaskQueue>(queue_alloc, 1, 2, 1024);
+```
+
+| Parameter | Default | Description |
+|---|---|---|
+| `num_lanes` | 1 | Number of queue lanes (1 per GPU is typical) |
+| `num_priorities` | 2 | Priority levels per lane |
+| `depth` | 1024 | Maximum tasks per lane before blocking |
+
+### Queue Backend Size
+
+The queue backend holds the `TaskQueue` data structure and its internal lane storage:
+
+```cpp
+size_t queue_memory_size = 2 * 1024 * 1024;  // 2 MB is sufficient for most use cases
+```
+
+### Runtime Thread Configuration
+
+The Chimaera runtime automatically detects GPU devices and creates GPU queues during `ServerInit`. The scheduler assigns a dedicated GPU worker thread to poll GPU lanes.
+
+In the runtime YAML configuration, ensure enough worker threads are configured:
+
+```yaml
+# At minimum: N task workers + 1 network worker + 1 GPU worker
+num_threads: 4
+```
+
+The scheduler's `DivideWorkers` method partitions workers into task, network, and GPU roles. With `num_threads: 4`, you get 2 task workers, 1 network worker, and 1 GPU worker (the GPU worker is selected from the task worker pool by the scheduler).
+
+## Data Flow Details
+
+### Send Path (GPU → CPU)
+
+1. **NewTask**: Allocates the task from the per-thread `ArenaAllocator`
+2. **SendGpu**: Allocates a `FutureShm` with embedded ring buffer copy space
+3. **Enqueue**: Pushes the `Future` onto the GPU task queue lane (the CPU worker polls this)
+4. **Serialize + Send**: Builds a `LocalSaveTaskArchive`, calls `task->SerializeIn(ar)`, then streams the archive through `ShmTransport::Send` into the input ring buffer
+
+### Receive Path (CPU → GPU)
+
+1. **CPU worker** dequeues the `FutureShm`, reads from the input ring buffer via `ShmTransport::Recv`, deserializes, executes the task, then writes output back through the output ring buffer
+2. **RecvGpu**: On the GPU, `Future::Wait()` calls `RecvGpu` which reads from the output ring buffer via `ShmTransport::Recv`
+3. **Deserialize**: A `LocalLoadTaskArchive` deserializes the output fields via `task->SerializeOut(ar)`
+4. **Completion**: The GPU spins on `FUTURE_COMPLETE` flag (should already be set after Recv completes)
+
+### Wire Format
+
+GPU and CPU use compatible but different archive types:
+
+| Side | Serialization | Deserialization |
+|---|---|---|
+| GPU (send) | `LocalSaveTaskArchive` | `LocalLoadTaskArchive` |
+| CPU (worker) | `SaveTaskArchive` | `LoadTaskArchive` |
+
+Both produce the same binary wire format: `[vector_size][elements...][msg_type][stream_data]`. This is critical — if the formats diverge, deserialization will produce corrupted data.
+
+## Troubleshooting
+
+### CUDA Error 700 (Illegal Memory Access)
+
+Usually caused by GPU stack overflow during deep serialization template chains. Increase the stack size:
+```cpp
+cudaDeviceSetLimit(cudaLimitStackSize, 131072);
+```
+
+### Silent data corruption (kernel completes but wrong values)
+
+Check that **all** serialization functions are marked `HSHM_CROSS_FUN`. Without `__device__` annotation, NVCC's SFINAE dispatch silently skips serialization, consuming 0 bytes and misaligning the stream. The compiler produces no error.
+
+### Ring buffer deadlock (kernel hangs)
+
+The GPU producer and CPU consumer share the ring buffer. If the buffer fills, the producer spins waiting for the consumer. Ensure:
+- The CPU worker is running and polling the GPU queue lanes
+- `RegisterGpuQueue` and `AssignGpuLanesToWorker` were called before the kernel launch
+- The copy space is large enough for the serialized payload
+
+### Kernel launch failure (-201)
+
+Check `cudaGetLastError()` after the kernel launch. Common causes:
+- Too many threads per block (ArenaAllocator table won't fit in memory)
+- Missing `CHIMAERA_GPU_INIT` (uninitialized shared memory access)
+
+### ShmPtr resolution failure on host
+
+The CPU worker must be able to resolve `ShmPtr`s that point into GPU-allocated pinned memory. Call `RegisterGpuAllocator` before launching the kernel:
+```cpp
+CHI_IPC->RegisterGpuAllocator(backend_id, gpu_backend.data_,
+                               gpu_backend.data_capacity_);
+```

From c7d6c3fdccae87a2036418a8d420c4ba4377cab2 Mon Sep 17 00:00:00 2001
From: lukemartinlogan <lukemartinlogan@gmail.com>
Date: Tue, 3 Mar 2026 05:56:26 +0000
Subject: [PATCH 2/5] Rename base-modules from sidebar position 6 to 20

Move base-modules section after the new GPU clients page to maintain
logical sidebar ordering.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../{6.base-modules => 20.base-modules}/1.admin.md                | 0
 .../context-runtime/{6.base-modules => 20.base-modules}/2.bdev.md | 0
 .../{6.base-modules => 20.base-modules}/3.MOD_NAME.md             | 0
 .../{6.base-modules => 20.base-modules}/_category_.json           | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 rename docs/sdk/context-runtime/{6.base-modules => 20.base-modules}/1.admin.md (100%)
 rename docs/sdk/context-runtime/{6.base-modules => 20.base-modules}/2.bdev.md (100%)
 rename docs/sdk/context-runtime/{6.base-modules => 20.base-modules}/3.MOD_NAME.md (100%)
 rename docs/sdk/context-runtime/{6.base-modules => 20.base-modules}/_category_.json (100%)

diff --git a/docs/sdk/context-runtime/6.base-modules/1.admin.md b/docs/sdk/context-runtime/20.base-modules/1.admin.md
similarity index 100%
rename from docs/sdk/context-runtime/6.base-modules/1.admin.md
rename to docs/sdk/context-runtime/20.base-modules/1.admin.md
diff --git a/docs/sdk/context-runtime/6.base-modules/2.bdev.md b/docs/sdk/context-runtime/20.base-modules/2.bdev.md
similarity index 100%
rename from docs/sdk/context-runtime/6.base-modules/2.bdev.md
rename to docs/sdk/context-runtime/20.base-modules/2.bdev.md
diff --git a/docs/sdk/context-runtime/6.base-modules/3.MOD_NAME.md b/docs/sdk/context-runtime/20.base-modules/3.MOD_NAME.md
similarity index 100%
rename from docs/sdk/context-runtime/6.base-modules/3.MOD_NAME.md
rename to docs/sdk/context-runtime/20.base-modules/3.MOD_NAME.md
diff --git a/docs/sdk/context-runtime/6.base-modules/_category_.json b/docs/sdk/context-runtime/20.base-modules/_category_.json
similarity index 100%
rename from docs/sdk/context-runtime/6.base-modules/_category_.json
rename to docs/sdk/context-runtime/20.base-modules/_category_.json

From 72d7a1a70d65c28c89696b37fa74ebd7113326af Mon Sep 17 00:00:00 2001
From: lukemartinlogan <lukemartinlogan@gmail.com>
Date: Sun, 8 Mar 2026 02:06:41 +0000
Subject: [PATCH 3/5] Update GPU client docs: dual-allocator, corrected API
 names, performance
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Update all queue field names to current API (cpu2gpu_queue, gpu2cu_queue,
  gpu2gpu_queue) throughout examples and narrative
- Update ClientConnectTask handshake table to new field names
- Add gpu_heap_backend (GpuMalloc, 9000+gpu_id) and gpu2gpu device-memory
  backend (3000+gpu_id) to server backend table and memory layout diagrams
- Replace single-ArenaAllocator description with dual-allocator architecture:
  ArenaAllocator (HSHM_DEFAULT_ALLOC_GPU_T, primary bump-pointer) +
  BuddyAllocator (CHI_GPU_HEAP_T, serialization heap with individual free)
- Document CHIMAERA_GPU_ORCHESTRATOR_INIT(gpu_info, num_blocks) macro for
  multi-block client kernels; CHI_CLIENT_GPU_INIT alias
- Add GetClientGpuInfo(gpu_id) to host-side setup — fills all IpcManagerGpuInfo
  fields automatically for same-process kernel launches
- Add Performance section: ~200 µs BuddyAllocator vs ~400 µs device malloc,
  corrected latency measurement explanation, arena-reset semantics
- Update server phase-1 init sequence for new backends and queue layout
- Update GPU memory backend size guidance for primary arena vs heap backends

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/sdk/context-runtime/6.gpu_clients.md | 1160 ++++++++++++++++++---
 1 file changed, 1040 insertions(+), 120 deletions(-)

diff --git a/docs/sdk/context-runtime/6.gpu_clients.md b/docs/sdk/context-runtime/6.gpu_clients.md
index 2abe9080..1dafbf4c 100644
--- a/docs/sdk/context-runtime/6.gpu_clients.md
+++ b/docs/sdk/context-runtime/6.gpu_clients.md
@@ -6,25 +6,491 @@ description: How CUDA/ROCm kernels submit tasks to the Chimaera runtime and rece
 
 # GPU Client Kernels
 
-This guide explains how CUDA or ROCm kernels can submit tasks to the Chimaera runtime and receive results, using the same client API as host code.
+This guide explains how CUDA or ROCm kernels can submit tasks to the Chimaera runtime and receive results, using the same client API as host code. It also covers **cross-device routing**: CPU code sending tasks to run on the GPU, and GPU kernels sending tasks to run on the CPU.
 
 ## Overview
 
-The Chimaera runtime supports **GPU-initiated task submission**: a CUDA/ROCm kernel running on the GPU can create a task, send it through a shared-memory ring buffer to a CPU worker, and wait for the result — all without returning to host code.
+The Chimaera runtime supports four GPU task routing modes:
 
-The communication path is:
+| Routing Mode | Direction | Description |
+|---|---|---|
+| `PoolQuery::Local()` | GPU &rarr; GPU | GPU kernel submits a task processed by the GPU work orchestrator |
+| `PoolQuery::LocalGpuBcast()` | CPU &rarr; GPU | CPU submits a task processed by the GPU work orchestrator |
+| `PoolQuery::ToLocalGpu(gpu_id)` | CPU &rarr; GPU | CPU submits a task to a specific GPU device |
+| `PoolQuery::ToLocalCpu()` | GPU &rarr; CPU | GPU kernel submits a task processed by a CPU worker |
+
+All paths use the same `client.AsyncMyTask(query, ...)` API. The `IpcManager::Send()` method detects the routing mode and dispatches to the correct transport automatically.
+
+### Architecture
+
+```
+CPU Host                                          GPU Work Orchestrator
+    |                                                      |
+    |  AsyncGpuSubmit(LocalGpuBcast())                     |
+    |  -> Send() detects LocalGpuBcast                     |
+    |  -> SendToGpu(): serialize + push cpu2gpu_queue ---> | pop + deserialize
+    |                                                      | dispatch to GpuRuntime
+    |  Wait(): spin on FUTURE_COMPLETE                     | serialize output
+    |  <- deserialize output from FutureShm           <--- | threadfence_system + FUTURE_COMPLETE
+    |                                                      |
+GPU Kernel                                        CPU Worker
+    |                                                      |
+    |  AsyncGpuSubmit(ToLocalCpu())                        |
+    |  -> SendGpu(): serialize + push gpu2cpu_queue   ---> | dequeue + deserialize
+    |                                                      | execute task
+    |  Wait(): RecvGpu reads output ring buffer       <--- | serialize output + FUTURE_COMPLETE
+    |                                                      |
+GPU Kernel                                        GPU Work Orchestrator
+    |                                                      |
+    |  AsyncGpuSubmit(Local())                             |
+    |  -> SendGpuLocal(): push gpu2gpu_queue          ---> | pop + deserialize
+    |                                                      | dispatch to GpuRuntime
+    |  Wait(): RecvGpu reads output ring buffer       <--- | serialize output + FUTURE_COMPLETE
+```
+
+## Runtime vs. Client Process Execution Modes
+
+The Chimaera runtime supports two process configurations for GPU task submission:
+
+| Mode | Init Call | GPU Resources | Use Case |
+|---|---|---|---|
+| **In-process runtime** | `CHIMAERA_INIT(kClient, true)` | Full: all GPU queues, orchestrator, GPU memory backends | Single process with embedded server |
+| **Separate client process** | `CHIMAERA_INIT(kClient, false)` | None: no GPU queues or orchestrator | Client connects to standalone server |
+
+### In-Process Runtime (Embedded Server)
+
+When `CHIMAERA_INIT(kClient, true)` is called, the process runs both client and server:
+
+```cpp
+// One process does everything
+chi::CHIMAERA_INIT(chi::ChimaeraMode::kClient, true);
+
+chimaera::my_module::Client client(pool_id);
+client.AsyncCreate(chi::PoolQuery::Dynamic(), name, pool_id).Wait();
+
+// All GPU routing modes work:
+// Local(), LocalGpuBcast(), ToLocalGpu(), ToLocalCpu()
+auto future = client.AsyncGpuSubmit(chi::PoolQuery::LocalGpuBcast(), 0, value);
+future.Wait();
+```
+
+**What gets initialized:**
+
+1. **`ServerInit()`** runs in sequence:
+   - `IpcManager::ServerInit()` &rarr; `ServerInitGpuQueues()` creates all GPU queue infrastructure
+   - `WorkOrchestrator::Init()` + `StartWorkers()` launches worker threads
+   - `PoolManager::ServerInit()` initializes pool management
+   - `LaunchGpuOrchestrator()` launches persistent GPU kernel (deferred until after pools are composed)
+   - `StartLocalServer()` opens the SHM segment for external clients
+2. **`ClientInit()`** initializes the client-side IPC connection
+
+All four GPU routing modes are available because the process owns the GPU queues and orchestrator.
+
+### Separate Client Process
+
+When `CHIMAERA_INIT(kClient, false)` is called, the process is a pure client. During the `ClientConnect` handshake, the server transmits GPU queue metadata (offsets, backend sizes, IPC handles) back to the client. If the client binary was compiled with CUDA/ROCm support, `ClientInitGpuQueues()` attaches to the server's GPU queue backends and reconstructs local queue pointers, enabling direct GPU task submission.
+
+```cpp
+// Server is a separate process
+// Client connects via SHM, TCP, or IPC transport
+chi::CHIMAERA_INIT(chi::ChimaeraMode::kClient, false);
+
+chimaera::my_module::Client client(pool_id);
+client.AsyncCreate(chi::PoolQuery::Dynamic(), name, pool_id).Wait();
+
+// PoolQuery::Local() always works (routed to server's CPU worker via SHM/TCP/IPC)
+auto future = client.AsyncGpuSubmit(chi::PoolQuery::Local(), 0, value);
+future.Wait();
+// Server's CPU handler executes: result = test_value * 2 + gpu_id
+
+// With GPU support compiled in, LocalGpuBcast also works (direct GPU queue push)
+auto gpu_future = client.AsyncGpuSubmit(chi::PoolQuery::LocalGpuBcast(), 0, value);
+gpu_future.Wait();
+// Server's GPU handler executes: result = test_value * 2 + gpu_id
+```
+
+**What gets initialized:**
+
+1. **`ClientInit()` only** — no `ServerInit()`:
+   - `ConfigManager::Init()` reads `chimaera.yaml`
+   - `IpcManager::ClientInit()` connects to the server via SHM, TCP, or IPC
+   - Admin client singleton is created
+2. **GPU queue attachment** (if compiled with CUDA/ROCm):
+   - `ClientConnect` response carries GPU queue offsets, backend sizes, and IPC handles
+   - `ClientInitGpuQueues()` attaches to each GPU's `GpuShmMmap` queue backend via `shm_attach()`
+   - `cpu2gpu_queues_`, `gpu2gpu_queues_` are reconstructed from the received offsets
+   - `RegisterGpuAllocator()` is called so the client can resolve `ShmPtr`s into queue memory
+3. **No GPU orchestrator**: `gpu_orchestrator_` is `nullptr` (the server runs the orchestrator)
+
+**Available routing:**
+
+| Routing Mode | Works? | Why |
+|---|---|---|
+| `PoolQuery::Local()` | Yes | Goes through `SendShm()` or `SendZmq()` to server's CPU worker |
+| `PoolQuery::LocalGpuBcast()` | Yes (with GPU) | `SendToGpu()` pushes to attached `cpu2gpu_queue` &rarr; server's GPU orchestrator |
+| `PoolQuery::ToLocalGpu(id)` | Yes (with GPU) | Same path, targeting a specific GPU device |
+| `PoolQuery::ToLocalCpu()` | No | Only callable from GPU device code |
+
+Without GPU support compiled in, `LocalGpuBcast` and `ToLocalGpu` return empty futures (no `cpu2gpu_queues_` attached). Use `PoolQuery::Local()` as a fallback — the server's CPU handler executes the task instead.
+
+#### ClientConnect GPU Queue Handshake
+
+The `ClientConnectTask` response includes these GPU-specific fields:
+
+| Field | Type | Description |
+|---|---|---|
+| `num_gpus_` | `u32` | Number of GPU devices on the server |
+| `cpu2gpu_queue_off_[8]` | `u64[8]` | Byte offsets of `cpu2gpu_queue` within each GPU's `GpuShmMmap` |
+| `gpu2cpu_queue_off_[8]` | `u64[8]` | Byte offsets of `gpu2cpu_queue` (GPU&rarr;CPU) |
+| `gpu2gpu_queue_off_[8]` | `u64[8]` | Byte offsets of `gpu2gpu_queue` |
+| `gpu_queue_backend_size_[8]` | `u64[8]` | Size of each GPU's `GpuShmMmap` backend |
+| `gpu_queue_depth_` | `u32` | Queue depth (number of slots) |
+
+The client uses these offsets to reconstruct `TaskQueue` pointers. The `cpu2gpu` and `gpu2cpu` queues reside in the pinned host `GpuShmMmap` backend; the `gpu2gpu` queue resides in device memory. A single `shm_attach()` per GPU attaches the pinned backend; the `gpu2gpu` backend is attached separately via its IPC handle.
+
+#### Cross-Process GPU Memory Registration
+
+Clients can register GPU device memory backends with the server using the extended `RegisterMemoryTask`. This enables the server to resolve `ShmPtr`s that point into GPU memory allocated by a different process.
+
+```cpp
+// Client allocates a GpuMalloc backend
+hipc::MemoryBackendId backend_id(static_cast<u32>(getpid()), 100);
+hipc::GpuMalloc gpu_backend;
+gpu_backend.shm_init(backend_id, data_size, "", gpu_id);
+
+// Get IPC handle from the backend's private header
+hipc::GpuMallocPrivateHeader priv_header;
+GpuApi::Memcpy(&priv_header, gpu_backend.GetPrivateHeader(), sizeof(priv_header));
+
+// Register with server via RegisterMemory(kGpuDeviceMemory)
+auto reg_task = ipc->NewTask<admin::RegisterMemoryTask>(
+    CreateTaskId(), kAdminPoolId, PoolQuery::Local(),
+    backend_id,
+    admin::MemoryType::kGpuDeviceMemory,
+    gpu_id, data_size, &priv_header.ipc_handle_);
+auto future = ipc->SendZmq(reg_task, IpcMode::kTcp);
+future.Wait();
+// Server opens the IPC handle and registers the allocator in gpu_alloc_map_
+```
+
+The `MemoryType` enum controls the registration path:
+
+| MemoryType | Value | Backend | Registration Path |
+|---|---|---|---|
+| `kCpuMemory` | 0 | POSIX shared memory | Existing `IpcManager::RegisterMemory()` |
+| `kPinnedHostMemory` | 1 | `GpuShmMmap` (pinned) | Attach via `shm_attach()` + `RegisterGpuAllocator()` |
+| `kGpuDeviceMemory` | 2 | `GpuMalloc` (device) | Open IPC handle via `shm_attach_ipc()` + `RegisterGpuAllocator()` |
+
+### Fork-Based Test Pattern
+
+For testing, a common pattern forks a child server and runs a pure client in the parent:
+
+```cpp
+int main(int argc, char* argv[]) {
+  // Child process: server mode
+  if (argc > 1 && std::string(argv[1]) == "--server-mode") {
+    CHIMAERA_INIT(ChimaeraMode::kServer, true);
+    sleep(300);  // Wait for parent to kill us
+    return 0;
+  }
+
+  // Parent: fork server, then run as client
+  pid_t server = fork();
+  if (server == 0) {
+    setpgid(0, 0);
+    execl("/proc/self/exe", argv[0], "--server-mode", nullptr);
+    _exit(1);
+  }
+  setpgid(server, server);
+  WaitForServer();  // Poll for SHM segment file
+
+  // Pure client
+  setenv("CHI_WITH_RUNTIME", "0", 1);
+  CHIMAERA_INIT(ChimaeraMode::kClient, false);
+
+  // Submit tasks via Local() routing
+  chimaera::my_module::Client client(pool_id);
+  client.AsyncCreate(PoolQuery::Dynamic(), name, pool_id).Wait();
+  auto future = client.AsyncGpuSubmit(PoolQuery::Local(), 0, 42);
+  future.Wait();
+  // result = 42 * 2 + 0 = 84  (CPU handler)
+
+  // Cleanup
+  kill(-server, SIGKILL);
+  waitpid(server, &status, 0);
+}
+```
+
+The `execl("/proc/self/exe", ...)` pattern avoids inheriting the parent's `CHIMAERA_INIT` static guard state.
+
+See `context-runtime/modules/MOD_NAME/test/test_gpu_client_process.cc` for a complete implementation of this pattern with 5 test cases covering CPU routing, GPU queue attachment, GPU routing, and cross-process memory registration.
+
+## Memory Allocation Architecture
+
+### GPU Queue Memory (Server Only)
+
+During `ServerInitGpuQueues()`, the server allocates **pinned host memory** backends for GPU communication. Each GPU device gets:
+
+| Backend | ID | Type | Size | Purpose |
+|---|---|---|---|---|
+| CPU→GPU queue backend | `1000 + gpu_id` | `GpuShmMmap` (pinned) | 64 MB | Holds `TaskQueue` structures (`cpu2gpu`, `gpu2cpu`) and `FutureShm` allocations for CPU→GPU tasks |
+| GPU→GPU queue backend | `3000 + gpu_id` | Device memory | 32 MB | Holds `gpu2gpu_queue` and per-thread `ArenaAllocator` storage for GPU→GPU tasks |
+| Orchestrator scratch | `2000 + gpu_id` | `GpuShmMmap` (pinned) | 64 MB | Per-block `ArenaAllocator` storage for the persistent orchestrator kernel |
+| GPU heap | `9000 + gpu_id` | `GpuMalloc` (device) | 64 MB | Per-thread `BuddyAllocator` (`CHI_GPU_HEAP_T`) for serialization scratch buffers |
+
+`GpuShmMmap` backends use `cudaHostAlloc` (pinned host memory) accessible from both CPU and GPU via UVA. `GpuMalloc` backends use `cudaMalloc` (device memory) for GPU-only allocations.
+
+```
+Server Process Memory Layout (per GPU):
+ +-----------------------------------------------------------------+
+ |  GpuShmMmap (backend ID 1000+gpu_id, 64 MB pinned host memory)  |
+ |  URL: /chi_gpu_queue_{gpu_id}                                   |
+ |                                                                 |
+ |  +-----------+  +-----------+                                   |
+ |  | TaskQueue |  | TaskQueue |                                   |
+ |  | gpu2cpu_q |  | cpu2gpu_q |                                   |
+ |  | (GPU->CPU)|  | (CPU->GPU)|                                   |
+ |  +-----------+  +-----------+                                   |
+ |                                                                 |
+ |  +-----------------------------+                                |
+ |  | FutureShm allocations       |  (allocated by SendToGpu)      |
+ |  | [FutureShm + copy_space]    |                                |
+ |  +-----------------------------+                                |
+ +-----------------------------------------------------------------+
+
+ +-----------------------------------------------------------------+
+ |  Device memory (backend ID 3000+gpu_id, 32 MB)                  |
+ |  URL: /chi_gpu2gpu_queue_{gpu_id}                               |
+ |                                                                 |
+ |  +-----------+                                                  |
+ |  | TaskQueue |                                                  |
+ |  | gpu2gpu_q |                                                  |
+ |  | (GPU->GPU)|                                                  |
+ |  +-----------+                                                  |
+ |                                                                 |
+ |  +--------------------------------------------+                 |
+ |  | ArenaAllocator per-thread (GPU->GPU tasks)  |                |
+ |  +--------------------------------------------+                 |
+ +-----------------------------------------------------------------+
+
+ +-----------------------------------------------------------------+
+ |  GpuShmMmap (backend ID 2000+gpu_id, 64 MB pinned host memory)  |
+ |  URL: /chi_gpu_orchestrator_{gpu_id}                            |
+ |                                                                 |
+ |  +--------------------------------------------+                 |
+ |  | Block 0: ArenaAllocator (per-thread bump)   |                |
+ |  +--------------------------------------------+                 |
+ |  | Block 1: ArenaAllocator                     |                |
+ |  +--------------------------------------------+                 |
+ |  | ...                                         |                |
+ |  +--------------------------------------------+                 |
+ +-----------------------------------------------------------------+
+
+ +-----------------------------------------------------------------+
+ |  GpuMalloc (backend ID 9000+gpu_id, 64 MB device memory)        |
+ |  URL: /chi_gpu_heap_{gpu_id}                                    |
+ |                                                                 |
+ |  +--------------------------------------------+                 |
+ |  | Block 0: BuddyAllocator (CHI_GPU_HEAP_T)    |                |
+ |  +--------------------------------------------+                 |
+ |  | Block 1: BuddyAllocator                     |                |
+ |  +--------------------------------------------+                 |
+ |  | ...                                         |                |
+ |  +--------------------------------------------+                 |
+ +-----------------------------------------------------------------+
+```
+
+### Client Process Memory
+
+A pure client (`CHIMAERA_INIT(kClient, false)`) does not create GPU backends, but **attaches to the server's GPU queue backends** during `ClientInitGpuQueues()` (if compiled with GPU support). Tasks can be sent through:
+
+- **SHM mode**: `FutureShm` allocated in the server's shared memory segment (`chi_main_segment`). The client writes serialized task data into `FutureShm::copy_space` via `ShmTransport::Send()`.
+- **TCP/IPC mode**: `FutureShm` allocated via `HSHM_MALLOC` (process-local heap). Task data serialized and sent through ZMQ DEALER socket.
+- **Direct GPU queue push** (GPU-compiled clients): `SendToGpu()` serializes into the attached `GpuShmMmap` queue backend and pushes to `cpu2gpu_queue`. The server's GPU orchestrator dequeues and processes the task.
+
+```
+Client Process (GPU-compiled) Memory Layout:
+
+ Attached from server (read/write via pinned host memory):
+ +-----------------------------------------------------------------+
+ |  GpuShmMmap (backend 1000+gpu_id) — attached via shm_attach()   |
+ |  cpu2gpu_queue, gpu2gpu_queue pointers reconstructed             |
+ |  FutureShm allocations for SendToGpu live here                   |
+ +-----------------------------------------------------------------+
+
+ Optionally registered with server:
+ +-----------------------------------------------------------------+
+ |  GpuMalloc (client-owned, backend pid:unique_id)                 |
+ |  Device memory allocated via cudaMalloc                          |
+ |  IPC handle sent to server via RegisterMemory(kGpuDeviceMemory)  |
+ |  Server opens handle to resolve ShmPtrs from this backend        |
+ +-----------------------------------------------------------------+
+```
+
+### GPU Kernel Memory (User-Allocated)
+
+GPU kernels that use the client API require two memory backends passed in `IpcManagerGpuInfo`:
+
+**Primary backend** (`backend`): `GpuShmMmap` pinned host memory (or `GpuMalloc` device memory for GPU→GPU). Provides per-thread `ArenaAllocator` (`HSHM_DEFAULT_ALLOC_GPU_T`) for fast bump-pointer allocation of `FutureShm` and task objects.
+
+**Heap backend** (`gpu_heap_backend`): `GpuMalloc` device memory. Provides per-thread `BuddyAllocator` (`CHI_GPU_HEAP_T`) for serialization scratch buffers in `LocalSaveTaskArchive` / `LocalLoadTaskArchive`. Unlike the arena, the buddy allocator supports individual `free()`, so scratch memory is reclaimed after each task without exhausting the primary arena.
+
+Use `CHI_IPC->GetClientGpuInfo(gpu_id)` to build the `IpcManagerGpuInfo` for same-process kernel launches — it fills all fields automatically:
+
+```cpp
+// Same-process launch (Chimaera running in-process):
+chi::IpcManagerGpuInfo gpu_info = CHI_IPC->GetClientGpuInfo(0);
+// gpu_info.backend, .gpu2gpu_queue, .gpu_heap_backend all set
+
+// Register primary backend so CPU can resolve GPU ShmPtrs
+CHI_IPC->RegisterGpuAllocator(gpu_info.backend.id_,
+                               gpu_info.backend.data_,
+                               gpu_info.backend.data_capacity_);
+```
+
+The `CHIMAERA_GPU_INIT` macro partitions both backends per-thread:
+
+```
+ backend (ArenaAllocator, HSHM_DEFAULT_ALLOC_GPU_T):
+ +------------------------------------------------------+
+ | Pointer table (num_threads * sizeof(ptr))             |
+ +------------------------------------------------------+
+ | ArenaAllocator 0  (per_thread_size bytes)             |
+ +------------------------------------------------------+
+ | ArenaAllocator 1  (per_thread_size bytes)             |
+ +------------------------------------------------------+
+ | ...                                                   |
+ +------------------------------------------------------+
+
+ gpu_heap_backend (BuddyAllocator, CHI_GPU_HEAP_T):
+ +------------------------------------------------------+
+ | Pointer table (num_threads * sizeof(ptr))             |
+ +------------------------------------------------------+
+ | BuddyAllocator 0  (per_thread_size bytes)             |
+ +------------------------------------------------------+
+ | BuddyAllocator 1  (per_thread_size bytes)             |
+ +------------------------------------------------------+
+ | ...                                                   |
+ +------------------------------------------------------+
+```
+
+The `ArenaAllocator` is a bump-pointer allocator; it is reset (via `alloc->Reset()`) after each task completes — safe because only one task is in-flight per thread at a time. The `BuddyAllocator` reclaims memory individually as serialization vectors go out of scope. Access the per-thread heap via `CHI_GPU_HEAP` (expands to `CHI_IPC->GetGpuHeap()`).
+
+## GPU Initialization Sequence
+
+### Server GPU Init (Full Sequence)
+
+The server initializes GPU resources in three phases during `ServerInit()`:
+
+**Phase 1: Queue creation** (`IpcManager::ServerInitGpuQueues()`)
+1. Query `hshm::GpuApi::GetDeviceCount()` for number of GPU devices
+2. For each GPU, create a 64 MB `GpuShmMmap` pinned host backend (ID `1000+gpu_id`) for `cpu2gpu` and `gpu2cpu` queues plus CPU→GPU `FutureShm` allocations
+3. Create a device-memory backend (ID `3000+gpu_id`) for the `gpu2gpu_queue` and its per-thread `ArenaAllocator`
+4. Create two `TaskQueue` objects in the pinned backend:
+   - `gpu2cpu_queues_[gpu_id]` — GPU&rarr;CPU queue (CPU worker polls this)
+   - `cpu2gpu_queues_[gpu_id]` — CPU&rarr;GPU queue (orchestrator polls this)
+5. Create one `TaskQueue` in device memory:
+   - `gpu2gpu_queues_[gpu_id]` — GPU&rarr;GPU queue (orchestrator polls this)
+6. Each queue has 1 lane, 2 priorities (normal + resumed), configurable depth
+7. Create orchestrator scratch backends (64 MB `GpuShmMmap`, IDs `2000+gpu_id`)
+8. Create GPU heap backends (64 MB `GpuMalloc`, IDs `9000+gpu_id`) for `CHI_GPU_HEAP_T = BuddyAllocator`
+9. Register all backends with `RegisterGpuAllocator()` for host-side `ShmPtr` resolution
+10. Populate `gpu_orchestrator_info_` struct for GPU 0 including `gpu_heap_backend`
+
+**Phase 2: Pool composition** (deferred)
+- All compose-section pools are created before the orchestrator launches
+- GPU containers are allocated during pool creation via autogenerated allocation kernels
+
+**Phase 3: Orchestrator launch** (`IpcManager::LaunchGpuOrchestrator()`)
+1. Read `gpu_blocks` and `gpu_threads_per_block` from configuration
+2. Set CUDA stack size to 131072 bytes (`cudaDeviceSetLimit`)
+3. Allocate `WorkOrchestratorControl` in pinned host memory
+4. Allocate `gpu::PoolManager` on device
+5. Create dedicated CUDA stream
+6. Launch persistent kernel: `chimaera_gpu_orchestrator<<<blocks, threads_per_block, 0, stream>>>()`
+7. Only block 0, thread 0 runs the `gpu::Worker` poll loop; other threads spin-wait for exit
+
+**Why deferred?** The orchestrator occupies all SMs with persistent thread blocks. If launched before pool composition, `cudaMalloc` calls during GPU container allocation would deadlock against the persistent kernel.
+
+### Client GPU Init (None)
+
+`ClientInit()` performs no GPU initialization:
+- No `ServerInitGpuQueues()` — `cpu2gpu_queues_` remains empty
+- No `LaunchGpuOrchestrator()` — `gpu_orchestrator_` remains `nullptr`
+- No GPU memory backends allocated
+
+The client can only submit tasks through CPU transport (SHM, TCP, IPC).
+
+## Queue Assignment Algorithm
+
+### Worker Thread Partitioning
+
+The `DefaultScheduler::DivideWorkers()` method partitions the configured worker threads into four roles:
+
+```
+Given N total worker threads (from num_threads in chimaera.yaml):
+
+  Worker 0           -> Scheduler worker (handles task routing)
+  Workers 1..N-3     -> I/O workers (execute tasks)
+  Worker N-2         -> GPU worker (polls GPU->CPU lanes)
+  Worker N-1         -> Network worker (handles ZMQ send/recv)
+```
+
+Example with `num_threads: 4`:
+- Worker 0: scheduler
+- Worker 1: I/O worker (also used as GPU worker since N-2 = 2, overlaps I/O range)
+- Worker 2: GPU worker
+- Worker 3: network worker
+
+The GPU worker is assigned only when `total_workers > 2`. With 2 or fewer workers, there is no dedicated GPU worker and GPU&rarr;CPU tasks cannot be processed.
+
+### GPU Lane Assignment
+
+After `ServerInitGpuQueues()` creates the per-GPU `TaskQueue` objects, `AssignGpuLanesToWorker()` connects them to the GPU worker:
+
+```cpp
+void IpcManager::AssignGpuLanesToWorker() {
+  Worker *gpu_worker = scheduler_->GetGpuWorker();
+
+  // Collect lane 0 from each GPU's gpu_queue (GPU->CPU)
+  for (size_t gpu_id = 0; gpu_id < num_gpus; ++gpu_id) {
+    TaskLane *gpu_lane = &gpu_queues_[gpu_id]->GetLane(0, 0);
+    gpu_lane->SetAssignedWorkerId(gpu_worker->GetId());
+    gpu_lanes.push_back(gpu_lane);
+  }
+
+  gpu_worker->SetGpuLanes(gpu_lanes);
+}
+```
+
+**Result:** The single GPU worker polls **all** GPU&rarr;CPU lanes across all GPUs. The GPU work orchestrator (persistent kernel) polls the CPU&rarr;GPU and GPU&rarr;GPU queues independently.
+
+### Queue Routing Summary
 
 ```
-GPU Kernel                          CPU Worker
-    │                                    │
-    │  1. NewTask (GPU allocator)        │
-    │  2. Send  (SPSC ring buffer →)  ───>  dequeue + deserialize
-    │                                    │  3. Execute task
-    │  4. Wait  (← SPSC ring buffer) <───  serialize output + FUTURE_COMPLETE
-    │  5. Read result                    │
+                    Pure Client Process              Server Process
+                    +-----------------+              +------------------+
+                    |  No GPU queues  |              | cpu2gpu_queues_[]|----> GPU Orchestrator
+                    |  No orchestrator|              | gpu2gpu_queues_[]|----> GPU Orchestrator
+AsyncGpuSubmit() ->-|  SendShm() or   |--SHM/TCP/-->| gpu_queues_[]    |----> GPU Worker (CPU)
+(PoolQuery::Local)  |  SendZmq()      |   IPC       |                  |
+                    +-----------------+              +------------------+
+                                                            |
+                                                    CPU Workers execute
+                                                    the task's CPU handler
 ```
 
-Both directions use a **SPSC (single-producer single-consumer) ring buffer** embedded in a `FutureShm` structure that lives in GPU-registered pinned shared memory. System-scope atomics ensure visibility between GPU and CPU.
+### Client-to-Server Transport Modes
+
+The pure client's transport mode is determined by the `CHI_IPC_MODE` environment variable or `chimaera.yaml` configuration:
+
+| Mode | Transport | How FutureShm is Allocated | How Task Data is Sent |
+|---|---|---|---|
+| `SHM` | Shared memory | In server's SHM segment | Written to `FutureShm::copy_space` ring buffer |
+| `TCP` | ZMQ DEALER/ROUTER | Process-local heap (`HSHM_MALLOC`) | Serialized through ZMQ TCP socket |
+| `IPC` | ZMQ DEALER/ROUTER (Unix socket) | Process-local heap (`HSHM_MALLOC`) | Serialized through ZMQ IPC socket |
+
+For SHM mode, the server worker deserializes from the `FutureShm::copy_space` ring buffer using `ShmTransport::Recv()`. For TCP/IPC mode, the server receives the serialized data through the ZMQ ROUTER socket and deserializes on the network worker thread.
 
 ## Prerequisites
 
@@ -85,14 +551,14 @@ struct MyGpuTask : public chi::Task {
     pool_query_ = query;
   }
 
-  // Serialization: fields sent GPU → CPU
+  // Serialization: fields sent GPU -> CPU
   template <typename Archive>
   HSHM_CROSS_FUN void SerializeIn(Archive& ar) {
     Task::SerializeIn(ar);
     ar(input_value_, result_);
   }
 
-  // Serialization: fields sent CPU → GPU (after execution)
+  // Serialization: fields sent CPU -> GPU (after execution)
   template <typename Archive>
   HSHM_CROSS_FUN void SerializeOut(Archive& ar) {
     Task::SerializeOut(ar);
@@ -127,139 +593,378 @@ class Client : public chi::ContainerClient {
 };
 ```
 
-`CHI_IPC->Send()` automatically dispatches to the GPU ring-buffer path (`SendGpu`) when compiled for device code.
+`CHI_IPC->Send()` automatically dispatches based on compilation context and routing mode:
+
+- **GPU device code** (`HSHM_IS_GPU`): routes via `SendGpu` or `SendGpuLocal`
+- **CPU host code** with `HSHM_ENABLE_CUDA=1`: detects `LocalGpuBcast`/`ToLocalGpu` and routes via `SendToGpu`
+- **CPU host code** without GPU: routes through normal CPU routing (pool query &rarr; worker)
+
+## GPU Routing Modes
+
+### GPU &rarr; GPU (Local)
+
+A GPU kernel submits a task that is processed by the **GPU work orchestrator**. The task runs on the GPU itself.
+
+```cpp
+// Inside a __global__ kernel
+chimaera::my_module::Client client(pool_id);
+auto future = client.AsyncMyTask(chi::PoolQuery::Local(), input);
+future.Wait();
+chi::u32 result = future->result_;
+```
+
+The GPU orchestrator must be running (not paused). The task is dispatched to your module's `GpuRuntime::MyMethod()` handler.
+
+### CPU &rarr; GPU (LocalGpuBcast)
+
+CPU host code submits a task to be processed by the **GPU work orchestrator**. The task runs on the GPU.
+
+```cpp
+// Host code -- must be compiled with HSHM_ENABLE_CUDA=1
+chimaera::my_module::Client client(pool_id);
+auto future = client.AsyncMyTask(chi::PoolQuery::LocalGpuBcast(), input);
+future.Wait();  // blocks until GPU completes + deserializes output
+chi::u32 result = future->result_;
+```
+
+**How it works:**
+
+1. `Send()` detects `LocalGpuBcast` and calls `SendToGpu()` instead of normal CPU routing
+2. `SendToGpu()` allocates a `FutureShm` in GPU-accessible pinned host memory, serializes task input into its ring buffer, and pushes to the `cpu2gpu_queue`
+3. The GPU orchestrator pops the task, deserializes, dispatches to `GpuRuntime`, serializes output, and sets `FUTURE_COMPLETE`
+4. `Wait()` spins on `FUTURE_COMPLETE`, then deserializes output from the FutureShm ring buffer back into the task
+
+**Important -- HSHM_ENABLE_CUDA must be 1**: The `Send()` GPU routing interceptor is inside `#if HSHM_ENABLE_CUDA || HSHM_ENABLE_ROCM`. If your source file is compiled with `HSHM_ENABLE_CUDA=0` (e.g., a plain C++ file in a mixed CUDA/C++ target), the interceptor is compiled out and the task will go through normal CPU routing, which does not complete the future correctly for GPU tasks.
+
+**Solution**: Call `AsyncMyTask(LocalGpuBcast(), ...)` from a source file compiled as CUDA (`.cu` or with `LANGUAGE CUDA` property), or from a wrapper function defined in such a file.
+
+### CPU &rarr; Specific GPU (ToLocalGpu)
+
+Same as `LocalGpuBcast` but targets a specific GPU by device ID:
+
+```cpp
+// Send to GPU device 1
+auto future = client.AsyncMyTask(chi::PoolQuery::ToLocalGpu(1), input);
+future.Wait();
+```
+
+### GPU &rarr; CPU (ToLocalCpu)
+
+A GPU kernel submits a task to be processed by a **CPU worker thread**. The task runs on the CPU.
+
+```cpp
+// Inside a __global__ kernel
+chimaera::my_module::Client client(pool_id);
+auto future = client.AsyncMyTask(chi::PoolQuery::ToLocalCpu(), input);
+future.Wait();
+chi::u32 result = future->result_;
+```
+
+**How it works:**
+
+1. `Send()` on the GPU calls `SendGpu()`, which serializes the task and pushes to the `gpu2cpu_queue`
+2. The CPU GPU-worker thread polls this queue, deserializes, and routes to the module's CPU runtime handler
+3. The CPU worker serializes output into the FutureShm ring buffer and sets `FUTURE_COMPLETE`
+4. `Wait()` on the GPU calls `RecvGpu()` which reads from the output ring buffer
+
+**Host setup required**: The GPU kernel needs a `gpu2cpu_queue` registered with the runtime. See [Host-Side Setup for GPU &rarr; CPU](#host-side-setup-for-gpu--cpu) below.
+
+## GPU Work Orchestrator
+
+The GPU work orchestrator is a **persistent CUDA kernel** that processes tasks on the GPU. It runs on a dedicated CUDA stream and polls two queues:
+
+- `cpu2gpu_queue` -- tasks pushed by CPU via `SendToGpu()`
+- `gpu2gpu_queue` -- tasks pushed by GPU kernels via `SendGpuLocal()`
+
+The runtime launches the orchestrator automatically during `ServerInit()` when GPU support is enabled.
+
+### Persistent Kernel Structure
+
+The orchestrator is launched with configurable blocks and threads per block. Only **block 0, thread 0** runs the worker poll loop; all other threads spin-wait for the exit signal:
+
+```cpp
+__global__ void chimaera_gpu_orchestrator(gpu::PoolManager *pool_mgr,
+                                          gpu::WorkOrchestratorControl *control,
+                                          IpcManagerGpuInfo gpu_info,
+                                          u32 num_blocks) {
+  CHIMAERA_GPU_ORCHESTRATOR_INIT(gpu_info, num_blocks);
+
+  if (blockIdx.x == 0 && threadIdx.x == 0) {
+    control->running_flag = 1;
+    gpu::Worker worker;
+    worker.Init(0, gpu_info.cpu2gpu_queue, gpu_info.gpu2gpu_queue,
+                pool_mgr, gpu_info.cpu2gpu_queue_base);
+
+    while (!control->exit_flag) {
+      worker.PollOnce();
+    }
+    worker.Finalize();
+  }
+
+  // Other blocks/threads: wait for exit signal
+  if (blockIdx.x != 0 || threadIdx.x != 0) {
+    while (!control->exit_flag) { /* spin */ }
+  }
+}
+```
+
+The extra blocks are launched to occupy all SMs, preventing other kernels from preempting the orchestrator.
+
+### GPU Container (GpuRuntime)
+
+Each module that supports GPU execution defines a `GpuRuntime` class:
+
+```cpp
+// my_module_gpu_runtime.h
+class GpuRuntime : public chi::gpu::Container {
+ public:
+  HSHM_GPU_FUN GpuRuntime() = default;
+  HSHM_GPU_FUN ~GpuRuntime() override = default;
+
+  /** GPU handler for MyMethod */
+  HSHM_GPU_FUN void MyMethod(hipc::FullPtr<MyGpuTask> task,
+                               chi::gpu::GpuRunContext &rctx) {
+    task->result_ = task->input_value_ * 3;
+  }
+
+  // Autogenerated virtual method overrides (switch-case dispatch)
+  #include "autogen/my_module_gpu_lib_exec.h"
+};
+```
+
+### Module YAML Configuration
+
+Enable GPU support in your module's `chimaera_mod.yaml`:
+
+```yaml
+module_name: my_module
+namespace: chimaera
+
+# ... method IDs ...
+kMyMethod: 25
+
+# GPU support
+has_gpu: true
+gpu_methods:
+  - kMyMethod
+```
+
+Run `chimaera repo refresh .` to regenerate the autogen dispatch code:
+- Per-module: `autogen/my_module_gpu_lib_exec.h` (virtual method overrides)
+- Repo-level: `src/autogen/gpu_work_orchestrator_modules.h` (container allocation + registry)
+
+### Pause/Resume
+
+The GPU orchestrator occupies all SMs with persistent thread blocks. To launch other GPU kernels (e.g., test kernels), you must pause the orchestrator first:
+
+```cpp
+CHI_IPC->PauseGpuOrchestrator();   // signal exit + synchronize stream
+// ... launch your kernel on a separate stream ...
+CHI_IPC->ResumeGpuOrchestrator();  // relaunch persistent kernel
+```
+
+**Important**: Use stream-based synchronization (`hshm::GpuApi::Synchronize(stream)`) for your own kernels, not `cudaDeviceSynchronize()`. The latter would block on the orchestrator's stream.
+
+### Container Registration
+
+GPU containers are automatically allocated and registered when a pool with `has_gpu: true` is created. The allocation kernel runs in the orchestrator's CUDA module context so vtables are correct for virtual dispatch.
 
 ## GPU Kernel Implementation
 
-### The CHIMAERA_GPU_INIT Macro
+### Initialization Macros
+
+Three macros are available depending on the kernel's role:
 
-Every GPU kernel that interacts with the runtime must call `CHIMAERA_GPU_INIT` at the top. This macro:
+#### `CHIMAERA_GPU_INIT(gpu_info)`
 
-1. Creates a per-block `IpcManager` singleton in `__shared__` memory
-2. Initializes per-thread `ArenaAllocator` instances from the GPU memory backend
-3. Synchronizes all threads in the block via `__syncthreads()`
+For regular client kernels (not the orchestrator). Initializes the per-block `IpcManager` from a fully-populated `IpcManagerGpuInfo`. All threads in a block share one `IpcManager` instance (in `__shared__` memory); thread 0 runs `ClientInitGpu`, then `__syncthreads()` makes it visible to all threads.
 
 ```cpp
 __global__ void my_kernel(chi::IpcManagerGpu gpu_info,
                            chi::PoolId pool_id,
                            chi::u32 input,
-                           int* d_result,
                            chi::u32* d_output) {
-  // Initialize per-block IPC manager (required)
   CHIMAERA_GPU_INIT(gpu_info);
 
-  // Use the client API — same code as host
   chimaera::my_module::Client client(pool_id);
   auto future = client.AsyncMyTask(chi::PoolQuery::Local(), input);
   future.Wait();
-
-  // Read result
   *d_output = future->result_;
-  *d_result = 1;  // success
 }
 ```
 
-### Memory Layout
+#### `CHI_CLIENT_GPU_INIT(gpu_info)`
+
+Alias for `CHIMAERA_GPU_INIT`. Use this in client-process kernels for clarity:
+
+```cpp
+__global__ void client_kernel(chi::IpcManagerGpu gpu_info, ...) {
+  CHI_CLIENT_GPU_INIT(gpu_info);
+  // identical behavior
+}
+```
+
+#### `CHIMAERA_GPU_ORCHESTRATOR_INIT(gpu_info, num_blocks)`
 
-`CHIMAERA_GPU_INIT` partitions the GPU memory backend as:
+For kernels that partition their memory across multiple blocks (e.g., benchmark client kernels and the runtime orchestrator itself). Before calling `ClientInitGpu`, this macro splits all three backends (`backend`, `gpu2cpu_backend`, `gpu_heap_backend`) so each block gets its own non-overlapping slice:
 
 ```
- ┌──────────────────────────────────────────────────────┐
- │  Pointer table (num_threads * sizeof(ptr))           │
- ├──────────────────────────────────────────────────────┤
- │  ArenaAllocator 0  (per_thread_size bytes)           │
- ├──────────────────────────────────────────────────────┤
- │  ArenaAllocator 1  (per_thread_size bytes)           │
- ├──────────────────────────────────────────────────────┤
- │  ...                                                 │
- ├──────────────────────────────────────────────────────┤
- │  ArenaAllocator N-1                                  │
- └──────────────────────────────────────────────────────┘
+per_block = data_capacity / num_blocks
+block N slice starts at: data_ + N * per_block
 ```
 
-Each `ArenaAllocator` is a bump-pointer allocator that provides fast, lock-free allocation for its thread. All `ShmPtr` offsets are relative to the backend base so they can be resolved by the CPU worker.
+Use this when launching multiple blocks where each block submits tasks independently:
+
+```cpp
+__global__ void bench_kernel(chi::IpcManagerGpu gpu_info,
+                              chi::PoolId pool_id,
+                              chi::u32 num_blocks,
+                              chi::u32 total_tasks,
+                              int* d_done) {
+  CHIMAERA_GPU_ORCHESTRATOR_INIT(gpu_info, num_blocks);
+
+  if (threadIdx.x != 0) return;  // only thread 0 per block submits
+
+  chimaera::my_module::Client client(pool_id);
+  for (chi::u32 i = 0; i < total_tasks; ++i) {
+    auto future = client.AsyncMyTask(chi::PoolQuery::Local(), i);
+    future.Wait();
+  }
+
+  if (blockIdx.x == 0) {
+    __threadfence_system();
+    *d_done = 1;
+  }
+}
+```
 
 ## Host-Side Setup
 
-The host code must:
+### Host-Side Setup for GPU &rarr; GPU
+
+For `PoolQuery::Local()` from a GPU kernel, the task goes through the `gpu2gpu_queue` which is managed by the orchestrator. The simplest approach is `GetClientGpuInfo()`, which fills all `IpcManagerGpuInfo` fields automatically from the running runtime:
+
+```cpp
+// Build IpcManagerGpuInfo — fills backend, gpu2gpu_queue, gpu_heap_backend, etc.
+chi::IpcManagerGpuInfo gpu_info = CHI_IPC->GetClientGpuInfo(0);
 
-1. Start the Chimaera runtime
-2. Create the module's container (pool)
-3. Allocate GPU-registered pinned shared memory
-4. Create a GPU task queue and register it with the runtime
-5. Register the GPU memory backend for host-side ShmPtr resolution
-6. Launch the kernel
+// Register primary backend so CPU can resolve GPU ShmPtrs
+CHI_IPC->RegisterGpuAllocator(gpu_info.backend.id_,
+                               gpu_info.backend.data_,
+                               gpu_info.backend.data_capacity_);
+```
 
-### Complete Host-Side Example
+For custom backends (e.g., benchmark kernels with a dedicated allocation region):
 
 ```cpp
-#include <chimaera/chimaera.h>
-#include <chimaera/singletons.h>
-#include <hermes_shm/util/gpu_api.h>
+// Custom primary backend (pinned host, for ArenaAllocator)
+hipc::MemoryBackendId backend_id(100, 0);
+hipc::GpuShmMmap gpu_backend;
+gpu_backend.shm_init(backend_id, 10 * 1024 * 1024, "/gpu_test", 0);
+CHI_IPC->RegisterGpuAllocator(backend_id, gpu_backend.data_,
+                               gpu_backend.data_capacity_);
+
+// Custom heap backend (device memory, for BuddyAllocator / CHI_GPU_HEAP_T)
+hipc::MemoryBackendId heap_id(101, 0);
+hipc::GpuMalloc gpu_heap;
+gpu_heap.shm_init(heap_id, 4 * 1024 * 1024, "/gpu_heap", 0);
+
+chi::IpcManagerGpuInfo gpu_info;
+gpu_info.backend = gpu_backend;
+gpu_info.gpu_heap_backend = gpu_heap;
+gpu_info.gpu2cu_queue  = nullptr;  // not using GPU->CPU
+gpu_info.cpu2gpu_queue = nullptr;  // not receiving CPU->GPU
+gpu_info.gpu2gpu_queue = CHI_IPC->GetGpuToGpuQueue(0);
+```
 
-int run_gpu_test(chi::PoolId pool_id, chi::u32 test_value,
-                  chi::u32* out_result) {
-  // Increase GPU stack for deep template call chains
-  cudaDeviceSetLimit(cudaLimitStackSize, 131072);  // 128 KB
-
-  // 1. Create GPU memory backend (pinned shared memory)
-  hipc::MemoryBackendId backend_id(3, 0);
-  size_t gpu_memory_size = 10 * 1024 * 1024;  // 10 MB
-  hipc::GpuShmMmap gpu_backend;
-  if (!gpu_backend.shm_init(backend_id, gpu_memory_size, "/gpu_test", 0))
-    return -100;
-
-  // 2. Create GPU task queue (pinned shared memory)
-  hipc::MemoryBackendId queue_backend_id(4, 0);
-  hipc::GpuShmMmap queue_backend;
-  if (!queue_backend.shm_init(queue_backend_id, 2 * 1024 * 1024,
-                               "/gpu_queue", 0))
-    return -101;
-
-  // 3. Create allocator and task queue in the queue backend
-  auto* queue_alloc = queue_backend.MakeAlloc<hipc::ArenaAllocator<false>>(
-      queue_backend.data_capacity_);
-  auto gpu_queue_ptr = queue_alloc->NewObj<chi::TaskQueue>(
-      queue_alloc, 1, 2, 1024);  // 1 lane, 2 priorities, depth 1024
-
-  // 4. Register queue and assign lanes to the GPU worker
-  CHI_IPC->RegisterGpuQueue(gpu_queue_ptr);
-  CHI_IPC->AssignGpuLanesToWorker();
-
-  // 5. Register GPU backend for host-side ShmPtr resolution
-  CHI_IPC->RegisterGpuAllocator(backend_id, gpu_backend.data_,
-                                 gpu_backend.data_capacity_);
-
-  // 6. Build IpcManagerGpu and launch kernel
-  chi::IpcManagerGpu gpu_info(gpu_backend, gpu_queue_ptr.ptr_);
-
-  int* d_result = hshm::GpuApi::Malloc<int>(sizeof(int));
-  chi::u32* d_out = hshm::GpuApi::Malloc<chi::u32>(sizeof(chi::u32));
-  int h_result = 0;
-  chi::u32 h_out = 0;
-  hshm::GpuApi::Memcpy(d_result, &h_result, sizeof(int));
-  hshm::GpuApi::Memcpy(d_out, &h_out, sizeof(chi::u32));
-
-  my_kernel<<<1, 1>>>(gpu_info, pool_id, test_value, d_result, d_out);
-
-  cudaDeviceSynchronize();
-  hshm::GpuApi::Memcpy(&h_result, d_result, sizeof(int));
-  hshm::GpuApi::Memcpy(&h_out, d_out, sizeof(chi::u32));
-
-  *out_result = h_out;
-  hshm::GpuApi::Free(d_result);
-  hshm::GpuApi::Free(d_out);
-  return h_result;
+Use pinned host memory for result polling (not `cudaDeviceSynchronize`, which hangs with the persistent orchestrator):
+
+```cpp
+int *d_result;
+cudaMallocHost(&d_result, sizeof(int));
+*d_result = 0;
+
+// Pause orchestrator, launch kernel, resume orchestrator
+CHI_IPC->PauseGpuOrchestrator();
+my_kernel<<<1, 1, 0, stream>>>(gpu_info, pool_id, input, d_result, d_out);
+CHI_IPC->ResumeGpuOrchestrator();
+
+// Poll pinned memory for completion
+while (*d_result == 0) {
+  std::this_thread::sleep_for(std::chrono::microseconds(100));
 }
 ```
 
+### Host-Side Setup for GPU &rarr; CPU
+
+For `PoolQuery::ToLocalCpu()` from a GPU kernel, the task goes through the `gpu2cpu_queue` that a CPU worker thread polls. The host must create and register this queue, and also provide a pinned-host `gpu2cpu_backend` so the GPU kernel can allocate `FutureShm` objects:
+
+```cpp
+// 1. Primary backend (device memory, for ArenaAllocator)
+hipc::MemoryBackendId backend_id(5, 0);
+hipc::GpuMalloc gpu_backend;
+gpu_backend.shm_init(backend_id, 10 * 1024 * 1024, "/gpu_to_cpu", 0);
+
+// 2. GPU->CPU copy-space backend (pinned host, for FutureShm allocation)
+hipc::MemoryBackendId copy_backend_id(6, 0);
+hipc::GpuShmMmap copy_backend;
+copy_backend.shm_init(copy_backend_id, 4 * 1024 * 1024, "/gpu_to_cpu_copy", 0);
+CHI_IPC->RegisterGpuAllocator(copy_backend_id, copy_backend.data_,
+                               copy_backend.data_capacity_);
+
+// 3. GPU task queue (pinned shared memory)
+hipc::MemoryBackendId queue_backend_id(7, 0);
+hipc::GpuShmMmap queue_backend;
+queue_backend.shm_init(queue_backend_id, 2 * 1024 * 1024, "/gpu_to_cpu_q", 0);
+
+auto *queue_alloc = queue_backend.MakeAlloc<hipc::ArenaAllocator<false>>(
+    queue_backend.data_capacity_);
+auto gpu_queue_ptr = queue_alloc->NewObj<chi::TaskQueue>(
+    queue_alloc, 1, 2, 1024);
+
+// 4. Register queue and assign to CPU GPU-worker
+CHI_IPC->RegisterGpuQueue(gpu_queue_ptr);
+CHI_IPC->AssignGpuLanesToWorker();
+CHI_IPC->RegisterGpuAllocator(backend_id, gpu_backend.data_,
+                               gpu_backend.data_capacity_);
+
+// 5. Build IpcManagerGpuInfo and launch kernel
+chi::IpcManagerGpuInfo gpu_info;
+gpu_info.backend = gpu_backend;
+gpu_info.gpu2cpu_queue   = gpu_queue_ptr.ptr_;
+gpu_info.gpu2cpu_backend = copy_backend;
+```
+
+### CPU &rarr; GPU (No Extra Host Setup)
+
+For `PoolQuery::LocalGpuBcast()` from CPU host code, no additional host setup is needed beyond creating the pool. The runtime creates the `cpu2gpu_queue` and GPU orchestrator during `ServerInit()`.
+
+```cpp
+// Compile this in a CUDA source file (HSHM_ENABLE_CUDA=1)
+chi::CHIMAERA_INIT(chi::ChimaeraMode::kClient, true);
+
+chimaera::my_module::Client client(pool_id);
+auto create = client.AsyncCreate(chi::PoolQuery::Dynamic(), name, pool_id);
+create.Wait();
+
+// Submit task to GPU -- Send() detects LocalGpuBcast -> SendToGpu()
+auto future = client.AsyncMyTask(chi::PoolQuery::LocalGpuBcast(), input);
+future.Wait();  // spins on FUTURE_COMPLETE, deserializes GPU output
+chi::u32 result = future->result_;
+```
+
 ### Key Registration Functions
 
 | Function | Purpose |
 |---|---|
+| `CHI_IPC->GetClientGpuInfo(gpu_id)` | Build a fully-populated `IpcManagerGpuInfo` for same-process kernel launches |
 | `CHI_IPC->RegisterGpuQueue(queue)` | Registers a GPU task queue so the runtime can poll it |
 | `CHI_IPC->AssignGpuLanesToWorker()` | Assigns all registered GPU queue lanes to the designated GPU worker thread |
 | `CHI_IPC->RegisterGpuAllocator(id, data, capacity)` | Registers GPU backend memory so the host can resolve `ShmPtr`s allocated by the GPU kernel |
+| `CHI_IPC->PauseGpuOrchestrator()` | Pauses the persistent GPU orchestrator to free SMs |
+| `CHI_IPC->ResumeGpuOrchestrator()` | Resumes the GPU orchestrator |
+| `CHI_IPC->GetGpuToGpuQueue(gpu_id)` | Returns the orchestrator's GPU&rarr;GPU queue pointer for a given device |
+| `CHI_GPU_HEAP` | Per-thread `BuddyAllocator` (`CHI_GPU_HEAP_T`) from the GPU heap table; valid in device code after `CHIMAERA_GPU_INIT` |
 
 ## Configuration Parameters
 
@@ -286,22 +991,31 @@ The default is **4096 bytes** if `GetCopySpaceSize()` returns 0. The `FutureShm`
 
 For tasks with large serialized payloads, increase this value. If the ring buffer is too small, the producer will spin-wait for the consumer to drain data.
 
-### GPU Memory Backend Size
+### GPU Memory Backend Sizes
 
-The GPU memory backend is partitioned equally among all threads in a block. Size it based on:
+Two backends must be sized independently:
 
-- Number of threads per block
-- Per-thread allocation needs (tasks, serialization buffers, FutureShm)
+**Primary backend** (`backend`, `ArenaAllocator`): holds `FutureShm` + task objects. Reset after each task so size only needs to cover one task at a time:
+```cpp
+// Per-block: sizeof(FutureShm) + copy_space + sizeof(TaskT) + overhead
+// 10 MB per block is sufficient for typical tasks
+size_t backend_size = num_blocks * 10 * 1024 * 1024;
+```
 
+**Heap backend** (`gpu_heap_backend`, `BuddyAllocator`): holds serialization scratch vectors. Individual `free()` reclaims memory, so size covers peak concurrent scratch usage (~2× copy_space per task):
 ```cpp
-// Example: 10 MB for a single-thread kernel
-size_t gpu_memory_size = 10 * 1024 * 1024;
+// Per-block: ~2 * copy_space_size + BuddyAllocator overhead
+// 4 MB per block is sufficient for typical tasks
+size_t heap_backend_size = num_blocks * 4 * 1024 * 1024;
 ```
 
-A single task submission typically consumes:
-- `sizeof(TaskT)` for the task object
+A single task submission typically consumes from the primary arena:
 - `sizeof(FutureShm) + copy_space_size` for the future
-- Temporary serialization buffer (~copy_space_size)
+- `sizeof(TaskT)` for the deserialized task object (orchestrator side)
+
+And from the heap:
+- Input serialization vector (~`copy_space_size`)
+- Output serialization vector (~`copy_space_size`)
 
 ### GPU Task Queue Configuration
 
@@ -326,33 +1040,113 @@ size_t queue_memory_size = 2 * 1024 * 1024;  // 2 MB is sufficient for most use
 
 ### Runtime Thread Configuration
 
-The Chimaera runtime automatically detects GPU devices and creates GPU queues during `ServerInit`. The scheduler assigns a dedicated GPU worker thread to poll GPU lanes.
-
-In the runtime YAML configuration, ensure enough worker threads are configured:
+The Chimaera runtime automatically detects GPU devices and creates GPU queues during `ServerInit`. The default scheduler (`DefaultScheduler`) partitions workers as follows:
 
 ```yaml
-# At minimum: N task workers + 1 network worker + 1 GPU worker
+# chimaera.yaml
 num_threads: 4
 ```
 
-The scheduler's `DivideWorkers` method partitions workers into task, network, and GPU roles. With `num_threads: 4`, you get 2 task workers, 1 network worker, and 1 GPU worker (the GPU worker is selected from the task worker pool by the scheduler).
+With `num_threads: N`:
+
+| Worker ID | Role | Responsibility |
+|---|---|---|
+| 0 | Scheduler | Task routing and scheduling |
+| 1 to N-3 | I/O workers | Execute tasks from CPU queues |
+| N-2 | GPU worker | Polls GPU&rarr;CPU queue lanes |
+| N-1 | Network worker | Handles ZMQ send/recv |
+
+The GPU worker polls **all** GPU&rarr;CPU lanes across all GPU devices. With `num_threads: 4`, you get 1 scheduler, 1 I/O worker (which doubles as the GPU worker range), 1 GPU worker, and 1 network worker.
+
+## CMake Integration
+
+### Mixed CUDA/C++ Targets
+
+When building a test or application that has both CUDA and C++ source files, keep GPU-routing calls in the CUDA source file. The typical CMake pattern:
+
+```cmake
+# GPU kernels compiled as CUDA
+set_source_files_properties(my_gpu_kernels.cc PROPERTIES LANGUAGE CUDA)
+add_library(my_gpu_kernels OBJECT my_gpu_kernels.cc)
+target_link_libraries(my_gpu_kernels PRIVATE hshm::cuda_cxx)
+
+# Main executable with CPU sources
+add_executable(my_app my_main.cc $<TARGET_OBJECTS:my_gpu_kernels>)
+
+# CRITICAL: CPU sources are compiled with HSHM_ENABLE_CUDA=0 to avoid
+# __device__ errors. GPU routing in Send() is compiled out.
+set_source_files_properties(my_main.cc PROPERTIES
+  COMPILE_OPTIONS "-UHSHM_ENABLE_CUDA;-DHSHM_ENABLE_CUDA=0"
+)
+```
+
+**Consequence**: `client.AsyncMyTask(LocalGpuBcast(), ...)` called from `my_main.cc` will NOT route to the GPU because the `Send()` interceptor is compiled out. Instead, define a wrapper in the CUDA source file:
+
+```cpp
+// my_gpu_kernels.cc (compiled as CUDA, HSHM_ENABLE_CUDA=1)
+extern "C" int run_cpu_to_gpu(chi::PoolId pool_id, chi::u32 input,
+                               chi::u32 *out_result) {
+  chimaera::my_module::Client client(pool_id);
+  auto future = client.AsyncMyTask(chi::PoolQuery::LocalGpuBcast(), input);
+  if (!future.Wait(10.0f)) return -3;  // timeout
+  *out_result = future->result_;
+  return 1;
+}
+```
+
+### GPU Module Include Directories
+
+GPU targets need `$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>` for autogen headers:
+
+```cmake
+target_include_directories(my_gpu_target PRIVATE
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+)
+```
 
 ## Data Flow Details
 
-### Send Path (GPU → CPU)
+### Send Path (GPU &rarr; CPU)
 
 1. **NewTask**: Allocates the task from the per-thread `ArenaAllocator`
 2. **SendGpu**: Allocates a `FutureShm` with embedded ring buffer copy space
 3. **Enqueue**: Pushes the `Future` onto the GPU task queue lane (the CPU worker polls this)
 4. **Serialize + Send**: Builds a `LocalSaveTaskArchive`, calls `task->SerializeIn(ar)`, then streams the archive through `ShmTransport::Send` into the input ring buffer
 
-### Receive Path (CPU → GPU)
+### Receive Path (CPU &rarr; GPU)
 
 1. **CPU worker** dequeues the `FutureShm`, reads from the input ring buffer via `ShmTransport::Recv`, deserializes, executes the task, then writes output back through the output ring buffer
 2. **RecvGpu**: On the GPU, `Future::Wait()` calls `RecvGpu` which reads from the output ring buffer via `ShmTransport::Recv`
 3. **Deserialize**: A `LocalLoadTaskArchive` deserializes the output fields via `task->SerializeOut(ar)`
 4. **Completion**: The GPU spins on `FUTURE_COMPLETE` flag (should already be set after Recv completes)
 
+### CPU &rarr; GPU Send Path (SendToGpu)
+
+1. **Send()** detects `LocalGpuBcast`/`ToLocalGpu` routing and calls `SendToGpu()`
+2. **SendToGpu** allocates a `FutureShm` in GPU-accessible pinned host memory via `AllocateGpuBuffer()`
+3. Serializes task input via `task->SerializeIn(ar)` + `ShmTransport::Send` into the input ring buffer
+4. Pushes a `Future<Task>` onto `cpu2gpu_queues_[gpu_id]`
+5. Returns a `Future<TaskT>` with the pinned-memory FutureShm
+
+### CPU &rarr; GPU Receive Path (Wait runtime path)
+
+1. **Wait()** enters the runtime spin-wait loop on `FUTURE_COMPLETE`
+2. The GPU orchestrator pops the task, dispatches to `GpuRuntime`, serializes output into the FutureShm's output ring buffer, calls `__threadfence_system()`, and sets `FUTURE_COMPLETE`
+3. **Wait()** detects `FUTURE_COPY_FROM_CLIENT` flag and non-zero `output_.copy_space_size_`, then deserializes output via `ShmTransport::Recv` + `task->SerializeOut(ar)`
+
+### Client Process &rarr; Server CPU Path (SendShm/SendZmq)
+
+When a pure client uses `PoolQuery::Local()`, the task bypasses all GPU queues and goes through the standard client transport:
+
+1. **SendShm**: Allocates `FutureShm` in server's SHM segment with `copy_space` ring buffer. Enqueues to `worker_queues_` lane (mapped by `ClientMapTask`). Serializes task via `SaveTaskArchive` + `ShmTransport::Send` into the ring buffer.
+2. **Server worker**: Dequeues `FutureShm`, checks `FUTURE_COPY_FROM_CLIENT` flag, reads from `copy_space` ring buffer via `ShmTransport::Recv`, deserializes, executes the **CPU handler** (e.g., `result = test_value * 2 + gpu_id`).
+3. **Response**: Worker serializes output back through the ring buffer, sets `FUTURE_COMPLETE`.
+4. **Client Wait()**: Spins on `FUTURE_COMPLETE`, deserializes output from `copy_space`.
+
+This path uses the CPU runtime handler, not the GPU handler. The computation differs:
+- CPU handler: `result_value = test_value * 2 + gpu_id`
+- GPU handler: `result_value = test_value * 3 + gpu_id`
+
 ### Wire Format
 
 GPU and CPU use compatible but different archive types:
@@ -362,7 +1156,105 @@ GPU and CPU use compatible but different archive types:
 | GPU (send) | `LocalSaveTaskArchive` | `LocalLoadTaskArchive` |
 | CPU (worker) | `SaveTaskArchive` | `LoadTaskArchive` |
 
-Both produce the same binary wire format: `[vector_size][elements...][msg_type][stream_data]`. This is critical — if the formats diverge, deserialization will produce corrupted data.
+Both produce the same binary wire format: `[vector_size][elements...][msg_type][stream_data]`. This is critical -- if the formats diverge, deserialization will produce corrupted data.
+
+## Client Process GPU Tests
+
+The test file `context-runtime/modules/MOD_NAME/test/test_gpu_client_process.cc` verifies cross-process GPU task submission using the fork-based test pattern. A child process runs as a standalone Chimaera server (with `--server-mode`), while the parent connects as a pure client.
+
+### Test Infrastructure
+
+All tests share the same server/client lifecycle managed by `EnsureInitialized()`:
+
+1. **`StartServerProcess()`**: Forks a child, sets `CHI_WITH_RUNTIME=1`, and `execl`s back into the same binary with `--server-mode`. The child calls `CHIMAERA_INIT(kServer, true)` and sleeps until killed.
+2. **`WaitForServer()`**: Polls for the SHM segment file (`/tmp/chimaera_<user>/chi_main_segment_<user>`) up to 50 times at 200ms intervals.
+3. **Parent client init**: Sets `CHI_WITH_RUNTIME=0` and calls `CHIMAERA_INIT(kClient, false)`. During `ClientConnect`, the client receives GPU queue metadata from the server.
+4. **Cleanup**: `CleanupServer()` sends `SIGKILL` to the server process group and removes SHM files.
+
+### Test Cases
+
+#### `client_process_gpu_submit_local`
+
+Submits a `GpuSubmitTask` with `PoolQuery::Local()` routing from the client process. The task travels via SHM or TCP transport to the server's CPU worker, which executes the CPU handler (`result = test_value * 2 + gpu_id`). Verifies the result matches the expected CPU computation.
+
+This test works regardless of GPU support — `Local()` routing always goes through the CPU worker.
+
+#### `client_process_gpu_submit_multiple`
+
+Submits 5 sequential `GpuSubmitTask`s with different `test_value` parameters (100–104), each via `PoolQuery::Local()`. Verifies that all return correct results, confirming the transport handles repeated task round-trips without state corruption.
+
+#### `client_process_gpu_queue_attachment`
+
+Verifies that `ClientInitGpuQueues()` successfully attached to the server's GPU queues during `ClientConnect`. Checks that `GetToGpuQueueCount() > 0` and that `GetToGpuQueue(0)` and `GetGpuToGpuQueue(0)` return non-null pointers.
+
+Requires `HSHM_ENABLE_CUDA` or `HSHM_ENABLE_ROCM` at compile time. Without GPU support, prints "GPU support not compiled, skipping" and passes.
+
+#### `client_process_gpu_submit_to_gpu`
+
+Submits a `GpuSubmitTask` with `PoolQuery::LocalGpuBcast()` routing. This exercises the `SendToGpu()` path: the client serializes the task into the attached `GpuShmMmap` queue backend and pushes it onto `cpu2gpu_queue`. The server's GPU orchestrator dequeues and dispatches to the module's GPU handler.
+
+Requires GPU support compiled in and a GPU device on the server. Verifies that `result = test_value * 2 + gpu_id` (GPU handler uses the same formula in this test module).
+
+#### `client_process_register_gpu_memory`
+
+Tests cross-process GPU memory registration. The client:
+1. Allocates a 4 MB `GpuMalloc` backend (device memory via `cudaMalloc`)
+2. Reads the IPC handle from the backend's private header (GPU→host memcpy)
+3. Sends a `RegisterMemoryTask` with `MemoryType::kGpuDeviceMemory` to the server
+4. The server opens the IPC handle via `GpuMalloc::shm_attach_ipc()` and registers the allocator in `gpu_alloc_map_`
+
+After registration, the server can resolve `ShmPtr`s pointing into this client-owned device memory.
+
+### Building and Running
+
+```bash
+cd /workspace/build
+cmake /workspace && cmake --build . -j$(nproc)
+
+# CRITICAL: install ALL .so files (stale modules cause serialization mismatches)
+sudo cp bin/lib*.so /usr/local/lib/ && sudo ldconfig
+
+# Run all client process GPU tests
+./bin/chimaera_gpu_client_process_tests
+
+# Run a specific test
+./bin/chimaera_gpu_client_process_tests client_process_gpu_submit_local
+```
+
+**Note:** Tests 3–5 (`gpu_queue_attachment`, `gpu_submit_to_gpu`, `register_gpu_memory`) require the test binary to be compiled with CUDA/ROCm support. Without it, they skip gracefully. Tests 1–2 work with any build configuration.
+
+## Performance
+
+### GPU→GPU Task Round-Trip Latency
+
+The following latency was measured using the in-process benchmark (`bench_gpu_runtime`) with 1 client block × 1 active thread (sequential `AsyncGpuSubmit` + `Wait()` loop):
+
+| Allocator (`CHI_GPU_HEAP_T`) | Avg Latency | Throughput (1 thread) | Notes |
+|---|---|---|---|
+| `BuddyAllocator` (default) | ~200 µs | ~5,000 tasks/sec | Managed pool; per-allocation free; no exhaustion |
+| `MallocAllocator` (device heap) | ~400 µs | ~2,500 tasks/sec | CUDA device `malloc()`/`free()`; serialized across warps |
+
+`BuddyAllocator` backed by `GpuMalloc` device memory is the default because it is 2× faster than CUDA device heap `malloc()` and does not exhaust memory across arbitrarily many tasks.
+
+### How Latency Is Measured
+
+The benchmark timer starts on the CPU immediately after `ResumeGpuOrchestrator()` and ends when the CPU polls `*d_done == 1` (set by the GPU after the last task completes). The formula is:
+
+```
+avg_latency = elapsed_ms * 1000 / total_tasks   (µs per task)
+```
+
+This captures the full GPU→GPU round-trip per task:
+1. `AsyncGpuSubmit`: serialize inputs into `FutureShm` ring buffer, push to `gpu2gpu_queue`
+2. Orchestrator polls, dispatches `DispatchTask`: deserialize → `Run()` → serialize output
+3. `FUTURE_COMPLETE` written with `__threadfence_system()` + system-scope atomic
+4. `future.Wait()` spin exits, `RecvGpu` deserializes output
+
+**Note:** the first task's queue-push overhead is excluded because the GPU kernel starts before the CPU timer. CPU polling overhead (100 µs sleep per iteration) adds at most ~0.1 µs per task across 1,000 tasks and is negligible.
+
+### Memory Limits
+
+The default `GpuShmMmap` orchestrator scratch (`HSHM_DEFAULT_ALLOC_GPU_T = ArenaAllocator`) is reset after each completed task (`alloc->Reset()`), so it never exhausts regardless of task count. The `BuddyAllocator` heap (`CHI_GPU_HEAP_T`) reclaims memory individually as serialization vectors go out of scope.
 
 ## Troubleshooting
 
@@ -373,6 +1265,12 @@ Usually caused by GPU stack overflow during deep serialization template chains.
 cudaDeviceSetLimit(cudaLimitStackSize, 131072);
 ```
 
+### CUDA Error 715 (Illegal Instruction)
+
+Caused by using system-scope atomics on pinned host memory, or by cross-library virtual function calls. Solutions:
+- Use device-scope atomics (`atomicAdd`, `atomicOr`) with explicit `__threadfence_system()` at strategic points
+- Ensure all GPU containers are allocated within the orchestrator's CUDA module context (the autogen system handles this)
+
 ### Silent data corruption (kernel completes but wrong values)
 
 Check that **all** serialization functions are marked `HSHM_CROSS_FUN`. Without `__device__` annotation, NVCC's SFINAE dispatch silently skips serialization, consuming 0 bytes and misaligning the stream. The compiler produces no error.
@@ -384,6 +1282,20 @@ The GPU producer and CPU consumer share the ring buffer. If the buffer fills, th
 - `RegisterGpuQueue` and `AssignGpuLanesToWorker` were called before the kernel launch
 - The copy space is large enough for the serialized payload
 
+### CPU&rarr;GPU Wait() timeout
+
+If `future.Wait()` times out after `SendToGpu()`:
+- Verify the GPU orchestrator is running (not paused)
+- Verify the module's GPU container is registered (check for "Registered GPU container" log message)
+- Verify the call site is compiled with `HSHM_ENABLE_CUDA=1` -- if compiled with `HSHM_ENABLE_CUDA=0`, the `Send()` GPU interceptor is absent and the task goes through normal CPU routing, which cannot complete a GPU-bound future
+
+### Client process GPU routing fails
+
+If a pure client process (`CHIMAERA_INIT(kClient, false)`) tries to use `LocalGpuBcast` or `ToLocalGpu` and gets empty futures:
+- **Binary not compiled with GPU support**: `ClientInitGpuQueues()` is skipped, `cpu2gpu_queues_` is empty. Use `PoolQuery::Local()` as a fallback (task goes to the server's CPU handler).
+- **Stale installed libraries**: After rebuilding with GPU queue extensions, **all** `.so` files must be reinstalled — not just `libchimaera_cxx.so`. Stale module `.so` files (e.g., `libchimaera_admin_runtime.so`) cause `ClientConnect` serialization mismatches because the server loads modules from `/usr/local/lib/`. Fix: `sudo cp /workspace/build/bin/lib*.so /usr/local/lib/ && sudo ldconfig`
+- **Server has no GPUs**: `ClientConnect` returns `num_gpus_ == 0`, so no queues are attached. This is expected behavior.
+
 ### Kernel launch failure (-201)
 
 Check `cudaGetLastError()` after the kernel launch. Common causes:
@@ -397,3 +1309,11 @@ The CPU worker must be able to resolve `ShmPtr`s that point into GPU-allocated p
 CHI_IPC->RegisterGpuAllocator(backend_id, gpu_backend.data_,
                                gpu_backend.data_capacity_);
 ```
+
+### GPU orchestrator SM starvation
+
+The persistent orchestrator occupies all SMs (32 blocks by default). To launch other GPU kernels:
+1. Call `CHI_IPC->PauseGpuOrchestrator()`
+2. Launch your kernel on a separate stream
+3. Synchronize your stream (not `cudaDeviceSynchronize`)
+4. Call `CHI_IPC->ResumeGpuOrchestrator()`

From 39947f31db4ff6bada880c46edfc194a6f2d08cf Mon Sep 17 00:00:00 2001
From: lukemartinlogan <lukemartinlogan@gmail.com>
Date: Mon, 9 Mar 2026 23:07:19 +0000
Subject: [PATCH 4/5] Add GPU CTE and GPU Infinite Memory (UVM) docs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

gpu-cte.md: documents GPU kernel integration with CTE — CPU fork-client
setup, three-backend memory layout, IpcManagerGpuInfo, kernel-side
AsyncPutBlob/GetBlob, routing table, and stop-flag polling pattern.

gpu-inf-mem.md: documents the GpuShmMmap UVM backend — why UVM is
chosen over pinned memory (no HostNativeAtomicSupported required),
shm_init API, memory layout, kernel passing, IPC manager registration,
per-block allocator construction, destruction rules, and comparison
table against MallocBackend / PosixShmMmap / GpuMalloc.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/sdk/context-transfer-engine/gpu-cte.md   | 325 ++++++++++++++++++
 .../context-transfer-engine/gpu-inf-mem.md    | 161 +++++++++
 2 files changed, 486 insertions(+)
 create mode 100644 docs/sdk/context-transfer-engine/gpu-cte.md
 create mode 100644 docs/sdk/context-transfer-engine/gpu-inf-mem.md

diff --git a/docs/sdk/context-transfer-engine/gpu-cte.md b/docs/sdk/context-transfer-engine/gpu-cte.md
new file mode 100644
index 00000000..b62dbc9d
--- /dev/null
+++ b/docs/sdk/context-transfer-engine/gpu-cte.md
@@ -0,0 +1,325 @@
+# GPU CTE API
+
+This guide describes how to call `AsyncPutBlob`, `AsyncGetBlob`, and
+`AsyncGetOrCreateTag` from CUDA GPU kernels using the CTE (Context Transfer
+Engine) client API. Two usage patterns are covered:
+
+1. **CPU-side with fork client** -- the simpler path where a CPU test process
+   launches a background Chimaera runtime and issues CTE calls from host code
+   using shared-memory buffers.
+2. **GPU kernel-side** -- the advanced path where a CUDA `__global__` kernel
+   allocates tasks, submits them through the GPU-to-CPU queue, and waits for
+   completion entirely on-device.
+
+## Headers
+
+```cpp
+#include <chimaera/chimaera.h>
+#include <chimaera/pool_query.h>
+#include <chimaera/singletons.h>
+#include <wrp_cte/core/core_client.h>
+#include <wrp_cte/core/core_tasks.h>
+
+// GPU memory backends (GPU kernel path only)
+#include <hermes_shm/memory/backend/gpu_shm_mmap.h>
+#include <hermes_shm/memory/backend/gpu_malloc.h>
+#include <hermes_shm/util/gpu_api.h>
+```
+
+## CPU-Side Usage (Fork Client)
+
+This is the recommended starting point. A fork client spawns a background
+Chimaera runtime in a child process, then the parent issues CTE calls from
+normal host code. Blob data is allocated in shared memory so the runtime
+workers can access it directly.
+
+### 1. Initialize Chimaera
+
+```cpp
+bool ok = chi::CHIMAERA_INIT(chi::ChimaeraMode::kClient, /*fork=*/true);
+```
+
+The second argument `true` forks a background runtime. Allow ~500 ms for the
+child process to start before issuing tasks.
+
+### 2. Create the CTE Pool
+
+```cpp
+chi::PoolId core_pool_id = wrp_cte::core::kCtePoolId;
+wrp_cte::core::Client core_client(core_pool_id);
+
+wrp_cte::core::CreateParams params;
+auto create_task = core_client.AsyncCreate(
+    chi::PoolQuery::Dynamic(),
+    wrp_cte::core::kCtePoolName, core_pool_id, params);
+create_task.Wait();
+```
+
+### 3. Register a Storage Target
+
+CTE needs at least one storage backend for blob data.
+
+```cpp
+auto reg_task = core_client.AsyncRegisterTarget(
+    "/tmp/cte_gpu_test.dat",         // path on disk
+    chimaera::bdev::BdevType::kFile, // file-backed target
+    16 * 1024 * 1024,                // 16 MB capacity
+    chi::PoolQuery::Local(),
+    chi::PoolId(700, 0));            // target pool ID
+reg_task.Wait();
+```
+
+### 4. Create a Tag
+
+Tags group blobs logically. The returned `tag_id_` is required by PutBlob
+and GetBlob.
+
+```cpp
+auto tag_task = core_client.AsyncGetOrCreateTag("my_gpu_tag");
+tag_task.Wait();
+wrp_cte::core::TagId tag_id = tag_task->tag_id_;
+```
+
+### 5. AsyncPutBlob
+
+Allocate a shared-memory buffer, fill it, then submit a PutBlob task.
+
+```cpp
+const size_t blob_size = 4096;
+
+// Allocate in shared memory (accessible to runtime workers)
+hipc::FullPtr<char> buf = CHI_IPC->AllocateBuffer(blob_size);
+std::memset(buf.ptr_, 0xAB, blob_size);
+
+// Cast to a void ShmPtr for the CTE API
+hipc::ShmPtr<> blob_data = buf.shm_.Cast<void>();
+
+auto put = core_client.AsyncPutBlob(
+    tag_id,
+    "my_blob",            // blob name
+    /*offset=*/0,
+    /*size=*/blob_size,
+    blob_data,
+    /*score=*/-1.0f,      // -1 = auto-place
+    wrp_cte::core::Context(),
+    /*flags=*/0,
+    chi::PoolQuery::Local());
+put.Wait();
+assert(put->GetReturnCode() == 0);
+```
+
+### 6. AsyncGetBlob
+
+Allocate an output buffer in shared memory and submit a GetBlob task.
+
+```cpp
+hipc::FullPtr<char> out_buf = CHI_IPC->AllocateBuffer(blob_size);
+std::memset(out_buf.ptr_, 0, blob_size);
+hipc::ShmPtr<> out_data = out_buf.shm_.Cast<void>();
+
+auto get = core_client.AsyncGetBlob(
+    tag_id,
+    "my_blob",
+    /*offset=*/0,
+    /*size=*/blob_size,
+    /*flags=*/0,
+    out_data,
+    chi::PoolQuery::Local());
+get.Wait();
+assert(get->GetReturnCode() == 0);
+// out_buf.ptr_ now contains the retrieved blob data
+```
+
+## GPU Kernel-Side Usage
+
+For submitting CTE tasks directly from a CUDA `__global__` kernel, additional
+setup is required. The GPU kernel uses `CHIMAERA_GPU_INIT` to set up per-thread
+allocators, then calls `CHI_IPC->NewTask<T>()` and `CHI_IPC->Send()` exactly
+like host code.
+
+### Memory Backend Setup (Host Side)
+
+Three GPU memory backends must be initialized before launching the kernel:
+
+```cpp
+// 1. Primary backend -- task object allocation (GpuShmMmap = UVM)
+hipc::MemoryBackendId backend_id(20, 0);
+hipc::GpuShmMmap gpu_backend;
+gpu_backend.shm_init(backend_id, 10 * 1024 * 1024, "/my_gpu_cte", 0);
+
+// Register with the IPC manager so the GPU kernel can allocate from it
+CHI_IPC->RegisterGpuAllocator(
+    backend_id, gpu_backend.data_, gpu_backend.data_capacity_);
+
+// 2. GPU-to-CPU backend -- FutureShm lives here (UVM, CPU+GPU visible)
+hipc::MemoryBackendId g2c_backend_id(21, 0);
+hipc::GpuShmMmap g2c_backend;
+g2c_backend.shm_init(g2c_backend_id, 4 * 1024 * 1024, "/my_gpu_g2c", 0);
+
+// 3. GPU heap backend -- device memory for serialization scratch
+hipc::MemoryBackendId heap_backend_id(22, 0);
+hipc::GpuMalloc gpu_heap_backend;
+gpu_heap_backend.shm_init(heap_backend_id, 4 * 1024 * 1024, "/my_gpu_heap", 0);
+```
+
+### IpcManagerGpuInfo Setup (Host Side)
+
+Pack the backends and queue pointer into a struct that the kernel receives
+by value:
+
+```cpp
+chi::IpcManagerGpuInfo gpu_info;
+gpu_info.backend =
+    static_cast<hipc::MemoryBackend &>(gpu_backend);
+gpu_info.gpu2cpu_queue =
+    CHI_IPC->GetGpuQueue(0);  // pre-existing GPU-to-CPU queue
+gpu_info.gpu2cpu_backend =
+    static_cast<hipc::MemoryBackend &>(g2c_backend);
+gpu_info.gpu_heap_backend =
+    static_cast<hipc::MemoryBackend &>(gpu_heap_backend);
+```
+
+### Kernel Launch (Host Side)
+
+Use a non-blocking CUDA stream to avoid serializing with the default stream
+(important when a persistent GPU orchestrator kernel is also running):
+
+```cpp
+cudaStream_t stream;
+cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+
+my_cte_kernel<<<1, 1, 0, stream>>>(gpu_info, pool_id, tag_id);
+
+// Poll or synchronize as needed (see polling pattern below)
+```
+
+### GPU Kernel Implementation
+
+```cpp
+__global__ void my_cte_kernel(chi::IpcManagerGpu gpu_info,
+                              chi::PoolId pool_id,
+                              wrp_cte::core::TagId tag_id) {
+  // Initialize per-thread GPU allocators and IPC context
+  CHIMAERA_GPU_INIT(gpu_info);
+
+  // ---- GetOrCreateTag ----
+  auto tag_task = CHI_IPC->NewTask<wrp_cte::core::GetOrCreateTagTask<>>(
+      chi::CreateTaskId(),
+      pool_id,
+      chi::PoolQuery::ToLocalCpu(),  // route to CPU worker
+      "my_gpu_tag",
+      wrp_cte::core::TagId::GetNull());
+  auto tag_future = CHI_IPC->Send(tag_task);
+  tag_future.Wait();
+  wrp_cte::core::TagId result_tag_id = tag_future->tag_id_;
+
+  // ---- PutBlob ----
+  const size_t blob_size = 4096;
+  hipc::FullPtr<char> buf = CHI_IPC->AllocateBuffer(blob_size);
+  // Fill buffer on device...
+
+  auto put_task = CHI_IPC->NewTask<wrp_cte::core::PutBlobTask>(
+      chi::CreateTaskId(),
+      pool_id,
+      chi::PoolQuery::ToLocalCpu(),
+      result_tag_id,
+      "my_blob",
+      /*offset=*/0ULL,
+      /*size=*/(chi::u64)blob_size,
+      buf.shm_.Cast<void>(),
+      /*score=*/-1.0f,
+      wrp_cte::core::Context(),
+      /*flags=*/0U);
+  auto put_future = CHI_IPC->Send(put_task);
+  put_future.Wait();
+
+  // ---- GetBlob ----
+  hipc::FullPtr<char> out = CHI_IPC->AllocateBuffer(blob_size);
+  auto get_task = CHI_IPC->NewTask<wrp_cte::core::GetBlobTask>(
+      chi::CreateTaskId(),
+      pool_id,
+      chi::PoolQuery::ToLocalCpu(),
+      result_tag_id,
+      "my_blob",
+      /*offset=*/0ULL,
+      /*size=*/(chi::u64)blob_size,
+      /*flags=*/0U,
+      out.shm_.Cast<void>());
+  auto get_future = CHI_IPC->Send(get_task);
+  get_future.Wait();
+  // out.ptr_ now contains the retrieved data
+}
+```
+
+### Routing from GPU Kernels
+
+| PoolQuery | Direction | Description |
+|-----------|-----------|-------------|
+| `PoolQuery::ToLocalCpu()` | GPU to CPU | Task enters the `gpu2cpu_queue` and is processed by a CPU worker. Use this for CTE blob operations. |
+| `PoolQuery::Local()` | GPU to GPU | Task stays on the GPU orchestrator. Use for GPU-native module methods. |
+| `PoolQuery::LocalGpuBcast()` | CPU to GPU | CPU pushes a task to GPU orchestrator. Used from host code, not inside kernels. |
+
+For CTE blob I/O, always use `PoolQuery::ToLocalCpu()` because the CTE
+runtime (storage targets, metadata) runs on CPU workers.
+
+### GPU Wait with Stop Flag
+
+`future.Wait()` spins on the GPU until the CPU worker marks the future
+complete. For robustness, use a manual poll loop with a CPU-controlled stop
+flag so the host can terminate a stuck kernel:
+
+```cpp
+__global__ void safe_kernel(chi::IpcManagerGpu gpu_info,
+                            /* ... */,
+                            volatile int *d_stop) {
+  CHIMAERA_GPU_INIT(gpu_info);
+  // ... NewTask + Send ...
+  auto fshm_full = future.GetFutureShm();
+  chi::FutureShm *fshm = fshm_full.ptr_;
+  while (fshm && !fshm->flags_.AnySystem(chi::FutureShm::FUTURE_COMPLETE)) {
+    // Bypass GPU L2 cache to see CPU-written stop flag
+    int stop = atomicAdd_system(const_cast<int *>(d_stop), 0);
+    if (stop) return;
+    HSHM_THREAD_MODEL->Yield();
+  }
+}
+```
+
+On the host side, allocate `d_stop` with `cudaMallocManaged` and set it to 1
+if the kernel times out.
+
+## Context Structure
+
+`wrp_cte::core::Context` controls compression and placement behavior. Pass a
+default-constructed `Context()` for standard uncompressed I/O.
+
+Key fields:
+
+| Field | Default | Description |
+|-------|---------|-------------|
+| `dynamic_compress_` | 0 | 0=skip, 1=static, 2=dynamic compression |
+| `compress_lib_` | 0 | Compression library index (0-10) |
+| `compress_preset_` | 2 | 1=FAST, 2=BALANCED, 3=BEST |
+| `min_persistence_level_` | 0 | 0=volatile, 1=temp-nonvolatile, 2=long-term |
+| `score` (PutBlob param) | -1.0 | -1=auto, 0.0-1.0=explicit tier (higher=faster) |
+
+## CMake Integration
+
+GPU CTE kernels must be compiled with CUDA (`*.cu` files or
+`set_source_files_properties(... LANGUAGE CUDA)`). Link against:
+
+```cmake
+target_link_libraries(my_target
+  wrp_cte_core_client   # CTE client library
+  chimaera_cxx          # Chimaera runtime
+  hermes_shm_host       # Shared memory primitives
+)
+```
+
+Enable CUDA in the project:
+
+```cmake
+enable_language(CUDA)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_ARCHITECTURES 70 80 90)
+```
diff --git a/docs/sdk/context-transfer-engine/gpu-inf-mem.md b/docs/sdk/context-transfer-engine/gpu-inf-mem.md
new file mode 100644
index 00000000..faf44158
--- /dev/null
+++ b/docs/sdk/context-transfer-engine/gpu-inf-mem.md
@@ -0,0 +1,161 @@
+# GPU Infinite Memory (UVM Backend)
+
+`GpuShmMmap` is IOWarp's **Unified Virtual Memory (UVM)** memory backend.
+It allocates a single region with `cudaMallocManaged` so that both CPU
+threads and GPU kernels can read and write the same bytes without any
+explicit copy or synchronization primitive beyond a memory fence.
+
+This is the backbone of the GPU transport layer: `FutureShm` ring buffers,
+task queues, and ArenaAllocator scratch regions all live in UVM memory so
+a CPU worker and a GPU orchestrator kernel can exchange data at cache speed
+rather than PCIe bandwidth.
+
+## Why UVM Instead of Pinned Memory
+
+Pinned host memory (`cudaMallocHost`) requires **system-scope atomics**
+(`atomicAdd_system`, `atomicExch_system`) for GPU→CPU visibility.  These
+atomics in turn require `cudaDevAttrHostNativeAtomicSupported`, which is
+`0` on most discrete/PCIe GPUs (e.g. RTX 4070, A100 over PCIe).  Without
+hardware support the operations silently fall back to slower paths and can
+exhibit stale-read races.
+
+UVM with `cudaDevAttrConcurrentManagedAccess` (available on SM 6.0+ GPUs)
+uses the GPU's hardware page-migration engine to maintain coherence, so:
+
+- **Standard device-scope atomics** on the GPU are visible to the CPU.
+- **`std::atomic` on the CPU** is visible to the GPU.
+- No `clflush`, write-combining flags, or `__threadfence_system` needed.
+
+| Feature | `cudaMallocHost` (pinned) | `cudaMallocManaged` (UVM) |
+|---------|--------------------------|--------------------------|
+| GPU→CPU atomics | `atomicAdd_system` required | device-scope sufficient |
+| Hardware requirement | `HostNativeAtomicSupported` | `ConcurrentManagedAccess` |
+| Cross-process sharing | Yes (via IPC handles) | No (single process only) |
+| Free with | `cudaFreeHost` | `cudaFree` |
+| Typical use | small control flags | ring buffers, allocators |
+
+## Header
+
+```cpp
+#include <hermes_shm/memory/backend/gpu_shm_mmap.h>
+```
+
+Requires `HSHM_ENABLE_CUDA=1` or `HSHM_ENABLE_ROCM=1`.
+
+## Allocation
+
+```cpp
+hipc::MemoryBackendId backend_id(100, 0); // (major, minor) — must be unique
+hipc::GpuShmMmap backend;
+
+bool ok = backend.shm_init(
+    backend_id,
+    32 * 1024 * 1024,  // 32 MB total (headers + data)
+    "/my_uvm_region",  // informational name — not a file path
+    0);                // GPU device ID (informational)
+
+assert(ok);
+char   *data     = backend.data_;           // usable data region
+size_t  capacity = backend.data_capacity_;  // usable bytes (total − 8 KB headers)
+```
+
+`shm_init` calls `cudaMallocManaged(&ptr, size, cudaMemAttachGlobal)` and
+lays out the region as:
+
+```
+[ 4 KB backend header ][ 4 KB shared header ][ data ... ]
+```
+
+The two header pages are reserved for `MemoryBackendHeader` metadata; your
+allocator or ring buffer should start from `data_`.
+
+## Passing to a GPU Kernel
+
+`GpuShmMmap` is a plain struct (no vtable, no host-only members) and can
+be passed to a `__global__` kernel **by value** as part of
+`IpcManagerGpuInfo`.  The `data_` pointer is a CUDA-managed virtual
+address valid on both host and device.
+
+```cpp
+chi::IpcManagerGpuInfo gpu_info;
+gpu_info.backend         = static_cast<hipc::MemoryBackend &>(backend);
+gpu_info.gpu2cpu_backend = static_cast<hipc::MemoryBackend &>(g2c_backend);
+// ... set queue pointers ...
+
+my_kernel<<<blocks, threads, 0, stream>>>(gpu_info, ...);
+```
+
+Inside the kernel, `CHIMAERA_GPU_ORCHESTRATOR_INIT` (or `CHIMAERA_GPU_INIT`
+for client kernels) reconstructs the per-block ArenaAllocator from
+`gpu_info.backend`, partitioning the UVM region evenly across blocks.
+
+## Registering with the IPC Manager
+
+On the host side, register the backend so the CPU-side IPC manager can
+resolve allocator IDs returned by the GPU:
+
+```cpp
+CHI_IPC->RegisterGpuAllocator(
+    backend_id,
+    backend.data_,
+    backend.data_capacity_);
+```
+
+This records a mapping from `backend_id` → `(data_, capacity)` so that
+`FutureShm` offsets written by a GPU thread can be dereferenced by a CPU
+worker without an extra copy.
+
+## Memory Layout Inside the Region
+
+After calling `shm_init`, build an ArenaAllocator (or any other
+`HSHM_DEFAULT_ALLOC_GPU_T`) directly on top of the UVM region:
+
+```cpp
+// Host side — initialize allocator
+auto *alloc = reinterpret_cast<HSHM_DEFAULT_ALLOC_GPU_T *>(backend.data_);
+new (alloc) HSHM_DEFAULT_ALLOC_GPU_T();
+alloc->shm_init(backend_id, nullptr, backend.data_, backend.data_capacity_);
+```
+
+The same bytes are then accessible from the GPU kernel as the
+`gpu_alloc_table_[thread_id]` entry set up by `CHIMAERA_GPU_ORCHESTRATOR_INIT`.
+
+## Destruction
+
+```cpp
+backend.shm_destroy(); // calls cudaFree internally
+```
+
+Do **not** call `cudaFreeHost` — the backing memory was allocated with
+`cudaMallocManaged`, not `cudaMallocHost`.
+
+`shm_attach` is intentionally unsupported; UVM allocations are
+process-local and cannot be imported by another PID.
+
+## Limitations
+
+| Limitation | Detail |
+|-----------|--------|
+| Single-process only | No `shm_attach`; cannot share across PIDs |
+| Minimum size | Enforced to 1 MB (`kMinBackendSize`) |
+| Requires CUDA/ROCm | Compiled out when neither flag is set |
+| SM requirement | `ConcurrentManagedAccess` needed for coherent access; available on SM 6.0+ (Pascal and newer) |
+
+## Relation to Other Backends
+
+| Backend | Memory type | GPU accessible | Cross-process |
+|---------|------------|---------------|--------------|
+| `MallocBackend` | Host heap | No | No |
+| `PosixShmMmap` | POSIX shm | No | Yes |
+| `GpuShmMmap` | UVM (managed) | Yes (coherent) | No |
+| `GpuMalloc` | Device DRAM | Yes (device-scope) | Yes (IPC handles) |
+
+For tasks that must be visible to CPU workers, use `GpuShmMmap`.  For
+opaque device-side scratch memory shared across processes, use `GpuMalloc`.
+
+## CMake
+
+```cmake
+target_link_libraries(my_target hermes_shm_cuda)
+target_compile_definitions(my_target PRIVATE HSHM_ENABLE_CUDA=1)
+```

From 16fcae76bb6604ecc75728243d62c9f5cc358938 Mon Sep 17 00:00:00 2001
From: lukemartinlogan <lukemartinlogan@gmail.com>
Date: Mon, 9 Mar 2026 23:09:30 +0000
Subject: [PATCH 5/5] Fix gpu-inf-mem.md: document wrp_cte_uvm demand-paging
 module

Replace placeholder UVM backend doc with accurate documentation of the
GpuVirtualMemoryManager class in context-transfer-engine/uvm.  Covers:
- GpuVmmConfig fields and defaults
- init/destroy lifecycle
- touchPage / touchRange / touchPageAsync demand page-in
- evictPage / evictPageAsync page-out to host RAM or CTE blob store
- prefetch_window auto-prefetch
- state queries (isMapped, isEvictedToHost, getMappedPageCount, ...)
- separate transfer/compute stream model
- CTE blob store backing option
- full end-to-end example
- CMake integration and hardware requirements

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../context-transfer-engine/gpu-inf-mem.md    | 254 ++++++++++--------
 1 file changed, 144 insertions(+), 110 deletions(-)

diff --git a/docs/sdk/context-transfer-engine/gpu-inf-mem.md b/docs/sdk/context-transfer-engine/gpu-inf-mem.md
index faf44158..4494f031 100644
--- a/docs/sdk/context-transfer-engine/gpu-inf-mem.md
+++ b/docs/sdk/context-transfer-engine/gpu-inf-mem.md
@@ -1,161 +1,195 @@
-# GPU Infinite Memory (UVM Backend)
+# GPU Infinite Memory (UVM)
 
-`GpuShmMmap` is IOWarp's **Unified Virtual Memory (UVM)** memory backend.
-It allocates a single region with `cudaMallocManaged` so that both CPU
-threads and GPU kernels can read and write the same bytes without any
-explicit copy or synchronization primitive beyond a memory fence.
+The `wrp_cte_uvm` module provides a **software-managed GPU demand-paging
+system** built on the CUDA Driver API's virtual memory management (VMM)
+primitives.  It lets you reserve an enormous virtual address space (up to
+512 GB by default) on the GPU while backing only the pages that are
+actually touched with physical device memory.  Pages that have not been
+accessed yet consume no physical memory; pages that are evicted are saved to
+host RAM (or to a CTE blob store) and transparently restored on next access.
 
-This is the backbone of the GPU transport layer: `FutureShm` ring buffers,
-task queues, and ArenaAllocator scratch regions all live in UVM memory so
-a CPU worker and a GPU orchestrator kernel can exchange data at cache speed
-rather than PCIe bandwidth.
+## Headers
 
-## Why UVM Instead of Pinned Memory
-
-Pinned host memory (`cudaMallocHost`) requires **system-scope atomics**
-(`atomicAdd_system`, `atomicExch_system`) for GPU→CPU visibility.  These
-atomics in turn require `cudaDevAttrHostNativeAtomicSupported`, which is
-`0` on most discrete/PCIe GPUs (e.g. RTX 4070, A100 over PCIe).  Without
-hardware support the operations silently fall back to slower paths and can
-exhibit stale-read races.
+```cpp
+#include <wrp_cte/uvm/gpu_vmm.h>
+```
 
-UVM with `cudaDevAttrConcurrentManagedAccess` (available on SM 6.0+ GPUs)
-uses the GPU's hardware page-migration engine to maintain coherence, so:
+Link against `wrp_cte_uvm`.  Requires `WRP_CORE_ENABLE_CUDA=ON`.
 
-- **Standard device-scope atomics** on the GPU are visible to the CPU.
-- **`std::atomic` on the CPU** is visible to the GPU.
-- No `clflush`, write-combining flags, or `__threadfence_system` needed.
+## Core Concepts
 
-| Feature | `cudaMallocHost` (pinned) | `cudaMallocManaged` (UVM) |
-|---------|--------------------------|--------------------------|
-| GPU→CPU atomics | `atomicAdd_system` required | device-scope sufficient |
-| Hardware requirement | `HostNativeAtomicSupported` | `ConcurrentManagedAccess` |
-| Cross-process sharing | Yes (via IPC handles) | No (single process only) |
-| Free with | `cudaFreeHost` | `cudaFree` |
-| Typical use | small control flags | ring buffers, allocators |
+| Concept | Detail |
+|---------|--------|
+| Virtual address space | Reserved once with `cuMemAddressReserve`; no physical cost |
+| Page size | 2 MB default, auto-aligned to GPU hardware granularity |
+| Physical backing | Allocated on-demand per page with `cuMemCreate` + `cuMemMap` |
+| Eviction target | Host pinned RAM (`cudaMallocHost`) or CTE blob store |
+| Thread safety | All public methods protected by `std::mutex` |
+| Async support | Separate transfer and compute CUDA streams |
 
-## Header
+## Configuration
 
 ```cpp
-#include <hermes_shm/memory/backend/gpu_shm_mmap.h>
+wrp_cte::uvm::GpuVmmConfig cfg;
+cfg.va_size_bytes    = 512ULL * 1024 * 1024 * 1024; // 512 GB virtual space
+cfg.page_size        = 2 * 1024 * 1024;              // 2 MB pages
+cfg.fill_value       = 0;                            // newly-mapped page fill
+cfg.device           = 0;                            // CUDA device ordinal
+cfg.prefetch_window  = 0;                            // pages to auto-touch ahead
+cfg.use_cte          = false;                        // use CTE blob store for eviction
 ```
 
-Requires `HSHM_ENABLE_CUDA=1` or `HSHM_ENABLE_ROCM=1`.
+All fields have defaults; a zero-initialized `GpuVmmConfig` is valid and
+uses 512 GB / 2 MB pages / device 0 / host-RAM backing.
 
-## Allocation
+## Initialization and Teardown
 
 ```cpp
-hipc::MemoryBackendId backend_id(100, 0); // (major, minor) — must be unique
-hipc::GpuShmMmap backend;
-
-bool ok = backend.shm_init(
-    backend_id,
-    32 * 1024 * 1024,  // 32 MB total (headers + data)
-    "/my_uvm_region",  // informational name — not a file path
-    0);                // GPU device ID (informational)
-
-assert(ok);
-char   *data     = backend.data_;           // usable data region
-size_t  capacity = backend.data_capacity_;  // usable bytes (total − 8 KB headers)
+wrp_cte::uvm::GpuVirtualMemoryManager vmm;
+
+vmm.init(cfg);   // reserve VA, create streams, verify hardware granularity
+// ... use vmm ...
+vmm.destroy();   // unmap all pages, free host backing, release VA range
 ```
 
-`shm_init` calls `cudaMallocManaged(&ptr, size, cudaMemAttachGlobal)` and
-lays out the region as:
+`init` validates that `page_size` is a multiple of the GPU's hardware
+allocation granularity (queried with `cuMemGetAllocationGranularity`).  It
+fails if the device does not support virtual memory management.
 
-```
-[ 4 KB backend header ][ 4 KB shared header ][ data ... ]
-```
+## Demand Paging — Page In
 
-The two header pages are reserved for `MemoryBackendHeader` metadata; your
-allocator or ring buffer should start from `data_`.
+```cpp
+// Touch a single 2 MB page (zero-based page index)
+vmm.touchPage(page_idx);
 
-## Passing to a GPU Kernel
+// Touch all pages that cover a byte range
+vmm.touchRange(byte_offset, byte_length);
 
-`GpuShmMmap` is a plain struct (no vtable, no host-only members) and can
-be passed to a `__global__` kernel **by value** as part of
-`IpcManagerGpuInfo`.  The `data_` pointer is a CUDA-managed virtual
-address valid on both host and device.
+// Non-blocking touch on the internal transfer stream
+vmm.touchPageAsync(page_idx);
+vmm.syncTransfer(); // wait for async touches
+```
+
+On first touch, `touchPage` calls `cuMemCreate` to allocate a 2 MB physical
+chunk, maps it into the reserved VA with `cuMemMap` + `cuMemSetAccess`, then
+launches a fill kernel.  If the page was previously evicted, the saved host
+buffer (or CTE blob) is copied back to the device instead.
+
+## Eviction — Page Out
 
 ```cpp
-chi::IpcManagerGpuInfo gpu_info;
-gpu_info.backend         = static_cast<hipc::MemoryBackend &>(backend);
-gpu_info.gpu2cpu_backend = static_cast<hipc::MemoryBackend &>(g2c_backend);
-// ... set queue pointers ...
+// Evict a single page to host RAM (synchronous D2H copy)
+vmm.evictPage(page_idx);
 
-my_kernel<<<blocks, threads, 0, stream>>>(gpu_info, ...);
+// Async eviction — D2H copy queued on transfer stream
+vmm.evictPageAsync(page_idx);
+vmm.syncTransfer();
 ```
 
-Inside the kernel, `CHIMAERA_GPU_ORCHESTRATOR_INIT` (or `CHIMAERA_GPU_INIT`
-for client kernels) reconstructs the per-block ArenaAllocator from
-`gpu_info.backend`, partitioning the UVM region evenly across blocks.
+Eviction:
+1. Copies the 2 MB page to a `cudaMallocHost` buffer (or `AsyncPutBlob`
+   when `use_cte=true`).
+2. Unmaps the page from the VA range (`cuMemUnmap`).
+3. Releases the physical allocation (`cuMemRelease`).
 
-## Registering with the IPC Manager
+After eviction the virtual address is still valid but accessing it from a
+GPU kernel will fault.  The next `touchPage` call restores the data.
 
-On the host side, register the backend so the CPU-side IPC manager can
-resolve allocator IDs returned by the GPU:
+## Prefetching
+
+Set `cfg.prefetch_window = N` to automatically touch the next `N` pages
+whenever a page is touched:
 
 ```cpp
-CHI_IPC->RegisterGpuAllocator(
-    backend_id,
-    backend.data_,
-    backend.data_capacity_);
+cfg.prefetch_window = 3; // touching page P also touches P+1, P+2, P+3 async
 ```
 
-This records a mapping from `backend_id` → `(data_, capacity)` so that
-`FutureShm` offsets written by a GPU thread can be dereferenced by a CPU
-worker without an extra copy.
-
-## Memory Layout Inside the Region
+Prefetch touches are issued on the transfer stream and do not block the
+caller.
 
-After calling `shm_init`, build an ArenaAllocator (or any other
-`HSHM_DEFAULT_ALLOC_GPU_T`) directly on top of the UVM region:
+## Querying State
 
 ```cpp
-// Host side — initialize allocator
-auto *alloc = reinterpret_cast<HSHM_DEFAULT_ALLOC_GPU_T *>(backend.data_);
-new (alloc) HSHM_DEFAULT_ALLOC_GPU_T();
-alloc->shm_init(backend_id, nullptr, backend.data_, backend.data_capacity_);
+void   *base      = vmm.getBasePtr();           // VA range start
+size_t  page_size = vmm.getPageSize();
+size_t  total     = vmm.getTotalPages();
+size_t  mapped    = vmm.getMappedPageCount();   // pages with physical backing
+size_t  evicted   = vmm.getEvictedPageCount();  // pages saved to host RAM
+
+bool is_mapped   = vmm.isMapped(page_idx);
+bool is_evicted  = vmm.isEvictedToHost(page_idx);
+
+// Device pointer to the start of page N
+void *page_ptr   = vmm.getPagePtr(page_idx);
 ```
 
-The same bytes are then accessible from the GPU kernel as the
-`gpu_alloc_table_[thread_id]` entry set up by `CHIMAERA_GPU_ORCHESTRATOR_INIT`.
+## CTE Blob Store Backing
+
+When `cfg.use_cte = true` the module uses CTE `AsyncPutBlob` / `AsyncGetBlob`
+for eviction instead of host pinned RAM.  This lets evicted pages survive
+process restart and be loaded from a persistent storage tier.
+
+Requires the CTE pool to be initialized before calling `vmm.init()`.  The
+module creates one blob per page named by its index.
 
-## Destruction
+## Stream Management
 
 ```cpp
-backend.shm_destroy(); // calls cudaFree internally
+cudaStream_t xfer = vmm.getTransferStream(); // D2H / H2D copies
+cudaStream_t comp = vmm.getComputeStream();  // kernel launches
+
+vmm.syncTransfer(); // cudaStreamSynchronize(transfer_stream_)
+vmm.syncCompute();  // cudaStreamSynchronize(compute_stream_)
 ```
 
-Do **not** call `cudaFreeHost` — the backing memory was allocated with
-`cudaMallocManaged`, not `cudaMallocHost`.
+Use `getComputeStream()` for kernels that read or write demand-paged memory
+so they are ordered after any in-flight page-in operations on the transfer
+stream.
+
+## Full Example
+
+```cpp
+#include <wrp_cte/uvm/gpu_vmm.h>
+
+// 1. Configure a 4 GB virtual address space with 2 MB pages
+wrp_cte::uvm::GpuVmmConfig cfg;
+cfg.va_size_bytes   = 4ULL * 1024 * 1024 * 1024;
+cfg.prefetch_window = 2;
 
-`shm_attach` is intentionally unsupported; UVM allocations are
-process-local and cannot be imported by another PID.
+wrp_cte::uvm::GpuVirtualMemoryManager vmm;
+vmm.init(cfg);
 
-## Limitations
+// 2. Touch page 0 — physical memory allocated, filled with cfg.fill_value
+vmm.touchPage(0);
 
-| Limitation | Detail |
-|-----------|--------|
-| Single-process only | No `shm_attach`; cannot share across PIDs |
-| Minimum size | Enforced to 1 MB (`kMinBackendSize`) |
-| Requires CUDA/ROCm | Compiled out when neither flag is set |
-| SM requirement | `ConcurrentManagedAccess` needed for coherent access; available on SM 6.0+ (Pascal and newer) |
+// 3. Write custom data via a GPU kernel
+int *page0 = reinterpret_cast<int *>(vmm.getPagePtr(0));
+writeKernel<<<1, 256, 0, vmm.getComputeStream()>>>(page0, 42, vmm.getPageSize());
+vmm.syncCompute();
 
-## Relation to Other Backends
+// 4. Evict page 0 to free device memory
+vmm.evictPage(0);
+// physical memory is now released; VA still valid
 
-| Backend | Memory type | GPU accessible | Cross-process |
-|---------|------------|---------------|--------------|
-| `MallocBackend` | Host heap | No | No |
-| `PosixShmMmap` | POSIX shm | No | Yes |
-| `GpuShmMmap` | UVM (managed) | Yes (coherent) | No |
-| `GpuMalloc` | Device DRAM | Yes (device-scope) | Yes (IPC handles) |
+// 5. Re-touch page 0 — data restored from host backing
+vmm.touchPage(0);
+// page0 now contains the data written in step 3
 
-For tasks that must be visible to CPU workers, use `GpuShmMmap`.  For
-opaque device-side scratch memory shared across processes, use `GpuMalloc`.
+vmm.destroy();
+```
 
-## CMake
+## CMake Integration
 
 ```cmake
-target_link_libraries(my_target hermes_shm_cuda)
-target_compile_definitions(my_target PRIVATE HSHM_ENABLE_CUDA=1)
+find_package(wrp_cte REQUIRED)
+
+target_link_libraries(my_target PRIVATE wrp_cte_uvm)
+enable_language(CUDA)
+set_target_properties(my_target PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 ```
+
+## Hardware Requirements
+
+- CUDA 10.2+ (virtual memory management API)
+- GPU with driver-side virtual memory support (`CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED`)
+- SM 7.0+ (Volta) recommended for best performance