rwilliamspbg-ops · rwilliamspbg-ops · May 30, 2026 · May 30, 2026 · May 30, 2026 · May 30, 2026
diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
@@ -0,0 +1,22 @@
+FROM ubuntu:24.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential cmake git ca-certificates wget curl pkg-config \
+    python3 python3-pip python3-venv python3-dev ca-certificates \
+    libssl-dev libffi-dev && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install liboqs from source
+WORKDIR /opt
+RUN git clone --depth 1 https://github.com/open-quantum-safe/liboqs.git && \
+    mkdir -p liboqs/build && cd liboqs/build && \
+    cmake -DCMAKE_BUILD_TYPE=Release .. && \
+    make -j"$(nproc)" && make install
+
+# Ensure pip is upgraded and install Python oqs wrapper
+RUN python3 -m pip install --upgrade pip setuptools wheel && \
+    python3 -m pip install oqs
+
+WORKDIR /workspace
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -0,0 +1,11 @@
+{
+  "name": "Mohawk Inference Devcontainer",
+  "build": {
+    "dockerfile": "Dockerfile"
+  },
+  "workspaceFolder": "/workspace",
+  "settings": {},
+  "extensions": [],
+  "forwardPorts": [8003],
+  "postCreateCommand": "./.devcontainer/post_create.sh"
+}
diff --git a/.devcontainer/post_create.sh b/.devcontainer/post_create.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+echo "Running devcontainer post-create: install build deps and liboqs"
+# install system deps (attempt apt, then apk)
+if command -v apt-get >/dev/null 2>&1; then
+  sudo apt-get update
+  sudo apt-get install -y build-essential cmake git python3-dev python3-pip pkg-config
+elif command -v apk >/dev/null 2>&1; then
+  sudo apk add --no-cache build-base cmake git python3 python3-dev py3-pip pkgconfig
+else
+  echo "Unknown package manager; please install build tools (cmake, make, git, python3-dev) manually"
+fi
+
+CACHE_DIR="$HOME/.cache/liboqs"
+mkdir -p "$CACHE_DIR"
+if [ ! -d "$CACHE_DIR/liboqs" ]; then
+  git clone --depth 1 https://github.com/open-quantum-safe/liboqs.git "$CACHE_DIR/liboqs"
+fi
+
+pushd "$CACHE_DIR/liboqs"
+mkdir -p build && cd build
+cmake -DCMAKE_BUILD_TYPE=Release ..
+make -j"$(nproc)"
+if command -v sudo >/dev/null 2>&1; then
+  sudo make install
+else
+  make install
+fi
+popd
+
+# ensure pip and install oqs python package
+python3 -m pip install --upgrade pip || true
+python3 -m pip install oqs || true
+
+echo "post-create complete"
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,47 @@
+name: CI
+
+on:
+  push:
+    branches: [ main, feat/* ]
+  pull_request:
+    branches: [ main ]
+  workflow_dispatch:
+    inputs:
+      build_liboqs:
+        description: Build liboqs from source before running tests
+        required: false
+        default: false
+        type: boolean
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    env:
+      OQS_INSTALL_PATH: /usr/local
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.12'
+      - name: Install system deps
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y build-essential cmake libssl-dev pkg-config
+      - name: Optionally build liboqs
+        if: github.event_name == 'workflow_dispatch' && inputs.build_liboqs || vars.BUILD_LIBOQS == 'true'
+        run: |
+          git clone --depth 1 https://github.com/open-quantum-safe/liboqs.git /tmp/liboqs
+          mkdir -p /tmp/liboqs/build && cd /tmp/liboqs/build
+          cmake -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX=/usr/local ..
+          make -j$(nproc)
+          sudo make install
+          sudo ldconfig
+          python -m pip install liboqs-python
+      - name: Install Python deps
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r prototype/requirements.txt
+      - name: Run tests
+        run: |
+          pytest -q
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,10 @@
+.venv/
+__pycache__/
+*.pyc
+.pytest_cache/
+.vscode/
+.env
+dist/
+build/
+/.pytest_cache/
+*.egg-info/
diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md
@@ -0,0 +1,125 @@
+Mohawk Inference Engine — Architecture Spec
+
+Overview
+
+Goal: provide a production-grade inference engine that enables capabilities LM Studio does not: multi-device layer splitting, PQC-secured edge offload, and high-concurrency session management. This document describes the core subsystems, dataflows, APIs, security model, and implementation priorities for an MVP.
+
+1. Core concepts
+
+- Layer-splitting: partitioning a neural network at layer boundaries (or sub-layer blocks) so different partitions (slices) execute on different devices (GPU/NPU/CPU/edge). Each slice exposes a small runtime ABI for input/output activation tensors and metadata.
+- Offload: the act of sending one or more slices to a remote device for execution. Offloads must preserve confidentiality/integrity of model IP (weights) and activations as required by policy.
+- PQC-secured channel: post-quantum cryptography handshake + authenticated encryption for slice packages and RPC traffic.
+- Session manager: long-lived controller that maps client sessions to slice placements, manages QoS, adaptive batching, autoscaling, and failure recovery.
+
+2. High-level architecture
+
+Components:
+- Controller (central or local): plans partitioning, placement, and routes requests to workers.
+- Worker runtime: lightweight process on each device that accepts slice packages, registers capabilities (memory, device type), and executes slices.
+- Offload transport: secure RPC over TCP/QUIC with PQC handshake and integrity checks.
+- Session Manager: receives client requests, handles session state, batching, and QoS rules.
+- Scheduler: maps slices to workers, performs placement decisions using cost model and current telemetry.
+- Persistence: key/value store for slice metadata, session state, and logs (can be local filesystem or etcd for distributed setups).
+
+3. Layer-splitting design
+
+3.1 Partitioning model
+- Static split: for MVP, support deterministic splits at transformer block or attention/MLP block granularity. Input: model graph (ONNX, TorchScript), cost model, device inventory. Output: ordered list of slices with boundary tensor shapes and serialization descriptors.
+- Dynamic split (future): runtime re-partitioning based on latency/throughput signals.
+
+3.2 Slice format
+- Metadata: slice id, inputs/outputs shapes, parameter size, expected memory footprint, device hints, version, policy tags (private/public).
+- Artifact: serialized weights in compact format (FP16/int8 quantized optional) + small runtime glue to map tensor ops.
+- Transport container: authenticated envelope (PQC AEAD) + optional compression.
+
+3.3 Runtime ABI
+- Execute(slice_id, input_tensor, trace_id) -> output_tensor, metrics
+- Health(check) -> status
+- Preload(slice_id) -> ack
+
+3.4 Scheduling and placement
+- Cost model inputs: parameter size, compute FLOPs per-token, estimated activation sizes, device throughput and free memory, network latency.
+- Heuristics for MVP: place compute-heavy contiguous slices on GPU if available; place small parameter slices on CPU to lower memory duplication; prefer colocated slices to reduce network hops.
+- Backpressure: if a worker is loaded, controller routes slice to alternate worker or falls back to local execution.
+
+4. PQC-secured edge offload
+
+4.1 Security goals
+- Confidentiality of slice weights when policy requires (IP protection).
+- Integrity of slice artifacts and runtime RPCs.
+- Forward-secure key exchange resistant to quantum-capable adversaries.
+
+4.2 Keyflows and handshakes
+- Root authority: operator provides long-term signing key (classical/ECDSA) for worker identity; optionally use hardware TPM for key storage.
+- Session handshake: use a PQC KEM (e.g., Kyber or later NIST standard) to establish ephemeral symmetric AEAD keys per connection. Steps:
+  1. Controller/worker exchange identity-signed certificates (classical) and PQC KEM public values.
+  2. Both sides derive AEAD keys via HKDF over KEM shared secret and transcript.
+  3. Optionally request remote attestation token before accepting slices (attestation hooks, e.g., Intel SGX/SEV or MDS attestation APIs).
+
+4.3 Slice packaging & integrity
+- Each slice package: {manifest, weights.blob, signature, version}
+- Manifest contains policy tags; controller encrypts package with AEAD key and includes HMAC/signature for extra assurance.
+- Workers verify signature + AEAD before load.
+
+4.4 Performance considerations
+- PQC KEM handshake cost is paid per long-lived connection; reuse AEAD keys for multiple RPCs.
+- For high-throughput edge fleets, pre-provision slice packages to workers via provisioning channel to avoid repeated KEM costs.
+
+5. Session manager
+
+5.1 API (gRPC/HTTP)
+- StartSession(request {model, routingHints, qos, tenant}) -> session_id
+- Infer(session_id, input, options {sync|async}) -> response stream or token
+- EndSession(session_id)
+- GetSessionStats(session_id) -> metrics
+
+5.2 Session lifecycle
+- Session creation: controller allocates slices, populates placement plan, preloads prioritized slices on workers, returns session token.
+- Execution path: client -> session manager -> controller splits request across slices -> workers execute in pipeline -> session manager aggregates outputs.
+- Adaptive batching: session manager groups small inferences into micro-batches per slice based on configured latency budgets.
+
+5.3 QoS and isolation
+- Per-session resource caps (max concurrency, token rate).
+- Tenant isolation: per-tenant slice caching and optional model duplication flags.
+- Fair queuing or priority queues for low-latency sessions.
+
+6. Telemetry & metrics
+- Per-slice metrics: exec latency, memory usage, throughput, error rate.
+- Per-worker metrics: GPU util, free memory, network RTT, connection counts.
+- Per-session metrics: p50/p95/p99 latencies, batch sizes, tokens/sec.
+- Emit via Prometheus metrics endpoint and structured traces (OpenTelemetry) for tracing across slices.
+
+7. Failure modes and fallbacks
+- Worker failure: controller reroutes to alternate worker or triggers local fallback (single-node execution). Evict/restore policy for preloaded slices.
+- Network partition: fall back to local execution when possible; if offload required, return graceful degradation messages to client.
+- Mismatched versions: use manifest version checks to prevent executing incompatible slices.
+
+8. Interfaces & data formats
+- Model ingestion: accept ONNX and TorchScript (MVP) with translator that enumerates layer boundaries.
+- Slice artifact: gzipped protobuf or tar with manifest.json and weights.bin.
+- RPC: gRPC over QUIC (preferred) or HTTP/2 with AEAD wrapper.
+
+9. Testing & benchmarks
+- Unit tests: correctness of slice outputs vs baseline single-node for a suite of models.
+- Integration tests: end-to-end run across two devices (GPU + CPU) validating activations and outputs.
+- Load tests: simulate 1k concurrent sessions with synthetic clients, measure p95 latency and throughput.
+- Security tests: verify PQC handshake, replay protection, and attestation flows.
+
+10. MVP milestones and deliverables
+- Week 0–1: architecture doc, slice format, and prototype plan. (this doc)
+- Week 1–2: implement controller + worker minimal runtime and static partitioner that accepts a small transformer and emits slices.
+- Week 2–3: add PQC handshake, encrypted slice transport, and pre-provisioning flow.
+- Week 3–4: session manager with adaptive batching and basic QoS; run 1k simulated sessions.
+- Week 4–5: integration tests, telemetry dashboard, readme hero docs, and release prep.
+
+11. Open questions
+- Target PQC primitives (Kyber, CRYSTALS-Kyber; choose current NIST-recommended variant). Decide whether to include hybrid classical+PQC key exchange.
+- Attestation strategy for diverse edge hardware — what minimal attestation APIs should we support for MVP?
+- Benchmark targets: supply representative hardware profiles to set realistic throughput/latency goals.
+
+Appendix: quick dataflow
+1. `StartSession` -> controller computes split plan -> preloads slices to assigned workers (encrypted transfer).
+2. Client sends `Infer` -> session manager pipelines activations across workers over secure channels.
+3. Workers return outputs and metrics -> session manager aggregates and returns response.
+
+Next steps: implement the static partitioner and minimal worker runtime (Week 1 task).
diff --git a/docs/PQC_INTEGRATION.md b/docs/PQC_INTEGRATION.md
@@ -0,0 +1,45 @@
+liboqs (pyOQS) integration notes
+
+Goal: Replace the placeholder X25519-only `PQCAdapter` with a hybrid KEM based on liboqs (e.g., Kyber) + X25519.
+
+High level steps:
+
+1. Install native liboqs and Python bindings (pyOQS).
+   - On Ubuntu (example):
+     ```bash
+     sudo apt-get update
+     sudo apt-get install -y build-essential cmake libssl-dev pkg-config
+     # Build and install liboqs from source (follow liboqs README)
+     git clone --branch main https://github.com/open-quantum-safe/liboqs.git
+     cd liboqs
+     mkdir build && cd build
+     cmake -DCMAKE_INSTALL_PREFIX=/usr/local ..
+     make -j$(nproc)
+     sudo make install
+
+     # Install the Python bindings that import as `oqs`
+     pip install liboqs-python
+     ```
+   - Alternatively use your distribution's packages or a prepared devcontainer that installs liboqs.
+   - Set `OQS_INSTALL_PATH=/usr/local` when using a local source install so the binding can find the shared library.
+
+2. Update `prototype/crypto.py` to perform a proper KEM exchange during handshake:
+   - Controller: send X25519 pub + OQS pub to worker.
+   - Worker: encapsulate to controller's OQS pub -> return encapsulation ciphertext + worker OQS pub.
+   - Controller: decapsulate ciphertext to obtain OQS shared secret.
+   - Final symmetric AEAD key = HKDF(X25519_shared || OQS_shared)
+   - The current binding in this workspace exposes `oqs.KeyEncapsulation`, `generate_keypair()`, `encap_secret()`, and `decap_secret()`.
+
+3. Tests & validation:
+   - Run `pytest -q prototype/test_oqs_hybrid.py prototype/test_secure_hybrid_integration.py prototype/test_concurrency_smoke.py`.
+   - Use `prototype/test_secure_run.py` as a quick smoke script when you want a single-session end-to-end check.
+   - Ensure the worker `/handshake` returns `worker_oqs_pub_b64` and `worker_pub_b64` when liboqs is available.
+
+Notes:
+- The repository already contains scaffolding in `prototype/crypto.py` to detect pyOQS at runtime and expose `get_oqs_public()`; complete integration requires invoking `kem.encapsulate()` and `kem.decapsulate()` where appropriate.
+- Building liboqs on CI requires adding native build steps in the pipeline; consider a GitHub Actions matrix job with a prebuilt liboqs artifact or using a self-hosted runner.
+- The CI workflow includes a manual `workflow_dispatch` trigger that can build liboqs from source when `build_liboqs` is enabled.
+
+If you want, I can:
+- Implement the full handshake KEM flow (controller encapsulate/decapsulate and worker encapsulate) once you confirm installing `pyOQS` in the devcontainer/CI is acceptable, or
+- Prepare a PR that adds devcontainer Dockerfile steps to install liboqs so we can run the full integration here.
diff --git a/docs/SCOPE.md b/docs/SCOPE.md
@@ -0,0 +1,18 @@
+Scope & Success Criteria
+
+Target: platform and infrastructure engineers, MLOps teams, and edge fleet operators who need production-grade inference beyond single-node setups.
+
+MVP capabilities:
+- Multi-device layer splitting: demonstrate partitioning a medium-sized transformer across GPU and CPU with deterministic correctness and end-to-end inference.
+- Secure edge offload: implement PQC-based encryption and integrity checks for offloaded model slices and communications.
+- High-concurrency session management: support 1k+ concurrent lightweight sessions with per-session QoS and adaptive batching.
+
+Success metrics:
+- Correctness: identical outputs (within numerical tolerance) compared to single-node baseline for partitioned runs.
+- Performance: 2× throughput improvement for target hardware when split across devices (measured on prototype hardware), and median p95 latency within target SLA for 95% of sessions.
+- Security: PQC handshake and slice integrity checks complete within acceptable overhead (<20% added latency in offload path) and keys/telemetry never expose raw weights.
+
+Out of scope for MVP:
+- Full production orchestration (K8s operators) and UI consoles — focus is on core engine, APIs, and integrations.
+
+Next: architecture spec covering layer-splitting algorithm, PQC keyflows, and session manager APIs.
diff --git a/prototype/README_PROTOTYPE.md b/prototype/README_PROTOTYPE.md
@@ -0,0 +1,31 @@
+Prototype demo
+
+This prototype demonstrates a minimal multi-device layer-splitting demo using a toy model. It simulates two workers (FastAPI) that accept slice preload and execution.
+
+Quickstart:
+
+1. Install dependencies:
+
+```bash
+python -m pip install -r prototype/requirements.txt
+```
+
+2. Start two workers in separate terminals (secure worker available):
+
+```bash
+# insecure worker (no encryption)
+python prototype/worker.py --port 8001
+# secure worker (handshake + AEAD) listens on a separate port
+python prototype/worker_secure.py --port 8003
+```
+
+3. Run the demo:
+
+```bash
+python prototype/run_demo.py
+```
+
+Notes:
+- This is a functional prototype illustrating partitioning, preload, and remote execution. It uses pickle-serialized weights and inputs for simplicity.
+- A secure path using X25519 + optional liboqs hybrid KEM is scaffolded in [prototype/crypto.py](prototype/crypto.py) and [prototype/worker_secure.py](prototype/worker_secure.py). To enable full hybrid PQC tests, install native liboqs plus the Python binding and set `OQS_INSTALL_PATH=/usr/local` (see [docs/PQC_INTEGRATION.md](docs/PQC_INTEGRATION.md)).
+- The in-process integration tests can be run with `pytest -q prototype/test_secure_hybrid_integration.py prototype/test_concurrency_smoke.py` once the environment is prepared.
diff --git a/prototype/__init__.py b/prototype/__init__.py