tetsuo-cpp · tetsuo-cpp · Feb 22, 2026 · Feb 22, 2026 · Feb 22, 2026 · Feb 22, 2026
diff --git a/gpu_test/conftest.py b/gpu_test/conftest.py
@@ -118,7 +118,10 @@ def _cleanup_orphans(self) -> None:
 
     def _launch(self) -> None:
         """Find the cheapest suitable offer and launch an instance."""
-        query = f"num_gpus=1 rentable=True rented=False compute_cap>=700 dph<={MAX_COST_PER_HOUR}"
+        query = (
+            f"num_gpus=1 rentable=True rented=False compute_cap>=700"
+            f" reliability2>=0.95 inet_up>=100 dph<={MAX_COST_PER_HOUR}"
+        )
         offers = self.sdk.search_offers(query=query, order="dph", limit=5)
 
         if not offers:

diff --git a/gpu_test/test_kernels.py b/gpu_test/test_kernels.py
@@ -4,6 +4,7 @@
 
 from typing import TYPE_CHECKING
 
+import numpy as np
 import pytest
 
 if TYPE_CHECKING:
@@ -583,3 +584,157 @@ def test_float_to_int_conversion(kernel_runner: KernelRunner) -> None:
         forth_source=("\\! kernel main\n\\! param DATA i64[256]\n7.9 F>S\n0 CELLS DATA + !"),
     )
     assert result[0] == 7
+
+
+# --- Attention ---
+
+_ATTENTION_KERNEL = """\
+\\! kernel attention
+\\! param Q f64[{n}]
+\\! param K f64[{n}]
+\\! param V f64[{n}]
+\\! param O f64[{n}]
+\\! param SEQ_LEN i64
+\\! param HEAD_DIM i64
+\\! shared SCORES f64[{seq_len}]
+\\! shared SCRATCH f64[{seq_len}]
+BID-X
+TID-X
+0.0
+HEAD_DIM 0 DO
+  2 PICK HEAD_DIM * I + CELLS Q + F@
+  2 PICK HEAD_DIM * I + CELLS K + F@
+  F* F+
+LOOP
+HEAD_DIM S>F FSQRT F/
+OVER 3 PICK >
+IF DROP -1.0e30 THEN
+OVER CELLS SCORES + SF!
+BARRIER
+TID-X 0= IF
+  0 CELLS SCORES + SF@
+  SEQ_LEN 1 DO I CELLS SCORES + SF@ FMAX LOOP
+  0 CELLS SCRATCH + SF!
+THEN
+BARRIER
+DUP CELLS SCORES + SF@
+0 CELLS SCRATCH + SF@
+F- FEXP
+OVER CELLS SCORES + SF!
+BARRIER
+TID-X 0= IF
+  0.0
+  SEQ_LEN 0 DO I CELLS SCORES + SF@ F+ LOOP
+  0 CELLS SCRATCH + SF!
+THEN
+BARRIER
+DUP CELLS SCORES + SF@
+0 CELLS SCRATCH + SF@
+F/
+OVER CELLS SCORES + SF!
+BARRIER
+DUP BEGIN DUP HEAD_DIM < WHILE
+  0.0
+  SEQ_LEN 0 DO
+    I CELLS SCORES + SF@
+    I HEAD_DIM * 3 PICK + CELLS V + F@
+    F* F+
+  LOOP
+  OVER 4 PICK HEAD_DIM * + CELLS O + F!
+  BDIM-X +
+REPEAT
+DROP DROP DROP
+"""
+
+
+def _attention_reference(q: np.ndarray, k: np.ndarray, v: np.ndarray, seq_len: int) -> list[float]:
+    """Compute scaled dot-product attention with causal mask (NumPy reference)."""
+    head_dim = q.shape[1]
+    scores = q @ k.T / np.sqrt(head_dim)
+    causal_mask = np.triu(np.ones((seq_len, seq_len), dtype=bool), k=1)
+    scores[causal_mask] = -1e30
+    exp_scores = np.exp(scores - scores.max(axis=1, keepdims=True))
+    attn = exp_scores / exp_scores.sum(axis=1, keepdims=True)
+    return (attn @ v).flatten().tolist()
+
+
+def test_naive_attention_f64(kernel_runner: KernelRunner) -> None:
+    """Naive scaled dot-product attention with causal mask.
+
+    O = softmax(Q @ K^T / sqrt(d_k)) @ V, seq_len=4, head_dim=4.
+    One block per query row, one thread per key position.
+    """
+    seq_len, head_dim = 4, 4
+
+    q = np.array(
+        [
+            [1.0, 0.0, 1.0, 0.0],
+            [0.0, 1.0, 0.0, 1.0],
+            [1.0, 1.0, 0.0, 0.0],
+            [0.0, 0.0, 1.0, 1.0],
+        ]
+    )
+    k = np.array(
+        [
+            [1.0, 0.0, 0.0, 1.0],
+            [0.0, 1.0, 1.0, 0.0],
+            [1.0, 1.0, 0.0, 0.0],
+            [0.0, 0.0, 1.0, 1.0],
+        ]
+    )
+    v = np.array(
+        [
+            [1.0, 2.0, 3.0, 4.0],
+            [5.0, 6.0, 7.0, 8.0],
+            [9.0, 10.0, 11.0, 12.0],
+            [13.0, 14.0, 15.0, 16.0],
+        ]
+    )
+
+    expected = _attention_reference(q, k, v, seq_len)
+    n = seq_len * head_dim
+
+    result = kernel_runner.run(
+        forth_source=_ATTENTION_KERNEL.format(n=n, seq_len=seq_len),
+        params={
+            "Q": q.flatten().tolist(),
+            "K": k.flatten().tolist(),
+            "V": v.flatten().tolist(),
+            "SEQ_LEN": seq_len,
+            "HEAD_DIM": head_dim,
+        },
+        grid=(seq_len, 1, 1),
+        block=(seq_len, 1, 1),
+        output_param=3,
+        output_count=n,
+    )
+    assert result == [pytest.approx(v) for v in expected]
+
+
+def test_naive_attention_f64_16x64(kernel_runner: KernelRunner) -> None:
+    """Naive scaled dot-product attention, seq_len=16, head_dim=64."""
+    seq_len, head_dim = 16, 64
+
+    rng = np.random.default_rng(42)
+    q = rng.standard_normal((seq_len, head_dim))
+    k = rng.standard_normal((seq_len, head_dim))
+    v = rng.standard_normal((seq_len, head_dim))
+
+    expected = _attention_reference(q, k, v, seq_len)
+    n = seq_len * head_dim
+
+    result = kernel_runner.run(
+        forth_source=_ATTENTION_KERNEL.format(n=n, seq_len=seq_len),
+        params={
+            "Q": q.flatten().tolist(),
+            "K": k.flatten().tolist(),
+            "V": v.flatten().tolist(),
+            "SEQ_LEN": seq_len,
+            "HEAD_DIM": head_dim,
+        },
+        grid=(seq_len, 1, 1),
+        block=(seq_len, 1, 1),
+        output_param=3,
+        output_count=n,
+    )
+    assert result == [pytest.approx(v) for v in expected]
diff --git a/lib/Bitcode/CMakeLists.txt b/lib/Bitcode/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(WARPFORTH_LIBDEVICE_PATH
+  "${CMAKE_CURRENT_BINARY_DIR}/libdevice.10.bc" PARENT_SCOPE)
+
+configure_file(libdevice.10.bc
+  "${CMAKE_CURRENT_BINARY_DIR}/libdevice.10.bc" COPYONLY)
diff --git a/lib/Bitcode/libdevice.10.bc b/lib/Bitcode/libdevice.10.bc
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
@@ -1,3 +1,4 @@
 add_subdirectory(Dialect)
+add_subdirectory(Bitcode)
 add_subdirectory(Conversion)
 add_subdirectory(Translation)
diff --git a/lib/Conversion/CMakeLists.txt b/lib/Conversion/CMakeLists.txt
@@ -16,5 +16,9 @@ add_mlir_library(MLIRConversionPasses
   MLIRTransforms
 )
 
+target_compile_definitions(obj.MLIRConversionPasses PRIVATE
+  WARPFORTH_LIBDEVICE_PATH="${WARPFORTH_LIBDEVICE_PATH}"
+)
+
 add_subdirectory(ForthToMemRef)
 add_subdirectory(ForthToGPU)
diff --git a/lib/Conversion/Passes.cpp b/lib/Conversion/Passes.cpp
@@ -33,7 +33,10 @@ void buildWarpForthPipeline(OpPassManager &pm) {
   pm.addPass(createCanonicalizerPass());
 
   // Stage 4: Attach NVVM target to GPU modules (sm_70 = Volta architecture)
-  pm.addPass(createGpuNVVMAttachTarget());
+  GpuNVVMAttachTargetOptions nvvmOptions;
+  nvvmOptions.chip = "sm_70";
+  nvvmOptions.linkLibs.push_back(WARPFORTH_LIBDEVICE_PATH);
+  pm.addPass(createGpuNVVMAttachTarget(nvvmOptions));
 
   // Stage 5: Lower GPU to NVVM with bare pointers
   ConvertGpuOpsToNVVMOpsOptions gpuToNVVMOptions;

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,6 +4,7 @@ version = "0.1.0"
 requires-python = ">=3.11"
 dependencies = [
     "lit>=18.1.0",
+    "numpy",
     "pytest",
     "vastai-sdk",
 ]

diff --git a/test/Pipeline/attention.forth b/test/Pipeline/attention.forth
@@ -0,0 +1,80 @@
+\ RUN: %warpforth-translate --forth-to-mlir %s | %warpforth-opt --warpforth-pipeline | %FileCheck %s
+
+\ Verify that a naive attention kernel with shared memory and float intrinsics
+\ survives the full pipeline to gpu.binary.
+\ CHECK: gpu.binary @warpforth_module
+
+\! kernel attention
+\! param Q f64[16]
+\! param K f64[16]
+\! param V f64[16]
+\! param O f64[16]
+\! param SEQ_LEN i64
+\! param HEAD_DIM i64
+\! shared SCORES f64[4]
+\! shared SCRATCH f64[4]
+
+\ row = BID-X, t = TID-X
+BID-X
+TID-X
+
+\ --- Dot product: Q[row,:] . K[t,:] ---
+0.0
+HEAD_DIM 0 DO
+  2 PICK HEAD_DIM * I + CELLS Q + F@
+  2 PICK HEAD_DIM * I + CELLS K + F@
+  F* F+
+LOOP
+HEAD_DIM S>F FSQRT F/
+
+\ --- Causal mask: if t > row, score = -inf ---
+OVER 3 PICK >
+IF DROP -1.0e30 THEN
+
+\ --- Store score to shared memory ---
+OVER CELLS SCORES + SF!
+BARRIER
+
+\ --- Softmax: max reduction (thread 0) ---
+TID-X 0= IF
+  0 CELLS SCORES + SF@
+  SEQ_LEN 1 DO I CELLS SCORES + SF@ FMAX LOOP
+  0 CELLS SCRATCH + SF!
+THEN
+BARRIER
+
+\ --- Softmax: exp(score - max) ---
+DUP CELLS SCORES + SF@
+0 CELLS SCRATCH + SF@
+F- FEXP
+OVER CELLS SCORES + SF!
+BARRIER
+
+\ --- Softmax: sum reduction (thread 0) ---
+TID-X 0= IF
+  0.0
+  SEQ_LEN 0 DO I CELLS SCORES + SF@ F+ LOOP
+  0 CELLS SCRATCH + SF!
+THEN
+BARRIER
+
+\ --- Softmax: normalize ---
+DUP CELLS SCORES + SF@
+0 CELLS SCRATCH + SF@
+F/
+OVER CELLS SCORES + SF!
+BARRIER
+
+\ --- V accumulation: O[row,col] = sum_j SCORES[j] * V[j*HD + col] ---
+\ Stride over head_dim columns: col = t, t+BDIM-X, t+2*BDIM-X, ...
+DUP BEGIN DUP HEAD_DIM < WHILE
+  0.0
+  SEQ_LEN 0 DO
+    I CELLS SCORES + SF@
+    I HEAD_DIM * 3 PICK + CELLS V + F@
+    F* F+
+  LOOP
+  OVER 4 PICK HEAD_DIM * + CELLS O + F!
+  BDIM-X +
+REPEAT
+DROP DROP DROP
diff --git a/uv.lock b/uv.lock