Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion gpu_test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,10 @@ def _cleanup_orphans(self) -> None:

def _launch(self) -> None:
"""Find the cheapest suitable offer and launch an instance."""
query = f"num_gpus=1 rentable=True rented=False compute_cap>=700 dph<={MAX_COST_PER_HOUR}"
query = (
f"num_gpus=1 rentable=True rented=False compute_cap>=700"
f" reliability2>=0.95 inet_up>=100 dph<={MAX_COST_PER_HOUR}"
)
offers = self.sdk.search_offers(query=query, order="dph", limit=5)

if not offers:
Expand Down
155 changes: 155 additions & 0 deletions gpu_test/test_kernels.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from typing import TYPE_CHECKING

import numpy as np
import pytest

if TYPE_CHECKING:
Expand Down Expand Up @@ -583,3 +584,157 @@ def test_float_to_int_conversion(kernel_runner: KernelRunner) -> None:
forth_source=("\\! kernel main\n\\! param DATA i64[256]\n7.9 F>S\n0 CELLS DATA + !"),
)
assert result[0] == 7


# --- Attention ---

_ATTENTION_KERNEL = """\
\\! kernel attention
\\! param Q f64[{n}]
\\! param K f64[{n}]
\\! param V f64[{n}]
\\! param O f64[{n}]
\\! param SEQ_LEN i64
\\! param HEAD_DIM i64
\\! shared SCORES f64[{seq_len}]
\\! shared SCRATCH f64[{seq_len}]
BID-X
TID-X
0.0
HEAD_DIM 0 DO
2 PICK HEAD_DIM * I + CELLS Q + F@
2 PICK HEAD_DIM * I + CELLS K + F@
F* F+
LOOP
HEAD_DIM S>F FSQRT F/
OVER 3 PICK >
IF DROP -1.0e30 THEN
OVER CELLS SCORES + SF!
BARRIER
TID-X 0= IF
0 CELLS SCORES + SF@
SEQ_LEN 1 DO I CELLS SCORES + SF@ FMAX LOOP
0 CELLS SCRATCH + SF!
THEN
BARRIER
DUP CELLS SCORES + SF@
0 CELLS SCRATCH + SF@
F- FEXP
OVER CELLS SCORES + SF!
BARRIER
TID-X 0= IF
0.0
SEQ_LEN 0 DO I CELLS SCORES + SF@ F+ LOOP
0 CELLS SCRATCH + SF!
THEN
BARRIER
DUP CELLS SCORES + SF@
0 CELLS SCRATCH + SF@
F/
OVER CELLS SCORES + SF!
BARRIER
DUP BEGIN DUP HEAD_DIM < WHILE
0.0
SEQ_LEN 0 DO
I CELLS SCORES + SF@
I HEAD_DIM * 3 PICK + CELLS V + F@
F* F+
LOOP
OVER 4 PICK HEAD_DIM * + CELLS O + F!
BDIM-X +
REPEAT
DROP DROP DROP
"""


def _attention_reference(q: np.ndarray, k: np.ndarray, v: np.ndarray, seq_len: int) -> list[float]:
"""Compute scaled dot-product attention with causal mask (NumPy reference)."""
head_dim = q.shape[1]
scores = q @ k.T / np.sqrt(head_dim)
causal_mask = np.triu(np.ones((seq_len, seq_len), dtype=bool), k=1)
scores[causal_mask] = -1e30
exp_scores = np.exp(scores - scores.max(axis=1, keepdims=True))
attn = exp_scores / exp_scores.sum(axis=1, keepdims=True)
return (attn @ v).flatten().tolist()


def test_naive_attention_f64(kernel_runner: KernelRunner) -> None:
"""Naive scaled dot-product attention with causal mask.

O = softmax(Q @ K^T / sqrt(d_k)) @ V, seq_len=4, head_dim=4.
One block per query row, one thread per key position.
"""
seq_len, head_dim = 4, 4

q = np.array(
[
[1.0, 0.0, 1.0, 0.0],
[0.0, 1.0, 0.0, 1.0],
[1.0, 1.0, 0.0, 0.0],
[0.0, 0.0, 1.0, 1.0],
]
)
k = np.array(
[
[1.0, 0.0, 0.0, 1.0],
[0.0, 1.0, 1.0, 0.0],
[1.0, 1.0, 0.0, 0.0],
[0.0, 0.0, 1.0, 1.0],
]
)
v = np.array(
[
[1.0, 2.0, 3.0, 4.0],
[5.0, 6.0, 7.0, 8.0],
[9.0, 10.0, 11.0, 12.0],
[13.0, 14.0, 15.0, 16.0],
]
)

expected = _attention_reference(q, k, v, seq_len)
n = seq_len * head_dim

result = kernel_runner.run(
forth_source=_ATTENTION_KERNEL.format(n=n, seq_len=seq_len),
params={
"Q": q.flatten().tolist(),
"K": k.flatten().tolist(),
"V": v.flatten().tolist(),
"SEQ_LEN": seq_len,
"HEAD_DIM": head_dim,
},
grid=(seq_len, 1, 1),
block=(seq_len, 1, 1),
output_param=3,
output_count=n,
)
assert result == [pytest.approx(v) for v in expected]


def test_naive_attention_f64_16x64(kernel_runner: KernelRunner) -> None:
"""Naive scaled dot-product attention, seq_len=16, head_dim=64."""
seq_len, head_dim = 16, 64

rng = np.random.default_rng(42)
q = rng.standard_normal((seq_len, head_dim))
k = rng.standard_normal((seq_len, head_dim))
v = rng.standard_normal((seq_len, head_dim))

expected = _attention_reference(q, k, v, seq_len)
n = seq_len * head_dim

result = kernel_runner.run(
forth_source=_ATTENTION_KERNEL.format(n=n, seq_len=seq_len),
params={
"Q": q.flatten().tolist(),
"K": k.flatten().tolist(),
"V": v.flatten().tolist(),
"SEQ_LEN": seq_len,
"HEAD_DIM": head_dim,
},
grid=(seq_len, 1, 1),
block=(seq_len, 1, 1),
output_param=3,
output_count=n,
)
assert result == [pytest.approx(v) for v in expected]
5 changes: 5 additions & 0 deletions lib/Bitcode/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
set(WARPFORTH_LIBDEVICE_PATH
"${CMAKE_CURRENT_BINARY_DIR}/libdevice.10.bc" PARENT_SCOPE)

configure_file(libdevice.10.bc
"${CMAKE_CURRENT_BINARY_DIR}/libdevice.10.bc" COPYONLY)
Binary file added lib/Bitcode/libdevice.10.bc
Binary file not shown.
1 change: 1 addition & 0 deletions lib/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
add_subdirectory(Dialect)
add_subdirectory(Bitcode)
add_subdirectory(Conversion)
add_subdirectory(Translation)
4 changes: 4 additions & 0 deletions lib/Conversion/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,9 @@ add_mlir_library(MLIRConversionPasses
MLIRTransforms
)

target_compile_definitions(obj.MLIRConversionPasses PRIVATE
WARPFORTH_LIBDEVICE_PATH="${WARPFORTH_LIBDEVICE_PATH}"
)

add_subdirectory(ForthToMemRef)
add_subdirectory(ForthToGPU)
5 changes: 4 additions & 1 deletion lib/Conversion/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,10 @@ void buildWarpForthPipeline(OpPassManager &pm) {
pm.addPass(createCanonicalizerPass());

// Stage 4: Attach NVVM target to GPU modules (sm_70 = Volta architecture)
pm.addPass(createGpuNVVMAttachTarget());
GpuNVVMAttachTargetOptions nvvmOptions;
nvvmOptions.chip = "sm_70";
nvvmOptions.linkLibs.push_back(WARPFORTH_LIBDEVICE_PATH);
pm.addPass(createGpuNVVMAttachTarget(nvvmOptions));

// Stage 5: Lower GPU to NVVM with bare pointers
ConvertGpuOpsToNVVMOpsOptions gpuToNVVMOptions;
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ version = "0.1.0"
requires-python = ">=3.11"
dependencies = [
"lit>=18.1.0",
"numpy",
"pytest",
"vastai-sdk",
]
Expand Down
80 changes: 80 additions & 0 deletions test/Pipeline/attention.forth
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
\ RUN: %warpforth-translate --forth-to-mlir %s | %warpforth-opt --warpforth-pipeline | %FileCheck %s

\ Verify that a naive attention kernel with shared memory and float intrinsics
\ survives the full pipeline to gpu.binary.
\ CHECK: gpu.binary @warpforth_module

\! kernel attention
\! param Q f64[16]
\! param K f64[16]
\! param V f64[16]
\! param O f64[16]
\! param SEQ_LEN i64
\! param HEAD_DIM i64
\! shared SCORES f64[4]
\! shared SCRATCH f64[4]

\ row = BID-X, t = TID-X
BID-X
TID-X

\ --- Dot product: Q[row,:] . K[t,:] ---
0.0
HEAD_DIM 0 DO
2 PICK HEAD_DIM * I + CELLS Q + F@
2 PICK HEAD_DIM * I + CELLS K + F@
F* F+
LOOP
HEAD_DIM S>F FSQRT F/

\ --- Causal mask: if t > row, score = -inf ---
OVER 3 PICK >
IF DROP -1.0e30 THEN

\ --- Store score to shared memory ---
OVER CELLS SCORES + SF!
BARRIER

\ --- Softmax: max reduction (thread 0) ---
TID-X 0= IF
0 CELLS SCORES + SF@
SEQ_LEN 1 DO I CELLS SCORES + SF@ FMAX LOOP
0 CELLS SCRATCH + SF!
THEN
BARRIER

\ --- Softmax: exp(score - max) ---
DUP CELLS SCORES + SF@
0 CELLS SCRATCH + SF@
F- FEXP
OVER CELLS SCORES + SF!
BARRIER

\ --- Softmax: sum reduction (thread 0) ---
TID-X 0= IF
0.0
SEQ_LEN 0 DO I CELLS SCORES + SF@ F+ LOOP
0 CELLS SCRATCH + SF!
THEN
BARRIER

\ --- Softmax: normalize ---
DUP CELLS SCORES + SF@
0 CELLS SCRATCH + SF@
F/
OVER CELLS SCORES + SF!
BARRIER

\ --- V accumulation: O[row,col] = sum_j SCORES[j] * V[j*HD + col] ---
\ Stride over head_dim columns: col = t, t+BDIM-X, t+2*BDIM-X, ...
DUP BEGIN DUP HEAD_DIM < WHILE
0.0
SEQ_LEN 0 DO
I CELLS SCORES + SF@
I HEAD_DIM * 3 PICK + CELLS V + F@
F* F+
LOOP
OVER 4 PICK HEAD_DIM * + CELLS O + F!
BDIM-X +
REPEAT
DROP DROP DROP
2 changes: 2 additions & 0 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.