diff --git a/gpu_test/conftest.py b/gpu_test/conftest.py index fabfa0f..1fc20e4 100644 --- a/gpu_test/conftest.py +++ b/gpu_test/conftest.py @@ -118,7 +118,10 @@ def _cleanup_orphans(self) -> None: def _launch(self) -> None: """Find the cheapest suitable offer and launch an instance.""" - query = f"num_gpus=1 rentable=True rented=False compute_cap>=700 dph<={MAX_COST_PER_HOUR}" + query = ( + f"num_gpus=1 rentable=True rented=False compute_cap>=700" + f" reliability2>=0.95 inet_up>=100 dph<={MAX_COST_PER_HOUR}" + ) offers = self.sdk.search_offers(query=query, order="dph", limit=5) if not offers: diff --git a/gpu_test/test_kernels.py b/gpu_test/test_kernels.py index f7b64e6..e6e9376 100644 --- a/gpu_test/test_kernels.py +++ b/gpu_test/test_kernels.py @@ -4,6 +4,7 @@ from typing import TYPE_CHECKING +import numpy as np import pytest if TYPE_CHECKING: @@ -583,3 +584,157 @@ def test_float_to_int_conversion(kernel_runner: KernelRunner) -> None: forth_source=("\\! kernel main\n\\! param DATA i64[256]\n7.9 F>S\n0 CELLS DATA + !"), ) assert result[0] == 7 + + +# --- Attention --- + +_ATTENTION_KERNEL = """\ +\\! kernel attention +\\! param Q f64[{n}] +\\! param K f64[{n}] +\\! param V f64[{n}] +\\! param O f64[{n}] +\\! param SEQ_LEN i64 +\\! param HEAD_DIM i64 +\\! shared SCORES f64[{seq_len}] +\\! shared SCRATCH f64[{seq_len}] +BID-X +TID-X +0.0 +HEAD_DIM 0 DO + 2 PICK HEAD_DIM * I + CELLS Q + F@ + 2 PICK HEAD_DIM * I + CELLS K + F@ + F* F+ +LOOP +HEAD_DIM S>F FSQRT F/ +OVER 3 PICK > +IF DROP -1.0e30 THEN +OVER CELLS SCORES + SF! +BARRIER +TID-X 0= IF + 0 CELLS SCORES + SF@ + SEQ_LEN 1 DO I CELLS SCORES + SF@ FMAX LOOP + 0 CELLS SCRATCH + SF! +THEN +BARRIER +DUP CELLS SCORES + SF@ +0 CELLS SCRATCH + SF@ +F- FEXP +OVER CELLS SCORES + SF! +BARRIER +TID-X 0= IF + 0.0 + SEQ_LEN 0 DO I CELLS SCORES + SF@ F+ LOOP + 0 CELLS SCRATCH + SF! +THEN +BARRIER +DUP CELLS SCORES + SF@ +0 CELLS SCRATCH + SF@ +F/ +OVER CELLS SCORES + SF! +BARRIER +DUP BEGIN DUP HEAD_DIM < WHILE + 0.0 + SEQ_LEN 0 DO + I CELLS SCORES + SF@ + I HEAD_DIM * 3 PICK + CELLS V + F@ + F* F+ + LOOP + OVER 4 PICK HEAD_DIM * + CELLS O + F! + BDIM-X + +REPEAT +DROP DROP DROP +""" + + +def _attention_reference(q: np.ndarray, k: np.ndarray, v: np.ndarray, seq_len: int) -> list[float]: + """Compute scaled dot-product attention with causal mask (NumPy reference).""" + head_dim = q.shape[1] + scores = q @ k.T / np.sqrt(head_dim) + causal_mask = np.triu(np.ones((seq_len, seq_len), dtype=bool), k=1) + scores[causal_mask] = -1e30 + exp_scores = np.exp(scores - scores.max(axis=1, keepdims=True)) + attn = exp_scores / exp_scores.sum(axis=1, keepdims=True) + return (attn @ v).flatten().tolist() + + +def test_naive_attention_f64(kernel_runner: KernelRunner) -> None: + """Naive scaled dot-product attention with causal mask. + + O = softmax(Q @ K^T / sqrt(d_k)) @ V, seq_len=4, head_dim=4. + One block per query row, one thread per key position. + """ + seq_len, head_dim = 4, 4 + + q = np.array( + [ + [1.0, 0.0, 1.0, 0.0], + [0.0, 1.0, 0.0, 1.0], + [1.0, 1.0, 0.0, 0.0], + [0.0, 0.0, 1.0, 1.0], + ] + ) + k = np.array( + [ + [1.0, 0.0, 0.0, 1.0], + [0.0, 1.0, 1.0, 0.0], + [1.0, 1.0, 0.0, 0.0], + [0.0, 0.0, 1.0, 1.0], + ] + ) + v = np.array( + [ + [1.0, 2.0, 3.0, 4.0], + [5.0, 6.0, 7.0, 8.0], + [9.0, 10.0, 11.0, 12.0], + [13.0, 14.0, 15.0, 16.0], + ] + ) + + expected = _attention_reference(q, k, v, seq_len) + n = seq_len * head_dim + + result = kernel_runner.run( + forth_source=_ATTENTION_KERNEL.format(n=n, seq_len=seq_len), + params={ + "Q": q.flatten().tolist(), + "K": k.flatten().tolist(), + "V": v.flatten().tolist(), + "SEQ_LEN": seq_len, + "HEAD_DIM": head_dim, + }, + grid=(seq_len, 1, 1), + block=(seq_len, 1, 1), + output_param=3, + output_count=n, + ) + assert result == [pytest.approx(v) for v in expected] + + +def test_naive_attention_f64_16x64(kernel_runner: KernelRunner) -> None: + """Naive scaled dot-product attention, seq_len=16, head_dim=64.""" + seq_len, head_dim = 16, 64 + + rng = np.random.default_rng(42) + q = rng.standard_normal((seq_len, head_dim)) + k = rng.standard_normal((seq_len, head_dim)) + v = rng.standard_normal((seq_len, head_dim)) + + expected = _attention_reference(q, k, v, seq_len) + n = seq_len * head_dim + + result = kernel_runner.run( + forth_source=_ATTENTION_KERNEL.format(n=n, seq_len=seq_len), + params={ + "Q": q.flatten().tolist(), + "K": k.flatten().tolist(), + "V": v.flatten().tolist(), + "SEQ_LEN": seq_len, + "HEAD_DIM": head_dim, + }, + grid=(seq_len, 1, 1), + block=(seq_len, 1, 1), + output_param=3, + output_count=n, + ) + assert result == [pytest.approx(v) for v in expected] diff --git a/lib/Bitcode/CMakeLists.txt b/lib/Bitcode/CMakeLists.txt new file mode 100644 index 0000000..d970bf2 --- /dev/null +++ b/lib/Bitcode/CMakeLists.txt @@ -0,0 +1,5 @@ +set(WARPFORTH_LIBDEVICE_PATH + "${CMAKE_CURRENT_BINARY_DIR}/libdevice.10.bc" PARENT_SCOPE) + +configure_file(libdevice.10.bc + "${CMAKE_CURRENT_BINARY_DIR}/libdevice.10.bc" COPYONLY) diff --git a/lib/Bitcode/libdevice.10.bc b/lib/Bitcode/libdevice.10.bc new file mode 100644 index 0000000..157bbca Binary files /dev/null and b/lib/Bitcode/libdevice.10.bc differ diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index bde68b6..94e9652 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -1,3 +1,4 @@ add_subdirectory(Dialect) +add_subdirectory(Bitcode) add_subdirectory(Conversion) add_subdirectory(Translation) diff --git a/lib/Conversion/CMakeLists.txt b/lib/Conversion/CMakeLists.txt index 646cdb2..5c0be66 100644 --- a/lib/Conversion/CMakeLists.txt +++ b/lib/Conversion/CMakeLists.txt @@ -16,5 +16,9 @@ add_mlir_library(MLIRConversionPasses MLIRTransforms ) +target_compile_definitions(obj.MLIRConversionPasses PRIVATE + WARPFORTH_LIBDEVICE_PATH="${WARPFORTH_LIBDEVICE_PATH}" +) + add_subdirectory(ForthToMemRef) add_subdirectory(ForthToGPU) diff --git a/lib/Conversion/Passes.cpp b/lib/Conversion/Passes.cpp index ad944eb..263bbc9 100644 --- a/lib/Conversion/Passes.cpp +++ b/lib/Conversion/Passes.cpp @@ -33,7 +33,10 @@ void buildWarpForthPipeline(OpPassManager &pm) { pm.addPass(createCanonicalizerPass()); // Stage 4: Attach NVVM target to GPU modules (sm_70 = Volta architecture) - pm.addPass(createGpuNVVMAttachTarget()); + GpuNVVMAttachTargetOptions nvvmOptions; + nvvmOptions.chip = "sm_70"; + nvvmOptions.linkLibs.push_back(WARPFORTH_LIBDEVICE_PATH); + pm.addPass(createGpuNVVMAttachTarget(nvvmOptions)); // Stage 5: Lower GPU to NVVM with bare pointers ConvertGpuOpsToNVVMOpsOptions gpuToNVVMOptions; diff --git a/pyproject.toml b/pyproject.toml index 9b2f64a..ef4c35d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,6 +4,7 @@ version = "0.1.0" requires-python = ">=3.11" dependencies = [ "lit>=18.1.0", + "numpy", "pytest", "vastai-sdk", ] diff --git a/test/Pipeline/attention.forth b/test/Pipeline/attention.forth new file mode 100644 index 0000000..094df16 --- /dev/null +++ b/test/Pipeline/attention.forth @@ -0,0 +1,80 @@ +\ RUN: %warpforth-translate --forth-to-mlir %s | %warpforth-opt --warpforth-pipeline | %FileCheck %s + +\ Verify that a naive attention kernel with shared memory and float intrinsics +\ survives the full pipeline to gpu.binary. +\ CHECK: gpu.binary @warpforth_module + +\! kernel attention +\! param Q f64[16] +\! param K f64[16] +\! param V f64[16] +\! param O f64[16] +\! param SEQ_LEN i64 +\! param HEAD_DIM i64 +\! shared SCORES f64[4] +\! shared SCRATCH f64[4] + +\ row = BID-X, t = TID-X +BID-X +TID-X + +\ --- Dot product: Q[row,:] . K[t,:] --- +0.0 +HEAD_DIM 0 DO + 2 PICK HEAD_DIM * I + CELLS Q + F@ + 2 PICK HEAD_DIM * I + CELLS K + F@ + F* F+ +LOOP +HEAD_DIM S>F FSQRT F/ + +\ --- Causal mask: if t > row, score = -inf --- +OVER 3 PICK > +IF DROP -1.0e30 THEN + +\ --- Store score to shared memory --- +OVER CELLS SCORES + SF! +BARRIER + +\ --- Softmax: max reduction (thread 0) --- +TID-X 0= IF + 0 CELLS SCORES + SF@ + SEQ_LEN 1 DO I CELLS SCORES + SF@ FMAX LOOP + 0 CELLS SCRATCH + SF! +THEN +BARRIER + +\ --- Softmax: exp(score - max) --- +DUP CELLS SCORES + SF@ +0 CELLS SCRATCH + SF@ +F- FEXP +OVER CELLS SCORES + SF! +BARRIER + +\ --- Softmax: sum reduction (thread 0) --- +TID-X 0= IF + 0.0 + SEQ_LEN 0 DO I CELLS SCORES + SF@ F+ LOOP + 0 CELLS SCRATCH + SF! +THEN +BARRIER + +\ --- Softmax: normalize --- +DUP CELLS SCORES + SF@ +0 CELLS SCRATCH + SF@ +F/ +OVER CELLS SCORES + SF! +BARRIER + +\ --- V accumulation: O[row,col] = sum_j SCORES[j] * V[j*HD + col] --- +\ Stride over head_dim columns: col = t, t+BDIM-X, t+2*BDIM-X, ... +DUP BEGIN DUP HEAD_DIM < WHILE + 0.0 + SEQ_LEN 0 DO + I CELLS SCORES + SF@ + I HEAD_DIM * 3 PICK + CELLS V + F@ + F* F+ + LOOP + OVER 4 PICK HEAD_DIM * + CELLS O + F! + BDIM-X + +REPEAT +DROP DROP DROP diff --git a/uv.lock b/uv.lock index 4dc6b59..33aec72 100644 --- a/uv.lock +++ b/uv.lock @@ -2227,6 +2227,7 @@ version = "0.1.0" source = { virtual = "." } dependencies = [ { name = "lit" }, + { name = "numpy" }, { name = "pytest" }, { name = "vastai-sdk" }, ] @@ -2240,6 +2241,7 @@ dev = [ [package.metadata] requires-dist = [ { name = "lit", specifier = ">=18.1.0" }, + { name = "numpy" }, { name = "pytest" }, { name = "vastai-sdk" }, ]