From e5fd00a15d5cc5f8dac1402258347b66d0847868 Mon Sep 17 00:00:00 2001 From: Alex Cameron Date: Sun, 22 Feb 2026 22:37:45 +0900 Subject: [PATCH 1/2] feat: add reduced-width memory load/store words (i8, i16, i32, f16, bf16, f32) --- CLAUDE.md | 3 +- include/warpforth/Dialect/Forth/ForthOps.td | 184 ++++++++++++++++++ .../ForthToMemRef/ForthToMemRef.cpp | 177 ++++++++++++++--- lib/Translation/ForthToMLIR/ForthToMLIR.cpp | 72 +++++++ .../ForthToMemRef/reduced-width-memory.mlir | 118 +++++++++++ test/Pipeline/reduced-width-memory.forth | 15 ++ .../Forth/reduced-width-memory.forth | 63 ++++++ 7 files changed, 607 insertions(+), 25 deletions(-) create mode 100644 test/Conversion/ForthToMemRef/reduced-width-memory.mlir create mode 100644 test/Pipeline/reduced-width-memory.forth create mode 100644 test/Translation/Forth/reduced-width-memory.forth diff --git a/CLAUDE.md b/CLAUDE.md index 2472d89..e034783 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -87,10 +87,11 @@ uv run ruff format gpu_test/ - **Stack Type**: `!forth.stack` - untyped stack, programmer ensures type safety - **Operations**: All take stack as input and produce stack as output (except `forth.stack`) -- **Supported Words**: literals (integer `42` and float `3.14`), `DUP DROP SWAP OVER ROT NIP TUCK PICK ROLL`, `+ - * / MOD`, `F+ F- F* F/` (float arithmetic), `FEXP FSQRT FLOG FABS FNEG` (float math intrinsics), `FMAX FMIN` (float min/max), `AND OR XOR NOT LSHIFT RSHIFT`, `= < > <> <= >= 0=`, `F= F< F> F<> F<= F>=` (float comparison), `S>F F>S` (int/float conversion), `@ !` (global memory), `F@ F!` (float global memory), `S@ S!` (shared memory), `SF@ SF!` (float shared memory), `CELLS`, `IF ELSE THEN`, `BEGIN UNTIL`, `BEGIN WHILE REPEAT`, `DO LOOP +LOOP I J K`, `LEAVE UNLOOP EXIT`, `{ a b -- }` (local variables in word definitions), `TID-X/Y/Z BID-X/Y/Z BDIM-X/Y/Z GDIM-X/Y/Z GLOBAL-ID` (GPU indexing). +- **Supported Words**: literals (integer `42` and float `3.14`), `DUP DROP SWAP OVER ROT NIP TUCK PICK ROLL`, `+ - * / MOD`, `F+ F- F* F/` (float arithmetic), `FEXP FSQRT FLOG FABS FNEG` (float math intrinsics), `FMAX FMIN` (float min/max), `AND OR XOR NOT LSHIFT RSHIFT`, `= < > <> <= >= 0=`, `F= F< F> F<> F<= F>=` (float comparison), `S>F F>S` (int/float conversion), `@ !` (global memory), `F@ F!` (float global memory), `S@ S!` (shared memory), `SF@ SF!` (float shared memory), `I8@ I8! SI8@ SI8!` (i8 memory), `I16@ I16! SI16@ SI16!` (i16 memory), `I32@ I32! SI32@ SI32!` (i32 memory), `HF@ HF! SHF@ SHF!` (f16 memory), `BF@ BF! SBF@ SBF!` (bf16 memory), `F32@ F32! SF32@ SF32!` (f32 memory), `CELLS`, `IF ELSE THEN`, `BEGIN UNTIL`, `BEGIN WHILE REPEAT`, `DO LOOP +LOOP I J K`, `LEAVE UNLOOP EXIT`, `{ a b -- }` (local variables in word definitions), `TID-X/Y/Z BID-X/Y/Z BDIM-X/Y/Z GDIM-X/Y/Z GLOBAL-ID` (GPU indexing). - **Float Literals**: Numbers containing `.` or `e`/`E` are parsed as f64 (e.g. `3.14`, `-2.0`, `1.0e-5`, `1e3`). Stored on the stack as i64 bit patterns; F-prefixed words perform bitcast before/after operations. - **Kernel Parameters**: Declared in the `\!` header. `\! kernel ` is required and must appear first. `\! param i64[]` becomes a `memref` argument; `\! param i64` becomes an `i64` argument. `\! param f64[]` becomes a `memref` argument; `\! param f64` becomes an `f64` argument (bitcast to i64 when pushed to stack). Using a param name in code emits `forth.param_ref` (arrays push address; scalars push value). - **Shared Memory**: `\! shared i64[]` or `\! shared f64[]` declares GPU shared (workgroup) memory. Emits a tagged `memref.alloca` at kernel entry; ForthToGPU converts it to a `gpu.func` workgroup attribution. Using the shared name in code pushes its base address onto the stack. Use `S@`/`S!` for i64 or `SF@`/`SF!` for f64 shared accesses. Cannot be referenced inside word definitions. +- **Reduced-Width Memory**: `I8@ I16@ I32@` load a narrow integer, sign-extend to i64. `I8! I16! I32!` truncate i64 to narrow integer, store. `HF@ BF@ F32@` load a narrow float, extend to f64, bitcast to i64. `HF! BF! F32!` bitcast i64 to f64, truncate to narrow float, store. `S`-prefixed variants (`SI8@`, `SHF!`, etc.) use shared memory (address space 3). - **Conversion**: `!forth.stack` → `memref<256xi64>` with explicit stack pointer - **GPU**: Functions wrapped in `gpu.module`, `main` gets `gpu.kernel` attribute, configured with bare pointers for NVVM conversion - **Local Variables**: `{ a b c -- }` at the start of a word definition binds read-only locals. Pops values from the stack in reverse name order (c, b, a) using `forth.pop`, stores SSA values. Referencing a local emits `forth.push_value`. SSA values from the entry block dominate all control flow, so locals work across IF/ELSE/THEN, loops, etc. On GPU, locals map directly to registers. diff --git a/include/warpforth/Dialect/Forth/ForthOps.td b/include/warpforth/Dialect/Forth/ForthOps.td index e1b0fa1..246fcf8 100644 --- a/include/warpforth/Dialect/Forth/ForthOps.td +++ b/include/warpforth/Dialect/Forth/ForthOps.td @@ -404,6 +404,190 @@ def Forth_SharedStoreFOp : Forth_StackOpBase<"shared_storef"> { }]; } +//===----------------------------------------------------------------------===// +// Reduced-width memory operations. +//===----------------------------------------------------------------------===// + +// --- i8 --- +def Forth_LoadI8Op : Forth_StackOpBase<"load_i8"> { + let summary = "Load i8 value from memory, zero-extend to i64"; + let description = [{ + Pops an address, loads an i8, sign-extends to i64, pushes result. + Forth semantics: ( addr -- value ) + }]; +} +def Forth_StoreI8Op : Forth_StackOpBase<"store_i8"> { + let summary = "Truncate i64 to i8 and store to memory"; + let description = [{ + Pops address and value, truncates i64 to i8, stores to memory. + Forth semantics: ( x addr -- ) + }]; +} +def Forth_SharedLoadI8Op : Forth_StackOpBase<"shared_load_i8"> { + let summary = "Load i8 from shared memory, zero-extend to i64"; + let description = [{ + Pops an address, loads an i8 from shared memory, sign-extends to i64. + Forth semantics: ( addr -- value ) + }]; +} +def Forth_SharedStoreI8Op : Forth_StackOpBase<"shared_store_i8"> { + let summary = "Truncate i64 to i8 and store to shared memory"; + let description = [{ + Pops address and value, truncates i64 to i8, stores to shared memory. + Forth semantics: ( x addr -- ) + }]; +} + +// --- i16 --- +def Forth_LoadI16Op : Forth_StackOpBase<"load_i16"> { + let summary = "Load i16 value from memory, zero-extend to i64"; + let description = [{ + Pops an address, loads an i16, sign-extends to i64, pushes result. + Forth semantics: ( addr -- value ) + }]; +} +def Forth_StoreI16Op : Forth_StackOpBase<"store_i16"> { + let summary = "Truncate i64 to i16 and store to memory"; + let description = [{ + Pops address and value, truncates i64 to i16, stores to memory. + Forth semantics: ( x addr -- ) + }]; +} +def Forth_SharedLoadI16Op : Forth_StackOpBase<"shared_load_i16"> { + let summary = "Load i16 from shared memory, zero-extend to i64"; + let description = [{ + Pops an address, loads an i16 from shared memory, sign-extends to i64. + Forth semantics: ( addr -- value ) + }]; +} +def Forth_SharedStoreI16Op : Forth_StackOpBase<"shared_store_i16"> { + let summary = "Truncate i64 to i16 and store to shared memory"; + let description = [{ + Pops address and value, truncates i64 to i16, stores to shared memory. + Forth semantics: ( x addr -- ) + }]; +} + +// --- i32 --- +def Forth_LoadI32Op : Forth_StackOpBase<"load_i32"> { + let summary = "Load i32 value from memory, zero-extend to i64"; + let description = [{ + Pops an address, loads an i32, sign-extends to i64, pushes result. + Forth semantics: ( addr -- value ) + }]; +} +def Forth_StoreI32Op : Forth_StackOpBase<"store_i32"> { + let summary = "Truncate i64 to i32 and store to memory"; + let description = [{ + Pops address and value, truncates i64 to i32, stores to memory. + Forth semantics: ( x addr -- ) + }]; +} +def Forth_SharedLoadI32Op : Forth_StackOpBase<"shared_load_i32"> { + let summary = "Load i32 from shared memory, zero-extend to i64"; + let description = [{ + Pops an address, loads an i32 from shared memory, sign-extends to i64. + Forth semantics: ( addr -- value ) + }]; +} +def Forth_SharedStoreI32Op : Forth_StackOpBase<"shared_store_i32"> { + let summary = "Truncate i64 to i32 and store to shared memory"; + let description = [{ + Pops address and value, truncates i64 to i32, stores to shared memory. + Forth semantics: ( x addr -- ) + }]; +} + +// --- f16 --- +def Forth_LoadF16Op : Forth_StackOpBase<"load_f16"> { + let summary = "Load f16 from memory, extend to f64, bitcast to i64"; + let description = [{ + Pops an address, loads f16, extends to f64, bitcasts to i64. + Forth semantics: ( addr -- value ) + }]; +} +def Forth_StoreF16Op : Forth_StackOpBase<"store_f16"> { + let summary = "Bitcast i64 to f64, truncate to f16, store to memory"; + let description = [{ + Pops address and value, bitcasts i64 to f64, truncates to f16, stores. + Forth semantics: ( x addr -- ) + }]; +} +def Forth_SharedLoadF16Op : Forth_StackOpBase<"shared_load_f16"> { + let summary = "Load f16 from shared memory, extend to f64, bitcast to i64"; + let description = [{ + Pops an address, loads f16 from shared memory, extends to f64, bitcasts to i64. + Forth semantics: ( addr -- value ) + }]; +} +def Forth_SharedStoreF16Op : Forth_StackOpBase<"shared_store_f16"> { + let summary = "Bitcast i64 to f64, truncate to f16, store to shared memory"; + let description = [{ + Pops address and value, bitcasts i64 to f64, truncates to f16, stores to shared memory. + Forth semantics: ( x addr -- ) + }]; +} + +// --- bf16 --- +def Forth_LoadBF16Op : Forth_StackOpBase<"load_bf16"> { + let summary = "Load bf16 from memory, extend to f64, bitcast to i64"; + let description = [{ + Pops an address, loads bf16, extends to f64, bitcasts to i64. + Forth semantics: ( addr -- value ) + }]; +} +def Forth_StoreBF16Op : Forth_StackOpBase<"store_bf16"> { + let summary = "Bitcast i64 to f64, truncate to bf16, store to memory"; + let description = [{ + Pops address and value, bitcasts i64 to f64, truncates to bf16, stores. + Forth semantics: ( x addr -- ) + }]; +} +def Forth_SharedLoadBF16Op : Forth_StackOpBase<"shared_load_bf16"> { + let summary = "Load bf16 from shared memory, extend to f64, bitcast to i64"; + let description = [{ + Pops an address, loads bf16 from shared memory, extends to f64, bitcasts to i64. + Forth semantics: ( addr -- value ) + }]; +} +def Forth_SharedStoreBF16Op : Forth_StackOpBase<"shared_store_bf16"> { + let summary = "Bitcast i64 to f64, truncate to bf16, store to shared memory"; + let description = [{ + Pops address and value, bitcasts i64 to f64, truncates to bf16, stores to shared memory. + Forth semantics: ( x addr -- ) + }]; +} + +// --- f32 --- +def Forth_LoadF32Op : Forth_StackOpBase<"load_f32"> { + let summary = "Load f32 from memory, extend to f64, bitcast to i64"; + let description = [{ + Pops an address, loads f32, extends to f64, bitcasts to i64. + Forth semantics: ( addr -- value ) + }]; +} +def Forth_StoreF32Op : Forth_StackOpBase<"store_f32"> { + let summary = "Bitcast i64 to f64, truncate to f32, store to memory"; + let description = [{ + Pops address and value, bitcasts i64 to f64, truncates to f32, stores. + Forth semantics: ( x addr -- ) + }]; +} +def Forth_SharedLoadF32Op : Forth_StackOpBase<"shared_load_f32"> { + let summary = "Load f32 from shared memory, extend to f64, bitcast to i64"; + let description = [{ + Pops an address, loads f32 from shared memory, extends to f64, bitcasts to i64. + Forth semantics: ( addr -- value ) + }]; +} +def Forth_SharedStoreF32Op : Forth_StackOpBase<"shared_store_f32"> { + let summary = "Bitcast i64 to f64, truncate to f32, store to shared memory"; + let description = [{ + Pops address and value, bitcasts i64 to f64, truncates to f32, stores to shared memory. + Forth semantics: ( x addr -- ) + }]; +} + def Forth_ParamRefOp : Forth_Op<"param_ref", [Pure]> { let summary = "Push kernel parameter address onto stack"; let description = [{ diff --git a/lib/Conversion/ForthToMemRef/ForthToMemRef.cpp b/lib/Conversion/ForthToMemRef/ForthToMemRef.cpp index 550b77e..7c2725a 100644 --- a/lib/Conversion/ForthToMemRef/ForthToMemRef.cpp +++ b/lib/Conversion/ForthToMemRef/ForthToMemRef.cpp @@ -775,11 +775,46 @@ struct ParamRefOpConversion : public OpConversionPattern { } }; +/// Memory element type tag for generalized load/store templates. +enum class MemType { I64, F64, I8, I16, I32, F16, BF16, F32 }; + +/// Return the MLIR element type for a given MemType. +static Type getMemElemType(MLIRContext *ctx, MemType mt) { + switch (mt) { + case MemType::I64: + return IntegerType::get(ctx, 64); + case MemType::I32: + return IntegerType::get(ctx, 32); + case MemType::I16: + return IntegerType::get(ctx, 16); + case MemType::I8: + return IntegerType::get(ctx, 8); + case MemType::F64: + return Float64Type::get(ctx); + case MemType::F32: + return Float32Type::get(ctx); + case MemType::F16: + return Float16Type::get(ctx); + case MemType::BF16: + return BFloat16Type::get(ctx); + } + llvm_unreachable("unhandled MemType"); +} + +template +inline constexpr bool isReducedInt = + (MT == MemType::I8 || MT == MemType::I16 || MT == MemType::I32); + +template +inline constexpr bool isReducedFloat = + (MT == MemType::F16 || MT == MemType::BF16 || MT == MemType::F32); + /// Generalized memory load template. /// Pops address from stack, loads value via pointer, pushes value. -/// When IsFloat=true, loads f64 from memory and bitcasts to i64 for stack. +/// MemType selects the memory element type and the widening strategy. /// AddressSpace selects global (0) or workgroup memory. -template +template struct MemoryLoadOpConversion : public OpConversionPattern { MemoryLoadOpConversion(const TypeConverter &typeConverter, MLIRContext *context) @@ -804,16 +839,23 @@ struct MemoryLoadOpConversion : public OpConversionPattern { // Load value from memory via pointer Value ptr = rewriter.create(loc, ptrType, addrValue); + Type elemType = getMemElemType(rewriter.getContext(), MT); + Value loaded = rewriter.create(loc, elemType, ptr); + Value valueToPush; - if constexpr (IsFloat) { - // Load f64 from memory, then bitcast to i64 for stack storage - Value loadedF64 = - rewriter.create(loc, rewriter.getF64Type(), ptr); - valueToPush = rewriter.create( - loc, rewriter.getI64Type(), loadedF64); - } else { + if constexpr (MT == MemType::I64) { + valueToPush = loaded; + } else if constexpr (MT == MemType::F64) { valueToPush = - rewriter.create(loc, rewriter.getI64Type(), ptr); + rewriter.create(loc, rewriter.getI64Type(), loaded); + } else if constexpr (isReducedInt) { + valueToPush = + rewriter.create(loc, rewriter.getI64Type(), loaded); + } else if constexpr (isReducedFloat) { + Value extended = + rewriter.create(loc, rewriter.getF64Type(), loaded); + valueToPush = rewriter.create( + loc, rewriter.getI64Type(), extended); } // Store loaded value back at same position (replaces address) @@ -824,18 +866,52 @@ struct MemoryLoadOpConversion : public OpConversionPattern { } }; -// Memory load instantiations +// Memory load instantiations — full-width using LoadIOpConversion = MemoryLoadOpConversion; -using LoadFOpConversion = MemoryLoadOpConversion; +using LoadFOpConversion = MemoryLoadOpConversion; using SharedLoadIOpConversion = - MemoryLoadOpConversion; + MemoryLoadOpConversion; using SharedLoadFOpConversion = - MemoryLoadOpConversion; + MemoryLoadOpConversion; + +// Memory load instantiations — reduced-width +using LoadI8OpConversion = MemoryLoadOpConversion; +using SharedLoadI8OpConversion = + MemoryLoadOpConversion; +using LoadI16OpConversion = + MemoryLoadOpConversion; +using SharedLoadI16OpConversion = + MemoryLoadOpConversion; +using LoadI32OpConversion = + MemoryLoadOpConversion; +using SharedLoadI32OpConversion = + MemoryLoadOpConversion; +using LoadF16OpConversion = + MemoryLoadOpConversion; +using SharedLoadF16OpConversion = + MemoryLoadOpConversion; +using LoadBF16OpConversion = + MemoryLoadOpConversion; +using SharedLoadBF16OpConversion = + MemoryLoadOpConversion; +using LoadF32OpConversion = + MemoryLoadOpConversion; +using SharedLoadF32OpConversion = + MemoryLoadOpConversion; /// Generalized memory store template. /// Pops address and value from stack, stores value to memory. -/// When IsFloat=true, bitcasts i64->f64 before storing. -template +/// MemType selects the memory element type and the narrowing strategy. +template struct MemoryStoreOpConversion : public OpConversionPattern { MemoryStoreOpConversion(const TypeConverter &typeConverter, MLIRContext *context) @@ -865,13 +941,23 @@ struct MemoryStoreOpConversion : public OpConversionPattern { // Store value to memory via pointer Value ptr = rewriter.create(loc, ptrType, addrValue); - if constexpr (IsFloat) { - // Bitcast i64 -> f64 before storing + if constexpr (MT == MemType::I64) { + rewriter.create(loc, value, ptr); + } else if constexpr (MT == MemType::F64) { Value f64Value = rewriter.create(loc, rewriter.getF64Type(), value); rewriter.create(loc, f64Value, ptr); - } else { - rewriter.create(loc, value, ptr); + } else if constexpr (isReducedInt) { + Type elemType = getMemElemType(rewriter.getContext(), MT); + Value truncated = rewriter.create(loc, elemType, value); + rewriter.create(loc, truncated, ptr); + } else if constexpr (isReducedFloat) { + Type elemType = getMemElemType(rewriter.getContext(), MT); + Value f64Value = + rewriter.create(loc, rewriter.getF64Type(), value); + Value truncated = + rewriter.create(loc, elemType, f64Value); + rewriter.create(loc, truncated, ptr); } // New stack pointer is SP-2 (popped both address and value) @@ -881,14 +967,47 @@ struct MemoryStoreOpConversion : public OpConversionPattern { } }; -// Memory store instantiations +// Memory store instantiations — full-width using StoreIOpConversion = MemoryStoreOpConversion; -using StoreFOpConversion = MemoryStoreOpConversion; +using StoreFOpConversion = + MemoryStoreOpConversion; using SharedStoreIOpConversion = - MemoryStoreOpConversion; using SharedStoreFOpConversion = - MemoryStoreOpConversion; + +// Memory store instantiations — reduced-width +using StoreI8OpConversion = + MemoryStoreOpConversion; +using SharedStoreI8OpConversion = + MemoryStoreOpConversion; +using StoreI16OpConversion = + MemoryStoreOpConversion; +using SharedStoreI16OpConversion = + MemoryStoreOpConversion; +using StoreI32OpConversion = + MemoryStoreOpConversion; +using SharedStoreI32OpConversion = + MemoryStoreOpConversion; +using StoreF16OpConversion = + MemoryStoreOpConversion; +using SharedStoreF16OpConversion = + MemoryStoreOpConversion; +using StoreBF16OpConversion = + MemoryStoreOpConversion; +using SharedStoreBF16OpConversion = + MemoryStoreOpConversion; +using StoreF32OpConversion = + MemoryStoreOpConversion; +using SharedStoreF32OpConversion = + MemoryStoreOpConversion; /// Conversion pattern for forth.itof (S>F). @@ -1279,6 +1398,16 @@ struct ConvertForthToMemRefPass LoadIOpConversion, StoreIOpConversion, LoadFOpConversion, StoreFOpConversion, SharedLoadIOpConversion, SharedStoreIOpConversion, SharedLoadFOpConversion, SharedStoreFOpConversion, + // Reduced-width memory ops + LoadI8OpConversion, StoreI8OpConversion, SharedLoadI8OpConversion, + SharedStoreI8OpConversion, LoadI16OpConversion, StoreI16OpConversion, + SharedLoadI16OpConversion, SharedStoreI16OpConversion, + LoadI32OpConversion, StoreI32OpConversion, SharedLoadI32OpConversion, + SharedStoreI32OpConversion, LoadF16OpConversion, StoreF16OpConversion, + SharedLoadF16OpConversion, SharedStoreF16OpConversion, + LoadBF16OpConversion, StoreBF16OpConversion, SharedLoadBF16OpConversion, + SharedStoreBF16OpConversion, LoadF32OpConversion, StoreF32OpConversion, + SharedLoadF32OpConversion, SharedStoreF32OpConversion, // Type conversions IToFOpConversion, FToIOpConversion, // Control flow diff --git a/lib/Translation/ForthToMLIR/ForthToMLIR.cpp b/lib/Translation/ForthToMLIR/ForthToMLIR.cpp index 409e737..0a732b5 100644 --- a/lib/Translation/ForthToMLIR/ForthToMLIR.cpp +++ b/lib/Translation/ForthToMLIR/ForthToMLIR.cpp @@ -592,6 +592,78 @@ Value ForthParser::emitOperation(StringRef word, Value inputStack, } else if (word == "SF!") { return builder.create(loc, stackType, inputStack) .getResult(); + } else if (word == "I8@") { + return builder.create(loc, stackType, inputStack) + .getResult(); + } else if (word == "I8!") { + return builder.create(loc, stackType, inputStack) + .getResult(); + } else if (word == "SI8@") { + return builder.create(loc, stackType, inputStack) + .getResult(); + } else if (word == "SI8!") { + return builder.create(loc, stackType, inputStack) + .getResult(); + } else if (word == "I16@") { + return builder.create(loc, stackType, inputStack) + .getResult(); + } else if (word == "I16!") { + return builder.create(loc, stackType, inputStack) + .getResult(); + } else if (word == "SI16@") { + return builder.create(loc, stackType, inputStack) + .getResult(); + } else if (word == "SI16!") { + return builder.create(loc, stackType, inputStack) + .getResult(); + } else if (word == "I32@") { + return builder.create(loc, stackType, inputStack) + .getResult(); + } else if (word == "I32!") { + return builder.create(loc, stackType, inputStack) + .getResult(); + } else if (word == "SI32@") { + return builder.create(loc, stackType, inputStack) + .getResult(); + } else if (word == "SI32!") { + return builder.create(loc, stackType, inputStack) + .getResult(); + } else if (word == "HF@") { + return builder.create(loc, stackType, inputStack) + .getResult(); + } else if (word == "HF!") { + return builder.create(loc, stackType, inputStack) + .getResult(); + } else if (word == "SHF@") { + return builder.create(loc, stackType, inputStack) + .getResult(); + } else if (word == "SHF!") { + return builder.create(loc, stackType, inputStack) + .getResult(); + } else if (word == "BF@") { + return builder.create(loc, stackType, inputStack) + .getResult(); + } else if (word == "BF!") { + return builder.create(loc, stackType, inputStack) + .getResult(); + } else if (word == "SBF@") { + return builder.create(loc, stackType, inputStack) + .getResult(); + } else if (word == "SBF!") { + return builder.create(loc, stackType, inputStack) + .getResult(); + } else if (word == "F32@") { + return builder.create(loc, stackType, inputStack) + .getResult(); + } else if (word == "F32!") { + return builder.create(loc, stackType, inputStack) + .getResult(); + } else if (word == "SF32@") { + return builder.create(loc, stackType, inputStack) + .getResult(); + } else if (word == "SF32!") { + return builder.create(loc, stackType, inputStack) + .getResult(); } else if (word == "TID-X") { return builder.create(loc, stackType, inputStack) .getResult(); diff --git a/test/Conversion/ForthToMemRef/reduced-width-memory.mlir b/test/Conversion/ForthToMemRef/reduced-width-memory.mlir new file mode 100644 index 0000000..7aec896 --- /dev/null +++ b/test/Conversion/ForthToMemRef/reduced-width-memory.mlir @@ -0,0 +1,118 @@ +// RUN: %warpforth-opt --convert-forth-to-memref %s | %FileCheck %s + +// CHECK-LABEL: func.func private @main + +// --- i8 load: llvm.load i8, extsi to i64 --- +// CHECK: llvm.inttoptr %{{.*}} : i64 to !llvm.ptr +// CHECK-NEXT: llvm.load %{{.*}} : !llvm.ptr -> i8 +// CHECK-NEXT: arith.extsi %{{.*}} : i8 to i64 + +// --- i8 store: trunci i64 to i8, llvm.store --- +// CHECK: arith.trunci %{{.*}} : i64 to i8 +// CHECK-NEXT: llvm.store %{{.*}}, %{{.*}} : i8, !llvm.ptr + +// --- shared i8 load: ptr<3> --- +// CHECK: llvm.inttoptr %{{.*}} : i64 to !llvm.ptr<3> +// CHECK-NEXT: llvm.load %{{.*}} : !llvm.ptr<3> -> i8 +// CHECK-NEXT: arith.extsi %{{.*}} : i8 to i64 + +// --- shared i8 store: ptr<3> --- +// CHECK: arith.trunci %{{.*}} : i64 to i8 +// CHECK-NEXT: llvm.store %{{.*}}, %{{.*}} : i8, !llvm.ptr<3> + +// --- i16 load --- +// CHECK: llvm.load %{{.*}} : !llvm.ptr -> i16 +// CHECK-NEXT: arith.extsi %{{.*}} : i16 to i64 + +// --- i16 store --- +// CHECK: arith.trunci %{{.*}} : i64 to i16 +// CHECK-NEXT: llvm.store %{{.*}}, %{{.*}} : i16, !llvm.ptr + +// --- i32 load --- +// CHECK: llvm.load %{{.*}} : !llvm.ptr -> i32 +// CHECK-NEXT: arith.extsi %{{.*}} : i32 to i64 + +// --- i32 store --- +// CHECK: arith.trunci %{{.*}} : i64 to i32 +// CHECK-NEXT: llvm.store %{{.*}}, %{{.*}} : i32, !llvm.ptr + +// --- f16 load: llvm.load f16, extf to f64, bitcast to i64 --- +// CHECK: llvm.load %{{.*}} : !llvm.ptr -> f16 +// CHECK-NEXT: arith.extf %{{.*}} : f16 to f64 +// CHECK-NEXT: arith.bitcast %{{.*}} : f64 to i64 + +// --- f16 store: bitcast i64 to f64, truncf to f16, llvm.store --- +// CHECK: arith.bitcast %{{.*}} : i64 to f64 +// CHECK-NEXT: arith.truncf %{{.*}} : f64 to f16 +// CHECK-NEXT: llvm.store %{{.*}}, %{{.*}} : f16, !llvm.ptr + +// --- bf16 load --- +// CHECK: llvm.load %{{.*}} : !llvm.ptr -> bf16 +// CHECK-NEXT: arith.extf %{{.*}} : bf16 to f64 +// CHECK-NEXT: arith.bitcast %{{.*}} : f64 to i64 + +// --- bf16 store --- +// CHECK: arith.bitcast %{{.*}} : i64 to f64 +// CHECK-NEXT: arith.truncf %{{.*}} : f64 to bf16 +// CHECK-NEXT: llvm.store %{{.*}}, %{{.*}} : bf16, !llvm.ptr + +// --- f32 load --- +// CHECK: llvm.load %{{.*}} : !llvm.ptr -> f32 +// CHECK-NEXT: arith.extf %{{.*}} : f32 to f64 +// CHECK-NEXT: arith.bitcast %{{.*}} : f64 to i64 + +// --- f32 store --- +// CHECK: arith.bitcast %{{.*}} : i64 to f64 +// CHECK-NEXT: arith.truncf %{{.*}} : f64 to f32 +// CHECK-NEXT: llvm.store %{{.*}}, %{{.*}} : f32, !llvm.ptr + +module { + func.func private @main() { + %0 = forth.stack !forth.stack + // i8 load + %1 = forth.constant %0(1 : i64) : !forth.stack -> !forth.stack + %2 = forth.load_i8 %1 : !forth.stack -> !forth.stack + // i8 store + %3 = forth.constant %2(42 : i64) : !forth.stack -> !forth.stack + %4 = forth.constant %3(100 : i64) : !forth.stack -> !forth.stack + %5 = forth.store_i8 %4 : !forth.stack -> !forth.stack + // shared i8 load + %6 = forth.constant %5(2 : i64) : !forth.stack -> !forth.stack + %7 = forth.shared_load_i8 %6 : !forth.stack -> !forth.stack + // shared i8 store + %8 = forth.constant %7(9 : i64) : !forth.stack -> !forth.stack + %9 = forth.constant %8(3 : i64) : !forth.stack -> !forth.stack + %10 = forth.shared_store_i8 %9 : !forth.stack -> !forth.stack + // i16 + %11 = forth.constant %10(1 : i64) : !forth.stack -> !forth.stack + %12 = forth.load_i16 %11 : !forth.stack -> !forth.stack + %13 = forth.constant %12(42 : i64) : !forth.stack -> !forth.stack + %14 = forth.constant %13(100 : i64) : !forth.stack -> !forth.stack + %15 = forth.store_i16 %14 : !forth.stack -> !forth.stack + // i32 + %16 = forth.constant %15(1 : i64) : !forth.stack -> !forth.stack + %17 = forth.load_i32 %16 : !forth.stack -> !forth.stack + %18 = forth.constant %17(42 : i64) : !forth.stack -> !forth.stack + %19 = forth.constant %18(100 : i64) : !forth.stack -> !forth.stack + %20 = forth.store_i32 %19 : !forth.stack -> !forth.stack + // f16 + %21 = forth.constant %20(1 : i64) : !forth.stack -> !forth.stack + %22 = forth.load_f16 %21 : !forth.stack -> !forth.stack + %23 = forth.constant %22(42 : i64) : !forth.stack -> !forth.stack + %24 = forth.constant %23(100 : i64) : !forth.stack -> !forth.stack + %25 = forth.store_f16 %24 : !forth.stack -> !forth.stack + // bf16 + %26 = forth.constant %25(1 : i64) : !forth.stack -> !forth.stack + %27 = forth.load_bf16 %26 : !forth.stack -> !forth.stack + %28 = forth.constant %27(42 : i64) : !forth.stack -> !forth.stack + %29 = forth.constant %28(100 : i64) : !forth.stack -> !forth.stack + %30 = forth.store_bf16 %29 : !forth.stack -> !forth.stack + // f32 + %31 = forth.constant %30(1 : i64) : !forth.stack -> !forth.stack + %32 = forth.load_f32 %31 : !forth.stack -> !forth.stack + %33 = forth.constant %32(42 : i64) : !forth.stack -> !forth.stack + %34 = forth.constant %33(100 : i64) : !forth.stack -> !forth.stack + %35 = forth.store_f32 %34 : !forth.stack -> !forth.stack + return + } +} diff --git a/test/Pipeline/reduced-width-memory.forth b/test/Pipeline/reduced-width-memory.forth new file mode 100644 index 0000000..786b7f3 --- /dev/null +++ b/test/Pipeline/reduced-width-memory.forth @@ -0,0 +1,15 @@ +\ RUN: %warpforth-translate --forth-to-mlir %s | %warpforth-opt --warpforth-pipeline | %FileCheck %s + +\ Verify reduced-width memory ops compile through the full pipeline to gpu.binary +\ CHECK: gpu.binary @warpforth_module + +\! kernel main +\! param DATA i64[256] +GLOBAL-ID CELLS DATA + I8@ +GLOBAL-ID CELLS DATA + I8! +GLOBAL-ID CELLS DATA + I32@ +GLOBAL-ID CELLS DATA + I32! +GLOBAL-ID CELLS DATA + HF@ +GLOBAL-ID CELLS DATA + HF! +GLOBAL-ID CELLS DATA + F32@ +GLOBAL-ID CELLS DATA + F32! diff --git a/test/Translation/Forth/reduced-width-memory.forth b/test/Translation/Forth/reduced-width-memory.forth new file mode 100644 index 0000000..4ebc8c1 --- /dev/null +++ b/test/Translation/Forth/reduced-width-memory.forth @@ -0,0 +1,63 @@ +\ RUN: %warpforth-translate --forth-to-mlir %s | %FileCheck %s + +\ Test I8@ / I8! +\ CHECK: forth.load_i8 +\ CHECK: forth.store_i8 + +\ Test SI8@ / SI8! +\ CHECK: forth.shared_load_i8 +\ CHECK: forth.shared_store_i8 + +\ Test I16@ / I16! +\ CHECK: forth.load_i16 +\ CHECK: forth.store_i16 + +\ Test SI16@ / SI16! +\ CHECK: forth.shared_load_i16 +\ CHECK: forth.shared_store_i16 + +\ Test I32@ / I32! +\ CHECK: forth.load_i32 +\ CHECK: forth.store_i32 + +\ Test SI32@ / SI32! +\ CHECK: forth.shared_load_i32 +\ CHECK: forth.shared_store_i32 + +\ Test HF@ / HF! +\ CHECK: forth.load_f16 +\ CHECK: forth.store_f16 + +\ Test SHF@ / SHF! +\ CHECK: forth.shared_load_f16 +\ CHECK: forth.shared_store_f16 + +\ Test BF@ / BF! +\ CHECK: forth.load_bf16 +\ CHECK: forth.store_bf16 + +\ Test SBF@ / SBF! +\ CHECK: forth.shared_load_bf16 +\ CHECK: forth.shared_store_bf16 + +\ Test F32@ / F32! +\ CHECK: forth.load_f32 +\ CHECK: forth.store_f32 + +\ Test SF32@ / SF32! +\ CHECK: forth.shared_load_f32 +\ CHECK: forth.shared_store_f32 + +\! kernel main +1 I8@ 2 3 I8! +4 SI8@ 5 6 SI8! +1 I16@ 2 3 I16! +4 SI16@ 5 6 SI16! +1 I32@ 2 3 I32! +4 SI32@ 5 6 SI32! +1 HF@ 2 3 HF! +4 SHF@ 5 6 SHF! +1 BF@ 2 3 BF! +4 SBF@ 5 6 SBF! +1 F32@ 2 3 F32! +4 SF32@ 5 6 SF32! From a18024780a9be3162643035703b68f0583bb4e08 Mon Sep 17 00:00:00 2001 From: Alex Cameron Date: Sun, 22 Feb 2026 22:41:43 +0900 Subject: [PATCH 2/2] fix(dialect): correct zero-extend to sign-extend in reduced-width load op summaries --- include/warpforth/Dialect/Forth/ForthOps.td | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/warpforth/Dialect/Forth/ForthOps.td b/include/warpforth/Dialect/Forth/ForthOps.td index 246fcf8..13a2a33 100644 --- a/include/warpforth/Dialect/Forth/ForthOps.td +++ b/include/warpforth/Dialect/Forth/ForthOps.td @@ -410,7 +410,7 @@ def Forth_SharedStoreFOp : Forth_StackOpBase<"shared_storef"> { // --- i8 --- def Forth_LoadI8Op : Forth_StackOpBase<"load_i8"> { - let summary = "Load i8 value from memory, zero-extend to i64"; + let summary = "Load i8 value from memory, sign-extend to i64"; let description = [{ Pops an address, loads an i8, sign-extends to i64, pushes result. Forth semantics: ( addr -- value ) @@ -424,7 +424,7 @@ def Forth_StoreI8Op : Forth_StackOpBase<"store_i8"> { }]; } def Forth_SharedLoadI8Op : Forth_StackOpBase<"shared_load_i8"> { - let summary = "Load i8 from shared memory, zero-extend to i64"; + let summary = "Load i8 from shared memory, sign-extend to i64"; let description = [{ Pops an address, loads an i8 from shared memory, sign-extends to i64. Forth semantics: ( addr -- value ) @@ -440,7 +440,7 @@ def Forth_SharedStoreI8Op : Forth_StackOpBase<"shared_store_i8"> { // --- i16 --- def Forth_LoadI16Op : Forth_StackOpBase<"load_i16"> { - let summary = "Load i16 value from memory, zero-extend to i64"; + let summary = "Load i16 value from memory, sign-extend to i64"; let description = [{ Pops an address, loads an i16, sign-extends to i64, pushes result. Forth semantics: ( addr -- value ) @@ -454,7 +454,7 @@ def Forth_StoreI16Op : Forth_StackOpBase<"store_i16"> { }]; } def Forth_SharedLoadI16Op : Forth_StackOpBase<"shared_load_i16"> { - let summary = "Load i16 from shared memory, zero-extend to i64"; + let summary = "Load i16 from shared memory, sign-extend to i64"; let description = [{ Pops an address, loads an i16 from shared memory, sign-extends to i64. Forth semantics: ( addr -- value ) @@ -470,7 +470,7 @@ def Forth_SharedStoreI16Op : Forth_StackOpBase<"shared_store_i16"> { // --- i32 --- def Forth_LoadI32Op : Forth_StackOpBase<"load_i32"> { - let summary = "Load i32 value from memory, zero-extend to i64"; + let summary = "Load i32 value from memory, sign-extend to i64"; let description = [{ Pops an address, loads an i32, sign-extends to i64, pushes result. Forth semantics: ( addr -- value ) @@ -484,7 +484,7 @@ def Forth_StoreI32Op : Forth_StackOpBase<"store_i32"> { }]; } def Forth_SharedLoadI32Op : Forth_StackOpBase<"shared_load_i32"> { - let summary = "Load i32 from shared memory, zero-extend to i64"; + let summary = "Load i32 from shared memory, sign-extend to i64"; let description = [{ Pops an address, loads an i32 from shared memory, sign-extends to i64. Forth semantics: ( addr -- value )