diff --git a/.dev/checklist.md b/.dev/checklist.md index 5981ee41a..47f3102db 100644 --- a/.dev/checklist.md +++ b/.dev/checklist.md @@ -74,6 +74,27 @@ Prefix: W## (to distinguish from CW's F## items). still 25/25 because it uses per-job `Setup Rust` and does not install Go / TinyGo; CI adoption tracked separately under W50. +- [ ] W55-watchos-ilp32: arm64_32-apple-watchos (ILP32) best-effort + support landed in PR #97 (D139). CI runs only a build-only smoke + job for `zig build static-lib -Dtarget=aarch64-watchos-ilp32 + -Djit=false -Dcomponent=false -Dwat=false` because GitHub Actions + runners have no watchOS SDK to link / run the resulting archive. + Open follow-ups: + (a) Zig 0.16 archive packing bug — `aarch64-watchos-ilp32` + static-lib produces a working `.o` but the `.a` archive is + 88 bytes (SYMDEF only). Embedders (e.g. wasm-benchmark's + `scripts/build-zwasm.sh`) work around with `ar rcs` + post-build. File upstream and remove the workaround once + fixed. + (b) `error.MissingIo` semantics on ILP32 are documented in D139 + but not surfaced in `include/zwasm.h` or the C API reference. + Add a "Caveats — ILP32" subsection if the target gains + traction. + (c) `memory.atomic.wait/notify` correctness on ILP32 with + `single_threaded = true`. Not a regression vs the initial PR + (atomics never worked under ILP32 anyway) but worth a one-line + test once the watchOS SDK becomes available in CI. + - [ ] W45: SIMD loop persistence — Skip Q-cache eviction at loop headers. Requires back-edge detection in scanBranchTargets. diff --git a/.dev/decisions.md b/.dev/decisions.md index c51bb0d49..401f25169 100644 --- a/.dev/decisions.md +++ b/.dev/decisions.md @@ -351,11 +351,11 @@ this is unaffected by host allocator choice. **Usage matrix**: -| Caller | Allocator source | -|---------------------|-------------------------------------------------| -| Zig host (CW/cw-new) | Host's `std.mem.Allocator` (GC-managed) | -| C host (via C API) | `malloc/free` function pointers or default | -| Standalone CLI | Internal `page_allocator` or `GeneralPurposeAllocator` | +| Caller | Allocator source | +|----------------------|--------------------------------------------------------| +| Zig host (CW/cw-new) | Host's `std.mem.Allocator` (GC-managed) | +| C host (via C API) | `malloc/free` function pointers or default | +| Standalone CLI | Internal `page_allocator` or `GeneralPurposeAllocator` | **Migration**: Internal Arena usage → accept Allocator parameter. Existing C API (`zwasm_engine_new`) gains optional config struct with alloc/free callbacks. @@ -887,11 +887,11 @@ agreed-on stripping mechanism, the ceiling has no meaning. and forcing parity would either hobble Linux or grant Windows excess slack. - | OS | Stripped binary | Ceiling | Headroom | - |------------------|-----------------|-----------|----------| - | macOS aarch64 | ~1.20 MB | 1.30 MB | ~80 KB | - | Linux x86_64 | ~1.56 MB | 1.60 MB | ~40 KB | - | Windows x86_64 | ~1.70 MB | 1.80 MB | ~100 KB | + | OS | Stripped binary | Ceiling | Headroom | + |----------------|-----------------|---------|----------| + | macOS aarch64 | ~1.20 MB | 1.30 MB | ~80 KB | + | Linux x86_64 | ~1.56 MB | 1.60 MB | ~40 KB | + | Windows x86_64 | ~1.70 MB | 1.80 MB | ~100 KB | The Linux 1.60 MB number is the original W48 Phase-1 target and is unchanged; the macOS 1.30 MB number tightens on the prior implicit @@ -1069,3 +1069,106 @@ once the coalescer extension that needs them is debugged on x86_64 - `W54-libm`: `rw_c_math` is dominated by libm `sin`/`cos`/`pow` dispatch; intrinsic recognition + ARM64 FSQRT inline + soft-libm fallback. + +--- + +## D139: arm64_32-apple-watchos (ILP32) static-lib support — best-effort, no CI gate + +**Status**: Accepted — landed via PR #97 (`arm64_32-apple-watchos` branch). + +**Context**: Apple Watch SE / SE2 / Series 4-8 (S4-S8 SoC family) +ships the ILP32 ABI: 32-bit pointers, 64-bit aarch64 instructions. +Zig 0.16 spells the triple `aarch64-watchos-ilp32` (the legacy +`arm64_32-` arch identifier was removed upstream in ziglang/zig +#20820). Apple's App Store policy forbids `MAP_JIT` outside +JavaScriptCore, so any wasm runtime targeting WatchKit apps must +be pure interpreter (`-Djit=false`). zwasm already supports that +mode on every 64-bit Apple platform; the missing piece was the +ILP32 build itself. + +The blocker on a clean build was that `std.Io.Threaded` in Zig +0.16 does not compile under ILP32 — `dirReadDarwin` / `pwrite` +and friends narrow `u64` syscall returns into a 32-bit `usize` +and the compiler rejects it. The full panic / `std.debug.*` paths +transitively pull `std.Io.Threaded` in via +`lockStderr → std_options.debug_io → debug_threaded_io.io()`, so +they trip the same compile error even when nothing else in zwasm +touches threading. + +**Decision**: Accept ILP32 as a **best-effort target with no +runtime CI gate**. Build-only smoke is wired into CI to catch +source-level rot; spec / e2e / realworld / ffi / bench coverage +is not added because GitHub Actions runners have no watchOS SDK +and the resulting archive cannot be linked or run there. Support +is conditional on the consumer (a) leaving JIT off, (b) not +enabling WASI host-dir access, and (c) not depending on +`memory.atomic.wait/notify` correctness. Every workaround is +gated comptime on `@sizeOf(usize) < 8` or +`target.result.ptrBitWidth() < 64` so 64-bit consumers are +byte-identical to before. + +| Concern | ILP32 (watchOS) | 64-bit (Mac/iOS/Linux/Windows) | +|-----------------------|----------------------------------|--------------------------------| +| panic / log namespace | `std.debug.no_panic` (trap-only) | stdlib default | +| static-lib threading | `single_threaded = true` | default (multi-threaded) | +| auto-init Io | refused | `std.Io.Threaded` (per D135) | +| `Config.io` required | yes, if WASI or timeout | optional | +| Guard memory consts | 0 placeholders | 4 GiB / 8 GiB | +| JIT | unsupported (App Store) | enabled by default | + +**Safety strategy** (per failure mode): + +1. **panic / std_options override** (`src/c_api.zig`). Scoped to + ILP32 via a comptime `const ilp32 = @sizeOf(usize) < 8;`. + 64-bit C-API consumers keep + `std.debug.FullPanic(std.debug.defaultPanic)`, the stdlib + default — no behaviour change for them. +2. **`single_threaded = true`** (`build.zig`, `lib_static_mod`). + Gated on `target.result.ptrBitWidth() < 64`. 64-bit static-lib + consumers keep functioning `atomic.wait/notify` via + `std.Thread` primitives. +3. **Io acquisition** (`src/types.zig`, `loadCore`). On ILP32 with + no `config.io`: refuse with `error.MissingIo` if WASI or + `timeout_ms` is requested (both would dereference an undefined + vtable at runtime). Otherwise leave `io = undefined` — the + embedder has promised not to exercise io-dependent paths. The + error name is deliberately ABI-neutral so it does not leak the + watchOS detail into the cross-target public error set. +4. **Guard memory constants** (`src/guard.zig`). `GUARD_SIZE` / + `TOTAL_RESERVATION` overflow comptime under 32-bit usize. Set + to 0 placeholders on ILP32; runtime callers are predicated on + `jitSupported()` which is false for watchos. +5. **Memory index narrowing** (`src/memory.zig`, `src/vm.zig`). + After the bounds check `effective: u64` provably fits in usize + (since `len: usize` bounds it), so the explicit + `@intCast(usize)` is a no-op on 64-bit and a + correctness-preserving narrowing on ILP32. + +**Alternatives considered**: + +- **Refuse the build entirely on ILP32.** Rejected — the watchOS + use case is real (Apple Watch wasm runtime comparison in + rebeckerspecialties/wasm-benchmark) and the comptime gates are + small enough to keep maintained. +- **Force `single_threaded = true` on every static-lib.** What + the initial PR proposed. Rejected — silently breaks + `atomic.wait/notify` for 64-bit Linux / macOS / Windows + static-lib consumers. +- **Reinvent the panic namespace inline.** What the initial PR + did (~80 lines of hand-rolled `@trap()` handlers). Rejected — + Zig 0.16 ships `std.debug.no_panic` for exactly this purpose. +- **Provide a trap-on-all-methods Io vtable.** Considered for the + ILP32 `undefined io` case. Rejected — `std.Io.VTable` has 50+ + function pointers, the surface is volatile across Zig releases, + and the `error.IlpRequiresExplicitIo` path achieves the same + loud-failure semantics with much less code. + +**Known follow-up**: tracked in `checklist.md` as +`W55-watchos-ilp32`. The Zig 0.16 `.a`-packing bug for this triple +(archive ends up as the 88-byte SYMDEF only; embedders must `ar +rcs` the `.o` themselves) is upstream's problem, not zwasm's, and +is left out of scope here. + +**Affected files**: `build.zig`, `src/c_api.zig`, `src/guard.zig`, +`src/memory.zig`, `src/types.zig`, `src/vm.zig`, +`.github/workflows/ci.yml`. diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fe9ae35a3..b20952831 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -278,6 +278,44 @@ jobs: - name: Verify minimal build tests pass run: zig build test -Djit=false -Dcomponent=false -Dwat=false + # Build-only smoke for arm64_32-apple-watchos (ILP32) — see D139. + # GitHub Actions runners lack a watchOS SDK so the resulting + # archive cannot be linked or run; this job only catches + # source-level rot on the ILP32 comptime gates in src/c_api.zig, + # src/guard.zig, src/memory.zig, src/types.zig, src/vm.zig and + # the build.zig single_threaded gate. + watchos-ilp32-build: + name: build (aarch64-watchos-ilp32, ILP32 smoke) + runs-on: macos-latest + steps: + - uses: actions/checkout@v4 + + - name: Install Zig + uses: goto-bus-stop/setup-zig@v2 + with: + version: 0.16.0 + + - name: Build static-lib for aarch64-watchos-ilp32 + run: | + zig build static-lib \ + -Dtarget=aarch64-watchos-ilp32 \ + -Djit=false -Dcomponent=false -Dwat=false + # Zig 0.16 ships an empty .a for this target (SYMDEF only, + # ~88 bytes). The .o is the actual artefact; verify it + # built and is the right ABI. `file(1)` output varies + # across versions ("64_32-bit armv8 object" on local Darwin + # 25, "arm64_32_v8" on macos-latest CI), so match the + # stable substring `arm64_32` present in both. + OBJ=$(find .zig-cache -name 'libzwasm_zcu.o' \ + -exec file {} + | grep -E 'arm64_32|64_32-bit armv8' | head -1 | cut -d: -f1) + if [ -z "$OBJ" ]; then + echo "FAIL: no arm64_32 (armv8 ILP32) object produced" + find .zig-cache -name 'libzwasm_zcu.o' -exec file {} + + exit 1 + fi + echo "PASS: built $OBJ" + ls -la "$OBJ" + # 3-OS bench matrix (C-g step 5, 2026-04-29). Each runner does its # own fresh measurement of base vs PR — never compared across # OSes / architectures because the hardware deltas dwarf any diff --git a/build.zig b/build.zig index c0699d59f..e3b5872d3 100644 --- a/build.zig +++ b/build.zig @@ -194,11 +194,20 @@ pub fn build(b: *std.Build) void { lib_shared.installHeader(b.path("include/zwasm.h"), "zwasm.h"); // Static library (libzwasm.a) + // + // single_threaded is forced on ILP32 targets (arm64_32-apple-watchos) + // because std.Io.Threaded fails to compile under that ABI (u64 → usize + // narrowing in dirReadDarwin / pwrite). On 64-bit targets the static + // lib keeps the default (multi-threaded), so wasm-threads + // atomic.wait/notify continues to work for Linux / macOS / Windows + // C-API consumers. + const lib_static_single_threaded: ?bool = if (target.result.ptrBitWidth() < 64) true else null; const lib_static_mod = b.createModule(.{ .root_source_file = b.path("src/c_api.zig"), .target = target, .optimize = if (lib_optimize) optimize else if (optimize == .Debug) .ReleaseSafe else optimize, .link_libc = true, + .single_threaded = lib_static_single_threaded, .pic = if (enable_pic) true else null, }); lib_static_mod.addOptions("build_options", options); diff --git a/src/c_api.zig b/src/c_api.zig index 1c0df74fe..bd360a676 100644 --- a/src/c_api.zig +++ b/src/c_api.zig @@ -18,6 +18,34 @@ const types = @import("types.zig"); const WasmModule = types.WasmModule; const WasiOptions = types.WasiOptions; +// On arm64_32-apple-watchos (ILP32) Zig 0.16's default panic / std.debug +// machinery and std.log.defaultLog both route through std.Io.Threaded +// (lockStderr → std_options.debug_io → debug_threaded_io.io()), which +// fails to compile under ILP32 because of u64 → usize narrowing in +// dirReadDarwin / pwrite. On 64-bit targets the stdlib defaults work +// fine, so the override is scoped to ILP32 only — keeping panic +// messages and log output intact for every other C-API consumer +// (macOS / iOS / Linux / Windows static & shared libs). +const ilp32 = @sizeOf(usize) < 8; + +pub const panic = if (ilp32) + std.debug.no_panic +else + std.debug.FullPanic(std.debug.defaultPanic); + +fn noopLog( + comptime _: std.log.Level, + comptime _: @EnumLiteral(), + comptime _: []const u8, + _: anytype, +) void {} + +pub const std_options: std.Options = if (ilp32) .{ + .allow_stack_tracing = false, + .networking = false, + .logFn = noopLog, +} else .{}; + /// Convert isize (C intptr_t) to platform File.Handle. fn isizeToHandle(v: isize) std.Io.File.Handle { if (builtin.os.tag == .windows) { diff --git a/src/guard.zig b/src/guard.zig index a9bce6fbd..f44314025 100644 --- a/src/guard.zig +++ b/src/guard.zig @@ -117,11 +117,18 @@ const Ucontext = switch (builtin.os.tag) { /// Guard region size: 4 GiB + 64 KiB. /// This ensures any 32-bit index (0..0xFFFFFFFF) + small offset (up to 64 KiB) /// falls within the mapped region (data + guard). -pub const GUARD_SIZE: usize = 4 * 1024 * 1024 * 1024 + 64 * 1024; +/// +/// On ILP32 targets (arm64_32-apple-watchos) `usize` is 32-bit and these +/// values overflow comptime. Guard memory is only used when `jitSupported()` +/// returns true, which is false for watchos — so on 32-bit targets we set +/// the constants to zero placeholders to satisfy the type checker. Any +/// runtime call into the GuardedMem path on a 32-bit platform would be a +/// bug: `addMemory` in src/store.zig predicates on `jitSupported()`. +pub const GUARD_SIZE: usize = if (@sizeOf(usize) >= 8) 4 * 1024 * 1024 * 1024 + 64 * 1024 else 0; /// Total virtual reservation: data capacity + guard. /// Data capacity matches Wasm max 4 GiB. Guard provides PROT_NONE safety zone. -pub const TOTAL_RESERVATION: usize = 8 * 1024 * 1024 * 1024 + 64 * 1024; +pub const TOTAL_RESERVATION: usize = if (@sizeOf(usize) >= 8) 8 * 1024 * 1024 * 1024 + 64 * 1024 else 0; /// Recovery information for signal handler. /// Set before calling JIT code, cleared after. diff --git a/src/memory.zig b/src/memory.zig index 4bff7c458..2813e1eb8 100644 --- a/src/memory.zig +++ b/src/memory.zig @@ -176,7 +176,12 @@ pub const Memory = struct { const len = self.data.items.len; if (overflow != 0 or len < @sizeOf(T) or effective > len - @sizeOf(T)) return error.OutOfBoundsMemoryAccess; - const ptr: *const [@sizeOf(T)]u8 = @ptrCast(&self.data.items[effective]); + // After the bounds check, `effective` fits in usize because `len` + // is usize. The explicit @intCast is needed on ILP32 targets + // (arm64_32-apple-watchos) where usize is 32-bit but `effective` + // is u64. + const effective_usize: usize = @intCast(effective); + const ptr: *const [@sizeOf(T)]u8 = @ptrCast(&self.data.items[effective_usize]); return switch (T) { u8, u16, u32, u64, i8, i16, i32, i64 => mem.readInt(T, ptr, .little), u128 => mem.readInt(u128, ptr, .little), @@ -193,7 +198,9 @@ pub const Memory = struct { const len = self.data.items.len; if (overflow != 0 or len < @sizeOf(T) or effective > len - @sizeOf(T)) return error.OutOfBoundsMemoryAccess; - const ptr: *[@sizeOf(T)]u8 = @ptrCast(&self.data.items[effective]); + // See Memory.read above for why this cast is required on ILP32. + const effective_usize: usize = @intCast(effective); + const ptr: *[@sizeOf(T)]u8 = @ptrCast(&self.data.items[effective_usize]); switch (T) { u8, u16, u32, u64, i8, i16, i32, i64 => mem.writeInt(T, ptr, value, .little), u128 => mem.writeInt(u128, ptr, value, .little), diff --git a/src/types.zig b/src/types.zig index f257ad8e0..f20196ca8 100644 --- a/src/types.zig +++ b/src/types.zig @@ -468,8 +468,24 @@ pub const WasmModule = struct { // stand up a private `std.Io.Threaded` owned by this module. // Acquired early — applyWasiOptions's addPreopenPath needs io to open // host directories cross-platform (Zig 0.16's `std.Io.Dir.openDir`). + // + // On ILP32 targets (arm64_32-apple-watchos) `std.Io.Threaded` itself + // doesn't compile (u64 syscall returns vs 32-bit usize), so we cannot + // auto-construct one. If the caller asked for any feature that + // reaches into the io vtable (WASI host preopens, deadline timeout) + // they must supply config.io themselves; otherwise loadCore would + // dereference an undefined vtable at runtime. Wasm modules that + // execute `memory.atomic.wait/notify` reach io too — those embedders + // must also pass config.io, but we cannot detect that statically. const io: std.Io = blk: { if (config.io) |io_val| break :blk io_val; + if (@sizeOf(usize) < 8) { + if (config.wasi or config.timeout_ms != null) { + return error.MissingIo; + } + self.owned_io = null; + break :blk @as(std.Io, undefined); + } const threaded = try allocator.create(std.Io.Threaded); errdefer allocator.destroy(threaded); threaded.* = std.Io.Threaded.init(allocator, .{}); diff --git a/src/vm.zig b/src/vm.zig index 1100ab0d1..cc1907571 100644 --- a/src/vm.zig +++ b/src/vm.zig @@ -4008,9 +4008,13 @@ pub const Vm = struct { const byte_count = N * @sizeOf(NarrowT); const effective, const ov = @addWithOverflow(ma.offset, base); if (ov != 0 or m.data.items.len < byte_count or effective > m.data.items.len - byte_count) return error.OutOfBoundsMemoryAccess; + // After the bounds check `effective` fits in usize because + // `m.data.items.len` is usize. Explicit cast needed on ILP32 + // targets (arm64_32-apple-watchos). + const effective_usize: usize = @intCast(effective); var narrow: [N]NarrowT = undefined; for (&narrow, 0..) |*n, i| { - const ptr: *const [@sizeOf(NarrowT)]u8 = @ptrCast(&m.data.items[effective + i * @sizeOf(NarrowT)]); + const ptr: *const [@sizeOf(NarrowT)]u8 = @ptrCast(&m.data.items[effective_usize + i * @sizeOf(NarrowT)]); n.* = std.mem.readInt(NarrowT, ptr, .little); } // Extend to wide