From ac59b1d634f8269d0e6f31d2bcdb29def9350429 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 2 Jul 2026 19:53:19 +0000 Subject: [PATCH] feat(simd_soa): iter_i32x16 / iter_i64x8 typed lane iterators on MultiLaneColumn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up unblocking the gridlake wiring (lance-graph #635 COMMENTARY): lane J's GridBatch carries i32 min/max and i64 sum columns, but MultiLaneColumn only exposed f32/f64/u64/u8 lane views — #227's onebrc gridlake probe got away with f32 min/max columns. Add the signed integer lane widths so a batch SoA can be viewed through the gridlake carrier directly, no f32 recast. - `i32x16_from_chunk` / `i64x8_from_chunk` — LE decoders mirroring the existing `f32x16_from_chunk` / `u64x8_from_chunk` (scalar `from_le_bytes` loop, lowered to a single register-width load on LE targets; no pointer cast of the u8-aligned Arc<[u8]>). - `iter_i32x16` / `iter_i64x8` methods + `len_i32x16` / `len_i64x8`, routed through `crate::simd::{I32x16, I64x8}` per the W1a layering rule (never dipping into simd_avx512/simd_neon/scalar directly). - Parity tests: `iter_i32x16_le_round_trip` (incl. negatives, proves sign-extension survives the decode) + `iter_i64x8_le_round_trip`; extended the empty-count, 3-lane-count, and len asserts. These are layout-only zero-copy reinterpretations of the backing store (the same category as the existing typed iterators), not new compute kernels — no per-arch AVX/NEON/scalar backend needed beyond the lane types crate::simd already provides. simd_soa: 13/13 tests pass; clippy -D warnings clean; fmt clean. Co-Authored-By: Claude Fable 5 Claude-Session: https://claude.ai/code/session_01MLBnPuScZy6w9di2QEjsXM --- src/simd_soa.rs | 92 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 91 insertions(+), 1 deletion(-) diff --git a/src/simd_soa.rs b/src/simd_soa.rs index 05903c2c..8bcde74a 100644 --- a/src/simd_soa.rs +++ b/src/simd_soa.rs @@ -35,7 +35,7 @@ use std::sync::Arc; // re-exports the right backend (AVX-512 / NEON / scalar) per `cfg`. Per // the W1a layering rule, `simd_soa.rs` MUST go through `crate::simd::` // rather than dipping into `simd_avx512` / `simd_neon` / `scalar` directly. -use crate::simd::{F32x16, F64x8, U64x8, U8x64}; +use crate::simd::{F32x16, F64x8, I32x16, I64x8, U64x8, U8x64}; // Endian-correct `&[u8; 4]` → `f32` / `&[u8; 8]` → `f64`/`u64` helpers. // `f32::from_le_bytes` is intrinsically optimised to a single load on @@ -89,6 +89,33 @@ fn u64x8_from_chunk(chunk: &[u8; 64]) -> U64x8 { U64x8::from_array(arr) } +#[inline(always)] +fn i32x16_from_chunk(chunk: &[u8; 64]) -> I32x16 { + let arr: [i32; 16] = core::array::from_fn(|i| { + let off = i * 4; + i32::from_le_bytes([chunk[off], chunk[off + 1], chunk[off + 2], chunk[off + 3]]) + }); + I32x16::from_array(arr) +} + +#[inline(always)] +fn i64x8_from_chunk(chunk: &[u8; 64]) -> I64x8 { + let arr: [i64; 8] = core::array::from_fn(|i| { + let off = i * 8; + i64::from_le_bytes([ + chunk[off], + chunk[off + 1], + chunk[off + 2], + chunk[off + 3], + chunk[off + 4], + chunk[off + 5], + chunk[off + 6], + chunk[off + 7], + ]) + }); + I64x8::from_array(arr) +} + // ════════════════════════════════════════════════════════════════════ // MultiLaneColumn — Arc<[u8]> carrier with typed lane-width chunk iters // ════════════════════════════════════════════════════════════════════ @@ -179,6 +206,16 @@ impl MultiLaneColumn { self.data.len() / 64 } + /// Number of `I32x16`-shaped (16 × i32 = 64-byte) chunks. + pub fn len_i32x16(&self) -> usize { + self.data.len() / 64 + } + + /// Number of `I64x8`-shaped (8 × i64 = 64-byte) chunks. + pub fn len_i64x8(&self) -> usize { + self.data.len() / 64 + } + /// View the backing store as a raw byte slice. pub fn as_bytes(&self) -> &[u8] { &self.data @@ -235,6 +272,27 @@ impl MultiLaneColumn { pub fn iter_u64x8(&self) -> impl Iterator + '_ { self.data.as_chunks::<64>().0.iter().map(u64x8_from_chunk) } + + /// Iterate the column as typed [`I32x16`] values dispatched via + /// `crate::simd::*`. + /// + /// Bytes are decoded little-endian (`i32::from_le_bytes`), the signed + /// sibling of [`iter_f32x16`](Self::iter_f32x16) — the lane width the + /// gridlake batch SoA needs for integer min/max/sum tile columns (the + /// consumer that could previously only view f32 min/max columns). + pub fn iter_i32x16(&self) -> impl Iterator + '_ { + self.data.as_chunks::<64>().0.iter().map(i32x16_from_chunk) + } + + /// Iterate the column as typed [`I64x8`] values dispatched via + /// `crate::simd::*`. + /// + /// Bytes are decoded little-endian (`i64::from_le_bytes`), the signed + /// sibling of [`iter_u64x8`](Self::iter_u64x8) — the lane width for + /// 64-bit integer accumulator columns (running sums). + pub fn iter_i64x8(&self) -> impl Iterator + '_ { + self.data.as_chunks::<64>().0.iter().map(i64x8_from_chunk) + } } // ════════════════════════════════════════════════════════════════════ @@ -255,6 +313,8 @@ mod tests { assert_eq!(col.len_f32x16(), 1); assert_eq!(col.len_f64x8(), 1); assert_eq!(col.len_u64x8(), 1); + assert_eq!(col.len_i32x16(), 1); + assert_eq!(col.len_i64x8(), 1); } #[test] @@ -273,6 +333,8 @@ mod tests { assert_eq!(col.iter_f32x16().count(), 0); assert_eq!(col.iter_f64x8().count(), 0); assert_eq!(col.iter_u64x8().count(), 0); + assert_eq!(col.iter_i32x16().count(), 0); + assert_eq!(col.iter_i64x8().count(), 0); } #[test] @@ -341,6 +403,32 @@ mod tests { assert_eq!(lane.to_array(), src); } + #[test] + fn iter_i32x16_le_round_trip() { + // Signed values incl. negatives, to prove sign-extension is + // preserved by the LE decode (the point of the i32 lane). + let src: [i32; 16] = core::array::from_fn(|i| (i as i32 - 8) * 0x0011_2233); + let mut bytes = vec![0u8; 64]; + for (i, &v) in src.iter().enumerate() { + bytes[i * 4..i * 4 + 4].copy_from_slice(&v.to_le_bytes()); + } + let col = MultiLaneColumn::new(Arc::from(bytes)).unwrap(); + let lane = col.iter_i32x16().next().expect("one lane"); + assert_eq!(lane.to_array(), src); + } + + #[test] + fn iter_i64x8_le_round_trip() { + let src: [i64; 8] = core::array::from_fn(|i| (i as i64 - 4) * 0x0123_4567_89AB_CDEF); + let mut bytes = vec![0u8; 64]; + for (i, &v) in src.iter().enumerate() { + bytes[i * 8..i * 8 + 8].copy_from_slice(&v.to_le_bytes()); + } + let col = MultiLaneColumn::new(Arc::from(bytes)).unwrap(); + let lane = col.iter_i64x8().next().expect("one lane"); + assert_eq!(lane.to_array(), src); + } + #[test] fn typed_iters_yield_three_lanes_over_192_bytes() { let v: Vec = (0u8..192).collect(); @@ -349,6 +437,8 @@ mod tests { assert_eq!(col.iter_f32x16().count(), 3); assert_eq!(col.iter_f64x8().count(), 3); assert_eq!(col.iter_u64x8().count(), 3); + assert_eq!(col.iter_i32x16().count(), 3); + assert_eq!(col.iter_i64x8().count(), 3); } #[test]