diff --git a/src/pipeline/mod.rs b/src/pipeline/mod.rs index c4c0148..443c45a 100644 --- a/src/pipeline/mod.rs +++ b/src/pipeline/mod.rs @@ -154,18 +154,10 @@ pub const STAGES_COUNT: usize = Stage::GammaCompressSrgb as usize + 1; impl PixmapRef<'_> { #[inline(always)] pub(crate) fn gather(&self, index: u32x8) -> [PremultipliedColorU8; highp::STAGE_WIDTH] { - let index: [u32; 8] = bytemuck::cast(index); let pixels = self.pixels(); - [ - pixels[index[0] as usize], - pixels[index[1] as usize], - pixels[index[2] as usize], - pixels[index[3] as usize], - pixels[index[4] as usize], - pixels[index[5] as usize], - pixels[index[6] as usize], - pixels[index[7] as usize], - ] + // safety: callers clamp indices to [0, w*h) via gather_ix. + let gathered = unsafe { u32x8::gather_u32(pixels.as_ptr() as *const u32, index) }; + bytemuck::cast(gathered) } } diff --git a/src/wide/u32x8_t.rs b/src/wide/u32x8_t.rs index 8b69a0e..4ae0f90 100644 --- a/src/wide/u32x8_t.rs +++ b/src/wide/u32x8_t.rs @@ -81,6 +81,34 @@ impl u32x8 { } } } + + /// Gathers 8 u32s from `base[index[i]]`. + /// + /// # Safety + /// Each lane in `index` must be a valid offset (in u32 units) into the + /// buffer at `base`. avx2 `vpgatherdd` faults on oob; the scalar fallback + /// indexes a raw slice and would UB on oob too. + #[inline(always)] + pub unsafe fn gather_u32(base: *const u32, index: Self) -> Self { + cfg_if::cfg_if! { + if #[cfg(all(feature = "simd", target_feature = "avx2"))] { + let vindex: __m256i = cast(index); + Self(_mm256_i32gather_epi32::<4>(base as *const i32, vindex)) + } else { + let ix: [u32; 8] = bytemuck::cast(index); + bytemuck::cast([ + *base.add(ix[0] as usize), + *base.add(ix[1] as usize), + *base.add(ix[2] as usize), + *base.add(ix[3] as usize), + *base.add(ix[4] as usize), + *base.add(ix[5] as usize), + *base.add(ix[6] as usize), + *base.add(ix[7] as usize), + ]) + } + } + } } impl core::ops::Not for u32x8 {