From 6785121aef5dbcb13a9e8a7d0e5b9fcbf344ccfd Mon Sep 17 00:00:00 2001 From: ark Date: Sat, 23 May 2026 12:39:35 +0200 Subject: [PATCH 1/2] perf: AVX2 gather for PixmapRef::gather --- src/pipeline/mod.rs | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/src/pipeline/mod.rs b/src/pipeline/mod.rs index c4c0148..0c3b320 100644 --- a/src/pipeline/mod.rs +++ b/src/pipeline/mod.rs @@ -154,18 +154,34 @@ pub const STAGES_COUNT: usize = Stage::GammaCompressSrgb as usize + 1; impl PixmapRef<'_> { #[inline(always)] pub(crate) fn gather(&self, index: u32x8) -> [PremultipliedColorU8; highp::STAGE_WIDTH] { - let index: [u32; 8] = bytemuck::cast(index); let pixels = self.pixels(); - [ - pixels[index[0] as usize], - pixels[index[1] as usize], - pixels[index[2] as usize], - pixels[index[3] as usize], - pixels[index[4] as usize], - pixels[index[5] as usize], - pixels[index[6] as usize], - pixels[index[7] as usize], - ] + cfg_if::cfg_if! { + if #[cfg(all(feature = "simd", target_feature = "avx2"))] { + #[cfg(target_arch = "x86")] + use core::arch::x86::*; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::*; + + // gather faults on oob; callers clamp indices to [0, w*h) via gather_ix. + unsafe { + let vindex: __m256i = bytemuck::cast(index); + let gathered = _mm256_i32gather_epi32::<4>(pixels.as_ptr() as *const i32, vindex); + bytemuck::cast(gathered) + } + } else { + let index: [u32; 8] = bytemuck::cast(index); + [ + pixels[index[0] as usize], + pixels[index[1] as usize], + pixels[index[2] as usize], + pixels[index[3] as usize], + pixels[index[4] as usize], + pixels[index[5] as usize], + pixels[index[6] as usize], + pixels[index[7] as usize], + ] + } + } } } From d66820e7b586046dc8ab009134720764ad20c38b Mon Sep 17 00:00:00 2001 From: ark Date: Tue, 26 May 2026 21:04:09 +0200 Subject: [PATCH 2/2] refactor: move AVX2 gather intrinsic into wide module --- src/pipeline/mod.rs | 30 +++--------------------------- src/wide/u32x8_t.rs | 28 ++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 27 deletions(-) diff --git a/src/pipeline/mod.rs b/src/pipeline/mod.rs index 0c3b320..443c45a 100644 --- a/src/pipeline/mod.rs +++ b/src/pipeline/mod.rs @@ -155,33 +155,9 @@ impl PixmapRef<'_> { #[inline(always)] pub(crate) fn gather(&self, index: u32x8) -> [PremultipliedColorU8; highp::STAGE_WIDTH] { let pixels = self.pixels(); - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx2"))] { - #[cfg(target_arch = "x86")] - use core::arch::x86::*; - #[cfg(target_arch = "x86_64")] - use core::arch::x86_64::*; - - // gather faults on oob; callers clamp indices to [0, w*h) via gather_ix. - unsafe { - let vindex: __m256i = bytemuck::cast(index); - let gathered = _mm256_i32gather_epi32::<4>(pixels.as_ptr() as *const i32, vindex); - bytemuck::cast(gathered) - } - } else { - let index: [u32; 8] = bytemuck::cast(index); - [ - pixels[index[0] as usize], - pixels[index[1] as usize], - pixels[index[2] as usize], - pixels[index[3] as usize], - pixels[index[4] as usize], - pixels[index[5] as usize], - pixels[index[6] as usize], - pixels[index[7] as usize], - ] - } - } + // safety: callers clamp indices to [0, w*h) via gather_ix. + let gathered = unsafe { u32x8::gather_u32(pixels.as_ptr() as *const u32, index) }; + bytemuck::cast(gathered) } } diff --git a/src/wide/u32x8_t.rs b/src/wide/u32x8_t.rs index 8b69a0e..4ae0f90 100644 --- a/src/wide/u32x8_t.rs +++ b/src/wide/u32x8_t.rs @@ -81,6 +81,34 @@ impl u32x8 { } } } + + /// Gathers 8 u32s from `base[index[i]]`. + /// + /// # Safety + /// Each lane in `index` must be a valid offset (in u32 units) into the + /// buffer at `base`. avx2 `vpgatherdd` faults on oob; the scalar fallback + /// indexes a raw slice and would UB on oob too. + #[inline(always)] + pub unsafe fn gather_u32(base: *const u32, index: Self) -> Self { + cfg_if::cfg_if! { + if #[cfg(all(feature = "simd", target_feature = "avx2"))] { + let vindex: __m256i = cast(index); + Self(_mm256_i32gather_epi32::<4>(base as *const i32, vindex)) + } else { + let ix: [u32; 8] = bytemuck::cast(index); + bytemuck::cast([ + *base.add(ix[0] as usize), + *base.add(ix[1] as usize), + *base.add(ix[2] as usize), + *base.add(ix[3] as usize), + *base.add(ix[4] as usize), + *base.add(ix[5] as usize), + *base.add(ix[6] as usize), + *base.add(ix[7] as usize), + ]) + } + } + } } impl core::ops::Not for u32x8 {