From 2d62200ee4de0f612a73135dad2f24311f11cb21 Mon Sep 17 00:00:00 2001 From: ark Date: Sat, 23 May 2026 12:17:48 +0200 Subject: [PATCH 1/2] perf: AVX2 8888 load/store + f32x16 save specializations --- src/pipeline/highp.rs | 145 +++++++++++++++++++++----------- src/pipeline/lowp.rs | 187 ++++++++++++++++++++++++++++++------------ src/wide/f32x16_t.rs | 66 +++++++++------ 3 files changed, 272 insertions(+), 126 deletions(-) diff --git a/src/pipeline/highp.rs b/src/pipeline/highp.rs index 214e31a..6065460 100644 --- a/src/pipeline/highp.rs +++ b/src/pipeline/highp.rs @@ -1246,37 +1246,57 @@ fn load_8888( data: &[PremultipliedColorU8; STAGE_WIDTH], r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8, ) { - // Surprisingly, `f32 * FACTOR` is way faster than `f32x8 * f32x8::splat(FACTOR)`. - - const FACTOR: f32 = 1.0 / 255.0; - - *r = f32x8::from([ - data[0].red() as f32 * FACTOR, data[1].red() as f32 * FACTOR, - data[2].red() as f32 * FACTOR, data[3].red() as f32 * FACTOR, - data[4].red() as f32 * FACTOR, data[5].red() as f32 * FACTOR, - data[6].red() as f32 * FACTOR, data[7].red() as f32 * FACTOR, - ]); - - *g = f32x8::from([ - data[0].green() as f32 * FACTOR, data[1].green() as f32 * FACTOR, - data[2].green() as f32 * FACTOR, data[3].green() as f32 * FACTOR, - data[4].green() as f32 * FACTOR, data[5].green() as f32 * FACTOR, - data[6].green() as f32 * FACTOR, data[7].green() as f32 * FACTOR, - ]); - - *b = f32x8::from([ - data[0].blue() as f32 * FACTOR, data[1].blue() as f32 * FACTOR, - data[2].blue() as f32 * FACTOR, data[3].blue() as f32 * FACTOR, - data[4].blue() as f32 * FACTOR, data[5].blue() as f32 * FACTOR, - data[6].blue() as f32 * FACTOR, data[7].blue() as f32 * FACTOR, - ]); - - *a = f32x8::from([ - data[0].alpha() as f32 * FACTOR, data[1].alpha() as f32 * FACTOR, - data[2].alpha() as f32 * FACTOR, data[3].alpha() as f32 * FACTOR, - data[4].alpha() as f32 * FACTOR, data[5].alpha() as f32 * FACTOR, - data[6].alpha() as f32 * FACTOR, data[7].alpha() as f32 * FACTOR, - ]); + cfg_if::cfg_if! { + if #[cfg(all(feature = "simd", target_feature = "avx2"))] { + #[cfg(target_arch = "x86")] + use core::arch::x86::*; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::*; + + unsafe { + let p = _mm256_loadu_si256(data.as_ptr() as *const __m256i); + let mask = _mm256_set1_epi32(0xFF); + let factor = _mm256_set1_ps(1.0 / 255.0); + let to_f = |v| _mm256_mul_ps(_mm256_cvtepi32_ps(v), factor); + + *r = bytemuck::cast(to_f(_mm256_and_si256(p, mask))); + *g = bytemuck::cast(to_f(_mm256_and_si256(_mm256_srli_epi32::<8>(p), mask))); + *b = bytemuck::cast(to_f(_mm256_and_si256(_mm256_srli_epi32::<16>(p), mask))); + *a = bytemuck::cast(to_f(_mm256_srli_epi32::<24>(p))); + } + } else { + // surprisingly, `f32 * FACTOR` is way faster than `f32x8 * f32x8::splat(FACTOR)`. + const FACTOR: f32 = 1.0 / 255.0; + + *r = f32x8::from([ + data[0].red() as f32 * FACTOR, data[1].red() as f32 * FACTOR, + data[2].red() as f32 * FACTOR, data[3].red() as f32 * FACTOR, + data[4].red() as f32 * FACTOR, data[5].red() as f32 * FACTOR, + data[6].red() as f32 * FACTOR, data[7].red() as f32 * FACTOR, + ]); + + *g = f32x8::from([ + data[0].green() as f32 * FACTOR, data[1].green() as f32 * FACTOR, + data[2].green() as f32 * FACTOR, data[3].green() as f32 * FACTOR, + data[4].green() as f32 * FACTOR, data[5].green() as f32 * FACTOR, + data[6].green() as f32 * FACTOR, data[7].green() as f32 * FACTOR, + ]); + + *b = f32x8::from([ + data[0].blue() as f32 * FACTOR, data[1].blue() as f32 * FACTOR, + data[2].blue() as f32 * FACTOR, data[3].blue() as f32 * FACTOR, + data[4].blue() as f32 * FACTOR, data[5].blue() as f32 * FACTOR, + data[6].blue() as f32 * FACTOR, data[7].blue() as f32 * FACTOR, + ]); + + *a = f32x8::from([ + data[0].alpha() as f32 * FACTOR, data[1].alpha() as f32 * FACTOR, + data[2].alpha() as f32 * FACTOR, data[3].alpha() as f32 * FACTOR, + data[4].alpha() as f32 * FACTOR, data[5].alpha() as f32 * FACTOR, + data[6].alpha() as f32 * FACTOR, data[7].alpha() as f32 * FACTOR, + ]); + } + } } #[inline(always)] @@ -1296,22 +1316,53 @@ fn store_8888( r: &f32x8, g: &f32x8, b: &f32x8, a: &f32x8, data: &mut [PremultipliedColorU8; STAGE_WIDTH], ) { - let r: [i32; 8] = unnorm(r).into(); - let g: [i32; 8] = unnorm(g).into(); - let b: [i32; 8] = unnorm(b).into(); - let a: [i32; 8] = unnorm(a).into(); - - let conv = |rr, gg, bb, aa| - PremultipliedColorU8::from_rgba_unchecked(rr as u8, gg as u8, bb as u8, aa as u8); - - data[0] = conv(r[0], g[0], b[0], a[0]); - data[1] = conv(r[1], g[1], b[1], a[1]); - data[2] = conv(r[2], g[2], b[2], a[2]); - data[3] = conv(r[3], g[3], b[3], a[3]); - data[4] = conv(r[4], g[4], b[4], a[4]); - data[5] = conv(r[5], g[5], b[5], a[5]); - data[6] = conv(r[6], g[6], b[6], a[6]); - data[7] = conv(r[7], g[7], b[7], a[7]); + cfg_if::cfg_if! { + if #[cfg(all(feature = "simd", target_feature = "avx2"))] { + #[cfg(target_arch = "x86")] + use core::arch::x86::*; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::*; + + // matches unnorm: clamp to [0,1], scale to [0,255], round to nearest (default MXCSR). + unsafe { + let scale = _mm256_set1_ps(255.0); + let zero = _mm256_setzero_ps(); + let one = _mm256_set1_ps(1.0); + let to_u32 = |v| { + let clamped = _mm256_min_ps(_mm256_max_ps(v, zero), one); + _mm256_cvtps_epi32(_mm256_mul_ps(clamped, scale)) + }; + + let ri = to_u32(bytemuck::cast(*r)); + let gi = to_u32(bytemuck::cast(*g)); + let bi = to_u32(bytemuck::cast(*b)); + let ai = to_u32(bytemuck::cast(*a)); + + let rgba = _mm256_or_si256( + _mm256_or_si256(ri, _mm256_slli_epi32::<8>(gi)), + _mm256_or_si256(_mm256_slli_epi32::<16>(bi), _mm256_slli_epi32::<24>(ai)), + ); + _mm256_storeu_si256(data.as_mut_ptr() as *mut __m256i, rgba); + } + } else { + let r: [i32; 8] = unnorm(r).into(); + let g: [i32; 8] = unnorm(g).into(); + let b: [i32; 8] = unnorm(b).into(); + let a: [i32; 8] = unnorm(a).into(); + + let conv = |rr, gg, bb, aa| + PremultipliedColorU8::from_rgba_unchecked(rr as u8, gg as u8, bb as u8, aa as u8); + + data[0] = conv(r[0], g[0], b[0], a[0]); + data[1] = conv(r[1], g[1], b[1], a[1]); + data[2] = conv(r[2], g[2], b[2], a[2]); + data[3] = conv(r[3], g[3], b[3], a[3]); + data[4] = conv(r[4], g[4], b[4], a[4]); + data[5] = conv(r[5], g[5], b[5], a[5]); + data[6] = conv(r[6], g[6], b[6], a[6]); + data[7] = conv(r[7], g[7], b[7], a[7]); + } + } } #[inline(always)] diff --git a/src/pipeline/lowp.rs b/src/pipeline/lowp.rs index fc1a6c5..52dcd54 100644 --- a/src/pipeline/lowp.rs +++ b/src/pipeline/lowp.rs @@ -742,33 +742,67 @@ fn load_8888( data: &[PremultipliedColorU8; STAGE_WIDTH], r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16, ) { - *r = u16x16([ - data[ 0].red() as u16, data[ 1].red() as u16, data[ 2].red() as u16, data[ 3].red() as u16, - data[ 4].red() as u16, data[ 5].red() as u16, data[ 6].red() as u16, data[ 7].red() as u16, - data[ 8].red() as u16, data[ 9].red() as u16, data[10].red() as u16, data[11].red() as u16, - data[12].red() as u16, data[13].red() as u16, data[14].red() as u16, data[15].red() as u16, - ]); - - *g = u16x16([ - data[ 0].green() as u16, data[ 1].green() as u16, data[ 2].green() as u16, data[ 3].green() as u16, - data[ 4].green() as u16, data[ 5].green() as u16, data[ 6].green() as u16, data[ 7].green() as u16, - data[ 8].green() as u16, data[ 9].green() as u16, data[10].green() as u16, data[11].green() as u16, - data[12].green() as u16, data[13].green() as u16, data[14].green() as u16, data[15].green() as u16, - ]); - - *b = u16x16([ - data[ 0].blue() as u16, data[ 1].blue() as u16, data[ 2].blue() as u16, data[ 3].blue() as u16, - data[ 4].blue() as u16, data[ 5].blue() as u16, data[ 6].blue() as u16, data[ 7].blue() as u16, - data[ 8].blue() as u16, data[ 9].blue() as u16, data[10].blue() as u16, data[11].blue() as u16, - data[12].blue() as u16, data[13].blue() as u16, data[14].blue() as u16, data[15].blue() as u16, - ]); - - *a = u16x16([ - data[ 0].alpha() as u16, data[ 1].alpha() as u16, data[ 2].alpha() as u16, data[ 3].alpha() as u16, - data[ 4].alpha() as u16, data[ 5].alpha() as u16, data[ 6].alpha() as u16, data[ 7].alpha() as u16, - data[ 8].alpha() as u16, data[ 9].alpha() as u16, data[10].alpha() as u16, data[11].alpha() as u16, - data[12].alpha() as u16, data[13].alpha() as u16, data[14].alpha() as u16, data[15].alpha() as u16, - ]); + cfg_if::cfg_if! { + if #[cfg(all(feature = "simd", target_feature = "avx2"))] { + #[cfg(target_arch = "x86")] + use core::arch::x86::*; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::*; + + // extract each channel by shift+mask from u32 lanes, then saturate-pack u32x8 + u32x8 -> u16x16. + // packus_epi32 lane-swaps; permute4x64 with 0xD8 puts the halves back in order. + unsafe { + let p_lo = _mm256_loadu_si256(data.as_ptr() as *const __m256i); + let p_hi = _mm256_loadu_si256(data.as_ptr().add(8) as *const __m256i); + let mask = _mm256_set1_epi32(0xFF); + let pack = |lo, hi| _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi32(lo, hi)); + + let rr = pack(_mm256_and_si256(p_lo, mask), _mm256_and_si256(p_hi, mask)); + let gg = pack( + _mm256_and_si256(_mm256_srli_epi32::<8>(p_lo), mask), + _mm256_and_si256(_mm256_srli_epi32::<8>(p_hi), mask), + ); + let bb = pack( + _mm256_and_si256(_mm256_srli_epi32::<16>(p_lo), mask), + _mm256_and_si256(_mm256_srli_epi32::<16>(p_hi), mask), + ); + let aa = pack(_mm256_srli_epi32::<24>(p_lo), _mm256_srli_epi32::<24>(p_hi)); + + _mm256_storeu_si256(r.0.as_mut_ptr() as *mut __m256i, rr); + _mm256_storeu_si256(g.0.as_mut_ptr() as *mut __m256i, gg); + _mm256_storeu_si256(b.0.as_mut_ptr() as *mut __m256i, bb); + _mm256_storeu_si256(a.0.as_mut_ptr() as *mut __m256i, aa); + } + } else { + *r = u16x16([ + data[ 0].red() as u16, data[ 1].red() as u16, data[ 2].red() as u16, data[ 3].red() as u16, + data[ 4].red() as u16, data[ 5].red() as u16, data[ 6].red() as u16, data[ 7].red() as u16, + data[ 8].red() as u16, data[ 9].red() as u16, data[10].red() as u16, data[11].red() as u16, + data[12].red() as u16, data[13].red() as u16, data[14].red() as u16, data[15].red() as u16, + ]); + + *g = u16x16([ + data[ 0].green() as u16, data[ 1].green() as u16, data[ 2].green() as u16, data[ 3].green() as u16, + data[ 4].green() as u16, data[ 5].green() as u16, data[ 6].green() as u16, data[ 7].green() as u16, + data[ 8].green() as u16, data[ 9].green() as u16, data[10].green() as u16, data[11].green() as u16, + data[12].green() as u16, data[13].green() as u16, data[14].green() as u16, data[15].green() as u16, + ]); + + *b = u16x16([ + data[ 0].blue() as u16, data[ 1].blue() as u16, data[ 2].blue() as u16, data[ 3].blue() as u16, + data[ 4].blue() as u16, data[ 5].blue() as u16, data[ 6].blue() as u16, data[ 7].blue() as u16, + data[ 8].blue() as u16, data[ 9].blue() as u16, data[10].blue() as u16, data[11].blue() as u16, + data[12].blue() as u16, data[13].blue() as u16, data[14].blue() as u16, data[15].blue() as u16, + ]); + + *a = u16x16([ + data[ 0].alpha() as u16, data[ 1].alpha() as u16, data[ 2].alpha() as u16, data[ 3].alpha() as u16, + data[ 4].alpha() as u16, data[ 5].alpha() as u16, data[ 6].alpha() as u16, data[ 7].alpha() as u16, + data[ 8].alpha() as u16, data[ 9].alpha() as u16, data[10].alpha() as u16, data[11].alpha() as u16, + data[12].alpha() as u16, data[13].alpha() as u16, data[14].alpha() as u16, data[15].alpha() as u16, + ]); + } + } } #[inline(always)] @@ -788,27 +822,57 @@ fn store_8888( r: &u16x16, g: &u16x16, b: &u16x16, a: &u16x16, data: &mut [PremultipliedColorU8; STAGE_WIDTH], ) { - let r = r.as_slice(); - let g = g.as_slice(); - let b = b.as_slice(); - let a = a.as_slice(); - - data[ 0] = PremultipliedColorU8::from_rgba_unchecked(r[ 0] as u8, g[ 0] as u8, b[ 0] as u8, a[ 0] as u8); - data[ 1] = PremultipliedColorU8::from_rgba_unchecked(r[ 1] as u8, g[ 1] as u8, b[ 1] as u8, a[ 1] as u8); - data[ 2] = PremultipliedColorU8::from_rgba_unchecked(r[ 2] as u8, g[ 2] as u8, b[ 2] as u8, a[ 2] as u8); - data[ 3] = PremultipliedColorU8::from_rgba_unchecked(r[ 3] as u8, g[ 3] as u8, b[ 3] as u8, a[ 3] as u8); - data[ 4] = PremultipliedColorU8::from_rgba_unchecked(r[ 4] as u8, g[ 4] as u8, b[ 4] as u8, a[ 4] as u8); - data[ 5] = PremultipliedColorU8::from_rgba_unchecked(r[ 5] as u8, g[ 5] as u8, b[ 5] as u8, a[ 5] as u8); - data[ 6] = PremultipliedColorU8::from_rgba_unchecked(r[ 6] as u8, g[ 6] as u8, b[ 6] as u8, a[ 6] as u8); - data[ 7] = PremultipliedColorU8::from_rgba_unchecked(r[ 7] as u8, g[ 7] as u8, b[ 7] as u8, a[ 7] as u8); - data[ 8] = PremultipliedColorU8::from_rgba_unchecked(r[ 8] as u8, g[ 8] as u8, b[ 8] as u8, a[ 8] as u8); - data[ 9] = PremultipliedColorU8::from_rgba_unchecked(r[ 9] as u8, g[ 9] as u8, b[ 9] as u8, a[ 9] as u8); - data[10] = PremultipliedColorU8::from_rgba_unchecked(r[10] as u8, g[10] as u8, b[10] as u8, a[10] as u8); - data[11] = PremultipliedColorU8::from_rgba_unchecked(r[11] as u8, g[11] as u8, b[11] as u8, a[11] as u8); - data[12] = PremultipliedColorU8::from_rgba_unchecked(r[12] as u8, g[12] as u8, b[12] as u8, a[12] as u8); - data[13] = PremultipliedColorU8::from_rgba_unchecked(r[13] as u8, g[13] as u8, b[13] as u8, a[13] as u8); - data[14] = PremultipliedColorU8::from_rgba_unchecked(r[14] as u8, g[14] as u8, b[14] as u8, a[14] as u8); - data[15] = PremultipliedColorU8::from_rgba_unchecked(r[15] as u8, g[15] as u8, b[15] as u8, a[15] as u8); + cfg_if::cfg_if! { + if #[cfg(all(feature = "simd", target_feature = "avx2"))] { + #[cfg(target_arch = "x86")] + use core::arch::x86::*; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::*; + + // pack rgba into u32 pixels via (g<<8)|r and (a<<8)|b, then interleave; + // unpack_lo/hi cross 128-bit lanes, so a final permute2x128 reassembles in order. + unsafe { + let rv = _mm256_loadu_si256(r.0.as_ptr() as *const __m256i); + let gv = _mm256_loadu_si256(g.0.as_ptr() as *const __m256i); + let bv = _mm256_loadu_si256(b.0.as_ptr() as *const __m256i); + let av = _mm256_loadu_si256(a.0.as_ptr() as *const __m256i); + + let rg = _mm256_or_si256(rv, _mm256_slli_epi16::<8>(gv)); + let ba = _mm256_or_si256(bv, _mm256_slli_epi16::<8>(av)); + + let p_lo = _mm256_unpacklo_epi16(rg, ba); + let p_hi = _mm256_unpackhi_epi16(rg, ba); + + let out_lo = _mm256_permute2x128_si256::<0x20>(p_lo, p_hi); + let out_hi = _mm256_permute2x128_si256::<0x31>(p_lo, p_hi); + + _mm256_storeu_si256(data.as_mut_ptr() as *mut __m256i, out_lo); + _mm256_storeu_si256(data.as_mut_ptr().add(8) as *mut __m256i, out_hi); + } + } else { + let r = r.as_slice(); + let g = g.as_slice(); + let b = b.as_slice(); + let a = a.as_slice(); + + data[ 0] = PremultipliedColorU8::from_rgba_unchecked(r[ 0] as u8, g[ 0] as u8, b[ 0] as u8, a[ 0] as u8); + data[ 1] = PremultipliedColorU8::from_rgba_unchecked(r[ 1] as u8, g[ 1] as u8, b[ 1] as u8, a[ 1] as u8); + data[ 2] = PremultipliedColorU8::from_rgba_unchecked(r[ 2] as u8, g[ 2] as u8, b[ 2] as u8, a[ 2] as u8); + data[ 3] = PremultipliedColorU8::from_rgba_unchecked(r[ 3] as u8, g[ 3] as u8, b[ 3] as u8, a[ 3] as u8); + data[ 4] = PremultipliedColorU8::from_rgba_unchecked(r[ 4] as u8, g[ 4] as u8, b[ 4] as u8, a[ 4] as u8); + data[ 5] = PremultipliedColorU8::from_rgba_unchecked(r[ 5] as u8, g[ 5] as u8, b[ 5] as u8, a[ 5] as u8); + data[ 6] = PremultipliedColorU8::from_rgba_unchecked(r[ 6] as u8, g[ 6] as u8, b[ 6] as u8, a[ 6] as u8); + data[ 7] = PremultipliedColorU8::from_rgba_unchecked(r[ 7] as u8, g[ 7] as u8, b[ 7] as u8, a[ 7] as u8); + data[ 8] = PremultipliedColorU8::from_rgba_unchecked(r[ 8] as u8, g[ 8] as u8, b[ 8] as u8, a[ 8] as u8); + data[ 9] = PremultipliedColorU8::from_rgba_unchecked(r[ 9] as u8, g[ 9] as u8, b[ 9] as u8, a[ 9] as u8); + data[10] = PremultipliedColorU8::from_rgba_unchecked(r[10] as u8, g[10] as u8, b[10] as u8, a[10] as u8); + data[11] = PremultipliedColorU8::from_rgba_unchecked(r[11] as u8, g[11] as u8, b[11] as u8, a[11] as u8); + data[12] = PremultipliedColorU8::from_rgba_unchecked(r[12] as u8, g[12] as u8, b[12] as u8, a[12] as u8); + data[13] = PremultipliedColorU8::from_rgba_unchecked(r[13] as u8, g[13] as u8, b[13] as u8, a[13] as u8); + data[14] = PremultipliedColorU8::from_rgba_unchecked(r[14] as u8, g[14] as u8, b[14] as u8, a[14] as u8); + data[15] = PremultipliedColorU8::from_rgba_unchecked(r[15] as u8, g[15] as u8, b[15] as u8, a[15] as u8); + } + } } #[inline(always)] @@ -837,12 +901,27 @@ fn store_8888_tail( #[inline(always)] fn load_8(data: &[u8; STAGE_WIDTH], a: &mut u16x16) { - *a = u16x16([ - data[ 0] as u16, data[ 1] as u16, data[ 2] as u16, data[ 3] as u16, - data[ 4] as u16, data[ 5] as u16, data[ 6] as u16, data[ 7] as u16, - data[ 8] as u16, data[ 9] as u16, data[10] as u16, data[11] as u16, - data[12] as u16, data[13] as u16, data[14] as u16, data[15] as u16, - ]); + cfg_if::cfg_if! { + if #[cfg(all(feature = "simd", target_feature = "avx2"))] { + #[cfg(target_arch = "x86")] + use core::arch::x86::*; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::*; + + unsafe { + let bytes = _mm_loadu_si128(data.as_ptr() as *const __m128i); + let widened = _mm256_cvtepu8_epi16(bytes); + _mm256_storeu_si256(a.0.as_mut_ptr() as *mut __m256i, widened); + } + } else { + *a = u16x16([ + data[ 0] as u16, data[ 1] as u16, data[ 2] as u16, data[ 3] as u16, + data[ 4] as u16, data[ 5] as u16, data[ 6] as u16, data[ 7] as u16, + data[ 8] as u16, data[ 9] as u16, data[10] as u16, data[11] as u16, + data[12] as u16, data[13] as u16, data[14] as u16, data[15] as u16, + ]); + } + } } #[inline(always)] diff --git a/src/wide/f32x16_t.rs b/src/wide/f32x16_t.rs index 3cd76a1..06c0119 100644 --- a/src/wide/f32x16_t.rs +++ b/src/wide/f32x16_t.rs @@ -85,31 +85,47 @@ impl f32x16 { // This method is too heavy and shouldn't be inlined. pub fn save_to_u16x16(&self, dst: &mut u16x16) { - // Do not use to_i32x8, because it involves rounding, - // and Skia cast's without it. - - let n0: [f32; 8] = self.0.into(); - let n1: [f32; 8] = self.1.into(); - - dst.0[0] = n0[0] as u16; - dst.0[1] = n0[1] as u16; - dst.0[2] = n0[2] as u16; - dst.0[3] = n0[3] as u16; - - dst.0[4] = n0[4] as u16; - dst.0[5] = n0[5] as u16; - dst.0[6] = n0[6] as u16; - dst.0[7] = n0[7] as u16; - - dst.0[8] = n1[0] as u16; - dst.0[9] = n1[1] as u16; - dst.0[10] = n1[2] as u16; - dst.0[11] = n1[3] as u16; - - dst.0[12] = n1[4] as u16; - dst.0[13] = n1[5] as u16; - dst.0[14] = n1[6] as u16; - dst.0[15] = n1[7] as u16; + cfg_if::cfg_if! { + if #[cfg(all(feature = "simd", target_feature = "avx2"))] { + #[cfg(target_arch = "x86")] + use core::arch::x86::*; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::*; + + // truncate f32 -> i32 (skia casts without rounding), then saturate-pack to u16x16. + // packus_epi32 lane-swaps; permute4x64 with 0xD8 puts the halves back in order. + unsafe { + let i0 = _mm256_cvttps_epi32(bytemuck::cast(self.0)); + let i1 = _mm256_cvttps_epi32(bytemuck::cast(self.1)); + let packed = _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi32(i0, i1)); + _mm256_storeu_si256(dst.0.as_mut_ptr() as *mut __m256i, packed); + } + } else { + // do not use to_i32x8, because it involves rounding, and skia casts without it. + let n0: [f32; 8] = self.0.into(); + let n1: [f32; 8] = self.1.into(); + + dst.0[0] = n0[0] as u16; + dst.0[1] = n0[1] as u16; + dst.0[2] = n0[2] as u16; + dst.0[3] = n0[3] as u16; + + dst.0[4] = n0[4] as u16; + dst.0[5] = n0[5] as u16; + dst.0[6] = n0[6] as u16; + dst.0[7] = n0[7] as u16; + + dst.0[8] = n1[0] as u16; + dst.0[9] = n1[1] as u16; + dst.0[10] = n1[2] as u16; + dst.0[11] = n1[3] as u16; + + dst.0[12] = n1[4] as u16; + dst.0[13] = n1[5] as u16; + dst.0[14] = n1[6] as u16; + dst.0[15] = n1[7] as u16; + } + } } } From a093b7f99d5557d15fe70514c10bf123cb42a81e Mon Sep 17 00:00:00 2001 From: ark Date: Tue, 26 May 2026 21:12:29 +0200 Subject: [PATCH 2/2] refactor: move AVX2 8888 load/store into wide module --- src/pipeline/highp.rs | 104 ++----------------------------- src/pipeline/lowp.rs | 140 +++--------------------------------------- src/wide/f32x8_t.rs | 79 ++++++++++++++++++++++++ src/wide/u16x16_t.rs | 119 +++++++++++++++++++++++++++++++++++ 4 files changed, 211 insertions(+), 231 deletions(-) diff --git a/src/pipeline/highp.rs b/src/pipeline/highp.rs index 6065460..329efbf 100644 --- a/src/pipeline/highp.rs +++ b/src/pipeline/highp.rs @@ -1246,57 +1246,11 @@ fn load_8888( data: &[PremultipliedColorU8; STAGE_WIDTH], r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8, ) { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx2"))] { - #[cfg(target_arch = "x86")] - use core::arch::x86::*; - #[cfg(target_arch = "x86_64")] - use core::arch::x86_64::*; - - unsafe { - let p = _mm256_loadu_si256(data.as_ptr() as *const __m256i); - let mask = _mm256_set1_epi32(0xFF); - let factor = _mm256_set1_ps(1.0 / 255.0); - let to_f = |v| _mm256_mul_ps(_mm256_cvtepi32_ps(v), factor); - - *r = bytemuck::cast(to_f(_mm256_and_si256(p, mask))); - *g = bytemuck::cast(to_f(_mm256_and_si256(_mm256_srli_epi32::<8>(p), mask))); - *b = bytemuck::cast(to_f(_mm256_and_si256(_mm256_srli_epi32::<16>(p), mask))); - *a = bytemuck::cast(to_f(_mm256_srli_epi32::<24>(p))); - } - } else { - // surprisingly, `f32 * FACTOR` is way faster than `f32x8 * f32x8::splat(FACTOR)`. - const FACTOR: f32 = 1.0 / 255.0; - - *r = f32x8::from([ - data[0].red() as f32 * FACTOR, data[1].red() as f32 * FACTOR, - data[2].red() as f32 * FACTOR, data[3].red() as f32 * FACTOR, - data[4].red() as f32 * FACTOR, data[5].red() as f32 * FACTOR, - data[6].red() as f32 * FACTOR, data[7].red() as f32 * FACTOR, - ]); - - *g = f32x8::from([ - data[0].green() as f32 * FACTOR, data[1].green() as f32 * FACTOR, - data[2].green() as f32 * FACTOR, data[3].green() as f32 * FACTOR, - data[4].green() as f32 * FACTOR, data[5].green() as f32 * FACTOR, - data[6].green() as f32 * FACTOR, data[7].green() as f32 * FACTOR, - ]); - - *b = f32x8::from([ - data[0].blue() as f32 * FACTOR, data[1].blue() as f32 * FACTOR, - data[2].blue() as f32 * FACTOR, data[3].blue() as f32 * FACTOR, - data[4].blue() as f32 * FACTOR, data[5].blue() as f32 * FACTOR, - data[6].blue() as f32 * FACTOR, data[7].blue() as f32 * FACTOR, - ]); - - *a = f32x8::from([ - data[0].alpha() as f32 * FACTOR, data[1].alpha() as f32 * FACTOR, - data[2].alpha() as f32 * FACTOR, data[3].alpha() as f32 * FACTOR, - data[4].alpha() as f32 * FACTOR, data[5].alpha() as f32 * FACTOR, - data[6].alpha() as f32 * FACTOR, data[7].alpha() as f32 * FACTOR, - ]); - } - } + let [rr, gg, bb, aa] = f32x8::load_8888_unorm(bytemuck::cast_ref(data)); + *r = rr; + *g = gg; + *b = bb; + *a = aa; } #[inline(always)] @@ -1316,53 +1270,7 @@ fn store_8888( r: &f32x8, g: &f32x8, b: &f32x8, a: &f32x8, data: &mut [PremultipliedColorU8; STAGE_WIDTH], ) { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx2"))] { - #[cfg(target_arch = "x86")] - use core::arch::x86::*; - #[cfg(target_arch = "x86_64")] - use core::arch::x86_64::*; - - // matches unnorm: clamp to [0,1], scale to [0,255], round to nearest (default MXCSR). - unsafe { - let scale = _mm256_set1_ps(255.0); - let zero = _mm256_setzero_ps(); - let one = _mm256_set1_ps(1.0); - let to_u32 = |v| { - let clamped = _mm256_min_ps(_mm256_max_ps(v, zero), one); - _mm256_cvtps_epi32(_mm256_mul_ps(clamped, scale)) - }; - - let ri = to_u32(bytemuck::cast(*r)); - let gi = to_u32(bytemuck::cast(*g)); - let bi = to_u32(bytemuck::cast(*b)); - let ai = to_u32(bytemuck::cast(*a)); - - let rgba = _mm256_or_si256( - _mm256_or_si256(ri, _mm256_slli_epi32::<8>(gi)), - _mm256_or_si256(_mm256_slli_epi32::<16>(bi), _mm256_slli_epi32::<24>(ai)), - ); - _mm256_storeu_si256(data.as_mut_ptr() as *mut __m256i, rgba); - } - } else { - let r: [i32; 8] = unnorm(r).into(); - let g: [i32; 8] = unnorm(g).into(); - let b: [i32; 8] = unnorm(b).into(); - let a: [i32; 8] = unnorm(a).into(); - - let conv = |rr, gg, bb, aa| - PremultipliedColorU8::from_rgba_unchecked(rr as u8, gg as u8, bb as u8, aa as u8); - - data[0] = conv(r[0], g[0], b[0], a[0]); - data[1] = conv(r[1], g[1], b[1], a[1]); - data[2] = conv(r[2], g[2], b[2], a[2]); - data[3] = conv(r[3], g[3], b[3], a[3]); - data[4] = conv(r[4], g[4], b[4], a[4]); - data[5] = conv(r[5], g[5], b[5], a[5]); - data[6] = conv(r[6], g[6], b[6], a[6]); - data[7] = conv(r[7], g[7], b[7], a[7]); - } - } + f32x8::store_8888_unorm(&[*r, *g, *b, *a], bytemuck::cast_mut(data)); } #[inline(always)] diff --git a/src/pipeline/lowp.rs b/src/pipeline/lowp.rs index 52dcd54..227e5d6 100644 --- a/src/pipeline/lowp.rs +++ b/src/pipeline/lowp.rs @@ -742,67 +742,11 @@ fn load_8888( data: &[PremultipliedColorU8; STAGE_WIDTH], r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16, ) { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx2"))] { - #[cfg(target_arch = "x86")] - use core::arch::x86::*; - #[cfg(target_arch = "x86_64")] - use core::arch::x86_64::*; - - // extract each channel by shift+mask from u32 lanes, then saturate-pack u32x8 + u32x8 -> u16x16. - // packus_epi32 lane-swaps; permute4x64 with 0xD8 puts the halves back in order. - unsafe { - let p_lo = _mm256_loadu_si256(data.as_ptr() as *const __m256i); - let p_hi = _mm256_loadu_si256(data.as_ptr().add(8) as *const __m256i); - let mask = _mm256_set1_epi32(0xFF); - let pack = |lo, hi| _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi32(lo, hi)); - - let rr = pack(_mm256_and_si256(p_lo, mask), _mm256_and_si256(p_hi, mask)); - let gg = pack( - _mm256_and_si256(_mm256_srli_epi32::<8>(p_lo), mask), - _mm256_and_si256(_mm256_srli_epi32::<8>(p_hi), mask), - ); - let bb = pack( - _mm256_and_si256(_mm256_srli_epi32::<16>(p_lo), mask), - _mm256_and_si256(_mm256_srli_epi32::<16>(p_hi), mask), - ); - let aa = pack(_mm256_srli_epi32::<24>(p_lo), _mm256_srli_epi32::<24>(p_hi)); - - _mm256_storeu_si256(r.0.as_mut_ptr() as *mut __m256i, rr); - _mm256_storeu_si256(g.0.as_mut_ptr() as *mut __m256i, gg); - _mm256_storeu_si256(b.0.as_mut_ptr() as *mut __m256i, bb); - _mm256_storeu_si256(a.0.as_mut_ptr() as *mut __m256i, aa); - } - } else { - *r = u16x16([ - data[ 0].red() as u16, data[ 1].red() as u16, data[ 2].red() as u16, data[ 3].red() as u16, - data[ 4].red() as u16, data[ 5].red() as u16, data[ 6].red() as u16, data[ 7].red() as u16, - data[ 8].red() as u16, data[ 9].red() as u16, data[10].red() as u16, data[11].red() as u16, - data[12].red() as u16, data[13].red() as u16, data[14].red() as u16, data[15].red() as u16, - ]); - - *g = u16x16([ - data[ 0].green() as u16, data[ 1].green() as u16, data[ 2].green() as u16, data[ 3].green() as u16, - data[ 4].green() as u16, data[ 5].green() as u16, data[ 6].green() as u16, data[ 7].green() as u16, - data[ 8].green() as u16, data[ 9].green() as u16, data[10].green() as u16, data[11].green() as u16, - data[12].green() as u16, data[13].green() as u16, data[14].green() as u16, data[15].green() as u16, - ]); - - *b = u16x16([ - data[ 0].blue() as u16, data[ 1].blue() as u16, data[ 2].blue() as u16, data[ 3].blue() as u16, - data[ 4].blue() as u16, data[ 5].blue() as u16, data[ 6].blue() as u16, data[ 7].blue() as u16, - data[ 8].blue() as u16, data[ 9].blue() as u16, data[10].blue() as u16, data[11].blue() as u16, - data[12].blue() as u16, data[13].blue() as u16, data[14].blue() as u16, data[15].blue() as u16, - ]); - - *a = u16x16([ - data[ 0].alpha() as u16, data[ 1].alpha() as u16, data[ 2].alpha() as u16, data[ 3].alpha() as u16, - data[ 4].alpha() as u16, data[ 5].alpha() as u16, data[ 6].alpha() as u16, data[ 7].alpha() as u16, - data[ 8].alpha() as u16, data[ 9].alpha() as u16, data[10].alpha() as u16, data[11].alpha() as u16, - data[12].alpha() as u16, data[13].alpha() as u16, data[14].alpha() as u16, data[15].alpha() as u16, - ]); - } - } + let [rr, gg, bb, aa] = u16x16::load_8888(bytemuck::cast_ref(data)); + *r = rr; + *g = gg; + *b = bb; + *a = aa; } #[inline(always)] @@ -822,57 +766,7 @@ fn store_8888( r: &u16x16, g: &u16x16, b: &u16x16, a: &u16x16, data: &mut [PremultipliedColorU8; STAGE_WIDTH], ) { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx2"))] { - #[cfg(target_arch = "x86")] - use core::arch::x86::*; - #[cfg(target_arch = "x86_64")] - use core::arch::x86_64::*; - - // pack rgba into u32 pixels via (g<<8)|r and (a<<8)|b, then interleave; - // unpack_lo/hi cross 128-bit lanes, so a final permute2x128 reassembles in order. - unsafe { - let rv = _mm256_loadu_si256(r.0.as_ptr() as *const __m256i); - let gv = _mm256_loadu_si256(g.0.as_ptr() as *const __m256i); - let bv = _mm256_loadu_si256(b.0.as_ptr() as *const __m256i); - let av = _mm256_loadu_si256(a.0.as_ptr() as *const __m256i); - - let rg = _mm256_or_si256(rv, _mm256_slli_epi16::<8>(gv)); - let ba = _mm256_or_si256(bv, _mm256_slli_epi16::<8>(av)); - - let p_lo = _mm256_unpacklo_epi16(rg, ba); - let p_hi = _mm256_unpackhi_epi16(rg, ba); - - let out_lo = _mm256_permute2x128_si256::<0x20>(p_lo, p_hi); - let out_hi = _mm256_permute2x128_si256::<0x31>(p_lo, p_hi); - - _mm256_storeu_si256(data.as_mut_ptr() as *mut __m256i, out_lo); - _mm256_storeu_si256(data.as_mut_ptr().add(8) as *mut __m256i, out_hi); - } - } else { - let r = r.as_slice(); - let g = g.as_slice(); - let b = b.as_slice(); - let a = a.as_slice(); - - data[ 0] = PremultipliedColorU8::from_rgba_unchecked(r[ 0] as u8, g[ 0] as u8, b[ 0] as u8, a[ 0] as u8); - data[ 1] = PremultipliedColorU8::from_rgba_unchecked(r[ 1] as u8, g[ 1] as u8, b[ 1] as u8, a[ 1] as u8); - data[ 2] = PremultipliedColorU8::from_rgba_unchecked(r[ 2] as u8, g[ 2] as u8, b[ 2] as u8, a[ 2] as u8); - data[ 3] = PremultipliedColorU8::from_rgba_unchecked(r[ 3] as u8, g[ 3] as u8, b[ 3] as u8, a[ 3] as u8); - data[ 4] = PremultipliedColorU8::from_rgba_unchecked(r[ 4] as u8, g[ 4] as u8, b[ 4] as u8, a[ 4] as u8); - data[ 5] = PremultipliedColorU8::from_rgba_unchecked(r[ 5] as u8, g[ 5] as u8, b[ 5] as u8, a[ 5] as u8); - data[ 6] = PremultipliedColorU8::from_rgba_unchecked(r[ 6] as u8, g[ 6] as u8, b[ 6] as u8, a[ 6] as u8); - data[ 7] = PremultipliedColorU8::from_rgba_unchecked(r[ 7] as u8, g[ 7] as u8, b[ 7] as u8, a[ 7] as u8); - data[ 8] = PremultipliedColorU8::from_rgba_unchecked(r[ 8] as u8, g[ 8] as u8, b[ 8] as u8, a[ 8] as u8); - data[ 9] = PremultipliedColorU8::from_rgba_unchecked(r[ 9] as u8, g[ 9] as u8, b[ 9] as u8, a[ 9] as u8); - data[10] = PremultipliedColorU8::from_rgba_unchecked(r[10] as u8, g[10] as u8, b[10] as u8, a[10] as u8); - data[11] = PremultipliedColorU8::from_rgba_unchecked(r[11] as u8, g[11] as u8, b[11] as u8, a[11] as u8); - data[12] = PremultipliedColorU8::from_rgba_unchecked(r[12] as u8, g[12] as u8, b[12] as u8, a[12] as u8); - data[13] = PremultipliedColorU8::from_rgba_unchecked(r[13] as u8, g[13] as u8, b[13] as u8, a[13] as u8); - data[14] = PremultipliedColorU8::from_rgba_unchecked(r[14] as u8, g[14] as u8, b[14] as u8, a[14] as u8); - data[15] = PremultipliedColorU8::from_rgba_unchecked(r[15] as u8, g[15] as u8, b[15] as u8, a[15] as u8); - } - } + u16x16::store_8888(&[*r, *g, *b, *a], bytemuck::cast_mut(data)); } #[inline(always)] @@ -901,27 +795,7 @@ fn store_8888_tail( #[inline(always)] fn load_8(data: &[u8; STAGE_WIDTH], a: &mut u16x16) { - cfg_if::cfg_if! { - if #[cfg(all(feature = "simd", target_feature = "avx2"))] { - #[cfg(target_arch = "x86")] - use core::arch::x86::*; - #[cfg(target_arch = "x86_64")] - use core::arch::x86_64::*; - - unsafe { - let bytes = _mm_loadu_si128(data.as_ptr() as *const __m128i); - let widened = _mm256_cvtepu8_epi16(bytes); - _mm256_storeu_si256(a.0.as_mut_ptr() as *mut __m256i, widened); - } - } else { - *a = u16x16([ - data[ 0] as u16, data[ 1] as u16, data[ 2] as u16, data[ 3] as u16, - data[ 4] as u16, data[ 5] as u16, data[ 6] as u16, data[ 7] as u16, - data[ 8] as u16, data[ 9] as u16, data[10] as u16, data[11] as u16, - data[12] as u16, data[13] as u16, data[14] as u16, data[15] as u16, - ]); - } - } + *a = u16x16::load_u8(data); } #[inline(always)] diff --git a/src/wide/f32x8_t.rs b/src/wide/f32x8_t.rs index 9448538..6b4b7ac 100644 --- a/src/wide/f32x8_t.rs +++ b/src/wide/f32x8_t.rs @@ -280,6 +280,85 @@ impl f32x8 { let skip = self.cmp_eq(f32x8::splat(0.0)) | self.cmp_eq(f32x8::splat(1.0)); skip.blend(self, x) } + + /// Loads 8 8888 RGBA pixels, unpacks each channel into a + /// normalized f32x8 in [0, 1] + #[inline(always)] + pub fn load_8888_unorm(data: &[u8; 32]) -> [Self; 4] { + cfg_if::cfg_if! { + if #[cfg(all(feature = "simd", target_feature = "avx2"))] { + unsafe { + let p = _mm256_loadu_si256(data.as_ptr() as *const __m256i); + let mask = _mm256_set1_epi32(0xFF); + let factor = _mm256_set1_ps(1.0 / 255.0); + let to_f = |v| _mm256_mul_ps(_mm256_cvtepi32_ps(v), factor); + + [ + Self(to_f(_mm256_and_si256(p, mask))), + Self(to_f(_mm256_and_si256(_mm256_srli_epi32::<8>(p), mask))), + Self(to_f(_mm256_and_si256(_mm256_srli_epi32::<16>(p), mask))), + Self(to_f(_mm256_srli_epi32::<24>(p))), + ] + } + } else { + // surprisingly, `f32 * FACTOR` is way faster than `f32x8 * f32x8::splat(FACTOR)`. + const FACTOR: f32 = 1.0 / 255.0; + let b = |i: usize, ch: usize| data[i * 4 + ch] as f32 * FACTOR; + [ + Self::from([b(0, 0), b(1, 0), b(2, 0), b(3, 0), b(4, 0), b(5, 0), b(6, 0), b(7, 0)]), + Self::from([b(0, 1), b(1, 1), b(2, 1), b(3, 1), b(4, 1), b(5, 1), b(6, 1), b(7, 1)]), + Self::from([b(0, 2), b(1, 2), b(2, 2), b(3, 2), b(4, 2), b(5, 2), b(6, 2), b(7, 2)]), + Self::from([b(0, 3), b(1, 3), b(2, 3), b(3, 3), b(4, 3), b(5, 3), b(6, 3), b(7, 3)]), + ] + } + } + } + + /// Packs 4 f32x8 channels in [0, 1] back into 8 8888 RGBA pixels (32 bytes). + /// Matches the scalar `unnorm` semantics: clamp -> *255 -> round-to-nearest. + #[inline(always)] + pub fn store_8888_unorm(rgba: &[Self; 4], data: &mut [u8; 32]) { + cfg_if::cfg_if! { + if #[cfg(all(feature = "simd", target_feature = "avx2"))] { + unsafe { + let scale = _mm256_set1_ps(255.0); + let zero = _mm256_setzero_ps(); + let one = _mm256_set1_ps(1.0); + let to_u32 = |v| { + let clamped = _mm256_min_ps(_mm256_max_ps(v, zero), one); + _mm256_cvtps_epi32(_mm256_mul_ps(clamped, scale)) + }; + + let ri = to_u32(rgba[0].0); + let gi = to_u32(rgba[1].0); + let bi = to_u32(rgba[2].0); + let ai = to_u32(rgba[3].0); + + let packed = _mm256_or_si256( + _mm256_or_si256(ri, _mm256_slli_epi32::<8>(gi)), + _mm256_or_si256(_mm256_slli_epi32::<16>(bi), _mm256_slli_epi32::<24>(ai)), + ); + _mm256_storeu_si256(data.as_mut_ptr() as *mut __m256i, packed); + } + } else { + let unnorm = |v: Self| -> [i32; 8] { + (v.max(Self::default()).min(Self::splat(1.0)) * Self::splat(255.0)) + .round_int() + .into() + }; + let r = unnorm(rgba[0]); + let g = unnorm(rgba[1]); + let b = unnorm(rgba[2]); + let a = unnorm(rgba[3]); + for i in 0..8 { + data[i * 4 + 0] = r[i] as u8; + data[i * 4 + 1] = g[i] as u8; + data[i * 4 + 2] = b[i] as u8; + data[i * 4 + 3] = a[i] as u8; + } + } + } + } } impl From<[f32; 8]> for f32x8 { diff --git a/src/wide/u16x16_t.rs b/src/wide/u16x16_t.rs index 5e1a464..7dca457 100644 --- a/src/wide/u16x16_t.rs +++ b/src/wide/u16x16_t.rs @@ -128,6 +128,125 @@ impl u16x16 { let pair: [uint16x8_t; 2] = cast(self.0); (pair[0], pair[1]) } + + /// Loads 16 8888 RGBA pixels (64 bytes) and unpacks each channel into a u16x16 + #[inline(always)] + pub fn load_8888(data: &[u8; 64]) -> [Self; 4] { + cfg_if::cfg_if! { + if #[cfg(all(feature = "simd", target_feature = "avx2"))] { + #[cfg(target_arch = "x86")] + use core::arch::x86::*; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::*; + + // extract each channel by shift+mask from u32 lanes, then saturate-pack u32x8 + u32x8 -> u16x16. + // packus_epi32 lane-swaps; permute4x64 with 0xD8 puts the halves back in order + unsafe { + let p_lo = _mm256_loadu_si256(data.as_ptr() as *const __m256i); + let p_hi = _mm256_loadu_si256(data.as_ptr().add(32) as *const __m256i); + let mask = _mm256_set1_epi32(0xFF); + let pack = |lo, hi| _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi32(lo, hi)); + + let mut out = [Self::default(); 4]; + let rr = pack(_mm256_and_si256(p_lo, mask), _mm256_and_si256(p_hi, mask)); + let gg = pack( + _mm256_and_si256(_mm256_srli_epi32::<8>(p_lo), mask), + _mm256_and_si256(_mm256_srli_epi32::<8>(p_hi), mask), + ); + let bb = pack( + _mm256_and_si256(_mm256_srli_epi32::<16>(p_lo), mask), + _mm256_and_si256(_mm256_srli_epi32::<16>(p_hi), mask), + ); + let aa = pack(_mm256_srli_epi32::<24>(p_lo), _mm256_srli_epi32::<24>(p_hi)); + + _mm256_storeu_si256(out[0].0.as_mut_ptr() as *mut __m256i, rr); + _mm256_storeu_si256(out[1].0.as_mut_ptr() as *mut __m256i, gg); + _mm256_storeu_si256(out[2].0.as_mut_ptr() as *mut __m256i, bb); + _mm256_storeu_si256(out[3].0.as_mut_ptr() as *mut __m256i, aa); + out + } + } else { + let mut out = [Self::default(); 4]; + for i in 0..16 { + out[0].0[i] = data[i * 4 + 0] as u16; + out[1].0[i] = data[i * 4 + 1] as u16; + out[2].0[i] = data[i * 4 + 2] as u16; + out[3].0[i] = data[i * 4 + 3] as u16; + } + out + } + } + } + + /// Packs 4 u16x16 channels back into 16 8888 RGBA pixels (64 bytes), + /// (channel values must fit in u8) + #[inline(always)] + pub fn store_8888(rgba: &[Self; 4], data: &mut [u8; 64]) { + cfg_if::cfg_if! { + if #[cfg(all(feature = "simd", target_feature = "avx2"))] { + #[cfg(target_arch = "x86")] + use core::arch::x86::*; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::*; + + // pack rgba into u32 pixels via (g<<8)|r and (a<<8)|b, then interleave; + // unpack_lo/hi cross 128-bit lanes, so a final permute2x128 reassembles in order. + unsafe { + let rv = _mm256_loadu_si256(rgba[0].0.as_ptr() as *const __m256i); + let gv = _mm256_loadu_si256(rgba[1].0.as_ptr() as *const __m256i); + let bv = _mm256_loadu_si256(rgba[2].0.as_ptr() as *const __m256i); + let av = _mm256_loadu_si256(rgba[3].0.as_ptr() as *const __m256i); + + let rg = _mm256_or_si256(rv, _mm256_slli_epi16::<8>(gv)); + let ba = _mm256_or_si256(bv, _mm256_slli_epi16::<8>(av)); + + let p_lo = _mm256_unpacklo_epi16(rg, ba); + let p_hi = _mm256_unpackhi_epi16(rg, ba); + + let out_lo = _mm256_permute2x128_si256::<0x20>(p_lo, p_hi); + let out_hi = _mm256_permute2x128_si256::<0x31>(p_lo, p_hi); + + _mm256_storeu_si256(data.as_mut_ptr() as *mut __m256i, out_lo); + _mm256_storeu_si256(data.as_mut_ptr().add(32) as *mut __m256i, out_hi); + } + } else { + for i in 0..16 { + data[i * 4 + 0] = rgba[0].0[i] as u8; + data[i * 4 + 1] = rgba[1].0[i] as u8; + data[i * 4 + 2] = rgba[2].0[i] as u8; + data[i * 4 + 3] = rgba[3].0[i] as u8; + } + } + } + } + + /// Widens 16 u8 bytes into u16x16 + #[inline(always)] + pub fn load_u8(data: &[u8; 16]) -> Self { + cfg_if::cfg_if! { + if #[cfg(all(feature = "simd", target_feature = "avx2"))] { + #[cfg(target_arch = "x86")] + use core::arch::x86::*; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::*; + + unsafe { + let bytes = _mm_loadu_si128(data.as_ptr() as *const __m128i); + let widened = _mm256_cvtepu8_epi16(bytes); + let mut out = Self::default(); + _mm256_storeu_si256(out.0.as_mut_ptr() as *mut __m256i, widened); + out + } + } else { + Self([ + data[ 0] as u16, data[ 1] as u16, data[ 2] as u16, data[ 3] as u16, + data[ 4] as u16, data[ 5] as u16, data[ 6] as u16, data[ 7] as u16, + data[ 8] as u16, data[ 9] as u16, data[10] as u16, data[11] as u16, + data[12] as u16, data[13] as u16, data[14] as u16, data[15] as u16, + ]) + } + } + } } impl core::ops::Add for u16x16 {