Skip to content

Not using WASM bitmask / all_true instructions #351

@stepantubanov

Description

@stepantubanov

Hi,

Not sure if this belongs in here or in compiler repo.
Example code (https://godbolt.org/z/4fqcdGqq6):

#![feature(portable_simd)]

use std::simd::{self, SimdPartialEq, ToBitMask};

pub fn to_bitmask(v: &[u8; 16]) -> usize {
    let data = simd::u8x16::from_array(*v);
    let zero = simd::u8x16::splat(0);
    let mask = data.simd_eq(zero);
    mask.to_bitmask() as usize
}

pub fn all_zeros(v: &[u8; 16]) -> bool {
    let data = simd::u8x16::from_array(*v);
    let zero = simd::u8x16::splat(0);
    let mask = data.simd_eq(zero);
    mask.all()
}

Ideally it should've been compiled using i8x16.bitmask and i8x16.all_true instructions.

https://github.com/WebAssembly/simd/blob/main/proposals/simd/SIMD.md#all-lanes-true

Instead, it generated some really slow code extracting individual lanes.
Full wasm output:

to_bitmask wasm
example::to_bitmask:
        global.get      __stack_pointer
        i32.const       16
        i32.sub 
        drop
        local.get       0
        v128.load       0:p2align=0
        v128.const      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
        i8x16.eq
        local.tee       1
        i8x16.extract_lane_u    0
        i32.const       1
        i32.and 
        local.get       1
        i8x16.extract_lane_u    1
        i32.const       1
        i32.and 
        i32.const       1
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    2
        i32.const       1
        i32.and 
        i32.const       2
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    3
        i32.const       1
        i32.and 
        i32.const       3
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    4
        i32.const       1
        i32.and 
        i32.const       4
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    5
        i32.const       1
        i32.and 
        i32.const       5
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    6
        i32.const       1
        i32.and 
        i32.const       6
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    7
        i32.const       1
        i32.and 
        i32.const       7
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    8
        i32.const       1
        i32.and 
        i32.const       8
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    9
        i32.const       1
        i32.and 
        i32.const       9
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    10
        i32.const       1
        i32.and 
        i32.const       10
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    11
        i32.const       1
        i32.and 
        i32.const       11
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    12
        i32.const       1
        i32.and 
        i32.const       12
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    13
        i32.const       1
        i32.and 
        i32.const       13
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    14
        i32.const       1
        i32.and 
        i32.const       14
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    15
        i32.const       15
        i32.shl 
        i32.or  
        i32.const       65535
        i32.and 
        end_function
all_zeros wasm
example::all_zeros:
        global.get      __stack_pointer
        i32.const       16
        i32.sub 
        drop
        local.get       0
        v128.load       0:p2align=0
        v128.const      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
        i8x16.ne
        local.tee       1
        i8x16.extract_lane_u    0
        i32.const       1
        i32.and 
        local.get       1
        i8x16.extract_lane_u    1
        i32.const       1
        i32.and 
        i32.const       1
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    2
        i32.const       1
        i32.and 
        i32.const       2
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    3
        i32.const       1
        i32.and 
        i32.const       3
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    4
        i32.const       1
        i32.and 
        i32.const       4
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    5
        i32.const       1
        i32.and 
        i32.const       5
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    6
        i32.const       1
        i32.and 
        i32.const       6
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    7
        i32.const       1
        i32.and 
        i32.const       7
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    8
        i32.const       1
        i32.and 
        i32.const       8
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    9
        i32.const       1
        i32.and 
        i32.const       9
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    10
        i32.const       1
        i32.and 
        i32.const       10
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    11
        i32.const       1
        i32.and 
        i32.const       11
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    12
        i32.const       1
        i32.and 
        i32.const       12
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    13
        i32.const       1
        i32.and 
        i32.const       13
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    14
        i32.const       1
        i32.and 
        i32.const       14
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    15
        i32.const       15
        i32.shl 
        i32.or  
        i32.const       65535
        i32.and 
        i32.eqz
        end_function

Meta

rustc --version --verbose:

rustc 1.71.0-nightly (521f4dae1 2023-05-19)

Metadata

Metadata

Assignees

No one assigned

    Labels

    C-bugCategory: Bug

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions