Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
163 changes: 163 additions & 0 deletions crates/onebrc-probe/src/lane_s.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
//! Lane S — the SWAR lane: the actual 1BRC-frontier compute levers on top of
//! the flat open-addressed table (lane F's group-by, which was already right).
//!
//! What the real 1BRC winners do that lanes A/C/F/R do NOT (see the answer in
//! the session log): find delimiters with SWAR (8 bytes at a time in a `u64`)
//! instead of a byte-by-byte `while data[i] != b';'` loop, and parse the
//! temperature branchlessly. Those are the two measurable levers here.
//!
//! - **(b) SWAR delimiter find** — the classic `haszero` bit trick:
//! `x = word ^ needle_bcast; (x - 0x0101…) & !x & 0x8080…` sets the high bit
//! of each byte where `word == needle`. `trailing_zeros() >> 3` gives the
//! first match position. Replaces the scalar name/temp scans.
//! - **(b) branchless temp parse** — the field is `-?\d?\d.\d`; parse to fixed
//! -point tenths with no data-dependent branches beyond the sign/width.
//! - **(c) name compare** — kept as `&[u8] == &[u8]`, which LLVM already lowers
//! to a vectorized `memcmp`; a hand-rolled 8-byte-chunk compare is the same
//! instruction sequence, so this is (c) already.
//!
//! **(a) mmap is deliberately NOT here and is NOT measurable in this harness:**
//! `main.rs` does `fs::read` BEFORE `Instant::now()`, so the `mrows_s` metric is
//! compute-only. mmap is a wall-clock / 13 GB-allocation lever for the full 1B
//! file, on an axis this timer does not observe. Claiming an mmap speedup on
//! `mrows_s` would be false; measuring it needs an end-to-end wall-clock mode.
//!
//! Reuses lane F's `SoaTable` (flat open-addressed accumulator) verbatim — the
//! ONLY variable vs lane F is the scan+parse. Same `chunk_bounds`/`merge_maps`
//! threading. std-only; keeps the crate's zero-dep contract.

use crate::lane_f::{fnv1a64, morton_slot, table_to_map, SoaTable};
use crate::{chunk_bounds, merge_maps, Stats};
use std::collections::BTreeMap;

const SEMI_BCAST: u64 = 0x3B3B_3B3B_3B3B_3B3B; // b';' in every byte
const NL_BCAST: u64 = 0x0A0A_0A0A_0A0A_0A0A; // b'\n' in every byte
const ONES: u64 = 0x0101_0101_0101_0101;
const HIGH: u64 = 0x8080_8080_8080_8080;

/// SWAR "has zero byte" applied to `word ^ bcast`: a set 0x80 in each byte
/// where `word`'s byte equals `bcast`'s byte.
#[inline(always)]
fn match_mask(word: u64, bcast: u64) -> u64 {
let x = word ^ bcast;
x.wrapping_sub(ONES) & !x & HIGH
}

/// First index `>= i` where `data[idx]` equals `bcast`'s byte — SWAR 8 bytes at
/// a time, scalar tail. Reads only within `data` (the `i + 8 <= len` guard).
#[inline(always)]
fn find(data: &[u8], mut i: usize, bcast: u64) -> usize {
let len = data.len();
while i + 8 <= len {
let word = u64::from_le_bytes(data[i..i + 8].try_into().unwrap());
let m = match_mask(word, bcast);
if m != 0 {
return i + ((m.trailing_zeros() >> 3) as usize);
}
i += 8;
}
let b = bcast as u8;
while i < len && data[i] != b {
i += 1;
}
i
}

/// Parse `-?\d?\d.\d` to fixed-point tenths. Branch only on sign and 1-vs-2
/// integer digits (both perfectly predicted at ~50/50 and ~90/10).
#[inline(always)]
fn parse_tenths(s: &[u8]) -> i32 {
let neg = s[0] == b'-';
let d = if neg { &s[1..] } else { s };
let v = if d.len() == 3 {
// d.d
(d[0] - b'0') as i32 * 10 + (d[2] - b'0') as i32
} else {
// dd.d
(d[0] - b'0') as i32 * 100 + (d[1] - b'0') as i32 * 10 + (d[3] - b'0') as i32
};
if neg {
-v
} else {
v
}
}

/// Scan `data` with the SWAR find + branchless parse, folding into lane F's
/// flat `SoaTable`. The ONLY difference from `lane_f::accumulate_table` is the
/// scan/parse — same hash, same slot fn, same table.
fn accumulate_swar(data: &[u8]) -> SoaTable {
let mut table = SoaTable::new();
let len = data.len();
let mut i = 0usize;
while i < len {
let name_start = i;
let semi = find(data, i, SEMI_BCAST);
let name = &data[name_start..semi];
let nl = find(data, semi + 1, NL_BCAST);
let tenths = parse_tenths(&data[semi + 1..nl]);
let h = fnv1a64(name);
table.observe(morton_slot(h), h, name, tenths);
i = nl + 1;
}
table
}

/// Lane S — SWAR delimiter scan + branchless parse over lane F's flat table.
pub fn lane_s_swar(data: &[u8], workers: usize) -> BTreeMap<String, Stats> {
let workers = workers.max(1);
let bounds = chunk_bounds(data, workers);
let results: Vec<BTreeMap<String, Stats>> = std::thread::scope(|scope| {
let handles: Vec<_> = bounds
.iter()
.map(|&(start, end)| {
let slice = &data[start..end];
scope.spawn(move || table_to_map(accumulate_swar(slice)))
})
.collect();
handles
.into_iter()
.map(|h| h.join().expect("lane S worker panicked"))
.collect()
});
merge_maps(results)
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn swar_find_matches_scalar() {
let data = b"hello;12.3\nab;-4.5\n";
// ';' after "hello" (idx 5), '\n' after "12.3" (idx 10), etc.
assert_eq!(find(data, 0, SEMI_BCAST), 5);
assert_eq!(find(data, 6, NL_BCAST), 10);
assert_eq!(find(data, 11, SEMI_BCAST), 13);
assert_eq!(find(data, 14, NL_BCAST), 18);
}

#[test]
fn parse_tenths_all_shapes() {
assert_eq!(parse_tenths(b"1.0"), 10);
assert_eq!(parse_tenths(b"12.3"), 123);
assert_eq!(parse_tenths(b"-4.5"), -45);
assert_eq!(parse_tenths(b"-99.9"), -999);
assert_eq!(parse_tenths(b"0.0"), 0);
}

#[test]
fn lane_s_agrees_with_lane_a() {
let dir = std::env::temp_dir();
let path = dir.join(format!("onebrc_probe_test_s_{}.txt", std::process::id()));
let result = crate::gen::gen(&path, 50_000, 97).expect("gen");
assert_eq!(result.rows, 50_000);
let data = std::fs::read(&path).expect("read generated corpus");
std::fs::remove_file(&path).ok();

let a = crate::lane_a_scalar(&data);
let s = lane_s_swar(&data, 3);
assert_eq!(a, s, "SWAR lane must produce identical aggregates to lane A");
assert!(!a.is_empty());
}
}
2 changes: 2 additions & 0 deletions crates/onebrc-probe/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ pub mod lane_j;
#[cfg(feature = "presets")]
pub mod presets;
pub mod sha256;
pub mod lane_s;

#[cfg(feature = "lane-b")]
pub use lane_b::lane_b_simd;
Expand All @@ -75,6 +76,7 @@ pub use lane_d::lane_d_ractor;
#[cfg(feature = "lane-e")]
pub use lane_e::lane_e_kanban;
pub use lane_f::{lane_f_morton, lane_r_radix};
pub use lane_s::lane_s_swar;
#[cfg(feature = "lane-g")]
pub use lane_g::{lane_g_kanban_soa, lane_g_kanban_soa_with_morsel};
#[cfg(feature = "lane-h")]
Expand Down
9 changes: 5 additions & 4 deletions crates/onebrc-probe/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
//!
//! ```text
//! onebrc-probe gen <path> <rows> <seed>
//! onebrc-probe run <path> <lane:a|b|c|d|e|f|r> [workers] [batches]
//! onebrc-probe run <path> <lane:a|b|c|d|e|f|r|s> [workers] [batches]
//! ```
//!
//! Lane `b` requires `--features lane-b`; lane `d` and lane `e` require
Expand All @@ -23,7 +23,7 @@ fn main() {
Some("run") => cmd_run(&args[2..]),
_ => {
eprintln!(
"usage:\n onebrc-probe gen <path> <rows> <seed>\n onebrc-probe run <path> <lane:a|b|c|d|e|f|r> [workers] [batches]"
"usage:\n onebrc-probe gen <path> <rows> <seed>\n onebrc-probe run <path> <lane:a|b|c|d|e|f|r|s> [workers] [batches]"
);
std::process::exit(2);
}
Expand Down Expand Up @@ -55,7 +55,7 @@ fn cmd_gen(args: &[String]) {
fn cmd_run(args: &[String]) {
let path = PathBuf::from(
args.first()
.expect("usage: run <path> <lane:a|b|c|d|e|f|r> [workers] [batches]"),
.expect("usage: run <path> <lane:a|b|c|d|e|f|r|s> [workers] [batches]"),
);
let lane = args.get(1).map(String::as_str).unwrap_or("a");
let workers: usize = args
Expand Down Expand Up @@ -132,6 +132,7 @@ fn cmd_run(args: &[String]) {
// Morton-tile SoA lane and its plain-radix control (lane_f.rs).
"f" => onebrc_probe::lane_f_morton(&data, workers),
"r" => onebrc_probe::lane_r_radix(&data, workers),
"s" => onebrc_probe::lane_s_swar(&data, workers),
Comment thread
coderabbitai[bot] marked this conversation as resolved.
"g" => {
#[cfg(feature = "lane-g")]
{
Expand Down Expand Up @@ -200,7 +201,7 @@ fn cmd_run(args: &[String]) {
}
}
other => {
eprintln!("unknown lane '{other}' (expected 'a', 'b', 'c', 'd', 'e', 'f', or 'r')");
eprintln!("unknown lane '{other}' (expected 'a', 'b', 'c', 'd', 'e', 'f', 'r', or 's')");
std::process::exit(2);
}
};
Expand Down
Loading