From e849f512f15ac303b4163357c638d777989a907b Mon Sep 17 00:00:00 2001
From: Serhii Savchuk <ser.vasilich@hotmail.com>
Date: Tue, 19 May 2026 11:32:48 +0300
Subject: [PATCH 1/3] test: round1-2 RFL coverage push (10 new files)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds happy-path RFL coverage for recently-introduced operators and
public-API surfaces that had 0%/low coverage on upstream master.

Round 1:
- rfl/agg/rowform_topk.rfl     — OP_GROUP_TOPK_ROWFORM / BOTK
- rfl/agg/rowform_maxmin.rfl   — OP_GROUP_MAXMIN_ROWFORM
- rfl/agg/rowform_sum_count.rfl — OP_GROUP_SUM_COUNT_ROWFORM (3..8 keys)
- rfl/sort/fused_topn.rfl      — top/bot over filtered vectors
- rfl/query/per_group_buf.rfl  — nonagg_eval_per_group(_buf), const_str_expr_copy
- rfl/query/parallel_probe.rfl — idxbuf_hist_fn / idxbuf_scat_fn
  parallel row->gid probe path

Round 2:
- rfl/temporal/extract.rfl     — yyyy/mm/dd/hh/ss/minute/dow/doy
- rfl/agg/variance.rfl         — var / var_pop / stddev / stddev_pop / dev
- rfl/io/csv_splayed.rfl       — csv_splayed_writer_*, GUID writer,
  col_copy_str_pool roundtrip
- rfl/hof/wrappers.rfl         — pmap / fold-left / scan-left dispatchers

All tests happy-path only (correct types / shapes); error/null/wrong-type
branches deferred to a future round.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 test/rfl/agg/rowform_maxmin.rfl    | 103 ++++++++++++++++
 test/rfl/agg/rowform_sum_count.rfl | 132 +++++++++++++++++++++
 test/rfl/agg/rowform_topk.rfl      | 125 ++++++++++++++++++++
 test/rfl/agg/variance.rfl          | 114 ++++++++++++++++++
 test/rfl/hof/wrappers.rfl          | 129 ++++++++++++++++++++
 test/rfl/io/csv_splayed.rfl        | 181 +++++++++++++++++++++++++++++
 test/rfl/query/parallel_probe.rfl  | 107 +++++++++++++++++
 test/rfl/query/per_group_buf.rfl   | 136 ++++++++++++++++++++++
 test/rfl/sort/fused_topn.rfl       |  94 +++++++++++++++
 test/rfl/temporal/extract.rfl      | 142 ++++++++++++++++++++++
 10 files changed, 1263 insertions(+)
 create mode 100644 test/rfl/agg/rowform_maxmin.rfl
 create mode 100644 test/rfl/agg/rowform_sum_count.rfl
 create mode 100644 test/rfl/agg/rowform_topk.rfl
 create mode 100644 test/rfl/agg/variance.rfl
 create mode 100644 test/rfl/hof/wrappers.rfl
 create mode 100644 test/rfl/io/csv_splayed.rfl
 create mode 100644 test/rfl/query/parallel_probe.rfl
 create mode 100644 test/rfl/query/per_group_buf.rfl
 create mode 100644 test/rfl/sort/fused_topn.rfl
 create mode 100644 test/rfl/temporal/extract.rfl
diff --git a/test/rfl/agg/rowform_maxmin.rfl b/test/rfl/agg/rowform_maxmin.rfl
new file mode 100644
index 00000000..56cacb21
--- /dev/null
+++ b/test/rfl/agg/rowform_maxmin.rfl
@@ -0,0 +1,103 @@
+;; ════════════════════════════════════════════════════════════════════
+;; ROWFORM per-group max(x) + min(y) (src/ops/group.c: exec_group_maxmin_rowform)
+;;
+;; Planner gate (src/ops/query.c:5985) routes
+;;   (select {a: (max x) b: (min y) by: <single key col> from: T})
+;; to OP_GROUP_MAXMIN_ROWFORM when:
+;;   - exactly 2 aggs, the first OP_MAX and second OP_MIN
+;;   - 1 key, no where, no non-agg
+;;   - key, x, y all simple OP_SCAN
+;;   - key type in {I64,I32,I16,U8,BOOL,DATE,TIME,TIMESTAMP,SYM}
+;;   - x, y types in {I64,I32,I16,U8,BOOL}  (integer only — F64 falls back)
+;;
+;; ROWFORM emits one row per group with columns [key, x, y] where x
+;; holds per-group max and y per-group min.  Group order is partition-
+;; induced, so tests use sum / membership rather than positional checks.
+;;
+;; Parallel threshold: nrows >= 16384 (src/ops/group.c:10482).
+;; ════════════════════════════════════════════════════════════════════
+
+;; ─── basic shape: I64 key, I64 x, I64 y ─────────────────────────────
+(set T (table [k x y] (list (as 'I64 [0 0 0 1 1 1]) (as 'I64 [3 1 5 2 7 4]) (as 'I64 [50 30 70 20 60 10]))))
+;; g=0: x={3,1,5} max=5; y={50,30,70} min=30
+;; g=1: x={2,7,4} max=7; y={20,60,10} min=10
+(count (select {mx: (max x) mn: (min y) by: k from: T})) -- 2
+(sum (at (select {mx: (max x) mn: (min y) by: k from: T}) 'mx)) -- 12
+(sum (at (select {mx: (max x) mn: (min y) by: k from: T}) 'mn)) -- 40
+;; Output column types match source: I64 throughout.
+(type (at (select {mx: (max x) mn: (min y) by: k from: T}) 'mx)) -- 'I64
+(type (at (select {mx: (max x) mn: (min y) by: k from: T}) 'mn)) -- 'I64
+(type (at (select {mx: (max x) mn: (min y) by: k from: T}) 'k)) -- 'I64
+
+;; ─── SYM key (gate allows it) ───────────────────────────────────────
+(set Ts (table [k x y] (list [A A A B B] (as 'I64 [1 5 3 2 4]) (as 'I64 [9 2 8 7 6]))))
+;; g=A: max x=5, min y=2; g=B: max x=4, min y=6
+(count (select {mx: (max x) mn: (min y) by: k from: Ts})) -- 2
+(sum (at (select {mx: (max x) mn: (min y) by: k from: Ts}) 'mx)) -- 9
+(sum (at (select {mx: (max x) mn: (min y) by: k from: Ts}) 'mn)) -- 8
+(type (at (select {mx: (max x) mn: (min y) by: k from: Ts}) 'k)) -- 'SYM
+
+;; ─── narrow integer key + narrow integer values ─────────────────────
+(set Ti32 (table [k x y] (list (as 'I32 [0 0 1 1]) (as 'I32 [5 7 11 13]) (as 'I32 [20 10 50 40]))))
+;; g=0: max=7, min=10; g=1: max=13, min=40
+(sum (at (select {mx: (max x) mn: (min y) by: k from: Ti32}) 'mx)) -- 20
+(sum (at (select {mx: (max x) mn: (min y) by: k from: Ti32}) 'mn)) -- 50
+(type (at (select {mx: (max x) mn: (min y) by: k from: Ti32}) 'mx)) -- 'I32
+(type (at (select {mx: (max x) mn: (min y) by: k from: Ti32}) 'mn)) -- 'I32
+
+(set Ti16 (table [k x y] (list (as 'I16 [0 0 1 1]) (as 'I16 [10 20 30 40]) (as 'I16 [-1 -2 3 -4]))))
+;; g=0: max=20, min=-2; g=1: max=40, min=-4. Sums: 60, -6.
+(sum (at (select {mx: (max x) mn: (min y) by: k from: Ti16}) 'mx)) -- 60
+(sum (at (select {mx: (max x) mn: (min y) by: k from: Ti16}) 'mn)) -- -6
+
+(set Tu8 (table [k x y] (list (as 'U8 [0 0 1 1]) (as 'U8 [10 30 5 7]) (as 'U8 [40 60 20 80]))))
+;; g=0: max=30, min=40; g=1: max=7, min=20. Sums: 37, 60.
+(sum (at (select {mx: (max x) mn: (min y) by: k from: Tu8}) 'mx)) -- 37
+(sum (at (select {mx: (max x) mn: (min y) by: k from: Tu8}) 'mn)) -- 60
+
+;; ─── BOOL x, BOOL y (degenerate but supported) ──────────────────────
+(set Tb (table [k x y] (list [A A B B] [false true true false] [true false true true])))
+;; g=A: max x = true; min y = false.  g=B: max x = true; min y = true.
+(count (select {mx: (max x) mn: (min y) by: k from: Tb})) -- 2
+;; sum of BOOL coerces to I64: true=1, false=0. mx: 1+1=2, mn: 0+1=1.
+(sum (as 'I64 (at (select {mx: (max x) mn: (min y) by: k from: Tb}) 'mx))) -- 2
+(sum (as 'I64 (at (select {mx: (max x) mn: (min y) by: k from: Tb}) 'mn))) -- 1
+
+;; ─── single group ───────────────────────────────────────────────────
+(set T1 (table [k x y] (list (as 'I64 [0 0 0 0 0]) (as 'I64 [3 1 5 2 7]) (as 'I64 [50 30 70 20 60]))))
+(count (select {mx: (max x) mn: (min y) by: k from: T1})) -- 1
+;; max x = 7, min y = 20
+(at (at (select {mx: (max x) mn: (min y) by: k from: T1}) 'mx) 0) -- 7
+(at (at (select {mx: (max x) mn: (min y) by: k from: T1}) 'mn) 0) -- 20
+
+;; ─── many small groups ──────────────────────────────────────────────
+(set Tm (table [k x y] (list (as 'I64 [0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9]) (as 'I64 [10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29]) (as 'I64 [50 51 52 53 54 55 56 57 58 59 40 41 42 43 44 45 46 47 48 49]))))
+;; group j has x={j+10, j+20} -> max = j+20; y={j+50, j+40} -> min = j+40.
+;; sum of maxes: (0+20)+(1+20)+...+(9+20) = 45+200 = 245.
+;; sum of mins:  (0+40)+(1+40)+...+(9+40) = 45+400 = 445.
+(count (select {mx: (max x) mn: (min y) by: k from: Tm})) -- 10
+(sum (at (select {mx: (max x) mn: (min y) by: k from: Tm}) 'mx)) -- 245
+(sum (at (select {mx: (max x) mn: (min y) by: k from: Tm}) 'mn)) -- 445
+
+;; ─── parallel path: nrows >= 16384 ──────────────────────────────────
+;; 20000 rows. x = i, y = 2N-i.  10 groups (mod 10).
+(set N 20000)
+(set Tbig (table [k x y] (list (% (til N) 10) (til N) (- (* 2 N) (til N)))))
+;; Group j: x = {j, j+10, ..., j+19990}; max x = j+19990.
+;;         y = {2N-j, 2N-j-10, ..., 2N-j-19990}; min y = 2N-j-19990 = 20010-j.
+;; sum of max x: 10*19990 + 45 = 199945
+;; sum of min y: 10*20010 - 45 = 200055
+(count (select {mx: (max x) mn: (min y) by: k from: Tbig})) -- 10
+(sum (at (select {mx: (max x) mn: (min y) by: k from: Tbig}) 'mx)) -- 199945
+(sum (at (select {mx: (max x) mn: (min y) by: k from: Tbig}) 'mn)) -- 200055
+
+;; ─── parallel with SYM key (high-cardinality H2O id3 shape) ─────────
+(set Tsbig (table [k x y] (list (as 'SYMBOL (% (til N) 100)) (til N) (- (* 2 N) (til N)))))
+;; 100 groups. Group of key idj contains rows i where i % 100 = j.
+;; x values: {j, j+100, ..., j+19900}; max x = j+19900.
+;; y = 2N - i where N=20000: y values: {40000-j, ..., 40000-j-19900}; min y = 20100-j.
+;; sum of max x: 100*19900 + (0+..+99) = 1990000 + 4950 = 1994950.
+;; sum of min y: 100*20100 - (0+..+99) = 2010000 - 4950 = 2005050.
+(count (select {mx: (max x) mn: (min y) by: k from: Tsbig})) -- 100
+(sum (at (select {mx: (max x) mn: (min y) by: k from: Tsbig}) 'mx)) -- 1994950
+(sum (at (select {mx: (max x) mn: (min y) by: k from: Tsbig}) 'mn)) -- 2005050
diff --git a/test/rfl/agg/rowform_sum_count.rfl b/test/rfl/agg/rowform_sum_count.rfl
new file mode 100644
index 00000000..48484a97
--- /dev/null
+++ b/test/rfl/agg/rowform_sum_count.rfl
@@ -0,0 +1,132 @@
+;; ════════════════════════════════════════════════════════════════════
+;; ROWFORM multi-key per-group sum(v) + count(v)
+;; (src/ops/group.c: exec_group_sum_count_rowform)
+;;
+;; Planner gate (src/ops/query.c:6082) routes
+;;   (select {tot: (sum v) cnt: (count v) by: [k1 k2 ... kN] from: T})
+;; to OP_GROUP_SUM_COUNT_ROWFORM when:
+;;   - N keys with 3 <= N <= 8 (all simple OP_SCAN)
+;;   - exactly 2 aggs: (sum v) then (count v), same value column
+;;   - no where, no non-agg expressions
+;;   - all keys non-nullable, types in
+;;     {I64,I32,I16,U8,BOOL,DATE,TIME,TIMESTAMP,SYM}
+;;   - v non-nullable, type in {I64,I32,I16,U8,BOOL,F64}
+;;
+;; ROWFORM emits one row per distinct key tuple with columns
+;; [k1..kN, sum, count].  Sum is always F64 (executor casts integer
+;; v -> double); count is I64.  Group order is partition-induced; tests
+;; verify via aggregate sums / counts, not positional checks.
+;;
+;; Aliases avoid colliding with key names (the result schema is
+;; [keys..., tot, cnt] and a name collision lets `at` pick the wrong
+;; column).  We use `tot` and `cnt` throughout.
+;;
+;; Parallel threshold: nrows >= 16384 (src/ops/group.c:11656).
+;; Closes canonical H2O q10.
+;; ════════════════════════════════════════════════════════════════════
+
+;; ─── basic 3-key shape: I64 keys, I64 v ─────────────────────────────
+(set T (table [k1 k2 k3 v] (list (as 'I64 [0 0 1 1]) (as 'I64 [0 1 0 1]) (as 'I64 [0 0 0 0]) (as 'I64 [10 20 30 40]))))
+;; All 4 rows have unique (k1,k2,k3) tuples, so 4 groups each of size 1.
+(count (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3] from: T})) -- 4
+;; Sum across all groups = sum of v = 100; count total = 4.
+(sum (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3] from: T}) 'tot)) -- 100.0
+(sum (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3] from: T}) 'cnt)) -- 4
+;; Sum column is F64 (executor always emits F64 sum); count is I64.
+(type (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3] from: T}) 'tot)) -- 'F64
+(type (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3] from: T}) 'cnt)) -- 'I64
+
+;; ─── 3 keys with collapses: distinct group count < n_rows ───────────
+(set Tg (table [k1 k2 k3 v] (list (as 'I64 [0 0 0 1 1]) (as 'I64 [0 0 1 0 0]) (as 'I64 [0 0 0 0 0]) (as 'I64 [10 20 30 40 50]))))
+;; Distinct (k1,k2,k3): (0,0,0)->v={10,20}=30, (0,1,0)->v=30, (1,0,0)->v={40,50}=90.
+(count (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3] from: Tg})) -- 3
+(sum (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3] from: Tg}) 'tot)) -- 150.0
+(sum (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3] from: Tg}) 'cnt)) -- 5
+;; Verify max per-group sum is 90 (from (1,0,0)) and min 30 (from one of the singletons).
+(max (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3] from: Tg}) 'tot)) -- 90.0
+(min (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3] from: Tg}) 'tot)) -- 30.0
+(max (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3] from: Tg}) 'cnt)) -- 2
+(min (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3] from: Tg}) 'cnt)) -- 1
+
+;; ─── 4 keys ─────────────────────────────────────────────────────────
+(set T4 (table [k1 k2 k3 k4 v] (list (as 'I64 [0 0 1 1]) (as 'I64 [0 1 0 1]) (as 'I64 [0 0 0 0]) (as 'I64 [0 0 0 0]) (as 'I64 [1 2 3 4]))))
+(count (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3 k4] from: T4})) -- 4
+(sum (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3 k4] from: T4}) 'tot)) -- 10.0
+(sum (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3 k4] from: T4}) 'cnt)) -- 4
+
+;; ─── 5 keys ─────────────────────────────────────────────────────────
+(set T5 (table [k1 k2 k3 k4 k5 v] (list (as 'I64 [0 0 1 1 0]) (as 'I64 [0 1 0 1 0]) (as 'I64 [0 0 0 0 0]) (as 'I64 [0 0 0 0 0]) (as 'I64 [0 0 0 0 0]) (as 'I64 [10 20 30 40 100]))))
+;; (0,0,0,0,0) seen twice (rows 0 and 4) -> sum 110, count 2.  Three other singletons.
+(count (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3 k4 k5] from: T5})) -- 4
+(sum (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3 k4 k5] from: T5}) 'tot)) -- 200.0
+(sum (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3 k4 k5] from: T5}) 'cnt)) -- 5
+(max (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3 k4 k5] from: T5}) 'cnt)) -- 2
+
+;; ─── 6 keys ─────────────────────────────────────────────────────────
+(set T6 (table [k1 k2 k3 k4 k5 k6 v] (list (as 'I64 [0 0 1 1]) (as 'I64 [0 1 0 1]) (as 'I64 [0 0 0 0]) (as 'I64 [0 0 0 0]) (as 'I64 [0 0 0 0]) (as 'I64 [0 0 0 0]) (as 'I64 [11 22 33 44]))))
+(count (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3 k4 k5 k6] from: T6})) -- 4
+(sum (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3 k4 k5 k6] from: T6}) 'tot)) -- 110.0
+(sum (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3 k4 k5 k6] from: T6}) 'cnt)) -- 4
+
+;; ─── 7 keys ─────────────────────────────────────────────────────────
+(set T7 (table [k1 k2 k3 k4 k5 k6 k7 v] (list (as 'I64 [0 0 1 1]) (as 'I64 [0 1 0 1]) (as 'I64 [0 0 0 0]) (as 'I64 [0 0 0 0]) (as 'I64 [0 0 0 0]) (as 'I64 [0 0 0 0]) (as 'I64 [0 0 0 0]) (as 'I64 [5 6 7 8]))))
+(count (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3 k4 k5 k6 k7] from: T7})) -- 4
+(sum (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3 k4 k5 k6 k7] from: T7}) 'tot)) -- 26.0
+(sum (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3 k4 k5 k6 k7] from: T7}) 'cnt)) -- 4
+
+;; ─── 8 keys (gate upper bound) ──────────────────────────────────────
+(set T8 (table [k1 k2 k3 k4 k5 k6 k7 k8 v] (list (as 'I64 [0 0 1 1]) (as 'I64 [0 1 0 1]) (as 'I64 [0 0 0 0]) (as 'I64 [0 0 0 0]) (as 'I64 [0 0 0 0]) (as 'I64 [0 0 0 0]) (as 'I64 [0 0 0 0]) (as 'I64 [0 0 0 0]) (as 'I64 [10 20 30 40]))))
+(count (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3 k4 k5 k6 k7 k8] from: T8})) -- 4
+(sum (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3 k4 k5 k6 k7 k8] from: T8}) 'tot)) -- 100.0
+(sum (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3 k4 k5 k6 k7 k8] from: T8}) 'cnt)) -- 4
+
+;; ─── F64 v column ───────────────────────────────────────────────────
+(set Tf (table [k1 k2 k3 v] (list (as 'I64 [0 0 1 1]) (as 'I64 [0 1 0 1]) (as 'I64 [0 0 0 0]) (as 'F64 [1.5 2.5 3.5 4.5]))))
+(count (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3] from: Tf})) -- 4
+(sum (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3] from: Tf}) 'tot)) -- 12.0
+(sum (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3] from: Tf}) 'cnt)) -- 4
+
+;; ─── narrow integer v (I32 / I16 / U8) ──────────────────────────────
+(set Ti32 (table [k1 k2 k3 v] (list (as 'I64 [0 0 1 1]) (as 'I64 [0 1 0 1]) (as 'I64 [0 0 0 0]) (as 'I32 [10 20 30 40]))))
+(sum (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3] from: Ti32}) 'tot)) -- 100.0
+(set Tu8 (table [k1 k2 k3 v] (list (as 'I64 [0 0 1 1]) (as 'I64 [0 1 0 1]) (as 'I64 [0 0 0 0]) (as 'U8 [1 2 3 4]))))
+(sum (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3] from: Tu8}) 'tot)) -- 10.0
+
+;; ─── SYM keys (canonical H2O q10 shape) ─────────────────────────────
+(set Ts (table [k1 k2 k3 v] (list [A A B B] [X Y X Y] [P P P P] (as 'I64 [10 20 30 40]))))
+(count (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3] from: Ts})) -- 4
+(sum (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3] from: Ts}) 'tot)) -- 100.0
+(sum (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3] from: Ts}) 'cnt)) -- 4
+
+;; SYM keys with collapses
+(set Tsc (table [k1 k2 k3 v] (list [A A A B] [X X Y Y] [P P P P] (as 'I64 [10 20 30 40]))))
+;; Distinct: (A,X,P) -> {10,20}=30, (A,Y,P) -> 30, (B,Y,P) -> 40.
+(count (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3] from: Tsc})) -- 3
+(sum (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3] from: Tsc}) 'tot)) -- 100.0
+(max (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3] from: Tsc}) 'cnt)) -- 2
+
+;; ─── mixed key types: I64 + SYM + I32 ───────────────────────────────
+(set Tmix (table [k1 k2 k3 v] (list (as 'I64 [0 0 1 1]) [A B A B] (as 'I32 [0 0 0 0]) (as 'I64 [10 20 30 40]))))
+(count (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3] from: Tmix})) -- 4
+(sum (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3] from: Tmix}) 'tot)) -- 100.0
+
+;; ─── parallel path: nrows >= 16384 ──────────────────────────────────
+;; 20000 rows; 3 keys whose product cardinalities are 5 * 4 * 3 = 60 groups.
+(set N 20000)
+(set Tbig (table [k1 k2 k3 v] (list (% (til N) 5) (% (til N) 4) (% (til N) 3) (til N))))
+(count (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3] from: Tbig})) -- 60
+;; Total sum of v across all groups = N*(N-1)/2 = 199990000.
+(sum (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3] from: Tbig}) 'tot)) -- 199990000.0
+(sum (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3] from: Tbig}) 'cnt)) -- 20000
+
+;; ─── parallel + SYM key ─────────────────────────────────────────────
+(set Tsbig (table [k1 k2 k3 v] (list (as 'SYMBOL (% (til N) 5)) (as 'SYMBOL (% (til N) 4)) (% (til N) 3) (til N))))
+(count (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3] from: Tsbig})) -- 60
+(sum (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3] from: Tsbig}) 'tot)) -- 199990000.0
+(sum (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3] from: Tsbig}) 'cnt)) -- 20000
+
+;; ─── parallel + 6 keys + F64 v ──────────────────────────────────────
+(set Tf6 (table [k1 k2 k3 k4 k5 k6 v] (list (% (til N) 5) (% (til N) 4) (% (til N) 3) (% (til N) 2) (% (til N) 2) (% (til N) 2) (as 'F64 (til N)))))
+;; Group count depends on coprime products, but total sum/count are stable.
+(sum (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3 k4 k5 k6] from: Tf6}) 'tot)) -- 199990000.0
+(sum (at (select {tot: (sum v) cnt: (count v) by: [k1 k2 k3 k4 k5 k6] from: Tf6}) 'cnt)) -- 20000
diff --git a/test/rfl/agg/rowform_topk.rfl b/test/rfl/agg/rowform_topk.rfl
new file mode 100644
index 00000000..288939ee
--- /dev/null
+++ b/test/rfl/agg/rowform_topk.rfl
@@ -0,0 +1,125 @@
+;; ════════════════════════════════════════════════════════════════════
+;; ROWFORM per-group top-K / bot-K (src/ops/group.c: exec_group_topk_rowform)
+;;
+;; Planner gate (src/ops/query.c:5885) routes
+;;   (select {alias: (top|bot col K) by: <single key col> from: T})
+;; to OP_GROUP_TOPK_ROWFORM / OP_GROUP_BOTK_ROWFORM when:
+;;   - single key, single agg, no where, no non-agg
+;;   - K in [1, 255]
+;;   - key & val columns are simple OP_SCAN, types in
+;;     {I64, I32, I16, U8, BOOL, DATE, TIME, TIMESTAMP, F64}
+;;     (SYM keys fall through to the LIST-cell OP_TOP_N path)
+;;
+;; ROWFORM emits one row per kept value: result has columns
+;; [key_col, val_col] with K * n_groups rows (or fewer if a group has
+;; < K elements).  Within each group, top-K is descending; bot-K is
+;; ascending.  Order across groups is partition-induced (8-bit radix)
+;; so tests use sum / membership rather than positional assertions.
+;;
+;; Parallel threshold: nrows >= 16384 (src/ops/group.c:9436).
+;; ════════════════════════════════════════════════════════════════════
+
+;; ─── basic shape: I64 key, I64 val, K=2, 2 groups ───────────────────
+(set T (table [k v] (list (as 'I64 [0 0 0 1 1 1]) (as 'I64 [10 20 30 40 50 60]))))
+;; Top-2 per group: g=0 -> {30,20}, g=1 -> {60,50}; total 4 rows.
+(count (select {t: (top v 2) by: k from: T})) -- 4
+;; Sum of top-2 values: 30+20+60+50 = 160.
+(sum (at (select {t: (top v 2) by: k from: T}) 't)) -- 160
+;; ROWFORM keeps the source vector type (not LIST).
+(type (at (select {t: (top v 2) by: k from: T}) 't)) -- 'I64
+;; Bot-2 per group: g=0 -> {10,20}, g=1 -> {40,50}; total 4 rows, sum 120.
+(count (select {b: (bot v 2) by: k from: T})) -- 4
+(sum (at (select {b: (bot v 2) by: k from: T}) 'b)) -- 120
+
+;; ─── K=1: degenerate to per-group max / min ─────────────────────────
+;; K=1 top = max per group; total rows = n_groups.
+(count (select {t: (top v 1) by: k from: T})) -- 2
+(sum (at (select {t: (top v 1) by: k from: T}) 't)) -- 90
+(sum (at (select {b: (bot v 1) by: k from: T}) 'b)) -- 50
+
+;; ─── K equals group size: full group emitted ────────────────────────
+(count (select {t: (top v 3) by: k from: T})) -- 6
+;; Sum equals total of all source v's (10..60) = 210.
+(sum (at (select {t: (top v 3) by: k from: T}) 't)) -- 210
+
+;; ─── K > group size: capped at group size (no padding) ──────────────
+(count (select {t: (top v 5) by: k from: T})) -- 6
+(sum (at (select {t: (top v 5) by: k from: T}) 't)) -- 210
+
+;; ─── narrow integer key types (I32 / I16 / U8 / BOOL) ──────────────
+(set Ti32 (table [k v] (list (as 'I32 [0 0 1 1]) (as 'I64 [5 7 11 13]))))
+(sum (at (select {t: (top v 1) by: k from: Ti32}) 't)) -- 20
+(type (at (select {t: (top v 1) by: k from: Ti32}) 'k)) -- 'I32
+(set Ti16 (table [k v] (list (as 'I16 [0 0 1 1]) (as 'I64 [100 200 300 400]))))
+(sum (at (select {t: (top v 1) by: k from: Ti16}) 't)) -- 600
+(set Tu8 (table [k v] (list (as 'U8 [0 0 1 1]) (as 'I64 [1 2 3 4]))))
+(sum (at (select {b: (bot v 1) by: k from: Tu8}) 'b)) -- 4
+(set Tbool (table [k v] (list [false false true true] (as 'I64 [9 8 7 6]))))
+(sum (at (select {t: (top v 1) by: k from: Tbool}) 't)) -- 16
+
+;; ─── F64 key and F64 value ──────────────────────────────────────────
+(set Tf (table [k v] (list (as 'F64 [0.0 0.0 0.0 1.0 1.0 1.0]) (as 'F64 [1.5 2.5 0.5 4.5 3.5 5.5]))))
+(count (select {t: (top v 2) by: k from: Tf})) -- 4
+;; Top-2 per group: g=0 -> {2.5,1.5}=4.0; g=1 -> {5.5,4.5}=10.0; sum 14.0.
+(sum (at (select {t: (top v 2) by: k from: Tf}) 't)) -- 14.0
+(type (at (select {t: (top v 2) by: k from: Tf}) 't)) -- 'F64
+;; Bot-1 per group: g=0 -> 0.5; g=1 -> 3.5; sum 4.0.
+(sum (at (select {b: (bot v 1) by: k from: Tf}) 'b)) -- 4.0
+
+;; ─── F64 key with I64 value (mixed) ─────────────────────────────────
+(set Tfi (table [k v] (list (as 'F64 [0.0 0.0 1.0 1.0]) (as 'I64 [10 20 30 40]))))
+(sum (at (select {t: (top v 1) by: k from: Tfi}) 't)) -- 60
+(type (at (select {t: (top v 1) by: k from: Tfi}) 'k)) -- 'F64
+(type (at (select {t: (top v 1) by: k from: Tfi}) 't)) -- 'I64
+
+;; ─── single group: all rows share one key ───────────────────────────
+(set T1 (table [k v] (list (as 'I64 [0 0 0 0 0 0]) (as 'I64 [3 1 5 2 7 4]))))
+(count (select {t: (top v 3) by: k from: T1})) -- 3
+;; Top-3 of {3,1,5,2,7,4} = {7,5,4}; sum 16.
+(sum (at (select {t: (top v 3) by: k from: T1}) 't)) -- 16
+(sum (at (select {b: (bot v 3) by: k from: T1}) 'b)) -- 6
+
+;; ─── ties: top-3 of [5 5 5 1] = [5 5 5] regardless of which 5 wins ──
+(set Tt (table [k v] (list (as 'I64 [0 0 0 0]) (as 'I64 [5 5 5 1]))))
+(sum (at (select {t: (top v 3) by: k from: Tt}) 't)) -- 15
+
+;; ─── many groups, small per group; total kept = K * n_groups ────────
+(set Tm (table [k v] (list (as 'I64 [0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9]) (as 'I64 [10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29]))))
+;; 10 groups × K=1 = 10 rows; max per group is (k+20).
+(count (select {t: (top v 1) by: k from: Tm})) -- 10
+;; Sum of maxes: 20+21+...+29 = 245.
+(sum (at (select {t: (top v 1) by: k from: Tm}) 't)) -- 245
+;; Bot per group: k+10; sum 10+11+...+19 = 145.
+(sum (at (select {b: (bot v 1) by: k from: Tm}) 'b)) -- 145
+;; K=2 -> 20 rows; each group keeps {k+10, k+20}; total sum = 245+145 = 390.
+(count (select {t: (top v 2) by: k from: Tm})) -- 20
+(sum (at (select {t: (top v 2) by: k from: Tm}) 't)) -- 390
+
+;; ─── parallel path: nrows >= 16384 (radix dispatch) ─────────────────
+;; 20000 rows, 10 groups (mod 10). Each group has 2000 elements equal
+;; to {j, j+10, j+20, ..., j+19990}. Top-1 = j+19990 -> sum 10*19990+45.
+(set N 20000)
+(set Tbig (table [k v] (list (% (til N) 10) (til N))))
+(count (select {t: (top v 1) by: k from: Tbig})) -- 10
+(sum (at (select {t: (top v 1) by: k from: Tbig}) 't)) -- 199945
+;; Bot-1 sum = sum of j over j=0..9 = 45.
+(sum (at (select {b: (bot v 1) by: k from: Tbig}) 'b)) -- 45
+;; Top-2: each group keeps {j+19990, j+19980}; sum = 10*(19990+19980) + 2*45 = 399790.
+(count (select {t: (top v 2) by: k from: Tbig})) -- 20
+(sum (at (select {t: (top v 2) by: k from: Tbig}) 't)) -- 399790
+;; Bot-2: {j, j+10}; sum = 10*10 + 2*45 = 190.
+(sum (at (select {b: (bot v 2) by: k from: Tbig}) 'b)) -- 190
+
+;; Parallel path, 1000 distinct keys (high cardinality stress).
+(set Mbig (table [k v] (list (% (til N) 1000) (til N))))
+;; 1000 groups × K=1 = 1000 rows.
+(count (select {t: (top v 1) by: k from: Mbig})) -- 1000
+;; Each group j has 20 values: j, j+1000, ..., j+19000. Top-1 = j+19000.
+;; Sum = 1000*19000 + (0+1+..+999) = 19000000 + 499500 = 19499500.
+(sum (at (select {t: (top v 1) by: k from: Mbig}) 't)) -- 19499500
+(sum (at (select {b: (bot v 1) by: k from: Mbig}) 'b)) -- 499500
+
+;; ─── F64 value parallel ─────────────────────────────────────────────
+(set Mf (table [k v] (list (% (til N) 10) (as 'F64 (til N)))))
+(sum (at (select {t: (top v 1) by: k from: Mf}) 't)) -- 199945.0
+(sum (at (select {b: (bot v 1) by: k from: Mf}) 'b)) -- 45.0
diff --git a/test/rfl/agg/variance.rfl b/test/rfl/agg/variance.rfl
new file mode 100644
index 00000000..47d37199
--- /dev/null
+++ b/test/rfl/agg/variance.rfl
@@ -0,0 +1,114 @@
+;; Happy-path invariants for the variance / stddev / dev wrappers in
+;; src/ops/agg.c.  All five wrappers route through the shared
+;; var_stddev_core(sample, take_sqrt):
+;;
+;;   var        -> sample=1, take_sqrt=0
+;;   var_pop    -> sample=0, take_sqrt=0
+;;   stddev     -> sample=1, take_sqrt=1
+;;   stddev_pop -> sample=0, take_sqrt=1
+;;   dev        -> sample=0, take_sqrt=1 (alias of stddev_pop)
+;;
+;; Canonical Wikipedia fixture: [2 4 4 4 5 5 7 9]
+;;   mean         = 5
+;;   sum sq diff  = 32
+;;   pop_var      = 32/8 = 4.0
+;;   pop_stddev   = 2.0
+;;   sample_var   = 32/7 ≈ 4.5714285714…
+;;   sample_stddev = √(32/7) ≈ 2.1380899352…
+
+;; ─── canonical fixture: exact integer answers ─────────────────────
+(var_pop    [2 4 4 4 5 5 7 9]) -- 4.0
+(stddev_pop [2 4 4 4 5 5 7 9]) -- 2.0
+(dev        [2 4 4 4 5 5 7 9]) -- 2.0
+
+;; sample_var = 32/7 — compare with tolerance.
+(< (abs (- (var    [2 4 4 4 5 5 7 9]) 4.571428571428571)) 0.000001) -- true
+;; sample_stddev = sqrt(32/7) — compare with tolerance.
+(< (abs (- (stddev [2 4 4 4 5 5 7 9]) 2.138089935299395)) 0.000001) -- true
+
+;; ─── F64 input gives the same answers as I64 input ────────────────
+(var_pop    [2.0 4.0 4.0 4.0 5.0 5.0 7.0 9.0]) -- 4.0
+(stddev_pop [2.0 4.0 4.0 4.0 5.0 5.0 7.0 9.0]) -- 2.0
+(dev        [2.0 4.0 4.0 4.0 5.0 5.0 7.0 9.0]) -- 2.0
+(< (abs (- (var    [2.0 4.0 4.0 4.0 5.0 5.0 7.0 9.0]) 4.571428571428571)) 0.000001) -- true
+(< (abs (- (stddev [2.0 4.0 4.0 4.0 5.0 5.0 7.0 9.0]) 2.138089935299395)) 0.000001) -- true
+
+;; ─── return type is F64 for every wrapper ─────────────────────────
+(type (var        [1 2 3])) -- 'f64
+(type (var_pop    [1 2 3])) -- 'f64
+(type (stddev     [1 2 3])) -- 'f64
+(type (stddev_pop [1 2 3])) -- 'f64
+(type (dev        [1 2 3])) -- 'f64
+
+;; ─── constant column → zero variance / dispersion ─────────────────
+(var        [3 3 3 3]) -- 0.0
+(var_pop    [3 3 3 3]) -- 0.0
+(stddev     [3 3 3 3]) -- 0.0
+(stddev_pop [3 3 3 3]) -- 0.0
+(dev        [3 3 3 3]) -- 0.0
+
+;; constant F64 column — same story.
+(var_pop    [7.5 7.5 7.5 7.5 7.5]) -- 0.0
+(stddev_pop [7.5 7.5 7.5 7.5 7.5]) -- 0.0
+
+;; ─── two-element diff [1 5]: mean=3, Σ(x-mean)² = 4+4 = 8 ─────────
+;;   pop_var = 8/2 = 4,  sample_var = 8/1 = 8
+;;   pop_stddev = 2,     sample_stddev = √8 ≈ 2.828427…
+(var_pop    [1 5]) -- 4.0
+(var        [1 5]) -- 8.0
+(stddev_pop [1 5]) -- 2.0
+(dev        [1 5]) -- 2.0
+(< (abs (- (stddev [1 5]) 2.828427124746190)) 0.000001) -- true
+
+;; ─── narrow-int coercion to F64 (I32 / I16 / U8 paths) ────────────
+(var_pop    (as 'I32 [2 4 4 4 5 5 7 9])) -- 4.0
+(stddev_pop (as 'I32 [2 4 4 4 5 5 7 9])) -- 2.0
+(dev        (as 'I32 [2 4 4 4 5 5 7 9])) -- 2.0
+(type (var_pop (as 'I32 [2 4 4 4 5 5 7 9]))) -- 'f64
+
+(var_pop    (as 'I16 [2 4 4 4 5 5 7 9])) -- 4.0
+(stddev_pop (as 'I16 [2 4 4 4 5 5 7 9])) -- 2.0
+(type (stddev_pop (as 'I16 [2 4 4 4 5 5 7 9]))) -- 'f64
+
+(var_pop    (as 'U8 [2 4 4 4 5 5 7 9])) -- 4.0
+(dev        (as 'U8 [2 4 4 4 5 5 7 9])) -- 2.0
+(type (dev (as 'U8 [2 4 4 4 5 5 7 9]))) -- 'f64
+
+;; sample stats over narrow-int input — tolerance-checked, type F64.
+(< (abs (- (var    (as 'I32 [2 4 4 4 5 5 7 9])) 4.571428571428571)) 0.000001) -- true
+(< (abs (- (stddev (as 'I16 [2 4 4 4 5 5 7 9])) 2.138089935299395)) 0.000001) -- true
+
+;; ─── length-1 input: pop_* = 0, sample_* = typed F64 null (0Nf) ──
+;; var_stddev_core(sample=0) → make_f64(0.0)
+;; var_stddev_core(sample=1, n<=1) → ray_typed_null(-RAY_F64) == 0Nf
+(var_pop    [42]) -- 0.0
+(stddev_pop [42]) -- 0.0
+(dev        [42]) -- 0.0
+(var    [42]) -- 0Nf
+(stddev [42]) -- 0Nf
+(var    [3.14]) -- 0Nf
+(stddev [3.14]) -- 0Nf
+
+;; F64 length-1 pop side.
+(var_pop    [3.14]) -- 0.0
+(stddev_pop [3.14]) -- 0.0
+
+;; ─── algebraic invariant: stddev² == var (within fp tolerance) ────
+(set V [2 4 4 4 5 5 7 9])
+(< (abs (- (* (stddev V) (stddev V)) (var V))) 0.000001) -- true
+(< (abs (- (* (stddev_pop V) (stddev_pop V)) (var_pop V))) 0.000001) -- true
+(< (abs (- (* (dev V) (dev V)) (var_pop V))) 0.000001) -- true
+
+;; dev is an alias for stddev_pop — must be bit-equal.
+(== (dev V) (stddev_pop V)) -- true
+
+;; population variance is bounded above by sample variance for n>=2.
+(<= (var_pop V) (var V)) -- true
+
+;; non-negativity over a random vector — drives the general path.
+(set R (rand 200 1000))
+(>= (var R) 0.0) -- true
+(>= (var_pop R) 0.0) -- true
+(>= (stddev R) 0.0) -- true
+(>= (stddev_pop R) 0.0) -- true
+(>= (dev R) 0.0) -- true
diff --git a/test/rfl/hof/wrappers.rfl b/test/rfl/hof/wrappers.rfl
new file mode 100644
index 00000000..f199ef9d
--- /dev/null
+++ b/test/rfl/hof/wrappers.rfl
@@ -0,0 +1,129 @@
+;; Happy-path coverage for the HOF wrapper dispatchers in
+;; src/ops/collection.c:
+;;     ray_pmap_fn       → forwards to ray_map_fn
+;;     ray_fold_left_fn  → forwards to ray_fold_fn
+;;     ray_scan_left_fn  → forwards to ray_scan_fn
+;;
+;; Existing test/rfl/hof/{pmap,fold,scan,right,map,filter,apply,lambda}.rfl
+;; exercise the inner ops directly.  This file specifically drives each
+;; *public-API wrapper* across I64 / F64 / mixed-type inputs and verifies
+;; result values, lengths, and the wrapper-equals-inner-op invariant.
+
+;; ─────────────────────────────────────────────────────────────────
+;; pmap — parallel map wrapper (ray_pmap_fn)
+;; ─────────────────────────────────────────────────────────────────
+
+;; I64 happy path — small list, lambda
+(pmap (fn [x] (* x 2)) [1 2 3 4 5]) -- [2 4 6 8 10]
+(pmap (fn [x] (+ x 1)) [10 20 30]) -- [11 21 31]
+
+;; I64 happy path — builtin verbs as fn
+(pmap neg [1 2 3 4]) -- [-1 -2 -3 -4]
+(pmap abs [-3 -2 -1 0 1 2 3]) -- [3 2 1 0 1 2 3]
+
+;; F64 happy path
+(pmap (fn [x] (* x 2.0)) [1.5 2.5 3.5]) -- [3.0 5.0 7.0]
+(pmap (fn [x] (+ x 0.5)) [1.0 2.0 3.0]) -- [1.5 2.5 3.5]
+
+;; Mixed-type coercion — I64 list with F64 lambda promotes result
+(pmap (fn [x] (* x 1.5)) [2 4 6]) -- [3.0 6.0 9.0]
+
+;; Large list — force the parallel-dispatch path (if thresholded)
+(set BIG (til 1000))
+(count (pmap (fn [x] (+ x 1)) BIG)) -- 1000
+(sum (pmap (fn [x] (+ x 1)) BIG)) -- (+ 1000 (sum BIG))
+
+;; Wrapper equivalence: pmap == map on the same input
+(set L (rand 200 500))
+(pmap (fn [x] (* x 2)) L) -- (map (fn [x] (* x 2)) L)
+(pmap neg L) -- (map neg L)
+
+;; Result type/count invariants
+(count (pmap (fn [x] x) [1 2 3 4 5])) -- 5
+(count (pmap (fn [x] (* x x)) (til 50))) -- 50
+
+;; ─────────────────────────────────────────────────────────────────
+;; fold-left — left-associative fold wrapper (ray_fold_left_fn)
+;; ─────────────────────────────────────────────────────────────────
+
+;; I64 happy path — + with seed 0 (sum)
+(fold-left + 0 [1 2 3 4 5]) -- 15
+(fold-left + 0 [10 20 30 40]) -- 100
+
+;; I64 happy path — * with seed 1 (product)
+(fold-left * 1 [1 2 3 4]) -- 24
+(fold-left * 1 [2 3 5]) -- 30
+
+;; F64 happy path — sum and product
+(fold-left + 0.0 [1.5 2.5 3.0]) -- 7.0
+(fold-left * 1.0 [2.0 0.5 4.0]) -- 4.0
+
+;; Mixed-type coercion — I64 seed + F64 list
+(fold-left + 0 [1.5 2.5 3.0]) -- 7.0
+
+;; Lambda fn — explicit binary lambda
+(fold-left (fn [a b] (+ a b)) 0 [1 2 3 4]) -- 10
+(fold-left (fn [a b] (* a b)) 1 [2 3 4]) -- 24
+
+;; Non-associative op (subtraction) — left-fold is well-defined:
+;; (((0-1)-2)-3)-4 = -10
+(fold-left - 0 [1 2 3 4]) -- -10
+
+;; Empty list + seed returns seed unchanged
+(fold-left + 42 []) -- 42
+(fold-left + 0.0 []) -- 0.0
+
+;; 2-arg form (no seed) uses first element as initial accumulator
+(fold-left + [1 2 3 4 5]) -- 15
+(fold-left * [1 2 3 4]) -- 24
+(fold-left + [10]) -- 10
+
+;; Wrapper equivalence: fold-left == fold on same inputs
+(set V (rand 50 100))
+(fold-left + 0 V) -- (fold + 0 V)
+(fold-left + V) -- (fold + V)
+(fold-left * 1 [2 3 4 5]) -- (fold * 1 [2 3 4 5])
+
+;; fold-left + 0 V equals (sum V)
+(fold-left + 0 V) -- (sum V)
+
+;; ─────────────────────────────────────────────────────────────────
+;; scan-left — left-prefix scan wrapper (ray_scan_left_fn)
+;; ─────────────────────────────────────────────────────────────────
+
+;; I64 happy path — prefix sums
+(scan-left + [1 2 3 4]) -- [1 3 6 10]
+(scan-left + [1 2 3 4 5]) -- [1 3 6 10 15]
+
+;; I64 happy path — prefix products
+(scan-left * [1 2 3 4]) -- [1 2 6 24]
+(scan-left * [2 3 4]) -- [2 6 24]
+
+;; F64 happy path
+(scan-left + [1.0 2.0 3.0]) -- [1.0 3.0 6.0]
+(scan-left * [1.0 2.0 3.0 4.0]) -- [1.0 2.0 6.0 24.0]
+
+;; Singleton — scan of one element is itself
+(scan-left + [42]) -- [42]
+(scan-left * [7]) -- [7]
+
+;; Lambda fn
+(scan-left (fn [a b] (+ a b)) [1 2 3 4]) -- [1 3 6 10]
+
+;; Length preserved (count invariant)
+(set W (rand 50 100))
+(count W) -- (count (scan-left + W))
+(count (scan-left * [1 2 3 4 5])) -- 5
+
+;; last(scan-left + v) equals (fold + 0 v) when v non-empty
+(last (scan-left + [1 2 3 4 5])) -- 15
+(last (scan-left + W)) -- (fold + 0 W)
+
+;; first(scan-left f v) equals v[0]
+(first (scan-left + [7 1 2 3])) -- 7
+(first (scan-left * [9 2 3])) -- 9
+
+;; Wrapper equivalence: scan-left == scan on same inputs
+(scan-left + [1 2 3 4 5]) -- (scan + [1 2 3 4 5])
+(scan-left * [1 2 3 4]) -- (scan * [1 2 3 4])
+(scan-left + W) -- (scan + W)
diff --git a/test/rfl/io/csv_splayed.rfl b/test/rfl/io/csv_splayed.rfl
new file mode 100644
index 00000000..f91b220d
--- /dev/null
+++ b/test/rfl/io/csv_splayed.rfl
@@ -0,0 +1,181 @@
+;; Coverage for src/io/csv.c splayed-writer paths, src/store/col.c str-pool
+;; copy on load, and the GUID column writer.
+;;
+;; Reachability map (per llvm-cov, 0% on master):
+;;
+;;   csv_splayed_writer_open     — opens per-column tmp file
+;;   csv_splayed_writer_append   — per-chunk col append
+;;   csv_splayed_writer_close    — finalizes header + nullmap, atomic rename
+;;   csv_splayed_writer_null_bit — per-row null-bit accumulator
+;;   csv_splayed_writer_zero_nulls — backfill / pad helpers (count=0 fast path
+;;                                   reached here; the inner loop requires
+;;                                   multi-chunk input which needs >1M rows —
+;;                                   see "Reachability notes" at end)
+;;
+;; All paths above are exercised through the `.csv.splayed` builtin
+;; (src/ops/builtins.c::ray_read_csv_splayed_fn), which writes a CSV into
+;; a directory as one file per column via ray_csv_save_splayed_named_opts.
+;;
+;; col_copy_str_pool / col_load_str_vec are reached via the standard
+;; ray_col_load path when the saved column carries a STR str_pool.
+;;
+;; Every assertion is deterministic — no rand on the LHS, no timestamps.
+;; Splayed dirs and CSV fixtures use the `rf_test_splayed_*` prefix so the
+;; Makefile-side `rm -f rf_test_*.csv` keeps fixtures bounded; the splayed
+;; *directories* themselves are cleaned by the test header below.
+
+;; ────────────── scrub stale state from a prior run ──────────────
+;; `.csv.splayed` errors out if the destination dir already has stale
+;; column files from a previous interrupted run.  A leftover sym file
+;; with stale IDs would also flip the load-back path to "corrupt".
+(.sys.exec "rm -rf rf_test_splayed_basic rf_test_splayed_nulls rf_test_splayed_str rf_test_splayed_sym rf_test_splayed_guid rf_test_splayed_*.csv") -- 0
+
+;; ════════════════════════════════════════════════════════════════
+;; 1. Basic mixed-column round trip — no nulls.
+;;
+;; Hits csv_splayed_writer_open + _append + _close for each column.
+;; Three columns ⇒ three open/close pairs, exercising the SYM_W32 attr
+;; branch in writer_open (the `s` column) and the non-SYM branch (`id`
+;; and `px`).
+;; ════════════════════════════════════════════════════════════════
+(.sys.exec "printf 'id,s,px\\n1,aa,1.5\\n2,bb,2.5\\n3,aa,3.5\\n4,cc,4.5\\n5,aa,5.5\\n' > rf_test_splayed_basic.csv") -- 0
+(set Tbasic (.csv.splayed "rf_test_splayed_basic.csv" "rf_test_splayed_basic/"))
+
+(count Tbasic) -- 5
+(sum (at Tbasic 'id)) -- 15
+(sum (at Tbasic 'px)) -- 17.5
+;; SYM column survives — duplicates intern to the same id.
+(count (at Tbasic 's)) -- 5
+;; Header was parsed → column names match.
+(key Tbasic) -- ['id 's 'px]
+
+;; Round-trip load via .db.splayed.get exercises ray_read_splayed which
+;; walks every column file and validates the writer's on-disk format.
+(set Rbasic (.db.splayed.get "rf_test_splayed_basic/"))
+(count Rbasic) -- 5
+(sum (at Rbasic 'id)) -- 15
+(sum (at Rbasic 'px)) -- 17.5
+(key Rbasic) -- ['id 's 'px]
+;; Values match position-by-position — proves writer + reader agree on
+;; row order and width-narrowed SYM widths.
+(at Rbasic 'id) -- (at Tbasic 'id)
+(at Rbasic 'px) -- (at Tbasic 'px)
+(at Rbasic 's)  -- (at Tbasic 's)
+
+;; ════════════════════════════════════════════════════════════════
+;; 2. Null rows — empty CSV fields produce typed nulls.
+;;
+;; Materialized columns carry RAY_ATTR_HAS_NULLS, which routes through
+;; csv_splayed_writer_null_bit for every row (14 regions) and the
+;; count=0 early-return in csv_splayed_writer_zero_nulls (1 region).
+;; The bit accumulator flushes when null_bits reaches 8, so 9+ rows
+;; force at least one full-byte fwrite of null_acc and exercise the
+;; flush-then-reset path inside null_bit.
+;; ════════════════════════════════════════════════════════════════
+(.sys.exec "printf 'a,b\\n1,1.5\\n,2.5\\n3,\\n,\\n5,5.5\\n6,6.5\\n,7.5\\n8,\\n9,9.5\\n10,10.5\\n,11.5\\n12,12.5\\n' > rf_test_splayed_nulls.csv") -- 0
+(set Tnull (.csv.splayed "rf_test_splayed_nulls.csv" "rf_test_splayed_nulls/"))
+
+(count Tnull) -- 12
+(key Tnull) -- ['a 'b]
+;; Sum skips nulls — only the non-null `a` cells contribute.
+(sum (at Tnull 'a)) -- 54
+;; F64 sum of non-null `b` cells: 1.5+2.5+5.5+6.5+7.5+9.5+10.5+11.5+12.5 = 67.5
+(sum (at Tnull 'b)) -- 67.5
+
+;; Round-trip load: the saved column file carries an external nullmap
+;; bitmap (RAY_ATTR_NULLMAP_EXT in writer_close), and ray_col_load
+;; restores it.  Re-loaded sums match the original.
+(set Rnull (.db.splayed.get "rf_test_splayed_nulls/"))
+(count Rnull) -- 12
+(sum (at Rnull 'a)) -- 54
+(sum (at Rnull 'b)) -- 67.5
+(at Rnull 'a) -- (at Tnull 'a)
+(at Rnull 'b) -- (at Tnull 'b)
+
+;; ════════════════════════════════════════════════════════════════
+;; 3. STR column via explicit schema — exercises ray_splay_save_bulk
+;; fallback inside ray_csv_save_splayed_named_opts (col.c writes the
+;; STR + adjacent str_pool inline), and col_copy_str_pool on load.
+;;
+;; The schema-overridden form takes (.csv.splayed [names] [types] csv dir),
+;; same dispatcher as ray_read_csv_splayed_fn.
+;; ════════════════════════════════════════════════════════════════
+(.sys.exec "printf '1,alpha\\n2,beta\\n3,gamma\\n4,delta\\n5,epsilon\\n' > rf_test_splayed_str.csv") -- 0
+(set Tstr (.csv.splayed [id name] [I64 STR] "rf_test_splayed_str.csv" "rf_test_splayed_str/"))
+(count Tstr) -- 5
+(sum (at Tstr 'id)) -- 15
+;; STR column round-trips via the str_pool copy in col_copy_str_pool.
+(at (at Tstr 'name) 0) -- "alpha"
+(at (at Tstr 'name) 4) -- "epsilon"
+
+;; Re-load from disk also hits col_copy_str_pool — fresh STR + pool.
+(set Rstr (.db.splayed.get "rf_test_splayed_str/"))
+(count Rstr) -- 5
+(at (at Rstr 'name) 0) -- "alpha"
+(at (at Rstr 'name) 2) -- "gamma"
+(at (at Rstr 'name) 4) -- "epsilon"
+(at Rstr 'id) -- (at Tstr 'id)
+
+;; ════════════════════════════════════════════════════════════════
+;; 4. SYM column round-trip — drives the writer's SYM_W32 branch
+;; (uint32_t fwrite per chunk) and the on-load sym validation.
+;; Multiple distinct symbol values + duplicates exercise the intern
+;; table and the saved-sym-count footer.
+;; ════════════════════════════════════════════════════════════════
+(.sys.exec "printf 'tag,v\\nAAPL,100\\nGOOG,200\\nMSFT,300\\nAAPL,400\\nGOOG,500\\nMSFT,600\\nAAPL,700\\n' > rf_test_splayed_sym.csv") -- 0
+(set Tsym (.csv.splayed "rf_test_splayed_sym.csv" "rf_test_splayed_sym/"))
+(count Tsym) -- 7
+(sum (at Tsym 'v)) -- 2800
+(count (at Tsym 'tag)) -- 7
+
+;; Round-trip — re-loading hits ray_sym_load via the splayed/sym file.
+(set Rsym (.db.splayed.get "rf_test_splayed_sym/"))
+(count Rsym) -- 7
+(sum (at Rsym 'v)) -- 2800
+(at Rsym 'tag) -- (at Tsym 'tag)
+
+;; ════════════════════════════════════════════════════════════════
+;; 5. GUID column via .csv.write — exercises csv_write_guid (8-4-4-4-12).
+;; (.csv.splayed has no GUID type inference path from text input, so
+;; the writer is reached via the row-formatted .csv.write builtin.)
+;;
+;; The exact bytes depend on `guid` codegen, so the assertion pins the
+;; structural invariant: 36-char canonical form, dashes at offsets
+;; 8/13/18/23.
+;; ════════════════════════════════════════════════════════════════
+(set Tguid (table [g] (list (guid 3))))
+(.csv.write Tguid "rf_test_splayed_guid.csv") -- 0
+(set rawg (read "rf_test_splayed_guid.csv"))
+(set linesg (split rawg "\n"))
+(count linesg) -- 5
+(at linesg 0) -- "g"
+;; Each data row is a 36-char canonical GUID string.
+(count (at linesg 1)) -- 36
+(count (at linesg 2)) -- 36
+(count (at linesg 3)) -- 36
+;; Dashes appear at the canonical offsets (8, 13, 18, 23).
+(at (at linesg 1) 8)  -- "-"
+(at (at linesg 1) 13) -- "-"
+(at (at linesg 1) 18) -- "-"
+(at (at linesg 1) 23) -- "-"
+
+;; ────────────── cleanup ──────────────
+;; CSV files match the Makefile's `rf_test_*.csv` clean rule; splayed
+;; dirs we created need explicit removal.
+(.sys.exec "rm -rf rf_test_splayed_basic rf_test_splayed_nulls rf_test_splayed_str rf_test_splayed_sym rf_test_splayed_*.csv") -- 0
+
+;; ────────────── reachability notes ──────────────
+;; csv_splayed_writer_zero_nulls — only the count<=0 early-return is
+;; hit here.  The inner loop requires either (a) a chunked write where
+;; one chunk has a null-bearing column and a later chunk doesn't, or
+;; (b) the reverse, both of which need >1M rows to cross the default
+;; CSV_PART_ROWS_DEFAULT chunk boundary.  Reachable but not regression-
+;; sized — would need a C-level test that passes a smaller rows_per_chunk.
+;;
+;; csv_splayed_writer_abort — only the err-path in writer_open's caller
+;; loop and the post-close abort exercise it.  Requires open() or
+;; fwrite() failure mid-flight; not reachable from a happy-path test.
+;;
+;; sym_lazy_materialize_to_locked / sym_lazy_unmap_locked — gated by
+;; SYM_LAZY_LOAD_MIN_BYTES = 64 MiB.  Out of reach for a regression
+;; test; covered by larger benchmarks.
diff --git a/test/rfl/query/parallel_probe.rfl b/test/rfl/query/parallel_probe.rfl
new file mode 100644
index 00000000..5f85efca
--- /dev/null
+++ b/test/rfl/query/parallel_probe.rfl
@@ -0,0 +1,107 @@
+;; Coverage for the parallel row→gid probe path in `src/ops/query.c`:
+;; `idxbuf_hist_fn` + `idxbuf_scat_fn`.  Activated by the post-DAG
+;; non-agg scatter when:
+;;   - `select` has a non-agg projection (a column ref, not a
+;;     reduction), so n_nonaggs > 0;
+;;   - the by-clause is a single scalar key (single-key scatter path);
+;;   - nrows >= 200_000  (the dispatch-overhead amortisation gate);
+;;   - 0 < n_groups <= 65_536  (per-task histogram sizing).
+;;
+;; The path also requires >= 2 worker threads.  Single-worker dev
+;; boxes fall through to the serial histogram below; the asserts here
+;; are correctness-only (totals, per-group sizes), so both branches
+;; return the same results.
+;;
+;; Shape:
+;;   `(select {v: v by: k from: T})`
+;; produces a keyed table with one LIST-column row per group, each cell
+;; holding the per-group slice of `v`.  Compare to a reference computed
+;; via `(group k)` + per-group sum / count.
+;;
+;; History: commit 774ce68f "perf(query): parallel row→gid probe for
+;; non-agg scatter".
+
+;; ─── Fixture 1: 200_000 rows, 100 groups (well under the 65_536 cap)
+(set Nbig 200000)
+(set Ngrp 100)
+(set Tbig (table [k v] (list (% (til Nbig) Ngrp) (til Nbig))))
+
+;; Result shape: keyed table with `k` key column + `v` LIST column.
+(set Rbig (select {v: v by: k from: Tbig}))
+(count Rbig) -- 100
+
+;; Per-group count = Nbig / Ngrp = 2000 for every group.
+(count (at Rbig 'v)) -- 100
+(count (at (at Rbig 'v) 0))  -- 2000
+(count (at (at Rbig 'v) 50)) -- 2000
+(count (at (at Rbig 'v) 99)) -- 2000
+
+;; Sum of per-group counts equals total row count.
+(fold + 0 (map count (at Rbig 'v))) -- 200000
+
+;; Total of all values across groups equals sum(til Nbig)
+;; = 200000 * 199999 / 2 = 19_999_900_000.
+(fold + 0 (map sum (at Rbig 'v))) -- 19999900000
+
+;; Group 0 holds rows {0, 100, 200, ..., 199900}.
+;; AP sum: 2000 * (0 + 199900) / 2 = 199_900_000.
+(sum (at (at Rbig 'v) 0)) -- 199900000
+
+;; Group 99 holds rows {99, 199, 299, ..., 199999}.
+;; AP sum: 2000 * (99 + 199999) / 2 = 200_098_000.
+(sum (at (at Rbig 'v) 99)) -- 200098000
+
+;; First-cell value in the per-group LIST: group `g` starts at row `g`.
+(at (at (at Rbig 'v) 0)  0) -- 0
+(at (at (at Rbig 'v) 1)  0) -- 1
+(at (at (at Rbig 'v) 99) 0) -- 99
+
+;; ─── Fixture 2: SYM key, same gating
+;; SYM keys flow through KEY_READ's SYM branch and remain in the parallel
+;; scatter (key not LIST/STR/GUID, so no eval-group force-out).
+(set Nsym 200000)
+(set syms ['AAA 'BBB 'CCC 'DDD 'EEE])
+(set Tsym (table [k v] (list (take syms Nsym) (til Nsym))))
+(set Rsym (select {v: v by: k from: Tsym}))
+(count Rsym) -- 5
+;; Even partition across 5 syms in round-robin order → 40_000 per group.
+(count (at (at Rsym 'v) 0)) -- 40000
+(count (at (at Rsym 'v) 4)) -- 40000
+(fold + 0 (map count (at Rsym 'v))) -- 200000
+;; Total sum unchanged from total til.
+(fold + 0 (map sum (at Rsym 'v))) -- 19999900000
+
+;; ─── Fixture 3: with WHERE filter — the rowsel-masking applied to
+;; row_gid before scatter, so dropped rows must not appear in any
+;; per-group LIST.  Survives the parallel idx_buf scatter via the
+;; rgid_did_mask path (per-segment masking inside the probe).
+(set Twh (table [k v] (list (% (til Nbig) Ngrp) (til Nbig))))
+(set Rwh (select {v: v by: k from: Twh where: (< v 100000)}))
+(count Rwh) -- 100
+;; First half kept → per-group size = 100_000 / 100 = 1000.
+(count (at (at Rwh 'v) 0))  -- 1000
+(count (at (at Rwh 'v) 50)) -- 1000
+;; Total surviving rows = 100_000; sum = 100_000 * 99_999 / 2 = 4_999_950_000.
+(fold + 0 (map count (at Rwh 'v))) -- 100000
+(fold + 0 (map sum (at Rwh 'v))) -- 4999950000
+
+;; ─── Fixture 4: non-agg projection alongside aggs — exercises the
+;; same scatter index for the agg result AND the non-agg LIST column.
+;; The `idx_buf` is shared by both downstream consumers.
+(set Rmix (select {c: (count v) v: v by: k from: Tbig}))
+(count Rmix) -- 100
+(at (at Rmix 'c) 0) -- 2000
+(sum (at Rmix 'c)) -- 200000
+(count (at (at Rmix 'v) 0)) -- 2000
+
+;; ─── Fixture 5: just above the parallel-dispatch threshold (200_001).
+;; The +1 row goes to the first surviving group; total row count must
+;; still match.
+(set Nedge 200001)
+(set Tedge (table [k v] (list (% (til Nedge) Ngrp) (til Nedge))))
+(set Redge (select {v: v by: k from: Tedge}))
+(count Redge) -- 100
+(fold + 0 (map count (at Redge 'v))) -- 200001
+;; Group 0 gets the extra row (row 200_000 → k = 200_000 % 100 = 0).
+(count (at (at Redge 'v) 0)) -- 2001
+(count (at (at Redge 'v) 1)) -- 2000
diff --git a/test/rfl/query/per_group_buf.rfl b/test/rfl/query/per_group_buf.rfl
new file mode 100644
index 00000000..47917292
--- /dev/null
+++ b/test/rfl/query/per_group_buf.rfl
@@ -0,0 +1,136 @@
+;; Coverage for the per-group buffer aggregators and helpers in
+;; `src/ops/query.c`:
+;;   - `typed_vec_to_list`        — demote a partly-typed result to a
+;;                                  LIST when a mid-loop cell type
+;;                                  doesn't fit the initial typed vec.
+;;   - `const_str_expr_copy`      — constant-fold `(concat str ...)`
+;;                                  expressions inside select dict
+;;                                  projections.
+;;   - eval-level / DAG fast-path scatter for non-agg projections that
+;;     compute per group (`nonagg_eval_per_group_core` /
+;;     `nonagg_eval_per_group_buf`).
+;;
+;; Note on `aggr_unary_per_group_buf` / `aggr_med_per_group_buf`
+;; (also targeted in this file's scope): the dispatch gate at
+;; `query.c:7684` requires `is_streaming_aggr_unary_call(expr)` to fire
+;; AND that the expr was bucketed into `nonagg_exprs` (i.e. NOT
+;; `is_group_dag_agg_expr`).  As of `resolve_agg_opcode` covering all
+;; RAY_FN_AGGR unaries (sum/avg/min/max/med/dev/var/stddev/...), every
+;; `(agg col)` shape now satisfies `is_agg_expr` → `is_group_dag_agg_expr`
+;; → buckets into `n_aggs`, never reaches the non-agg scatter.  These
+;; two helpers are therefore not reachable from RFL today; see the
+;; reachability note in the task report.
+
+;; ────────────────────────────────────────────────────────────────────
+;; const_str_expr_copy via `(concat <strlit>...)` constant folding in
+;; a select projection.  The DAG compiler `compile_const_str_expr`
+;; walks the expression with `const_str_expr_len` and emits a single
+;; folded `ray_const_str` op.  Stays on the stack buffer (≤ 256 B) for
+;; the small cases and heap-allocates for the longer one.
+;; ────────────────────────────────────────────────────────────────────
+(set Tcs (table [k v] (list ['a 'b 'c] [1 2 3])))
+
+;; Two-piece concat — small, stack buffer (< 256 B).
+(at (at (select {s: (concat "hello" "-world") from: Tcs}) 's) 0) -- "hello-world"
+(count (at (select {s: (concat "hello" "-world") from: Tcs}) 's)) -- 3
+
+;; Three-piece concat — recurses through const_str_expr_copy.
+(at (at (select {s: (concat "a" "b" "c") from: Tcs}) 's) 0) -- "abc"
+
+;; Nested concat — const_str_expr_len recognises the inner (concat ...)
+;; head and recurses, const_str_expr_copy follows the same walk.
+(at (at (select {s: (concat "x" (concat "y" "z")) from: Tcs}) 's) 0) -- "xyz"
+(at (at (select {s: (concat (concat "ab" "cd") (concat "ef" "gh")) from: Tcs}) 's) 0) -- "abcdefgh"
+
+;; Longer folded result that overflows the 256-byte stack buffer →
+;; heap path inside `compile_const_str_expr`.  Folded length is
+;; 26 * 12 = 312 bytes.  Verify the projection emits 3 rows and the
+;; first cell holds the full folded string.
+(count (at (select {s: (concat "abcdefghijklmnopqrstuvwxyz" "abcdefghijklmnopqrstuvwxyz" "abcdefghijklmnopqrstuvwxyz" "abcdefghijklmnopqrstuvwxyz" "abcdefghijklmnopqrstuvwxyz" "abcdefghijklmnopqrstuvwxyz" "abcdefghijklmnopqrstuvwxyz" "abcdefghijklmnopqrstuvwxyz" "abcdefghijklmnopqrstuvwxyz" "abcdefghijklmnopqrstuvwxyz" "abcdefghijklmnopqrstuvwxyz" "abcdefghijklmnopqrstuvwxyz") from: Tcs}) 's)) -- 3
+(at (at (select {s: (concat "abcdefghijklmnopqrstuvwxyz" "abcdefghijklmnopqrstuvwxyz" "abcdefghijklmnopqrstuvwxyz" "abcdefghijklmnopqrstuvwxyz" "abcdefghijklmnopqrstuvwxyz" "abcdefghijklmnopqrstuvwxyz" "abcdefghijklmnopqrstuvwxyz" "abcdefghijklmnopqrstuvwxyz" "abcdefghijklmnopqrstuvwxyz" "abcdefghijklmnopqrstuvwxyz" "abcdefghijklmnopqrstuvwxyz" "abcdefghijklmnopqrstuvwxyz") from: Tcs}) 's) 0) -- "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"
+
+;; ────────────────────────────────────────────────────────────────────
+;; Multi-key by + non-agg-with-inner-agg → eval-level group path that
+;; runs `nonagg_eval_per_group` over a LIST<(key,idx_list)> layout.
+;; First-of-group cell decides the typed-vec storage type; subsequent
+;; cells must match or the result demotes via typed_vec_to_list.
+;;
+;; Happy path: homogeneous numeric output → stays on the typed-vec
+;; path, no demotion.  Confirms the per-group eval correctness and
+;; the typed-direct (collapsable) branch in nonagg_eval_per_group_core.
+;; ────────────────────────────────────────────────────────────────────
+
+;; Single-key q7 shape — non-agg outer, inner aggs reduce per-group.
+;; Routes through the DAG fast scatter (`nonagg_eval_per_group_buf`).
+(set Ta (table [k v1 v2] (list ['A 'A 'B 'B 'C 'C] [10 20 30 40 50 60] [5 15 25 35 45 55])))
+(count (select {r: (- (max v1) (min v2)) by: k from: Ta})) -- 3
+;; A: max(v1)=20 min(v2)=5  → 15
+;; B: max(v1)=40 min(v2)=25 → 15
+;; C: max(v1)=60 min(v2)=45 → 15
+(sum (at (select {r: (- (max v1) (min v2)) by: k from: Ta}) 'r)) -- 45
+
+;; Multi-key by — forces use_eval_group → `nonagg_eval_per_group`
+;; over the LIST<groups> layout.
+(set Tb (table [g h v1 v2] (list ['A 'A 'B 'B 'A 'A 'B 'B] ['X 'Y 'X 'Y 'X 'Y 'X 'Y] [10 20 30 40 50 60 70 80] [1 2 3 4 5 6 7 8])))
+(count (select {r: (- (max v1) (min v2)) by: [g h] from: Tb})) -- 4
+;; (A,X): v1=[10,50] max=50; v2=[1,5] min=1 → 49
+;; (A,Y): v1=[20,60] max=60; v2=[2,6] min=2 → 58
+;; (B,X): v1=[30,70] max=70; v2=[3,7] min=3 → 67
+;; (B,Y): v1=[40,80] max=80; v2=[4,8] min=4 → 76
+;; Sum = 49+58+67+76 = 250
+(sum (at (select {r: (- (max v1) (min v2)) by: [g h] from: Tb}) 'r)) -- 250
+
+;; F64 arithmetic-of-aggregates per group — same routing, F64 typed
+;; vec result.
+(set Tf (table [k x y] (list ['A 'A 'B 'B 'C 'C] [1.0 2.0 3.0 4.0 5.0 6.0] [2.0 4.0 6.0 8.0 10.0 12.0])))
+(count (select {r: (- (max y) (min x)) by: k from: Tf})) -- 3
+;; A: max(y)=4, min(x)=1 → 3
+;; B: max(y)=8, min(x)=3 → 5
+;; C: max(y)=12, min(x)=5 → 7
+(sum (at (select {r: (- (max y) (min x)) by: k from: Tf}) 'r)) -- 15.0
+
+;; Integer key (non-symbol) — same row→gid + per-group eval.
+(set Ti (table [k v] (list [1 1 2 2 3 3] [10 20 30 40 50 60])))
+;; (+ (max v) (min v)) per group:
+;;   k=1: max=20, min=10 → 30
+;;   k=2: max=40, min=30 → 70
+;;   k=3: max=60, min=50 → 110
+(sum (at (select {r: (+ (max v) (min v)) by: k from: Ti}) 'r)) -- 210
+(count (select {r: (+ (max v) (min v)) by: k from: Ti})) -- 3
+
+;; Two-key with integer keys — eval-level multi-key, typed-direct.
+(set Tk (table [k1 k2 v] (list [1 1 2 2 1 2] [10 20 10 20 10 20] [100 200 300 400 500 600])))
+;; Groups (k1, k2) — 4 distinct pairs:
+;;   (1, 10) v=[100, 500] → max=500, min=100 → diff=400
+;;   (1, 20) v=[200]      → max=200, min=200 → diff=0
+;;   (2, 10) v=[300]      → max=300, min=300 → diff=0
+;;   (2, 20) v=[400, 600] → max=600, min=400 → diff=200
+(count (select {r: (- (max v) (min v)) by: [k1 k2] from: Tk})) -- 4
+(sum (at (select {r: (- (max v) (min v)) by: [k1 k2] from: Tk}) 'r)) -- 600
+
+;; ────────────────────────────────────────────────────────────────────
+;; typed-direct happy path inside `nonagg_eval_per_group_core`.
+;; First-group cell is a collapsable primitive (F64 scalar after
+;; column-level type coercion of the mixed I64/F64 list), so the
+;; result emerges as a typed F64 vec without invoking
+;; `typed_vec_to_list` demotion.  Confirms the row→gid scatter +
+;; per-group eval produces a row-aligned typed column.
+;;
+;; (See reachability note in the task report: constructing a true
+;; typed_vec_to_list demotion from RFL requires a per-group eval
+;; whose cell type GENUINELY varies across groups — table column
+;; coercion collapses mixed-I64/F64 literal lists to a single F64
+;; vector before the eval ever runs, so the cells are homogeneous
+;; F64.  The demotion line is exercised via internal callers; the
+;; happy-path typed-direct branch is what RFL surfaces.)
+;; ────────────────────────────────────────────────────────────────────
+(set Tlist (table [k v] (list ['A 'B 'C 'D] (list 1 2.5 3 4.5))))
+(set Rlist (select {r: (+ 0 (first v)) by: k from: Tlist}))
+(count Rlist) -- 4
+;; Column v is coerced to F64; per-group first-of-slice yields
+;; F64 atoms → all result cells are F64.
+(at (at Rlist 'r) 0) -- 1.0
+(at (at Rlist 'r) 1) -- 2.5
+(at (at Rlist 'r) 2) -- 3.0
+(at (at Rlist 'r) 3) -- 4.5
+(type (at Rlist 'r)) -- 'F64
diff --git a/test/rfl/sort/fused_topn.rfl b/test/rfl/sort/fused_topn.rfl
new file mode 100644
index 00000000..38e3ab98
--- /dev/null
+++ b/test/rfl/sort/fused_topn.rfl
@@ -0,0 +1,94 @@
+;; Fused top-N / bottom-N pipeline — Rayfall coverage.
+;;
+;; Drives ray_top_fn / ray_bot_fn through the RFL surface, which
+;; flows into topk_take_vec → topk_indices_single (radix-encoded
+;; bounded-heap path) or topk_indices_cmp_single (SYM comparator
+;; heap, sort.c:3173).
+;;
+;; Complements arith/top_bot.rfl by widening type / shape coverage
+;; (TIMESTAMP, DATE, sorted-input, deeper filter+top fusion, K=1 /
+;; K=mid / K=N corners).  Each assertion targets a happy-path
+;; configuration; null / wrong-type cases live in top_bot.rfl.
+
+;; ─── numeric fast path: I64 ────────────────────────────────────────
+(top [10 20 5 25 15 30 1 35 8 22] 4) -- [35 30 25 22]
+(bot [10 20 5 25 15 30 1 35 8 22] 4) -- [1 5 8 10]
+
+;; K=1 (smallest possible) — verify max/min compatibility.
+(top [42 17 99 3 88 56] 1) -- [99]
+(bot [42 17 99 3 88 56] 1) -- [3]
+
+;; ─── numeric fast path: F64 (negative, fractional) ─────────────────
+(top [-1.5 2.5 -0.5 3.5 -1.0 4.5 2.0] 3) -- [4.5 3.5 2.5]
+(bot [-1.5 2.5 -0.5 3.5 -1.0 4.5 2.0] 3) -- [-1.5 -1.0 -0.5]
+
+;; ─── narrow integer types ──────────────────────────────────────────
+;; I32, I16, U8 fast paths — preserve the input type, k < n.
+(top (as 'I32 [100 200 50 250 150 300 75]) 2) -- (as 'I32 [300 250])
+(bot (as 'I32 [100 200 50 250 150 300 75]) 2) -- (as 'I32 [50 75])
+(top (as 'I16 [9 1 5 7 3 8 2 6 4]) 3) -- (as 'I16 [9 8 7])
+(bot (as 'I16 [9 1 5 7 3 8 2 6 4]) 3) -- (as 'I16 [1 2 3])
+(top (as 'U8  [9 1 5 7 3 8 2 6 4]) 3) -- (as 'U8  [9 8 7])
+(bot (as 'U8  [9 1 5 7 3 8 2 6 4]) 3) -- (as 'U8  [1 2 3])
+
+;; ─── already-sorted / reverse-sorted inputs ────────────────────────
+;; topk_indices_single must still scan and produce the correct prefix
+;; even when the source is monotonic.
+(top [1 2 3 4 5 6 7 8 9 10] 3) -- [10 9 8]
+(bot [1 2 3 4 5 6 7 8 9 10] 3) -- [1 2 3]
+(top [10 9 8 7 6 5 4 3 2 1] 3) -- [10 9 8]
+(bot [10 9 8 7 6 5 4 3 2 1] 3) -- [1 2 3]
+
+;; ─── filter + top-N fusion (the "fused topk" benchmark shape) ──────
+;; (top (where v > 100 v) k) — filter then partial-sort.  This is
+;; the Q25-Q27 shape from be3b5364.
+(set Vf [50 120 80 200 30 150 90 175 60 110])
+(top (filter Vf (> Vf 100)) 3) -- [200 175 150]
+(bot (filter Vf (> Vf 100)) 2) -- [110 120]
+
+;; Predicate selects exactly K elements: top must return them all.
+(top (filter Vf (> Vf 150)) 2) -- [200 175]
+(bot (filter Vf (> Vf 150)) 2) -- [175 200]
+
+;; ─── algebraic identity: (top v k) is a prefix of (desc v) ─────────
+(set V32 [100 5 201 12 302 7 403 9 50 25 75 33 66])
+(top V32 5) -- (take (desc V32) 5)
+(bot V32 5) -- (take (asc V32) 5)
+
+;; ─── K = count(v) edge: top/bot equal full sort ────────────────────
+(set Vk [5 2 8 1 9 4 7 3 6])
+(top Vk (count Vk)) -- (desc Vk)
+(bot Vk (count Vk)) -- (asc Vk)
+
+;; ─── temporal types: DATE, TIME, TIMESTAMP fast path ───────────────
+;; The radix-encoded heap path handles DATE/TIME/TIMESTAMP identically
+;; to their underlying I32 / I64 representations, so happy-path top/bot
+;; on these is a meaningful coverage smoke check.
+(set D (as 'DATE [2025.03.15 2025.01.01 2025.06.20 2025.02.10 2025.12.31]))
+(top D 2) -- (as 'DATE [2025.12.31 2025.06.20])
+(bot D 2) -- (as 'DATE [2025.01.01 2025.02.10])
+
+(set T (as 'TIME [12:00:00.000 09:30:00.000 15:45:00.000 06:15:00.000 23:59:59.999]))
+(top T 2) -- (as 'TIME [23:59:59.999 15:45:00.000])
+(bot T 2) -- (as 'TIME [06:15:00.000 09:30:00.000])
+
+;; ─── larger N to exercise the multi-morsel scan ────────────────────
+;; (top of a larger random-ish vector) must agree with (take (desc v) k).
+(set Vbig (rand 1024 10000))
+(top Vbig 7) -- (take (desc Vbig) 7)
+(bot Vbig 7) -- (take (asc  Vbig) 7)
+
+;; ─── K = 2 over a tie-rich input ───────────────────────────────────
+;; Multiple equal max values — bot path's order must still be stable.
+(top [5 5 5 5 5 5] 2) -- [5 5]
+(bot [5 5 5 5 5 5] 2) -- [5 5]
+(top [7 3 7 3 7 3] 2) -- [7 7]
+(bot [7 3 7 3 7 3] 2) -- [3 3]
+
+;; ─── count / type invariants ───────────────────────────────────────
+(count (top [1 2 3 4 5] 3)) -- 3
+(count (bot [1 2 3 4 5] 3)) -- 3
+(type  (top (as 'I32 [1 2 3 4 5]) 2)) -- 'I32
+(type  (bot (as 'I16 [1 2 3 4 5]) 2)) -- 'I16
+(type  (top (as 'U8  [1 2 3 4 5]) 2)) -- 'U8
+(type  (top (as 'F64 [1.0 2.0 3.0]) 2)) -- 'F64
diff --git a/test/rfl/temporal/extract.rfl b/test/rfl/temporal/extract.rfl
new file mode 100644
index 00000000..c19eb27a
--- /dev/null
+++ b/test/rfl/temporal/extract.rfl
@@ -0,0 +1,142 @@
+;; Happy-path coverage for the temporal extract helpers in src/ops/temporal.c:
+;;   ray_extract_yyyy_fn / mm / dd / hh / minute / ss / dow / doy
+;;
+;; Each helper is a thin wrapper around ray_temporal_extract; reached from
+;; RFL via the unary builtins registered in src/lang/eval.c.  Atom and
+;; vector input paths are exercised here.  Null / wrong-type / OOB branches
+;; are out of scope (P2).
+
+;; ───────────────────────────── yyyy (year) ─────────────────────────────
+;; date atom
+(yyyy 2024.03.15) -- 2024
+(yyyy 2000.01.01) -- 2000
+;; leap-year boundary (29 Feb of a leap year decodes to year 2024)
+(yyyy 2024.02.29) -- 2024
+;; year-end → next-year boundary
+(yyyy 2024.12.31) -- 2024
+(yyyy 2025.01.01) -- 2025
+;; pre-epoch date (days_since_2000 < 0) still decomposes correctly
+(yyyy 1999.12.31) -- 1999
+;; timestamp atom
+(yyyy 2024.07.04D09:15:30.000000000) -- 2024
+;; vector
+(yyyy [2024.01.01 2024.07.04]) -- [2024 2024]
+(yyyy [1999.12.31 2000.01.01 2025.01.01]) -- [1999 2000 2025]
+
+;; ───────────────────────────── mm (month) ──────────────────────────────
+(mm 2024.03.15) -- 3
+(mm 2024.01.01) -- 1
+(mm 2024.12.31) -- 12
+;; leap-year Feb
+(mm 2024.02.29) -- 2
+;; pre-epoch
+(mm 1999.12.31) -- 12
+;; timestamp atom
+(mm 2024.07.04D09:15:30.000000000) -- 7
+;; vector covering every month
+(mm [2024.01.10 2024.02.10 2024.03.10 2024.04.10 2024.05.10 2024.06.10]) -- [1 2 3 4 5 6]
+(mm [2024.07.10 2024.08.10 2024.09.10 2024.10.10 2024.11.10 2024.12.10]) -- [7 8 9 10 11 12]
+
+;; ─────────────────────────── dd (day-of-month) ─────────────────────────
+(dd 2024.03.15) -- 15
+(dd 2024.01.01) -- 1
+(dd 2024.01.31) -- 31
+;; leap-day
+(dd 2024.02.29) -- 29
+;; year-end
+(dd 2024.12.31) -- 31
+;; timestamp atom
+(dd 2024.07.04D09:15:30.000000000) -- 4
+;; vector at month boundaries
+(dd [2024.01.01 2024.01.31 2024.02.29 2024.12.31]) -- [1 31 29 31]
+
+;; ───────────────────────────── hh (hour) ───────────────────────────────
+;; time atom
+(hh 00:00:00.000) -- 0
+(hh 12:34:56.000) -- 12
+(hh 23:59:59.999) -- 23
+;; timestamp atom
+(hh 2024.03.15D00:00:00.000000000) -- 0
+(hh 2024.03.15D12:34:56.000000000) -- 12
+(hh 2024.03.15D23:59:59.999999999) -- 23
+;; pure-date timestamp boundary (date atom decodes to midnight UTC)
+(hh 2024.03.15) -- 0
+;; vector of times
+(hh [00:00:00.000 06:30:00.000 12:00:00.000 18:45:00.000 23:59:59.000]) -- [0 6 12 18 23]
+;; vector of timestamps
+(hh [2024.03.15D01:00:00.000000000 2024.03.15D13:00:00.000000000]) -- [1 13]
+
+;; ──────────────────────────── minute ───────────────────────────────────
+(minute 00:00:00.000) -- 0
+(minute 12:34:56.000) -- 34
+(minute 12:59:00.000) -- 59
+(minute 2024.03.15D12:34:56.000000000) -- 34
+(minute 2024.03.15D08:00:00.000000000) -- 0
+;; vector
+(minute [00:00:00.000 00:15:00.000 00:30:00.000 00:45:00.000 00:59:00.000]) -- [0 15 30 45 59]
+
+;; ──────────────────────────── ss (second) ──────────────────────────────
+(ss 00:00:00.000) -- 0
+(ss 12:34:56.000) -- 56
+(ss 12:30:59.999) -- 59
+(ss 2024.03.15D12:34:56.000000000) -- 56
+(ss 2024.03.15D12:34:00.000000000) -- 0
+;; vector
+(ss [00:00:00.000 00:00:01.000 00:00:30.000 00:00:59.000]) -- [0 1 30 59]
+
+;; ───────────────────────────── dow (day-of-week) ───────────────────────
+;; rayforce's `dow` returns 1..7 with Mon=1 .. Sun=7
+;; (formula: ((days_since_2000 % 7) + 7 + 5) % 7 + 1)
+;; Verified by hand against the Gregorian calendar:
+;;   2024-01-01 = Mon, 2024-03-15 = Fri, 2024-07-04 = Thu,
+;;   2024-12-31 = Tue, 2000-01-01 = Sat, 2023-12-31 = Sun.
+(dow 2024.01.01) -- 1
+(dow 2024.03.15) -- 5
+(dow 2024.07.04) -- 4
+(dow 2024.12.31) -- 2
+(dow 2000.01.01) -- 6
+(dow 2023.12.31) -- 7
+;; timestamp atom takes the same path
+(dow 2024.03.15D12:34:56.000000000) -- 5
+;; vector — one of each weekday code
+;; (2024.01.01 Mon, 2024.01.02 Tue, 2024.01.03 Wed, 2024.01.04 Thu,
+;;  2024.01.05 Fri, 2024.01.06 Sat, 2024.01.07 Sun)
+(dow [2024.01.01 2024.01.02 2024.01.03 2024.01.04 2024.01.05 2024.01.06 2024.01.07]) -- [1 2 3 4 5 6 7]
+
+;; ───────────────────────────── doy (day-of-year) ───────────────────────
+;; non-leap baseline
+(doy 2023.01.01) -- 1
+(doy 2023.03.01) -- 60
+(doy 2023.12.31) -- 365
+;; leap year shifts everything from Mar 1 onward by +1
+(doy 2024.01.01) -- 1
+(doy 2024.02.29) -- 60
+(doy 2024.03.01) -- 61
+(doy 2024.07.04) -- 186
+(doy 2024.12.31) -- 366
+;; century rules: 2000 is leap (div 400), 1900 was not (div 100, not 400)
+(doy 2000.12.31) -- 366
+;; timestamp atom
+(doy 2024.03.15D12:34:56.000000000) -- 75
+;; vector across the year
+(doy [2024.01.01 2024.02.29 2024.03.01 2024.07.04 2024.12.31]) -- [1 60 61 186 366]
+
+;; ───────────────────── combined extractor round-trip ───────────────────
+;; A single timestamp decomposes consistently across every helper.
+(set TS 2024.03.15D12:34:56.000000000)
+(yyyy   TS) -- 2024
+(mm     TS) -- 3
+(dd     TS) -- 15
+(hh     TS) -- 12
+(minute TS) -- 34
+(ss     TS) -- 56
+(dow    TS) -- 5
+(doy    TS) -- 75
+
+;; Same idea on a vector — each helper produces a parallel column.
+(set DS [2024.01.01 2024.07.04 2024.12.31])
+(yyyy DS) -- [2024 2024 2024]
+(mm   DS) -- [1 7 12]
+(dd   DS) -- [1 4 31]
+(dow  DS) -- [1 4 2]
+(doy  DS) -- [1 186 366]

From c421fac896467ba446e8c16a18414d232e40e64f Mon Sep 17 00:00:00 2001
From: Serhii Savchuk <ser.vasilich@hotmail.com>
Date: Tue, 19 May 2026 11:33:18 +0300
Subject: [PATCH 2/3] test: round1-2 C-level coverage push
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extends three C-level test files with happy-path coverage for:

test/test_fused_group.c (+575 lines, 10 new tests):
- mk_combine_hist_fn / mk_combine_scat_fn / mk_combine_dedup_fn —
  multi-key fused_group 3-pass radix scatter combine
- fp_expr_const_str — phase-3 const-string LIKE predicate gate
- fp_count_heap_up/down/consider, fp_count_emit_keep_min —
  fused TOP-N count heap

test/test_sort.c (+425 lines, 12 new tests):
- ray_top_fn / ray_bot_fn for I64/F64/SYM at K<N, K=1, K=N
- topk_indices_cmp_single via SYM input
- msd_bucket_sort_fn + bucket_lsb_sort on N>1_000_001

test/test_public_api.c (+260 lines, 33 new tests):
- ray_obj_type / ray_obj_attrs across atom/vec/list/dict/table
- ray_vec_get_i64 across I64/I32/I16/U8/BOOL/TIMESTAMP
- ray_vec_get_f64 across F64/F32
- ray_vec_get_sym_id across W64/W32/W16/W8
- ray_runtime_create_with_sym, _with_sym_err, runtime_destroy(NULL)
- ray_request_interrupt / ray_clear_interrupt / ray_interrupted
- ray_eval_*_interrupt wrappers (thread-local sig_atomic flag)
- ray_eval_get_nfo / ray_eval_set_nfo handle round-trip
- ray_eval_set_restricted / ray_eval_get_restricted
- ray_get_error_trace populated after lambda type-error,
  cleared on next ray_eval_str

RAY_DATE / RAY_TIME branches of ray_vec_get_i64 are flagged as
intentionally uncovered (see in-file comment) — fix in follow-up.

All tests happy-path only (correct types / shapes).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 test/test_fused_group.c | 575 +++++++++++++++++++++++++++++++++++++++
 test/test_public_api.c  | 576 +++++++++++++++++++++++++++++++++++++++-
 test/test_sort.c        | 425 +++++++++++++++++++++++++++++
 3 files changed, 1574 insertions(+), 2 deletions(-)

diff --git a/test/test_fused_group.c b/test/test_fused_group.c
index f83902db..afa9e46d 100644
--- a/test/test_fused_group.c
+++ b/test/test_fused_group.c
@@ -36,6 +36,7 @@
 #include "ops/internal.h"
 #include "ops/ops.h"
 #include "ops/fused_group.h"
+#include "lang/parse.h"
 #include "table/sym.h"
 #include <string.h>
 
@@ -1380,6 +1381,568 @@ static test_result_t test_eq_no_match(void) {
     PASS();
 }
 
+/* ──────────────────────────────────────────────────────────────────────
+ * Coverage extensions: multi-key parallel combine (mk_combine_hist_fn /
+ * mk_combine_scat_fn / mk_combine_dedup_fn), fused TOP-N count heap
+ * (fp_count_heap_* + fp_count_emit_keep_min), and Phase-3 const-string
+ * predicate gate (fp_expr_const_str).
+ * ────────────────────────────────────────────────────────────────────── */
+
+/* mk_combine_parallel path: 2 wide I64 keys (16 bytes total → wide=1).
+ * Drive enough distinct (k1,k2) pairs past FP_COMBINE_PAR_MIN (50,000)
+ * across all worker shards so the 3-pass radix scatter activates.  Each
+ * worker sees its row range and shards into a private HT — with all-
+ * distinct rows the shard fills equal nrows/nw, summing past 50K across
+ * the pool. */
+static test_result_t test_mk_combine_2i64_parallel_wide(void) {
+    ray_heap_init();
+    (void)ray_sym_init();
+
+    int64_t N = 80000;
+    ray_t* k1c = ray_vec_new(RAY_I64, N); k1c->len = N;
+    ray_t* k2c = ray_vec_new(RAY_I64, N); k2c->len = N;
+    ray_t* vc  = ray_vec_new(RAY_I64, N); vc->len  = N;
+    int64_t* k1 = (int64_t*)ray_data(k1c);
+    int64_t* k2 = (int64_t*)ray_data(k2c);
+    int64_t* v  = (int64_t*)ray_data(vc);
+    /* All (k1, k2) pairs distinct so per-shard n_filled = rows/nw and
+     * total_local = N — comfortably above FP_COMBINE_PAR_MIN (50K). */
+    for (int64_t i = 0; i < N; i++) {
+        k1[i] = i;
+        k2[i] = i * 3 + 7;
+        v[i]  = i + 1;
+    }
+
+    int64_t s_k1 = ray_sym_intern("k1", 2);
+    int64_t s_k2 = ray_sym_intern("k2", 2);
+    int64_t s_v  = ray_sym_intern("v",  1);
+    ray_t* tbl = ray_table_new(3);
+    tbl = ray_table_add_col(tbl, s_k1, k1c); ray_release(k1c);
+    tbl = ray_table_add_col(tbl, s_k2, k2c); ray_release(k2c);
+    tbl = ray_table_add_col(tbl, s_v,  vc);  ray_release(vc);
+
+    ray_graph_t* g = ray_graph_new(tbl);
+    ray_op_t* scan_k1 = ray_scan(g, "k1");
+    ray_op_t* scan_k2 = ray_scan(g, "k2");
+    ray_op_t* scan_v  = ray_scan(g, "v");
+    ray_op_t* scan_vp = ray_scan(g, "v");
+    ray_op_t* zero    = ray_const_i64(g, 0);
+    /* Non-trivial WHERE that passes everything. */
+    ray_op_t* pred    = ray_binop(g, OP_GE, scan_vp, zero);
+
+    uint16_t  agg_ops[] = { OP_COUNT };
+    ray_op_t* agg_ins[] = { scan_v };
+    ray_op_t* keys[]    = { scan_k1, scan_k2 };
+    ray_op_t* fused     = ray_filtered_group(g, pred, keys, 2, agg_ops, agg_ins, 1);
+    TEST_ASSERT_NOT_NULL(fused);
+
+    ray_t* res = ray_execute(g, fused);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(res));
+    /* All pairs distinct → N output rows. */
+    TEST_ASSERT_EQ_I(ray_table_nrows(res), N);
+
+    int64_t cnt_sym = ray_sym_intern("count", 5);
+    ray_t* cnt_col = ray_table_get_col(res, cnt_sym);
+    TEST_ASSERT_NOT_NULL(cnt_col);
+    int64_t total = 0;
+    for (int64_t i = 0; i < ray_table_nrows(res); i++)
+        total += ((int64_t*)ray_data(cnt_col))[i];
+    TEST_ASSERT_EQ_I(total, N);
+
+    ray_release(res); ray_graph_free(g); ray_release(tbl);
+    ray_sym_destroy(); ray_heap_destroy();
+    PASS();
+}
+
+/* mk_combine narrow branch: 2 I32 keys → 8 bytes total → wide=0.  All
+ * (k1, k2) pairs distinct so total_local hits the parallel threshold.
+ * Exercises the !wide branches of mk_combine_hist_fn / scat_fn / dedup_fn. */
+static test_result_t test_mk_combine_2i32_parallel_narrow(void) {
+    ray_heap_init();
+    (void)ray_sym_init();
+
+    int64_t N = 80000;
+    ray_t* k1c = ray_vec_new(RAY_I32, N); k1c->len = N;
+    ray_t* k2c = ray_vec_new(RAY_I32, N); k2c->len = N;
+    ray_t* vc  = ray_vec_new(RAY_I64, N); vc->len  = N;
+    int32_t* k1 = (int32_t*)ray_data(k1c);
+    int32_t* k2 = (int32_t*)ray_data(k2c);
+    int64_t* v  = (int64_t*)ray_data(vc);
+    /* k1 = i / 4, k2 = i % 4 → all (k1,k2) distinct because i = k1*4 + k2. */
+    for (int64_t i = 0; i < N; i++) {
+        k1[i] = (int32_t)(i / 4);
+        k2[i] = (int32_t)(i % 4);
+        v[i]  = i + 1;
+    }
+
+    int64_t s_k1 = ray_sym_intern("k1", 2);
+    int64_t s_k2 = ray_sym_intern("k2", 2);
+    int64_t s_v  = ray_sym_intern("v",  1);
+    ray_t* tbl = ray_table_new(3);
+    tbl = ray_table_add_col(tbl, s_k1, k1c); ray_release(k1c);
+    tbl = ray_table_add_col(tbl, s_k2, k2c); ray_release(k2c);
+    tbl = ray_table_add_col(tbl, s_v,  vc);  ray_release(vc);
+
+    ray_graph_t* g = ray_graph_new(tbl);
+    ray_op_t* scan_k1 = ray_scan(g, "k1");
+    ray_op_t* scan_k2 = ray_scan(g, "k2");
+    ray_op_t* scan_v  = ray_scan(g, "v");
+    ray_op_t* scan_vp = ray_scan(g, "v");
+    ray_op_t* zero    = ray_const_i64(g, 0);
+    ray_op_t* pred    = ray_binop(g, OP_GE, scan_vp, zero);
+
+    uint16_t  agg_ops[] = { OP_COUNT };
+    ray_op_t* agg_ins[] = { scan_v };
+    ray_op_t* keys[]    = { scan_k1, scan_k2 };
+    ray_op_t* fused     = ray_filtered_group(g, pred, keys, 2, agg_ops, agg_ins, 1);
+    TEST_ASSERT_NOT_NULL(fused);
+
+    ray_t* res = ray_execute(g, fused);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(res));
+    TEST_ASSERT_EQ_I(ray_table_nrows(res), N);
+
+    int64_t cnt_sym = ray_sym_intern("count", 5);
+    ray_t* cnt_col = ray_table_get_col(res, cnt_sym);
+    int64_t total = 0;
+    for (int64_t i = 0; i < ray_table_nrows(res); i++)
+        total += ((int64_t*)ray_data(cnt_col))[i];
+    TEST_ASSERT_EQ_I(total, N);
+
+    ray_release(res); ray_graph_free(g); ray_release(tbl);
+    ray_sym_destroy(); ray_heap_destroy();
+    PASS();
+}
+
+/* mk_combine 2 SYM keys with W32 width.  Total = 4+4 = 8 bytes → wide=0.
+ * Each row carries a distinct (s1, s2) pair so total_local exceeds
+ * FP_COMBINE_PAR_MIN. */
+static test_result_t test_mk_combine_2sym_parallel(void) {
+    ray_heap_init();
+    (void)ray_sym_init();
+
+    int64_t N = 80000;
+    ray_t* s1c = ray_sym_vec_new(RAY_SYM_W32, N); s1c->len = N;
+    ray_t* s2c = ray_sym_vec_new(RAY_SYM_W32, N); s2c->len = N;
+    ray_t* vc  = ray_vec_new(RAY_I64, N);         vc->len  = N;
+    int32_t* s1 = (int32_t*)ray_data(s1c);
+    int32_t* s2 = (int32_t*)ray_data(s2c);
+    int64_t* v  = (int64_t*)ray_data(vc);
+    /* Intern N distinct symbols up front so we can index into them. */
+    int64_t pool[400];
+    char  nm[16];
+    for (int j = 0; j < 400; j++) {
+        int l = snprintf(nm, sizeof(nm), "sym_%04d", j);
+        pool[j] = ray_sym_intern(nm, (size_t)l);
+    }
+    /* (s1[i], s2[i]) = (pool[i / 400], pool[i % 400]) — 400 × 400 = 160K
+     * possible pairs; with N=80K rows all pairs distinct (i runs 0..N). */
+    for (int64_t i = 0; i < N; i++) {
+        s1[i] = (int32_t)pool[i / 400];
+        s2[i] = (int32_t)pool[i % 400];
+        v[i]  = i + 1;
+    }
+    int64_t s_a = ray_sym_intern("a", 1);
+    int64_t s_b = ray_sym_intern("b", 1);
+    int64_t s_v = ray_sym_intern("v", 1);
+    ray_t* tbl = ray_table_new(3);
+    tbl = ray_table_add_col(tbl, s_a, s1c); ray_release(s1c);
+    tbl = ray_table_add_col(tbl, s_b, s2c); ray_release(s2c);
+    tbl = ray_table_add_col(tbl, s_v, vc);  ray_release(vc);
+
+    ray_graph_t* g = ray_graph_new(tbl);
+    ray_op_t* scan_a  = ray_scan(g, "a");
+    ray_op_t* scan_b  = ray_scan(g, "b");
+    ray_op_t* scan_v  = ray_scan(g, "v");
+    ray_op_t* scan_vp = ray_scan(g, "v");
+    ray_op_t* zero    = ray_const_i64(g, 0);
+    ray_op_t* pred    = ray_binop(g, OP_GE, scan_vp, zero);
+    uint16_t  agg_ops[] = { OP_COUNT };
+    ray_op_t* agg_ins[] = { scan_v };
+    ray_op_t* keys[]    = { scan_a, scan_b };
+    ray_op_t* fused     = ray_filtered_group(g, pred, keys, 2, agg_ops, agg_ins, 1);
+    TEST_ASSERT_NOT_NULL(fused);
+
+    ray_t* res = ray_execute(g, fused);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(res));
+    /* All pairs distinct. */
+    TEST_ASSERT_EQ_I(ray_table_nrows(res), N);
+
+    int64_t cnt_sym = ray_sym_intern("count", 5);
+    ray_t* cnt_col = ray_table_get_col(res, cnt_sym);
+    int64_t total = 0;
+    for (int64_t i = 0; i < ray_table_nrows(res); i++)
+        total += ((int64_t*)ray_data(cnt_col))[i];
+    TEST_ASSERT_EQ_I(total, N);
+
+    ray_release(res); ray_graph_free(g); ray_release(tbl);
+    ray_sym_destroy(); ray_heap_destroy();
+    PASS();
+}
+
+/* mk_combine mixed: SYM_W32 (4 bytes) + I64 (8 bytes) = 12 bytes → wide=1.
+ * Exercises the wide branch with a SYM-bearing decompose at materialize. */
+static test_result_t test_mk_combine_sym_i64_parallel(void) {
+    ray_heap_init();
+    (void)ray_sym_init();
+
+    int64_t N = 80000;
+    ray_t* sc = ray_sym_vec_new(RAY_SYM_W32, N); sc->len = N;
+    ray_t* kc = ray_vec_new(RAY_I64, N);         kc->len = N;
+    ray_t* vc = ray_vec_new(RAY_I64, N);         vc->len = N;
+    int32_t* s = (int32_t*)ray_data(sc);
+    int64_t* k = (int64_t*)ray_data(kc);
+    int64_t* v = (int64_t*)ray_data(vc);
+    int64_t pool[400];
+    char nm[16];
+    for (int j = 0; j < 400; j++) {
+        int l = snprintf(nm, sizeof(nm), "msy_%04d", j);
+        pool[j] = ray_sym_intern(nm, (size_t)l);
+    }
+    /* (s[i], k[i]) = (pool[i % 400], i) — N distinct pairs (k unique). */
+    for (int64_t i = 0; i < N; i++) {
+        s[i] = (int32_t)pool[i % 400];
+        k[i] = i;
+        v[i] = i + 1;
+    }
+    int64_t s_s = ray_sym_intern("s", 1);
+    int64_t s_k = ray_sym_intern("k", 1);
+    int64_t s_v = ray_sym_intern("v", 1);
+    ray_t* tbl = ray_table_new(3);
+    tbl = ray_table_add_col(tbl, s_s, sc); ray_release(sc);
+    tbl = ray_table_add_col(tbl, s_k, kc); ray_release(kc);
+    tbl = ray_table_add_col(tbl, s_v, vc); ray_release(vc);
+
+    ray_graph_t* g = ray_graph_new(tbl);
+    ray_op_t* scan_s  = ray_scan(g, "s");
+    ray_op_t* scan_k  = ray_scan(g, "k");
+    ray_op_t* scan_v  = ray_scan(g, "v");
+    ray_op_t* scan_vp = ray_scan(g, "v");
+    ray_op_t* zero    = ray_const_i64(g, 0);
+    ray_op_t* pred    = ray_binop(g, OP_GE, scan_vp, zero);
+    uint16_t  agg_ops[] = { OP_COUNT };
+    ray_op_t* agg_ins[] = { scan_v };
+    ray_op_t* keys[]    = { scan_s, scan_k };
+    ray_op_t* fused     = ray_filtered_group(g, pred, keys, 2, agg_ops, agg_ins, 1);
+    TEST_ASSERT_NOT_NULL(fused);
+
+    ray_t* res = ray_execute(g, fused);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(res));
+    TEST_ASSERT_EQ_I(ray_table_nrows(res), N);
+
+    int64_t cnt_sym = ray_sym_intern("count", 5);
+    ray_t* cnt_col = ray_table_get_col(res, cnt_sym);
+    int64_t total = 0;
+    for (int64_t i = 0; i < ray_table_nrows(res); i++)
+        total += ((int64_t*)ray_data(cnt_col))[i];
+    TEST_ASSERT_EQ_I(total, N);
+
+    ray_release(res); ray_graph_free(g); ray_release(tbl);
+    ray_sym_destroy(); ray_heap_destroy();
+    PASS();
+}
+
+/* Forward-declare the runtime API for fp_expr_const_str tests.  Mirrors
+ * test_fused_topk.c pattern — fp_expr_const_str is called only from
+ * fp_check_like inside ray_fused_group_supported, which needs a parsed
+ * AST.  ray_parse requires a live runtime for its symbol-table state. */
+struct ray_runtime_s;
+typedef struct ray_runtime_s ray_runtime_t;
+extern ray_runtime_t* ray_runtime_create(int argc, char** argv);
+extern void           ray_runtime_destroy(ray_runtime_t* rt);
+extern ray_runtime_t* __RUNTIME;
+
+/* fp_expr_const_str: LIKE on a SYM column with a string-literal pattern
+ * should be recognised by the planner gate (returns 1).  Exercises the
+ * `expr->type == -RAY_STR && !RAY_ATTR_NAME` base case of the recursive
+ * walker. */
+static test_result_t test_fp_expr_const_str_simple_like(void) {
+    ray_runtime_create(0, NULL);
+
+    /* Tiny SYM table — fp_check_like requires the column to exist and be
+     * STR/SYM type. */
+    ray_t* sc = ray_sym_vec_new(RAY_SYM_W32, 3); sc->len = 3;
+    int32_t* sd = (int32_t*)ray_data(sc);
+    int64_t s_a = ray_sym_intern("apple", 5);
+    int64_t s_b = ray_sym_intern("banana", 6);
+    int64_t s_c = ray_sym_intern("cherry", 6);
+    sd[0] = (int32_t)s_a; sd[1] = (int32_t)s_b; sd[2] = (int32_t)s_c;
+    int64_t s_name = ray_sym_intern("name", 4);
+    ray_t* tbl = ray_table_new(1);
+    tbl = ray_table_add_col(tbl, s_name, sc); ray_release(sc);
+
+    ray_t* expr = ray_parse("(like name \"app*\")");
+    TEST_ASSERT_NOT_NULL(expr);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(expr));
+
+    /* Predicate gate must accept (like sym_col "literal") — this recurses
+     * through fp_check_like → fp_expr_const_str on the literal. */
+    int ok = ray_fused_group_supported(expr, tbl);
+    TEST_ASSERT_EQ_I(ok, 1);
+
+    ray_release(expr);
+    ray_release(tbl);
+    ray_runtime_destroy(__RUNTIME);
+    PASS();
+}
+
+/* fp_expr_const_str: nested (concat str str) pattern.  Exercises the
+ * "is_concat" branch + recursion into each child. */
+static test_result_t test_fp_expr_const_str_concat_like(void) {
+    ray_runtime_create(0, NULL);
+
+    ray_t* sc = ray_sym_vec_new(RAY_SYM_W32, 2); sc->len = 2;
+    int32_t* sd = (int32_t*)ray_data(sc);
+    int64_t s_x = ray_sym_intern("foo_x", 5);
+    int64_t s_y = ray_sym_intern("foo_y", 5);
+    sd[0] = (int32_t)s_x; sd[1] = (int32_t)s_y;
+    int64_t s_n = ray_sym_intern("name", 4);
+    ray_t* tbl = ray_table_new(1);
+    tbl = ray_table_add_col(tbl, s_n, sc); ray_release(sc);
+
+    /* Pattern is (concat "foo" "*") — a nested-list const-string. */
+    ray_t* expr = ray_parse("(like name (concat \"foo\" \"*\"))");
+    TEST_ASSERT_NOT_NULL(expr);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(expr));
+
+    int ok = ray_fused_group_supported(expr, tbl);
+    TEST_ASSERT_EQ_I(ok, 1);
+
+    ray_release(expr);
+    ray_release(tbl);
+    ray_runtime_destroy(__RUNTIME);
+    PASS();
+}
+
+/* fp_expr_const_str: deeply-nested (concat (concat str str) str) — drives
+ * the recursive fp_expr_const_str over a tree, not just a flat list. */
+static test_result_t test_fp_expr_const_str_nested_concat(void) {
+    ray_runtime_create(0, NULL);
+
+    ray_t* sc = ray_sym_vec_new(RAY_SYM_W32, 1); sc->len = 1;
+    int32_t* sd = (int32_t*)ray_data(sc);
+    int64_t s_q = ray_sym_intern("abcdefg", 7);
+    sd[0] = (int32_t)s_q;
+    int64_t s_n = ray_sym_intern("name", 4);
+    ray_t* tbl = ray_table_new(1);
+    tbl = ray_table_add_col(tbl, s_n, sc); ray_release(sc);
+
+    ray_t* expr = ray_parse("(like name (concat (concat \"a\" \"b\") \"*\"))");
+    TEST_ASSERT_NOT_NULL(expr);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(expr));
+    int ok = ray_fused_group_supported(expr, tbl);
+    TEST_ASSERT_EQ_I(ok, 1);
+
+    ray_release(expr);
+    ray_release(tbl);
+    ray_runtime_destroy(__RUNTIME);
+    PASS();
+}
+
+/* fp_count_heap_*: U8 column → fp_try_direct_count1 fires (256 slots);
+ * with emit_filter.top_count_take = 3 and many distinct keys, the
+ * fp_count_emit_keep_min path runs the heap (n_slots ≫ k_take). */
+static test_result_t test_fp_count_heap_u8_top3(void) {
+    ray_heap_init();
+    (void)ray_sym_init();
+
+    /* 20 distinct U8 keys with sharply different counts so the top-3 is
+     * unambiguous: key i appears (i+1) times — total rows = 1+2+...+20
+     * = 210. */
+    int64_t total_rows = 0;
+    for (int64_t i = 1; i <= 20; i++) total_rows += i;
+    ray_t* kc = ray_vec_new(RAY_U8, total_rows); kc->len = total_rows;
+    uint8_t* k = (uint8_t*)ray_data(kc);
+    int64_t pos = 0;
+    for (int64_t key = 1; key <= 20; key++) {
+        for (int64_t r = 0; r < key; r++) k[pos++] = (uint8_t)key;
+    }
+    int64_t s_k = ray_sym_intern("k", 1);
+    ray_t* tbl = ray_table_new(1);
+    tbl = ray_table_add_col(tbl, s_k, kc); ray_release(kc);
+
+    ray_graph_t* g = ray_graph_new(tbl);
+    ray_op_t* scan_k    = ray_scan(g, "k");
+    ray_op_t* scan_pred = ray_scan(g, "k");
+    ray_op_t* zero      = ray_const_i64(g, 0);
+    ray_op_t* pred      = ray_binop(g, OP_GE, scan_pred, zero);
+    uint16_t  agg_ops[] = { OP_COUNT };
+    ray_op_t* agg_ins[] = { scan_k };
+    ray_op_t* keys[]    = { scan_k };
+    ray_op_t* fused     = ray_filtered_group(g, pred, keys, 1, agg_ops, agg_ins, 1);
+    TEST_ASSERT_NOT_NULL(fused);
+
+    ray_group_emit_filter_t prev = ray_group_emit_filter_get();
+    ray_group_emit_filter_t filter = {0};
+    filter.enabled = 1;
+    filter.agg_index = 0;
+    filter.top_count_take = 3;
+    ray_group_emit_filter_set(filter);
+    ray_t* res = ray_execute(g, fused);
+    ray_group_emit_filter_set(prev);
+
+    TEST_ASSERT_FALSE(RAY_IS_ERR(res));
+    /* Top-3 counts: keys 20, 19, 18 with counts 20, 19, 18 respectively.
+     * fp_count_emit_keep_min returns heap[0] = 18 — every group with
+     * count >= 18 is retained, so exactly 3 rows. */
+    TEST_ASSERT_EQ_I(ray_table_nrows(res), 3);
+
+    int64_t cnt_sym = ray_sym_intern("count", 5);
+    ray_t* k_col = ray_table_get_col(res, s_k);
+    ray_t* c_col = ray_table_get_col(res, cnt_sym);
+    TEST_ASSERT_NOT_NULL(k_col);
+    TEST_ASSERT_NOT_NULL(c_col);
+    int seen_18 = 0, seen_19 = 0, seen_20 = 0;
+    for (int64_t i = 0; i < ray_table_nrows(res); i++) {
+        int64_t key = (int64_t)((uint8_t*)ray_data(k_col))[i];
+        int64_t cnt = ((int64_t*)ray_data(c_col))[i];
+        if (key == 18) { TEST_ASSERT_EQ_I(cnt, 18); seen_18 = 1; }
+        if (key == 19) { TEST_ASSERT_EQ_I(cnt, 19); seen_19 = 1; }
+        if (key == 20) { TEST_ASSERT_EQ_I(cnt, 20); seen_20 = 1; }
+    }
+    TEST_ASSERT_TRUE(seen_18 && seen_19 && seen_20);
+
+    ray_release(res); ray_graph_free(g); ray_release(tbl);
+    ray_sym_destroy(); ray_heap_destroy();
+    PASS();
+}
+
+/* fp_count_heap_*: I16 key → fp_try_direct_count1 with 65536 slots; with
+ * a small top-K the heap_up / heap_down branches both fire as the heap
+ * gets pushed past capacity and then sees rows that displace heap[0]. */
+static test_result_t test_fp_count_heap_i16_top5(void) {
+    ray_heap_init();
+    (void)ray_sym_init();
+
+    /* 12 distinct I16 keys, counts decreasing as the key increases.  The
+     * key sequence (intentionally not sorted) drives both the up-heap
+     * (initial fill) and down-heap (replace heap[0] when a bigger count
+     * appears later in the slot walk) paths. */
+    int64_t per_key[12] = { 5, 11, 3, 17, 2, 9, 13, 21, 1, 7, 19, 4 };
+    int64_t total_rows = 0;
+    for (int i = 0; i < 12; i++) total_rows += per_key[i];
+    ray_t* kc = ray_vec_new(RAY_I16, total_rows); kc->len = total_rows;
+    int16_t* k = (int16_t*)ray_data(kc);
+    int64_t pos = 0;
+    for (int i = 0; i < 12; i++)
+        for (int64_t r = 0; r < per_key[i]; r++)
+            k[pos++] = (int16_t)(i + 100);  /* keys 100..111 */
+    int64_t s_k = ray_sym_intern("k", 1);
+    ray_t* tbl = ray_table_new(1);
+    tbl = ray_table_add_col(tbl, s_k, kc); ray_release(kc);
+
+    ray_graph_t* g = ray_graph_new(tbl);
+    ray_op_t* scan_k    = ray_scan(g, "k");
+    ray_op_t* scan_pred = ray_scan(g, "k");
+    ray_op_t* zero      = ray_const_i64(g, 0);
+    ray_op_t* pred      = ray_binop(g, OP_GE, scan_pred, zero);
+    uint16_t  agg_ops[] = { OP_COUNT };
+    ray_op_t* agg_ins[] = { scan_k };
+    ray_op_t* keys[]    = { scan_k };
+    ray_op_t* fused     = ray_filtered_group(g, pred, keys, 1, agg_ops, agg_ins, 1);
+    TEST_ASSERT_NOT_NULL(fused);
+
+    ray_group_emit_filter_t prev = ray_group_emit_filter_get();
+    ray_group_emit_filter_t filter = {0};
+    filter.enabled = 1;
+    filter.agg_index = 0;
+    filter.top_count_take = 5;
+    ray_group_emit_filter_set(filter);
+    ray_t* res = ray_execute(g, fused);
+    ray_group_emit_filter_set(prev);
+
+    TEST_ASSERT_FALSE(RAY_IS_ERR(res));
+    /* Top-5 counts: sorted descending = 21, 19, 17, 13, 11.  keep_min = 11.
+     * Result rows: every key whose count >= 11.  Counts 21,19,17,13,11 →
+     * 5 rows. */
+    TEST_ASSERT_EQ_I(ray_table_nrows(res), 5);
+
+    /* Verify the result counts are exactly {21,19,17,13,11}. */
+    int64_t cnt_sym = ray_sym_intern("count", 5);
+    ray_t* c_col = ray_table_get_col(res, cnt_sym);
+    TEST_ASSERT_NOT_NULL(c_col);
+    int64_t expect[5] = { 11, 13, 17, 19, 21 };
+    int seen[5] = {0, 0, 0, 0, 0};
+    for (int64_t i = 0; i < ray_table_nrows(res); i++) {
+        int64_t c = ((int64_t*)ray_data(c_col))[i];
+        for (int j = 0; j < 5; j++)
+            if (c == expect[j] && !seen[j]) { seen[j] = 1; break; }
+    }
+    for (int j = 0; j < 5; j++) TEST_ASSERT_TRUE(seen[j]);
+
+    ray_release(res); ray_graph_free(g); ray_release(tbl);
+    ray_sym_destroy(); ray_heap_destroy();
+    PASS();
+}
+
+/* fp_count_emit_keep_min via the serial-combine path of count1 with a
+ * wide-key I64 column.  fp_try_direct_count1 rejects (kt != BOOL/U8/I16)
+ * so the code falls through to fp_combine_and_materialize.  With
+ * use_emit_filter on, the parallel-combine branch is skipped (line 1343)
+ * and the serial combine + fp_count_emit_keep_min path runs.  The
+ * used_key_slots parameter is non-NULL in this branch, exercising the
+ * `used_key_slots && !used_key_slots[s * 2]` skip. */
+static test_result_t test_fp_count_emit_keep_min_i64_serial(void) {
+    ray_heap_init();
+    (void)ray_sym_init();
+
+    /* 15 distinct I64 keys with monotone counts 1..15.  Big enough that
+     * after the serial HT-build the open-addressed table has many empty
+     * slots interspersed with filled ones, exercising the
+     * used_key_slots-skip branch. */
+    int64_t per_key[15];
+    int64_t total_rows = 0;
+    for (int i = 0; i < 15; i++) { per_key[i] = i + 1; total_rows += per_key[i]; }
+    ray_t* kc = ray_vec_new(RAY_I64, total_rows); kc->len = total_rows;
+    int64_t* k = (int64_t*)ray_data(kc);
+    int64_t pos = 0;
+    for (int i = 0; i < 15; i++)
+        for (int64_t r = 0; r < per_key[i]; r++)
+            k[pos++] = (int64_t)(1000 + i);
+    int64_t s_k = ray_sym_intern("k", 1);
+    ray_t* tbl = ray_table_new(1);
+    tbl = ray_table_add_col(tbl, s_k, kc); ray_release(kc);
+
+    ray_graph_t* g = ray_graph_new(tbl);
+    ray_op_t* scan_k    = ray_scan(g, "k");
+    ray_op_t* scan_pred = ray_scan(g, "k");
+    ray_op_t* zero      = ray_const_i64(g, 0);
+    ray_op_t* pred      = ray_binop(g, OP_GE, scan_pred, zero);
+    uint16_t  agg_ops[] = { OP_COUNT };
+    ray_op_t* agg_ins[] = { scan_k };
+    ray_op_t* keys[]    = { scan_k };
+    ray_op_t* fused     = ray_filtered_group(g, pred, keys, 1, agg_ops, agg_ins, 1);
+    TEST_ASSERT_NOT_NULL(fused);
+
+    ray_group_emit_filter_t prev = ray_group_emit_filter_get();
+    ray_group_emit_filter_t filter = {0};
+    filter.enabled = 1;
+    filter.agg_index = 0;
+    filter.top_count_take = 4;
+    ray_group_emit_filter_set(filter);
+    ray_t* res = ray_execute(g, fused);
+    ray_group_emit_filter_set(prev);
+
+    TEST_ASSERT_FALSE(RAY_IS_ERR(res));
+    /* Top-4 counts = 15, 14, 13, 12. keep_min = 12 → 4 rows. */
+    TEST_ASSERT_EQ_I(ray_table_nrows(res), 4);
+
+    int64_t cnt_sym = ray_sym_intern("count", 5);
+    ray_t* c_col = ray_table_get_col(res, cnt_sym);
+    TEST_ASSERT_NOT_NULL(c_col);
+    int64_t expect[4] = { 12, 13, 14, 15 };
+    int seen[4] = { 0, 0, 0, 0 };
+    for (int64_t i = 0; i < ray_table_nrows(res); i++) {
+        int64_t c = ((int64_t*)ray_data(c_col))[i];
+        for (int j = 0; j < 4; j++)
+            if (c == expect[j] && !seen[j]) { seen[j] = 1; break; }
+    }
+    for (int j = 0; j < 4; j++) TEST_ASSERT_TRUE(seen[j]);
+
+    ray_release(res); ray_graph_free(g); ray_release(tbl);
+    ray_sym_destroy(); ray_heap_destroy();
+    PASS();
+}
+
 const test_entry_t fused_group_entries[] = {
     { "fused_group/eq_count",                    test_eq_count,                    NULL, NULL },
     { "fused_group/ne_two_groups",               test_ne_two_groups,               NULL, NULL },
@@ -1408,5 +1971,17 @@ const test_entry_t fused_group_entries[] = {
     { "fused_group/multi_agg_and_pred",          test_multi_agg_and_pred,          NULL, NULL },
     { "fused_group/multi_agg_unsigned_inputs",   test_multi_agg_unsigned_inputs,   NULL, NULL },
     { "fused_group/count1_sym_key_w32",          test_count1_sym_key_w32,          NULL, NULL },
+    /* mk_combine_* (multi-key parallel 3-pass radix scatter) + fused
+     * TOP-N count heap + Phase-3 const-string LIKE gate. */
+    { "fused_group/mk_combine_2i64_parallel_wide",  test_mk_combine_2i64_parallel_wide,  NULL, NULL },
+    { "fused_group/mk_combine_2i32_parallel_narrow",test_mk_combine_2i32_parallel_narrow,NULL, NULL },
+    { "fused_group/mk_combine_2sym_parallel",       test_mk_combine_2sym_parallel,       NULL, NULL },
+    { "fused_group/mk_combine_sym_i64_parallel",    test_mk_combine_sym_i64_parallel,    NULL, NULL },
+    { "fused_group/fp_expr_const_str_simple_like",  test_fp_expr_const_str_simple_like,  NULL, NULL },
+    { "fused_group/fp_expr_const_str_concat_like",  test_fp_expr_const_str_concat_like,  NULL, NULL },
+    { "fused_group/fp_expr_const_str_nested_concat",test_fp_expr_const_str_nested_concat,NULL, NULL },
+    { "fused_group/fp_count_heap_u8_top3",          test_fp_count_heap_u8_top3,          NULL, NULL },
+    { "fused_group/fp_count_heap_i16_top5",         test_fp_count_heap_i16_top5,         NULL, NULL },
+    { "fused_group/fp_count_emit_keep_min_i64_serial", test_fp_count_emit_keep_min_i64_serial, NULL, NULL },
     { NULL, NULL, NULL, NULL },
 };
diff --git a/test/test_public_api.c b/test/test_public_api.c
index afd7d8f5..240efb84 100644
--- a/test/test_public_api.c
+++ b/test/test_public_api.c
@@ -21,8 +21,27 @@
  *   SOFTWARE.
  */
 
+#define _DEFAULT_SOURCE   /* mkdtemp */
+
 #include "test.h"
 #include <rayforce.h>
+#include "lang/eval.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+/* Most introspection helpers need a live heap/runtime so vectors and
+ * atoms can be constructed via the public API. Match the test_link.c
+ * pattern: bring up a runtime in setup, tear it down afterwards. */
+struct ray_runtime_s;
+typedef struct ray_runtime_s ray_runtime_t;
+extern ray_runtime_t* ray_runtime_create(int argc, char** argv);
+extern void           ray_runtime_destroy(ray_runtime_t* rt);
+extern ray_runtime_t* __RUNTIME;
+
+static void public_api_setup(void)    { ray_runtime_create(0, NULL); }
+static void public_api_teardown(void) { ray_runtime_destroy(__RUNTIME); }
 
 static test_result_t test_public_ipc_client_symbols(void) {
     int64_t   (*connect_fn)(const char*, uint16_t, const char*, const char*) = ray_ipc_connect;
@@ -62,8 +81,561 @@ static test_result_t test_public_query_and_format_symbols(void) {
     PASS();
 }
 
+/* ─── ray_obj_type / ray_obj_attrs ──────────────────────────────────
+ *
+ * The FFI helpers are thin readers of v->type and v->attrs.  Atoms
+ * carry the negative form of the type tag; vectors carry the positive
+ * tag.  RAY_LIST is type 0, RAY_TABLE is 98, RAY_DICT is 99. */
+
+static test_result_t test_public_obj_type_atom_i64(void) {
+    ray_t* v = ray_i64(42);
+    TEST_ASSERT_NOT_NULL(v);
+    TEST_ASSERT_EQ_I(ray_obj_type(v), -RAY_I64);
+    TEST_ASSERT_EQ_I(ray_obj_attrs(v), 0);
+    ray_release(v);
+    PASS();
+}
+
+static test_result_t test_public_obj_type_atom_f64(void) {
+    ray_t* v = ray_f64(3.14);
+    TEST_ASSERT_NOT_NULL(v);
+    TEST_ASSERT_EQ_I(ray_obj_type(v), -RAY_F64);
+    TEST_ASSERT_EQ_I(ray_obj_attrs(v), 0);
+    ray_release(v);
+    PASS();
+}
+
+static test_result_t test_public_obj_type_atom_sym(void) {
+    int64_t sid = ray_sym_intern("alpha", 5);
+    ray_t* v = ray_sym(sid);
+    TEST_ASSERT_NOT_NULL(v);
+    TEST_ASSERT_EQ_I(ray_obj_type(v), -RAY_SYM);
+    TEST_ASSERT_EQ_I(ray_obj_attrs(v), 0);
+    ray_release(v);
+    PASS();
+}
+
+static test_result_t test_public_obj_type_vec_i64(void) {
+    ray_t* v = ray_vec_new(RAY_I64, 4);
+    TEST_ASSERT_NOT_NULL(v);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(v));
+    TEST_ASSERT_EQ_I(ray_obj_type(v), RAY_I64);
+    TEST_ASSERT_EQ_I(ray_obj_attrs(v), 0);
+    ray_release(v);
+    PASS();
+}
+
+static test_result_t test_public_obj_type_vec_f64(void) {
+    ray_t* v = ray_vec_new(RAY_F64, 4);
+    TEST_ASSERT_NOT_NULL(v);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(v));
+    TEST_ASSERT_EQ_I(ray_obj_type(v), RAY_F64);
+    TEST_ASSERT_EQ_I(ray_obj_attrs(v), 0);
+    ray_release(v);
+    PASS();
+}
+
+static test_result_t test_public_obj_type_vec_sym(void) {
+    /* ray_sym_vec_new stores the width in the low 2 bits of attrs;
+     * ray_obj_attrs should expose them verbatim. */
+    ray_t* v = ray_sym_vec_new(RAY_SYM_W32, 4);
+    TEST_ASSERT_NOT_NULL(v);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(v));
+    TEST_ASSERT_EQ_I(ray_obj_type(v), RAY_SYM);
+    TEST_ASSERT_EQ_I(ray_obj_attrs(v) & 0x3, RAY_SYM_W32);
+    ray_release(v);
+    PASS();
+}
+
+static test_result_t test_public_obj_type_list(void) {
+    ray_t* v = ray_list_new(2);
+    TEST_ASSERT_NOT_NULL(v);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(v));
+    TEST_ASSERT_EQ_I(ray_obj_type(v), RAY_LIST);
+    TEST_ASSERT_EQ_I(ray_obj_attrs(v), 0);
+    ray_release(v);
+    PASS();
+}
+
+static test_result_t test_public_obj_type_table(void) {
+    ray_t* tbl = ray_table_new(2);
+    TEST_ASSERT_NOT_NULL(tbl);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(tbl));
+    TEST_ASSERT_EQ_I(ray_obj_type(tbl), RAY_TABLE);
+    TEST_ASSERT_EQ_I(ray_obj_attrs(tbl), 0);
+    ray_release(tbl);
+    PASS();
+}
+
+static test_result_t test_public_obj_type_dict(void) {
+    /* Two-element typed-vec keys + typed-vec vals → dict. */
+    ray_t* keys = ray_vec_new(RAY_I64, 2);
+    int64_t k0 = 10, k1 = 20;
+    keys = ray_vec_append(keys, &k0);
+    keys = ray_vec_append(keys, &k1);
+    ray_t* vals = ray_vec_new(RAY_I64, 2);
+    int64_t v0 = 100, v1 = 200;
+    vals = ray_vec_append(vals, &v0);
+    vals = ray_vec_append(vals, &v1);
+
+    ray_t* d = ray_dict_new(keys, vals);
+    TEST_ASSERT_NOT_NULL(d);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(d));
+    TEST_ASSERT_EQ_I(ray_obj_type(d), RAY_DICT);
+    TEST_ASSERT_EQ_I(ray_obj_attrs(d), 0);
+    ray_release(d);
+    PASS();
+}
+
+/* ─── ray_vec_get_i64 — every integer width branch ───────────────────
+ *
+ * Implementation (src/core/runtime.c) dispatches on vec->type:
+ *   I64 / DATE / TIME / TIMESTAMP  → int64_t cast
+ *   I32                            → int32_t cast
+ *   I16                            → int16_t cast
+ *   U8  / BOOL                     → uint8_t cast
+ *
+ * For each branch read at idx 0, mid, and last to exercise the indexing
+ * arithmetic on top of the type-specific element size. */
+
+static test_result_t test_public_vec_get_i64_i64(void) {
+    ray_t* v = ray_vec_new(RAY_I64, 5);
+    int64_t xs[] = { -1000, 1, 2, 3, 9223372036854775000LL };
+    for (int i = 0; i < 5; i++) v = ray_vec_append(v, &xs[i]);
+    TEST_ASSERT_EQ_I(ray_vec_get_i64(v, 0), xs[0]);
+    TEST_ASSERT_EQ_I(ray_vec_get_i64(v, 2), xs[2]);
+    TEST_ASSERT_EQ_I(ray_vec_get_i64(v, 4), xs[4]);
+    ray_release(v);
+    PASS();
+}
+
+static test_result_t test_public_vec_get_i64_i32(void) {
+    ray_t* v = ray_vec_new(RAY_I32, 4);
+    int32_t xs[] = { -7, 0, 12345, 2147483600 };
+    for (int i = 0; i < 4; i++) v = ray_vec_append(v, &xs[i]);
+    TEST_ASSERT_EQ_I(ray_vec_get_i64(v, 0), (int64_t)xs[0]);
+    TEST_ASSERT_EQ_I(ray_vec_get_i64(v, 1), (int64_t)xs[1]);
+    TEST_ASSERT_EQ_I(ray_vec_get_i64(v, 3), (int64_t)xs[3]);
+    ray_release(v);
+    PASS();
+}
+
+static test_result_t test_public_vec_get_i64_i16(void) {
+    ray_t* v = ray_vec_new(RAY_I16, 3);
+    int16_t xs[] = { -32000, 0, 32000 };
+    for (int i = 0; i < 3; i++) v = ray_vec_append(v, &xs[i]);
+    TEST_ASSERT_EQ_I(ray_vec_get_i64(v, 0), (int64_t)xs[0]);
+    TEST_ASSERT_EQ_I(ray_vec_get_i64(v, 1), (int64_t)xs[1]);
+    TEST_ASSERT_EQ_I(ray_vec_get_i64(v, 2), (int64_t)xs[2]);
+    ray_release(v);
+    PASS();
+}
+
+static test_result_t test_public_vec_get_i64_u8(void) {
+    ray_t* v = ray_vec_new(RAY_U8, 4);
+    uint8_t xs[] = { 0, 1, 200, 255 };
+    for (int i = 0; i < 4; i++) v = ray_vec_append(v, &xs[i]);
+    TEST_ASSERT_EQ_I(ray_vec_get_i64(v, 0), (int64_t)xs[0]);
+    TEST_ASSERT_EQ_I(ray_vec_get_i64(v, 2), (int64_t)xs[2]);
+    TEST_ASSERT_EQ_I(ray_vec_get_i64(v, 3), (int64_t)xs[3]);
+    ray_release(v);
+    PASS();
+}
+
+static test_result_t test_public_vec_get_i64_bool(void) {
+    ray_t* v = ray_vec_new(RAY_BOOL, 3);
+    uint8_t xs[] = { 0, 1, 1 };
+    for (int i = 0; i < 3; i++) v = ray_vec_append(v, &xs[i]);
+    TEST_ASSERT_EQ_I(ray_vec_get_i64(v, 0), 0);
+    TEST_ASSERT_EQ_I(ray_vec_get_i64(v, 1), 1);
+    TEST_ASSERT_EQ_I(ray_vec_get_i64(v, 2), 1);
+    ray_release(v);
+    PASS();
+}
+
+/* NOTE: RAY_DATE / RAY_TIME branches of ray_vec_get_i64 are intentionally
+ * NOT covered here.  Their on-disk element width is 4 bytes (see
+ * ray_type_sizes in src/core/types.c), but ray_vec_get_i64 dispatches
+ * them through the same 8-byte cast as RAY_I64 / RAY_TIMESTAMP — reading
+ * past the row boundary.  Reported separately; do not write a "happy
+ * path" test that locks in the broken behaviour. */
+
+static test_result_t test_public_vec_get_i64_timestamp(void) {
+    ray_t* v = ray_vec_new(RAY_TIMESTAMP, 3);
+    int64_t xs[] = { 0, 1700000000000000000LL, 1800000000000000000LL };
+    for (int i = 0; i < 3; i++) v = ray_vec_append(v, &xs[i]);
+    TEST_ASSERT_EQ_I(ray_vec_get_i64(v, 0), xs[0]);
+    TEST_ASSERT_EQ_I(ray_vec_get_i64(v, 1), xs[1]);
+    TEST_ASSERT_EQ_I(ray_vec_get_i64(v, 2), xs[2]);
+    ray_release(v);
+    PASS();
+}
+
+/* ─── ray_vec_get_f64 — F32/F64 branches ─────────────────────────────
+ *
+ * Implementation accepts only RAY_F64 and RAY_F32; any other type
+ * returns 0.0.  Integer vectors do NOT coerce — verified by reading
+ * the source.  Cover only the supported (happy) types here. */
+
+static test_result_t test_public_vec_get_f64_f64(void) {
+    ray_t* v = ray_vec_new(RAY_F64, 4);
+    double xs[] = { -1.5, 0.0, 2.25, 1e10 };
+    for (int i = 0; i < 4; i++) v = ray_vec_append(v, &xs[i]);
+    TEST_ASSERT_EQ_F(ray_vec_get_f64(v, 0), xs[0], 0.0);
+    TEST_ASSERT_EQ_F(ray_vec_get_f64(v, 2), xs[2], 0.0);
+    TEST_ASSERT_EQ_F(ray_vec_get_f64(v, 3), xs[3], 0.0);
+    ray_release(v);
+    PASS();
+}
+
+static test_result_t test_public_vec_get_f64_f32(void) {
+    ray_t* v = ray_vec_new(RAY_F32, 3);
+    float xs[] = { -0.5f, 1.25f, 3.5f };
+    for (int i = 0; i < 3; i++) v = ray_vec_append(v, &xs[i]);
+    /* F32 values round-trip exactly to double when they are representable
+     * in 24-bit mantissa form — these are powers-of-two-fraction sums. */
+    TEST_ASSERT_EQ_F(ray_vec_get_f64(v, 0), (double)xs[0], 1e-6);
+    TEST_ASSERT_EQ_F(ray_vec_get_f64(v, 1), (double)xs[1], 1e-6);
+    TEST_ASSERT_EQ_F(ray_vec_get_f64(v, 2), (double)xs[2], 1e-6);
+    ray_release(v);
+    PASS();
+}
+
+/* ─── ray_vec_get_sym_id — every SYM width ───────────────────────────
+ *
+ * The implementation dispatches through ray_read_sym which respects the
+ * width-encoded attrs.  Use ray_sym_intern to obtain real IDs, append
+ * via the W64-shaped int64 elem (ray_vec_append normalizes width), then
+ * verify the round-trip.  W8 sym vec only addresses ≤255 distinct IDs;
+ * the first builtins claim low slots so our user-interned names land in
+ * a range that still fits an 8-bit index. */
+
+static test_result_t test_public_vec_get_sym_id_w64(void) {
+    int64_t a = ray_sym_intern("pub_w64_a", 9);
+    int64_t b = ray_sym_intern("pub_w64_b", 9);
+    int64_t c = ray_sym_intern("pub_w64_c", 9);
+
+    ray_t* v = ray_sym_vec_new(RAY_SYM_W64, 3);
+    v = ray_vec_append(v, &a);
+    v = ray_vec_append(v, &b);
+    v = ray_vec_append(v, &c);
+
+    TEST_ASSERT_EQ_I(ray_vec_get_sym_id(v, 0), a);
+    TEST_ASSERT_EQ_I(ray_vec_get_sym_id(v, 1), b);
+    TEST_ASSERT_EQ_I(ray_vec_get_sym_id(v, 2), c);
+    ray_release(v);
+    PASS();
+}
+
+static test_result_t test_public_vec_get_sym_id_w32(void) {
+    int64_t a = ray_sym_intern("pub_w32_a", 9);
+    int64_t b = ray_sym_intern("pub_w32_b", 9);
+
+    ray_t* v = ray_sym_vec_new(RAY_SYM_W32, 2);
+    v = ray_vec_append(v, &a);
+    v = ray_vec_append(v, &b);
+    TEST_ASSERT_EQ_I(ray_vec_get_sym_id(v, 0), a);
+    TEST_ASSERT_EQ_I(ray_vec_get_sym_id(v, 1), b);
+    ray_release(v);
+    PASS();
+}
+
+static test_result_t test_public_vec_get_sym_id_w16(void) {
+    int64_t a = ray_sym_intern("pub_w16_a", 9);
+    int64_t b = ray_sym_intern("pub_w16_b", 9);
+
+    ray_t* v = ray_sym_vec_new(RAY_SYM_W16, 2);
+    v = ray_vec_append(v, &a);
+    v = ray_vec_append(v, &b);
+    TEST_ASSERT_EQ_I(ray_vec_get_sym_id(v, 0), a);
+    TEST_ASSERT_EQ_I(ray_vec_get_sym_id(v, 1), b);
+    ray_release(v);
+    PASS();
+}
+
+static test_result_t test_public_vec_get_sym_id_w8(void) {
+    /* W8 indices only address up to 255 distinct entries.  By the time
+     * the runtime is up the symbol table holds the builtin set; user
+     * intern IDs are appended after.  Provided the cumulative count
+     * stays under 256 (well within the fresh-runtime budget), the W8
+     * append path will succeed. */
+    int64_t a = ray_sym_intern("pub_w8_a", 8);
+    int64_t b = ray_sym_intern("pub_w8_b", 8);
+
+    /* Skip when the runtime's existing builtins have already pushed past
+     * the W8 ceiling — the public API doesn't expose narrowing semantics
+     * here and we want a deterministic happy path. */
+    if (a > 0xFF || b > 0xFF) {
+        SKIP("sym ID exceeds W8 range — happy-path narrowing unreachable");
+    }
+
+    ray_t* v = ray_sym_vec_new(RAY_SYM_W8, 2);
+    v = ray_vec_append(v, &a);
+    v = ray_vec_append(v, &b);
+    TEST_ASSERT_EQ_I(ray_vec_get_sym_id(v, 0), a);
+    TEST_ASSERT_EQ_I(ray_vec_get_sym_id(v, 1), b);
+    ray_release(v);
+    PASS();
+}
+
+/* ─── ray_runtime_create_with_sym* (happy path, eval round-trip) ─────
+ *
+ * These tests own their runtime lifecycle (no setup/teardown entry).
+ * Pass a path that doesn't exist: per the contract, ENOENT is the
+ * normal first-run case — out_sym_err stays RAY_OK and the runtime
+ * comes up.  Run a trivial eval to confirm the language stack is live,
+ * then destroy. */
+
+static test_result_t test_public_runtime_create_with_sym_eval(void) {
+    char tmpl[] = "/tmp/rayforce-pub-rt-XXXXXX";
+    char* dir = mkdtemp(tmpl);
+    TEST_ASSERT_NOT_NULL(dir);
+    char path[256];
+    snprintf(path, sizeof(path), "%s/missing.sym", dir);
+
+    ray_runtime_t* rt = ray_runtime_create_with_sym(path);
+    TEST_ASSERT_NOT_NULL(rt);
+
+    /* Trivial eval — confirms ray_lang_init ran and the env has the
+     * arithmetic builtin wired up.  Rayfall uses Lisp-style prefix
+     * notation (see test_lang.c). */
+    ray_t* r = ray_eval_str("(+ 1 2)");
+    TEST_ASSERT_NOT_NULL(r);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(r));
+    TEST_ASSERT_EQ_I(ray_obj_type(r), -RAY_I64);
+    TEST_ASSERT_EQ_I(r->i64, 3);
+    ray_release(r);
+
+    ray_runtime_destroy(rt);
+    rmdir(dir);
+    PASS();
+}
+
+static test_result_t test_public_runtime_create_with_sym_err_eval(void) {
+    char tmpl[] = "/tmp/rayforce-pub-rt-err-XXXXXX";
+    char* dir = mkdtemp(tmpl);
+    TEST_ASSERT_NOT_NULL(dir);
+    char path[256];
+    snprintf(path, sizeof(path), "%s/missing.sym", dir);
+
+    ray_err_t err = RAY_ERR_OOM;  /* poison */
+    ray_runtime_t* rt = ray_runtime_create_with_sym_err(path, &err);
+    TEST_ASSERT_NOT_NULL(rt);
+    /* ENOENT is the documented first-run case: out_sym_err must be
+     * cleared to RAY_OK by runtime_create_impl. */
+    TEST_ASSERT_EQ_I((int)err, (int)RAY_OK);
+
+    ray_t* r = ray_eval_str("(* 5 6)");
+    TEST_ASSERT_NOT_NULL(r);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(r));
+    TEST_ASSERT_EQ_I(r->i64, 30);
+    ray_release(r);
+
+    ray_runtime_destroy(rt);
+    rmdir(dir);
+    PASS();
+}
+
+/* ray_runtime_destroy(NULL) is documented as a no-op via the early
+ * `if (!rt) return;` guard.  Pin that behaviour so a future refactor
+ * can't silently break it.  No setup/teardown — we don't want a real
+ * runtime alive when we hand the destroyer NULL. */
+static test_result_t test_public_runtime_destroy_null_is_noop(void) {
+    ray_runtime_destroy(NULL);
+    PASS();
+}
+
+/* ═══════════════════════════════════════════════════════════════════════
+ * Interrupt API — global flag (eval.c).  Happy-path set/get round-trip
+ * for the public ray_*_interrupt names and their legacy ray_eval_*
+ * wrappers.  The flag is thread-local sig_atomic_t storage; here we
+ * only verify the set→get→clear contract.
+ * ═══════════════════════════════════════════════════════════════════════ */
+
+static test_result_t test_public_interrupt_roundtrip(void) {
+    ray_clear_interrupt();
+    TEST_ASSERT_FALSE(ray_interrupted());
+
+    ray_request_interrupt();
+    TEST_ASSERT_TRUE(ray_interrupted());
+
+    ray_clear_interrupt();
+    TEST_ASSERT_FALSE(ray_interrupted());
+    PASS();
+}
+
+static test_result_t test_public_interrupt_idempotent_set(void) {
+    ray_clear_interrupt();
+    ray_request_interrupt();
+    ray_request_interrupt();
+    TEST_ASSERT_TRUE(ray_interrupted());
+    ray_clear_interrupt();
+    TEST_ASSERT_FALSE(ray_interrupted());
+    PASS();
+}
+
+static test_result_t test_public_eval_interrupt_wrappers(void) {
+    ray_eval_clear_interrupt();
+    TEST_ASSERT_EQ_I(ray_eval_is_interrupted(), 0);
+    TEST_ASSERT_FALSE(ray_interrupted());
+
+    ray_eval_request_interrupt();
+    TEST_ASSERT_TRUE(ray_eval_is_interrupted() != 0);
+    TEST_ASSERT_TRUE(ray_interrupted());
+
+    ray_eval_clear_interrupt();
+    TEST_ASSERT_EQ_I(ray_eval_is_interrupted(), 0);
+    TEST_ASSERT_FALSE(ray_interrupted());
+    PASS();
+}
+
+static test_result_t test_public_interrupt_cross_path(void) {
+    ray_clear_interrupt();
+
+    ray_request_interrupt();
+    TEST_ASSERT_TRUE(ray_eval_is_interrupted() != 0);
+    ray_clear_interrupt();
+
+    ray_eval_request_interrupt();
+    TEST_ASSERT_TRUE(ray_interrupted());
+    ray_eval_clear_interrupt();
+    TEST_ASSERT_FALSE(ray_interrupted());
+    PASS();
+}
+
+/* nfo API — get/set returns the same handle. */
+static test_result_t test_public_eval_nfo_roundtrip(void) {
+    ray_t* prev = ray_eval_get_nfo();
+
+    ray_eval_set_nfo(NULL);
+    TEST_ASSERT_NULL(ray_eval_get_nfo());
+
+    const char* src = "(+ 1 2)";
+    ray_t* nfo = ray_nfo_create("test", 4, src, 7);
+    TEST_ASSERT_NOT_NULL(nfo);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(nfo));
+
+    ray_eval_set_nfo(nfo);
+    TEST_ASSERT_EQ_PTR(ray_eval_get_nfo(), nfo);
+
+    ray_eval_set_nfo(prev);
+    ray_release(nfo);
+    PASS();
+}
+
+/* Restricted-mode API — pure data store with no side effects on benign arith. */
+static test_result_t test_public_eval_restricted_setget(void) {
+    ray_eval_set_restricted(false);
+    TEST_ASSERT_FALSE(ray_eval_get_restricted());
+
+    ray_eval_set_restricted(true);
+    TEST_ASSERT_TRUE(ray_eval_get_restricted());
+
+    ray_eval_set_restricted(false);
+    TEST_ASSERT_FALSE(ray_eval_get_restricted());
+    PASS();
+}
+
+static test_result_t test_public_eval_restricted_allows_arith(void) {
+    ray_eval_set_restricted(true);
+    ray_t* r = ray_eval_str("(+ 1 2)");
+    ray_eval_set_restricted(false);
+
+    TEST_ASSERT_NOT_NULL(r);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(r));
+    TEST_ASSERT_EQ_I(r->type, -RAY_I64);
+    TEST_ASSERT_EQ_I(r->i64, 3);
+    ray_release(r);
+    PASS();
+}
+
+/* Error-trace API — ray_eval_str clears the trace at entry. */
+static test_result_t test_public_get_error_trace_populated(void) {
+    ray_t* def = ray_eval_str("(set boom (fn [x] (+ x 1)))");
+    TEST_ASSERT_NOT_NULL(def);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(def));
+    ray_release(def);
+
+    ray_t* err = ray_eval_str("(boom \"not-a-number\")");
+    TEST_ASSERT_NOT_NULL(err);
+    TEST_ASSERT_TRUE(RAY_IS_ERR(err));
+
+    ray_t* trace = ray_get_error_trace();
+    TEST_ASSERT_NOT_NULL(trace);
+    TEST_ASSERT_EQ_I(trace->type, RAY_LIST);
+    TEST_ASSERT_TRUE(ray_len(trace) > 0);
+
+    ray_t* frame0 = ((ray_t**)ray_data(trace))[0];
+    TEST_ASSERT_NOT_NULL(frame0);
+    TEST_ASSERT_EQ_I(frame0->type, RAY_LIST);
+    TEST_ASSERT_EQ_I(ray_len(frame0), 4);
+
+    ray_release(err);
+    PASS();
+}
+
+static test_result_t test_public_get_error_trace_cleared_on_eval(void) {
+    ray_t* def = ray_eval_str("(set boom2 (fn [x] (+ x 1)))");
+    TEST_ASSERT_NOT_NULL(def);
+    ray_release(def);
+
+    ray_t* err = ray_eval_str("(boom2 \"x\")");
+    TEST_ASSERT_TRUE(RAY_IS_ERR(err));
+    TEST_ASSERT_NOT_NULL(ray_get_error_trace());
+    ray_release(err);
+
+    ray_t* ok = ray_eval_str("(+ 10 20)");
+    TEST_ASSERT_NOT_NULL(ok);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(ok));
+    TEST_ASSERT_NULL(ray_get_error_trace());
+    ray_release(ok);
+    PASS();
+}
+
 const test_entry_t public_api_entries[] = {
-    { "public/ipc_client_symbols", test_public_ipc_client_symbols, NULL, NULL },
-    { "public/query_and_format_symbols", test_public_query_and_format_symbols, NULL, NULL },
+    { "public/ipc_client_symbols",        test_public_ipc_client_symbols,        NULL, NULL },
+    { "public/query_and_format_symbols",  test_public_query_and_format_symbols,  NULL, NULL },
+
+    { "public/obj_type_atom_i64",   test_public_obj_type_atom_i64,   public_api_setup, public_api_teardown },
+    { "public/obj_type_atom_f64",   test_public_obj_type_atom_f64,   public_api_setup, public_api_teardown },
+    { "public/obj_type_atom_sym",   test_public_obj_type_atom_sym,   public_api_setup, public_api_teardown },
+    { "public/obj_type_vec_i64",    test_public_obj_type_vec_i64,    public_api_setup, public_api_teardown },
+    { "public/obj_type_vec_f64",    test_public_obj_type_vec_f64,    public_api_setup, public_api_teardown },
+    { "public/obj_type_vec_sym",    test_public_obj_type_vec_sym,    public_api_setup, public_api_teardown },
+    { "public/obj_type_list",       test_public_obj_type_list,       public_api_setup, public_api_teardown },
+    { "public/obj_type_table",      test_public_obj_type_table,      public_api_setup, public_api_teardown },
+    { "public/obj_type_dict",       test_public_obj_type_dict,       public_api_setup, public_api_teardown },
+
+    { "public/vec_get_i64_i64",        test_public_vec_get_i64_i64,        public_api_setup, public_api_teardown },
+    { "public/vec_get_i64_i32",        test_public_vec_get_i64_i32,        public_api_setup, public_api_teardown },
+    { "public/vec_get_i64_i16",        test_public_vec_get_i64_i16,        public_api_setup, public_api_teardown },
+    { "public/vec_get_i64_u8",         test_public_vec_get_i64_u8,         public_api_setup, public_api_teardown },
+    { "public/vec_get_i64_bool",       test_public_vec_get_i64_bool,       public_api_setup, public_api_teardown },
+    { "public/vec_get_i64_timestamp",  test_public_vec_get_i64_timestamp,  public_api_setup, public_api_teardown },
+
+    { "public/vec_get_f64_f64",   test_public_vec_get_f64_f64,   public_api_setup, public_api_teardown },
+    { "public/vec_get_f64_f32",   test_public_vec_get_f64_f32,   public_api_setup, public_api_teardown },
+
+    { "public/vec_get_sym_id_w64", test_public_vec_get_sym_id_w64, public_api_setup, public_api_teardown },
+    { "public/vec_get_sym_id_w32", test_public_vec_get_sym_id_w32, public_api_setup, public_api_teardown },
+    { "public/vec_get_sym_id_w16", test_public_vec_get_sym_id_w16, public_api_setup, public_api_teardown },
+    { "public/vec_get_sym_id_w8",  test_public_vec_get_sym_id_w8,  public_api_setup, public_api_teardown },
+
+    /* These tests manage their own runtime lifecycle. */
+    { "public/runtime_create_with_sym_eval",      test_public_runtime_create_with_sym_eval,     NULL, NULL },
+    { "public/runtime_create_with_sym_err_eval",  test_public_runtime_create_with_sym_err_eval, NULL, NULL },
+    { "public/runtime_destroy_null_is_noop",      test_public_runtime_destroy_null_is_noop,     NULL, NULL },
+
+    /* eval interrupt / nfo / restricted / error-trace public API. */
+    { "public/interrupt_roundtrip",            test_public_interrupt_roundtrip,            NULL, NULL },
+    { "public/interrupt_idempotent_set",       test_public_interrupt_idempotent_set,       NULL, NULL },
+    { "public/eval_interrupt_wrappers",        test_public_eval_interrupt_wrappers,        NULL, NULL },
+    { "public/interrupt_cross_path",           test_public_interrupt_cross_path,           NULL, NULL },
+    { "public/eval_nfo_roundtrip",             test_public_eval_nfo_roundtrip,             public_api_setup, public_api_teardown },
+    { "public/eval_restricted_setget",         test_public_eval_restricted_setget,         NULL, NULL },
+    { "public/eval_restricted_allows_arith",   test_public_eval_restricted_allows_arith,   public_api_setup, public_api_teardown },
+    { "public/get_error_trace_populated",      test_public_get_error_trace_populated,      public_api_setup, public_api_teardown },
+    { "public/get_error_trace_cleared_on_eval",test_public_get_error_trace_cleared_on_eval,public_api_setup, public_api_teardown },
+
     { NULL, NULL, NULL, NULL },
 };
diff --git a/test/test_sort.c b/test/test_sort.c
index f563d896..3939b46a 100644
--- a/test/test_sort.c
+++ b/test/test_sort.c
@@ -1029,6 +1029,413 @@ static test_result_t test_sort_bool_nulls_first(void) {
     PASS();
 }
 
+/* ══════════════════════════════════════════════════════════════════
+ * top / bot (partial top-N / bottom-N) — happy path
+ *
+ * Targets ray_top_fn / ray_bot_fn (sort.c:3448, 3453) which dispatch
+ * through topk_take_vec → topk_indices_single → either the radix-
+ * encoded heap path (numeric types) or topk_indices_cmp_single +
+ * topk_indices_cmp + topk_cmp_sift_down (SYM type, sort.c:3173).
+ *
+ * Happy-path only: correct-type / correct-shape inputs.  Null /
+ * wrong-type / K-edge cases are covered elsewhere (top_bot.rfl).
+ * ══════════════════════════════════════════════════════════════════ */
+
+/* (top vec K) over an I64 vec with K < N — exercises the numeric
+ * radix-encoded bounded-heap path inside topk_indices_single. */
+static test_result_t test_top_i64_k_lt_n(void) {
+    ray_heap_init();
+    ray_sym_init();
+
+    int64_t data[] = {3, 1, 5, 2, 7, 4, 9, 6, 8};
+    ray_t* v = ray_vec_from_raw(RAY_I64, data, 9);
+    TEST_ASSERT_NOT_NULL(v);
+
+    ray_t* k = ray_i64(3);
+    ray_t* res = ray_top_fn(v, k);
+    TEST_ASSERT_NOT_NULL(res);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(res));
+    TEST_ASSERT_EQ_I(ray_len(res), 3);
+    TEST_ASSERT_EQ_I(res->type, RAY_I64);
+
+    /* Top 3 of {3,1,5,2,7,4,9,6,8} desc = {9,8,7}. */
+    const int64_t* r = (const int64_t*)ray_data(res);
+    TEST_ASSERT_EQ_I(r[0], 9);
+    TEST_ASSERT_EQ_I(r[1], 8);
+    TEST_ASSERT_EQ_I(r[2], 7);
+
+    ray_release(res); ray_release(k); ray_release(v);
+    ray_sym_destroy();
+    ray_heap_destroy();
+    PASS();
+}
+
+/* (top vec 1) — degenerate K=1 path: heap-of-one == max. */
+static test_result_t test_top_i64_k_eq_one(void) {
+    ray_heap_init();
+    ray_sym_init();
+
+    int64_t data[] = {3, 1, 5, 2, 7, 4, 9, 6, 8};
+    ray_t* v = ray_vec_from_raw(RAY_I64, data, 9);
+    ray_t* k = ray_i64(1);
+    ray_t* res = ray_top_fn(v, k);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(res));
+    TEST_ASSERT_EQ_I(ray_len(res), 1);
+    TEST_ASSERT_EQ_I(((const int64_t*)ray_data(res))[0], 9);
+
+    ray_release(res); ray_release(k); ray_release(v);
+    ray_sym_destroy();
+    ray_heap_destroy();
+    PASS();
+}
+
+/* (bot vec K) — mirror path with desc=0; verifies bot's heap orientation. */
+static test_result_t test_bot_i64_k_lt_n(void) {
+    ray_heap_init();
+    ray_sym_init();
+
+    int64_t data[] = {3, 1, 5, 2, 7, 4, 9, 6, 8};
+    ray_t* v = ray_vec_from_raw(RAY_I64, data, 9);
+    ray_t* k = ray_i64(3);
+    ray_t* res = ray_bot_fn(v, k);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(res));
+    TEST_ASSERT_EQ_I(ray_len(res), 3);
+
+    /* Bot 3 asc = {1,2,3}. */
+    const int64_t* r = (const int64_t*)ray_data(res);
+    TEST_ASSERT_EQ_I(r[0], 1);
+    TEST_ASSERT_EQ_I(r[1], 2);
+    TEST_ASSERT_EQ_I(r[2], 3);
+
+    ray_release(res); ray_release(k); ray_release(v);
+    ray_sym_destroy();
+    ray_heap_destroy();
+    PASS();
+}
+
+/* (top vec K) over F64 — exercises the F64 branch of the radix encode
+ * inside the bounded-heap path. */
+static test_result_t test_top_f64_k_lt_n(void) {
+    ray_heap_init();
+    ray_sym_init();
+
+    double data[] = {1.5, 2.5, 0.5, 3.5, -1.0, 4.5, 2.0};
+    ray_t* v = ray_vec_from_raw(RAY_F64, data, 7);
+    ray_t* k = ray_i64(3);
+    ray_t* res = ray_top_fn(v, k);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(res));
+    TEST_ASSERT_EQ_I(ray_len(res), 3);
+    TEST_ASSERT_EQ_I(res->type, RAY_F64);
+
+    /* Top 3 desc of {1.5,2.5,0.5,3.5,-1.0,4.5,2.0} = {4.5, 3.5, 2.5}. */
+    const double* r = (const double*)ray_data(res);
+    TEST_ASSERT_EQ_F(r[0], 4.5, 1e-9);
+    TEST_ASSERT_EQ_F(r[1], 3.5, 1e-9);
+    TEST_ASSERT_EQ_F(r[2], 2.5, 1e-9);
+
+    ray_release(res); ray_release(k); ray_release(v);
+    ray_sym_destroy();
+    ray_heap_destroy();
+    PASS();
+}
+
+/* (bot vec K) over F64 — F64 branch with desc=0. */
+static test_result_t test_bot_f64_k_lt_n(void) {
+    ray_heap_init();
+    ray_sym_init();
+
+    double data[] = {1.5, 2.5, 0.5, 3.5, -1.0, 4.5, 2.0};
+    ray_t* v = ray_vec_from_raw(RAY_F64, data, 7);
+    ray_t* k = ray_i64(2);
+    ray_t* res = ray_bot_fn(v, k);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(res));
+    TEST_ASSERT_EQ_I(ray_len(res), 2);
+
+    const double* r = (const double*)ray_data(res);
+    TEST_ASSERT_EQ_F(r[0], -1.0, 1e-9);
+    TEST_ASSERT_EQ_F(r[1], 0.5, 1e-9);
+
+    ray_release(res); ray_release(k); ray_release(v);
+    ray_sym_destroy();
+    ray_heap_destroy();
+    PASS();
+}
+
+/* (top vec K=N) — k>=len → falls through to ray_desc_fn (full sort),
+ * which returns a lazy chain that must be materialized.  Exercises
+ * the K==N short-circuit in topk_take_vec. */
+static test_result_t test_top_i64_k_eq_n(void) {
+    ray_heap_init();
+    ray_sym_init();
+
+    int64_t data[] = {3, 1, 5, 2, 7};
+    ray_t* v = ray_vec_from_raw(RAY_I64, data, 5);
+    ray_t* k = ray_i64(5);
+    ray_t* res = ray_top_fn(v, k);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(res));
+    if (ray_is_lazy(res)) res = ray_lazy_materialize(res);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(res));
+    TEST_ASSERT_EQ_I(ray_len(res), 5);
+
+    /* Full desc = {7,5,3,2,1}. */
+    const int64_t* r = (const int64_t*)ray_data(res);
+    TEST_ASSERT_EQ_I(r[0], 7);
+    TEST_ASSERT_EQ_I(r[4], 1);
+    for (int64_t i = 1; i < 5; i++)
+        TEST_ASSERT_FMT(r[i] <= r[i-1],
+                        "top k==n not desc-sorted at %lld", (long long)i);
+
+    ray_release(res); ray_release(k); ray_release(v);
+    ray_sym_destroy();
+    ray_heap_destroy();
+    PASS();
+}
+
+/* (bot vec K=N) mirror. */
+static test_result_t test_bot_i64_k_eq_n(void) {
+    ray_heap_init();
+    ray_sym_init();
+
+    int64_t data[] = {3, 1, 5, 2, 7};
+    ray_t* v = ray_vec_from_raw(RAY_I64, data, 5);
+    ray_t* k = ray_i64(5);
+    ray_t* res = ray_bot_fn(v, k);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(res));
+    if (ray_is_lazy(res)) res = ray_lazy_materialize(res);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(res));
+    TEST_ASSERT_EQ_I(ray_len(res), 5);
+
+    const int64_t* r = (const int64_t*)ray_data(res);
+    TEST_ASSERT_EQ_I(r[0], 1);
+    TEST_ASSERT_EQ_I(r[4], 7);
+
+    ray_release(res); ray_release(k); ray_release(v);
+    ray_sym_destroy();
+    ray_heap_destroy();
+    PASS();
+}
+
+/* (top symvec K) — RAY_SYM dispatches to topk_indices_cmp_single
+ * (sort.c:3173), which calls topk_indices_cmp + topk_cmp_sift_down.
+ * Exercises the comparator-heap branch of the top-K fast path that
+ * the numeric radix encoding doesn't cover. */
+static test_result_t test_top_sym_k_lt_n(void) {
+    ray_heap_init();
+    ray_sym_init();
+
+    int64_t s_apple  = ray_sym_intern("apple",  5);
+    int64_t s_banana = ray_sym_intern("banana", 6);
+    int64_t s_cherry = ray_sym_intern("cherry", 6);
+    int64_t s_date   = ray_sym_intern("date",   4);
+    int64_t s_elder  = ray_sym_intern("elder",  5);
+    int64_t s_fig    = ray_sym_intern("fig",    3);
+
+    /* SYM_W64 width: index slot is int64_t */
+    int64_t N = 12;
+    ray_t* sv = ray_sym_vec_new(RAY_SYM_W64, N);
+    TEST_ASSERT_NOT_NULL(sv);
+    sv->len = N;
+    int64_t syms[6] = { s_apple, s_banana, s_cherry, s_date, s_elder, s_fig };
+    int64_t* sd = (int64_t*)ray_data(sv);
+    for (int64_t i = 0; i < N; i++) sd[i] = syms[i % 6];
+
+    /* (top sv 3) → top 3 lex-desc symbols.  Lex order:
+     *   apple < banana < cherry < date < elder < fig
+     * Each symbol appears twice (N=12, 6 syms), so the desc top 3 must
+     * draw from the {fig, fig, elder} multiset (two fig + one elder)
+     * since fig and elder are the two highest symbols. */
+    ray_t* k = ray_i64(3);
+    ray_t* res = ray_top_fn(sv, k);
+    TEST_ASSERT_NOT_NULL(res);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(res));
+    TEST_ASSERT_EQ_I(ray_len(res), 3);
+    TEST_ASSERT_TRUE(RAY_IS_SYM(res->type));
+
+    /* Read all three sym ids — the result is desc-sorted so r0 ≥ r1 ≥ r2
+     * in lex order.  Expected (with stable tie-break): fig, fig, elder. */
+    const int64_t r0 = ray_read_sym(ray_data(res), 0, res->type, res->attrs);
+    const int64_t r1 = ray_read_sym(ray_data(res), 1, res->type, res->attrs);
+    const int64_t r2 = ray_read_sym(ray_data(res), 2, res->type, res->attrs);
+    TEST_ASSERT_EQ_I(r0, s_fig);
+    TEST_ASSERT_EQ_I(r1, s_fig);
+    TEST_ASSERT_EQ_I(r2, s_elder);
+
+    ray_release(res); ray_release(k); ray_release(sv);
+    ray_sym_destroy();
+    ray_heap_destroy();
+    PASS();
+}
+
+/* (bot symvec K) — mirror direction over SYM, exercising
+ * topk_indices_cmp_single with desc=0. */
+static test_result_t test_bot_sym_k_lt_n(void) {
+    ray_heap_init();
+    ray_sym_init();
+
+    int64_t s_apple  = ray_sym_intern("apple",  5);
+    int64_t s_banana = ray_sym_intern("banana", 6);
+    int64_t s_cherry = ray_sym_intern("cherry", 6);
+    int64_t s_date   = ray_sym_intern("date",   4);
+    int64_t s_elder  = ray_sym_intern("elder",  5);
+
+    int64_t N = 10;
+    ray_t* sv = ray_sym_vec_new(RAY_SYM_W64, N);
+    sv->len = N;
+    int64_t syms[5] = { s_apple, s_banana, s_cherry, s_date, s_elder };
+    int64_t* sd = (int64_t*)ray_data(sv);
+    for (int64_t i = 0; i < N; i++) sd[i] = syms[i % 5];
+
+    ray_t* k = ray_i64(2);
+    ray_t* res = ray_bot_fn(sv, k);
+    TEST_ASSERT_NOT_NULL(res);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(res));
+    TEST_ASSERT_EQ_I(ray_len(res), 2);
+
+    /* Bot 2 asc = {apple, apple}: 'apple' appears at rows 0 and 5. */
+    const int64_t r0 = ray_read_sym(ray_data(res), 0, res->type, res->attrs);
+    const int64_t r1 = ray_read_sym(ray_data(res), 1, res->type, res->attrs);
+    TEST_ASSERT_EQ_I(r0, s_apple);
+    TEST_ASSERT_EQ_I(r1, s_apple);
+
+    ray_release(res); ray_release(k); ray_release(sv);
+    ray_sym_destroy();
+    ray_heap_destroy();
+    PASS();
+}
+
+/* (top symvec K=N) — K==N → falls through to ray_desc_fn (full sort
+ * over SYM), still a happy-path traverse. */
+static test_result_t test_top_sym_k_eq_n(void) {
+    ray_heap_init();
+    ray_sym_init();
+
+    int64_t s_a = ray_sym_intern("aa", 2);
+    int64_t s_b = ray_sym_intern("bb", 2);
+    int64_t s_c = ray_sym_intern("cc", 2);
+
+    int64_t N = 3;
+    ray_t* sv = ray_sym_vec_new(RAY_SYM_W64, N);
+    sv->len = N;
+    int64_t* sd = (int64_t*)ray_data(sv);
+    sd[0] = s_b; sd[1] = s_a; sd[2] = s_c;
+
+    ray_t* k = ray_i64(3);
+    ray_t* res = ray_top_fn(sv, k);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(res));
+    if (ray_is_lazy(res)) res = ray_lazy_materialize(res);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(res));
+    TEST_ASSERT_EQ_I(ray_len(res), 3);
+    /* desc lex: cc, bb, aa */
+    TEST_ASSERT_EQ_I(ray_read_sym(ray_data(res), 0, res->type, res->attrs), s_c);
+    TEST_ASSERT_EQ_I(ray_read_sym(ray_data(res), 1, res->type, res->attrs), s_b);
+    TEST_ASSERT_EQ_I(ray_read_sym(ray_data(res), 2, res->type, res->attrs), s_a);
+
+    ray_release(res); ray_release(k); ray_release(sv);
+    ray_sym_destroy();
+    ray_heap_destroy();
+    PASS();
+}
+
+/* ══════════════════════════════════════════════════════════════════
+ * MSD bucket sort — msd_radix_sort_run dispatches to
+ * msd_bucket_sort_fn + bucket_lsb_sort only when both
+ *   nrows  > 1,000,000   AND
+ *   key_nbytes > 5       (range needs ≥6 bytes)
+ * apply (sort.c:810).  Build a 1.1M-row I64 vec with a 56-bit value
+ * range that ensures compute_key_nbytes returns ≥6, so we drop into
+ * the MSD path with 256 buckets and per-bucket LSB radix.
+ * ══════════════════════════════════════════════════════════════════ */
+
+static test_result_t test_sort_msd_bucket_i64(void) {
+    ray_heap_init();
+    ray_sym_init();
+
+    /* Just over 1M rows so we exceed the `n > 1000000` gate. */
+    const int64_t N = 1000001;
+    ray_t* vec = ray_vec_new(RAY_I64, N);
+    TEST_ASSERT_NOT_NULL(vec);
+    int64_t* d = (int64_t*)ray_data(vec);
+
+    /* Spread values across ~2^56 so the encoded key_nbytes is 7,
+     * tripping the n_bytes > 5 gate.  Use a simple deterministic
+     * pseudo-random pattern that's neither sorted nor reverse-sorted. */
+    const int64_t big = (int64_t)1 << 56;
+    for (int64_t i = 0; i < N; i++) {
+        /* Mix bits in the upper 7 bytes so every key_nbytes byte is
+         * non-uniform → no MSD-uniform fallback. */
+        uint64_t m = (uint64_t)(i * 2654435761ULL);
+        d[i] = (int64_t)(m % (uint64_t)big);
+    }
+    vec->len = N;
+
+    uint8_t desc = 0;
+    ray_t* result = ray_sort(&vec, &desc, NULL, 1, N);
+    TEST_ASSERT_NOT_NULL(result);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(result));
+    TEST_ASSERT_EQ_I(ray_len(result), N);
+
+    /* Verify ascending order at sparse checkpoints; full O(n) scan is
+     * also fine but costly.  Walk every 137th element (coprime with
+     * 64 / 128 / 256) so we land on every bucket boundary class. */
+    const int64_t* r = (const int64_t*)ray_data(result);
+    int64_t prev = r[0];
+    for (int64_t i = 137; i < N; i += 137) {
+        TEST_ASSERT_FMT(r[i] >= prev,
+                        "msd asc out of order at %lld: %lld < %lld",
+                        (long long)i, (long long)r[i], (long long)prev);
+        prev = r[i];
+    }
+    /* Sanity: adjacent pairs at start and end. */
+    TEST_ASSERT_TRUE(r[1] >= r[0]);
+    TEST_ASSERT_TRUE(r[N-1] >= r[N-2]);
+
+    ray_release(result);
+    ray_release(vec);
+    ray_sym_destroy();
+    ray_heap_destroy();
+    PASS();
+}
+
+/* MSD bucket sort, descending — same path, exercises the desc branch
+ * of radix_encode_fn that feeds the bucketed sort.  Smaller checks
+ * keep runtime moderate. */
+static test_result_t test_sort_msd_bucket_i64_desc(void) {
+    ray_heap_init();
+    ray_sym_init();
+
+    const int64_t N = 1000001;
+    ray_t* vec = ray_vec_new(RAY_I64, N);
+    int64_t* d = (int64_t*)ray_data(vec);
+
+    /* Same big spread as asc test, different seed. */
+    const int64_t big = (int64_t)1 << 56;
+    for (int64_t i = 0; i < N; i++) {
+        uint64_t m = (uint64_t)((i + 17) * 2246822519ULL);
+        d[i] = (int64_t)(m % (uint64_t)big);
+    }
+    vec->len = N;
+
+    uint8_t desc = 1;
+    ray_t* result = ray_sort(&vec, &desc, NULL, 1, N);
+    TEST_ASSERT_FALSE(RAY_IS_ERR(result));
+    TEST_ASSERT_EQ_I(ray_len(result), N);
+
+    const int64_t* r = (const int64_t*)ray_data(result);
+    int64_t prev = r[0];
+    for (int64_t i = 211; i < N; i += 211) {
+        TEST_ASSERT_FMT(r[i] <= prev,
+                        "msd desc out of order at %lld: %lld > %lld",
+                        (long long)i, (long long)r[i], (long long)prev);
+        prev = r[i];
+    }
+
+    ray_release(result);
+    ray_release(vec);
+    ray_sym_destroy();
+    ray_heap_destroy();
+    PASS();
+}
+
 /* ─── Entry table ────────────────────────────────────────────────── */
 
 const test_entry_t sort_entries[] = {
@@ -1073,5 +1480,23 @@ const test_entry_t sort_entries[] = {
     { "sort/u8_nulls_last_asc",         test_sort_u8_nulls_last_asc,    NULL, NULL },
     { "sort/u8_nulls_first_desc",       test_sort_u8_nulls_first_desc,  NULL, NULL },
     { "sort/bool_nulls_first",          test_sort_bool_nulls_first,     NULL, NULL },
+    /* top / bot — partial top-N / bottom-N happy paths.  Drive
+     * ray_top_fn / ray_bot_fn over numeric and SYM vectors with
+     * K<N, K=1, K=N to cover topk_indices_single (radix path) and
+     * topk_indices_cmp_single (SYM comparator-heap path). */
+    { "sort/top_i64_k_lt_n",            test_top_i64_k_lt_n,            NULL, NULL },
+    { "sort/top_i64_k_eq_one",          test_top_i64_k_eq_one,          NULL, NULL },
+    { "sort/bot_i64_k_lt_n",            test_bot_i64_k_lt_n,            NULL, NULL },
+    { "sort/top_f64_k_lt_n",            test_top_f64_k_lt_n,            NULL, NULL },
+    { "sort/bot_f64_k_lt_n",            test_bot_f64_k_lt_n,            NULL, NULL },
+    { "sort/top_i64_k_eq_n",            test_top_i64_k_eq_n,            NULL, NULL },
+    { "sort/bot_i64_k_eq_n",            test_bot_i64_k_eq_n,            NULL, NULL },
+    { "sort/top_sym_k_lt_n",            test_top_sym_k_lt_n,            NULL, NULL },
+    { "sort/bot_sym_k_lt_n",            test_bot_sym_k_lt_n,            NULL, NULL },
+    { "sort/top_sym_k_eq_n",            test_top_sym_k_eq_n,            NULL, NULL },
+    /* MSD bucket sort — 1M+ rows × 7-byte key range triggers the
+     * msd_bucket_sort_fn / bucket_lsb_sort path in msd_radix_sort_run. */
+    { "sort/msd_bucket_i64_asc",        test_sort_msd_bucket_i64,       NULL, NULL },
+    { "sort/msd_bucket_i64_desc",       test_sort_msd_bucket_i64_desc,  NULL, NULL },
     { NULL, NULL, NULL, NULL },
 };

From 7444aff993a2d20c6ada9ba8df3ac16c689c041f Mon Sep 17 00:00:00 2001
From: Serhii Savchuk <ser.vasilich@hotmail.com>
Date: Tue, 19 May 2026 11:36:31 +0300
Subject: [PATCH 3/3] fix(runtime): ray_vec_get_i64 reads DATE/TIME with wrong
 width
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ray_vec_get_i64 dispatched RAY_DATE and RAY_TIME through the same
int64 cast as RAY_I64 / RAY_TIMESTAMP. But ray_type_sizes in
src/core/types.c declares both DATE and TIME as 4-byte elements,
not 8 — so the cast read 8 bytes per element, returning garbage
for idx 0 (upper half captured the adjacent element) and OOB
reading once idx >= 1.

Fix: split DATE / TIME off the int64 path; read them as int32
alongside RAY_I32. RAY_TIMESTAMP stays on the int64 path (it is
genuinely 8 bytes).

Adds two TDD tests in test_public_api.c covering known DATE and
TIME values; both FAIL before the fix and PASS after. Replaces
the prior "intentionally NOT covered" comment.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/core/runtime.c     |  6 ++++--
 test/test_public_api.c | 35 +++++++++++++++++++++++++++++------
 2 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/src/core/runtime.c b/src/core/runtime.c
index 05706203..4a17a2e3 100644
--- a/src/core/runtime.c
+++ b/src/core/runtime.c
@@ -368,10 +368,12 @@ uint8_t ray_obj_attrs(ray_t* v) {
 
 int64_t ray_vec_get_i64(ray_t* vec, int64_t idx) {
     if (!vec || idx < 0 || idx >= vec->len) return 0;
-    if (vec->type == RAY_I64 || vec->type == RAY_DATE || vec->type == RAY_TIME || vec->type == RAY_TIMESTAMP) {
+    if (vec->type == RAY_I64 || vec->type == RAY_TIMESTAMP) {
         return ((const int64_t*)ray_data(vec))[idx];
     }
-    if (vec->type == RAY_I32) return ((const int32_t*)ray_data(vec))[idx];
+    if (vec->type == RAY_I32 || vec->type == RAY_DATE || vec->type == RAY_TIME) {
+        return ((const int32_t*)ray_data(vec))[idx];
+    }
     if (vec->type == RAY_I16) return ((const int16_t*)ray_data(vec))[idx];
     if (vec->type == RAY_U8 || vec->type == RAY_BOOL) return ((const uint8_t*)ray_data(vec))[idx];
     return 0;
diff --git a/test/test_public_api.c b/test/test_public_api.c
index 240efb84..fd938534 100644
--- a/test/test_public_api.c
+++ b/test/test_public_api.c
@@ -253,12 +253,33 @@ static test_result_t test_public_vec_get_i64_bool(void) {
     PASS();
 }
 
-/* NOTE: RAY_DATE / RAY_TIME branches of ray_vec_get_i64 are intentionally
- * NOT covered here.  Their on-disk element width is 4 bytes (see
- * ray_type_sizes in src/core/types.c), but ray_vec_get_i64 dispatches
- * them through the same 8-byte cast as RAY_I64 / RAY_TIMESTAMP — reading
- * past the row boundary.  Reported separately; do not write a "happy
- * path" test that locks in the broken behaviour. */
+/* RAY_DATE / RAY_TIME branches — element width is 4 bytes (int32) per
+ * ray_type_sizes in src/core/types.c.  ray_vec_get_i64 must read them as
+ * int32, not int64. */
+
+static test_result_t test_public_vec_get_i64_date(void) {
+    ray_t* v = ray_vec_new(RAY_DATE, 3);
+    /* Pick three distinct int32 day values that differ in both halves so
+     * a wrong-width read would catch obviously-wrong adjacent bytes. */
+    int32_t xs[] = { 0, 8766, 19724 };  /* 1970.01.01, 1994.01.01, 2024.01.01 */
+    for (int i = 0; i < 3; i++) v = ray_vec_append(v, &xs[i]);
+    TEST_ASSERT_EQ_I(ray_vec_get_i64(v, 0), xs[0]);
+    TEST_ASSERT_EQ_I(ray_vec_get_i64(v, 1), xs[1]);
+    TEST_ASSERT_EQ_I(ray_vec_get_i64(v, 2), xs[2]);
+    ray_release(v);
+    PASS();
+}
+
+static test_result_t test_public_vec_get_i64_time(void) {
+    ray_t* v = ray_vec_new(RAY_TIME, 3);
+    int32_t xs[] = { 0, 43200000, 86399000 };  /* 00:00:00.000, 12:00:00.000, 23:59:59.000 */
+    for (int i = 0; i < 3; i++) v = ray_vec_append(v, &xs[i]);
+    TEST_ASSERT_EQ_I(ray_vec_get_i64(v, 0), xs[0]);
+    TEST_ASSERT_EQ_I(ray_vec_get_i64(v, 1), xs[1]);
+    TEST_ASSERT_EQ_I(ray_vec_get_i64(v, 2), xs[2]);
+    ray_release(v);
+    PASS();
+}
 
 static test_result_t test_public_vec_get_i64_timestamp(void) {
     ray_t* v = ray_vec_new(RAY_TIMESTAMP, 3);
@@ -611,6 +632,8 @@ const test_entry_t public_api_entries[] = {
     { "public/vec_get_i64_i16",        test_public_vec_get_i64_i16,        public_api_setup, public_api_teardown },
     { "public/vec_get_i64_u8",         test_public_vec_get_i64_u8,         public_api_setup, public_api_teardown },
     { "public/vec_get_i64_bool",       test_public_vec_get_i64_bool,       public_api_setup, public_api_teardown },
+    { "public/vec_get_i64_date",       test_public_vec_get_i64_date,       public_api_setup, public_api_teardown },
+    { "public/vec_get_i64_time",       test_public_vec_get_i64_time,       public_api_setup, public_api_teardown },
     { "public/vec_get_i64_timestamp",  test_public_vec_get_i64_timestamp,  public_api_setup, public_api_teardown },
 
     { "public/vec_get_f64_f64",   test_public_vec_get_f64_f64,   public_api_setup, public_api_teardown },