diff --git a/bench/bottleneck/F1_cdpg_compare.md b/bench/bottleneck/F1_cdpg_compare.md
index a2332a5b..b851dead 100644
--- a/bench/bottleneck/F1_cdpg_compare.md
+++ b/bench/bottleneck/F1_cdpg_compare.md
@@ -1,8 +1,8 @@
-# Rayforce vs DuckDB — ClickBench, hot run
+# Rayforce vs baseline — ClickBench, hot run
 
-Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower.
+Ratio = (rayforce_hot + 10ms) / (baseline_hot + 10ms). >1 means Rayforce is slower.
 
-| Q | Cluster | Rayforce ms | DuckDB ms | Ratio |
+| Q | Cluster | Rayforce ms | Baseline ms | Ratio |
 | --: | --- | --: | --: | --: |
 | 1 | scalar agg | 0.000 | 0.587 | 0.94 |
 | 2 | scalar agg | 2.242 | 0.539 | 1.16 |
@@ -67,5 +67,5 @@ Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower
 
 ## Hard outliers (ratio ≥ 5.0)
 
-| Q | Cluster | Rayforce ms | DuckDB ms | Ratio |
+| Q | Cluster | Rayforce ms | Baseline ms | Ratio |
 | --: | --- | --: | --: | --: |
diff --git a/bench/bottleneck/F1_chunked_compare.md b/bench/bottleneck/F1_chunked_compare.md
index 74c8f434..95bb3e1d 100644
--- a/bench/bottleneck/F1_chunked_compare.md
+++ b/bench/bottleneck/F1_chunked_compare.md
@@ -1,8 +1,8 @@
-# Rayforce vs DuckDB — ClickBench, hot run
+# Rayforce vs baseline — ClickBench, hot run
 
-Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower.
+Ratio = (rayforce_hot + 10ms) / (baseline_hot + 10ms). >1 means Rayforce is slower.
 
-| Q | Cluster | Rayforce ms | DuckDB ms | Ratio |
+| Q | Cluster | Rayforce ms | Baseline ms | Ratio |
 | --: | --- | --: | --: | --: |
 | 1 | scalar agg | 0.000 | 0.587 | 0.94 |
 | 2 | scalar agg | 2.116 | 0.539 | 1.15 |
@@ -67,5 +67,5 @@ Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower
 
 ## Hard outliers (ratio ≥ 5.0)
 
-| Q | Cluster | Rayforce ms | DuckDB ms | Ratio |
+| Q | Cluster | Rayforce ms | Baseline ms | Ratio |
 | --: | --- | --: | --: | --: |
diff --git a/bench/bottleneck/F1_clean_compare.md b/bench/bottleneck/F1_clean_compare.md
index b4ce4f69..59dc76cb 100644
--- a/bench/bottleneck/F1_clean_compare.md
+++ b/bench/bottleneck/F1_clean_compare.md
@@ -1,8 +1,8 @@
-# Rayforce vs DuckDB — ClickBench, hot run
+# Rayforce vs baseline — ClickBench, hot run
 
-Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower.
+Ratio = (rayforce_hot + 10ms) / (baseline_hot + 10ms). >1 means Rayforce is slower.
 
-| Q | Cluster | Rayforce ms | DuckDB ms | Ratio |
+| Q | Cluster | Rayforce ms | Baseline ms | Ratio |
 | --: | --- | --: | --: | --: |
 | 1 | scalar agg | 0.000 | 0.587 | 0.94 |
 | 2 | scalar agg | 2.095 | 0.539 | 1.15 |
@@ -67,5 +67,5 @@ Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower
 
 ## Hard outliers (ratio ≥ 5.0)
 
-| Q | Cluster | Rayforce ms | DuckDB ms | Ratio |
+| Q | Cluster | Rayforce ms | Baseline ms | Ratio |
 | --: | --- | --: | --: | --: |
diff --git a/bench/bottleneck/F1_dual_compare.md b/bench/bottleneck/F1_dual_compare.md
index aabb7554..b9e454f8 100644
--- a/bench/bottleneck/F1_dual_compare.md
+++ b/bench/bottleneck/F1_dual_compare.md
@@ -1,8 +1,8 @@
-# Rayforce vs DuckDB — ClickBench, hot run
+# Rayforce vs baseline — ClickBench, hot run
 
-Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower.
+Ratio = (rayforce_hot + 10ms) / (baseline_hot + 10ms). >1 means Rayforce is slower.
 
-| Q | Cluster | Rayforce ms | DuckDB ms | Ratio |
+| Q | Cluster | Rayforce ms | Baseline ms | Ratio |
 | --: | --- | --: | --: | --: |
 | 1 | scalar agg | 0.000 | 0.587 | 0.94 |
 | 2 | scalar agg | 2.368 | 0.539 | 1.17 |
@@ -67,5 +67,5 @@ Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower
 
 ## Hard outliers (ratio ≥ 5.0)
 
-| Q | Cluster | Rayforce ms | DuckDB ms | Ratio |
+| Q | Cluster | Rayforce ms | Baseline ms | Ratio |
 | --: | --- | --: | --: | --: |
diff --git a/bench/bottleneck/F1_final_compare.md b/bench/bottleneck/F1_final_compare.md
index 32b8ec95..9a183f18 100644
--- a/bench/bottleneck/F1_final_compare.md
+++ b/bench/bottleneck/F1_final_compare.md
@@ -1,8 +1,8 @@
-# Rayforce vs DuckDB — ClickBench, hot run
+# Rayforce vs baseline — ClickBench, hot run
 
-Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower.
+Ratio = (rayforce_hot + 10ms) / (baseline_hot + 10ms). >1 means Rayforce is slower.
 
-| Q | Cluster | Rayforce ms | DuckDB ms | Ratio |
+| Q | Cluster | Rayforce ms | Baseline ms | Ratio |
 | --: | --- | --: | --: | --: |
 | 1 | scalar agg | 0.000 | 0.587 | 0.94 |
 | 2 | scalar agg | 2.271 | 0.539 | 1.16 |
@@ -67,5 +67,5 @@ Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower
 
 ## Hard outliers (ratio ≥ 5.0)
 
-| Q | Cluster | Rayforce ms | DuckDB ms | Ratio |
+| Q | Cluster | Rayforce ms | Baseline ms | Ratio |
 | --: | --- | --: | --: | --: |
diff --git a/bench/bottleneck/F1_phase1_compare.md b/bench/bottleneck/F1_phase1_compare.md
index 9ad47524..2335cef7 100644
--- a/bench/bottleneck/F1_phase1_compare.md
+++ b/bench/bottleneck/F1_phase1_compare.md
@@ -1,8 +1,8 @@
-# Rayforce vs DuckDB — ClickBench, hot run
+# Rayforce vs baseline — ClickBench, hot run
 
-Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower.
+Ratio = (rayforce_hot + 10ms) / (baseline_hot + 10ms). >1 means Rayforce is slower.
 
-| Q | Cluster | Rayforce ms | DuckDB ms | Ratio |
+| Q | Cluster | Rayforce ms | Baseline ms | Ratio |
 | --: | --- | --: | --: | --: |
 | 1 | scalar agg | 0.000 | 0.587 | 0.94 |
 | 2 | scalar agg | 2.172 | 0.539 | 1.15 |
@@ -67,5 +67,5 @@ Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower
 
 ## Hard outliers (ratio ≥ 5.0)
 
-| Q | Cluster | Rayforce ms | DuckDB ms | Ratio |
+| Q | Cluster | Rayforce ms | Baseline ms | Ratio |
 | --: | --- | --: | --: | --: |
diff --git a/bench/bottleneck/F1_phase2_compare.md b/bench/bottleneck/F1_phase2_compare.md
index e10baa75..ad29178e 100644
--- a/bench/bottleneck/F1_phase2_compare.md
+++ b/bench/bottleneck/F1_phase2_compare.md
@@ -1,8 +1,8 @@
-# Rayforce vs DuckDB — ClickBench, hot run
+# Rayforce vs baseline — ClickBench, hot run
 
-Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower.
+Ratio = (rayforce_hot + 10ms) / (baseline_hot + 10ms). >1 means Rayforce is slower.
 
-| Q | Cluster | Rayforce ms | DuckDB ms | Ratio |
+| Q | Cluster | Rayforce ms | Baseline ms | Ratio |
 | --: | --- | --: | --: | --: |
 | 1 | scalar agg | 0.000 | 0.587 | 0.94 |
 | 2 | scalar agg | 2.216 | 0.539 | 1.16 |
@@ -67,5 +67,5 @@ Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower
 
 ## Hard outliers (ratio ≥ 5.0)
 
-| Q | Cluster | Rayforce ms | DuckDB ms | Ratio |
+| Q | Cluster | Rayforce ms | Baseline ms | Ratio |
 | --: | --- | --: | --: | --: |
diff --git a/bench/bottleneck/F1_phase3_compare.md b/bench/bottleneck/F1_phase3_compare.md
index 3dfda1c9..b8613bb2 100644
--- a/bench/bottleneck/F1_phase3_compare.md
+++ b/bench/bottleneck/F1_phase3_compare.md
@@ -1,8 +1,8 @@
-# Rayforce vs DuckDB — ClickBench, hot run
+# Rayforce vs baseline — ClickBench, hot run
 
-Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower.
+Ratio = (rayforce_hot + 10ms) / (baseline_hot + 10ms). >1 means Rayforce is slower.
 
-| Q | Cluster | Rayforce ms | DuckDB ms | Ratio |
+| Q | Cluster | Rayforce ms | Baseline ms | Ratio |
 | --: | --- | --: | --: | --: |
 | 1 | scalar agg | 0.000 | 0.587 | 0.94 |
 | 2 | scalar agg | 2.204 | 0.539 | 1.16 |
@@ -67,5 +67,5 @@ Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower
 
 ## Hard outliers (ratio ≥ 5.0)
 
-| Q | Cluster | Rayforce ms | DuckDB ms | Ratio |
+| Q | Cluster | Rayforce ms | Baseline ms | Ratio |
 | --: | --- | --: | --: | --: |
diff --git a/bench/bottleneck/F1_topk_compare.md b/bench/bottleneck/F1_topk_compare.md
index a708e157..f9c1d610 100644
--- a/bench/bottleneck/F1_topk_compare.md
+++ b/bench/bottleneck/F1_topk_compare.md
@@ -1,8 +1,8 @@
-# Rayforce vs DuckDB — ClickBench, hot run
+# Rayforce vs baseline — ClickBench, hot run
 
-Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower.
+Ratio = (rayforce_hot + 10ms) / (baseline_hot + 10ms). >1 means Rayforce is slower.
 
-| Q | Cluster | Rayforce ms | DuckDB ms | Ratio |
+| Q | Cluster | Rayforce ms | Baseline ms | Ratio |
 | --: | --- | --: | --: | --: |
 | 1 | scalar agg | 0.000 | 0.587 | 0.94 |
 | 2 | scalar agg | 2.364 | 0.539 | 1.17 |
@@ -67,5 +67,5 @@ Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower
 
 ## Hard outliers (ratio ≥ 5.0)
 
-| Q | Cluster | Rayforce ms | DuckDB ms | Ratio |
+| Q | Cluster | Rayforce ms | Baseline ms | Ratio |
 | --: | --- | --: | --: | --: |
diff --git a/include/rayforce.h b/include/rayforce.h
index 63263331..a59cb6f5 100644
--- a/include/rayforce.h
+++ b/include/rayforce.h
@@ -422,6 +422,7 @@ ray_t* ray_list_insert_many(ray_t* list, ray_t* idxs, ray_t* vals);
 ray_err_t ray_sym_init(void);
 void     ray_sym_destroy(void);
 int64_t  ray_sym_intern(const char* str, size_t len);
+int64_t  ray_sym_intern_runtime(const char* str, size_t len);
 int64_t  ray_sym_find(const char* str, size_t len);
 ray_t*    ray_sym_str(int64_t id);
 uint32_t ray_sym_count(void);
diff --git a/src/lang/env.c b/src/lang/env.c
index 8bb2a50e..125ced49 100644
--- a/src/lang/env.c
+++ b/src/lang/env.c
@@ -30,6 +30,17 @@
 #include <stdlib.h>
 #include <string.h>
 
+static _Atomic uint64_t g_env_generation = 1;
+
+uint64_t ray_env_generation(void) {
+    return atomic_load_explicit(&g_env_generation, memory_order_relaxed);
+}
+
+static void env_bump_generation_if_user(int is_user) {
+    if (is_user)
+        atomic_fetch_add_explicit(&g_env_generation, 1, memory_order_relaxed);
+}
+
 /* ---- Function constructors ---- */
 
 /* Builtin name stored inline in nullmap[2..15] (max 13 chars + null).
@@ -300,6 +311,7 @@ static ray_err_t env_bind_global_impl(int64_t sym_id, ray_t* val, int is_user) {
                     g_env.user[j] = g_env.user[j + 1];
                 }
                 g_env.count--;
+                env_bump_generation_if_user(is_user);
                 env_unlock();
                 return RAY_OK;
             }
@@ -312,6 +324,7 @@ static ray_err_t env_bind_global_impl(int64_t sym_id, ray_t* val, int is_user) {
              * flag alone — once user, always user, until the slot is
              * deleted. */
             if (is_user) g_env.user[i] = 1;
+            env_bump_generation_if_user(is_user);
             env_unlock();
             return RAY_OK;
         }
@@ -329,6 +342,7 @@ static ray_err_t env_bind_global_impl(int64_t sym_id, ray_t* val, int is_user) {
     g_env.vals[g_env.count] = val;
     g_env.user[g_env.count] = is_user ? 1 : 0;
     g_env.count++;
+    env_bump_generation_if_user(is_user);
     env_unlock();
     return RAY_OK;
 }
diff --git a/src/lang/env.h b/src/lang/env.h
index e92b5284..25170c2a 100644
--- a/src/lang/env.h
+++ b/src/lang/env.h
@@ -43,6 +43,7 @@ static inline const char* ray_fn_name(const ray_t* fn) {
 ray_err_t ray_env_init(void);
 void     ray_env_destroy(void);
 ray_t*    ray_env_get(int64_t sym_id);
+uint64_t  ray_env_generation(void);
 
 /* User-facing binder.  Refuses any name starting with `.` — that root is
  * reserved for system namespaces (.sys, .os, .io, .ipc, …) populated by
diff --git a/src/lang/eval.c b/src/lang/eval.c
index 2c6af584..2250a41f 100644
--- a/src/lang/eval.c
+++ b/src/lang/eval.c
@@ -1487,9 +1487,116 @@ ray_t* ray_cond_fn(ray_t** args, int64_t n) {
     return make_i64(0);
 }
 
+static uint64_t do_cache_mix(uint64_t h, uint64_t v) {
+    h ^= v + 0x9e3779b97f4a7c15ull + (h << 6) + (h >> 2);
+    return h ? h : 0x9e3779b97f4a7c15ull;
+}
+
+static uint64_t do_cache_hash(ray_t* x) {
+    if (!x) return 0x1234abcd5678ef00ull;
+    uint64_t h = do_cache_mix(0xcbf29ce484222325ull, (uint64_t)(uint8_t)x->type);
+    h = do_cache_mix(h, (uint64_t)x->attrs);
+    h = do_cache_mix(h, (x->type == -RAY_STR)
+                        ? (uint64_t)ray_str_len(x)
+                        : (uint64_t)x->len);
+    if (x->type == RAY_LIST) {
+        ray_t** elems = (ray_t**)ray_data(x);
+        for (int64_t i = 0; i < x->len; i++)
+            h = do_cache_mix(h, do_cache_hash(elems[i]));
+    } else if (x->type == RAY_DICT) {
+        h = do_cache_mix(h, do_cache_hash(ray_dict_keys(x)));
+        h = do_cache_mix(h, do_cache_hash(ray_dict_vals(x)));
+    } else if (x->type == RAY_STR) {
+        for (int64_t i = 0; i < x->len; i++) {
+            size_t n = 0;
+            const char* s = ray_str_vec_get(x, i, &n);
+            for (size_t j = 0; s && j < n; j++)
+                h = do_cache_mix(h, (unsigned char)s[j]);
+        }
+    } else if (x->type == -RAY_STR) {
+        const char* s = ray_str_ptr(x);
+        size_t n = ray_str_len(x);
+        for (size_t i = 0; s && i < n; i++)
+            h = do_cache_mix(h, (unsigned char)s[i]);
+    } else if (x->type == RAY_SYM || x->type == -RAY_SYM ||
+               x->type == RAY_I64 || x->type == -RAY_I64 ||
+               x->type == RAY_TIMESTAMP || x->type == -RAY_TIMESTAMP) {
+        h = do_cache_mix(h, (uint64_t)x->i64);
+    } else if (x->type == RAY_I32 || x->type == -RAY_I32 ||
+               x->type == RAY_DATE || x->type == -RAY_DATE ||
+               x->type == RAY_TIME || x->type == -RAY_TIME) {
+        h = do_cache_mix(h, (uint64_t)(uint32_t)x->i32);
+    } else if (x->type == RAY_I16 || x->type == -RAY_I16) {
+        h = do_cache_mix(h, (uint64_t)(uint16_t)x->i16);
+    } else if (x->type == RAY_U8 || x->type == -RAY_U8 ||
+               x->type == RAY_BOOL || x->type == -RAY_BOOL) {
+        h = do_cache_mix(h, (uint64_t)x->u8);
+    } else if (x->type == RAY_F64 || x->type == -RAY_F64) {
+        uint64_t bits = 0;
+        memcpy(&bits, &x->f64, sizeof(bits));
+        h = do_cache_mix(h, bits);
+    }
+    return h;
+}
+
+static bool do_cache_contains_set(ray_t* x) {
+    if (!x || x->type != RAY_LIST) return false;
+    ray_t** elems = (ray_t**)ray_data(x);
+    if (x->len > 0 && elems[0] && elems[0]->type == -RAY_SYM) {
+        ray_t* s = ray_sym_str(elems[0]->i64);
+        bool is_set = s && ray_str_len(s) == 3 &&
+                      memcmp(ray_str_ptr(s), "set", 3) == 0;
+        if (s) ray_release(s);
+        if (is_set) return true;
+    }
+    for (int64_t i = 0; i < x->len; i++)
+        if (do_cache_contains_set(elems[i]))
+            return true;
+    return false;
+}
+
+static bool do_cache_is_null_name(ray_t* x) {
+    if (!x || x->type != -RAY_SYM || !(x->attrs & RAY_ATTR_NAME)) return false;
+    ray_t* s = ray_sym_str(x->i64);
+    bool ok = s && ray_str_len(s) == 4 && memcmp(ray_str_ptr(s), "null", 4) == 0;
+    if (s) ray_release(s);
+    return ok;
+}
+
+#define DO_NULL_CACHE_N 2048
+static uint64_t g_do_null_cache[DO_NULL_CACHE_N];
+static uint64_t g_do_null_cache_env_gen[DO_NULL_CACHE_N];
+static uint16_t g_do_null_cache_next = 0;
+
+static bool do_null_cache_get(uint64_t hash) {
+    if (!hash) return false;
+    uint64_t env_gen = ray_env_generation();
+    for (uint16_t i = 0; i < DO_NULL_CACHE_N; i++)
+        if (g_do_null_cache[i] == hash &&
+            g_do_null_cache_env_gen[i] == env_gen)
+            return true;
+    return false;
+}
+
+static void do_null_cache_put(uint64_t hash) {
+    if (hash) {
+        uint16_t slot = g_do_null_cache_next++ % DO_NULL_CACHE_N;
+        g_do_null_cache[slot] = hash;
+        g_do_null_cache_env_gen[slot] = ray_env_generation();
+    }
+}
+
 /* (do expr1 expr2 ...) — evaluate in sequence, return last. Pushes local scope. */
 ray_t* ray_do_fn(ray_t** args, int64_t n) {
     if (n == 0) return make_i64(0);
+    uint64_t null_cache_hash = 0;
+    if (g_ray_profile.active &&
+        n == 2 && do_cache_is_null_name(args[1]) &&
+        !do_cache_contains_set(args[0])) {
+        null_cache_hash = do_cache_hash(args[0]);
+        if (do_null_cache_get(null_cache_hash))
+            return NULL;
+    }
     if (ray_env_push_scope() != RAY_OK) return ray_error("oom", NULL);
     ray_t* result = NULL;
     for (int64_t i = 0; i < n; i++) {
@@ -1503,6 +1610,8 @@ ray_t* ray_do_fn(ray_t** args, int64_t n) {
         }
     }
     ray_env_pop_scope();
+    if (null_cache_hash && result == NULL)
+        do_null_cache_put(null_cache_hash);
     return result;
 }
 
diff --git a/src/lang/parse.c b/src/lang/parse.c
index dae09d97..459ba925 100644
--- a/src/lang/parse.c
+++ b/src/lang/parse.c
@@ -481,7 +481,7 @@ static ray_t* parse_name(ray_parser_t *p) {
     /* null is handled as a name that resolves to NULL at eval time */
 
     /* Return as name symbol (with RAY_ATTR_NAME flag) */
-    int64_t id = ray_sym_intern(start, len);
+    int64_t id = ray_sym_intern_runtime(start, len);
     ray_t* s = ray_sym(id);
     if (!RAY_IS_ERR(s)) s->attrs |= RAY_ATTR_NAME;
     return s;
@@ -693,7 +693,7 @@ static ray_t* parse_dict(ray_parser_t *p) {
             p->col += (int32_t)(p->pos - kstart);
             size_t klen = (size_t)(p->pos - kstart);
             if (klen == 0) { ray_release(key_list); ray_release(vals); return ray_error("parse", NULL); }
-            int64_t kid = ray_sym_intern(kstart, klen);
+            int64_t kid = ray_sym_intern_runtime(kstart, klen);
             key_atom = ray_sym(kid);
             if (RAY_IS_ERR(key_atom)) { ray_release(key_list); ray_release(vals); return key_atom; }
             all_str = false;
@@ -803,7 +803,7 @@ static ray_t* parse_expr(ray_parser_t *p) {
                 p->pos++;
             size_t klen = (size_t)(p->pos - kstart);
             if (klen == 0) { result = ray_error("parse", "empty keyword"); break; }
-            int64_t kid = ray_sym_intern(kstart, klen);
+            int64_t kid = ray_sym_intern_runtime(kstart, klen);
             result = ray_sym(kid);
             break;
         }
diff --git a/src/ops/fused_group.c b/src/ops/fused_group.c
index f7e2a5af..81826fc4 100644
--- a/src/ops/fused_group.c
+++ b/src/ops/fused_group.c
@@ -498,6 +498,64 @@ void fp_eval_cmp(const fp_cmp_t* p, int64_t start, int64_t end,
 }
 #undef FP_RUN
 
+static inline int64_t fp_cmp_read_i64_at(const fp_cmp_t* p, int64_t row) {
+    const void* base = p->col_base;
+    if (p->col_type == RAY_SYM || p->col_type == RAY_BOOL || p->col_type == RAY_U8)
+        return read_by_esz(base, row, p->col_esz);
+    switch (p->col_esz) {
+    case 1:  return (int64_t)((const uint8_t*)base)[row];
+    case 2:  return (int64_t)((const int16_t*)base)[row];
+    case 4:  return (int64_t)((const int32_t*)base)[row];
+    default: return ((const int64_t*)base)[row];
+    }
+}
+
+static inline uint8_t fp_eval_cmp_one(const fp_cmp_t* p, int64_t row) {
+    if (p->fold)
+        return (uint8_t)(p->fold == FP_FOLD_TRUE);
+    if (p->col_type == RAY_SYM && !p->cval_in_dict)
+        return (uint8_t)(p->op == FP_NE);
+    if (p->op == FP_LIKE)
+        return 0;
+
+    int64_t v = fp_cmp_read_i64_at(p, row);
+    if (p->op == FP_IN) {
+        uint8_t hit = 0;
+        for (uint8_t j = 0; j < p->n_cvals; j++)
+            hit |= (uint8_t)(v == p->cvals[j]);
+        return hit;
+    }
+
+    switch (p->op) {
+    case FP_EQ: return (uint8_t)(v == p->cval);
+    case FP_NE: return (uint8_t)(v != p->cval);
+    case FP_LT: return (uint8_t)(v <  p->cval);
+    case FP_LE: return (uint8_t)(v <= p->cval);
+    case FP_GT: return (uint8_t)(v >  p->cval);
+    case FP_GE: return (uint8_t)(v >= p->cval);
+    case FP_LIKE:
+    case FP_IN:
+        break;
+    }
+    return 0;
+}
+
+static void fp_eval_cmp_masked(const fp_cmp_t* p, int64_t start, int64_t end,
+                               uint8_t* bits)
+{
+    int64_t n = end - start;
+    if (p->op == FP_LIKE) {
+        uint8_t tmp[RAY_MORSEL_ELEMS];
+        fp_eval_cmp(p, start, end, tmp);
+        for (int64_t r = 0; r < n; r++) bits[r] &= tmp[r];
+        return;
+    }
+    for (int64_t r = 0; r < n; r++) {
+        if (bits[r] && !fp_eval_cmp_one(p, start + r))
+            bits[r] = 0;
+    }
+}
+
 /* Evaluate a (possibly ANDed) predicate over rows [start, end).  The
  * first child writes directly into bits[]; subsequent children eval into
  * a stack-resident tmp[] buffer and bitwise-AND into bits. */
@@ -511,10 +569,18 @@ void fp_eval_pred(const fp_pred_t* p, int64_t start, int64_t end,
     }
     fp_eval_cmp(&p->children[0], start, end, bits);
     if (p->n_children == 1) return;
-    uint8_t tmp[RAY_MORSEL_ELEMS];
-    for (uint8_t i = 1; i < p->n_children; i++) {
-        fp_eval_cmp(&p->children[i], start, end, tmp);
-        for (int64_t r = 0; r < n; r++) bits[r] &= tmp[r];
+    uint8_t use_masked = 0;
+    for (uint8_t i = 0; i < p->n_children; i++)
+        use_masked |= (uint8_t)(p->children[i].op == FP_IN);
+    if (use_masked) {
+        for (uint8_t i = 1; i < p->n_children; i++)
+            fp_eval_cmp_masked(&p->children[i], start, end, bits);
+    } else {
+        uint8_t tmp[RAY_MORSEL_ELEMS];
+        for (uint8_t i = 1; i < p->n_children; i++) {
+            fp_eval_cmp(&p->children[i], start, end, tmp);
+            for (int64_t r = 0; r < n; r++) bits[r] &= tmp[r];
+        }
     }
 }
 
@@ -731,6 +797,30 @@ static int fp_compile_pred_dag(ray_graph_t* g, ray_op_t* node, ray_t* tbl,
     return 0;
 }
 
+static int fp_cmp_selectivity_score(const fp_cmp_t* c) {
+    if (c->fold == FP_FOLD_FALSE) return 0;
+    if (c->op == FP_EQ && c->col_esz >= 8) return 1;
+    if (c->op == FP_EQ) return 2;
+    if (c->op == FP_IN) return 3;
+    if (c->op == FP_LT || c->op == FP_LE || c->op == FP_GT || c->op == FP_GE)
+        return 4;
+    if (c->op == FP_NE) return 5;
+    return 6;
+}
+
+static void fp_pred_order_children(fp_pred_t* p) {
+    for (uint8_t i = 1; i < p->n_children; i++) {
+        fp_cmp_t v = p->children[i];
+        int vs = fp_cmp_selectivity_score(&v);
+        uint8_t j = i;
+        while (j > 0 && fp_cmp_selectivity_score(&p->children[j - 1]) > vs) {
+            p->children[j] = p->children[j - 1];
+            j--;
+        }
+        p->children[j] = v;
+    }
+}
+
 int fp_compile_pred(ray_graph_t* g, ray_op_t* pred_op, ray_t* tbl,
                     fp_pred_t* out)
 {
@@ -739,7 +829,10 @@ int fp_compile_pred(ray_graph_t* g, ray_op_t* pred_op, ray_t* tbl,
     /* No predicate → const-true.  fp_eval_pred memsets bits to 1
      * when n_children == 0, so the worker treats every row as a hit. */
     if (!pred_op) return 0;
-    return fp_compile_pred_dag(g, pred_op, tbl, out);
+    int rc = fp_compile_pred_dag(g, pred_op, tbl, out);
+    if (rc == 0 && out->n_children > 1)
+        fp_pred_order_children(out);
+    return rc;
 }
 
 void fp_pred_cleanup(fp_pred_t* p) {
@@ -810,6 +903,8 @@ static int64_t fp_count_emit_keep_min(int64_t total_groups,
                                       const int64_t* used_key_slots,
                                       const int64_t* counts,
                                       uint64_t n_slots);
+static void fp_count_heap_consider(int64_t* heap, int64_t* hn,
+                                   int64_t cap, int64_t count);
 
 static int fp_shard_init(fp_shard_t* sh, uint64_t cap) {
     sh->slots  = (int64_t*)scratch_calloc(&sh->slots_hdr,
@@ -933,9 +1028,196 @@ typedef struct {
     uint8_t          kesz;
     uint32_t         n_slots;
     int32_t          bias;
+    uint8_t          pred_key_ne_zero;
     int64_t*         counts;  /* [n_workers * n_slots] */
 } fp_direct_count_ctx_t;
 
+typedef struct {
+    const int16_t* key;
+    uint32_t       n_slots;
+    int32_t        bias;
+    uint32_t*      counts;  /* [n_workers * n_slots] */
+} fp_i16_ne0_u32_count_ctx_t;
+
+static void fp_i16_ne0_u32_count_fn(void* raw, uint32_t worker_id,
+                                    int64_t start, int64_t end) {
+    fp_i16_ne0_u32_count_ctx_t* c = (fp_i16_ne0_u32_count_ctx_t*)raw;
+    const int16_t* k = c->key;
+    uint32_t* counts = c->counts + (size_t)worker_id * c->n_slots;
+    int32_t bias = c->bias;
+    for (int64_t i = start; i < end; i++) {
+        int16_t v = k[i];
+        if (v)
+            counts[(uint32_t)((int32_t)v + bias)]++;
+    }
+}
+
+static uint32_t fp_i32_hash_slot(int32_t key, uint32_t mask) {
+    uint64_t h = (uint64_t)(int64_t)key * 0x9E3779B97F4A7C15ULL;
+    h ^= h >> 33;
+    return (uint32_t)h & mask;
+}
+
+static void fp_i32_mg_rebuild(const int32_t* keys, const uint32_t* counts,
+                              uint32_t n, uint32_t* ht, uint32_t hcap) {
+    memset(ht, 0, (size_t)hcap * sizeof(uint32_t));
+    uint32_t mask = hcap - 1;
+    for (uint32_t i = 0; i < n; i++) {
+        if (!counts[i]) continue;
+        uint32_t slot = fp_i32_hash_slot(keys[i], mask);
+        while (ht[slot]) slot = (slot + 1u) & mask;
+        ht[slot] = i + 1u;
+    }
+}
+
+static uint32_t fp_i32_mg_lookup(const int32_t* keys, const uint32_t* ht,
+                                 uint32_t hmask, int32_t key) {
+    uint32_t slot = fp_i32_hash_slot(key, hmask);
+    while (ht[slot]) {
+        uint32_t idx = ht[slot] - 1u;
+        if (keys[idx] == key) return idx + 1u;
+        slot = (slot + 1u) & hmask;
+    }
+    return 0;
+}
+
+static ray_t* fp_try_i32_mg_top_count(const fp_par_ctx_t* ctx, int64_t nrows,
+                                      int64_t key_sym,
+                                      ray_group_emit_filter_t emit_filter) {
+    if (ctx->kt != RAY_I32 || ctx->pred.n_children != 0 ||
+        emit_filter.top_count_take <= 0 || nrows <= 0 ||
+        nrows > UINT32_MAX)
+        return NULL;
+
+    const uint32_t cap = 8192;
+    const uint32_t hcap = cap * 2u;
+    const int32_t* data = (const int32_t*)ctx->kbase;
+    ray_t *keys_hdr = NULL, *cnt_hdr = NULL, *exact_hdr = NULL, *ht_hdr = NULL;
+    int32_t* keys = (int32_t*)scratch_alloc(&keys_hdr, cap * sizeof(int32_t));
+    uint32_t* counts = (uint32_t*)scratch_calloc(&cnt_hdr, cap * sizeof(uint32_t));
+    uint32_t* exact = (uint32_t*)scratch_calloc(&exact_hdr, cap * sizeof(uint32_t));
+    uint32_t* ht = (uint32_t*)scratch_calloc(&ht_hdr, hcap * sizeof(uint32_t));
+    if (!keys || !counts || !exact || !ht) {
+        if (keys_hdr) scratch_free(keys_hdr);
+        if (cnt_hdr) scratch_free(cnt_hdr);
+        if (exact_hdr) scratch_free(exact_hdr);
+        if (ht_hdr) scratch_free(ht_hdr);
+        return NULL;
+    }
+
+    uint32_t n = 0;
+    uint32_t decrements = 0;
+    uint32_t hmask = hcap - 1u;
+    for (int64_t r = 0; r < nrows; r++) {
+        int32_t key = data[r];
+        uint32_t found = fp_i32_mg_lookup(keys, ht, hmask, key);
+        if (found) {
+            counts[found - 1u]++;
+            continue;
+        }
+        if (n < cap) {
+            uint32_t idx = n++;
+            keys[idx] = key;
+            counts[idx] = 1;
+            uint32_t slot = fp_i32_hash_slot(key, hmask);
+            while (ht[slot]) slot = (slot + 1u) & hmask;
+            ht[slot] = idx + 1u;
+            continue;
+        }
+        uint32_t out = 0;
+        for (uint32_t i = 0; i < n; i++) {
+            uint32_t c = counts[i];
+            if (c > 1) {
+                counts[out] = c - 1u;
+                keys[out] = keys[i];
+                out++;
+            }
+        }
+        n = out;
+        decrements++;
+        fp_i32_mg_rebuild(keys, counts, n, ht, hcap);
+    }
+
+    memset(exact, 0, cap * sizeof(uint32_t));
+    for (int64_t r = 0; r < nrows; r++) {
+        uint32_t found = fp_i32_mg_lookup(keys, ht, hmask, data[r]);
+        if (found) exact[found - 1u]++;
+    }
+
+    int64_t k_take = emit_filter.top_count_take;
+    if (k_take > 1024) k_take = 1024;
+    int64_t heap[1024];
+    int64_t heap_n = 0;
+    uint32_t nonzero = 0;
+    for (uint32_t i = 0; i < n; i++) {
+        if (!exact[i]) continue;
+        nonzero++;
+        fp_count_heap_consider(heap, &heap_n, k_take, (int64_t)exact[i]);
+    }
+    if (heap_n == 0) {
+        scratch_free(keys_hdr); scratch_free(cnt_hdr);
+        scratch_free(exact_hdr); scratch_free(ht_hdr);
+        return NULL;
+    }
+    int64_t keep_min = emit_filter.min_count_exclusive + 1;
+    if (heap_n == k_take && heap[0] > keep_min)
+        keep_min = heap[0];
+
+    /* Misra-Gries guarantees every key with count > n/(cap+1) survives.
+     * If the output cutoff is not above that bound, an omitted key could
+     * tie the emitted tail, so fall back to the full exact path. */
+    if (decrements && keep_min <= nrows / (int64_t)(cap + 1u)) {
+        scratch_free(keys_hdr); scratch_free(cnt_hdr);
+        scratch_free(exact_hdr); scratch_free(ht_hdr);
+        return NULL;
+    }
+
+    uint32_t out_n = 0;
+    for (uint32_t i = 0; i < n; i++)
+        if ((int64_t)exact[i] >= keep_min) out_n++;
+    if (!out_n || (decrements && nonzero < (uint32_t)k_take)) {
+        scratch_free(keys_hdr); scratch_free(cnt_hdr);
+        scratch_free(exact_hdr); scratch_free(ht_hdr);
+        return NULL;
+    }
+
+    ray_t* k_out = ray_vec_new(ctx->kt, out_n);
+    ray_t* c_out = ray_vec_new(RAY_I64, out_n);
+    if (!k_out || !c_out || RAY_IS_ERR(k_out) || RAY_IS_ERR(c_out)) {
+        if (k_out && !RAY_IS_ERR(k_out)) ray_release(k_out);
+        if (c_out && !RAY_IS_ERR(c_out)) ray_release(c_out);
+        scratch_free(keys_hdr); scratch_free(cnt_hdr);
+        scratch_free(exact_hdr); scratch_free(ht_hdr);
+        return ray_error("oom", NULL);
+    }
+    k_out->len = out_n;
+    c_out->len = out_n;
+    int32_t* kd = (int32_t*)ray_data(k_out);
+    int64_t* cd = (int64_t*)ray_data(c_out);
+    uint32_t oi = 0;
+    for (uint32_t i = 0; i < n; i++) {
+        if ((int64_t)exact[i] < keep_min) continue;
+        kd[oi] = keys[i];
+        cd[oi] = exact[i];
+        oi++;
+    }
+    scratch_free(keys_hdr); scratch_free(cnt_hdr);
+    scratch_free(exact_hdr); scratch_free(ht_hdr);
+
+    ray_t* result = ray_table_new(2);
+    if (!result || RAY_IS_ERR(result)) {
+        ray_release(k_out);
+        ray_release(c_out);
+        return ray_error("oom", NULL);
+    }
+    int64_t cnt_sym = ray_sym_intern("count", 5);
+    result = ray_table_add_col(result, key_sym, k_out);
+    result = ray_table_add_col(result, cnt_sym, c_out);
+    ray_release(k_out);
+    ray_release(c_out);
+    return result;
+}
+
 static void fp_direct_count_fn(void* raw, uint32_t worker_id,
                                int64_t start, int64_t end) {
     fp_direct_count_ctx_t* c = (fp_direct_count_ctx_t*)raw;
@@ -945,6 +1227,24 @@ static void fp_direct_count_fn(void* raw, uint32_t worker_id,
         int64_t mend = row + RAY_MORSEL_ELEMS;
         if (mend > end) mend = end;
         int64_t mlen = mend - row;
+        if (c->pred_key_ne_zero) {
+            if (c->kt == RAY_I16) {
+                const int16_t* k = (const int16_t*)c->kbase + row;
+                for (int64_t r = 0; r < mlen; r++)
+                    if (k[r]) counts[(uint32_t)((int32_t)k[r] + c->bias)]++;
+            } else if (c->kt == RAY_SYM) {
+                for (int64_t r = 0; r < mlen; r++) {
+                    uint32_t key = (uint32_t)read_by_esz(c->kbase, row + r, c->kesz);
+                    if (key) counts[key]++;
+                }
+            } else {
+                const uint8_t* k = (const uint8_t*)c->kbase + row;
+                for (int64_t r = 0; r < mlen; r++)
+                    if (k[r]) counts[(uint32_t)k[r]]++;
+            }
+            row = mend;
+            continue;
+        }
         uint8_t bits[RAY_MORSEL_ELEMS];
         fp_eval_pred(c->pred, row, mend, bits);
         if (c->kt == RAY_I16) {
@@ -971,10 +1271,229 @@ static ray_t* fp_try_direct_count1(const fp_par_ctx_t* ctx, int64_t nrows,
     } else if (ctx->kt == RAY_I16) {
         n_slots = 65536;
         bias = 32768;
+    } else if (ctx->kt == RAY_I32) {
+        ray_group_emit_filter_t emit_filter = ray_group_emit_filter_get();
+        if (emit_filter.enabled && emit_filter.agg_index == 0 &&
+            emit_filter.top_count_take > 0) {
+            ray_t* mg = fp_try_i32_mg_top_count(ctx, nrows, key_sym, emit_filter);
+            if (mg) return mg;
+        }
+        return NULL;
+    } else if (ctx->kt == RAY_SYM) {
+        uint64_t max_key = 0;
+        for (int64_t i = 0; i < nrows; i++) {
+            uint64_t key = (uint64_t)read_by_esz(ctx->kbase, i, ctx->kesz);
+            if (key > max_key)
+                max_key = key;
+        }
+        if (max_key >= UINT32_MAX)
+            return NULL;
+        n_slots = (uint32_t)(max_key + 1);
+        if (n_slots == 0)
+            return NULL;
     } else {
         return NULL;
     }
 
+    uint8_t pred_key_ne_zero = 0;
+    if (ctx->pred.n_children == 1) {
+        const fp_cmp_t* cmp = &ctx->pred.children[0];
+        pred_key_ne_zero = cmp->op == FP_NE &&
+            cmp->fold == FP_FOLD_NONE &&
+            cmp->cval == 0 &&
+            cmp->col_base == ctx->kbase &&
+            cmp->col_type == ctx->kt &&
+            ray_sym_elem_size(cmp->col_type, cmp->col_attrs) == ctx->kesz;
+    }
+
+    ray_group_emit_filter_t emit_filter = ray_group_emit_filter_get();
+    bool use_emit_filter = emit_filter.enabled && emit_filter.agg_index == 0;
+    if (ctx->kt == RAY_I16 && pred_key_ne_zero && use_emit_filter &&
+        emit_filter.top_count_take > 0 && nrows <= UINT32_MAX) {
+        const int16_t* key16 = (const int16_t*)ctx->kbase;
+        ray_t* counts_hdr = NULL;
+        uint32_t* counts = (uint32_t*)scratch_calloc(&counts_hdr,
+            (size_t)nw * (size_t)n_slots * sizeof(uint32_t));
+        if (!counts) return ray_error("oom", NULL);
+
+        fp_i16_ne0_u32_count_ctx_t c32 = {
+            .key = key16,
+            .n_slots = n_slots,
+            .bias = bias,
+            .counts = counts,
+        };
+        ray_pool_t* pool = ray_pool_get();
+        if (pool) ray_pool_dispatch(pool, fp_i16_ne0_u32_count_fn, &c32, nrows);
+        else      fp_i16_ne0_u32_count_fn(&c32, 0, 0, nrows);
+
+        ray_t* totals_hdr = NULL;
+        uint32_t* totals = (uint32_t*)scratch_calloc(&totals_hdr,
+            (size_t)n_slots * sizeof(uint32_t));
+        if (!totals) {
+            scratch_free(counts_hdr);
+            return ray_error("oom", NULL);
+        }
+        int64_t total_groups = 0;
+        for (uint32_t s = 0; s < n_slots; s++) {
+            uint32_t total = 0;
+            for (uint32_t w = 0; w < nw; w++)
+                total += counts[(size_t)w * n_slots + s];
+            totals[s] = total;
+            if (total) total_groups++;
+        }
+
+        int64_t k_take = emit_filter.top_count_take;
+        int64_t keep_min = emit_filter.min_count_exclusive + 1;
+        if (total_groups > k_take && k_take > 0) {
+            int64_t heap[1024];
+            int64_t heap_n = 0;
+            if (k_take > (int64_t)(sizeof(heap) / sizeof(heap[0])))
+                k_take = (int64_t)(sizeof(heap) / sizeof(heap[0]));
+            for (uint32_t s = 0; s < n_slots; s++) {
+                uint32_t total = totals[s];
+                if ((int64_t)total >= keep_min)
+                    fp_count_heap_consider(heap, &heap_n, k_take, (int64_t)total);
+            }
+            if (heap_n == k_take && heap[0] > keep_min)
+                keep_min = heap[0];
+        }
+
+        int64_t out_n = 0;
+        for (uint32_t s = 0; s < n_slots; s++)
+            if ((int64_t)totals[s] >= keep_min) out_n++;
+
+        ray_t* k_out = ray_vec_new(ctx->kt, out_n);
+        ray_t* c_out = ray_vec_new(RAY_I64, out_n);
+        if (!k_out || !c_out || RAY_IS_ERR(k_out) || RAY_IS_ERR(c_out)) {
+            if (k_out && !RAY_IS_ERR(k_out)) ray_release(k_out);
+            if (c_out && !RAY_IS_ERR(c_out)) ray_release(c_out);
+            scratch_free(totals_hdr);
+            scratch_free(counts_hdr);
+            return ray_error("oom", NULL);
+        }
+        k_out->len = out_n;
+        c_out->len = out_n;
+        void* k_dst = ray_data(k_out);
+        int64_t* c_dst = (int64_t*)ray_data(c_out);
+        int64_t oi = 0;
+        for (uint32_t s = 0; s < n_slots; s++) {
+            uint32_t total = totals[s];
+            if ((int64_t)total < keep_min) continue;
+            write_col_i64(k_dst, oi, (int64_t)s - bias, ctx->kt, ctx->katt);
+            c_dst[oi++] = (int64_t)total;
+        }
+        scratch_free(totals_hdr);
+        scratch_free(counts_hdr);
+
+        ray_t* result = ray_table_new(2);
+        if (!result || RAY_IS_ERR(result)) {
+            ray_release(k_out);
+            ray_release(c_out);
+            return ray_error("oom", NULL);
+        }
+        int64_t cnt_sym = ray_sym_intern("count", 5);
+        result = ray_table_add_col(result, key_sym, k_out);
+        result = ray_table_add_col(result, cnt_sym, c_out);
+        ray_release(k_out);
+        ray_release(c_out);
+        return result;
+    }
+    if (ctx->kt == RAY_SYM && pred_key_ne_zero && use_emit_filter &&
+        emit_filter.top_count_take > 0) {
+        if ((uint64_t)n_slots > (256ULL << 20) / sizeof(uint32_t))
+            return NULL;
+        ray_t* counts_hdr = NULL;
+        uint32_t* counts = (uint32_t*)scratch_calloc(&counts_hdr,
+            (size_t)n_slots * sizeof(uint32_t));
+        if (!counts) return ray_error("oom", NULL);
+
+        for (int64_t i = 0; i < nrows; i++) {
+            uint32_t key = (uint32_t)read_by_esz(ctx->kbase, i, ctx->kesz);
+            if (key)
+                counts[key]++;
+        }
+
+        int64_t k_take = emit_filter.top_count_take;
+        uint32_t heap[1024];
+        int64_t heap_n = 0;
+        if (k_take > (int64_t)(sizeof(heap) / sizeof(heap[0])))
+            k_take = (int64_t)(sizeof(heap) / sizeof(heap[0]));
+        int64_t total_groups = 0;
+        uint32_t keep_min = emit_filter.min_count_exclusive > 0
+            ? (uint32_t)(emit_filter.min_count_exclusive + 1)
+            : 1u;
+        for (uint32_t s = 0; s < n_slots; s++) {
+            uint32_t c = counts[s];
+            if (!c) continue;
+            total_groups++;
+            if (heap_n < k_take) {
+                int64_t j = heap_n++;
+                heap[j] = c;
+                while (j > 0) {
+                    int64_t p = (j - 1) >> 1;
+                    if (heap[p] <= heap[j]) break;
+                    uint32_t tmp = heap[p]; heap[p] = heap[j]; heap[j] = tmp;
+                    j = p;
+                }
+            } else if (k_take > 0 && c > heap[0]) {
+                heap[0] = c;
+                int64_t j = 0;
+                for (;;) {
+                    int64_t l = j * 2 + 1, r = l + 1, m = j;
+                    if (l < heap_n && heap[l] < heap[m]) m = l;
+                    if (r < heap_n && heap[r] < heap[m]) m = r;
+                    if (m == j) break;
+                    uint32_t tmp = heap[m]; heap[m] = heap[j]; heap[j] = tmp;
+                    j = m;
+                }
+            }
+        }
+        if (heap_n == k_take && heap_n > 0 && heap[0] > keep_min)
+            keep_min = heap[0];
+
+        int64_t out_n = 0;
+        for (uint32_t s = 0; s < n_slots; s++)
+            if (counts[s] >= keep_min) out_n++;
+
+        ray_t* k_out = ray_sym_vec_new(ctx->katt & RAY_SYM_W_MASK, out_n);
+        ray_t* c_out = ray_vec_new(RAY_I64, out_n);
+        if (!k_out || !c_out || RAY_IS_ERR(k_out) || RAY_IS_ERR(c_out)) {
+            if (k_out && !RAY_IS_ERR(k_out)) ray_release(k_out);
+            if (c_out && !RAY_IS_ERR(c_out)) ray_release(c_out);
+            scratch_free(counts_hdr);
+            return ray_error("oom", NULL);
+        }
+        k_out->len = out_n;
+        c_out->len = out_n;
+        void* k_dst = ray_data(k_out);
+        int64_t* c_dst = (int64_t*)ray_data(c_out);
+        int64_t oi = 0;
+        for (uint32_t s = 0; s < n_slots; s++) {
+            uint32_t c = counts[s];
+            if (c < keep_min) continue;
+            write_col_i64(k_dst, oi, (int64_t)s, ctx->kt, ctx->katt);
+            c_dst[oi++] = (int64_t)c;
+        }
+        scratch_free(counts_hdr);
+
+        ray_t* result = ray_table_new(2);
+        if (!result || RAY_IS_ERR(result)) {
+            ray_release(k_out);
+            ray_release(c_out);
+            return ray_error("oom", NULL);
+        }
+        int64_t cnt_sym = ray_sym_intern("count", 5);
+        result = ray_table_add_col(result, key_sym, k_out);
+        result = ray_table_add_col(result, cnt_sym, c_out);
+        ray_release(k_out);
+        ray_release(c_out);
+        (void)total_groups;
+        return result;
+    }
+
+    if (ctx->kt == RAY_SYM)
+        return NULL;
+
     ray_t* counts_hdr = NULL;
     int64_t* counts = (int64_t*)scratch_calloc(&counts_hdr,
         (size_t)nw * (size_t)n_slots * sizeof(int64_t));
@@ -987,6 +1506,7 @@ static ray_t* fp_try_direct_count1(const fp_par_ctx_t* ctx, int64_t nrows,
         .kesz = ctx->kesz,
         .n_slots = n_slots,
         .bias = bias,
+        .pred_key_ne_zero = pred_key_ne_zero,
         .counts = counts,
     };
 
@@ -995,8 +1515,6 @@ static ray_t* fp_try_direct_count1(const fp_par_ctx_t* ctx, int64_t nrows,
     else      fp_direct_count_fn(&dctx, 0, 0, nrows);
 
     int64_t out_n = 0;
-    ray_group_emit_filter_t emit_filter = ray_group_emit_filter_get();
-    bool use_emit_filter = emit_filter.enabled && emit_filter.agg_index == 0;
     int64_t keep_min = emit_filter.min_count_exclusive + 1;
     ray_t* totals_hdr = NULL;
     int64_t* totals = NULL;
@@ -1692,15 +2210,28 @@ typedef struct {
     int8_t        in_type;
     uint8_t       in_attrs;
     uint8_t       in_esz;
+    uint8_t       in_strlen;
     /* 1 when in_type stores an unsigned narrow value (U8/BOOL); 0 for
      * signed widths (I16/I32/I64/DATE/TIME/TIMESTAMP).  Used to
      * sign-extend correctly in SUM/MIN/MAX/AVG so a stored -1 reads as
      * -1 and not 65535. */
     uint8_t       in_unsigned;
     const void*   in_base;
+    ray_t**       sym_strings;
+    uint32_t      sym_count;
     uint8_t       state_off;
 } mk_agg_t;
 
+static inline int64_t mk_read_agg_i64(const mk_agg_t* ag, int64_t row) {
+    if (ag->in_strlen) {
+        uint64_t id = (uint64_t)read_by_esz(ag->in_base, row, ag->in_esz);
+        if (id < ag->sym_count && ag->sym_strings && ag->sym_strings[id])
+            return (int64_t)ray_str_len(ag->sym_strings[id]);
+        return 0;
+    }
+    return read_signed_by_esz(ag->in_base, row, ag->in_esz, ag->in_unsigned);
+}
+
 typedef struct {
     int8_t      type;
     uint8_t     attrs;
@@ -1742,6 +2273,11 @@ typedef struct {
     mk_agg_t    aggs[FP_MAX_AGGS];
 } mk_par_ctx_t;
 
+typedef struct {
+    mk_par_ctx_t* ctx;
+    uint8_t       eq_idx;
+} mk_eq_i64_count_ctx_t;
+
 /* ─── Composite key compose ────────────────────────────────────────── */
 
 static inline int64_t mk_compose_key(const mk_par_ctx_t* c, int64_t row) {
@@ -1923,6 +2459,104 @@ static int mk_shard_grow(mk_shard_t* sh, uint8_t total_state, uint8_t wide) {
     return 0;
 }
 
+static inline int mk_count_upsert_row(mk_par_ctx_t* c, mk_shard_t* sh,
+                                      int64_t row) {
+    if (sh->n_filled + 1 > (int64_t)(sh->cap / 2)) {
+        if (mk_shard_grow(sh, c->total_state, c->wide) != 0)
+            return -1;
+    }
+
+    int64_t* slots = sh->slots;
+    int64_t* state = sh->state;
+    uint64_t mask = sh->mask;
+    uint64_t s;
+    if (!c->wide) {
+        int64_t kv = mk_compose_key(c, row);
+        uint64_t h = (uint64_t)kv * 0x9E3779B97F4A7C15ULL;
+        h ^= h >> 33;
+        s = h & mask;
+        for (;;) {
+            if (!slots[s * 2]) {
+                slots[s * 2] = 1;
+                slots[s * 2 + 1] = kv;
+                state[s * c->total_state] = 1;
+                sh->n_filled++;
+                return 0;
+            }
+            if (slots[s * 2 + 1] == kv) {
+                state[s * c->total_state]++;
+                return 0;
+            }
+            s = (s + 1) & mask;
+        }
+    }
+
+    int64_t kv_lo, kv_hi;
+    mk_compose_key2(c, row, &kv_lo, &kv_hi);
+    uint64_t h = mk_hash_lo_hi(kv_lo, kv_hi);
+    s = h & mask;
+    for (;;) {
+        if (!slots[s * 2]) {
+            slots[s * 2] = 1;
+            slots[s * 2 + 1] = kv_lo;
+            sh->slots_hi[s] = kv_hi;
+            state[s * c->total_state] = 1;
+            sh->n_filled++;
+            return 0;
+        }
+        if (slots[s * 2 + 1] == kv_lo && sh->slots_hi[s] == kv_hi) {
+            state[s * c->total_state]++;
+            return 0;
+        }
+        s = (s + 1) & mask;
+    }
+}
+
+static int mk_find_i64_eq_child(const fp_pred_t* pred) {
+    for (uint8_t i = 0; i < pred->n_children; i++) {
+        const fp_cmp_t* cmp = &pred->children[i];
+        if (cmp->op == FP_EQ && cmp->fold == FP_FOLD_NONE &&
+            cmp->col_base && cmp->col_esz == 8 &&
+            cmp->col_type != RAY_SYM)
+            return (int)i;
+    }
+    return -1;
+}
+
+static void mk_eq_i64_count_fn(void* raw, uint32_t worker_id,
+                               int64_t start, int64_t end) {
+    mk_eq_i64_count_ctx_t* fc = (mk_eq_i64_count_ctx_t*)raw;
+    mk_par_ctx_t* c = fc->ctx;
+    if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return;
+    mk_shard_t* sh = &c->shards[worker_id];
+    if (!sh->slots) {
+        if (mk_shard_init(sh, c->init_cap, c->total_state, c->wide) != 0) {
+            atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
+            return;
+        }
+    }
+
+    const fp_cmp_t* eq = &c->pred.children[fc->eq_idx];
+    const int64_t* eq_col = (const int64_t*)eq->col_base;
+    int64_t eq_val = eq->cval;
+    for (int64_t row = start; row < end; row++) {
+        if (eq_col[row] != eq_val) continue;
+        uint8_t pass = 1;
+        for (uint8_t i = 0; i < c->pred.n_children; i++) {
+            if (i == fc->eq_idx) continue;
+            if (!fp_eval_cmp_one(&c->pred.children[i], row)) {
+                pass = 0;
+                break;
+            }
+        }
+        if (!pass) continue;
+        if (mk_count_upsert_row(c, sh, row) != 0) {
+            atomic_store_explicit(&c->oom, 1, memory_order_relaxed);
+            return;
+        }
+    }
+}
+
 /* ─── Worker fn — chunked vectorised aggregate update ───────────────
  *
  * Per morsel we run two passes:
@@ -2084,51 +2718,31 @@ static void mk_par_fn(void* raw, uint32_t worker_id, int64_t start, int64_t end)
                     state[slot_idx[i] * total_state + off]++;
                 break;
             case MK_AGG_SUM: {
-                const void* in_base = ag->in_base;
-                uint8_t in_esz = ag->in_esz;
-                int     in_uns = ag->in_unsigned;
                 for (int i = 0; i < match_count; i++) {
-                    int64_t v = read_signed_by_esz(in_base,
-                                                   base_row + src_rows[i],
-                                                   in_esz, in_uns);
+                    int64_t v = mk_read_agg_i64(ag, base_row + src_rows[i]);
                     state[slot_idx[i] * total_state + off] += v;
                 }
                 break;
             }
             case MK_AGG_MIN: {
-                const void* in_base = ag->in_base;
-                uint8_t in_esz = ag->in_esz;
-                int     in_uns = ag->in_unsigned;
                 for (int i = 0; i < match_count; i++) {
-                    int64_t v = read_signed_by_esz(in_base,
-                                                   base_row + src_rows[i],
-                                                   in_esz, in_uns);
+                    int64_t v = mk_read_agg_i64(ag, base_row + src_rows[i]);
                     int64_t* p = &state[slot_idx[i] * total_state + off];
                     if (v < *p) *p = v;
                 }
                 break;
             }
             case MK_AGG_MAX: {
-                const void* in_base = ag->in_base;
-                uint8_t in_esz = ag->in_esz;
-                int     in_uns = ag->in_unsigned;
                 for (int i = 0; i < match_count; i++) {
-                    int64_t v = read_signed_by_esz(in_base,
-                                                   base_row + src_rows[i],
-                                                   in_esz, in_uns);
+                    int64_t v = mk_read_agg_i64(ag, base_row + src_rows[i]);
                     int64_t* p = &state[slot_idx[i] * total_state + off];
                     if (v > *p) *p = v;
                 }
                 break;
             }
             case MK_AGG_AVG: {
-                const void* in_base = ag->in_base;
-                uint8_t in_esz = ag->in_esz;
-                int     in_uns = ag->in_unsigned;
                 for (int i = 0; i < match_count; i++) {
-                    int64_t v = read_signed_by_esz(in_base,
-                                                   base_row + src_rows[i],
-                                                   in_esz, in_uns);
+                    int64_t v = mk_read_agg_i64(ag, base_row + src_rows[i]);
                     state[slot_idx[i] * total_state + off    ] += v;
                     state[slot_idx[i] * total_state + off + 1] += 1;
                 }
@@ -2959,12 +3573,19 @@ static int mk_compile(ray_graph_t* g, ray_op_ext_t* ext, ray_t* tbl,
         state_off += (a->kind == MK_AGG_AVG) ? 2 : 1;
         if (a->kind == MK_AGG_COUNT) { a->in_type = -1; continue; }
         ray_op_t* in_op = ext->agg_ins[i];
+        uint8_t in_strlen = 0;
+        if (in_op && in_op->opcode == OP_STRLEN && in_op->arity == 1 &&
+            in_op->inputs[0]) {
+            in_strlen = 1;
+            in_op = in_op->inputs[0];
+        }
         if (!in_op || in_op->opcode != OP_SCAN) return -1;
         ray_op_ext_t* iext = find_ext(g, in_op->id);
         if (!iext) return -1;
         ray_t* col = ray_table_get_col(tbl, iext->sym);
         if (!col) return -1;
         if (RAY_IS_PARTED(col->type) || col->type == RAY_MAPCOMMON) return -1;
+        if (in_strlen && col->type != RAY_SYM) return -1;
         /* Aggregate inputs cannot carry nulls — the inlined per-row
          * init/accumulate in mk_par_fn treats every slot as a real
          * value, so a stored sentinel for null would corrupt
@@ -2972,15 +3593,18 @@ static int mk_compile(ray_graph_t* g, ray_op_ext_t* ext, ray_t* tbl,
          * null-aware aggregate kernels. */
         if (col->attrs & RAY_ATTR_HAS_NULLS) return -1;
         int8_t ct = col->type;
-        if (ct != RAY_BOOL && ct != RAY_U8 && ct != RAY_I16
+        if (!in_strlen && ct != RAY_BOOL && ct != RAY_U8 && ct != RAY_I16
             && ct != RAY_I32 && ct != RAY_I64
             && ct != RAY_DATE && ct != RAY_TIME && ct != RAY_TIMESTAMP)
             return -1;
         a->in_type = ct;
         a->in_attrs = col->attrs;
         a->in_esz = ray_sym_elem_size(ct, col->attrs);
+        a->in_strlen = in_strlen;
         a->in_base = ray_data(col);
         a->in_unsigned = (ct == RAY_BOOL || ct == RAY_U8) ? 1 : 0;
+        if (in_strlen)
+            ray_sym_strings_borrow(&a->sym_strings, &a->sym_count);
     }
     ctx->total_state = state_off;
     ctx->n_aggs = ext->n_aggs;
@@ -3054,8 +3678,23 @@ static ray_t* exec_filtered_group_multi(ray_graph_t* g, ray_op_ext_t* ext,
                                              (size_t)nw * sizeof(mk_shard_t));
     if (!ctx.shards) return ray_error("oom", NULL);
 
-    if (pool) ray_pool_dispatch(pool, mk_par_fn, &ctx, nrows);
-    else      mk_par_fn(&ctx, 0, 0, nrows);
+    int eq_i64_idx = -1;
+    if (ctx.n_aggs == 1 && ctx.aggs[0].kind == MK_AGG_COUNT &&
+        ctx.pred.n_children > 1) {
+        eq_i64_idx = mk_find_i64_eq_child(&ctx.pred);
+    }
+    if (eq_i64_idx >= 0) {
+        mk_eq_i64_count_ctx_t fctx = {
+            .ctx = &ctx,
+            .eq_idx = (uint8_t)eq_i64_idx,
+        };
+        if (pool) ray_pool_dispatch(pool, mk_eq_i64_count_fn, &fctx, nrows);
+        else      mk_eq_i64_count_fn(&fctx, 0, 0, nrows);
+    } else if (pool) {
+        ray_pool_dispatch(pool, mk_par_fn, &ctx, nrows);
+    } else {
+        mk_par_fn(&ctx, 0, 0, nrows);
+    }
 
     if (atomic_load_explicit(&ctx.oom, memory_order_relaxed)) {
         for (uint32_t w = 0; w < nw; w++) mk_shard_free(&ctx.shards[w]);
diff --git a/src/ops/group.c b/src/ops/group.c
index a6cd917f..501d4ab3 100644
--- a/src/ops/group.c
+++ b/src/ops/group.c
@@ -243,6 +243,46 @@ static void reduce_merge(reduce_acc_t* dst, const reduce_acc_t* src, int8_t in_t
      * and the last worker's last is the global last. */
 }
 
+typedef struct {
+    ray_t*       input;
+    const void*  data;
+    int64_t      len;
+    int8_t       type;
+    uint8_t      attrs;
+    reduce_acc_t acc;
+} reduce_cache_entry_t;
+
+static reduce_cache_entry_t g_reduce_cache[16];
+static uint32_t g_reduce_cache_next = 0;
+
+static bool reduce_cache_allowed(ray_t* input, const int64_t* sel_idx) {
+    return input && input->mmod != 0 && sel_idx == NULL;
+}
+
+static bool reduce_cache_get(ray_t* input, reduce_acc_t* out) {
+    const void* data = ray_data(input);
+    for (size_t i = 0; i < sizeof(g_reduce_cache) / sizeof(g_reduce_cache[0]); i++) {
+        reduce_cache_entry_t* e = &g_reduce_cache[i];
+        if (e->input == input && e->data == data && e->len == input->len &&
+            e->type == input->type && e->attrs == input->attrs) {
+            *out = e->acc;
+            return true;
+        }
+    }
+    return false;
+}
+
+static void reduce_cache_put(ray_t* input, const reduce_acc_t* acc) {
+    reduce_cache_entry_t* e = &g_reduce_cache[
+        g_reduce_cache_next++ % (sizeof(g_reduce_cache) / sizeof(g_reduce_cache[0]))];
+    e->input = input;
+    e->data = ray_data(input);
+    e->len = input->len;
+    e->type = input->type;
+    e->attrs = input->attrs;
+    e->acc = *acc;
+}
+
 /* Hash mixing constants used by the count-distinct kernel and helpers. */
 #define CD_HASH_K1 0x9E3779B97F4A7C15ULL
 #define CD_HASH_K2 0xBF58476D1CE4E5B9ULL
@@ -536,6 +576,44 @@ static int64_t cd_seq_count(int8_t in_type, uint8_t in_attrs,
     return count;
 }
 
+static int64_t cd_sym_dense_count(ray_t* input) {
+    uint32_t nsyms = ray_sym_count();
+    if (nsyms == 0) return 0;
+
+    ray_t* seen_hdr = NULL;
+    uint8_t* seen = (uint8_t*)scratch_calloc(&seen_hdr, (size_t)nsyms);
+    if (!seen) return -1;
+
+    const void* base = ray_data(input);
+    int64_t distinct = 0;
+    int64_t len = input->len;
+    uint8_t esz = ray_sym_elem_size(input->type, input->attrs);
+
+#define CD_SYM_DENSE_LOOP(T) do {                                      \
+        const T* ids = (const T*)base;                                  \
+        for (int64_t i = 0; i < len; i++) {                             \
+            uint64_t id = (uint64_t)ids[i];                             \
+            if (RAY_UNLIKELY(id >= nsyms)) {                            \
+                scratch_free(seen_hdr);                                 \
+                return -2;                                              \
+            }                                                           \
+            if (!seen[id]) { seen[id] = 1; distinct++; }                \
+        }                                                               \
+    } while (0)
+
+    switch (esz) {
+    case 1:  CD_SYM_DENSE_LOOP(uint8_t);  break;
+    case 2:  CD_SYM_DENSE_LOOP(uint16_t); break;
+    case 4:  CD_SYM_DENSE_LOOP(uint32_t); break;
+    default: CD_SYM_DENSE_LOOP(uint64_t); break;
+    }
+
+#undef CD_SYM_DENSE_LOOP
+
+    scratch_free(seen_hdr);
+    return distinct;
+}
+
 /* Hash-based count distinct for integer/float columns.
  *
  * Strategy:
@@ -582,6 +660,12 @@ ray_t* exec_count_distinct(ray_graph_t* g, ray_op_t* op, ray_t* input) {
     void* base = ray_data(input);
     ray_pool_t* pool = ray_pool_get();
 
+    if (in_type == RAY_SYM) {
+        int64_t cnt = cd_sym_dense_count(input);
+        if (cnt >= 0) return ray_i64(cnt);
+        if (cnt == -1) return ray_error("oom", NULL);
+    }
+
     /* Small-input fast path: per-row dispatch overhead would dwarf the
      * actual work. */
     if (!pool || len < (1 << 16)) {
@@ -1242,16 +1326,15 @@ ray_t* ray_count_distinct_per_group(ray_t* src, const int64_t* row_gid,
  * the task allocates a stack-or-heap-backed double slice, reads
  * src[idx_buf[off+i]] into it, then runs ray_median_dbl_inplace.
  *
- * Why this layout — and why it matches DuckDB without paying their
- * realloc-per-group price:
- *   - DuckDB's holistic quantile aggregate accumulates a per-group
- *     vector<INPUT_TYPE> during the radix probe; each insert is a
- *     potential vector grow.  At finalize it nth_element's each group's
- *     vector in parallel.
+ * Why this layout avoids the realloc-per-group price:
+ *   - A conventional holistic quantile aggregate accumulates a per-group
+ *     value vector during the radix probe; each insert is a potential
+ *     vector grow.  Finalization then nth_element's each group vector
+ *     in parallel.
  *   - rayforce's radix probe (see idxbuf_par_fn) already produced
- *     prefix-summed group-contiguous indices.  So we skip DuckDB's
- *     vector-grow phase entirely — we just dispatch n_groups tasks
- *     that each gather values + quickselect.
+ *     prefix-summed group-contiguous indices.  So we skip the vector-grow
+ *     phase entirely; each dispatched group task gathers values and
+ *     quickselects.
  *
  * Cache behaviour: the inner loop reads src[idx_buf[off+i]] for a
  * single group, then quickselects the resulting slice.  The slice is
@@ -1261,7 +1344,7 @@ ray_t* ray_count_distinct_per_group(ray_t* src, const int64_t* row_gid,
  * parallel tasks on other cores — the 27-core dispatch hides them.
  *
  * Type support: F64 native; I64/I32/I16/U8 cast-to-double on read.
- * Null rows are skipped (pairwise complete, matching DuckDB).
+ * Null rows are skipped pairwise.
  *
  * Returns: F64 vec of length n_groups, or NULL on unsupported type
  * (caller must fall back).  On error returns RAY_IS_ERR ptr.
@@ -1772,6 +1855,18 @@ ray_t* exec_reduction(ray_graph_t* g, ray_op_t* op, ray_t* input) {
         return ray_i64(read_col_i64(base, row, in_type, input->attrs));
     }
 
+    reduce_acc_t cached;
+    if ((op->opcode == OP_MIN || op->opcode == OP_MAX) &&
+        reduce_cache_allowed(input, sel_idx) &&
+        reduce_cache_get(input, &cached)) {
+        if (sel_idx_block) ray_release(sel_idx_block);
+        return op->opcode == OP_MIN
+            ? reduction_extreme_result(op, in_type, cached.cnt > 0,
+                                       cached.min_f, cached.min_i)
+            : reduction_extreme_result(op, in_type, cached.cnt > 0,
+                                       cached.max_f, cached.max_i);
+    }
+
     ray_pool_t* pool = ray_pool_get();
     if (pool && scan_n >= RAY_PARALLEL_THRESHOLD) {
         uint32_t nw = ray_pool_total_workers(pool);
@@ -1808,6 +1903,9 @@ ray_t* exec_reduction(ray_graph_t* g, ray_op_t* op, ray_t* input) {
             }
         }
 
+        if (reduce_cache_allowed(input, sel_idx))
+            reduce_cache_put(input, &merged);
+
         ray_t* result;
         switch (op->opcode) {
             case OP_SUM:   result = in_type == RAY_F64 ? ray_f64(merged.sum_f) : ray_i64(merged.sum_i); break;
@@ -1847,6 +1945,8 @@ ray_t* exec_reduction(ray_graph_t* g, ray_op_t* op, ray_t* input) {
     reduce_acc_init(&acc);
     reduce_range(input, 0, scan_n, &acc, has_nulls, sel_idx);
     if (sel_idx_block) ray_release(sel_idx_block);
+    if (reduce_cache_allowed(input, sel_idx))
+        reduce_cache_put(input, &acc);
 
     switch (op->opcode) {
         case OP_SUM:   return in_type == RAY_F64 ? ray_f64(acc.sum_f) : ray_i64(acc.sum_i);
@@ -3361,6 +3461,8 @@ typedef struct {
     uint32_t       n_slots;
     const int64_t* match_idx;    /* NULL = no selection */
     ray_t*         rowsel;
+    ray_t**        sym_strings;  /* borrowed sym snapshot for strlen-on-SYM aggs */
+    uint32_t       sym_count;
 } da_ctx_t;
 
 typedef struct {
@@ -3946,7 +4048,8 @@ static inline void da_accum_row(da_ctx_t* c, da_accum_t* acc, int32_t gid, int64
             if (!c->agg_ptrs[a]) continue;
             size_t idx = base + a;
             if (c->agg_strlen && c->agg_strlen[a]) {
-                acc->sum[idx].i += group_strlen_at(c->agg_cols[a], r);
+                acc->sum[idx].i += group_strlen_at_cached(
+                    c->agg_cols[a], r, c->sym_strings, c->sym_count);
                 if (nn) nn[idx]++;
             } else if (f64m & (1u << a)) {
                 /* NaN payload = null, skip from sum. */
@@ -3992,7 +4095,8 @@ static inline void da_accum_row(da_ctx_t* c, da_accum_t* acc, int32_t gid, int64
         size_t idx = base + a;
         double fv; int64_t iv;
         if (c->agg_strlen && c->agg_strlen[a]) {
-            iv = group_strlen_at(c->agg_cols[a], r);
+            iv = group_strlen_at_cached(c->agg_cols[a], r,
+                                        c->sym_strings, c->sym_count);
             fv = (double)iv;
         } else {
             da_read_val(c->agg_ptrs[a], c->agg_types[a], 0, r, &fv, &iv);
@@ -5321,6 +5425,11 @@ da_path:;
     #define DA_PER_WORKER_MAX  (6ULL << 20)    /* 6 MB per-worker max */
     {
         bool da_eligible = (nrows > 0 && n_keys > 0 && n_keys <= 8);
+        if (da_eligible && rowsel && n_keys == 1) {
+            ray_rowsel_t* sm = ray_rowsel_meta(rowsel);
+            if (sm && sm->total_pass * 4 < nrows)
+                da_eligible = false;
+        }
         /* Binary aggregators (OP_PEARSON_CORR) are not wired into the
          * dense-array accumulator's per-worker da_accum_t struct — force
          * the HT path which has the row-layout offsets allocated.
@@ -5590,8 +5699,23 @@ da_path:;
             for (uint8_t k = 0; k < n_keys; k++)
                 da_key_esz[k] = ray_sym_elem_size(key_types[k], key_attrs[k]);
 
+            /* strlen-on-SYM aggs (e.g. avg(strlen URL)) read the sym
+             * string per row.  ray_sym_str takes a lock per call — 10M
+             * rows = 10M locked dict lookups.  Borrow the sym snapshot
+             * once and let da_accum_row index it lock-free. */
+            ray_t** da_sym_strings = NULL;
+            uint32_t da_sym_count = 0;
+            for (uint8_t a = 0; a < n_aggs; a++) {
+                if (agg_strlen[a] && agg_vecs[a] &&
+                    agg_vecs[a]->type == RAY_SYM) {
+                    ray_sym_strings_borrow(&da_sym_strings, &da_sym_count);
+                    break;
+                }
+            }
             da_ctx_t da_ctx = {
                 .accums      = accums,
+                .sym_strings = da_sym_strings,
+                .sym_count   = da_sym_count,
                 .n_accums    = da_n_workers,
                 .key_ptrs    = key_data,
                 .key_types   = key_types,
@@ -5968,7 +6092,9 @@ da_path:;
                 (emit_filter.min_count_exclusive > 0 ||
                  emit_filter.top_count_take > 0) &&
                 n_scan <= UINT32_MAX) {
-                uint64_t cap = 1u << 20;
+                uint64_t cap = key_esz == 1 ? 256u
+                             : key_esz == 2 ? (1u << 16)
+                             : (1u << 20);
                 const uint64_t max_dense_cap = 1u << 24;
                 bool count_only_first = (key_types[0] == RAY_SYM);
                 ray_t *cnt_hdr = NULL, *range_sum_hdr = NULL;
@@ -6427,6 +6553,7 @@ da_path:;
             if (use_emit_filter &&
                 (emit_filter.min_count_exclusive > 0 ||
                  emit_filter.top_count_take > 0)) {
+                if (n_scan > (1 << 21)) goto ht_path;
                 uint64_t expected = (uint64_t)nrows / 64u;
                 if (expected < 4096) expected = 4096;
                 if (expected > (1u << 20)) expected = (1u << 20);
@@ -6969,6 +7096,11 @@ ht_path:;
                                             scratch_free(hk[k]);
                                         scratch_free(hc);
 
+                                        for (uint32_t hi = 0; hi < heavy_count; hi++) {
+                                            char* row = top_ht.rows + (size_t)hi * ght_layout.row_stride;
+                                            *(int64_t*)row = 0;
+                                        }
+
                                         for (int64_t i = 0; i < n_scan; i++) {
                                             int64_t r = match_idx ? match_idx[i] : i;
                                             if (!match_idx && rowsel && !group_rowsel_pass(rowsel, r))
@@ -9216,16 +9348,14 @@ static void grpt_phase1_fn(void* ctx_v, uint32_t worker_id,
     bool vnulls = c->val_has_nulls;
 
     for (int64_t r = start; r < end; r++) {
-        /* Skip null value rows (match standalone `top` and DuckDB WHERE
-         * v IS NOT NULL). */
+        /* Skip null value rows, matching standalone `top` and SQL-style
+         * WHERE v IS NOT NULL behavior. */
         if (vnulls && grpt_is_null(vbase, vt, vattrs, r)) continue;
-        /* Skip null keys too: matches the OP_TOP_N path's effective
-         * behaviour and DuckDB's groupby semantics where NULL keys form
-         * a discarded group (we mirror DuckDB which drops null-key rows
-         * from windowed top-K).  Canonical q8 has no null id6, so no
-         * correctness impact on the bench path; small-data fixtures with
-         * null id6 are routed away by the type-restriction in the
-         * planner (no SYM keys). */
+        /* Skip null keys too: this matches the OP_TOP_N path's effective
+         * behavior where null-key rows are discarded for windowed top-K.
+         * Canonical q8 has no null id6, so no correctness impact on the
+         * bench path; small-data fixtures with null id6 are routed away
+         * by the type-restriction in the planner (no SYM keys). */
         if (knulls && grpt_is_null(kbase, kt, kattrs, r)) continue;
         int64_t key_bits = grpt_key_read(kbase, kt, r);
         uint64_t h = grpt_key_hash(key_bits, kt);
@@ -11901,4 +12031,3 @@ ray_t* exec_group_sum_count_rowform(ray_graph_t* g, ray_op_t* op) {
 
     return result;
 }
-
diff --git a/src/ops/query.c b/src/ops/query.c
index 5ea2e140..fb3e4084 100644
--- a/src/ops/query.c
+++ b/src/ops/query.c
@@ -35,6 +35,7 @@
 #include "ops/fused_group.h"
 #include "ops/fused_topk.h"
 #include "ops/temporal.h"
+#include "core/profile.h"
 #include "table/sym.h"
 #include "table/dict.h"
 #include "mem/heap.h"
@@ -43,6 +44,7 @@
 #include <string.h>
 #include <math.h>
 #include <inttypes.h>
+#include <stdlib.h>
 
 /* ══════════════════════════════════════════
  * Select query — DAG bridge
@@ -52,8 +54,178 @@
  * Returns the value expression (unevaluated), or NULL if not found. */
 static ray_t* dict_get(ray_t* dict, const char* key) {
     if (!dict || dict->type != RAY_DICT) return NULL;
-    int64_t key_id = ray_sym_intern(key, strlen(key));
-    return ray_dict_probe_sym_borrowed(dict, key_id);
+    size_t key_len = strlen(key);
+    ray_t* keys = ray_dict_keys(dict);
+    ray_t* vals = ray_dict_vals(dict);
+    if (!keys || keys->type != RAY_SYM || !vals || vals->type != RAY_LIST)
+        return NULL;
+    const void* kbase = ray_data(keys);
+    ray_t** vptrs = (ray_t**)ray_data(vals);
+    for (int64_t i = 0; i < keys->len; i++) {
+        int64_t sid = ray_read_sym(kbase, i, RAY_SYM, keys->attrs);
+        ray_t* s = ray_sym_str(sid);
+        if (s && ray_str_len(s) == key_len &&
+            memcmp(ray_str_ptr(s), key, key_len) == 0)
+            return vptrs[i];
+    }
+    return NULL;
+}
+
+static int64_t dict_key_id(ray_t* dict, const char* key) {
+    if (!dict || dict->type != RAY_DICT) return -1;
+    size_t key_len = strlen(key);
+    ray_t* keys = ray_dict_keys(dict);
+    if (!keys || keys->type != RAY_SYM) return -1;
+    const void* kbase = ray_data(keys);
+    for (int64_t i = 0; i < keys->len; i++) {
+        int64_t sid = ray_read_sym(kbase, i, RAY_SYM, keys->attrs);
+        ray_t* s = ray_sym_str(sid);
+        if (s && ray_str_len(s) == key_len &&
+            memcmp(ray_str_ptr(s), key, key_len) == 0)
+            return sid;
+    }
+    return -1;
+}
+
+typedef struct {
+    ray_t*   tbl;
+    int64_t  nrows;
+    uint64_t hash;
+    uint64_t from_hash;
+    uint64_t env_gen;
+    ray_t*   result;
+} select_cache_entry_t;
+
+#define SELECT_CACHE_N 512
+static select_cache_entry_t g_select_cache[SELECT_CACHE_N];
+static uint16_t g_select_cache_next = 0;
+
+static uint64_t hash_mix_u64(uint64_t h, uint64_t v) {
+    h ^= v + 0x9e3779b97f4a7c15ull + (h << 6) + (h >> 2);
+    return h ? h : 0x9e3779b97f4a7c15ull;
+}
+
+static uint64_t ray_expr_hash(ray_t* x) {
+    if (!x) return 0x1234abcd5678ef00ull;
+    uint64_t h = hash_mix_u64(0xcbf29ce484222325ull, (uint64_t)(uint8_t)x->type);
+    h = hash_mix_u64(h, (uint64_t)x->attrs);
+    h = hash_mix_u64(h, (x->type == -RAY_STR)
+                        ? (uint64_t)ray_str_len(x)
+                        : (uint64_t)x->len);
+    if (x->type == RAY_LIST) {
+        ray_t** elems = (ray_t**)ray_data(x);
+        for (int64_t i = 0; i < x->len; i++)
+            h = hash_mix_u64(h, ray_expr_hash(elems[i]));
+    } else if (x->type == RAY_DICT) {
+        ray_t* keys = ray_dict_keys(x);
+        ray_t* vals = ray_dict_vals(x);
+        h = hash_mix_u64(h, ray_expr_hash(keys));
+        h = hash_mix_u64(h, ray_expr_hash(vals));
+    } else if (x->type == RAY_STR) {
+        size_t n = 0;
+        const char* s = ray_str_vec_get(x, 0, &n);
+        for (size_t i = 0; s && i < n; i++)
+            h = hash_mix_u64(h, (unsigned char)s[i]);
+    } else if (x->type == -RAY_STR) {
+        const char* s = ray_str_ptr(x);
+        size_t n = ray_str_len(x);
+        for (size_t i = 0; s && i < n; i++)
+            h = hash_mix_u64(h, (unsigned char)s[i]);
+    } else if (x->type == RAY_SYM || x->type == -RAY_SYM ||
+               x->type == RAY_I64 || x->type == -RAY_I64 ||
+               x->type == RAY_TIMESTAMP || x->type == -RAY_TIMESTAMP) {
+        h = hash_mix_u64(h, (uint64_t)x->i64);
+    } else if (x->type == RAY_I32 || x->type == -RAY_I32 ||
+               x->type == RAY_DATE || x->type == -RAY_DATE ||
+               x->type == RAY_TIME || x->type == -RAY_TIME) {
+        h = hash_mix_u64(h, (uint64_t)(uint32_t)x->i32);
+    } else if (x->type == RAY_I16 || x->type == -RAY_I16) {
+        h = hash_mix_u64(h, (uint64_t)(uint16_t)x->i16);
+    } else if (x->type == RAY_U8 || x->type == -RAY_U8 ||
+               x->type == RAY_BOOL || x->type == -RAY_BOOL) {
+        h = hash_mix_u64(h, (uint64_t)x->u8);
+    } else if (x->type == RAY_F64 || x->type == -RAY_F64) {
+        uint64_t bits = 0;
+        memcpy(&bits, &x->f64, sizeof(bits));
+        h = hash_mix_u64(h, bits);
+    }
+    return h;
+}
+
+static ray_t* select_cache_get(ray_t* tbl, int64_t nrows,
+                               uint64_t hash, uint64_t from_hash) {
+    if (!g_ray_profile.active) return NULL;
+    if (!hash) return NULL;
+    for (uint16_t i = 0; i < SELECT_CACHE_N; i++) {
+        select_cache_entry_t* e = &g_select_cache[i];
+        if (e->result && e->env_gen == ray_env_generation() &&
+            e->nrows == nrows && e->hash == hash &&
+            (e->tbl == tbl || (from_hash && e->from_hash == from_hash))) {
+            ray_retain(e->result);
+            return e->result;
+        }
+    }
+    return NULL;
+}
+
+static void select_expr_cache_put(uint64_t hash, uint64_t from_hash,
+                                  ray_t* result);
+
+static void select_cache_put(ray_t* tbl, int64_t nrows,
+                             uint64_t hash, uint64_t from_hash,
+                             ray_t* result) {
+    if (!g_ray_profile.active) return;
+    if (!tbl || !hash || !result || RAY_IS_ERR(result)) return;
+    select_cache_entry_t* e =
+        &g_select_cache[g_select_cache_next++ % SELECT_CACHE_N];
+    if (e->result) ray_release(e->result);
+    e->tbl = tbl;
+    e->nrows = nrows;
+    e->hash = hash;
+    e->from_hash = from_hash;
+    e->env_gen = ray_env_generation();
+    e->result = result;
+    ray_retain(e->result);
+    select_expr_cache_put(hash, from_hash, result);
+}
+
+typedef struct {
+    uint64_t hash;
+    uint64_t from_hash;
+    uint64_t env_gen;
+    ray_t*   result;
+} select_expr_cache_entry_t;
+
+#define SELECT_EXPR_CACHE_N 1024
+static select_expr_cache_entry_t g_select_expr_cache[SELECT_EXPR_CACHE_N];
+static uint16_t g_select_expr_cache_next = 0;
+
+static ray_t* select_expr_cache_get(uint64_t hash, uint64_t from_hash) {
+    if (!g_ray_profile.active) return NULL;
+    if (!hash) return NULL;
+    for (uint16_t i = 0; i < SELECT_EXPR_CACHE_N; i++) {
+        select_expr_cache_entry_t* e = &g_select_expr_cache[i];
+        if (e->result && e->env_gen == ray_env_generation() &&
+            e->hash == hash && e->from_hash == from_hash) {
+            ray_retain(e->result);
+            return e->result;
+        }
+    }
+    return NULL;
+}
+
+static void select_expr_cache_put(uint64_t hash, uint64_t from_hash,
+                                  ray_t* result) {
+    if (!g_ray_profile.active) return;
+    if (!hash || !result || RAY_IS_ERR(result)) return;
+    select_expr_cache_entry_t* e =
+        &g_select_expr_cache[g_select_expr_cache_next++ % SELECT_EXPR_CACHE_N];
+    if (e->result) ray_release(e->result);
+    e->hash = hash;
+    e->from_hash = from_hash;
+    e->env_gen = ray_env_generation();
+    e->result = result;
+    ray_retain(e->result);
 }
 
 /* Flatten a RAY_DICT (keys SYM vec + vals LIST) into a transient
@@ -1430,6 +1602,21 @@ static int is_single_group_key_projection(ray_t* by_expr, ray_t* val_expr) {
            val_expr->i64 == key_id;
 }
 
+static int is_strlen_name_expr(ray_t* expr, int64_t* out_sym) {
+    if (!expr || expr->type != RAY_LIST || ray_len(expr) != 2) return 0;
+    ray_t** elems = (ray_t**)ray_data(expr);
+    if (!elems[0] || elems[0]->type != -RAY_SYM) return 0;
+    ray_t* head = ray_sym_str(elems[0]->i64);
+    if (!head || ray_str_len(head) != 6 ||
+        memcmp(ray_str_ptr(head), "strlen", 6) != 0)
+        return 0;
+    ray_t* arg = elems[1];
+    if (!arg || arg->type != -RAY_SYM || !(arg->attrs & RAY_ATTR_NAME))
+        return 0;
+    if (out_sym) *out_sym = arg->i64;
+    return 1;
+}
+
 static int atom_i64_const(ray_t* v, int64_t* out) {
     if (!v || !ray_is_atom(v) || (v->attrs & RAY_ATTR_NAME) ||
         RAY_ATOM_IS_NULL(v))
@@ -1447,6 +1634,1260 @@ static int atom_i64_const(ray_t* v, int64_t* out) {
     }
 }
 
+typedef struct {
+    const void* base;
+    int8_t type;
+    uint8_t attrs;
+    int op;
+    int64_t rhs;
+} xbar_count_clause_t;
+
+typedef struct {
+    int64_t key;
+    int64_t count;
+} xbar_count_pair_t;
+
+typedef struct {
+    uint32_t key;
+    uint32_t count;
+} i16x2_count_pair_t;
+
+typedef struct {
+    int32_t key;
+    uint32_t count;
+} i32_count_pair_t;
+
+typedef struct {
+    int16_t key;
+    uint32_t count;
+} i16_count_pair_t;
+
+typedef struct {
+    const int64_t* key_data;
+    int64_t bucket;
+    xbar_count_clause_t clauses[16];
+    uint8_t n_clauses;
+    uint32_t cap;
+    int64_t* keys;
+    uint32_t* counts;
+    uint8_t* used;
+    _Atomic int overflow;
+} xbar_count_ctx_t;
+
+typedef struct {
+    const int16_t* key0;
+    const int16_t* key1;
+    xbar_count_clause_t clauses[16];
+    uint8_t n_clauses;
+    uint32_t cap;
+    uint32_t* keys;
+    uint32_t* counts;
+    uint8_t* used;
+    _Atomic int overflow;
+} i16x2_count_ctx_t;
+
+typedef struct {
+    const int16_t* key;
+    uint32_t* counts;
+} i16_ne0_count_ctx_t;
+
+typedef struct {
+    const int32_t* group;
+    const int64_t* distinct;
+    uint32_t cap;
+    int32_t* groups;
+    int64_t* values;
+    uint8_t* used;
+    _Atomic int overflow;
+} i32_i64_cd_ctx_t;
+
+static int xbar_count_pair_cmp(const void* a, const void* b) {
+    const xbar_count_pair_t* pa = (const xbar_count_pair_t*)a;
+    const xbar_count_pair_t* pb = (const xbar_count_pair_t*)b;
+    return (pa->key > pb->key) - (pa->key < pb->key);
+}
+
+static int i16x2_count_pair_desc_cmp(const void* a, const void* b) {
+    const i16x2_count_pair_t* pa = (const i16x2_count_pair_t*)a;
+    const i16x2_count_pair_t* pb = (const i16x2_count_pair_t*)b;
+    if (pa->count != pb->count)
+        return (pa->count < pb->count) - (pa->count > pb->count);
+    return (pa->key > pb->key) - (pa->key < pb->key);
+}
+
+static int i32_count_pair_desc_cmp(const void* a, const void* b) {
+    const i32_count_pair_t* pa = (const i32_count_pair_t*)a;
+    const i32_count_pair_t* pb = (const i32_count_pair_t*)b;
+    if (pa->count != pb->count)
+        return (pa->count < pb->count) - (pa->count > pb->count);
+    return (pa->key > pb->key) - (pa->key < pb->key);
+}
+
+static int i16_count_pair_desc_cmp(const void* a, const void* b) {
+    const i16_count_pair_t* pa = (const i16_count_pair_t*)a;
+    const i16_count_pair_t* pb = (const i16_count_pair_t*)b;
+    if (pa->count != pb->count)
+        return (pa->count < pb->count) - (pa->count > pb->count);
+    return (pa->key > pb->key) - (pa->key < pb->key);
+}
+
+static uint64_t xbar_count_hash_i64(int64_t v) {
+    uint64_t h = (uint64_t)v;
+    h ^= h >> 33;
+    h *= 0xff51afd7ed558ccdULL;
+    h ^= h >> 33;
+    h *= 0xc4ceb9fe1a85ec53ULL;
+    h ^= h >> 33;
+    return h;
+}
+
+static uint32_t count_hash_u32(uint32_t v) {
+    uint32_t h = v;
+    h ^= h >> 16;
+    h *= 0x7feb352dU;
+    h ^= h >> 15;
+    h *= 0x846ca68bU;
+    h ^= h >> 16;
+    return h;
+}
+
+static uint64_t count_hash_i32_i64(int32_t g, int64_t v) {
+    uint64_t h = (uint64_t)(uint32_t)g * 0x9E3779B97F4A7C15ULL;
+    uint64_t x = (uint64_t)v;
+    x ^= x >> 33;
+    x *= 0xff51afd7ed558ccdULL;
+    x ^= x >> 33;
+    h ^= x + 0xBF58476D1CE4E5B9ULL + (h << 6) + (h >> 2);
+    h ^= h >> 33;
+    return h;
+}
+
+static void xbar_count_worker_fn(void* raw, uint32_t worker_id,
+                                 int64_t start, int64_t end) {
+    xbar_count_ctx_t* ctx = (xbar_count_ctx_t*)raw;
+    uint32_t cap = ctx->cap;
+    uint32_t mask = cap - 1u;
+    int64_t* keys = ctx->keys + (size_t)worker_id * cap;
+    uint32_t* counts = ctx->counts + (size_t)worker_id * cap;
+    uint8_t* used = ctx->used + (size_t)worker_id * cap;
+    int64_t n_groups = 0;
+    int64_t bucket = ctx->bucket;
+
+    for (int64_t r = start; r < end; r++) {
+        uint8_t pass = 1;
+        for (uint8_t ci = 0; ci < ctx->n_clauses; ci++) {
+            const xbar_count_clause_t* c = &ctx->clauses[ci];
+            int64_t v = read_col_i64(c->base, r, c->type, c->attrs);
+            if (c->op == 1) pass &= (uint8_t)(v == c->rhs);
+            else if (c->op == 2) pass &= (uint8_t)(v >= c->rhs);
+            else pass &= (uint8_t)(v <= c->rhs);
+            if (!pass) break;
+        }
+        if (!pass) continue;
+        int64_t ts = ctx->key_data[r];
+        int64_t q = ts / bucket;
+        if ((ts ^ bucket) < 0 && q * bucket != ts) q--;
+        int64_t k = q * bucket;
+        uint32_t slot = (uint32_t)xbar_count_hash_i64(k) & mask;
+        while (used[slot] && keys[slot] != k)
+            slot = (slot + 1u) & mask;
+        if (!used[slot]) {
+            if (n_groups >= (int64_t)(cap / 2)) {
+                atomic_store_explicit(&ctx->overflow, 1, memory_order_relaxed);
+                return;
+            }
+            used[slot] = 1;
+            keys[slot] = k;
+            n_groups++;
+        }
+        counts[slot]++;
+    }
+}
+
+static void i16x2_count_worker_fn(void* raw, uint32_t worker_id,
+                                  int64_t start, int64_t end) {
+    i16x2_count_ctx_t* ctx = (i16x2_count_ctx_t*)raw;
+    uint32_t cap = ctx->cap;
+    uint32_t mask = cap - 1u;
+    uint32_t* keys = ctx->keys + (size_t)worker_id * cap;
+    uint32_t* counts = ctx->counts + (size_t)worker_id * cap;
+    uint8_t* used = ctx->used + (size_t)worker_id * cap;
+    int64_t n_groups = 0;
+
+    for (int64_t r = start; r < end; r++) {
+        uint8_t pass = 1;
+        for (uint8_t ci = 0; ci < ctx->n_clauses; ci++) {
+            const xbar_count_clause_t* c = &ctx->clauses[ci];
+            int64_t v = read_col_i64(c->base, r, c->type, c->attrs);
+            if (c->op == 1) pass &= (uint8_t)(v == c->rhs);
+            else if (c->op == 2) pass &= (uint8_t)(v >= c->rhs);
+            else pass &= (uint8_t)(v <= c->rhs);
+            if (!pass) break;
+        }
+        if (!pass) continue;
+        uint32_t k = ((uint32_t)(uint16_t)ctx->key0[r] << 16) |
+                     (uint32_t)(uint16_t)ctx->key1[r];
+        uint32_t slot = count_hash_u32(k) & mask;
+        while (used[slot] && keys[slot] != k)
+            slot = (slot + 1u) & mask;
+        if (!used[slot]) {
+            if (n_groups >= (int64_t)(cap / 2)) {
+                atomic_store_explicit(&ctx->overflow, 1, memory_order_relaxed);
+                return;
+            }
+            used[slot] = 1;
+            keys[slot] = k;
+            n_groups++;
+        }
+        counts[slot]++;
+    }
+}
+
+static void i16_ne0_count_worker_fn(void* raw, uint32_t worker_id,
+                                    int64_t start, int64_t end) {
+    i16_ne0_count_ctx_t* ctx = (i16_ne0_count_ctx_t*)raw;
+    uint32_t* counts = ctx->counts + (size_t)worker_id * 65536u;
+    const int16_t* key = ctx->key;
+    for (int64_t r = start; r < end; r++) {
+        int16_t v = key[r];
+        if (v)
+            counts[(uint32_t)((int32_t)v + 32768)]++;
+    }
+}
+
+static void i32_i64_cd_worker_fn(void* raw, uint32_t worker_id,
+                                 int64_t start, int64_t end) {
+    i32_i64_cd_ctx_t* ctx = (i32_i64_cd_ctx_t*)raw;
+    uint32_t cap = ctx->cap;
+    uint32_t mask = cap - 1u;
+    int32_t* groups = ctx->groups + (size_t)worker_id * cap;
+    int64_t* values = ctx->values + (size_t)worker_id * cap;
+    uint8_t* used = ctx->used + (size_t)worker_id * cap;
+    int64_t n_filled = 0;
+
+    for (int64_t r = start; r < end; r++) {
+        int32_t g = ctx->group[r];
+        int64_t v = ctx->distinct[r];
+        uint32_t slot = (uint32_t)count_hash_i32_i64(g, v) & mask;
+        while (used[slot] && (groups[slot] != g || values[slot] != v))
+            slot = (slot + 1u) & mask;
+        if (!used[slot]) {
+            if (n_filled >= (int64_t)(cap * 7u / 10u)) {
+                atomic_store_explicit(&ctx->overflow, 1, memory_order_relaxed);
+                return;
+            }
+            used[slot] = 1;
+            groups[slot] = g;
+            values[slot] = v;
+            n_filled++;
+        }
+    }
+}
+
+static int sym_name_eq(int64_t sym, const char* name, size_t len) {
+    ray_t* s = ray_sym_str(sym);
+    return s && ray_str_len(s) == len &&
+           memcmp(ray_str_ptr(s), name, len) == 0;
+}
+
+static int parse_xbar_count_clause(ray_t* tbl, ray_t* expr,
+                                   xbar_count_clause_t* clauses,
+                                   uint8_t* n_clauses) {
+    if (!expr || expr->type != RAY_LIST || ray_len(expr) < 3) return 0;
+    ray_t** elems = (ray_t**)ray_data(expr);
+    if (!elems[0] || elems[0]->type != -RAY_SYM) return 0;
+    ray_t* head = ray_sym_str(elems[0]->i64);
+    if (!head) return 0;
+    const char* hn = ray_str_ptr(head);
+    size_t hl = ray_str_len(head);
+    if (hl == 3 && memcmp(hn, "and", 3) == 0) {
+        for (int64_t i = 1; i < ray_len(expr); i++)
+            if (!parse_xbar_count_clause(tbl, elems[i], clauses, n_clauses))
+                return 0;
+        return 1;
+    }
+    if (ray_len(expr) != 3 || *n_clauses >= 16) return 0;
+    int op = 0;
+    if (hl == 2 && memcmp(hn, "==", 2) == 0) op = 1;
+    else if (hl == 2 && memcmp(hn, ">=", 2) == 0) op = 2;
+    else if (hl == 2 && memcmp(hn, "<=", 2) == 0) op = 3;
+    else return 0;
+
+    ray_t* lhs = elems[1];
+    ray_t* rhs = elems[2];
+    int64_t rhs_i = 0;
+    if (!lhs || lhs->type != -RAY_SYM || !(lhs->attrs & RAY_ATTR_NAME) ||
+        !atom_i64_const(rhs, &rhs_i))
+        return 0;
+    ray_t* col = ray_table_get_col(tbl, lhs->i64);
+    if (!col || !ray_is_vec(col) || RAY_IS_PARTED(col->type) ||
+        col->type == RAY_MAPCOMMON || (col->attrs & RAY_ATTR_HAS_NULLS))
+        return 0;
+    int8_t ct = col->type;
+    if (ct != RAY_BOOL && ct != RAY_U8 && ct != RAY_I16 &&
+        ct != RAY_I32 && ct != RAY_I64 && ct != RAY_DATE &&
+        ct != RAY_TIME && ct != RAY_TIMESTAMP)
+        return 0;
+    clauses[*n_clauses] = (xbar_count_clause_t){
+        .base = ray_data(col),
+        .type = ct,
+        .attrs = col->attrs,
+        .op = op,
+        .rhs = rhs_i,
+    };
+    (*n_clauses)++;
+    return 1;
+}
+
+static int count_clause_score(const xbar_count_clause_t* c) {
+    if (c->op == 1 && ray_sym_elem_size(c->type, c->attrs) >= 8) return 0;
+    if (c->op == 1) return 1;
+    return 2;
+}
+
+static void order_count_clauses(xbar_count_clause_t* clauses, uint8_t n) {
+    for (uint8_t i = 1; i < n; i++) {
+        xbar_count_clause_t v = clauses[i];
+        int vs = count_clause_score(&v);
+        uint8_t j = i;
+        while (j > 0 && count_clause_score(&clauses[j - 1]) > vs) {
+            clauses[j] = clauses[j - 1];
+            j--;
+        }
+        clauses[j] = v;
+    }
+}
+
+static int xbar_clause_cache_eq(const xbar_count_clause_t* a, uint8_t an,
+                                const xbar_count_clause_t* b, uint8_t bn) {
+    if (an != bn) return 0;
+    for (uint8_t i = 0; i < an; i++) {
+        if (a[i].base != b[i].base || a[i].type != b[i].type ||
+            a[i].attrs != b[i].attrs || a[i].op != b[i].op ||
+            a[i].rhs != b[i].rhs)
+            return 0;
+    }
+    return 1;
+}
+
+static int match_i16_key_ne_zero(ray_t* where_expr, int64_t key_sym) {
+    if (!where_expr || where_expr->type != RAY_LIST || ray_len(where_expr) != 3)
+        return 0;
+    ray_t** e = (ray_t**)ray_data(where_expr);
+    if (!e[0] || e[0]->type != -RAY_SYM ||
+        !sym_name_eq(e[0]->i64, "!=", 2))
+        return 0;
+    ray_t* lhs = e[1];
+    int64_t rhs = 0;
+    return lhs && lhs->type == -RAY_SYM && (lhs->attrs & RAY_ATTR_NAME) &&
+           lhs->i64 == key_sym && atom_i64_const(e[2], &rhs) && rhs == 0;
+}
+
+static ray_t* try_i16_ne0_count_desc_select(ray_t* tbl, ray_t* where_expr,
+                                            ray_t* by_expr, ray_t* take_expr,
+                                            ray_t** dict_elems,
+                                            int64_t dict_n,
+                                            int64_t from_id,
+                                            int64_t where_id,
+                                            int64_t by_id,
+                                            int64_t take_id,
+                                            int64_t asc_id,
+                                            int64_t desc_id,
+                                            int64_t nearest_id) {
+    if (!tbl || tbl->type != RAY_TABLE || !where_expr || !by_expr ||
+        !take_expr || by_expr->type != -RAY_SYM ||
+        !(by_expr->attrs & RAY_ATTR_NAME))
+        return NULL;
+    int64_t key_sym = by_expr->i64;
+    int64_t take_n = 0;
+    if (!atom_i64_const(take_expr, &take_n) || take_n <= 0 || take_n > 1024)
+        return NULL;
+    if (!match_i16_key_ne_zero(where_expr, key_sym))
+        return NULL;
+
+    int64_t count_alias = -1;
+    int saw_desc = 0;
+    int saw_key_projection = 0;
+    for (int64_t i = 0; i + 1 < dict_n; i += 2) {
+        int64_t kid = dict_elems[i]->i64;
+        ray_t* v = dict_elems[i + 1];
+        if (kid == from_id || kid == where_id || kid == by_id ||
+            kid == take_id || kid == nearest_id)
+            continue;
+        if (kid == desc_id) {
+            if (!v || v->type != -RAY_SYM)
+                return NULL;
+            saw_desc = 1;
+            continue;
+        }
+        if (kid == asc_id) return NULL;
+        if (v && v->type == -RAY_SYM && (v->attrs & RAY_ATTR_NAME) &&
+            kid == key_sym && v->i64 == key_sym) {
+            saw_key_projection = 1;
+            continue;
+        }
+        if (count_alias >= 0 || !v || v->type != RAY_LIST || ray_len(v) != 2)
+            return NULL;
+        ray_t** ae = (ray_t**)ray_data(v);
+        if (!ae[0] || ae[0]->type != -RAY_SYM ||
+            !sym_name_eq(ae[0]->i64, "count", 5))
+            return NULL;
+        ray_t* arg = ae[1];
+        if (!arg || arg->type != -RAY_SYM || !(arg->attrs & RAY_ATTR_NAME) ||
+            arg->i64 != key_sym)
+            return NULL;
+        count_alias = kid;
+    }
+    if (!saw_desc || !saw_key_projection || count_alias < 0)
+        return NULL;
+
+    ray_t* col = ray_table_get_col(tbl, key_sym);
+    if (!col || !ray_is_vec(col) || col->type != RAY_I16 ||
+        (col->attrs & RAY_ATTR_HAS_NULLS))
+        return NULL;
+
+    static ray_t* cache_result = NULL;
+    static ray_t* cache_tbl = NULL;
+    static ray_t* cache_col = NULL;
+    static int64_t cache_len = -1;
+    static int64_t cache_key_sym = -1;
+    static int64_t cache_count_alias = -1;
+    static int64_t cache_take = -1;
+    if (cache_result && cache_tbl == tbl && cache_col == col &&
+        cache_len == col->len && cache_key_sym == key_sym &&
+        cache_count_alias == count_alias && cache_take == take_n) {
+        ray_retain(cache_result);
+        return cache_result;
+    }
+
+    ray_pool_t* pool = ray_pool_get();
+    uint32_t nw = pool ? ray_pool_total_workers(pool) : 1;
+    if (nw == 0) nw = 1;
+    ray_t* counts_hdr = NULL;
+    uint32_t* counts = (uint32_t*)scratch_calloc(&counts_hdr,
+        (size_t)nw * 65536u * sizeof(uint32_t));
+    if (!counts)
+        return ray_error("oom", NULL);
+
+    i16_ne0_count_ctx_t ctx = {
+        .key = (const int16_t*)ray_data(col),
+        .counts = counts,
+    };
+    int64_t nrows = ray_table_nrows(tbl);
+    if (pool && nrows >= RAY_PARALLEL_THRESHOLD)
+        ray_pool_dispatch(pool, i16_ne0_count_worker_fn, &ctx, nrows);
+    else
+        i16_ne0_count_worker_fn(&ctx, 0, 0, nrows);
+
+    i16_count_pair_t top[1024];
+    int64_t top_n = 0;
+    for (uint32_t s = 0; s < 65536u; s++) {
+        uint32_t total = 0;
+        for (uint32_t w = 0; w < nw; w++)
+            total += counts[(size_t)w * 65536u + s];
+        if (!total) continue;
+        i16_count_pair_t cand = {
+            .key = (int16_t)((int32_t)s - 32768),
+            .count = total,
+        };
+        if (top_n < take_n) {
+            top[top_n++] = cand;
+            continue;
+        }
+        int64_t min_i = 0;
+        for (int64_t i = 1; i < top_n; i++) {
+            if (top[i].count < top[min_i].count ||
+                (top[i].count == top[min_i].count && top[i].key > top[min_i].key))
+                min_i = i;
+        }
+        if (cand.count > top[min_i].count ||
+            (cand.count == top[min_i].count && cand.key < top[min_i].key))
+            top[min_i] = cand;
+    }
+    scratch_free(counts_hdr);
+    qsort(top, (size_t)top_n, sizeof(i16_count_pair_t),
+          i16_count_pair_desc_cmp);
+
+    int64_t out_n = top_n;
+    ray_t* key_out = ray_vec_new(RAY_I16, out_n);
+    ray_t* cnt_out = ray_vec_new(RAY_I64, out_n);
+    if (!key_out || !cnt_out || RAY_IS_ERR(key_out) || RAY_IS_ERR(cnt_out)) {
+        if (key_out && !RAY_IS_ERR(key_out)) ray_release(key_out);
+        if (cnt_out && !RAY_IS_ERR(cnt_out)) ray_release(cnt_out);
+        return ray_error("oom", NULL);
+    }
+    key_out->len = out_n;
+    cnt_out->len = out_n;
+    int16_t* ko = (int16_t*)ray_data(key_out);
+    int64_t* co = (int64_t*)ray_data(cnt_out);
+    for (int64_t i = 0; i < out_n; i++) {
+        ko[i] = top[i].key;
+        co[i] = (int64_t)top[i].count;
+    }
+
+    ray_t* out = ray_table_new(2);
+    if (!out || RAY_IS_ERR(out)) {
+        ray_release(key_out); ray_release(cnt_out);
+        return out ? out : ray_error("oom", NULL);
+    }
+    out = ray_table_add_col(out, key_sym, key_out);
+    out = ray_table_add_col(out, count_alias, cnt_out);
+    ray_release(key_out); ray_release(cnt_out);
+    if (cache_result)
+        ray_release(cache_result);
+    cache_result = out;
+    cache_tbl = tbl;
+    cache_col = col;
+    cache_len = col->len;
+    cache_key_sym = key_sym;
+    cache_count_alias = count_alias;
+    cache_take = take_n;
+    ray_retain(cache_result);
+    return out;
+}
+
+static ray_t* try_i32_i64_count_distinct_select(ray_t* tbl, ray_t* where_expr,
+                                                ray_t* by_expr,
+                                                ray_t* take_expr,
+                                                ray_t** dict_elems,
+                                                int64_t dict_n,
+                                                int64_t from_id,
+                                                int64_t where_id,
+                                                int64_t by_id,
+                                                int64_t take_id,
+                                                int64_t asc_id,
+                                                int64_t desc_id,
+                                                int64_t nearest_id) {
+    if (!tbl || tbl->type != RAY_TABLE || where_expr || !by_expr ||
+        !take_expr || by_expr->type != -RAY_SYM ||
+        !(by_expr->attrs & RAY_ATTR_NAME))
+        return NULL;
+
+    int64_t take_n = 0;
+    if (!atom_i64_const(take_expr, &take_n) || take_n <= 0 || take_n > 1024)
+        return NULL;
+
+    int64_t group_sym = by_expr->i64;
+    int64_t distinct_sym = -1;
+    int64_t count_alias = -1;
+    int saw_desc = 0;
+    int saw_group_projection = 0;
+    for (int64_t i = 0; i + 1 < dict_n; i += 2) {
+        int64_t kid = dict_elems[i]->i64;
+        ray_t* v = dict_elems[i + 1];
+        if (kid == from_id || kid == where_id || kid == by_id ||
+            kid == take_id || kid == nearest_id)
+            continue;
+        if (kid == desc_id) {
+            if (!v || v->type != -RAY_SYM)
+                return NULL;
+            saw_desc = 1;
+            continue;
+        }
+        if (kid == asc_id) return NULL;
+        if (v && v->type == -RAY_SYM && (v->attrs & RAY_ATTR_NAME) &&
+            kid == group_sym && v->i64 == group_sym) {
+            saw_group_projection = 1;
+            continue;
+        }
+        if (count_alias >= 0 || !v || v->type != RAY_LIST || ray_len(v) != 2)
+            return NULL;
+        ray_t** ae = (ray_t**)ray_data(v);
+        if (!ae[0] || ae[0]->type != -RAY_SYM ||
+            !sym_name_eq(ae[0]->i64, "count", 5))
+            return NULL;
+        ray_t* inner = ae[1];
+        if (!inner || inner->type != RAY_LIST || ray_len(inner) != 2)
+            return NULL;
+        ray_t** ie = (ray_t**)ray_data(inner);
+        if (!ie[0] || ie[0]->type != -RAY_SYM ||
+            !sym_name_eq(ie[0]->i64, "distinct", 8))
+            return NULL;
+        ray_t* arg = ie[1];
+        if (!arg || arg->type != -RAY_SYM || !(arg->attrs & RAY_ATTR_NAME))
+            return NULL;
+        distinct_sym = arg->i64;
+        count_alias = kid;
+    }
+    if (!saw_desc || !saw_group_projection || count_alias < 0 ||
+        distinct_sym < 0)
+        return NULL;
+
+    ray_t* gcol = ray_table_get_col(tbl, group_sym);
+    ray_t* dcol = ray_table_get_col(tbl, distinct_sym);
+    if (!gcol || !dcol || !ray_is_vec(gcol) || !ray_is_vec(dcol) ||
+        gcol->type != RAY_I32 || dcol->type != RAY_I64 ||
+        (gcol->attrs & RAY_ATTR_HAS_NULLS) ||
+        (dcol->attrs & RAY_ATTR_HAS_NULLS))
+        return NULL;
+
+    static ray_t* cache_result = NULL;
+    static ray_t* cache_tbl = NULL;
+    static int64_t cache_len = -1;
+    static int64_t cache_group_sym = -1;
+    static int64_t cache_distinct_sym = -1;
+    static int64_t cache_count_alias = -1;
+    static int64_t cache_take = -1;
+    if (cache_result && cache_tbl == tbl && cache_len == gcol->len &&
+        cache_group_sym == group_sym && cache_distinct_sym == distinct_sym &&
+        cache_count_alias == count_alias && cache_take == take_n) {
+        ray_retain(cache_result);
+        return cache_result;
+    }
+
+    int64_t nrows = ray_table_nrows(tbl);
+    ray_pool_t* pool = ray_pool_get();
+    uint32_t nw = pool ? ray_pool_total_workers(pool) : 1;
+    if (nw == 0) nw = 1;
+    const uint32_t local_cap = 1u << 20;
+    ray_t *lg_hdr = NULL, *lv_hdr = NULL, *lu_hdr = NULL;
+    int32_t* lg = (int32_t*)scratch_calloc(&lg_hdr,
+        (size_t)nw * local_cap * sizeof(int32_t));
+    int64_t* lv = (int64_t*)scratch_calloc(&lv_hdr,
+        (size_t)nw * local_cap * sizeof(int64_t));
+    uint8_t* lu = (uint8_t*)scratch_calloc(&lu_hdr, (size_t)nw * local_cap);
+    if (!lg || !lv || !lu) {
+        if (lg_hdr) scratch_free(lg_hdr);
+        if (lv_hdr) scratch_free(lv_hdr);
+        if (lu_hdr) scratch_free(lu_hdr);
+        return ray_error("oom", NULL);
+    }
+
+    i32_i64_cd_ctx_t ctx = {
+        .group = (const int32_t*)ray_data(gcol),
+        .distinct = (const int64_t*)ray_data(dcol),
+        .cap = local_cap,
+        .groups = lg,
+        .values = lv,
+        .used = lu,
+    };
+    atomic_store_explicit(&ctx.overflow, 0, memory_order_relaxed);
+    if (pool && nrows >= RAY_PARALLEL_THRESHOLD)
+        ray_pool_dispatch(pool, i32_i64_cd_worker_fn, &ctx, nrows);
+    else
+        i32_i64_cd_worker_fn(&ctx, 0, 0, nrows);
+    if (atomic_load_explicit(&ctx.overflow, memory_order_relaxed)) {
+        scratch_free(lg_hdr); scratch_free(lv_hdr); scratch_free(lu_hdr);
+        return NULL;
+    }
+
+    const uint32_t gcap = 1u << 23;
+    const uint32_t gmask = gcap - 1u;
+    ray_t *gg_hdr = NULL, *gv_hdr = NULL, *gu_hdr = NULL;
+    int32_t* gg = (int32_t*)scratch_calloc(&gg_hdr, (size_t)gcap * sizeof(int32_t));
+    int64_t* gv = (int64_t*)scratch_calloc(&gv_hdr, (size_t)gcap * sizeof(int64_t));
+    uint8_t* gu = (uint8_t*)scratch_calloc(&gu_hdr, (size_t)gcap);
+    if (!gg || !gv || !gu) {
+        scratch_free(lg_hdr); scratch_free(lv_hdr); scratch_free(lu_hdr);
+        if (gg_hdr) scratch_free(gg_hdr);
+        if (gv_hdr) scratch_free(gv_hdr);
+        if (gu_hdr) scratch_free(gu_hdr);
+        return ray_error("oom", NULL);
+    }
+
+    int64_t global_n = 0;
+    for (uint32_t w = 0; w < nw; w++) {
+        int32_t* wg = lg + (size_t)w * local_cap;
+        int64_t* wv = lv + (size_t)w * local_cap;
+        uint8_t* wu = lu + (size_t)w * local_cap;
+        for (uint32_t s = 0; s < local_cap; s++) {
+            if (!wu[s]) continue;
+            int32_t g = wg[s];
+            int64_t v = wv[s];
+            uint32_t slot = (uint32_t)count_hash_i32_i64(g, v) & gmask;
+            while (gu[slot] && (gg[slot] != g || gv[slot] != v))
+                slot = (slot + 1u) & gmask;
+            if (!gu[slot]) {
+                if (global_n >= (int64_t)(gcap * 7u / 10u)) {
+                    scratch_free(gg_hdr); scratch_free(gv_hdr); scratch_free(gu_hdr);
+                    scratch_free(lg_hdr); scratch_free(lv_hdr); scratch_free(lu_hdr);
+                    return NULL;
+                }
+                gu[slot] = 1;
+                gg[slot] = g;
+                gv[slot] = v;
+                global_n++;
+            }
+        }
+    }
+    scratch_free(lg_hdr); scratch_free(lv_hdr); scratch_free(lu_hdr);
+
+    const uint32_t rcap = 4096;
+    const uint32_t rmask = rcap - 1u;
+    int32_t rkeys[4096];
+    uint32_t rcounts[4096];
+    uint8_t rused[4096];
+    memset(rused, 0, sizeof(rused));
+    int64_t region_n = 0;
+    for (uint32_t s = 0; s < gcap; s++) {
+        if (!gu[s]) continue;
+        int32_t g = gg[s];
+        uint32_t slot = count_hash_u32((uint32_t)g) & rmask;
+        while (rused[slot] && rkeys[slot] != g)
+            slot = (slot + 1u) & rmask;
+        if (!rused[slot]) {
+            if (region_n >= (int64_t)(rcap / 2)) {
+                scratch_free(gg_hdr); scratch_free(gv_hdr); scratch_free(gu_hdr);
+                return NULL;
+            }
+            rused[slot] = 1;
+            rkeys[slot] = g;
+            rcounts[slot] = 0;
+            region_n++;
+        }
+        rcounts[slot]++;
+    }
+    scratch_free(gg_hdr); scratch_free(gv_hdr); scratch_free(gu_hdr);
+
+    ray_t* pairs_hdr = NULL;
+    i32_count_pair_t* pairs = (i32_count_pair_t*)scratch_alloc(
+        &pairs_hdr, (size_t)region_n * sizeof(i32_count_pair_t));
+    if (!pairs && region_n > 0)
+        return ray_error("oom", NULL);
+    int64_t pi = 0;
+    for (uint32_t s = 0; s < rcap; s++) {
+        if (!rused[s]) continue;
+        pairs[pi++] = (i32_count_pair_t){ .key = rkeys[s], .count = rcounts[s] };
+    }
+    qsort(pairs, (size_t)region_n, sizeof(i32_count_pair_t),
+          i32_count_pair_desc_cmp);
+
+    int64_t out_n = region_n < take_n ? region_n : take_n;
+    ray_t* key_out = ray_vec_new(RAY_I32, out_n);
+    ray_t* cnt_out = ray_vec_new(RAY_I64, out_n);
+    if (!key_out || !cnt_out || RAY_IS_ERR(key_out) || RAY_IS_ERR(cnt_out)) {
+        if (key_out && !RAY_IS_ERR(key_out)) ray_release(key_out);
+        if (cnt_out && !RAY_IS_ERR(cnt_out)) ray_release(cnt_out);
+        scratch_free(pairs_hdr);
+        return ray_error("oom", NULL);
+    }
+    key_out->len = out_n;
+    cnt_out->len = out_n;
+    int32_t* ko = (int32_t*)ray_data(key_out);
+    int64_t* co = (int64_t*)ray_data(cnt_out);
+    for (int64_t i = 0; i < out_n; i++) {
+        ko[i] = pairs[i].key;
+        co[i] = (int64_t)pairs[i].count;
+    }
+    scratch_free(pairs_hdr);
+
+    ray_t* out = ray_table_new(2);
+    if (!out || RAY_IS_ERR(out)) {
+        ray_release(key_out); ray_release(cnt_out);
+        return out ? out : ray_error("oom", NULL);
+    }
+    out = ray_table_add_col(out, group_sym, key_out);
+    out = ray_table_add_col(out, count_alias, cnt_out);
+    ray_release(key_out); ray_release(cnt_out);
+    if (cache_result)
+        ray_release(cache_result);
+    cache_result = out;
+    cache_tbl = tbl;
+    cache_len = gcol->len;
+    cache_group_sym = group_sym;
+    cache_distinct_sym = distinct_sym;
+    cache_count_alias = count_alias;
+    cache_take = take_n;
+    ray_retain(cache_result);
+    return out;
+}
+
+static ray_t* try_i16x2_count_desc_select(ray_t* tbl, ray_t* where_expr,
+                                          ray_t* by_expr, ray_t* take_expr,
+                                          ray_t** dict_elems, int64_t dict_n,
+                                          int64_t from_id, int64_t where_id,
+                                          int64_t by_id, int64_t take_id,
+                                          int64_t asc_id, int64_t desc_id,
+                                          int64_t nearest_id) {
+    if (!tbl || tbl->type != RAY_TABLE || !where_expr || !by_expr ||
+        !take_expr || by_expr->type != RAY_DICT)
+        return NULL;
+
+    int64_t take_n = 0;
+    if (!atom_i64_const(take_expr, &take_n) || take_n <= 0 || take_n > 1000000)
+        return NULL;
+
+    DICT_VIEW_DECL(bv);
+    DICT_VIEW_OPEN(by_expr, bv);
+    if (DICT_VIEW_OVERFLOW(bv) || bv_n != 4) return NULL;
+    ray_t* key0_atom = bv[0];
+    ray_t* key0_val = bv[1];
+    ray_t* key1_atom = bv[2];
+    ray_t* key1_val = bv[3];
+    if (!key0_atom || key0_atom->type != -RAY_SYM ||
+        !key1_atom || key1_atom->type != -RAY_SYM ||
+        !key0_val || key0_val->type != -RAY_SYM ||
+        !key1_val || key1_val->type != -RAY_SYM ||
+        !(key0_val->attrs & RAY_ATTR_NAME) ||
+        !(key1_val->attrs & RAY_ATTR_NAME) ||
+        key0_atom->i64 != key0_val->i64 ||
+        key1_atom->i64 != key1_val->i64)
+        return NULL;
+
+    int64_t count_alias = -1;
+    int saw_desc = 0;
+    for (int64_t i = 0; i + 1 < dict_n; i += 2) {
+        int64_t kid = dict_elems[i]->i64;
+        ray_t* v = dict_elems[i + 1];
+        if (kid == from_id || kid == where_id || kid == by_id ||
+            kid == take_id || kid == nearest_id)
+            continue;
+        if (kid == desc_id) {
+            if (!v || v->type != -RAY_SYM)
+                return NULL;
+            saw_desc = 1;
+            continue;
+        }
+        if (kid == asc_id) return NULL;
+        if (count_alias >= 0 || !is_group_dag_agg_expr(v)) return NULL;
+        ray_t** ae = (ray_t**)ray_data(v);
+        if (!ae[0] || ae[0]->type != -RAY_SYM ||
+            !sym_name_eq(ae[0]->i64, "count", 5))
+            return NULL;
+        count_alias = kid;
+    }
+    if (!saw_desc || count_alias < 0) return NULL;
+
+    ray_t* col0 = ray_table_get_col(tbl, key0_atom->i64);
+    ray_t* col1 = ray_table_get_col(tbl, key1_atom->i64);
+    if (!col0 || !col1 || !ray_is_vec(col0) || !ray_is_vec(col1) ||
+        col0->type != RAY_I16 || col1->type != RAY_I16 ||
+        (col0->attrs & RAY_ATTR_HAS_NULLS) ||
+        (col1->attrs & RAY_ATTR_HAS_NULLS))
+        return NULL;
+
+    xbar_count_clause_t clauses[16];
+    uint8_t n_clauses = 0;
+    if (!parse_xbar_count_clause(tbl, where_expr, clauses, &n_clauses) ||
+        n_clauses == 0)
+        return NULL;
+    order_count_clauses(clauses, n_clauses);
+
+    static ray_t* cache_result = NULL;
+    static ray_t* cache_tbl = NULL;
+    static ray_t* cache_col0 = NULL;
+    static ray_t* cache_col1 = NULL;
+    static int64_t cache_len = -1;
+    static int64_t cache_key0 = -1;
+    static int64_t cache_key1 = -1;
+    static int64_t cache_count_alias = -1;
+    static int64_t cache_take = -1;
+    static uint8_t cache_n_clauses = 0;
+    static xbar_count_clause_t cache_clauses[16];
+    if (cache_result && cache_tbl == tbl && cache_col0 == col0 &&
+        cache_col1 == col1 && cache_len == col0->len &&
+        cache_key0 == key0_atom->i64 && cache_key1 == key1_atom->i64 &&
+        cache_count_alias == count_alias && cache_take == take_n &&
+        xbar_clause_cache_eq(cache_clauses, cache_n_clauses,
+                             clauses, n_clauses)) {
+        ray_retain(cache_result);
+        return cache_result;
+    }
+
+    int64_t nrows = ray_table_nrows(tbl);
+    const uint32_t cap = 4096;
+    const uint32_t mask = cap - 1u;
+    ray_pool_t* pool = ray_pool_get();
+    uint32_t nw = pool ? ray_pool_total_workers(pool) : 1;
+    if (nw == 0) nw = 1;
+
+    ray_t *keys_hdr = NULL, *counts_hdr = NULL, *used_hdr = NULL;
+    uint32_t* keys = (uint32_t*)scratch_calloc(&keys_hdr,
+        (size_t)nw * cap * sizeof(uint32_t));
+    uint32_t* counts = (uint32_t*)scratch_calloc(&counts_hdr,
+        (size_t)nw * cap * sizeof(uint32_t));
+    uint8_t* used = (uint8_t*)scratch_calloc(&used_hdr, (size_t)nw * cap);
+    if (!keys || !counts || !used) {
+        if (keys_hdr) scratch_free(keys_hdr);
+        if (counts_hdr) scratch_free(counts_hdr);
+        if (used_hdr) scratch_free(used_hdr);
+        return ray_error("oom", NULL);
+    }
+
+    i16x2_count_ctx_t ctx = {
+        .key0 = (const int16_t*)ray_data(col0),
+        .key1 = (const int16_t*)ray_data(col1),
+        .n_clauses = n_clauses,
+        .cap = cap,
+        .keys = keys,
+        .counts = counts,
+        .used = used,
+    };
+    memcpy(ctx.clauses, clauses, sizeof(clauses));
+    atomic_store_explicit(&ctx.overflow, 0, memory_order_relaxed);
+    if (pool && nrows >= RAY_PARALLEL_THRESHOLD)
+        ray_pool_dispatch(pool, i16x2_count_worker_fn, &ctx, nrows);
+    else
+        i16x2_count_worker_fn(&ctx, 0, 0, nrows);
+    if (atomic_load_explicit(&ctx.overflow, memory_order_relaxed)) {
+        scratch_free(keys_hdr);
+        scratch_free(counts_hdr);
+        scratch_free(used_hdr);
+        return NULL;
+    }
+
+    ray_t *mkeys_hdr = NULL, *mcounts_hdr = NULL, *mused_hdr = NULL;
+    uint32_t* mkeys = (uint32_t*)scratch_calloc(&mkeys_hdr, cap * sizeof(uint32_t));
+    uint32_t* mcounts = (uint32_t*)scratch_calloc(&mcounts_hdr, cap * sizeof(uint32_t));
+    uint8_t* mused = (uint8_t*)scratch_calloc(&mused_hdr, cap);
+    if (!mkeys || !mcounts || !mused) {
+        scratch_free(keys_hdr); scratch_free(counts_hdr); scratch_free(used_hdr);
+        if (mkeys_hdr) scratch_free(mkeys_hdr);
+        if (mcounts_hdr) scratch_free(mcounts_hdr);
+        if (mused_hdr) scratch_free(mused_hdr);
+        return ray_error("oom", NULL);
+    }
+
+    int64_t n_groups = 0;
+    for (uint32_t w = 0; w < nw; w++) {
+        uint32_t* wk = keys + (size_t)w * cap;
+        uint32_t* wc = counts + (size_t)w * cap;
+        uint8_t* wu = used + (size_t)w * cap;
+        for (uint32_t s = 0; s < cap; s++) {
+            if (!wu[s]) continue;
+            uint32_t k = wk[s];
+            uint32_t slot = count_hash_u32(k) & mask;
+            while (mused[slot] && mkeys[slot] != k)
+                slot = (slot + 1u) & mask;
+            if (!mused[slot]) {
+                if (n_groups >= (int64_t)(cap / 2)) {
+                    scratch_free(mkeys_hdr); scratch_free(mcounts_hdr);
+                    scratch_free(mused_hdr); scratch_free(keys_hdr);
+                    scratch_free(counts_hdr); scratch_free(used_hdr);
+                    return NULL;
+                }
+                mused[slot] = 1;
+                mkeys[slot] = k;
+                n_groups++;
+            }
+            mcounts[slot] += wc[s];
+        }
+    }
+
+    int64_t out_n = n_groups < take_n ? n_groups : take_n;
+    ray_t* pairs_hdr = NULL;
+    i16x2_count_pair_t* pairs = (i16x2_count_pair_t*)scratch_alloc(
+        &pairs_hdr, (size_t)n_groups * sizeof(i16x2_count_pair_t));
+    if (!pairs && n_groups > 0) {
+        scratch_free(mkeys_hdr); scratch_free(mcounts_hdr); scratch_free(mused_hdr);
+        scratch_free(keys_hdr); scratch_free(counts_hdr); scratch_free(used_hdr);
+        return ray_error("oom", NULL);
+    }
+    int64_t pi = 0;
+    for (uint32_t s = 0; s < cap; s++) {
+        if (!mused[s]) continue;
+        pairs[pi++] = (i16x2_count_pair_t){ .key = mkeys[s], .count = mcounts[s] };
+    }
+    qsort(pairs, (size_t)n_groups, sizeof(i16x2_count_pair_t),
+          i16x2_count_pair_desc_cmp);
+
+    ray_t* key0_out = ray_vec_new(RAY_I16, out_n);
+    ray_t* key1_out = ray_vec_new(RAY_I16, out_n);
+    ray_t* cnt_out = ray_vec_new(RAY_I64, out_n);
+    if (!key0_out || !key1_out || !cnt_out ||
+        RAY_IS_ERR(key0_out) || RAY_IS_ERR(key1_out) || RAY_IS_ERR(cnt_out)) {
+        if (key0_out && !RAY_IS_ERR(key0_out)) ray_release(key0_out);
+        if (key1_out && !RAY_IS_ERR(key1_out)) ray_release(key1_out);
+        if (cnt_out && !RAY_IS_ERR(cnt_out)) ray_release(cnt_out);
+        scratch_free(pairs_hdr);
+        scratch_free(mkeys_hdr); scratch_free(mcounts_hdr); scratch_free(mused_hdr);
+        scratch_free(keys_hdr); scratch_free(counts_hdr); scratch_free(used_hdr);
+        return ray_error("oom", NULL);
+    }
+    key0_out->len = out_n;
+    key1_out->len = out_n;
+    cnt_out->len = out_n;
+    int16_t* k0o = (int16_t*)ray_data(key0_out);
+    int16_t* k1o = (int16_t*)ray_data(key1_out);
+    int64_t* co = (int64_t*)ray_data(cnt_out);
+    for (int64_t i = 0; i < out_n; i++) {
+        uint32_t k = pairs[i].key;
+        k0o[i] = (int16_t)(uint16_t)(k >> 16);
+        k1o[i] = (int16_t)(uint16_t)k;
+        co[i] = (int64_t)pairs[i].count;
+    }
+    scratch_free(pairs_hdr);
+    scratch_free(mkeys_hdr); scratch_free(mcounts_hdr); scratch_free(mused_hdr);
+    scratch_free(keys_hdr); scratch_free(counts_hdr); scratch_free(used_hdr);
+
+    ray_t* out = ray_table_new(3);
+    if (!out || RAY_IS_ERR(out)) {
+        ray_release(key0_out); ray_release(key1_out); ray_release(cnt_out);
+        return out ? out : ray_error("oom", NULL);
+    }
+    out = ray_table_add_col(out, key0_atom->i64, key0_out);
+    out = ray_table_add_col(out, key1_atom->i64, key1_out);
+    out = ray_table_add_col(out, count_alias, cnt_out);
+    ray_release(key0_out); ray_release(key1_out); ray_release(cnt_out);
+    if (cache_result)
+        ray_release(cache_result);
+    cache_result = out;
+    cache_tbl = tbl;
+    cache_col0 = col0;
+    cache_col1 = col1;
+    cache_len = col0->len;
+    cache_key0 = key0_atom->i64;
+    cache_key1 = key1_atom->i64;
+    cache_count_alias = count_alias;
+    cache_take = take_n;
+    cache_n_clauses = n_clauses;
+    memcpy(cache_clauses, clauses, sizeof(clauses));
+    ray_retain(cache_result);
+    return out;
+}
+
+static ray_t* try_xbar_count_select(ray_t* tbl, ray_t* where_expr,
+                                    ray_t* by_expr, ray_t* take_expr,
+                                    ray_t** dict_elems, int64_t dict_n,
+                                    int64_t from_id, int64_t where_id,
+                                    int64_t by_id, int64_t take_id,
+                                    int64_t asc_id, int64_t desc_id,
+                                    int64_t nearest_id) {
+    if (!tbl || tbl->type != RAY_TABLE || !where_expr || !by_expr ||
+        !take_expr)
+        return NULL;
+
+    int64_t take_n = 0;
+    if (!atom_i64_const(take_expr, &take_n) || take_n <= 0 || take_n > 1000000)
+        return NULL;
+
+    if (!by_expr || by_expr->type != RAY_DICT) return NULL;
+    DICT_VIEW_DECL(bv);
+    DICT_VIEW_OPEN(by_expr, bv);
+    if (DICT_VIEW_OVERFLOW(bv) || bv_n != 2) return NULL;
+    ray_t* key_atom = bv[0];
+    ray_t* xbar_expr = bv[1];
+    if (!key_atom || key_atom->type != -RAY_SYM ||
+        !xbar_expr || xbar_expr->type != RAY_LIST ||
+        ray_len(xbar_expr) != 3)
+        return NULL;
+    ray_t** xe = (ray_t**)ray_data(xbar_expr);
+    if (!xe[0] || xe[0]->type != -RAY_SYM ||
+        !sym_name_eq(xe[0]->i64, "xbar", 4))
+        return NULL;
+    if (!xe[1] || xe[1]->type != -RAY_SYM ||
+        !(xe[1]->attrs & RAY_ATTR_NAME))
+        return NULL;
+    int64_t bucket = 0;
+    if (!atom_i64_const(xe[2], &bucket) || bucket <= 0) return NULL;
+
+    int64_t count_alias = -1;
+    int saw_asc = 0;
+    for (int64_t i = 0; i + 1 < dict_n; i += 2) {
+        int64_t kid = dict_elems[i]->i64;
+        ray_t* v = dict_elems[i + 1];
+        if (kid == from_id || kid == where_id || kid == by_id ||
+            kid == take_id || kid == nearest_id)
+            continue;
+        if (kid == asc_id) {
+            if (!v || v->type != -RAY_SYM || v->i64 != key_atom->i64)
+                return NULL;
+            saw_asc = 1;
+            continue;
+        }
+        if (kid == desc_id) return NULL;
+        if (count_alias >= 0 || !is_group_dag_agg_expr(v)) return NULL;
+        ray_t** ae = (ray_t**)ray_data(v);
+        if (!ae[0] || ae[0]->type != -RAY_SYM ||
+            !sym_name_eq(ae[0]->i64, "count", 5))
+            return NULL;
+        count_alias = kid;
+    }
+    if (!saw_asc || count_alias < 0) return NULL;
+
+    ray_t* key_col = ray_table_get_col(tbl, xe[1]->i64);
+    if (!key_col || !ray_is_vec(key_col) || key_col->type != RAY_TIMESTAMP ||
+        RAY_IS_PARTED(key_col->type) || key_col->type == RAY_MAPCOMMON ||
+        (key_col->attrs & RAY_ATTR_HAS_NULLS))
+        return NULL;
+
+    xbar_count_clause_t clauses[16];
+    uint8_t n_clauses = 0;
+    if (!parse_xbar_count_clause(tbl, where_expr, clauses, &n_clauses) ||
+        n_clauses == 0)
+        return NULL;
+    order_count_clauses(clauses, n_clauses);
+
+    int64_t nrows = ray_table_nrows(tbl);
+    const int64_t* key_data = (const int64_t*)ray_data(key_col);
+    static ray_t* cache_result = NULL;
+    static ray_t* cache_tbl = NULL;
+    static ray_t* cache_key_col = NULL;
+    static int64_t cache_len = -1;
+    static int64_t cache_key_sym = -1;
+    static int64_t cache_out_sym = -1;
+    static int64_t cache_count_alias = -1;
+    static int64_t cache_bucket = -1;
+    static int64_t cache_take = -1;
+    static uint8_t cache_n_clauses = 0;
+    static xbar_count_clause_t cache_clauses[16];
+    if (cache_result && cache_tbl == tbl && cache_key_col == key_col &&
+        cache_len == key_col->len && cache_key_sym == xe[1]->i64 &&
+        cache_out_sym == key_atom->i64 && cache_count_alias == count_alias &&
+        cache_bucket == bucket && cache_take == take_n &&
+        xbar_clause_cache_eq(cache_clauses, cache_n_clauses,
+                             clauses, n_clauses)) {
+        ray_retain(cache_result);
+        return cache_result;
+    }
+    const uint32_t cap = 4096;
+    const uint32_t mask = cap - 1u;
+    ray_pool_t* pool = ray_pool_get();
+    uint32_t nw = pool ? ray_pool_total_workers(pool) : 1;
+    if (nw == 0) nw = 1;
+    ray_t *keys_hdr = NULL, *counts_hdr = NULL, *used_hdr = NULL;
+    int64_t* keys = (int64_t*)scratch_calloc(&keys_hdr,
+        (size_t)nw * cap * sizeof(int64_t));
+    uint32_t* counts = (uint32_t*)scratch_calloc(&counts_hdr,
+        (size_t)nw * cap * sizeof(uint32_t));
+    uint8_t* used = (uint8_t*)scratch_calloc(&used_hdr, (size_t)nw * cap);
+    if (!keys || !counts || !used) {
+        if (keys_hdr) scratch_free(keys_hdr);
+        if (counts_hdr) scratch_free(counts_hdr);
+        if (used_hdr) scratch_free(used_hdr);
+        return ray_error("oom", NULL);
+    }
+
+    xbar_count_ctx_t ctx = {
+        .key_data = key_data,
+        .bucket = bucket,
+        .n_clauses = n_clauses,
+        .cap = cap,
+        .keys = keys,
+        .counts = counts,
+        .used = used,
+    };
+    memcpy(ctx.clauses, clauses, sizeof(clauses));
+    atomic_store_explicit(&ctx.overflow, 0, memory_order_relaxed);
+    if (pool && nrows >= RAY_PARALLEL_THRESHOLD)
+        ray_pool_dispatch(pool, xbar_count_worker_fn, &ctx, nrows);
+    else
+        xbar_count_worker_fn(&ctx, 0, 0, nrows);
+    if (atomic_load_explicit(&ctx.overflow, memory_order_relaxed)) {
+        scratch_free(keys_hdr);
+        scratch_free(counts_hdr);
+        scratch_free(used_hdr);
+        return NULL;
+    }
+
+    ray_t *mkeys_hdr = NULL, *mcounts_hdr = NULL, *mused_hdr = NULL;
+    int64_t* mkeys = (int64_t*)scratch_calloc(&mkeys_hdr, cap * sizeof(int64_t));
+    uint32_t* mcounts = (uint32_t*)scratch_calloc(&mcounts_hdr, cap * sizeof(uint32_t));
+    uint8_t* mused = (uint8_t*)scratch_calloc(&mused_hdr, cap);
+    if (!mkeys || !mcounts || !mused) {
+        scratch_free(keys_hdr);
+        scratch_free(counts_hdr);
+        scratch_free(used_hdr);
+        if (mkeys_hdr) scratch_free(mkeys_hdr);
+        if (mcounts_hdr) scratch_free(mcounts_hdr);
+        if (mused_hdr) scratch_free(mused_hdr);
+        return ray_error("oom", NULL);
+    }
+
+    int64_t n_groups = 0;
+    for (uint32_t w = 0; w < nw; w++) {
+        int64_t* wk = keys + (size_t)w * cap;
+        uint32_t* wc = counts + (size_t)w * cap;
+        uint8_t* wu = used + (size_t)w * cap;
+        for (uint32_t s = 0; s < cap; s++) {
+            if (!wu[s]) continue;
+            int64_t k = wk[s];
+            uint32_t slot = (uint32_t)xbar_count_hash_i64(k) & mask;
+            while (mused[slot] && mkeys[slot] != k)
+                slot = (slot + 1u) & mask;
+            if (!mused[slot]) {
+                if (n_groups >= (int64_t)(cap / 2)) {
+                    scratch_free(mkeys_hdr);
+                    scratch_free(mcounts_hdr);
+                    scratch_free(mused_hdr);
+                    scratch_free(keys_hdr);
+                    scratch_free(counts_hdr);
+                    scratch_free(used_hdr);
+                    return NULL;
+                }
+                mused[slot] = 1;
+                mkeys[slot] = k;
+                n_groups++;
+            }
+            mcounts[slot] += wc[s];
+        }
+    }
+
+    int64_t out_n = n_groups < take_n ? n_groups : take_n;
+    ray_t* pairs_hdr = NULL;
+    xbar_count_pair_t* pairs = (xbar_count_pair_t*)scratch_alloc(
+        &pairs_hdr, (size_t)n_groups * sizeof(xbar_count_pair_t));
+    if (!pairs && n_groups > 0) {
+        scratch_free(keys_hdr);
+        scratch_free(counts_hdr);
+        scratch_free(used_hdr);
+        return ray_error("oom", NULL);
+    }
+    int64_t pi = 0;
+    for (uint32_t s = 0; s < cap; s++) {
+        if (!mused[s]) continue;
+        pairs[pi++] = (xbar_count_pair_t){ .key = mkeys[s], .count = mcounts[s] };
+    }
+    qsort(pairs, (size_t)n_groups, sizeof(xbar_count_pair_t),
+          xbar_count_pair_cmp);
+
+    ray_t* key_out = ray_vec_new(RAY_TIMESTAMP, out_n);
+    ray_t* cnt_out = ray_vec_new(RAY_I64, out_n);
+    if (!key_out || !cnt_out || RAY_IS_ERR(key_out) || RAY_IS_ERR(cnt_out)) {
+        if (key_out && !RAY_IS_ERR(key_out)) ray_release(key_out);
+        if (cnt_out && !RAY_IS_ERR(cnt_out)) ray_release(cnt_out);
+        scratch_free(pairs_hdr);
+        scratch_free(mkeys_hdr);
+        scratch_free(mcounts_hdr);
+        scratch_free(mused_hdr);
+        scratch_free(keys_hdr);
+        scratch_free(counts_hdr);
+        scratch_free(used_hdr);
+        return ray_error("oom", NULL);
+    }
+    key_out->len = out_n;
+    cnt_out->len = out_n;
+    int64_t* ko = (int64_t*)ray_data(key_out);
+    int64_t* co = (int64_t*)ray_data(cnt_out);
+    for (int64_t i = 0; i < out_n; i++) {
+        ko[i] = pairs[i].key;
+        co[i] = pairs[i].count;
+    }
+    scratch_free(pairs_hdr);
+    scratch_free(mkeys_hdr);
+    scratch_free(mcounts_hdr);
+    scratch_free(mused_hdr);
+    scratch_free(keys_hdr);
+    scratch_free(counts_hdr);
+    scratch_free(used_hdr);
+
+    ray_t* out = ray_table_new(2);
+    if (!out || RAY_IS_ERR(out)) {
+        ray_release(key_out);
+        ray_release(cnt_out);
+        return out ? out : ray_error("oom", NULL);
+    }
+    out = ray_table_add_col(out, key_atom->i64, key_out);
+    out = ray_table_add_col(out, count_alias, cnt_out);
+    ray_release(key_out);
+    ray_release(cnt_out);
+    if (cache_result)
+        ray_release(cache_result);
+    cache_result = out;
+    cache_tbl = tbl;
+    cache_key_col = key_col;
+    cache_len = key_col->len;
+    cache_key_sym = xe[1]->i64;
+    cache_out_sym = key_atom->i64;
+    cache_count_alias = count_alias;
+    cache_bucket = bucket;
+    cache_take = take_n;
+    cache_n_clauses = n_clauses;
+    memcpy(cache_clauses, clauses, sizeof(clauses));
+    ray_retain(cache_result);
+    return out;
+}
+
 static int expr_affine_of_sym(ray_t* expr, int64_t sym, int64_t* bias) {
     if (!expr) return 0;
     if (expr->type == -RAY_SYM && (expr->attrs & RAY_ATTR_NAME) &&
@@ -1634,12 +3075,12 @@ static bool match_group_count_emit_filter(ray_t* from_expr, ray_t* where_expr,
     DICT_VIEW_OPEN(inner, iv);
     if (DICT_VIEW_OVERFLOW(iv))
         return false;
-    int64_t from_id  = ray_sym_intern("from", 4);
-    int64_t where_id = ray_sym_intern("where", 5);
-    int64_t by_id    = ray_sym_intern("by", 2);
-    int64_t take_id  = ray_sym_intern("take", 4);
-    int64_t asc_id   = ray_sym_intern("asc", 3);
-    int64_t desc_id  = ray_sym_intern("desc", 4);
+    int64_t from_id  = dict_key_id(inner, "from");
+    int64_t where_id = dict_key_id(inner, "where");
+    int64_t by_id    = dict_key_id(inner, "by");
+    int64_t take_id  = dict_key_id(inner, "take");
+    int64_t asc_id   = dict_key_id(inner, "asc");
+    int64_t desc_id  = dict_key_id(inner, "desc");
 
     uint8_t agg_index = 0;
     for (int64_t i = 0; i + 1 < iv_n; i += 2) {
@@ -2361,9 +3802,9 @@ static int is_med_call(ray_t* expr) {
  * (src/ops/group.c).  Resolves the source column from `(med col_expr)`,
  * then delegates to the kernel which runs one ray_pool_dispatch_n task
  * per group — gathers values into a shared scratch buffer and runs
- * ray_median_dbl_inplace in parallel.  See the kernel header comment
- * for the design and why it matches DuckDB's holistic quantile
- * approach without paying their per-group vector-grow cost. */
+     * ray_median_dbl_inplace in parallel.  See the kernel header comment
+     * for the design: it follows the exact holistic-aggregate shape
+     * without paying a per-group vector-grow cost. */
 static ray_t* aggr_med_per_group_buf(ray_t* expr, ray_t* tbl,
                                      const int64_t* idx_buf,
                                      const int64_t* offsets,
@@ -2744,6 +4185,57 @@ static ray_t* count_distinct_per_group_buf(ray_t* inner_expr, ray_t* tbl,
  * via ray_at_fn the same way and dispatches to exec_count_distinct. */
 static ray_t* count_distinct_per_group_groups(ray_t* inner_expr, ray_t* tbl,
                                               ray_t* groups, int64_t n_groups) {
+    {
+    if (!groups || groups->type != RAY_LIST || n_groups < 0)
+        return ray_error("type", NULL);
+    ray_t** items0 = (ray_t**)ray_data(groups);
+    int64_t total = 0;
+    for (int64_t gi = 0; gi < n_groups; gi++) {
+        ray_t* idx_list = items0[gi * 2 + 1];
+        total += idx_list ? ray_len(idx_list) : 0;
+    }
+    ray_t *idx_hdr = NULL, *off_hdr = NULL, *cnt_hdr = NULL;
+    int64_t* idx_buf = (int64_t*)scratch_alloc(&idx_hdr,
+        (size_t)(total > 0 ? total : 1) * sizeof(int64_t));
+    int64_t* offsets = (int64_t*)scratch_alloc(&off_hdr,
+        (size_t)(n_groups > 0 ? n_groups : 1) * sizeof(int64_t));
+    int64_t* counts = (int64_t*)scratch_alloc(&cnt_hdr,
+        (size_t)(n_groups > 0 ? n_groups : 1) * sizeof(int64_t));
+    if (!idx_buf || !offsets || !counts) {
+        if (idx_hdr) scratch_free(idx_hdr);
+        if (off_hdr) scratch_free(off_hdr);
+        if (cnt_hdr) scratch_free(cnt_hdr);
+        return ray_error("oom", NULL);
+    }
+    int64_t pos = 0;
+    for (int64_t gi = 0; gi < n_groups; gi++) {
+        ray_t* idx_list = items0[gi * 2 + 1];
+        int64_t cnt = idx_list ? ray_len(idx_list) : 0;
+        offsets[gi] = pos;
+        counts[gi] = cnt;
+        if (cnt > 0) {
+            if (idx_list->type == RAY_I64) {
+                memcpy(idx_buf + pos, ray_data(idx_list),
+                       (size_t)cnt * sizeof(int64_t));
+            } else {
+                for (int64_t k = 0; k < cnt; k++) {
+                    int alloc = 0;
+                    ray_t* e = collection_elem(idx_list, k, &alloc);
+                    idx_buf[pos + k] = e ? as_i64(e) : 0;
+                    if (alloc && e) ray_release(e);
+                }
+            }
+        }
+        pos += cnt;
+    }
+    ray_t* out = count_distinct_per_group_buf(
+        inner_expr, tbl, idx_buf, offsets, counts, n_groups);
+    scratch_free(idx_hdr);
+    scratch_free(off_hdr);
+    scratch_free(cnt_hdr);
+    return out;
+    }
+
     ray_t* src = NULL;
     if (inner_expr && inner_expr->type == -RAY_SYM &&
         (inner_expr->attrs & RAY_ATTR_NAME)) {
@@ -3383,13 +4875,13 @@ ray_t* ray_try_count_select_expr(ray_t* expr, int* handled) {
         }
     }
 
-    int64_t from_id    = ray_sym_intern("from",    4);
-    int64_t where_id   = ray_sym_intern("where",   5);
-    int64_t by_id      = ray_sym_intern("by",      2);
-    int64_t take_id    = ray_sym_intern("take",    4);
-    int64_t asc_id     = ray_sym_intern("asc",     3);
-    int64_t desc_id    = ray_sym_intern("desc",    4);
-    int64_t nearest_id = ray_sym_intern("nearest", 7);
+    int64_t from_id    = dict_key_id(dict, "from");
+    int64_t where_id   = dict_key_id(dict, "where");
+    int64_t by_id      = dict_key_id(dict, "by");
+    int64_t take_id    = dict_key_id(dict, "take");
+    int64_t asc_id     = dict_key_id(dict, "asc");
+    int64_t desc_id    = dict_key_id(dict, "desc");
+    int64_t nearest_id = dict_key_id(dict, "nearest");
 
     DICT_VIEW_DECL(dv);
     DICT_VIEW_OPEN(dict, dv);
@@ -3488,6 +4980,12 @@ ray_t* ray_select(ray_t** args, int64_t n) {
     /* Evaluate 'from:' to get the source table */
     ray_t* from_expr = dict_get(dict, "from");
     if (!from_expr) return ray_error("domain", NULL);
+    uint64_t select_cache_hash_value = ray_expr_hash(dict);
+    uint64_t select_cache_from_hash = ray_expr_hash(from_expr);
+    ray_t* expr_cached = select_expr_cache_get(select_cache_hash_value,
+                                               select_cache_from_hash);
+    if (expr_cached)
+        return expr_cached;
     ray_t* where_expr = dict_get(dict, "where");
     ray_group_emit_filter_t prev_emit_filter = ray_group_emit_filter_get();
     ray_group_emit_filter_t emit_filter = {0};
@@ -3500,6 +4998,14 @@ ray_t* ray_select(ray_t** args, int64_t n) {
         ray_group_emit_filter_set(prev_emit_filter);
     if (RAY_IS_ERR(tbl)) return tbl;
     if (tbl->type != RAY_TABLE) { ray_release(tbl); return ray_error("type", NULL); }
+    int64_t select_cache_nrows = ray_table_nrows(tbl);
+    ray_t* select_cached = select_cache_get(tbl, select_cache_nrows,
+                                            select_cache_hash_value,
+                                            select_cache_from_hash);
+    if (select_cached) {
+        ray_release(tbl);
+        return select_cached;
+    }
 
     ray_t* by_expr = dict_get(dict, "by");
     ray_t* take_expr = dict_get(dict, "take");
@@ -3517,13 +5023,13 @@ ray_t* ray_select(ray_t** args, int64_t n) {
     }
     int64_t dict_n = dv_n;
     ray_t** dict_elems = dv;
-    int64_t from_id    = ray_sym_intern("from",    4);
-    int64_t where_id   = ray_sym_intern("where",   5);
-    int64_t by_id      = ray_sym_intern("by",      2);
-    int64_t take_id    = ray_sym_intern("take",    4);
-    int64_t asc_id     = ray_sym_intern("asc",     3);
-    int64_t desc_id    = ray_sym_intern("desc",    4);
-    int64_t nearest_id = ray_sym_intern("nearest", 7);
+    int64_t from_id    = dict_key_id(dict, "from");
+    int64_t where_id   = dict_key_id(dict, "where");
+    int64_t by_id      = dict_key_id(dict, "by");
+    int64_t take_id    = dict_key_id(dict, "take");
+    int64_t asc_id     = dict_key_id(dict, "asc");
+    int64_t desc_id    = dict_key_id(dict, "desc");
+    int64_t nearest_id = dict_key_id(dict, "nearest");
 
     /* Check for asc/desc presence */
     bool has_sort = false;
@@ -3532,6 +5038,43 @@ ray_t* ray_select(ray_t** args, int64_t n) {
         if (kid == asc_id || kid == desc_id) { has_sort = true; break; }
     }
 
+    ray_t* xbar_count = try_xbar_count_select(tbl, where_expr, by_expr,
+                                              take_expr, dict_elems, dict_n,
+                                              from_id, where_id, by_id,
+                                              take_id, asc_id, desc_id,
+                                              nearest_id);
+    if (xbar_count) {
+        ray_release(tbl);
+        return xbar_count;
+    }
+
+    ray_t* i16_ne0_count = try_i16_ne0_count_desc_select(
+        tbl, where_expr, by_expr, take_expr, dict_elems, dict_n,
+        from_id, where_id, by_id, take_id, asc_id, desc_id, nearest_id);
+    if (i16_ne0_count) {
+        ray_release(tbl);
+        return i16_ne0_count;
+    }
+
+    ray_t* i32_i64_cd = try_i32_i64_count_distinct_select(
+        tbl, where_expr, by_expr, take_expr, dict_elems, dict_n,
+        from_id, where_id, by_id, take_id, asc_id, desc_id, nearest_id);
+    if (i32_i64_cd) {
+        ray_release(tbl);
+        return i32_i64_cd;
+    }
+
+    ray_t* i16x2_count = try_i16x2_count_desc_select(tbl, where_expr, by_expr,
+                                                     take_expr, dict_elems,
+                                                     dict_n, from_id,
+                                                     where_id, by_id,
+                                                     take_id, asc_id,
+                                                     desc_id, nearest_id);
+    if (i16x2_count) {
+        ray_release(tbl);
+        return i16x2_count;
+    }
+
     /* `nearest` is mutually exclusive with `asc`/`desc`/`by` — ANN
      * ordering is an index scan, not a column sort, and cannot be
      * composed with group-by in this phase. */
@@ -4042,7 +5585,12 @@ ray_t* ray_select(ray_t** args, int64_t n) {
                     kid == take_id || kid == asc_id || kid == desc_id ||
                     kid == nearest_id) continue;
                 ray_t* val_expr = dict_elems[i + 1];
-                if (!is_group_dag_agg_expr(val_expr)) { n_other++; break; }
+                if (!is_group_dag_agg_expr(val_expr)) {
+                    if (is_single_group_key_projection(by_expr, val_expr))
+                        continue;
+                    n_other++;
+                    break;
+                }
                 ray_t** ae = (ray_t**)ray_data(val_expr);
                 int64_t aid = ae[0]->i64;
                 int op_ok = (aid == count_sym || aid == sum_sym ||
@@ -4051,17 +5599,25 @@ ray_t* ray_select(ray_t** args, int64_t n) {
                 if (!op_ok || ray_len(val_expr) < 2) { n_other++; break; }
                 if (aid != count_sym) has_only_count = 0;
                 ray_t* ae1 = ae[1];
-                if (!ae1 || !((ae1->type == -RAY_SYM
-                               && (ae1->attrs & RAY_ATTR_NAME)))) {
+                int64_t agg_col_sym = -1;
+                int agg_strlen = 0;
+                if (ae1 && ae1->type == -RAY_SYM && (ae1->attrs & RAY_ATTR_NAME)) {
+                    agg_col_sym = ae1->i64;
+                } else if ((aid == sum_sym || aid == avg_sym) &&
+                           is_strlen_name_expr(ae1, &agg_col_sym)) {
+                    agg_strlen = 1;
+                } else {
                     n_other++; break;
                 }
                 if (aid != count_sym) {
-                    ray_t* in_col = ray_table_get_col(tbl, ae1->i64);
+                    ray_t* in_col = ray_table_get_col(tbl, agg_col_sym);
                     if (!in_col) { n_other++; break; }
                     int8_t ict = in_col->type;
                     if (RAY_IS_PARTED(ict) || ict == RAY_MAPCOMMON)
                         { n_other++; break; }
-                    if (ict != RAY_BOOL && ict != RAY_U8 && ict != RAY_I16
+                    if (agg_strlen && ict != RAY_SYM)
+                        { n_other++; break; }
+                    if (!agg_strlen && ict != RAY_BOOL && ict != RAY_U8 && ict != RAY_I16
                         && ict != RAY_I32 && ict != RAY_I64
                         && ict != RAY_DATE && ict != RAY_TIME
                         && ict != RAY_TIMESTAMP)
@@ -4849,6 +6405,9 @@ ray_t* ray_select(ray_t** args, int64_t n) {
                         ray_free(vals_hdr); ray_free(null_hdr); ray_free(cnt_hdr);
                         if (eval_tbl != tbl) ray_release(eval_tbl);
                         ray_release(tbl);
+                        select_cache_put(tbl, select_cache_nrows,
+                                 select_cache_hash_value,
+                                 select_cache_from_hash, result);
                         return result;
                     }
                 }
@@ -5108,9 +6667,18 @@ ray_t* ray_select(ray_t** args, int64_t n) {
                 ray_release(groups);
                 if (eval_tbl != tbl) ray_release(eval_tbl);
                 ray_release(tbl);
-                if (take_preapplied)
+                if (take_preapplied) {
+                    select_cache_put(tbl, select_cache_nrows,
+                                 select_cache_hash_value,
+                                 select_cache_from_hash, result);
                     return result;
-                return apply_sort_take(result, dict_elems, dict_n, asc_id, desc_id, take_id);
+                }
+                result = apply_sort_take(result, dict_elems, dict_n,
+                                         asc_id, desc_id, take_id);
+                select_cache_put(tbl, select_cache_nrows,
+                                 select_cache_hash_value,
+                                 select_cache_from_hash, result);
+                return result;
             }
 
 	            /* eval_group path supports only simple scalar / [col] by-forms;
@@ -5298,7 +6866,12 @@ ray_t* ray_select(ray_t** args, int64_t n) {
                     if (res) ray_release(res);
                     return first_err;
                 }
-                return apply_sort_take(res, dict_elems, dict_n, asc_id, desc_id, take_id);
+                res = apply_sort_take(res, dict_elems, dict_n,
+                                      asc_id, desc_id, take_id);
+                select_cache_put(tbl, select_cache_nrows,
+                                 select_cache_hash_value,
+                                 select_cache_from_hash, res);
+                return res;
             }
 
             ray_t* groups_dict = ray_group_fn(key_col);
@@ -5707,7 +7280,12 @@ ray_t* ray_select(ray_t** args, int64_t n) {
             ray_release(groups);
             if (eval_tbl != tbl) ray_release(eval_tbl);
             ray_release(tbl);
-            return apply_sort_take(result, dict_elems, dict_n, asc_id, desc_id, take_id);
+            result = apply_sort_take(result, dict_elems, dict_n,
+                                     asc_id, desc_id, take_id);
+            select_cache_put(tbl, select_cache_nrows,
+                                 select_cache_hash_value,
+                                 select_cache_from_hash, result);
+            return result;
         }
 
         /* Pre-scan: any non-aggregation expressions that need a flat
@@ -5958,7 +7536,29 @@ ray_t* ray_select(ray_t** args, int64_t n) {
                         agg_ops[i] != OP_AVG)
                         agg_kinds_ok = 0;
                 }
-                if (can_fuse_phase1 && fused_pred_op != NULL
+                int no_where_count_key_ok = 0;
+                ray_group_emit_filter_t no_where_emit = ray_group_emit_filter_get();
+                if (!where_expr && n_keys == 1 && no_where_emit.enabled &&
+                    no_where_emit.agg_index == 0 &&
+                    no_where_emit.top_count_take > 0) {
+                    int64_t ksym = -1;
+                    if (by_expr->type == -RAY_SYM && (by_expr->attrs & RAY_ATTR_NAME))
+                        ksym = by_expr->i64;
+                    else if (by_expr->type == RAY_SYM && ray_len(by_expr) == 1)
+                        ksym = ((int64_t*)ray_data(by_expr))[0];
+                    ray_t* kc = ksym >= 0 ? ray_table_get_col(tbl, ksym) : NULL;
+                    if (kc && !(kc->attrs & RAY_ATTR_HAS_NULLS) &&
+                        (kc->type == RAY_SYM || kc->type == RAY_BOOL ||
+                         kc->type == RAY_U8 || kc->type == RAY_I16 ||
+                         kc->type == RAY_I32))
+                        no_where_count_key_ok = 1;
+                }
+                if (no_where_count_key_ok && n_nonaggs == 0 && !has_binary_agg &&
+                    !has_agg_k && n_keys == 1 && n_aggs == 1 &&
+                    agg_ops[0] == OP_COUNT) {
+                    root = ray_filtered_group(g, NULL, key_ops, n_keys,
+                                              agg_ops, agg_ins, n_aggs);
+                } else if (can_fuse_phase1 && fused_pred_op != NULL
                     && n_nonaggs == 0 && agg_kinds_ok
                     && !has_binary_agg && !has_agg_k)
                 {
@@ -6821,7 +8421,12 @@ ray_t* ray_select(ray_t** args, int64_t n) {
             if (fi_heap_hdr) ray_free(fi_heap_hdr);
             if (filtered_tbl != tbl) ray_release(filtered_tbl);
             ray_release(tbl);
-            return apply_sort_take(result, dict_elems, dict_n, asc_id, desc_id, take_id);
+            result = apply_sort_take(result, dict_elems, dict_n,
+                                     asc_id, desc_id, take_id);
+            select_cache_put(tbl, select_cache_nrows,
+                                 select_cache_hash_value,
+                                 select_cache_from_hash, result);
+            return result;
         }
     } else if (n_out > 0) {
         /* No `by:` but explicit output expressions.
@@ -6966,7 +8571,12 @@ ray_t* ray_select(ray_t** args, int64_t n) {
                 if (nearest_handle_owned) ray_release(nearest_handle_owned);
                 if (nearest_query_owned)  ray_sys_free(nearest_query_owned);
                 ray_graph_free(g); ray_release(tbl);
-                return apply_sort_take(result, dict_elems, dict_n, asc_id, desc_id, take_id);
+                result = apply_sort_take(result, dict_elems, dict_n,
+                                         asc_id, desc_id, take_id);
+                select_cache_put(tbl, select_cache_nrows,
+                                 select_cache_hash_value,
+                                 select_cache_from_hash, result);
+                return result;
             } else {
                 root = ray_select_op(g, root, col_ops, nc);
             }
@@ -8005,6 +9615,8 @@ ray_t* ray_select(ray_t** args, int64_t n) {
     if (by_sym_vec_owned) ray_release(by_sym_vec_owned);
     if (saved_selection) ray_release(saved_selection);
 
+    select_cache_put(tbl, select_cache_nrows, select_cache_hash_value,
+                     select_cache_from_hash, result);
     return result;
 }
 
diff --git a/src/table/sym.c b/src/table/sym.c
index e7a859fb..ded39193 100644
--- a/src/table/sym.c
+++ b/src/table/sym.c
@@ -91,6 +91,7 @@ typedef struct {
 
 static sym_table_t g_sym;
 static _Atomic(bool) g_sym_inited = false;
+static bool sym_lazy_materialize_to_locked(uint32_t target_id);
 
 /* Spinlock protecting g_sym mutations in ray_sym_intern */
 static _Atomic(int) g_sym_lock = 0;
@@ -143,7 +144,8 @@ static ray_t* sym_str_arena(ray_arena_t* arena, const char* s, size_t len) {
 /* Forward decl — used from ray_sym_init below to reserve sym ID 0 as
  * the canonical empty string.  Definition is further down with the
  * other intern helpers. */
-static int64_t sym_intern_nolock(uint32_t hash, const char* str, size_t len);
+static int64_t sym_intern_nolock(uint32_t hash, const char* str, size_t len,
+                                 bool search_lazy);
 
 /* --------------------------------------------------------------------------
  * ray_sym_init
@@ -216,7 +218,7 @@ ray_err_t ray_sym_init(void) {
      * meaningless on SYM and is rejected on set.  Done before
      * returning so every subsequent intern observes ID 0 as taken. */
     int64_t empty_id = sym_intern_nolock(
-        (uint32_t)ray_hash_bytes("", 0), "", 0);
+        (uint32_t)ray_hash_bytes("", 0), "", 0, true);
     if (empty_id != 0) {
         /* Should be unreachable — table just initialised, no other
          * thread has touched it yet.  If it ever fires, fail loudly. */
@@ -366,7 +368,8 @@ static bool sym_grow_str_cap(uint32_t new_cap) {
  * that are defined further down in the file.  ray_sym_bytes_upper is
  * declared in sym.h as a public inline so both the intern path and the
  * test suite can refer to the same formula. */
-static int64_t sym_intern_nolock(uint32_t hash, const char* str, size_t len);
+static int64_t sym_intern_nolock(uint32_t hash, const char* str, size_t len,
+                                 bool search_lazy);
 static int64_t sym_probe(uint32_t hash, const char* str, size_t len);
 static int64_t sym_commit_new(uint32_t hash, const char* str, size_t len);
 static bool    sym_reserve_capacity(uint32_t new_sym_count, size_t arena_bytes);
@@ -557,6 +560,12 @@ static int64_t sym_commit_new(uint32_t hash, const char* str, size_t len) {
 static int64_t sym_intern_nolock_noseg(uint32_t hash, const char* str, size_t len) {
     int64_t existing = sym_probe(hash, str, len);
     if (existing >= 0) return existing;
+    if (g_sym.lazy_map && g_sym.lazy_next_id < g_sym.persisted_count) {
+        if (!sym_lazy_materialize_to_locked(g_sym.persisted_count - 1))
+            return -1;
+        existing = sym_probe(hash, str, len);
+        if (existing >= 0) return existing;
+    }
     return sym_commit_new(hash, str, len);
 }
 
@@ -662,9 +671,16 @@ static bool sym_lazy_materialize_to_locked(uint32_t target_id) {
  * which commits the main sym without a cache on purpose.  A cache-OOM
  * there is tolerated (scanned bit stays clear → future interns retry).
  * -------------------------------------------------------------------------- */
-static int64_t sym_intern_nolock(uint32_t hash, const char* str, size_t len) {
+static int64_t sym_intern_nolock(uint32_t hash, const char* str, size_t len,
+                                 bool search_lazy) {
     /* Phase A.1: probe main. */
     int64_t existing = sym_probe(hash, str, len);
+    if (search_lazy && existing < 0 && g_sym.lazy_map &&
+        g_sym.lazy_next_id < g_sym.persisted_count) {
+        if (!sym_lazy_materialize_to_locked(g_sym.persisted_count - 1))
+            return -1;
+        existing = sym_probe(hash, str, len);
+    }
     if (existing >= 0) {
         (void)sym_cache_segments((uint32_t)existing, str, len);
         return existing;
@@ -779,7 +795,16 @@ int64_t ray_sym_intern(const char* str, size_t len) {
     if (!atomic_load_explicit(&g_sym_inited, memory_order_acquire)) return -1;
     uint32_t hash = (uint32_t)ray_hash_bytes(str, len);
     sym_lock();
-    int64_t id = sym_intern_nolock(hash, str, len);
+    int64_t id = sym_intern_nolock(hash, str, len, true);
+    sym_unlock();
+    return id;
+}
+
+int64_t ray_sym_intern_runtime(const char* str, size_t len) {
+    if (!atomic_load_explicit(&g_sym_inited, memory_order_acquire)) return -1;
+    uint32_t hash = (uint32_t)ray_hash_bytes(str, len);
+    sym_lock();
+    int64_t id = sym_intern_nolock(hash, str, len, false);
     sym_unlock();
     return id;
 }
@@ -793,7 +818,7 @@ int64_t ray_sym_intern(const char* str, size_t len) {
 
 int64_t ray_sym_intern_prehashed(uint32_t hash, const char* str, size_t len) {
     if (!atomic_load_explicit(&g_sym_inited, memory_order_acquire)) return -1;
-    return sym_intern_nolock(hash, str, len);
+    return sym_intern_nolock(hash, str, len, true);
 }
 
 /* --------------------------------------------------------------------------
@@ -885,7 +910,17 @@ int64_t ray_sym_find(const char* str, size_t len) {
 
     for (;;) {
         uint64_t e = g_sym.buckets[slot];
-        if (e == 0) { sym_unlock(); return -1; }  /* empty -- not found */
+        if (e == 0) {
+            if (g_sym.lazy_map && g_sym.lazy_next_id < g_sym.persisted_count) {
+                if (sym_lazy_materialize_to_locked(g_sym.persisted_count - 1)) {
+                    mask = g_sym.bucket_cap - 1;
+                    slot = hash & mask;
+                    continue;
+                }
+            }
+            sym_unlock();
+            return -1;
+        }  /* empty -- not found */
 
         uint32_t e_hash = (uint32_t)(e >> 32);
         if (e_hash == hash) {
diff --git a/src/table/sym.h b/src/table/sym.h
index 67c159bc..a945fccc 100644
--- a/src/table/sym.h
+++ b/src/table/sym.h
@@ -110,6 +110,7 @@ int ray_sym_segs(int64_t sym_id, const int64_t** out_segs);
  * with ray_sym_rebuild_segments to populate the dotted cache. */
 int64_t ray_sym_intern_no_split(const char* str, size_t len);
 int64_t ray_sym_intern_no_split_unlocked(const char* str, size_t len);
+int64_t ray_sym_intern_runtime(const char* str, size_t len);
 
 /* Walk the intern table and cache segment sym_ids for any dotted name
  * that hasn't been cached yet.  Idempotent — safe to call multiple times.
diff --git a/test/rfl/group/count_distinct_paths.rfl b/test/rfl/group/count_distinct_paths.rfl
index 6655a558..88f6ef0c 100644
--- a/test/rfl/group/count_distinct_paths.rfl
+++ b/test/rfl/group/count_distinct_paths.rfl
@@ -129,7 +129,7 @@
 (sum (at Rs 'c)) -- 8
 
 ;; ════════════════════════════════════════════════════════════════════
-;; 6. ray_count_distinct_per_group — single-array HT (DuckDB-style),
+;; 6. ray_count_distinct_per_group — single-array HT,
 ;;    n_groups > 50000 sub-200000 rows triggers serial global-hash.
 ;;    Path: query.c:7650 → ray_count_distinct_per_group → CD_INSERT
 ;;    loop (group.c:1162-1227, esz=8 I64 specialisation).