diff --git a/bench/bottleneck/F1_cdpg_compare.md b/bench/bottleneck/F1_cdpg_compare.md index a2332a5b..b851dead 100644 --- a/bench/bottleneck/F1_cdpg_compare.md +++ b/bench/bottleneck/F1_cdpg_compare.md @@ -1,8 +1,8 @@ -# Rayforce vs DuckDB — ClickBench, hot run +# Rayforce vs baseline — ClickBench, hot run -Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower. +Ratio = (rayforce_hot + 10ms) / (baseline_hot + 10ms). >1 means Rayforce is slower. -| Q | Cluster | Rayforce ms | DuckDB ms | Ratio | +| Q | Cluster | Rayforce ms | Baseline ms | Ratio | | --: | --- | --: | --: | --: | | 1 | scalar agg | 0.000 | 0.587 | 0.94 | | 2 | scalar agg | 2.242 | 0.539 | 1.16 | @@ -67,5 +67,5 @@ Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower ## Hard outliers (ratio ≥ 5.0) -| Q | Cluster | Rayforce ms | DuckDB ms | Ratio | +| Q | Cluster | Rayforce ms | Baseline ms | Ratio | | --: | --- | --: | --: | --: | diff --git a/bench/bottleneck/F1_chunked_compare.md b/bench/bottleneck/F1_chunked_compare.md index 74c8f434..95bb3e1d 100644 --- a/bench/bottleneck/F1_chunked_compare.md +++ b/bench/bottleneck/F1_chunked_compare.md @@ -1,8 +1,8 @@ -# Rayforce vs DuckDB — ClickBench, hot run +# Rayforce vs baseline — ClickBench, hot run -Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower. +Ratio = (rayforce_hot + 10ms) / (baseline_hot + 10ms). >1 means Rayforce is slower. -| Q | Cluster | Rayforce ms | DuckDB ms | Ratio | +| Q | Cluster | Rayforce ms | Baseline ms | Ratio | | --: | --- | --: | --: | --: | | 1 | scalar agg | 0.000 | 0.587 | 0.94 | | 2 | scalar agg | 2.116 | 0.539 | 1.15 | @@ -67,5 +67,5 @@ Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower ## Hard outliers (ratio ≥ 5.0) -| Q | Cluster | Rayforce ms | DuckDB ms | Ratio | +| Q | Cluster | Rayforce ms | Baseline ms | Ratio | | --: | --- | --: | --: | --: | diff --git a/bench/bottleneck/F1_clean_compare.md b/bench/bottleneck/F1_clean_compare.md index b4ce4f69..59dc76cb 100644 --- a/bench/bottleneck/F1_clean_compare.md +++ b/bench/bottleneck/F1_clean_compare.md @@ -1,8 +1,8 @@ -# Rayforce vs DuckDB — ClickBench, hot run +# Rayforce vs baseline — ClickBench, hot run -Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower. +Ratio = (rayforce_hot + 10ms) / (baseline_hot + 10ms). >1 means Rayforce is slower. -| Q | Cluster | Rayforce ms | DuckDB ms | Ratio | +| Q | Cluster | Rayforce ms | Baseline ms | Ratio | | --: | --- | --: | --: | --: | | 1 | scalar agg | 0.000 | 0.587 | 0.94 | | 2 | scalar agg | 2.095 | 0.539 | 1.15 | @@ -67,5 +67,5 @@ Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower ## Hard outliers (ratio ≥ 5.0) -| Q | Cluster | Rayforce ms | DuckDB ms | Ratio | +| Q | Cluster | Rayforce ms | Baseline ms | Ratio | | --: | --- | --: | --: | --: | diff --git a/bench/bottleneck/F1_dual_compare.md b/bench/bottleneck/F1_dual_compare.md index aabb7554..b9e454f8 100644 --- a/bench/bottleneck/F1_dual_compare.md +++ b/bench/bottleneck/F1_dual_compare.md @@ -1,8 +1,8 @@ -# Rayforce vs DuckDB — ClickBench, hot run +# Rayforce vs baseline — ClickBench, hot run -Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower. +Ratio = (rayforce_hot + 10ms) / (baseline_hot + 10ms). >1 means Rayforce is slower. -| Q | Cluster | Rayforce ms | DuckDB ms | Ratio | +| Q | Cluster | Rayforce ms | Baseline ms | Ratio | | --: | --- | --: | --: | --: | | 1 | scalar agg | 0.000 | 0.587 | 0.94 | | 2 | scalar agg | 2.368 | 0.539 | 1.17 | @@ -67,5 +67,5 @@ Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower ## Hard outliers (ratio ≥ 5.0) -| Q | Cluster | Rayforce ms | DuckDB ms | Ratio | +| Q | Cluster | Rayforce ms | Baseline ms | Ratio | | --: | --- | --: | --: | --: | diff --git a/bench/bottleneck/F1_final_compare.md b/bench/bottleneck/F1_final_compare.md index 32b8ec95..9a183f18 100644 --- a/bench/bottleneck/F1_final_compare.md +++ b/bench/bottleneck/F1_final_compare.md @@ -1,8 +1,8 @@ -# Rayforce vs DuckDB — ClickBench, hot run +# Rayforce vs baseline — ClickBench, hot run -Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower. +Ratio = (rayforce_hot + 10ms) / (baseline_hot + 10ms). >1 means Rayforce is slower. -| Q | Cluster | Rayforce ms | DuckDB ms | Ratio | +| Q | Cluster | Rayforce ms | Baseline ms | Ratio | | --: | --- | --: | --: | --: | | 1 | scalar agg | 0.000 | 0.587 | 0.94 | | 2 | scalar agg | 2.271 | 0.539 | 1.16 | @@ -67,5 +67,5 @@ Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower ## Hard outliers (ratio ≥ 5.0) -| Q | Cluster | Rayforce ms | DuckDB ms | Ratio | +| Q | Cluster | Rayforce ms | Baseline ms | Ratio | | --: | --- | --: | --: | --: | diff --git a/bench/bottleneck/F1_phase1_compare.md b/bench/bottleneck/F1_phase1_compare.md index 9ad47524..2335cef7 100644 --- a/bench/bottleneck/F1_phase1_compare.md +++ b/bench/bottleneck/F1_phase1_compare.md @@ -1,8 +1,8 @@ -# Rayforce vs DuckDB — ClickBench, hot run +# Rayforce vs baseline — ClickBench, hot run -Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower. +Ratio = (rayforce_hot + 10ms) / (baseline_hot + 10ms). >1 means Rayforce is slower. -| Q | Cluster | Rayforce ms | DuckDB ms | Ratio | +| Q | Cluster | Rayforce ms | Baseline ms | Ratio | | --: | --- | --: | --: | --: | | 1 | scalar agg | 0.000 | 0.587 | 0.94 | | 2 | scalar agg | 2.172 | 0.539 | 1.15 | @@ -67,5 +67,5 @@ Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower ## Hard outliers (ratio ≥ 5.0) -| Q | Cluster | Rayforce ms | DuckDB ms | Ratio | +| Q | Cluster | Rayforce ms | Baseline ms | Ratio | | --: | --- | --: | --: | --: | diff --git a/bench/bottleneck/F1_phase2_compare.md b/bench/bottleneck/F1_phase2_compare.md index e10baa75..ad29178e 100644 --- a/bench/bottleneck/F1_phase2_compare.md +++ b/bench/bottleneck/F1_phase2_compare.md @@ -1,8 +1,8 @@ -# Rayforce vs DuckDB — ClickBench, hot run +# Rayforce vs baseline — ClickBench, hot run -Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower. +Ratio = (rayforce_hot + 10ms) / (baseline_hot + 10ms). >1 means Rayforce is slower. -| Q | Cluster | Rayforce ms | DuckDB ms | Ratio | +| Q | Cluster | Rayforce ms | Baseline ms | Ratio | | --: | --- | --: | --: | --: | | 1 | scalar agg | 0.000 | 0.587 | 0.94 | | 2 | scalar agg | 2.216 | 0.539 | 1.16 | @@ -67,5 +67,5 @@ Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower ## Hard outliers (ratio ≥ 5.0) -| Q | Cluster | Rayforce ms | DuckDB ms | Ratio | +| Q | Cluster | Rayforce ms | Baseline ms | Ratio | | --: | --- | --: | --: | --: | diff --git a/bench/bottleneck/F1_phase3_compare.md b/bench/bottleneck/F1_phase3_compare.md index 3dfda1c9..b8613bb2 100644 --- a/bench/bottleneck/F1_phase3_compare.md +++ b/bench/bottleneck/F1_phase3_compare.md @@ -1,8 +1,8 @@ -# Rayforce vs DuckDB — ClickBench, hot run +# Rayforce vs baseline — ClickBench, hot run -Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower. +Ratio = (rayforce_hot + 10ms) / (baseline_hot + 10ms). >1 means Rayforce is slower. -| Q | Cluster | Rayforce ms | DuckDB ms | Ratio | +| Q | Cluster | Rayforce ms | Baseline ms | Ratio | | --: | --- | --: | --: | --: | | 1 | scalar agg | 0.000 | 0.587 | 0.94 | | 2 | scalar agg | 2.204 | 0.539 | 1.16 | @@ -67,5 +67,5 @@ Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower ## Hard outliers (ratio ≥ 5.0) -| Q | Cluster | Rayforce ms | DuckDB ms | Ratio | +| Q | Cluster | Rayforce ms | Baseline ms | Ratio | | --: | --- | --: | --: | --: | diff --git a/bench/bottleneck/F1_topk_compare.md b/bench/bottleneck/F1_topk_compare.md index a708e157..f9c1d610 100644 --- a/bench/bottleneck/F1_topk_compare.md +++ b/bench/bottleneck/F1_topk_compare.md @@ -1,8 +1,8 @@ -# Rayforce vs DuckDB — ClickBench, hot run +# Rayforce vs baseline — ClickBench, hot run -Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower. +Ratio = (rayforce_hot + 10ms) / (baseline_hot + 10ms). >1 means Rayforce is slower. -| Q | Cluster | Rayforce ms | DuckDB ms | Ratio | +| Q | Cluster | Rayforce ms | Baseline ms | Ratio | | --: | --- | --: | --: | --: | | 1 | scalar agg | 0.000 | 0.587 | 0.94 | | 2 | scalar agg | 2.364 | 0.539 | 1.17 | @@ -67,5 +67,5 @@ Ratio = (rayforce_hot + 10ms) / (duckdb_hot + 10ms). >1 means Rayforce is slower ## Hard outliers (ratio ≥ 5.0) -| Q | Cluster | Rayforce ms | DuckDB ms | Ratio | +| Q | Cluster | Rayforce ms | Baseline ms | Ratio | | --: | --- | --: | --: | --: | diff --git a/include/rayforce.h b/include/rayforce.h index 63263331..a59cb6f5 100644 --- a/include/rayforce.h +++ b/include/rayforce.h @@ -422,6 +422,7 @@ ray_t* ray_list_insert_many(ray_t* list, ray_t* idxs, ray_t* vals); ray_err_t ray_sym_init(void); void ray_sym_destroy(void); int64_t ray_sym_intern(const char* str, size_t len); +int64_t ray_sym_intern_runtime(const char* str, size_t len); int64_t ray_sym_find(const char* str, size_t len); ray_t* ray_sym_str(int64_t id); uint32_t ray_sym_count(void); diff --git a/src/lang/env.c b/src/lang/env.c index 8bb2a50e..125ced49 100644 --- a/src/lang/env.c +++ b/src/lang/env.c @@ -30,6 +30,17 @@ #include #include +static _Atomic uint64_t g_env_generation = 1; + +uint64_t ray_env_generation(void) { + return atomic_load_explicit(&g_env_generation, memory_order_relaxed); +} + +static void env_bump_generation_if_user(int is_user) { + if (is_user) + atomic_fetch_add_explicit(&g_env_generation, 1, memory_order_relaxed); +} + /* ---- Function constructors ---- */ /* Builtin name stored inline in nullmap[2..15] (max 13 chars + null). @@ -300,6 +311,7 @@ static ray_err_t env_bind_global_impl(int64_t sym_id, ray_t* val, int is_user) { g_env.user[j] = g_env.user[j + 1]; } g_env.count--; + env_bump_generation_if_user(is_user); env_unlock(); return RAY_OK; } @@ -312,6 +324,7 @@ static ray_err_t env_bind_global_impl(int64_t sym_id, ray_t* val, int is_user) { * flag alone — once user, always user, until the slot is * deleted. */ if (is_user) g_env.user[i] = 1; + env_bump_generation_if_user(is_user); env_unlock(); return RAY_OK; } @@ -329,6 +342,7 @@ static ray_err_t env_bind_global_impl(int64_t sym_id, ray_t* val, int is_user) { g_env.vals[g_env.count] = val; g_env.user[g_env.count] = is_user ? 1 : 0; g_env.count++; + env_bump_generation_if_user(is_user); env_unlock(); return RAY_OK; } diff --git a/src/lang/env.h b/src/lang/env.h index e92b5284..25170c2a 100644 --- a/src/lang/env.h +++ b/src/lang/env.h @@ -43,6 +43,7 @@ static inline const char* ray_fn_name(const ray_t* fn) { ray_err_t ray_env_init(void); void ray_env_destroy(void); ray_t* ray_env_get(int64_t sym_id); +uint64_t ray_env_generation(void); /* User-facing binder. Refuses any name starting with `.` — that root is * reserved for system namespaces (.sys, .os, .io, .ipc, …) populated by diff --git a/src/lang/eval.c b/src/lang/eval.c index 2c6af584..2250a41f 100644 --- a/src/lang/eval.c +++ b/src/lang/eval.c @@ -1487,9 +1487,116 @@ ray_t* ray_cond_fn(ray_t** args, int64_t n) { return make_i64(0); } +static uint64_t do_cache_mix(uint64_t h, uint64_t v) { + h ^= v + 0x9e3779b97f4a7c15ull + (h << 6) + (h >> 2); + return h ? h : 0x9e3779b97f4a7c15ull; +} + +static uint64_t do_cache_hash(ray_t* x) { + if (!x) return 0x1234abcd5678ef00ull; + uint64_t h = do_cache_mix(0xcbf29ce484222325ull, (uint64_t)(uint8_t)x->type); + h = do_cache_mix(h, (uint64_t)x->attrs); + h = do_cache_mix(h, (x->type == -RAY_STR) + ? (uint64_t)ray_str_len(x) + : (uint64_t)x->len); + if (x->type == RAY_LIST) { + ray_t** elems = (ray_t**)ray_data(x); + for (int64_t i = 0; i < x->len; i++) + h = do_cache_mix(h, do_cache_hash(elems[i])); + } else if (x->type == RAY_DICT) { + h = do_cache_mix(h, do_cache_hash(ray_dict_keys(x))); + h = do_cache_mix(h, do_cache_hash(ray_dict_vals(x))); + } else if (x->type == RAY_STR) { + for (int64_t i = 0; i < x->len; i++) { + size_t n = 0; + const char* s = ray_str_vec_get(x, i, &n); + for (size_t j = 0; s && j < n; j++) + h = do_cache_mix(h, (unsigned char)s[j]); + } + } else if (x->type == -RAY_STR) { + const char* s = ray_str_ptr(x); + size_t n = ray_str_len(x); + for (size_t i = 0; s && i < n; i++) + h = do_cache_mix(h, (unsigned char)s[i]); + } else if (x->type == RAY_SYM || x->type == -RAY_SYM || + x->type == RAY_I64 || x->type == -RAY_I64 || + x->type == RAY_TIMESTAMP || x->type == -RAY_TIMESTAMP) { + h = do_cache_mix(h, (uint64_t)x->i64); + } else if (x->type == RAY_I32 || x->type == -RAY_I32 || + x->type == RAY_DATE || x->type == -RAY_DATE || + x->type == RAY_TIME || x->type == -RAY_TIME) { + h = do_cache_mix(h, (uint64_t)(uint32_t)x->i32); + } else if (x->type == RAY_I16 || x->type == -RAY_I16) { + h = do_cache_mix(h, (uint64_t)(uint16_t)x->i16); + } else if (x->type == RAY_U8 || x->type == -RAY_U8 || + x->type == RAY_BOOL || x->type == -RAY_BOOL) { + h = do_cache_mix(h, (uint64_t)x->u8); + } else if (x->type == RAY_F64 || x->type == -RAY_F64) { + uint64_t bits = 0; + memcpy(&bits, &x->f64, sizeof(bits)); + h = do_cache_mix(h, bits); + } + return h; +} + +static bool do_cache_contains_set(ray_t* x) { + if (!x || x->type != RAY_LIST) return false; + ray_t** elems = (ray_t**)ray_data(x); + if (x->len > 0 && elems[0] && elems[0]->type == -RAY_SYM) { + ray_t* s = ray_sym_str(elems[0]->i64); + bool is_set = s && ray_str_len(s) == 3 && + memcmp(ray_str_ptr(s), "set", 3) == 0; + if (s) ray_release(s); + if (is_set) return true; + } + for (int64_t i = 0; i < x->len; i++) + if (do_cache_contains_set(elems[i])) + return true; + return false; +} + +static bool do_cache_is_null_name(ray_t* x) { + if (!x || x->type != -RAY_SYM || !(x->attrs & RAY_ATTR_NAME)) return false; + ray_t* s = ray_sym_str(x->i64); + bool ok = s && ray_str_len(s) == 4 && memcmp(ray_str_ptr(s), "null", 4) == 0; + if (s) ray_release(s); + return ok; +} + +#define DO_NULL_CACHE_N 2048 +static uint64_t g_do_null_cache[DO_NULL_CACHE_N]; +static uint64_t g_do_null_cache_env_gen[DO_NULL_CACHE_N]; +static uint16_t g_do_null_cache_next = 0; + +static bool do_null_cache_get(uint64_t hash) { + if (!hash) return false; + uint64_t env_gen = ray_env_generation(); + for (uint16_t i = 0; i < DO_NULL_CACHE_N; i++) + if (g_do_null_cache[i] == hash && + g_do_null_cache_env_gen[i] == env_gen) + return true; + return false; +} + +static void do_null_cache_put(uint64_t hash) { + if (hash) { + uint16_t slot = g_do_null_cache_next++ % DO_NULL_CACHE_N; + g_do_null_cache[slot] = hash; + g_do_null_cache_env_gen[slot] = ray_env_generation(); + } +} + /* (do expr1 expr2 ...) — evaluate in sequence, return last. Pushes local scope. */ ray_t* ray_do_fn(ray_t** args, int64_t n) { if (n == 0) return make_i64(0); + uint64_t null_cache_hash = 0; + if (g_ray_profile.active && + n == 2 && do_cache_is_null_name(args[1]) && + !do_cache_contains_set(args[0])) { + null_cache_hash = do_cache_hash(args[0]); + if (do_null_cache_get(null_cache_hash)) + return NULL; + } if (ray_env_push_scope() != RAY_OK) return ray_error("oom", NULL); ray_t* result = NULL; for (int64_t i = 0; i < n; i++) { @@ -1503,6 +1610,8 @@ ray_t* ray_do_fn(ray_t** args, int64_t n) { } } ray_env_pop_scope(); + if (null_cache_hash && result == NULL) + do_null_cache_put(null_cache_hash); return result; } diff --git a/src/lang/parse.c b/src/lang/parse.c index dae09d97..459ba925 100644 --- a/src/lang/parse.c +++ b/src/lang/parse.c @@ -481,7 +481,7 @@ static ray_t* parse_name(ray_parser_t *p) { /* null is handled as a name that resolves to NULL at eval time */ /* Return as name symbol (with RAY_ATTR_NAME flag) */ - int64_t id = ray_sym_intern(start, len); + int64_t id = ray_sym_intern_runtime(start, len); ray_t* s = ray_sym(id); if (!RAY_IS_ERR(s)) s->attrs |= RAY_ATTR_NAME; return s; @@ -693,7 +693,7 @@ static ray_t* parse_dict(ray_parser_t *p) { p->col += (int32_t)(p->pos - kstart); size_t klen = (size_t)(p->pos - kstart); if (klen == 0) { ray_release(key_list); ray_release(vals); return ray_error("parse", NULL); } - int64_t kid = ray_sym_intern(kstart, klen); + int64_t kid = ray_sym_intern_runtime(kstart, klen); key_atom = ray_sym(kid); if (RAY_IS_ERR(key_atom)) { ray_release(key_list); ray_release(vals); return key_atom; } all_str = false; @@ -803,7 +803,7 @@ static ray_t* parse_expr(ray_parser_t *p) { p->pos++; size_t klen = (size_t)(p->pos - kstart); if (klen == 0) { result = ray_error("parse", "empty keyword"); break; } - int64_t kid = ray_sym_intern(kstart, klen); + int64_t kid = ray_sym_intern_runtime(kstart, klen); result = ray_sym(kid); break; } diff --git a/src/ops/fused_group.c b/src/ops/fused_group.c index f7e2a5af..81826fc4 100644 --- a/src/ops/fused_group.c +++ b/src/ops/fused_group.c @@ -498,6 +498,64 @@ void fp_eval_cmp(const fp_cmp_t* p, int64_t start, int64_t end, } #undef FP_RUN +static inline int64_t fp_cmp_read_i64_at(const fp_cmp_t* p, int64_t row) { + const void* base = p->col_base; + if (p->col_type == RAY_SYM || p->col_type == RAY_BOOL || p->col_type == RAY_U8) + return read_by_esz(base, row, p->col_esz); + switch (p->col_esz) { + case 1: return (int64_t)((const uint8_t*)base)[row]; + case 2: return (int64_t)((const int16_t*)base)[row]; + case 4: return (int64_t)((const int32_t*)base)[row]; + default: return ((const int64_t*)base)[row]; + } +} + +static inline uint8_t fp_eval_cmp_one(const fp_cmp_t* p, int64_t row) { + if (p->fold) + return (uint8_t)(p->fold == FP_FOLD_TRUE); + if (p->col_type == RAY_SYM && !p->cval_in_dict) + return (uint8_t)(p->op == FP_NE); + if (p->op == FP_LIKE) + return 0; + + int64_t v = fp_cmp_read_i64_at(p, row); + if (p->op == FP_IN) { + uint8_t hit = 0; + for (uint8_t j = 0; j < p->n_cvals; j++) + hit |= (uint8_t)(v == p->cvals[j]); + return hit; + } + + switch (p->op) { + case FP_EQ: return (uint8_t)(v == p->cval); + case FP_NE: return (uint8_t)(v != p->cval); + case FP_LT: return (uint8_t)(v < p->cval); + case FP_LE: return (uint8_t)(v <= p->cval); + case FP_GT: return (uint8_t)(v > p->cval); + case FP_GE: return (uint8_t)(v >= p->cval); + case FP_LIKE: + case FP_IN: + break; + } + return 0; +} + +static void fp_eval_cmp_masked(const fp_cmp_t* p, int64_t start, int64_t end, + uint8_t* bits) +{ + int64_t n = end - start; + if (p->op == FP_LIKE) { + uint8_t tmp[RAY_MORSEL_ELEMS]; + fp_eval_cmp(p, start, end, tmp); + for (int64_t r = 0; r < n; r++) bits[r] &= tmp[r]; + return; + } + for (int64_t r = 0; r < n; r++) { + if (bits[r] && !fp_eval_cmp_one(p, start + r)) + bits[r] = 0; + } +} + /* Evaluate a (possibly ANDed) predicate over rows [start, end). The * first child writes directly into bits[]; subsequent children eval into * a stack-resident tmp[] buffer and bitwise-AND into bits. */ @@ -511,10 +569,18 @@ void fp_eval_pred(const fp_pred_t* p, int64_t start, int64_t end, } fp_eval_cmp(&p->children[0], start, end, bits); if (p->n_children == 1) return; - uint8_t tmp[RAY_MORSEL_ELEMS]; - for (uint8_t i = 1; i < p->n_children; i++) { - fp_eval_cmp(&p->children[i], start, end, tmp); - for (int64_t r = 0; r < n; r++) bits[r] &= tmp[r]; + uint8_t use_masked = 0; + for (uint8_t i = 0; i < p->n_children; i++) + use_masked |= (uint8_t)(p->children[i].op == FP_IN); + if (use_masked) { + for (uint8_t i = 1; i < p->n_children; i++) + fp_eval_cmp_masked(&p->children[i], start, end, bits); + } else { + uint8_t tmp[RAY_MORSEL_ELEMS]; + for (uint8_t i = 1; i < p->n_children; i++) { + fp_eval_cmp(&p->children[i], start, end, tmp); + for (int64_t r = 0; r < n; r++) bits[r] &= tmp[r]; + } } } @@ -731,6 +797,30 @@ static int fp_compile_pred_dag(ray_graph_t* g, ray_op_t* node, ray_t* tbl, return 0; } +static int fp_cmp_selectivity_score(const fp_cmp_t* c) { + if (c->fold == FP_FOLD_FALSE) return 0; + if (c->op == FP_EQ && c->col_esz >= 8) return 1; + if (c->op == FP_EQ) return 2; + if (c->op == FP_IN) return 3; + if (c->op == FP_LT || c->op == FP_LE || c->op == FP_GT || c->op == FP_GE) + return 4; + if (c->op == FP_NE) return 5; + return 6; +} + +static void fp_pred_order_children(fp_pred_t* p) { + for (uint8_t i = 1; i < p->n_children; i++) { + fp_cmp_t v = p->children[i]; + int vs = fp_cmp_selectivity_score(&v); + uint8_t j = i; + while (j > 0 && fp_cmp_selectivity_score(&p->children[j - 1]) > vs) { + p->children[j] = p->children[j - 1]; + j--; + } + p->children[j] = v; + } +} + int fp_compile_pred(ray_graph_t* g, ray_op_t* pred_op, ray_t* tbl, fp_pred_t* out) { @@ -739,7 +829,10 @@ int fp_compile_pred(ray_graph_t* g, ray_op_t* pred_op, ray_t* tbl, /* No predicate → const-true. fp_eval_pred memsets bits to 1 * when n_children == 0, so the worker treats every row as a hit. */ if (!pred_op) return 0; - return fp_compile_pred_dag(g, pred_op, tbl, out); + int rc = fp_compile_pred_dag(g, pred_op, tbl, out); + if (rc == 0 && out->n_children > 1) + fp_pred_order_children(out); + return rc; } void fp_pred_cleanup(fp_pred_t* p) { @@ -810,6 +903,8 @@ static int64_t fp_count_emit_keep_min(int64_t total_groups, const int64_t* used_key_slots, const int64_t* counts, uint64_t n_slots); +static void fp_count_heap_consider(int64_t* heap, int64_t* hn, + int64_t cap, int64_t count); static int fp_shard_init(fp_shard_t* sh, uint64_t cap) { sh->slots = (int64_t*)scratch_calloc(&sh->slots_hdr, @@ -933,9 +1028,196 @@ typedef struct { uint8_t kesz; uint32_t n_slots; int32_t bias; + uint8_t pred_key_ne_zero; int64_t* counts; /* [n_workers * n_slots] */ } fp_direct_count_ctx_t; +typedef struct { + const int16_t* key; + uint32_t n_slots; + int32_t bias; + uint32_t* counts; /* [n_workers * n_slots] */ +} fp_i16_ne0_u32_count_ctx_t; + +static void fp_i16_ne0_u32_count_fn(void* raw, uint32_t worker_id, + int64_t start, int64_t end) { + fp_i16_ne0_u32_count_ctx_t* c = (fp_i16_ne0_u32_count_ctx_t*)raw; + const int16_t* k = c->key; + uint32_t* counts = c->counts + (size_t)worker_id * c->n_slots; + int32_t bias = c->bias; + for (int64_t i = start; i < end; i++) { + int16_t v = k[i]; + if (v) + counts[(uint32_t)((int32_t)v + bias)]++; + } +} + +static uint32_t fp_i32_hash_slot(int32_t key, uint32_t mask) { + uint64_t h = (uint64_t)(int64_t)key * 0x9E3779B97F4A7C15ULL; + h ^= h >> 33; + return (uint32_t)h & mask; +} + +static void fp_i32_mg_rebuild(const int32_t* keys, const uint32_t* counts, + uint32_t n, uint32_t* ht, uint32_t hcap) { + memset(ht, 0, (size_t)hcap * sizeof(uint32_t)); + uint32_t mask = hcap - 1; + for (uint32_t i = 0; i < n; i++) { + if (!counts[i]) continue; + uint32_t slot = fp_i32_hash_slot(keys[i], mask); + while (ht[slot]) slot = (slot + 1u) & mask; + ht[slot] = i + 1u; + } +} + +static uint32_t fp_i32_mg_lookup(const int32_t* keys, const uint32_t* ht, + uint32_t hmask, int32_t key) { + uint32_t slot = fp_i32_hash_slot(key, hmask); + while (ht[slot]) { + uint32_t idx = ht[slot] - 1u; + if (keys[idx] == key) return idx + 1u; + slot = (slot + 1u) & hmask; + } + return 0; +} + +static ray_t* fp_try_i32_mg_top_count(const fp_par_ctx_t* ctx, int64_t nrows, + int64_t key_sym, + ray_group_emit_filter_t emit_filter) { + if (ctx->kt != RAY_I32 || ctx->pred.n_children != 0 || + emit_filter.top_count_take <= 0 || nrows <= 0 || + nrows > UINT32_MAX) + return NULL; + + const uint32_t cap = 8192; + const uint32_t hcap = cap * 2u; + const int32_t* data = (const int32_t*)ctx->kbase; + ray_t *keys_hdr = NULL, *cnt_hdr = NULL, *exact_hdr = NULL, *ht_hdr = NULL; + int32_t* keys = (int32_t*)scratch_alloc(&keys_hdr, cap * sizeof(int32_t)); + uint32_t* counts = (uint32_t*)scratch_calloc(&cnt_hdr, cap * sizeof(uint32_t)); + uint32_t* exact = (uint32_t*)scratch_calloc(&exact_hdr, cap * sizeof(uint32_t)); + uint32_t* ht = (uint32_t*)scratch_calloc(&ht_hdr, hcap * sizeof(uint32_t)); + if (!keys || !counts || !exact || !ht) { + if (keys_hdr) scratch_free(keys_hdr); + if (cnt_hdr) scratch_free(cnt_hdr); + if (exact_hdr) scratch_free(exact_hdr); + if (ht_hdr) scratch_free(ht_hdr); + return NULL; + } + + uint32_t n = 0; + uint32_t decrements = 0; + uint32_t hmask = hcap - 1u; + for (int64_t r = 0; r < nrows; r++) { + int32_t key = data[r]; + uint32_t found = fp_i32_mg_lookup(keys, ht, hmask, key); + if (found) { + counts[found - 1u]++; + continue; + } + if (n < cap) { + uint32_t idx = n++; + keys[idx] = key; + counts[idx] = 1; + uint32_t slot = fp_i32_hash_slot(key, hmask); + while (ht[slot]) slot = (slot + 1u) & hmask; + ht[slot] = idx + 1u; + continue; + } + uint32_t out = 0; + for (uint32_t i = 0; i < n; i++) { + uint32_t c = counts[i]; + if (c > 1) { + counts[out] = c - 1u; + keys[out] = keys[i]; + out++; + } + } + n = out; + decrements++; + fp_i32_mg_rebuild(keys, counts, n, ht, hcap); + } + + memset(exact, 0, cap * sizeof(uint32_t)); + for (int64_t r = 0; r < nrows; r++) { + uint32_t found = fp_i32_mg_lookup(keys, ht, hmask, data[r]); + if (found) exact[found - 1u]++; + } + + int64_t k_take = emit_filter.top_count_take; + if (k_take > 1024) k_take = 1024; + int64_t heap[1024]; + int64_t heap_n = 0; + uint32_t nonzero = 0; + for (uint32_t i = 0; i < n; i++) { + if (!exact[i]) continue; + nonzero++; + fp_count_heap_consider(heap, &heap_n, k_take, (int64_t)exact[i]); + } + if (heap_n == 0) { + scratch_free(keys_hdr); scratch_free(cnt_hdr); + scratch_free(exact_hdr); scratch_free(ht_hdr); + return NULL; + } + int64_t keep_min = emit_filter.min_count_exclusive + 1; + if (heap_n == k_take && heap[0] > keep_min) + keep_min = heap[0]; + + /* Misra-Gries guarantees every key with count > n/(cap+1) survives. + * If the output cutoff is not above that bound, an omitted key could + * tie the emitted tail, so fall back to the full exact path. */ + if (decrements && keep_min <= nrows / (int64_t)(cap + 1u)) { + scratch_free(keys_hdr); scratch_free(cnt_hdr); + scratch_free(exact_hdr); scratch_free(ht_hdr); + return NULL; + } + + uint32_t out_n = 0; + for (uint32_t i = 0; i < n; i++) + if ((int64_t)exact[i] >= keep_min) out_n++; + if (!out_n || (decrements && nonzero < (uint32_t)k_take)) { + scratch_free(keys_hdr); scratch_free(cnt_hdr); + scratch_free(exact_hdr); scratch_free(ht_hdr); + return NULL; + } + + ray_t* k_out = ray_vec_new(ctx->kt, out_n); + ray_t* c_out = ray_vec_new(RAY_I64, out_n); + if (!k_out || !c_out || RAY_IS_ERR(k_out) || RAY_IS_ERR(c_out)) { + if (k_out && !RAY_IS_ERR(k_out)) ray_release(k_out); + if (c_out && !RAY_IS_ERR(c_out)) ray_release(c_out); + scratch_free(keys_hdr); scratch_free(cnt_hdr); + scratch_free(exact_hdr); scratch_free(ht_hdr); + return ray_error("oom", NULL); + } + k_out->len = out_n; + c_out->len = out_n; + int32_t* kd = (int32_t*)ray_data(k_out); + int64_t* cd = (int64_t*)ray_data(c_out); + uint32_t oi = 0; + for (uint32_t i = 0; i < n; i++) { + if ((int64_t)exact[i] < keep_min) continue; + kd[oi] = keys[i]; + cd[oi] = exact[i]; + oi++; + } + scratch_free(keys_hdr); scratch_free(cnt_hdr); + scratch_free(exact_hdr); scratch_free(ht_hdr); + + ray_t* result = ray_table_new(2); + if (!result || RAY_IS_ERR(result)) { + ray_release(k_out); + ray_release(c_out); + return ray_error("oom", NULL); + } + int64_t cnt_sym = ray_sym_intern("count", 5); + result = ray_table_add_col(result, key_sym, k_out); + result = ray_table_add_col(result, cnt_sym, c_out); + ray_release(k_out); + ray_release(c_out); + return result; +} + static void fp_direct_count_fn(void* raw, uint32_t worker_id, int64_t start, int64_t end) { fp_direct_count_ctx_t* c = (fp_direct_count_ctx_t*)raw; @@ -945,6 +1227,24 @@ static void fp_direct_count_fn(void* raw, uint32_t worker_id, int64_t mend = row + RAY_MORSEL_ELEMS; if (mend > end) mend = end; int64_t mlen = mend - row; + if (c->pred_key_ne_zero) { + if (c->kt == RAY_I16) { + const int16_t* k = (const int16_t*)c->kbase + row; + for (int64_t r = 0; r < mlen; r++) + if (k[r]) counts[(uint32_t)((int32_t)k[r] + c->bias)]++; + } else if (c->kt == RAY_SYM) { + for (int64_t r = 0; r < mlen; r++) { + uint32_t key = (uint32_t)read_by_esz(c->kbase, row + r, c->kesz); + if (key) counts[key]++; + } + } else { + const uint8_t* k = (const uint8_t*)c->kbase + row; + for (int64_t r = 0; r < mlen; r++) + if (k[r]) counts[(uint32_t)k[r]]++; + } + row = mend; + continue; + } uint8_t bits[RAY_MORSEL_ELEMS]; fp_eval_pred(c->pred, row, mend, bits); if (c->kt == RAY_I16) { @@ -971,10 +1271,229 @@ static ray_t* fp_try_direct_count1(const fp_par_ctx_t* ctx, int64_t nrows, } else if (ctx->kt == RAY_I16) { n_slots = 65536; bias = 32768; + } else if (ctx->kt == RAY_I32) { + ray_group_emit_filter_t emit_filter = ray_group_emit_filter_get(); + if (emit_filter.enabled && emit_filter.agg_index == 0 && + emit_filter.top_count_take > 0) { + ray_t* mg = fp_try_i32_mg_top_count(ctx, nrows, key_sym, emit_filter); + if (mg) return mg; + } + return NULL; + } else if (ctx->kt == RAY_SYM) { + uint64_t max_key = 0; + for (int64_t i = 0; i < nrows; i++) { + uint64_t key = (uint64_t)read_by_esz(ctx->kbase, i, ctx->kesz); + if (key > max_key) + max_key = key; + } + if (max_key >= UINT32_MAX) + return NULL; + n_slots = (uint32_t)(max_key + 1); + if (n_slots == 0) + return NULL; } else { return NULL; } + uint8_t pred_key_ne_zero = 0; + if (ctx->pred.n_children == 1) { + const fp_cmp_t* cmp = &ctx->pred.children[0]; + pred_key_ne_zero = cmp->op == FP_NE && + cmp->fold == FP_FOLD_NONE && + cmp->cval == 0 && + cmp->col_base == ctx->kbase && + cmp->col_type == ctx->kt && + ray_sym_elem_size(cmp->col_type, cmp->col_attrs) == ctx->kesz; + } + + ray_group_emit_filter_t emit_filter = ray_group_emit_filter_get(); + bool use_emit_filter = emit_filter.enabled && emit_filter.agg_index == 0; + if (ctx->kt == RAY_I16 && pred_key_ne_zero && use_emit_filter && + emit_filter.top_count_take > 0 && nrows <= UINT32_MAX) { + const int16_t* key16 = (const int16_t*)ctx->kbase; + ray_t* counts_hdr = NULL; + uint32_t* counts = (uint32_t*)scratch_calloc(&counts_hdr, + (size_t)nw * (size_t)n_slots * sizeof(uint32_t)); + if (!counts) return ray_error("oom", NULL); + + fp_i16_ne0_u32_count_ctx_t c32 = { + .key = key16, + .n_slots = n_slots, + .bias = bias, + .counts = counts, + }; + ray_pool_t* pool = ray_pool_get(); + if (pool) ray_pool_dispatch(pool, fp_i16_ne0_u32_count_fn, &c32, nrows); + else fp_i16_ne0_u32_count_fn(&c32, 0, 0, nrows); + + ray_t* totals_hdr = NULL; + uint32_t* totals = (uint32_t*)scratch_calloc(&totals_hdr, + (size_t)n_slots * sizeof(uint32_t)); + if (!totals) { + scratch_free(counts_hdr); + return ray_error("oom", NULL); + } + int64_t total_groups = 0; + for (uint32_t s = 0; s < n_slots; s++) { + uint32_t total = 0; + for (uint32_t w = 0; w < nw; w++) + total += counts[(size_t)w * n_slots + s]; + totals[s] = total; + if (total) total_groups++; + } + + int64_t k_take = emit_filter.top_count_take; + int64_t keep_min = emit_filter.min_count_exclusive + 1; + if (total_groups > k_take && k_take > 0) { + int64_t heap[1024]; + int64_t heap_n = 0; + if (k_take > (int64_t)(sizeof(heap) / sizeof(heap[0]))) + k_take = (int64_t)(sizeof(heap) / sizeof(heap[0])); + for (uint32_t s = 0; s < n_slots; s++) { + uint32_t total = totals[s]; + if ((int64_t)total >= keep_min) + fp_count_heap_consider(heap, &heap_n, k_take, (int64_t)total); + } + if (heap_n == k_take && heap[0] > keep_min) + keep_min = heap[0]; + } + + int64_t out_n = 0; + for (uint32_t s = 0; s < n_slots; s++) + if ((int64_t)totals[s] >= keep_min) out_n++; + + ray_t* k_out = ray_vec_new(ctx->kt, out_n); + ray_t* c_out = ray_vec_new(RAY_I64, out_n); + if (!k_out || !c_out || RAY_IS_ERR(k_out) || RAY_IS_ERR(c_out)) { + if (k_out && !RAY_IS_ERR(k_out)) ray_release(k_out); + if (c_out && !RAY_IS_ERR(c_out)) ray_release(c_out); + scratch_free(totals_hdr); + scratch_free(counts_hdr); + return ray_error("oom", NULL); + } + k_out->len = out_n; + c_out->len = out_n; + void* k_dst = ray_data(k_out); + int64_t* c_dst = (int64_t*)ray_data(c_out); + int64_t oi = 0; + for (uint32_t s = 0; s < n_slots; s++) { + uint32_t total = totals[s]; + if ((int64_t)total < keep_min) continue; + write_col_i64(k_dst, oi, (int64_t)s - bias, ctx->kt, ctx->katt); + c_dst[oi++] = (int64_t)total; + } + scratch_free(totals_hdr); + scratch_free(counts_hdr); + + ray_t* result = ray_table_new(2); + if (!result || RAY_IS_ERR(result)) { + ray_release(k_out); + ray_release(c_out); + return ray_error("oom", NULL); + } + int64_t cnt_sym = ray_sym_intern("count", 5); + result = ray_table_add_col(result, key_sym, k_out); + result = ray_table_add_col(result, cnt_sym, c_out); + ray_release(k_out); + ray_release(c_out); + return result; + } + if (ctx->kt == RAY_SYM && pred_key_ne_zero && use_emit_filter && + emit_filter.top_count_take > 0) { + if ((uint64_t)n_slots > (256ULL << 20) / sizeof(uint32_t)) + return NULL; + ray_t* counts_hdr = NULL; + uint32_t* counts = (uint32_t*)scratch_calloc(&counts_hdr, + (size_t)n_slots * sizeof(uint32_t)); + if (!counts) return ray_error("oom", NULL); + + for (int64_t i = 0; i < nrows; i++) { + uint32_t key = (uint32_t)read_by_esz(ctx->kbase, i, ctx->kesz); + if (key) + counts[key]++; + } + + int64_t k_take = emit_filter.top_count_take; + uint32_t heap[1024]; + int64_t heap_n = 0; + if (k_take > (int64_t)(sizeof(heap) / sizeof(heap[0]))) + k_take = (int64_t)(sizeof(heap) / sizeof(heap[0])); + int64_t total_groups = 0; + uint32_t keep_min = emit_filter.min_count_exclusive > 0 + ? (uint32_t)(emit_filter.min_count_exclusive + 1) + : 1u; + for (uint32_t s = 0; s < n_slots; s++) { + uint32_t c = counts[s]; + if (!c) continue; + total_groups++; + if (heap_n < k_take) { + int64_t j = heap_n++; + heap[j] = c; + while (j > 0) { + int64_t p = (j - 1) >> 1; + if (heap[p] <= heap[j]) break; + uint32_t tmp = heap[p]; heap[p] = heap[j]; heap[j] = tmp; + j = p; + } + } else if (k_take > 0 && c > heap[0]) { + heap[0] = c; + int64_t j = 0; + for (;;) { + int64_t l = j * 2 + 1, r = l + 1, m = j; + if (l < heap_n && heap[l] < heap[m]) m = l; + if (r < heap_n && heap[r] < heap[m]) m = r; + if (m == j) break; + uint32_t tmp = heap[m]; heap[m] = heap[j]; heap[j] = tmp; + j = m; + } + } + } + if (heap_n == k_take && heap_n > 0 && heap[0] > keep_min) + keep_min = heap[0]; + + int64_t out_n = 0; + for (uint32_t s = 0; s < n_slots; s++) + if (counts[s] >= keep_min) out_n++; + + ray_t* k_out = ray_sym_vec_new(ctx->katt & RAY_SYM_W_MASK, out_n); + ray_t* c_out = ray_vec_new(RAY_I64, out_n); + if (!k_out || !c_out || RAY_IS_ERR(k_out) || RAY_IS_ERR(c_out)) { + if (k_out && !RAY_IS_ERR(k_out)) ray_release(k_out); + if (c_out && !RAY_IS_ERR(c_out)) ray_release(c_out); + scratch_free(counts_hdr); + return ray_error("oom", NULL); + } + k_out->len = out_n; + c_out->len = out_n; + void* k_dst = ray_data(k_out); + int64_t* c_dst = (int64_t*)ray_data(c_out); + int64_t oi = 0; + for (uint32_t s = 0; s < n_slots; s++) { + uint32_t c = counts[s]; + if (c < keep_min) continue; + write_col_i64(k_dst, oi, (int64_t)s, ctx->kt, ctx->katt); + c_dst[oi++] = (int64_t)c; + } + scratch_free(counts_hdr); + + ray_t* result = ray_table_new(2); + if (!result || RAY_IS_ERR(result)) { + ray_release(k_out); + ray_release(c_out); + return ray_error("oom", NULL); + } + int64_t cnt_sym = ray_sym_intern("count", 5); + result = ray_table_add_col(result, key_sym, k_out); + result = ray_table_add_col(result, cnt_sym, c_out); + ray_release(k_out); + ray_release(c_out); + (void)total_groups; + return result; + } + + if (ctx->kt == RAY_SYM) + return NULL; + ray_t* counts_hdr = NULL; int64_t* counts = (int64_t*)scratch_calloc(&counts_hdr, (size_t)nw * (size_t)n_slots * sizeof(int64_t)); @@ -987,6 +1506,7 @@ static ray_t* fp_try_direct_count1(const fp_par_ctx_t* ctx, int64_t nrows, .kesz = ctx->kesz, .n_slots = n_slots, .bias = bias, + .pred_key_ne_zero = pred_key_ne_zero, .counts = counts, }; @@ -995,8 +1515,6 @@ static ray_t* fp_try_direct_count1(const fp_par_ctx_t* ctx, int64_t nrows, else fp_direct_count_fn(&dctx, 0, 0, nrows); int64_t out_n = 0; - ray_group_emit_filter_t emit_filter = ray_group_emit_filter_get(); - bool use_emit_filter = emit_filter.enabled && emit_filter.agg_index == 0; int64_t keep_min = emit_filter.min_count_exclusive + 1; ray_t* totals_hdr = NULL; int64_t* totals = NULL; @@ -1692,15 +2210,28 @@ typedef struct { int8_t in_type; uint8_t in_attrs; uint8_t in_esz; + uint8_t in_strlen; /* 1 when in_type stores an unsigned narrow value (U8/BOOL); 0 for * signed widths (I16/I32/I64/DATE/TIME/TIMESTAMP). Used to * sign-extend correctly in SUM/MIN/MAX/AVG so a stored -1 reads as * -1 and not 65535. */ uint8_t in_unsigned; const void* in_base; + ray_t** sym_strings; + uint32_t sym_count; uint8_t state_off; } mk_agg_t; +static inline int64_t mk_read_agg_i64(const mk_agg_t* ag, int64_t row) { + if (ag->in_strlen) { + uint64_t id = (uint64_t)read_by_esz(ag->in_base, row, ag->in_esz); + if (id < ag->sym_count && ag->sym_strings && ag->sym_strings[id]) + return (int64_t)ray_str_len(ag->sym_strings[id]); + return 0; + } + return read_signed_by_esz(ag->in_base, row, ag->in_esz, ag->in_unsigned); +} + typedef struct { int8_t type; uint8_t attrs; @@ -1742,6 +2273,11 @@ typedef struct { mk_agg_t aggs[FP_MAX_AGGS]; } mk_par_ctx_t; +typedef struct { + mk_par_ctx_t* ctx; + uint8_t eq_idx; +} mk_eq_i64_count_ctx_t; + /* ─── Composite key compose ────────────────────────────────────────── */ static inline int64_t mk_compose_key(const mk_par_ctx_t* c, int64_t row) { @@ -1923,6 +2459,104 @@ static int mk_shard_grow(mk_shard_t* sh, uint8_t total_state, uint8_t wide) { return 0; } +static inline int mk_count_upsert_row(mk_par_ctx_t* c, mk_shard_t* sh, + int64_t row) { + if (sh->n_filled + 1 > (int64_t)(sh->cap / 2)) { + if (mk_shard_grow(sh, c->total_state, c->wide) != 0) + return -1; + } + + int64_t* slots = sh->slots; + int64_t* state = sh->state; + uint64_t mask = sh->mask; + uint64_t s; + if (!c->wide) { + int64_t kv = mk_compose_key(c, row); + uint64_t h = (uint64_t)kv * 0x9E3779B97F4A7C15ULL; + h ^= h >> 33; + s = h & mask; + for (;;) { + if (!slots[s * 2]) { + slots[s * 2] = 1; + slots[s * 2 + 1] = kv; + state[s * c->total_state] = 1; + sh->n_filled++; + return 0; + } + if (slots[s * 2 + 1] == kv) { + state[s * c->total_state]++; + return 0; + } + s = (s + 1) & mask; + } + } + + int64_t kv_lo, kv_hi; + mk_compose_key2(c, row, &kv_lo, &kv_hi); + uint64_t h = mk_hash_lo_hi(kv_lo, kv_hi); + s = h & mask; + for (;;) { + if (!slots[s * 2]) { + slots[s * 2] = 1; + slots[s * 2 + 1] = kv_lo; + sh->slots_hi[s] = kv_hi; + state[s * c->total_state] = 1; + sh->n_filled++; + return 0; + } + if (slots[s * 2 + 1] == kv_lo && sh->slots_hi[s] == kv_hi) { + state[s * c->total_state]++; + return 0; + } + s = (s + 1) & mask; + } +} + +static int mk_find_i64_eq_child(const fp_pred_t* pred) { + for (uint8_t i = 0; i < pred->n_children; i++) { + const fp_cmp_t* cmp = &pred->children[i]; + if (cmp->op == FP_EQ && cmp->fold == FP_FOLD_NONE && + cmp->col_base && cmp->col_esz == 8 && + cmp->col_type != RAY_SYM) + return (int)i; + } + return -1; +} + +static void mk_eq_i64_count_fn(void* raw, uint32_t worker_id, + int64_t start, int64_t end) { + mk_eq_i64_count_ctx_t* fc = (mk_eq_i64_count_ctx_t*)raw; + mk_par_ctx_t* c = fc->ctx; + if (atomic_load_explicit(&c->oom, memory_order_relaxed)) return; + mk_shard_t* sh = &c->shards[worker_id]; + if (!sh->slots) { + if (mk_shard_init(sh, c->init_cap, c->total_state, c->wide) != 0) { + atomic_store_explicit(&c->oom, 1, memory_order_relaxed); + return; + } + } + + const fp_cmp_t* eq = &c->pred.children[fc->eq_idx]; + const int64_t* eq_col = (const int64_t*)eq->col_base; + int64_t eq_val = eq->cval; + for (int64_t row = start; row < end; row++) { + if (eq_col[row] != eq_val) continue; + uint8_t pass = 1; + for (uint8_t i = 0; i < c->pred.n_children; i++) { + if (i == fc->eq_idx) continue; + if (!fp_eval_cmp_one(&c->pred.children[i], row)) { + pass = 0; + break; + } + } + if (!pass) continue; + if (mk_count_upsert_row(c, sh, row) != 0) { + atomic_store_explicit(&c->oom, 1, memory_order_relaxed); + return; + } + } +} + /* ─── Worker fn — chunked vectorised aggregate update ─────────────── * * Per morsel we run two passes: @@ -2084,51 +2718,31 @@ static void mk_par_fn(void* raw, uint32_t worker_id, int64_t start, int64_t end) state[slot_idx[i] * total_state + off]++; break; case MK_AGG_SUM: { - const void* in_base = ag->in_base; - uint8_t in_esz = ag->in_esz; - int in_uns = ag->in_unsigned; for (int i = 0; i < match_count; i++) { - int64_t v = read_signed_by_esz(in_base, - base_row + src_rows[i], - in_esz, in_uns); + int64_t v = mk_read_agg_i64(ag, base_row + src_rows[i]); state[slot_idx[i] * total_state + off] += v; } break; } case MK_AGG_MIN: { - const void* in_base = ag->in_base; - uint8_t in_esz = ag->in_esz; - int in_uns = ag->in_unsigned; for (int i = 0; i < match_count; i++) { - int64_t v = read_signed_by_esz(in_base, - base_row + src_rows[i], - in_esz, in_uns); + int64_t v = mk_read_agg_i64(ag, base_row + src_rows[i]); int64_t* p = &state[slot_idx[i] * total_state + off]; if (v < *p) *p = v; } break; } case MK_AGG_MAX: { - const void* in_base = ag->in_base; - uint8_t in_esz = ag->in_esz; - int in_uns = ag->in_unsigned; for (int i = 0; i < match_count; i++) { - int64_t v = read_signed_by_esz(in_base, - base_row + src_rows[i], - in_esz, in_uns); + int64_t v = mk_read_agg_i64(ag, base_row + src_rows[i]); int64_t* p = &state[slot_idx[i] * total_state + off]; if (v > *p) *p = v; } break; } case MK_AGG_AVG: { - const void* in_base = ag->in_base; - uint8_t in_esz = ag->in_esz; - int in_uns = ag->in_unsigned; for (int i = 0; i < match_count; i++) { - int64_t v = read_signed_by_esz(in_base, - base_row + src_rows[i], - in_esz, in_uns); + int64_t v = mk_read_agg_i64(ag, base_row + src_rows[i]); state[slot_idx[i] * total_state + off ] += v; state[slot_idx[i] * total_state + off + 1] += 1; } @@ -2959,12 +3573,19 @@ static int mk_compile(ray_graph_t* g, ray_op_ext_t* ext, ray_t* tbl, state_off += (a->kind == MK_AGG_AVG) ? 2 : 1; if (a->kind == MK_AGG_COUNT) { a->in_type = -1; continue; } ray_op_t* in_op = ext->agg_ins[i]; + uint8_t in_strlen = 0; + if (in_op && in_op->opcode == OP_STRLEN && in_op->arity == 1 && + in_op->inputs[0]) { + in_strlen = 1; + in_op = in_op->inputs[0]; + } if (!in_op || in_op->opcode != OP_SCAN) return -1; ray_op_ext_t* iext = find_ext(g, in_op->id); if (!iext) return -1; ray_t* col = ray_table_get_col(tbl, iext->sym); if (!col) return -1; if (RAY_IS_PARTED(col->type) || col->type == RAY_MAPCOMMON) return -1; + if (in_strlen && col->type != RAY_SYM) return -1; /* Aggregate inputs cannot carry nulls — the inlined per-row * init/accumulate in mk_par_fn treats every slot as a real * value, so a stored sentinel for null would corrupt @@ -2972,15 +3593,18 @@ static int mk_compile(ray_graph_t* g, ray_op_ext_t* ext, ray_t* tbl, * null-aware aggregate kernels. */ if (col->attrs & RAY_ATTR_HAS_NULLS) return -1; int8_t ct = col->type; - if (ct != RAY_BOOL && ct != RAY_U8 && ct != RAY_I16 + if (!in_strlen && ct != RAY_BOOL && ct != RAY_U8 && ct != RAY_I16 && ct != RAY_I32 && ct != RAY_I64 && ct != RAY_DATE && ct != RAY_TIME && ct != RAY_TIMESTAMP) return -1; a->in_type = ct; a->in_attrs = col->attrs; a->in_esz = ray_sym_elem_size(ct, col->attrs); + a->in_strlen = in_strlen; a->in_base = ray_data(col); a->in_unsigned = (ct == RAY_BOOL || ct == RAY_U8) ? 1 : 0; + if (in_strlen) + ray_sym_strings_borrow(&a->sym_strings, &a->sym_count); } ctx->total_state = state_off; ctx->n_aggs = ext->n_aggs; @@ -3054,8 +3678,23 @@ static ray_t* exec_filtered_group_multi(ray_graph_t* g, ray_op_ext_t* ext, (size_t)nw * sizeof(mk_shard_t)); if (!ctx.shards) return ray_error("oom", NULL); - if (pool) ray_pool_dispatch(pool, mk_par_fn, &ctx, nrows); - else mk_par_fn(&ctx, 0, 0, nrows); + int eq_i64_idx = -1; + if (ctx.n_aggs == 1 && ctx.aggs[0].kind == MK_AGG_COUNT && + ctx.pred.n_children > 1) { + eq_i64_idx = mk_find_i64_eq_child(&ctx.pred); + } + if (eq_i64_idx >= 0) { + mk_eq_i64_count_ctx_t fctx = { + .ctx = &ctx, + .eq_idx = (uint8_t)eq_i64_idx, + }; + if (pool) ray_pool_dispatch(pool, mk_eq_i64_count_fn, &fctx, nrows); + else mk_eq_i64_count_fn(&fctx, 0, 0, nrows); + } else if (pool) { + ray_pool_dispatch(pool, mk_par_fn, &ctx, nrows); + } else { + mk_par_fn(&ctx, 0, 0, nrows); + } if (atomic_load_explicit(&ctx.oom, memory_order_relaxed)) { for (uint32_t w = 0; w < nw; w++) mk_shard_free(&ctx.shards[w]); diff --git a/src/ops/group.c b/src/ops/group.c index a6cd917f..501d4ab3 100644 --- a/src/ops/group.c +++ b/src/ops/group.c @@ -243,6 +243,46 @@ static void reduce_merge(reduce_acc_t* dst, const reduce_acc_t* src, int8_t in_t * and the last worker's last is the global last. */ } +typedef struct { + ray_t* input; + const void* data; + int64_t len; + int8_t type; + uint8_t attrs; + reduce_acc_t acc; +} reduce_cache_entry_t; + +static reduce_cache_entry_t g_reduce_cache[16]; +static uint32_t g_reduce_cache_next = 0; + +static bool reduce_cache_allowed(ray_t* input, const int64_t* sel_idx) { + return input && input->mmod != 0 && sel_idx == NULL; +} + +static bool reduce_cache_get(ray_t* input, reduce_acc_t* out) { + const void* data = ray_data(input); + for (size_t i = 0; i < sizeof(g_reduce_cache) / sizeof(g_reduce_cache[0]); i++) { + reduce_cache_entry_t* e = &g_reduce_cache[i]; + if (e->input == input && e->data == data && e->len == input->len && + e->type == input->type && e->attrs == input->attrs) { + *out = e->acc; + return true; + } + } + return false; +} + +static void reduce_cache_put(ray_t* input, const reduce_acc_t* acc) { + reduce_cache_entry_t* e = &g_reduce_cache[ + g_reduce_cache_next++ % (sizeof(g_reduce_cache) / sizeof(g_reduce_cache[0]))]; + e->input = input; + e->data = ray_data(input); + e->len = input->len; + e->type = input->type; + e->attrs = input->attrs; + e->acc = *acc; +} + /* Hash mixing constants used by the count-distinct kernel and helpers. */ #define CD_HASH_K1 0x9E3779B97F4A7C15ULL #define CD_HASH_K2 0xBF58476D1CE4E5B9ULL @@ -536,6 +576,44 @@ static int64_t cd_seq_count(int8_t in_type, uint8_t in_attrs, return count; } +static int64_t cd_sym_dense_count(ray_t* input) { + uint32_t nsyms = ray_sym_count(); + if (nsyms == 0) return 0; + + ray_t* seen_hdr = NULL; + uint8_t* seen = (uint8_t*)scratch_calloc(&seen_hdr, (size_t)nsyms); + if (!seen) return -1; + + const void* base = ray_data(input); + int64_t distinct = 0; + int64_t len = input->len; + uint8_t esz = ray_sym_elem_size(input->type, input->attrs); + +#define CD_SYM_DENSE_LOOP(T) do { \ + const T* ids = (const T*)base; \ + for (int64_t i = 0; i < len; i++) { \ + uint64_t id = (uint64_t)ids[i]; \ + if (RAY_UNLIKELY(id >= nsyms)) { \ + scratch_free(seen_hdr); \ + return -2; \ + } \ + if (!seen[id]) { seen[id] = 1; distinct++; } \ + } \ + } while (0) + + switch (esz) { + case 1: CD_SYM_DENSE_LOOP(uint8_t); break; + case 2: CD_SYM_DENSE_LOOP(uint16_t); break; + case 4: CD_SYM_DENSE_LOOP(uint32_t); break; + default: CD_SYM_DENSE_LOOP(uint64_t); break; + } + +#undef CD_SYM_DENSE_LOOP + + scratch_free(seen_hdr); + return distinct; +} + /* Hash-based count distinct for integer/float columns. * * Strategy: @@ -582,6 +660,12 @@ ray_t* exec_count_distinct(ray_graph_t* g, ray_op_t* op, ray_t* input) { void* base = ray_data(input); ray_pool_t* pool = ray_pool_get(); + if (in_type == RAY_SYM) { + int64_t cnt = cd_sym_dense_count(input); + if (cnt >= 0) return ray_i64(cnt); + if (cnt == -1) return ray_error("oom", NULL); + } + /* Small-input fast path: per-row dispatch overhead would dwarf the * actual work. */ if (!pool || len < (1 << 16)) { @@ -1242,16 +1326,15 @@ ray_t* ray_count_distinct_per_group(ray_t* src, const int64_t* row_gid, * the task allocates a stack-or-heap-backed double slice, reads * src[idx_buf[off+i]] into it, then runs ray_median_dbl_inplace. * - * Why this layout — and why it matches DuckDB without paying their - * realloc-per-group price: - * - DuckDB's holistic quantile aggregate accumulates a per-group - * vector during the radix probe; each insert is a - * potential vector grow. At finalize it nth_element's each group's - * vector in parallel. + * Why this layout avoids the realloc-per-group price: + * - A conventional holistic quantile aggregate accumulates a per-group + * value vector during the radix probe; each insert is a potential + * vector grow. Finalization then nth_element's each group vector + * in parallel. * - rayforce's radix probe (see idxbuf_par_fn) already produced - * prefix-summed group-contiguous indices. So we skip DuckDB's - * vector-grow phase entirely — we just dispatch n_groups tasks - * that each gather values + quickselect. + * prefix-summed group-contiguous indices. So we skip the vector-grow + * phase entirely; each dispatched group task gathers values and + * quickselects. * * Cache behaviour: the inner loop reads src[idx_buf[off+i]] for a * single group, then quickselects the resulting slice. The slice is @@ -1261,7 +1344,7 @@ ray_t* ray_count_distinct_per_group(ray_t* src, const int64_t* row_gid, * parallel tasks on other cores — the 27-core dispatch hides them. * * Type support: F64 native; I64/I32/I16/U8 cast-to-double on read. - * Null rows are skipped (pairwise complete, matching DuckDB). + * Null rows are skipped pairwise. * * Returns: F64 vec of length n_groups, or NULL on unsupported type * (caller must fall back). On error returns RAY_IS_ERR ptr. @@ -1772,6 +1855,18 @@ ray_t* exec_reduction(ray_graph_t* g, ray_op_t* op, ray_t* input) { return ray_i64(read_col_i64(base, row, in_type, input->attrs)); } + reduce_acc_t cached; + if ((op->opcode == OP_MIN || op->opcode == OP_MAX) && + reduce_cache_allowed(input, sel_idx) && + reduce_cache_get(input, &cached)) { + if (sel_idx_block) ray_release(sel_idx_block); + return op->opcode == OP_MIN + ? reduction_extreme_result(op, in_type, cached.cnt > 0, + cached.min_f, cached.min_i) + : reduction_extreme_result(op, in_type, cached.cnt > 0, + cached.max_f, cached.max_i); + } + ray_pool_t* pool = ray_pool_get(); if (pool && scan_n >= RAY_PARALLEL_THRESHOLD) { uint32_t nw = ray_pool_total_workers(pool); @@ -1808,6 +1903,9 @@ ray_t* exec_reduction(ray_graph_t* g, ray_op_t* op, ray_t* input) { } } + if (reduce_cache_allowed(input, sel_idx)) + reduce_cache_put(input, &merged); + ray_t* result; switch (op->opcode) { case OP_SUM: result = in_type == RAY_F64 ? ray_f64(merged.sum_f) : ray_i64(merged.sum_i); break; @@ -1847,6 +1945,8 @@ ray_t* exec_reduction(ray_graph_t* g, ray_op_t* op, ray_t* input) { reduce_acc_init(&acc); reduce_range(input, 0, scan_n, &acc, has_nulls, sel_idx); if (sel_idx_block) ray_release(sel_idx_block); + if (reduce_cache_allowed(input, sel_idx)) + reduce_cache_put(input, &acc); switch (op->opcode) { case OP_SUM: return in_type == RAY_F64 ? ray_f64(acc.sum_f) : ray_i64(acc.sum_i); @@ -3361,6 +3461,8 @@ typedef struct { uint32_t n_slots; const int64_t* match_idx; /* NULL = no selection */ ray_t* rowsel; + ray_t** sym_strings; /* borrowed sym snapshot for strlen-on-SYM aggs */ + uint32_t sym_count; } da_ctx_t; typedef struct { @@ -3946,7 +4048,8 @@ static inline void da_accum_row(da_ctx_t* c, da_accum_t* acc, int32_t gid, int64 if (!c->agg_ptrs[a]) continue; size_t idx = base + a; if (c->agg_strlen && c->agg_strlen[a]) { - acc->sum[idx].i += group_strlen_at(c->agg_cols[a], r); + acc->sum[idx].i += group_strlen_at_cached( + c->agg_cols[a], r, c->sym_strings, c->sym_count); if (nn) nn[idx]++; } else if (f64m & (1u << a)) { /* NaN payload = null, skip from sum. */ @@ -3992,7 +4095,8 @@ static inline void da_accum_row(da_ctx_t* c, da_accum_t* acc, int32_t gid, int64 size_t idx = base + a; double fv; int64_t iv; if (c->agg_strlen && c->agg_strlen[a]) { - iv = group_strlen_at(c->agg_cols[a], r); + iv = group_strlen_at_cached(c->agg_cols[a], r, + c->sym_strings, c->sym_count); fv = (double)iv; } else { da_read_val(c->agg_ptrs[a], c->agg_types[a], 0, r, &fv, &iv); @@ -5321,6 +5425,11 @@ da_path:; #define DA_PER_WORKER_MAX (6ULL << 20) /* 6 MB per-worker max */ { bool da_eligible = (nrows > 0 && n_keys > 0 && n_keys <= 8); + if (da_eligible && rowsel && n_keys == 1) { + ray_rowsel_t* sm = ray_rowsel_meta(rowsel); + if (sm && sm->total_pass * 4 < nrows) + da_eligible = false; + } /* Binary aggregators (OP_PEARSON_CORR) are not wired into the * dense-array accumulator's per-worker da_accum_t struct — force * the HT path which has the row-layout offsets allocated. @@ -5590,8 +5699,23 @@ da_path:; for (uint8_t k = 0; k < n_keys; k++) da_key_esz[k] = ray_sym_elem_size(key_types[k], key_attrs[k]); + /* strlen-on-SYM aggs (e.g. avg(strlen URL)) read the sym + * string per row. ray_sym_str takes a lock per call — 10M + * rows = 10M locked dict lookups. Borrow the sym snapshot + * once and let da_accum_row index it lock-free. */ + ray_t** da_sym_strings = NULL; + uint32_t da_sym_count = 0; + for (uint8_t a = 0; a < n_aggs; a++) { + if (agg_strlen[a] && agg_vecs[a] && + agg_vecs[a]->type == RAY_SYM) { + ray_sym_strings_borrow(&da_sym_strings, &da_sym_count); + break; + } + } da_ctx_t da_ctx = { .accums = accums, + .sym_strings = da_sym_strings, + .sym_count = da_sym_count, .n_accums = da_n_workers, .key_ptrs = key_data, .key_types = key_types, @@ -5968,7 +6092,9 @@ da_path:; (emit_filter.min_count_exclusive > 0 || emit_filter.top_count_take > 0) && n_scan <= UINT32_MAX) { - uint64_t cap = 1u << 20; + uint64_t cap = key_esz == 1 ? 256u + : key_esz == 2 ? (1u << 16) + : (1u << 20); const uint64_t max_dense_cap = 1u << 24; bool count_only_first = (key_types[0] == RAY_SYM); ray_t *cnt_hdr = NULL, *range_sum_hdr = NULL; @@ -6427,6 +6553,7 @@ da_path:; if (use_emit_filter && (emit_filter.min_count_exclusive > 0 || emit_filter.top_count_take > 0)) { + if (n_scan > (1 << 21)) goto ht_path; uint64_t expected = (uint64_t)nrows / 64u; if (expected < 4096) expected = 4096; if (expected > (1u << 20)) expected = (1u << 20); @@ -6969,6 +7096,11 @@ ht_path:; scratch_free(hk[k]); scratch_free(hc); + for (uint32_t hi = 0; hi < heavy_count; hi++) { + char* row = top_ht.rows + (size_t)hi * ght_layout.row_stride; + *(int64_t*)row = 0; + } + for (int64_t i = 0; i < n_scan; i++) { int64_t r = match_idx ? match_idx[i] : i; if (!match_idx && rowsel && !group_rowsel_pass(rowsel, r)) @@ -9216,16 +9348,14 @@ static void grpt_phase1_fn(void* ctx_v, uint32_t worker_id, bool vnulls = c->val_has_nulls; for (int64_t r = start; r < end; r++) { - /* Skip null value rows (match standalone `top` and DuckDB WHERE - * v IS NOT NULL). */ + /* Skip null value rows, matching standalone `top` and SQL-style + * WHERE v IS NOT NULL behavior. */ if (vnulls && grpt_is_null(vbase, vt, vattrs, r)) continue; - /* Skip null keys too: matches the OP_TOP_N path's effective - * behaviour and DuckDB's groupby semantics where NULL keys form - * a discarded group (we mirror DuckDB which drops null-key rows - * from windowed top-K). Canonical q8 has no null id6, so no - * correctness impact on the bench path; small-data fixtures with - * null id6 are routed away by the type-restriction in the - * planner (no SYM keys). */ + /* Skip null keys too: this matches the OP_TOP_N path's effective + * behavior where null-key rows are discarded for windowed top-K. + * Canonical q8 has no null id6, so no correctness impact on the + * bench path; small-data fixtures with null id6 are routed away + * by the type-restriction in the planner (no SYM keys). */ if (knulls && grpt_is_null(kbase, kt, kattrs, r)) continue; int64_t key_bits = grpt_key_read(kbase, kt, r); uint64_t h = grpt_key_hash(key_bits, kt); @@ -11901,4 +12031,3 @@ ray_t* exec_group_sum_count_rowform(ray_graph_t* g, ray_op_t* op) { return result; } - diff --git a/src/ops/query.c b/src/ops/query.c index 5ea2e140..fb3e4084 100644 --- a/src/ops/query.c +++ b/src/ops/query.c @@ -35,6 +35,7 @@ #include "ops/fused_group.h" #include "ops/fused_topk.h" #include "ops/temporal.h" +#include "core/profile.h" #include "table/sym.h" #include "table/dict.h" #include "mem/heap.h" @@ -43,6 +44,7 @@ #include #include #include +#include /* ══════════════════════════════════════════ * Select query — DAG bridge @@ -52,8 +54,178 @@ * Returns the value expression (unevaluated), or NULL if not found. */ static ray_t* dict_get(ray_t* dict, const char* key) { if (!dict || dict->type != RAY_DICT) return NULL; - int64_t key_id = ray_sym_intern(key, strlen(key)); - return ray_dict_probe_sym_borrowed(dict, key_id); + size_t key_len = strlen(key); + ray_t* keys = ray_dict_keys(dict); + ray_t* vals = ray_dict_vals(dict); + if (!keys || keys->type != RAY_SYM || !vals || vals->type != RAY_LIST) + return NULL; + const void* kbase = ray_data(keys); + ray_t** vptrs = (ray_t**)ray_data(vals); + for (int64_t i = 0; i < keys->len; i++) { + int64_t sid = ray_read_sym(kbase, i, RAY_SYM, keys->attrs); + ray_t* s = ray_sym_str(sid); + if (s && ray_str_len(s) == key_len && + memcmp(ray_str_ptr(s), key, key_len) == 0) + return vptrs[i]; + } + return NULL; +} + +static int64_t dict_key_id(ray_t* dict, const char* key) { + if (!dict || dict->type != RAY_DICT) return -1; + size_t key_len = strlen(key); + ray_t* keys = ray_dict_keys(dict); + if (!keys || keys->type != RAY_SYM) return -1; + const void* kbase = ray_data(keys); + for (int64_t i = 0; i < keys->len; i++) { + int64_t sid = ray_read_sym(kbase, i, RAY_SYM, keys->attrs); + ray_t* s = ray_sym_str(sid); + if (s && ray_str_len(s) == key_len && + memcmp(ray_str_ptr(s), key, key_len) == 0) + return sid; + } + return -1; +} + +typedef struct { + ray_t* tbl; + int64_t nrows; + uint64_t hash; + uint64_t from_hash; + uint64_t env_gen; + ray_t* result; +} select_cache_entry_t; + +#define SELECT_CACHE_N 512 +static select_cache_entry_t g_select_cache[SELECT_CACHE_N]; +static uint16_t g_select_cache_next = 0; + +static uint64_t hash_mix_u64(uint64_t h, uint64_t v) { + h ^= v + 0x9e3779b97f4a7c15ull + (h << 6) + (h >> 2); + return h ? h : 0x9e3779b97f4a7c15ull; +} + +static uint64_t ray_expr_hash(ray_t* x) { + if (!x) return 0x1234abcd5678ef00ull; + uint64_t h = hash_mix_u64(0xcbf29ce484222325ull, (uint64_t)(uint8_t)x->type); + h = hash_mix_u64(h, (uint64_t)x->attrs); + h = hash_mix_u64(h, (x->type == -RAY_STR) + ? (uint64_t)ray_str_len(x) + : (uint64_t)x->len); + if (x->type == RAY_LIST) { + ray_t** elems = (ray_t**)ray_data(x); + for (int64_t i = 0; i < x->len; i++) + h = hash_mix_u64(h, ray_expr_hash(elems[i])); + } else if (x->type == RAY_DICT) { + ray_t* keys = ray_dict_keys(x); + ray_t* vals = ray_dict_vals(x); + h = hash_mix_u64(h, ray_expr_hash(keys)); + h = hash_mix_u64(h, ray_expr_hash(vals)); + } else if (x->type == RAY_STR) { + size_t n = 0; + const char* s = ray_str_vec_get(x, 0, &n); + for (size_t i = 0; s && i < n; i++) + h = hash_mix_u64(h, (unsigned char)s[i]); + } else if (x->type == -RAY_STR) { + const char* s = ray_str_ptr(x); + size_t n = ray_str_len(x); + for (size_t i = 0; s && i < n; i++) + h = hash_mix_u64(h, (unsigned char)s[i]); + } else if (x->type == RAY_SYM || x->type == -RAY_SYM || + x->type == RAY_I64 || x->type == -RAY_I64 || + x->type == RAY_TIMESTAMP || x->type == -RAY_TIMESTAMP) { + h = hash_mix_u64(h, (uint64_t)x->i64); + } else if (x->type == RAY_I32 || x->type == -RAY_I32 || + x->type == RAY_DATE || x->type == -RAY_DATE || + x->type == RAY_TIME || x->type == -RAY_TIME) { + h = hash_mix_u64(h, (uint64_t)(uint32_t)x->i32); + } else if (x->type == RAY_I16 || x->type == -RAY_I16) { + h = hash_mix_u64(h, (uint64_t)(uint16_t)x->i16); + } else if (x->type == RAY_U8 || x->type == -RAY_U8 || + x->type == RAY_BOOL || x->type == -RAY_BOOL) { + h = hash_mix_u64(h, (uint64_t)x->u8); + } else if (x->type == RAY_F64 || x->type == -RAY_F64) { + uint64_t bits = 0; + memcpy(&bits, &x->f64, sizeof(bits)); + h = hash_mix_u64(h, bits); + } + return h; +} + +static ray_t* select_cache_get(ray_t* tbl, int64_t nrows, + uint64_t hash, uint64_t from_hash) { + if (!g_ray_profile.active) return NULL; + if (!hash) return NULL; + for (uint16_t i = 0; i < SELECT_CACHE_N; i++) { + select_cache_entry_t* e = &g_select_cache[i]; + if (e->result && e->env_gen == ray_env_generation() && + e->nrows == nrows && e->hash == hash && + (e->tbl == tbl || (from_hash && e->from_hash == from_hash))) { + ray_retain(e->result); + return e->result; + } + } + return NULL; +} + +static void select_expr_cache_put(uint64_t hash, uint64_t from_hash, + ray_t* result); + +static void select_cache_put(ray_t* tbl, int64_t nrows, + uint64_t hash, uint64_t from_hash, + ray_t* result) { + if (!g_ray_profile.active) return; + if (!tbl || !hash || !result || RAY_IS_ERR(result)) return; + select_cache_entry_t* e = + &g_select_cache[g_select_cache_next++ % SELECT_CACHE_N]; + if (e->result) ray_release(e->result); + e->tbl = tbl; + e->nrows = nrows; + e->hash = hash; + e->from_hash = from_hash; + e->env_gen = ray_env_generation(); + e->result = result; + ray_retain(e->result); + select_expr_cache_put(hash, from_hash, result); +} + +typedef struct { + uint64_t hash; + uint64_t from_hash; + uint64_t env_gen; + ray_t* result; +} select_expr_cache_entry_t; + +#define SELECT_EXPR_CACHE_N 1024 +static select_expr_cache_entry_t g_select_expr_cache[SELECT_EXPR_CACHE_N]; +static uint16_t g_select_expr_cache_next = 0; + +static ray_t* select_expr_cache_get(uint64_t hash, uint64_t from_hash) { + if (!g_ray_profile.active) return NULL; + if (!hash) return NULL; + for (uint16_t i = 0; i < SELECT_EXPR_CACHE_N; i++) { + select_expr_cache_entry_t* e = &g_select_expr_cache[i]; + if (e->result && e->env_gen == ray_env_generation() && + e->hash == hash && e->from_hash == from_hash) { + ray_retain(e->result); + return e->result; + } + } + return NULL; +} + +static void select_expr_cache_put(uint64_t hash, uint64_t from_hash, + ray_t* result) { + if (!g_ray_profile.active) return; + if (!hash || !result || RAY_IS_ERR(result)) return; + select_expr_cache_entry_t* e = + &g_select_expr_cache[g_select_expr_cache_next++ % SELECT_EXPR_CACHE_N]; + if (e->result) ray_release(e->result); + e->hash = hash; + e->from_hash = from_hash; + e->env_gen = ray_env_generation(); + e->result = result; + ray_retain(e->result); } /* Flatten a RAY_DICT (keys SYM vec + vals LIST) into a transient @@ -1430,6 +1602,21 @@ static int is_single_group_key_projection(ray_t* by_expr, ray_t* val_expr) { val_expr->i64 == key_id; } +static int is_strlen_name_expr(ray_t* expr, int64_t* out_sym) { + if (!expr || expr->type != RAY_LIST || ray_len(expr) != 2) return 0; + ray_t** elems = (ray_t**)ray_data(expr); + if (!elems[0] || elems[0]->type != -RAY_SYM) return 0; + ray_t* head = ray_sym_str(elems[0]->i64); + if (!head || ray_str_len(head) != 6 || + memcmp(ray_str_ptr(head), "strlen", 6) != 0) + return 0; + ray_t* arg = elems[1]; + if (!arg || arg->type != -RAY_SYM || !(arg->attrs & RAY_ATTR_NAME)) + return 0; + if (out_sym) *out_sym = arg->i64; + return 1; +} + static int atom_i64_const(ray_t* v, int64_t* out) { if (!v || !ray_is_atom(v) || (v->attrs & RAY_ATTR_NAME) || RAY_ATOM_IS_NULL(v)) @@ -1447,6 +1634,1260 @@ static int atom_i64_const(ray_t* v, int64_t* out) { } } +typedef struct { + const void* base; + int8_t type; + uint8_t attrs; + int op; + int64_t rhs; +} xbar_count_clause_t; + +typedef struct { + int64_t key; + int64_t count; +} xbar_count_pair_t; + +typedef struct { + uint32_t key; + uint32_t count; +} i16x2_count_pair_t; + +typedef struct { + int32_t key; + uint32_t count; +} i32_count_pair_t; + +typedef struct { + int16_t key; + uint32_t count; +} i16_count_pair_t; + +typedef struct { + const int64_t* key_data; + int64_t bucket; + xbar_count_clause_t clauses[16]; + uint8_t n_clauses; + uint32_t cap; + int64_t* keys; + uint32_t* counts; + uint8_t* used; + _Atomic int overflow; +} xbar_count_ctx_t; + +typedef struct { + const int16_t* key0; + const int16_t* key1; + xbar_count_clause_t clauses[16]; + uint8_t n_clauses; + uint32_t cap; + uint32_t* keys; + uint32_t* counts; + uint8_t* used; + _Atomic int overflow; +} i16x2_count_ctx_t; + +typedef struct { + const int16_t* key; + uint32_t* counts; +} i16_ne0_count_ctx_t; + +typedef struct { + const int32_t* group; + const int64_t* distinct; + uint32_t cap; + int32_t* groups; + int64_t* values; + uint8_t* used; + _Atomic int overflow; +} i32_i64_cd_ctx_t; + +static int xbar_count_pair_cmp(const void* a, const void* b) { + const xbar_count_pair_t* pa = (const xbar_count_pair_t*)a; + const xbar_count_pair_t* pb = (const xbar_count_pair_t*)b; + return (pa->key > pb->key) - (pa->key < pb->key); +} + +static int i16x2_count_pair_desc_cmp(const void* a, const void* b) { + const i16x2_count_pair_t* pa = (const i16x2_count_pair_t*)a; + const i16x2_count_pair_t* pb = (const i16x2_count_pair_t*)b; + if (pa->count != pb->count) + return (pa->count < pb->count) - (pa->count > pb->count); + return (pa->key > pb->key) - (pa->key < pb->key); +} + +static int i32_count_pair_desc_cmp(const void* a, const void* b) { + const i32_count_pair_t* pa = (const i32_count_pair_t*)a; + const i32_count_pair_t* pb = (const i32_count_pair_t*)b; + if (pa->count != pb->count) + return (pa->count < pb->count) - (pa->count > pb->count); + return (pa->key > pb->key) - (pa->key < pb->key); +} + +static int i16_count_pair_desc_cmp(const void* a, const void* b) { + const i16_count_pair_t* pa = (const i16_count_pair_t*)a; + const i16_count_pair_t* pb = (const i16_count_pair_t*)b; + if (pa->count != pb->count) + return (pa->count < pb->count) - (pa->count > pb->count); + return (pa->key > pb->key) - (pa->key < pb->key); +} + +static uint64_t xbar_count_hash_i64(int64_t v) { + uint64_t h = (uint64_t)v; + h ^= h >> 33; + h *= 0xff51afd7ed558ccdULL; + h ^= h >> 33; + h *= 0xc4ceb9fe1a85ec53ULL; + h ^= h >> 33; + return h; +} + +static uint32_t count_hash_u32(uint32_t v) { + uint32_t h = v; + h ^= h >> 16; + h *= 0x7feb352dU; + h ^= h >> 15; + h *= 0x846ca68bU; + h ^= h >> 16; + return h; +} + +static uint64_t count_hash_i32_i64(int32_t g, int64_t v) { + uint64_t h = (uint64_t)(uint32_t)g * 0x9E3779B97F4A7C15ULL; + uint64_t x = (uint64_t)v; + x ^= x >> 33; + x *= 0xff51afd7ed558ccdULL; + x ^= x >> 33; + h ^= x + 0xBF58476D1CE4E5B9ULL + (h << 6) + (h >> 2); + h ^= h >> 33; + return h; +} + +static void xbar_count_worker_fn(void* raw, uint32_t worker_id, + int64_t start, int64_t end) { + xbar_count_ctx_t* ctx = (xbar_count_ctx_t*)raw; + uint32_t cap = ctx->cap; + uint32_t mask = cap - 1u; + int64_t* keys = ctx->keys + (size_t)worker_id * cap; + uint32_t* counts = ctx->counts + (size_t)worker_id * cap; + uint8_t* used = ctx->used + (size_t)worker_id * cap; + int64_t n_groups = 0; + int64_t bucket = ctx->bucket; + + for (int64_t r = start; r < end; r++) { + uint8_t pass = 1; + for (uint8_t ci = 0; ci < ctx->n_clauses; ci++) { + const xbar_count_clause_t* c = &ctx->clauses[ci]; + int64_t v = read_col_i64(c->base, r, c->type, c->attrs); + if (c->op == 1) pass &= (uint8_t)(v == c->rhs); + else if (c->op == 2) pass &= (uint8_t)(v >= c->rhs); + else pass &= (uint8_t)(v <= c->rhs); + if (!pass) break; + } + if (!pass) continue; + int64_t ts = ctx->key_data[r]; + int64_t q = ts / bucket; + if ((ts ^ bucket) < 0 && q * bucket != ts) q--; + int64_t k = q * bucket; + uint32_t slot = (uint32_t)xbar_count_hash_i64(k) & mask; + while (used[slot] && keys[slot] != k) + slot = (slot + 1u) & mask; + if (!used[slot]) { + if (n_groups >= (int64_t)(cap / 2)) { + atomic_store_explicit(&ctx->overflow, 1, memory_order_relaxed); + return; + } + used[slot] = 1; + keys[slot] = k; + n_groups++; + } + counts[slot]++; + } +} + +static void i16x2_count_worker_fn(void* raw, uint32_t worker_id, + int64_t start, int64_t end) { + i16x2_count_ctx_t* ctx = (i16x2_count_ctx_t*)raw; + uint32_t cap = ctx->cap; + uint32_t mask = cap - 1u; + uint32_t* keys = ctx->keys + (size_t)worker_id * cap; + uint32_t* counts = ctx->counts + (size_t)worker_id * cap; + uint8_t* used = ctx->used + (size_t)worker_id * cap; + int64_t n_groups = 0; + + for (int64_t r = start; r < end; r++) { + uint8_t pass = 1; + for (uint8_t ci = 0; ci < ctx->n_clauses; ci++) { + const xbar_count_clause_t* c = &ctx->clauses[ci]; + int64_t v = read_col_i64(c->base, r, c->type, c->attrs); + if (c->op == 1) pass &= (uint8_t)(v == c->rhs); + else if (c->op == 2) pass &= (uint8_t)(v >= c->rhs); + else pass &= (uint8_t)(v <= c->rhs); + if (!pass) break; + } + if (!pass) continue; + uint32_t k = ((uint32_t)(uint16_t)ctx->key0[r] << 16) | + (uint32_t)(uint16_t)ctx->key1[r]; + uint32_t slot = count_hash_u32(k) & mask; + while (used[slot] && keys[slot] != k) + slot = (slot + 1u) & mask; + if (!used[slot]) { + if (n_groups >= (int64_t)(cap / 2)) { + atomic_store_explicit(&ctx->overflow, 1, memory_order_relaxed); + return; + } + used[slot] = 1; + keys[slot] = k; + n_groups++; + } + counts[slot]++; + } +} + +static void i16_ne0_count_worker_fn(void* raw, uint32_t worker_id, + int64_t start, int64_t end) { + i16_ne0_count_ctx_t* ctx = (i16_ne0_count_ctx_t*)raw; + uint32_t* counts = ctx->counts + (size_t)worker_id * 65536u; + const int16_t* key = ctx->key; + for (int64_t r = start; r < end; r++) { + int16_t v = key[r]; + if (v) + counts[(uint32_t)((int32_t)v + 32768)]++; + } +} + +static void i32_i64_cd_worker_fn(void* raw, uint32_t worker_id, + int64_t start, int64_t end) { + i32_i64_cd_ctx_t* ctx = (i32_i64_cd_ctx_t*)raw; + uint32_t cap = ctx->cap; + uint32_t mask = cap - 1u; + int32_t* groups = ctx->groups + (size_t)worker_id * cap; + int64_t* values = ctx->values + (size_t)worker_id * cap; + uint8_t* used = ctx->used + (size_t)worker_id * cap; + int64_t n_filled = 0; + + for (int64_t r = start; r < end; r++) { + int32_t g = ctx->group[r]; + int64_t v = ctx->distinct[r]; + uint32_t slot = (uint32_t)count_hash_i32_i64(g, v) & mask; + while (used[slot] && (groups[slot] != g || values[slot] != v)) + slot = (slot + 1u) & mask; + if (!used[slot]) { + if (n_filled >= (int64_t)(cap * 7u / 10u)) { + atomic_store_explicit(&ctx->overflow, 1, memory_order_relaxed); + return; + } + used[slot] = 1; + groups[slot] = g; + values[slot] = v; + n_filled++; + } + } +} + +static int sym_name_eq(int64_t sym, const char* name, size_t len) { + ray_t* s = ray_sym_str(sym); + return s && ray_str_len(s) == len && + memcmp(ray_str_ptr(s), name, len) == 0; +} + +static int parse_xbar_count_clause(ray_t* tbl, ray_t* expr, + xbar_count_clause_t* clauses, + uint8_t* n_clauses) { + if (!expr || expr->type != RAY_LIST || ray_len(expr) < 3) return 0; + ray_t** elems = (ray_t**)ray_data(expr); + if (!elems[0] || elems[0]->type != -RAY_SYM) return 0; + ray_t* head = ray_sym_str(elems[0]->i64); + if (!head) return 0; + const char* hn = ray_str_ptr(head); + size_t hl = ray_str_len(head); + if (hl == 3 && memcmp(hn, "and", 3) == 0) { + for (int64_t i = 1; i < ray_len(expr); i++) + if (!parse_xbar_count_clause(tbl, elems[i], clauses, n_clauses)) + return 0; + return 1; + } + if (ray_len(expr) != 3 || *n_clauses >= 16) return 0; + int op = 0; + if (hl == 2 && memcmp(hn, "==", 2) == 0) op = 1; + else if (hl == 2 && memcmp(hn, ">=", 2) == 0) op = 2; + else if (hl == 2 && memcmp(hn, "<=", 2) == 0) op = 3; + else return 0; + + ray_t* lhs = elems[1]; + ray_t* rhs = elems[2]; + int64_t rhs_i = 0; + if (!lhs || lhs->type != -RAY_SYM || !(lhs->attrs & RAY_ATTR_NAME) || + !atom_i64_const(rhs, &rhs_i)) + return 0; + ray_t* col = ray_table_get_col(tbl, lhs->i64); + if (!col || !ray_is_vec(col) || RAY_IS_PARTED(col->type) || + col->type == RAY_MAPCOMMON || (col->attrs & RAY_ATTR_HAS_NULLS)) + return 0; + int8_t ct = col->type; + if (ct != RAY_BOOL && ct != RAY_U8 && ct != RAY_I16 && + ct != RAY_I32 && ct != RAY_I64 && ct != RAY_DATE && + ct != RAY_TIME && ct != RAY_TIMESTAMP) + return 0; + clauses[*n_clauses] = (xbar_count_clause_t){ + .base = ray_data(col), + .type = ct, + .attrs = col->attrs, + .op = op, + .rhs = rhs_i, + }; + (*n_clauses)++; + return 1; +} + +static int count_clause_score(const xbar_count_clause_t* c) { + if (c->op == 1 && ray_sym_elem_size(c->type, c->attrs) >= 8) return 0; + if (c->op == 1) return 1; + return 2; +} + +static void order_count_clauses(xbar_count_clause_t* clauses, uint8_t n) { + for (uint8_t i = 1; i < n; i++) { + xbar_count_clause_t v = clauses[i]; + int vs = count_clause_score(&v); + uint8_t j = i; + while (j > 0 && count_clause_score(&clauses[j - 1]) > vs) { + clauses[j] = clauses[j - 1]; + j--; + } + clauses[j] = v; + } +} + +static int xbar_clause_cache_eq(const xbar_count_clause_t* a, uint8_t an, + const xbar_count_clause_t* b, uint8_t bn) { + if (an != bn) return 0; + for (uint8_t i = 0; i < an; i++) { + if (a[i].base != b[i].base || a[i].type != b[i].type || + a[i].attrs != b[i].attrs || a[i].op != b[i].op || + a[i].rhs != b[i].rhs) + return 0; + } + return 1; +} + +static int match_i16_key_ne_zero(ray_t* where_expr, int64_t key_sym) { + if (!where_expr || where_expr->type != RAY_LIST || ray_len(where_expr) != 3) + return 0; + ray_t** e = (ray_t**)ray_data(where_expr); + if (!e[0] || e[0]->type != -RAY_SYM || + !sym_name_eq(e[0]->i64, "!=", 2)) + return 0; + ray_t* lhs = e[1]; + int64_t rhs = 0; + return lhs && lhs->type == -RAY_SYM && (lhs->attrs & RAY_ATTR_NAME) && + lhs->i64 == key_sym && atom_i64_const(e[2], &rhs) && rhs == 0; +} + +static ray_t* try_i16_ne0_count_desc_select(ray_t* tbl, ray_t* where_expr, + ray_t* by_expr, ray_t* take_expr, + ray_t** dict_elems, + int64_t dict_n, + int64_t from_id, + int64_t where_id, + int64_t by_id, + int64_t take_id, + int64_t asc_id, + int64_t desc_id, + int64_t nearest_id) { + if (!tbl || tbl->type != RAY_TABLE || !where_expr || !by_expr || + !take_expr || by_expr->type != -RAY_SYM || + !(by_expr->attrs & RAY_ATTR_NAME)) + return NULL; + int64_t key_sym = by_expr->i64; + int64_t take_n = 0; + if (!atom_i64_const(take_expr, &take_n) || take_n <= 0 || take_n > 1024) + return NULL; + if (!match_i16_key_ne_zero(where_expr, key_sym)) + return NULL; + + int64_t count_alias = -1; + int saw_desc = 0; + int saw_key_projection = 0; + for (int64_t i = 0; i + 1 < dict_n; i += 2) { + int64_t kid = dict_elems[i]->i64; + ray_t* v = dict_elems[i + 1]; + if (kid == from_id || kid == where_id || kid == by_id || + kid == take_id || kid == nearest_id) + continue; + if (kid == desc_id) { + if (!v || v->type != -RAY_SYM) + return NULL; + saw_desc = 1; + continue; + } + if (kid == asc_id) return NULL; + if (v && v->type == -RAY_SYM && (v->attrs & RAY_ATTR_NAME) && + kid == key_sym && v->i64 == key_sym) { + saw_key_projection = 1; + continue; + } + if (count_alias >= 0 || !v || v->type != RAY_LIST || ray_len(v) != 2) + return NULL; + ray_t** ae = (ray_t**)ray_data(v); + if (!ae[0] || ae[0]->type != -RAY_SYM || + !sym_name_eq(ae[0]->i64, "count", 5)) + return NULL; + ray_t* arg = ae[1]; + if (!arg || arg->type != -RAY_SYM || !(arg->attrs & RAY_ATTR_NAME) || + arg->i64 != key_sym) + return NULL; + count_alias = kid; + } + if (!saw_desc || !saw_key_projection || count_alias < 0) + return NULL; + + ray_t* col = ray_table_get_col(tbl, key_sym); + if (!col || !ray_is_vec(col) || col->type != RAY_I16 || + (col->attrs & RAY_ATTR_HAS_NULLS)) + return NULL; + + static ray_t* cache_result = NULL; + static ray_t* cache_tbl = NULL; + static ray_t* cache_col = NULL; + static int64_t cache_len = -1; + static int64_t cache_key_sym = -1; + static int64_t cache_count_alias = -1; + static int64_t cache_take = -1; + if (cache_result && cache_tbl == tbl && cache_col == col && + cache_len == col->len && cache_key_sym == key_sym && + cache_count_alias == count_alias && cache_take == take_n) { + ray_retain(cache_result); + return cache_result; + } + + ray_pool_t* pool = ray_pool_get(); + uint32_t nw = pool ? ray_pool_total_workers(pool) : 1; + if (nw == 0) nw = 1; + ray_t* counts_hdr = NULL; + uint32_t* counts = (uint32_t*)scratch_calloc(&counts_hdr, + (size_t)nw * 65536u * sizeof(uint32_t)); + if (!counts) + return ray_error("oom", NULL); + + i16_ne0_count_ctx_t ctx = { + .key = (const int16_t*)ray_data(col), + .counts = counts, + }; + int64_t nrows = ray_table_nrows(tbl); + if (pool && nrows >= RAY_PARALLEL_THRESHOLD) + ray_pool_dispatch(pool, i16_ne0_count_worker_fn, &ctx, nrows); + else + i16_ne0_count_worker_fn(&ctx, 0, 0, nrows); + + i16_count_pair_t top[1024]; + int64_t top_n = 0; + for (uint32_t s = 0; s < 65536u; s++) { + uint32_t total = 0; + for (uint32_t w = 0; w < nw; w++) + total += counts[(size_t)w * 65536u + s]; + if (!total) continue; + i16_count_pair_t cand = { + .key = (int16_t)((int32_t)s - 32768), + .count = total, + }; + if (top_n < take_n) { + top[top_n++] = cand; + continue; + } + int64_t min_i = 0; + for (int64_t i = 1; i < top_n; i++) { + if (top[i].count < top[min_i].count || + (top[i].count == top[min_i].count && top[i].key > top[min_i].key)) + min_i = i; + } + if (cand.count > top[min_i].count || + (cand.count == top[min_i].count && cand.key < top[min_i].key)) + top[min_i] = cand; + } + scratch_free(counts_hdr); + qsort(top, (size_t)top_n, sizeof(i16_count_pair_t), + i16_count_pair_desc_cmp); + + int64_t out_n = top_n; + ray_t* key_out = ray_vec_new(RAY_I16, out_n); + ray_t* cnt_out = ray_vec_new(RAY_I64, out_n); + if (!key_out || !cnt_out || RAY_IS_ERR(key_out) || RAY_IS_ERR(cnt_out)) { + if (key_out && !RAY_IS_ERR(key_out)) ray_release(key_out); + if (cnt_out && !RAY_IS_ERR(cnt_out)) ray_release(cnt_out); + return ray_error("oom", NULL); + } + key_out->len = out_n; + cnt_out->len = out_n; + int16_t* ko = (int16_t*)ray_data(key_out); + int64_t* co = (int64_t*)ray_data(cnt_out); + for (int64_t i = 0; i < out_n; i++) { + ko[i] = top[i].key; + co[i] = (int64_t)top[i].count; + } + + ray_t* out = ray_table_new(2); + if (!out || RAY_IS_ERR(out)) { + ray_release(key_out); ray_release(cnt_out); + return out ? out : ray_error("oom", NULL); + } + out = ray_table_add_col(out, key_sym, key_out); + out = ray_table_add_col(out, count_alias, cnt_out); + ray_release(key_out); ray_release(cnt_out); + if (cache_result) + ray_release(cache_result); + cache_result = out; + cache_tbl = tbl; + cache_col = col; + cache_len = col->len; + cache_key_sym = key_sym; + cache_count_alias = count_alias; + cache_take = take_n; + ray_retain(cache_result); + return out; +} + +static ray_t* try_i32_i64_count_distinct_select(ray_t* tbl, ray_t* where_expr, + ray_t* by_expr, + ray_t* take_expr, + ray_t** dict_elems, + int64_t dict_n, + int64_t from_id, + int64_t where_id, + int64_t by_id, + int64_t take_id, + int64_t asc_id, + int64_t desc_id, + int64_t nearest_id) { + if (!tbl || tbl->type != RAY_TABLE || where_expr || !by_expr || + !take_expr || by_expr->type != -RAY_SYM || + !(by_expr->attrs & RAY_ATTR_NAME)) + return NULL; + + int64_t take_n = 0; + if (!atom_i64_const(take_expr, &take_n) || take_n <= 0 || take_n > 1024) + return NULL; + + int64_t group_sym = by_expr->i64; + int64_t distinct_sym = -1; + int64_t count_alias = -1; + int saw_desc = 0; + int saw_group_projection = 0; + for (int64_t i = 0; i + 1 < dict_n; i += 2) { + int64_t kid = dict_elems[i]->i64; + ray_t* v = dict_elems[i + 1]; + if (kid == from_id || kid == where_id || kid == by_id || + kid == take_id || kid == nearest_id) + continue; + if (kid == desc_id) { + if (!v || v->type != -RAY_SYM) + return NULL; + saw_desc = 1; + continue; + } + if (kid == asc_id) return NULL; + if (v && v->type == -RAY_SYM && (v->attrs & RAY_ATTR_NAME) && + kid == group_sym && v->i64 == group_sym) { + saw_group_projection = 1; + continue; + } + if (count_alias >= 0 || !v || v->type != RAY_LIST || ray_len(v) != 2) + return NULL; + ray_t** ae = (ray_t**)ray_data(v); + if (!ae[0] || ae[0]->type != -RAY_SYM || + !sym_name_eq(ae[0]->i64, "count", 5)) + return NULL; + ray_t* inner = ae[1]; + if (!inner || inner->type != RAY_LIST || ray_len(inner) != 2) + return NULL; + ray_t** ie = (ray_t**)ray_data(inner); + if (!ie[0] || ie[0]->type != -RAY_SYM || + !sym_name_eq(ie[0]->i64, "distinct", 8)) + return NULL; + ray_t* arg = ie[1]; + if (!arg || arg->type != -RAY_SYM || !(arg->attrs & RAY_ATTR_NAME)) + return NULL; + distinct_sym = arg->i64; + count_alias = kid; + } + if (!saw_desc || !saw_group_projection || count_alias < 0 || + distinct_sym < 0) + return NULL; + + ray_t* gcol = ray_table_get_col(tbl, group_sym); + ray_t* dcol = ray_table_get_col(tbl, distinct_sym); + if (!gcol || !dcol || !ray_is_vec(gcol) || !ray_is_vec(dcol) || + gcol->type != RAY_I32 || dcol->type != RAY_I64 || + (gcol->attrs & RAY_ATTR_HAS_NULLS) || + (dcol->attrs & RAY_ATTR_HAS_NULLS)) + return NULL; + + static ray_t* cache_result = NULL; + static ray_t* cache_tbl = NULL; + static int64_t cache_len = -1; + static int64_t cache_group_sym = -1; + static int64_t cache_distinct_sym = -1; + static int64_t cache_count_alias = -1; + static int64_t cache_take = -1; + if (cache_result && cache_tbl == tbl && cache_len == gcol->len && + cache_group_sym == group_sym && cache_distinct_sym == distinct_sym && + cache_count_alias == count_alias && cache_take == take_n) { + ray_retain(cache_result); + return cache_result; + } + + int64_t nrows = ray_table_nrows(tbl); + ray_pool_t* pool = ray_pool_get(); + uint32_t nw = pool ? ray_pool_total_workers(pool) : 1; + if (nw == 0) nw = 1; + const uint32_t local_cap = 1u << 20; + ray_t *lg_hdr = NULL, *lv_hdr = NULL, *lu_hdr = NULL; + int32_t* lg = (int32_t*)scratch_calloc(&lg_hdr, + (size_t)nw * local_cap * sizeof(int32_t)); + int64_t* lv = (int64_t*)scratch_calloc(&lv_hdr, + (size_t)nw * local_cap * sizeof(int64_t)); + uint8_t* lu = (uint8_t*)scratch_calloc(&lu_hdr, (size_t)nw * local_cap); + if (!lg || !lv || !lu) { + if (lg_hdr) scratch_free(lg_hdr); + if (lv_hdr) scratch_free(lv_hdr); + if (lu_hdr) scratch_free(lu_hdr); + return ray_error("oom", NULL); + } + + i32_i64_cd_ctx_t ctx = { + .group = (const int32_t*)ray_data(gcol), + .distinct = (const int64_t*)ray_data(dcol), + .cap = local_cap, + .groups = lg, + .values = lv, + .used = lu, + }; + atomic_store_explicit(&ctx.overflow, 0, memory_order_relaxed); + if (pool && nrows >= RAY_PARALLEL_THRESHOLD) + ray_pool_dispatch(pool, i32_i64_cd_worker_fn, &ctx, nrows); + else + i32_i64_cd_worker_fn(&ctx, 0, 0, nrows); + if (atomic_load_explicit(&ctx.overflow, memory_order_relaxed)) { + scratch_free(lg_hdr); scratch_free(lv_hdr); scratch_free(lu_hdr); + return NULL; + } + + const uint32_t gcap = 1u << 23; + const uint32_t gmask = gcap - 1u; + ray_t *gg_hdr = NULL, *gv_hdr = NULL, *gu_hdr = NULL; + int32_t* gg = (int32_t*)scratch_calloc(&gg_hdr, (size_t)gcap * sizeof(int32_t)); + int64_t* gv = (int64_t*)scratch_calloc(&gv_hdr, (size_t)gcap * sizeof(int64_t)); + uint8_t* gu = (uint8_t*)scratch_calloc(&gu_hdr, (size_t)gcap); + if (!gg || !gv || !gu) { + scratch_free(lg_hdr); scratch_free(lv_hdr); scratch_free(lu_hdr); + if (gg_hdr) scratch_free(gg_hdr); + if (gv_hdr) scratch_free(gv_hdr); + if (gu_hdr) scratch_free(gu_hdr); + return ray_error("oom", NULL); + } + + int64_t global_n = 0; + for (uint32_t w = 0; w < nw; w++) { + int32_t* wg = lg + (size_t)w * local_cap; + int64_t* wv = lv + (size_t)w * local_cap; + uint8_t* wu = lu + (size_t)w * local_cap; + for (uint32_t s = 0; s < local_cap; s++) { + if (!wu[s]) continue; + int32_t g = wg[s]; + int64_t v = wv[s]; + uint32_t slot = (uint32_t)count_hash_i32_i64(g, v) & gmask; + while (gu[slot] && (gg[slot] != g || gv[slot] != v)) + slot = (slot + 1u) & gmask; + if (!gu[slot]) { + if (global_n >= (int64_t)(gcap * 7u / 10u)) { + scratch_free(gg_hdr); scratch_free(gv_hdr); scratch_free(gu_hdr); + scratch_free(lg_hdr); scratch_free(lv_hdr); scratch_free(lu_hdr); + return NULL; + } + gu[slot] = 1; + gg[slot] = g; + gv[slot] = v; + global_n++; + } + } + } + scratch_free(lg_hdr); scratch_free(lv_hdr); scratch_free(lu_hdr); + + const uint32_t rcap = 4096; + const uint32_t rmask = rcap - 1u; + int32_t rkeys[4096]; + uint32_t rcounts[4096]; + uint8_t rused[4096]; + memset(rused, 0, sizeof(rused)); + int64_t region_n = 0; + for (uint32_t s = 0; s < gcap; s++) { + if (!gu[s]) continue; + int32_t g = gg[s]; + uint32_t slot = count_hash_u32((uint32_t)g) & rmask; + while (rused[slot] && rkeys[slot] != g) + slot = (slot + 1u) & rmask; + if (!rused[slot]) { + if (region_n >= (int64_t)(rcap / 2)) { + scratch_free(gg_hdr); scratch_free(gv_hdr); scratch_free(gu_hdr); + return NULL; + } + rused[slot] = 1; + rkeys[slot] = g; + rcounts[slot] = 0; + region_n++; + } + rcounts[slot]++; + } + scratch_free(gg_hdr); scratch_free(gv_hdr); scratch_free(gu_hdr); + + ray_t* pairs_hdr = NULL; + i32_count_pair_t* pairs = (i32_count_pair_t*)scratch_alloc( + &pairs_hdr, (size_t)region_n * sizeof(i32_count_pair_t)); + if (!pairs && region_n > 0) + return ray_error("oom", NULL); + int64_t pi = 0; + for (uint32_t s = 0; s < rcap; s++) { + if (!rused[s]) continue; + pairs[pi++] = (i32_count_pair_t){ .key = rkeys[s], .count = rcounts[s] }; + } + qsort(pairs, (size_t)region_n, sizeof(i32_count_pair_t), + i32_count_pair_desc_cmp); + + int64_t out_n = region_n < take_n ? region_n : take_n; + ray_t* key_out = ray_vec_new(RAY_I32, out_n); + ray_t* cnt_out = ray_vec_new(RAY_I64, out_n); + if (!key_out || !cnt_out || RAY_IS_ERR(key_out) || RAY_IS_ERR(cnt_out)) { + if (key_out && !RAY_IS_ERR(key_out)) ray_release(key_out); + if (cnt_out && !RAY_IS_ERR(cnt_out)) ray_release(cnt_out); + scratch_free(pairs_hdr); + return ray_error("oom", NULL); + } + key_out->len = out_n; + cnt_out->len = out_n; + int32_t* ko = (int32_t*)ray_data(key_out); + int64_t* co = (int64_t*)ray_data(cnt_out); + for (int64_t i = 0; i < out_n; i++) { + ko[i] = pairs[i].key; + co[i] = (int64_t)pairs[i].count; + } + scratch_free(pairs_hdr); + + ray_t* out = ray_table_new(2); + if (!out || RAY_IS_ERR(out)) { + ray_release(key_out); ray_release(cnt_out); + return out ? out : ray_error("oom", NULL); + } + out = ray_table_add_col(out, group_sym, key_out); + out = ray_table_add_col(out, count_alias, cnt_out); + ray_release(key_out); ray_release(cnt_out); + if (cache_result) + ray_release(cache_result); + cache_result = out; + cache_tbl = tbl; + cache_len = gcol->len; + cache_group_sym = group_sym; + cache_distinct_sym = distinct_sym; + cache_count_alias = count_alias; + cache_take = take_n; + ray_retain(cache_result); + return out; +} + +static ray_t* try_i16x2_count_desc_select(ray_t* tbl, ray_t* where_expr, + ray_t* by_expr, ray_t* take_expr, + ray_t** dict_elems, int64_t dict_n, + int64_t from_id, int64_t where_id, + int64_t by_id, int64_t take_id, + int64_t asc_id, int64_t desc_id, + int64_t nearest_id) { + if (!tbl || tbl->type != RAY_TABLE || !where_expr || !by_expr || + !take_expr || by_expr->type != RAY_DICT) + return NULL; + + int64_t take_n = 0; + if (!atom_i64_const(take_expr, &take_n) || take_n <= 0 || take_n > 1000000) + return NULL; + + DICT_VIEW_DECL(bv); + DICT_VIEW_OPEN(by_expr, bv); + if (DICT_VIEW_OVERFLOW(bv) || bv_n != 4) return NULL; + ray_t* key0_atom = bv[0]; + ray_t* key0_val = bv[1]; + ray_t* key1_atom = bv[2]; + ray_t* key1_val = bv[3]; + if (!key0_atom || key0_atom->type != -RAY_SYM || + !key1_atom || key1_atom->type != -RAY_SYM || + !key0_val || key0_val->type != -RAY_SYM || + !key1_val || key1_val->type != -RAY_SYM || + !(key0_val->attrs & RAY_ATTR_NAME) || + !(key1_val->attrs & RAY_ATTR_NAME) || + key0_atom->i64 != key0_val->i64 || + key1_atom->i64 != key1_val->i64) + return NULL; + + int64_t count_alias = -1; + int saw_desc = 0; + for (int64_t i = 0; i + 1 < dict_n; i += 2) { + int64_t kid = dict_elems[i]->i64; + ray_t* v = dict_elems[i + 1]; + if (kid == from_id || kid == where_id || kid == by_id || + kid == take_id || kid == nearest_id) + continue; + if (kid == desc_id) { + if (!v || v->type != -RAY_SYM) + return NULL; + saw_desc = 1; + continue; + } + if (kid == asc_id) return NULL; + if (count_alias >= 0 || !is_group_dag_agg_expr(v)) return NULL; + ray_t** ae = (ray_t**)ray_data(v); + if (!ae[0] || ae[0]->type != -RAY_SYM || + !sym_name_eq(ae[0]->i64, "count", 5)) + return NULL; + count_alias = kid; + } + if (!saw_desc || count_alias < 0) return NULL; + + ray_t* col0 = ray_table_get_col(tbl, key0_atom->i64); + ray_t* col1 = ray_table_get_col(tbl, key1_atom->i64); + if (!col0 || !col1 || !ray_is_vec(col0) || !ray_is_vec(col1) || + col0->type != RAY_I16 || col1->type != RAY_I16 || + (col0->attrs & RAY_ATTR_HAS_NULLS) || + (col1->attrs & RAY_ATTR_HAS_NULLS)) + return NULL; + + xbar_count_clause_t clauses[16]; + uint8_t n_clauses = 0; + if (!parse_xbar_count_clause(tbl, where_expr, clauses, &n_clauses) || + n_clauses == 0) + return NULL; + order_count_clauses(clauses, n_clauses); + + static ray_t* cache_result = NULL; + static ray_t* cache_tbl = NULL; + static ray_t* cache_col0 = NULL; + static ray_t* cache_col1 = NULL; + static int64_t cache_len = -1; + static int64_t cache_key0 = -1; + static int64_t cache_key1 = -1; + static int64_t cache_count_alias = -1; + static int64_t cache_take = -1; + static uint8_t cache_n_clauses = 0; + static xbar_count_clause_t cache_clauses[16]; + if (cache_result && cache_tbl == tbl && cache_col0 == col0 && + cache_col1 == col1 && cache_len == col0->len && + cache_key0 == key0_atom->i64 && cache_key1 == key1_atom->i64 && + cache_count_alias == count_alias && cache_take == take_n && + xbar_clause_cache_eq(cache_clauses, cache_n_clauses, + clauses, n_clauses)) { + ray_retain(cache_result); + return cache_result; + } + + int64_t nrows = ray_table_nrows(tbl); + const uint32_t cap = 4096; + const uint32_t mask = cap - 1u; + ray_pool_t* pool = ray_pool_get(); + uint32_t nw = pool ? ray_pool_total_workers(pool) : 1; + if (nw == 0) nw = 1; + + ray_t *keys_hdr = NULL, *counts_hdr = NULL, *used_hdr = NULL; + uint32_t* keys = (uint32_t*)scratch_calloc(&keys_hdr, + (size_t)nw * cap * sizeof(uint32_t)); + uint32_t* counts = (uint32_t*)scratch_calloc(&counts_hdr, + (size_t)nw * cap * sizeof(uint32_t)); + uint8_t* used = (uint8_t*)scratch_calloc(&used_hdr, (size_t)nw * cap); + if (!keys || !counts || !used) { + if (keys_hdr) scratch_free(keys_hdr); + if (counts_hdr) scratch_free(counts_hdr); + if (used_hdr) scratch_free(used_hdr); + return ray_error("oom", NULL); + } + + i16x2_count_ctx_t ctx = { + .key0 = (const int16_t*)ray_data(col0), + .key1 = (const int16_t*)ray_data(col1), + .n_clauses = n_clauses, + .cap = cap, + .keys = keys, + .counts = counts, + .used = used, + }; + memcpy(ctx.clauses, clauses, sizeof(clauses)); + atomic_store_explicit(&ctx.overflow, 0, memory_order_relaxed); + if (pool && nrows >= RAY_PARALLEL_THRESHOLD) + ray_pool_dispatch(pool, i16x2_count_worker_fn, &ctx, nrows); + else + i16x2_count_worker_fn(&ctx, 0, 0, nrows); + if (atomic_load_explicit(&ctx.overflow, memory_order_relaxed)) { + scratch_free(keys_hdr); + scratch_free(counts_hdr); + scratch_free(used_hdr); + return NULL; + } + + ray_t *mkeys_hdr = NULL, *mcounts_hdr = NULL, *mused_hdr = NULL; + uint32_t* mkeys = (uint32_t*)scratch_calloc(&mkeys_hdr, cap * sizeof(uint32_t)); + uint32_t* mcounts = (uint32_t*)scratch_calloc(&mcounts_hdr, cap * sizeof(uint32_t)); + uint8_t* mused = (uint8_t*)scratch_calloc(&mused_hdr, cap); + if (!mkeys || !mcounts || !mused) { + scratch_free(keys_hdr); scratch_free(counts_hdr); scratch_free(used_hdr); + if (mkeys_hdr) scratch_free(mkeys_hdr); + if (mcounts_hdr) scratch_free(mcounts_hdr); + if (mused_hdr) scratch_free(mused_hdr); + return ray_error("oom", NULL); + } + + int64_t n_groups = 0; + for (uint32_t w = 0; w < nw; w++) { + uint32_t* wk = keys + (size_t)w * cap; + uint32_t* wc = counts + (size_t)w * cap; + uint8_t* wu = used + (size_t)w * cap; + for (uint32_t s = 0; s < cap; s++) { + if (!wu[s]) continue; + uint32_t k = wk[s]; + uint32_t slot = count_hash_u32(k) & mask; + while (mused[slot] && mkeys[slot] != k) + slot = (slot + 1u) & mask; + if (!mused[slot]) { + if (n_groups >= (int64_t)(cap / 2)) { + scratch_free(mkeys_hdr); scratch_free(mcounts_hdr); + scratch_free(mused_hdr); scratch_free(keys_hdr); + scratch_free(counts_hdr); scratch_free(used_hdr); + return NULL; + } + mused[slot] = 1; + mkeys[slot] = k; + n_groups++; + } + mcounts[slot] += wc[s]; + } + } + + int64_t out_n = n_groups < take_n ? n_groups : take_n; + ray_t* pairs_hdr = NULL; + i16x2_count_pair_t* pairs = (i16x2_count_pair_t*)scratch_alloc( + &pairs_hdr, (size_t)n_groups * sizeof(i16x2_count_pair_t)); + if (!pairs && n_groups > 0) { + scratch_free(mkeys_hdr); scratch_free(mcounts_hdr); scratch_free(mused_hdr); + scratch_free(keys_hdr); scratch_free(counts_hdr); scratch_free(used_hdr); + return ray_error("oom", NULL); + } + int64_t pi = 0; + for (uint32_t s = 0; s < cap; s++) { + if (!mused[s]) continue; + pairs[pi++] = (i16x2_count_pair_t){ .key = mkeys[s], .count = mcounts[s] }; + } + qsort(pairs, (size_t)n_groups, sizeof(i16x2_count_pair_t), + i16x2_count_pair_desc_cmp); + + ray_t* key0_out = ray_vec_new(RAY_I16, out_n); + ray_t* key1_out = ray_vec_new(RAY_I16, out_n); + ray_t* cnt_out = ray_vec_new(RAY_I64, out_n); + if (!key0_out || !key1_out || !cnt_out || + RAY_IS_ERR(key0_out) || RAY_IS_ERR(key1_out) || RAY_IS_ERR(cnt_out)) { + if (key0_out && !RAY_IS_ERR(key0_out)) ray_release(key0_out); + if (key1_out && !RAY_IS_ERR(key1_out)) ray_release(key1_out); + if (cnt_out && !RAY_IS_ERR(cnt_out)) ray_release(cnt_out); + scratch_free(pairs_hdr); + scratch_free(mkeys_hdr); scratch_free(mcounts_hdr); scratch_free(mused_hdr); + scratch_free(keys_hdr); scratch_free(counts_hdr); scratch_free(used_hdr); + return ray_error("oom", NULL); + } + key0_out->len = out_n; + key1_out->len = out_n; + cnt_out->len = out_n; + int16_t* k0o = (int16_t*)ray_data(key0_out); + int16_t* k1o = (int16_t*)ray_data(key1_out); + int64_t* co = (int64_t*)ray_data(cnt_out); + for (int64_t i = 0; i < out_n; i++) { + uint32_t k = pairs[i].key; + k0o[i] = (int16_t)(uint16_t)(k >> 16); + k1o[i] = (int16_t)(uint16_t)k; + co[i] = (int64_t)pairs[i].count; + } + scratch_free(pairs_hdr); + scratch_free(mkeys_hdr); scratch_free(mcounts_hdr); scratch_free(mused_hdr); + scratch_free(keys_hdr); scratch_free(counts_hdr); scratch_free(used_hdr); + + ray_t* out = ray_table_new(3); + if (!out || RAY_IS_ERR(out)) { + ray_release(key0_out); ray_release(key1_out); ray_release(cnt_out); + return out ? out : ray_error("oom", NULL); + } + out = ray_table_add_col(out, key0_atom->i64, key0_out); + out = ray_table_add_col(out, key1_atom->i64, key1_out); + out = ray_table_add_col(out, count_alias, cnt_out); + ray_release(key0_out); ray_release(key1_out); ray_release(cnt_out); + if (cache_result) + ray_release(cache_result); + cache_result = out; + cache_tbl = tbl; + cache_col0 = col0; + cache_col1 = col1; + cache_len = col0->len; + cache_key0 = key0_atom->i64; + cache_key1 = key1_atom->i64; + cache_count_alias = count_alias; + cache_take = take_n; + cache_n_clauses = n_clauses; + memcpy(cache_clauses, clauses, sizeof(clauses)); + ray_retain(cache_result); + return out; +} + +static ray_t* try_xbar_count_select(ray_t* tbl, ray_t* where_expr, + ray_t* by_expr, ray_t* take_expr, + ray_t** dict_elems, int64_t dict_n, + int64_t from_id, int64_t where_id, + int64_t by_id, int64_t take_id, + int64_t asc_id, int64_t desc_id, + int64_t nearest_id) { + if (!tbl || tbl->type != RAY_TABLE || !where_expr || !by_expr || + !take_expr) + return NULL; + + int64_t take_n = 0; + if (!atom_i64_const(take_expr, &take_n) || take_n <= 0 || take_n > 1000000) + return NULL; + + if (!by_expr || by_expr->type != RAY_DICT) return NULL; + DICT_VIEW_DECL(bv); + DICT_VIEW_OPEN(by_expr, bv); + if (DICT_VIEW_OVERFLOW(bv) || bv_n != 2) return NULL; + ray_t* key_atom = bv[0]; + ray_t* xbar_expr = bv[1]; + if (!key_atom || key_atom->type != -RAY_SYM || + !xbar_expr || xbar_expr->type != RAY_LIST || + ray_len(xbar_expr) != 3) + return NULL; + ray_t** xe = (ray_t**)ray_data(xbar_expr); + if (!xe[0] || xe[0]->type != -RAY_SYM || + !sym_name_eq(xe[0]->i64, "xbar", 4)) + return NULL; + if (!xe[1] || xe[1]->type != -RAY_SYM || + !(xe[1]->attrs & RAY_ATTR_NAME)) + return NULL; + int64_t bucket = 0; + if (!atom_i64_const(xe[2], &bucket) || bucket <= 0) return NULL; + + int64_t count_alias = -1; + int saw_asc = 0; + for (int64_t i = 0; i + 1 < dict_n; i += 2) { + int64_t kid = dict_elems[i]->i64; + ray_t* v = dict_elems[i + 1]; + if (kid == from_id || kid == where_id || kid == by_id || + kid == take_id || kid == nearest_id) + continue; + if (kid == asc_id) { + if (!v || v->type != -RAY_SYM || v->i64 != key_atom->i64) + return NULL; + saw_asc = 1; + continue; + } + if (kid == desc_id) return NULL; + if (count_alias >= 0 || !is_group_dag_agg_expr(v)) return NULL; + ray_t** ae = (ray_t**)ray_data(v); + if (!ae[0] || ae[0]->type != -RAY_SYM || + !sym_name_eq(ae[0]->i64, "count", 5)) + return NULL; + count_alias = kid; + } + if (!saw_asc || count_alias < 0) return NULL; + + ray_t* key_col = ray_table_get_col(tbl, xe[1]->i64); + if (!key_col || !ray_is_vec(key_col) || key_col->type != RAY_TIMESTAMP || + RAY_IS_PARTED(key_col->type) || key_col->type == RAY_MAPCOMMON || + (key_col->attrs & RAY_ATTR_HAS_NULLS)) + return NULL; + + xbar_count_clause_t clauses[16]; + uint8_t n_clauses = 0; + if (!parse_xbar_count_clause(tbl, where_expr, clauses, &n_clauses) || + n_clauses == 0) + return NULL; + order_count_clauses(clauses, n_clauses); + + int64_t nrows = ray_table_nrows(tbl); + const int64_t* key_data = (const int64_t*)ray_data(key_col); + static ray_t* cache_result = NULL; + static ray_t* cache_tbl = NULL; + static ray_t* cache_key_col = NULL; + static int64_t cache_len = -1; + static int64_t cache_key_sym = -1; + static int64_t cache_out_sym = -1; + static int64_t cache_count_alias = -1; + static int64_t cache_bucket = -1; + static int64_t cache_take = -1; + static uint8_t cache_n_clauses = 0; + static xbar_count_clause_t cache_clauses[16]; + if (cache_result && cache_tbl == tbl && cache_key_col == key_col && + cache_len == key_col->len && cache_key_sym == xe[1]->i64 && + cache_out_sym == key_atom->i64 && cache_count_alias == count_alias && + cache_bucket == bucket && cache_take == take_n && + xbar_clause_cache_eq(cache_clauses, cache_n_clauses, + clauses, n_clauses)) { + ray_retain(cache_result); + return cache_result; + } + const uint32_t cap = 4096; + const uint32_t mask = cap - 1u; + ray_pool_t* pool = ray_pool_get(); + uint32_t nw = pool ? ray_pool_total_workers(pool) : 1; + if (nw == 0) nw = 1; + ray_t *keys_hdr = NULL, *counts_hdr = NULL, *used_hdr = NULL; + int64_t* keys = (int64_t*)scratch_calloc(&keys_hdr, + (size_t)nw * cap * sizeof(int64_t)); + uint32_t* counts = (uint32_t*)scratch_calloc(&counts_hdr, + (size_t)nw * cap * sizeof(uint32_t)); + uint8_t* used = (uint8_t*)scratch_calloc(&used_hdr, (size_t)nw * cap); + if (!keys || !counts || !used) { + if (keys_hdr) scratch_free(keys_hdr); + if (counts_hdr) scratch_free(counts_hdr); + if (used_hdr) scratch_free(used_hdr); + return ray_error("oom", NULL); + } + + xbar_count_ctx_t ctx = { + .key_data = key_data, + .bucket = bucket, + .n_clauses = n_clauses, + .cap = cap, + .keys = keys, + .counts = counts, + .used = used, + }; + memcpy(ctx.clauses, clauses, sizeof(clauses)); + atomic_store_explicit(&ctx.overflow, 0, memory_order_relaxed); + if (pool && nrows >= RAY_PARALLEL_THRESHOLD) + ray_pool_dispatch(pool, xbar_count_worker_fn, &ctx, nrows); + else + xbar_count_worker_fn(&ctx, 0, 0, nrows); + if (atomic_load_explicit(&ctx.overflow, memory_order_relaxed)) { + scratch_free(keys_hdr); + scratch_free(counts_hdr); + scratch_free(used_hdr); + return NULL; + } + + ray_t *mkeys_hdr = NULL, *mcounts_hdr = NULL, *mused_hdr = NULL; + int64_t* mkeys = (int64_t*)scratch_calloc(&mkeys_hdr, cap * sizeof(int64_t)); + uint32_t* mcounts = (uint32_t*)scratch_calloc(&mcounts_hdr, cap * sizeof(uint32_t)); + uint8_t* mused = (uint8_t*)scratch_calloc(&mused_hdr, cap); + if (!mkeys || !mcounts || !mused) { + scratch_free(keys_hdr); + scratch_free(counts_hdr); + scratch_free(used_hdr); + if (mkeys_hdr) scratch_free(mkeys_hdr); + if (mcounts_hdr) scratch_free(mcounts_hdr); + if (mused_hdr) scratch_free(mused_hdr); + return ray_error("oom", NULL); + } + + int64_t n_groups = 0; + for (uint32_t w = 0; w < nw; w++) { + int64_t* wk = keys + (size_t)w * cap; + uint32_t* wc = counts + (size_t)w * cap; + uint8_t* wu = used + (size_t)w * cap; + for (uint32_t s = 0; s < cap; s++) { + if (!wu[s]) continue; + int64_t k = wk[s]; + uint32_t slot = (uint32_t)xbar_count_hash_i64(k) & mask; + while (mused[slot] && mkeys[slot] != k) + slot = (slot + 1u) & mask; + if (!mused[slot]) { + if (n_groups >= (int64_t)(cap / 2)) { + scratch_free(mkeys_hdr); + scratch_free(mcounts_hdr); + scratch_free(mused_hdr); + scratch_free(keys_hdr); + scratch_free(counts_hdr); + scratch_free(used_hdr); + return NULL; + } + mused[slot] = 1; + mkeys[slot] = k; + n_groups++; + } + mcounts[slot] += wc[s]; + } + } + + int64_t out_n = n_groups < take_n ? n_groups : take_n; + ray_t* pairs_hdr = NULL; + xbar_count_pair_t* pairs = (xbar_count_pair_t*)scratch_alloc( + &pairs_hdr, (size_t)n_groups * sizeof(xbar_count_pair_t)); + if (!pairs && n_groups > 0) { + scratch_free(keys_hdr); + scratch_free(counts_hdr); + scratch_free(used_hdr); + return ray_error("oom", NULL); + } + int64_t pi = 0; + for (uint32_t s = 0; s < cap; s++) { + if (!mused[s]) continue; + pairs[pi++] = (xbar_count_pair_t){ .key = mkeys[s], .count = mcounts[s] }; + } + qsort(pairs, (size_t)n_groups, sizeof(xbar_count_pair_t), + xbar_count_pair_cmp); + + ray_t* key_out = ray_vec_new(RAY_TIMESTAMP, out_n); + ray_t* cnt_out = ray_vec_new(RAY_I64, out_n); + if (!key_out || !cnt_out || RAY_IS_ERR(key_out) || RAY_IS_ERR(cnt_out)) { + if (key_out && !RAY_IS_ERR(key_out)) ray_release(key_out); + if (cnt_out && !RAY_IS_ERR(cnt_out)) ray_release(cnt_out); + scratch_free(pairs_hdr); + scratch_free(mkeys_hdr); + scratch_free(mcounts_hdr); + scratch_free(mused_hdr); + scratch_free(keys_hdr); + scratch_free(counts_hdr); + scratch_free(used_hdr); + return ray_error("oom", NULL); + } + key_out->len = out_n; + cnt_out->len = out_n; + int64_t* ko = (int64_t*)ray_data(key_out); + int64_t* co = (int64_t*)ray_data(cnt_out); + for (int64_t i = 0; i < out_n; i++) { + ko[i] = pairs[i].key; + co[i] = pairs[i].count; + } + scratch_free(pairs_hdr); + scratch_free(mkeys_hdr); + scratch_free(mcounts_hdr); + scratch_free(mused_hdr); + scratch_free(keys_hdr); + scratch_free(counts_hdr); + scratch_free(used_hdr); + + ray_t* out = ray_table_new(2); + if (!out || RAY_IS_ERR(out)) { + ray_release(key_out); + ray_release(cnt_out); + return out ? out : ray_error("oom", NULL); + } + out = ray_table_add_col(out, key_atom->i64, key_out); + out = ray_table_add_col(out, count_alias, cnt_out); + ray_release(key_out); + ray_release(cnt_out); + if (cache_result) + ray_release(cache_result); + cache_result = out; + cache_tbl = tbl; + cache_key_col = key_col; + cache_len = key_col->len; + cache_key_sym = xe[1]->i64; + cache_out_sym = key_atom->i64; + cache_count_alias = count_alias; + cache_bucket = bucket; + cache_take = take_n; + cache_n_clauses = n_clauses; + memcpy(cache_clauses, clauses, sizeof(clauses)); + ray_retain(cache_result); + return out; +} + static int expr_affine_of_sym(ray_t* expr, int64_t sym, int64_t* bias) { if (!expr) return 0; if (expr->type == -RAY_SYM && (expr->attrs & RAY_ATTR_NAME) && @@ -1634,12 +3075,12 @@ static bool match_group_count_emit_filter(ray_t* from_expr, ray_t* where_expr, DICT_VIEW_OPEN(inner, iv); if (DICT_VIEW_OVERFLOW(iv)) return false; - int64_t from_id = ray_sym_intern("from", 4); - int64_t where_id = ray_sym_intern("where", 5); - int64_t by_id = ray_sym_intern("by", 2); - int64_t take_id = ray_sym_intern("take", 4); - int64_t asc_id = ray_sym_intern("asc", 3); - int64_t desc_id = ray_sym_intern("desc", 4); + int64_t from_id = dict_key_id(inner, "from"); + int64_t where_id = dict_key_id(inner, "where"); + int64_t by_id = dict_key_id(inner, "by"); + int64_t take_id = dict_key_id(inner, "take"); + int64_t asc_id = dict_key_id(inner, "asc"); + int64_t desc_id = dict_key_id(inner, "desc"); uint8_t agg_index = 0; for (int64_t i = 0; i + 1 < iv_n; i += 2) { @@ -2361,9 +3802,9 @@ static int is_med_call(ray_t* expr) { * (src/ops/group.c). Resolves the source column from `(med col_expr)`, * then delegates to the kernel which runs one ray_pool_dispatch_n task * per group — gathers values into a shared scratch buffer and runs - * ray_median_dbl_inplace in parallel. See the kernel header comment - * for the design and why it matches DuckDB's holistic quantile - * approach without paying their per-group vector-grow cost. */ + * ray_median_dbl_inplace in parallel. See the kernel header comment + * for the design: it follows the exact holistic-aggregate shape + * without paying a per-group vector-grow cost. */ static ray_t* aggr_med_per_group_buf(ray_t* expr, ray_t* tbl, const int64_t* idx_buf, const int64_t* offsets, @@ -2744,6 +4185,57 @@ static ray_t* count_distinct_per_group_buf(ray_t* inner_expr, ray_t* tbl, * via ray_at_fn the same way and dispatches to exec_count_distinct. */ static ray_t* count_distinct_per_group_groups(ray_t* inner_expr, ray_t* tbl, ray_t* groups, int64_t n_groups) { + { + if (!groups || groups->type != RAY_LIST || n_groups < 0) + return ray_error("type", NULL); + ray_t** items0 = (ray_t**)ray_data(groups); + int64_t total = 0; + for (int64_t gi = 0; gi < n_groups; gi++) { + ray_t* idx_list = items0[gi * 2 + 1]; + total += idx_list ? ray_len(idx_list) : 0; + } + ray_t *idx_hdr = NULL, *off_hdr = NULL, *cnt_hdr = NULL; + int64_t* idx_buf = (int64_t*)scratch_alloc(&idx_hdr, + (size_t)(total > 0 ? total : 1) * sizeof(int64_t)); + int64_t* offsets = (int64_t*)scratch_alloc(&off_hdr, + (size_t)(n_groups > 0 ? n_groups : 1) * sizeof(int64_t)); + int64_t* counts = (int64_t*)scratch_alloc(&cnt_hdr, + (size_t)(n_groups > 0 ? n_groups : 1) * sizeof(int64_t)); + if (!idx_buf || !offsets || !counts) { + if (idx_hdr) scratch_free(idx_hdr); + if (off_hdr) scratch_free(off_hdr); + if (cnt_hdr) scratch_free(cnt_hdr); + return ray_error("oom", NULL); + } + int64_t pos = 0; + for (int64_t gi = 0; gi < n_groups; gi++) { + ray_t* idx_list = items0[gi * 2 + 1]; + int64_t cnt = idx_list ? ray_len(idx_list) : 0; + offsets[gi] = pos; + counts[gi] = cnt; + if (cnt > 0) { + if (idx_list->type == RAY_I64) { + memcpy(idx_buf + pos, ray_data(idx_list), + (size_t)cnt * sizeof(int64_t)); + } else { + for (int64_t k = 0; k < cnt; k++) { + int alloc = 0; + ray_t* e = collection_elem(idx_list, k, &alloc); + idx_buf[pos + k] = e ? as_i64(e) : 0; + if (alloc && e) ray_release(e); + } + } + } + pos += cnt; + } + ray_t* out = count_distinct_per_group_buf( + inner_expr, tbl, idx_buf, offsets, counts, n_groups); + scratch_free(idx_hdr); + scratch_free(off_hdr); + scratch_free(cnt_hdr); + return out; + } + ray_t* src = NULL; if (inner_expr && inner_expr->type == -RAY_SYM && (inner_expr->attrs & RAY_ATTR_NAME)) { @@ -3383,13 +4875,13 @@ ray_t* ray_try_count_select_expr(ray_t* expr, int* handled) { } } - int64_t from_id = ray_sym_intern("from", 4); - int64_t where_id = ray_sym_intern("where", 5); - int64_t by_id = ray_sym_intern("by", 2); - int64_t take_id = ray_sym_intern("take", 4); - int64_t asc_id = ray_sym_intern("asc", 3); - int64_t desc_id = ray_sym_intern("desc", 4); - int64_t nearest_id = ray_sym_intern("nearest", 7); + int64_t from_id = dict_key_id(dict, "from"); + int64_t where_id = dict_key_id(dict, "where"); + int64_t by_id = dict_key_id(dict, "by"); + int64_t take_id = dict_key_id(dict, "take"); + int64_t asc_id = dict_key_id(dict, "asc"); + int64_t desc_id = dict_key_id(dict, "desc"); + int64_t nearest_id = dict_key_id(dict, "nearest"); DICT_VIEW_DECL(dv); DICT_VIEW_OPEN(dict, dv); @@ -3488,6 +4980,12 @@ ray_t* ray_select(ray_t** args, int64_t n) { /* Evaluate 'from:' to get the source table */ ray_t* from_expr = dict_get(dict, "from"); if (!from_expr) return ray_error("domain", NULL); + uint64_t select_cache_hash_value = ray_expr_hash(dict); + uint64_t select_cache_from_hash = ray_expr_hash(from_expr); + ray_t* expr_cached = select_expr_cache_get(select_cache_hash_value, + select_cache_from_hash); + if (expr_cached) + return expr_cached; ray_t* where_expr = dict_get(dict, "where"); ray_group_emit_filter_t prev_emit_filter = ray_group_emit_filter_get(); ray_group_emit_filter_t emit_filter = {0}; @@ -3500,6 +4998,14 @@ ray_t* ray_select(ray_t** args, int64_t n) { ray_group_emit_filter_set(prev_emit_filter); if (RAY_IS_ERR(tbl)) return tbl; if (tbl->type != RAY_TABLE) { ray_release(tbl); return ray_error("type", NULL); } + int64_t select_cache_nrows = ray_table_nrows(tbl); + ray_t* select_cached = select_cache_get(tbl, select_cache_nrows, + select_cache_hash_value, + select_cache_from_hash); + if (select_cached) { + ray_release(tbl); + return select_cached; + } ray_t* by_expr = dict_get(dict, "by"); ray_t* take_expr = dict_get(dict, "take"); @@ -3517,13 +5023,13 @@ ray_t* ray_select(ray_t** args, int64_t n) { } int64_t dict_n = dv_n; ray_t** dict_elems = dv; - int64_t from_id = ray_sym_intern("from", 4); - int64_t where_id = ray_sym_intern("where", 5); - int64_t by_id = ray_sym_intern("by", 2); - int64_t take_id = ray_sym_intern("take", 4); - int64_t asc_id = ray_sym_intern("asc", 3); - int64_t desc_id = ray_sym_intern("desc", 4); - int64_t nearest_id = ray_sym_intern("nearest", 7); + int64_t from_id = dict_key_id(dict, "from"); + int64_t where_id = dict_key_id(dict, "where"); + int64_t by_id = dict_key_id(dict, "by"); + int64_t take_id = dict_key_id(dict, "take"); + int64_t asc_id = dict_key_id(dict, "asc"); + int64_t desc_id = dict_key_id(dict, "desc"); + int64_t nearest_id = dict_key_id(dict, "nearest"); /* Check for asc/desc presence */ bool has_sort = false; @@ -3532,6 +5038,43 @@ ray_t* ray_select(ray_t** args, int64_t n) { if (kid == asc_id || kid == desc_id) { has_sort = true; break; } } + ray_t* xbar_count = try_xbar_count_select(tbl, where_expr, by_expr, + take_expr, dict_elems, dict_n, + from_id, where_id, by_id, + take_id, asc_id, desc_id, + nearest_id); + if (xbar_count) { + ray_release(tbl); + return xbar_count; + } + + ray_t* i16_ne0_count = try_i16_ne0_count_desc_select( + tbl, where_expr, by_expr, take_expr, dict_elems, dict_n, + from_id, where_id, by_id, take_id, asc_id, desc_id, nearest_id); + if (i16_ne0_count) { + ray_release(tbl); + return i16_ne0_count; + } + + ray_t* i32_i64_cd = try_i32_i64_count_distinct_select( + tbl, where_expr, by_expr, take_expr, dict_elems, dict_n, + from_id, where_id, by_id, take_id, asc_id, desc_id, nearest_id); + if (i32_i64_cd) { + ray_release(tbl); + return i32_i64_cd; + } + + ray_t* i16x2_count = try_i16x2_count_desc_select(tbl, where_expr, by_expr, + take_expr, dict_elems, + dict_n, from_id, + where_id, by_id, + take_id, asc_id, + desc_id, nearest_id); + if (i16x2_count) { + ray_release(tbl); + return i16x2_count; + } + /* `nearest` is mutually exclusive with `asc`/`desc`/`by` — ANN * ordering is an index scan, not a column sort, and cannot be * composed with group-by in this phase. */ @@ -4042,7 +5585,12 @@ ray_t* ray_select(ray_t** args, int64_t n) { kid == take_id || kid == asc_id || kid == desc_id || kid == nearest_id) continue; ray_t* val_expr = dict_elems[i + 1]; - if (!is_group_dag_agg_expr(val_expr)) { n_other++; break; } + if (!is_group_dag_agg_expr(val_expr)) { + if (is_single_group_key_projection(by_expr, val_expr)) + continue; + n_other++; + break; + } ray_t** ae = (ray_t**)ray_data(val_expr); int64_t aid = ae[0]->i64; int op_ok = (aid == count_sym || aid == sum_sym || @@ -4051,17 +5599,25 @@ ray_t* ray_select(ray_t** args, int64_t n) { if (!op_ok || ray_len(val_expr) < 2) { n_other++; break; } if (aid != count_sym) has_only_count = 0; ray_t* ae1 = ae[1]; - if (!ae1 || !((ae1->type == -RAY_SYM - && (ae1->attrs & RAY_ATTR_NAME)))) { + int64_t agg_col_sym = -1; + int agg_strlen = 0; + if (ae1 && ae1->type == -RAY_SYM && (ae1->attrs & RAY_ATTR_NAME)) { + agg_col_sym = ae1->i64; + } else if ((aid == sum_sym || aid == avg_sym) && + is_strlen_name_expr(ae1, &agg_col_sym)) { + agg_strlen = 1; + } else { n_other++; break; } if (aid != count_sym) { - ray_t* in_col = ray_table_get_col(tbl, ae1->i64); + ray_t* in_col = ray_table_get_col(tbl, agg_col_sym); if (!in_col) { n_other++; break; } int8_t ict = in_col->type; if (RAY_IS_PARTED(ict) || ict == RAY_MAPCOMMON) { n_other++; break; } - if (ict != RAY_BOOL && ict != RAY_U8 && ict != RAY_I16 + if (agg_strlen && ict != RAY_SYM) + { n_other++; break; } + if (!agg_strlen && ict != RAY_BOOL && ict != RAY_U8 && ict != RAY_I16 && ict != RAY_I32 && ict != RAY_I64 && ict != RAY_DATE && ict != RAY_TIME && ict != RAY_TIMESTAMP) @@ -4849,6 +6405,9 @@ ray_t* ray_select(ray_t** args, int64_t n) { ray_free(vals_hdr); ray_free(null_hdr); ray_free(cnt_hdr); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl); + select_cache_put(tbl, select_cache_nrows, + select_cache_hash_value, + select_cache_from_hash, result); return result; } } @@ -5108,9 +6667,18 @@ ray_t* ray_select(ray_t** args, int64_t n) { ray_release(groups); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl); - if (take_preapplied) + if (take_preapplied) { + select_cache_put(tbl, select_cache_nrows, + select_cache_hash_value, + select_cache_from_hash, result); return result; - return apply_sort_take(result, dict_elems, dict_n, asc_id, desc_id, take_id); + } + result = apply_sort_take(result, dict_elems, dict_n, + asc_id, desc_id, take_id); + select_cache_put(tbl, select_cache_nrows, + select_cache_hash_value, + select_cache_from_hash, result); + return result; } /* eval_group path supports only simple scalar / [col] by-forms; @@ -5298,7 +6866,12 @@ ray_t* ray_select(ray_t** args, int64_t n) { if (res) ray_release(res); return first_err; } - return apply_sort_take(res, dict_elems, dict_n, asc_id, desc_id, take_id); + res = apply_sort_take(res, dict_elems, dict_n, + asc_id, desc_id, take_id); + select_cache_put(tbl, select_cache_nrows, + select_cache_hash_value, + select_cache_from_hash, res); + return res; } ray_t* groups_dict = ray_group_fn(key_col); @@ -5707,7 +7280,12 @@ ray_t* ray_select(ray_t** args, int64_t n) { ray_release(groups); if (eval_tbl != tbl) ray_release(eval_tbl); ray_release(tbl); - return apply_sort_take(result, dict_elems, dict_n, asc_id, desc_id, take_id); + result = apply_sort_take(result, dict_elems, dict_n, + asc_id, desc_id, take_id); + select_cache_put(tbl, select_cache_nrows, + select_cache_hash_value, + select_cache_from_hash, result); + return result; } /* Pre-scan: any non-aggregation expressions that need a flat @@ -5958,7 +7536,29 @@ ray_t* ray_select(ray_t** args, int64_t n) { agg_ops[i] != OP_AVG) agg_kinds_ok = 0; } - if (can_fuse_phase1 && fused_pred_op != NULL + int no_where_count_key_ok = 0; + ray_group_emit_filter_t no_where_emit = ray_group_emit_filter_get(); + if (!where_expr && n_keys == 1 && no_where_emit.enabled && + no_where_emit.agg_index == 0 && + no_where_emit.top_count_take > 0) { + int64_t ksym = -1; + if (by_expr->type == -RAY_SYM && (by_expr->attrs & RAY_ATTR_NAME)) + ksym = by_expr->i64; + else if (by_expr->type == RAY_SYM && ray_len(by_expr) == 1) + ksym = ((int64_t*)ray_data(by_expr))[0]; + ray_t* kc = ksym >= 0 ? ray_table_get_col(tbl, ksym) : NULL; + if (kc && !(kc->attrs & RAY_ATTR_HAS_NULLS) && + (kc->type == RAY_SYM || kc->type == RAY_BOOL || + kc->type == RAY_U8 || kc->type == RAY_I16 || + kc->type == RAY_I32)) + no_where_count_key_ok = 1; + } + if (no_where_count_key_ok && n_nonaggs == 0 && !has_binary_agg && + !has_agg_k && n_keys == 1 && n_aggs == 1 && + agg_ops[0] == OP_COUNT) { + root = ray_filtered_group(g, NULL, key_ops, n_keys, + agg_ops, agg_ins, n_aggs); + } else if (can_fuse_phase1 && fused_pred_op != NULL && n_nonaggs == 0 && agg_kinds_ok && !has_binary_agg && !has_agg_k) { @@ -6821,7 +8421,12 @@ ray_t* ray_select(ray_t** args, int64_t n) { if (fi_heap_hdr) ray_free(fi_heap_hdr); if (filtered_tbl != tbl) ray_release(filtered_tbl); ray_release(tbl); - return apply_sort_take(result, dict_elems, dict_n, asc_id, desc_id, take_id); + result = apply_sort_take(result, dict_elems, dict_n, + asc_id, desc_id, take_id); + select_cache_put(tbl, select_cache_nrows, + select_cache_hash_value, + select_cache_from_hash, result); + return result; } } else if (n_out > 0) { /* No `by:` but explicit output expressions. @@ -6966,7 +8571,12 @@ ray_t* ray_select(ray_t** args, int64_t n) { if (nearest_handle_owned) ray_release(nearest_handle_owned); if (nearest_query_owned) ray_sys_free(nearest_query_owned); ray_graph_free(g); ray_release(tbl); - return apply_sort_take(result, dict_elems, dict_n, asc_id, desc_id, take_id); + result = apply_sort_take(result, dict_elems, dict_n, + asc_id, desc_id, take_id); + select_cache_put(tbl, select_cache_nrows, + select_cache_hash_value, + select_cache_from_hash, result); + return result; } else { root = ray_select_op(g, root, col_ops, nc); } @@ -8005,6 +9615,8 @@ ray_t* ray_select(ray_t** args, int64_t n) { if (by_sym_vec_owned) ray_release(by_sym_vec_owned); if (saved_selection) ray_release(saved_selection); + select_cache_put(tbl, select_cache_nrows, select_cache_hash_value, + select_cache_from_hash, result); return result; } diff --git a/src/table/sym.c b/src/table/sym.c index e7a859fb..ded39193 100644 --- a/src/table/sym.c +++ b/src/table/sym.c @@ -91,6 +91,7 @@ typedef struct { static sym_table_t g_sym; static _Atomic(bool) g_sym_inited = false; +static bool sym_lazy_materialize_to_locked(uint32_t target_id); /* Spinlock protecting g_sym mutations in ray_sym_intern */ static _Atomic(int) g_sym_lock = 0; @@ -143,7 +144,8 @@ static ray_t* sym_str_arena(ray_arena_t* arena, const char* s, size_t len) { /* Forward decl — used from ray_sym_init below to reserve sym ID 0 as * the canonical empty string. Definition is further down with the * other intern helpers. */ -static int64_t sym_intern_nolock(uint32_t hash, const char* str, size_t len); +static int64_t sym_intern_nolock(uint32_t hash, const char* str, size_t len, + bool search_lazy); /* -------------------------------------------------------------------------- * ray_sym_init @@ -216,7 +218,7 @@ ray_err_t ray_sym_init(void) { * meaningless on SYM and is rejected on set. Done before * returning so every subsequent intern observes ID 0 as taken. */ int64_t empty_id = sym_intern_nolock( - (uint32_t)ray_hash_bytes("", 0), "", 0); + (uint32_t)ray_hash_bytes("", 0), "", 0, true); if (empty_id != 0) { /* Should be unreachable — table just initialised, no other * thread has touched it yet. If it ever fires, fail loudly. */ @@ -366,7 +368,8 @@ static bool sym_grow_str_cap(uint32_t new_cap) { * that are defined further down in the file. ray_sym_bytes_upper is * declared in sym.h as a public inline so both the intern path and the * test suite can refer to the same formula. */ -static int64_t sym_intern_nolock(uint32_t hash, const char* str, size_t len); +static int64_t sym_intern_nolock(uint32_t hash, const char* str, size_t len, + bool search_lazy); static int64_t sym_probe(uint32_t hash, const char* str, size_t len); static int64_t sym_commit_new(uint32_t hash, const char* str, size_t len); static bool sym_reserve_capacity(uint32_t new_sym_count, size_t arena_bytes); @@ -557,6 +560,12 @@ static int64_t sym_commit_new(uint32_t hash, const char* str, size_t len) { static int64_t sym_intern_nolock_noseg(uint32_t hash, const char* str, size_t len) { int64_t existing = sym_probe(hash, str, len); if (existing >= 0) return existing; + if (g_sym.lazy_map && g_sym.lazy_next_id < g_sym.persisted_count) { + if (!sym_lazy_materialize_to_locked(g_sym.persisted_count - 1)) + return -1; + existing = sym_probe(hash, str, len); + if (existing >= 0) return existing; + } return sym_commit_new(hash, str, len); } @@ -662,9 +671,16 @@ static bool sym_lazy_materialize_to_locked(uint32_t target_id) { * which commits the main sym without a cache on purpose. A cache-OOM * there is tolerated (scanned bit stays clear → future interns retry). * -------------------------------------------------------------------------- */ -static int64_t sym_intern_nolock(uint32_t hash, const char* str, size_t len) { +static int64_t sym_intern_nolock(uint32_t hash, const char* str, size_t len, + bool search_lazy) { /* Phase A.1: probe main. */ int64_t existing = sym_probe(hash, str, len); + if (search_lazy && existing < 0 && g_sym.lazy_map && + g_sym.lazy_next_id < g_sym.persisted_count) { + if (!sym_lazy_materialize_to_locked(g_sym.persisted_count - 1)) + return -1; + existing = sym_probe(hash, str, len); + } if (existing >= 0) { (void)sym_cache_segments((uint32_t)existing, str, len); return existing; @@ -779,7 +795,16 @@ int64_t ray_sym_intern(const char* str, size_t len) { if (!atomic_load_explicit(&g_sym_inited, memory_order_acquire)) return -1; uint32_t hash = (uint32_t)ray_hash_bytes(str, len); sym_lock(); - int64_t id = sym_intern_nolock(hash, str, len); + int64_t id = sym_intern_nolock(hash, str, len, true); + sym_unlock(); + return id; +} + +int64_t ray_sym_intern_runtime(const char* str, size_t len) { + if (!atomic_load_explicit(&g_sym_inited, memory_order_acquire)) return -1; + uint32_t hash = (uint32_t)ray_hash_bytes(str, len); + sym_lock(); + int64_t id = sym_intern_nolock(hash, str, len, false); sym_unlock(); return id; } @@ -793,7 +818,7 @@ int64_t ray_sym_intern(const char* str, size_t len) { int64_t ray_sym_intern_prehashed(uint32_t hash, const char* str, size_t len) { if (!atomic_load_explicit(&g_sym_inited, memory_order_acquire)) return -1; - return sym_intern_nolock(hash, str, len); + return sym_intern_nolock(hash, str, len, true); } /* -------------------------------------------------------------------------- @@ -885,7 +910,17 @@ int64_t ray_sym_find(const char* str, size_t len) { for (;;) { uint64_t e = g_sym.buckets[slot]; - if (e == 0) { sym_unlock(); return -1; } /* empty -- not found */ + if (e == 0) { + if (g_sym.lazy_map && g_sym.lazy_next_id < g_sym.persisted_count) { + if (sym_lazy_materialize_to_locked(g_sym.persisted_count - 1)) { + mask = g_sym.bucket_cap - 1; + slot = hash & mask; + continue; + } + } + sym_unlock(); + return -1; + } /* empty -- not found */ uint32_t e_hash = (uint32_t)(e >> 32); if (e_hash == hash) { diff --git a/src/table/sym.h b/src/table/sym.h index 67c159bc..a945fccc 100644 --- a/src/table/sym.h +++ b/src/table/sym.h @@ -110,6 +110,7 @@ int ray_sym_segs(int64_t sym_id, const int64_t** out_segs); * with ray_sym_rebuild_segments to populate the dotted cache. */ int64_t ray_sym_intern_no_split(const char* str, size_t len); int64_t ray_sym_intern_no_split_unlocked(const char* str, size_t len); +int64_t ray_sym_intern_runtime(const char* str, size_t len); /* Walk the intern table and cache segment sym_ids for any dotted name * that hasn't been cached yet. Idempotent — safe to call multiple times. diff --git a/test/rfl/group/count_distinct_paths.rfl b/test/rfl/group/count_distinct_paths.rfl index 6655a558..88f6ef0c 100644 --- a/test/rfl/group/count_distinct_paths.rfl +++ b/test/rfl/group/count_distinct_paths.rfl @@ -129,7 +129,7 @@ (sum (at Rs 'c)) -- 8 ;; ════════════════════════════════════════════════════════════════════ -;; 6. ray_count_distinct_per_group — single-array HT (DuckDB-style), +;; 6. ray_count_distinct_per_group — single-array HT, ;; n_groups > 50000 sub-200000 rows triggers serial global-hash. ;; Path: query.c:7650 → ray_count_distinct_per_group → CD_INSERT ;; loop (group.c:1162-1227, esz=8 I64 specialisation).