diff --git a/expected/explicit_threshold_cap.out b/expected/explicit_threshold_cap.out new file mode 100644 index 0000000..b3cdfd9 --- /dev/null +++ b/expected/explicit_threshold_cap.out @@ -0,0 +1,260 @@ +-- ---------------------------------------------------------------- +-- Regression tests for explicit threshold enforcement. +-- Verify that auto-threshold capping, buffer capacity guards, +-- and deserialization validation work correctly. +-- ---------------------------------------------------------------- +SELECT hll_set_output_version(1); + hll_set_output_version +------------------------ + 1 +(1 row) + +-- ---------------------------------------------------------------- +-- Test 1: Crafted binary with out-of-range log2m is rejected. +-- byte0=0x11 (v1, EMPTY), byte1=0xf4 (regwidth=8, log2m=20), +-- byte2=0x3f (sparseon=0, expthresh=auto). +-- Must be rejected at deserialization time. +-- ---------------------------------------------------------------- +SELECT '\x11f43f'::hll; +ERROR: log2m modifier must be between 0 and 17 +LINE 1: SELECT '\x11f43f'::hll; + ^ +-- Explicit type with out-of-range params also rejected. +-- byte0=0x12 (v1, EXPLICIT), same params, one 8-byte element. +SELECT '\x12f43f0000000000000001'::hll; +ERROR: log2m modifier must be between 0 and 17 +LINE 1: SELECT '\x12f43f0000000000000001'::hll; + ^ +-- Malformed header variant: byte2=0x7f (sparseon=1, expthresh=auto). +SELECT '\x12f47f0000000000000001'::hll; +ERROR: log2m modifier must be between 0 and 17 +LINE 1: SELECT '\x12f47f0000000000000001'::hll; + ^ +-- ---------------------------------------------------------------- +-- Test 2: Crafted binary with out-of-range regwidth is rejected. +-- byte1=0xeb (regwidth=8, log2m=11). +-- ---------------------------------------------------------------- +SELECT '\x11eb3f'::hll; +ERROR: regwidth modifier must be between 0 and 7 +LINE 1: SELECT '\x11eb3f'::hll; + ^ +-- ---------------------------------------------------------------- +-- Test 3: Valid parameters, verify no behavioral change. +-- Standard log2m=11, regwidth=5 with auto threshold. +-- ---------------------------------------------------------------- +SELECT hll_cardinality( + hll_add( + hll_add( + hll_add(hll_empty(11,5,-1,1), + hll_hash_integer(1,0)), + hll_hash_integer(2,0)), + hll_hash_integer(3,0)) +); + hll_cardinality +----------------- + 3 +(1 row) + +-- ---------------------------------------------------------------- +-- Test 4: Union with valid small parameters. +-- ---------------------------------------------------------------- +SELECT hll_cardinality( + hll_union( + hll_add(hll_empty(11,5,-1,1), + hll_hash_integer(1,0)), + hll_add(hll_empty(11,5,-1,1), + hll_hash_integer(2,0)) + ) +); + hll_cardinality +----------------- + 2 +(1 row) + +-- ---------------------------------------------------------------- +-- Test 5: Promotion at explicit threshold boundary. +-- Use expthresh=2: two elements stay explicit, third promotes. +-- ---------------------------------------------------------------- +-- Two elements: should be explicit (type 2). +SELECT hll_type( + hll_add( + hll_add(hll_empty(11,5,2,0), + hll_hash_integer(1,0)), + hll_hash_integer(2,0)) +); + hll_type +---------- + 2 +(1 row) + +-- Three elements: should promote to compressed (type 4). +SELECT hll_type( + hll_add( + hll_add( + hll_add(hll_empty(11,5,2,0), + hll_hash_integer(1,0)), + hll_hash_integer(2,0)), + hll_hash_integer(3,0)) +); + hll_type +---------- + 4 +(1 row) + +-- ---------------------------------------------------------------- +-- Test 6: Promotion via union at threshold boundary. +-- Two explicit HLLs with expthresh=2, union forces promotion. +-- ---------------------------------------------------------------- +SELECT hll_type( + hll_union( + hll_add( + hll_add(hll_empty(11,5,2,0), + hll_hash_integer(1,0)), + hll_hash_integer(2,0)), + hll_add(hll_empty(11,5,2,0), + hll_hash_integer(3,0)) + ) +); + hll_type +---------- + 4 +(1 row) + +-- ---------------------------------------------------------------- +-- Test 7: Valid max-boundary parameters (log2m=17, regwidth=7). +-- These are at the limit but within valid range. +-- ---------------------------------------------------------------- +SELECT hll_cardinality( + hll_add( + hll_add( + hll_add(hll_empty(17,7,-1,0), + hll_hash_integer(1,0)), + hll_hash_integer(2,0)), + hll_hash_integer(3,0)) +); + hll_cardinality +----------------- + 3 +(1 row) + +-- Union with max-boundary parameters. +SELECT hll_cardinality( + hll_union( + hll_add(hll_empty(17,7,-1,0), + hll_hash_integer(1,0)), + hll_add(hll_empty(17,7,-1,0), + hll_hash_integer(2,0)) + ) +); + hll_cardinality +----------------- + 2 +(1 row) + +-- ---------------------------------------------------------------- +-- Test 8: Crafted binary with valid max-boundary parameters. +-- byte1=0xcb (regwidth=7, log2m=11), byte2=0x3f (auto, sparse=0). +-- Should be accepted. +-- ---------------------------------------------------------------- +SELECT hll_cardinality( + hll_add('\x11cb3f'::hll, hll_hash_integer(1,0)) +); + hll_cardinality +----------------- + 1 +(1 row) + +-- ---------------------------------------------------------------- +-- Test 9: Malformed header rejected through hll_union() path. +-- The exact PoC header \x12f47f (log2m=20, regwidth=8, auto) +-- must be rejected even when used as input to hll_union(). +-- ---------------------------------------------------------------- +SELECT hll_union( + '\x12f47f0000000000000001'::hll, + '\x12f47f0000000000000002'::hll +); +ERROR: log2m modifier must be between 0 and 17 +LINE 2: '\x12f47f0000000000000001'::hll, + ^ +-- ---------------------------------------------------------------- +-- Test 10: Malformed header rejected through hll_union_agg() path. +-- ---------------------------------------------------------------- +SELECT hll_union_agg(v::hll) FROM (VALUES + ('\x12f47f0000000000000001'::hll), + ('\x12f47f0000000000000002'::hll) +) AS t(v); +ERROR: log2m modifier must be between 0 and 17 +LINE 2: ('\x12f47f0000000000000001'::hll), + ^ +-- ---------------------------------------------------------------- +-- Test 11: Near-capacity union with max valid parameters. +-- Build two large explicit HLLs with log2m=11, regwidth=5, +-- auto threshold. Auto threshold = ((5*2048+7)/8)/8 = 160. +-- Each HLL has 100 unique elements, union produces 200 which +-- exceeds the threshold and must promote to compressed. +-- ---------------------------------------------------------------- +SELECT hll_type( + hll_union( + (SELECT hll_add_agg(hll_hash_integer(i,0), 11, 5, -1, 0) + FROM generate_series(1, 100) AS g(i)), + (SELECT hll_add_agg(hll_hash_integer(i,0), 11, 5, -1, 0) + FROM generate_series(101, 200) AS g(i)) + ) +); + hll_type +---------- + 4 +(1 row) + +-- ---------------------------------------------------------------- +-- Test 12: Near-capacity union via hll_union_agg aggregate. +-- Build 20 explicit HLLs of 10 elements each with auto threshold. +-- Union should promote to compressed. +-- ---------------------------------------------------------------- +SELECT hll_type( + (SELECT hll_union_agg(h) FROM ( + SELECT hll_add_agg(hll_hash_integer(i,0), 11, 5, -1, 0) AS h + FROM generate_series(1, 200) AS g(i) + GROUP BY i / 10 + ) sub) +); + hll_type +---------- + 4 +(1 row) + +-- ---------------------------------------------------------------- +-- Test 13: hll_send on union result with valid parameters. +-- Ensure the serialized output is well-formed (no leaked memory). +-- ---------------------------------------------------------------- +SELECT length(hll_send( + hll_union( + (SELECT hll_add_agg(hll_hash_integer(i,0), 11, 5, -1, 0) + FROM generate_series(1, 100) AS g(i)), + (SELECT hll_add_agg(hll_hash_integer(i,0), 11, 5, -1, 0) + FROM generate_series(101, 200) AS g(i)) + ) +)) > 0 AS send_ok; + send_ok +--------- + t +(1 row) + +-- ---------------------------------------------------------------- +-- Test 14: Maximum valid params (log2m=17, regwidth=7) with +-- enough elements to exceed auto threshold (14336) via union. +-- Must promote to compressed without overflow. +-- ---------------------------------------------------------------- +SELECT hll_type( + hll_union( + (SELECT hll_add_agg(hll_hash_integer(i,0), 17, 7, -1, 0) + FROM generate_series(1, 10000) AS g(i)), + (SELECT hll_add_agg(hll_hash_integer(i,0), 17, 7, -1, 0) + FROM generate_series(5001, 15000) AS g(i)) + ) +); + hll_type +---------- + 4 +(1 row) + diff --git a/sql/explicit_threshold_cap.sql b/sql/explicit_threshold_cap.sql new file mode 100644 index 0000000..8f6adef --- /dev/null +++ b/sql/explicit_threshold_cap.sql @@ -0,0 +1,210 @@ +-- ---------------------------------------------------------------- +-- Regression tests for explicit threshold enforcement. +-- Verify that auto-threshold capping, buffer capacity guards, +-- and deserialization validation work correctly. +-- ---------------------------------------------------------------- + +SELECT hll_set_output_version(1); + +-- ---------------------------------------------------------------- +-- Test 1: Crafted binary with out-of-range log2m is rejected. +-- byte0=0x11 (v1, EMPTY), byte1=0xf4 (regwidth=8, log2m=20), +-- byte2=0x3f (sparseon=0, expthresh=auto). +-- Must be rejected at deserialization time. +-- ---------------------------------------------------------------- + +SELECT '\x11f43f'::hll; + +-- Explicit type with out-of-range params also rejected. +-- byte0=0x12 (v1, EXPLICIT), same params, one 8-byte element. +SELECT '\x12f43f0000000000000001'::hll; + +-- Malformed header variant: byte2=0x7f (sparseon=1, expthresh=auto). +SELECT '\x12f47f0000000000000001'::hll; + +-- ---------------------------------------------------------------- +-- Test 2: Crafted binary with out-of-range regwidth is rejected. +-- byte1=0xeb (regwidth=8, log2m=11). +-- ---------------------------------------------------------------- + +SELECT '\x11eb3f'::hll; + +-- ---------------------------------------------------------------- +-- Test 3: Valid parameters, verify no behavioral change. +-- Standard log2m=11, regwidth=5 with auto threshold. +-- ---------------------------------------------------------------- + +SELECT hll_cardinality( + hll_add( + hll_add( + hll_add(hll_empty(11,5,-1,1), + hll_hash_integer(1,0)), + hll_hash_integer(2,0)), + hll_hash_integer(3,0)) +); + +-- ---------------------------------------------------------------- +-- Test 4: Union with valid small parameters. +-- ---------------------------------------------------------------- + +SELECT hll_cardinality( + hll_union( + hll_add(hll_empty(11,5,-1,1), + hll_hash_integer(1,0)), + hll_add(hll_empty(11,5,-1,1), + hll_hash_integer(2,0)) + ) +); + +-- ---------------------------------------------------------------- +-- Test 5: Promotion at explicit threshold boundary. +-- Use expthresh=2: two elements stay explicit, third promotes. +-- ---------------------------------------------------------------- + +-- Two elements: should be explicit (type 2). +SELECT hll_type( + hll_add( + hll_add(hll_empty(11,5,2,0), + hll_hash_integer(1,0)), + hll_hash_integer(2,0)) +); + +-- Three elements: should promote to compressed (type 4). +SELECT hll_type( + hll_add( + hll_add( + hll_add(hll_empty(11,5,2,0), + hll_hash_integer(1,0)), + hll_hash_integer(2,0)), + hll_hash_integer(3,0)) +); + +-- ---------------------------------------------------------------- +-- Test 6: Promotion via union at threshold boundary. +-- Two explicit HLLs with expthresh=2, union forces promotion. +-- ---------------------------------------------------------------- + +SELECT hll_type( + hll_union( + hll_add( + hll_add(hll_empty(11,5,2,0), + hll_hash_integer(1,0)), + hll_hash_integer(2,0)), + hll_add(hll_empty(11,5,2,0), + hll_hash_integer(3,0)) + ) +); + +-- ---------------------------------------------------------------- +-- Test 7: Valid max-boundary parameters (log2m=17, regwidth=7). +-- These are at the limit but within valid range. +-- ---------------------------------------------------------------- + +SELECT hll_cardinality( + hll_add( + hll_add( + hll_add(hll_empty(17,7,-1,0), + hll_hash_integer(1,0)), + hll_hash_integer(2,0)), + hll_hash_integer(3,0)) +); + +-- Union with max-boundary parameters. +SELECT hll_cardinality( + hll_union( + hll_add(hll_empty(17,7,-1,0), + hll_hash_integer(1,0)), + hll_add(hll_empty(17,7,-1,0), + hll_hash_integer(2,0)) + ) +); + +-- ---------------------------------------------------------------- +-- Test 8: Crafted binary with valid max-boundary parameters. +-- byte1=0xcb (regwidth=7, log2m=11), byte2=0x3f (auto, sparse=0). +-- Should be accepted. +-- ---------------------------------------------------------------- + +SELECT hll_cardinality( + hll_add('\x11cb3f'::hll, hll_hash_integer(1,0)) +); + +-- ---------------------------------------------------------------- +-- Test 9: Malformed header rejected through hll_union() path. +-- The exact PoC header \x12f47f (log2m=20, regwidth=8, auto) +-- must be rejected even when used as input to hll_union(). +-- ---------------------------------------------------------------- + +SELECT hll_union( + '\x12f47f0000000000000001'::hll, + '\x12f47f0000000000000002'::hll +); + +-- ---------------------------------------------------------------- +-- Test 10: Malformed header rejected through hll_union_agg() path. +-- ---------------------------------------------------------------- + +SELECT hll_union_agg(v::hll) FROM (VALUES + ('\x12f47f0000000000000001'::hll), + ('\x12f47f0000000000000002'::hll) +) AS t(v); + +-- ---------------------------------------------------------------- +-- Test 11: Near-capacity union with max valid parameters. +-- Build two large explicit HLLs with log2m=11, regwidth=5, +-- auto threshold. Auto threshold = ((5*2048+7)/8)/8 = 160. +-- Each HLL has 100 unique elements, union produces 200 which +-- exceeds the threshold and must promote to compressed. +-- ---------------------------------------------------------------- + +SELECT hll_type( + hll_union( + (SELECT hll_add_agg(hll_hash_integer(i,0), 11, 5, -1, 0) + FROM generate_series(1, 100) AS g(i)), + (SELECT hll_add_agg(hll_hash_integer(i,0), 11, 5, -1, 0) + FROM generate_series(101, 200) AS g(i)) + ) +); + +-- ---------------------------------------------------------------- +-- Test 12: Near-capacity union via hll_union_agg aggregate. +-- Build 20 explicit HLLs of 10 elements each with auto threshold. +-- Union should promote to compressed. +-- ---------------------------------------------------------------- + +SELECT hll_type( + (SELECT hll_union_agg(h) FROM ( + SELECT hll_add_agg(hll_hash_integer(i,0), 11, 5, -1, 0) AS h + FROM generate_series(1, 200) AS g(i) + GROUP BY i / 10 + ) sub) +); + +-- ---------------------------------------------------------------- +-- Test 13: hll_send on union result with valid parameters. +-- Ensure the serialized output is well-formed (no leaked memory). +-- ---------------------------------------------------------------- + +SELECT length(hll_send( + hll_union( + (SELECT hll_add_agg(hll_hash_integer(i,0), 11, 5, -1, 0) + FROM generate_series(1, 100) AS g(i)), + (SELECT hll_add_agg(hll_hash_integer(i,0), 11, 5, -1, 0) + FROM generate_series(101, 200) AS g(i)) + ) +)) > 0 AS send_ok; + +-- ---------------------------------------------------------------- +-- Test 14: Maximum valid params (log2m=17, regwidth=7) with +-- enough elements to exceed auto threshold (14336) via union. +-- Must promote to compressed without overflow. +-- ---------------------------------------------------------------- + +SELECT hll_type( + hll_union( + (SELECT hll_add_agg(hll_hash_integer(i,0), 17, 7, -1, 0) + FROM generate_series(1, 10000) AS g(i)), + (SELECT hll_add_agg(hll_hash_integer(i,0), 17, 7, -1, 0) + FROM generate_series(5001, 15000) AS g(i)) + ) +); diff --git a/src/hll.c b/src/hll.c index 551fc8a..5f50701 100644 --- a/src/hll.c +++ b/src/hll.c @@ -515,6 +515,8 @@ static int32 encode_expthresh(int64 expthresh) return integer_log2(expthresh) + 1; } +static size_t mse_nelem_max(void); +static void check_modifiers(int32 log2m, int32 regwidth, int64 expthresh, int32 sparseon); // If expthresh == -1 (auto select expthresh) determine // the expthresh to use from nbits and nregs. // @@ -531,7 +533,9 @@ expthresh_value(int64 expthresh, size_t nbits, size_t nregs) // registers that fits in the same space as the compressed // encoding. size_t cmpsz = ((nbits * nregs) + 7) / 8; - return cmpsz / sizeof(uint64_t); + size_t result = cmpsz / sizeof(uint64_t); + size_t max_elems = mse_nelem_max(); + return (result > max_elems) ? max_elems : result; } } @@ -1212,7 +1216,11 @@ multiset_add(multiset_t * o_msp, uint64_t element) size_t expval = expthresh_value(o_msp->ms_expthresh, o_msp->ms_nbits, o_msp->ms_nregs); - Assert(expval <= mse_nelem_max()); + if (expval > mse_nelem_max()) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("explicit threshold %zu exceeds maximum buffer capacity %zu", + expval, mse_nelem_max()))); switch (o_msp->ms_type) { @@ -1251,7 +1259,7 @@ multiset_add(multiset_t * o_msp, uint64_t element) } // Is the explicit multiset full? - if (msep->mse_nelem == expval) + if (msep->mse_nelem >= expval || msep->mse_nelem >= mse_nelem_max()) { // Convert it to compressed. explicit_to_compressed(o_msp); @@ -1322,7 +1330,7 @@ explicit_union(multiset_t * o_msp, ms_explicit_t const * i_msep) element_compare)) continue; - if (msep->mse_nelem < expval) + if (msep->mse_nelem < expval && msep->mse_nelem < mse_nelem_max()) { // Add the element at the end. msep->mse_elems[msep->mse_nelem++] = element; @@ -1404,6 +1412,9 @@ multiset_unpack(multiset_t * o_msp, } unpack_header(o_msp, i_bitp, vers, type); + + check_modifiers(o_msp->ms_log2nregs, o_msp->ms_nbits, + o_msp->ms_expthresh, o_msp->ms_sparseon); } else { @@ -1441,6 +1452,9 @@ multiset_unpack(multiset_t * o_msp, unpack_header(o_msp, i_bitp, vers, type); + check_modifiers(o_msp->ms_log2nregs, o_msp->ms_nbits, + o_msp->ms_expthresh, o_msp->ms_sparseon); + msep->mse_nelem = nelem; for (size_t ii = 0; ii < nelem; ++ii) {