From 507850b9b6878dcc82f637325461838dc15cb391 Mon Sep 17 00:00:00 2001 From: yuanyuhao Date: Wed, 20 May 2026 17:39:06 +0800 Subject: [PATCH 1/7] add docs for aggregation function datasketches_hll_union_agg --- .../datasketches_hll_union_agg.md | 95 +++++++++++++++++++ .../datasketches_hll_union_agg.md | 94 ++++++++++++++++++ sidebars.ts | 1 + 3 files changed, 190 insertions(+) create mode 100644 docs/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md diff --git a/docs/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md b/docs/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md new file mode 100644 index 0000000000000..a3c1b99f8fab0 --- /dev/null +++ b/docs/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md @@ -0,0 +1,95 @@ +--- +{ + "title": "DATASKETCHES_HLL_UNION_AGG", + "language": "en", + "description": "The datasketches_hll_union_agg function is an aggregate function used to union multiple Apache DataSketches HLL sketches and return the estimated cardinality of the union." +} +--- + +## Description + +`datasketches_hll_union_agg` is an aggregate function used to **union** multiple Apache DataSketches **HLL** (`hll_sketch`) serialized values and return the **estimated cardinality** (approximate distinct count / NDV) after union. + +This function expects the input to be **serialized bytes of a DataSketches HLL sketch** (for example, generated by `hll_sketch.serialize_compact()` in the DataSketches library). It does not accept arbitrary strings. + +Aliases: + +- `ds_hll_union_count` +- `ds_cardinality` + +## Syntax + +```sql +datasketches_hll_union_agg() +``` + +## Parameters + +| Parameter | Description | +| -- | -- | +| `` | The serialized bytes of an Apache DataSketches HLL sketch. Supported types: STRING / VARCHAR / BINARY / VARBINARY. NULL values are ignored. Empty strings are treated as invalid input and will throw an error. | + +## Return Value + +Returns a BIGINT cardinality estimate value. +If there is no valid data in the group, returns 0. +If the input bytes cannot be deserialized as a valid DataSketches HLL sketch (including empty string), an error is thrown. + +## Example + +```sql +-- setup +CREATE TABLE test_datasketches_hll_union_agg_tbl ( + id INT, + sk STRING +) +DISTRIBUTED BY HASH(id) BUCKETS 1 +PROPERTIES ("replication_num" = "1"); + +-- The sketch bytes are inserted via Base64 decoding. +INSERT INTO test_datasketches_hll_union_agg_tbl VALUES + (1, from_base64('AgEHCAMIBwjL18IEK/L7BoYv+Q11gWYHgbxdBntl5gj8LUIK')), + (2, from_base64('AwEHCAUIAAkKAAAAIjvrBcS1nwfGGWoEyHokBO8t9wc1qTEENkcJB7hWqQxZf9QNnuSbGA==')), + (3, NULL); +``` + +```sql +SELECT datasketches_hll_union_agg(sk) FROM test_datasketches_hll_union_agg_tbl; +``` + +```text ++-------------------------------+ +| datasketches_hll_union_agg(sk) | ++-------------------------------+ +| 17 | ++-------------------------------+ +``` + +```sql +-- aliases +SELECT + datasketches_hll_union_agg(sk), + ds_hll_union_count(sk), + ds_cardinality(sk) +FROM test_datasketches_hll_union_agg_tbl; +``` + +```sql +-- empty input returns 0 +SELECT datasketches_hll_union_agg(sk) +FROM test_datasketches_hll_union_agg_tbl +WHERE sk IS NULL; +``` + +```text ++-------------------------------+ +| datasketches_hll_union_agg(sk) | ++-------------------------------+ +| 0 | ++-------------------------------+ +``` + +```sql +-- empty string is invalid and will throw +SELECT datasketches_hll_union_agg(''); +``` diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md new file mode 100644 index 0000000000000..4dc43635ad09c --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md @@ -0,0 +1,94 @@ +--- +{ + "title": "DATASKETCHES_HLL_UNION_AGG", + "language": "zh-CN", + "description": "datasketches_hll_union_agg 函数是一种聚合函数,用于对多个 Apache DataSketches HLL sketch 的序列化结果进行 union 合并,并返回合并后基数的估算值(近似去重数)。" +} +--- + +## 描述 + +`datasketches_hll_union_agg` 函数是一种聚合函数,用于对多个 **Apache DataSketches HLL sketch(hll_sketch)** 的序列化结果进行 **union 合并**,并返回合并后基数的**估算值**(近似去重数 / NDV)。 + +该函数的输入不是普通字符串,而是 **DataSketches HLL sketch 的序列化字节串**(例如由 DataSketches 的 `hll_sketch.serialize_compact()` 生成)。 + +别名: + +- `ds_hll_union_count` +- `ds_cardinality` + +## 语法 + +```sql +datasketches_hll_union_agg() +``` + +## 参数 + +| 参数 | 说明 | +| -- | -- | +| `` | DataSketches HLL sketch 的序列化字节串。支持类型:STRING / VARCHAR / BINARY / VARBINARY。NULL 会被忽略;空字符串属于非法输入,将报错。 | + +## 返回值 + +返回 BIGINT 类型的基数估算值。 +如果组内没有合法数据则返回 0 。 +若输入字节串无法反序列化为合法的 DataSketches HLL sketch(包括空字符串),将报错。 + +## 举例 + +```sql +-- setup +CREATE TABLE test_datasketches_hll_union_agg_tbl ( + id INT, + sk STRING +) DISTRIBUTED BY HASH(id) BUCKETS 1 +PROPERTIES ("replication_num" = "1"); + +-- 通过 from_base64() 将 Base64 文本解码为 sketch 字节串后写入 +INSERT INTO test_datasketches_hll_union_agg_tbl VALUES + (1, from_base64('AgEHCAMIBwjL18IEK/L7BoYv+Q11gWYHgbxdBntl5gj8LUIK')), + (2, from_base64('AwEHCAUIAAkKAAAAIjvrBcS1nwfGGWoEyHokBO8t9wc1qTEENkcJB7hWqQxZf9QNnuSbGA==')), + (3, NULL); +``` + +```sql +SELECT datasketches_hll_union_agg(sk) FROM test_datasketches_hll_union_agg_tbl; +``` + +```text ++-------------------------------+ +| datasketches_hll_union_agg(sk) | ++-------------------------------+ +| 17 | ++-------------------------------+ +``` + +```sql +-- 别名用法 +SELECT + datasketches_hll_union_agg(sk), + ds_hll_union_count(sk), + ds_cardinality(sk) +FROM test_datasketches_hll_union_agg_tbl; +``` + +```sql +-- 组内无合法数据返回 0 +SELECT datasketches_hll_union_agg(sk) +FROM test_datasketches_hll_union_agg_tbl +WHERE sk IS NULL; +``` + +```text ++-------------------------------+ +| datasketches_hll_union_agg(sk) | ++-------------------------------+ +| 0 | ++-------------------------------+ +``` + +```sql +-- 空字符串属于非法输入,将报错 +SELECT datasketches_hll_union_agg(''); +``` \ No newline at end of file diff --git a/sidebars.ts b/sidebars.ts index 9ee4e8fca691b..c37d871fe21a2 100644 --- a/sidebars.ts +++ b/sidebars.ts @@ -1973,6 +1973,7 @@ const sidebars: SidebarsConfig = { 'sql-manual/sql-functions/aggregate-functions/count-by-enum', 'sql-manual/sql-functions/aggregate-functions/covar', 'sql-manual/sql-functions/aggregate-functions/covar-samp', + 'sql-manual/sql-functions/aggregate-functions/datasketches-hll-union-agg', 'sql-manual/sql-functions/aggregate-functions/group-array-intersect', 'sql-manual/sql-functions/aggregate-functions/group-array-union', 'sql-manual/sql-functions/aggregate-functions/group-bit-and', From 99932793e8eb2c1a39d3876630874df15a917acb Mon Sep 17 00:00:00 2001 From: yuanyuhao Date: Thu, 21 May 2026 21:17:39 +0800 Subject: [PATCH 2/7] improve docs --- .../datasketches_hll_union_agg.md | 44 ++++++++++-------- .../datasketches_hll_union_agg.md | 45 +++++++++++-------- 2 files changed, 51 insertions(+), 38 deletions(-) diff --git a/docs/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md b/docs/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md index a3c1b99f8fab0..fad8da4c38614 100644 --- a/docs/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md +++ b/docs/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md @@ -1,8 +1,7 @@ ---- { - "title": "DATASKETCHES_HLL_UNION_AGG", - "language": "en", - "description": "The datasketches_hll_union_agg function is an aggregate function used to union multiple Apache DataSketches HLL sketches and return the estimated cardinality of the union." +"title": "DATASKETCHES_HLL_UNION_AGG", +"language": "en", +"description": "The datasketches_hll_union_agg function is an aggregate function used to union multiple Apache DataSketches HLL sketches and return the estimated cardinality of the union as a DOUBLE value." } --- @@ -14,8 +13,8 @@ This function expects the input to be **serialized bytes of a DataSketches HLL s Aliases: -- `ds_hll_union_count` -- `ds_cardinality` +- `ds_hll_estimate` +- `datasketches_hll_estimate` ## Syntax @@ -31,9 +30,9 @@ datasketches_hll_union_agg() ## Return Value -Returns a BIGINT cardinality estimate value. -If there is no valid data in the group, returns 0. -If the input bytes cannot be deserialized as a valid DataSketches HLL sketch (including empty string), an error is thrown. +Returns a DOUBLE (Float64) cardinality estimate value. +If there is no valid data in the group (or the input is empty), returns 0. +If the input bytes cannot be deserialized as a valid DataSketches HLL sketch (including empty string), an error is thrown (typically with error code `CORRUPTION`). ## Example @@ -54,23 +53,25 @@ INSERT INTO test_datasketches_hll_union_agg_tbl VALUES ``` ```sql -SELECT datasketches_hll_union_agg(sk) FROM test_datasketches_hll_union_agg_tbl; +-- The function returns DOUBLE, so use ROUND/CAST if you want an integer display. +SELECT CAST(ROUND(datasketches_hll_union_agg(sk)) AS BIGINT) +FROM test_datasketches_hll_union_agg_tbl; ``` ```text -+-------------------------------+ -| datasketches_hll_union_agg(sk) | -+-------------------------------+ -| 17 | -+-------------------------------+ ++------------------------------------------------------+ +| CAST(ROUND(datasketches_hll_union_agg(sk)) AS BIGINT) | ++------------------------------------------------------+ +| 17 | ++------------------------------------------------------+ ``` ```sql -- aliases SELECT - datasketches_hll_union_agg(sk), - ds_hll_union_count(sk), - ds_cardinality(sk) + CAST(ROUND(datasketches_hll_union_agg(sk)) AS BIGINT), + CAST(ROUND(ds_hll_estimate(sk)) AS BIGINT), + CAST(ROUND(datasketches_hll_estimate(sk)) AS BIGINT) FROM test_datasketches_hll_union_agg_tbl; ``` @@ -89,7 +90,12 @@ WHERE sk IS NULL; +-------------------------------+ ``` +```sql +-- invalid sketch bytes will throw +SELECT datasketches_hll_union_agg(from_base64('AA==')); +``` + ```sql -- empty string is invalid and will throw SELECT datasketches_hll_union_agg(''); -``` +``` \ No newline at end of file diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md index 4dc43635ad09c..f464ffe38e525 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md @@ -1,8 +1,7 @@ ---- { - "title": "DATASKETCHES_HLL_UNION_AGG", - "language": "zh-CN", - "description": "datasketches_hll_union_agg 函数是一种聚合函数,用于对多个 Apache DataSketches HLL sketch 的序列化结果进行 union 合并,并返回合并后基数的估算值(近似去重数)。" +"title": "DATASKETCHES_HLL_UNION_AGG", +"language": "zh-CN", +"description": "datasketches_hll_union_agg 函数是一种聚合函数,用于对多个 Apache DataSketches HLL sketch 的序列化结果进行 union 合并,并返回合并后基数的估算值(DOUBLE)。" } --- @@ -14,8 +13,8 @@ 别名: -- `ds_hll_union_count` -- `ds_cardinality` +- `ds_hll_estimate` +- `datasketches_hll_estimate` ## 语法 @@ -31,9 +30,9 @@ datasketches_hll_union_agg() ## 返回值 -返回 BIGINT 类型的基数估算值。 -如果组内没有合法数据则返回 0 。 -若输入字节串无法反序列化为合法的 DataSketches HLL sketch(包括空字符串),将报错。 +返回 DOUBLE(Float64)类型的基数估算值。 +如果没有合法数据(例如全为 NULL,或表为空)则返回 0。 +若输入字节串无法反序列化为合法的 DataSketches HLL sketch(包括空字符串),将报错(通常错误码为 `CORRUPTION`)。 ## 举例 @@ -42,7 +41,8 @@ datasketches_hll_union_agg() CREATE TABLE test_datasketches_hll_union_agg_tbl ( id INT, sk STRING -) DISTRIBUTED BY HASH(id) BUCKETS 1 +) +DISTRIBUTED BY HASH(id) BUCKETS 1 PROPERTIES ("replication_num" = "1"); -- 通过 from_base64() 将 Base64 文本解码为 sketch 字节串后写入 @@ -53,23 +53,25 @@ INSERT INTO test_datasketches_hll_union_agg_tbl VALUES ``` ```sql -SELECT datasketches_hll_union_agg(sk) FROM test_datasketches_hll_union_agg_tbl; +-- 该函数返回 DOUBLE,如需以整数形式展示可配合 ROUND/CAST +SELECT CAST(ROUND(datasketches_hll_union_agg(sk)) AS BIGINT) +FROM test_datasketches_hll_union_agg_tbl; ``` ```text -+-------------------------------+ -| datasketches_hll_union_agg(sk) | -+-------------------------------+ -| 17 | -+-------------------------------+ ++------------------------------------------------------+ +| CAST(ROUND(datasketches_hll_union_agg(sk)) AS BIGINT) | ++------------------------------------------------------+ +| 17 | ++------------------------------------------------------+ ``` ```sql -- 别名用法 SELECT - datasketches_hll_union_agg(sk), - ds_hll_union_count(sk), - ds_cardinality(sk) + CAST(ROUND(datasketches_hll_union_agg(sk)) AS BIGINT), + CAST(ROUND(ds_hll_estimate(sk)) AS BIGINT), + CAST(ROUND(datasketches_hll_estimate(sk)) AS BIGINT) FROM test_datasketches_hll_union_agg_tbl; ``` @@ -88,6 +90,11 @@ WHERE sk IS NULL; +-------------------------------+ ``` +```sql +-- 非法 sketch 字节串将报错 +SELECT datasketches_hll_union_agg(from_base64('AA==')); +``` + ```sql -- 空字符串属于非法输入,将报错 SELECT datasketches_hll_union_agg(''); From 989cfc9716f9d3e14c69e7c3370e2a32480787d3 Mon Sep 17 00:00:00 2001 From: yuanyuhao Date: Mon, 1 Jun 2026 13:54:52 +0800 Subject: [PATCH 3/7] fix case result & comments --- .../datasketches_hll_union_agg.md | 39 +++++++++++----- .../datasketches_hll_union_agg.md | 45 +++++++++++++------ 2 files changed, 59 insertions(+), 25 deletions(-) diff --git a/docs/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md b/docs/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md index fad8da4c38614..bf59a24fcd31b 100644 --- a/docs/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md +++ b/docs/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md @@ -1,3 +1,4 @@ +--- { "title": "DATASKETCHES_HLL_UNION_AGG", "language": "en", @@ -59,22 +60,30 @@ FROM test_datasketches_hll_union_agg_tbl; ``` ```text -+------------------------------------------------------+ ++-------------------------------------------------------+ | CAST(ROUND(datasketches_hll_union_agg(sk)) AS BIGINT) | -+------------------------------------------------------+ -| 17 | -+------------------------------------------------------+ ++-------------------------------------------------------+ +| 17 | ++-------------------------------------------------------+ ``` ```sql -- aliases SELECT - CAST(ROUND(datasketches_hll_union_agg(sk)) AS BIGINT), - CAST(ROUND(ds_hll_estimate(sk)) AS BIGINT), - CAST(ROUND(datasketches_hll_estimate(sk)) AS BIGINT) + CAST(ROUND(datasketches_hll_union_agg(sk)) AS BIGINT) AS v1, + CAST(ROUND(ds_hll_estimate(sk)) AS BIGINT) AS v2, + CAST(ROUND(datasketches_hll_estimate(sk)) AS BIGINT) AS v3 FROM test_datasketches_hll_union_agg_tbl; ``` +```text ++------+------+------+ +| v1 | v2 | v3 | ++------+------+------+ +| 17 | 17 | 17 | ++------+------+------+ +``` + ```sql -- empty input returns 0 SELECT datasketches_hll_union_agg(sk) @@ -83,11 +92,11 @@ WHERE sk IS NULL; ``` ```text -+-------------------------------+ ++--------------------------------+ | datasketches_hll_union_agg(sk) | -+-------------------------------+ -| 0 | -+-------------------------------+ ++--------------------------------+ +| 0 | ++--------------------------------+ ``` ```sql @@ -95,7 +104,15 @@ WHERE sk IS NULL; SELECT datasketches_hll_union_agg(from_base64('AA==')); ``` +```text +ERROR 1105 (HY000): errCode = 2, detailMessage = (127.0.0.1)[CORRUPTION]HLL sketch data corrupted when add: Attempt to deserialize unknown object type +``` + ```sql -- empty string is invalid and will throw SELECT datasketches_hll_union_agg(''); +``` + +```text +ERROR 1105 (HY000): errCode = 2, detailMessage = (127.0.0.1)[CORRUPTION]HLL sketch data corrupted when add: empty input. ``` \ No newline at end of file diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md index f464ffe38e525..ba67311d4895e 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md @@ -1,7 +1,8 @@ +--- { -"title": "DATASKETCHES_HLL_UNION_AGG", -"language": "zh-CN", -"description": "datasketches_hll_union_agg 函数是一种聚合函数,用于对多个 Apache DataSketches HLL sketch 的序列化结果进行 union 合并,并返回合并后基数的估算值(DOUBLE)。" + "title": "DATASKETCHES_HLL_UNION_AGG", + "language": "zh-CN", + "description": "datasketches_hll_union_agg 函数是一种聚合函数,用于对多个 Apache DataSketches HLL sketch 的序列化结果进行 union 合并,并返回合并后基数的估算值(DOUBLE)。" } --- @@ -59,22 +60,30 @@ FROM test_datasketches_hll_union_agg_tbl; ``` ```text -+------------------------------------------------------+ ++-------------------------------------------------------+ | CAST(ROUND(datasketches_hll_union_agg(sk)) AS BIGINT) | -+------------------------------------------------------+ -| 17 | -+------------------------------------------------------+ ++-------------------------------------------------------+ +| 17 | ++-------------------------------------------------------+ ``` ```sql -- 别名用法 SELECT - CAST(ROUND(datasketches_hll_union_agg(sk)) AS BIGINT), - CAST(ROUND(ds_hll_estimate(sk)) AS BIGINT), - CAST(ROUND(datasketches_hll_estimate(sk)) AS BIGINT) + CAST(ROUND(datasketches_hll_union_agg(sk)) AS BIGINT) AS v1, + CAST(ROUND(ds_hll_estimate(sk)) AS BIGINT) AS v2, + CAST(ROUND(datasketches_hll_estimate(sk)) AS BIGINT) AS v3 FROM test_datasketches_hll_union_agg_tbl; ``` +```text ++------+------+------+ +| v1 | v2 | v3 | ++------+------+------+ +| 17 | 17 | 17 | ++------+------+------+ +``` + ```sql -- 组内无合法数据返回 0 SELECT datasketches_hll_union_agg(sk) @@ -83,11 +92,11 @@ WHERE sk IS NULL; ``` ```text -+-------------------------------+ ++--------------------------------+ | datasketches_hll_union_agg(sk) | -+-------------------------------+ -| 0 | -+-------------------------------+ ++--------------------------------+ +| 0 | ++--------------------------------+ ``` ```sql @@ -95,7 +104,15 @@ WHERE sk IS NULL; SELECT datasketches_hll_union_agg(from_base64('AA==')); ``` +```text +ERROR 1105 (HY000): errCode = 2, detailMessage = (127.0.0.1)[CORRUPTION]HLL sketch data corrupted when add: Attempt to deserialize unknown object type +``` + ```sql -- 空字符串属于非法输入,将报错 SELECT datasketches_hll_union_agg(''); +``` + +```text +ERROR 1105 (HY000): errCode = 2, detailMessage = (127.0.0.1)[CORRUPTION]HLL sketch data corrupted when add: empty input. ``` \ No newline at end of file From a43aba8d2516cb2ae699aff686c5d13c637da2e0 Mon Sep 17 00:00:00 2001 From: yuanyuhao Date: Mon, 1 Jun 2026 15:50:41 +0800 Subject: [PATCH 4/7] add 4.x docs --- .../datasketches_hll_union_agg.md | 120 ++++++++++++++++++ .../datasketches_hll_union_agg.md | 120 ++++++++++++++++++ versioned_sidebars/version-4.x-sidebars.json | 3 +- 3 files changed, 242 insertions(+), 1 deletion(-) create mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md create mode 100644 versioned_docs/version-4.x/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md new file mode 100644 index 0000000000000..75bf76c308508 --- /dev/null +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md @@ -0,0 +1,120 @@ +--- +{ + "title": "DATASKETCHES_HLL_UNION_AGG", + "language": "zh-CN", + "description": "datasketches_hll_union_agg 函数是一种聚合函数,用于对多个 Apache DataSketches HLL sketch 的序列化结果进行 union 合并,并返回合并后基数的估算值(DOUBLE)。" +} +--- + +> 从 4.1.2 版本开始支持。 + +## 描述 + +`datasketches_hll_union_agg` 函数是一种聚合函数,用于对多个 **Apache DataSketches HLL sketch(hll_sketch)** 的序列化结果进行 **union 合并**,并返回合并后基数的**估算值**(近似去重数 / NDV)。 + +该函数的输入不是普通字符串,而是 **DataSketches HLL sketch 的序列化字节串**(例如由 DataSketches 的 `hll_sketch.serialize_compact()` 生成)。 + +## 别名 + +- `ds_hll_estimate` +- `datasketches_hll_estimate` + +## 语法 + +```sql +datasketches_hll_union_agg() +``` + +## 参数 + +| 参数 | 说明 | +| -- | -- | +| `` | DataSketches HLL sketch 的序列化字节串。支持类型:STRING / VARCHAR / BINARY / VARBINARY。NULL 会被忽略;空字符串属于非法输入,将报错。 | + +## 返回值 + +返回 DOUBLE(Float64)类型的基数估算值。 +如果没有合法数据(例如全为 NULL,或表为空)则返回 0。 +若输入字节串无法反序列化为合法的 DataSketches HLL sketch(包括空字符串),将报错(通常错误码为 `CORRUPTION`)。 + +## 举例 + +```sql +-- setup +CREATE TABLE test_datasketches_hll_union_agg_tbl ( + id INT, + sk STRING +) +DISTRIBUTED BY HASH(id) BUCKETS 1 +PROPERTIES ("replication_num" = "1"); + +-- 通过 from_base64() 将 Base64 文本解码为 sketch 字节串后写入 +INSERT INTO test_datasketches_hll_union_agg_tbl VALUES + (1, from_base64('AgEHCAMIBwjL18IEK/L7BoYv+Q11gWYHgbxdBntl5gj8LUIK')), + (2, from_base64('AwEHCAUIAAkKAAAAIjvrBcS1nwfGGWoEyHokBO8t9wc1qTEENkcJB7hWqQxZf9QNnuSbGA==')), + (3, NULL); +``` + +```sql +-- 该函数返回 DOUBLE,如需以整数形式展示可配合 ROUND/CAST +SELECT CAST(ROUND(datasketches_hll_union_agg(sk)) AS BIGINT) +FROM test_datasketches_hll_union_agg_tbl; +``` + +```text ++-------------------------------------------------------+ +| CAST(ROUND(datasketches_hll_union_agg(sk)) AS BIGINT) | ++-------------------------------------------------------+ +| 17 | ++-------------------------------------------------------+ +``` + +```sql +-- 别名用法 +SELECT + CAST(ROUND(datasketches_hll_union_agg(sk)) AS BIGINT) AS v1, + CAST(ROUND(ds_hll_estimate(sk)) AS BIGINT) AS v2, + CAST(ROUND(datasketches_hll_estimate(sk)) AS BIGINT) AS v3 +FROM test_datasketches_hll_union_agg_tbl; +``` + +```text ++------+------+------+ +| v1 | v2 | v3 | ++------+------+------+ +| 17 | 17 | 17 | ++------+------+------+ +``` + +```sql +-- 组内无合法数据返回 0 +SELECT datasketches_hll_union_agg(sk) +FROM test_datasketches_hll_union_agg_tbl +WHERE sk IS NULL; +``` + +```text ++--------------------------------+ +| datasketches_hll_union_agg(sk) | ++--------------------------------+ +| 0 | ++--------------------------------+ +``` + +```sql +-- 非法 sketch 字节串将报错 +SELECT datasketches_hll_union_agg(from_base64('AA==')); +``` + +```text +ERROR 1105 (HY000): errCode = 2, detailMessage = (127.0.0.1)[CORRUPTION]HLL sketch data corrupted when add: Attempt to deserialize unknown object type +``` + +```sql +-- 空字符串属于非法输入,将报错 +SELECT datasketches_hll_union_agg(''); +``` + +```text +ERROR 1105 (HY000): errCode = 2, detailMessage = (127.0.0.1)[CORRUPTION]HLL sketch data corrupted when add: empty input. +``` \ No newline at end of file diff --git a/versioned_docs/version-4.x/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md b/versioned_docs/version-4.x/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md new file mode 100644 index 0000000000000..919b1efbbfc14 --- /dev/null +++ b/versioned_docs/version-4.x/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md @@ -0,0 +1,120 @@ +--- +{ + "title": "DATASKETCHES_HLL_UNION_AGG", + "language": "en", + "description": "The datasketches_hll_union_agg function is an aggregate function used to union multiple Apache DataSketches HLL sketches and return the estimated cardinality of the union as a DOUBLE value." +} +--- + +> Supported since version 4.1.2. + +## Description + +`datasketches_hll_union_agg` is an aggregate function used to **union** multiple Apache DataSketches **HLL** (`hll_sketch`) serialized values and return the **estimated cardinality** (approximate distinct count / NDV) after union. + +This function expects the input to be **serialized bytes of a DataSketches HLL sketch** (for example, generated by `hll_sketch.serialize_compact()` in the DataSketches library). It does not accept arbitrary strings. + +## Alias + +- `ds_hll_estimate` +- `datasketches_hll_estimate` + +## Syntax + +```sql +datasketches_hll_union_agg() +``` + +## Parameters + +| Parameter | Description | +| -- | -- | +| `` | The serialized bytes of an Apache DataSketches HLL sketch. Supported types: STRING / VARCHAR / BINARY / VARBINARY. NULL values are ignored. Empty strings are treated as invalid input and will throw an error. | + +## Return Value + +Returns a DOUBLE (Float64) cardinality estimate value. +If there is no valid data in the group (or the input is empty), returns 0. +If the input bytes cannot be deserialized as a valid DataSketches HLL sketch (including empty string), an error is thrown (typically with error code `CORRUPTION`). + +## Example + +```sql +-- setup +CREATE TABLE test_datasketches_hll_union_agg_tbl ( + id INT, + sk STRING +) +DISTRIBUTED BY HASH(id) BUCKETS 1 +PROPERTIES ("replication_num" = "1"); + +-- The sketch bytes are inserted via Base64 decoding. +INSERT INTO test_datasketches_hll_union_agg_tbl VALUES + (1, from_base64('AgEHCAMIBwjL18IEK/L7BoYv+Q11gWYHgbxdBntl5gj8LUIK')), + (2, from_base64('AwEHCAUIAAkKAAAAIjvrBcS1nwfGGWoEyHokBO8t9wc1qTEENkcJB7hWqQxZf9QNnuSbGA==')), + (3, NULL); +``` + +```sql +-- The function returns DOUBLE, so use ROUND/CAST if you want an integer display. +SELECT CAST(ROUND(datasketches_hll_union_agg(sk)) AS BIGINT) +FROM test_datasketches_hll_union_agg_tbl; +``` + +```text ++-------------------------------------------------------+ +| CAST(ROUND(datasketches_hll_union_agg(sk)) AS BIGINT) | ++-------------------------------------------------------+ +| 17 | ++-------------------------------------------------------+ +``` + +```sql +-- aliases +SELECT + CAST(ROUND(datasketches_hll_union_agg(sk)) AS BIGINT) AS v1, + CAST(ROUND(ds_hll_estimate(sk)) AS BIGINT) AS v2, + CAST(ROUND(datasketches_hll_estimate(sk)) AS BIGINT) AS v3 +FROM test_datasketches_hll_union_agg_tbl; +``` + +```text ++------+------+------+ +| v1 | v2 | v3 | ++------+------+------+ +| 17 | 17 | 17 | ++------+------+------+ +``` + +```sql +-- empty input returns 0 +SELECT datasketches_hll_union_agg(sk) +FROM test_datasketches_hll_union_agg_tbl +WHERE sk IS NULL; +``` + +```text ++--------------------------------+ +| datasketches_hll_union_agg(sk) | ++--------------------------------+ +| 0 | ++--------------------------------+ +``` + +```sql +-- invalid sketch bytes will throw +SELECT datasketches_hll_union_agg(from_base64('AA==')); +``` + +```text +ERROR 1105 (HY000): errCode = 2, detailMessage = (127.0.0.1)[CORRUPTION]HLL sketch data corrupted when add: Attempt to deserialize unknown object type +``` + +```sql +-- empty string is invalid and will throw +SELECT datasketches_hll_union_agg(''); +``` + +```text +ERROR 1105 (HY000): errCode = 2, detailMessage = (127.0.0.1)[CORRUPTION]HLL sketch data corrupted when add: empty input. +``` \ No newline at end of file diff --git a/versioned_sidebars/version-4.x-sidebars.json b/versioned_sidebars/version-4.x-sidebars.json index a329bbc91ae9c..c55fd7a26e093 100644 --- a/versioned_sidebars/version-4.x-sidebars.json +++ b/versioned_sidebars/version-4.x-sidebars.json @@ -2168,6 +2168,7 @@ "sql-manual/sql-functions/aggregate-functions/count-by-enum", "sql-manual/sql-functions/aggregate-functions/covar", "sql-manual/sql-functions/aggregate-functions/covar-samp", + "sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg", "sql-manual/sql-functions/aggregate-functions/group-array-intersect", "sql-manual/sql-functions/aggregate-functions/group-array-union", "sql-manual/sql-functions/aggregate-functions/group-bit-and", @@ -2841,4 +2842,4 @@ ] } ] -} +} \ No newline at end of file From 06bda0d225563dbcff8618ad9050ae232fa86180 Mon Sep 17 00:00:00 2001 From: nooneuse Date: Mon, 1 Jun 2026 15:53:10 +0800 Subject: [PATCH 5/7] Update version-4.x-sidebars.json --- versioned_sidebars/version-4.x-sidebars.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/versioned_sidebars/version-4.x-sidebars.json b/versioned_sidebars/version-4.x-sidebars.json index c55fd7a26e093..57c240ad928d0 100644 --- a/versioned_sidebars/version-4.x-sidebars.json +++ b/versioned_sidebars/version-4.x-sidebars.json @@ -2842,4 +2842,4 @@ ] } ] -} \ No newline at end of file +} From e2cd04aa075f0cd3fd9452ca2617f52e609bac11 Mon Sep 17 00:00:00 2001 From: yuanyuhao Date: Mon, 1 Jun 2026 19:23:15 +0800 Subject: [PATCH 6/7] fix sidebar.ts --- sidebars.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sidebars.ts b/sidebars.ts index 4451e3223fb5f..8edfc219570bc 100644 --- a/sidebars.ts +++ b/sidebars.ts @@ -1997,7 +1997,7 @@ const sidebars: SidebarsConfig = { 'sql-manual/sql-functions/aggregate-functions/count-by-enum', 'sql-manual/sql-functions/aggregate-functions/covar', 'sql-manual/sql-functions/aggregate-functions/covar-samp', - 'sql-manual/sql-functions/aggregate-functions/datasketches-hll-union-agg', + 'sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg', 'sql-manual/sql-functions/aggregate-functions/exponential-moving-average', 'sql-manual/sql-functions/aggregate-functions/group-array-intersect', 'sql-manual/sql-functions/aggregate-functions/group-array-union', From df94deb0e5b91b9216acabdf0ad461241cdb8ed7 Mon Sep 17 00:00:00 2001 From: yuanyuhao Date: Tue, 2 Jun 2026 11:36:42 +0800 Subject: [PATCH 7/7] fix docs support datatypes --- .../aggregate-functions/datasketches_hll_union_agg.md | 2 +- .../aggregate-functions/datasketches_hll_union_agg.md | 2 +- .../aggregate-functions/datasketches_hll_union_agg.md | 2 +- .../aggregate-functions/datasketches_hll_union_agg.md | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md b/docs/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md index bf59a24fcd31b..90eb8de35488a 100644 --- a/docs/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md +++ b/docs/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md @@ -27,7 +27,7 @@ datasketches_hll_union_agg() | Parameter | Description | | -- | -- | -| `` | The serialized bytes of an Apache DataSketches HLL sketch. Supported types: STRING / VARCHAR / BINARY / VARBINARY. NULL values are ignored. Empty strings are treated as invalid input and will throw an error. | +| `` | The serialized bytes of an Apache DataSketches HLL sketch. Supported types: STRING / VARCHAR / VARBINARY. NULL values are ignored. Empty strings are treated as invalid input and will throw an error. | ## Return Value diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md index ba67311d4895e..b4e54b4817f0d 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md @@ -27,7 +27,7 @@ datasketches_hll_union_agg() | 参数 | 说明 | | -- | -- | -| `` | DataSketches HLL sketch 的序列化字节串。支持类型:STRING / VARCHAR / BINARY / VARBINARY。NULL 会被忽略;空字符串属于非法输入,将报错。 | +| `` | DataSketches HLL sketch 的序列化字节串。支持类型:STRING / VARCHAR / VARBINARY。NULL 会被忽略;空字符串属于非法输入,将报错。 | ## 返回值 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md index 75bf76c308508..5d1960fa90781 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md @@ -29,7 +29,7 @@ datasketches_hll_union_agg() | 参数 | 说明 | | -- | -- | -| `` | DataSketches HLL sketch 的序列化字节串。支持类型:STRING / VARCHAR / BINARY / VARBINARY。NULL 会被忽略;空字符串属于非法输入,将报错。 | +| `` | DataSketches HLL sketch 的序列化字节串。支持类型:STRING / VARCHAR / VARBINARY。NULL 会被忽略;空字符串属于非法输入,将报错。 | ## 返回值 diff --git a/versioned_docs/version-4.x/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md b/versioned_docs/version-4.x/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md index 919b1efbbfc14..9c2d1dea1f059 100644 --- a/versioned_docs/version-4.x/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md +++ b/versioned_docs/version-4.x/sql-manual/sql-functions/aggregate-functions/datasketches_hll_union_agg.md @@ -29,7 +29,7 @@ datasketches_hll_union_agg() | Parameter | Description | | -- | -- | -| `` | The serialized bytes of an Apache DataSketches HLL sketch. Supported types: STRING / VARCHAR / BINARY / VARBINARY. NULL values are ignored. Empty strings are treated as invalid input and will throw an error. | +| `` | The serialized bytes of an Apache DataSketches HLL sketch. Supported types: STRING / VARCHAR / VARBINARY. NULL values are ignored. Empty strings are treated as invalid input and will throw an error. | ## Return Value