From f4dd7a03c1868330f97deff3e218d687fbc4ea79 Mon Sep 17 00:00:00 2001 From: nooneuse Date: Sat, 30 May 2026 18:36:41 +0800 Subject: [PATCH] Add datasketches HLL sketch aggregate functions (#63143) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What problem does this PR solve? > An aggregate function is required to process user data containing Datasketches HLL sketches. In many data aggregation scenarios, users pre‑aggregate detailed data in Hive using the sketching techniques provided by Apache Datasketches, and then analyze the resulting sketches across various OLAP engines. Compared with the HLL union aggregate functions natively offered by these engines, there are two key diff to using Datasketches HLL sketches: firstly, the use cases differ; and secondly, HLL sketches can be used seamlessly across different engines—for example, simultaneously in ES, Doris, and ClickHouse. Such requirements are common in many production environments. Issue Number: - #63142(https://github.com/apache/doris/issues/63142) - #26416 - #56246 Summary: Implemented a built-in aggregate function that integrates the Datasketches HLL sketch. This aggregate function cannot rely on the Java UDF environment. Considering that in the Java UDF environment, Strings are encoded in UTF-8, which corrupts the binary data of sketches, the serialization/deserialization operations for sketches must be implemented on the BE side. (additionally, since Apache Datasketches has been added to the contrib directory via a git submodule, it will become very easy to add other sketches such as theta sketch in the future.) **see**: https://github.com/apache/doris/issues/63142 **use case**: see regression test & https://github.com/apache/doris/issues/63142 --------- Co-authored-by: yuanyuhao --- .gitmodules | 3 + ...te_function_datasketches_hll_union_agg.cpp | 44 + ...gate_function_datasketches_hll_union_agg.h | 243 ++++ .../aggregate_function_simple_factory.cpp | 3 + .../agg_datasketches_hll_union_agg_test.cpp | 1097 +++++++++++++++++ build.sh | 9 + contrib/datasketches-cpp | 1 + .../catalog/BuiltinAggregateFunctions.java | 3 + .../agg/DataSketchesHllUnionAgg.java | 113 ++ .../visitor/AggregateFunctionVisitor.java | 5 + .../test_datasketches_hll_union_agg.out | 28 + .../test_datasketches_hll_union_agg.groovy | 170 +++ run-be-ut.sh | 14 +- 13 files changed, 1732 insertions(+), 1 deletion(-) create mode 100644 be/src/exprs/aggregate/aggregate_function_datasketches_hll_union_agg.cpp create mode 100644 be/src/exprs/aggregate/aggregate_function_datasketches_hll_union_agg.h create mode 100644 be/test/exprs/aggregate/agg_datasketches_hll_union_agg_test.cpp create mode 160000 contrib/datasketches-cpp create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/agg/DataSketchesHllUnionAgg.java create mode 100644 regression-test/data/query_p0/sql_functions/aggregate_functions/test_datasketches_hll_union_agg.out create mode 100644 regression-test/suites/query_p0/sql_functions/aggregate_functions/test_datasketches_hll_union_agg.groovy diff --git a/.gitmodules b/.gitmodules index 54c1a8a36366af..eb8e703aa8af98 100644 --- a/.gitmodules +++ b/.gitmodules @@ -29,3 +29,6 @@ path = contrib/openblas url = https://github.com/apache/doris-thirdparty.git branch = openblas +[submodule "contrib/datasketches-cpp"] + path = contrib/datasketches-cpp + url = https://github.com/apache/datasketches-cpp.git diff --git a/be/src/exprs/aggregate/aggregate_function_datasketches_hll_union_agg.cpp b/be/src/exprs/aggregate/aggregate_function_datasketches_hll_union_agg.cpp new file mode 100644 index 00000000000000..c9b7013e7a960f --- /dev/null +++ b/be/src/exprs/aggregate/aggregate_function_datasketches_hll_union_agg.cpp @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "exprs/aggregate/aggregate_function_datasketches_hll_union_agg.h" + +#include + +#include "core/data_type/data_type.h" +#include "core/data_type/define_primitive_type.h" +#include "exec/common/hash_table/hash.h" // IWYU pragma: keep +#include "exprs/aggregate/aggregate_function_simple_factory.h" +#include "exprs/aggregate/helpers.h" +namespace doris { +template