diff --git a/common/include/serde.hpp b/common/include/serde.hpp index ad20fe63..c4e46d7d 100644 --- a/common/include/serde.hpp +++ b/common/include/serde.hpp @@ -132,6 +132,11 @@ struct serde::value>::type> { /// ItemsSketch with ArrayOfStringsSerDe in Java. /// The length of each string is stored as a 32-bit integer (historically), /// which may be too wasteful. Treat this as an example. +/// +/// This implementation treats std::string as an arbitrary byte container. +/// It does not check whether string contents are valid UTF-8. +/// +/// Use a UTF-8-validating SerDe when cross-language portability is required. template<> struct serde { /// @copydoc serde::serialize diff --git a/fi/include/frequent_items_sketch.hpp b/fi/include/frequent_items_sketch.hpp index 0aa9514c..87ee174e 100644 --- a/fi/include/frequent_items_sketch.hpp +++ b/fi/include/frequent_items_sketch.hpp @@ -44,6 +44,11 @@ enum frequent_items_error_type { * Based on Java implementation here: * https://github.com/apache/datasketches-java/blob/master/src/main/java/org/apache/datasketches/frequencies/ItemsSketch.java * @author Alexander Saydakov + * + * Sketch that may retain string values. + * For sketches containing strings, cross-language portability depends on + * using compatible string encodings. This class does not by itself enforce + * UTF-8 validity for all string inputs. */ template< typename T, @@ -74,6 +79,8 @@ class frequent_items_sketch { /** * Update this sketch with an item and a positive weight (frequency count). + * If cross-language portability is required, callers should ensure that + * the input string uses a compatible encoding (valid UTF-8). * @param item for which the weight should be increased (lvalue) * @param weight the amount by which the weight of the item should be increased * A count of zero is a no-op, and a negative count will throw an exception. @@ -82,6 +89,8 @@ class frequent_items_sketch { /** * Update this sketch with an item and a positive weight (frequency count). + * If cross-language portability is required, callers should ensure that + * the input string uses a compatible encoding (valid UTF-8). * @param item for which the weight should be increased (rvalue) * @param weight the amount by which the weight of the item should be increased * A count of zero is a no-op, and a negative count will throw an exception. @@ -91,6 +100,8 @@ class frequent_items_sketch { /** * This function merges the other sketch into this one. * The other sketch may be of a different size. + * If sketches contain strings, callers are responsible for ensuring that + * both sketches were built using compatible string encodings. * @param other sketch to be merged into this (lvalue) */ void merge(const frequent_items_sketch& other); @@ -98,6 +109,8 @@ class frequent_items_sketch { /** * This function merges the other sketch into this one. * The other sketch may be of a different size. + * If sketches contain strings, callers are responsible for ensuring that + * both sketches were built using compatible string encodings. * @param other sketch to be merged into this (rvalue) */ void merge(frequent_items_sketch&& other); diff --git a/kll/include/kll_sketch.hpp b/kll/include/kll_sketch.hpp index 904587a1..d672c419 100644 --- a/kll/include/kll_sketch.hpp +++ b/kll/include/kll_sketch.hpp @@ -46,6 +46,11 @@ namespace kll_constants { * and nearly optimal accuracy per retained item. * See Optimal Quantile Approximation in Streams. * + * Sketch that may retain string values. + * For sketches containing strings, cross-language portability depends on + * using compatible string encodings. This class does not by itself enforce + * UTF-8 validity for all string inputs. + * *

This is a stochastic streaming sketch that enables near real-time analysis of the * approximate distribution of items from a very large stream in a single pass, requiring only * that the items are comparable. @@ -56,7 +61,7 @@ namespace kll_constants { *

As of May 2020, this implementation produces serialized sketches which are binary-compatible * with the equivalent Java implementation only when template parameter T = float * (32-bit single precision values). - * + * *

Given an input stream of N items, the natural rank of any specific * item is defined as its index (1 to N) in inclusive mode * or (0 to N-1) in exclusive mode @@ -225,6 +230,8 @@ class kll_sketch { /** * Updates this sketch with the given data item. + * If cross-language portability is required, callers should ensure that + * the input string uses a compatible encoding (valid UTF-8). * @param item from a stream of items */ template @@ -232,6 +239,8 @@ class kll_sketch { /** * Merges another sketch into this one. + * If sketches contain strings, callers are responsible for ensuring that + * both sketches were built using compatible string encodings. * @param other sketch to merge into this one */ template diff --git a/quantiles/include/quantiles_sketch.hpp b/quantiles/include/quantiles_sketch.hpp index b1e2e3c1..e995e3e3 100644 --- a/quantiles/include/quantiles_sketch.hpp +++ b/quantiles/include/quantiles_sketch.hpp @@ -47,6 +47,11 @@ namespace quantiles_constants { * The analysis is obtained using get_rank() and get_quantile() functions, * the Probability Mass Function from get_PMF() and the Cumulative Distribution Function from get_CDF(). * + * Sketch that may retain string values. + * For sketches containing strings, cross-language portability depends on + * using compatible string encodings. This class does not by itself enforce + * UTF-8 validity for all string inputs. + * *

Consider a large stream of one million values such as packet sizes coming into a network node. * The natural rank of any specific size value is its index in the hypothetical sorted * array of values. @@ -206,6 +211,8 @@ class quantiles_sketch { /** * Updates this sketch with the given data item. + * If cross-language portability is required, callers should ensure that + * the input string uses a compatible encoding (valid UTF-8). * @param item from a stream of items */ template @@ -213,6 +220,8 @@ class quantiles_sketch { /** * Merges another sketch into this one. + * If sketches contain strings, callers are responsible for ensuring that + * both sketches were built using compatible string encodings. * @param other sketch to merge into this one */ template diff --git a/req/include/req_sketch.hpp b/req/include/req_sketch.hpp index 21ccac0c..52295bd2 100755 --- a/req/include/req_sketch.hpp +++ b/req/include/req_sketch.hpp @@ -35,6 +35,11 @@ namespace datasketches { * "Relative Error Streaming Quantiles" by Graham Cormode, Zohar Karnin, Edo Liberty, * Justin Thaler, Pavel VeselĂ˝, and loosely derived from a Python prototype written by Pavel VeselĂ˝. * + * Sketch that may retain string values. + * For sketches containing strings, cross-language portability depends on + * using compatible string encodings. This class does not by itself enforce + * UTF-8 validity for all string inputs. + * *

Reference: https://arxiv.org/abs/2004.01668

* *

This implementation differs from the algorithm described in the paper in the following:

@@ -179,6 +184,8 @@ class req_sketch { /** * Updates this sketch with the given data item. + * If cross-language portability is required, callers should ensure that + * the input string uses a compatible encoding (valid UTF-8). * @param item from a stream of items */ template @@ -186,6 +193,8 @@ class req_sketch { /** * Merges another sketch into this one. + * If sketches contain strings, callers are responsible for ensuring that + * both sketches were built using compatible string encodings. * @param other sketch to merge into this one */ template diff --git a/sampling/include/ebpps_sketch.hpp b/sampling/include/ebpps_sketch.hpp index 038b5a30..615d37b8 100644 --- a/sampling/include/ebpps_sketch.hpp +++ b/sampling/include/ebpps_sketch.hpp @@ -50,6 +50,11 @@ namespace ebpps_constants { * The sample may be smaller than k and the resulting size of the sample potentially includes * a probabilistic component, meaning the resulting sample size is not always constant. * + * Sketch that may retain string values. + * For sketches containing strings, cross-language portability depends on + * using compatible string encodings. This class does not by itself enforce + * UTF-8 validity for all string inputs. + * * @author Jon Malkin */ template< @@ -71,6 +76,8 @@ class ebpps_sketch { /** * Updates this sketch with the given data item with the given weight. * This method takes an lvalue. + * If cross-language portability is required, callers should ensure that + * the input string uses a compatible encoding (valid UTF-8). * @param item an item from a stream of items * @param weight the weight of the item */ @@ -79,6 +86,8 @@ class ebpps_sketch { /** * Updates this sketch with the given data item with the given weight. * This method takes an rvalue. + * If cross-language portability is required, callers should ensure that + * the input string uses a compatible encoding (valid UTF-8). * @param item an item from a stream of items * @param weight the weight of the item */ @@ -87,6 +96,8 @@ class ebpps_sketch { /** * Merges the provided sketch into the current one. * This method takes an lvalue. + * If sketches contain strings, callers are responsible for ensuring that + * both sketches were built using compatible string encodings. * @param sketch the sketch to merge into the current object */ void merge(const ebpps_sketch& sketch); @@ -94,6 +105,8 @@ class ebpps_sketch { /** * Merges the provided sketch into the current one. * This method takes an rvalue. + * If sketches contain strings, callers are responsible for ensuring that + * both sketches were built using compatible string encodings. * @param sketch the sketch to merge into the current object */ void merge(ebpps_sketch&& sketch); diff --git a/sampling/include/var_opt_sketch.hpp b/sampling/include/var_opt_sketch.hpp index 1324883c..6b157caa 100644 --- a/sampling/include/var_opt_sketch.hpp +++ b/sampling/include/var_opt_sketch.hpp @@ -57,6 +57,11 @@ namespace var_opt_constants { * optimal (varopt) sampling is related to reservoir sampling, with improved error bounds for * subset sum estimation. * + * Sketch that may retain string values. + * For sketches containing strings, cross-language portability depends on + * using compatible string encodings. This class does not by itself enforce + * UTF-8 validity for all string inputs. + * * author Kevin Lang * author Jon Malkin */ @@ -111,6 +116,8 @@ class var_opt_sketch { /** * Updates this sketch with the given data item with the given weight. * This method takes an lvalue. + * If cross-language portability is required, callers should ensure that + * the input string uses a compatible encoding (valid UTF-8). * @param item an item from a stream of items * @param weight the weight of the item */ @@ -119,6 +126,8 @@ class var_opt_sketch { /** * Updates this sketch with the given data item with the given weight. * This method takes an rvalue. + * If cross-language portability is required, callers should ensure that + * the input string uses a compatible encoding (valid UTF-8). * @param item an item from a stream of items * @param weight the weight of the item */ diff --git a/sampling/include/var_opt_union.hpp b/sampling/include/var_opt_union.hpp index 0e4f76d8..68d1ac4b 100644 --- a/sampling/include/var_opt_union.hpp +++ b/sampling/include/var_opt_union.hpp @@ -65,13 +65,17 @@ class var_opt_union { /** * Updates this union with the given sketch * This method takes an lvalue. + * If sketches contain strings, callers are responsible for ensuring that + * both sketches were built using compatible string encodings. * @param sk a sketch to add to the union */ void update(const var_opt_sketch& sk); - + /** * Updates this union with the given sketch * This method takes an rvalue. + * If sketches contain strings, callers are responsible for ensuring that + * both sketches were built using compatible string encodings. * @param sk a sketch to add to the union */ void update(var_opt_sketch&& sk); diff --git a/tuple/include/tuple_sketch.hpp b/tuple/include/tuple_sketch.hpp index cbfd9f11..7b636a78 100644 --- a/tuple/include/tuple_sketch.hpp +++ b/tuple/include/tuple_sketch.hpp @@ -46,6 +46,11 @@ struct pair_extract_key { /** * Base class for Tuple sketch. * This is an extension of Theta sketch that allows keeping arbitrary Summary associated with each retained key. + * + * Summary that may retain string values. + * For Summary containing strings, cross-language portability depends on + * using compatible string encodings. This class does not by itself enforce + * UTF-8 validity for all string inputs. */ template< typename Summary, @@ -253,6 +258,9 @@ class update_tuple_sketch: public tuple_sketch { /** * Update this sketch with a given string. + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * @param key string to update the sketch with * @param value to update the sketch with */ @@ -261,6 +269,9 @@ class update_tuple_sketch: public tuple_sketch { /** * Update this sketch with a given unsigned 64-bit integer. + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * @param key uint64_t to update the sketch with * @param value to update the sketch with */ @@ -269,6 +280,9 @@ class update_tuple_sketch: public tuple_sketch { /** * Update this sketch with a given signed 64-bit integer. + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * @param key int64_t to update the sketch with * @param value to update the sketch with */ @@ -277,6 +291,9 @@ class update_tuple_sketch: public tuple_sketch { /** * Update this sketch with a given unsigned 32-bit integer. + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * For compatibility with Java implementation. * @param key uint32_t to update the sketch with * @param value to update the sketch with @@ -286,6 +303,9 @@ class update_tuple_sketch: public tuple_sketch { /** * Update this sketch with a given signed 32-bit integer. + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * For compatibility with Java implementation. * @param key int32_t to update the sketch with * @param value to update the sketch with @@ -295,6 +315,9 @@ class update_tuple_sketch: public tuple_sketch { /** * Update this sketch with a given unsigned 16-bit integer. + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * For compatibility with Java implementation. * @param key uint16_t to update the sketch with * @param value to update the sketch with @@ -304,6 +327,9 @@ class update_tuple_sketch: public tuple_sketch { /** * Update this sketch with a given signed 16-bit integer. + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * For compatibility with Java implementation. * @param key int16_t to update the sketch with * @param value to update the sketch with @@ -313,6 +339,9 @@ class update_tuple_sketch: public tuple_sketch { /** * Update this sketch with a given unsigned 8-bit integer. + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * For compatibility with Java implementation. * @param key uint8_t to update the sketch with * @param value to update the sketch with @@ -322,6 +351,9 @@ class update_tuple_sketch: public tuple_sketch { /** * Update this sketch with a given signed 8-bit integer. + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * For compatibility with Java implementation. * @param key int8_t to update the sketch with * @param value to update the sketch with @@ -331,6 +363,9 @@ class update_tuple_sketch: public tuple_sketch { /** * Update this sketch with a given double-precision floating point value. + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * For compatibility with Java implementation. * @param key double to update the sketch with * @param value to update the sketch with @@ -340,6 +375,9 @@ class update_tuple_sketch: public tuple_sketch { /** * Update this sketch with a given floating point value. + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * For compatibility with Java implementation. * @param key float to update the sketch with * @param value to update the sketch with @@ -357,6 +395,9 @@ class update_tuple_sketch: public tuple_sketch { * Otherwise two sketches that should represent overlapping sets will be disjoint * For instance, for signed 32-bit values call update(int32_t) method above, * which does widening conversion to int64_t, if compatibility with Java is expected + * If the summary contains strings and cross-language portability is required, + * callers should ensure that any strings in the summary + * use a compatible encoding (valid UTF-8). * @param key pointer to the data * @param length of the data in bytes * @param value to update the sketch with