Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions common/include/serde.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,11 @@ struct serde<T, typename std::enable_if<std::is_arithmetic<T>::value>::type> {
/// ItemsSketch<String> with ArrayOfStringsSerDe in Java.
/// The length of each string is stored as a 32-bit integer (historically),
/// which may be too wasteful. Treat this as an example.
///
/// This implementation treats std::string as an arbitrary byte container.
/// It does not check whether string contents are valid UTF-8.
///
/// Use a UTF-8-validating SerDe when cross-language portability is required.
template<>
struct serde<std::string> {
/// @copydoc serde::serialize
Expand Down
13 changes: 13 additions & 0 deletions fi/include/frequent_items_sketch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ enum frequent_items_error_type {
* Based on Java implementation here:
* https://github.com/apache/datasketches-java/blob/master/src/main/java/org/apache/datasketches/frequencies/ItemsSketch.java
* @author Alexander Saydakov
*
* Sketch that may retain string values.
* For sketches containing strings, cross-language portability depends on
* using compatible string encodings. This class does not by itself enforce
* UTF-8 validity for all string inputs.
*/
template<
typename T,
Expand Down Expand Up @@ -74,6 +79,8 @@ class frequent_items_sketch {

/**
* Update this sketch with an item and a positive weight (frequency count).
* If cross-language portability is required, callers should ensure that
* the input string uses a compatible encoding (valid UTF-8).
* @param item for which the weight should be increased (lvalue)
* @param weight the amount by which the weight of the item should be increased
* A count of zero is a no-op, and a negative count will throw an exception.
Expand All @@ -82,6 +89,8 @@ class frequent_items_sketch {

/**
* Update this sketch with an item and a positive weight (frequency count).
* If cross-language portability is required, callers should ensure that
* the input string uses a compatible encoding (valid UTF-8).
* @param item for which the weight should be increased (rvalue)
* @param weight the amount by which the weight of the item should be increased
* A count of zero is a no-op, and a negative count will throw an exception.
Expand All @@ -91,13 +100,17 @@ class frequent_items_sketch {
/**
* This function merges the other sketch into this one.
* The other sketch may be of a different size.
* If sketches contain strings, callers are responsible for ensuring that
* both sketches were built using compatible string encodings.
* @param other sketch to be merged into this (lvalue)
*/
void merge(const frequent_items_sketch& other);

/**
* This function merges the other sketch into this one.
* The other sketch may be of a different size.
* If sketches contain strings, callers are responsible for ensuring that
* both sketches were built using compatible string encodings.
* @param other sketch to be merged into this (rvalue)
*/
void merge(frequent_items_sketch&& other);
Expand Down
11 changes: 10 additions & 1 deletion kll/include/kll_sketch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,11 @@ namespace kll_constants {
* and nearly optimal accuracy per retained item.
* See <a href="https://arxiv.org/abs/1603.05346v2">Optimal Quantile Approximation in Streams</a>.
*
* Sketch that may retain string values.
* For sketches containing strings, cross-language portability depends on
* using compatible string encodings. This class does not by itself enforce
* UTF-8 validity for all string inputs.
*
* <p>This is a stochastic streaming sketch that enables near real-time analysis of the
* approximate distribution of items from a very large stream in a single pass, requiring only
* that the items are comparable.
Expand All @@ -56,7 +61,7 @@ namespace kll_constants {
* <p>As of May 2020, this implementation produces serialized sketches which are binary-compatible
* with the equivalent Java implementation only when template parameter T = float
* (32-bit single precision values).
*
*
* <p>Given an input stream of <i>N</i> items, the <i>natural rank</i> of any specific
* item is defined as its index <i>(1 to N)</i> in inclusive mode
* or <i>(0 to N-1)</i> in exclusive mode
Expand Down Expand Up @@ -225,13 +230,17 @@ class kll_sketch {

/**
* Updates this sketch with the given data item.
* If cross-language portability is required, callers should ensure that
* the input string uses a compatible encoding (valid UTF-8).
* @param item from a stream of items
*/
template<typename FwdT>
void update(FwdT&& item);

/**
* Merges another sketch into this one.
* If sketches contain strings, callers are responsible for ensuring that
* both sketches were built using compatible string encodings.
* @param other sketch to merge into this one
*/
template<typename FwdSk>
Expand Down
9 changes: 9 additions & 0 deletions quantiles/include/quantiles_sketch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ namespace quantiles_constants {
* The analysis is obtained using get_rank() and get_quantile() functions,
* the Probability Mass Function from get_PMF() and the Cumulative Distribution Function from get_CDF().
*
* Sketch that may retain string values.
* For sketches containing strings, cross-language portability depends on
* using compatible string encodings. This class does not by itself enforce
* UTF-8 validity for all string inputs.
*
* <p>Consider a large stream of one million values such as packet sizes coming into a network node.
* The natural rank of any specific size value is its index in the hypothetical sorted
* array of values.
Expand Down Expand Up @@ -206,13 +211,17 @@ class quantiles_sketch {

/**
* Updates this sketch with the given data item.
* If cross-language portability is required, callers should ensure that
* the input string uses a compatible encoding (valid UTF-8).
* @param item from a stream of items
*/
template<typename FwdT>
void update(FwdT&& item);

/**
* Merges another sketch into this one.
* If sketches contain strings, callers are responsible for ensuring that
* both sketches were built using compatible string encodings.
* @param other sketch to merge into this one
*/
template<typename FwdSk>
Expand Down
9 changes: 9 additions & 0 deletions req/include/req_sketch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@ namespace datasketches {
* "Relative Error Streaming Quantiles" by Graham Cormode, Zohar Karnin, Edo Liberty,
* Justin Thaler, Pavel Veselý, and loosely derived from a Python prototype written by Pavel Veselý.
*
* Sketch that may retain string values.
* For sketches containing strings, cross-language portability depends on
* using compatible string encodings. This class does not by itself enforce
* UTF-8 validity for all string inputs.
*
* <p>Reference: https://arxiv.org/abs/2004.01668</p>
*
* <p>This implementation differs from the algorithm described in the paper in the following:</p>
Expand Down Expand Up @@ -179,13 +184,17 @@ class req_sketch {

/**
* Updates this sketch with the given data item.
* If cross-language portability is required, callers should ensure that
* the input string uses a compatible encoding (valid UTF-8).
* @param item from a stream of items
*/
template<typename FwdT>
void update(FwdT&& item);

/**
* Merges another sketch into this one.
* If sketches contain strings, callers are responsible for ensuring that
* both sketches were built using compatible string encodings.
* @param other sketch to merge into this one
*/
template<typename FwdSk>
Expand Down
13 changes: 13 additions & 0 deletions sampling/include/ebpps_sketch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ namespace ebpps_constants {
* The sample may be smaller than k and the resulting size of the sample potentially includes
* a probabilistic component, meaning the resulting sample size is not always constant.
*
* Sketch that may retain string values.
* For sketches containing strings, cross-language portability depends on
* using compatible string encodings. This class does not by itself enforce
* UTF-8 validity for all string inputs.
*
* @author Jon Malkin
*/
template<
Expand All @@ -71,6 +76,8 @@ class ebpps_sketch {
/**
* Updates this sketch with the given data item with the given weight.
* This method takes an lvalue.
* If cross-language portability is required, callers should ensure that
* the input string uses a compatible encoding (valid UTF-8).
* @param item an item from a stream of items
* @param weight the weight of the item
*/
Expand All @@ -79,6 +86,8 @@ class ebpps_sketch {
/**
* Updates this sketch with the given data item with the given weight.
* This method takes an rvalue.
* If cross-language portability is required, callers should ensure that
* the input string uses a compatible encoding (valid UTF-8).
* @param item an item from a stream of items
* @param weight the weight of the item
*/
Expand All @@ -87,13 +96,17 @@ class ebpps_sketch {
/**
* Merges the provided sketch into the current one.
* This method takes an lvalue.
* If sketches contain strings, callers are responsible for ensuring that
* both sketches were built using compatible string encodings.
* @param sketch the sketch to merge into the current object
*/
void merge(const ebpps_sketch<T, A>& sketch);

/**
* Merges the provided sketch into the current one.
* This method takes an rvalue.
* If sketches contain strings, callers are responsible for ensuring that
* both sketches were built using compatible string encodings.
* @param sketch the sketch to merge into the current object
*/
void merge(ebpps_sketch<T, A>&& sketch);
Expand Down
9 changes: 9 additions & 0 deletions sampling/include/var_opt_sketch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,11 @@ namespace var_opt_constants {
* optimal (varopt) sampling is related to reservoir sampling, with improved error bounds for
* subset sum estimation.
*
* Sketch that may retain string values.
* For sketches containing strings, cross-language portability depends on
* using compatible string encodings. This class does not by itself enforce
* UTF-8 validity for all string inputs.
*
* author Kevin Lang
* author Jon Malkin
*/
Expand Down Expand Up @@ -111,6 +116,8 @@ class var_opt_sketch {
/**
* Updates this sketch with the given data item with the given weight.
* This method takes an lvalue.
* If cross-language portability is required, callers should ensure that
* the input string uses a compatible encoding (valid UTF-8).
* @param item an item from a stream of items
* @param weight the weight of the item
*/
Expand All @@ -119,6 +126,8 @@ class var_opt_sketch {
/**
* Updates this sketch with the given data item with the given weight.
* This method takes an rvalue.
* If cross-language portability is required, callers should ensure that
* the input string uses a compatible encoding (valid UTF-8).
* @param item an item from a stream of items
* @param weight the weight of the item
*/
Expand Down
6 changes: 5 additions & 1 deletion sampling/include/var_opt_union.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,17 @@ class var_opt_union {
/**
* Updates this union with the given sketch
* This method takes an lvalue.
* If sketches contain strings, callers are responsible for ensuring that
* both sketches were built using compatible string encodings.
* @param sk a sketch to add to the union
*/
void update(const var_opt_sketch<T, A>& sk);

/**
* Updates this union with the given sketch
* This method takes an rvalue.
* If sketches contain strings, callers are responsible for ensuring that
* both sketches were built using compatible string encodings.
* @param sk a sketch to add to the union
*/
void update(var_opt_sketch<T, A>&& sk);
Expand Down
41 changes: 41 additions & 0 deletions tuple/include/tuple_sketch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,11 @@ struct pair_extract_key {
/**
* Base class for Tuple sketch.
* This is an extension of Theta sketch that allows keeping arbitrary Summary associated with each retained key.
*
* Summary that may retain string values.
* For Summary containing strings, cross-language portability depends on
* using compatible string encodings. This class does not by itself enforce
* UTF-8 validity for all string inputs.
*/
template<
typename Summary,
Expand Down Expand Up @@ -253,6 +258,9 @@ class update_tuple_sketch: public tuple_sketch<Summary, Allocator> {

/**
* Update this sketch with a given string.
* If the summary contains strings and cross-language portability is required,
* callers should ensure that any strings in the summary
* use a compatible encoding (valid UTF-8).
* @param key string to update the sketch with
* @param value to update the sketch with
*/
Expand All @@ -261,6 +269,9 @@ class update_tuple_sketch: public tuple_sketch<Summary, Allocator> {

/**
* Update this sketch with a given unsigned 64-bit integer.
* If the summary contains strings and cross-language portability is required,
* callers should ensure that any strings in the summary
* use a compatible encoding (valid UTF-8).
* @param key uint64_t to update the sketch with
* @param value to update the sketch with
*/
Expand All @@ -269,6 +280,9 @@ class update_tuple_sketch: public tuple_sketch<Summary, Allocator> {

/**
* Update this sketch with a given signed 64-bit integer.
* If the summary contains strings and cross-language portability is required,
* callers should ensure that any strings in the summary
* use a compatible encoding (valid UTF-8).
* @param key int64_t to update the sketch with
* @param value to update the sketch with
*/
Expand All @@ -277,6 +291,9 @@ class update_tuple_sketch: public tuple_sketch<Summary, Allocator> {

/**
* Update this sketch with a given unsigned 32-bit integer.
* If the summary contains strings and cross-language portability is required,
* callers should ensure that any strings in the summary
* use a compatible encoding (valid UTF-8).
* For compatibility with Java implementation.
* @param key uint32_t to update the sketch with
* @param value to update the sketch with
Expand All @@ -286,6 +303,9 @@ class update_tuple_sketch: public tuple_sketch<Summary, Allocator> {

/**
* Update this sketch with a given signed 32-bit integer.
* If the summary contains strings and cross-language portability is required,
* callers should ensure that any strings in the summary
* use a compatible encoding (valid UTF-8).
* For compatibility with Java implementation.
* @param key int32_t to update the sketch with
* @param value to update the sketch with
Expand All @@ -295,6 +315,9 @@ class update_tuple_sketch: public tuple_sketch<Summary, Allocator> {

/**
* Update this sketch with a given unsigned 16-bit integer.
* If the summary contains strings and cross-language portability is required,
* callers should ensure that any strings in the summary
* use a compatible encoding (valid UTF-8).
* For compatibility with Java implementation.
* @param key uint16_t to update the sketch with
* @param value to update the sketch with
Expand All @@ -304,6 +327,9 @@ class update_tuple_sketch: public tuple_sketch<Summary, Allocator> {

/**
* Update this sketch with a given signed 16-bit integer.
* If the summary contains strings and cross-language portability is required,
* callers should ensure that any strings in the summary
* use a compatible encoding (valid UTF-8).
* For compatibility with Java implementation.
* @param key int16_t to update the sketch with
* @param value to update the sketch with
Expand All @@ -313,6 +339,9 @@ class update_tuple_sketch: public tuple_sketch<Summary, Allocator> {

/**
* Update this sketch with a given unsigned 8-bit integer.
* If the summary contains strings and cross-language portability is required,
* callers should ensure that any strings in the summary
* use a compatible encoding (valid UTF-8).
* For compatibility with Java implementation.
* @param key uint8_t to update the sketch with
* @param value to update the sketch with
Expand All @@ -322,6 +351,9 @@ class update_tuple_sketch: public tuple_sketch<Summary, Allocator> {

/**
* Update this sketch with a given signed 8-bit integer.
* If the summary contains strings and cross-language portability is required,
* callers should ensure that any strings in the summary
* use a compatible encoding (valid UTF-8).
* For compatibility with Java implementation.
* @param key int8_t to update the sketch with
* @param value to update the sketch with
Expand All @@ -331,6 +363,9 @@ class update_tuple_sketch: public tuple_sketch<Summary, Allocator> {

/**
* Update this sketch with a given double-precision floating point value.
* If the summary contains strings and cross-language portability is required,
* callers should ensure that any strings in the summary
* use a compatible encoding (valid UTF-8).
* For compatibility with Java implementation.
* @param key double to update the sketch with
* @param value to update the sketch with
Expand All @@ -340,6 +375,9 @@ class update_tuple_sketch: public tuple_sketch<Summary, Allocator> {

/**
* Update this sketch with a given floating point value.
* If the summary contains strings and cross-language portability is required,
* callers should ensure that any strings in the summary
* use a compatible encoding (valid UTF-8).
* For compatibility with Java implementation.
* @param key float to update the sketch with
* @param value to update the sketch with
Expand All @@ -357,6 +395,9 @@ class update_tuple_sketch: public tuple_sketch<Summary, Allocator> {
* Otherwise two sketches that should represent overlapping sets will be disjoint
* For instance, for signed 32-bit values call update(int32_t) method above,
* which does widening conversion to int64_t, if compatibility with Java is expected
* If the summary contains strings and cross-language portability is required,
* callers should ensure that any strings in the summary
* use a compatible encoding (valid UTF-8).
* @param key pointer to the data
* @param length of the data in bytes
* @param value to update the sketch with
Expand Down
Loading