Skip to content

Commit 271b90f

Browse files
author
Mahesh Pai
committed
Bugfix: tdigest const_iterator returns dangling reference causing incorrect values
1 parent 5b1e968 commit 271b90f

3 files changed

Lines changed: 275 additions & 1 deletion

File tree

tdigest/include/tdigest.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -316,7 +316,7 @@ template<typename T, typename A>
316316
class tdigest<T, A>::const_iterator {
317317
public:
318318
using iterator_category = std::input_iterator_tag;
319-
using value_type = std::pair<const T&, const W>;
319+
using value_type = std::pair<T, W>;
320320
using difference_type = void;
321321
using pointer = const return_value_holder<value_type>;
322322
using reference = const value_type;

tdigest/test/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ target_sources(tdigest_test
3939
PRIVATE
4040
tdigest_test.cpp
4141
tdigest_custom_allocator_test.cpp
42+
tdigest_iterator_test.cpp
4243
)
4344

4445
if (SERDE_COMPAT)
Lines changed: 273 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,273 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include <catch2/catch.hpp>
21+
#include <memory>
22+
#include <map>
23+
#include <vector>
24+
25+
#include "tdigest.hpp"
26+
27+
namespace datasketches {
28+
29+
TEST_CASE("tdigest iterator: basic iteration", "[tdigest]") {
30+
tdigest_double td(100);
31+
32+
// Insert 10 distinct values
33+
for (int i = 0; i < 10; i++) {
34+
td.update(static_cast<double>(i));
35+
}
36+
37+
// Collect all centroids via iteration
38+
std::map<double, uint64_t> centroids;
39+
for (const auto&& centroid : td) {
40+
centroids[centroid.first] = centroid.second;
41+
}
42+
43+
// Should have collected all 10 distinct values
44+
REQUIRE(centroids.size() == 10);
45+
46+
// Verify each value was captured correctly
47+
for (int i = 0; i < 10; i++) {
48+
REQUIRE(centroids.count(static_cast<double>(i)) == 1);
49+
REQUIRE(centroids[static_cast<double>(i)] == 1);
50+
}
51+
}
52+
53+
TEST_CASE("tdigest iterator: explicit begin/end with unique_ptr", "[tdigest]") {
54+
// This test reproduces the bug scenario found in ClickHouse
55+
auto td = std::make_unique<tdigest_double>(100);
56+
57+
// Insert distinct values
58+
for (int i = 0; i < 10; i++) {
59+
td->update(static_cast<double>(i));
60+
}
61+
62+
// Use explicit begin/end iterators
63+
auto it = td->begin();
64+
auto end_it = td->end();
65+
66+
std::vector<double> means;
67+
std::vector<uint64_t> weights;
68+
69+
while (it != end_it) {
70+
// Before the fix, accessing it->first would return garbage or same value repeatedly
71+
double mean = it->first;
72+
uint64_t weight = it->second;
73+
means.push_back(mean);
74+
weights.push_back(weight);
75+
++it;
76+
}
77+
78+
// Should have collected 10 centroids
79+
REQUIRE(means.size() == 10);
80+
REQUIRE(weights.size() == 10);
81+
82+
// All means should be distinct (not all zeros or garbage)
83+
std::set<double> unique_means(means.begin(), means.end());
84+
REQUIRE(unique_means.size() == 10);
85+
86+
// Verify all expected values are present
87+
for (int i = 0; i < 10; i++) {
88+
REQUIRE(unique_means.count(static_cast<double>(i)) == 1);
89+
}
90+
}
91+
92+
TEST_CASE("tdigest iterator: structured bindings", "[tdigest]") {
93+
tdigest_double td(100);
94+
95+
for (int i = 0; i < 5; i++) {
96+
td.update(static_cast<double>(i * 10));
97+
}
98+
99+
std::vector<std::pair<double, uint64_t>> collected;
100+
101+
// Test structured bindings
102+
for (auto it = td.begin(); it != td.end(); ++it) {
103+
auto [mean, weight] = *it;
104+
collected.emplace_back(mean, weight);
105+
}
106+
107+
REQUIRE(collected.size() == 5);
108+
109+
// Verify distinct values were collected
110+
std::set<double> means;
111+
for (const auto& [m, w] : collected) {
112+
means.insert(m);
113+
REQUIRE(w == 1); // Each value inserted once
114+
}
115+
116+
REQUIRE(means.size() == 5);
117+
for (int i = 0; i < 5; i++) {
118+
REQUIRE(means.count(static_cast<double>(i * 10)) == 1);
119+
}
120+
}
121+
122+
TEST_CASE("tdigest iterator: operator-> access", "[tdigest]") {
123+
tdigest_double td(100);
124+
125+
// Insert values
126+
for (int i = 1; i <= 10; i++) {
127+
td.update(static_cast<double>(i * i)); // 1, 4, 9, 16, 25, 36, 49, 64, 81, 100
128+
}
129+
130+
// Access via operator->
131+
std::map<double, uint64_t> centroids;
132+
auto end_it = td.end();
133+
for (auto it = td.begin(); it != end_it; ++it) {
134+
// operator-> should return valid values
135+
centroids[it->first] = it->second;
136+
}
137+
138+
REQUIRE(centroids.size() == 10);
139+
140+
// Verify the squared values
141+
for (int i = 1; i <= 10; i++) {
142+
double expected = static_cast<double>(i * i);
143+
REQUIRE(centroids.count(expected) == 1);
144+
}
145+
}
146+
147+
TEST_CASE("tdigest iterator: range-based for with const auto&&", "[tdigest]") {
148+
tdigest_double td(100);
149+
150+
// Insert values
151+
for (double d = 0.0; d < 10.0; d += 1.0) {
152+
td.update(d);
153+
}
154+
155+
size_t count = 0;
156+
std::set<double> seen_means;
157+
158+
// This pattern was working in simple tests but failing in optimized builds
159+
for (const auto&& centroid : td) {
160+
seen_means.insert(centroid.first);
161+
count++;
162+
}
163+
164+
REQUIRE(count == 10);
165+
REQUIRE(seen_means.size() == 10);
166+
167+
// Verify all values from 0 to 9 are present
168+
for (int i = 0; i < 10; i++) {
169+
REQUIRE(seen_means.count(static_cast<double>(i)) == 1);
170+
}
171+
}
172+
173+
TEST_CASE("tdigest iterator: copy vs reference semantics", "[tdigest]") {
174+
tdigest_double td(100);
175+
176+
td.update(1.0);
177+
td.update(2.0);
178+
td.update(3.0);
179+
180+
auto it = td.begin();
181+
182+
// Store the pair
183+
auto pair1 = *it;
184+
double mean1 = pair1.first;
185+
186+
++it;
187+
188+
// Store another pair
189+
auto pair2 = *it;
190+
double mean2 = pair2.first;
191+
192+
++it;
193+
194+
auto pair3 = *it;
195+
double mean3 = pair3.first;
196+
197+
// All three means should be distinct
198+
REQUIRE(mean1 != mean2);
199+
REQUIRE(mean2 != mean3);
200+
REQUIRE(mean1 != mean3);
201+
202+
// And they should match our input values
203+
std::set<double> means = {mean1, mean2, mean3};
204+
REQUIRE(means.count(1.0) == 1);
205+
REQUIRE(means.count(2.0) == 1);
206+
REQUIRE(means.count(3.0) == 1);
207+
}
208+
209+
TEST_CASE("tdigest iterator: empty sketch", "[tdigest]") {
210+
tdigest_double td(100);
211+
212+
// Empty sketch should have begin() == end()
213+
REQUIRE(td.begin() == td.end());
214+
215+
// Range-based for should not execute
216+
size_t count = 0;
217+
for (const auto&& centroid : td) {
218+
(void)centroid; // Silence unused warning
219+
count++;
220+
}
221+
REQUIRE(count == 0);
222+
}
223+
224+
TEST_CASE("tdigest iterator: single value", "[tdigest]") {
225+
tdigest_double td(100);
226+
td.update(42.0);
227+
228+
size_t count = 0;
229+
double captured_mean = 0.0;
230+
uint64_t captured_weight = 0;
231+
232+
for (const auto&& centroid : td) {
233+
captured_mean = centroid.first;
234+
captured_weight = centroid.second;
235+
count++;
236+
}
237+
238+
REQUIRE(count == 1);
239+
REQUIRE(captured_mean == 42.0);
240+
REQUIRE(captured_weight == 1);
241+
}
242+
243+
TEST_CASE("tdigest iterator: large dataset", "[tdigest]") {
244+
tdigest_double td(100);
245+
246+
// Insert 1000 distinct values
247+
for (int i = 0; i < 1000; i++) {
248+
td.update(static_cast<double>(i));
249+
}
250+
251+
// Iterator should provide compressed centroids (not all 1000)
252+
size_t centroid_count = 0;
253+
std::set<double> unique_means;
254+
uint64_t total_weight = 0;
255+
256+
for (const auto&& centroid : td) {
257+
unique_means.insert(centroid.first);
258+
total_weight += centroid.second;
259+
centroid_count++;
260+
}
261+
262+
// Should have fewer centroids than input values due to compression
263+
REQUIRE(centroid_count < 1000);
264+
REQUIRE(centroid_count > 0);
265+
266+
// Total weight should equal number of input values
267+
REQUIRE(total_weight == 1000);
268+
269+
// All means should be unique (no duplicates)
270+
REQUIRE(unique_means.size() == centroid_count);
271+
}
272+
273+
} // namespace datasketches

0 commit comments

Comments
 (0)