/Users/andrewlamb/Software/datafusion/datafusion/functions-aggregate/src/hyperloglog.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! # HyperLogLog |
19 | | //! |
20 | | //! `hyperloglog` is a module that contains a modified version |
21 | | //! of [redis's implementation](https://github.com/redis/redis/blob/4930d19e70c391750479951022e207e19111eb55/src/hyperloglog.c) |
22 | | //! with some modification based on strong assumption of usage |
23 | | //! within datafusion, so that function can |
24 | | //! be efficiently implemented. |
25 | | //! |
26 | | //! Specifically, like Redis's version, this HLL structure uses |
27 | | //! 2**14 = 16384 registers, which means the standard error is |
28 | | //! 1.04/(16384**0.5) = 0.8125%. Unlike Redis, the register takes |
29 | | //! up full [`u8`] size instead of a raw int* and thus saves some |
30 | | //! tricky bit shifting techniques used in the original version. |
31 | | //! This results in a memory usage increase from 12Kib to 16Kib. |
32 | | //! Also only the dense version is adopted, so there's no automatic |
33 | | //! conversion, largely to simplify the code. |
34 | | //! |
35 | | //! This module also borrows some code structure from [pdatastructs.rs](https://github.com/crepererum/pdatastructs.rs/blob/3997ed50f6b6871c9e53c4c5e0f48f431405fc63/src/hyperloglog.rs). |
36 | | |
37 | | use ahash::RandomState; |
38 | | use std::hash::Hash; |
39 | | use std::marker::PhantomData; |
40 | | |
41 | | /// The greater is P, the smaller the error. |
42 | | const HLL_P: usize = 14_usize; |
43 | | /// The number of bits of the hash value used determining the number of leading zeros |
44 | | const HLL_Q: usize = 64_usize - HLL_P; |
45 | | const NUM_REGISTERS: usize = 1_usize << HLL_P; |
46 | | /// Mask to obtain index into the registers |
47 | | const HLL_P_MASK: u64 = (NUM_REGISTERS as u64) - 1; |
48 | | |
49 | | #[derive(Clone, Debug)] |
50 | | pub(crate) struct HyperLogLog<T> |
51 | | where |
52 | | T: Hash + ?Sized, |
53 | | { |
54 | | registers: [u8; NUM_REGISTERS], |
55 | | phantom: PhantomData<T>, |
56 | | } |
57 | | |
58 | | /// Fixed seed for the hashing so that values are consistent across runs |
59 | | /// |
60 | | /// Note that when we later move on to have serialized HLL register binaries |
61 | | /// shared across cluster, this SEED will have to be consistent across all |
62 | | /// parties otherwise we might have corruption. So ideally for later this seed |
63 | | /// shall be part of the serialized form (or stay unchanged across versions). |
64 | | const SEED: RandomState = RandomState::with_seeds( |
65 | | 0x885f6cab121d01a3_u64, |
66 | | 0x71e4379f2976ad8f_u64, |
67 | | 0xbf30173dd28a8816_u64, |
68 | | 0x0eaea5d736d733a4_u64, |
69 | | ); |
70 | | |
71 | | impl<T> Default for HyperLogLog<T> |
72 | | where |
73 | | T: Hash + ?Sized, |
74 | | { |
75 | 0 | fn default() -> Self { |
76 | 0 | Self::new() |
77 | 0 | } |
78 | | } |
79 | | |
80 | | impl<T> HyperLogLog<T> |
81 | | where |
82 | | T: Hash + ?Sized, |
83 | | { |
84 | | /// Creates a new, empty HyperLogLog. |
85 | 0 | pub fn new() -> Self { |
86 | 0 | let registers = [0; NUM_REGISTERS]; |
87 | 0 | Self::new_with_registers(registers) |
88 | 0 | } |
89 | | |
90 | | /// Creates a HyperLogLog from already populated registers |
91 | | /// note that this method should not be invoked in untrusted environment |
92 | | /// because the internal structure of registers are not examined. |
93 | 0 | pub(crate) fn new_with_registers(registers: [u8; NUM_REGISTERS]) -> Self { |
94 | 0 | Self { |
95 | 0 | registers, |
96 | 0 | phantom: PhantomData, |
97 | 0 | } |
98 | 0 | } |
99 | | |
100 | | /// choice of hash function: ahash is already an dependency |
101 | | /// and it fits the requirements of being a 64bit hash with |
102 | | /// reasonable performance. |
103 | | #[inline] |
104 | 0 | fn hash_value(&self, obj: &T) -> u64 { |
105 | 0 | SEED.hash_one(obj) |
106 | 0 | } |
107 | | |
108 | | /// Adds an element to the HyperLogLog. |
109 | 0 | pub fn add(&mut self, obj: &T) { |
110 | 0 | let hash = self.hash_value(obj); |
111 | 0 | let index = (hash & HLL_P_MASK) as usize; |
112 | 0 | let p = ((hash >> HLL_P) | (1_u64 << HLL_Q)).trailing_zeros() + 1; |
113 | 0 | self.registers[index] = self.registers[index].max(p as u8); |
114 | 0 | } |
115 | | |
116 | | /// Get the register histogram (each value in register index into |
117 | | /// the histogram; u32 is enough because we only have 2**14=16384 registers |
118 | | #[inline] |
119 | 0 | fn get_histogram(&self) -> [u32; HLL_Q + 2] { |
120 | 0 | let mut histogram = [0; HLL_Q + 2]; |
121 | | // hopefully this can be unrolled |
122 | 0 | for r in self.registers { |
123 | 0 | histogram[r as usize] += 1; |
124 | 0 | } |
125 | 0 | histogram |
126 | 0 | } |
127 | | |
128 | | /// Merge the other [`HyperLogLog`] into this one |
129 | 0 | pub fn merge(&mut self, other: &HyperLogLog<T>) { |
130 | 0 | assert!( |
131 | 0 | self.registers.len() == other.registers.len(), |
132 | 0 | "unexpected got unequal register size, expect {}, got {}", |
133 | 0 | self.registers.len(), |
134 | 0 | other.registers.len() |
135 | | ); |
136 | 0 | for i in 0..self.registers.len() { |
137 | 0 | self.registers[i] = self.registers[i].max(other.registers[i]); |
138 | 0 | } |
139 | 0 | } |
140 | | |
141 | | /// Guess the number of unique elements seen by the HyperLogLog. |
142 | 0 | pub fn count(&self) -> usize { |
143 | 0 | let histogram = self.get_histogram(); |
144 | 0 | let m = NUM_REGISTERS as f64; |
145 | 0 | let mut z = m * hll_tau((m - histogram[HLL_Q + 1] as f64) / m); |
146 | 0 | for i in histogram[1..=HLL_Q].iter().rev() { |
147 | 0 | z += *i as f64; |
148 | 0 | z *= 0.5; |
149 | 0 | } |
150 | 0 | z += m * hll_sigma(histogram[0] as f64 / m); |
151 | 0 | (0.5 / 2_f64.ln() * m * m / z).round() as usize |
152 | 0 | } |
153 | | } |
154 | | |
155 | | /// Helper function sigma as defined in |
156 | | /// "New cardinality estimation algorithms for HyperLogLog sketches" |
157 | | /// Otmar Ertl, arXiv:1702.01284 |
158 | | #[inline] |
159 | 0 | fn hll_sigma(x: f64) -> f64 { |
160 | 0 | if x == 1. { |
161 | 0 | f64::INFINITY |
162 | | } else { |
163 | 0 | let mut y = 1.0; |
164 | 0 | let mut z = x; |
165 | 0 | let mut x = x; |
166 | | loop { |
167 | 0 | x *= x; |
168 | 0 | let z_prime = z; |
169 | 0 | z += x * y; |
170 | 0 | y += y; |
171 | 0 | if z_prime == z { |
172 | 0 | break; |
173 | 0 | } |
174 | | } |
175 | 0 | z |
176 | | } |
177 | 0 | } |
178 | | |
179 | | /// Helper function tau as defined in |
180 | | /// "New cardinality estimation algorithms for HyperLogLog sketches" |
181 | | /// Otmar Ertl, arXiv:1702.01284 |
182 | | #[inline] |
183 | 0 | fn hll_tau(x: f64) -> f64 { |
184 | 0 | if x == 0.0 || x == 1.0 { |
185 | 0 | 0.0 |
186 | | } else { |
187 | 0 | let mut y = 1.0; |
188 | 0 | let mut z = 1.0 - x; |
189 | 0 | let mut x = x; |
190 | | loop { |
191 | 0 | x = x.sqrt(); |
192 | 0 | let z_prime = z; |
193 | 0 | y *= 0.5; |
194 | 0 | z -= (1.0 - x).powi(2) * y; |
195 | 0 | if z_prime == z { |
196 | 0 | break; |
197 | 0 | } |
198 | | } |
199 | 0 | z / 3.0 |
200 | | } |
201 | 0 | } |
202 | | |
203 | | impl<T> AsRef<[u8]> for HyperLogLog<T> |
204 | | where |
205 | | T: Hash + ?Sized, |
206 | | { |
207 | 0 | fn as_ref(&self) -> &[u8] { |
208 | 0 | &self.registers |
209 | 0 | } |
210 | | } |
211 | | |
212 | | impl<T> Extend<T> for HyperLogLog<T> |
213 | | where |
214 | | T: Hash, |
215 | | { |
216 | 0 | fn extend<S: IntoIterator<Item = T>>(&mut self, iter: S) { |
217 | 0 | for elem in iter { |
218 | 0 | self.add(&elem); |
219 | 0 | } |
220 | 0 | } |
221 | | } |
222 | | |
223 | | impl<'a, T> Extend<&'a T> for HyperLogLog<T> |
224 | | where |
225 | | T: 'a + Hash + ?Sized, |
226 | | { |
227 | 0 | fn extend<S: IntoIterator<Item = &'a T>>(&mut self, iter: S) { |
228 | 0 | for elem in iter { |
229 | 0 | self.add(elem); |
230 | 0 | } |
231 | 0 | } |
232 | | } |
233 | | |
234 | | #[cfg(test)] |
235 | | mod tests { |
236 | | use super::{HyperLogLog, NUM_REGISTERS}; |
237 | | |
238 | | fn compare_with_delta(got: usize, expected: usize) { |
239 | | let expected = expected as f64; |
240 | | let diff = (got as f64) - expected; |
241 | | let diff = diff.abs() / expected; |
242 | | // times 6 because we want the tests to be stable |
243 | | // so we allow a rather large margin of error |
244 | | // this is adopted from redis's unit test version as well |
245 | | let margin = 1.04 / ((NUM_REGISTERS as f64).sqrt()) * 6.0; |
246 | | assert!( |
247 | | diff <= margin, |
248 | | "{} is not near {} percent of {} which is ({}, {})", |
249 | | got, |
250 | | margin, |
251 | | expected, |
252 | | expected * (1.0 - margin), |
253 | | expected * (1.0 + margin) |
254 | | ); |
255 | | } |
256 | | |
257 | | macro_rules! sized_number_test { |
258 | | ($SIZE: expr, $T: tt) => {{ |
259 | | let mut hll = HyperLogLog::<$T>::new(); |
260 | | for i in 0..$SIZE { |
261 | | hll.add(&i); |
262 | | } |
263 | | compare_with_delta(hll.count(), $SIZE); |
264 | | }}; |
265 | | } |
266 | | |
267 | | macro_rules! typed_large_number_test { |
268 | | ($SIZE: expr) => {{ |
269 | | sized_number_test!($SIZE, u64); |
270 | | sized_number_test!($SIZE, u128); |
271 | | sized_number_test!($SIZE, i64); |
272 | | sized_number_test!($SIZE, i128); |
273 | | }}; |
274 | | } |
275 | | |
276 | | macro_rules! typed_number_test { |
277 | | ($SIZE: expr) => {{ |
278 | | sized_number_test!($SIZE, u16); |
279 | | sized_number_test!($SIZE, u32); |
280 | | sized_number_test!($SIZE, i16); |
281 | | sized_number_test!($SIZE, i32); |
282 | | typed_large_number_test!($SIZE); |
283 | | }}; |
284 | | } |
285 | | |
286 | | #[test] |
287 | | fn test_empty() { |
288 | | let hll = HyperLogLog::<u64>::new(); |
289 | | assert_eq!(hll.count(), 0); |
290 | | } |
291 | | |
292 | | #[test] |
293 | | fn test_one() { |
294 | | let mut hll = HyperLogLog::<u64>::new(); |
295 | | hll.add(&1); |
296 | | assert_eq!(hll.count(), 1); |
297 | | } |
298 | | |
299 | | #[test] |
300 | | fn test_number_100() { |
301 | | typed_number_test!(100); |
302 | | } |
303 | | |
304 | | #[test] |
305 | | fn test_number_1k() { |
306 | | typed_number_test!(1_000); |
307 | | } |
308 | | |
309 | | #[test] |
310 | | fn test_number_10k() { |
311 | | typed_number_test!(10_000); |
312 | | } |
313 | | |
314 | | #[test] |
315 | | fn test_number_100k() { |
316 | | typed_large_number_test!(100_000); |
317 | | } |
318 | | |
319 | | #[test] |
320 | | fn test_number_1m() { |
321 | | typed_large_number_test!(1_000_000); |
322 | | } |
323 | | |
324 | | #[test] |
325 | | fn test_u8() { |
326 | | let mut hll = HyperLogLog::<[u8]>::new(); |
327 | | for i in 0..1000 { |
328 | | let s = i.to_string(); |
329 | | let b = s.as_bytes(); |
330 | | hll.add(b); |
331 | | } |
332 | | compare_with_delta(hll.count(), 1000); |
333 | | } |
334 | | |
335 | | #[test] |
336 | | fn test_string() { |
337 | | let mut hll = HyperLogLog::<String>::new(); |
338 | | hll.extend((0..1000).map(|i| i.to_string())); |
339 | | compare_with_delta(hll.count(), 1000); |
340 | | } |
341 | | |
342 | | #[test] |
343 | | fn test_empty_merge() { |
344 | | let mut hll = HyperLogLog::<u64>::new(); |
345 | | hll.merge(&HyperLogLog::<u64>::new()); |
346 | | assert_eq!(hll.count(), 0); |
347 | | } |
348 | | |
349 | | #[test] |
350 | | fn test_merge_overlapped() { |
351 | | let mut hll = HyperLogLog::<String>::new(); |
352 | | hll.extend((0..1000).map(|i| i.to_string())); |
353 | | |
354 | | let mut other = HyperLogLog::<String>::new(); |
355 | | other.extend((0..1000).map(|i| i.to_string())); |
356 | | |
357 | | hll.merge(&other); |
358 | | compare_with_delta(hll.count(), 1000); |
359 | | } |
360 | | |
361 | | #[test] |
362 | | fn test_repetition() { |
363 | | let mut hll = HyperLogLog::<u32>::new(); |
364 | | for i in 0..1_000_000 { |
365 | | hll.add(&(i % 1000)); |
366 | | } |
367 | | compare_with_delta(hll.count(), 1000); |
368 | | } |
369 | | } |