Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/common/src/stats.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! This module provides data structures to represent statistics
19
20
use std::fmt::{self, Debug, Display};
21
22
use crate::{Result, ScalarValue};
23
24
use arrow_schema::{Schema, SchemaRef};
25
26
/// Represents a value with a degree of certainty. `Precision` is used to
27
/// propagate information the precision of statistical values.
28
#[derive(Clone, PartialEq, Eq, Default, Copy)]
29
pub enum Precision<T: Debug + Clone + PartialEq + Eq + PartialOrd> {
30
    /// The exact value is known
31
    Exact(T),
32
    /// The value is not known exactly, but is likely close to this value
33
    Inexact(T),
34
    /// Nothing is known about the value
35
    #[default]
36
    Absent,
37
}
38
39
impl<T: Debug + Clone + PartialEq + Eq + PartialOrd> Precision<T> {
40
    /// If we have some value (exact or inexact), it returns that value.
41
    /// Otherwise, it returns `None`.
42
823
    pub fn get_value(&self) -> Option<&T> {
43
823
        match self {
44
705
            Precision::Exact(
value10
) | Precision::Inexact(
value695
) => Some(value),
45
118
            Precision::Absent => None,
46
        }
47
823
    }
48
49
    /// Transform the value in this [`Precision`] object, if one exists, using
50
    /// the given function. Preserves the exactness state.
51
238
    pub fn map<U, F>(self, f: F) -> Precision<U>
52
238
    where
53
238
        F: Fn(T) -> U,
54
238
        U: Debug + Clone + PartialEq + Eq + PartialOrd,
55
238
    {
56
238
        match self {
57
4
            Precision::Exact(val) => Precision::Exact(f(val)),
58
172
            Precision::Inexact(val) => Precision::Inexact(f(val)),
59
62
            _ => Precision::<U>::Absent,
60
        }
61
238
    }
62
63
    /// Returns `Some(true)` if we have an exact value, `Some(false)` if we
64
    /// have an inexact value, and `None` if there is no value.
65
42
    pub fn is_exact(&self) -> Option<bool> {
66
42
        match self {
67
7
            Precision::Exact(_) => Some(true),
68
35
            Precision::Inexact(_) => Some(false),
69
0
            _ => None,
70
        }
71
42
    }
72
73
    /// Returns the maximum of two (possibly inexact) values, conservatively
74
    /// propagating exactness information. If one of the input values is
75
    /// [`Precision::Absent`], the result is `Absent` too.
76
48
    pub fn max(&self, other: &Precision<T>) -> Precision<T> {
77
48
        match (self, other) {
78
2
            (Precision::Exact(a), Precision::Exact(b)) => {
79
2
                Precision::Exact(if a >= b { 
a.clone()1
} else {
b.clone()1
})
80
            }
81
4
            (Precision::Inexact(a), Precision::Exact(b))
82
0
            | (Precision::Exact(a), Precision::Inexact(b))
83
41
            | (Precision::Inexact(a), Precision::Inexact(b)) => {
84
45
                Precision::Inexact(if a >= b { 
a.clone()22
} else {
b.clone()23
})
85
            }
86
1
            (_, _) => Precision::Absent,
87
        }
88
48
    }
89
90
    /// Returns the minimum of two (possibly inexact) values, conservatively
91
    /// propagating exactness information. If one of the input values is
92
    /// [`Precision::Absent`], the result is `Absent` too.
93
3
    pub fn min(&self, other: &Precision<T>) -> Precision<T> {
94
3
        match (self, other) {
95
2
            (Precision::Exact(a), Precision::Exact(b)) => {
96
2
                Precision::Exact(if a >= b { 
b.clone()0
} else { a.clone() })
97
            }
98
0
            (Precision::Inexact(a), Precision::Exact(b))
99
0
            | (Precision::Exact(a), Precision::Inexact(b))
100
0
            | (Precision::Inexact(a), Precision::Inexact(b)) => {
101
0
                Precision::Inexact(if a >= b { b.clone() } else { a.clone() })
102
            }
103
1
            (_, _) => Precision::Absent,
104
        }
105
3
    }
106
107
    /// Demotes the precision state from exact to inexact (if present).
108
206
    pub fn to_inexact(self) -> Self {
109
206
        match self {
110
16
            Precision::Exact(value) => Precision::Inexact(value),
111
190
            _ => self,
112
        }
113
206
    }
114
}
115
116
impl Precision<usize> {
117
    /// Calculates the sum of two (possibly inexact) [`usize`] values,
118
    /// conservatively propagating exactness information. If one of the input
119
    /// values is [`Precision::Absent`], the result is `Absent` too.
120
7
    pub fn add(&self, other: &Precision<usize>) -> Precision<usize> {
121
7
        match (self, other) {
122
3
            (Precision::Exact(a), Precision::Exact(b)) => Precision::Exact(a + b),
123
0
            (Precision::Inexact(a), Precision::Exact(b))
124
0
            | (Precision::Exact(a), Precision::Inexact(b))
125
2
            | (Precision::Inexact(a), Precision::Inexact(b)) => Precision::Inexact(a + b),
126
2
            (_, _) => Precision::Absent,
127
        }
128
7
    }
129
130
    /// Calculates the difference of two (possibly inexact) [`usize`] values,
131
    /// conservatively propagating exactness information. If one of the input
132
    /// values is [`Precision::Absent`], the result is `Absent` too.
133
2
    pub fn sub(&self, other: &Precision<usize>) -> Precision<usize> {
134
2
        match (self, other) {
135
0
            (Precision::Exact(a), Precision::Exact(b)) => Precision::Exact(a - b),
136
0
            (Precision::Inexact(a), Precision::Exact(b))
137
0
            | (Precision::Exact(a), Precision::Inexact(b))
138
2
            | (Precision::Inexact(a), Precision::Inexact(b)) => Precision::Inexact(a - b),
139
0
            (_, _) => Precision::Absent,
140
        }
141
2
    }
142
143
    /// Calculates the multiplication of two (possibly inexact) [`usize`] values,
144
    /// conservatively propagating exactness information. If one of the input
145
    /// values is [`Precision::Absent`], the result is `Absent` too.
146
13
    pub fn multiply(&self, other: &Precision<usize>) -> Precision<usize> {
147
13
        match (self, other) {
148
8
            (Precision::Exact(a), Precision::Exact(b)) => Precision::Exact(a * b),
149
0
            (Precision::Inexact(a), Precision::Exact(b))
150
0
            | (Precision::Exact(a), Precision::Inexact(b))
151
0
            | (Precision::Inexact(a), Precision::Inexact(b)) => Precision::Inexact(a * b),
152
5
            (_, _) => Precision::Absent,
153
        }
154
13
    }
155
156
    /// Return the estimate of applying a filter with estimated selectivity
157
    /// `selectivity` to this Precision. A selectivity of `1.0` means that all
158
    /// rows are selected. A selectivity of `0.5` means half the rows are
159
    /// selected. Will always return inexact statistics.
160
72
    pub fn with_estimated_selectivity(self, selectivity: f64) -> Self {
161
72
        self.map(|v| 
((v as f64 * selectivity).ceil()) as usize44
)
162
72
            .to_inexact()
163
72
    }
164
}
165
166
impl Precision<ScalarValue> {
167
    /// Calculates the sum of two (possibly inexact) [`ScalarValue`] values,
168
    /// conservatively propagating exactness information. If one of the input
169
    /// values is [`Precision::Absent`], the result is `Absent` too.
170
0
    pub fn add(&self, other: &Precision<ScalarValue>) -> Precision<ScalarValue> {
171
0
        match (self, other) {
172
0
            (Precision::Exact(a), Precision::Exact(b)) => {
173
0
                if let Ok(result) = a.add(b) {
174
0
                    Precision::Exact(result)
175
                } else {
176
0
                    Precision::Absent
177
                }
178
            }
179
0
            (Precision::Inexact(a), Precision::Exact(b))
180
0
            | (Precision::Exact(a), Precision::Inexact(b))
181
0
            | (Precision::Inexact(a), Precision::Inexact(b)) => {
182
0
                if let Ok(result) = a.add(b) {
183
0
                    Precision::Inexact(result)
184
                } else {
185
0
                    Precision::Absent
186
                }
187
            }
188
0
            (_, _) => Precision::Absent,
189
        }
190
0
    }
191
}
192
193
impl<T: fmt::Debug + Clone + PartialEq + Eq + PartialOrd> Debug for Precision<T> {
194
0
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
195
0
        match self {
196
0
            Precision::Exact(inner) => write!(f, "Exact({:?})", inner),
197
0
            Precision::Inexact(inner) => write!(f, "Inexact({:?})", inner),
198
0
            Precision::Absent => write!(f, "Absent"),
199
        }
200
0
    }
201
}
202
203
impl<T: fmt::Debug + Clone + PartialEq + Eq + PartialOrd> Display for Precision<T> {
204
0
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
205
0
        match self {
206
0
            Precision::Exact(inner) => write!(f, "Exact({:?})", inner),
207
0
            Precision::Inexact(inner) => write!(f, "Inexact({:?})", inner),
208
0
            Precision::Absent => write!(f, "Absent"),
209
        }
210
0
    }
211
}
212
213
/// Statistics for a relation
214
/// Fields are optional and can be inexact because the sources
215
/// sometimes provide approximate estimates for performance reasons
216
/// and the transformations output are not always predictable.
217
#[derive(Debug, Clone, PartialEq, Eq)]
218
pub struct Statistics {
219
    /// The number of table rows.
220
    pub num_rows: Precision<usize>,
221
    /// Total bytes of the table rows.
222
    pub total_byte_size: Precision<usize>,
223
    /// Statistics on a column level. It contains a [`ColumnStatistics`] for
224
    /// each field in the schema of the table to which the [`Statistics`] refer.
225
    pub column_statistics: Vec<ColumnStatistics>,
226
}
227
228
impl Statistics {
229
    /// Returns a [`Statistics`] instance for the given schema by assigning
230
    /// unknown statistics to each column in the schema.
231
4
    pub fn new_unknown(schema: &Schema) -> Self {
232
4
        Self {
233
4
            num_rows: Precision::Absent,
234
4
            total_byte_size: Precision::Absent,
235
4
            column_statistics: Statistics::unknown_column(schema),
236
4
        }
237
4
    }
238
239
    /// Returns an unbounded `ColumnStatistics` for each field in the schema.
240
27
    pub fn unknown_column(schema: &Schema) -> Vec<ColumnStatistics> {
241
27
        schema
242
27
            .fields()
243
27
            .iter()
244
27
            .map(|_| ColumnStatistics::new_unknown())
245
27
            .collect()
246
27
    }
247
248
    /// If the exactness of a [`Statistics`] instance is lost, this function relaxes
249
    /// the exactness of all information by converting them [`Precision::Inexact`].
250
3
    pub fn to_inexact(mut self) -> Self {
251
3
        self.num_rows = self.num_rows.to_inexact();
252
3
        self.total_byte_size = self.total_byte_size.to_inexact();
253
3
        self.column_statistics = self
254
3
            .column_statistics
255
3
            .into_iter()
256
3
            .map(|s| s.to_inexact())
257
3
            .collect();
258
3
        self
259
3
    }
260
261
    /// Calculates the statistics after `fetch` and `skip` operations apply.
262
    /// Here, `self` denotes per-partition statistics. Use the `n_partitions`
263
    /// parameter to compute global statistics in a multi-partition setting.
264
17
    pub fn with_fetch(
265
17
        mut self,
266
17
        schema: SchemaRef,
267
17
        fetch: Option<usize>,
268
17
        skip: usize,
269
17
        n_partitions: usize,
270
17
    ) -> Result<Self> {
271
17
        let fetch_val = fetch.unwrap_or(usize::MAX);
272
17
273
17
        self.num_rows = match self {
274
            Statistics {
275
9
                num_rows: Precision::Exact(nr),
276
                ..
277
            }
278
            | Statistics {
279
8
                num_rows: Precision::Inexact(nr),
280
                ..
281
            } => {
282
                // Here, the inexact case gives us an upper bound on the number of rows.
283
17
                if nr <= skip {
284
                    // All input data will be skipped:
285
2
                    Precision::Exact(0)
286
15
                } else if nr <= fetch_val && 
skip == 06
{
287
                    // If the input does not reach the `fetch` globally, and `skip`
288
                    // is zero (meaning the input and output are identical), return
289
                    // input stats as is.
290
                    // TODO: Can input stats still be used, but adjusted, when `skip`
291
                    //       is non-zero?
292
2
                    return Ok(self);
293
13
                } else if nr - skip <= fetch_val {
294
                    // After `skip` input rows are skipped, the remaining rows are
295
                    // less than or equal to the `fetch` values, so `num_rows` must
296
                    // equal the remaining rows.
297
6
                    check_num_rows(
298
6
                        (nr - skip).checked_mul(n_partitions),
299
6
                        // We know that we have an estimate for the number of rows:
300
6
                        self.num_rows.is_exact().unwrap(),
301
6
                    )
302
                } else {
303
                    // At this point we know that we were given a `fetch` value
304
                    // as the `None` case would go into the branch above. Since
305
                    // the input has more rows than `fetch + skip`, the number
306
                    // of rows will be the `fetch`, but we won't be able to
307
                    // predict the other statistics.
308
7
                    check_num_rows(
309
7
                        fetch_val.checked_mul(n_partitions),
310
7
                        // We know that we have an estimate for the number of rows:
311
7
                        self.num_rows.is_exact().unwrap(),
312
7
                    )
313
                }
314
            }
315
            Statistics {
316
                num_rows: Precision::Absent,
317
                ..
318
0
            } => check_num_rows(fetch.and_then(|v| v.checked_mul(n_partitions)), false),
319
        };
320
15
        self.column_statistics = Statistics::unknown_column(&schema);
321
15
        self.total_byte_size = Precision::Absent;
322
15
        Ok(self)
323
17
    }
324
}
325
326
/// Creates an estimate of the number of rows in the output using the given
327
/// optional value and exactness flag.
328
13
fn check_num_rows(value: Option<usize>, is_exact: bool) -> Precision<usize> {
329
13
    if let Some(value) = value {
330
13
        if is_exact {
331
7
            Precision::Exact(value)
332
        } else {
333
            // If the input stats are inexact, so are the output stats.
334
6
            Precision::Inexact(value)
335
        }
336
    } else {
337
        // If the estimate is not available (e.g. due to an overflow), we can
338
        // not produce a reliable estimate.
339
0
        Precision::Absent
340
    }
341
13
}
342
343
impl Display for Statistics {
344
0
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
345
0
        // string of column statistics
346
0
        let column_stats = self
347
0
            .column_statistics
348
0
            .iter()
349
0
            .enumerate()
350
0
            .map(|(i, cs)| {
351
0
                let s = format!("(Col[{}]:", i);
352
0
                let s = if cs.min_value != Precision::Absent {
353
0
                    format!("{} Min={}", s, cs.min_value)
354
                } else {
355
0
                    s
356
                };
357
0
                let s = if cs.max_value != Precision::Absent {
358
0
                    format!("{} Max={}", s, cs.max_value)
359
                } else {
360
0
                    s
361
                };
362
0
                let s = if cs.null_count != Precision::Absent {
363
0
                    format!("{} Null={}", s, cs.null_count)
364
                } else {
365
0
                    s
366
                };
367
0
                let s = if cs.distinct_count != Precision::Absent {
368
0
                    format!("{} Distinct={}", s, cs.distinct_count)
369
                } else {
370
0
                    s
371
                };
372
373
0
                s + ")"
374
0
            })
375
0
            .collect::<Vec<_>>()
376
0
            .join(",");
377
0
378
0
        write!(
379
0
            f,
380
0
            "Rows={}, Bytes={}, [{}]",
381
0
            self.num_rows, self.total_byte_size, column_stats
382
0
        )?;
383
384
0
        Ok(())
385
0
    }
386
}
387
388
/// Statistics for a column within a relation
389
#[derive(Clone, Debug, PartialEq, Eq, Default)]
390
pub struct ColumnStatistics {
391
    /// Number of null values on column
392
    pub null_count: Precision<usize>,
393
    /// Maximum value of column
394
    pub max_value: Precision<ScalarValue>,
395
    /// Minimum value of column
396
    pub min_value: Precision<ScalarValue>,
397
    /// Number of distinct values
398
    pub distinct_count: Precision<usize>,
399
}
400
401
impl ColumnStatistics {
402
    /// Column contains a single non null value (e.g constant).
403
21
    pub fn is_singleton(&self) -> bool {
404
21
        match (&self.min_value, &self.max_value) {
405
            // Min and max values are the same and not infinity.
406
4
            (Precision::Exact(min), Precision::Exact(max)) => {
407
4
                !min.is_null() && !max.is_null() && (min == max)
408
            }
409
17
            (_, _) => false,
410
        }
411
21
    }
412
413
    /// Returns a [`ColumnStatistics`] instance having all [`Precision::Absent`] parameters.
414
61
    pub fn new_unknown() -> Self {
415
61
        Self {
416
61
            null_count: Precision::Absent,
417
61
            max_value: Precision::Absent,
418
61
            min_value: Precision::Absent,
419
61
            distinct_count: Precision::Absent,
420
61
        }
421
61
    }
422
423
    /// If the exactness of a [`ColumnStatistics`] instance is lost, this
424
    /// function relaxes the exactness of all information by converting them
425
    /// [`Precision::Inexact`].
426
3
    pub fn to_inexact(mut self) -> Self {
427
3
        self.null_count = self.null_count.to_inexact();
428
3
        self.max_value = self.max_value.to_inexact();
429
3
        self.min_value = self.min_value.to_inexact();
430
3
        self.distinct_count = self.distinct_count.to_inexact();
431
3
        self
432
3
    }
433
}
434
435
#[cfg(test)]
436
mod tests {
437
    use super::*;
438
439
    #[test]
440
    fn test_get_value() {
441
        let exact_precision = Precision::Exact(42);
442
        let inexact_precision = Precision::Inexact(23);
443
        let absent_precision = Precision::<i32>::Absent;
444
445
        assert_eq!(*exact_precision.get_value().unwrap(), 42);
446
        assert_eq!(*inexact_precision.get_value().unwrap(), 23);
447
        assert_eq!(absent_precision.get_value(), None);
448
    }
449
450
    #[test]
451
    fn test_map() {
452
        let exact_precision = Precision::Exact(42);
453
        let inexact_precision = Precision::Inexact(23);
454
        let absent_precision = Precision::Absent;
455
456
        let squared = |x| x * x;
457
458
        assert_eq!(exact_precision.map(squared), Precision::Exact(1764));
459
        assert_eq!(inexact_precision.map(squared), Precision::Inexact(529));
460
        assert_eq!(absent_precision.map(squared), Precision::Absent);
461
    }
462
463
    #[test]
464
    fn test_is_exact() {
465
        let exact_precision = Precision::Exact(42);
466
        let inexact_precision = Precision::Inexact(23);
467
        let absent_precision = Precision::<i32>::Absent;
468
469
        assert_eq!(exact_precision.is_exact(), Some(true));
470
        assert_eq!(inexact_precision.is_exact(), Some(false));
471
        assert_eq!(absent_precision.is_exact(), None);
472
    }
473
474
    #[test]
475
    fn test_max() {
476
        let precision1 = Precision::Exact(42);
477
        let precision2 = Precision::Inexact(23);
478
        let precision3 = Precision::Exact(30);
479
        let absent_precision = Precision::Absent;
480
481
        assert_eq!(precision1.max(&precision2), Precision::Inexact(42));
482
        assert_eq!(precision1.max(&precision3), Precision::Exact(42));
483
        assert_eq!(precision2.max(&precision3), Precision::Inexact(30));
484
        assert_eq!(precision1.max(&absent_precision), Precision::Absent);
485
    }
486
487
    #[test]
488
    fn test_min() {
489
        let precision1 = Precision::Exact(42);
490
        let precision2 = Precision::Inexact(23);
491
        let precision3 = Precision::Exact(30);
492
        let absent_precision = Precision::Absent;
493
494
        assert_eq!(precision1.min(&precision2), Precision::Inexact(23));
495
        assert_eq!(precision1.min(&precision3), Precision::Exact(30));
496
        assert_eq!(precision2.min(&precision3), Precision::Inexact(23));
497
        assert_eq!(precision1.min(&absent_precision), Precision::Absent);
498
    }
499
500
    #[test]
501
    fn test_to_inexact() {
502
        let exact_precision = Precision::Exact(42);
503
        let inexact_precision = Precision::Inexact(42);
504
        let absent_precision = Precision::<i32>::Absent;
505
506
        assert_eq!(exact_precision.to_inexact(), inexact_precision);
507
        assert_eq!(inexact_precision.to_inexact(), inexact_precision);
508
        assert_eq!(absent_precision.to_inexact(), absent_precision);
509
    }
510
511
    #[test]
512
    fn test_add() {
513
        let precision1 = Precision::Exact(42);
514
        let precision2 = Precision::Inexact(23);
515
        let precision3 = Precision::Exact(30);
516
        let absent_precision = Precision::Absent;
517
518
        assert_eq!(precision1.add(&precision2), Precision::Inexact(65));
519
        assert_eq!(precision1.add(&precision3), Precision::Exact(72));
520
        assert_eq!(precision2.add(&precision3), Precision::Inexact(53));
521
        assert_eq!(precision1.add(&absent_precision), Precision::Absent);
522
    }
523
524
    #[test]
525
    fn test_sub() {
526
        let precision1 = Precision::Exact(42);
527
        let precision2 = Precision::Inexact(23);
528
        let precision3 = Precision::Exact(30);
529
        let absent_precision = Precision::Absent;
530
531
        assert_eq!(precision1.sub(&precision2), Precision::Inexact(19));
532
        assert_eq!(precision1.sub(&precision3), Precision::Exact(12));
533
        assert_eq!(precision1.sub(&absent_precision), Precision::Absent);
534
    }
535
536
    #[test]
537
    fn test_multiply() {
538
        let precision1 = Precision::Exact(6);
539
        let precision2 = Precision::Inexact(3);
540
        let precision3 = Precision::Exact(5);
541
        let absent_precision = Precision::Absent;
542
543
        assert_eq!(precision1.multiply(&precision2), Precision::Inexact(18));
544
        assert_eq!(precision1.multiply(&precision3), Precision::Exact(30));
545
        assert_eq!(precision2.multiply(&precision3), Precision::Inexact(15));
546
        assert_eq!(precision1.multiply(&absent_precision), Precision::Absent);
547
    }
548
549
    #[test]
550
    fn test_precision_cloning() {
551
        // Precision<usize> is copy
552
        let precision: Precision<usize> = Precision::Exact(42);
553
        let p2 = precision;
554
        assert_eq!(precision, p2);
555
556
        // Precision<ScalarValue> is not copy (requires .clone())
557
        let precision: Precision<ScalarValue> =
558
            Precision::Exact(ScalarValue::Int64(Some(42)));
559
        // Clippy would complain about this if it were Copy
560
        #[allow(clippy::redundant_clone)]
561
        let p2 = precision.clone();
562
        assert_eq!(precision, p2);
563
    }
564
}