Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/common/src/utils/memory.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! This module provides a function to estimate the memory size of a HashTable prior to alloaction
19
20
use crate::{DataFusionError, Result};
21
22
/// Estimates the memory size required for a hash table prior to allocation.
23
///
24
/// # Parameters
25
/// - `num_elements`: The number of elements expected in the hash table.
26
/// - `fixed_size`: A fixed overhead size associated with the collection
27
///    (e.g., HashSet or HashTable).
28
/// - `T`: The type of elements stored in the hash table.
29
///
30
/// # Details
31
/// This function calculates the estimated memory size by considering:
32
/// - An overestimation of buckets to keep approximately 1/8 of them empty.
33
/// - The total memory size is computed as:
34
///   - The size of each entry (`T`) multiplied by the estimated number of
35
///     buckets.
36
///   - One byte overhead for each bucket.
37
///   - The fixed size overhead of the collection.
38
/// - If the estimation overflows, we return a [`DataFusionError`]
39
///
40
/// # Examples
41
/// ---
42
///
43
/// ## From within a struct
44
///
45
/// ```rust
46
/// # use datafusion_common::utils::memory::estimate_memory_size;
47
/// # use datafusion_common::Result;
48
///
49
/// struct MyStruct<T> {
50
///     values: Vec<T>,
51
///     other_data: usize,
52
/// }
53
///
54
/// impl<T> MyStruct<T> {
55
///     fn size(&self) -> Result<usize> {
56
///         let num_elements = self.values.len();
57
///         let fixed_size = std::mem::size_of_val(self) +
58
///           std::mem::size_of_val(&self.values);
59
///
60
///         estimate_memory_size::<T>(num_elements, fixed_size)
61
///     }
62
/// }
63
/// ```
64
/// ---
65
/// ## With a simple collection
66
///
67
/// ```rust
68
/// # use datafusion_common::utils::memory::estimate_memory_size;
69
/// # use std::collections::HashMap;
70
///
71
/// let num_rows = 100;
72
/// let fixed_size = std::mem::size_of::<HashMap<u64, u64>>();
73
/// let estimated_hashtable_size =
74
///   estimate_memory_size::<(u64, u64)>(num_rows,fixed_size)
75
///     .expect("Size estimation failed");
76
/// ```
77
1.71k
pub fn estimate_memory_size<T>(num_elements: usize, fixed_size: usize) -> Result<usize> {
78
1.71k
    // For the majority of cases hashbrown overestimates the bucket quantity
79
1.71k
    // to keep ~1/8 of them empty. We take this factor into account by
80
1.71k
    // multiplying the number of elements with a fixed ratio of 8/7 (~1.14).
81
1.71k
    // This formula leads to overallocation for small tables (< 8 elements)
82
1.71k
    // but should be fine overall.
83
1.71k
    num_elements
84
1.71k
        .checked_mul(8)
85
1.71k
        .and_then(|overestimate| {
86
1.71k
            let estimated_buckets = (overestimate / 7).next_power_of_two();
87
1.71k
            // + size of entry * number of buckets
88
1.71k
            // + 1 byte for each bucket
89
1.71k
            // + fixed size of collection (HashSet/HashTable)
90
1.71k
            std::mem::size_of::<T>()
91
1.71k
                .checked_mul(estimated_buckets)
?0
92
1.71k
                .checked_add(estimated_buckets)
?0
93
1.71k
                .checked_add(fixed_size)
94
1.71k
        })
95
1.71k
        .ok_or_else(|| {
96
0
            DataFusionError::Execution(
97
0
                "usize overflow while estimating the number of buckets".to_string(),
98
0
            )
99
1.71k
        })
100
1.71k
}
101
102
#[cfg(test)]
103
mod tests {
104
    use std::collections::HashSet;
105
106
    use super::estimate_memory_size;
107
108
    #[test]
109
    fn test_estimate_memory() {
110
        // size (bytes): 48
111
        let fixed_size = std::mem::size_of::<HashSet<u32>>();
112
113
        // estimated buckets: 16 = (8 * 8 / 7).next_power_of_two()
114
        let num_elements = 8;
115
        // size (bytes): 128 = 16 * 4 + 16 + 48
116
        let estimated = estimate_memory_size::<u32>(num_elements, fixed_size).unwrap();
117
        assert_eq!(estimated, 128);
118
119
        // estimated buckets: 64 = (40 * 8 / 7).next_power_of_two()
120
        let num_elements = 40;
121
        // size (bytes): 368 = 64 * 4 + 64 + 48
122
        let estimated = estimate_memory_size::<u32>(num_elements, fixed_size).unwrap();
123
        assert_eq!(estimated, 368);
124
    }
125
126
    #[test]
127
    fn test_estimate_memory_overflow() {
128
        let num_elements = usize::MAX;
129
        let fixed_size = std::mem::size_of::<HashSet<u32>>();
130
        let estimated = estimate_memory_size::<u32>(num_elements, fixed_size);
131
132
        assert!(estimated.is_err());
133
    }
134
}