/Users/andrewlamb/Software/datafusion/datafusion/common/src/utils/memory.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! This module provides a function to estimate the memory size of a HashTable prior to alloaction |
19 | | |
20 | | use crate::{DataFusionError, Result}; |
21 | | |
22 | | /// Estimates the memory size required for a hash table prior to allocation. |
23 | | /// |
24 | | /// # Parameters |
25 | | /// - `num_elements`: The number of elements expected in the hash table. |
26 | | /// - `fixed_size`: A fixed overhead size associated with the collection |
27 | | /// (e.g., HashSet or HashTable). |
28 | | /// - `T`: The type of elements stored in the hash table. |
29 | | /// |
30 | | /// # Details |
31 | | /// This function calculates the estimated memory size by considering: |
32 | | /// - An overestimation of buckets to keep approximately 1/8 of them empty. |
33 | | /// - The total memory size is computed as: |
34 | | /// - The size of each entry (`T`) multiplied by the estimated number of |
35 | | /// buckets. |
36 | | /// - One byte overhead for each bucket. |
37 | | /// - The fixed size overhead of the collection. |
38 | | /// - If the estimation overflows, we return a [`DataFusionError`] |
39 | | /// |
40 | | /// # Examples |
41 | | /// --- |
42 | | /// |
43 | | /// ## From within a struct |
44 | | /// |
45 | | /// ```rust |
46 | | /// # use datafusion_common::utils::memory::estimate_memory_size; |
47 | | /// # use datafusion_common::Result; |
48 | | /// |
49 | | /// struct MyStruct<T> { |
50 | | /// values: Vec<T>, |
51 | | /// other_data: usize, |
52 | | /// } |
53 | | /// |
54 | | /// impl<T> MyStruct<T> { |
55 | | /// fn size(&self) -> Result<usize> { |
56 | | /// let num_elements = self.values.len(); |
57 | | /// let fixed_size = std::mem::size_of_val(self) + |
58 | | /// std::mem::size_of_val(&self.values); |
59 | | /// |
60 | | /// estimate_memory_size::<T>(num_elements, fixed_size) |
61 | | /// } |
62 | | /// } |
63 | | /// ``` |
64 | | /// --- |
65 | | /// ## With a simple collection |
66 | | /// |
67 | | /// ```rust |
68 | | /// # use datafusion_common::utils::memory::estimate_memory_size; |
69 | | /// # use std::collections::HashMap; |
70 | | /// |
71 | | /// let num_rows = 100; |
72 | | /// let fixed_size = std::mem::size_of::<HashMap<u64, u64>>(); |
73 | | /// let estimated_hashtable_size = |
74 | | /// estimate_memory_size::<(u64, u64)>(num_rows,fixed_size) |
75 | | /// .expect("Size estimation failed"); |
76 | | /// ``` |
77 | 1.71k | pub fn estimate_memory_size<T>(num_elements: usize, fixed_size: usize) -> Result<usize> { |
78 | 1.71k | // For the majority of cases hashbrown overestimates the bucket quantity |
79 | 1.71k | // to keep ~1/8 of them empty. We take this factor into account by |
80 | 1.71k | // multiplying the number of elements with a fixed ratio of 8/7 (~1.14). |
81 | 1.71k | // This formula leads to overallocation for small tables (< 8 elements) |
82 | 1.71k | // but should be fine overall. |
83 | 1.71k | num_elements |
84 | 1.71k | .checked_mul(8) |
85 | 1.71k | .and_then(|overestimate| { |
86 | 1.71k | let estimated_buckets = (overestimate / 7).next_power_of_two(); |
87 | 1.71k | // + size of entry * number of buckets |
88 | 1.71k | // + 1 byte for each bucket |
89 | 1.71k | // + fixed size of collection (HashSet/HashTable) |
90 | 1.71k | std::mem::size_of::<T>() |
91 | 1.71k | .checked_mul(estimated_buckets)?0 |
92 | 1.71k | .checked_add(estimated_buckets)?0 |
93 | 1.71k | .checked_add(fixed_size) |
94 | 1.71k | }) |
95 | 1.71k | .ok_or_else(|| { |
96 | 0 | DataFusionError::Execution( |
97 | 0 | "usize overflow while estimating the number of buckets".to_string(), |
98 | 0 | ) |
99 | 1.71k | }) |
100 | 1.71k | } |
101 | | |
102 | | #[cfg(test)] |
103 | | mod tests { |
104 | | use std::collections::HashSet; |
105 | | |
106 | | use super::estimate_memory_size; |
107 | | |
108 | | #[test] |
109 | | fn test_estimate_memory() { |
110 | | // size (bytes): 48 |
111 | | let fixed_size = std::mem::size_of::<HashSet<u32>>(); |
112 | | |
113 | | // estimated buckets: 16 = (8 * 8 / 7).next_power_of_two() |
114 | | let num_elements = 8; |
115 | | // size (bytes): 128 = 16 * 4 + 16 + 48 |
116 | | let estimated = estimate_memory_size::<u32>(num_elements, fixed_size).unwrap(); |
117 | | assert_eq!(estimated, 128); |
118 | | |
119 | | // estimated buckets: 64 = (40 * 8 / 7).next_power_of_two() |
120 | | let num_elements = 40; |
121 | | // size (bytes): 368 = 64 * 4 + 64 + 48 |
122 | | let estimated = estimate_memory_size::<u32>(num_elements, fixed_size).unwrap(); |
123 | | assert_eq!(estimated, 368); |
124 | | } |
125 | | |
126 | | #[test] |
127 | | fn test_estimate_memory_overflow() { |
128 | | let num_elements = usize::MAX; |
129 | | let fixed_size = std::mem::size_of::<HashSet<u32>>(); |
130 | | let estimated = estimate_memory_size::<u32>(num_elements, fixed_size); |
131 | | |
132 | | assert!(estimated.is_err()); |
133 | | } |
134 | | } |