/Users/andrewlamb/Software/datafusion/datafusion/execution/src/disk_manager.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! Manages files generated during query execution, files are |
19 | | //! hashed among the directories listed in RuntimeConfig::local_dirs. |
20 | | |
21 | | use datafusion_common::{resources_datafusion_err, DataFusionError, Result}; |
22 | | use log::debug; |
23 | | use parking_lot::Mutex; |
24 | | use rand::{thread_rng, Rng}; |
25 | | use std::path::{Path, PathBuf}; |
26 | | use std::sync::Arc; |
27 | | use tempfile::{Builder, NamedTempFile, TempDir}; |
28 | | |
29 | | /// Configuration for temporary disk access |
30 | | #[derive(Debug, Clone)] |
31 | | pub enum DiskManagerConfig { |
32 | | /// Use the provided [DiskManager] instance |
33 | | Existing(Arc<DiskManager>), |
34 | | |
35 | | /// Create a new [DiskManager] that creates temporary files within |
36 | | /// a temporary directory chosen by the OS |
37 | | NewOs, |
38 | | |
39 | | /// Create a new [DiskManager] that creates temporary files within |
40 | | /// the specified directories |
41 | | NewSpecified(Vec<PathBuf>), |
42 | | |
43 | | /// Disable disk manager, attempts to create temporary files will error |
44 | | Disabled, |
45 | | } |
46 | | |
47 | | impl Default for DiskManagerConfig { |
48 | 937 | fn default() -> Self { |
49 | 937 | Self::NewOs |
50 | 937 | } |
51 | | } |
52 | | |
53 | | impl DiskManagerConfig { |
54 | | /// Create temporary files in a temporary directory chosen by the OS |
55 | 0 | pub fn new() -> Self { |
56 | 0 | Self::default() |
57 | 0 | } |
58 | | |
59 | | /// Create temporary files using the provided disk manager |
60 | 0 | pub fn new_existing(existing: Arc<DiskManager>) -> Self { |
61 | 0 | Self::Existing(existing) |
62 | 0 | } |
63 | | |
64 | | /// Create temporary files in the specified directories |
65 | 0 | pub fn new_specified(paths: Vec<PathBuf>) -> Self { |
66 | 0 | Self::NewSpecified(paths) |
67 | 0 | } |
68 | | } |
69 | | |
70 | | /// Manages files generated during query execution, e.g. spill files generated |
71 | | /// while processing dataset larger than available memory. |
72 | | #[derive(Debug)] |
73 | | pub struct DiskManager { |
74 | | /// TempDirs to put temporary files in. |
75 | | /// |
76 | | /// If `Some(vec![])` a new OS specified temporary directory will be created |
77 | | /// If `None` an error will be returned (configured not to spill) |
78 | | local_dirs: Mutex<Option<Vec<Arc<TempDir>>>>, |
79 | | } |
80 | | |
81 | | impl DiskManager { |
82 | | /// Create a DiskManager given the configuration |
83 | 939 | pub fn try_new(config: DiskManagerConfig) -> Result<Arc<Self>> { |
84 | 939 | match config { |
85 | 0 | DiskManagerConfig::Existing(manager) => Ok(manager), |
86 | 937 | DiskManagerConfig::NewOs => Ok(Arc::new(Self { |
87 | 937 | local_dirs: Mutex::new(Some(vec![])), |
88 | 937 | })), |
89 | 0 | DiskManagerConfig::NewSpecified(conf_dirs) => { |
90 | 0 | let local_dirs = create_local_dirs(conf_dirs)?; |
91 | 0 | debug!( |
92 | 0 | "Created local dirs {:?} as DataFusion working directory", |
93 | | local_dirs |
94 | | ); |
95 | 0 | Ok(Arc::new(Self { |
96 | 0 | local_dirs: Mutex::new(Some(local_dirs)), |
97 | 0 | })) |
98 | | } |
99 | 2 | DiskManagerConfig::Disabled => Ok(Arc::new(Self { |
100 | 2 | local_dirs: Mutex::new(None), |
101 | 2 | })), |
102 | | } |
103 | 939 | } |
104 | | |
105 | | /// Return true if this disk manager supports creating temporary |
106 | | /// files. If this returns false, any call to `create_tmp_file` |
107 | | /// will error. |
108 | 822 | pub fn tmp_files_enabled(&self) -> bool { |
109 | 822 | self.local_dirs.lock().is_some() |
110 | 822 | } |
111 | | |
112 | | /// Return a temporary file from a randomized choice in the configured locations |
113 | | /// |
114 | | /// If the file can not be created for some reason, returns an |
115 | | /// error message referencing the request description |
116 | 47 | pub fn create_tmp_file( |
117 | 47 | &self, |
118 | 47 | request_description: &str, |
119 | 47 | ) -> Result<RefCountedTempFile> { |
120 | 47 | let mut guard = self.local_dirs.lock(); |
121 | 47 | let local_dirs = guard.as_mut().ok_or_else(|| { |
122 | 0 | resources_datafusion_err!( |
123 | 0 | "Memory Exhausted while {request_description} (DiskManager is disabled)" |
124 | 0 | ) |
125 | 47 | })?0 ; |
126 | | |
127 | | // Create a temporary directory if needed |
128 | 47 | if local_dirs.is_empty() { |
129 | 10 | let tempdir = tempfile::tempdir().map_err(DataFusionError::IoError)?0 ; |
130 | | |
131 | 10 | debug!( |
132 | 0 | "Created directory '{:?}' as DataFusion tempfile directory for {}", |
133 | 0 | tempdir.path().to_string_lossy(), |
134 | | request_description, |
135 | | ); |
136 | | |
137 | 10 | local_dirs.push(Arc::new(tempdir)); |
138 | 37 | } |
139 | | |
140 | 47 | let dir_index = thread_rng().gen_range(0..local_dirs.len()); |
141 | 47 | Ok(RefCountedTempFile { |
142 | 47 | parent_temp_dir: Arc::clone(&local_dirs[dir_index]), |
143 | 47 | tempfile: Builder::new() |
144 | 47 | .tempfile_in(local_dirs[dir_index].as_ref()) |
145 | 47 | .map_err(DataFusionError::IoError)?0 , |
146 | | }) |
147 | 47 | } |
148 | | } |
149 | | |
150 | | /// A wrapper around a [`NamedTempFile`] that also contains |
151 | | /// a reference to its parent temporary directory |
152 | | #[derive(Debug)] |
153 | | pub struct RefCountedTempFile { |
154 | | /// The reference to the directory in which temporary files are created to ensure |
155 | | /// it is not cleaned up prior to the NamedTempFile |
156 | | #[allow(dead_code)] |
157 | | parent_temp_dir: Arc<TempDir>, |
158 | | tempfile: NamedTempFile, |
159 | | } |
160 | | |
161 | | impl RefCountedTempFile { |
162 | 217 | pub fn path(&self) -> &Path { |
163 | 217 | self.tempfile.path() |
164 | 217 | } |
165 | | |
166 | 0 | pub fn inner(&self) -> &NamedTempFile { |
167 | 0 | &self.tempfile |
168 | 0 | } |
169 | | } |
170 | | |
171 | | /// Setup local dirs by creating one new dir in each of the given dirs |
172 | 0 | fn create_local_dirs(local_dirs: Vec<PathBuf>) -> Result<Vec<Arc<TempDir>>> { |
173 | 0 | local_dirs |
174 | 0 | .iter() |
175 | 0 | .map(|root| { |
176 | 0 | if !std::path::Path::new(root).exists() { |
177 | 0 | std::fs::create_dir(root)?; |
178 | 0 | } |
179 | 0 | Builder::new() |
180 | 0 | .prefix("datafusion-") |
181 | 0 | .tempdir_in(root) |
182 | 0 | .map_err(DataFusionError::IoError) |
183 | 0 | }) |
184 | 0 | .map(|result| result.map(Arc::new)) |
185 | 0 | .collect() |
186 | 0 | } |
187 | | |
188 | | #[cfg(test)] |
189 | | mod tests { |
190 | | use super::*; |
191 | | |
192 | | #[test] |
193 | | fn lazy_temp_dir_creation() -> Result<()> { |
194 | | // A default configuration should not create temp files until requested |
195 | | let config = DiskManagerConfig::new(); |
196 | | let dm = DiskManager::try_new(config)?; |
197 | | |
198 | | assert_eq!(0, local_dir_snapshot(&dm).len()); |
199 | | |
200 | | // can still create a tempfile however: |
201 | | let actual = dm.create_tmp_file("Testing")?; |
202 | | |
203 | | // Now the tempdir has been created on demand |
204 | | assert_eq!(1, local_dir_snapshot(&dm).len()); |
205 | | |
206 | | // the returned tempfile file should be in the temp directory |
207 | | let local_dirs = local_dir_snapshot(&dm); |
208 | | assert_path_in_dirs(actual.path(), local_dirs.iter().map(|p| p.as_path())); |
209 | | |
210 | | Ok(()) |
211 | | } |
212 | | |
213 | | fn local_dir_snapshot(dm: &DiskManager) -> Vec<PathBuf> { |
214 | | dm.local_dirs |
215 | | .lock() |
216 | | .iter() |
217 | | .flatten() |
218 | | .map(|p| p.path().into()) |
219 | | .collect() |
220 | | } |
221 | | |
222 | | #[test] |
223 | | fn file_in_right_dir() -> Result<()> { |
224 | | let local_dir1 = TempDir::new()?; |
225 | | let local_dir2 = TempDir::new()?; |
226 | | let local_dir3 = TempDir::new()?; |
227 | | let local_dirs = vec![local_dir1.path(), local_dir2.path(), local_dir3.path()]; |
228 | | let config = DiskManagerConfig::new_specified( |
229 | | local_dirs.iter().map(|p| p.into()).collect(), |
230 | | ); |
231 | | |
232 | | let dm = DiskManager::try_new(config)?; |
233 | | assert!(dm.tmp_files_enabled()); |
234 | | let actual = dm.create_tmp_file("Testing")?; |
235 | | |
236 | | // the file should be in one of the specified local directories |
237 | | assert_path_in_dirs(actual.path(), local_dirs.into_iter()); |
238 | | |
239 | | Ok(()) |
240 | | } |
241 | | |
242 | | #[test] |
243 | | fn test_disabled_disk_manager() { |
244 | | let config = DiskManagerConfig::Disabled; |
245 | | let manager = DiskManager::try_new(config).unwrap(); |
246 | | assert!(!manager.tmp_files_enabled()); |
247 | | assert_eq!( |
248 | | manager.create_tmp_file("Testing").unwrap_err().strip_backtrace(), |
249 | | "Resources exhausted: Memory Exhausted while Testing (DiskManager is disabled)", |
250 | | ) |
251 | | } |
252 | | |
253 | | #[test] |
254 | | fn test_disk_manager_create_spill_folder() { |
255 | | let config = DiskManagerConfig::new_specified(vec!["DOESNT_EXIST".into()]); |
256 | | |
257 | | DiskManager::try_new(config) |
258 | | .unwrap() |
259 | | .create_tmp_file("Testing") |
260 | | .unwrap(); |
261 | | } |
262 | | |
263 | | /// Asserts that `file_path` is found anywhere in any of `dir` directories |
264 | | fn assert_path_in_dirs<'a>( |
265 | | file_path: &'a Path, |
266 | | dirs: impl Iterator<Item = &'a Path>, |
267 | | ) { |
268 | | let dirs: Vec<&Path> = dirs.collect(); |
269 | | |
270 | | let found = dirs.iter().any(|dir_path| { |
271 | | file_path |
272 | | .ancestors() |
273 | | .any(|candidate_path| *dir_path == candidate_path) |
274 | | }); |
275 | | |
276 | | assert!(found, "Can't find {file_path:?} in dirs: {dirs:?}"); |
277 | | } |
278 | | |
279 | | #[test] |
280 | | fn test_temp_file_still_alive_after_disk_manager_dropped() -> Result<()> { |
281 | | // Test for the case using OS arranged temporary directory |
282 | | let config = DiskManagerConfig::new(); |
283 | | let dm = DiskManager::try_new(config)?; |
284 | | let temp_file = dm.create_tmp_file("Testing")?; |
285 | | let temp_file_path = temp_file.path().to_owned(); |
286 | | assert!(temp_file_path.exists()); |
287 | | |
288 | | drop(dm); |
289 | | assert!(temp_file_path.exists()); |
290 | | |
291 | | drop(temp_file); |
292 | | assert!(!temp_file_path.exists()); |
293 | | |
294 | | // Test for the case using specified directories |
295 | | let local_dir1 = TempDir::new()?; |
296 | | let local_dir2 = TempDir::new()?; |
297 | | let local_dir3 = TempDir::new()?; |
298 | | let local_dirs = [local_dir1.path(), local_dir2.path(), local_dir3.path()]; |
299 | | let config = DiskManagerConfig::new_specified( |
300 | | local_dirs.iter().map(|p| p.into()).collect(), |
301 | | ); |
302 | | let dm = DiskManager::try_new(config)?; |
303 | | let temp_file = dm.create_tmp_file("Testing")?; |
304 | | let temp_file_path = temp_file.path().to_owned(); |
305 | | assert!(temp_file_path.exists()); |
306 | | |
307 | | drop(dm); |
308 | | assert!(temp_file_path.exists()); |
309 | | |
310 | | drop(temp_file); |
311 | | assert!(!temp_file_path.exists()); |
312 | | |
313 | | Ok(()) |
314 | | } |
315 | | } |