Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/execution/src/disk_manager.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! Manages files generated during query execution, files are
19
//! hashed among the directories listed in RuntimeConfig::local_dirs.
20
21
use datafusion_common::{resources_datafusion_err, DataFusionError, Result};
22
use log::debug;
23
use parking_lot::Mutex;
24
use rand::{thread_rng, Rng};
25
use std::path::{Path, PathBuf};
26
use std::sync::Arc;
27
use tempfile::{Builder, NamedTempFile, TempDir};
28
29
/// Configuration for temporary disk access
30
#[derive(Debug, Clone)]
31
pub enum DiskManagerConfig {
32
    /// Use the provided [DiskManager] instance
33
    Existing(Arc<DiskManager>),
34
35
    /// Create a new [DiskManager] that creates temporary files within
36
    /// a temporary directory chosen by the OS
37
    NewOs,
38
39
    /// Create a new [DiskManager] that creates temporary files within
40
    /// the specified directories
41
    NewSpecified(Vec<PathBuf>),
42
43
    /// Disable disk manager, attempts to create temporary files will error
44
    Disabled,
45
}
46
47
impl Default for DiskManagerConfig {
48
937
    fn default() -> Self {
49
937
        Self::NewOs
50
937
    }
51
}
52
53
impl DiskManagerConfig {
54
    /// Create temporary files in a temporary directory chosen by the OS
55
0
    pub fn new() -> Self {
56
0
        Self::default()
57
0
    }
58
59
    /// Create temporary files using the provided disk manager
60
0
    pub fn new_existing(existing: Arc<DiskManager>) -> Self {
61
0
        Self::Existing(existing)
62
0
    }
63
64
    /// Create temporary files in the specified directories
65
0
    pub fn new_specified(paths: Vec<PathBuf>) -> Self {
66
0
        Self::NewSpecified(paths)
67
0
    }
68
}
69
70
/// Manages files generated during query execution, e.g. spill files generated
71
/// while processing dataset larger than available memory.
72
#[derive(Debug)]
73
pub struct DiskManager {
74
    /// TempDirs to put temporary files in.
75
    ///
76
    /// If `Some(vec![])` a new OS specified temporary directory will be created
77
    /// If `None` an error will be returned (configured not to spill)
78
    local_dirs: Mutex<Option<Vec<Arc<TempDir>>>>,
79
}
80
81
impl DiskManager {
82
    /// Create a DiskManager given the configuration
83
939
    pub fn try_new(config: DiskManagerConfig) -> Result<Arc<Self>> {
84
939
        match config {
85
0
            DiskManagerConfig::Existing(manager) => Ok(manager),
86
937
            DiskManagerConfig::NewOs => Ok(Arc::new(Self {
87
937
                local_dirs: Mutex::new(Some(vec![])),
88
937
            })),
89
0
            DiskManagerConfig::NewSpecified(conf_dirs) => {
90
0
                let local_dirs = create_local_dirs(conf_dirs)?;
91
0
                debug!(
92
0
                    "Created local dirs {:?} as DataFusion working directory",
93
                    local_dirs
94
                );
95
0
                Ok(Arc::new(Self {
96
0
                    local_dirs: Mutex::new(Some(local_dirs)),
97
0
                }))
98
            }
99
2
            DiskManagerConfig::Disabled => Ok(Arc::new(Self {
100
2
                local_dirs: Mutex::new(None),
101
2
            })),
102
        }
103
939
    }
104
105
    /// Return true if this disk manager supports creating temporary
106
    /// files. If this returns false, any call to `create_tmp_file`
107
    /// will error.
108
822
    pub fn tmp_files_enabled(&self) -> bool {
109
822
        self.local_dirs.lock().is_some()
110
822
    }
111
112
    /// Return a temporary file from a randomized choice in the configured locations
113
    ///
114
    /// If the file can not be created for some reason, returns an
115
    /// error message referencing the request description
116
47
    pub fn create_tmp_file(
117
47
        &self,
118
47
        request_description: &str,
119
47
    ) -> Result<RefCountedTempFile> {
120
47
        let mut guard = self.local_dirs.lock();
121
47
        let local_dirs = guard.as_mut().ok_or_else(|| {
122
0
            resources_datafusion_err!(
123
0
                "Memory Exhausted while {request_description} (DiskManager is disabled)"
124
0
            )
125
47
        })
?0
;
126
127
        // Create a temporary directory if needed
128
47
        if local_dirs.is_empty() {
129
10
            let tempdir = tempfile::tempdir().map_err(DataFusionError::IoError)
?0
;
130
131
10
            debug!(
132
0
                "Created directory '{:?}' as DataFusion tempfile directory for {}",
133
0
                tempdir.path().to_string_lossy(),
134
                request_description,
135
            );
136
137
10
            local_dirs.push(Arc::new(tempdir));
138
37
        }
139
140
47
        let dir_index = thread_rng().gen_range(0..local_dirs.len());
141
47
        Ok(RefCountedTempFile {
142
47
            parent_temp_dir: Arc::clone(&local_dirs[dir_index]),
143
47
            tempfile: Builder::new()
144
47
                .tempfile_in(local_dirs[dir_index].as_ref())
145
47
                .map_err(DataFusionError::IoError)
?0
,
146
        })
147
47
    }
148
}
149
150
/// A wrapper around a [`NamedTempFile`] that also contains
151
/// a reference to its parent temporary directory
152
#[derive(Debug)]
153
pub struct RefCountedTempFile {
154
    /// The reference to the directory in which temporary files are created to ensure
155
    /// it is not cleaned up prior to the NamedTempFile
156
    #[allow(dead_code)]
157
    parent_temp_dir: Arc<TempDir>,
158
    tempfile: NamedTempFile,
159
}
160
161
impl RefCountedTempFile {
162
217
    pub fn path(&self) -> &Path {
163
217
        self.tempfile.path()
164
217
    }
165
166
0
    pub fn inner(&self) -> &NamedTempFile {
167
0
        &self.tempfile
168
0
    }
169
}
170
171
/// Setup local dirs by creating one new dir in each of the given dirs
172
0
fn create_local_dirs(local_dirs: Vec<PathBuf>) -> Result<Vec<Arc<TempDir>>> {
173
0
    local_dirs
174
0
        .iter()
175
0
        .map(|root| {
176
0
            if !std::path::Path::new(root).exists() {
177
0
                std::fs::create_dir(root)?;
178
0
            }
179
0
            Builder::new()
180
0
                .prefix("datafusion-")
181
0
                .tempdir_in(root)
182
0
                .map_err(DataFusionError::IoError)
183
0
        })
184
0
        .map(|result| result.map(Arc::new))
185
0
        .collect()
186
0
}
187
188
#[cfg(test)]
189
mod tests {
190
    use super::*;
191
192
    #[test]
193
    fn lazy_temp_dir_creation() -> Result<()> {
194
        // A default configuration should not create temp files until requested
195
        let config = DiskManagerConfig::new();
196
        let dm = DiskManager::try_new(config)?;
197
198
        assert_eq!(0, local_dir_snapshot(&dm).len());
199
200
        // can still create a tempfile however:
201
        let actual = dm.create_tmp_file("Testing")?;
202
203
        // Now the tempdir has been created on demand
204
        assert_eq!(1, local_dir_snapshot(&dm).len());
205
206
        // the returned tempfile file should be in the temp directory
207
        let local_dirs = local_dir_snapshot(&dm);
208
        assert_path_in_dirs(actual.path(), local_dirs.iter().map(|p| p.as_path()));
209
210
        Ok(())
211
    }
212
213
    fn local_dir_snapshot(dm: &DiskManager) -> Vec<PathBuf> {
214
        dm.local_dirs
215
            .lock()
216
            .iter()
217
            .flatten()
218
            .map(|p| p.path().into())
219
            .collect()
220
    }
221
222
    #[test]
223
    fn file_in_right_dir() -> Result<()> {
224
        let local_dir1 = TempDir::new()?;
225
        let local_dir2 = TempDir::new()?;
226
        let local_dir3 = TempDir::new()?;
227
        let local_dirs = vec![local_dir1.path(), local_dir2.path(), local_dir3.path()];
228
        let config = DiskManagerConfig::new_specified(
229
            local_dirs.iter().map(|p| p.into()).collect(),
230
        );
231
232
        let dm = DiskManager::try_new(config)?;
233
        assert!(dm.tmp_files_enabled());
234
        let actual = dm.create_tmp_file("Testing")?;
235
236
        // the file should be in one of the specified local directories
237
        assert_path_in_dirs(actual.path(), local_dirs.into_iter());
238
239
        Ok(())
240
    }
241
242
    #[test]
243
    fn test_disabled_disk_manager() {
244
        let config = DiskManagerConfig::Disabled;
245
        let manager = DiskManager::try_new(config).unwrap();
246
        assert!(!manager.tmp_files_enabled());
247
        assert_eq!(
248
            manager.create_tmp_file("Testing").unwrap_err().strip_backtrace(),
249
            "Resources exhausted: Memory Exhausted while Testing (DiskManager is disabled)",
250
        )
251
    }
252
253
    #[test]
254
    fn test_disk_manager_create_spill_folder() {
255
        let config = DiskManagerConfig::new_specified(vec!["DOESNT_EXIST".into()]);
256
257
        DiskManager::try_new(config)
258
            .unwrap()
259
            .create_tmp_file("Testing")
260
            .unwrap();
261
    }
262
263
    /// Asserts that `file_path` is found anywhere in any of `dir` directories
264
    fn assert_path_in_dirs<'a>(
265
        file_path: &'a Path,
266
        dirs: impl Iterator<Item = &'a Path>,
267
    ) {
268
        let dirs: Vec<&Path> = dirs.collect();
269
270
        let found = dirs.iter().any(|dir_path| {
271
            file_path
272
                .ancestors()
273
                .any(|candidate_path| *dir_path == candidate_path)
274
        });
275
276
        assert!(found, "Can't find {file_path:?} in dirs: {dirs:?}");
277
    }
278
279
    #[test]
280
    fn test_temp_file_still_alive_after_disk_manager_dropped() -> Result<()> {
281
        // Test for the case using OS arranged temporary directory
282
        let config = DiskManagerConfig::new();
283
        let dm = DiskManager::try_new(config)?;
284
        let temp_file = dm.create_tmp_file("Testing")?;
285
        let temp_file_path = temp_file.path().to_owned();
286
        assert!(temp_file_path.exists());
287
288
        drop(dm);
289
        assert!(temp_file_path.exists());
290
291
        drop(temp_file);
292
        assert!(!temp_file_path.exists());
293
294
        // Test for the case using specified directories
295
        let local_dir1 = TempDir::new()?;
296
        let local_dir2 = TempDir::new()?;
297
        let local_dir3 = TempDir::new()?;
298
        let local_dirs = [local_dir1.path(), local_dir2.path(), local_dir3.path()];
299
        let config = DiskManagerConfig::new_specified(
300
            local_dirs.iter().map(|p| p.into()).collect(),
301
        );
302
        let dm = DiskManager::try_new(config)?;
303
        let temp_file = dm.create_tmp_file("Testing")?;
304
        let temp_file_path = temp_file.path().to_owned();
305
        assert!(temp_file_path.exists());
306
307
        drop(dm);
308
        assert!(temp_file_path.exists());
309
310
        drop(temp_file);
311
        assert!(!temp_file_path.exists());
312
313
        Ok(())
314
    }
315
}