Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/execution/src/cache/cache_unit.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
use std::sync::Arc;
19
20
use crate::cache::CacheAccessor;
21
22
use datafusion_common::Statistics;
23
24
use dashmap::DashMap;
25
use object_store::path::Path;
26
use object_store::ObjectMeta;
27
28
/// Collected statistics for files
29
/// Cache is invalided when file size or last modification has changed
30
#[derive(Default)]
31
pub struct DefaultFileStatisticsCache {
32
    statistics: DashMap<Path, (ObjectMeta, Arc<Statistics>)>,
33
}
34
35
impl CacheAccessor<Path, Arc<Statistics>> for DefaultFileStatisticsCache {
36
    type Extra = ObjectMeta;
37
38
    /// Get `Statistics` for file location.
39
    fn get(&self, k: &Path) -> Option<Arc<Statistics>> {
40
        self.statistics
41
            .get(k)
42
0
            .map(|s| Some(Arc::clone(&s.value().1)))
43
            .unwrap_or(None)
44
    }
45
46
    /// Get `Statistics` for file location. Returns None if file has changed or not found.
47
    fn get_with_extra(&self, k: &Path, e: &Self::Extra) -> Option<Arc<Statistics>> {
48
        self.statistics
49
            .get(k)
50
0
            .map(|s| {
51
0
                let (saved_meta, statistics) = s.value();
52
0
                if saved_meta.size != e.size
53
0
                    || saved_meta.last_modified != e.last_modified
54
                {
55
                    // file has changed
56
0
                    None
57
                } else {
58
0
                    Some(Arc::clone(statistics))
59
                }
60
0
            })
61
            .unwrap_or(None)
62
    }
63
64
    /// Save collected file statistics
65
    fn put(&self, _key: &Path, _value: Arc<Statistics>) -> Option<Arc<Statistics>> {
66
        panic!("Put cache in DefaultFileStatisticsCache without Extra not supported.")
67
    }
68
69
    fn put_with_extra(
70
        &self,
71
        key: &Path,
72
        value: Arc<Statistics>,
73
        e: &Self::Extra,
74
    ) -> Option<Arc<Statistics>> {
75
        self.statistics
76
            .insert(key.clone(), (e.clone(), value))
77
0
            .map(|x| x.1)
78
    }
79
80
    fn remove(&mut self, k: &Path) -> Option<Arc<Statistics>> {
81
0
        self.statistics.remove(k).map(|x| x.1 .1)
82
    }
83
84
    fn contains_key(&self, k: &Path) -> bool {
85
        self.statistics.contains_key(k)
86
    }
87
88
    fn len(&self) -> usize {
89
        self.statistics.len()
90
    }
91
92
    fn clear(&self) {
93
        self.statistics.clear()
94
    }
95
    fn name(&self) -> String {
96
        "DefaultFileStatisticsCache".to_string()
97
    }
98
}
99
100
/// Collected files metadata for listing files.
101
/// Cache will not invalided until user call remove or clear.
102
#[derive(Default)]
103
pub struct DefaultListFilesCache {
104
    statistics: DashMap<Path, Arc<Vec<ObjectMeta>>>,
105
}
106
107
impl CacheAccessor<Path, Arc<Vec<ObjectMeta>>> for DefaultListFilesCache {
108
    type Extra = ObjectMeta;
109
110
    fn get(&self, k: &Path) -> Option<Arc<Vec<ObjectMeta>>> {
111
0
        self.statistics.get(k).map(|x| Arc::clone(x.value()))
112
    }
113
114
    fn get_with_extra(
115
        &self,
116
        _k: &Path,
117
        _e: &Self::Extra,
118
    ) -> Option<Arc<Vec<ObjectMeta>>> {
119
        panic!("Not supported DefaultListFilesCache get_with_extra")
120
    }
121
122
    fn put(
123
        &self,
124
        key: &Path,
125
        value: Arc<Vec<ObjectMeta>>,
126
    ) -> Option<Arc<Vec<ObjectMeta>>> {
127
        self.statistics.insert(key.clone(), value)
128
    }
129
130
    fn put_with_extra(
131
        &self,
132
        _key: &Path,
133
        _value: Arc<Vec<ObjectMeta>>,
134
        _e: &Self::Extra,
135
    ) -> Option<Arc<Vec<ObjectMeta>>> {
136
        panic!("Not supported DefaultListFilesCache put_with_extra")
137
    }
138
139
    fn remove(&mut self, k: &Path) -> Option<Arc<Vec<ObjectMeta>>> {
140
0
        self.statistics.remove(k).map(|x| x.1)
141
    }
142
143
    fn contains_key(&self, k: &Path) -> bool {
144
        self.statistics.contains_key(k)
145
    }
146
147
    fn len(&self) -> usize {
148
        self.statistics.len()
149
    }
150
151
    fn clear(&self) {
152
        self.statistics.clear()
153
    }
154
155
    fn name(&self) -> String {
156
        "DefaultListFilesCache".to_string()
157
    }
158
}
159
160
#[cfg(test)]
161
mod tests {
162
    use crate::cache::cache_unit::{DefaultFileStatisticsCache, DefaultListFilesCache};
163
    use crate::cache::CacheAccessor;
164
    use arrow::datatypes::{DataType, Field, Schema, TimeUnit};
165
    use chrono::DateTime;
166
    use datafusion_common::Statistics;
167
    use object_store::path::Path;
168
    use object_store::ObjectMeta;
169
170
    #[test]
171
    fn test_statistics_cache() {
172
        let meta = ObjectMeta {
173
            location: Path::from("test"),
174
            last_modified: DateTime::parse_from_rfc3339("2022-09-27T22:36:00+02:00")
175
                .unwrap()
176
                .into(),
177
            size: 1024,
178
            e_tag: None,
179
            version: None,
180
        };
181
        let cache = DefaultFileStatisticsCache::default();
182
        assert!(cache.get_with_extra(&meta.location, &meta).is_none());
183
184
        cache.put_with_extra(
185
            &meta.location,
186
            Statistics::new_unknown(&Schema::new(vec![Field::new(
187
                "test_column",
188
                DataType::Timestamp(TimeUnit::Second, None),
189
                false,
190
            )]))
191
            .into(),
192
            &meta,
193
        );
194
        assert!(cache.get_with_extra(&meta.location, &meta).is_some());
195
196
        // file size changed
197
        let mut meta2 = meta.clone();
198
        meta2.size = 2048;
199
        assert!(cache.get_with_extra(&meta2.location, &meta2).is_none());
200
201
        // file last_modified changed
202
        let mut meta2 = meta.clone();
203
        meta2.last_modified = DateTime::parse_from_rfc3339("2022-09-27T22:40:00+02:00")
204
            .unwrap()
205
            .into();
206
        assert!(cache.get_with_extra(&meta2.location, &meta2).is_none());
207
208
        // different file
209
        let mut meta2 = meta;
210
        meta2.location = Path::from("test2");
211
        assert!(cache.get_with_extra(&meta2.location, &meta2).is_none());
212
    }
213
214
    #[test]
215
    fn test_list_file_cache() {
216
        let meta = ObjectMeta {
217
            location: Path::from("test"),
218
            last_modified: DateTime::parse_from_rfc3339("2022-09-27T22:36:00+02:00")
219
                .unwrap()
220
                .into(),
221
            size: 1024,
222
            e_tag: None,
223
            version: None,
224
        };
225
226
        let cache = DefaultListFilesCache::default();
227
        assert!(cache.get(&meta.location).is_none());
228
229
        cache.put(&meta.location, vec![meta.clone()].into());
230
        assert_eq!(
231
            cache.get(&meta.location).unwrap().first().unwrap().clone(),
232
            meta.clone()
233
        );
234
    }
235
}