-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[feat] Introduce cacheManager in session ctx and make StatisticsCache share in session #7570
Changes from 3 commits
be03887
04a2f06
b81230d
382888f
a66ee12
0160108
8060a76
aaa912d
c5c162a
83ba66a
b09479e
650d419
784666f
d77ad00
cac0180
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
use crate::cache::CacheAccessor; | ||
use datafusion_common::{Result, Statistics}; | ||
use object_store::path::Path; | ||
use object_store::ObjectMeta; | ||
use std::sync::Arc; | ||
|
||
pub type FileStaticCache = Arc<dyn CacheAccessor<Path, Statistics, Extra = ObjectMeta>>; | ||
Ted-Jiang marked this conversation as resolved.
Show resolved
Hide resolved
Ted-Jiang marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
#[derive(Default)] | ||
pub struct CacheManager { | ||
file_statistic_cache: Option<FileStaticCache>, | ||
} | ||
|
||
impl CacheManager { | ||
pub fn try_new(config: &CacheManagerConfig) -> Result<Arc<Self>> { | ||
let mut manager = CacheManager::default(); | ||
if let Some(cc) = &config.table_files_statistics_cache { | ||
manager.file_statistic_cache = Some(cc.clone()) | ||
} | ||
Ok(Arc::new(manager)) | ||
} | ||
|
||
pub fn get_file_statistic_cache(&self) -> Option<FileStaticCache> { | ||
Ted-Jiang marked this conversation as resolved.
Show resolved
Hide resolved
|
||
self.file_statistic_cache.clone() | ||
} | ||
} | ||
|
||
#[derive(Clone, Default)] | ||
pub struct CacheManagerConfig { | ||
/// Enable cache of files statistics when listing files. | ||
/// Avoid get same file statistics repeatedly in same datafusion session. | ||
/// Default is disable. Fow now only supports Parquet files. | ||
pub table_files_statistics_cache: Option<FileStaticCache>, | ||
} | ||
|
||
impl CacheManagerConfig { | ||
pub fn enable_table_files_statistics_cache(mut self, cache: FileStaticCache) -> Self { | ||
Ted-Jiang marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we possibly have the field names match -- here it is called There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ops, this make sense |
||
self.table_files_statistics_cache = Some(cache); | ||
self | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
use crate::cache::CacheAccessor; | ||
use dashmap::DashMap; | ||
use datafusion_common::Statistics; | ||
use object_store::path::Path; | ||
use object_store::ObjectMeta; | ||
|
||
/// Collected statistics for files | ||
/// Cache is invalided when file size or last modification has changed | ||
#[derive(Default)] | ||
pub struct FileStatisticsCache { | ||
statistics: DashMap<Path, (ObjectMeta, Statistics)>, | ||
} | ||
|
||
impl CacheAccessor<Path, Statistics> for FileStatisticsCache { | ||
type Extra = ObjectMeta; | ||
|
||
/// Get `Statistics` for file location. | ||
fn get(&self, k: &Path) -> Option<Statistics> { | ||
self.statistics | ||
.get(k) | ||
.map(|s| Some(s.value().1.clone())) | ||
.unwrap_or(None) | ||
} | ||
|
||
/// Get `Statistics` for file location. Returns None if file has changed or not found. | ||
fn get_with_extra(&self, k: &Path, e: &Self::Extra) -> Option<Statistics> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. as written this is going to copy the statistics (though I realize that is what this PR did previously) -- maybe we could use something like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes , thanks for point this out |
||
self.statistics | ||
.get(k) | ||
.map(|s| { | ||
let (saved_meta, statistics) = s.value(); | ||
if saved_meta.size != e.size | ||
|| saved_meta.last_modified != e.last_modified | ||
{ | ||
// file has changed | ||
None | ||
} else { | ||
Some(statistics.clone()) | ||
} | ||
}) | ||
.unwrap_or(None) | ||
} | ||
|
||
/// Save collected file statistics | ||
fn put(&self, _key: &Path, _value: Statistics) -> Option<Statistics> { | ||
panic!("Put cache in FileStatisticsCache without Extra not supported.") | ||
} | ||
|
||
fn put_with_extra( | ||
&self, | ||
key: &Path, | ||
value: Statistics, | ||
e: &Self::Extra, | ||
) -> Option<Statistics> { | ||
self.statistics | ||
.insert(key.clone(), (e.clone(), value)) | ||
.map(|x| x.1) | ||
} | ||
|
||
fn evict(&self, k: &Path) -> bool { | ||
self.statistics.remove(k).is_some() | ||
} | ||
|
||
fn contains_key(&self, k: &Path) -> bool { | ||
self.statistics.contains_key(k) | ||
} | ||
|
||
fn len(&self) -> usize { | ||
self.statistics.len() | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use crate::cache::cache_unit::FileStatisticsCache; | ||
use crate::cache::CacheAccessor; | ||
use chrono::DateTime; | ||
use datafusion_common::Statistics; | ||
use object_store::path::Path; | ||
use object_store::ObjectMeta; | ||
Ted-Jiang marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
#[test] | ||
fn test_statistics_cache() { | ||
let meta = ObjectMeta { | ||
location: Path::from("test"), | ||
last_modified: DateTime::parse_from_rfc3339("2022-09-27T22:36:00+02:00") | ||
.unwrap() | ||
.into(), | ||
size: 1024, | ||
e_tag: None, | ||
}; | ||
|
||
let cache = FileStatisticsCache::default(); | ||
assert!(cache.get_with_extra(&meta.location, &meta).is_none()); | ||
|
||
cache.put_with_extra(&meta.location, Statistics::default(), &meta); | ||
assert!(cache.get_with_extra(&meta.location, &meta).is_some()); | ||
|
||
// file size changed | ||
let mut meta2 = meta.clone(); | ||
meta2.size = 2048; | ||
assert!(cache.get_with_extra(&meta2.location, &meta2).is_none()); | ||
|
||
// file last_modified changed | ||
let mut meta2 = meta.clone(); | ||
meta2.last_modified = DateTime::parse_from_rfc3339("2022-09-27T22:40:00+02:00") | ||
.unwrap() | ||
.into(); | ||
assert!(cache.get_with_extra(&meta2.location, &meta2).is_none()); | ||
|
||
// different file | ||
let mut meta2 = meta; | ||
meta2.location = Path::from("test2"); | ||
assert!(cache.get_with_extra(&meta2.location, &meta2).is_none()); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
pub mod cache_manager; | ||
pub mod cache_unit; | ||
|
||
// The cache accessor, users usually working on this interface while manipulating caches | ||
Ted-Jiang marked this conversation as resolved.
Show resolved
Hide resolved
|
||
pub trait CacheAccessor<K, V>: Send + Sync { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another classic API to add here would be "clear()" to clear all the values There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. need this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nice abstract/trait |
||
// Extra info but not part of the cache key or cache value. | ||
type Extra: Clone; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you explain what the usecase for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Like in default There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see -- that makes sense -- it might help to document the rationale in statistics |
||
|
||
/// Get value from cache. | ||
fn get(&self, k: &K) -> Option<V>; | ||
/// Get value from cache. | ||
fn get_with_extra(&self, k: &K, e: &Self::Extra) -> Option<V>; | ||
/// Put value into cache. Returns the old value associated with the key if there was one. | ||
fn put(&self, key: &K, value: V) -> Option<V>; | ||
/// Put value into cache. Returns the old value associated with the key if there was one. | ||
fn put_with_extra(&self, key: &K, value: V, e: &Self::Extra) -> Option<V>; | ||
/// Remove an entry from the cache, returning `true` if they existed in the cache. | ||
fn evict(&self, k: &K) -> bool; | ||
Ted-Jiang marked this conversation as resolved.
Show resolved
Hide resolved
|
||
/// Check if the cache contains a specific key. | ||
fn contains_key(&self, k: &K) -> bool; | ||
/// Fetch the total number of cache entries. | ||
fn len(&self) -> usize; | ||
/// Check if the Cache collection is empty or not. | ||
fn is_empty(&self) -> bool { | ||
self.len() == 0 | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you implement the
with_cache
API, this can look likeThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
More clear way ! 👍