-
Notifications
You must be signed in to change notification settings - Fork 68
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix: make rattler_index::index
concurrency safe
#955
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,9 @@ | |
|
||
use std::{ | ||
collections::{HashSet, VecDeque}, | ||
fmt, io, | ||
fmt, | ||
fs::OpenOptions, | ||
io, | ||
marker::PhantomData, | ||
path::Path, | ||
}; | ||
|
@@ -26,6 +28,8 @@ use serde_json::value::RawValue; | |
use superslice::Ext; | ||
use thiserror::Error; | ||
|
||
use crate::utils::LockedFile; | ||
|
||
/// A struct to enable loading records from a `repodata.json` file on demand. | ||
/// Since most of the time you don't need all the records from the | ||
/// `repodata.json` this can help provide some significant speedups. | ||
|
@@ -43,6 +47,9 @@ pub struct SparseRepoData { | |
/// A function that can be used to patch the package record after it has | ||
/// been parsed. This is mainly used to add `pip` to `python` if desired | ||
patch_record_fn: Option<fn(&mut PackageRecord)>, | ||
|
||
/// memmap2 blocks file from being modified so wrap the repodata file with a lock | ||
_lock: Option<LockedFile>, | ||
} | ||
|
||
enum SparseRepoDataInner { | ||
|
@@ -104,20 +111,39 @@ impl SparseRepoData { | |
path: impl AsRef<Path>, | ||
patch_function: Option<fn(&mut PackageRecord)>, | ||
) -> Result<Self, io::Error> { | ||
let file = fs::File::open(path.as_ref().to_owned())?; | ||
let memory_map = unsafe { memmap2::Mmap::map(&file) }?; | ||
Ok(SparseRepoData { | ||
inner: SparseRepoDataInner::Memmapped( | ||
MemmappedSparseRepoDataInnerTryBuilder { | ||
memory_map, | ||
repo_data_builder: |memory_map| serde_json::from_slice(memory_map.as_ref()), | ||
} | ||
.try_build()?, | ||
), | ||
subdir: subdir.into(), | ||
channel, | ||
patch_record_fn: patch_function, | ||
}) | ||
if path.as_ref().exists() { | ||
let lock_file_path = path.as_ref().with_extension("lock"); | ||
if !lock_file_path.exists() { | ||
OpenOptions::new() | ||
.read(true) | ||
.write(true) | ||
.create(true) | ||
.truncate(false) | ||
.open(&lock_file_path)?; | ||
} | ||
let lock_file = LockedFile::open_ro(lock_file_path, "repodata cache") | ||
.map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; | ||
let file = fs::File::open(path.as_ref().to_owned())?; | ||
let memory_map = unsafe { memmap2::Mmap::map(&file) }?; | ||
Ok(SparseRepoData { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps you can move this out of the if statement? First acquire the lock, store it in a variable and then do the rest of the operations. This reduces the amount of nested code. |
||
inner: SparseRepoDataInner::Memmapped( | ||
MemmappedSparseRepoDataInnerTryBuilder { | ||
memory_map, | ||
repo_data_builder: |memory_map| serde_json::from_slice(memory_map.as_ref()), | ||
} | ||
.try_build()?, | ||
), | ||
subdir: subdir.into(), | ||
channel, | ||
patch_record_fn: patch_function, | ||
_lock: Some(lock_file), | ||
}) | ||
} else { | ||
Err(io::Error::new( | ||
io::ErrorKind::NotFound, | ||
format!("file not found: {:?}", path.as_ref()), | ||
)) | ||
} | ||
} | ||
|
||
/// Construct an instance of self from a bytes and a [`Channel`]. | ||
|
@@ -141,6 +167,7 @@ impl SparseRepoData { | |
channel, | ||
subdir: subdir.into(), | ||
patch_record_fn: patch_function, | ||
_lock: None, | ||
}) | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -67,7 +67,9 @@ pub fn py_fetch_repo_data<'a>( | |
.into_iter() | ||
.map(|(cache, chan)| { | ||
let path = cache_path.to_string_lossy().into_owned(); | ||
PySparseRepoData::new(chan, path, cache.repo_data_json_path) | ||
let repo_data_json_path = cache.repo_data_json_path.clone(); | ||
drop(cache); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Its quite unclear why this is needed. It would be nice if the lock-file in the |
||
PySparseRepoData::new(chan, path, repo_data_json_path) | ||
}) | ||
.collect::<Result<Vec<_>, _>>(), | ||
Err(e) => Err(PyRattlerError::from(e).into()), | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Wouldnt it be much nicer to pass in the lock-file rather than reacquiring?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was reacquiring because
fetch_repo_data
acquires arw
lock while I was hoping to use aro
lock inSparseRepoData
so there could be multiple processes reading the same file without blocking.When testing on windows yesterday I stopped getting deadlocks after dropping the lock in the python bindings but after having the test time out again I setup an env on a linux vm and I'm still getting deadlocks.
After reexamining this change, I don't think this will work. The scenario I'm thinking of is lets say there's two files and one process holds the ro lock for one while waiting for the rw lock for the other while the 2nd process holds the ro lock that the 1st process is waiting for the rw lock on while also waiting on the rw lock for the other file.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Two possible alternatives I'm thinking of are
fetch_repo_data
calls and drops all the locks at the same time before trying to acquire locks for theSparseRepoData
from_bytes
instead of the memory mapped option inSparseRepoData