Skip to content

Commit

Permalink
docs: doc comments for remote registry
Browse files Browse the repository at this point in the history
  • Loading branch information
weihanglo committed Jun 9, 2023
1 parent 1bc13e9 commit 067c747
Showing 1 changed file with 88 additions and 21 deletions.
109 changes: 88 additions & 21 deletions src/cargo/sources/registry/remote.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
//! Access to a Git index based registry. See [`RemoteRegistry`] for details.
use crate::core::{GitReference, PackageId, SourceId};
use crate::sources::git;
use crate::sources::git::fetch::RemoteKind;
Expand All @@ -21,29 +23,73 @@ use std::task::{ready, Poll};
/// A remote registry is a registry that lives at a remote URL (such as
/// crates.io). The git index is cloned locally, and `.crate` files are
/// downloaded as needed and cached locally.
///
/// This type is primarily accessed through the [`RegistryData`] trait.
///
/// See the [module-level documentation](super) for the index format and layout.
///
/// ## History of Git-based index registry
///
/// Using Git to host this index used to be quite efficient. The full index can
/// be stored efficiently locally on disk, and once it is downloaded, all
/// queries of a registry can happen locally and needn't touch the network.
/// Git-based index was a reasonable design choice at the time when HTTP/2
/// was just introduced.
///
/// However, the full index keeps growing as crates.io grows. It becomes
/// relatively big and slows down the first use of Cargo. Git (specifically
/// libgit2) is not efficient at handling huge amounts of small files either.
/// On the other hand, newer protocols like HTTP/2 are prevalent and capable to
/// serve a bunch of tiny files. Today, it is encourged to use [`HttpRegistry`],
/// which is the default from 1.70.0. That being said, Cargo will continue
/// supporting Git-based index for a pretty long while.
///
/// [`HttpRegistry`]: super::http_remote::HttpRegistry
pub struct RemoteRegistry<'cfg> {
/// Path to the registry index (`$CARGO_HOME/registry/index/$REG-HASH`).
index_path: Filesystem,
/// Path to the cache of `.crate` files (`$CARGO_HOME/registry/path/$REG-HASH`).
/// Path to the cache of `.crate` files (`$CARGO_HOME/registry/cache/$REG-HASH`).
cache_path: Filesystem,
/// The unique identifier of this registry source.
source_id: SourceId,
/// This reference is stored so that when a registry needs update, it knows
/// where to fetch from.
index_git_ref: GitReference,
config: &'cfg Config,
/// A Git [tree object] to help this registry find crate metadata from the
/// underlying Git repository.
///
/// This is stored here to prevent Git from repeatly creating a tree object
/// during each call into `load()`.
///
/// [tree object]: https://git-scm.com/book/en/v2/Git-Internals-Git-Objects#_tree_objects
tree: RefCell<Option<git2::Tree<'static>>>,
/// A Git repository that contains the actual index we want.
repo: LazyCell<git2::Repository>,
/// The current HEAD commit of the underlying Git repository.
head: Cell<Option<git2::Oid>>,
/// This stores sha value of the current HEAD commit for convenience.
current_sha: Cell<Option<InternedString>>,
needs_update: bool, // Does this registry need to be updated?
/// Whether this registry needs to update package informations.
///
/// See [`RemoteRegistry::mark_updated`] on how to make sure a registry
/// index is updated only once per session.
needs_update: bool,
/// Disables status messages.
quiet: bool,
}

impl<'cfg> RemoteRegistry<'cfg> {
/// Creates a Git-rebased remote registry for `source_id`.
///
/// * `name` --- Name of a path segment where `.crate` tarballs and the
/// registry index are stored. Expect to be unique.
pub fn new(source_id: SourceId, config: &'cfg Config, name: &str) -> RemoteRegistry<'cfg> {
RemoteRegistry {
index_path: config.registry_index_path().join(name),
cache_path: config.registry_cache_path().join(name),
source_id,
config,
// TODO: we should probably make this configurable
index_git_ref: GitReference::DefaultBranch,
tree: RefCell::new(None),
repo: LazyCell::new(),
Expand All @@ -54,17 +100,16 @@ impl<'cfg> RemoteRegistry<'cfg> {
}
}

/// Creates intermediate dirs and initialize the repository.
fn repo(&self) -> CargoResult<&git2::Repository> {
self.repo.try_borrow_with(|| {
let path = self.config.assert_package_cache_locked(&self.index_path);

// Fast path without a lock
if let Ok(repo) = git2::Repository::open(&path) {
trace!("opened a repo without a lock");
return Ok(repo);
}

// Ok, now we need to lock and try the whole thing over again.
trace!("acquiring registry index lock");
match git2::Repository::open(&path) {
Ok(repo) => Ok(repo),
Expand Down Expand Up @@ -97,6 +142,7 @@ impl<'cfg> RemoteRegistry<'cfg> {
})
}

/// Get the object ID of the HEAD commit from the underlying Git repository.
fn head(&self) -> CargoResult<git2::Oid> {
if self.head.get().is_none() {
let repo = self.repo()?;
Expand All @@ -106,6 +152,8 @@ impl<'cfg> RemoteRegistry<'cfg> {
Ok(self.head.get().unwrap())
}

/// Returns a [`git2::Tree`] object of the current HEAD commit of the
/// underlying Git repository.
fn tree(&self) -> CargoResult<Ref<'_, git2::Tree<'_>>> {
{
let tree = self.tree.borrow();
Expand All @@ -117,6 +165,7 @@ impl<'cfg> RemoteRegistry<'cfg> {
let commit = repo.find_commit(self.head()?)?;
let tree = commit.tree()?;

// SAFETY:
// Unfortunately in libgit2 the tree objects look like they've got a
// reference to the repository object which means that a tree cannot
// outlive the repository that it came from. Here we want to cache this
Expand All @@ -134,6 +183,9 @@ impl<'cfg> RemoteRegistry<'cfg> {
Ok(Ref::map(self.tree.borrow(), |s| s.as_ref().unwrap()))
}

/// Gets the current version of the registry index.
///
/// It is usually sha of the HEAD commit from the underlying Git repository.
fn current_version(&self) -> Option<InternedString> {
if let Some(sha) = self.current_sha.get() {
return Some(sha);
Expand All @@ -143,10 +195,16 @@ impl<'cfg> RemoteRegistry<'cfg> {
Some(sha)
}

/// Whether the registry is up-to-date. See [`Self::mark_updated`] for more.
fn is_updated(&self) -> bool {
self.config.updated_sources().contains(&self.source_id)
}

/// Marks this registry as up-to-date.
///
/// This makes sure the index is only updated once per session since it is
/// an expensive operation. This generally only happens when the resolver
/// is run multiple times, such as during `cargo publish`.
fn mark_updated(&self) {
self.config.updated_sources().insert(self.source_id);
}
Expand All @@ -156,7 +214,7 @@ const LAST_UPDATED_FILE: &str = ".last-updated";

impl<'cfg> RegistryData for RemoteRegistry<'cfg> {
fn prepare(&self) -> CargoResult<()> {
self.repo()?; // create intermediate dirs and initialize the repo
self.repo()?;
Ok(())
}

Expand All @@ -168,13 +226,20 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> {
self.config.assert_package_cache_locked(path)
}

// `index_version` Is a string representing the version of the file used to construct the cached copy.
// Older versions of Cargo used the single value of the hash of the HEAD commit as a `index_version`.
// This is technically correct but a little too conservative. If a new commit is fetched all cached
// files need to be regenerated even if a particular file was not changed.
// However if an old cargo has written such a file we still know how to read it, as long as we check for that hash value.
//
// Cargo now uses a hash of the file's contents as provided by git.
/// Read the general concept for `load()` on [`RegistryData::load`].
///
/// `index_version` is a string representing the version of the file used
/// to construct the cached copy.
///
/// Older versions of Cargo used the single value of the hash of the HEAD
/// commit as a `index_version`. This is technically correct but a little
/// too conservative. If a new commit is fetched all cached files need to
/// be regenerated even if a particular file was not changed.
///
/// However if an old cargo has written such a file we still know how to
/// read it, as long as we check for that hash value.
///
/// Cargo now uses a hash of the file's contents as provided by git.
fn load(
&mut self,
_root: &Path,
Expand All @@ -187,7 +252,8 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> {
// Check if the cache is valid.
let git_commit_hash = self.current_version();
if index_version.is_some() && index_version == git_commit_hash.as_deref() {
// This file was written by an old version of cargo, but it is still up-to-date.
// This file was written by an old version of cargo, but it is
// still up-to-date.
return Poll::Ready(Ok(LoadResponse::CacheValid));
}
// Note that the index calls this method and the filesystem is locked
Expand Down Expand Up @@ -224,8 +290,8 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> {
match load_helper(&self, path, index_version) {
Ok(result) => Poll::Ready(Ok(result)),
Err(_) if !self.is_updated() => {
// If git returns an error and we haven't updated the repo, return
// pending to allow an update to try again.
// If git returns an error and we haven't updated the repo,
// return pending to allow an update to try again.
self.needs_update = true;
Poll::Pending
}
Expand Down Expand Up @@ -265,9 +331,6 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> {

self.needs_update = false;

// Make sure the index is only updated once per session since it is an
// expensive operation. This generally only happens when the resolver
// is run multiple times, such as during `cargo publish`.
if self.is_updated() {
return Ok(());
}
Expand Down Expand Up @@ -321,8 +384,11 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> {
Ok(())
}

/// Read the general concept for `invalidate_cache()` on
/// [`RegistryData::invalidate_cache`].
///
/// To fully invalidate, undo [`RemoteRegistry::mark_updated`]'s work.
fn invalidate_cache(&mut self) {
// To fully invalidate, undo `mark_updated`s work
self.needs_update = true;
}

Expand Down Expand Up @@ -365,9 +431,10 @@ impl<'cfg> RegistryData for RemoteRegistry<'cfg> {
}
}

/// Implemented to just be sure to drop `tree` field before our other fields.
/// See SAFETY inside [`RemoteRegistry::tree()`] for more.
impl<'cfg> Drop for RemoteRegistry<'cfg> {
fn drop(&mut self) {
// Just be sure to drop this before our other fields
self.tree.borrow_mut().take();
}
}

0 comments on commit 067c747

Please sign in to comment.