Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

git_backend: on gc(), remove unreachable no-gc refs and compact them #2839

Merged
merged 5 commits into from
Jan 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### New features

* New `jj op abandon` command is added to clean up the operation history. If GC
is implemented, Git refs and commit objects can be compacted.
* New `jj op abandon` command is added to clean up the operation history. Git
refs and commit objects can be further compacted by `jj util gc`.

* `jj util gc` now removes unreachable operation and view objects.
* `jj util gc` now removes unreachable operation, view, and Git objects.

* `jj branch rename` will now warn if the renamed branch has a remote branch, since
those will have to be manually renamed outside of `jj`.
Expand Down
6 changes: 4 additions & 2 deletions cli/examples/custom-backend/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
use std::any::Any;
use std::io::Read;
use std::path::Path;
use std::time::SystemTime;

use async_trait::async_trait;
use jj_cli::cli_util::{CliRunner, CommandError, CommandHelper};
Expand All @@ -24,6 +25,7 @@ use jj_lib::backend::{
Conflict, ConflictId, FileId, SigningFn, SymlinkId, Tree, TreeId,
};
use jj_lib::git_backend::GitBackend;
use jj_lib::index::Index;
use jj_lib::repo::StoreFactories;
use jj_lib::repo_path::RepoPath;
use jj_lib::settings::UserSettings;
Expand Down Expand Up @@ -171,7 +173,7 @@ impl Backend for JitBackend {
self.inner.write_commit(contents, sign_with)
}

fn gc(&self) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
self.inner.gc()
fn gc(&self, index: &dyn Index, keep_newer: SystemTime) -> BackendResult<()> {
self.inner.gc(index, keep_newer)
}
}
4 changes: 1 addition & 3 deletions cli/src/commands/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,9 +136,7 @@ fn cmd_util_gc(
let repo = workspace_command.repo();
repo.op_store()
.gc(slice::from_ref(repo.op_id()), keep_newer)?;
repo.store()
.gc()
.map_err(|err| user_error(err.to_string()))?;
repo.store().gc(repo.index(), keep_newer)?;
Ok(())
}

Expand Down
11 changes: 8 additions & 3 deletions lib/src/backend.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,14 @@ use std::collections::BTreeMap;
use std::fmt::Debug;
use std::io::Read;
use std::result::Result;
use std::time::SystemTime;
use std::vec::Vec;

use async_trait::async_trait;
use thiserror::Error;

use crate::content_hash::ContentHash;
use crate::index::Index;
use crate::merge::Merge;
use crate::object_id::{id_type, ObjectId};
use crate::repo_path::{RepoPath, RepoPathComponent, RepoPathComponentBuf};
Expand Down Expand Up @@ -217,7 +219,7 @@ pub enum BackendError {
object_type: &'static str,
source: Box<dyn std::error::Error + Send + Sync>,
},
#[error("Error: {0}")]
#[error(transparent)]
Other(Box<dyn std::error::Error + Send + Sync>),
}

Expand Down Expand Up @@ -450,6 +452,9 @@ pub trait Backend: Send + Sync + Debug {
) -> BackendResult<(CommitId, Commit)>;

/// Perform garbage collection.
// TODO: pass in the set of commits to keep here
fn gc(&self) -> Result<(), Box<dyn std::error::Error + Send + Sync>>;
///
/// All commits found in the `index` won't be removed. In addition to that,
/// objects created after `keep_newer` will be preserved. This mitigates a
/// risk of deleting new commits created concurrently by another process.
fn gc(&self, index: &dyn Index, keep_newer: SystemTime) -> BackendResult<()>;
}
125 changes: 114 additions & 11 deletions lib/src/git_backend.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,9 @@ use std::collections::HashSet;
use std::fmt::{Debug, Error, Formatter};
use std::io::{Cursor, Read};
use std::path::{Path, PathBuf};
use std::process::ExitStatus;
use std::process::{Command, ExitStatus};
use std::sync::{Arc, Mutex, MutexGuard};
use std::time::SystemTime;
use std::{fs, io, str};

use async_trait::async_trait;
Expand All @@ -37,6 +38,7 @@ use crate::backend::{
TreeValue,
};
use crate::file_util::{IoResultExt as _, PathError};
use crate::index::Index;
use crate::lock::FileLock;
use crate::merge::{Merge, MergeBuilder};
use crate::object_id::ObjectId;
Expand Down Expand Up @@ -605,6 +607,103 @@ fn to_no_gc_ref_update(id: &CommitId) -> gix::refs::transaction::RefEdit {
}
}

fn to_ref_deletion(git_ref: gix::refs::Reference) -> gix::refs::transaction::RefEdit {
let expected = gix::refs::transaction::PreviousValue::ExistingMustMatch(git_ref.target);
gix::refs::transaction::RefEdit {
change: gix::refs::transaction::Change::Delete {
expected,
log: gix::refs::transaction::RefLog::AndReference,
},
name: git_ref.name,
deref: false,
}
}

/// Recreates `refs/jj/keep` refs for the `new_heads`, and removes the other
/// unreachable and non-head refs.
fn recreate_no_gc_refs(
git_repo: &gix::Repository,
new_heads: impl IntoIterator<Item = CommitId>,
keep_newer: SystemTime,
) -> Result<(), BackendError> {
// Calculate diff between existing no-gc refs and new heads.
let new_heads: HashSet<CommitId> = new_heads.into_iter().collect();
let mut no_gc_refs_to_keep_count: usize = 0;
let mut no_gc_refs_to_delete: Vec<gix::refs::Reference> = Vec::new();
let git_references = git_repo
.references()
.map_err(|err| BackendError::Other(err.into()))?;
let no_gc_refs_iter = git_references
.prefixed(NO_GC_REF_NAMESPACE)
.map_err(|err| BackendError::Other(err.into()))?;
for git_ref in no_gc_refs_iter {
let git_ref = git_ref.map_err(BackendError::Other)?.detach();
let oid = git_ref.target.try_id().ok_or_else(|| {
let name = git_ref.name.as_bstr();
BackendError::Other(format!("Symbolic no-gc ref found: {name}").into())
})?;
let id = CommitId::from_bytes(oid.as_bytes());
let name_good = git_ref.name.as_bstr()[NO_GC_REF_NAMESPACE.len()..] == id.hex();
if new_heads.contains(&id) && name_good {
no_gc_refs_to_keep_count += 1;
continue;
}
// Check timestamp of loose ref, but this is still racy on re-import
// because:
// - existing packed ref won't be demoted to loose ref
// - existing loose ref won't be touched
//
// TODO: might be better to switch to a dummy merge, where new no-gc ref
// will always have a unique name. Doing that with the current
// ref-per-head strategy would increase the number of the no-gc refs.
// https://github.com/martinvonz/jj/pull/2659#issuecomment-1837057782
yuja marked this conversation as resolved.
Show resolved Hide resolved
let loose_ref_path = git_repo.path().join(git_ref.name.to_path());
if let Ok(metadata) = loose_ref_path.metadata() {
let mtime = metadata.modified().expect("unsupported platform?");
if mtime > keep_newer {
tracing::trace!(?git_ref, "not deleting new");
no_gc_refs_to_keep_count += 1;
continue;
}
}
// Also deletes no-gc ref of random name created by old jj.
tracing::trace!(?git_ref, ?name_good, "will delete");
no_gc_refs_to_delete.push(git_ref);
}
tracing::info!(
new_heads_count = new_heads.len(),
no_gc_refs_to_keep_count,
no_gc_refs_to_delete_count = no_gc_refs_to_delete.len(),
"collected reachable refs"
);

// It's slow to delete packed refs one by one, so update refs all at once.
let ref_edits = itertools::chain(
no_gc_refs_to_delete.into_iter().map(to_ref_deletion),
new_heads.iter().map(to_no_gc_ref_update),
);
git_repo
.edit_references(ref_edits)
.map_err(|err| BackendError::Other(err.into()))?;

Ok(())
}

fn run_git_gc(git_dir: &Path) -> Result<(), GitGcError> {
let mut git = Command::new("git");
git.arg("--git-dir=."); // turn off discovery
git.arg("gc");
// Don't specify it by GIT_DIR/--git-dir. On Windows, the "\\?\" path might
// not be supported by git.
git.current_dir(git_dir);
// TODO: pass output to UI layer instead of printing directly here
let status = git.status().map_err(GitGcError::GcCommand)?;
if !status.success() {
return Err(GitGcError::GcCommandErrorStatus(status));
}
Ok(())
}

fn validate_git_object_id(id: &impl ObjectId) -> Result<gix::ObjectId, BackendError> {
if id.as_bytes().len() != HASH_LENGTH {
return Err(BackendError::InvalidHashLength {
Expand Down Expand Up @@ -1066,16 +1165,20 @@ impl Backend for GitBackend {
Ok((id, contents))
}

fn gc(&self) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
let mut git = std::process::Command::new("git");
git.env("GIT_DIR", self.git_repo_path());
git.args(["gc"]);
// TODO: pass output to UI layer instead of printing directly here
let status = git.status().map_err(GitGcError::GcCommand)?;
if !status.success() {
return Err(Box::new(GitGcError::GcCommandErrorStatus(status)));
}
Ok(())
#[tracing::instrument(skip(self, index))]
fn gc(&self, index: &dyn Index, keep_newer: SystemTime) -> BackendResult<()> {
let git_repo = self.lock_git_repo();
let new_heads = index
.all_heads_for_gc()
.map_err(|err| BackendError::Other(err.into()))?
.filter(|id| *id != self.root_commit_id);
recreate_no_gc_refs(&git_repo, new_heads, keep_newer)?;
// TODO: remove unreachable entries from extras table if segment file
// mtime <= keep_newer? (it won't be consistent with no-gc refs
// preserved by the keep_newer timestamp though)
// TODO: remove unreachable extras table segments
// TODO: pass in keep_newer to "git gc" command
run_git_gc(self.git_repo_path()).map_err(|err| BackendError::Other(err.into()))
}
}

Expand Down
4 changes: 3 additions & 1 deletion lib/src/local_backend.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ use std::fs;
use std::fs::File;
use std::io::{Read, Write};
use std::path::{Path, PathBuf};
use std::time::SystemTime;

use async_trait::async_trait;
use blake2::{Blake2b512, Digest};
Expand All @@ -33,6 +34,7 @@ use crate::backend::{
};
use crate::content_hash::blake2b_hash;
use crate::file_util::persist_content_addressed_temp_file;
use crate::index::Index;
use crate::merge::MergeBuilder;
use crate::object_id::ObjectId;
use crate::repo_path::{RepoPath, RepoPathComponentBuf};
Expand Down Expand Up @@ -299,7 +301,7 @@ impl Backend for LocalBackend {
Ok((id, commit))
}

fn gc(&self) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
fn gc(&self, _index: &dyn Index, _keep_newer: SystemTime) -> BackendResult<()> {
Ok(())
}
}
Expand Down
6 changes: 4 additions & 2 deletions lib/src/store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ use std::collections::HashMap;
use std::fmt::{Debug, Formatter};
use std::io::Read;
use std::sync::{Arc, RwLock};
use std::time::SystemTime;

use pollster::FutureExt;

Expand All @@ -27,6 +28,7 @@ use crate::backend::{
SymlinkId, TreeId,
};
use crate::commit::Commit;
use crate::index::Index;
use crate::merge::{Merge, MergedTreeValue};
use crate::merged_tree::MergedTree;
use crate::repo_path::{RepoPath, RepoPathBuf};
Expand Down Expand Up @@ -266,7 +268,7 @@ impl Store {
TreeBuilder::new(self.clone(), base_tree_id)
}

pub fn gc(&self) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
self.backend.gc()
pub fn gc(&self, index: &dyn Index, keep_newer: SystemTime) -> BackendResult<()> {
self.backend.gc(index, keep_newer)
}
}
Loading
Loading