You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
The problem is that the callback wrapper in the walk() implementation assumes utf-8 for the root name. However, this is not required by base Git or libgit2. Other git2-rs methods appear to correctly handle non-utf-8 names by allowing paths to be accessed as byte slices.
Here's a self-contained demonstration of the issue which also shows base git behavior:
use std::{
path::{Path, PathBuf},
process::Command,
};
use anyhow::{anyhow, Context, Result};
use git2::{
ObjectType, Oid, Repository, RepositoryInitOptions, Signature, Tree, TreeBuilder,
TreeWalkResult,
};
use rand::RngCore;
fn main() -> Result<()> {
let mut rng = rand::thread_rng();
let temp_dir = create_temp_dir(&mut rng)?;
println!("Creating repo at {temp_dir:?}");
let mut options = RepositoryInitOptions::new();
// NOTE: We use a bare repo to avoid creating a working tree and
// materializing any pathological file names. Git, however, supports
// almost-arbitrary filenames internally (excluding embedded NUL and some
// other special characters, depending on the mode). libgit2 also supports
// arbitrary encoding with some exceptions:
// https://github.com/libgit2/libgit2/blob/54bfab93bffa4f98ee37e3730c8ed4061a1bc5db/src/util/fs_path.c#L1579.
options.bare(true).initial_head("main");
let repo = Repository::init_opts(&temp_dir, &options)?;
println!("Populating repo");
let spec = vec![
TreeSpec::blob(b"a.txt", b"Toplevel file. This is walkable because it's traversed _before_ the broken subtree."),
TreeSpec::blob(b"b\xc0.txt", b"Toplevel file. This is walkable as above, but it also has a filename which is not valid utf-8."),
// Note that the tree below (with an invalid utf-8 name) can be _listed_
// but not _traversed_ by the current walk implementation.
TreeSpec::tree(b"c\xc0", vec![
TreeSpec::blob(b"nested.txt", b"Not walkable due to the parent directory name being non-utf-8 bytes. This will abort the walk."),
]),
TreeSpec::blob(b"d.txt", b"Not walkable due to the broken tree above ending the walk."),
];
populate_repo(&repo, &spec)?;
println!("Repo populated");
println!();
println!("Walking via git2:");
walk_repo(&repo)?;
println!();
println!("Listing committed files using git command:");
list_repo(temp_dir.as_path())?;
Ok(())
}
fn create_temp_dir(rng: impl RngCore) -> Result<PathBuf> {
let mut rng = rng;
let toplevel = std::env::temp_dir();
let prefix = "git-repo";
for _ in 0..1000 {
let suffix = random_suffix(&mut rng);
let path = Path::join(toplevel.as_path(), format!("{prefix}-{suffix}.git"));
match std::fs::create_dir(path.as_path()) {
Ok(()) => return Ok(path),
Err(e) => match e.kind() {
std::io::ErrorKind::AlreadyExists => (),
_ => return Err(e.into()),
},
}
}
Err(anyhow!("could not allocate a unique random dir"))
}
fn random_suffix(rng: impl RngCore) -> String {
use base64::engine::general_purpose::URL_SAFE_NO_PAD;
use base64::Engine;
let mut rng = rng;
let mut buf = [0; 4];
rng.fill_bytes(&mut buf);
URL_SAFE_NO_PAD.encode(&buf)
}
fn populate_repo(repo: &Repository, spec: &[TreeSpec]) -> Result<()> {
let mut root = repo.treebuilder(None)?;
for item in spec {
add_to_tree(repo, &mut root, item)?;
}
let tree_oid = root.write()?;
let tree = repo.find_tree(tree_oid)?;
let signature = Signature::now("foo", "[email protected]")?;
let message = "Commit to hold reference";
repo.commit(Some("HEAD"), &signature, &signature, message, &tree, &[])?;
Ok(())
}
fn add_to_tree(repo: &Repository, root: &mut TreeBuilder, tree_spec: &TreeSpec) -> Result<()> {
match tree_spec {
TreeSpec::Tree { name, children } => {
let mut tree_builder = repo.treebuilder(None)?;
for child in children {
add_to_tree(repo, &mut tree_builder, child)?;
}
let oid = tree_builder.write()?;
root.insert(name, oid, 0o040000)?;
}
TreeSpec::Blob { name, contents } => {
let oid = repo.blob(contents)?;
root.insert(name, oid, 0o100644)?;
}
}
Ok(())
}
fn walk_repo(repo: &Repository) -> Result<()> {
let mut revwalk = repo.revwalk()?;
revwalk.push_head()?;
for oid in revwalk {
let oid = oid?;
describe_commit(&repo, oid).with_context(|| format!("could not inspect commit {oid}"))?;
}
Ok(())
}
fn describe_commit(repo: &Repository, oid: Oid) -> Result<()> {
let commit = repo.find_commit(oid)?;
let summary = commit
.summary()
.ok_or_else(|| anyhow!("could not open commit summary"))
.with_context(|| format!("reading oid {oid}"))?;
println!("{oid}: {summary}");
let tree = commit.tree()?;
inspect_tree(repo, &tree)?;
Ok(())
}
fn inspect_tree(repo: &Repository, tree: &Tree) -> Result<()> {
// Note that the signature below _assumes_ that `root` can always be coerced
// to `&str` (i.e., utf-8). Due to the implementation, in practice, the
// result is that subtrees containing non-utf-8 encodings are always
// silently skipped while walking. Note that basenames (`filename` below)
// are correctly handled as bytes due to `TreeEntry`'s interface.
tree.walk(git2::TreeWalkMode::PreOrder, |root, entry| {
let filename = String::from_utf8_lossy(entry.name_bytes());
let path = format!("{root}{filename}");
if let Ok(object) = entry.to_object(repo) {
match object.kind() {
Some(ObjectType::Blob) => {
let blob = object.as_blob().unwrap();
if blob.is_binary() {
println!(" {path}: <binary>")
} else {
println!(" {path}:");
let contents = String::from_utf8_lossy(blob.content());
for line in contents.lines() {
println!(" > {line}");
}
}
}
Some(ObjectType::Tree) => {
println!(" {path}: <tree>");
}
None => {
println!("{path} had no object type");
}
_ => {
println!("{path} was an unexpected object type (not a blob or tree)");
}
}
// Is it possible to return a result from the callback? It would be
// much better ergonomically if we could.
TreeWalkResult::Ok
} else {
println!("ERROR: could not get object for {path}");
TreeWalkResult::Abort
}
})?;
Ok(())
}
fn list_repo(path: &Path) -> Result<()> {
let mut child = Command::new("git")
.arg("-C")
.arg(path)
.arg("ls-tree")
.arg("-r")
.arg("main")
.spawn()?;
let result = child.wait()?;
if result.success() {
Ok(())
} else {
if let Some(code) = result.code() {
Err(anyhow!("git failed with status code {code}"))
} else {
Err(anyhow!("git failed"))
}
}
}
enum TreeSpec {
Tree {
name: &'static [u8],
children: Vec<TreeSpec>,
},
Blob {
name: &'static [u8],
contents: &'static [u8],
},
}
impl TreeSpec {
fn tree(name: &'static [u8], children: Vec<TreeSpec>) -> TreeSpec {
TreeSpec::Tree { name, children }
}
fn blob(name: &'static [u8], contents: &'static [u8]) -> TreeSpec {
TreeSpec::Blob { name, contents }
}
}
Sample invocation:
> cargo run --release
Finished release [optimized] target(s) in 0.23s
Running `target/release/bad-encoding`
Creating repo at "/var/folders/0h/1lcyrkv11ynfjq5w7bt0sl_00000gn/T/git-repo-kDBu5A.git"
Populating repo
Repo populated
Walking via git2:
6cdc2a2f7544639226efe4e872ddd2ec42fb70a0: Commit to hold reference
a.txt:
> Toplevel file. This is walkable because it's traversed _before_ the broken subtree.
b�.txt:
> Toplevel file. This is walkable as above, but it also has a filename which is not valid utf-8.
c�: <tree>
Listing committed files using git command:
100644 blob 596c677536cbdd42ed208abb4135afcb95bd967c a.txt
100644 blob 330e18bbaa623998f17ce703cdcf7158915c6bc5 "b\300.txt"
100644 blob 5fd5bb9cddb4076e8786e90670ba46e621174482 "c\300/nested.txt"
100644 blob 07bd6651bc96c9554dab80f1f15b867d4091116a d.txt
The example binary does not delete the bare git repo afterward so you can easily inspect it.
The text was updated successfully, but these errors were encountered:
The problem is that the callback wrapper in the
walk()
implementation assumes utf-8 for the root name. However, this is not required by base Git or libgit2. Other git2-rs methods appear to correctly handle non-utf-8 names by allowing paths to be accessed as byte slices.Here's a self-contained demonstration of the issue which also shows base git behavior:
Cargo.toml
src/main.rs
:Sample invocation:
The example binary does not delete the bare git repo afterward so you can easily inspect it.
The text was updated successfully, but these errors were encountered: