From e54541225142b509f5f3b7468897b2c31a7635dc Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Mon, 16 Sep 2019 12:35:03 -0700 Subject: [PATCH] Work with canonical URLs in `[patch]` This commit addresses an issue with how the resolver processes `[patch]` annotations in manifests and lock files. Previously the resolver would use the raw `Url` coming out of a manifest, but the rest of resolution, when comparing `SourceId`, uses a canonical form of a `Url` rather than the actual raw `Url`. This ended up causing discrepancies like those found in #7282. To fix the issue all `patch` intermediate storage in the resolver uses a newly-added `CanonicalUrl` type instead of a `Url`. This `CanonicalUrl` is then also used throughout the codebase, and all lookups in the resolver as switched to using `CanonicalUrl` instead of `Url`, which... Closes #7282 --- src/cargo/core/registry.rs | 31 ++++++---- src/cargo/core/resolver/resolve.rs | 6 +- src/cargo/core/source/source_id.rs | 19 ++++--- src/cargo/ops/resolve.rs | 2 +- src/cargo/sources/git/mod.rs | 2 +- src/cargo/sources/git/source.rs | 91 ++++++++---------------------- src/cargo/util/canonical_url.rs | 73 ++++++++++++++++++++++++ src/cargo/util/mod.rs | 2 + tests/testsuite/patch.rs | 65 +++++++++++++++++++++ 9 files changed, 200 insertions(+), 91 deletions(-) create mode 100644 src/cargo/util/canonical_url.rs diff --git a/src/cargo/core/registry.rs b/src/cargo/core/registry.rs index 979e54759e3..98dd68cc294 100644 --- a/src/cargo/core/registry.rs +++ b/src/cargo/core/registry.rs @@ -9,7 +9,7 @@ use crate::core::PackageSet; use crate::core::{Dependency, PackageId, Source, SourceId, SourceMap, Summary}; use crate::sources::config::SourceConfigMap; use crate::util::errors::{CargoResult, CargoResultExt}; -use crate::util::{profile, Config}; +use crate::util::{profile, CanonicalUrl, Config}; /// Source of information about a group of packages. /// @@ -75,9 +75,9 @@ pub struct PackageRegistry<'cfg> { yanked_whitelist: HashSet, source_config: SourceConfigMap<'cfg>, - patches: HashMap>, + patches: HashMap>, patches_locked: bool, - patches_available: HashMap>, + patches_available: HashMap>, } /// A map of all "locked packages" which is filled in when parsing a lock file @@ -230,6 +230,8 @@ impl<'cfg> PackageRegistry<'cfg> { /// `query` until `lock_patches` is called below, which should be called /// once all patches have been added. pub fn patch(&mut self, url: &Url, deps: &[Dependency]) -> CargoResult<()> { + let canonical = CanonicalUrl::new(url)?; + // First up we need to actually resolve each `deps` specification to // precisely one summary. We're not using the `query` method below as it // internally uses maps we're building up as part of this method @@ -284,7 +286,7 @@ impl<'cfg> PackageRegistry<'cfg> { url ) } - if summary.package_id().source_id().url() == url { + if *summary.package_id().source_id().canonical_url() == canonical { failure::bail!( "patch for `{}` in `{}` points to the same source, but \ patches must point to different sources", @@ -317,8 +319,8 @@ impl<'cfg> PackageRegistry<'cfg> { // `lock` method) and otherwise store the unlocked summaries in // `patches` to get locked in a future call to `lock_patches`. let ids = unlocked_summaries.iter().map(|s| s.package_id()).collect(); - self.patches_available.insert(url.clone(), ids); - self.patches.insert(url.clone(), unlocked_summaries); + self.patches_available.insert(canonical.clone(), ids); + self.patches.insert(canonical, unlocked_summaries); Ok(()) } @@ -340,8 +342,11 @@ impl<'cfg> PackageRegistry<'cfg> { self.patches_locked = true; } - pub fn patches(&self) -> &HashMap> { - &self.patches + pub fn patches(&self) -> Vec { + self.patches + .values() + .flat_map(|v| v.iter().cloned()) + .collect() } fn load(&mut self, source_id: SourceId, kind: Kind) -> CargoResult<()> { @@ -472,7 +477,7 @@ impl<'cfg> Registry for PackageRegistry<'cfg> { // This means that `dep.matches(..)` will always return false, when // what we really care about is the name/version match. let mut patches = Vec::::new(); - if let Some(extra) = self.patches.get(dep.source_id().url()) { + if let Some(extra) = self.patches.get(dep.source_id().canonical_url()) { patches.extend( extra .iter() @@ -605,7 +610,11 @@ impl<'cfg> Registry for PackageRegistry<'cfg> { } } -fn lock(locked: &LockedMap, patches: &HashMap>, summary: Summary) -> Summary { +fn lock( + locked: &LockedMap, + patches: &HashMap>, + summary: Summary, +) -> Summary { let pair = locked .get(&summary.source_id()) .and_then(|map| map.get(&*summary.name())) @@ -669,7 +678,7 @@ fn lock(locked: &LockedMap, patches: &HashMap>, summary: Sum // map, and we see if `id` is contained in the list of patches // for that url. If it is then this lock is still valid, // otherwise the lock is no longer valid. - match patches.get(dep.source_id().url()) { + match patches.get(dep.source_id().canonical_url()) { Some(list) => list.contains(&id), None => false, } diff --git a/src/cargo/core/resolver/resolve.rs b/src/cargo/core/resolver/resolve.rs index 9ced48f4d67..dcb7cd44361 100644 --- a/src/cargo/core/resolver/resolve.rs +++ b/src/cargo/core/resolver/resolve.rs @@ -3,8 +3,6 @@ use std::collections::{HashMap, HashSet}; use std::fmt; use std::iter::FromIterator; -use url::Url; - use crate::core::dependency::Kind; use crate::core::{Dependency, PackageId, PackageIdSpec, Summary, Target}; use crate::util::errors::CargoResult; @@ -114,8 +112,8 @@ impl Resolve { self.graph.path_to_top(pkg) } - pub fn register_used_patches(&mut self, patches: &HashMap>) { - for summary in patches.values().flat_map(|v| v) { + pub fn register_used_patches(&mut self, patches: &[Summary]) { + for summary in patches { if self.iter().any(|id| id == summary.package_id()) { continue; } diff --git a/src/cargo/core/source/source_id.rs b/src/cargo/core/source/source_id.rs index 597bc33a8ee..9755b5006fe 100644 --- a/src/cargo/core/source/source_id.rs +++ b/src/cargo/core/source/source_id.rs @@ -15,10 +15,9 @@ use url::Url; use crate::core::PackageId; use crate::ops; -use crate::sources::git; use crate::sources::DirectorySource; use crate::sources::{GitSource, PathSource, RegistrySource, CRATES_IO_INDEX}; -use crate::util::{CargoResult, Config, IntoUrl}; +use crate::util::{CanonicalUrl, CargoResult, Config, IntoUrl}; lazy_static::lazy_static! { static ref SOURCE_ID_CACHE: Mutex> = Mutex::new(HashSet::new()); @@ -34,8 +33,8 @@ pub struct SourceId { struct SourceIdInner { /// The source URL. url: Url, - /// The result of `git::canonicalize_url()` on `url` field. - canonical_url: Url, + /// The canonical version of the above url + canonical_url: CanonicalUrl, /// The source kind. kind: Kind, /// For example, the exact Git revision of the specified branch for a Git Source. @@ -80,7 +79,7 @@ impl SourceId { fn new(kind: Kind, url: Url) -> CargoResult { let source_id = SourceId::wrap(SourceIdInner { kind, - canonical_url: git::canonicalize_url(&url)?, + canonical_url: CanonicalUrl::new(&url)?, url, precise: None, name: None, @@ -216,7 +215,7 @@ impl SourceId { let url = config.get_registry_index(key)?; Ok(SourceId::wrap(SourceIdInner { kind: Kind::Registry, - canonical_url: git::canonicalize_url(&url)?, + canonical_url: CanonicalUrl::new(&url)?, url, precise: None, name: Some(key.to_string()), @@ -228,6 +227,12 @@ impl SourceId { &self.inner.url } + /// Gets the canonical URL of this source, used for internal comparison + /// purposes. + pub fn canonical_url(&self) -> &CanonicalUrl { + &self.inner.canonical_url + } + pub fn display_index(self) -> String { if self.is_default_registry() { "crates.io index".to_string() @@ -508,7 +513,7 @@ impl Hash for SourceId { fn hash(&self, into: &mut S) { self.inner.kind.hash(into); match self.inner.kind { - Kind::Git(_) => self.inner.canonical_url.as_str().hash(into), + Kind::Git(_) => self.inner.canonical_url.hash(into), _ => self.inner.url.as_str().hash(into), } } diff --git a/src/cargo/ops/resolve.rs b/src/cargo/ops/resolve.rs index d29cc31bd47..bd243afff6e 100644 --- a/src/cargo/ops/resolve.rs +++ b/src/cargo/ops/resolve.rs @@ -343,7 +343,7 @@ pub fn resolve_with_previous<'cfg>( Some(ws.config()), ws.features().require(Feature::public_dependency()).is_ok(), )?; - resolved.register_used_patches(registry.patches()); + resolved.register_used_patches(®istry.patches()); if register_patches { // It would be good if this warning was more targeted and helpful // (such as showing close candidates that failed to match). However, diff --git a/src/cargo/sources/git/mod.rs b/src/cargo/sources/git/mod.rs index 86d0094d19e..bd6d7ea7d6e 100644 --- a/src/cargo/sources/git/mod.rs +++ b/src/cargo/sources/git/mod.rs @@ -1,4 +1,4 @@ -pub use self::source::{canonicalize_url, GitSource}; +pub use self::source::GitSource; pub use self::utils::{fetch, GitCheckout, GitDatabase, GitRemote, GitRevision}; mod source; mod utils; diff --git a/src/cargo/sources/git/source.rs b/src/cargo/sources/git/source.rs index e417da9cb01..26d482f2511 100644 --- a/src/cargo/sources/git/source.rs +++ b/src/cargo/sources/git/source.rs @@ -27,7 +27,7 @@ impl<'cfg> GitSource<'cfg> { assert!(source_id.is_git(), "id is not git, id={}", source_id); let remote = GitRemote::new(source_id.url()); - let ident = ident(source_id.url())?; + let ident = ident(&source_id); let reference = match source_id.precise() { Some(s) => GitReference::Rev(s.to_string()), @@ -59,58 +59,17 @@ impl<'cfg> GitSource<'cfg> { } } -fn ident(url: &Url) -> CargoResult { - let url = canonicalize_url(url)?; - let ident = url +fn ident(id: &SourceId) -> String { + let ident = id + .canonical_url() + .raw_canonicalized_url() .path_segments() - .and_then(|mut s| s.next_back()) + .and_then(|s| s.rev().next()) .unwrap_or(""); let ident = if ident == "" { "_empty" } else { ident }; - Ok(format!("{}-{}", ident, short_hash(&url))) -} - -// Some hacks and heuristics for making equivalent URLs hash the same. -pub fn canonicalize_url(url: &Url) -> CargoResult { - let mut url = url.clone(); - - // cannot-be-a-base-urls (e.g., `github.com:rust-lang-nursery/rustfmt.git`) - // are not supported. - if url.cannot_be_a_base() { - failure::bail!( - "invalid url `{}`: cannot-be-a-base-URLs are not supported", - url - ) - } - - // Strip a trailing slash. - if url.path().ends_with('/') { - url.path_segments_mut().unwrap().pop_if_empty(); - } - - // HACK: for GitHub URLs specifically, just lower-case - // everything. GitHub treats both the same, but they hash - // differently, and we're gonna be hashing them. This wants a more - // general solution, and also we're almost certainly not using the - // same case conversion rules that GitHub does. (See issue #84.) - if url.host_str() == Some("github.com") { - url.set_scheme("https").unwrap(); - let path = url.path().to_lowercase(); - url.set_path(&path); - } - - // Repos can generally be accessed with or without `.git` extension. - let needs_chopping = url.path().ends_with(".git"); - if needs_chopping { - let last = { - let last = url.path_segments().unwrap().next_back().unwrap(); - last[..last.len() - 4].to_owned() - }; - url.path_segments_mut().unwrap().pop().push(&last); - } - - Ok(url) + format!("{}-{}", ident, short_hash(id.canonical_url())) } impl<'cfg> Debug for GitSource<'cfg> { @@ -241,56 +200,54 @@ impl<'cfg> Source for GitSource<'cfg> { #[cfg(test)] mod test { use super::ident; + use crate::core::{GitReference, SourceId}; use crate::util::IntoUrl; - use url::Url; #[test] pub fn test_url_to_path_ident_with_path() { - let ident = ident(&url("https://github.com/carlhuda/cargo")).unwrap(); + let ident = ident(&src("https://github.com/carlhuda/cargo")); assert!(ident.starts_with("cargo-")); } #[test] pub fn test_url_to_path_ident_without_path() { - let ident = ident(&url("https://github.com")).unwrap(); + let ident = ident(&src("https://github.com")); assert!(ident.starts_with("_empty-")); } #[test] fn test_canonicalize_idents_by_stripping_trailing_url_slash() { - let ident1 = ident(&url("https://github.com/PistonDevelopers/piston/")).unwrap(); - let ident2 = ident(&url("https://github.com/PistonDevelopers/piston")).unwrap(); + let ident1 = ident(&src("https://github.com/PistonDevelopers/piston/")); + let ident2 = ident(&src("https://github.com/PistonDevelopers/piston")); assert_eq!(ident1, ident2); } #[test] fn test_canonicalize_idents_by_lowercasing_github_urls() { - let ident1 = ident(&url("https://github.com/PistonDevelopers/piston")).unwrap(); - let ident2 = ident(&url("https://github.com/pistondevelopers/piston")).unwrap(); + let ident1 = ident(&src("https://github.com/PistonDevelopers/piston")); + let ident2 = ident(&src("https://github.com/pistondevelopers/piston")); assert_eq!(ident1, ident2); } #[test] fn test_canonicalize_idents_by_stripping_dot_git() { - let ident1 = ident(&url("https://github.com/PistonDevelopers/piston")).unwrap(); - let ident2 = ident(&url("https://github.com/PistonDevelopers/piston.git")).unwrap(); + let ident1 = ident(&src("https://github.com/PistonDevelopers/piston")); + let ident2 = ident(&src("https://github.com/PistonDevelopers/piston.git")); assert_eq!(ident1, ident2); } #[test] fn test_canonicalize_idents_different_protocols() { - let ident1 = ident(&url("https://github.com/PistonDevelopers/piston")).unwrap(); - let ident2 = ident(&url("git://github.com/PistonDevelopers/piston")).unwrap(); + let ident1 = ident(&src("https://github.com/PistonDevelopers/piston")); + let ident2 = ident(&src("git://github.com/PistonDevelopers/piston")); assert_eq!(ident1, ident2); } - #[test] - fn test_canonicalize_cannot_be_a_base_urls() { - assert!(ident(&url("github.com:PistonDevelopers/piston")).is_err()); - assert!(ident(&url("google.com:PistonDevelopers/piston")).is_err()); - } - - fn url(s: &str) -> Url { - s.into_url().unwrap() + fn src(s: &str) -> SourceId { + SourceId::for_git( + &s.into_url().unwrap(), + GitReference::Branch("master".to_string()), + ) + .unwrap() } } diff --git a/src/cargo/util/canonical_url.rs b/src/cargo/util/canonical_url.rs new file mode 100644 index 00000000000..df5d2c7f6cc --- /dev/null +++ b/src/cargo/util/canonical_url.rs @@ -0,0 +1,73 @@ +use crate::util::errors::CargoResult; +use std::hash::{self, Hash}; +use url::Url; + +/// A newtype wrapper around `Url` which represents a "canonical" version of an +/// original URL. +/// +/// A "canonical" url is only intended for internal comparison purposes in +/// Cargo. It's to help paper over mistakes such as depending on +/// `github.com/foo/bar` vs `github.com/foo/bar.git`. This is **only** for +/// internal purposes within Cargo and provides no means to actually read the +/// underlying string value of the `Url` it contains. This is intentional, +/// because all fetching should still happen within the context of the original +/// URL. +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)] +pub struct CanonicalUrl(Url); + +impl CanonicalUrl { + pub fn new(url: &Url) -> CargoResult { + let mut url = url.clone(); + + // cannot-be-a-base-urls (e.g., `github.com:rust-lang-nursery/rustfmt.git`) + // are not supported. + if url.cannot_be_a_base() { + failure::bail!( + "invalid url `{}`: cannot-be-a-base-URLs are not supported", + url + ) + } + + // Strip a trailing slash. + if url.path().ends_with('/') { + url.path_segments_mut().unwrap().pop_if_empty(); + } + + // For GitHub URLs specifically, just lower-case everything. GitHub + // treats both the same, but they hash differently, and we're gonna be + // hashing them. This wants a more general solution, and also we're + // almost certainly not using the same case conversion rules that GitHub + // does. (See issue #84) + if url.host_str() == Some("github.com") { + url.set_scheme("https").unwrap(); + let path = url.path().to_lowercase(); + url.set_path(&path); + } + + // Repos can generally be accessed with or without `.git` extension. + let needs_chopping = url.path().ends_with(".git"); + if needs_chopping { + let last = { + let last = url.path_segments().unwrap().next_back().unwrap(); + last[..last.len() - 4].to_owned() + }; + url.path_segments_mut().unwrap().pop().push(&last); + } + + Ok(CanonicalUrl(url)) + } + + /// Returns the raw canonicalized URL, although beware that this should + /// never be used/displayed/etc, it should only be used for internal data + /// structures and hashes and such. + pub fn raw_canonicalized_url(&self) -> &Url { + &self.0 + } +} + +// See comment in `source_id.rs` for why we explicitly use `as_str()` here. +impl Hash for CanonicalUrl { + fn hash(&self, into: &mut S) { + self.0.as_str().hash(into); + } +} diff --git a/src/cargo/util/mod.rs b/src/cargo/util/mod.rs index 50fbd8c2b1b..b35e1c12958 100644 --- a/src/cargo/util/mod.rs +++ b/src/cargo/util/mod.rs @@ -1,5 +1,6 @@ use std::time::Duration; +pub use self::canonical_url::CanonicalUrl; pub use self::cfg::{Cfg, CfgExpr}; pub use self::config::{homedir, Config, ConfigValue}; pub use self::dependency_queue::DependencyQueue; @@ -28,6 +29,7 @@ pub use self::workspace::{ print_available_tests, }; +mod canonical_url; mod cfg; pub mod command_prelude; pub mod config; diff --git a/tests/testsuite/patch.rs b/tests/testsuite/patch.rs index 8c1459c916c..763f08b3e3e 100644 --- a/tests/testsuite/patch.rs +++ b/tests/testsuite/patch.rs @@ -1338,3 +1338,68 @@ version. [..] ) .run(); } + +#[cargo_test] +fn canonicalize_a_bunch() { + let base = git::repo(&paths::root().join("base")) + .file("Cargo.toml", &basic_manifest("base", "0.1.0")) + .file("src/lib.rs", "") + .build(); + + let intermediate = git::repo(&paths::root().join("intermediate")) + .file( + "Cargo.toml", + &format!( + r#" + [package] + name = "intermediate" + version = "0.1.0" + + [dependencies] + # Note the lack of trailing slash + base = {{ git = '{}' }} + "#, + base.url(), + ), + ) + .file("src/lib.rs", "pub fn f() { base::f() }") + .build(); + + let newbase = git::repo(&paths::root().join("newbase")) + .file("Cargo.toml", &basic_manifest("base", "0.1.0")) + .file("src/lib.rs", "pub fn f() {}") + .build(); + + let p = project() + .file( + "Cargo.toml", + &format!( + r#" + [package] + name = "foo" + version = "0.0.1" + + [dependencies] + # Note the trailing slashes + base = {{ git = '{base}/' }} + intermediate = {{ git = '{intermediate}/' }} + + [patch.'{base}'] # Note the lack of trailing slash + base = {{ git = '{newbase}' }} + "#, + base = base.url(), + intermediate = intermediate.url(), + newbase = newbase.url(), + ), + ) + .file("src/lib.rs", "pub fn a() { base::f(); intermediate::f() }") + .build(); + + // Once to make sure it actually works + p.cargo("build").run(); + + // Then a few more times for good measure to ensure no weird warnings about + // `[patch]` are printed. + p.cargo("build").with_stderr("[FINISHED] [..]").run(); + p.cargo("build").with_stderr("[FINISHED] [..]").run(); +}