Skip to content

Commit

Permalink
Auto merge of #12240 - weihanglo:source-refactor, r=epage
Browse files Browse the repository at this point in the history
refactor: registry source cleanup
  • Loading branch information
bors committed Jun 8, 2023
2 parents 69ea85c + c977baa commit 9e33d31
Show file tree
Hide file tree
Showing 4 changed files with 233 additions and 240 deletions.
2 changes: 1 addition & 1 deletion crates/cargo-test-support/src/registry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1342,7 +1342,7 @@ impl Package {

/// Sets the index schema version for this package.
///
/// See `cargo::sources::registry::RegistryPackage` for more information.
/// See `cargo::sources::registry::IndexPackage` for more information.
pub fn schema_version(&mut self, version: u32) -> &mut Package {
self.v = Some(version);
self
Expand Down
11 changes: 7 additions & 4 deletions src/cargo/sources/registry/download.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,7 @@ use cargo_util::Sha256;
use crate::core::PackageId;
use crate::sources::registry::make_dep_prefix;
use crate::sources::registry::MaybeLock;
use crate::sources::registry::{
RegistryConfig, CHECKSUM_TEMPLATE, CRATE_TEMPLATE, LOWER_PREFIX_TEMPLATE, PREFIX_TEMPLATE,
VERSION_TEMPLATE,
};
use crate::sources::registry::RegistryConfig;
use crate::util::auth;
use crate::util::errors::CargoResult;
use crate::util::{Config, Filesystem};
Expand All @@ -17,6 +14,12 @@ use std::io::prelude::*;
use std::io::SeekFrom;
use std::str;

const CRATE_TEMPLATE: &str = "{crate}";
const VERSION_TEMPLATE: &str = "{version}";
const PREFIX_TEMPLATE: &str = "{prefix}";
const LOWER_PREFIX_TEMPLATE: &str = "{lowerprefix}";
const CHECKSUM_TEMPLATE: &str = "{sha256-checksum}";

pub(super) fn filename(pkg: PackageId) -> String {
format!("{}-{}.crate", pkg.name(), pkg.version())
}
Expand Down
243 changes: 217 additions & 26 deletions src/cargo/sources/registry/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,9 @@
//!
//! The crates.io index, however, is not amenable to this form of query. Instead
//! the crates.io index simply is a file where each line is a JSON blob, aka
//! [`RegistryPackage`]. To learn about the versions in each JSON blob we
//! would need to parse the JSON via [`IndexSummary::parse`], defeating the
//! purpose of trying to parse as little as possible.
//! [`IndexPackage`]. To learn about the versions in each JSON blob we would
//! need to parse the JSON via [`IndexSummary::parse`], defeating the purpose
//! of trying to parse as little as possible.
//!
//! > Note that as a small aside even *loading* the JSON from the registry is
//! > actually pretty slow. For crates.io and [`RemoteRegistry`] we don't
Expand Down Expand Up @@ -85,21 +85,34 @@
//! [`RemoteRegistry`]: super::remote::RemoteRegistry
//! [`Dependency`]: crate::core::Dependency
use crate::core::dependency::DepKind;
use crate::core::Dependency;
use crate::core::{PackageId, SourceId, Summary};
use crate::sources::registry::{LoadResponse, RegistryData, RegistryPackage, INDEX_V_MAX};
use crate::sources::registry::{LoadResponse, RegistryData};
use crate::util::interning::InternedString;
use crate::util::IntoUrl;
use crate::util::{internal, CargoResult, Config, Filesystem, OptVersionReq, ToSemver};
use anyhow::bail;
use cargo_util::{paths, registry::make_dep_path};
use log::{debug, info};
use semver::Version;
use serde::Deserialize;
use std::borrow::Cow;
use std::collections::BTreeMap;
use std::collections::{HashMap, HashSet};
use std::fs;
use std::io::ErrorKind;
use std::path::Path;
use std::str;
use std::task::{ready, Poll};

/// The current version of [`SummariesCache`].
const CURRENT_CACHE_VERSION: u8 = 3;

/// The maximum schema version of the `v` field in the index this version of
/// cargo understands. See [`IndexPackage::v`] for the detail.
const INDEX_V_MAX: u32 = 2;

/// Manager for handling the on-disk index.
///
/// Different kinds of registries store the index differently:
Expand Down Expand Up @@ -139,7 +152,7 @@ pub struct RegistryIndex<'cfg> {
/// 1. From raw registry index --- Primarily Cargo will parse the corresponding
/// file for a crate in the upstream crates.io registry. That's just a JSON
/// blob per line which we can parse, extract the version, and then store here.
/// See [`RegistryPackage`] and [`IndexSummary::parse`].
/// See [`IndexPackage`] and [`IndexSummary::parse`].
///
/// 2. From on-disk index cache --- If Cargo has previously run, we'll have a
/// cached index of dependencies for the upstream index. This is a file that
Expand Down Expand Up @@ -179,7 +192,7 @@ enum MaybeIndexSummary {
pub struct IndexSummary {
pub summary: Summary,
pub yanked: bool,
/// Schema version, see [`RegistryPackage::v`].
/// Schema version, see [`IndexPackage::v`].
v: u32,
}

Expand All @@ -202,7 +215,7 @@ pub struct IndexSummary {
///
/// ```text
/// +---------------+----------------------+--------------------+---+
/// | cache version | index format version | index file version | 0 |
/// | cache version | index schema version | index file version | 0 |
/// +---------------+----------------------+--------------------+---+
/// ```
///
Expand All @@ -219,19 +232,19 @@ pub struct IndexSummary {
/// * _cache version_ --- Intended to ensure that there's some level of
/// future compatibility against changes to this cache format so if different
/// versions of Cargo share the same cache they don't get too confused.
/// * _index format version_ --- The version of the raw index file.
/// See [`RegistryPackage::v`] for the detail.
/// * _index schema version_ --- The schema version of the raw index file.
/// See [`IndexPackage::v`] for the detail.
/// * _index file version_ --- Tracks when a cache needs to be regenerated.
/// A cache regeneration is required whenever the index file itself updates.
/// * _semver version_ --- The version for each JSON blob. Extracted from the
/// blob for fast queries without parsing the entire blob.
/// * _JSON blob_ --- The actual metadata for each version of the package. It
/// has the same representation as [`RegistryPackage`].
/// has the same representation as [`IndexPackage`].
///
/// # Changes between each cache version
///
/// * `1`: The original version.
/// * `2`: Added the "index format version" field so that if the index format
/// * `2`: Added the "index schema version" field so that if the index schema
/// changes, different versions of cargo won't get confused reading each
/// other's caches.
/// * `3`: Bumped the version to work around an issue where multiple versions of
Expand All @@ -254,6 +267,97 @@ struct SummariesCache<'a> {
index_version: &'a str,
}

/// A single line in the index representing a single version of a package.
#[derive(Deserialize)]
pub struct IndexPackage<'a> {
/// Name of the pacakge.
name: InternedString,
/// The version of this dependency.
vers: Version,
/// All kinds of direct dependencies of the package, including dev and
/// build dependencies.
#[serde(borrow)]
deps: Vec<RegistryDependency<'a>>,
/// Set of features defined for the package, i.e., `[features]` table.
features: BTreeMap<InternedString, Vec<InternedString>>,
/// This field contains features with new, extended syntax. Specifically,
/// namespaced features (`dep:`) and weak dependencies (`pkg?/feat`).
///
/// This is separated from `features` because versions older than 1.19
/// will fail to load due to not being able to parse the new syntax, even
/// with a `Cargo.lock` file.
features2: Option<BTreeMap<InternedString, Vec<InternedString>>>,
/// Checksum for verifying the integrity of the corresponding downloaded package.
cksum: String,
/// If `true`, Cargo will skip this version when resolving.
///
/// This was added in 2014. Everything in the crates.io index has this set
/// now, so this probably doesn't need to be an option anymore.
yanked: Option<bool>,
/// Native library name this package links to.
///
/// Added early 2018 (see <https://github.com/rust-lang/cargo/pull/4978>),
/// can be `None` if published before then.
links: Option<InternedString>,
/// Required version of rust
///
/// Corresponds to `package.rust-version`.
///
/// Added in 2023 (see <https://github.com/rust-lang/crates.io/pull/6267>),
/// can be `None` if published before then or if not set in the manifest.
rust_version: Option<InternedString>,
/// The schema version for this entry.
///
/// If this is None, it defaults to version `1`. Entries with unknown
/// versions are ignored.
///
/// Version `2` schema adds the `features2` field.
///
/// This provides a method to safely introduce changes to index entries
/// and allow older versions of cargo to ignore newer entries it doesn't
/// understand. This is honored as of 1.51, so unfortunately older
/// versions will ignore it, and potentially misinterpret version 2 and
/// newer entries.
///
/// The intent is that versions older than 1.51 will work with a
/// pre-existing `Cargo.lock`, but they may not correctly process `cargo
/// update` or build a lock from scratch. In that case, cargo may
/// incorrectly select a new package that uses a new index schema. A
/// workaround is to downgrade any packages that are incompatible with the
/// `--precise` flag of `cargo update`.
v: Option<u32>,
}

/// A dependency as encoded in the [`IndexPackage`] index JSON.
#[derive(Deserialize)]
struct RegistryDependency<'a> {
/// Name of the dependency. If the dependency is renamed, the original
/// would be stored in [`RegistryDependency::package`].
name: InternedString,
/// The SemVer requirement for this dependency.
#[serde(borrow)]
req: Cow<'a, str>,
/// Set of features enabled for this dependency.
features: Vec<InternedString>,
/// Whether or not this is an optional dependency.
optional: bool,
/// Whether or not default features are enabled.
default_features: bool,
/// The target platform for this dependency.
target: Option<Cow<'a, str>>,
/// The dependency kind. "dev", "build", and "normal".
kind: Option<Cow<'a, str>>,
// The URL of the index of the registry where this dependency is from.
// `None` if it is from the same index.
registry: Option<Cow<'a, str>>,
/// The original name if the dependency is renamed.
package: Option<InternedString>,
/// Whether or not this is a public dependency. Unstable. See [RFC 1977].
///
/// [RFC 1977]: https://rust-lang.github.io/rfcs/1977-public-private-dependencies.html
public: Option<bool>,
}

impl<'cfg> RegistryIndex<'cfg> {
/// Creates an empty registry index at `path`.
pub fn new(
Expand Down Expand Up @@ -380,12 +484,7 @@ impl<'cfg> RegistryIndex<'cfg> {

// See module comment in `registry/mod.rs` for why this is structured
// the way it is.
let fs_name = name
.chars()
.flat_map(|c| c.to_lowercase())
.collect::<String>();

let path = make_dep_path(&fs_name, false);
let path = make_dep_path(&name.to_lowercase(), false);
let summaries = ready!(Summaries::parse(
root,
&cache_root,
Expand Down Expand Up @@ -697,9 +796,6 @@ impl Summaries {
}
}

/// The current version of [`SummariesCache`].
const CURRENT_CACHE_VERSION: u8 = 3;

impl<'a> SummariesCache<'a> {
/// Deserializes an on-disk cache.
fn parse(data: &'a [u8]) -> CargoResult<SummariesCache<'a>> {
Expand All @@ -712,13 +808,11 @@ impl<'a> SummariesCache<'a> {
}
let index_v_bytes = rest
.get(..4)
.ok_or_else(|| anyhow::anyhow!("cache expected 4 bytes for index version"))?;
.ok_or_else(|| anyhow::anyhow!("cache expected 4 bytes for index schema version"))?;
let index_v = u32::from_le_bytes(index_v_bytes.try_into().unwrap());
if index_v != INDEX_V_MAX {
bail!(
"index format version {} doesn't match the version I know ({})",
index_v,
INDEX_V_MAX
"index schema version {index_v} doesn't match the version I know ({INDEX_V_MAX})",
);
}
let rest = &rest[4..];
Expand Down Expand Up @@ -799,15 +893,15 @@ impl IndexSummary {
/// for a package.
///
/// The `line` provided is expected to be valid JSON. It is supposed to be
/// a [`RegistryPackage`].
/// a [`IndexPackage`].
fn parse(config: &Config, line: &[u8], source_id: SourceId) -> CargoResult<IndexSummary> {
// ****CAUTION**** Please be extremely careful with returning errors
// from this function. Entries that error are not included in the
// index cache, and can cause cargo to get confused when switching
// between different versions that understand the index differently.
// Make sure to consider the INDEX_V_MAX and CURRENT_CACHE_VERSION
// values carefully when making changes here.
let RegistryPackage {
let IndexPackage {
name,
vers,
cksum,
Expand Down Expand Up @@ -841,6 +935,70 @@ impl IndexSummary {
}
}

impl<'a> RegistryDependency<'a> {
/// Converts an encoded dependency in the registry to a cargo dependency
pub fn into_dep(self, default: SourceId) -> CargoResult<Dependency> {
let RegistryDependency {
name,
req,
mut features,
optional,
default_features,
target,
kind,
registry,
package,
public,
} = self;

let id = if let Some(registry) = &registry {
SourceId::for_registry(&registry.into_url()?)?
} else {
default
};

let mut dep = Dependency::parse(package.unwrap_or(name), Some(&req), id)?;
if package.is_some() {
dep.set_explicit_name_in_toml(name);
}
let kind = match kind.as_deref().unwrap_or("") {
"dev" => DepKind::Development,
"build" => DepKind::Build,
_ => DepKind::Normal,
};

let platform = match target {
Some(target) => Some(target.parse()?),
None => None,
};

// All dependencies are private by default
let public = public.unwrap_or(false);

// Unfortunately older versions of cargo and/or the registry ended up
// publishing lots of entries where the features array contained the
// empty feature, "", inside. This confuses the resolution process much
// later on and these features aren't actually valid, so filter them all
// out here.
features.retain(|s| !s.is_empty());

// In index, "registry" is null if it is from the same index.
// In Cargo.toml, "registry" is None if it is from the default
if !id.is_crates_io() {
dep.set_registry_id(id);
}

dep.set_optional(optional)
.set_default_features(default_features)
.set_features(features)
.set_platform(platform)
.set_kind(kind)
.set_public(public);

Ok(dep)
}
}

/// Like [`slice::split`] but is optimized by [`memchr`].
fn split(haystack: &[u8], needle: u8) -> impl Iterator<Item = &[u8]> {
struct Split<'a> {
Expand All @@ -866,3 +1024,36 @@ fn split(haystack: &[u8], needle: u8) -> impl Iterator<Item = &[u8]> {

Split { haystack, needle }
}

#[test]
fn escaped_char_in_index_json_blob() {
let _: IndexPackage<'_> = serde_json::from_str(
r#"{"name":"a","vers":"0.0.1","deps":[],"cksum":"bae3","features":{}}"#,
)
.unwrap();
let _: IndexPackage<'_> = serde_json::from_str(
r#"{"name":"a","vers":"0.0.1","deps":[],"cksum":"bae3","features":{"test":["k","q"]},"links":"a-sys"}"#
).unwrap();

// Now we add escaped cher all the places they can go
// these are not valid, but it should error later than json parsing
let _: IndexPackage<'_> = serde_json::from_str(
r#"{
"name":"This name has a escaped cher in it \n\t\" ",
"vers":"0.0.1",
"deps":[{
"name": " \n\t\" ",
"req": " \n\t\" ",
"features": [" \n\t\" "],
"optional": true,
"default_features": true,
"target": " \n\t\" ",
"kind": " \n\t\" ",
"registry": " \n\t\" "
}],
"cksum":"bae3",
"features":{"test \n\t\" ":["k \n\t\" ","q \n\t\" "]},
"links":" \n\t\" "}"#,
)
.unwrap();
}
Loading

0 comments on commit 9e33d31

Please sign in to comment.