Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extend repitition test to span across two files #349

Merged
merged 13 commits into from
Oct 7, 2021
Merged
1 change: 0 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion examples/builder/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ async fn main() -> Result<()> {
.build()
.client()?;

let response = client.check("http://example.org").await?;
let response = client.check("https://example.org").await?;
dbg!(&response);
assert!(response.status().is_success());
Ok(())
Expand Down
18 changes: 11 additions & 7 deletions examples/client_pool/client_pool.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use lychee_lib::{ClientBuilder, ClientPool, Input, Request, Result, Uri};
use lychee_lib::{ClientBuilder, Input, Request, Result, Uri};
use std::convert::TryFrom;
use tokio::sync::mpsc;

Expand All @@ -9,7 +9,7 @@ const CONCURRENT_REQUESTS: usize = 4;
async fn main() -> Result<()> {
// These channels are used to send requests and receive responses to and
// from the lychee client pool
let (send_req, recv_req) = mpsc::channel(CONCURRENT_REQUESTS);
let (send_req, mut recv_req) = mpsc::channel(CONCURRENT_REQUESTS);
let (send_resp, mut recv_resp) = mpsc::channel(CONCURRENT_REQUESTS);

// Add as many requests as you like
Expand All @@ -29,13 +29,17 @@ async fn main() -> Result<()> {
// Create a default lychee client
let client = ClientBuilder::default().client()?;

// Create a pool with four lychee clients
let clients = vec![client; CONCURRENT_REQUESTS];
let mut clients = ClientPool::new(send_resp, recv_req, clients);

// Handle requests in a client pool
tokio::spawn(async move {
clients.listen().await;
while let Some(req) = recv_req.recv().await {
// Client::check() may fail only because Request::try_from() may fail
// here request is already Request, so it never fails
let resp = client.check(req).await.unwrap();
send_resp
.send(resp)
.await
.expect("Cannot send response to channel");
}
});

// Finally, listen to incoming responses from lychee
Expand Down
4 changes: 0 additions & 4 deletions fixtures/TEST_REPETITION.txt

This file was deleted.

5 changes: 5 additions & 0 deletions fixtures/TEST_REPETITION_1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
These links are all the same and match those of TEST_REPETITION_2.txt.
All links in both files should be counted as one and checked once only.
https://example.org/
https://example.org/
https://example.org
5 changes: 5 additions & 0 deletions fixtures/TEST_REPETITION_2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
These links are all the same and match those of TEST_REPETITION_1.txt.
All links in both files should be counted as one and checked once only.
https://example.org/
https://example.org/
https://example.org
16 changes: 11 additions & 5 deletions lychee-bin/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ use std::{collections::HashSet, fs, str::FromStr};
use anyhow::{anyhow, Context, Result};
use headers::HeaderMapExt;
use indicatif::{ProgressBar, ProgressStyle};
use lychee_lib::{ClientBuilder, ClientPool, Collector, Input, Request, Response};
use lychee_lib::{ClientBuilder, Collector, Input, Request, Response};
use openssl_sys as _; // required for vendored-openssl feature
use regex::RegexSet;
use ring as _; // required for apple silicon
Expand Down Expand Up @@ -228,7 +228,7 @@ async fn run(cfg: &Config, inputs: Vec<Input>) -> Result<i32> {
Some(bar)
};

let (send_req, recv_req) = mpsc::channel(max_concurrency);
let (send_req, mut recv_req) = mpsc::channel(max_concurrency);
let (send_resp, mut recv_resp) = mpsc::channel(max_concurrency);

let mut stats = ResponseStats::new();
Expand All @@ -245,9 +245,15 @@ async fn run(cfg: &Config, inputs: Vec<Input>) -> Result<i32> {

// Start receiving requests
tokio::spawn(async move {
let clients = vec![client; max_concurrency];
let mut clients = ClientPool::new(send_resp, recv_req, clients);
clients.listen().await;
while let Some(req) = recv_req.recv().await {
// `Client::check()` may fail only because `Request::try_from()` may
// fail. Here `req` is already a valid `Request`, so it never fails.
let resp = client.check(req).await.unwrap();
send_resp
.send(resp)
.await
.expect("Cannot send response to channel");
}
});

while let Some(response) = recv_resp.recv().await {
Expand Down
2 changes: 1 addition & 1 deletion lychee-bin/src/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ mod test {
stats.add(Response(
Input::Stdin,
ResponseBody {
uri: website("http://example.org/ok"),
uri: website("https://example.org/ok"),
status: Status::Ok(StatusCode::OK),
},
));
Expand Down
16 changes: 13 additions & 3 deletions lychee-bin/tests/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -216,11 +216,21 @@ mod cli {
}

#[test]
fn test_repetition() {
#[ignore]
// Test that two identical requests don't get executed twice.
// Note: This currently fails, because we currently don't cache responses. We
// used to, but there were issues.
// See https://github.com/lycheeverse/lychee/pull/349.
// We're planning to add back caching support at a later point in time,
// which is why we keep the test around.
fn test_caching() {
let mut cmd = main_command();
MichaIng marked this conversation as resolved.
Show resolved Hide resolved
let test_schemes_path = fixtures_path().join("TEST_REPETITION.txt");
// Repetitions across multiple files shall all be checked and counted only once.
let test_schemes_path_1 = fixtures_path().join("TEST_REPETITION_1.txt");
let test_schemes_path_2 = fixtures_path().join("TEST_REPETITION_2.txt");

cmd.arg(&test_schemes_path)
cmd.arg(&test_schemes_path_1)
.arg(&test_schemes_path_2)
.env_clear()
.assert()
.success()
Expand Down
1 change: 0 additions & 1 deletion lychee-lib/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ version = "0.7.2"

[dependencies]
check-if-email-exists = "0.8.25"
deadpool = "0.7.0"
fast_chemail = "0.9.6"
glob = "0.3.0"
html5ever = "0.25.1"
Expand Down
27 changes: 25 additions & 2 deletions lychee-lib/src/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ use crate::{
const DEFAULT_MAX_REDIRECTS: usize = 5;
const DEFAULT_USER_AGENT: &str = concat!("lychee/", env!("CARGO_PKG_VERSION"));

/// Handles incoming requests and returns responses. Usually you would not
/// initialize a `Client` yourself, but use the `ClientBuilder` because it
/// provides sane defaults for all configuration options.
#[derive(Debug, Clone)]
pub struct Client {
/// Underlying reqwest client instance that handles the HTTP requests.
Expand Down Expand Up @@ -123,7 +126,13 @@ impl ClientBuilder {
}

/// The build method instantiates the client.
#[allow(clippy::missing_errors_doc)]
///
/// # Errors
///
/// Returns an `Err` if:
/// - The user agent cannot be parsed
/// - The request client cannot be created
/// - The Github client cannot be created
pub fn client(&self) -> Result<Client> {
let mut headers = self.custom_headers.clone();
headers.insert(header::USER_AGENT, HeaderValue::from_str(&self.user_agent)?);
Expand Down Expand Up @@ -169,6 +178,13 @@ impl ClientBuilder {
}

impl Client {
/// Check a single request
///
/// # Errors
///
/// This returns an `Err` if
/// - The request cannot be parsed
/// - An HTTP website with an invalid URI format gets checked
pub async fn check<T, E>(&self, request: T) -> Result<Response>
where
Request: TryFrom<T, Error = E>,
Expand All @@ -185,7 +201,10 @@ impl Client {
match self.check_website(&uri).await {
Status::Ok(code) if self.require_https && uri.scheme() == "http" => {
let mut https_uri = uri.clone();
https_uri.url.set_scheme("https").unwrap();
https_uri
.url
.set_scheme("https")
.map_err(|_| ErrorKind::InvalidURI(uri.clone()))?;
if self.check_website(&https_uri).await.is_success() {
Status::Error(Box::new(ErrorKind::InsecureURL(https_uri)))
} else {
Expand All @@ -200,10 +219,12 @@ impl Client {
}

/// Check if the given URI is filtered by the client
#[must_use]
pub fn filtered(&self, uri: &Uri) -> bool {
self.filter.is_excluded(uri)
}

/// Check a website URI
pub async fn check_website(&self, uri: &Uri) -> Status {
let mut retries: i64 = 3;
let mut wait: u64 = 1;
Expand Down Expand Up @@ -256,6 +277,7 @@ impl Client {
}
}

/// Check a file URI
pub async fn check_file(&self, uri: &Uri) -> Status {
if let Ok(path) = uri.url.to_file_path() {
if path.exists() {
Expand All @@ -265,6 +287,7 @@ impl Client {
ErrorKind::InvalidFilePath(uri.clone()).into()
}

/// Check a mail address
pub async fn check_mail(&self, uri: &Uri) -> Status {
let input = CheckEmailInput::new(vec![uri.as_str().to_owned()]);
let result = &(check_email(&input).await)[0];
Expand Down
45 changes: 0 additions & 45 deletions lychee-lib/src/client_pool.rs

This file was deleted.

21 changes: 7 additions & 14 deletions lychee-lib/src/collector.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::{extract::Extractor, Base, Input, Request, Result, Uri};
use crate::{extract::Extractor, Base, Input, Request, Result};
use std::collections::HashSet;

/// Collector keeps the state of link collection
Expand All @@ -7,18 +7,20 @@ pub struct Collector {
base: Option<Base>,
skip_missing_inputs: bool,
max_concurrency: usize,
cache: HashSet<Uri>,
}

impl Collector {
/// Create a new collector with an empty cache
#[must_use]
pub fn new(base: Option<Base>, skip_missing_inputs: bool, max_concurrency: usize) -> Self {
pub const fn new(
base: Option<Base>,
skip_missing_inputs: bool,
max_concurrency: usize,
) -> Self {
Collector {
base,
skip_missing_inputs,
max_concurrency,
cache: HashSet::new(),
}
}

Expand All @@ -29,7 +31,7 @@ impl Collector {
/// # Errors
///
/// Will return `Err` if links cannot be extracted from an input
pub async fn collect_links(mut self, inputs: &[Input]) -> Result<HashSet<Request>> {
pub async fn collect_links(self, inputs: &[Input]) -> Result<HashSet<Request>> {
let (contents_tx, mut contents_rx) = tokio::sync::mpsc::channel(self.max_concurrency);

// extract input contents
Expand Down Expand Up @@ -71,17 +73,8 @@ impl Collector {
links.extend(new_links?);
}

// Filter out already cached links (duplicates)
links.retain(|l| !self.cache.contains(&l.uri));

self.update_cache(&links);
Ok(links)
}

/// Update internal link cache
fn update_cache(&mut self, links: &HashSet<Request>) {
self.cache.extend(links.iter().cloned().map(|l| l.uri));
}
}

#[cfg(test)]
Expand Down
4 changes: 2 additions & 2 deletions lychee-lib/src/extract.rs
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ impl Extractor {
match resolved {
Some(path) => Url::from_file_path(&path)
.map(Some)
.map_err(|_e| ErrorKind::InvalidUrl(path)),
.map_err(|_e| ErrorKind::InvalidUrlFromPath(path)),
None => Ok(None),
}
}
Expand Down Expand Up @@ -239,7 +239,7 @@ mod test {

#[test]
fn test_extract_link_at_end_of_line() {
let input = "http://www.apache.org/licenses/LICENSE-2.0\n";
let input = "https://www.apache.org/licenses/LICENSE-2.0\n";
let link = input.trim_end();

let mut extractor = Extractor::new(None);
Expand Down
4 changes: 2 additions & 2 deletions lychee-lib/src/filter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ pub use includes::Includes;
use crate::Uri;

/// Pre-defined exclusions for known false-positives
static FALSE_POSITIVE_PAT: &[&str] = &[r"http://www.w3.org/1999/xhtml"];
const FALSE_POSITIVE_PAT: &[&str] = &[r"http://www.w3.org/1999/xhtml"];

#[inline]
#[must_use]
Expand Down Expand Up @@ -298,7 +298,7 @@ mod test {
..Filter::default()
};

assert!(filter.is_excluded(&website("http://github.com")));
assert!(filter.is_excluded(&website("https://github.com")));
assert!(filter.is_excluded(&website("http://exclude.org")));
assert!(filter.is_excluded(&mail("[email protected]")));

Expand Down
4 changes: 1 addition & 3 deletions lychee-lib/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@
doc_comment::doctest!("../../README.md");

mod client;
mod client_pool;
/// A pool of clients, to handle concurrent checks
pub mod collector;
mod helpers;
Expand All @@ -73,8 +72,7 @@ use ring as _; // required for apple silicon

#[doc(inline)]
pub use crate::{
client::{check, ClientBuilder},
client_pool::ClientPool,
client::{check, Client, ClientBuilder},
collector::Collector,
filter::{Excludes, Filter, Includes},
types::{Base, ErrorKind, Input, Request, Response, ResponseBody, Result, Status, Uri},
Expand Down
Loading