From 69de82f4c4b2719f93352b539b40d3421f9a8961 Mon Sep 17 00:00:00 2001 From: MichaIng Date: Thu, 30 Sep 2021 13:51:13 +0200 Subject: [PATCH 01/10] Extend repitition test to span across two files Signed-off-by: MichaIng --- examples/builder/builder.rs | 2 +- fixtures/TEST_REPETITION.txt | 4 ---- fixtures/TEST_REPETITION_1.txt | 5 +++++ fixtures/TEST_REPETITION_2.txt | 5 +++++ lychee-bin/src/stats.rs | 2 +- lychee-bin/tests/cli.rs | 7 +++++-- lychee-lib/src/collector.rs | 9 +++------ lychee-lib/src/extract.rs | 2 +- lychee-lib/src/filter/mod.rs | 4 ++-- lychee-lib/src/types/uri.rs | 8 ++++---- 10 files changed, 27 insertions(+), 21 deletions(-) delete mode 100644 fixtures/TEST_REPETITION.txt create mode 100644 fixtures/TEST_REPETITION_1.txt create mode 100644 fixtures/TEST_REPETITION_2.txt diff --git a/examples/builder/builder.rs b/examples/builder/builder.rs index 08f189e899..1b3f53bd93 100644 --- a/examples/builder/builder.rs +++ b/examples/builder/builder.rs @@ -40,7 +40,7 @@ async fn main() -> Result<()> { .build() .client()?; - let response = client.check("http://example.org").await?; + let response = client.check("https://example.org").await?; dbg!(&response); assert!(response.status().is_success()); Ok(()) diff --git a/fixtures/TEST_REPETITION.txt b/fixtures/TEST_REPETITION.txt deleted file mode 100644 index 0395c8e26e..0000000000 --- a/fixtures/TEST_REPETITION.txt +++ /dev/null @@ -1,4 +0,0 @@ -These links are all the same and should only be checked once: -https://example.org/ -https://example.org/ -https://example.org diff --git a/fixtures/TEST_REPETITION_1.txt b/fixtures/TEST_REPETITION_1.txt new file mode 100644 index 0000000000..14570d507c --- /dev/null +++ b/fixtures/TEST_REPETITION_1.txt @@ -0,0 +1,5 @@ +These links are all the same and match those of TEST_REPETITION_2.txt. +All links in both files should be counted as one and checked once only. +https://example.org/ +https://example.org/ +https://example.org diff --git a/fixtures/TEST_REPETITION_2.txt b/fixtures/TEST_REPETITION_2.txt new file mode 100644 index 0000000000..0b6091f1a8 --- /dev/null +++ b/fixtures/TEST_REPETITION_2.txt @@ -0,0 +1,5 @@ +These links are all the same and match those of TEST_REPETITION_1.txt. +All links in both files should be counted as one and checked once only. +https://example.org/ +https://example.org/ +https://example.org diff --git a/lychee-bin/src/stats.rs b/lychee-bin/src/stats.rs index 9ba51f91cc..579c30a21e 100644 --- a/lychee-bin/src/stats.rs +++ b/lychee-bin/src/stats.rs @@ -174,7 +174,7 @@ mod test { stats.add(Response( Input::Stdin, ResponseBody { - uri: website("http://example.org/ok"), + uri: website("https://example.org/ok"), status: Status::Ok(StatusCode::OK), }, )); diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index d095c50c37..6b28655b3a 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -218,9 +218,12 @@ mod cli { #[test] fn test_repetition() { let mut cmd = main_command(); - let test_schemes_path = fixtures_path().join("TEST_REPETITION.txt"); + // Repetitions across multiple files shall all be checked and counted only once. + let test_schemes_path_1 = fixtures_path().join("TEST_REPETITION_1.txt"); + let test_schemes_path_2 = fixtures_path().join("TEST_REPETITION_2.txt"); - cmd.arg(&test_schemes_path) + cmd.arg(&test_schemes_path_1) + .arg(&test_schemes_path_2) .env_clear() .assert() .success() diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index b5e69d96ba..8e454cf3cb 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -72,13 +72,10 @@ impl Collector { // Filter out already cached links (duplicates) links.retain(|l| !self.cache.contains(&l.uri)); - self.update_cache(&links); - Ok(links) - } - - /// Update internal link cache - fn update_cache(&mut self, links: &HashSet) { + // Add remaining new links to cache self.cache.extend(links.iter().cloned().map(|l| l.uri)); + + Ok(links) } } diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs index e5753d833b..03532cb963 100644 --- a/lychee-lib/src/extract.rs +++ b/lychee-lib/src/extract.rs @@ -225,7 +225,7 @@ mod test { #[test] fn test_extract_link_at_end_of_line() { - let input = "http://www.apache.org/licenses/LICENSE-2.0\n"; + let input = "https://www.apache.org/licenses/LICENSE-2.0\n"; let link = input.trim_end(); assert_eq!( diff --git a/lychee-lib/src/filter/mod.rs b/lychee-lib/src/filter/mod.rs index 0726aa67d0..915a908002 100644 --- a/lychee-lib/src/filter/mod.rs +++ b/lychee-lib/src/filter/mod.rs @@ -9,7 +9,7 @@ pub use includes::Includes; use crate::Uri; /// Pre-defined exclusions for known false-positives -static FALSE_POSITIVE_PAT: &[&str] = &[r"http://www.w3.org/1999/xhtml"]; +const FALSE_POSITIVE_PAT: &[&str] = &[r"http://www.w3.org/1999/xhtml"]; #[inline] #[must_use] @@ -286,7 +286,7 @@ mod test { ..Filter::default() }; - assert!(filter.is_excluded(&website("http://github.com"))); + assert!(filter.is_excluded(&website("https://github.com"))); assert!(filter.is_excluded(&website("http://exclude.org"))); assert!(filter.is_excluded(&mail("mail@example.org"))); diff --git a/lychee-lib/src/types/uri.rs b/lychee-lib/src/types/uri.rs index 6ad126c3d4..02b017608c 100644 --- a/lychee-lib/src/types/uri.rs +++ b/lychee-lib/src/types/uri.rs @@ -160,12 +160,12 @@ mod test { fn test_uri_from_str() { assert!(Uri::try_from("").is_err()); assert_eq!( - Uri::try_from("http://example.org"), - Ok(website("http://example.org")) + Uri::try_from("https://example.org"), + Ok(website("https://example.org")) ); assert_eq!( - Uri::try_from("http://example.org/@test/testing"), - Ok(website("http://example.org/@test/testing")) + Uri::try_from("https://example.org/@test/testing"), + Ok(website("https://example.org/@test/testing")) ); assert_eq!( Uri::try_from("mail@example.org"), From 1ec9c59241c72272bf5eb6e605289f894eea9b6d Mon Sep 17 00:00:00 2001 From: Matthias Date: Thu, 7 Oct 2021 12:58:44 +0200 Subject: [PATCH 02/10] Remove cache from collector --- lychee-bin/tests/cli.rs | 3 ++- lychee-lib/src/collector.rs | 14 +++----------- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 6b28655b3a..a3f2b42246 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -216,7 +216,8 @@ mod cli { } #[test] - fn test_repetition() { + #[ignore] + fn test_caching() { let mut cmd = main_command(); // Repetitions across multiple files shall all be checked and counted only once. let test_schemes_path_1 = fixtures_path().join("TEST_REPETITION_1.txt"); diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 8e454cf3cb..d5b66a3885 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -1,4 +1,4 @@ -use crate::{extract::extract_links, Base, Input, Request, Result, Uri}; +use crate::{extract::extract_links, Base, Input, Request, Result}; use std::collections::HashSet; /// Collector keeps the state of link collection @@ -7,18 +7,16 @@ pub struct Collector { base: Option, skip_missing_inputs: bool, max_concurrency: usize, - cache: HashSet, } impl Collector { /// Create a new collector with an empty cache #[must_use] - pub fn new(base: Option, skip_missing_inputs: bool, max_concurrency: usize) -> Self { + pub const fn new(base: Option, skip_missing_inputs: bool, max_concurrency: usize) -> Self { Collector { base, skip_missing_inputs, max_concurrency, - cache: HashSet::new(), } } @@ -29,7 +27,7 @@ impl Collector { /// # Errors /// /// Will return `Err` if links cannot be extracted from an input - pub async fn collect_links(mut self, inputs: &[Input]) -> Result> { + pub async fn collect_links(self, inputs: &[Input]) -> Result> { let (contents_tx, mut contents_rx) = tokio::sync::mpsc::channel(self.max_concurrency); // extract input contents @@ -69,12 +67,6 @@ impl Collector { links.extend(new_links?); } - // Filter out already cached links (duplicates) - links.retain(|l| !self.cache.contains(&l.uri)); - - // Add remaining new links to cache - self.cache.extend(links.iter().cloned().map(|l| l.uri)); - Ok(links) } } From f0b24170a33d640415b52b30917eaedc52da4c3c Mon Sep 17 00:00:00 2001 From: Matthias Date: Thu, 7 Oct 2021 12:59:23 +0200 Subject: [PATCH 03/10] Remove client pool Reqwest comes with its own request pool, so there's no need in adding another layer of indirection. This also gets rid of a lot of allocs. --- Cargo.lock | 1 - examples/client_pool/client_pool.rs | 18 +++++++----- lychee-bin/src/main.rs | 20 +++++++++---- lychee-lib/Cargo.toml | 1 - lychee-lib/src/client.rs | 7 +++++ lychee-lib/src/client_pool.rs | 45 ----------------------------- lychee-lib/src/lib.rs | 4 +-- 7 files changed, 34 insertions(+), 62 deletions(-) delete mode 100644 lychee-lib/src/client_pool.rs diff --git a/Cargo.lock b/Cargo.lock index df423647fb..a83bc10b1c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1578,7 +1578,6 @@ version = "0.7.2" dependencies = [ "cached", "check-if-email-exists", - "deadpool", "doc-comment", "fast_chemail", "glob", diff --git a/examples/client_pool/client_pool.rs b/examples/client_pool/client_pool.rs index ef4bc8d46b..064fe83691 100644 --- a/examples/client_pool/client_pool.rs +++ b/examples/client_pool/client_pool.rs @@ -1,4 +1,4 @@ -use lychee_lib::{ClientBuilder, ClientPool, Input, Request, Result, Uri}; +use lychee_lib::{ClientBuilder, Input, Request, Result, Uri}; use std::convert::TryFrom; use tokio::sync::mpsc; @@ -9,7 +9,7 @@ const CONCURRENT_REQUESTS: usize = 4; async fn main() -> Result<()> { // These channels are used to send requests and receive responses to and // from the lychee client pool - let (send_req, recv_req) = mpsc::channel(CONCURRENT_REQUESTS); + let (send_req, mut recv_req) = mpsc::channel(CONCURRENT_REQUESTS); let (send_resp, mut recv_resp) = mpsc::channel(CONCURRENT_REQUESTS); // Add as many requests as you like @@ -29,13 +29,17 @@ async fn main() -> Result<()> { // Create a default lychee client let client = ClientBuilder::default().client()?; - // Create a pool with four lychee clients - let clients = vec![client; CONCURRENT_REQUESTS]; - let mut clients = ClientPool::new(send_resp, recv_req, clients); - // Handle requests in a client pool tokio::spawn(async move { - clients.listen().await; + while let Some(req) = recv_req.recv().await { + // Client::check() may fail only because Request::try_from() may fail + // here request is already Request, so it never fails + let resp = client.check(req).await.unwrap(); + send_resp + .send(resp) + .await + .expect("Cannot send response to channel"); + } }); // Finally, listen to incoming responses from lychee diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index b1f2c7307c..97142d7ff7 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -69,7 +69,7 @@ use std::{collections::HashSet, fs, str::FromStr}; use anyhow::{anyhow, Context, Result}; use headers::HeaderMapExt; use indicatif::{ProgressBar, ProgressStyle}; -use lychee_lib::{ClientBuilder, ClientPool, Collector, Input, Request, Response}; +use lychee_lib::{Client, ClientBuilder, Collector, Input, Request, Response}; use openssl_sys as _; // required for vendored-openssl feature use regex::RegexSet; use ring as _; // required for apple silicon @@ -228,7 +228,7 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { Some(bar) }; - let (send_req, recv_req) = mpsc::channel(max_concurrency); + let (send_req, mut recv_req) = mpsc::channel(max_concurrency); let (send_resp, mut recv_resp) = mpsc::channel(max_concurrency); let mut stats = ResponseStats::new(); @@ -243,11 +243,21 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { } }); + async fn check(client: &Client, req: Request) -> Response { + client.check(req).await.unwrap() + } + // Start receiving requests tokio::spawn(async move { - let clients = vec![client; max_concurrency]; - let mut clients = ClientPool::new(send_resp, recv_req, clients); - clients.listen().await; + while let Some(req) = recv_req.recv().await { + // Client::check() may fail only because Request::try_from() may fail + // here request is already Request, so it never fails + let resp = check(&client, req).await; + send_resp + .send(resp) + .await + .expect("Cannot send response to channel"); + } }); while let Some(response) = recv_resp.recv().await { diff --git a/lychee-lib/Cargo.toml b/lychee-lib/Cargo.toml index 5b84ec5c8c..a08a922984 100644 --- a/lychee-lib/Cargo.toml +++ b/lychee-lib/Cargo.toml @@ -18,7 +18,6 @@ version = "0.7.2" [dependencies] check-if-email-exists = "0.8.24" -deadpool = "0.7.0" fast_chemail = "0.9.6" glob = "0.3.0" html5ever = "0.25.1" diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index 4c7d8ec856..090aef23be 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -26,6 +26,9 @@ use crate::{ const DEFAULT_MAX_REDIRECTS: usize = 5; const DEFAULT_USER_AGENT: &str = concat!("lychee/", env!("CARGO_PKG_VERSION")); +/// Handles incoming requests and returns responses. Usually you would not +/// initialize a `Client` yourself, but use the `ClientBuilder` because it +/// provides sane defaults for all configuration options. #[derive(Debug, Clone)] pub struct Client { /// Underlying reqwest client instance that handles the HTTP requests. @@ -169,6 +172,7 @@ impl ClientBuilder { } impl Client { + /// Check a single request pub async fn check(&self, request: T) -> Result where Request: TryFrom, @@ -204,6 +208,7 @@ impl Client { self.filter.is_excluded(uri) } + /// Check a website URI pub async fn check_website(&self, uri: &Uri) -> Status { let mut retries: i64 = 3; let mut wait: u64 = 1; @@ -256,6 +261,7 @@ impl Client { } } + /// Check a file URI pub async fn check_file(&self, uri: &Uri) -> Status { if let Ok(path) = uri.url.to_file_path() { if path.exists() { @@ -265,6 +271,7 @@ impl Client { ErrorKind::InvalidFilePath(uri.clone()).into() } + /// Check a mail address pub async fn check_mail(&self, uri: &Uri) -> Status { let input = CheckEmailInput::new(vec![uri.as_str().to_owned()]); let result = &(check_email(&input).await)[0]; diff --git a/lychee-lib/src/client_pool.rs b/lychee-lib/src/client_pool.rs deleted file mode 100644 index 92b4d5615c..0000000000 --- a/lychee-lib/src/client_pool.rs +++ /dev/null @@ -1,45 +0,0 @@ -use client::Client; -use deadpool::unmanaged::Pool; -use tokio::sync::mpsc; - -use crate::{client, types}; - -#[allow(missing_debug_implementations)] -/// Manages a channel for incoming requests -/// and a pool of lychee clients to handle them -pub struct ClientPool { - tx: mpsc::Sender, - rx: mpsc::Receiver, - pool: deadpool::unmanaged::Pool, -} - -impl ClientPool { - #[must_use] - /// Creates a new client pool - pub fn new( - tx: mpsc::Sender, - rx: mpsc::Receiver, - clients: Vec, - ) -> Self { - let pool = Pool::from(clients); - ClientPool { tx, rx, pool } - } - - #[allow(clippy::missing_panics_doc)] - /// Start listening for incoming requests and send each of them - /// asynchronously to a client from the pool - pub async fn listen(&mut self) { - while let Some(req) = self.rx.recv().await { - let client = self.pool.get().await; - let tx = self.tx.clone(); - tokio::spawn(async move { - // Client::check() may fail only because Request::try_from() may fail - // here request is already Request, so it never fails - let resp = client.check(req).await.unwrap(); - tx.send(resp) - .await - .expect("Cannot send response to channel"); - }); - } - } -} diff --git a/lychee-lib/src/lib.rs b/lychee-lib/src/lib.rs index 58c0a6abc6..e4d1de057a 100644 --- a/lychee-lib/src/lib.rs +++ b/lychee-lib/src/lib.rs @@ -47,7 +47,6 @@ doc_comment::doctest!("../../README.md"); mod client; -mod client_pool; /// A pool of clients, to handle concurrent checks pub mod collector; mod helpers; @@ -73,8 +72,7 @@ use ring as _; // required for apple silicon #[doc(inline)] pub use crate::{ - client::{check, ClientBuilder}, - client_pool::ClientPool, + client::{check, Client, ClientBuilder}, collector::Collector, filter::{Excludes, Filter, Includes}, types::{Base, ErrorKind, Input, Request, Response, ResponseBody, Result, Status, Uri}, From d92a2630bddeae539e02eb0b43e8ff71e021e34a Mon Sep 17 00:00:00 2001 From: MichaIng Date: Thu, 30 Sep 2021 13:51:13 +0200 Subject: [PATCH 04/10] Extend repitition test to span across two files Signed-off-by: MichaIng --- examples/builder/builder.rs | 2 +- fixtures/TEST_REPETITION.txt | 4 ---- fixtures/TEST_REPETITION_1.txt | 5 +++++ fixtures/TEST_REPETITION_2.txt | 5 +++++ lychee-bin/src/stats.rs | 2 +- lychee-bin/tests/cli.rs | 7 +++++-- lychee-lib/src/collector.rs | 9 +++------ lychee-lib/src/extract.rs | 2 +- lychee-lib/src/filter/mod.rs | 4 ++-- lychee-lib/src/types/uri.rs | 8 ++++---- 10 files changed, 27 insertions(+), 21 deletions(-) delete mode 100644 fixtures/TEST_REPETITION.txt create mode 100644 fixtures/TEST_REPETITION_1.txt create mode 100644 fixtures/TEST_REPETITION_2.txt diff --git a/examples/builder/builder.rs b/examples/builder/builder.rs index 08f189e899..1b3f53bd93 100644 --- a/examples/builder/builder.rs +++ b/examples/builder/builder.rs @@ -40,7 +40,7 @@ async fn main() -> Result<()> { .build() .client()?; - let response = client.check("http://example.org").await?; + let response = client.check("https://example.org").await?; dbg!(&response); assert!(response.status().is_success()); Ok(()) diff --git a/fixtures/TEST_REPETITION.txt b/fixtures/TEST_REPETITION.txt deleted file mode 100644 index 0395c8e26e..0000000000 --- a/fixtures/TEST_REPETITION.txt +++ /dev/null @@ -1,4 +0,0 @@ -These links are all the same and should only be checked once: -https://example.org/ -https://example.org/ -https://example.org diff --git a/fixtures/TEST_REPETITION_1.txt b/fixtures/TEST_REPETITION_1.txt new file mode 100644 index 0000000000..14570d507c --- /dev/null +++ b/fixtures/TEST_REPETITION_1.txt @@ -0,0 +1,5 @@ +These links are all the same and match those of TEST_REPETITION_2.txt. +All links in both files should be counted as one and checked once only. +https://example.org/ +https://example.org/ +https://example.org diff --git a/fixtures/TEST_REPETITION_2.txt b/fixtures/TEST_REPETITION_2.txt new file mode 100644 index 0000000000..0b6091f1a8 --- /dev/null +++ b/fixtures/TEST_REPETITION_2.txt @@ -0,0 +1,5 @@ +These links are all the same and match those of TEST_REPETITION_1.txt. +All links in both files should be counted as one and checked once only. +https://example.org/ +https://example.org/ +https://example.org diff --git a/lychee-bin/src/stats.rs b/lychee-bin/src/stats.rs index 9ba51f91cc..579c30a21e 100644 --- a/lychee-bin/src/stats.rs +++ b/lychee-bin/src/stats.rs @@ -174,7 +174,7 @@ mod test { stats.add(Response( Input::Stdin, ResponseBody { - uri: website("http://example.org/ok"), + uri: website("https://example.org/ok"), status: Status::Ok(StatusCode::OK), }, )); diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index d095c50c37..6b28655b3a 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -218,9 +218,12 @@ mod cli { #[test] fn test_repetition() { let mut cmd = main_command(); - let test_schemes_path = fixtures_path().join("TEST_REPETITION.txt"); + // Repetitions across multiple files shall all be checked and counted only once. + let test_schemes_path_1 = fixtures_path().join("TEST_REPETITION_1.txt"); + let test_schemes_path_2 = fixtures_path().join("TEST_REPETITION_2.txt"); - cmd.arg(&test_schemes_path) + cmd.arg(&test_schemes_path_1) + .arg(&test_schemes_path_2) .env_clear() .assert() .success() diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 871e869557..bc15de7cb9 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -74,13 +74,10 @@ impl Collector { // Filter out already cached links (duplicates) links.retain(|l| !self.cache.contains(&l.uri)); - self.update_cache(&links); - Ok(links) - } - - /// Update internal link cache - fn update_cache(&mut self, links: &HashSet) { + // Add remaining new links to cache self.cache.extend(links.iter().cloned().map(|l| l.uri)); + + Ok(links) } } diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs index f8ade534c9..13f04607f6 100644 --- a/lychee-lib/src/extract.rs +++ b/lychee-lib/src/extract.rs @@ -239,7 +239,7 @@ mod test { #[test] fn test_extract_link_at_end_of_line() { - let input = "http://www.apache.org/licenses/LICENSE-2.0\n"; + let input = "https://www.apache.org/licenses/LICENSE-2.0\n"; let link = input.trim_end(); let mut extractor = Extractor::new(None); diff --git a/lychee-lib/src/filter/mod.rs b/lychee-lib/src/filter/mod.rs index 07e0913a2d..0a7f8c39da 100644 --- a/lychee-lib/src/filter/mod.rs +++ b/lychee-lib/src/filter/mod.rs @@ -9,7 +9,7 @@ pub use includes::Includes; use crate::Uri; /// Pre-defined exclusions for known false-positives -static FALSE_POSITIVE_PAT: &[&str] = &[r"http://www.w3.org/1999/xhtml"]; +const FALSE_POSITIVE_PAT: &[&str] = &[r"http://www.w3.org/1999/xhtml"]; #[inline] #[must_use] @@ -298,7 +298,7 @@ mod test { ..Filter::default() }; - assert!(filter.is_excluded(&website("http://github.com"))); + assert!(filter.is_excluded(&website("https://github.com"))); assert!(filter.is_excluded(&website("http://exclude.org"))); assert!(filter.is_excluded(&mail("mail@example.org"))); diff --git a/lychee-lib/src/types/uri.rs b/lychee-lib/src/types/uri.rs index 6ad126c3d4..02b017608c 100644 --- a/lychee-lib/src/types/uri.rs +++ b/lychee-lib/src/types/uri.rs @@ -160,12 +160,12 @@ mod test { fn test_uri_from_str() { assert!(Uri::try_from("").is_err()); assert_eq!( - Uri::try_from("http://example.org"), - Ok(website("http://example.org")) + Uri::try_from("https://example.org"), + Ok(website("https://example.org")) ); assert_eq!( - Uri::try_from("http://example.org/@test/testing"), - Ok(website("http://example.org/@test/testing")) + Uri::try_from("https://example.org/@test/testing"), + Ok(website("https://example.org/@test/testing")) ); assert_eq!( Uri::try_from("mail@example.org"), From 1813a48e1b3a92f5da5b2ee69bd2353f088295df Mon Sep 17 00:00:00 2001 From: Matthias Date: Thu, 7 Oct 2021 12:58:44 +0200 Subject: [PATCH 05/10] Remove cache from collector --- lychee-bin/tests/cli.rs | 3 ++- lychee-lib/src/collector.rs | 14 +++----------- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 6b28655b3a..a3f2b42246 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -216,7 +216,8 @@ mod cli { } #[test] - fn test_repetition() { + #[ignore] + fn test_caching() { let mut cmd = main_command(); // Repetitions across multiple files shall all be checked and counted only once. let test_schemes_path_1 = fixtures_path().join("TEST_REPETITION_1.txt"); diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index bc15de7cb9..98543302c7 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -1,4 +1,4 @@ -use crate::{extract::Extractor, Base, Input, Request, Result, Uri}; +use crate::{extract::Extractor, Base, Input, Request, Result}; use std::collections::HashSet; /// Collector keeps the state of link collection @@ -7,18 +7,16 @@ pub struct Collector { base: Option, skip_missing_inputs: bool, max_concurrency: usize, - cache: HashSet, } impl Collector { /// Create a new collector with an empty cache #[must_use] - pub fn new(base: Option, skip_missing_inputs: bool, max_concurrency: usize) -> Self { + pub const fn new(base: Option, skip_missing_inputs: bool, max_concurrency: usize) -> Self { Collector { base, skip_missing_inputs, max_concurrency, - cache: HashSet::new(), } } @@ -29,7 +27,7 @@ impl Collector { /// # Errors /// /// Will return `Err` if links cannot be extracted from an input - pub async fn collect_links(mut self, inputs: &[Input]) -> Result> { + pub async fn collect_links(self, inputs: &[Input]) -> Result> { let (contents_tx, mut contents_rx) = tokio::sync::mpsc::channel(self.max_concurrency); // extract input contents @@ -71,12 +69,6 @@ impl Collector { links.extend(new_links?); } - // Filter out already cached links (duplicates) - links.retain(|l| !self.cache.contains(&l.uri)); - - // Add remaining new links to cache - self.cache.extend(links.iter().cloned().map(|l| l.uri)); - Ok(links) } } From e0675050db5d1aef4dad651a5431c0a1c266aa66 Mon Sep 17 00:00:00 2001 From: Matthias Date: Thu, 7 Oct 2021 12:59:23 +0200 Subject: [PATCH 06/10] Remove client pool Reqwest comes with its own request pool, so there's no need in adding another layer of indirection. This also gets rid of a lot of allocs. --- Cargo.lock | 1 - examples/client_pool/client_pool.rs | 18 +++++++----- lychee-bin/src/main.rs | 20 +++++++++---- lychee-lib/Cargo.toml | 1 - lychee-lib/src/client.rs | 7 +++++ lychee-lib/src/client_pool.rs | 45 ----------------------------- lychee-lib/src/lib.rs | 4 +-- 7 files changed, 34 insertions(+), 62 deletions(-) delete mode 100644 lychee-lib/src/client_pool.rs diff --git a/Cargo.lock b/Cargo.lock index b9a1f5199e..dfa6268180 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1579,7 +1579,6 @@ version = "0.7.2" dependencies = [ "cached", "check-if-email-exists", - "deadpool", "doc-comment", "fast_chemail", "glob", diff --git a/examples/client_pool/client_pool.rs b/examples/client_pool/client_pool.rs index ef4bc8d46b..064fe83691 100644 --- a/examples/client_pool/client_pool.rs +++ b/examples/client_pool/client_pool.rs @@ -1,4 +1,4 @@ -use lychee_lib::{ClientBuilder, ClientPool, Input, Request, Result, Uri}; +use lychee_lib::{ClientBuilder, Input, Request, Result, Uri}; use std::convert::TryFrom; use tokio::sync::mpsc; @@ -9,7 +9,7 @@ const CONCURRENT_REQUESTS: usize = 4; async fn main() -> Result<()> { // These channels are used to send requests and receive responses to and // from the lychee client pool - let (send_req, recv_req) = mpsc::channel(CONCURRENT_REQUESTS); + let (send_req, mut recv_req) = mpsc::channel(CONCURRENT_REQUESTS); let (send_resp, mut recv_resp) = mpsc::channel(CONCURRENT_REQUESTS); // Add as many requests as you like @@ -29,13 +29,17 @@ async fn main() -> Result<()> { // Create a default lychee client let client = ClientBuilder::default().client()?; - // Create a pool with four lychee clients - let clients = vec![client; CONCURRENT_REQUESTS]; - let mut clients = ClientPool::new(send_resp, recv_req, clients); - // Handle requests in a client pool tokio::spawn(async move { - clients.listen().await; + while let Some(req) = recv_req.recv().await { + // Client::check() may fail only because Request::try_from() may fail + // here request is already Request, so it never fails + let resp = client.check(req).await.unwrap(); + send_resp + .send(resp) + .await + .expect("Cannot send response to channel"); + } }); // Finally, listen to incoming responses from lychee diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index b1f2c7307c..97142d7ff7 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -69,7 +69,7 @@ use std::{collections::HashSet, fs, str::FromStr}; use anyhow::{anyhow, Context, Result}; use headers::HeaderMapExt; use indicatif::{ProgressBar, ProgressStyle}; -use lychee_lib::{ClientBuilder, ClientPool, Collector, Input, Request, Response}; +use lychee_lib::{Client, ClientBuilder, Collector, Input, Request, Response}; use openssl_sys as _; // required for vendored-openssl feature use regex::RegexSet; use ring as _; // required for apple silicon @@ -228,7 +228,7 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { Some(bar) }; - let (send_req, recv_req) = mpsc::channel(max_concurrency); + let (send_req, mut recv_req) = mpsc::channel(max_concurrency); let (send_resp, mut recv_resp) = mpsc::channel(max_concurrency); let mut stats = ResponseStats::new(); @@ -243,11 +243,21 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { } }); + async fn check(client: &Client, req: Request) -> Response { + client.check(req).await.unwrap() + } + // Start receiving requests tokio::spawn(async move { - let clients = vec![client; max_concurrency]; - let mut clients = ClientPool::new(send_resp, recv_req, clients); - clients.listen().await; + while let Some(req) = recv_req.recv().await { + // Client::check() may fail only because Request::try_from() may fail + // here request is already Request, so it never fails + let resp = check(&client, req).await; + send_resp + .send(resp) + .await + .expect("Cannot send response to channel"); + } }); while let Some(response) = recv_resp.recv().await { diff --git a/lychee-lib/Cargo.toml b/lychee-lib/Cargo.toml index 1ae84abeb0..67b5043534 100644 --- a/lychee-lib/Cargo.toml +++ b/lychee-lib/Cargo.toml @@ -18,7 +18,6 @@ version = "0.7.2" [dependencies] check-if-email-exists = "0.8.25" -deadpool = "0.7.0" fast_chemail = "0.9.6" glob = "0.3.0" html5ever = "0.25.1" diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index 4c7d8ec856..090aef23be 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -26,6 +26,9 @@ use crate::{ const DEFAULT_MAX_REDIRECTS: usize = 5; const DEFAULT_USER_AGENT: &str = concat!("lychee/", env!("CARGO_PKG_VERSION")); +/// Handles incoming requests and returns responses. Usually you would not +/// initialize a `Client` yourself, but use the `ClientBuilder` because it +/// provides sane defaults for all configuration options. #[derive(Debug, Clone)] pub struct Client { /// Underlying reqwest client instance that handles the HTTP requests. @@ -169,6 +172,7 @@ impl ClientBuilder { } impl Client { + /// Check a single request pub async fn check(&self, request: T) -> Result where Request: TryFrom, @@ -204,6 +208,7 @@ impl Client { self.filter.is_excluded(uri) } + /// Check a website URI pub async fn check_website(&self, uri: &Uri) -> Status { let mut retries: i64 = 3; let mut wait: u64 = 1; @@ -256,6 +261,7 @@ impl Client { } } + /// Check a file URI pub async fn check_file(&self, uri: &Uri) -> Status { if let Ok(path) = uri.url.to_file_path() { if path.exists() { @@ -265,6 +271,7 @@ impl Client { ErrorKind::InvalidFilePath(uri.clone()).into() } + /// Check a mail address pub async fn check_mail(&self, uri: &Uri) -> Status { let input = CheckEmailInput::new(vec![uri.as_str().to_owned()]); let result = &(check_email(&input).await)[0]; diff --git a/lychee-lib/src/client_pool.rs b/lychee-lib/src/client_pool.rs deleted file mode 100644 index 92b4d5615c..0000000000 --- a/lychee-lib/src/client_pool.rs +++ /dev/null @@ -1,45 +0,0 @@ -use client::Client; -use deadpool::unmanaged::Pool; -use tokio::sync::mpsc; - -use crate::{client, types}; - -#[allow(missing_debug_implementations)] -/// Manages a channel for incoming requests -/// and a pool of lychee clients to handle them -pub struct ClientPool { - tx: mpsc::Sender, - rx: mpsc::Receiver, - pool: deadpool::unmanaged::Pool, -} - -impl ClientPool { - #[must_use] - /// Creates a new client pool - pub fn new( - tx: mpsc::Sender, - rx: mpsc::Receiver, - clients: Vec, - ) -> Self { - let pool = Pool::from(clients); - ClientPool { tx, rx, pool } - } - - #[allow(clippy::missing_panics_doc)] - /// Start listening for incoming requests and send each of them - /// asynchronously to a client from the pool - pub async fn listen(&mut self) { - while let Some(req) = self.rx.recv().await { - let client = self.pool.get().await; - let tx = self.tx.clone(); - tokio::spawn(async move { - // Client::check() may fail only because Request::try_from() may fail - // here request is already Request, so it never fails - let resp = client.check(req).await.unwrap(); - tx.send(resp) - .await - .expect("Cannot send response to channel"); - }); - } - } -} diff --git a/lychee-lib/src/lib.rs b/lychee-lib/src/lib.rs index 58c0a6abc6..e4d1de057a 100644 --- a/lychee-lib/src/lib.rs +++ b/lychee-lib/src/lib.rs @@ -47,7 +47,6 @@ doc_comment::doctest!("../../README.md"); mod client; -mod client_pool; /// A pool of clients, to handle concurrent checks pub mod collector; mod helpers; @@ -73,8 +72,7 @@ use ring as _; // required for apple silicon #[doc(inline)] pub use crate::{ - client::{check, ClientBuilder}, - client_pool::ClientPool, + client::{check, Client, ClientBuilder}, collector::Collector, filter::{Excludes, Filter, Includes}, types::{Base, ErrorKind, Input, Request, Response, ResponseBody, Result, Status, Uri}, From 9d920d11d64044ab314ebd587a45d3ec6244120b Mon Sep 17 00:00:00 2001 From: Matthias Date: Thu, 7 Oct 2021 13:15:04 +0200 Subject: [PATCH 07/10] Add comment about ignored caching test --- lychee-bin/tests/cli.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index a3f2b42246..b4d27c4f5a 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -217,6 +217,12 @@ mod cli { #[test] #[ignore] + // Test that two identical requests don't get executed twice. + // Note: This currently fails, because we currently don't cache responses. We + // used to, but there were issues. + // See https://github.com/lycheeverse/lychee/pull/349. + // We're planning to add back caching support at a later point in time, + // which is why we keep the test around. fn test_caching() { let mut cmd = main_command(); // Repetitions across multiple files shall all be checked and counted only once. From 1bc7db3e43b07804a7602b6e491975784c764227 Mon Sep 17 00:00:00 2001 From: MichaIng Date: Thu, 7 Oct 2021 13:20:25 +0200 Subject: [PATCH 08/10] Satisfy Rust fmt lint Signed-off-by: MichaIng --- lychee-bin/tests/cli.rs | 2 +- lychee-lib/src/collector.rs | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index b4d27c4f5a..d02d7cc4f3 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -219,7 +219,7 @@ mod cli { #[ignore] // Test that two identical requests don't get executed twice. // Note: This currently fails, because we currently don't cache responses. We - // used to, but there were issues. + // used to, but there were issues. // See https://github.com/lycheeverse/lychee/pull/349. // We're planning to add back caching support at a later point in time, // which is why we keep the test around. diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 98543302c7..2089e304f7 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -12,7 +12,11 @@ pub struct Collector { impl Collector { /// Create a new collector with an empty cache #[must_use] - pub const fn new(base: Option, skip_missing_inputs: bool, max_concurrency: usize) -> Self { + pub const fn new( + base: Option, + skip_missing_inputs: bool, + max_concurrency: usize, + ) -> Self { Collector { base, skip_missing_inputs, From 9ee30cc01198a39a75f24996d4ee4e80332c3fd5 Mon Sep 17 00:00:00 2001 From: Matthias Date: Thu, 7 Oct 2021 14:09:24 +0200 Subject: [PATCH 09/10] Improve error handling and documentation --- lychee-bin/src/main.rs | 12 ++++-------- lychee-lib/src/client.rs | 20 ++++++++++++++++++-- lychee-lib/src/extract.rs | 2 +- lychee-lib/src/types/error.rs | 14 ++++++++++---- 4 files changed, 33 insertions(+), 15 deletions(-) diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index 97142d7ff7..8c49c5810c 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -69,7 +69,7 @@ use std::{collections::HashSet, fs, str::FromStr}; use anyhow::{anyhow, Context, Result}; use headers::HeaderMapExt; use indicatif::{ProgressBar, ProgressStyle}; -use lychee_lib::{Client, ClientBuilder, Collector, Input, Request, Response}; +use lychee_lib::{ClientBuilder, Collector, Input, Request, Response}; use openssl_sys as _; // required for vendored-openssl feature use regex::RegexSet; use ring as _; // required for apple silicon @@ -243,16 +243,12 @@ async fn run(cfg: &Config, inputs: Vec) -> Result { } }); - async fn check(client: &Client, req: Request) -> Response { - client.check(req).await.unwrap() - } - // Start receiving requests tokio::spawn(async move { while let Some(req) = recv_req.recv().await { - // Client::check() may fail only because Request::try_from() may fail - // here request is already Request, so it never fails - let resp = check(&client, req).await; + // `Client::check()` may fail only because `Request::try_from()` may + // fail. Here `req` is already a valid `Request`, so it never fails. + let resp = client.check(req).await.unwrap(); send_resp .send(resp) .await diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index 090aef23be..8b13d8d0ce 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -126,7 +126,13 @@ impl ClientBuilder { } /// The build method instantiates the client. - #[allow(clippy::missing_errors_doc)] + /// + /// # Errors + /// + /// Returns an `Err` if: + /// - The user agent cannot be parsed + /// - The request client cannot be created + /// - The Github client cannot be created pub fn client(&self) -> Result { let mut headers = self.custom_headers.clone(); headers.insert(header::USER_AGENT, HeaderValue::from_str(&self.user_agent)?); @@ -173,6 +179,12 @@ impl ClientBuilder { impl Client { /// Check a single request + /// + /// # Errors + /// + /// This returns an `Err` if + /// - The request cannot be parsed + /// - An HTTP website with an invalid URI format gets checked pub async fn check(&self, request: T) -> Result where Request: TryFrom, @@ -189,7 +201,10 @@ impl Client { match self.check_website(&uri).await { Status::Ok(code) if self.require_https && uri.scheme() == "http" => { let mut https_uri = uri.clone(); - https_uri.url.set_scheme("https").unwrap(); + https_uri + .url + .set_scheme("https") + .map_err(|_| ErrorKind::InvalidURI(uri.clone()))?; if self.check_website(&https_uri).await.is_success() { Status::Error(Box::new(ErrorKind::InsecureURL(https_uri))) } else { @@ -204,6 +219,7 @@ impl Client { } /// Check if the given URI is filtered by the client + #[must_use] pub fn filtered(&self, uri: &Uri) -> bool { self.filter.is_excluded(uri) } diff --git a/lychee-lib/src/extract.rs b/lychee-lib/src/extract.rs index 13f04607f6..1e4e92cfc5 100644 --- a/lychee-lib/src/extract.rs +++ b/lychee-lib/src/extract.rs @@ -154,7 +154,7 @@ impl Extractor { match resolved { Some(path) => Url::from_file_path(&path) .map(Some) - .map_err(|_e| ErrorKind::InvalidUrl(path)), + .map_err(|_e| ErrorKind::InvalidUrlFromPath(path)), None => Ok(None), } } diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index 4a761416d3..08391cbad9 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -25,7 +25,7 @@ pub enum ErrorKind { /// The given URI cannot be converted to a file path InvalidFilePath(Uri), /// The given path cannot be converted to a URI - InvalidUrl(PathBuf), + InvalidUrlFromPath(PathBuf), /// The given mail address is unreachable UnreachableEmailAddress(Uri), /// The given header could not be parsed. @@ -40,8 +40,10 @@ pub enum ErrorKind { InvalidGlobPattern(glob::PatternError), /// The Github API could not be called because of a missing Github token. MissingGitHubToken, - /// The website is available in HTTPS protocol, but HTTP scheme is used. + /// The website is available through HTTPS, but HTTP scheme is used. InsecureURL(Uri), + /// Invalid URI + InvalidURI(Uri), } impl PartialEq for ErrorKind { @@ -76,7 +78,8 @@ impl Hash for ErrorKind { Self::HubcapsError(e) => e.to_string().hash(state), Self::FileNotFound(e) => e.to_string_lossy().hash(state), Self::UrlParseError(s, e) => (s, e.type_id()).hash(state), - Self::InvalidUrl(p) => p.hash(state), + Self::InvalidURI(u) => u.hash(state), + Self::InvalidUrlFromPath(p) => p.hash(state), Self::Utf8Error(e) => e.to_string().hash(state), Self::InvalidFilePath(u) | Self::UnreachableEmailAddress(u) | Self::InsecureURL(u) => { u.hash(state); @@ -113,7 +116,10 @@ impl Display for ErrorKind { write!(f, "Cannot parse {} as website url ({})", s, url_err) } Self::InvalidFilePath(u) => write!(f, "Invalid file URI: {}", u), - Self::InvalidUrl(p) => write!(f, "Invalid path: {}", p.display()), + Self::InvalidURI(u) => write!(f, "Invalid URI: {}", u), + Self::InvalidUrlFromPath(p) => { + write!(f, "Invalid path to URL conversion: {}", p.display()) + } Self::UnreachableEmailAddress(uri) => write!(f, "Unreachable mail address: {}", uri), Self::InvalidHeader(e) => e.fmt(f), Self::InvalidGlobPattern(e) => e.fmt(f), From 00a0558f31a7b5b4e1fffad85c1b26fa80ae20f6 Mon Sep 17 00:00:00 2001 From: Matthias Date: Thu, 7 Oct 2021 14:19:01 +0200 Subject: [PATCH 10/10] Add back test for request caching in single file --- lychee-bin/tests/cli.rs | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index d02d7cc4f3..52cd66ee96 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -215,6 +215,20 @@ mod cli { .stdout(contains("Excluded.........1")); } + #[test] + fn test_caching_single_file() { + let mut cmd = main_command(); + // Repetitions in one file shall all be checked and counted only once. + let test_schemes_path_1 = fixtures_path().join("TEST_REPETITION_1.txt"); + + cmd.arg(&test_schemes_path_1) + .env_clear() + .assert() + .success() + .stdout(contains("Total............1")) + .stdout(contains("Successful.......1")); + } + #[test] #[ignore] // Test that two identical requests don't get executed twice. @@ -223,7 +237,7 @@ mod cli { // See https://github.com/lycheeverse/lychee/pull/349. // We're planning to add back caching support at a later point in time, // which is why we keep the test around. - fn test_caching() { + fn test_caching_across_files() { let mut cmd = main_command(); // Repetitions across multiple files shall all be checked and counted only once. let test_schemes_path_1 = fixtures_path().join("TEST_REPETITION_1.txt");