diff --git a/Cargo.lock b/Cargo.lock index 06ce19d..0d24b8f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1428,6 +1428,17 @@ dependencies = [ "itoa", ] +[[package]] +name = "http-body" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5f38f16d184e36f2408a55281cd658ecbd3ca05cce6d6510a176eca393e26d1" +dependencies = [ + "bytes", + "http", + "pin-project-lite", +] + [[package]] name = "httparse" version = "1.8.0" @@ -1446,6 +1457,43 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" +[[package]] +name = "hyper" +version = "0.14.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab302d72a6f11a3b910431ff93aae7e773078c769f0a3ef15fb9ec692ed147d4" +dependencies = [ + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "socket2", + "tokio", + "tower-service", + "tracing", + "want", +] + +[[package]] +name = "hyper-tls" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905" +dependencies = [ + "bytes", + "hyper", + "native-tls", + "tokio", + "tokio-native-tls", +] + [[package]] name = "idna" version = "0.3.0" @@ -1525,6 +1573,12 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "ipnet" +version = "2.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12b6ee2129af8d4fb011108c73d99a1b83a85977f23b82460c0ae2e25bb4b57f" + [[package]] name = "is-terminal" version = "0.4.7" @@ -2358,6 +2412,44 @@ version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78" +[[package]] +name = "reqwest" +version = "0.11.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cde824a14b7c14f85caff81225f411faacc04a2013f41670f41443742b1c1c55" +dependencies = [ + "base64 0.21.2", + "bytes", + "encoding_rs", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "hyper", + "hyper-tls", + "ipnet", + "js-sys", + "log", + "mime", + "native-tls", + "once_cell", + "percent-encoding", + "pin-project-lite", + "rustls 0.21.1", + "serde", + "serde_json", + "serde_urlencoded", + "tokio", + "tokio-native-tls", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", + "winreg", +] + [[package]] name = "ring" version = "0.16.20" @@ -2459,6 +2551,7 @@ version = "0.21.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c911ba11bc8433e811ce56fde130ccf32f5127cab0e0194e9c68c5a5b671791e" dependencies = [ + "log", "ring", "rustls-webpki", "sct", @@ -2576,6 +2669,7 @@ dependencies = [ "mailchecker", "openssl", "rand_core", + "reqwest", "serde", "serde_bytes", "serde_json", @@ -3310,6 +3404,16 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + [[package]] name = "tokio-rustls" version = "0.23.4" @@ -3346,6 +3450,12 @@ dependencies = [ "tracing", ] +[[package]] +name = "tower-service" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" + [[package]] name = "tracing" version = "0.1.37" @@ -3409,6 +3519,12 @@ dependencies = [ "tracing-log", ] +[[package]] +name = "try-lock" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed" + [[package]] name = "typenum" version = "1.16.0" @@ -3534,6 +3650,16 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +[[package]] +name = "want" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ce8a968cb1cd110d136ff8b819a556d6fb6d919363c61534f6860c7eb172ba0" +dependencies = [ + "log", + "try-lock", +] + [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" @@ -3565,6 +3691,18 @@ dependencies = [ "wasm-bindgen-shared", ] +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d1985d03709c53167ce907ff394f5316aa22cb4e12761295c5dc57dacb6297e" +dependencies = [ + "cfg-if", + "js-sys", + "wasm-bindgen", + "web-sys", +] + [[package]] name = "wasm-bindgen-macro" version = "0.2.86" @@ -3875,6 +4013,15 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" +[[package]] +name = "winreg" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "80d0f4e272c85def139476380b12f9ac60926689dd2e01d4923222f40580869d" +dependencies = [ + "winapi", +] + [[package]] name = "x509-parser" version = "0.13.2" diff --git a/Cargo.toml b/Cargo.toml index 8488c86..98416aa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,6 +31,7 @@ log = "0.4.18" mailchecker = "5.0.9" openssl = "0.10.54" rand_core = "0.6.4" +reqwest = "0.11.18" serde = "1.0.164" serde_bytes = "0.11.9" serde_json = "1.0.96" @@ -59,6 +60,8 @@ default = [ "insta/json", "insta/redactions", "rand_core/std", + "reqwest/json", + "reqwest/rustls", "serde/derive", "serde_json/arbitrary_precision", "sqlx/json", @@ -67,6 +70,7 @@ default = [ "sqlx/sqlite", "time/formatting", "time/macros", + "url/serde", "webauthn-rs/danger-allow-state-serialisation" ] diff --git a/src/utils/web_scrapping/resources.rs b/src/utils/web_scrapping/resources.rs index de97b25..aabca9f 100644 --- a/src/utils/web_scrapping/resources.rs +++ b/src/utils/web_scrapping/resources.rs @@ -1,6 +1,13 @@ mod web_page_resource; mod web_page_resources_tracker; +mod web_scrapper_resources_request; +mod web_scrapper_resources_response; pub use self::{ - web_page_resource::WebPageResource, web_page_resources_tracker::WebPageResourcesTracker, + web_page_resource::WebPageResource, + web_page_resources_tracker::WebPageResourcesTracker, + web_scrapper_resources_request::WebScrapperResourcesRequest, + web_scrapper_resources_response::{ + WebScrapperResource, WebScrapperResourceBundle, WebScrapperResourcesResponse, + }, }; diff --git a/src/utils/web_scrapping/resources/web_scrapper_resources_request.rs b/src/utils/web_scrapping/resources/web_scrapper_resources_request.rs new file mode 100644 index 0000000..aff327b --- /dev/null +++ b/src/utils/web_scrapping/resources/web_scrapper_resources_request.rs @@ -0,0 +1,85 @@ +use serde::Serialize; +use url::Url; + +/// Represents request to scrap web page resources. +#[derive(Serialize, Debug, PartialEq, Eq)] +#[serde(rename_all = "camelCase")] +pub struct WebScrapperResourcesRequest<'a> { + /// URL of the web page to scrap resources for. + pub url: &'a Url, + + /// Number of milliseconds to wait until page enters "idle" state. + #[serde(skip_serializing_if = "Option::is_none")] + pub timeout: Option, + + /// Number of milliseconds to wait after page enters "idle" state. + #[serde(skip_serializing_if = "Option::is_none")] + pub delay: Option, + + /// Optional CSS selector to wait for before extracting resources. + #[serde(skip_serializing_if = "Option::is_none")] + pub wait_selector: Option<&'a str>, +} + +impl<'a> WebScrapperResourcesRequest<'a> { + /// Creates request with only the URL of the web page to scrap resources for, the rest of the + /// parameters are omitted. + pub fn with_default_parameters(url: &'a Url) -> Self { + Self { + url, + timeout: None, + delay: None, + wait_selector: None, + } + } +} + +#[cfg(test)] +mod tests { + use super::WebScrapperResourcesRequest; + use insta::assert_json_snapshot; + use url::Url; + + #[test] + fn serialization() -> anyhow::Result<()> { + assert_json_snapshot!(WebScrapperResourcesRequest { + url: &Url::parse("http://localhost:1234/my/app?q=2")?, + timeout: Some(100), + delay: Some(200), + wait_selector: Some("body") + }, @r###" + { + "url": "http://localhost:1234/my/app?q=2", + "timeout": 100, + "delay": 200, + "waitSelector": "body" + } + "###); + + Ok(()) + } + + #[test] + fn serialization_with_default_parameters() -> anyhow::Result<()> { + assert_json_snapshot!(WebScrapperResourcesRequest::with_default_parameters(&Url::parse("http://localhost:1234/my/app?q=2")?), @r###" + { + "url": "http://localhost:1234/my/app?q=2" + } + "###); + + Ok(()) + } + + #[test] + fn with_default_parameters() -> anyhow::Result<()> { + let url = Url::parse("http://localhost:1234/my/app?q=2")?; + let request = WebScrapperResourcesRequest::with_default_parameters(&url); + + assert_eq!(request.url, &url); + assert!(request.wait_selector.is_none()); + assert!(request.delay.is_none()); + assert!(request.timeout.is_none()); + + Ok(()) + } +} diff --git a/src/utils/web_scrapping/resources/web_scrapper_resources_response.rs b/src/utils/web_scrapping/resources/web_scrapper_resources_response.rs new file mode 100644 index 0000000..7fd58cc --- /dev/null +++ b/src/utils/web_scrapping/resources/web_scrapper_resources_response.rs @@ -0,0 +1,147 @@ +use serde::Deserialize; +use time::OffsetDateTime; +use url::Url; + +/// Represents response with scrapped resources. +#[derive(Deserialize, Debug, PartialEq, Eq)] +#[serde(rename_all = "camelCase")] +pub struct WebScrapperResourcesResponse { + /// Timestamp indicating when resources were fetched. + #[serde(with = "time::serde::timestamp")] + pub timestamp: OffsetDateTime, + /// List of JavaScript resources. + pub scripts: WebScrapperResourceBundle, + /// List of CSS resources. + pub styles: WebScrapperResourceBundle, +} + +/// Represents both external and inline resources of a particular type. +#[derive(Deserialize, Debug, PartialEq, Eq)] +#[serde(rename_all = "camelCase")] +pub struct WebScrapperResourceBundle { + /// List of external resources. + pub external: Vec, + /// List of inline resources. + pub inline: Vec, +} + +/// Describes either external or inline resource. +#[derive(Deserialize, Debug, PartialEq, Eq)] +#[serde(rename_all = "camelCase")] +pub struct WebScrapperResource { + /// The URL resources is loaded from. + pub url: Option, + /// SHA256 digest of the external resource content, if available. + pub digest: Option, + /// Size of the inline resource content, if available, in bytes. + pub size: Option, +} + +#[cfg(test)] +mod tests { + use super::{WebScrapperResource, WebScrapperResourceBundle, WebScrapperResourcesResponse}; + use time::OffsetDateTime; + use url::Url; + + #[test] + fn deserialization() -> anyhow::Result<()> { + assert_eq!( + serde_json::from_str::( + r###" +{ + "timestamp": 946720800, + "scripts": { + "external": [{ "url": "https://secutils.dev/script.js", "digest": "some-digest", "size": 123 }], + "inline": [{ "digest": "another-digest", "size": 321 }] + }, + "styles": { + "external": [{ "url": "https://secutils.dev/style.css", "digest": "some-css-digest", "size": 456 }], + "inline": [{ "digest": "another-css-digest", "size": 654 }] + } +} + "### + )?, + WebScrapperResourcesResponse { + // January 1, 2000 11:00:00 + timestamp: OffsetDateTime::from_unix_timestamp(946720800)?, + scripts: WebScrapperResourceBundle { + external: vec![WebScrapperResource { + url: Some(Url::parse("https://secutils.dev/script.js")?), + digest: Some("some-digest".to_string()), + size: Some(123), + }], + inline: vec![WebScrapperResource { + url: None, + digest: Some("another-digest".to_string()), + size: Some(321), + }] + }, + styles: WebScrapperResourceBundle { + external: vec![WebScrapperResource { + url: Some(Url::parse("https://secutils.dev/style.css")?), + digest: Some("some-css-digest".to_string()), + size: Some(456), + }], + inline: vec![WebScrapperResource { + url: None, + digest: Some("another-css-digest".to_string()), + size: Some(654), + }] + }, + } + ); + + Ok(()) + } + + #[test] + fn deserialization_without_optional_values() -> anyhow::Result<()> { + assert_eq!( + serde_json::from_str::( + r###" +{ + "timestamp": 946720800, + "scripts": { + "external": [{ "url": "https://secutils.dev/script.js" }], + "inline": [{ "digest": "another-digest" }] + }, + "styles": { + "external": [{ "url": "https://secutils.dev/style.css" }], + "inline": [{ "digest": "another-css-digest" }] + } +} + "### + )?, + WebScrapperResourcesResponse { + // January 1, 2000 11:00:00 + timestamp: OffsetDateTime::from_unix_timestamp(946720800)?, + scripts: WebScrapperResourceBundle { + external: vec![WebScrapperResource { + url: Some(Url::parse("https://secutils.dev/script.js")?), + digest: None, + size: None, + }], + inline: vec![WebScrapperResource { + url: None, + digest: Some("another-digest".to_string()), + size: None, + }] + }, + styles: WebScrapperResourceBundle { + external: vec![WebScrapperResource { + url: Some(Url::parse("https://secutils.dev/style.css")?), + digest: None, + size: None, + }], + inline: vec![WebScrapperResource { + url: None, + digest: Some("another-css-digest".to_string()), + size: None, + }] + }, + } + ); + + Ok(()) + } +} diff --git a/src/utils/web_scrapping/utils_web_scrapping_action_handler.rs b/src/utils/web_scrapping/utils_web_scrapping_action_handler.rs index a21ea0a..c91f87d 100644 --- a/src/utils/web_scrapping/utils_web_scrapping_action_handler.rs +++ b/src/utils/web_scrapping/utils_web_scrapping_action_handler.rs @@ -1,7 +1,11 @@ use crate::{ api::Api, users::{PublicUserDataNamespace, User}, - utils::{UtilsWebScrappingAction, UtilsWebScrappingActionResult, WebPageResourcesTracker}, + utils::{ + web_scrapping::resources::{WebScrapperResourcesRequest, WebScrapperResourcesResponse}, + UtilsWebScrappingAction, UtilsWebScrappingActionResult, WebPageResource, + WebPageResourcesTracker, + }, }; use anyhow::anyhow; use std::collections::BTreeMap; @@ -30,9 +34,30 @@ impl UtilsWebScrappingActionHandler { ) })?; + let web_scrapper_response = reqwest::Client::new() + .post(format!( + "{}api/resources", + api.config.components.web_scrapper_url.as_str() + )) + .json(&WebScrapperResourcesRequest::with_default_parameters( + &tracker.web_page_url, + )) + .send() + .await? + .json::() + .await?; + + // TODO: Return all resources, not just external ones. Ok(UtilsWebScrappingActionResult::TrackWebPageResources { tracker_name: tracker.name, - resources: vec![], + resources: web_scrapper_response + .scripts + .external + .into_iter() + .chain(web_scrapper_response.styles.external.into_iter()) + .filter_map(|resource| resource.url) + .map(|src| WebPageResource { url: src }) + .collect(), }) } } diff --git a/tools/api/user/get_data.http b/tools/api/user/get_data.http index de56ec2..b81e8a9 100644 --- a/tools/api/user/get_data.http +++ b/tools/api/user/get_data.http @@ -1,4 +1,9 @@ ### Get data -GET {{host}}/api/user/data?dataType=userSettings +GET {{host}}/api/user/data?namespace=userSettings +Accept: application/json +Authorization: {{api-credentials}} + +### Get utils data +GET {{host}}/api/user/data?namespace=webPageResourcesTrackers Accept: application/json Authorization: {{api-credentials}} diff --git a/tools/api/user/set_data.http b/tools/api/user/set_data.http index e83c5b0..c941c9e 100644 --- a/tools/api/user/set_data.http +++ b/tools/api/user/set_data.http @@ -1,5 +1,5 @@ ### Update data -POST {{host}}/api/user/data?dataType=userSettings +POST {{host}}/api/user/data?namespace=userSettings Accept: application/json Content-Type: application/json Authorization: {{api-credentials}} @@ -9,7 +9,7 @@ Authorization: {{api-credentials}} } ### Remove data -POST {{host}}/api/user/data?dataType=userSettings +POST {{host}}/api/user/data?namespace=userSettings Accept: application/json Content-Type: application/json Authorization: {{api-credentials}} @@ -17,3 +17,13 @@ Authorization: {{api-credentials}} { "dataValue": "{ \"certificates.doNotShowSelfSignedWarning\": null }" } + +### Save utils data +POST {{host}}/api/user/data?namespace=webPageResourcesTrackers +Accept: application/json +Content-Type: application/json +Authorization: {{api-credentials}} + +{ + "dataValue":"{\"test\":{\"n\":\"test\",\"u\":\"https://secutils.dev\"}}" +} diff --git a/tools/api/utils/web_scrapping_resources.http b/tools/api/utils/web_scrapping_resources.http new file mode 100644 index 0000000..cbd64b5 --- /dev/null +++ b/tools/api/utils/web_scrapping_resources.http @@ -0,0 +1,17 @@ +### Track web page resources +POST {{host}}/api/utils/action +Authorization: {{api-credentials}} +Accept: application/json +Content-Type: application/json + +{ + "action": { + "type": "webScrapping", + "value": { + "type": "trackWebPageResources", + "value": { "trackerName": "test" } + } + } +} + +###