[Feature] Integrate Web Scrapper "Track resources" action with actual…

… Web Scrapper backend.
secutils-dev · Jun 14, 2023 · 66ce4ba · 66ce4ba
1 parent e6f2f69
commit 66ce4ba
Show file tree

Hide file tree

Showing 9 changed files with 453 additions and 6 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -31,6 +31,7 @@ log = "0.4.18"
 mailchecker = "5.0.9"
 openssl = "0.10.54"
 rand_core = "0.6.4"
+reqwest = "0.11.18"
 serde = "1.0.164"
 serde_bytes = "0.11.9"
 serde_json = "1.0.96"
@@ -59,6 +60,8 @@ default = [
     "insta/json",
     "insta/redactions",
     "rand_core/std",
+    "reqwest/json",
+    "reqwest/rustls",
     "serde/derive",
     "serde_json/arbitrary_precision",
     "sqlx/json",
@@ -67,6 +70,7 @@ default = [
     "sqlx/sqlite",
     "time/formatting",
     "time/macros",
+    "url/serde",
     "webauthn-rs/danger-allow-state-serialisation"
 ]
 

diff --git a/src/utils/web_scrapping/resources.rs b/src/utils/web_scrapping/resources.rs
@@ -1,6 +1,13 @@
 mod web_page_resource;
 mod web_page_resources_tracker;
+mod web_scrapper_resources_request;
+mod web_scrapper_resources_response;
 
 pub use self::{
-    web_page_resource::WebPageResource, web_page_resources_tracker::WebPageResourcesTracker,
+    web_page_resource::WebPageResource,
+    web_page_resources_tracker::WebPageResourcesTracker,
+    web_scrapper_resources_request::WebScrapperResourcesRequest,
+    web_scrapper_resources_response::{
+        WebScrapperResource, WebScrapperResourceBundle, WebScrapperResourcesResponse,
+    },
 };
diff --git a/src/utils/web_scrapping/resources/web_scrapper_resources_request.rs b/src/utils/web_scrapping/resources/web_scrapper_resources_request.rs
@@ -0,0 +1,85 @@
+use serde::Serialize;
+use url::Url;
+
+/// Represents request to scrap web page resources.
+#[derive(Serialize, Debug, PartialEq, Eq)]
+#[serde(rename_all = "camelCase")]
+pub struct WebScrapperResourcesRequest<'a> {
+    /// URL of the web page to scrap resources for.
+    pub url: &'a Url,
+
+    /// Number of milliseconds to wait until page enters "idle" state.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub timeout: Option<usize>,
+
+    /// Number of milliseconds to wait after page enters "idle" state.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub delay: Option<usize>,
+
+    /// Optional CSS selector to wait for before extracting resources.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub wait_selector: Option<&'a str>,
+}
+
+impl<'a> WebScrapperResourcesRequest<'a> {
+    /// Creates request with only the URL of the web page to scrap resources for, the rest of the
+    /// parameters are omitted.
+    pub fn with_default_parameters(url: &'a Url) -> Self {
+        Self {
+            url,
+            timeout: None,
+            delay: None,
+            wait_selector: None,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::WebScrapperResourcesRequest;
+    use insta::assert_json_snapshot;
+    use url::Url;
+
+    #[test]
+    fn serialization() -> anyhow::Result<()> {
+        assert_json_snapshot!(WebScrapperResourcesRequest {
+            url: &Url::parse("http://localhost:1234/my/app?q=2")?,
+            timeout: Some(100),
+            delay: Some(200),
+            wait_selector: Some("body")
+        }, @r###"
+        {
+          "url": "http://localhost:1234/my/app?q=2",
+          "timeout": 100,
+          "delay": 200,
+          "waitSelector": "body"
+        }
+        "###);
+
+        Ok(())
+    }
+
+    #[test]
+    fn serialization_with_default_parameters() -> anyhow::Result<()> {
+        assert_json_snapshot!(WebScrapperResourcesRequest::with_default_parameters(&Url::parse("http://localhost:1234/my/app?q=2")?), @r###"
+        {
+          "url": "http://localhost:1234/my/app?q=2"
+        }
+        "###);
+
+        Ok(())
+    }
+
+    #[test]
+    fn with_default_parameters() -> anyhow::Result<()> {
+        let url = Url::parse("http://localhost:1234/my/app?q=2")?;
+        let request = WebScrapperResourcesRequest::with_default_parameters(&url);
+
+        assert_eq!(request.url, &url);
+        assert!(request.wait_selector.is_none());
+        assert!(request.delay.is_none());
+        assert!(request.timeout.is_none());
+
+        Ok(())
+    }
+}