Skip to content

Commit

Permalink
[Feature] Integrate Web Scrapper "Track resources" action with actual…
Browse files Browse the repository at this point in the history
… Web Scrapper backend.
  • Loading branch information
azasypkin committed Jun 14, 2023
1 parent e6f2f69 commit 66ce4ba
Show file tree
Hide file tree
Showing 9 changed files with 453 additions and 6 deletions.
147 changes: 147 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ log = "0.4.18"
mailchecker = "5.0.9"
openssl = "0.10.54"
rand_core = "0.6.4"
reqwest = "0.11.18"
serde = "1.0.164"
serde_bytes = "0.11.9"
serde_json = "1.0.96"
Expand Down Expand Up @@ -59,6 +60,8 @@ default = [
"insta/json",
"insta/redactions",
"rand_core/std",
"reqwest/json",
"reqwest/rustls",
"serde/derive",
"serde_json/arbitrary_precision",
"sqlx/json",
Expand All @@ -67,6 +70,7 @@ default = [
"sqlx/sqlite",
"time/formatting",
"time/macros",
"url/serde",
"webauthn-rs/danger-allow-state-serialisation"
]

Expand Down
9 changes: 8 additions & 1 deletion src/utils/web_scrapping/resources.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
mod web_page_resource;
mod web_page_resources_tracker;
mod web_scrapper_resources_request;
mod web_scrapper_resources_response;

pub use self::{
web_page_resource::WebPageResource, web_page_resources_tracker::WebPageResourcesTracker,
web_page_resource::WebPageResource,
web_page_resources_tracker::WebPageResourcesTracker,
web_scrapper_resources_request::WebScrapperResourcesRequest,
web_scrapper_resources_response::{
WebScrapperResource, WebScrapperResourceBundle, WebScrapperResourcesResponse,
},
};
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
use serde::Serialize;
use url::Url;

/// Represents request to scrap web page resources.
#[derive(Serialize, Debug, PartialEq, Eq)]
#[serde(rename_all = "camelCase")]
pub struct WebScrapperResourcesRequest<'a> {
/// URL of the web page to scrap resources for.
pub url: &'a Url,

/// Number of milliseconds to wait until page enters "idle" state.
#[serde(skip_serializing_if = "Option::is_none")]
pub timeout: Option<usize>,

/// Number of milliseconds to wait after page enters "idle" state.
#[serde(skip_serializing_if = "Option::is_none")]
pub delay: Option<usize>,

/// Optional CSS selector to wait for before extracting resources.
#[serde(skip_serializing_if = "Option::is_none")]
pub wait_selector: Option<&'a str>,
}

impl<'a> WebScrapperResourcesRequest<'a> {
/// Creates request with only the URL of the web page to scrap resources for, the rest of the
/// parameters are omitted.
pub fn with_default_parameters(url: &'a Url) -> Self {
Self {
url,
timeout: None,
delay: None,
wait_selector: None,
}
}
}

#[cfg(test)]
mod tests {
use super::WebScrapperResourcesRequest;
use insta::assert_json_snapshot;
use url::Url;

#[test]
fn serialization() -> anyhow::Result<()> {
assert_json_snapshot!(WebScrapperResourcesRequest {
url: &Url::parse("http://localhost:1234/my/app?q=2")?,
timeout: Some(100),
delay: Some(200),
wait_selector: Some("body")
}, @r###"
{
"url": "http://localhost:1234/my/app?q=2",
"timeout": 100,
"delay": 200,
"waitSelector": "body"
}
"###);

Ok(())
}

#[test]
fn serialization_with_default_parameters() -> anyhow::Result<()> {
assert_json_snapshot!(WebScrapperResourcesRequest::with_default_parameters(&Url::parse("http://localhost:1234/my/app?q=2")?), @r###"
{
"url": "http://localhost:1234/my/app?q=2"
}
"###);

Ok(())
}

#[test]
fn with_default_parameters() -> anyhow::Result<()> {
let url = Url::parse("http://localhost:1234/my/app?q=2")?;
let request = WebScrapperResourcesRequest::with_default_parameters(&url);

assert_eq!(request.url, &url);
assert!(request.wait_selector.is_none());
assert!(request.delay.is_none());
assert!(request.timeout.is_none());

Ok(())
}
}
Loading

0 comments on commit 66ce4ba

Please sign in to comment.