Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extract macro #2

Merged
merged 10 commits into from
Jul 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
[workspace]
members = [".", "reqwest-scraper-macros"]

[package]
name = "reqwest-scraper"
version = "0.2.1"
version = "0.3.0"
edition = "2021"
description = "Web scraping integration with reqwest"
license-file = "LICENSE"
repository = "https://github.com/holmofy/reqwest-scraper"

[lib]
name = "reqwest_scraper"
path = "src/lib.rs"

[dependencies]
async-trait = "0.1"
itertools = "0.13"
Expand All @@ -15,13 +22,15 @@ reqwest = { version = "0.12" }
scraper = { version = "0.19", optional = true }
serde = { version = "1.0", optional = true }
serde_json = { version = "1.0", optional = true }
reqwest-scraper-macros = { version = "0.3.0", path = "./reqwest-scraper-macros", optional = true }
thiserror = "1.0"

[features]
default = ["jsonpath", "css_selector", "xpath"]
default = ["jsonpath", "css_selector", "xpath", "macros"]
jsonpath = ["jsonpath_lib", "serde", "serde_json", "reqwest/json"]
xpath = ["libxml"]
css_selector = ["scraper"]
macros = ["reqwest-scraper-macros"]

[dev-dependencies]
anyhow = { version = "1.0", features = ["backtrace"] }
Expand Down
65 changes: 61 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@ Extends [reqwest](https://github.com/seanmonstar/reqwest) to support multiple we
* [x] Use [JsonPath](#jsonpath) to select fields in json response
* [x] Select elements in HTML response using [CSS selector](#css-selector)
* [x] Evalute the value in HTML response using [xpath expression](#xpath)
* [ ] Derive macro extract
* [x] [Derive macro extract](#macros)

### Start Guide

* add dependency
```toml
reqwest = { version = "0.12", features = ["json"] }
reqwest-scraper="0.2.1"
reqwest-scraper="0.3.0"
```
* use ScraperResponse
```rust
Expand Down Expand Up @@ -61,6 +61,7 @@ pub async fn request() -> Result<()> {

* `Html::select(selector: &str) -> Result<Selectable>`
* `Selectable::iter() -> impl Iterator<SelectItem>`
* `Selectable::first() -> Option<SelectItem>`
* `SelectItem::name() -> &str`
* `SelectItem::id() -> Option<&str>`
* `SelectItem::has_class(class: &str, case_sensitive: CaseSensitivity) -> bool`
Expand Down Expand Up @@ -117,8 +118,8 @@ async fn request() -> Result<()> {
* `Node::children() -> Vec<Node>`
* `Node::findnodes(relative_xpath: &str) -> Result<Vec<Node>>`
* `Node::findvalues(relative_xpath: &str) -> Result<Vec<String>>`
* `Node::findnode(relative_xpath: &str) -> Result<Node>`
* `Node::findvalue(relative_xpath: &str) -> Result<String>`
* `Node::findnode(relative_xpath: &str) -> Result<Option<Node>>`
* `Node::findvalue(relative_xpath: &str) -> Result<Option<String>>`

[**example**](./examples/xpath.rs):

Expand Down Expand Up @@ -163,6 +164,62 @@ async fn request() -> Result<()> {
}
```

<h3 id="macros">Derive macro extract</h3>

**use `FromCssSelector` & `selector` to extract html element into struct**
```rust
// define struct and derive the FromCssSelector trait
#[derive(Debug, FromCssSelector)]
#[selector(path = "#user-repositories-list > ul > li")]
struct Repo {
#[selector(path = "a[itemprop~='name']", default = "<unname>", text)]
name: String,

#[selector(path = "span[itemprop~='programmingLanguage']", text)]
program_lang: Option<String>,

#[selector(path = "div.topics-row-container>a", text)]
topics: Vec<String>,
}

// request
let html = reqwest::get("https://github.com/holmofy?tab=repositories")
.await?
.css_selector()
.await?;

// Use the generated `from_html` method to extract data into the struct
let items = Repo::from_html(html)?;
items.iter().for_each(|item| println!("{:?}", item));
```

**use `FromXPath` & `xpath` to extract html element into struct**
```rust
// define struct and derive the FromXPath trait
#[derive(Debug, FromXPath)]
#[xpath(path = "//div[@id='user-repositories-list']/ul/li")]
struct Repo {
#[xpath(path = ".//a[contains(@itemprop,'name')]/text()", default = "<unname>")]
name: String,

#[xpath(path = ".//span[contains(@itemprop,'programmingLanguage')]/text()")]
program_lang: Option<String>,

#[xpath(path = ".//div[contains(@class,'topics-row-container')]/a/text()")]
topics: Vec<String>,
}

let html = reqwest::get("https://github.com/holmofy?tab=repositories")
.await?
.xpath()
.await?;

// Use the generated `from_xhtml` method to extract data into the struct
let items = Repo::from_xhtml(html)?;
items.iter().for_each(|item| println!("{:?}", item));
```


## Related Projects

* [reqwest](https://github.com/seanmonstar/reqwest)
Expand Down
66 changes: 58 additions & 8 deletions examples/html.rs
Original file line number Diff line number Diff line change
@@ -1,30 +1,80 @@
use anyhow::Result;
use reqwest_scraper::ScraperResponse;
use reqwest_scraper::{FromCssSelector, ScraperResponse};

#[tokio::main]
async fn main() {
request().await.expect("request error");
}

#[derive(Debug, FromCssSelector)]
#[selector(path = "#user-repositories-list > ul > li")]
struct Repo {
#[selector(path = "a[itemprop~='name']", default = "<unname>", text)]
name: String,

#[selector(path = "span[itemprop~='programmingLanguage']", text)]
program_lang: Option<String>,

#[selector(path = "div.topics-row-container>a", text)]
topics: Vec<String>,
}

async fn request() -> Result<()> {
let html = reqwest::get("https://github.com/holmofy")
.await?
.css_selector()
.await?;

// 1. Simple extract
assert_eq!(
html.select(".p-name")?.iter().nth(0).unwrap().text().trim(),
html.select(".p-name")?
.first()
.map(|e| e.text())
.unwrap_or("xxx".into()),
"holmofy"
);

let select_result = html.select(".vcard-details > li.vcard-detail")?;
let html = reqwest::get("https://github.com/holmofy?tab=repositories")
.await?
.css_selector()
.await?;

// 2. Select List Element
println!("\n2. Select List Element");
let select_result = html.select("#user-repositories-list > ul > li")?;

for item in select_result.iter() {
let name = item
.select("a[itemprop~='name']")?
.first()
.map(|e| e.text())
.unwrap_or("<unname>".into());

let program_lang = item
.select("span[itemprop~='programmingLanguage']")?
.first()
.map(|e| e.text());

let topics = item
.select("div.topics-row-container>a")?
.iter()
.map(|e| e.text())
.collect::<Vec<_>>();

for detail_item in select_result.iter() {
println!(
"{}",
detail_item.attr("aria-label").unwrap_or_else(|| "".into())
)
let item = Repo {
name,
program_lang,
topics,
};

println!("{:?}", item);
}

// 3. Extract By Derived Macros
println!("\n3. Extract By Derived Macros");

let items = Repo::from_html(html)?;
items.iter().for_each(|item| println!("{:?}", item));

Ok(())
}
26 changes: 26 additions & 0 deletions examples/json.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,29 @@
use anyhow::Result;
use reqwest_scraper::ScraperResponse;
use serde::Deserialize;

#[derive(Debug, Deserialize)]
struct Owner {
login: String,
id: i64,
node_id: String,
avatar_url: String,
gravatar_id: String,
url: String,
html_url: String,
followers_url: String,
following_url: String,
gists_url: String,
starred_url: String,
subscriptions_url: String,
organizations_url: String,
repos_url: String,
events_url: String,
received_events_url: String,
#[serde(alias = "type")]
_type: String,
site_admin: bool,
}

#[tokio::main]
async fn main() {
Expand All @@ -19,10 +43,12 @@ pub async fn request() -> Result<()> {
let total_count_str = json.select_as_str("$.total_count")?;
let total_count_int: i32 = json.select_one("$.total_count")?;
let names: Vec<String> = json.select("$.items[*].full_name")?;
let owners: Vec<Owner> = json.select("$.items[*].owner")?;

println!("{}", total_count_str);
println!("{}", total_count_int);
println!("{}", names.join("\t"));
owners.iter().for_each(|o| println!("{:#?}", o));

Ok(())
}
59 changes: 42 additions & 17 deletions examples/xpath.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,24 @@
use anyhow::Result;
use reqwest_scraper::ScraperResponse;
use reqwest_scraper::{FromXPath, ScraperResponse};

#[tokio::main]
async fn main() {
request().await.expect("request error");
}

#[derive(Debug, FromXPath)]
#[xpath(path = "//div[@id='user-repositories-list']/ul/li")]
struct Repo {
#[xpath(path = ".//a[contains(@itemprop,'name')]/text()", default = "<unname>")]
name: String,

#[xpath(path = ".//span[contains(@itemprop,'programmingLanguage')]/text()")]
program_lang: Option<String>,

#[xpath(path = ".//div[contains(@class,'topics-row-container')]/a/text()")]
topics: Vec<String>,
}

async fn request() -> Result<()> {
let html = reqwest::get("https://github.com/holmofy")
.await?
Expand All @@ -18,7 +31,6 @@ async fn request() -> Result<()> {
.as_node()
.unwrap()
.text();
println!("{}", name);
assert_eq!(name.trim(), "holmofy");

// iterate elements
Expand All @@ -31,27 +43,40 @@ async fn request() -> Result<()> {
for item in select_result.into_iter() {
let attr = item.attr("aria-label").unwrap_or_else(|| "".into());
println!("{}", attr);
println!("{}", item.text());
println!("{}", item.text().trim());
}

// attribute extract
let select_result = html
.select("//ul[contains(@class,'vcard-details')]/li[contains(@class,'vcard-detail')]/@aria-label")?
.as_strs();
let html = reqwest::get("https://github.com/holmofy?tab=repositories")
.await?
.xpath()
.await?;

println!("{}", select_result.len());
select_result.into_iter().for_each(|s| println!("{}", s));
// 2. Select List Element
println!("\n2. Select List Element");
let select_result = html.select("//div[@id='user-repositories-list']/ul/li")?;

//
let select_result = html
.select("//ul[contains(@class,'vcard-details')]/li[contains(@class,'vcard-detail')]/@aria-label")?
.as_nodes();
for item in select_result.as_nodes() {
let name = item.findvalue(".//a[contains(@itemprop,'name')]/text()")?.unwrap_or("".into());

println!("{}", select_result.len());
let program_lang =
item.findvalue(".//span[contains(@itemprop,'programmingLanguage')]/text()")?;

let topics = item.findvalues(".//div[contains(@class,'topics-row-container')]/a/text()")?;

let item = Repo {
name,
program_lang: program_lang,
topics,
};

println!("{:?}", item);
}

// 3. Extract By Derived Macros
println!("\n3. Extract By Derived Macros");

select_result
.into_iter()
.for_each(|n| println!("{}", n.name()));
let items = Repo::from_xhtml(html)?;
items.iter().for_each(|item| println!("{:?}", item));

Ok(())
}
21 changes: 21 additions & 0 deletions reqwest-scraper-macros/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
[package]
name = "reqwest-scraper-macros"
version = "0.3.0"
edition = "2021"
description = "Web scraping integration with reqwest"
license-file = "LICENSE"
repository = "https://github.com/holmofy/reqwest-scraper"

[lib]
name = "reqwest_scraper_macros"
proc-macro = true

[dependencies]
proc-macro2 = "1.0"
quote = "1.0"
syn = "2.0"
darling = "0.20.10"
scraper = { version = "0.19", default-features = false }

[dev-dependencies]
reqwest-scraper = { version = "0.3.0", path = "../" }
Loading
Loading