From fddd90ed14791e16f0f459d2bee9c1eca5d90d5e Mon Sep 17 00:00:00 2001 From: Adam Reichold Date: Thu, 24 Oct 2024 15:09:08 +0200 Subject: [PATCH] Bump html5ever to its current stable version and adjust our usage accordingly --- Cargo.lock | 8 +- scraper/Cargo.toml | 2 +- scraper/src/html/mod.rs | 11 ++- scraper/src/html/tree_sink.rs | 138 ++++++++++++++++++++++------------ scraper/src/lib.rs | 2 +- 5 files changed, 103 insertions(+), 58 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a6be2e7a..e8e241d5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -147,9 +147,9 @@ checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb" [[package]] name = "html5ever" -version = "0.27.0" +version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c13771afe0e6e846f1e67d038d4cb29998a6779f93c809212e4e9c32efd244d4" +checksum = "2e15626aaf9c351bc696217cbe29cb9b5e86c43f8a46b5e2f5c6c5cf7cb904ce" dependencies = [ "log", "mac", @@ -205,9 +205,9 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" [[package]] name = "markup5ever" -version = "0.12.1" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16ce3abbeba692c8b8441d036ef91aea6df8da2c6b6e21c7e14d3c18e526be45" +checksum = "82c88c6129bd24319e62a0359cb6b958fa7e8be6e19bb1663bc396b90883aca5" dependencies = [ "log", "phf", diff --git a/scraper/Cargo.toml b/scraper/Cargo.toml index f7fdc7e1..e7fabfd6 100644 --- a/scraper/Cargo.toml +++ b/scraper/Cargo.toml @@ -16,7 +16,7 @@ readme = "README.md" ahash = "0.8.0" cssparser = "0.34.0" ego-tree = "0.9.0" -html5ever = "0.27.0" +html5ever = "0.29.0" indexmap = { version = "2.6.0", optional = true } precomputed-hash = "0.1.1" selectors = "0.26.0" diff --git a/scraper/src/html/mod.rs b/scraper/src/html/mod.rs index 62056836..39ad74cf 100644 --- a/scraper/src/html/mod.rs +++ b/scraper/src/html/mod.rs @@ -16,6 +16,8 @@ use tendril::TendrilSink; use crate::selector::Selector; use crate::{ElementRef, Node}; +pub use tree_sink::HtmlTreeSink; + /// An HTML tree. /// /// Parsing does not fail hard. Instead, the `quirks_mode` is set and errors are added to the @@ -67,22 +69,23 @@ impl Html { /// # fn main() { /// # let document = ""; /// use html5ever::driver::{self, ParseOpts}; - /// use scraper::Html; + /// use scraper::{Html, HtmlTreeSink}; /// use tendril::TendrilSink; /// - /// let parser = driver::parse_document(Html::new_document(), ParseOpts::default()); + /// let parser = driver::parse_document(HtmlTreeSink::new(Html::new_document()), ParseOpts::default()); /// let html = parser.one(document); /// # } /// ``` pub fn parse_document(document: &str) -> Self { - let parser = driver::parse_document(Self::new_document(), Default::default()); + let parser = + driver::parse_document(HtmlTreeSink::new(Self::new_document()), Default::default()); parser.one(document) } /// Parses a string of HTML as a fragment. pub fn parse_fragment(fragment: &str) -> Self { let parser = driver::parse_fragment( - Self::new_fragment(), + HtmlTreeSink::new(Self::new_fragment()), Default::default(), QualName::new(None, ns!(html), local_name!("body")), Vec::new(), diff --git a/scraper/src/html/tree_sink.rs b/scraper/src/html/tree_sink.rs index af253765..f9e18720 100644 --- a/scraper/src/html/tree_sink.rs +++ b/scraper/src/html/tree_sink.rs @@ -5,34 +5,47 @@ use ego_tree::NodeId; use html5ever::tendril::StrTendril; use html5ever::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink}; use html5ever::Attribute; -use html5ever::{ExpandedName, QualName}; +use html5ever::QualName; use std::borrow::Cow; +use std::cell::{Ref, RefCell}; + +/// Wraps `Html` instances as sinks to drive parsing +#[derive(Debug)] +pub struct HtmlTreeSink(RefCell); + +impl HtmlTreeSink { + /// Wrap a `Html`instance as a sink to drive parsing + pub fn new(html: Html) -> Self { + Self(RefCell::new(html)) + } +} /// Note: does not support the `