From eb84dd27c61a1b3a4a52a53cc0404203eac729e8 Mon Sep 17 00:00:00 2001 From: Timon Vonk Date: Sun, 23 Jun 2024 18:31:29 +0200 Subject: [PATCH] feat(integrations,transformers): add transformer for converting html to markdown --- Cargo.lock | 62 +++++++++++++++ swiftide/Cargo.toml | 3 +- .../scraping/html_to_markdown_transformer.rs | 75 +++++++++++++++++++ swiftide/src/integrations/scraping/loader.rs | 5 +- swiftide/src/integrations/scraping/mod.rs | 1 + 5 files changed, 141 insertions(+), 5 deletions(-) create mode 100644 swiftide/src/integrations/scraping/html_to_markdown_transformer.rs diff --git a/Cargo.lock b/Cargo.lock index 93552f82..0fc20fcb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1539,6 +1539,30 @@ dependencies = [ "winapi", ] +[[package]] +name = "htmd" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23b4e09e225e498bceb1e16a22df7c6610e8df987683adb1809708fa62ae7703" +dependencies = [ + "html5ever", + "markup5ever_rcdom", +] + +[[package]] +name = "html5ever" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c13771afe0e6e846f1e67d038d4cb29998a6779f93c809212e4e9c32efd244d4" +dependencies = [ + "log", + "mac", + "markup5ever", + "proc-macro2", + "quote", + "syn 2.0.67", +] + [[package]] name = "http" version = "0.2.12" @@ -2069,6 +2093,32 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568" +[[package]] +name = "markup5ever" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16ce3abbeba692c8b8441d036ef91aea6df8da2c6b6e21c7e14d3c18e526be45" +dependencies = [ + "log", + "phf 0.11.2", + "phf_codegen 0.11.2", + "string_cache", + "string_cache_codegen", + "tendril", +] + +[[package]] +name = "markup5ever_rcdom" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edaa21ab3701bfee5099ade5f7e1f84553fd19228cf332f13cd6e964bf59be18" +dependencies = [ + "html5ever", + "markup5ever", + "tendril", + "xml5ever", +] + [[package]] name = "match_cfg" version = "0.1.0" @@ -3617,6 +3667,7 @@ dependencies = [ "derive_builder", "fastembed", "futures-util", + "htmd", "ignore", "indoc", "itertools 0.13.0", @@ -4739,6 +4790,17 @@ dependencies = [ "rustix", ] +[[package]] +name = "xml5ever" +version = "0.18.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9bbb26405d8e919bc1547a5aa9abc95cbfa438f04844f5fdd9dc7596b748bf69" +dependencies = [ + "log", + "mac", + "markup5ever", +] + [[package]] name = "zerocopy" version = "0.7.34" diff --git a/swiftide/Cargo.toml b/swiftide/Cargo.toml index 54c0966d..b032c005 100644 --- a/swiftide/Cargo.toml +++ b/swiftide/Cargo.toml @@ -47,6 +47,7 @@ tree-sitter-typescript = { version = "0.21.1", optional = true } tree-sitter-javascript = { version = "0.21.3", optional = true } fastembed = { version = "3.6.1", optional = true } spider = { version = "1.98", optional = true } +htmd = { version = "0.1.3", optional = true } [features] default = [] @@ -63,7 +64,7 @@ tree-sitter = [ ] openai = ["dep:async-openai"] fastembed = ["dep:fastembed"] -scraping = ["dep:spider"] +scraping = ["dep:spider", "dep:htmd"] [dev-dependencies] test-log = "0.2.16" diff --git a/swiftide/src/integrations/scraping/html_to_markdown_transformer.rs b/swiftide/src/integrations/scraping/html_to_markdown_transformer.rs new file mode 100644 index 00000000..17eeb324 --- /dev/null +++ b/swiftide/src/integrations/scraping/html_to_markdown_transformer.rs @@ -0,0 +1,75 @@ +use anyhow::Result; +use async_trait::async_trait; +use derive_builder::Builder; +use htmd::HtmlToMarkdown; + +use crate::{ingestion::IngestionNode, Transformer}; + +#[derive(Builder)] +#[builder(pattern = "owned")] +/// Transforms HTML content into markdown. +/// +/// Useful for converting scraping results into markdown. +pub struct HtmlToMarkdownTransformer { + /// The `HtmlToMarkdown` instance used to convert HTML to markdown. + /// + /// Sets a sane default, but can be customized. + htmd: HtmlToMarkdown, + #[builder(default)] + concurrency: Option, +} + +impl Default for HtmlToMarkdownTransformer { + fn default() -> Self { + Self { + htmd: HtmlToMarkdown::builder() + .skip_tags(vec!["script", "style"]) + .build(), + concurrency: None, + } + } +} + +impl HtmlToMarkdownTransformer { + #[allow(dead_code)] + pub fn builder() -> HtmlToMarkdownTransformerBuilder { + HtmlToMarkdownTransformerBuilder::default() + } +} + +impl std::fmt::Debug for HtmlToMarkdownTransformer { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("HtmlToMarkdownTransformer").finish() + } +} + +#[async_trait] +impl Transformer for HtmlToMarkdownTransformer { + /// Converts the HTML content in the `IngestionNode` to markdown. + /// + /// Will Err the node if the conversion fails. + async fn transform_node(&self, node: IngestionNode) -> Result { + let chunk = self.htmd.convert(&node.chunk); + Ok(IngestionNode { + chunk: chunk?, + ..node + }) + } + + fn concurrency(&self) -> Option { + self.concurrency + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[tokio::test] + async fn test_html_to_markdown() { + let node = IngestionNode::new("

Hello, World!

"); + let transformer = HtmlToMarkdownTransformer::default(); + let transformed = transformer.transform_node(node).await.unwrap(); + assert_eq!(transformed.chunk, "# Hello, World!"); + } +} diff --git a/swiftide/src/integrations/scraping/loader.rs b/swiftide/src/integrations/scraping/loader.rs index e5b7df56..6b837981 100644 --- a/swiftide/src/integrations/scraping/loader.rs +++ b/swiftide/src/integrations/scraping/loader.rs @@ -1,8 +1,4 @@ -use std::{borrow::BorrowMut, cell::RefCell}; - -use anyhow::{Context as _, Result}; use derive_builder::Builder; -use futures_util::stream; use spider::website::Website; use tokio::sync::RwLock; @@ -23,6 +19,7 @@ pub struct ScrapingLoader { impl ScrapingLoader { // Constructs a scrapingloader from a `spider::Website` configuration + #![allow(dead_code)] pub fn from_spider(spider_website: Website) -> Self { Self { spider_website: RwLock::new(spider_website), diff --git a/swiftide/src/integrations/scraping/mod.rs b/swiftide/src/integrations/scraping/mod.rs index 22b93ad9..c0c23ab3 100644 --- a/swiftide/src/integrations/scraping/mod.rs +++ b/swiftide/src/integrations/scraping/mod.rs @@ -1 +1,2 @@ +mod html_to_markdown_transformer; mod loader;