Skip to content

Commit

Permalink
feat(integrations,transformers): add transformer for converting html …
Browse files Browse the repository at this point in the history
…to markdown
  • Loading branch information
timonv committed Jun 23, 2024
1 parent 6d37051 commit eb84dd2
Show file tree
Hide file tree
Showing 5 changed files with 141 additions and 5 deletions.
62 changes: 62 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion swiftide/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ tree-sitter-typescript = { version = "0.21.1", optional = true }
tree-sitter-javascript = { version = "0.21.3", optional = true }
fastembed = { version = "3.6.1", optional = true }
spider = { version = "1.98", optional = true }
htmd = { version = "0.1.3", optional = true }

[features]
default = []
Expand All @@ -63,7 +64,7 @@ tree-sitter = [
]
openai = ["dep:async-openai"]
fastembed = ["dep:fastembed"]
scraping = ["dep:spider"]
scraping = ["dep:spider", "dep:htmd"]

[dev-dependencies]
test-log = "0.2.16"
Expand Down
75 changes: 75 additions & 0 deletions swiftide/src/integrations/scraping/html_to_markdown_transformer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
use anyhow::Result;
use async_trait::async_trait;
use derive_builder::Builder;
use htmd::HtmlToMarkdown;

use crate::{ingestion::IngestionNode, Transformer};

#[derive(Builder)]
#[builder(pattern = "owned")]
/// Transforms HTML content into markdown.
///
/// Useful for converting scraping results into markdown.
pub struct HtmlToMarkdownTransformer {
/// The `HtmlToMarkdown` instance used to convert HTML to markdown.
///
/// Sets a sane default, but can be customized.
htmd: HtmlToMarkdown,
#[builder(default)]
concurrency: Option<usize>,
}

impl Default for HtmlToMarkdownTransformer {
fn default() -> Self {
Self {
htmd: HtmlToMarkdown::builder()
.skip_tags(vec!["script", "style"])
.build(),
concurrency: None,
}
}
}

impl HtmlToMarkdownTransformer {
#[allow(dead_code)]
pub fn builder() -> HtmlToMarkdownTransformerBuilder {
HtmlToMarkdownTransformerBuilder::default()
}
}

impl std::fmt::Debug for HtmlToMarkdownTransformer {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("HtmlToMarkdownTransformer").finish()
}
}

#[async_trait]
impl Transformer for HtmlToMarkdownTransformer {
/// Converts the HTML content in the `IngestionNode` to markdown.
///
/// Will Err the node if the conversion fails.
async fn transform_node(&self, node: IngestionNode) -> Result<IngestionNode> {
let chunk = self.htmd.convert(&node.chunk);
Ok(IngestionNode {
chunk: chunk?,
..node
})
}

fn concurrency(&self) -> Option<usize> {
self.concurrency
}
}

#[cfg(test)]
mod test {
use super::*;

#[tokio::test]
async fn test_html_to_markdown() {
let node = IngestionNode::new("<h1>Hello, World!</h1>");
let transformer = HtmlToMarkdownTransformer::default();
let transformed = transformer.transform_node(node).await.unwrap();
assert_eq!(transformed.chunk, "# Hello, World!");
}
}
5 changes: 1 addition & 4 deletions swiftide/src/integrations/scraping/loader.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
use std::{borrow::BorrowMut, cell::RefCell};

use anyhow::{Context as _, Result};
use derive_builder::Builder;
use futures_util::stream;
use spider::website::Website;
use tokio::sync::RwLock;

Expand All @@ -23,6 +19,7 @@ pub struct ScrapingLoader {

impl ScrapingLoader {
// Constructs a scrapingloader from a `spider::Website` configuration
#![allow(dead_code)]
pub fn from_spider(spider_website: Website) -> Self {
Self {
spider_website: RwLock::new(spider_website),
Expand Down
1 change: 1 addition & 0 deletions swiftide/src/integrations/scraping/mod.rs
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
mod html_to_markdown_transformer;
mod loader;

0 comments on commit eb84dd2

Please sign in to comment.