Add html parsers (#4874)

# Add bs4 html parser * Some minor refactors * Extract the bs4 html parsing code from the bs html loader * Move some tests from integration tests to unit tests
langchain-ai · May 18, 2023 · 0dc304c · 0dc304c
1 parent 8e41143
commit 0dc304c
Show file tree

Hide file tree

Showing 7 changed files with 96 additions and 16 deletions.
diff --git a/langchain/document_loaders/parsers/__init__.py b/langchain/document_loaders/parsers/__init__.py
@@ -1,3 +1,4 @@
+from langchain.document_loaders.parsers.html import BS4HTMLParser
 from langchain.document_loaders.parsers.pdf import (
     PDFMinerParser,
     PDFPlumberParser,
@@ -7,9 +8,10 @@
 )
 
 __all__ = [
-    "PyPDFParser",
+    "BS4HTMLParser",
     "PDFMinerParser",
+    "PDFPlumberParser",
     "PyMuPDFParser",
     "PyPDFium2Parser",
-    "PDFPlumberParser",
+    "PyPDFParser",
 ]
diff --git a/langchain/document_loaders/parsers/html/__init__.py b/langchain/document_loaders/parsers/html/__init__.py
@@ -0,0 +1,3 @@
+from langchain.document_loaders.parsers.html.bs4 import BS4HTMLParser
+
+__all__ = ["BS4HTMLParser"]
diff --git a/langchain/document_loaders/parsers/html/bs4.py b/langchain/document_loaders/parsers/html/bs4.py
@@ -0,0 +1,53 @@
+"""Loader that uses bs4 to load HTML files, enriching metadata with page title."""
+
+import logging
+from typing import Any, Dict, Iterator, Union
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseBlobParser
+from langchain.document_loaders.blob_loaders import Blob
+
+logger = logging.getLogger(__name__)
+
+
+class BS4HTMLParser(BaseBlobParser):
+    """Parser that uses beautiful soup to parse HTML files."""
+
+    def __init__(
+        self,
+        *,
+        features: str = "lxml",
+        get_text_separator: str = "",
+        **kwargs: Any,
+    ) -> None:
+        """Initialize a bs4 based HTML parser."""
+        try:
+            import bs4  # noqa:F401
+        except ImportError:
+            raise ValueError(
+                "beautifulsoup4 package not found, please install it with "
+                "`pip install beautifulsoup4`"
+            )
+
+        self.bs_kwargs = {"features": features, **kwargs}
+        self.get_text_separator = get_text_separator
+
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+        """Load HTML document into document objects."""
+        from bs4 import BeautifulSoup
+
+        with blob.as_bytes_io() as f:
+            soup = BeautifulSoup(f, **self.bs_kwargs)
+
+        text = soup.get_text(self.get_text_separator)
+
+        if soup.title:
+            title = str(soup.title.string)
+        else:
+            title = ""
+
+        metadata: Dict[str, Union[str, None]] = {
+            "source": blob.source,
+            "title": title,
+        }
+        yield Document(page_content=text, metadata=metadata)
diff --git a/tests/integration_tests/document_loaders/parsers/test_public_api.py b/tests/integration_tests/document_loaders/parsers/test_public_api.py
diff --git a/tests/unit_tests/document_loaders/parsers/test_html_parsers.py b/tests/unit_tests/document_loaders/parsers/test_html_parsers.py
@@ -0,0 +1,28 @@
+"""Tests for the HTML parsers."""
+from pathlib import Path
+
+import pytest
+
+from langchain.document_loaders.blob_loaders import Blob
+from langchain.document_loaders.parsers.html import BS4HTMLParser
+
+HERE = Path(__file__).parent
+EXAMPLES = HERE.parent.parent.parent / "integration_tests" / "examples"
+
+
+@pytest.mark.requires("bs4", "lxml")
+def test_bs_html_loader() -> None:
+    """Test unstructured loader."""
+    file_path = EXAMPLES / "example.html"
+    blob = Blob.from_path(file_path)
+    parser = BS4HTMLParser(get_text_separator="|")
+    docs = list(parser.lazy_parse(blob))
+    assert isinstance(docs, list)
+    assert len(docs) == 1
+
+    metadata = docs[0].metadata
+    content = docs[0].page_content
+
+    assert metadata["title"] == "Chew dad's slippers"
+    assert metadata["source"] == str(file_path)
+    assert content[:2] == "\n|"
diff --git a/tests/unit_tests/document_loaders/parsers/test_public_api.py b/tests/unit_tests/document_loaders/parsers/test_public_api.py
@@ -4,6 +4,7 @@
 def test_parsers_public_api_correct() -> None:
     """Test public API of parsers for breaking changes."""
     assert set(__all__) == {
+        "BS4HTMLParser",
         "PyPDFParser",
         "PDFMinerParser",
         "PyMuPDFParser",

diff --git a/...ion_tests/document_loaders/test_bshtml.py → ...nit_tests/document_loaders/test_bshtml.py b/...ion_tests/document_loaders/test_bshtml.py → ...nit_tests/document_loaders/test_bshtml.py
@@ -5,10 +5,14 @@
 
 from langchain.document_loaders.html_bs import BSHTMLLoader
 
+HERE = Path(__file__).parent
+EXAMPLES = HERE.parent.parent / "integration_tests" / "examples"
 
+
+@pytest.mark.requires("bs4", "lxml")
 def test_bs_html_loader() -> None:
     """Test unstructured loader."""
-    file_path = Path(__file__).parent.parent / "examples/example.html"
+    file_path = EXAMPLES / "example.html"
     loader = BSHTMLLoader(str(file_path), get_text_separator="|")
     docs = loader.load()
 
@@ -26,9 +30,10 @@ def test_bs_html_loader() -> None:
     bool(sys.flags.utf8_mode) or not sys.platform.startswith("win"),
     reason="default encoding is utf8",
 )
+@pytest.mark.requires("bs4", "lxml")
 def test_bs_html_loader_non_utf8() -> None:
     """Test providing encoding to BSHTMLLoader."""
-    file_path = Path(__file__).parent.parent / "examples/example-utf8.html"
+    file_path = EXAMPLES / "example-utf8.html"
 
     with pytest.raises(UnicodeDecodeError):
         BSHTMLLoader(str(file_path)).load()