Skip to content

Commit

Permalink
Add html parsers (#4874)
Browse files Browse the repository at this point in the history
# Add bs4 html parser

* Some minor refactors
* Extract the bs4 html parsing code from the bs html loader
* Move some tests from integration tests to unit tests
  • Loading branch information
eyurtsev authored May 18, 2023
1 parent 8e41143 commit 0dc304c
Show file tree
Hide file tree
Showing 7 changed files with 96 additions and 16 deletions.
6 changes: 4 additions & 2 deletions langchain/document_loaders/parsers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from langchain.document_loaders.parsers.html import BS4HTMLParser
from langchain.document_loaders.parsers.pdf import (
PDFMinerParser,
PDFPlumberParser,
Expand All @@ -7,9 +8,10 @@
)

__all__ = [
"PyPDFParser",
"BS4HTMLParser",
"PDFMinerParser",
"PDFPlumberParser",
"PyMuPDFParser",
"PyPDFium2Parser",
"PDFPlumberParser",
"PyPDFParser",
]
3 changes: 3 additions & 0 deletions langchain/document_loaders/parsers/html/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from langchain.document_loaders.parsers.html.bs4 import BS4HTMLParser

__all__ = ["BS4HTMLParser"]
53 changes: 53 additions & 0 deletions langchain/document_loaders/parsers/html/bs4.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""Loader that uses bs4 to load HTML files, enriching metadata with page title."""

import logging
from typing import Any, Dict, Iterator, Union

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseBlobParser
from langchain.document_loaders.blob_loaders import Blob

logger = logging.getLogger(__name__)


class BS4HTMLParser(BaseBlobParser):
"""Parser that uses beautiful soup to parse HTML files."""

def __init__(
self,
*,
features: str = "lxml",
get_text_separator: str = "",
**kwargs: Any,
) -> None:
"""Initialize a bs4 based HTML parser."""
try:
import bs4 # noqa:F401
except ImportError:
raise ValueError(
"beautifulsoup4 package not found, please install it with "
"`pip install beautifulsoup4`"
)

self.bs_kwargs = {"features": features, **kwargs}
self.get_text_separator = get_text_separator

def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Load HTML document into document objects."""
from bs4 import BeautifulSoup

with blob.as_bytes_io() as f:
soup = BeautifulSoup(f, **self.bs_kwargs)

text = soup.get_text(self.get_text_separator)

if soup.title:
title = str(soup.title.string)
else:
title = ""

metadata: Dict[str, Union[str, None]] = {
"source": blob.source,
"title": title,
}
yield Document(page_content=text, metadata=metadata)

This file was deleted.

28 changes: 28 additions & 0 deletions tests/unit_tests/document_loaders/parsers/test_html_parsers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""Tests for the HTML parsers."""
from pathlib import Path

import pytest

from langchain.document_loaders.blob_loaders import Blob
from langchain.document_loaders.parsers.html import BS4HTMLParser

HERE = Path(__file__).parent
EXAMPLES = HERE.parent.parent.parent / "integration_tests" / "examples"


@pytest.mark.requires("bs4", "lxml")
def test_bs_html_loader() -> None:
"""Test unstructured loader."""
file_path = EXAMPLES / "example.html"
blob = Blob.from_path(file_path)
parser = BS4HTMLParser(get_text_separator="|")
docs = list(parser.lazy_parse(blob))
assert isinstance(docs, list)
assert len(docs) == 1

metadata = docs[0].metadata
content = docs[0].page_content

assert metadata["title"] == "Chew dad's slippers"
assert metadata["source"] == str(file_path)
assert content[:2] == "\n|"
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
def test_parsers_public_api_correct() -> None:
"""Test public API of parsers for breaking changes."""
assert set(__all__) == {
"BS4HTMLParser",
"PyPDFParser",
"PDFMinerParser",
"PyMuPDFParser",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,14 @@

from langchain.document_loaders.html_bs import BSHTMLLoader

HERE = Path(__file__).parent
EXAMPLES = HERE.parent.parent / "integration_tests" / "examples"


@pytest.mark.requires("bs4", "lxml")
def test_bs_html_loader() -> None:
"""Test unstructured loader."""
file_path = Path(__file__).parent.parent / "examples/example.html"
file_path = EXAMPLES / "example.html"
loader = BSHTMLLoader(str(file_path), get_text_separator="|")
docs = loader.load()

Expand All @@ -26,9 +30,10 @@ def test_bs_html_loader() -> None:
bool(sys.flags.utf8_mode) or not sys.platform.startswith("win"),
reason="default encoding is utf8",
)
@pytest.mark.requires("bs4", "lxml")
def test_bs_html_loader_non_utf8() -> None:
"""Test providing encoding to BSHTMLLoader."""
file_path = Path(__file__).parent.parent / "examples/example-utf8.html"
file_path = EXAMPLES / "example-utf8.html"

with pytest.raises(UnicodeDecodeError):
BSHTMLLoader(str(file_path)).load()
Expand Down

0 comments on commit 0dc304c

Please sign in to comment.