-
Notifications
You must be signed in to change notification settings - Fork 15.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
# Add bs4 html parser * Some minor refactors * Extract the bs4 html parsing code from the bs html loader * Move some tests from integration tests to unit tests
- Loading branch information
Showing
7 changed files
with
96 additions
and
16 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from langchain.document_loaders.parsers.html.bs4 import BS4HTMLParser | ||
|
||
__all__ = ["BS4HTMLParser"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
"""Loader that uses bs4 to load HTML files, enriching metadata with page title.""" | ||
|
||
import logging | ||
from typing import Any, Dict, Iterator, Union | ||
|
||
from langchain.docstore.document import Document | ||
from langchain.document_loaders.base import BaseBlobParser | ||
from langchain.document_loaders.blob_loaders import Blob | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class BS4HTMLParser(BaseBlobParser): | ||
"""Parser that uses beautiful soup to parse HTML files.""" | ||
|
||
def __init__( | ||
self, | ||
*, | ||
features: str = "lxml", | ||
get_text_separator: str = "", | ||
**kwargs: Any, | ||
) -> None: | ||
"""Initialize a bs4 based HTML parser.""" | ||
try: | ||
import bs4 # noqa:F401 | ||
except ImportError: | ||
raise ValueError( | ||
"beautifulsoup4 package not found, please install it with " | ||
"`pip install beautifulsoup4`" | ||
) | ||
|
||
self.bs_kwargs = {"features": features, **kwargs} | ||
self.get_text_separator = get_text_separator | ||
|
||
def lazy_parse(self, blob: Blob) -> Iterator[Document]: | ||
"""Load HTML document into document objects.""" | ||
from bs4 import BeautifulSoup | ||
|
||
with blob.as_bytes_io() as f: | ||
soup = BeautifulSoup(f, **self.bs_kwargs) | ||
|
||
text = soup.get_text(self.get_text_separator) | ||
|
||
if soup.title: | ||
title = str(soup.title.string) | ||
else: | ||
title = "" | ||
|
||
metadata: Dict[str, Union[str, None]] = { | ||
"source": blob.source, | ||
"title": title, | ||
} | ||
yield Document(page_content=text, metadata=metadata) |
12 changes: 0 additions & 12 deletions
12
tests/integration_tests/document_loaders/parsers/test_public_api.py
This file was deleted.
Oops, something went wrong.
28 changes: 28 additions & 0 deletions
28
tests/unit_tests/document_loaders/parsers/test_html_parsers.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
"""Tests for the HTML parsers.""" | ||
from pathlib import Path | ||
|
||
import pytest | ||
|
||
from langchain.document_loaders.blob_loaders import Blob | ||
from langchain.document_loaders.parsers.html import BS4HTMLParser | ||
|
||
HERE = Path(__file__).parent | ||
EXAMPLES = HERE.parent.parent.parent / "integration_tests" / "examples" | ||
|
||
|
||
@pytest.mark.requires("bs4", "lxml") | ||
def test_bs_html_loader() -> None: | ||
"""Test unstructured loader.""" | ||
file_path = EXAMPLES / "example.html" | ||
blob = Blob.from_path(file_path) | ||
parser = BS4HTMLParser(get_text_separator="|") | ||
docs = list(parser.lazy_parse(blob)) | ||
assert isinstance(docs, list) | ||
assert len(docs) == 1 | ||
|
||
metadata = docs[0].metadata | ||
content = docs[0].page_content | ||
|
||
assert metadata["title"] == "Chew dad's slippers" | ||
assert metadata["source"] == str(file_path) | ||
assert content[:2] == "\n|" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters