From c17403acdd9b3565d9fe270bb216d345494e3fe1 Mon Sep 17 00:00:00 2001 From: Andrew Zhou <44193474+adrwz@users.noreply.github.com> Date: Sun, 29 Oct 2023 16:26:53 -0700 Subject: [PATCH] More comprehensive readthedocs document loader (#12382) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## **Description:** When building our own readthedocs.io scraper, we noticed a couple interesting things: 1. Text lines with a lot of nested tags would give unclean text with a bunch of newlines. For example, for [Langchain's documentation](https://api.python.langchain.com/en/latest/document_loaders/langchain.document_loaders.readthedocs.ReadTheDocsLoader.html#langchain.document_loaders.readthedocs.ReadTheDocsLoader), a single line is represented in a complicated nested HTML structure, and the naive `soup.get_text()` call currently being made will create a newline for each nested HTML element. Therefore, the document loader would give a messy, newline-separated blob of text. This would be true in a lot of cases. Screenshot 2023-10-26 at 6 15 39 PM Screenshot 2023-10-26 at 6 16 00 PM Additionally, content from iframes, code from scripts, css from styles, etc. will be gotten if it's a subclass of the selector (which happens more often than you'd think). For example, [this page](https://pydeck.gl/gallery/contour_layer.html#) will scrape 1.5 million characters of content that looks like this: Screenshot 2023-10-26 at 6 32 55 PM Therefore, I wrote a recursive _get_clean_text(soup) class function that 1. skips all irrelevant elements, and 2. only adds newlines when necessary. 2. Index pages (like [this one](https://api.python.langchain.com/en/latest/api_reference.html)) would be loaded, chunked, and eventually embedded. This is really bad not just because the user will be embedding irrelevant information - but because index pages are very likely to show up in retrieved content, making retrieval less effective (in our tests). Therefore, I added a bool parameter `exclude_index_pages` defaulted to False (which is the current behavior — although I'd petition to default this to True) that will skip all pages where links take up 50%+ of the page. Through manual testing, this seems to be the best threshold. ## Other Information: - **Issue:** n/a - **Dependencies:** n/a - **Tag maintainer:** n/a - **Twitter handle:** @andrewthezhou --------- Co-authored-by: Andrew Zhou Co-authored-by: Bagatur --- .../langchain/document_loaders/readthedocs.py | 147 ++++++++++++++++-- .../readthedocs/index_page/test.html | 10 ++ .../nested_html_structure/test.html | 5 + .../document_loaders/test_readthedoc.py | 14 ++ 4 files changed, 161 insertions(+), 15 deletions(-) create mode 100644 libs/langchain/tests/unit_tests/document_loaders/test_docs/readthedocs/index_page/test.html create mode 100644 libs/langchain/tests/unit_tests/document_loaders/test_docs/readthedocs/nested_html_structure/test.html diff --git a/libs/langchain/langchain/document_loaders/readthedocs.py b/libs/langchain/langchain/document_loaders/readthedocs.py index 6aa3ddfd91590..2eed19931f10c 100644 --- a/libs/langchain/langchain/document_loaders/readthedocs.py +++ b/libs/langchain/langchain/document_loaders/readthedocs.py @@ -1,9 +1,15 @@ +from __future__ import annotations + from pathlib import Path -from typing import Any, List, Optional, Sequence, Tuple, Union +from typing import TYPE_CHECKING, Any, Iterator, List, Optional, Sequence, Tuple, Union from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader +if TYPE_CHECKING: + from bs4 import NavigableString + from bs4.element import Comment, Tag + class ReadTheDocsLoader(BaseLoader): """Load `ReadTheDocs` documentation directory.""" @@ -15,7 +21,8 @@ def __init__( errors: Optional[str] = None, custom_html_tag: Optional[Tuple[str, dict]] = None, patterns: Sequence[str] = ("*.htm", "*.html"), - **kwargs: Optional[Any] + exclude_links_ratio: float = 1.0, + **kwargs: Optional[Any], ): """ Initialize ReadTheDocsLoader @@ -36,6 +43,9 @@ def __init__( custom_html_tag: Optional custom html tag to retrieve the content from files. patterns: The file patterns to load, passed to `glob.rglob`. + exclude_links_ratio: The ratio of links:content to exclude pages from. + This is to reduce the frequency at which index pages make their + way into retrieved results. Recommended: 0.5 kwargs: named arguments passed to `bs4.BeautifulSoup`. """ try: @@ -48,7 +58,9 @@ def __init__( try: _ = BeautifulSoup( - "Parser builder library test.", **kwargs + "Parser builder library test.", + "html.parser", + **kwargs, ) except Exception as e: raise ValueError("Parsing kwargs do not appear valid") from e @@ -59,24 +71,26 @@ def __init__( self.custom_html_tag = custom_html_tag self.patterns = patterns self.bs_kwargs = kwargs + self.exclude_links_ratio = exclude_links_ratio - def load(self) -> List[Document]: - """Load documents.""" - docs = [] + def lazy_load(self) -> Iterator[Document]: + """A lazy loader for Documents.""" for file_pattern in self.patterns: for p in self.file_path.rglob(file_pattern): if p.is_dir(): continue with open(p, encoding=self.encoding, errors=self.errors) as f: text = self._clean_data(f.read()) - metadata = {"source": str(p)} - docs.append(Document(page_content=text, metadata=metadata)) - return docs + yield Document(page_content=text, metadata={"source": str(p)}) + + def load(self) -> List[Document]: + """Load documents.""" + return list(self.lazy_load()) def _clean_data(self, data: str) -> str: from bs4 import BeautifulSoup - soup = BeautifulSoup(data, **self.bs_kwargs) + soup = BeautifulSoup(data, "html.parser", **self.bs_kwargs) # default tags html_tags = [ @@ -87,18 +101,121 @@ def _clean_data(self, data: str) -> str: if self.custom_html_tag is not None: html_tags.append(self.custom_html_tag) - text = None + element = None # reversed order. check the custom one first for tag, attrs in html_tags[::-1]: - text = soup.find(tag, attrs) + element = soup.find(tag, attrs) # if found, break - if text is not None: + if element is not None: break - if text is not None: - text = text.get_text() + if element is not None and _get_link_ratio(element) <= self.exclude_links_ratio: + text = _get_clean_text(element) else: text = "" # trim empty lines return "\n".join([t for t in text.split("\n") if t]) + + +def _get_clean_text(element: Tag) -> str: + """Returns cleaned text with newlines preserved and irrelevant elements removed.""" + elements_to_skip = [ + "script", + "noscript", + "canvas", + "meta", + "svg", + "map", + "area", + "audio", + "source", + "track", + "video", + "embed", + "object", + "param", + "picture", + "iframe", + "frame", + "frameset", + "noframes", + "applet", + "form", + "button", + "select", + "base", + "style", + "img", + ] + + newline_elements = [ + "p", + "div", + "ul", + "ol", + "li", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "pre", + "table", + "tr", + ] + + text = _process_element(element, elements_to_skip, newline_elements) + return text.strip() + + +def _get_link_ratio(section: Tag) -> float: + links = section.find_all("a") + total_text = "".join(str(s) for s in section.stripped_strings) + if len(total_text) == 0: + return 0 + + link_text = "".join( + str(string.string.strip()) + for link in links + for string in link.strings + if string + ) + return len(link_text) / len(total_text) + + +def _process_element( + element: Union[Tag, NavigableString, Comment], + elements_to_skip: List[str], + newline_elements: List[str], +) -> str: + """ + Traverse through HTML tree recursively to preserve newline and skip + unwanted (code/binary) elements + """ + from bs4 import NavigableString + from bs4.element import Comment, Tag + + tag_name = getattr(element, "name", None) + if isinstance(element, Comment) or tag_name in elements_to_skip: + return "" + elif isinstance(element, NavigableString): + return element + elif tag_name == "br": + return "\n" + elif tag_name in newline_elements: + return ( + "".join( + _process_element(child, elements_to_skip, newline_elements) + for child in element.children + if isinstance(child, (Tag, NavigableString, Comment)) + ) + + "\n" + ) + else: + return "".join( + _process_element(child, elements_to_skip, newline_elements) + for child in element.children + if isinstance(child, (Tag, NavigableString, Comment)) + ) diff --git a/libs/langchain/tests/unit_tests/document_loaders/test_docs/readthedocs/index_page/test.html b/libs/langchain/tests/unit_tests/document_loaders/test_docs/readthedocs/index_page/test.html new file mode 100644 index 0000000000000..29aaaa6e6e934 --- /dev/null +++ b/libs/langchain/tests/unit_tests/document_loaders/test_docs/readthedocs/index_page/test.html @@ -0,0 +1,10 @@ + +
+ Websites: + Langchain + Langchain Docs + Langchain API Reference +
+ diff --git a/libs/langchain/tests/unit_tests/document_loaders/test_docs/readthedocs/nested_html_structure/test.html b/libs/langchain/tests/unit_tests/document_loaders/test_docs/readthedocs/nested_html_structure/test.html new file mode 100644 index 0000000000000..89c864c2316f7 --- /dev/null +++ b/libs/langchain/tests/unit_tests/document_loaders/test_docs/readthedocs/nested_html_structure/test.html @@ -0,0 +1,5 @@ + +
+ Hello World! +
+ diff --git a/libs/langchain/tests/unit_tests/document_loaders/test_readthedoc.py b/libs/langchain/tests/unit_tests/document_loaders/test_readthedoc.py index 9bcaae2fef7d3..087bbf3480b4a 100644 --- a/libs/langchain/tests/unit_tests/document_loaders/test_readthedoc.py +++ b/libs/langchain/tests/unit_tests/document_loaders/test_readthedoc.py @@ -31,6 +31,20 @@ def test_custom() -> None: assert len(documents[0].page_content) != 0 +@pytest.mark.requires("bs4") +def test_nested_html_structure() -> None: + loader = ReadTheDocsLoader(PARENT_DIR / "nested_html_structure") + documents = loader.load() + assert documents[0].page_content == "Hello World!" + + +@pytest.mark.requires("bs4") +def test_index_page() -> None: + loader = ReadTheDocsLoader(PARENT_DIR / "index_page", exclude_links_ratio=0.5) + documents = loader.load() + assert len(documents[0].page_content) == 0 + + @pytest.mark.requires("bs4") def test_empty() -> None: loader = ReadTheDocsLoader(