forked from langchain-ai/langchain
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add unit tests for document_transformers/beautiful_soup_transformer.py (
langchain-ai#12520) - **Description:** * Add unit tests for document_transformers/beautiful_soup_transformer.py * Basic functionality is tested (extract tags, remove tags, drop lines) * add a FIXME comment about the order of tags that is not preserved (and a passing test, but with the expected tags now out-of-order) - **Issue:** None - **Dependencies:** None - **Tag maintainer:** @rlancemartin - **Twitter handle:** `peter_v` Please make sure your PR is passing linting and testing before submitting. => OK: I ran `make format`, `make test` (passing after install of beautifulsoup4) and `make lint`.
- Loading branch information
1 parent
e12aef9
commit 56c1074
Showing
1 changed file
with
88 additions
and
0 deletions.
There are no files selected for viewing
88 changes: 88 additions & 0 deletions
88
libs/langchain/tests/unit_tests/document_transformers/test_beautiful_soup_transformer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
"""Unit tests for beautiful soup document transformer.""" | ||
import pytest | ||
|
||
from langchain.document_transformers import BeautifulSoupTransformer | ||
from langchain.schema.document import Document | ||
|
||
|
||
@pytest.mark.requires("bs4") | ||
def test_transform_empty_html() -> None: | ||
bs_transformer = BeautifulSoupTransformer() | ||
empty_html = "<html></html>" | ||
documents = [Document(page_content=empty_html)] | ||
docs_transformed = bs_transformer.transform_documents(documents) | ||
assert docs_transformed[0].page_content == "" | ||
|
||
|
||
@pytest.mark.requires("bs4") | ||
def test_extract_paragraph() -> None: | ||
bs_transformer = BeautifulSoupTransformer() | ||
paragraphs_html = "<html><p>First paragraph.</p><p>Second paragraph.</p><html>" | ||
documents = [Document(page_content=paragraphs_html)] | ||
docs_transformed = bs_transformer.transform_documents(documents) | ||
assert docs_transformed[0].page_content == "First paragraph. Second paragraph." | ||
|
||
|
||
@pytest.mark.requires("bs4") | ||
def test_remove_style() -> None: | ||
bs_transformer = BeautifulSoupTransformer() | ||
with_style_html = ( | ||
"<html><style>my_funky_style</style><p>First paragraph.</p></html>" | ||
) | ||
documents = [Document(page_content=with_style_html)] | ||
docs_transformed = bs_transformer.transform_documents(documents) | ||
assert docs_transformed[0].page_content == "First paragraph." | ||
|
||
|
||
@pytest.mark.requires("bs4") | ||
def test_remove_unwanted_lines() -> None: | ||
bs_transformer = BeautifulSoupTransformer() | ||
with_lines_html = "<html>\n\n<p>First \n\n paragraph.</p>\n</html>\n\n" | ||
documents = [Document(page_content=with_lines_html)] | ||
docs_transformed = bs_transformer.transform_documents(documents) | ||
assert docs_transformed[0].page_content == "First paragraph." | ||
|
||
|
||
# FIXME: This test proves that the order of the tags is NOT preserved. | ||
# Documenting the current behavior here, but this should be fixed. | ||
@pytest.mark.requires("bs4") | ||
def test_transform_keeps_order() -> None: | ||
bs_transformer = BeautifulSoupTransformer() | ||
multiple_tags_html = ( | ||
"<h1>First heading.</h1>" | ||
"<p>First paragraph.</p>" | ||
"<h1>Second heading.</h1>" | ||
"<p>Second paragraph.</p>" | ||
) | ||
documents = [Document(page_content=multiple_tags_html)] | ||
|
||
# order of "p" and "h1" in the "tags_to_extract" parameter is important here: | ||
# it will first extract all "p" tags, then all "h1" tags, breaking the order | ||
# of the HTML. | ||
docs_transformed_p_then_h1 = bs_transformer.transform_documents( | ||
documents, tags_to_extract=["p", "h1"] | ||
) | ||
assert ( | ||
docs_transformed_p_then_h1[0].page_content | ||
== "First paragraph. Second paragraph. First heading. Second heading." | ||
) | ||
|
||
# Recreating `documents` because transform_documents() modifies it. | ||
documents = [Document(page_content=multiple_tags_html)] | ||
|
||
# changing the order of "h1" and "p" in "tags_to_extract" flips the order of | ||
# the extracted tags: | ||
docs_transformed_h1_then_p = bs_transformer.transform_documents( | ||
documents, tags_to_extract=["h1", "p"] | ||
) | ||
assert ( | ||
docs_transformed_h1_then_p[0].page_content | ||
== "First heading. Second heading. First paragraph. Second paragraph." | ||
) | ||
|
||
# The correct result should be: | ||
# | ||
# "First heading. First paragraph. Second heading. Second paragraph." | ||
# | ||
# That is the order in the original HTML, that should be preserved to preserve | ||
# the semantic "meaning" of the text. |