From a1858a259a6e12caa6608ede7d3648288fff7bf9 Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Thu, 7 Nov 2024 15:02:22 +0100 Subject: [PATCH 1/3] Add recursion limit --- CHANGELOG.md | 12 ++++++++++- .../html/test_html_to_ontology_parsing.py | 20 ++++++++++++++++++- ...t_html_to_unstructured_and_back_parsing.py | 2 +- unstructured/__version__.py | 2 +- unstructured/documents/ontology.py | 8 ++++++-- .../partition/html/transformations.py | 12 +++++++---- 6 files changed, 46 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 683413210d..46171d28c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## 0.16.5-dev0 + +### Enhancements + +### Features + +### Fixes +- **Fixes parsing HTML v2 parser** Now max recursion limit is set and value is correctly extracted from ontology element + + ## 0.16.4 ### Enhancements @@ -9,7 +19,7 @@ ### Features -* **Add support for link extraction in pdf hi_res strategy.** The `partition_pdf()` function now supports link extraction when using the `hi_res` strategy, allowing users to extract hyperlinks from PDF documents more effectively. +* **Add support for link extraction in pdf hi_res strategy.** The `partition_pdf()` function now supports link extraction when using the `hi_res` strategy, allowing users to extract hyperlinks from PDF documents more effectively. ### Fixes diff --git a/test_unstructured/partition/html/test_html_to_ontology_parsing.py b/test_unstructured/partition/html/test_html_to_ontology_parsing.py index 8a69722b23..5630020fa4 100644 --- a/test_unstructured/partition/html/test_html_to_ontology_parsing.py +++ b/test_unstructured/partition/html/test_html_to_ontology_parsing.py @@ -1,6 +1,6 @@ from bs4 import BeautifulSoup -from unstructured.documents.ontology import OntologyElement +from unstructured.documents.ontology import Form, FormFieldValue, OntologyElement, Page from unstructured.partition.html.html_utils import indent_html from unstructured.partition.html.transformations import parse_html_to_ontology @@ -605,3 +605,21 @@ def test_text_in_form_field_value(): form_field_value = page.children[0] assert form_field_value.text == "" assert form_field_value.to_text() == "Random Input Value" + + +def test_to_text_when_form_field(): + ontology = Page( + children=[ + Form( + tag="input", + additional_attributes={"value": "Random Input Value"}, + children=[ + FormFieldValue( + tag="input", + additional_attributes={"value": "Random Input Value"}, + ) + ], + ) + ] + ) + assert ontology.to_text(add_children=True) == "Random Input Value" diff --git a/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py b/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py index f763196517..a3edcfc024 100644 --- a/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py +++ b/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py @@ -274,7 +274,7 @@ def test_forms(): assert expected_html == parsed_html expected_elements = _page_elements + [ Text( - text="Option 1 (Checked)", + text="2 Option 1 (Checked)", element_id="2", detection_origin="vlm_partitioner", metadata=ElementMetadata( diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 602ae7b7f7..d282588e44 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.4" # pragma: no cover +__version__ = "0.16.5-dev0" # pragma: no cover diff --git a/unstructured/documents/ontology.py b/unstructured/documents/ontology.py index ef9cc52e85..9ddd389c93 100644 --- a/unstructured/documents/ontology.py +++ b/unstructured/documents/ontology.py @@ -90,7 +90,10 @@ def to_html(self, add_children=True) -> str: return result_html def to_text(self, add_children=True) -> str: - return " ".join(BeautifulSoup(self.to_html(add_children), "html.parser").stripped_strings) + if self.children and add_children: + children_text = " ".join(child.to_text().strip() for child in self.children) + return children_text + return self.text.strip() def _construct_attribute_string(self, attributes: dict) -> str: return " ".join( @@ -472,7 +475,8 @@ class FormFieldValue(OntologyElement): allowed_tags: List[str] = Field(["input"], frozen=True) def to_text(self, add_children=True) -> str: - return super().to_text() + self.additional_attributes.get("value", "") + text = super().to_text() + self.additional_attributes.get("value", "") + return text.strip() class Checkbox(OntologyElement): diff --git a/unstructured/partition/html/transformations.py b/unstructured/partition/html/transformations.py index 51cd5dd414..a1dbfa6660 100644 --- a/unstructured/partition/html/transformations.py +++ b/unstructured/partition/html/transformations.py @@ -36,6 +36,8 @@ UncategorizedText, ) +RECURSION_LIMIT = 50 + def ontology_to_unstructured_elements( ontology_element: OntologyElement, @@ -68,7 +70,7 @@ def ontology_to_unstructured_elements( list[Element]: A list of unstructured Element objects. """ elements_to_return = [] - if ontology_element.elementType == ElementTypeEnum.layout: + if ontology_element.elementType == ElementTypeEnum.layout and depth <= RECURSION_LIMIT: if page_number is None and isinstance(ontology_element, Page): page_number = ontology_element.page_number @@ -354,7 +356,7 @@ def remove_empty_tags(soup): return str(soup) -def parse_html_to_ontology_element(soup: Tag) -> OntologyElement | None: +def parse_html_to_ontology_element(soup: Tag, recursion_depth: int = 1) -> OntologyElement | None: """ Converts a BeautifulSoup Tag object into an OntologyElement object. This function is recursive. First tries to recognize a class from Unstructured Ontology, then if class is matched tries @@ -364,6 +366,7 @@ def parse_html_to_ontology_element(soup: Tag) -> OntologyElement | None: Args: soup (Tag): The BeautifulSoup Tag object to be converted. + recursion_depth (int): Flag to control limit of recursion depth. Returns: OntologyElement: The converted OntologyElement object. @@ -384,12 +387,13 @@ def parse_html_to_ontology_element(soup: Tag) -> OntologyElement | None: and any(isinstance(content, Tag) for content in soup.contents) or ontology_class().elementType == ElementTypeEnum.layout ) + should_unwrap_html = has_children and recursion_depth < RECURSION_LIMIT - if has_children: + if should_unwrap_html: text = "" children = [ ( - parse_html_to_ontology_element(child) + parse_html_to_ontology_element(child, recursion_depth=recursion_depth + 1) if isinstance(child, Tag) else Paragraph(text=str(child).strip()) ) From 7bcc6c0f8dc218cd5cb19173504fbeb57f249057 Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Thu, 7 Nov 2024 15:14:06 +0100 Subject: [PATCH 2/3] Add unit test for recursion --- .../html/test_html_to_ontology_parsing.py | 33 ++++++++++++++++++- unstructured/documents/ontology.py | 2 +- .../partition/html/transformations.py | 2 +- 3 files changed, 34 insertions(+), 3 deletions(-) diff --git a/test_unstructured/partition/html/test_html_to_ontology_parsing.py b/test_unstructured/partition/html/test_html_to_ontology_parsing.py index 5630020fa4..f42506c51e 100644 --- a/test_unstructured/partition/html/test_html_to_ontology_parsing.py +++ b/test_unstructured/partition/html/test_html_to_ontology_parsing.py @@ -2,7 +2,7 @@ from unstructured.documents.ontology import Form, FormFieldValue, OntologyElement, Page from unstructured.partition.html.html_utils import indent_html -from unstructured.partition.html.transformations import parse_html_to_ontology +from unstructured.partition.html.transformations import RECURSION_LIMIT, parse_html_to_ontology def _wrap_with_body(html: str) -> str: @@ -623,3 +623,34 @@ def test_to_text_when_form_field(): ] ) assert ontology.to_text(add_children=True) == "Random Input Value" + + +def test_recursion_limit_is_limiting_parsing(): + # language=HTML + broken_html = "some text" + for i in range(100): + broken_html = f"

{broken_html}

" + broken_html = _wrap_with_body(broken_html) + ontology = parse_html_to_ontology(broken_html) + + iterator = 1 + last_child = ontology.children[0] + while last_child.children: + last_child = last_child.children[0] + iterator += 1 + assert last_child.text.startswith('

') + assert iterator == RECURSION_LIMIT + + +def test_get_text_when_recursion_limit_activated(): + broken_html = "some text" + for i in range(100): + broken_html = f"

{broken_html}

" + broken_html = _wrap_with_body(broken_html) + ontology = parse_html_to_ontology(broken_html) + + last_child = ontology.children[0] + while last_child.children: + last_child = last_child.children[0] + + assert last_child.to_text() == "some text" diff --git a/unstructured/documents/ontology.py b/unstructured/documents/ontology.py index 9ddd389c93..262d3fe148 100644 --- a/unstructured/documents/ontology.py +++ b/unstructured/documents/ontology.py @@ -93,7 +93,7 @@ def to_text(self, add_children=True) -> str: if self.children and add_children: children_text = " ".join(child.to_text().strip() for child in self.children) return children_text - return self.text.strip() + return BeautifulSoup(self.to_html()).get_text().strip() def _construct_attribute_string(self, attributes: dict) -> str: return " ".join( diff --git a/unstructured/partition/html/transformations.py b/unstructured/partition/html/transformations.py index a1dbfa6660..6054eba670 100644 --- a/unstructured/partition/html/transformations.py +++ b/unstructured/partition/html/transformations.py @@ -387,7 +387,7 @@ def parse_html_to_ontology_element(soup: Tag, recursion_depth: int = 1) -> Ontol and any(isinstance(content, Tag) for content in soup.contents) or ontology_class().elementType == ElementTypeEnum.layout ) - should_unwrap_html = has_children and recursion_depth < RECURSION_LIMIT + should_unwrap_html = has_children and recursion_depth <= RECURSION_LIMIT if should_unwrap_html: text = "" From 4a9be2cc5fdc2674f05c0d1dabc1627be495b5e9 Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Thu, 7 Nov 2024 15:33:44 +0100 Subject: [PATCH 3/3] Change default and remove unused code --- unstructured/documents/ontology.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/unstructured/documents/ontology.py b/unstructured/documents/ontology.py index 262d3fe148..aca80599e9 100644 --- a/unstructured/documents/ontology.py +++ b/unstructured/documents/ontology.py @@ -42,7 +42,7 @@ class ElementTypeEnum(str, Enum): class OntologyElement(BaseModel): - text: Optional[str] = Field(None, description="Text content of the element") + text: Optional[str] = Field("", description="Text content of the element") css_class_name: Optional[str] = Field( default_factory=lambda: "", description="CSS class associated with the element" ) @@ -453,15 +453,6 @@ class Form(OntologyElement): elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True) allowed_tags: List[str] = Field(["form"], frozen=True) - def to_text(self, add_children=True) -> str: - texts = [self.text] if self.text else [] - - if add_children: - for child in self.children: - texts.append(child.to_text(add_children=True)) - - return " ".join(filter(None, texts)).strip() - class FormField(OntologyElement): description: str = Field("A property value of a form", frozen=True)