diff --git a/CHANGELOG.md b/CHANGELOG.md index 683413210d..46171d28c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## 0.16.5-dev0 + +### Enhancements + +### Features + +### Fixes +- **Fixes parsing HTML v2 parser** Now max recursion limit is set and value is correctly extracted from ontology element + + ## 0.16.4 ### Enhancements @@ -9,7 +19,7 @@ ### Features -* **Add support for link extraction in pdf hi_res strategy.** The `partition_pdf()` function now supports link extraction when using the `hi_res` strategy, allowing users to extract hyperlinks from PDF documents more effectively. +* **Add support for link extraction in pdf hi_res strategy.** The `partition_pdf()` function now supports link extraction when using the `hi_res` strategy, allowing users to extract hyperlinks from PDF documents more effectively. ### Fixes diff --git a/test_unstructured/partition/html/test_html_to_ontology_parsing.py b/test_unstructured/partition/html/test_html_to_ontology_parsing.py index 8a69722b23..f42506c51e 100644 --- a/test_unstructured/partition/html/test_html_to_ontology_parsing.py +++ b/test_unstructured/partition/html/test_html_to_ontology_parsing.py @@ -1,8 +1,8 @@ from bs4 import BeautifulSoup -from unstructured.documents.ontology import OntologyElement +from unstructured.documents.ontology import Form, FormFieldValue, OntologyElement, Page from unstructured.partition.html.html_utils import indent_html -from unstructured.partition.html.transformations import parse_html_to_ontology +from unstructured.partition.html.transformations import RECURSION_LIMIT, parse_html_to_ontology def _wrap_with_body(html: str) -> str: @@ -605,3 +605,52 @@ def test_text_in_form_field_value(): form_field_value = page.children[0] assert form_field_value.text == "" assert form_field_value.to_text() == "Random Input Value" + + +def test_to_text_when_form_field(): + ontology = Page( + children=[ + Form( + tag="input", + additional_attributes={"value": "Random Input Value"}, + children=[ + FormFieldValue( + tag="input", + additional_attributes={"value": "Random Input Value"}, + ) + ], + ) + ] + ) + assert ontology.to_text(add_children=True) == "Random Input Value" + + +def test_recursion_limit_is_limiting_parsing(): + # language=HTML + broken_html = "some text" + for i in range(100): + broken_html = f"

{broken_html}

" + broken_html = _wrap_with_body(broken_html) + ontology = parse_html_to_ontology(broken_html) + + iterator = 1 + last_child = ontology.children[0] + while last_child.children: + last_child = last_child.children[0] + iterator += 1 + assert last_child.text.startswith('

') + assert iterator == RECURSION_LIMIT + + +def test_get_text_when_recursion_limit_activated(): + broken_html = "some text" + for i in range(100): + broken_html = f"

{broken_html}

" + broken_html = _wrap_with_body(broken_html) + ontology = parse_html_to_ontology(broken_html) + + last_child = ontology.children[0] + while last_child.children: + last_child = last_child.children[0] + + assert last_child.to_text() == "some text" diff --git a/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py b/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py index f763196517..a3edcfc024 100644 --- a/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py +++ b/test_unstructured/partition/html/test_html_to_unstructured_and_back_parsing.py @@ -274,7 +274,7 @@ def test_forms(): assert expected_html == parsed_html expected_elements = _page_elements + [ Text( - text="Option 1 (Checked)", + text="2 Option 1 (Checked)", element_id="2", detection_origin="vlm_partitioner", metadata=ElementMetadata( diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 602ae7b7f7..d282588e44 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.4" # pragma: no cover +__version__ = "0.16.5-dev0" # pragma: no cover diff --git a/unstructured/documents/ontology.py b/unstructured/documents/ontology.py index ef9cc52e85..aca80599e9 100644 --- a/unstructured/documents/ontology.py +++ b/unstructured/documents/ontology.py @@ -42,7 +42,7 @@ class ElementTypeEnum(str, Enum): class OntologyElement(BaseModel): - text: Optional[str] = Field(None, description="Text content of the element") + text: Optional[str] = Field("", description="Text content of the element") css_class_name: Optional[str] = Field( default_factory=lambda: "", description="CSS class associated with the element" ) @@ -90,7 +90,10 @@ def to_html(self, add_children=True) -> str: return result_html def to_text(self, add_children=True) -> str: - return " ".join(BeautifulSoup(self.to_html(add_children), "html.parser").stripped_strings) + if self.children and add_children: + children_text = " ".join(child.to_text().strip() for child in self.children) + return children_text + return BeautifulSoup(self.to_html()).get_text().strip() def _construct_attribute_string(self, attributes: dict) -> str: return " ".join( @@ -450,15 +453,6 @@ class Form(OntologyElement): elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True) allowed_tags: List[str] = Field(["form"], frozen=True) - def to_text(self, add_children=True) -> str: - texts = [self.text] if self.text else [] - - if add_children: - for child in self.children: - texts.append(child.to_text(add_children=True)) - - return " ".join(filter(None, texts)).strip() - class FormField(OntologyElement): description: str = Field("A property value of a form", frozen=True) @@ -472,7 +466,8 @@ class FormFieldValue(OntologyElement): allowed_tags: List[str] = Field(["input"], frozen=True) def to_text(self, add_children=True) -> str: - return super().to_text() + self.additional_attributes.get("value", "") + text = super().to_text() + self.additional_attributes.get("value", "") + return text.strip() class Checkbox(OntologyElement): diff --git a/unstructured/partition/html/transformations.py b/unstructured/partition/html/transformations.py index 51cd5dd414..6054eba670 100644 --- a/unstructured/partition/html/transformations.py +++ b/unstructured/partition/html/transformations.py @@ -36,6 +36,8 @@ UncategorizedText, ) +RECURSION_LIMIT = 50 + def ontology_to_unstructured_elements( ontology_element: OntologyElement, @@ -68,7 +70,7 @@ def ontology_to_unstructured_elements( list[Element]: A list of unstructured Element objects. """ elements_to_return = [] - if ontology_element.elementType == ElementTypeEnum.layout: + if ontology_element.elementType == ElementTypeEnum.layout and depth <= RECURSION_LIMIT: if page_number is None and isinstance(ontology_element, Page): page_number = ontology_element.page_number @@ -354,7 +356,7 @@ def remove_empty_tags(soup): return str(soup) -def parse_html_to_ontology_element(soup: Tag) -> OntologyElement | None: +def parse_html_to_ontology_element(soup: Tag, recursion_depth: int = 1) -> OntologyElement | None: """ Converts a BeautifulSoup Tag object into an OntologyElement object. This function is recursive. First tries to recognize a class from Unstructured Ontology, then if class is matched tries @@ -364,6 +366,7 @@ def parse_html_to_ontology_element(soup: Tag) -> OntologyElement | None: Args: soup (Tag): The BeautifulSoup Tag object to be converted. + recursion_depth (int): Flag to control limit of recursion depth. Returns: OntologyElement: The converted OntologyElement object. @@ -384,12 +387,13 @@ def parse_html_to_ontology_element(soup: Tag) -> OntologyElement | None: and any(isinstance(content, Tag) for content in soup.contents) or ontology_class().elementType == ElementTypeEnum.layout ) + should_unwrap_html = has_children and recursion_depth <= RECURSION_LIMIT - if has_children: + if should_unwrap_html: text = "" children = [ ( - parse_html_to_ontology_element(child) + parse_html_to_ontology_element(child, recursion_depth=recursion_depth + 1) if isinstance(child, Tag) else Paragraph(text=str(child).strip()) )