-
Notifications
You must be signed in to change notification settings - Fork 818
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add text as html to orig elements chunks (#3779)
This simplest solution doesn't drop HTML from metadata when merging Elements from HTML input. We still need to address how to handle nested elements, and if we want to have `LayoutElements` in the metadata of Composite Elements, a unit test showing the current behavior. Note: metadata still contains `orig_elements` which has all the metadata.
- Loading branch information
Showing
4 changed files
with
97 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
from functools import partial | ||
|
||
import pytest | ||
|
||
from unstructured.chunking.basic import chunk_elements | ||
from unstructured.chunking.title import chunk_by_title | ||
from unstructured.documents.elements import ElementMetadata, NarrativeText, Text, Title | ||
|
||
|
||
@pytest.fixture(params=[chunk_elements, partial(chunk_by_title, combine_text_under_n_chars=0)]) | ||
def chunking_fn(request): | ||
return request.param | ||
|
||
|
||
def test_combining_html_metadata_when_multiple_elements_in_composite_element(chunking_fn): | ||
metadata_1 = '<h1 class="Title" id="1">Header </h1>' | ||
metadata_2 = '<time class="CalendarDate" id="2">Date: October 30, 2023 </time>' | ||
metadata_3 = ( | ||
'<form class="Form" id="3"> ' | ||
'<label class="FormField" for="company-name" id="4">Form field name </label>' | ||
'<input class="FormFieldValue" id="5" value="Example value" />' | ||
"</form>" | ||
) | ||
combined_metadata = " ".join([metadata_1, metadata_2, metadata_3]) | ||
|
||
elements = [ | ||
Title(text="Header", metadata=ElementMetadata(text_as_html=metadata_1)), | ||
Text(text="Date: October 30, 2023", metadata=ElementMetadata(text_as_html=metadata_2)), | ||
Text( | ||
text="Form field name Example value", metadata=ElementMetadata(text_as_html=metadata_3) | ||
), | ||
] | ||
chunks = chunking_fn(elements) | ||
assert len(chunks) == 1 | ||
assert chunks[0].metadata.text_as_html == combined_metadata | ||
|
||
|
||
def test_combining_html_metadata_with_nested_relationship_between_elements(chunking_fn): | ||
""" | ||
Ground truth | ||
<Document> | ||
<Page> | ||
<Section> | ||
<p>First</p> | ||
<p>Second</p> | ||
</Section> | ||
</Page> | ||
</Document> | ||
Elements: Document, Page, Section, Paragraph, Paragraph | ||
Chunk 1: Document, Page, Section, Paragraph | ||
Chunk 2: | ||
Paragraph | ||
""" | ||
|
||
metadata_1 = '<div class="Section" id="1" />' | ||
metadata_2 = '<p class="Paragraph" id="2">First </p>' | ||
metadata_3 = '<p class="Paragraph" id="3">Second </p>' | ||
|
||
elements = [ | ||
Text(text="", metadata=ElementMetadata(text_as_html=metadata_1)), | ||
NarrativeText( | ||
text="First", metadata=ElementMetadata(text_as_html=metadata_2, parent_id="1") | ||
), | ||
NarrativeText( | ||
text="Second", metadata=ElementMetadata(text_as_html=metadata_3, parent_id="1") | ||
), | ||
] | ||
chunks = chunking_fn(elements, max_characters=6) | ||
assert len(chunks) == 2 | ||
assert chunks[0].text == "First" | ||
assert chunks[1].text == "Second" | ||
|
||
assert chunks[0].metadata.text_as_html == metadata_1 + " " + metadata_2 | ||
assert chunks[1].metadata.text_as_html == metadata_3 | ||
|
||
|
||
def test_html_metadata_exist_in_both_element_when_text_is_split(chunking_fn): | ||
"""Mimic behaviour of elements with non-html metadata""" | ||
metadata_1 = '<h1 class="Title" id="1">Header </h1>' | ||
elements = [ | ||
Title(text="Header", metadata=ElementMetadata(text_as_html=metadata_1)), | ||
] | ||
chunks = chunking_fn(elements, max_characters=3) | ||
assert len(chunks) == 2 | ||
|
||
assert chunks[0].text == "Hea" | ||
assert chunks[1].text == "der" | ||
assert chunks[0].metadata.text_as_html == '<h1 class="Title" id="1">Header </h1>' | ||
assert chunks[1].metadata.text_as_html == '<h1 class="Title" id="1">Header </h1>' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters