Skip to content

Commit

Permalink
Add text as html to orig elements chunks (#3779)
Browse files Browse the repository at this point in the history
This simplest solution doesn't drop HTML from metadata when merging
Elements from HTML input. We still need to address how to handle nested
elements, and if we want to have `LayoutElements` in the metadata of
Composite Elements, a unit test showing the current behavior.
Note: metadata still contains `orig_elements` which has all the
metadata.
  • Loading branch information
plutasnyy authored Nov 20, 2024
1 parent e1babf0 commit 85ecdab
Show file tree
Hide file tree
Showing 4 changed files with 97 additions and 1 deletion.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
### Features

### Fixes
- **ElementMetadata consolidation** Now `text_as_html` metadata is combined across all elements in CompositeElement when chunking HTML output

## 0.16.5

Expand Down
90 changes: 90 additions & 0 deletions test_unstructured/chunking/test_html_output.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from functools import partial

import pytest

from unstructured.chunking.basic import chunk_elements
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import ElementMetadata, NarrativeText, Text, Title


@pytest.fixture(params=[chunk_elements, partial(chunk_by_title, combine_text_under_n_chars=0)])
def chunking_fn(request):
return request.param


def test_combining_html_metadata_when_multiple_elements_in_composite_element(chunking_fn):
metadata_1 = '<h1 class="Title" id="1">Header </h1>'
metadata_2 = '<time class="CalendarDate" id="2">Date: October 30, 2023 </time>'
metadata_3 = (
'<form class="Form" id="3"> '
'<label class="FormField" for="company-name" id="4">Form field name </label>'
'<input class="FormFieldValue" id="5" value="Example value" />'
"</form>"
)
combined_metadata = " ".join([metadata_1, metadata_2, metadata_3])

elements = [
Title(text="Header", metadata=ElementMetadata(text_as_html=metadata_1)),
Text(text="Date: October 30, 2023", metadata=ElementMetadata(text_as_html=metadata_2)),
Text(
text="Form field name Example value", metadata=ElementMetadata(text_as_html=metadata_3)
),
]
chunks = chunking_fn(elements)
assert len(chunks) == 1
assert chunks[0].metadata.text_as_html == combined_metadata


def test_combining_html_metadata_with_nested_relationship_between_elements(chunking_fn):
"""
Ground truth
<Document>
<Page>
<Section>
<p>First</p>
<p>Second</p>
</Section>
</Page>
</Document>
Elements: Document, Page, Section, Paragraph, Paragraph
Chunk 1: Document, Page, Section, Paragraph
Chunk 2:
Paragraph
"""

metadata_1 = '<div class="Section" id="1" />'
metadata_2 = '<p class="Paragraph" id="2">First </p>'
metadata_3 = '<p class="Paragraph" id="3">Second </p>'

elements = [
Text(text="", metadata=ElementMetadata(text_as_html=metadata_1)),
NarrativeText(
text="First", metadata=ElementMetadata(text_as_html=metadata_2, parent_id="1")
),
NarrativeText(
text="Second", metadata=ElementMetadata(text_as_html=metadata_3, parent_id="1")
),
]
chunks = chunking_fn(elements, max_characters=6)
assert len(chunks) == 2
assert chunks[0].text == "First"
assert chunks[1].text == "Second"

assert chunks[0].metadata.text_as_html == metadata_1 + " " + metadata_2
assert chunks[1].metadata.text_as_html == metadata_3


def test_html_metadata_exist_in_both_element_when_text_is_split(chunking_fn):
"""Mimic behaviour of elements with non-html metadata"""
metadata_1 = '<h1 class="Title" id="1">Header </h1>'
elements = [
Title(text="Header", metadata=ElementMetadata(text_as_html=metadata_1)),
]
chunks = chunking_fn(elements, max_characters=3)
assert len(chunks) == 2

assert chunks[0].text == "Hea"
assert chunks[1].text == "der"
assert chunks[0].metadata.text_as_html == '<h1 class="Title" id="1">Header </h1>'
assert chunks[1].metadata.text_as_html == '<h1 class="Title" id="1">Header </h1>'
2 changes: 2 additions & 0 deletions unstructured/chunking/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -774,6 +774,8 @@ def iter_kwarg_pairs() -> Iterator[tuple[str, Any]]:
# -- Python 3.7+ maintains dict insertion order --
ordered_unique_keys = {key: None for val_list in values for key in val_list}
yield field_name, list(ordered_unique_keys.keys())
elif strategy is CS.STRING_CONCATENATE:
yield field_name, " ".join(val.strip() for val in values)
elif strategy is CS.DROP:
continue
else: # pragma: no cover
Expand Down
5 changes: 4 additions & 1 deletion unstructured/documents/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,9 @@ class ConsolidationStrategy(enum.Enum):
FIRST = "first"
"""Use the first value encountered, omit if not present in any elements."""

STRING_CONCATENATE = "string_concatenate"
"""Combine the values of this field across elements. Only suitable for fields of `str` type."""

LIST_CONCATENATE = "LIST_CONCATENATE"
"""Concatenate the list values across elements. Only suitable for fields of `List` type."""

Expand Down Expand Up @@ -507,7 +510,7 @@ def field_consolidation_strategies(cls) -> dict[str, ConsolidationStrategy]:
"sent_to": cls.FIRST,
"signature": cls.FIRST,
"subject": cls.FIRST,
"text_as_html": cls.FIRST, # -- only occurs in Table --
"text_as_html": cls.STRING_CONCATENATE,
"table_as_cells": cls.FIRST, # -- only occurs in Table --
"url": cls.FIRST,
"key_value_pairs": cls.DROP, # -- only occurs in FormKeysValues --
Expand Down

0 comments on commit 85ecdab

Please sign in to comment.