From c03cc912ad03515e9dc5d838c8ad66c5e165906f Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Mon, 11 Nov 2024 15:11:42 +0100 Subject: [PATCH 1/8] Add concatenate strategy for metadata consolidation --- .../chunking/test_vlm_output_chunking.py | 103 ++++++++++++++++++ unstructured/chunking/base.py | 2 + unstructured/documents/elements.py | 5 +- 3 files changed, 109 insertions(+), 1 deletion(-) create mode 100644 test_unstructured/chunking/test_vlm_output_chunking.py diff --git a/test_unstructured/chunking/test_vlm_output_chunking.py b/test_unstructured/chunking/test_vlm_output_chunking.py new file mode 100644 index 0000000000..5c9e3619b4 --- /dev/null +++ b/test_unstructured/chunking/test_vlm_output_chunking.py @@ -0,0 +1,103 @@ +from pathlib import Path + +import pytest + +from unstructured.chunking.title import chunk_by_title +from unstructured.documents.elements import ElementMetadata, NarrativeText, Text, Title +from unstructured.staging.base import elements_from_json + + +@pytest.mark.parametrize( + "json_file_path", + [ + "unstructured_json_output/example.json", + "unstructured_json_output/example_full_doc.json", + "unstructured_json_output/example_with_inline_fields.json", + ], +) +def test_chunking_output(json_file_path): + json_file_path = Path(__file__).parent / json_file_path + expected_json_elements = elements_from_json(str(json_file_path)) + chunks = chunk_by_title(expected_json_elements, combine_text_under_n_chars=0) + + assert False + + +def test_combining_html_metadata(): + meta_1 = '

Header

' + meta_2 = '' + meta_3 = '
' + combined_metadata = meta_1 + meta_2 + meta_3 + + elements = [ + Title(text="Header", metadata=ElementMetadata(text_as_html=meta_1)), + Text(text="Date: October 30, 2023", metadata=ElementMetadata(text_as_html=meta_2)), + Text(text="From field name Example value", metadata=ElementMetadata(text_as_html=meta_3)), + ] + chunks = chunk_by_title(elements, combine_text_under_n_chars=0) + assert len(chunks) == 1 + assert chunks[0].metadata.text_as_html == combined_metadata + + +def test_combining_html_metadata_with_parent(): + """ + Ground truth + + +
+

First

+

Second

+
+
+
+ Elements: Document, Page, Section, Paragraph, Paragraph + Chunk 1: Document, Page, Section, Paragraph + + Chunk 2: + Current: Paragraph + Okay? or: Document, Page, Section, Paragraph + """ + + meta_1 = '
' + meta_2 = '

First

' + meta_3 = '

Second

' + + elements = [ + Text(text="", metadata=ElementMetadata(text_as_html=meta_1)), + NarrativeText(text="First", metadata=ElementMetadata(text_as_html=meta_2, parent_id="1")), + NarrativeText(text="Second", metadata=ElementMetadata(text_as_html=meta_3, parent_id="1")), + ] + chunks = chunk_by_title(elements, max_characters=6, combine_text_under_n_chars=0) + assert len(chunks) == 2 + assert chunks[0].text == "First" + assert chunks[1].text == "Second" + + assert chunks[0].metadata.text_as_html == meta_1 + meta_2 + assert chunks[1].metadata.text_as_html == meta_3 + + +def test_splitting_html_metadata(): + meta_1 = '

Header

' + elements = [ + Title(text="Header", metadata=ElementMetadata(text_as_html=meta_1)), + ] + chunks = chunk_by_title(elements, combine_text_under_n_chars=0, max_characters=3) + assert len(chunks) == 2 + + assert chunks[0].text == "Hea" + assert chunks[1].text == "der" + assert chunks[0].metadata.text_as_html == '

Header

' + assert chunks[1].metadata.text_as_html == '

Header

' + + +def test_splitting_text(): + elements = [ + Title(text="Header", metadata=ElementMetadata(text_as_html="Header")), + ] + chunks = chunk_by_title(elements, combine_text_under_n_chars=0, max_characters=3) + assert len(chunks) == 2 + + assert chunks[0].text == "Hea" + assert chunks[1].text == "der" + assert chunks[0].metadata.text_as_html == "Header" + assert chunks[1].metadata.text_as_html == "Header" diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py index 90057d11d3..78ad52e08d 100644 --- a/unstructured/chunking/base.py +++ b/unstructured/chunking/base.py @@ -774,6 +774,8 @@ def iter_kwarg_pairs() -> Iterator[tuple[str, Any]]: # -- Python 3.7+ maintains dict insertion order -- ordered_unique_keys = {key: None for val_list in values for key in val_list} yield field_name, list(ordered_unique_keys.keys()) + elif strategy is CS.CONCATENATE: + yield field_name, "".join(values) elif strategy is CS.DROP: continue else: # pragma: no cover diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index d6f4c3fc3f..1219f5308f 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -458,6 +458,9 @@ class ConsolidationStrategy(enum.Enum): FIRST = "first" """Use the first value encountered, omit if not present in any elements.""" + CONCATENATE = "concatenate" + """Combine the values of this field across elements. Only suitable for string type fields.""" + LIST_CONCATENATE = "LIST_CONCATENATE" """Concatenate the list values across elements. Only suitable for fields of `List` type.""" @@ -507,7 +510,7 @@ def field_consolidation_strategies(cls) -> dict[str, ConsolidationStrategy]: "sent_to": cls.FIRST, "signature": cls.FIRST, "subject": cls.FIRST, - "text_as_html": cls.FIRST, # -- only occurs in Table -- + "text_as_html": cls.CONCATENATE, "table_as_cells": cls.FIRST, # -- only occurs in Table -- "url": cls.FIRST, "key_value_pairs": cls.DROP, # -- only occurs in FormKeysValues -- From 45fdf4c3f5df28bab0d4424c404783fd27e7eb2f Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Mon, 11 Nov 2024 15:15:12 +0100 Subject: [PATCH 2/8] Update changelog --- CHANGELOG.md | 9 +++++++++ unstructured/__version__.py | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1584267192..f431d31065 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +## 0.16.6-dev0 + +### Enhancements + +### Features + +### Fixes +- **ElementMetadata consolidation** Now `text_as_html` metadata is combined across all elements in CompositeElement when chunking HTML output + ## 0.16.5 ### Enhancements diff --git a/unstructured/__version__.py b/unstructured/__version__.py index f4096753b3..a03340d1af 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.5" # pragma: no cover +__version__ = "0.16.6-dev0" # pragma: no cover From 725a356d52594087032665732151727c3923f04a Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Mon, 11 Nov 2024 16:19:05 +0100 Subject: [PATCH 3/8] Add unit tests --- .../chunking/test_vlm_output_chunking.py | 56 ++++++------------- unstructured/chunking/base.py | 2 +- unstructured/documents/elements.py | 6 +- 3 files changed, 22 insertions(+), 42 deletions(-) diff --git a/test_unstructured/chunking/test_vlm_output_chunking.py b/test_unstructured/chunking/test_vlm_output_chunking.py index 5c9e3619b4..99c5c4c53e 100644 --- a/test_unstructured/chunking/test_vlm_output_chunking.py +++ b/test_unstructured/chunking/test_vlm_output_chunking.py @@ -1,32 +1,25 @@ -from pathlib import Path +from functools import partial import pytest +from unstructured.chunking.basic import chunk_elements from unstructured.chunking.title import chunk_by_title from unstructured.documents.elements import ElementMetadata, NarrativeText, Text, Title -from unstructured.staging.base import elements_from_json -@pytest.mark.parametrize( - "json_file_path", - [ - "unstructured_json_output/example.json", - "unstructured_json_output/example_full_doc.json", - "unstructured_json_output/example_with_inline_fields.json", - ], -) -def test_chunking_output(json_file_path): - json_file_path = Path(__file__).parent / json_file_path - expected_json_elements = elements_from_json(str(json_file_path)) - chunks = chunk_by_title(expected_json_elements, combine_text_under_n_chars=0) +@pytest.fixture(params=[chunk_elements, partial(chunk_by_title, combine_text_under_n_chars=0)]) +def chunking_fn(request): + return request.param - assert False - -def test_combining_html_metadata(): +def test_combining_html_metadata_when_multiple_elements_in_composite_element(chunking_fn): meta_1 = '

Header

' meta_2 = '' - meta_3 = '
' + meta_3 = ( + '
' + "
" + ) combined_metadata = meta_1 + meta_2 + meta_3 elements = [ @@ -34,12 +27,12 @@ def test_combining_html_metadata(): Text(text="Date: October 30, 2023", metadata=ElementMetadata(text_as_html=meta_2)), Text(text="From field name Example value", metadata=ElementMetadata(text_as_html=meta_3)), ] - chunks = chunk_by_title(elements, combine_text_under_n_chars=0) + chunks = chunking_fn(elements) assert len(chunks) == 1 assert chunks[0].metadata.text_as_html == combined_metadata -def test_combining_html_metadata_with_parent(): +def test_combining_html_metadata_with_nested_relationship_between_elements(chunking_fn): """ Ground truth @@ -54,8 +47,7 @@ def test_combining_html_metadata_with_parent(): Chunk 1: Document, Page, Section, Paragraph Chunk 2: - Current: Paragraph - Okay? or: Document, Page, Section, Paragraph + Paragraph """ meta_1 = '
' @@ -67,7 +59,7 @@ def test_combining_html_metadata_with_parent(): NarrativeText(text="First", metadata=ElementMetadata(text_as_html=meta_2, parent_id="1")), NarrativeText(text="Second", metadata=ElementMetadata(text_as_html=meta_3, parent_id="1")), ] - chunks = chunk_by_title(elements, max_characters=6, combine_text_under_n_chars=0) + chunks = chunking_fn(elements, max_characters=6) assert len(chunks) == 2 assert chunks[0].text == "First" assert chunks[1].text == "Second" @@ -76,28 +68,16 @@ def test_combining_html_metadata_with_parent(): assert chunks[1].metadata.text_as_html == meta_3 -def test_splitting_html_metadata(): +def test_html_metadata_exist_in_both_element_when_text_is_split(chunking_fn): + """Mimic behaviour of elements with non-html metadata""" meta_1 = '

Header

' elements = [ Title(text="Header", metadata=ElementMetadata(text_as_html=meta_1)), ] - chunks = chunk_by_title(elements, combine_text_under_n_chars=0, max_characters=3) + chunks = chunking_fn(elements, max_characters=3) assert len(chunks) == 2 assert chunks[0].text == "Hea" assert chunks[1].text == "der" assert chunks[0].metadata.text_as_html == '

Header

' assert chunks[1].metadata.text_as_html == '

Header

' - - -def test_splitting_text(): - elements = [ - Title(text="Header", metadata=ElementMetadata(text_as_html="Header")), - ] - chunks = chunk_by_title(elements, combine_text_under_n_chars=0, max_characters=3) - assert len(chunks) == 2 - - assert chunks[0].text == "Hea" - assert chunks[1].text == "der" - assert chunks[0].metadata.text_as_html == "Header" - assert chunks[1].metadata.text_as_html == "Header" diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py index 78ad52e08d..e3d43adf83 100644 --- a/unstructured/chunking/base.py +++ b/unstructured/chunking/base.py @@ -774,7 +774,7 @@ def iter_kwarg_pairs() -> Iterator[tuple[str, Any]]: # -- Python 3.7+ maintains dict insertion order -- ordered_unique_keys = {key: None for val_list in values for key in val_list} yield field_name, list(ordered_unique_keys.keys()) - elif strategy is CS.CONCATENATE: + elif strategy is CS.STRING_CONCATENATE: yield field_name, "".join(values) elif strategy is CS.DROP: continue diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index 1219f5308f..a9636b5d6e 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -458,8 +458,8 @@ class ConsolidationStrategy(enum.Enum): FIRST = "first" """Use the first value encountered, omit if not present in any elements.""" - CONCATENATE = "concatenate" - """Combine the values of this field across elements. Only suitable for string type fields.""" + STRING_CONCATENATE = "string_concatenate" + """Combine the values of this field across elements. Only suitable for fields of `str` type.""" LIST_CONCATENATE = "LIST_CONCATENATE" """Concatenate the list values across elements. Only suitable for fields of `List` type.""" @@ -510,7 +510,7 @@ def field_consolidation_strategies(cls) -> dict[str, ConsolidationStrategy]: "sent_to": cls.FIRST, "signature": cls.FIRST, "subject": cls.FIRST, - "text_as_html": cls.CONCATENATE, + "text_as_html": cls.STRING_CONCATENATE, "table_as_cells": cls.FIRST, # -- only occurs in Table -- "url": cls.FIRST, "key_value_pairs": cls.DROP, # -- only occurs in FormKeysValues -- From 03fe724b514c16d3534bed104c2dde3939f5e7bd Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Mon, 11 Nov 2024 16:25:02 +0100 Subject: [PATCH 4/8] Fix naming --- .../chunking/test_vlm_output_chunking.py | 40 +++++++++++-------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/test_unstructured/chunking/test_vlm_output_chunking.py b/test_unstructured/chunking/test_vlm_output_chunking.py index 99c5c4c53e..9869a68c32 100644 --- a/test_unstructured/chunking/test_vlm_output_chunking.py +++ b/test_unstructured/chunking/test_vlm_output_chunking.py @@ -13,19 +13,21 @@ def chunking_fn(request): def test_combining_html_metadata_when_multiple_elements_in_composite_element(chunking_fn): - meta_1 = '

Header

' - meta_2 = '' - meta_3 = ( + metadata_1 = '

Header

' + metadata_2 = '' + metadata_3 = ( '
' "
" ) - combined_metadata = meta_1 + meta_2 + meta_3 + combined_metadata = metadata_1 + metadata_2 + metadata_3 elements = [ - Title(text="Header", metadata=ElementMetadata(text_as_html=meta_1)), - Text(text="Date: October 30, 2023", metadata=ElementMetadata(text_as_html=meta_2)), - Text(text="From field name Example value", metadata=ElementMetadata(text_as_html=meta_3)), + Title(text="Header", metadata=ElementMetadata(text_as_html=metadata_1)), + Text(text="Date: October 30, 2023", metadata=ElementMetadata(text_as_html=metadata_2)), + Text( + text="From field name Example value", metadata=ElementMetadata(text_as_html=metadata_3) + ), ] chunks = chunking_fn(elements) assert len(chunks) == 1 @@ -50,29 +52,33 @@ def test_combining_html_metadata_with_nested_relationship_between_elements(chunk Paragraph """ - meta_1 = '
' - meta_2 = '

First

' - meta_3 = '

Second

' + metadata_1 = '
' + metadata_2 = '

First

' + metadata_3 = '

Second

' elements = [ - Text(text="", metadata=ElementMetadata(text_as_html=meta_1)), - NarrativeText(text="First", metadata=ElementMetadata(text_as_html=meta_2, parent_id="1")), - NarrativeText(text="Second", metadata=ElementMetadata(text_as_html=meta_3, parent_id="1")), + Text(text="", metadata=ElementMetadata(text_as_html=metadata_1)), + NarrativeText( + text="First", metadata=ElementMetadata(text_as_html=metadata_2, parent_id="1") + ), + NarrativeText( + text="Second", metadata=ElementMetadata(text_as_html=metadata_3, parent_id="1") + ), ] chunks = chunking_fn(elements, max_characters=6) assert len(chunks) == 2 assert chunks[0].text == "First" assert chunks[1].text == "Second" - assert chunks[0].metadata.text_as_html == meta_1 + meta_2 - assert chunks[1].metadata.text_as_html == meta_3 + assert chunks[0].metadata.text_as_html == metadata_1 + metadata_2 + assert chunks[1].metadata.text_as_html == metadata_3 def test_html_metadata_exist_in_both_element_when_text_is_split(chunking_fn): """Mimic behaviour of elements with non-html metadata""" - meta_1 = '

Header

' + metadata_1 = '

Header

' elements = [ - Title(text="Header", metadata=ElementMetadata(text_as_html=meta_1)), + Title(text="Header", metadata=ElementMetadata(text_as_html=metadata_1)), ] chunks = chunking_fn(elements, max_characters=3) assert len(chunks) == 2 From cd057ca042d2223a6a9932a7d60c3fc0dc5018ce Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Mon, 11 Nov 2024 16:32:28 +0100 Subject: [PATCH 5/8] Fix formatting --- test_unstructured/chunking/test_vlm_output_chunking.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test_unstructured/chunking/test_vlm_output_chunking.py b/test_unstructured/chunking/test_vlm_output_chunking.py index 9869a68c32..2c01d9361d 100644 --- a/test_unstructured/chunking/test_vlm_output_chunking.py +++ b/test_unstructured/chunking/test_vlm_output_chunking.py @@ -16,8 +16,9 @@ def test_combining_html_metadata_when_multiple_elements_in_composite_element(chu metadata_1 = '

Header

' metadata_2 = '' metadata_3 = ( - '
' + ' ' + '' + '' "
" ) combined_metadata = metadata_1 + metadata_2 + metadata_3 From 6332003f911104bd2679c997c3fb2dd1d24b32ba Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Mon, 11 Nov 2024 16:39:50 +0100 Subject: [PATCH 6/8] Update test --- .../{test_vlm_output_chunking.py => test_html_output.py} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename test_unstructured/chunking/{test_vlm_output_chunking.py => test_html_output.py} (96%) diff --git a/test_unstructured/chunking/test_vlm_output_chunking.py b/test_unstructured/chunking/test_html_output.py similarity index 96% rename from test_unstructured/chunking/test_vlm_output_chunking.py rename to test_unstructured/chunking/test_html_output.py index 2c01d9361d..26dacf3ef0 100644 --- a/test_unstructured/chunking/test_vlm_output_chunking.py +++ b/test_unstructured/chunking/test_html_output.py @@ -17,7 +17,7 @@ def test_combining_html_metadata_when_multiple_elements_in_composite_element(chu metadata_2 = '' metadata_3 = ( '
' - '' + '' '' "
" ) @@ -27,7 +27,7 @@ def test_combining_html_metadata_when_multiple_elements_in_composite_element(chu Title(text="Header", metadata=ElementMetadata(text_as_html=metadata_1)), Text(text="Date: October 30, 2023", metadata=ElementMetadata(text_as_html=metadata_2)), Text( - text="From field name Example value", metadata=ElementMetadata(text_as_html=metadata_3) + text="Form field name Example value", metadata=ElementMetadata(text_as_html=metadata_3) ), ] chunks = chunking_fn(elements) From 3ec8714b0a0317fb95f01dd40d5f4f770bf3218b Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Wed, 20 Nov 2024 13:46:18 +0100 Subject: [PATCH 7/8] Fixes after CR --- test_unstructured/chunking/test_html_output.py | 4 ++-- unstructured/chunking/base.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test_unstructured/chunking/test_html_output.py b/test_unstructured/chunking/test_html_output.py index 26dacf3ef0..d1a174e2c3 100644 --- a/test_unstructured/chunking/test_html_output.py +++ b/test_unstructured/chunking/test_html_output.py @@ -21,7 +21,7 @@ def test_combining_html_metadata_when_multiple_elements_in_composite_element(chu '' "" ) - combined_metadata = metadata_1 + metadata_2 + metadata_3 + combined_metadata = " ".join([metadata_1, metadata_2, metadata_3]) elements = [ Title(text="Header", metadata=ElementMetadata(text_as_html=metadata_1)), @@ -71,7 +71,7 @@ def test_combining_html_metadata_with_nested_relationship_between_elements(chunk assert chunks[0].text == "First" assert chunks[1].text == "Second" - assert chunks[0].metadata.text_as_html == metadata_1 + metadata_2 + assert chunks[0].metadata.text_as_html == metadata_1 + " " + metadata_2 assert chunks[1].metadata.text_as_html == metadata_3 diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py index e3d43adf83..b91c3982ea 100644 --- a/unstructured/chunking/base.py +++ b/unstructured/chunking/base.py @@ -775,7 +775,7 @@ def iter_kwarg_pairs() -> Iterator[tuple[str, Any]]: ordered_unique_keys = {key: None for val_list in values for key in val_list} yield field_name, list(ordered_unique_keys.keys()) elif strategy is CS.STRING_CONCATENATE: - yield field_name, "".join(values) + yield field_name, " ".join(val.strip() for val in values) elif strategy is CS.DROP: continue else: # pragma: no cover From 59198731dc315211aa3c3b0ee3dd9c0a3a6237e6 Mon Sep 17 00:00:00 2001 From: Kamil Plucinski Date: Wed, 20 Nov 2024 13:47:09 +0100 Subject: [PATCH 8/8] Remove spaces from mimicing html --- test_unstructured/chunking/test_html_output.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_unstructured/chunking/test_html_output.py b/test_unstructured/chunking/test_html_output.py index d1a174e2c3..6e3e92d946 100644 --- a/test_unstructured/chunking/test_html_output.py +++ b/test_unstructured/chunking/test_html_output.py @@ -41,8 +41,8 @@ def test_combining_html_metadata_with_nested_relationship_between_elements(chunk
-

First

-

Second

+

First

+

Second