From 9ece0b5ad2cc2ecbb24e57644c06c0b735719776 Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Tue, 17 Dec 2024 17:16:42 -0800 Subject: [PATCH] fix: improve false-positive Title elements on Chinese text (#3836) **Summary** Improve element-type mapping for Chinese text. Fixes bug where Chinese text would produce large numbers of false-positive `Title` elements. Fixes #3084 --------- Co-authored-by: scanny Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> --- CHANGELOG.md | 3 +- .../metrics/test_element_type.py | 10 +- test_unstructured/partition/test_auto.py | 2 +- test_unstructured/partition/test_doc.py | 2 +- test_unstructured/partition/test_docx.py | 16 +- test_unstructured/partition/test_odt.py | 5 +- .../box/handbook-1p.docx.json | 12 +- .../dropbox/handbook-1p.docx.json | 12 +- .../google-drive/fake.docx.json | 36 +- .../google-drive/nested/fake.docx.json | 16 +- .../recalibrating-risk-report.pdf.json | 1494 ++++++++--------- .../google-drive/test-drive-doc.docx.json | 70 +- .../handbook-1p.docx.json | 6 +- unstructured/__version__.py | 2 +- unstructured/partition/docx.py | 31 +- 15 files changed, 856 insertions(+), 861 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d13d859802..aa832741fb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.16.12-dev3 +## 0.16.12-dev4 ### Enhancements @@ -10,6 +10,7 @@ - **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted. - **CSV with asserted XLS content-type is correctly identified as CSV.** Resolves a bug where a CSV file with an asserted content-type of `application/vnd.ms-excel` was incorrectly identified as an XLS file. +- **Improve element-type mapping for Chinese text.** Fixes bug where Chinese text would produce large numbers of false-positive `Title` elements. ## 0.16.11 diff --git a/test_unstructured/metrics/test_element_type.py b/test_unstructured/metrics/test_element_type.py index d1faba6a20..9a44a08f05 100644 --- a/test_unstructured/metrics/test_element_type.py +++ b/test_unstructured/metrics/test_element_type.py @@ -74,20 +74,18 @@ def test_get_element_type_frequency(filename: str, frequency: dict[tuple[str, in "handbook-1p.docx", { ("Header", None): 1, - ("Title", 0): 1, - ("Title", 1): 1, - ("Title", 2): 1, + ("UncategorizedText", 0): 6, ("ListItem", 3): 3, - ("NarrativeText", 4): 7, + ("NarrativeText", 0): 7, ("Footer", None): 1, }, - (0.43, 0.07, 0.65), + (0.78, 0.72, 0.81), ), ( "handbook-1p.docx", { ("Header", None): 1, - ("Title", 0): 6, + ("UncategorizedText", 0): 6, ("NarrativeText", 0): 7, ("PageBreak", None): 1, ("Footer", None): 1, diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index 9443058176..74187aa3b0 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -1286,7 +1286,7 @@ def expected_docx_elements(): Title("These are a few of my favorite things:"), ListItem("Parrots"), ListItem("Hockey"), - Title("Analysis"), + Text("Analysis"), NarrativeText("This is my first thought. This is my second thought."), NarrativeText("This is my third thought."), Text("2023"), diff --git a/test_unstructured/partition/test_doc.py b/test_unstructured/partition/test_doc.py index 7c8c4d3ef5..e2698a3f71 100644 --- a/test_unstructured/partition/test_doc.py +++ b/test_unstructured/partition/test_doc.py @@ -275,7 +275,7 @@ def expected_elements() -> list[Element]: Title("These are a few of my favorite things:"), ListItem("Parrots"), ListItem("Hockey"), - Title("Analysis"), + Text("Analysis"), NarrativeText("This is my first thought. This is my second thought."), NarrativeText("This is my third thought."), Text("2023"), diff --git a/test_unstructured/partition/test_docx.py b/test_unstructured/partition/test_docx.py index 1330b4a79a..34a27cfde3 100644 --- a/test_unstructured/partition/test_docx.py +++ b/test_unstructured/partition/test_docx.py @@ -627,7 +627,7 @@ def expected_elements() -> list[Text]: Title("These are a few of my favorite things:"), ListItem("Parrots"), ListItem("Hockey"), - Title("Analysis"), + Text("Analysis"), NarrativeText("This is my first thought. This is my second thought."), NarrativeText("This is my third thought."), Text("2023"), @@ -1210,7 +1210,7 @@ def str_repr(e: Element) -> str: opts_args["file_path"] = example_doc_path("page-breaks.docx") opts = DocxPartitionerOptions(**opts_args) expected = [ - # NOTE(scanny) - -- page 1 -- + # -- page 1 -- NarrativeText( "First page, tab here:\t" "followed by line-break here:\n" @@ -1220,28 +1220,28 @@ def str_repr(e: Element) -> str: "and hard page-break here>>" ), PageBreak(""), - # NOTE(scanny) - -- page 2 -- + # -- page 2 -- NarrativeText( "<> <>"), NarrativeText("<>"), PageBreak(""), - # NOTE(scanny) - -- page 4 -- + # -- page 4 -- PageBreak(""), - # NOTE(scanny) - -- page 5 -- + # -- page 5 -- NarrativeText("<> ' ), PageBreak(""), - # NOTE(scanny) - -- page 6 -- - Title("< Iterator[Eleme ) ) - # NOTE(scanny) - blank paragraphs are commonly used for spacing between paragraphs and - # do not contribute to the document-element stream. + # -- blank paragraphs are commonly used for spacing between paragraphs and do not + # -- contribute to the document-element stream if not text.strip(): return metadata = self._paragraph_metadata(paragraph) - # NOTE(scanny) - a list-item gets some special treatment, mutating the text to remove a - # bullet-character if present. + # -- a list-item gets some special treatment, mutating the text to remove a + # -- bullet-character if present if self._is_list_item(paragraph): clean_text = clean_bullets(text).strip() if clean_text: @@ -431,19 +430,19 @@ def _classify_paragraph_to_element(self, paragraph: Paragraph) -> Iterator[Eleme ) return - # NOTE(scanny) - determine element-type from an explicit Word paragraph-style if possible + # -- determine element-type from an explicit Word paragraph-style if possible -- TextSubCls = self._style_based_element_type(paragraph) if TextSubCls: yield TextSubCls(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN) return - # NOTE(scanny) - try to recognize the element type by parsing its text + # -- try to recognize the element type by parsing its text -- TextSubCls = self._parse_paragraph_text_for_element_type(paragraph) if TextSubCls: yield TextSubCls(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN) return - # NOTE(scanny) - if all that fails we give it the default `Text` element-type + # -- if all that fails we give it the default `Text` element-type -- yield Text(text, metadata=metadata, detection_origin=DETECTION_ORIGIN) def _convert_table_to_html(self, table: DocxTable) -> str: @@ -576,20 +575,20 @@ def iter_paragraph_items(paragraph: Paragraph) -> Iterator[Paragraph | RenderedP page_break = paragraph.rendered_page_breaks[0] - # NOTE(scanny)- preceding-fragment is None when first paragraph content is a page-break + # -- preceding-fragment is None when first paragraph content is a page-break -- preceding_paragraph_fragment = page_break.preceding_paragraph_fragment if preceding_paragraph_fragment: yield preceding_paragraph_fragment yield page_break - # NOTE(scanny) - following-fragment is None when page-break is last paragraph content. - # This is probably quite rare (Word moves these to the start of the next paragraph) but - # easier to check for it than prove it can't happen. + # -- following-fragment is None when page-break is last paragraph content. This is + # -- probably quite rare (Word moves these to the start of the next paragraph) but + # -- easier to check for it than prove it can't happen. following_paragraph_fragment = page_break.following_paragraph_fragment - # NOTE(scanny) - the paragraph fragment following a page-break can itself contain - # another page-break. This would also be quite rare, but it can happen so we just - # recurse into the second fragment the same way we handled the original paragraph. + # -- the paragraph fragment following a page-break can itself contain another + # -- page-break; this would also be quite rare, but it can happen so we just recurse + # -- into the second fragment the same way we handled the original paragraph if following_paragraph_fragment: yield from iter_paragraph_items(following_paragraph_fragment) @@ -901,8 +900,6 @@ def _parse_paragraph_text_for_element_type(self, paragraph: Paragraph) -> Type[T return EmailAddress if is_possible_narrative_text(text): return NarrativeText - if is_possible_title(text): - return Title return None