From de31df51a92cd7cbf687e684218c9a754f42c635 Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Fri, 21 Oct 2022 14:43:10 -0400 Subject: [PATCH] feat: Adds a helper function to convert ISD dicts to elements (#39) * updated category name for ListItem * added brick to convert isd to elements * bump version * added isd_to_elements to documentation --- CHANGELOG.md | 3 ++- docs/source/bricks.rst | 21 +++++++++++++++++++ .../staging/test_base_staging.py | 19 ++++++++++++++++- unstructured/__version__.py | 2 +- unstructured/documents/elements.py | 6 ++++-- unstructured/staging/base.py | 19 ++++++++++++++++- 6 files changed, 64 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b0c602828e..de0ced263a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ -## 0.2.1-dev9 +## 0.2.1 +* Added brick to convert an ISD dictionary to a list of elements * Update `PDFDocument` to use the `from_file` method * Added staging brick for CSV format for ISD (Initial Structured Data) format. * Added staging brick for separating text into attention window size chunks for `transformers`. diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst index a1b38cf97a..3caa193e9e 100644 --- a/docs/source/bricks.rst +++ b/docs/source/bricks.rst @@ -338,6 +338,27 @@ Examples: isd = convert_to_isd(elements) +``isd_to_elements`` +------------------- + +Converts outputs from initial structured data (ISD) format back to a list of ``Text`` elements. + +Examples: + +.. code:: python + + from unstructured.staging.base import isd_to_elements + + isd = [ + {"text": "My Title", "type": "Title"}, + {"text": "My Narrative", "type": "NarrativeText"} + ] + + # elements will look like: + # [ Title(text="My Title"), NarrativeText(text="My Narrative")] + elements = isd_to_elements(isd) + + ``convert_to_isd_csv`` ---------------------- diff --git a/test_unstructured/staging/test_base_staging.py b/test_unstructured/staging/test_base_staging.py index f84cf4e859..0802c00fa8 100644 --- a/test_unstructured/staging/test_base_staging.py +++ b/test_unstructured/staging/test_base_staging.py @@ -4,7 +4,7 @@ import unstructured.staging.base as base -from unstructured.documents.elements import Title, NarrativeText +from unstructured.documents.elements import Title, NarrativeText, ListItem @pytest.fixture @@ -23,6 +23,23 @@ def test_convert_to_isd(): assert isd[1]["type"] == "NarrativeText" +def test_isd_to_elements(): + isd = [ + {"text": "Blurb1", "type": "NarrativeText"}, + {"text": "Blurb2", "type": "Title"}, + {"text": "Blurb3", "type": "ListItem"}, + {"text": "Blurb4", "type": "BulletedText"}, + ] + + elements = base.isd_to_elements(isd) + assert elements == [ + NarrativeText(text="Blurb1"), + Title(text="Blurb2"), + ListItem(text="Blurb3"), + ListItem(text="Blurb4"), + ] + + def test_convert_to_isd_csv(output_csv_file): elements = [Title(text="Title 1"), NarrativeText(text="Narrative 1")] diff --git a/unstructured/__version__.py b/unstructured/__version__.py index a31549ba7e..9aa9703881 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.2.1-dev9" # pragma: no cover +__version__ = "0.2.1" # pragma: no cover diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index 379d3f703e..4afdf693f4 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -47,9 +47,11 @@ class NarrativeText(Text): class ListItem(Text): - """BulletedText is a NarrativeText element that is part of a bulleted list.""" + """ListItem is a NarrativeText element that is part of a list.""" - category = "BulletedText" + category = "ListItem" + + pass class Title(Text): diff --git a/unstructured/staging/base.py b/unstructured/staging/base.py index 408472eee6..a3d453665c 100644 --- a/unstructured/staging/base.py +++ b/unstructured/staging/base.py @@ -2,7 +2,7 @@ import csv from typing import Dict, List -from unstructured.documents.elements import Text +from unstructured.documents.elements import Text, NarrativeText, Title, ListItem def convert_to_isd(elements: List[Text]) -> List[Dict[str, str]]: @@ -14,6 +14,23 @@ def convert_to_isd(elements: List[Text]) -> List[Dict[str, str]]: return isd +def isd_to_elements(isd: List[Dict[str, str]]) -> List[Text]: + """Converts an Initial Structured Data (ISD) dictionary to a list of Text elements.""" + elements: List[Text] = list() + + for item in isd: + if item["type"] == "NarrativeText": + elements.append(NarrativeText(text=item["text"])) + elif item["type"] == "Title": + elements.append(Title(text=item["text"])) + # NOTE(robinson) - "BulletedText" is in there for backward compatibility. ListItem used + # to be called BulletedText in an earlier version + elif item["type"] in ["ListItem", "BulletedText"]: + elements.append(ListItem(text=item["text"])) + + return elements + + def convert_to_isd_csv(elements: List[Text]) -> str: """ Returns the representation of document elements as an Initial Structured Document (ISD)