feat: Adds a helper function to convert ISD dicts to elements (#39)

* updated category name for ListItem * added brick to convert isd to elements * bump version * added isd_to_elements to documentation
Unstructured-IO · Oct 21, 2022 · de31df5 · de31df5
1 parent 2871941
commit de31df5
Show file tree

Hide file tree

Showing 6 changed files with 64 additions and 6 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,6 @@
-## 0.2.1-dev9
+## 0.2.1
 
+* Added brick to convert an ISD dictionary to a list of elements
 * Update `PDFDocument` to use the `from_file` method
 * Added staging brick for CSV format for ISD (Initial Structured Data) format.
 * Added staging brick for separating text into attention window size chunks for `transformers`.

diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst
@@ -338,6 +338,27 @@ Examples:
   isd = convert_to_isd(elements)
 
 
+``isd_to_elements``
+-------------------
+
+Converts outputs from initial structured data (ISD) format back to a list of ``Text`` elements.
+
+Examples:
+
+.. code:: python
+
+  from unstructured.staging.base import isd_to_elements
+
+  isd = [
+    {"text": "My Title", "type": "Title"},
+    {"text": "My Narrative", "type": "NarrativeText"}
+  ]
+
+  # elements will look like:
+  # [ Title(text="My Title"), NarrativeText(text="My Narrative")]
+  elements = isd_to_elements(isd)
+
+
 ``convert_to_isd_csv``
 ----------------------
 

diff --git a/test_unstructured/staging/test_base_staging.py b/test_unstructured/staging/test_base_staging.py
@@ -4,7 +4,7 @@
 
 import unstructured.staging.base as base
 
-from unstructured.documents.elements import Title, NarrativeText
+from unstructured.documents.elements import Title, NarrativeText, ListItem
 
 
 @pytest.fixture
@@ -23,6 +23,23 @@ def test_convert_to_isd():
     assert isd[1]["type"] == "NarrativeText"
 
 
+def test_isd_to_elements():
+    isd = [
+        {"text": "Blurb1", "type": "NarrativeText"},
+        {"text": "Blurb2", "type": "Title"},
+        {"text": "Blurb3", "type": "ListItem"},
+        {"text": "Blurb4", "type": "BulletedText"},
+    ]
+
+    elements = base.isd_to_elements(isd)
+    assert elements == [
+        NarrativeText(text="Blurb1"),
+        Title(text="Blurb2"),
+        ListItem(text="Blurb3"),
+        ListItem(text="Blurb4"),
+    ]
+
+
 def test_convert_to_isd_csv(output_csv_file):
 
     elements = [Title(text="Title 1"), NarrativeText(text="Narrative 1")]

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.2.1-dev9"  # pragma: no cover
+__version__ = "0.2.1"  # pragma: no cover
diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py
@@ -47,9 +47,11 @@ class NarrativeText(Text):
 
 
 class ListItem(Text):
-    """BulletedText is a NarrativeText element that is part of a bulleted list."""
+    """ListItem is a NarrativeText element that is part of a list."""
 
-    category = "BulletedText"
+    category = "ListItem"
+
+    pass
 
 
 class Title(Text):

diff --git a/unstructured/staging/base.py b/unstructured/staging/base.py
@@ -2,7 +2,7 @@
 import csv
 from typing import Dict, List
 
-from unstructured.documents.elements import Text
+from unstructured.documents.elements import Text, NarrativeText, Title, ListItem
 
 
 def convert_to_isd(elements: List[Text]) -> List[Dict[str, str]]:
@@ -14,6 +14,23 @@ def convert_to_isd(elements: List[Text]) -> List[Dict[str, str]]:
     return isd
 
 
+def isd_to_elements(isd: List[Dict[str, str]]) -> List[Text]:
+    """Converts an Initial Structured Data (ISD) dictionary to a list of Text elements."""
+    elements: List[Text] = list()
+
+    for item in isd:
+        if item["type"] == "NarrativeText":
+            elements.append(NarrativeText(text=item["text"]))
+        elif item["type"] == "Title":
+            elements.append(Title(text=item["text"]))
+        # NOTE(robinson) - "BulletedText" is in there for backward compatibility. ListItem used
+        # to be called BulletedText in an earlier version
+        elif item["type"] in ["ListItem", "BulletedText"]:
+            elements.append(ListItem(text=item["text"]))
+
+    return elements
+
+
 def convert_to_isd_csv(elements: List[Text]) -> str:
     """
     Returns the representation of document elements as an Initial Structured Document (ISD)
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.2.1-dev9" # pragma: no cover
		__version__ = "0.2.1" # pragma: no cover