Skip to content

Commit

Permalink
feat: Adds a helper function to convert ISD dicts to elements (#39)
Browse files Browse the repository at this point in the history
* updated category name for ListItem

* added brick to convert isd to elements

* bump version

* added isd_to_elements to documentation
  • Loading branch information
MthwRobinson authored Oct 21, 2022
1 parent 2871941 commit de31df5
Show file tree
Hide file tree
Showing 6 changed files with 64 additions and 6 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
## 0.2.1-dev9
## 0.2.1

* Added brick to convert an ISD dictionary to a list of elements
* Update `PDFDocument` to use the `from_file` method
* Added staging brick for CSV format for ISD (Initial Structured Data) format.
* Added staging brick for separating text into attention window size chunks for `transformers`.
Expand Down
21 changes: 21 additions & 0 deletions docs/source/bricks.rst
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,27 @@ Examples:
isd = convert_to_isd(elements)
``isd_to_elements``
-------------------

Converts outputs from initial structured data (ISD) format back to a list of ``Text`` elements.

Examples:

.. code:: python
from unstructured.staging.base import isd_to_elements
isd = [
{"text": "My Title", "type": "Title"},
{"text": "My Narrative", "type": "NarrativeText"}
]
# elements will look like:
# [ Title(text="My Title"), NarrativeText(text="My Narrative")]
elements = isd_to_elements(isd)
``convert_to_isd_csv``
----------------------

Expand Down
19 changes: 18 additions & 1 deletion test_unstructured/staging/test_base_staging.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import unstructured.staging.base as base

from unstructured.documents.elements import Title, NarrativeText
from unstructured.documents.elements import Title, NarrativeText, ListItem


@pytest.fixture
Expand All @@ -23,6 +23,23 @@ def test_convert_to_isd():
assert isd[1]["type"] == "NarrativeText"


def test_isd_to_elements():
isd = [
{"text": "Blurb1", "type": "NarrativeText"},
{"text": "Blurb2", "type": "Title"},
{"text": "Blurb3", "type": "ListItem"},
{"text": "Blurb4", "type": "BulletedText"},
]

elements = base.isd_to_elements(isd)
assert elements == [
NarrativeText(text="Blurb1"),
Title(text="Blurb2"),
ListItem(text="Blurb3"),
ListItem(text="Blurb4"),
]


def test_convert_to_isd_csv(output_csv_file):

elements = [Title(text="Title 1"), NarrativeText(text="Narrative 1")]
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.2.1-dev9" # pragma: no cover
__version__ = "0.2.1" # pragma: no cover
6 changes: 4 additions & 2 deletions unstructured/documents/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,11 @@ class NarrativeText(Text):


class ListItem(Text):
"""BulletedText is a NarrativeText element that is part of a bulleted list."""
"""ListItem is a NarrativeText element that is part of a list."""

category = "BulletedText"
category = "ListItem"

pass


class Title(Text):
Expand Down
19 changes: 18 additions & 1 deletion unstructured/staging/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import csv
from typing import Dict, List

from unstructured.documents.elements import Text
from unstructured.documents.elements import Text, NarrativeText, Title, ListItem


def convert_to_isd(elements: List[Text]) -> List[Dict[str, str]]:
Expand All @@ -14,6 +14,23 @@ def convert_to_isd(elements: List[Text]) -> List[Dict[str, str]]:
return isd


def isd_to_elements(isd: List[Dict[str, str]]) -> List[Text]:
"""Converts an Initial Structured Data (ISD) dictionary to a list of Text elements."""
elements: List[Text] = list()

for item in isd:
if item["type"] == "NarrativeText":
elements.append(NarrativeText(text=item["text"]))
elif item["type"] == "Title":
elements.append(Title(text=item["text"]))
# NOTE(robinson) - "BulletedText" is in there for backward compatibility. ListItem used
# to be called BulletedText in an earlier version
elif item["type"] in ["ListItem", "BulletedText"]:
elements.append(ListItem(text=item["text"]))

return elements


def convert_to_isd_csv(elements: List[Text]) -> str:
"""
Returns the representation of document elements as an Initial Structured Document (ISD)
Expand Down

0 comments on commit de31df5

Please sign in to comment.