Skip to content

Commit

Permalink
fix: resolves UnicodeDecodeError in partition_email for emails wi…
Browse files Browse the repository at this point in the history
…th attachments (#158)

* split emails by \n=

* added test for equivalence betweent html and plain text

* changelog and bump version

* add check for content disposition
  • Loading branch information
MthwRobinson authored Jan 17, 2023
1 parent 7ed5f71 commit 9c3c14e
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 14 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
## 0.4.2-dev0
## 0.4.2

* Added `partition_image` to process documents in an image format.
* Fixed utf-8 encoding error in `partition_email` with attachments for `text/html`


## 0.4.1
Expand Down
12 changes: 12 additions & 0 deletions test_unstructured/partition/test_email.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,18 @@ def test_partition_email_header():
assert elements == HEADER_EXPECTED_OUTPUT


def test_extract_email_text_matches_html():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-attachment.eml")
elements_from_text = partition_email(filename=filename, content_source="text/plain")
elements_from_html = partition_email(filename=filename, content_source="text/html")

assert len(elements_from_text) == len(elements_from_html)
# NOTE(robinson) - checking each individually is necessary because the text/html returns
# HTMLTitle, HTMLNarrativeText, etc
for i, element in enumerate(elements_from_text):
assert element == elements_from_text[i]


def test_extract_attachment_info():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-attachment.eml")
with open(filename, "r") as f:
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.4.2-dev0" # pragma: no cover
__version__ = "0.4.2" # pragma: no cover
29 changes: 17 additions & 12 deletions unstructured/partition/email.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,30 +181,35 @@ def partition_email(
else:
raise ValueError("Only one of filename, file, or text can be specified.")

content_map: Dict[str, str] = {
part.get_content_type(): part.get_payload() for part in msg.walk()
}
content_map: Dict[str, str] = {}
for part in msg.walk():
# NOTE(robinson) - content dispostiion is None for the content of the email itself.
# Other dispositions include "attachment" for attachments
if part.get_content_disposition() is not None:
continue
content_type = part.get_content_type()
content_map[content_type] = part.get_payload()

content = content_map.get(content_source, "")
if not content:
raise ValueError(f"{content_source} content not found in email")

# NOTE(robinson) - In the .eml files, the HTML content gets stored in a format that
# looks like the following, resulting in extraneous "=" characters in the output if
# you don't clean it up
# <ul> =
# <li>Item 1</li>=
# <li>Item 2<li>=
# </ul>
list_content = split_by_paragraph(content)

if content_source == "text/html":
# NOTE(robinson) - In the .eml files, the HTML content gets stored in a format that
# looks like the following, resulting in extraneous "=" characters in the output if
# you don't clean it up
# <ul> =
# <li>Item 1</li>=
# <li>Item 2<li>=
# </ul>
list_content = content.split("=\n")
content = "".join(list_content)
elements = partition_html(text=content)
for element in elements:
if isinstance(element, Text):
element.apply(replace_mime_encodings)
elif content_source == "text/plain":
list_content = split_by_paragraph(content)
elements = partition_text(text=content)

for idx, element in enumerate(elements):
Expand Down

0 comments on commit 9c3c14e

Please sign in to comment.