Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix parse errors for huge and empty nodes #102

Merged
merged 4 commits into from
Oct 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions enex2notion/enex_parser_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def iter_process_xml_elements(
recover=True,
strip_cdata=False,
resolve_entities=False,
huge_tree=True,
)

try:
Expand Down
3 changes: 3 additions & 0 deletions enex2notion/note_parser/note.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ def _parse_note_dom(note: EvernoteNote) -> Optional[Tag]:
logger.error(f"Failed to extract DOM from note '{note.title}'")
return None

if len(note_dom.contents) == 0:
return None

return _filter_yinxiang_markdown(note_dom)


Expand Down
67 changes: 67 additions & 0 deletions tests/test_enex_parser.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import base64
import datetime
import logging
from pathlib import Path
Expand Down Expand Up @@ -547,6 +548,72 @@ def test_iter_notes_single_with_resource(fs):
assert notes[0].resource_by_md5("000") is None


def test_iter_notes_single_with_huge_resource(fs, caplog):
test_enex_head = b"""<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE en-export SYSTEM "http://xml.evernote.com/pub/evernote-export4.dtd">
<en-export export-date="20211218T085932Z" application="Evernote" version="10.25.6">
<note>
<title>test1</title>
<created>20211118T085332Z</created>
<updated>20211118T085920Z</updated>
<note-attributes>
</note-attributes>
<content>test</content>
<resource>
<data encoding="base64">
"""
test_enex_tail = b"""
</data>
<mime>image/gif</mime>
<resource-attributes>
<file-name>smallest.gif</file-name>
</resource-attributes>
</resource>
</note>
</en-export>
"""
test_enex_file = fs.create_file("test.enex", contents=test_enex_head)

# 10 MB
big_binary = b"\x00" * 10 * 1024 * 1024
big_binary_hash = "f1c9645dbc14efddc7d8a322685f26eb"

with Path("test.enex").open("ab+") as f:
f.write(base64.b64encode(big_binary))
f.write(test_enex_tail)

with caplog.at_level(logging.WARNING, logger="enex2notion"):
notes_count = count_notes(Path("test.enex"))

notes = list(iter_notes(Path("test.enex")))

expected_resource = EvernoteResource(
data_bin=big_binary,
size=len(big_binary),
md5=big_binary_hash,
mime="image/gif",
file_name="smallest.gif",
)

assert caplog.text == ""
assert notes_count == 1
assert notes == [
EvernoteNote(
title="test1",
created=datetime.datetime(2021, 11, 18, 8, 53, 32, tzinfo=tzutc()),
updated=datetime.datetime(2021, 11, 18, 8, 59, 20, tzinfo=tzutc()),
content="test",
tags=[],
author="",
url="",
is_webclip=False,
resources=[expected_resource],
),
]
assert notes[0].resource_by_md5(big_binary_hash) == expected_resource
assert notes[0].resource_by_md5("000") is None


def test_iter_notes_single_with_noext_resource(fs):
test_enex = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE en-export SYSTEM "http://xml.evernote.com/pub/evernote-export4.dtd">
Expand Down
16 changes: 16 additions & 0 deletions tests/test_note_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -654,6 +654,22 @@ def test_linebreaks_inside_root(parse_html):
]


def test_empty_note(parse_rules):
test_note = EvernoteNote(
title="test1",
created=datetime(2021, 11, 18, 0, 0, 0, tzinfo=tzutc()),
updated=datetime(2021, 11, 18, 0, 0, 0, tzinfo=tzutc()),
content="<en-note></en-note>",
tags=[],
author="",
url="",
is_webclip=False,
resources=[],
)

assert parse_note(test_note, parse_rules) == []


def test_yinxiang_markdown(parse_rules):
test_note = EvernoteNote(
title="test1",
Expand Down