From 84542988f1d3188d7c0c027e039f7bb303ee2ec0 Mon Sep 17 00:00:00 2001 From: Diego Zamboni Date: Fri, 27 Oct 2023 12:02:32 +0200 Subject: [PATCH 1/4] Fix parse errors for huge and empty nodes - Enabled the "huge_tree" option in the XML parser to prevent the "xmlSAX2Characters: huge text node" error. - Fixed a "list index out of range" error that happened on some notes with title but no content. Fixes #101. --- enex2notion/enex_parser_xml.py | 1 + enex2notion/note_parser/note.py | 11 ++++++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/enex2notion/enex_parser_xml.py b/enex2notion/enex_parser_xml.py index 9d1ec1a..aa69cff 100644 --- a/enex2notion/enex_parser_xml.py +++ b/enex2notion/enex_parser_xml.py @@ -27,6 +27,7 @@ def iter_process_xml_elements( recover=True, strip_cdata=False, resolve_entities=False, + huge_tree=True, ) try: diff --git a/enex2notion/note_parser/note.py b/enex2notion/note_parser/note.py index 6eac9df..8d09090 100644 --- a/enex2notion/note_parser/note.py +++ b/enex2notion/note_parser/note.py @@ -48,13 +48,14 @@ def _parse_note_dom(note: EvernoteNote) -> Optional[Tag]: def _filter_yinxiang_markdown(note_dom: Tag) -> Tag: - last_block = note_dom.contents[-1] + if len(note_dom.contents) > 0: + last_block = note_dom.contents[-1] - if not isinstance(last_block, Tag): - return note_dom + if not isinstance(last_block, Tag): + return note_dom - if "display:none" in last_block.attrs.get("style", ""): - last_block.extract() + if "display:none" in last_block.attrs.get("style", ""): + last_block.extract() return note_dom From 9da78e86c540bf6cbf387f86423067b0ee953224 Mon Sep 17 00:00:00 2001 From: vzhd1701 Date: Fri, 27 Oct 2023 16:49:45 +0500 Subject: [PATCH 2/4] refactor: make empty dom check explicit --- enex2notion/note_parser/note.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/enex2notion/note_parser/note.py b/enex2notion/note_parser/note.py index 8d09090..7fe198a 100644 --- a/enex2notion/note_parser/note.py +++ b/enex2notion/note_parser/note.py @@ -44,18 +44,20 @@ def _parse_note_dom(note: EvernoteNote) -> Optional[Tag]: logger.error(f"Failed to extract DOM from note '{note.title}'") return None + if len(note_dom.contents) == 0: + return None + return _filter_yinxiang_markdown(note_dom) def _filter_yinxiang_markdown(note_dom: Tag) -> Tag: - if len(note_dom.contents) > 0: - last_block = note_dom.contents[-1] + last_block = note_dom.contents[-1] - if not isinstance(last_block, Tag): - return note_dom + if not isinstance(last_block, Tag): + return note_dom - if "display:none" in last_block.attrs.get("style", ""): - last_block.extract() + if "display:none" in last_block.attrs.get("style", ""): + last_block.extract() return note_dom From 7a6fbe4ed3188ea0eb8d19906eb148262d591e42 Mon Sep 17 00:00:00 2001 From: vzhd1701 Date: Fri, 27 Oct 2023 16:52:31 +0500 Subject: [PATCH 3/4] test: add big resource note test --- tests/test_enex_parser.py | 67 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/tests/test_enex_parser.py b/tests/test_enex_parser.py index beada82..7abcedc 100644 --- a/tests/test_enex_parser.py +++ b/tests/test_enex_parser.py @@ -1,3 +1,4 @@ +import base64 import datetime import logging from pathlib import Path @@ -547,6 +548,72 @@ def test_iter_notes_single_with_resource(fs): assert notes[0].resource_by_md5("000") is None +def test_iter_notes_single_with_huge_resource(fs, caplog): + test_enex_head = b""" + + + + test1 + 20211118T085332Z + 20211118T085920Z + + + test + + + """ + test_enex_tail = b""" + + image/gif + + smallest.gif + + + + + """ + test_enex_file = fs.create_file("test.enex", contents=test_enex_head) + + # 10 MB + big_binary = b"\x00" * 10 * 1024 * 1024 + big_binary_hash = "f1c9645dbc14efddc7d8a322685f26eb" + + with Path("test.enex").open("ab+") as f: + f.write(base64.b64encode(big_binary)) + f.write(test_enex_tail) + + with caplog.at_level(logging.WARNING, logger="enex2notion"): + notes_count = count_notes(Path("test.enex")) + + notes = list(iter_notes(Path("test.enex"))) + + expected_resource = EvernoteResource( + data_bin=big_binary, + size=len(big_binary), + md5=big_binary_hash, + mime="image/gif", + file_name="smallest.gif", + ) + + assert caplog.text == "" + assert notes_count == 1 + assert notes == [ + EvernoteNote( + title="test1", + created=datetime.datetime(2021, 11, 18, 8, 53, 32, tzinfo=tzutc()), + updated=datetime.datetime(2021, 11, 18, 8, 59, 20, tzinfo=tzutc()), + content="test", + tags=[], + author="", + url="", + is_webclip=False, + resources=[expected_resource], + ), + ] + assert notes[0].resource_by_md5(big_binary_hash) == expected_resource + assert notes[0].resource_by_md5("000") is None + + def test_iter_notes_single_with_noext_resource(fs): test_enex = """ From 3a8fd42dfc4e38ceb87835c829a9bfcf408559dd Mon Sep 17 00:00:00 2001 From: vzhd1701 Date: Fri, 27 Oct 2023 16:58:02 +0500 Subject: [PATCH 4/4] test: add empty note dom test --- tests/test_note_parser.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/test_note_parser.py b/tests/test_note_parser.py index ef7021d..c3b9d75 100644 --- a/tests/test_note_parser.py +++ b/tests/test_note_parser.py @@ -654,6 +654,22 @@ def test_linebreaks_inside_root(parse_html): ] +def test_empty_note(parse_rules): + test_note = EvernoteNote( + title="test1", + created=datetime(2021, 11, 18, 0, 0, 0, tzinfo=tzutc()), + updated=datetime(2021, 11, 18, 0, 0, 0, tzinfo=tzutc()), + content="", + tags=[], + author="", + url="", + is_webclip=False, + resources=[], + ) + + assert parse_note(test_note, parse_rules) == [] + + def test_yinxiang_markdown(parse_rules): test_note = EvernoteNote( title="test1",