From b0fabf61eb2640d8c42a580afdb980b43f46917c Mon Sep 17 00:00:00 2001 From: Dan Koller <57103678+dan-koller@users.noreply.github.com> Date: Mon, 19 Aug 2024 10:50:17 +0200 Subject: [PATCH] feat: Added methods to filter out redundant messages --- mapy/utils.py | 81 ++++++++++++++++++++++++-------------- tests/test_utils.py | 95 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 146 insertions(+), 30 deletions(-) diff --git a/mapy/utils.py b/mapy/utils.py index 1da8055..0cddc14 100644 --- a/mapy/utils.py +++ b/mapy/utils.py @@ -423,67 +423,68 @@ def extract_message_data(mail_data: str) -> tuple: attachments.append(attachment_info) elif content_type in ['text/plain', 'text/html']: - message_info = process_message_part(part, email_date) + message_info = process_message_part(part, email_date, content_type) if message_info: messages.append(message_info) else: - message_info = process_message_part(msg, email_date) + content_type = msg.get_content_type() + message_info = process_message_part(msg, email_date, content_type) if message_info: messages.append(message_info) - return messages, attachments - - -def process_attachment(part: Message) -> Optional[dict]: - """ - Process an email part as an attachment and return attachment information. - - :param part: The email part representing the attachment - - :return: A dictionary containing the attachment's filename, base64 encoded data, and length - """ - filename = part.get_filename() - if not filename: - return None - - attachment_data = part.get_payload(decode=True) + messages = filter_duplicate_messages(messages) - encoded_data = base64.b64encode(attachment_data).decode('utf-8') - return { - 'filename': filename, - 'data': encoded_data, - 'length': len(attachment_data) - } + return messages, attachments -def process_message_part(part: Message, email_date: str) -> Optional[dict]: +def process_message_part(part: Message, email_date: str, content_type: str) -> Optional[dict]: """ Process an email part and return the message content. :param part: The email part to process :param email_date: The date of the email + :param content_type: The content type of the email part (e.g., 'text/plain', 'text/html') - :return: A dictionary containing the date and clean message content + :return: A dictionary containing the date, message content, and content type """ payload = part.get_payload(decode=True) charset = part.get_content_charset() or 'utf-8' try: decoded_payload = payload.decode(charset, errors='replace') - clean_text = extract_text_from_html(decoded_payload) + clean_text = extract_text_from_html(decoded_payload) if content_type == 'text/html' else decoded_payload return { 'date': email_date, - 'content': clean_text + 'content': clean_text.strip(), + 'content_type': content_type } except Exception as e: return { 'date': email_date, - 'content': f"Error decoding message: {e}" + 'content': f"Error decoding message: {e}", + 'content_type': content_type } +def filter_duplicate_messages(messages: list) -> list: + """ + Filter out duplicate messages, keeping only the raw text/plain content. + + :param messages: A list of message dictionaries + + :return: A filtered list of message dictionaries + """ + filtered_messages = {} + for message in messages: + date = message['date'] + if date not in filtered_messages or message['content_type'] == 'text/plain': + filtered_messages[date] = message + + return list(filtered_messages.values()) + + def extract_text_from_html(html_content: str) -> str: """ Extract and clean text from HTML content. @@ -494,3 +495,25 @@ def extract_text_from_html(html_content: str) -> str: """ soup = BeautifulSoup(html_content, 'html.parser') return soup.get_text() + + +def process_attachment(part: Message) -> Optional[dict]: + """ + Process an email part as an attachment and return attachment information. + + :param part: The email part representing the attachment + + :return: A dictionary containing the attachment's filename, base64 encoded data, and length + """ + filename = part.get_filename() + if not filename: + return None + + attachment_data = part.get_payload(decode=True) + + encoded_data = base64.b64encode(attachment_data).decode('utf-8') + return { + 'filename': filename, + 'data': encoded_data, + 'length': len(attachment_data) + } diff --git a/tests/test_utils.py b/tests/test_utils.py index 5f113df..9d0c71a 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -274,7 +274,8 @@ def test_process_attachment(): def test_process_message_part(): email_date = "Fri, 23 Jul 2024 10:21:35 -0700" - message_info = process_message_part(message_part, email_date) + content_type = 'text/plain' + message_info = process_message_part(message_part, email_date, content_type) assert message_info['date'] == email_date assert message_info['content'] == 'This is a test message.' @@ -282,3 +283,95 @@ def test_process_message_part(): def test_extract_text_from_html(): extracted_text = extract_text_from_html(html_content) assert extracted_text == '\n\nTest Email\n\nThis is a test email.\n\n\n' + + +def test_filter_duplicate_messages(): + messages = [ + { + 'date': "2024-08-17T10:21:35Z", + 'content': 'This is a plain text message.', + 'content_type': 'text/plain' + }, + { + 'date': "2024-08-17T10:21:35Z", + 'content': '
This is a formatted HTML message.
', + 'content_type': 'text/html' + }, + { + 'date': "2024-08-17T11:00:00Z", + 'content': 'Another plain text message.', + 'content_type': 'text/plain' + }, + { + 'date': "2024-08-17T12:00:00Z", + 'content': 'Another HTML message.
', + 'content_type': 'text/html' + } + ] + + filtered_messages = filter_duplicate_messages(messages) + + assert len(filtered_messages) == 3 + + # Check that the first message is the text version for the duplicate date + assert filtered_messages[0]['content'] == 'This is a plain text message.' + assert filtered_messages[0]['content_type'] == 'text/plain' + + # Check that the second message is the unique plain text message + assert filtered_messages[1]['content'] == 'Another plain text message.' + assert filtered_messages[1]['content_type'] == 'text/plain' + + # Check that the third message is the HTML message for the unique date + assert filtered_messages[2]['content'] == 'Another HTML message.
' + assert filtered_messages[2]['content_type'] == 'text/html' + + +def test_filter_duplicate_messages_no_duplicates(): + messages = [ + { + 'date': "2024-08-17T10:21:35Z", + 'content': 'This is a plain text message.', + 'content_type': 'text/plain' + }, + { + 'date': "2024-08-17T11:00:00Z", + 'content': 'Another plain text message.', + 'content_type': 'text/plain' + } + ] + + filtered_messages = filter_duplicate_messages(messages) + + assert len(filtered_messages) == 2 + + # Check that the first message is kept + assert filtered_messages[0]['content'] == 'This is a plain text message.' + assert filtered_messages[0]['content_type'] == 'text/plain' + + # Check that the second message is kept + assert filtered_messages[1]['content'] == 'Another plain text message.' + assert filtered_messages[1]['content_type'] == 'text/plain' + + +def test_filter_duplicate_messages_text_preferred(): + messages = [ + { + 'date': "2024-08-17T10:21:35Z", + 'content': 'This is a cleaned HTML message.
', + 'content_type': 'text/html' + }, + { + 'date': "2024-08-17T10:21:35Z", + 'content': 'This is a plain text message.', + 'content_type': 'text/plain' + } + ] + + filtered_messages = filter_duplicate_messages(messages) + + assert len(filtered_messages) == 1 + + # When handling duplicates, the plain text message should be preferred + # as it gets formatted better than the HTML message + assert filtered_messages[0]['content'] == 'This is a plain text message.' + assert filtered_messages[0]['content_type'] == 'text/plain'