-
Notifications
You must be signed in to change notification settings - Fork 6
/
eml_extractor.py
113 lines (99 loc) · 3.7 KB
/
eml_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import re
from argparse import ArgumentParser, ArgumentTypeError
from email import message_from_file, policy
from pathlib import Path
from typing import List
def extract_attachments(file: Path, destination: Path) -> None:
print(f'PROCESSING FILE "{file}"')
with file.open() as f:
email_message = message_from_file(f, policy=policy.default)
email_subject = email_message.get('Subject')
basepath = destination / sanitize_foldername(email_subject)
# ignore inline attachments
attachments = [item for item in email_message.iter_attachments() if item.is_attachment()] # type: ignore
if not attachments:
print('>> No attachments found.')
return
for attachment in attachments:
filename = attachment.get_filename()
print(f'>> Attachment found: {filename}')
filepath = basepath / filename
payload = attachment.get_payload(decode=True)
if filepath.exists():
overwrite = input(f'>> The file "{filename}" already exists! Overwrite it (Y/n)? ')
save_attachment(filepath, payload) if overwrite.upper() == 'Y' else print('>> Skipping...')
else:
basepath.mkdir(exist_ok=True)
save_attachment(filepath, payload)
def sanitize_foldername(name: str) -> str:
illegal_chars = r'[/\\|\[\]\{\}:<>+=;,?!*"~#$%&@\']'
return re.sub(illegal_chars, '_', name)
def save_attachment(file: Path, payload: bytes) -> None:
with file.open('wb') as f:
print(f'>> Saving attachment to "{file}"')
f.write(payload)
def get_eml_files_from(path: Path, recursively: bool = False) -> List[Path]:
if recursively:
return list(path.rglob('*.eml'))
return list(path.glob('*.eml'))
def check_file(arg_value: str) -> Path:
file = Path(arg_value)
if file.is_file() and file.suffix == '.eml':
return file
raise ArgumentTypeError(f'"{file}" is not a valid EML file.')
def check_path(arg_value: str) -> Path:
path = Path(arg_value)
if path.is_dir():
return path
raise ArgumentTypeError(f'"{path}" is not a valid directory.')
def get_argument_parser():
parser = ArgumentParser(
usage='%(prog)s [OPTIONS]',
description='Extracts attachments from .eml files'
)
# force the use of --source or --files, not both
source_group = parser.add_mutually_exclusive_group()
source_group.add_argument(
'-s',
'--source',
type=check_path,
default=Path.cwd(),
metavar='PATH',
help='the directory containing the .eml files to extract attachments (default: current working directory)'
)
parser.add_argument(
'-r',
'--recursive',
action='store_true',
help='allow recursive search for .eml files under SOURCE directory'
)
source_group.add_argument(
'-f',
'--files',
nargs='+',
type=check_file,
metavar='FILE',
help='specify a .eml file or a list of .eml files to extract attachments'
)
parser.add_argument(
'-d',
'--destination',
type=check_path,
default=Path.cwd(),
metavar='PATH',
help='the directory to extract attachments to (default: current working directory)'
)
return parser
def parse_arguments():
parser = get_argument_parser()
return parser.parse_args()
def main():
args = parse_arguments()
eml_files = args.files or get_eml_files_from(args.source, args.recursive)
if not eml_files:
print(f'No EML files found!')
for file in eml_files:
extract_attachments(file, destination=args.destination)
print('Done.')
if __name__ == '__main__':
main()