forked from jonabbey/eudora2unix
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHeader.py
526 lines (449 loc) · 20.2 KB
/
Header.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
#!/usr/bin/env python
"""
Classes for handling Eudora Mailbox header parsing and clean up.
"""
import os
import re
import time
import string
import EudoraLog
# Configuration.
# Add a 'X-Eudora2Unix: <ISO 8601 date> converted' header at the end
# of the emitted headers (see sub emit_headers), 0=no, 1=yes.
# This can come in handy later to differentiate between 'new' KMail
# messages and those inherited from the conversion.
emit_X_Eudora2Unix_Header = 1
# End of configuration.
def iso_8601_zulu():
"""
Return a string with the date and time in ISO 8601 form for
'timezone' Zulu, ie. UTC +00:00, or zero-meridian and formerly
known as 'GMT', or 'Greenwich Mean Time'.
See http://www.cl.cam.ac.uk/~mgk25/iso-time.html for more info on
on the ISO International Standard Date and Time Notation (ISO 8601).
Example: for February 28, 2002 14:42:42 UTC+01:00, this string:
2002-02-28T13:42:42Z
will be returned and shows the date and time in 'timezone' Zulu.
Note the one (1) hour difference in this particular example (because
it is in UTC +01:00), the 'T' separator between date and time and the
'Z' (Zulu) designator for the UTC +00:00 'timezone'.
CVS, a popular source code and document versioning program, also uses
Zulu time."""
(gm_year, gm_month, gm_day,
gm_hour, gm_minute, gm_second,
gm_weekday, gm_julian_day, gm_dstf) = time.gmtime()
iso_8601_zulu_datetime = "%04d-%02d-%02dT%02d:%02d:%02dZ" % \
(gm_year+1900, gm_month+1, gm_day,
gm_hour, gm_minute, gm_second)
return iso_8601_zulu_datetime
date_pat = r'\s*\S+?\s*(\S{3})\s+(\S{3})\s+(\d{1,2})'
time_pat = r'\s*(\d{2}:\d{2}:\d{2})\s+(\d{4})\s*([+-]\d{4}){0,1}'
re_from_date_time = re.compile( date_pat + time_pat )
re_message_start = re.compile( r'^From' + date_pat + time_pat )
re_timeout_protection = re.compile( r'^X-(NortonAV|Symantec)-TimeoutProtection' )
re_between_angles = re.compile( '.*<(.*?)>.*' )
re_before_parenth = re.compile( '(.*)\(.*?\)' )
re_after_parenth = re.compile( '\(.*?\)(.*)' )
def strip_linesep( line ):
"""Regular expressions proved too slow, and rstrip doesn't take
an argument in my distribution (?)"""
while len( line ) > 0 and ( line[-1] == '\n' or line[-1] == '\r' ):
line = line[0:-1]
return line
# SW
class Replies:
"""
Eudora seems to rely on a message with In-Reply-To corresponding
to the current message Message-ID.
Pine indicates that a message has been replied to by X-Status: A,
This reads through whole mailbox, makes dictionary of message ID's
found in In-Reply-To headers.
Note: Won't work if reply is in a different mailbox, or otherwise lost.
"""
def __init__( self, file ):
self.replies = {}
inheaders = False
# see below for why I don't use a for line in file loop here.
while True:
line = file.readline()
if not line:
break
if line.find( 'Find ', 0, 5 ) and re_message_start.match( line ):
inheaders = True
else:
if inheaders:
line = strip_linesep( line )
if len( line ) == 0:
inheaders = False
else:
if line.find( 'In-Reply-To:' ) == 0:
line = line.replace(
'In-Reply-To:', '', 1 )
line = line.strip()
self.replies[line] = True
file.seek( 0 )
def message_was_answered( self, message_id ):
if len( message_id ) > 0:
message_id = message_id.strip()
try:
self.replies[message_id]
return True
except KeyError:
return False
return False
# SW
class TOC_Info:
"""Looks for a file ending in ".toc.txt" to find values of
Status and X_PRIORITY headers. See EudoraTOC.py.
The parsed toc file is text file that represents the contents of
the binary Eudora '.toc' file corresponding to the mailbox.
The Status info indicates whether the Eudora message was read or
not.
The toc file keeps track of the message by a binary offset to
the beginning of the message in the mailbox file
"""
def __init__( self, mbx_name ):
self.info = {}
offset = 0
status = ''
toc_file_name = mbx_name + ".toc.txt"
try:
TOCFILE = open( toc_file_name )
# the theory here is that an 'offset:' line in the
# parsed toc file will always precede the other lines
# for that message
for line in TOCFILE:
if line.find( 'offset:' ) == 0:
offset = line.replace( 'offset:', '' ).strip()
self.info[offset] = {}
elif line.find( 'status:' ) == 0:
status = line.replace( 'status:', '' ).strip()
self.info[offset]['status'] = status
elif line.find( 'priority:' ) == 0:
pri = line.replace( 'priority:', '' ).strip()
self.info[offset]['priority'] = pri
TOCFILE.close()
except IOError, ( errno, strerror ):
self.info = None
if EudoraLog.verbose >= 0:
print( "Couldn't read parsed .toc file '"
+ toc_file_name + "': " + strerror )
def info_exists_for_msg_at( self, offset ):
try:
self.info[offset]
return True
except Exception:
return False
def status_of_msg_at( self, offset ):
try:
return self.info[offset]['status']
except KeyError:
return None
def priority_of_msg_at( self, offset ):
try:
return self.info[offset]['priority']
except KeyError:
return None
weekdays = ( 'Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat' )
def fix_date( date ):
d = date.split( ' ' ) # 5 date items
if len( d ) < 5:
return date # don't know what to do with this
if d[0][-1] != ',' and d[0] in weekdays:
d[0] += ',' # comma must come after weekday
return string.join( d, ' ' )
# SW
class Header:
"""
A list of mailbox headers.
Implements case-insensitive searching (RFC 561 explicitly states
case of headers is undefined); keeps a mixed-case version for output.
A dictionary doesn't work for this because several header
lines can share the same key, e.g. 'Received:'.
Three fields used:
lowercase-id, id, value
The first is for searching by id, the second is to make id
output like the input.
"""
ok_to_dup = ( 'received:', 'x400-received:', 'delivered-to:', 'x-mailer:',
'return-path:', 'sender:', 'mime-version:', 'precedence:',
'x-uidl:', 'content-transfer-encoding:', )
def __init__( self ):
self.data = []
self.index = 0
self.cleaned = False
def __iter__( self ):
return self
def __str__( self ):
return "\n".join([name + " " + value for (lcname, name, value) in self.data])
def add( self, id, value ):
"""Will also accept un-parsed line"""
if not id or len( id ) == 0:
return
value = self.stripOffID( id, value )
self.data.append( [ id.lower(), id, value ] )
def getValue( self, id ):
lcid = id.lower()
for h in self.data:
if h[0] == lcid:
return h[2]
return None
def stripOffID( self, id, line ):
idlen = len( id )
if idlen == 0:
return
lcid = id.lower()
if line and len( line ) >= idlen:
linefront = line[ 0:idlen ].lower()
if linefront == lcid:
line = line[ idlen: ].strip()
return line
def setValue( self, id, value ):
idlen = len( id )
if idlen == 0:
return
if self.getValue( id ):
lcid = id.lower()
for h in self.data:
if h[0] == lcid:
h[2] = self.stripOffID( id, value )
break
else:
self.add( id, value )
def removeValue( self, id ):
newlist = []
lcid = id.lower()
for h in self.data:
if h[0] != lcid:
newlist.append(h)
self.data = newlist
def replaceValue( self, id, value ):
self.removeValue(id)
self.add(id, value)
def appendToLast( self, additional ):
"""Facilitates header field "folding" (RFC 2822 3.2.3)"""
# strangely, a raw tab after os.linesep doesn't get printed...
if len( self.data ) == 0:
return
additional = strip_linesep( additional )
self.data[-1][2] += os.linesep + '\t' + additional
def emit( self, filehandle, exceptions = None ):
for h in self.data:
if not exceptions or h[0] not in exceptions:
filehandle.write( h[1] )
if h[0] != 'from ':
filehandle.write( ' ' )
filehandle.write( h[2] + os.linesep )
def next( self ):
if self.index < len( self.data ):
val = ( self.data[self.index][1],
self.data[self.index][2] )
self.index = self.index + 1
return val
else:
raise StopIteration
def add_line(self, line):
"""Parses a header line into a (key, value) tuple, trimming
whitespace off ends. Introductory 'From ' header not treated."""
colon = line.find( ':' )
space = line.find( ' ' )
# If starts with something with no whitespace then a colon,
# that's the From ID, and we won't handle it here
if colon == -1 or ( space != -1 and space <= colon ):
return
id = line[ 0: colon + 1 ]
value = line[ colon + 1:]
if value:
value = value.strip()
# Comment out most repeated headers, except a few that
# make sense to be repeated
if ( re_timeout_protection.match( line ) ):
return
if( id and not id.lower() in Header.ok_to_dup and self.getValue( id ) ):
#EudoraLog.log.warn( "extra '" + id +
# "' header encountered - commented out ")
id = '>' + id
self.add( id, value )
def clean(self, toc, msg_offset, replies):
"""
Processes headers from a Eudora mbx message. Data is
taken from the self Header object, the TOC_Info object
passed as 'toc', and the Replies object passed as
'replies'.
The msg_offset parameter contains the position of the
first character in the mbx file whose headers we are
processing.
The email for the 'From ' (not 'From: ') header is extracted from
the first encountered 'From: ' or else 'Sender: ' or else
'Return-Path: ', or else, it is set to '[email protected]'.
This replaces the '???@???' in the 'From ' header.
If a 'Date: ' is header is not present, a date is extracted from
header 'From ' and reformatted and a properly rewritten
'Date: ' header is appended.
The flag variable $emit_X_Eudora2Unix_Header is used to determine
if a 'X-Eudora2Unix: ' should be added. It looks like:
X-Eudora2Unix: 2002-02-28T13:42:42Z converted
where the date and time (for timezone UTC +00:00, formerly known
as 'GMT') is in ISO 8601 format (in the so called extended format,
to be precize). It shows the time and date of conversion of this
specific message in Zulu (zero-meridian) ISO 8601 date and time.
See http://www.cl.cam.ac.uk/~mgk25/iso-time.html for more info on
on the ISO International Standard Date and Time Notation.
Uses the toc information with msg_offset to pull out Status and
Priority info, creates or alters appropriate headers with that info.
"""
if self.cleaned:
print "Hey, already cleaned!"
return
global re_between_angles, re_before_parenth, re_after_parenth
hdr_line0 = self.getValue( 'From ' ) # still has line end
# Handle "Date: " and "From: " fields specially.
# Keep the first encountered non-empty value in $hdr_date and
# $hdr_from, thus ignoring subsequent Date:'s and From:'s, that
# are, however, still added to @hdr_line in order to keep the
# original (albeit malformed) message as intact as possible.
# Use "Sender: " and "Return-Path: " as a backup for "From: ".
# Try to extract a "Date:" unless we already have one.
# Only keep them in @hdr_line, if they are not empty.
# In other words, pop them away again, if there are empty.
new_date = ""
hdr_date = self.getValue( 'Date:' )
if not hdr_date:
# Extract a date from Eudora's "From ???@??? " (with checks).
""" r'\s*\S+?\s*(\S{3})\s+(\S{3})\s+(\d{1,2})\s+(\d{2}:\d{2}:\d{2})\s+(\d{4})' """
date = re_from_date_time.match( hdr_line0 )
if not date:
msg = "Bad date in From '" + hdr_line0 + "'"
EudoraLog.log.log( msg )
EudoraLog.log.error( msg )
else:
new_date = 'Date: ' + date.group(1) + ' ' \
+ date.group(3) + ' ' + date.group(2) + ' ' \
+ date.group(5) + ' ' + date.group(4)
if date.group(6):
new_date += ' ' + date.group(6)
# This was 'warn', but it's by far the most common issue, and
# it's not abnormal for Eudora.
EudoraLog.log.log( 'No Date field, added [' + new_date + ']')
self.add( 'Date:', new_date )
hdr_date = self.getValue( 'Date:' )
fixed_date = fix_date( hdr_date )
if fixed_date != hdr_date:
self.setValue( 'Date:', fixed_date )
# Pine is picky about the contents of the Status header. It will
# ignore the whole thing if it doesn't understand the contents.
# Basically, just keep R (read) and O (downloaded)
hdr_status = self.getValue( 'Status:' )
if hdr_status:
new_status = ''
if hdr_status.find( 'R' ) != -1:
new_status += 'R'
if hdr_status.find( 'O' ) != -1:
new_status += 'O'
self.setValue( 'Status:', new_status )
# Set Pine's X-Status to A for answered if find another message
# in this mailbox that is a reply to this message
hdr_message_id = self.getValue( 'Message-ID:' )
if hdr_message_id and replies.message_was_answered( hdr_message_id ):
hdr_x_status = self.getValue( 'X-Status:' )
if hdr_x_status:
hdr_x_status += 'A'
else:
hdr_x_status = 'A'
self.setValue( 'X-Status:', hdr_x_status )
# Try to extract a "From: " unless we already have one.
# Only keep them in @hdr_line, if they are not empty.
# In other words, pop them away again, if there are empty.
# Determine sender's address from "From:" and fall back on
# "Sender:" and "Return-Path:", respectively.
new_from = self.getValue( 'From:' )
commented_from = self.getValue( '>From:' )
if not new_from:
new_from = commented_from
if hdr_line0.find( '???@???' ) > -1:
if not new_from:
if not new_from:
new_from = self.getValue( 'Send:' )
if not new_from:
new_from = self.getValue( 'Return-Path:' )
if not new_from:
new_from = '[email protected]'
msg = 'No From field, used [' + new_from + ']'
EudoraLog.log.log( msg )
EudoraLog.log.error( msg )
else:
EudoraLog.log.log( 'Had From field, used [' + new_from + ']')
# Extract an e-mail address from $new_from with a _greedy_
# match, if it matches on <...>, i.e. use the question mark (?)
# in (.*?). This ensures that after a "<", the first ">" will
# be matched.
# Especially Return-Path's can have multiple e-mail addresses.
# If there are parentheses, the e-mail address is usually
# outside it, so simply remove it.
email_address = ''
between_angles = re_between_angles.search( new_from, re.IGNORECASE )
if between_angles:
email_address = between_angles.group( 1 )
else:
before_parenth = re_before_parenth.search( new_from )
if before_parenth:
email_address = before_parenth.group( 1 )
else:
after_parenth = re_after_parenth.search( new_from )
if after_parenth:
email_address = after_parenth.group( 1 )
else:
email_address = new_from
email_address = email_address.strip()
# EudoraLog.log.log( 'e-mail address extracted <' + email_address + '>')
hdr_line0 = hdr_line0.replace( r'???@???', email_address, 1 )
self.replaceValue( 'From ', hdr_line0 )
# Add a 'X-Eudora2Unix: ' header (if $emit_X_Eudora2Unix_Header is true)
# This header is like (example: February 28, 2002 14:42:42 UTC+01:00):
# X-Eudora2Unix: 2002-02-28T14:42:42+01:00 converted
# and showed the date and time of conversion of this specific message.
# This header is like (example: February 28, 2002 14:42:42 UTC+01:00):
# X-Eudora2Unix: 2002-02-28T13:42:42Z converted
# and showes the date and time of conversion of this specific message in
# Zulu time (just like CVS). Note the one (1) hour difference in this
# particular example.
if emit_X_Eudora2Unix_Header == 1:
# Get Zulu time with gmtime(time) and 'correct' month day (0..11)
# and year (has 1900 substracted from it).
# See also sub iso_8601_zulu(), provided for pleasure but not used
# here, as again, a function call proved to perform worse by a factor
# of 2 to 3 as compared to using the code directly.
self.add( 'X-Eudora2Unix:', iso_8601_zulu() + ' converted' )
# Pull status and priority info out of the '.toc' file
if toc:
# SW: Careful here: the backtick integer-to-string translation
# will put an 'L' at the end of the number, but we need the
# decimal version for a textual lookup in the dictionary.
offset_str = "%d" % ( msg_offset, )
if toc.info_exists_for_msg_at( offset_str ):
status_in_toc = toc.status_of_msg_at( offset_str )
# See RFC 2076 for a discussion of Status header
if status_in_toc:
hdr_status = self.getValue( 'Status:' )
if hdr_status:
hdr_status = status_in_toc + hdr_status
else:
hdr_status = status_in_toc
self.setValue( 'Status:', hdr_status )
priority = int( toc.priority_of_msg_at( offset_str ) )
if priority:
# Kmail responds to this header
self.setValue( 'X-Priority:', "%d" % ( priority, ) )
if priority > 0 and priority < 3:
# Pine responds to message 'flagged'
# in X-Status (puts * in first column)
hpri = self.getValue( 'X-Status:' )
if hpri:
hpri += 'F'
else:
hpri = 'F'
self.setValue( 'X-Status:', hpri )
else:
pass
# EudoraLog.log.warn( "No toc entry for message at offset " + offset_str)
self.cleaned = True