-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtask_translation.py
146 lines (136 loc) · 4.11 KB
/
task_translation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import re
import datefinder # yyyy-mm-dd
from datetime import date as libdate
pytesseract.pytesseract.tesseract_cmd = '/app/.apt/usr/bin/tesseract'
# dictionary for months to month-number
mon_dict = {'jan': '01', 'feb': '02', 'mar': '03', 'apr': '04',
'may': '05', 'jun': '06', 'jul': '07', 'june': '06', 'july': '07', 'aug': '08',
'sep': '09', 'oct': '10', 'nov': '11', 'dec': '12'}
# check if date is valid
def valid(matched_date):
''' validate month,year and date '''
[year, month, date] = matched_date
today = list(str(libdate.today()).split('-'))
try:
if ((int(year) <= int(today[0]) and int(year) >= 1900) and (int(month) <= 12 and int(month) > 0) and
(int(date) <= 31 and int(date) > 0)):
return True
except:
return False
return False
# regex based extraction
def date_forms(strn):
''' Extract different possible date forms from input text'''
strn = strn.lower()
# different regular expression for different formats
mon = 'jan|feb|mar|apr|may|jun|jul|june|july|aug|sep|oct|nov|dec'
regex_str = [r'\b(?:' + mon + ').\d\d.\d\d',
r'\b(?:' + mon + ').\d\d.\d\d\d\d',
r'\b(?:' + mon + ').\d.\d\d\d\d',
r'\b(?:' + mon + ').\d.\d\d',
r'\d\d.\b(?:' + mon + ').\d\d\d\d',
r'\d\d.\b(?:' + mon + ').\d\d',
r'\d.\b(?:' + mon + ').\d\d\d\d',
r'\d.\b(?:' + mon + ').\d\d',
r'\b(?:' + mon + ').\d\d..\d\d\d\d',
r'\d\d.\d\d.\d\d\d\d',
r'\d\d.\d\d.\d\d',
r'\d\d.\d.\d\d\d\d',
r'\d\d.\d.\d\d',
r'\d.\d\d.\d\d\d\d',
r'\d.\d.\d\d\d\d',
r'\d.\d\d.\d\d',
r'\d.\d.\d\d',
]
# separate matched text in month,year and date and check for validity
for regex in range(len(regex_str)):
repn = regex_str[regex]
date_v = re.findall(repn, strn)
if date_v != []:
if regex < 4:
for dv in date_v:
if dv[:3] in mon_dict:
month = mon_dict[dv[:3]]
else:
continue
if len(dv) <= 7:
date = '0' + dv[-4:-3]
year = '20' + dv[-2:]
elif len(dv) == 8:
date = dv[-5:-3]
year = '20' + dv[-2:]
else:
date = dv[-7:-5]
year = dv[-4:]
if valid([year, month, date]):
formatted_date = year + '-' + month + '-' + date
return formatted_date
elif regex == 4:
for dv in date_v:
if dv[3:5] in mon_dict:
month = mon_dict[dv[3:5]]
else:
continue
date = dv[:2]
year = dv[-4:]
if valid([year, month, date]):
formatted_date = year + '-' + month + '-' + date
return formatted_date
elif regex == 5:
for dv in date_v:
if dv[3:5] in mon_dict:
month = mon_dict[dv[3:5]]
else:
continue
date = dv[:2]
year = '20' + dv[-2:]
if valid([year, month, date]):
formatted_date = year + '-' + month + '-' + date
return formatted_date
elif regex == 6:
for dv in date_v:
if dv[2:5] in mon_dict:
month = mon_dict[dv[2:5]]
else:
continue
date = '0' + dv[:1]
year = dv[-4:]
if valid([year, month, date]):
formatted_date = year + '-' + month + '-' + date
return formatted_date
elif regex == 7:
for dv in date_v:
if dv[2:5] in mon_dict:
month = mon_dict[dv[2:5]]
else:
continue
date = '0' + dv[:1]
year = '20' + dv[-2:]
if valid([year, month, date]):
formatted_date = year + '-' + month + '-' + date
return formatted_date
elif regex == 8:
for dv in date_v:
if dv[:3] in mon_dict:
month = mon_dict[dv[:3]]
else:
continue
date = dv[-8:-6]
year = dv[-4:]
if valid([year, month, date]):
formatted_date = year + '-' + month + '-' + date
return formatted_date
elif regex > 8:
# date_val.append([date_v, repn])
for x in date_v:
matches = datefinder.find_dates(x)
dt = [str(i).split(' ')[0].split('-') for i in matches]
if dt != []:
for dv in dt:
if valid(dv):
formatted_date = '-'.join(dv)
return formatted_date
else:
return None
#return none if no valid match found
return None