-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfix_missing_meanings.py
101 lines (88 loc) · 3.34 KB
/
fix_missing_meanings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""
Python 3 because this is not for lambda (don't be fooled by
__future__.print_function import below, that works in 3 too).
"""
from __future__ import print_function
from __future__ import unicode_literals
from itertools import chain
import boto3
from encoding import normalize
from scrapers import scrape_meaning
# For Python3 compat.
try:
xrange
except NameError:
xrange = range
dynamodb = boto3.resource('dynamodb')
dict_table = dynamodb.Table('dictionary')
entries_table = dynamodb.Table('entries')
def main():
entries_to_check = set()
LastEvaluatedKey = None
print('Starting...')
while True:
scan_kwargs = {}
if LastEvaluatedKey:
scan_kwargs['ExclusiveStartKey'] = LastEvaluatedKey
response = entries_table.scan(
**scan_kwargs
)
print('Scanned entries.')
items = response['Items']
if not items:
break
LastEvaluatedKey = response.get('LastEvaluatedKey')
# We can only request items by 100 chunks in batch_get_item.
for chunk_start in xrange(0, len(items), 100):
item_chunk = items[chunk_start:chunk_start+100]
entries_in_entries = set(item['entry'] for item in item_chunk)
add_norm_fields(item_chunk)
request_items = {
'dictionary': {
'Keys': item_chunk,
'AttributesToGet': ['entry', 'sources'],
}
}
entries_in_dictionary = set()
while request_items:
response = dynamodb.batch_get_item(RequestItems=request_items)
print('Scanned dictionary.')
request_items = response['UnprocessedKeys']
for item in response['Responses']['dictionary']:
if not item['sources']:
continue
entries_in_dictionary.add(item['entry'])
entries_to_check = entries_to_check.union(
entries_in_entries.difference(entries_in_dictionary)
)
if len(entries_to_check) > 25:
process_entries(entries_to_check)
print('Going for next dictionary scan.')
# We've reached the end of the table.
if not LastEvaluatedKey:
break
# Process last entries < 10 that haven't been processed yet.
process_entries(entries_to_check)
def add_norm_fields(items):
for item in items:
item['norm'] = normalize(item['entry'])
def process_entries(entries_to_check):
with dict_table.batch_writer(overwrite_by_pkeys=['norm', 'entry']) as \
dict_b, entries_table.batch_writer(overwrite_by_pkeys=['entry']) \
as entry_b:
for entry in entries_to_check:
print(u'Processing {}'.format(entry))
meaning = scrape_meaning(entry)
if not meaning['sources']:
print(u"ERROR: Couldn't find meaning for {}".format(entry))
continue
dict_b.put_item(Item=meaning)
for related_entry in chain(
meaning['related_entries']['compound_entries'],
meaning['related_entries']['idioms']
):
if related_entry:
entry_b.put_item(Item={'entry': related_entry})
entries_to_check.clear()
if __name__ == '__main__':
main()