-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathstring_adjustments.py
117 lines (88 loc) · 3.6 KB
/
string_adjustments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
'''
Functions to adjust the title, location, and other fields for front/back end
Maverick Reynolds
UCF Crimes
'''
import re
import editdistance
import json
from selenium_scrape import selenium_scrape
TYPO_TOLERANCE = 1
# Sometimes spacing isn't quite correct, or words spelled differently
def case_title_format(case_title: str) -> str:
case_title = case_title.replace('PETIT', 'PETTY') # Change spelling
case_title = re.sub(' ?, ?', ', ', case_title) # Ensure commas have spaces following
case_title = re.sub(' ', ' ', case_title) # Remove double spaces
return case_title
# Because emojis are fun
# Reads from emojis.json (supports regex!)
def get_emojis(formatted_case_title: str) -> str:
with open('emojis.json', 'r', encoding="utf-8") as f:
emojis: dict = json.load(f)
emoji_suffix = ''
for emoji_txt in emojis.keys():
if re.search(emoji_txt, formatted_case_title.lower()):
emoji_suffix += f' {emojis[emoji_txt]}' # Must use += and not .join() to preserve unicode
return emoji_suffix
# Returns title function applied to string with exceptions found in title_exceptions.json
def gen_title(str_upper: str) -> str:
with open('title_exceptions.json') as f:
exceptions: dict = json.load(f)
title = ''
for token in str_upper.split():
if token in exceptions.keys():
title += exceptions[token]
else:
title += token.title()
title += ' '
return title.strip()
# Takes raw address and primes it for comparison
# DO NOT USE, THIS IS A HELPER FUNCTION FOR REPLACE_ADDRESS()
def expand_address(address: str) -> str:
address = ' ' + address + ' '
address = re.sub('\.', '', address) # Remove periods
subs = {
' W ': ' WEST ',
' E ': ' EAST ',
' N ': ' NORTH ',
' S ': ' SOUTH ',
'BLD' : 'BLVD',
' ?(?: and |\/) ?': ' & ' # Change intersection of streets to &
}
for key, value in subs.items():
address = re.sub(key, value, address, flags=re.IGNORECASE)
return address.strip()
# Takes address and compares it to the locations.json file
# Robust against varying word positions and typo errors
# If selenium scraping is enabled, will use that as a backup
# Returns gen_title() on the string if nothing is found as a last resort
def replace_address(addr: str, try_selenium=False) -> str:
expanded_addr = expand_address(addr)
txt_tokens = expanded_addr.split()
# Start by looking through the address
with open('locations.json') as f:
locs: dict[str] = json.load(f)
# Do this with every key
for key in locs.keys():
key_tokens = key.split()
is_match = True
# If numerical start, make sure it matches exactly
if re.match('\d', txt_tokens[0]) and re.match('\d', key_tokens[0]) and txt_tokens[0] != key_tokens[0]:
is_match = False
continue
# Go through each token in key
for key_token in key_tokens:
token_distances = [editdistance.eval(key_token, txt_token) for txt_token in txt_tokens]
# Fail if no token is within tokerance for all key tokens
if min(token_distances) > TYPO_TOLERANCE:
is_match = False
break
# If all tokens are within tolerance, locations is found
if is_match:
return locs[key]
# Try Selenium if enabled
if try_selenium:
if (selenium_result := selenium_scrape(expanded_addr)):
return selenium_result
# Otherwise return titled version of expanded address
return gen_title(expanded_addr)