-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathutils.py
70 lines (56 loc) · 1.85 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
DIALOG_DATASETS = [
'wizard_of_wikipedia',
'empathetic_dialogues',
'blended_skill_talk',
'wizard_of_internet'
]
CLASSIFICATION_DATASETS = [
'piqa',
'hellaswag',
'ai2_arc',
'winogrande',
'math_qa',
'pubmed_qa',
'copa'
]
def normalize_reply(text: str, version=2) -> str:
"""
Standardize the capitalization and punctuation spacing of the input text.
Version 1: Fix sentence start casing, and punctuation.
Version 2: Add trailing period, if missing.
"""
switch_list = [(' .', '.'), (' ,', ','), (' ?', '?'),
(' !', '!'), (" ' ", "'")]
# add spaces so that words and punctuation can be seaprated
new_text = text.lower()
# normalize in case of human:
for new, old in switch_list:
new_text = new_text.replace(old, new).replace(' ', ' ')
# split on punctuation to find sentence boundaries
# capitalize stuff
tokens = new_text.split(' ')
for i in range(len(tokens)):
if i == 0:
tokens[i] = uppercase(tokens[i])
elif tokens[i] in ('i', "i'm", "i've", "i'll", "i'd"):
tokens[i] = uppercase(tokens[i])
elif tokens[i] in '?.!' and i < len(tokens) - 1:
tokens[i + 1] = uppercase(tokens[i + 1])
new_text = ' '.join(tokens)
new_text = ' ' + new_text + ' '
for tup in switch_list:
new_text = new_text.replace(tup[0], tup[1])
# get rid of surrounding whitespace
new_text = new_text.strip()
new_text = new_text.replace(' ', ' ')
if version > 1 and new_text and new_text[-1] not in '!.?)"\'':
new_text += '.'
return new_text
def uppercase(string: str) -> str:
"""
Make the first character of the string uppercase, if the string is non-empty.
"""
if len(string) == 0:
return string
else:
return string[0].upper() + string[1:]