-
Notifications
You must be signed in to change notification settings - Fork 15
/
text.py
57 lines (52 loc) · 1.56 KB
/
text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
"""
Taken from the Helmut project.
https://github.com/okfn/helmut/blob/master/helmut/text.py
"""
from unicodedata import normalize as ucnorm, category
def normalize(text):
""" Simplify a piece of text to generate a more canonical
representation. This involves lowercasing, stripping trailing
spaces, removing symbols, diacritical marks (umlauts) and
converting all newlines etc. to single spaces.
"""
text = text.lower()
decomposed = ucnorm('NFKD', text)
filtered = []
for char in decomposed:
cat = category(char)
if cat.startswith('C'):
filtered.append(' ')
elif cat.startswith('M'):
# marks, such as umlauts
continue
elif cat.startswith('Z'):
# newlines, non-breaking etc.
filtered.append(' ')
elif cat.startswith('S'):
# symbols, such as currency
continue
else:
filtered.append(char)
text = u''.join(filtered)
while ' ' in text:
text = text.replace(' ', ' ')
#remove hyphens
text = text.replace('-', ' ')
text = text.strip()
return ucnorm('NFKC', text)
def url_slug(text):
text = normalize(text)
text = text.replace(' ', '-')
text = text.replace('.', '_')
return text
def tokenize(text, splits='COPZ'):
token = []
for c in unicode(text):
if category(c)[0] in splits:
if len(token):
yield u''.join(token)
token = []
else:
token.append(c)
if len(token):
yield u''.join(token)