-
Notifications
You must be signed in to change notification settings - Fork 0
/
stemmer.py
82 lines (68 loc) · 2.27 KB
/
stemmer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# This stemmer is implemented on the ideas of the stemmer for Croatian language originally
# designed in the Science Department of the Faculty of Philosophy, University of Zagreb
import re
import sys
STOP_WORDS =set([word.strip() for word in open('stop_words.txt')])
PREDEFINED_RULES = [re.compile(r'^('+osnova+')('+nastavak+r')$') for osnova, nastavak in [e.strip().split(' ') for e in open('rules.txt', encoding='utf8')]]
PREDEFINED_TRANSFORMATIONS = [e.strip().split('\t') for e in open('transformations.txt', encoding='utf8')]
'''
Ignore Croatian symbols as many people are not using them in informal comments/reviews.
'''
CROATIAN_TO_ENGLISH_SYMBOLS = {
'š' : 's',
'ć' : 'c',
'č' : 'c',
'ž' : 'z',
'đ' : 'd'
}
def process_row(row):
'''
Processes the row by stemming words.
'''
stem_row = ''
for token in re.findall(r'\w+', row, re.UNICODE):
token = remove_croatian_symbols(token)
if token.lower() in STOP_WORDS:
continue
# Stem the word.
stem_word = convert_to_root(transform(token.lower()))
stem_row += stem_word + ' '
return stem_row
def amplify_R(niz):
'''
The letter R is considered to be a special kind of vowel in Croatian.
'''
return re.sub(r'(^|[^aeiou])r($|[^aeiou])',r'\1R\2',niz)
def has_vowel(word):
'''
Checks whether there is a vowel in the word.
'''
if re.search(r'[aeiouR]',amplify_R(word)) is None:
return False
else:
return True
def transform(word):
'''
The method that does tranformation (similar to lemmatization) of the word to its default case system.
'''
for orig,sub in PREDEFINED_TRANSFORMATIONS:
if word.endswith(orig):
return word[:-len(orig)]+sub
return word
def convert_to_root(word):
'''
The stemming method.
'''
for rule in PREDEFINED_RULES:
rule_split = rule.match(word)
if rule_split is not None:
if has_vowel(rule_split.group(1)) and len(rule_split.group(1))>1:
return rule_split.group(1)
return word
def remove_croatian_symbols(token):
'''
Removes Croatian symbols from the words.
'''
for key, value in CROATIAN_TO_ENGLISH_SYMBOLS.items():
token = token.replace(key, value)
return token