-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathword.py
277 lines (226 loc) · 8.22 KB
/
word.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
# NEWMAN: Natural English With Mutating Abridged Nouns
#
# Copyright 2010 Chris Eberle <[email protected]>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
class NonWordException(Exception):
"""
A word which is not
"""
def __init__(self, value):
self.parameter = value
def __str__(self):
return str(self.parameter)
class UnknownWordException(Exception):
"""
A word was encountered which is not in wordnet
"""
def __init__(self, value):
self.parameter = value
def __str__(self):
return str(self.parameter)
class BaseWord(object):
"""
Represents a base word, or more specifically a word which our grammar will support
"""
def __init__(self, word, production = None, synsets = None, aliases = None):
"""
Create a new base word
"""
self.word = word.strip().lower()
self.production = production
self.synsets = []
self.aliases = []
if synsets is not None:
if isinstance(synsets, (list, tuple)):
for synset in synsets:
if synset is not None:
self.synsets.append(synset)
else:
self.synsets.append(synsets)
if aliases is not None:
if isinstance(aliases, (list, tuple)):
for alias in aliases:
self.add_alias(alias)
else:
self.add_alias(aliases)
def add_alias(self, alias):
"""
Add a simple alias. May or may not be a regex
"""
if alias is None:
return False
alias = alias.strip().lower()
if len(alias) == 0:
return False
alias = '^' + alias + '$'
return self.add_alias_explicit(alias)
def add_alias_explicit(self, alias):
"""
Add an alias which is assumed to be a well-formed regex
"""
import re
if alias is None:
return False
alias = alias.strip().lower()
if len(alias) == 0:
return False
preg = re.compile(alias)
if preg is None:
return False
self.aliases.append(preg)
return True
def get_valid_synsets(self, synset):
"""
Given a particular synset, see which of my own synsets (if any)
are valid candidates. To be valid means that it is an ancestor
(i.e. if I have a synset for "asia" and someone sends me the synset
for "china", clearly asia is a hypernym, so that one is valid)
"""
if self.synsets is None or len(self.synsets) == 0 or synset is None:
return None
valid_sets = []
for vsynset in self.synsets:
if vsynset.pos != synset.pos:
# Don't compare non-nouns to nouns
continue
# Walk the hypernym paths, make sure I'm your ancestor
# (who's your daddy?)
paths = synset.hypernym_paths()
valid = False
for path in paths:
for p in path:
if p == vsynset:
valid = True
break
if valid:
break
# Are we at all related?
if not valid:
# These words have nothing in common
continue
valid_sets.append(vsynset)
if len(valid_sets) == 0:
return None
return valid_sets
def is_alias(self, word):
"""
Determine if a given word is an alias for this word
"""
import re
word = word.strip().lower()
if word == self.word:
return True
for alias in self.aliases:
if alias.match(word):
return True
return False
class Word(object):
"""
Represents a single word, and anything about it
"""
def __init__(self, word, vocabulary, onlyDefine = False):
"""
Initialize this beeyatch
"""
self._process(word, vocabulary, onlyDefine)
def _process(self, word, vocabulary, onlyDefine):
"""
Reduce a word to something we can more easily process later
"""
import nltk
from nltk.corpus import wordnet as wn
# Normalize the word
self._original = word
self._normalized = word.strip().lower()
self._reduced = None
if len(self._normalized) == 0 or self._normalized in [',', '"', "'", '?', '!', '.']:
raise NonWordException(word)
# Is it a simple word (i.e. just a root word or an alias)?
if not onlyDefine:
for baseword in vocabulary:
if baseword.is_alias(self._normalized):
self._reduced = baseword.word
return
# Damn, looks like we have some work to do
self._pos_forms = []
self._definitions = {}
for pos in (wn.NOUN, wn.VERB, wn.ADJ, wn.ADV):
# Find a possible base form for the given form, with the given
# part of speech, by checking WordNet's list of exceptional forms,
# and by recursively stripping affixes for this part of speech
# until a form in WordNet is found.
form = wn.morphy(self._normalized, pos)
if form is not None:
synsets = wn.synsets(form, pos)
for synset in synsets:
forms = []
for lemma in synset.lemmas:
name = lemma.name.replace(' ', '_').lower()
if name == self._normalized:
forms.append(lemma)
self._definitions[lemma.key] = lemma.synset.definition
if len(forms) == 0:
continue
self._pos_forms.append((synset, forms[0].key))
# Sometimes we just want the definitions
if onlyDefine:
return
# The part-of-speech tagger has failed
if len(self._pos_forms) == 0:
raise UnknownWordException(word)
# OK, now we have a list of the possible synsets for this word, let's find out if
# we have a word that is close.
candidates = []
for baseword in vocabulary:
if baseword.synsets is None or len(baseword.synsets) == 0:
continue
for synset, key in self._pos_forms:
bsynsets = baseword.get_valid_synsets(synset)
if bsynsets is None:
continue
# OK, we're related, so let's find out our distance to each other
for bsynset in bsynsets:
score = wn.path_similarity(synset, bsynset)
candidates.append((score, baseword))
# Determine the closest match
candidates.sort()
candidates.reverse()
if len(candidates) == 0:
raise UnknownWordException(word)
self._reduced = candidates[0][1].word
def __str__(self):
"""
Get a nice string representation for this word object
"""
return 'Word( "' + self._original + '" -> "' + self._normalized + '" -> "' + self._reduced + '" )'
def original(self):
"""
Get the original input string (i.e. "Dude ")
"""
return self._original
def normalized(self):
"""
Get the normalized version of the word (i.e. "dude")
"""
return self._normalized
def reduced(self):
"""
Get the reduced form of the word (i.e. "male")
"""
return self._reduced
def definitions(self):
"""
Get a list of possible definitions for this word
"""
return self._definitions