-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathword.py
63 lines (55 loc) · 2.31 KB
/
word.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
__author__ = 'Simone Mainardi, [email protected]'
class Word(object):
@staticmethod
def deletes(word, edit_distance):
""" Recursively create all the possible deletes at a distance * less than or equal to * `edit_distance`
from the `word` provided. For example, all the possible delete edits of the word `ciao`, at an
edit distance less than or equal to two are:
-> cia, cao, cio, iao (distance 1)
-> ci, ia, ca, ca, ao, co, ci, io, co, ia, ao, io (distance 2)
Duplicates are suppressed.
"""
dels = set()
if len(word) <= 1 or edit_distance == 0:
return dels
for i in range(len(word)):
delete = word[:i] + word[i+1:] # remove the i-th character from the word
dels.update([delete])
dels.update(Word.deletes(delete, edit_distance - 1))
return dels
@staticmethod
def shorter_words_within_distance(word, bag, distance):
res = set()
for shorter in Word.deletes(word, distance):
if not shorter in bag: # no match
continue
bag.remove(shorter) # no need to visit `shorter` again
res.update([shorter])
res.update(Word.shorter_words_within_distance(shorter, bag, distance))
return res
@staticmethod
def damerau_levenshtein_distance(word1, word2):
"""
A genetic algorithm to compute the Damerau-Levenshtein distance between two words
"""
d = {}
lenstr1 = len(word1)
lenstr2 = len(word2)
for i in xrange(-1, lenstr1 + 1):
d[(i, -1)] = i + 1
for j in xrange(-1, lenstr2 + 1):
d[(-1, j)] = j + 1
for i in xrange(lenstr1):
for j in xrange(lenstr2):
if word1[i] == word2[j]:
cost = 0
else:
cost = 1
d[(i, j)] = min(
d[(i - 1, j)] + 1, # deletion
d[(i, j - 1)] + 1, # insertion
d[(i - 1, j - 1)] + cost, # substitution
)
if i and j and word1[i] == word2[j - 1] and word1[i - 1] == word2[j]:
d[(i, j)] = min(d[(i, j)], d[i - 2, j - 2] + cost) # transposition
return d[lenstr1 - 1, lenstr2 - 1]