Skip to content

Commit

Permalink
Leverage distance computing algorithm in compare API
Browse files Browse the repository at this point in the history
  • Loading branch information
hailiang-wang committed Mar 2, 2018
1 parent dac98aa commit 0e5794c
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 40 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# 2.5
* 使用空间距离近的词汇优化编辑距离计算

# 2.3
* 计算相似度时增加平滑策略

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ Chinese Synonyms for Natural Language Processing and Understanding.
```
pip install -U synonyms
```
兼容py2和py3,当前稳定版本 v2.3**同时,Node.js 用户可以使用 [node-synonyms](https://www.npmjs.com/package/node-synonyms)了。**
兼容py2和py3,当前稳定版本 [v2.x](https://github.com/huyingxi/Synonyms/releases)**同时,Node.js 用户可以使用 [node-synonyms](https://www.npmjs.com/package/node-synonyms)了。**

```
npm install node-synonyms
Expand Down
2 changes: 1 addition & 1 deletion Requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
synonyms>=2.3
synonyms>=2.5
18 changes: 1 addition & 17 deletions demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
import numpy
import unittest

compare_ = lambda x,y,z: "%s vs %s: %f" % (x, y, synonyms.compare(x, y, seg=z))
compare_ = lambda x,y,z: "*"* 30 + "\n%s vs %s: %f" % (x, y, synonyms.compare(x, y, seg=z))

# run testcase: python /Users/hain/ai/Synonyms/demo.py Test.testExample
class Test(unittest.TestCase):
Expand All @@ -52,35 +52,20 @@ def tearDown(self):

def test_pairs(self):
print("test_pairs")
print("*"* 30)
print(compare_("轿车", "汽车", True))
print("*"* 30)
print(compare_("宝石", "宝物", True))
print("*"* 30)
print(compare_("旅游", "游历", True))
print("*"* 30)
print(compare_("男孩子", "小伙子", True))
print("*"* 30)
print(compare_("海岸", "海滨", True))
print("*"* 30)
print(compare_("庇护所", "精神病院", True))
print("*"* 30)
print(compare_("魔术师", "巫师", True))
print("*"* 30)
print(compare_("中午", "正午", True))
print("*"* 30)
print(compare_("火炉", "炉灶", True))
print("*"* 30)
print(compare_("食物", "水果", True))
print("*"* 30)
print(compare_("鸡", "公鸡", True))
print("*"* 30)
print(compare_("鸟", "鹤", True))
print("*"* 30)
print(compare_("工具", "器械", True))
print("*"* 30)
print(compare_("兄弟", "和尚", True))
print("*"* 30)
print(compare_("起重机", "器械", True))

def test_similarity(self):
Expand Down Expand Up @@ -110,7 +95,6 @@ def test_similarity(self):
sen2 = "巴赫"
r = synonyms.compare(sen1, sen2, seg=True)
print("%s vs %s" % (sen1, sen2), r)


def test_nearby(self):
synonyms.display("人脸") # synonyms.display calls synonyms.nearby
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

setup(
name='synonyms',
version='2.3',
version='2.5',
description='Chinese Synonyms for Natural Language Processing and Understanding',
long_description=LONGDOC,
author='Hai Liang Wang, Hu Ying Xi',
Expand Down
60 changes: 40 additions & 20 deletions synonyms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ def _get_wv(sentence):
'''
global _vectors
vectors = []
for y in sentence.split():
for y in sentence:
y_ = any2unicode(y).strip()
if y_ not in _stopwords:
syns = nearby(y_)[0]
Expand Down Expand Up @@ -214,13 +214,35 @@ def _levenshtein_distance(sentence1, sentence2):
new_distances[-1])))
distances = new_distances
levenshtein = distances[-1]
dis = float((maxlen - levenshtein)/maxlen)
d = float((maxlen - levenshtein)/maxlen)
# smoothing
s = (sigmoid(dis * 6) - 0.5) * 2
# print("smoothing[%s| %s]: %s -> %s" % (sentence1, sentence2, dis, s))
s = (sigmoid(d * 6) - 0.5) * 2
# print("smoothing[%s| %s]: %s -> %s" % (sentence1, sentence2, d, s))
return s

_smooth = lambda x, y, z: (x * y) + z
def _nearby_levenshtein_distance(s1, s2):
'''
使用
'''
s1_len = len(s1)
s2_len = len(s2)
maxlen = max(s1_len, s2_len)
first, second = (s2, s1) if s1_len == maxlen else (s1, s2)
ft = set() # all related words with first sentence
for x in first:
ft.add(x)
n, _ = nearby(x)
for o in n:
ft.add(o)
scores = []
if len(ft) == 0: return 0.0 # invalid length for first string
for x in second:
scores.append(max([_levenshtein_distance(x, y) for y in ft]))
s = np.sum(scores) / maxlen
return s

# combine similarity scores
_similarity_smooth = lambda x, y, z: (x * y) + z

def _similarity_distance(s1, s2):
'''
Expand All @@ -230,25 +252,21 @@ def _similarity_distance(s1, s2):
b = _sim_molecule(_get_wv(s2))
# https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.norm.html
g = 1 / (np.linalg.norm(a - b) + 1)
u = _levenshtein_distance(s1, s2)

u = _nearby_levenshtein_distance(s1, s2)
# print("g: %s, u: %s" % (g, u))
if u > 0.8:
r = _smooth(g, 0.05, u)
r = _similarity_smooth(g, 1, u)
elif u > 0.7:
r = _smooth(g, 0.1, u)
r = _similarity_smooth(g, 1.5, u)
elif u > 0.6:
r = _smooth(g, 0.2, u)
elif u > 0.5:
r = _smooth(g, 1, u)
elif u > 0.4:
r = _smooth(g, 4, u)
r = _similarity_smooth(g, 2, u)
else:
r = _smooth(g, 10, u)
r = _similarity_smooth(g, 4, u)

r = min(r, 1.0)
return float("%.3f" % r)


def compare(s1, s2, seg=True):
'''
compare similarity
Expand All @@ -257,12 +275,15 @@ def compare(s1, s2, seg=True):
seg : True : The original sentences need jieba.cut
Flase : The original sentences have been cut.
'''
assert len(s1) > 0 and len(s2) > 0, "The length of s1 and s2 should > 0."
if seg:
s1 = ' '.join(jieba.cut(s1))
s2 = ' '.join(jieba.cut(s2))
return _similarity_distance(s1, s2)
s1 = [x for x in jieba.cut(s1)]
s2 = [x for x in jieba.cut(s2)]
else:
s1 = s1.split()
s2 = s2.split()
assert len(s1) > 0 and len(s2) > 0, "The length of s1 and s2 should > 0."

return _similarity_distance(s1, s2)

def display(word):
print("'%s'近义词:" % word)
Expand All @@ -273,7 +294,6 @@ def display(word):
for k, v in enumerate(o[0]):
print(" %d. %s:%s" % (k + 1, v, o[1][k]))


def main():
display("人脸")
display("NOT_EXIST")
Expand Down

0 comments on commit 0e5794c

Please sign in to comment.