From 0e5794cfff656875b8270680f65098b07cfdf598 Mon Sep 17 00:00:00 2001 From: Hai Liang Wang Date: Fri, 2 Mar 2018 11:07:45 +0800 Subject: [PATCH] Leverage distance computing algorithm in compare API --- CHANGELOG.md | 3 +++ README.md | 2 +- Requirements.txt | 2 +- demo.py | 18 +------------ setup.py | 2 +- synonyms/__init__.py | 60 +++++++++++++++++++++++++++++--------------- 6 files changed, 47 insertions(+), 40 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7a36266..e581f97 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,6 @@ +# 2.5 +* 使用空间距离近的词汇优化编辑距离计算 + # 2.3 * 计算相似度时增加平滑策略 diff --git a/README.md b/README.md index 31378b9..1a8988c 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ Chinese Synonyms for Natural Language Processing and Understanding. ``` pip install -U synonyms ``` -兼容py2和py3,当前稳定版本 v2.3。**同时,Node.js 用户可以使用 [node-synonyms](https://www.npmjs.com/package/node-synonyms)了。** +兼容py2和py3,当前稳定版本 [v2.x](https://github.com/huyingxi/Synonyms/releases)。**同时,Node.js 用户可以使用 [node-synonyms](https://www.npmjs.com/package/node-synonyms)了。** ``` npm install node-synonyms diff --git a/Requirements.txt b/Requirements.txt index dd85757..47bdce6 100644 --- a/Requirements.txt +++ b/Requirements.txt @@ -1 +1 @@ -synonyms>=2.3 \ No newline at end of file +synonyms>=2.5 \ No newline at end of file diff --git a/demo.py b/demo.py index 9e2d94c..907747f 100755 --- a/demo.py +++ b/demo.py @@ -36,7 +36,7 @@ import numpy import unittest -compare_ = lambda x,y,z: "%s vs %s: %f" % (x, y, synonyms.compare(x, y, seg=z)) +compare_ = lambda x,y,z: "*"* 30 + "\n%s vs %s: %f" % (x, y, synonyms.compare(x, y, seg=z)) # run testcase: python /Users/hain/ai/Synonyms/demo.py Test.testExample class Test(unittest.TestCase): @@ -52,35 +52,20 @@ def tearDown(self): def test_pairs(self): print("test_pairs") - print("*"* 30) print(compare_("轿车", "汽车", True)) - print("*"* 30) print(compare_("宝石", "宝物", True)) - print("*"* 30) print(compare_("旅游", "游历", True)) - print("*"* 30) print(compare_("男孩子", "小伙子", True)) - print("*"* 30) print(compare_("海岸", "海滨", True)) - print("*"* 30) print(compare_("庇护所", "精神病院", True)) - print("*"* 30) print(compare_("魔术师", "巫师", True)) - print("*"* 30) print(compare_("中午", "正午", True)) - print("*"* 30) print(compare_("火炉", "炉灶", True)) - print("*"* 30) print(compare_("食物", "水果", True)) - print("*"* 30) print(compare_("鸡", "公鸡", True)) - print("*"* 30) print(compare_("鸟", "鹤", True)) - print("*"* 30) print(compare_("工具", "器械", True)) - print("*"* 30) print(compare_("兄弟", "和尚", True)) - print("*"* 30) print(compare_("起重机", "器械", True)) def test_similarity(self): @@ -110,7 +95,6 @@ def test_similarity(self): sen2 = "巴赫" r = synonyms.compare(sen1, sen2, seg=True) print("%s vs %s" % (sen1, sen2), r) - def test_nearby(self): synonyms.display("人脸") # synonyms.display calls synonyms.nearby diff --git a/setup.py b/setup.py index 611f7ce..a6485ad 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ setup( name='synonyms', - version='2.3', + version='2.5', description='Chinese Synonyms for Natural Language Processing and Understanding', long_description=LONGDOC, author='Hai Liang Wang, Hu Ying Xi', diff --git a/synonyms/__init__.py b/synonyms/__init__.py index b1b4008..31e9c30 100755 --- a/synonyms/__init__.py +++ b/synonyms/__init__.py @@ -149,7 +149,7 @@ def _get_wv(sentence): ''' global _vectors vectors = [] - for y in sentence.split(): + for y in sentence: y_ = any2unicode(y).strip() if y_ not in _stopwords: syns = nearby(y_)[0] @@ -214,13 +214,35 @@ def _levenshtein_distance(sentence1, sentence2): new_distances[-1]))) distances = new_distances levenshtein = distances[-1] - dis = float((maxlen - levenshtein)/maxlen) + d = float((maxlen - levenshtein)/maxlen) # smoothing - s = (sigmoid(dis * 6) - 0.5) * 2 - # print("smoothing[%s| %s]: %s -> %s" % (sentence1, sentence2, dis, s)) + s = (sigmoid(d * 6) - 0.5) * 2 + # print("smoothing[%s| %s]: %s -> %s" % (sentence1, sentence2, d, s)) return s -_smooth = lambda x, y, z: (x * y) + z +def _nearby_levenshtein_distance(s1, s2): + ''' + 使用 + ''' + s1_len = len(s1) + s2_len = len(s2) + maxlen = max(s1_len, s2_len) + first, second = (s2, s1) if s1_len == maxlen else (s1, s2) + ft = set() # all related words with first sentence + for x in first: + ft.add(x) + n, _ = nearby(x) + for o in n: + ft.add(o) + scores = [] + if len(ft) == 0: return 0.0 # invalid length for first string + for x in second: + scores.append(max([_levenshtein_distance(x, y) for y in ft])) + s = np.sum(scores) / maxlen + return s + +# combine similarity scores +_similarity_smooth = lambda x, y, z: (x * y) + z def _similarity_distance(s1, s2): ''' @@ -230,25 +252,21 @@ def _similarity_distance(s1, s2): b = _sim_molecule(_get_wv(s2)) # https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.norm.html g = 1 / (np.linalg.norm(a - b) + 1) - u = _levenshtein_distance(s1, s2) + + u = _nearby_levenshtein_distance(s1, s2) # print("g: %s, u: %s" % (g, u)) if u > 0.8: - r = _smooth(g, 0.05, u) + r = _similarity_smooth(g, 1, u) elif u > 0.7: - r = _smooth(g, 0.1, u) + r = _similarity_smooth(g, 1.5, u) elif u > 0.6: - r = _smooth(g, 0.2, u) - elif u > 0.5: - r = _smooth(g, 1, u) - elif u > 0.4: - r = _smooth(g, 4, u) + r = _similarity_smooth(g, 2, u) else: - r = _smooth(g, 10, u) + r = _similarity_smooth(g, 4, u) r = min(r, 1.0) return float("%.3f" % r) - def compare(s1, s2, seg=True): ''' compare similarity @@ -257,12 +275,15 @@ def compare(s1, s2, seg=True): seg : True : The original sentences need jieba.cut Flase : The original sentences have been cut. ''' - assert len(s1) > 0 and len(s2) > 0, "The length of s1 and s2 should > 0." if seg: - s1 = ' '.join(jieba.cut(s1)) - s2 = ' '.join(jieba.cut(s2)) - return _similarity_distance(s1, s2) + s1 = [x for x in jieba.cut(s1)] + s2 = [x for x in jieba.cut(s2)] + else: + s1 = s1.split() + s2 = s2.split() + assert len(s1) > 0 and len(s2) > 0, "The length of s1 and s2 should > 0." + return _similarity_distance(s1, s2) def display(word): print("'%s'近义词:" % word) @@ -273,7 +294,6 @@ def display(word): for k, v in enumerate(o[0]): print(" %d. %s:%s" % (k + 1, v, o[1][k])) - def main(): display("人脸") display("NOT_EXIST")