Skip to content

Commit

Permalink
Closed #43 smoothing scores in compare API
Browse files Browse the repository at this point in the history
  • Loading branch information
hailiang-wang committed Mar 1, 2018
1 parent 3e872fe commit dac98aa
Show file tree
Hide file tree
Showing 7 changed files with 79 additions and 15 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# 2.3
* 计算相似度时增加平滑策略

# v1.6
* use ```jieba``` instead of ```thulac``` as tokeninzer.
* refine console log for Jupyter notebook.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ Chinese Synonyms for Natural Language Processing and Understanding.
```
pip install -U synonyms
```
兼容py2和py3,当前稳定版本 v2.2**同时,Node.js 用户可以使用 [node-synonyms](https://www.npmjs.com/package/node-synonyms)了。**
兼容py2和py3,当前稳定版本 v2.3**同时,Node.js 用户可以使用 [node-synonyms](https://www.npmjs.com/package/node-synonyms)了。**

```
npm install node-synonyms
Expand Down Expand Up @@ -103,10 +103,10 @@ data is built based on [wikidata-corpus](https://github.com/Samurais/wikidata-co
## Valuation

### 同义词词林
《同义词词林》是梅家驹等人于1983年编纂而成,现在使用广泛的是哈工大社会计算与信息检索研究中心维护的《同义词词林扩展版》,它精细的将中文词汇划分成大类和小类,梳理了词汇间的关系,同义词词林扩展版包含词语77,343条,其中32,470被以开放数据形式共享
《同义词词林》是梅家驹等人于1983年编纂而成,现在使用广泛的是哈工大社会计算与信息检索研究中心维护的《同义词词林扩展版》,它精细的将中文词汇划分成大类和小类,梳理了词汇间的关系,同义词词林扩展版包含词语7万余条,其中3万余条被以开放数据形式共享

### 知网, HowNet
HowNet,也被称为知网,它并不只是一个语义字典,而是一个知识系统,词汇之间的关系是其一个基本使用场景。知网包含词语8,265条
HowNet,也被称为知网,它并不只是一个语义字典,而是一个知识系统,词汇之间的关系是其一个基本使用场景。知网包含词语8余条

国际上对词语相似度算法的评价标准普遍采用 Miller&Charles 发布的英语词对集的人工判定值。该词对集由十对高度相关、十对中度相关、十对低度相关共 30 个英语词对组成,然后让38个受试者对这30对进行语义相关度判断,最后取他们的平均值作为人工判定标准。然后不同近义词工具也对这些词汇进行相似度评分,与人工判定标准做比较,比如使用皮尔森相关系数。在中文领域,使用这个词表的翻译版进行中文近义词比较也是常用的办法。

Expand All @@ -115,7 +115,7 @@ Synonyms的词表容量是125,792,下面选择一些在同义词词林、知

![](./assets/5.png)

注:同义词林及知网数据、分数来源, https://github.com/yaleimeng/Final_word_Similarity
注:同义词林及知网数据、分数来源, https://github.com/yaleimeng/Final_word_Similarity;Synonyms也在不断优化中,新的分数可能和上图不一致。

## Benchmark

Expand Down
2 changes: 1 addition & 1 deletion Requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
synonyms>=2.0
synonyms>=2.3
43 changes: 41 additions & 2 deletions demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@
import numpy
import unittest

# run testcase: python /Users/hain/ai/Synonyms/demo.py Test.testExample

compare_ = lambda x,y,z: "%s vs %s: %f" % (x, y, synonyms.compare(x, y, seg=z))

# run testcase: python /Users/hain/ai/Synonyms/demo.py Test.testExample
class Test(unittest.TestCase):
'''
Expand All @@ -50,6 +50,39 @@ def setUp(self):
def tearDown(self):
pass

def test_pairs(self):
print("test_pairs")
print("*"* 30)
print(compare_("轿车", "汽车", True))
print("*"* 30)
print(compare_("宝石", "宝物", True))
print("*"* 30)
print(compare_("旅游", "游历", True))
print("*"* 30)
print(compare_("男孩子", "小伙子", True))
print("*"* 30)
print(compare_("海岸", "海滨", True))
print("*"* 30)
print(compare_("庇护所", "精神病院", True))
print("*"* 30)
print(compare_("魔术师", "巫师", True))
print("*"* 30)
print(compare_("中午", "正午", True))
print("*"* 30)
print(compare_("火炉", "炉灶", True))
print("*"* 30)
print(compare_("食物", "水果", True))
print("*"* 30)
print(compare_("鸡", "公鸡", True))
print("*"* 30)
print(compare_("鸟", "鹤", True))
print("*"* 30)
print(compare_("工具", "器械", True))
print("*"* 30)
print(compare_("兄弟", "和尚", True))
print("*"* 30)
print(compare_("起重机", "器械", True))

def test_similarity(self):
'''
Generate sentence similarity
Expand All @@ -73,6 +106,12 @@ def test_similarity(self):
print("发生历史性变革 vs 发生历史性变革:", r)
# assert r > 0, "the similarity should be bigger then zero"

sen1 = "骨折"
sen2 = "巴赫"
r = synonyms.compare(sen1, sen2, seg=True)
print("%s vs %s" % (sen1, sen2), r)


def test_nearby(self):
synonyms.display("人脸") # synonyms.display calls synonyms.nearby

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

setup(
name='synonyms',
version='2.2',
version='2.3',
description='Chinese Synonyms for Natural Language Processing and Understanding',
long_description=LONGDOC,
author='Hai Liang Wang, Hu Ying Xi',
Expand Down
34 changes: 27 additions & 7 deletions synonyms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
from synonyms.word2vec import KeyedVectors
from synonyms.utils import any2utf8
from synonyms.utils import any2unicode
from synonyms.utils import sigmoid
import jieba.posseg as _tokenizer
import jieba

Expand All @@ -58,7 +59,6 @@
_vectors = None
_stopwords = set()


'''
nearby
'''
Expand Down Expand Up @@ -195,10 +195,13 @@ def _levenshtein_distance(sentence1, sentence2):
Based on:
http://rosettacode.org/wiki/Levenshtein_distance#Python
'''
first = sentence1.split()
second = sentence2.split()
if len(first) > len(second):
first = any2utf8(sentence1).decode('utf-8', 'ignore')
second = any2utf8(sentence2).decode('utf-8', 'ignore')
sentence1_len, sentence2_len = len(first), len(second)
maxlen = max(sentence1_len, sentence2_len)
if sentence1_len > sentence2_len:
first, second = second, first

distances = range(len(first) + 1)
for index2, char2 in enumerate(second):
new_distances = [index2 + 1]
Expand All @@ -211,8 +214,13 @@ def _levenshtein_distance(sentence1, sentence2):
new_distances[-1])))
distances = new_distances
levenshtein = distances[-1]
return 2 ** (-1 * levenshtein)
dis = float((maxlen - levenshtein)/maxlen)
# smoothing
s = (sigmoid(dis * 6) - 0.5) * 2
# print("smoothing[%s| %s]: %s -> %s" % (sentence1, sentence2, dis, s))
return s

_smooth = lambda x, y, z: (x * y) + z

def _similarity_distance(s1, s2):
'''
Expand All @@ -223,9 +231,21 @@ def _similarity_distance(s1, s2):
# https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.norm.html
g = 1 / (np.linalg.norm(a - b) + 1)
u = _levenshtein_distance(s1, s2)
r = g * 5 + u * 0.8
r = min(r, 1.0)
# print("g: %s, u: %s" % (g, u))
if u > 0.8:
r = _smooth(g, 0.05, u)
elif u > 0.7:
r = _smooth(g, 0.1, u)
elif u > 0.6:
r = _smooth(g, 0.2, u)
elif u > 0.5:
r = _smooth(g, 1, u)
elif u > 0.4:
r = _smooth(g, 4, u)
else:
r = _smooth(g, 10, u)

r = min(r, 1.0)
return float("%.3f" % r)


Expand Down
2 changes: 2 additions & 0 deletions synonyms/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,8 @@ def any2unicode(text, encoding='utf8', errors='strict'):

to_unicode = any2unicode

def sigmoid(x):
return 1.0 / (1.0 + np.exp(-x))

def call_on_class_only(*args, **kwargs):
"""Raise exception when load methods are called on instance"""
Expand Down

0 comments on commit dac98aa

Please sign in to comment.