forked from liuhuanyong/SentenceSimilarity
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sim_hownet.py
66 lines (56 loc) · 2.33 KB
/
sim_hownet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/env python3
# coding: utf-8
# File: sim_hownet.py
# Author: lhy<[email protected],https://huangyong.github.io>
# Date: 18-4-27
import jieba.posseg as pseg
class SimHownet:
def __init__(self):
self.semantic_path = 'model/hownet.dat'
self.semantic_dict = self.load_semanticwords()
'''加载语义词典'''
def load_semanticwords(self):
semantic_dict = {}
for line in open(self.semantic_path):
words = [word for word in line.strip().replace(' ','>').replace('\t','>').split('>') if word !='']
word = words[0]
word_def = words[2]
semantic_dict[word] = word_def.split(',')
return semantic_dict
'''基于语义计算语义相似度'''
def calculate_semantic(self, DEF1, DEF2):
DEF_INTERSECTION = set(DEF1).intersection(set(DEF2))
DEF_UNION = set(DEF1).union(set(DEF2))
return float(len(DEF_INTERSECTION))/float(len(DEF_UNION))
'''比较两个词语之间的相似度'''
def compute_similarity(self, word1, word2):
DEFS_word1 = self.semantic_dict.get(word1, [])
DEFS_word2 = self.semantic_dict.get(word2, [])
scores = [self.calculate_semantic(DEF_word1, DEF_word2) for DEF_word1 in DEFS_word1 for DEF_word2 in DEFS_word2]
if scores:
return max(scores)
else:
return 0
'''基于词语相似度计算句子相似度'''
def distance(self, text1, text2):
words1 = [word.word for word in pseg.cut(text1) if word.flag[0] not in ['u', 'x', 'w']]
words2 = [word.word for word in pseg.cut(text2) if word.flag[0] not in ['u', 'x', 'w']]
score_words1 = []
score_words2 = []
for word1 in words1:
score = max(self.compute_similarity(word1, word2) for word2 in words2)
score_words1.append(score)
for word2 in words2:
score = max(self.compute_similarity(word2, word1) for word1 in words1)
score_words2.append(score)
similarity = max(sum(score_words1)/len(words1), sum(score_words2)/len(words2))
return similarity
def test():
simer = SimHownet()
text1 = '南昌是江西的省会'
text2 = '北京乃中国之首都'
text1 = '周杰伦是一个歌手'
text2 = '刘若英是个演员'
sim = simer.distance(text1, text2)
print(sim)
test()