-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_2.py
272 lines (236 loc) · 8.57 KB
/
test_2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
#获取人名说了什么话,返回人名 论点
import jieba.posseg as psg
import jieba,re
from stanfordcorenlp import StanfordCoreNLP
import os, sys
from pyltp import SentenceSplitter, Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller
# global nlp
# nlp= StanfordCoreNLP(r'D:/stanford/stanford-corenlp-full-2018-10-05', lang='zh')
def get_say(filename):
'''获取说的词典'''
res=[]
f_r=open(filename,"r",encoding="utf-8")
for line in f_r:
line=line.strip(r"\n")
lines=line.split(",")
res.extend([data for data in lines if data])
return list(set(res))
def raplace_line_feed(sentence):
return sentence.replace("\u3000"," ")
def more_space_to_one(sentence):
sen=jieba.lcut(sentence)
new_data=[]
for data in sen:
if new_data:
if new_data[-1] not in [" "," "]:
new_data.append(data)
elif data not in [" "," "]:
new_data.append(data)
else:
new_data.append(data)
return "".join(new_data)
def get_name(netags,words):
# LTP_DATA_DIR = r'ltp_data_v3.4.0' # LTP模型目录路径
# # 分词
# segmentor = Segmentor() # 初始化
# segmentor.load(os.path.join(LTP_DATA_DIR, 'cws.model')) # 加载模型
# words = segmentor.segment(line) # 分词
# # 词性标注
# postagger = Postagger() # 初始化
# postagger.load(os.path.join(LTP_DATA_DIR, 'pos.model')) # 加载模型
# postags = postagger.postag(words)
# # postags = postagger.postag(['中国', '进出口', '银行', '与', '中国银行', '加强', '合作', '。'])
res=[]
# # 命名实体识别
# recognizer = NamedEntityRecognizer() # 实例化
# recognizer.load(os.path.join(LTP_DATA_DIR, 'ner.model'))
# netags = recognizer.recognize(words, postags)
print(list(netags))
for i,data in enumerate(list(netags)):
if data[2:]=="Nh":
res.append(words[i])
return list(set(res))
def get_one_name(new_sentence):
#对于这个句子进行文本处理,获取这个句子中的姓名实体
name_list=[]
# new_sentence=raplace_line_feed(new_sentence)
# new_sentence=more_space_to_one(new_sentence)
ner_data=psg.lcut(new_sentence)
i=0
for w,tag in ner_data:
if tag=="nr":
name_list.append(w)
# elif tag=="n":
# if i!=0 and list(ner_data[i-1])[-1]=="nr":
# name_list[-1]+=w
# else:
# name_list.append(w)
i += 1
return list(set(name_list))
def get_dependency_word(sentence):
'''获取句法分析'''
# global nlp
parse_res=[]
nlp = StanfordCoreNLP(r'D:/stanford/stanford-corenlp-full-2018-10-05', lang='zh')
word = nlp.word_tokenize(sentence)
# print(nlp.parse(sentence))
res = nlp.dependency_parse(sentence)
new_data = []
new_data.append("ROOT")
# print(new_data)
new_data += word
# print(new_data)
for i, w in enumerate(res):
# print([w[0], new_data[int(w[1])], new_data[int(w[2])]])
parse_res.append([w[0], new_data[int(w[1])], new_data[int(w[2])]])
nlp.close()
return parse_res
def judge_which_say(say_words,sentence):
'''判断句子中有哪个表示说的词'''
res_words=[]
for word in say_words:
if word in jieba.lcut(sentence):
res_words.append(word)
return list(set(res_words))
# def judge_who_say_what(have_say,name_list,sentence):
# '''判断谁说了什么话'''
# if len(have_say)==
def get_say_sentence(sentence,say):
'''获取包含say的最简短的sentence'''
senlist=[]
new_sen=sentence.split(r"\n")
#print(new_sen)
# res=new_sen
# c=0
# for i ,sen in enumerate(new_sen):
# if sen.count('"')%2!=0:
# c+=1
# if c%2==0:
# res[i-1]+=sen
# res[i]=res[i-1]
# else:
# res.append(sen)
# senlist=[]
# print(new_sen)
# for sen in new_sen:
# if say in sen:
# senlist.append(sen+"。")
for sen in new_sen:
if say in sen:
senlist.append(sen)
return senlist
def second_say_words(sentence,say,per):
#print(sentence)
#print(say)
#print(per)
per=per.replace("\\","")
res2 = re.findall('(?:“|")([\s\S]*?)(?:”|")(?:,|,)?' + "(?:[^,.。,)]*?)" + per + "(?:[^,.。,)]*?)" + say, sentence)
res1=re.findall(per+"(?:[^,.。))]*?)"+say+"(?:[^,.。]*?)"+"(?:,|:|:|,)([\s\S]*?)。",sentence)
if res2!=[]:
return res2[0]
if res1!=[]:
return res1[0]
return ""
#print("hello")
def judge_parse(parse_list,say,sentence):
'''对解析的内容进行判断是不是'''
says=""
idex=0
name=""
for i,data in enumerate(parse_list):
if data[0]=="nsubj" and data[1]==say:
if get_one_name(data[-1])==[data[-1]]:
# idex=i
name=data[-1]
if data[0]=="punct":
idex=i
if i>idex and data[0]!="punct" and idex!=0:
says+=data[-1]
else:
says=second_say_words(sentence, say, name)
return says,name
#一个人说的话一共三种情况,1.最简单的一种就是人名后面紧跟说,说后面紧跟说的话 2.人名后面紧跟说,人名前面用引号引住说的话。3.其他格式
def get_some_idea(sentence,name_list):
#根据与表述相关的词的词库,获取每个人说的话
global say_words
# print(say_words)
#new_sentence = raplace_line_feed(sentence)
#new_sentence = more_space_to_one(new_sentence)
# name_list=get_name(new_sentence)
have_say=judge_which_say(say_words,sentence)
# print(have_say)
say_words_list={}
if have_say!=[] and name_list!=[]:
# for name in name_list:
# jieba.lcut()
for say in have_say:
sen_part=get_say_sentence(sentence,say)
# print("=====",sen_part)
for onesen in sen_part:
# parse=get_dependency_word(onesen)######
# print(parse)
for name in name_list:
# for say in have_say:
# says,name=judge_parse(parse,say,onesen)
ls=second_say_words(onesen,say,name)
if ls:
if name not in say_words_list:
say_words_list[name] = []
say_words_list[name].append((say, ls))
# say_words,name=judge_parse(parse,say,onesen)########
# if name:
# if name not in say_words_list:
# say_words_list[name]=[]
# say_words_list[name].append((say,say_words))
else:
return []
return say_words_list
def get_all_name(r_filename,w_file):
# global nlp
LTP_DATA_DIR = r'ltp_data_v3.4.0' # LTP模型目录路径
# 分词
segmentor = Segmentor() # 初始化
segmentor.load(os.path.join(LTP_DATA_DIR, 'cws.model')) # 加载模型
# words = segmentor.segment(line) # 分词
# 词性标注
postagger = Postagger() # 初始化
postagger.load(os.path.join(LTP_DATA_DIR, 'pos.model')) # 加载模型
#postags = postagger.postag(words)
# postags = postagger.postag(['中国', '进出口', '银行', '与', '中国银行', '加强', '合作', '。'])
#res=[]
# 命名实体识别
recognizer = NamedEntityRecognizer() # 实例化
recognizer.load(os.path.join(LTP_DATA_DIR, 'ner.model'))
f_r=open(r_filename,"r",encoding="utf-8")
f_w=open(w_file,"w",encoding="utf-8")
count=0
for line in f_r:
count+=1
line=line.strip(r"\n")
line = raplace_line_feed(line)
line = more_space_to_one(line)
print(line)
words = segmentor.segment(line)
postags = postagger.postag(words)
netags = recognizer.recognize(words, postags)
name_list=get_name(netags,words)
if name_list!=[]:
print(name_list)
sen=get_some_idea(line,name_list)
print(sen)
if sen:
for key in sen:
# print(sen[key])
sens="\t".join(list(set([data[1] for data in sen[key]])))
f_w.write(key +"\t"+sens +"\n")
# nlp.close()
f_r.close()
f_w.close()
if __name__=="__main__":
# get_all_name("lclnew.txt","name.txt")
global say_words
say_words = get_say("say.txt")
#print(get_dependency_word("刘春玲慷慨激昂的说道,这里没有东西"))
# res=get_one_name("刘春玲说了什么话")
get_all_name("lclnew.txt","sentences.txt")
# print(s)