-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathknp2json.py
executable file
·136 lines (119 loc) · 4.42 KB
/
knp2json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import re
import json
def execute(knp_tab_output):
# phrase = {type, relation, relationType, basics, morphemes, features}
# basic = {type, relation, relationType, phrase, features [, case, caseAnalysis]}
# morpheme = {type, phrase, morpheme, features}
results = []
lines = knp_tab_output.split('\n')
for l in lines:
if len(l) < 1:
continue
if l[0] == "#":
phrases = []
basics = []
morphemes = []
continue
if l == "EOS":
results.append({"phrases": phrases, "basics": basics, "morphemes": morphemes})
continue
d = {}
f = re.split('>?<?', l)
if l[0] == '*':
d['features'] = f[1:-1] # 最初は係り受けの情報、最後は空
d['relation'] = int(f[0][2:-2])
d['relationType'] = f[0][-2]
d['basics'] = []
d['morphemes'] = []
phrases.append(d)
elif l[0] == '+':
d['phrase'] = len(phrases) - 1
d.update(_analyze_basic(f[:-1]))
basics.append(d)
phrases[-1]['basics'].append(len(basics)-1)
else:
d['phrase'] = len(phrases) - 1
d.update(_analyze_morpheme(f[:-1]))
morphemes.append(d)
phrases[-1]['morphemes'].append(len(morphemes)-1)
return results
def _analyze_basic(basic_info):
d = {'relation': int(basic_info[0][2:-2]), 'relationType': basic_info[0][-2]}
basic_info = basic_info[1:]
removed = []
for f in basic_info:
splited = f.split(':')
if len(splited) > 1:
removed.append(f)
if splited[0] == "解析格":
d['case'] = splited[1]
if splited[0] == "格解析結果":
d['caseAnalysis'] = _analyze_case_analysis(splited[-1])
for f in removed:
basic_info.remove(f)
d['features'] = basic_info
return d
def _analyze_case_analysis(case_analysis_string):
elements = case_analysis_string.split(';')
d = {}
for element in elements:
splited = element.split('/')
e = {'flag': splited[1],
'expression': splited[2],
'#basics': _change_int(splited[3]),
'sentenceId': _change_int(splited[5])}
d[splited[0]] = e
return d
def _change_int(int_string):
if int_string.isdigit():
return int(int_string)
else:
return int_string
def _analyze_morpheme(morpheme_info):
s = morpheme_info[0].split(' ')
d = {'input': s[0], 'pronunciation': s[1], 'original': s[2], 'pos': s[3], 'posId': s[4], 'subPos': s[5],
'subPosId': s[6], 'inflectionType': s[7], 'inflectionTypeId': s[8], 'inflection': s[9], 'inflectionId': s[10],
'others': s[11]}
morpheme_info = morpheme_info[1:]
removed = []
for f in morpheme_info:
splited = f.split(':')
if len(splited) > 1:
removed.append(f)
if splited[0] == 'Wikipediaエントリ':
d['wikipedia'] = splited[1]
else:
d[splited[0]] = splited[1:]
for f in removed:
morpheme_info.remove(f)
d['features'] = morpheme_info
return d
def show_analyzed_knp_info(analyzed_knp_info):
phrases = analyzed_knp_info["phrases"]
basics = analyzed_knp_info["basics"]
morphemes = analyzed_knp_info["morphemes"]
print("### 文節 ###")
attr = ['relation', 'relationType', 'basics', 'morphemes', 'features', 'type']
for i in range(len(phrases)):
print(str(i) + ': ' + _convert_dictionary_to_string(phrases[i], attr))
print("### 形態素 ###")
attr = ['phrase', 'input', 'pos', 'subPos', 'wikipedia', 'features', 'type']
for i in range(len(morphemes)):
print(str(i) + ': ' + _convert_dictionary_to_string(morphemes[i], attr))
print("### 基本句 ###")
attr = ['relation', 'relationType', 'phrase', 'case', 'caseAnalysis', 'features', 'type']
for i in range(len(basics)):
print(str(i) + ': ' + _convert_dictionary_to_string(basics[i], attr))
def _convert_dictionary_to_string(dictionary, keys):
s = '{'
for k in keys:
if k in dictionary.keys():
s += '\'' + k + '\':' + str(dictionary[k]) + ', '
s += '}'
return s
if __name__ == "__main__":
syntactic_info = execute(sys.stdin.read())
print(json.dumps(syntactic_info, indent=2, ensure_ascii=False))