-
Notifications
You must be signed in to change notification settings - Fork 14
/
NER-StanfordNLP-annotate.py
127 lines (108 loc) · 4.71 KB
/
NER-StanfordNLP-annotate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/python -W ignore
#
# This script provides named entity recogntion with the NER-tagger from Stanford NLP.
#
# Each line is tokenized into sentences -- a few lines are not correctly split into sentences by cc-segment-stories.
#
# http://www-nlp.stanford.edu/software/CRF-NER.shtml
# http://www.nltk.org/api/nltk.tag.html#module-nltk.tag.stanford
#
# 20130101012604.000|20130101012626.000|NER_03|Bill/PERSON|Clinton/PERSON|U.S./LOCATION
# Start time|End time|Primary tag(|Word/ner tag)*
#
# Written by FFS, 2014-08-09
#
# Changelog:
#
# 2014-08-13 Set #!/usr/bin/python -W ignore to turn off Unicode warnings
# 2014-08-10 Server mode via pyner -- note dict output
# 2014-08-09 Forked from PartsOfSpeech-StanfordNLP-01.py
#
# -----------------------------------------------------------------------------------------------------------------
# User input
import sys, os.path
scriptname = os.path.basename(sys.argv[0])
filename = sys.argv[1]
# Help screen
if filename == "-h" :
print "".join([ "\n","\t","This is a production script for named entity recognition -- issue:","\n" ])
print "".join([ "\t","\t",scriptname," $FIL.seg > $FIL.pos or" ])
print "".join([ "\t","\t",scriptname," $FIL.seg | sponge $FIL.seg" ])
print "".join([ "\n","\t","or use the seg-NER bash script for bulk processing." ])
print "".join([ "\n","\t","Limit to six instances to avoid socket errors.","\n" ])
quit()
# Libraries
import datetime, re
# Server mode (TCP/IP sockets, mixed and caseless, supported by pyner)
import ner
Mix = ner.SocketNER(host='localhost', port=2020, output_format='slashTags')
UPP = ner.SocketNER(host='localhost', port=2021, output_format='slashTags')
# Pattern for tokenizing
# http://www.clips.ua.ac.be/pages/pattern-en
from pattern.en import tokenize
# Counter
n = 0
# A. Get the lines from the file
with open(filename) as fp:
for line in fp:
# B. Split each line into fields
field = line.split("|")
# Pretty debug
# print('\n'.join('{}: {}'.format(*k) for k in enumerate(field)))
# C. Header and footer
if len(field[0]) != 18:
print line,
continue
# D. Program credit
if n == 0:
credit=["NER_03|",datetime.datetime.now().strftime("%Y-%m-%d %H:%M"),"|Source_Program=stanford-ner 3.4, ",scriptname,"|Source_Person=Jenny Rose Finkel, FFS|Codebook=Category/Entity"]
print "".join(credit)
n=1
# E. Segment tags and other non-caption tags
if field[2] == "SEG":
print line,
continue
elif len(field[2]) != 3:
print line,
continue
# F. Get the text, clean leading chevrons -- if BOM, strip non-ascii, otherwise remove individually
try:
text = re.sub('^[>,\ ]{0,6}','', field[3])
if re.search("(\xef\xbf\xbd)", text): text = ''.join([x for x in text if ord(x) < 128])
text = str(text).replace('\x00 ','').replace('\xef\xbf\xbd','')
text = str(text).replace('\xf7','').replace('\xc3\xba','').replace('\xb6','').replace('\xa9','').replace('\xe2\x99\xaa','')
text = str(text).replace('\xc3\xaf','').replace('\x5c','').replace('\xf1','').replace('\xe1','').replace('\xe7','').replace('\xfa','')
text = str(text).replace('\xf3','').replace('\xed','').replace('\xe9','').replace('\xe0','').replace('\xae','').replace('\xc2','')
text = str(text).replace('\xc3','').replace('\xa2','').replace('\xbf','')
# print text
except IndexError:
print line
continue
# G. Remove clearly wrong unicode characters -- BOM, NULL (only utf8 hex works)
line = str(line).replace('\x00 ','').replace('\xef\xbf\xbd','')
print line,
# H. Ensure the text is split into sentences
# tokenize(string, punctuation=".,;:!?()[]{}`''\"@#$^&*+-|=~_", replace={})
for sentence in tokenize(text):
all = ""
# I. Select the parser
if sentence.isupper() or sentence.islower(): st = UPP
else: st = Mix
# J. Parts of speech with stanford-ner via pyner
reply = st.get_entities(sentence)
# {u'PERSON': [u'Bill Clinton'], u'LOCATION': [u'U.S.'], u'O': [u'was President of the']}
try:
for tup in reply.items():
names = ""
if tup[0] == "O" or not tup[0] : continue
for name in tup[1]:
names = "".join([names,"/",name])
all = "".join([all,"|",tup[0],names])
if all != "": print "".join([field[0],"|",field[1],"|NER_03",all])
# I/PRP|'M/MD|JOHN/NNP|DOE/NNP
except (UnicodeDecodeError, UnicodeEncodeError, IndexError, AssertionError):
print "".join([field[0],"|",field[1],"|NER_03","|NA"])
continue
# K. Close the file
fp.close()
# EOF