-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstanford.py
75 lines (50 loc) · 1.69 KB
/
stanford.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import xml.etree.ElementTree as ET
from subprocess import call
from collections import defaultdict
from subprocess import call
import os, nltk
class StanfordNLP:
def __init__(self, text):
path = "../stanford-nlp/"
back = os.getcwd()
os.chdir(path)
with open("buffer.txt", "w") as buf:
buf.write(text)
call("java -cp stanford-corenlp-1.3.5.jar:stanford-corenlp-1.3.5-models.jar:xom.jar:joda-time.jar:jollyday.jar -Xmx3g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner,parse,dcoref -file buffer.txt".split(" "))
self.data = ET.parse("buffer.txt.xml").getroot()
os.chdir(back)
def coreferences(self):
def parse_mention(mention):
return tuple([int(el.text)-1 for el in mention])
coref = defaultdict(list)
try:
section = self.data.iter("coreference").next()
except StopIteration:
return None
for ref in section:
main = parse_mention(ref[0])
for ment in ref[1:]:
coref[main].append(parse_mention(ment))
return coref
def sentences(self):
sents = []
section = self.data.iter("sentences").next()
for sent in section:
sents.append([el[0].text for el in sent.iter("token")\
if el[0].text != el[4].text \
and el[4].text != ":" \
and el[4].text != "SYM"])
return sents
def preprocess_to_setsim(self):
coreferences = self.coreferences()
sentences = self.sentences()
sents = [set(s) for s in sentences]
for ref in coreferences:
_sent, _start, _end, _head = ref
term = set(sentences[_sent][_start:_end])
for bind in coreferences[ref]:
if _sent == bind[0]:
continue
sents[bind[0]].update(term)
sw = set(nltk.corpus.stopwords.words('english'))
return map(lambda x: x-sw, sents)