-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstanford.py
106 lines (82 loc) · 3.56 KB
/
stanford.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# -*- coding: utf-8 -*-
# Natural Language Toolkit: Interface to the Stanford Tokenizer
#
# Copyright (C) 2001-2014 NLTK Project
# Author: Steven Xu <[email protected]>
#
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from __future__ import unicode_literals, print_function
import tempfile
import os
import json
from subprocess import PIPE
from nltk import compat
from nltk.internals import find_jar, config_java, java, _java_options
from nltk.tokenize.api import TokenizerI
_stanford_url = 'http://nlp.stanford.edu/software/lex-parser.shtml'
class StanfordTokenizer(TokenizerI):
r"""
Interface to the Stanford Tokenizer
>>> from nltk.tokenize.stanford import StanfordTokenizer
>>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks."
>>> StanfordTokenizer().tokenize(s)
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
>>> s = "The colour of the wall is blue."
>>> StanfordTokenizer(options={"americanize": True}).tokenize(s)
['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.']
"""
_JAR = 'stanford-postagger.jar'
def __init__(self, path_to_jar=None, encoding='UTF-8', options=None, verbose=False, java_options='-mx1000m'):
self._stanford_jar = find_jar(
self._JAR, path_to_jar,
env_vars=('STANFORD_POSTAGGER',),
searchpath=(), url=_stanford_url,
verbose=verbose
)
self._encoding = encoding
self.java_options = java_options
options = {} if options is None else options
self._options_cmd = ','.join('{0}={1}'.format(key, json.dumps(val)) for key, val in options.items())
@staticmethod
def _parse_tokenized_output(s):
return s.splitlines()
def tokenize(self, s):
"""
Use stanford tokenizer's PTBTokenizer to tokenize multiple sentences.
"""
cmd = [
'edu.stanford.nlp.process.PTBTokenizer',
]
return self._parse_tokenized_output(self._execute(cmd, s))
def _execute(self, cmd, input_, verbose=False):
encoding = self._encoding
cmd.extend(['-charset', encoding])
_options_cmd = self._options_cmd
if _options_cmd:
cmd.extend(['-options', self._options_cmd])
default_options = ' '.join(_java_options)
# Configure java.
config_java(options=self.java_options, verbose=verbose)
# Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file:
# Write the actual sentences to the temporary input file
if isinstance(input_, compat.text_type) and encoding:
input_ = input_.encode(encoding)
input_file.write(input_)
input_file.flush()
cmd.append(input_file.name)
# Run the tagger and get the output.
stdout, stderr = java(cmd, classpath=self._stanford_jar,
stdout=PIPE, stderr=PIPE)
stdout = stdout.decode(encoding)
os.unlink(input_file.name)
# Return java configurations to their default values.
config_java(options=default_options, verbose=False)
return stdout
def setup_module(module):
from nose import SkipTest
try:
StanfordTokenizer()
except LookupError:
raise SkipTest('doctests from nltk.tokenize.stanford are skipped because the stanford postagger jar doesn\'t exist')