This repository has been archived by the owner on Jan 24, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
ngram.py
494 lines (429 loc) · 19.3 KB
/
ngram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
# Natural Language Toolkit: Language Models
#
# Copyright (C) 2001-2009 NLTK Project
# Author: Steven Bird <[email protected]>
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT
import random, collections.abc
from itertools import chain
from math import log
from nltk.probability import (ConditionalProbDist, ConditionalFreqDist,
MLEProbDist, FreqDist, WittenBellProbDist)
from nltk.util import ngrams as ingrams
from nltk import compat
try:
from api import *
except ImportError:
from .api import *
def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
# http://stackoverflow.com/a/33024979
return abs(a-b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
def discount(self):
return float(self._N)/float(self._N + self._T)
def check(self):
totProb=sum(self.prob(sample) for sample in self.samples())
assert isclose(self.discount(),totProb),\
"discount %s != totProb %s"%(self.discount(),totProb)
WittenBellProbDist.discount = discount
WittenBellProbDist.check = check
def _estimator(fdist, bins):
"""
Default estimator function using WB.
"""
# can't be an instance method of NgramModel as they
# can't be pickled either.
res=WittenBellProbDist(fdist,fdist.B()+1)
res.check()
return res
@compat.python_2_unicode_compatible
class NgramModel(ModelI):
"""
A processing interface for assigning a probability to the next word.
"""
def __init__(self, n, train, pad_left=False, pad_right=False,
estimator=None, *estimator_args, **estimator_kwargs):
"""
Creates an ngram language model to capture patterns in n consecutive
words of training text. An estimator smooths the probabilities derived
from the text and may allow generation of ngrams not seen during
training.
:param n: the order of the language model (ngram size)
:type n: C{int}
:param train: the training text
:type train: C{iterable} of C{string} or C{iterable} of C{iterable} of C{string}
:param estimator: a function for generating a probability distribution---defaults to MLEProbDist
:type estimator: a function that takes a C{ConditionalFreqDist} and
returns a C{ConditionalProbDist}
:param pad_left: whether to pad the left of each sentence with an (n-1)-gram of <s>
:type pad_left: bool
:param pad_right: whether to pad the right of each sentence with </s>
:type pad_right: bool
:param estimator_args: Extra arguments for estimator.
These arguments are usually used to specify extra
properties for the probability distributions of individual
conditions, such as the number of bins they contain.
Note: For backward-compatibility, if no arguments are specified, the
number of bins in the underlying ConditionalFreqDist are passed to
the estimator as an argument.
:type estimator_args: (any)
:param estimator_kwargs: Extra keyword arguments for the estimator
:type estimator_kwargs: (any)
"""
# protection from cryptic behavior for calling programs
# that use the pre-2.0.2 interface
assert(isinstance(pad_left, bool))
assert(isinstance(pad_right, bool))
# make sure n is greater than zero, otherwise print it
assert (n > 0), n
# For explicitness save the check whether this is a unigram model
self.is_unigram_model = (n == 1)
# save the ngram order number
self._n = n
# save left and right padding
self._lpad = ('<s>',) * (n - 1) if pad_left else ()
# Need _rpad even for unigrams or padded entropy will give
# wrong answer because '</s>' will be treated as unseen...
self._rpad = ('</s>',) if pad_right else ()
self._padLen = len(self._lpad)+len(self._rpad)
self._N=0
delta = 1+self._padLen-n # len(sent)+delta == ngrams in sent
if estimator is None:
assert (estimator_args is ()) and (estimator_kwargs=={}),\
"estimator_args (%s) or _kwargs supplied (%s), but no estimator"%(estimator_args,estimator_kwargs)
estimator = lambda fdist, bins: MLEProbDist(fdist)
# Given backoff, a generator isn't acceptable
if not isinstance(train,collections.abc.Sequence):
train=list(train)
self._W = len(train)
# Coerce to list of list -- note that this means to train charGrams,
# requires exploding the words ahead of time
if train is not None:
if isinstance(train[0], compat.string_types):
train = [train]
self._W=1
elif not isinstance(train[0],collections.abc.Sequence):
# if you mix strings and generators, you have only yourself
# to blame!
for i in range(len(train)):
train[i]=list(train[i])
if n == 1:
if pad_right:
sents=(chain(s,self._rpad) for s in train)
else:
sents=train
fd=FreqDist()
for s in sents:
fd.update(s)
if not estimator_args and not estimator_kwargs:
self._model = estimator(fd,fd.B())
else:
self._model = estimator(fd,fd.B(),
*estimator_args, **estimator_kwargs)
self._N=fd.N()
else:
cfd = ConditionalFreqDist()
self._ngrams = set()
for sent in train:
self._N+=len(sent)+delta
for ngram in ingrams(chain(self._lpad, sent, self._rpad), n):
self._ngrams.add(ngram)
context = tuple(ngram[:-1])
token = ngram[-1]
cfd[context][token]+=1
if not estimator_args and not estimator_kwargs:
self._model = ConditionalProbDist(cfd, estimator, len(cfd))
else:
self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs)
# recursively construct the lower-order models
if not self.is_unigram_model:
self._backoff = NgramModel(n-1, train,
pad_left, pad_right,
estimator,
*estimator_args,
**estimator_kwargs)
# Code below here in this method, and the _words_following and _alpha method, are from
# http://www.nltk.org/_modules/nltk/model/ngram.html "Last updated on Feb 26, 2015"
self._backoff_alphas = dict()
# For each condition (or context)
for ctxt in cfd.conditions():
backoff_ctxt = ctxt[1:]
backoff_total_pr = 0.0
total_observed_pr = 0.0
# this is the subset of words that we OBSERVED following
# this context.
# i.e. Count(word | context) > 0
for word in self._words_following(ctxt, cfd):
total_observed_pr += self.prob(word, ctxt)
# we also need the total (n-1)-gram probability of
# words observed in this n-gram context
backoff_total_pr += self._backoff.prob(word, backoff_ctxt)
if isclose(total_observed_pr,1.0):
total_observed_pr=1.0
else:
assert 0.0 <= total_observed_pr <= 1.0,\
"sum of probs for %s out of bounds: %.10g"%(ctxt,total_observed_pr)
# beta is the remaining probability weight after we factor out
# the probability of observed words.
# As a sanity check, both total_observed_pr and backoff_total_pr
# must be GE 0, since probabilities are never negative
beta = 1.0 - total_observed_pr
if beta!=0.0:
assert (0.0 <= backoff_total_pr < 1.0), \
"sum of backoff probs for %s out of bounds: %s"%(ctxt,backoff_total_pr)
alpha_ctxt = beta / (1.0 - backoff_total_pr)
else:
assert ((0.0 <= backoff_total_pr < 1.0) or
isclose(1.0,backoff_total_pr)), \
"sum of backoff probs for %s out of bounds: %s"%(ctxt,backoff_total_pr)
alpha_ctxt = 0.0
self._backoff_alphas[ctxt] = alpha_ctxt
def _words_following(self, context, cond_freq_dist):
return cond_freq_dist[context].keys()
def prob(self, word, context, verbose=False):
"""
Evaluate the probability of this word in this context using Katz Backoff.
:param word: the word to get the probability of
:type word: str
:param context: the context the word is in
:type context: list(str)
"""
assert(isinstance(word,compat.string_types))
context = tuple(context)
if self.is_unigram_model:
if not(self._model.SUM_TO_ONE):
# Smoothing models should do the right thing for unigrams
# even if they're 'absent'
return self._model.prob(word)
else:
try:
return self._model.prob(word)
except:
raise RuntimeError("No probability mass assigned"
"to unigram %s" % (word))
if context + (word,) in self._ngrams:
return self[context].prob(word)
else:
alpha=self._alpha(context)
if alpha>0.0:
if verbose:
print("backing off for %s"%(context+(word,),))
return alpha * self._backoff.prob(word, context[1:],verbose)
else:
if verbose:
print('no backoff for "%s" as model doesn\'t do any smoothing so prob=0.0'%word)
return alpha
def _alpha(self, context,verbose=False):
"""Get the backoff alpha value for the given context
"""
error_message = "Alphas and backoff are not defined for unigram models"
assert not self.is_unigram_model, error_message
if context in self._backoff_alphas:
res = self._backoff_alphas[context]
else:
res = 1
if verbose:
print(" alpha: %s = %s"%(context,res))
return res
def logprob(self, word, context,verbose=False):
"""
Evaluate the (negative) log probability of this word in this context.
:param word: the word to get the probability of
:type word: str
:param context: the context the word is in
:type context: list(str)
"""
return -log(self.prob(word, context,verbose), 2)
@property
def ngrams(self):
return self._ngrams
@property
def backoff(self):
return self._backoff
@property
def model(self):
return self._model
# NB, this will always start with same word since model
# is trained on a single text
def generate(self, num_words, context=()):
'''
Generate random text based on the language model.
:param num_words: number of words to generate
:type num_words: int
:param context: initial words in generated string
:type context: list(str)
'''
orig = list(context)
res=[]
text = list(orig) # take a copy
for i in range(num_words):
one=self._generate_one(text)
text.append(one)
if one=='</s>' or i==num_words-1:
if self._lpad is not ():
res.append(list(self._lpad)[:(len(self._lpad)+len(context))-(self._n-2)]+text)
else:
res.append(text)
text=list(orig)
return res
def _generate_one(self, context):
context = (self._lpad + tuple(context))[-self._n+1:]
# print "Context (%d): <%s>" % (self._n, ','.join(context))
if context in self:
return self[context].generate()
elif self._n > 1:
return self._backoff._generate_one(context[1:])
else:
return self._model.max()
def entropy(self, text, pad_left=False, pad_right=False,
verbose=False, perItem=False):
"""
Calculate the approximate cross-entropy of the n-gram model for a
given evaluation text.
This is the average log probability of each item in the text.
:param text: items to use for evaluation
:type text: iterable(str)
:param pad_left: whether to pad the left of each text with an (n-1)-gram of <s> markers
:type pad_left: bool
:param pad_right: whether to pad the right of each sentence with an </s> marker
:type pad_right: bool
:param perItem: normalise for length if True
:type perItem: bool
"""
# This version takes account of padding for greater accuracy
# Note that if input is a string, it will be exploded into characters
e = 0.0
for ngram in ingrams(chain(self._lpad, text, self._rpad), self._n):
context = tuple(ngram[:-1])
token = ngram[-1]
cost=self.logprob(token, context, verbose) # _negative_
# log2 prob == cost!
if verbose:
print("p(%s|%s) = [%s-gram] %7f"%(token,context,self._n,2**-cost))
e += cost
if perItem:
return e/((len(text)+self._padLen)-(self._n - 1))
else:
return e
def perplexity(self, text, pad_left=False, pad_right=False, verbose=False):
"""
Calculates the perplexity of the given text.
This is simply 2 ** cross-entropy for the text.
:param text: words to calculate perplexity of
:type text: list(str)
:param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings
:type pad_left: bool
:param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings
:type pad_right: bool
"""
return pow(2.0, self.entropy(text), pad_left=pad_left,
pad_right=pad_right, perItem=True)
def dump(self, file, logBase=None, precision=7):
"""Dump this model in SRILM/ARPA/Doug Paul format
Use logBase=10 and the default precision to get something comparable
to SRILM ngram-model -lm output
@param file to dump to
@type file file
@param logBase If not None, output logBases to the specified base
@type logBase int|None"""
file.write('\n\\data\\\n')
self._writeLens(file)
self._writeModels(file,logBase,precision,None)
file.write('\\end\\\n')
def _writeLens(self,file):
if self._n>1:
self._backoff._writeLens(file)
file.write('ngram %s=%s\n'%(self._n,
sum(len(self._model[c].samples())\
for c in self._model.keys())))
else:
file.write('ngram 1=%s\n'%len(self._model.samples()))
def _writeModels(self,file,logBase,precision,alphas):
if self._n>1:
self._backoff._writeModels(file,logBase,precision,self._backoff_alphas)
file.write('\n\\%s-grams:\n'%self._n)
if self._n==1:
self._writeProbs(self._model,file,logBase,precision,(),alphas)
else:
for c in sorted(self._model.conditions()):
self._writeProbs(self._model[c],file,logBase,precision,
c,alphas)
def _writeProbs(self,pd,file,logBase,precision,ctxt,alphas):
if self._n==1:
for k in sorted(pd.samples()+['<unk>','<s>']):
if k=='<s>':
file.write('-99')
elif k=='<unk>':
_writeProb(file,logBase,precision,1-pd.discount())
else:
_writeProb(file,logBase,precision,pd.prob(k))
file.write('\t%s'%k)
if k not in ('</s>','<unk>'):
file.write('\t')
_writeProb(file,logBase,precision,alphas[ctxt+(k,)])
file.write('\n')
else:
ctxtString=' '.join(ctxt)
for k in sorted(pd.samples()):
_writeProb(file,logBase,precision,pd.prob(k))
file.write('\t%s %s'%(ctxtString,k))
if alphas is not None:
file.write('\t')
_writeProb(file,logBase,precision,alphas[ctxt+(k,)])
file.write('\n')
def __contains__(self, item):
item=tuple(item)
try:
return item in self._model
except:
try:
# hack if model is an MLEProbDist, more efficient
return item in self._model._freqdist
except:
return item in self._model.samples()
def __getitem__(self, item):
return self._model[tuple(item)]
def __repr__(self):
return '<NgramModel with %d %d-grams>' % (self._N, self._n)
def _writeProb(file,logBase,precision,p):
file.write('%.*g'%(precision,
p if logBase is None else log(p,logBase)))
class LgramModel(NgramModel):
def __init__(self, n, train, pad_left=False, pad_right=False,
estimator=None, *estimator_args, **estimator_kwargs):
"""
NgramModel (q.v.) slightly tweaked to produce char-grams,
not word-grams, with a WittenBell default estimator
:param train: List of strings, which will be converted to list of lists of characters, but more efficiently
:type train: iter(str)
"""
if estimator is None:
assert (not(estimator_args)) and (not(estimator_kwargs)),\
"estimator_args (%s) or _kwargs (%s) supplied, but no estimator"%(estimator_args,estimator_kwargs)
estimator=_estimator
super(LgramModel,self).__init__(n,
(iter(word) for word in train),
pad_left, pad_right,
estimator,
*estimator_args, **estimator_kwargs)
def teardown_module(module=None):
from nltk.corpus import brown
brown._unload()
from nltk.probability import LidstoneProbDist, WittenBellProbDist
def demo(estimator_function=LidstoneProbDist):
from nltk.corpus import brown
estimator = lambda fdist, bins: estimator_function(fdist, 0.2, bins+1)
lm = NgramModel(3, brown.sents(categories='news'), estimator=estimator,
pad_left=True, pad_right=True)
print("Built %s using %s as estimator"%(lm,estimator_function))
txt="There is no such thing as a free lunch ."
print("Computing average per-token entropy for \"%s\", showing the computation:"%txt)
e=lm.entropy(txt.split(),True,True,True,True)
print("Per-token average: %.2f"%e)
text = lm.generate(100)
import textwrap
print("--------\nA randomly generated 100-token sequence:")
for sent in text:
print('\n'.join(textwrap.wrap(' '.join(sent))))
return lm
if __name__ == '__main__':
demo()