forked from wwwtyro/keyzen
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bigrams.py
148 lines (111 loc) · 3.67 KB
/
bigrams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
from nltk.corpus import brown, gutenberg
from pprint import pprint
from unipath import Path
import json
import tempfile
import re
import numpy
import time
import tempfile
import urllib2
import sys
import subprocess as sub
import os
import zipfile
import string, sys
def download_zipped_corpus():
tempdir = tempfile.mkdtemp()
url = 'https://github.com/torvalds/linux/archive/master.zip'
test_url = 'https://github.com/facebook/libphenom/archive/master.zip'
file_name = url.split('/')[-1]
# download zipfile with output to console
def clear(): os.system('cls' if os.name=='nt' else 'clear')
wget_out_file = Path(tempdir, file_name)
wget = sub.Popen(['wget', url,'-O', wget_out_file], stdout=sub.PIPE, stderr=sub.STDOUT)
while True:
line = wget.stdout.readline()
if not line: break
clear()
print line
wget.wait()
return wget_out_file.absolute()
def istext(s):
text_characters = "".join(map(chr, range(32, 127)) + list("\n\r\t\b"))
_null_trans = string.maketrans("", "")
if "\0" in s:
return 0
if not s: # Empty files are considered text
return 1
# Get the non-text characters (maps a character to itself then
# use the 'remove' option to get rid of the text characters.)
t = s.translate(_null_trans, text_characters)
# If more than 30% non-text characters, then
# this is considered a binary file
if len(t)/len(s) > 0.30:
return 0
return 1
def main(corpus=None):
# time program execution
start_time = time.time()
# ########################
chars = " jfkdlsahgyturieowpqbnvmcxz6758493021`-=[]\;',./ABCDEFGHIJKLMNOPQRSTUVWXYZ~!@#$%^&*()_+{}|:" + '"<>?';
# ########################
# Generate all the patterns we want to count
bigrams = {}
for i in range(len(chars)):
for j in range(len(chars)):
bigram = chars[i] + chars[j]
bigrams[bigram] = 1 # add fake data
# ########################
CORPUS = ''
# ########################
if corpus is None:
# fetch linux kernal, add all the text files to the Corpus
corpus = download_zipped_corpus()
# unzip corpus
print "unzipping ..."
zfile = zipfile.ZipFile(corpus.absolute())
for name in zfile.namelist():
f = zfile.read(name)
if istext(f):
CORPUS += f
# ########################
max_size = 100000000
if len(CORPUS) > max_size:
CORPUS = CORPUS[:max_size]
# add words from Brown corpus to our Corpus
CORPUS += ' '.join(brown.words())
#CORPUS += gutenberg.raw().replace('\n', ' ').replace('\r', '')
n_chars = len(CORPUS)
CORPUS = re.sub('\s+',' ',CORPUS)
print 'CORPUS length: %s' % len(CORPUS)
print "adding bigrams..."
for i in range(2, n_chars):
try:
bigrams[CORPUS[i-2:i]] += 1
except KeyError as e:
pass
#print e
print "finished"
# ########################
checksum = 0
not_found = []
for key, value in bigrams.iteritems():
if value > 0:
normalised = float(value) / n_chars
bigrams[key] = normalised
checksum += normalised
if value == 1: # remember fake data
not_found.append(key)
print "checksum: %s " % checksum
print '%s out of %s patterns not found' % (len(not_found), len(bigrams))
# save bigram frequencies to file
f = open('genreated_bigrams.json', 'w')
f.write(json.dumps(bigrams))
f.close()
print "exection took:", time.time() - start_time, "seconds"
if __name__=="__main__":
if len(sys.argv) > 1:
main(corpus=Path(sys.argv[1]))
else:
main()