-
Notifications
You must be signed in to change notification settings - Fork 92
/
nbsvm.py
96 lines (85 loc) · 3.33 KB
/
nbsvm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os
import pdb
import numpy as np
import argparse
from collections import Counter
def tokenize(sentence, grams):
words = sentence.split()
tokens = []
for gram in grams:
for i in range(len(words) - gram + 1):
tokens += ["_*_".join(words[i:i+gram])]
return tokens
def build_dict(f, grams):
dic = Counter()
for sentence in open(f).xreadlines():
dic.update(tokenize(sentence, grams))
return dic
def process_files(file_pos, file_neg, dic, r, outfn, grams):
output = []
for beg_line, f in zip(["1", "-1"], [file_pos, file_neg]):
for l in open(f).xreadlines():
tokens = tokenize(l, grams)
indexes = []
for t in tokens:
try:
indexes += [dic[t]]
except KeyError:
pass
indexes = list(set(indexes))
indexes.sort()
line = [beg_line]
for i in indexes:
line += ["%i:%f" % (i + 1, r[i])]
output += [" ".join(line)]
output = "\n".join(output)
f = open(outfn, "w")
f.writelines(output)
f.close()
def compute_ratio(poscounts, negcounts, alpha=1):
alltokens = list(set(poscounts.keys() + negcounts.keys()))
dic = dict((t, i) for i, t in enumerate(alltokens))
d = len(dic)
print "computing r..."
p, q = np.ones(d) * alpha , np.ones(d) * alpha
for t in alltokens:
p[dic[t]] += poscounts[t]
q[dic[t]] += negcounts[t]
p /= abs(p).sum()
q /= abs(q).sum()
r = np.log(p/q)
return dic, r
def main(ptrain, ntrain, ptest, ntest, out, liblinear, ngram):
ngram = [int(i) for i in ngram]
print "counting..."
poscounts = build_dict(ptrain, ngram)
negcounts = build_dict(ntrain, ngram)
dic, r = compute_ratio(poscounts, negcounts)
print "processing files..."
process_files(ptrain, ntrain, dic, r, "train-nbsvm.txt", ngram)
process_files(ptest, ntest, dic, r, "test-nbsvm.txt", ngram)
trainsvm = os.path.join(liblinear, "train")
predictsvm = os.path.join(liblinear, "predict")
os.system(trainsvm + " -s 0 train-nbsvm.txt model.logreg")
os.system(predictsvm + " -b 1 test-nbsvm.txt model.logreg " + out)
os.system("rm model.logreg train-nbsvm.txt test-nbsvm.txt")
if __name__ == "__main__":
"""
Usage :
python nbsvm.py --liblinear /PATH/liblinear-1.96\
--ptrain /PATH/data/full-train-pos.txt\
--ntrain /PATH/data/full-train-neg.txt\
--ptest /PATH/data/test-pos.txt\
--ntest /PATH/data/test-neg.txt\
--ngram 123 --out TEST-SCORE
"""
parser = argparse.ArgumentParser(description='Run NB-SVM on some text files.')
parser.add_argument('--liblinear', help='path of liblinear install e.g. */liblinear-1.96')
parser.add_argument('--ptrain', help='path of the text file TRAIN POSITIVE')
parser.add_argument('--ntrain', help='path of the text file TRAIN NEGATIVE')
parser.add_argument('--ptest', help='path of the text file TEST POSITIVE')
parser.add_argument('--ntest', help='path of the text file TEST NEGATIVE')
parser.add_argument('--out', help='path and fileename for score output')
parser.add_argument('--ngram', help='N-grams considered e.g. 123 is uni+bi+tri-grams')
args = vars(parser.parse_args())
main(**args)