-
Notifications
You must be signed in to change notification settings - Fork 0
/
SVM_Classification.py
137 lines (96 loc) · 4.41 KB
/
SVM_Classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import Eval_Matrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_classif
import numpy as np
#import time
# a dummy function that just returns its input
def identity(x):
return x
# decide on TF-IDF vectorization for feature
# based on the value of tfidf (True/False)
def tf_idf_func(tfidf):
# TODO - change the values
# we use a dummy function as tokenizer and preprocessor,
# since the texts are already preprocessed and tokenized.
if tfidf:
vec = TfidfVectorizer(preprocessor = identity, tokenizer = identity)
else:
vec = CountVectorizer(preprocessor = identity, tokenizer = identity)
return vec
def SVM_Normal(trainDoc, trainClass, testDoc, testClass, tfIdf):
# decides on TfidfVectorizer(True) or CountVectorizer(False)
vec = tf_idf_func(tfIdf)
# classifier = Pipeline( [('vec', vec),
# ('cls', svm.LinearSVC())])
total_doc = []
for x in trainDoc:
total_doc.append(x)
for x in testDoc:
total_doc.append(x)
vectorizer = tf_idf_func(tfIdf)
vectorizer.fit(total_doc)
trainDoc_tfidf_dense = vectorizer.transform(trainDoc)
trainDoc_tfidf = trainDoc_tfidf_dense.toarray()
testDoc_tfidf_dense = vectorizer.transform(testDoc)
testDoc_tfidf = testDoc_tfidf_dense.toarray()
#LinearSVC() works as one to rest classifier
classifier = svm.LinearSVC()
# classifier = make_pipeline(SelectKBest(f_classif, k=500), svm.SVC(kernel='linear', C=2.0))
#SVC kernel is linear
#classifier=svm.SVC(kernel='linear', C=2.0)
# Here trainDoc are the documents from training set and trainClass is the class labels for those documents
classifier.fit(trainDoc_tfidf, trainClass)
# Use the classifier to predict the class for all the documents in the test set testDoc
# Save those output class labels in testGuess
'''slight change here'''
##testGuess = classifier.predict(testDoc_tfidf)
'''wrong try for probability'''
# p = np.array(classifier.decision_function(trainDoc_tfidf))
# prob = np.exp(p) / np.sum(np.exp(p), axis=1)
# classes = classifier.predict(trainDoc_tfidf)
# print("Sample={}, Prediction={},\n Votes={} \nP={}".format(idx, c, v, s) for idx, (v, s, c) in enumerate(zip(p, prob, classes)))
# train_vectors_dbow_new = []
#
# for x, y in zip(train_labels, train_vectors_dbow):
# if x == 'entangled work space':
# train_vectors_dbow_new.append(np.append(y, keyword_vectors_dbow[0]))
i=0
probability=[]
for x in testClass:
print("Actual prediction:{}".format(x))
g=testDoc_tfidf[i].reshape(1, -1)
d = classifier.predict(g)
#will get decision scores of each class
print("prediction by classifier:{}".format(d))
e=classifier.decision_function(g)
#converting the decision score using softmax function.Softmax function, a wonderful activation function that turns
# numbers aka logits into probabilities that sum to one.Softmax function outputs a vector that represents the probability
# distributions of a list of potential outcomes.
prob=np.exp(e)/np.sum(np.exp(e),axis=1)
probability.append(prob)
print("Decision function:{}".format(prob))
i +=1
return probability
# print("check of probability:{}".format(probability[399]))
# print("Actual prediction:{}".format(testClass[61]))
# g=testDoc_tfidf[61].reshape(1, -1)
# d=classifier.predict(g)
# print("predicition for d:{}".format(d))
# e=classifier.decision_function(g)
# prob=np.exp(e)/np.sum(np.exp(e),axis=1)
# print("decision function for d:{}".format(prob))
# Just to know which version of Tfidf is being used
'''slight change here again'''
# tfIDF_type = "TfidfVectorizer" if(tfIdf) else "CountVectorizer"
#
# print("\n########### Default SVM Classifier For (", tfIDF_type, ") ###########")
#
# title = "Linear SVM (C = 1.0)"
# #for evaluation
# Eval_Matrics.calculate_measures(classifier, testClass, testGuess, title)
# return classifier
#