-
Notifications
You must be signed in to change notification settings - Fork 0
/
tfidf.py
140 lines (114 loc) · 4.35 KB
/
tfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# Hotel Reviews Sentiment Analysis Project
# This Project Uses TF-IDF Vectorizer for the models
# @author Simran
# @version 1.0
import pandas as pd # analyse data
import numpy as np # for working with friends
# machine learning library
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
# >>- importing dataset ->>
print('>>- importing dataset ->>')
df = pd.read_csv('Datafiniti_Hotel_Reviews.csv')
# >>- getting rid of null values ->>
print('>>- getting rid of null values ->>')
df = df.dropna()
# >>- taking 30% representative sample ->>
print('>>- taking 30% representative sample ->>')
np.random.seed(34)
df1 = df.sample(frac=0.3)
# >>- adding sentiments column ->>
print('>>- adding sentiments column ->>')
df1['sentiments'] = df1.rating.apply(lambda x: 0 if x in [1, 2] else 1)
# >>- defining input training features and labels ->>
print('>>- defining input training features and labels ->>')
X = df1['reviews'] # input feature
Y = df1['sentiments'] # label
# >>- TF-IDF Vectorizer ->>
print('>>- TF-IDF Vectorizer ->>')
# TF-IDF vectorizer uses frequency of the words
X_train, X_test, Y_train, Y_test = \
train_test_split(X, Y, test_size=0.5, random_state=45)
# Vectorizing the text data
print('Vectorizing the text data')
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
# >- Logistic Regression ->
print('>- Logistic Regression ->')
# Training the model
print('Training the model')
lr = LogisticRegression()
lr.fit(X_train_vec, Y_train)
# Generating Accuracy score
print('Generating Accuracy score')
lr_score = lr.score(X_test_vec, Y_test)
print("Results for Logistic Regression with TF-IDF Vectorizer")
print(lr_score)
# Predicting the labels for the test data
print('Predicting the labels for the test data')
Y_pred_lr = lr.predict(X_test_vec)
# Setting up Confusion matrix
print('Setting up Confusion matrix')
cm_lr = confusion_matrix(Y_test, Y_pred_lr)
tn, fp, fn, tp = confusion_matrix(Y_test, Y_pred_lr).ravel()
print(tn, fp, fn, tp)
# Printing True Positive and Negative rates
print('Printing True Positive and Negative rates')
tpr_lr = round(tp / (tp + fn), 4)
tnr_lr = round(tn / (tn + fp), 4)
print(tpr_lr, tnr_lr)
# >- Support Vector Machine ->
print('>- Support Vector Machine ->')
# Training the model
print('Training the model')
# params = {'kernel':('linear', 'rbf'), 'C':[1, 10, 100]}
svcl = svm.SVC(kernel='rbf')
# clf_sv = GridSearchCV(svcl, params)
svcl.fit(X_train_vec, Y_train)
# Generating Accuracy score
print('Generating Accuracy score')
svcl_score = svcl.score(X_test_vec, Y_test)
print("Results for Support Vector Machine with TF-IDF")
print(svcl_score)
# Predicting the labels for the test data
print('Predicting the labels for the test data')
Y_pred_sv = svcl.predict(X_test_vec)
# Setting up Confusion matrix
print('Setting up Confusion matrix')
cm_sv = confusion_matrix(Y_test, Y_pred_sv)
tn, fp, fn, tp = confusion_matrix(Y_test, Y_pred_sv).ravel()
print(tn, fp, fn, tp)
# Printing True Positive and Negative rates
print('Printing True Positive and Negative rates')
tpr_sv = round(tp / (tp + fn), 4)
tnr_sv = round(tn / (tn + fp), 4)
print(tpr_sv, tnr_sv)
# >- K Nearest Neighbor ->
print('>- K Nearest Neighbor ->')
# Training the model
print('Training the model')
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_vec, Y_train)
# Generating Accuracy score
print('Generating Accuracy score')
knn_score = knn.score(X_test_vec, Y_test)
print("Results for Support Vector Machine with TF-IDF")
print(knn_score)
# Predicting the labels for the test data
print('Predicting the labels for the test data')
Y_pred_knn = knn.predict(X_test_vec)
# Setting up Confusion matrix
print('Setting up Confusion matrix')
cm_knn = confusion_matrix(Y_test, Y_pred_knn)
tn, fp, fn, tp = confusion_matrix(Y_test, Y_pred_knn).ravel()
print(tn, fp, fn, tp)
# Printing True Positive and Negative rates
print('Printing True Positive and Negative rates')
tpr_knn = round(tp / (tp + fn), 4)
tnr_knn = round(tn / (tn + fp), 4)
print(tpr_knn, tnr_knn)