-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnaive_bayes.py
197 lines (153 loc) · 6.96 KB
/
naive_bayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# naive_bayes.py
# ---------------
# Licensing Information: You are free to use or extend this projects for
# educational purposes provided that (1) you do not distribute or publish
# solutions, (2) you retain this notice, and (3) you provide clear
# attribution to the University of Illinois at Urbana-Champaign
#
# Created by Justin Lizama ([email protected]) on 09/28/2018
import numpy as np
from collections import Counter
"""
This is the main entry point for Part 1 of this MP. You should only modify code
within this file for Part 1 -- the unrevised staff files will be used for all other
files and classes when code is run, so be careful to not modify anything else.
"""
def naiveBayes(train_set, train_labels, dev_set, smoothing_parameter, pos_prior):
"""
train_set - List of list of words corresponding with each email
example: suppose I had two emails 'like this movie' and 'i fall asleep' in my training set
Then train_set := [['like','this','movie'], ['i','fall','asleep']]
train_labels - List of labels corresponding with train_set
example: Suppose I had two emails, first one was ham and second one was spam.
Then train_labels := [1, 0]
dev_set - List of list of words corresponding with each email that we are testing on
It follows the same format as train_set
smoothing_parameter - The smoothing parameter --laplace (1.0 by default)
pos_prior - positive prior probability (between 0 and 1)
"""
# *----Train Model----*
# Initialize Counters to store the frequency of every word in Ham/Spam emails
ham_word_counter = Counter()
spam_word_counter = Counter()
# Initialize dictionaries to store the probability of each word Ham/Spam emails
ham_word_probability = {}
spam_word_probability = {}
# Initialize a list to store the predicted development set labels
dev_labels = []
# Populate the frequency of every word in Ham/Spam emails
index = 0
for label in train_labels:
if label == 1:
ham_word_counter.update(train_set[index])
else:
spam_word_counter.update(train_set[index])
index += 1
# Display frequency
print("Ham Word Counter:")
print(ham_word_counter.most_common(10))
print()
print("Spam Word Counter:")
print(spam_word_counter.most_common(10))
print()
# Determine the total number of words in the Ham/Spam training email set
ham_total_words = 0
for word_frequency in ham_word_counter.values():
ham_total_words += word_frequency
spam_total_words = 0
for word_frequency in spam_word_counter.values():
spam_total_words += word_frequency
# Display totals BEFORE Laplace smoothing
print("Total Number of Words in Ham Emails BEFORE Laplace:")
print(ham_total_words)
print()
print("Total Number of Words in Spam Emails BEFORE Laplace:")
print(spam_total_words)
print()
# Add the words present in the developer set but absent in the ham set to the counter with a frequency of zero
for email in range(len(dev_set)):
for word in dev_set[email]:
if word not in ham_word_counter:
ham_word_counter.update([word])
ham_word_counter.subtract([word])
# Add the words present in the developer set but absent in the spam set to the counter with a frequency of zero
for email in range(len(dev_set)):
for word in dev_set[email]:
if word not in spam_word_counter:
spam_word_counter.update([word])
spam_word_counter.subtract([word])
# Display the ham counter after the addition of words with zero frequency
ham_word_counter_length = len(ham_word_counter)
print("Smallest Ham Word Frequency:")
print(ham_word_counter[ham_word_counter_length - 1])
print()
# Display the spam counter after the addition of words with zero frequency
spam_word_counter_length = len(spam_word_counter)
print("Smallest Spam Word Frequency:")
print(spam_word_counter[spam_word_counter_length - 1])
print()
# Copy ham word counter content into ham word probability dictionary
ham_word_probability = ham_word_counter.copy()
# Copy spam word counter content into spam word probability dictionary
spam_word_probability = spam_word_counter.copy()
# Display dictionaries before the addition of the Laplace smoothing constant
print("Ham Word Probability BEFORE Laplace:")
# print(ham_word_probability)
print()
print("Spam Word Probability BEFORE Laplace:")
# print(spam_word_probability)
print()
# Apply Laplace smoothing
for word in ham_word_probability:
ham_word_probability[word] += smoothing_parameter
for word in spam_word_probability:
spam_word_probability[word] += smoothing_parameter
# Display the dictionaries after the addition of the Laplace smoothing constant
print("Laplace Constant:")
print(smoothing_parameter)
print()
print("Ham Word Probability AFTER Laplace:")
# print(ham_word_probability)
print()
print("Spam Word Probability AFTER Laplace:")
# print(spam_word_probability)
print()
# Determine the total number of words after Laplace smoothing
ham_word_total = sum(ham_word_probability.values())
spam_word_total = sum(spam_word_probability.values())
# Display totals AFTER Laplace smoothing
print("Total Number of Words in Ham Emails AFTER Laplace:")
print(ham_word_total)
print()
print("Total Number of Words in Spam Emails AFTER Laplace:")
print(spam_word_total)
print()
# Determine each word's likelihood in ham/spam emails (logging the probabilities to avoid underflow)
for word in ham_word_probability:
ham_word_probability[word] = np.log((ham_word_probability[word]) / ham_word_total)
for word in spam_word_probability:
spam_word_probability[word] = np.log((spam_word_probability[word]) / spam_word_total)
# Determine likelihood of ham/spam prior [i.e: log(P(Ham)) and log(P(Spam))]
likelihood_of_ham = np.log(pos_prior)
likelihood_of_spam = np.log(1.0 - pos_prior)
# *----Test Model----*
likelihood_email_is_ham = likelihood_of_ham
likelihood_email_is_spam = likelihood_of_spam
for email in range(len(dev_set)):
# Based on the words in a given email, determine the likelihood the email is ham and spam
for word in dev_set[email]:
likelihood_email_is_ham += ham_word_probability[word]
likelihood_email_is_spam += spam_word_probability[word]
# Classify email as ham or spam based on likelihood value
if likelihood_email_is_ham > likelihood_email_is_spam:
dev_labels.append(1)
else:
dev_labels.append(0)
# Reset likelihoods to initial values
likelihood_email_is_ham = likelihood_of_ham
likelihood_email_is_spam = likelihood_of_spam
print("Development Labels:")
print(dev_labels)
print()
# return predicted labels of development set
return dev_labels