-
Notifications
You must be signed in to change notification settings - Fork 0
/
naive_bayes.R
97 lines (66 loc) · 3.39 KB
/
naive_bayes.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# Naive Bayes Classification
# clear global environment
rm(list = ls(all.names = TRUE))
# libraries
library(tidyverse)
library(tm)
library(SnowballC)
library(htmlwidgets)
library(wordcloud)
library(e1071) # naiveBayes() model (decided against naive_bayes(): library (naivebayes))
library(gmodels)
# custom functions----
replacePunctuation <- function(vector){
result <- str_replace_all(vector,"[:punct:]"," ")
return(result)
}
# load data----
nb_data <- read_csv("~/Documents/Machine-Learning-Data/sms_spam.csv", col_names = TRUE)
# recode as a factor
nb_data$type <- as.factor(nb_data$type)
# create and save a corpus
nb_corpus <- VCorpus(VectorSource(nb_data$text))
# clean the corpus
nb_corpus_clean <- tm_map(nb_corpus, content_transformer(str_to_lower)) # convert text to lower case
nb_corpus_clean <- tm_map(nb_corpus_clean, content_transformer(removeNumbers)) # remove numbers from text
nb_corpus_clean <- tm_map(nb_corpus_clean, removeWords, stopwords()) # remove stopwords() from text
nb_corpus_clean <- tm_map(nb_corpus_clean, content_transformer(removeNumbers)) # remove numbers from text
nb_corpus_clean <- tm_map(nb_corpus_clean, content_transformer(replacePunctuation)) # remove numbers from text
# use stemming from the snowball package to stem results and reduce necessity to learn word variants
nb_corpus_clean <- tm_map(nb_corpus_clean, stemDocument) # remove suffixes
nb_corpus_clean <- tm_map(nb_corpus_clean, stripWhitespace) # trim white space
# create a document term matrix (soemtimes sparse)
nb_dtm <- DocumentTermMatrix(nb_corpus_clean)
# create train and test sets, data is already randomized prior to load
# 75/25 split
nb_data_train <- nb_dtm[1:4169,]
nb_data_test <- nb_dtm[4170:5559,]
# save labels
nb_train_labels <- nb_data$type[1:4169]
nb_test_labels <- nb_data$type[4170:5559]
# create word visualization using wordcloud
wordcloud(nb_corpus_clean, min.freq = .01*length(nb_data$text), random.order = FALSE)
# To get an idea of how strong the naive bayes classifer will be, two word clouds can be generated by type to see
# if there is a stark difference in words. If so, then it is likely the classidier will be pretty good.
# spam vs ham cloud
s <- subset(nb_data, type == "spam")
h <- subset(nb_data, type == "ham")
wordcloud(s$text, max.words = 40, random.order = FALSE)
wordcloud(h$text, max.words = 40, random.order = FALSE)
# wordcloud comparison suggests the model will work quite well
############################# create indicator features - use training data
# use findFreqTerms functions to eliminate words that appear in less than .1 percent of the data
frequent_terms <- findFreqTerms(nb_data_train, lowfreq = 5)
# change train and test data to include only frequent terms
nb_data_train_f <- nb_data_train[,frequent_terms]
nb_data_test_f <- nb_data_test[,frequent_terms]
# convert the binary data to categorical for naive bayes
nb_train <- apply(nb_data_train_f, MARGIN = 2, function(x){ifelse( x > 0, "Yes", "No")})
nb_test <- apply(nb_data_test_f, MARGIN = 2, function(x){ifelse( x > 0, "Yes", "No")})
# create the model uwing naivebayes package
classification_model <- naiveBayes(nb_train, nb_train_labels, laplace = 0)
# get classification of test data
nb_test_predict <- predict(classification_model, nb_test)
# evaluate model efficiency
CrossTable(nb_test_predict, nb_test_labels, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
dnn = c("predicted", "actual"))