naive_bayes.R

# Naive Bayes Classification

# clear global environment
rm(list = ls(all.names = TRUE))

# libraries
library(tidyverse)
library(tm)
library(SnowballC)
library(htmlwidgets)
library(wordcloud)
library(e1071) # naiveBayes() model (decided against naive_bayes(): library (naivebayes))
library(gmodels)

# custom functions----

replacePunctuation <- function(vector){
  
  result <- str_replace_all(vector,"[:punct:]"," ")
  return(result)
  
}


# load data----
nb_data <- read_csv("~/Documents/Machine-Learning-Data/sms_spam.csv", col_names = TRUE)

# recode as a factor
nb_data$type <- as.factor(nb_data$type)

# create and save a corpus
nb_corpus <- VCorpus(VectorSource(nb_data$text))

# clean the corpus
nb_corpus_clean <- tm_map(nb_corpus, content_transformer(str_to_lower)) # convert text to lower case
nb_corpus_clean <- tm_map(nb_corpus_clean, content_transformer(removeNumbers)) # remove numbers from text
nb_corpus_clean <- tm_map(nb_corpus_clean, removeWords, stopwords()) # remove stopwords() from text
nb_corpus_clean <- tm_map(nb_corpus_clean, content_transformer(removeNumbers)) # remove numbers from text
nb_corpus_clean <- tm_map(nb_corpus_clean, content_transformer(replacePunctuation)) # remove numbers from text

# use stemming from the snowball package to stem results and reduce necessity to learn word variants
nb_corpus_clean <- tm_map(nb_corpus_clean, stemDocument) # remove suffixes
nb_corpus_clean <- tm_map(nb_corpus_clean, stripWhitespace) # trim white space

# create a document term matrix (soemtimes sparse) 
nb_dtm <- DocumentTermMatrix(nb_corpus_clean)

# create train and test sets, data is already randomized prior to load
# 75/25 split
nb_data_train <- nb_dtm[1:4169,]
nb_data_test <- nb_dtm[4170:5559,]

# save labels
nb_train_labels <- nb_data$type[1:4169]
nb_test_labels <- nb_data$type[4170:5559]

# create word visualization using wordcloud
wordcloud(nb_corpus_clean, min.freq = .01*length(nb_data$text), random.order = FALSE)

# To get an idea of how strong the naive bayes classifer will be, two word clouds can be generated by type to see
# if there is a stark difference in words. If so, then it is likely the classidier will be pretty good.

# spam vs ham cloud
s <- subset(nb_data, type == "spam")
h <- subset(nb_data, type == "ham")

wordcloud(s$text, max.words = 40, random.order = FALSE)
wordcloud(h$text, max.words = 40, random.order = FALSE)

# wordcloud comparison suggests the model will work quite well


############################# create indicator features - use training data

# use findFreqTerms functions to eliminate words that appear in less than .1 percent of the data
frequent_terms <- findFreqTerms(nb_data_train, lowfreq = 5)

# change train and test data to include only frequent terms
nb_data_train_f <- nb_data_train[,frequent_terms]
nb_data_test_f <- nb_data_test[,frequent_terms]

# convert the binary data to categorical for naive bayes
nb_train <- apply(nb_data_train_f, MARGIN = 2, function(x){ifelse( x > 0, "Yes", "No")})
nb_test <- apply(nb_data_test_f, MARGIN = 2, function(x){ifelse( x > 0, "Yes", "No")})

# create the model uwing naivebayes package
classification_model <- naiveBayes(nb_train, nb_train_labels, laplace = 0)

# get classification of test data
nb_test_predict <- predict(classification_model, nb_test)

# evaluate model efficiency
CrossTable(nb_test_predict, nb_test_labels, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
           dnn = c("predicted", "actual"))