-
Notifications
You must be signed in to change notification settings - Fork 5
/
baseFunctions.R
71 lines (59 loc) · 2.18 KB
/
baseFunctions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
library(dplyr)
profanity.words <- readLines("en_profanity_words.txt")
perplexity <- function(prob) {
N <- length(prob)
p2 <- -1/N * sum(log2(prob))
return(2 ** p2)
}
generateNgramProb <- function(data,N) {
data.targets <- sapply(strsplit(data$Term, ' '), function(a) a[N])
data <- data.frame(data,target = data.targets, stringsAsFactors = F)
rm(data.targets)
data.keys <- sapply(strsplit(data$Term, ' '), function(a) paste(a[1:N-1], collapse = " ") )
data <- data.frame(data,key = data.keys, stringsAsFactors = F)
rm(data.keys)
data.sum <- data %>%
group_by(key) %>%
summarise(total=sum(Freq))
data <- inner_join(data,data.sum,by="key") %>%
mutate(prob = Freq/total)
rm(data.sum)
return(data)
}
generateNgramDf <- function(corpus,N) {
tdm <- TermDocumentMatrix(corpus,
control = list(
tokenize = function(x) NGramTokenizer(x, Weka_control(min = N, max = N))
))
#ft <- sort(rowSums(as.matrix(tdm)), decreasing=TRUE)
#dfft <- data.frame(term = names(ft), cnt = ft)
#dfft$prob <- dfft$cnt/sum(dfft$cnt)
#dfft$i <- 1:nrow(dfft)
df <- data.frame(Term = tdm$dimnames$Terms, Freq = tdm$v, stringsAsFactors = F)
#test.trigram.df.blogs$Term <- as.character(test.trigram.df.blogs$Term)
return(df)
}
cleanCorpus <- function(corpus,removeStopWords = F, steam = F) {
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, content_transformer(removePunctuation))
corpus <- tm_map(corpus, content_transformer(removeNumbers))
if (removeStopWords) {
print("Removing stopwords!")
corpus <- tm_map(corpus, removeWords, stopwords("english"))
}
#corpus <- tm_map(corpus, removeWords, profanity.words)
corpus <- tm_map(corpus, stripWhitespace)
if (steam) {
print("Steamin document!")
corpus <- tm_map(corpus, stemDocument, language='english')
}
return(corpus)
}
createCorpus <- function(text,clean = T, removeStopWords = F, steam = F) {
text <- iconv(text, "latin1", "ASCII", sub="")
corpus <- Corpus(VectorSource(list(text)))
if(clean) {
corpus <- cleanCorpus(corpus,removeStopWords, steam)
}
return(corpus)
}