-
Notifications
You must be signed in to change notification settings - Fork 0
/
common-words.R
48 lines (40 loc) · 1.79 KB
/
common-words.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#word stats#
library(tm)
library(SnowballC)
library(wordcloud)
split_tweet <- function(tweet){
text <- unlist(strsplit(tweet,"[^#@'’\"a-zA-Z0-9\\-]+"))
return(text)
}
# Use raw_dat from tweet-viz.R
monsanto_tz <- "America/Chicago"
dat <- raw_dat %>% filter(grepl("(monsanto|gmo|bayer)",tolower(raw_dat$tweet))) %>% select(tweet, timestamp_ms)
dat$timestamp_ms <- as.POSIXct(as.numeric(dat$timestamp_ms)/1000, origin="1970-01-01", tz="America/Chicago")
lims <- as.POSIXct(strptime(c("2016-09-11 00:00","2016-09-16 23:59"), tz=monsanto_tz, format = "%Y-%m-%d %H:%M"))
dat <- dat %>% filter(timestamp_ms >= lims[[1]] & timestamp_ms <= lims[[2]])
words <- sapply(dat$tweet, split_tweet, USE.NAMES=F)
words <- sapply(words,tolower,USE.NAMES = F)
words_ul <- as.character(unlist(words))
words_ul <- gsub("#","",words_ul)
built_in_stop <- unlist(sapply(top_lang$tweet_lang, stopwords))
# remove twitter links/RT words
twitter_words <- c("t","co","http","https","rt","amp","#")
# common words
common_words <- c("the","The","to","of","a","in","is",
"and","","de","by","for","la","has",
"on","se","it","with","that","que",
"al","as","y","en","s","t","el","es",
"a","at","S","r","un","m","u","n","por",
"d","te","via","le","an","i","la","para",
"los","-")
my_stop_words <- c(built_in_stop,twitter_words,common_words)
words.df <- data.frame(table(words_ul))
words.df <- words.df %>% rename(words=words_ul)
words.df$words <- as.character(words.df$words)
words.filtered <- words.df %>% filter(!words %in% my_stop_words) %>% arrange(-Freq) %>% head(50)
# save the image in png format
png("TweetCloud.png", width=12, height=8, units="in", res=300)
wordcloud(words.filtered$words,words.filtered$Freq, scale=c(6,.5),
max.words = 25, random.order=F,
colors=brewer.pal(8, "Dark2"))
dev.off()