-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTwitterWordCloud.R
78 lines (67 loc) · 2.27 KB
/
TwitterWordCloud.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
"
1.Install package Twitter,RCurl
install.packages('twitteR', dependencies=TRUE)
install.packages('RCurl')
install.packages('bitops')
install.packages('base64enc')
install.packages('httpuv')
install.packages('tm')
install.packages('wordcloud')
install.packages('stringr')
library(twitteR)
library(RCurl)
library(bitops)
library(base64enc)
library(httpuv)
library(tm)
library(wordcloud)
library(stringr)
"
options(stringsAsFactors = FALSE)
"
In cred_file
consumer_key consumer_secret access_token access_secret
'your key' 'your access' 'your access token' 'your access secret'
"
#read.table reads csv and delimited files
oauthCreds = read.table(cred_file,header=TRUE)
tweets_list = searchTwitter("India + South Africa + IND + SA",lang="en",n=300,resultType="recent")
class(tweets_list)
tweets_list
tweets_text = sapply(tweets_list, function(x) x$getText())
class(tweets_text)
tweets_text[1]
tweets_corpus = Corpus(VectorSource(tweets_text))
class(tweets_corpus)
tweets_corpus
inspect(tweets_corpus[1:3])
#preprocessing pipeline
#strip white space
#remove number
#remove stop words
#tolower
#pattern
#tospace
tweets_corpus_clean = tm_map(tweets_corpus, removePunctuation)
tweets_corpus_clean = tm_map(tweets_corpus_clean, stripWhitespace)
tweets_corpus_clean = tm_map(tweets_corpus_clean, removeNumbers)
tweets_corpus_clean = tm_map(tweets_corpus_clean, removeWords, stopwords("english"))
tweets_corpus_clean = tm_map(tweets_corpus_clean, content_transformer(tolower))
toSpace = content_transformer(function(x, pattern) gsub(pattern,"",x))
tweets_corpus_clean = tm_map(tweets_corpus_clean, toSpace,"https*|youtu*")
tweets_corpus_clean
#not actually a matrix
tweets_tdm = TermDocumentMatrix(tweets_corpus_clean)
#internal representation
str(tweets_tdm)
#convert to matrix
tweets_tdm = as.matrix(tweets_tdm)
tweets_tdm
tdm_term_freq_sort = sort(rowSums(tweets_tdm), decreasing=TRUE)
tdm_term_freq_sort_inc = sort(rowSums(tweets_tdm), decreasing=FALSE)
tdm_term_freq_df = data.frame(word = names(tdm_term_freq_sort),
freq = tdm_term_freq_sort)
str(tdm_term_freq_df)
head(tdm_term_freq_df,10)
wordcloud(words=tdm_term_freq_df$word,freq=tdm_term_freq_df$freq,min.freq = 8, max.words = 300, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(8,'Dark2'),
scale=c(3,0.5))