forked from YuxiaoLuo/r_analysis_dri_2022
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtext_analysis.R
198 lines (144 loc) · 5.28 KB
/
text_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
####################################
# Intro to Text Analysis
# DRI 2023
# 01/25/2023
# for data manipulation and plotting
library(tidyverse)
# for working with text data
library(tidytext)
# for obtaining the sentiment analysis lexicons
library(textdata)
# for file path management
library(here)
# for word cloud
library(wordcloud)
library(wordcloud2)
# for color palattes
library(RColorBrewer)
# load data online
lyrics <- read_csv("https://raw.githubusercontent.com/YuxiaoLuo/r_analysis_dri_2022/main/data/genius.csv")
# load data locally
lyrics <- read_csv(here("data", "genius.csv"))
glimpse(lyrics)
# dataset -> one-token-per-row format
# unnest_tokens(output column, input column, type of token)
# type of token: words, characters, ngrams, sentences, lines,
# paragraphs, regex
# split lyrics into words
tidy_lyrics <- lyrics %>%
unnest_tokens(word,
line,
token = "words")
# count function
# count(data, Var Name, sort = TRUE/FALSE)
lyrics %>%
group_by(artist_name) %>%
count(song_name, sort = TRUE) %>%
View()
####################################
# Wordcloud
# Why you use Word Clouds
# 1. descriptive tool, present text data in a simple and clear format
# 2. communication tool, handy and capture conversation on
# social media or customer reviews
# 3. basic qualitative insights, flexibility in interpretation
## Generate wordcloud
set.seed(2023) # for reproducibility
# count(dataset, column name, sort = TRUE/FALSE)
# count words
words_wordcloud <- tidy_lyrics %>% count(word, sort = TRUE)
words_wordcloud
glimpse(tidy_lyrics)
tidy_lyrics %>% count(word, sort = TRUE)
# max.words: Maximum number of words to be plotted
# min.freq: words with frequency below min.freq will not be plotted
# rot.per: proportion words with 90 degree rotation
# random.color: plot words in random order.
# If false, they will be plotted in decreasing frequency
library(wordcloud)
library(RColorBrewer) # color template
wordcloud(words = words_wordcloud$word, freq = words_wordcloud$n,
min.freq = 5, max.words =200, random.order = FALSE,
rot.per = 0.35, colors = brewer.pal(8,"Dark2"))
help(brewer.pal)
# a common mistake: show too many words that have little frequency
# adjust min frequency
# wordcloud2, more advanced wordcloud
library(wordcloud2)
wordcloud2(data=words_wordcloud, size=1, color='random-dark')
help(wordcloud2)
# shape = "cardiod", apple or heart shape curve
wordcloud2(data = words_wordcloud, size = .8, shape = "cardiod")
# other shape: diamond, star, pentagon, triangle, triangle-forward
wordcloud2(data = words_wordcloud, size = .8, shape = "pentagon")
# color: random-dark, random-light
wordcloud2(data=words_wordcloud, size=.8, color='random-light')
# Stopwords: words that are not useful for an analysis,
# typically extremely common words such as "the", "of", "to"
# anti_join(x, y, by = "...")
# filter rows from x based on the presence or
# absence of matches in y
# tidytext package has built-in stopwords
stop_words
anti_join(words_wordcloud, stop_words, by = "word")
# look beyond the rows of dictionary after "after"
stop_words %>% View()
# remove stopwords
words_no_stop <- words_wordcloud %>%
anti_join(stop_words, by = "word")
# compare with or without stopwords
glimpse(words_wordcloud)
glimpse(words_no_stop)
wordcloud(words = words_no_stop$word, freq = words_no_stop$n,
min.freq = 1, max.words =150, random.order = FALSE,
rot.per = 0.35, colors = brewer.pal(8,"Dark2"))
wordcloud2(data=words_no_stop, size= .8, color='random-dark')
# change size -> include different # of words in the plot
help(wordcloud2)
#############################
# Explore the lyrics dataset
lyrics_no_stop <- tidy_lyrics %>% anti_join(stop_words, by = "word")
lyrics_no_stop %>% count(word, sort = TRUE)
lyrics_no_stop %>% filter(artist_name == "Buck Meek") %>%
count(word, sort = TRUE)
# Visualization
lyrics_no_stop %>%
filter(artist_name == "Buck Meek") %>%
count(word,sort = TRUE) %>%
filter(n > 3) %>%
mutate(word = reorder(word,n)) %>%
ggplot(aes(n,word)) +
geom_col() +
labs(y = NULL,
title = "Words Frequency in Buck Meek's Songs")
#############
# Exercise
# Find out Full of Hell's most common words
lyrics_no_stop %>%
filter(artist_name == "Full of Hell") %>%
count(word,sort = TRUE) %>%
filter(n > 3) %>%
mutate(word = reorder(word,n)) %>%
ggplot(aes(n,word)) +
geom_col() +
labs(y = NULL,
title = "Words Frequency in Buck Meek's Songs")
###################################
# Challenge: explore another dataset
# Data: Spotify
# Make a wordcloud (remove the stopwords ) of
# the lyrics in Spotify dataset
spotify <- read_csv("https://raw.githubusercontent.com/GC-DRI/r_analysis_dri_2022/main/data/spotify_lyrics.csv")
glimpse(spotify)
# ?unnest_tokens
spotify_lyrics <- spotify %>% unnest_tokens(word, lyrics, token = "words")
spotify_wordcloud <- spotify_lyrics %>% count(word, sort = TRUE)
spotify_wordcloud
spotify_no_stop <- spotify_wordcloud %>%
anti_join(stop_words, by = "word")
spotify_no_stop
wordcloud2(data=spotify_no_stop, size=.2,
color='random-light')
# wordcloud without removing stop words
wordcloud2(data = spotify_wordcloud, size = .8, color = 'random-light')
###########################################################################