-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathdata_wrangling_analysis.R
235 lines (180 loc) · 6.75 KB
/
data_wrangling_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
##################################################
# DRI 2023 R Track
# data wrangling and analysis
# Github repo:
# https://github.com/GC-DRI/r_data_analysis_2021/blob/main/data-wrangling.md
library(tidyverse)
# 2 ways of input data
# input data from local
# spotify <- read_csv('/kaggle/input/dri2023/data/spotify.csv')
# input data from online resource
spotify <- read_csv('https://raw.githubusercontent.com/YuxiaoLuo/r_analysis_dri_2022/main/data/spotify.csv')
# see top 10 rows of data
head(spotify)
# see summary of variables
glimpse(spotify)
# select() - extracts columns from data frame
# filter() - extracts rows from data frame
####################################
# select() function
# if you want to select the loudness variable
select(spotify, loudness)
# if you want to select more than one variable
# include the column names, separated by commas, as arguments to the function
select(spotify, loudness, instrumentalness, key)
## select helper
# - , remove the columns after it
# remove everything but the rank & track columns
select(spotify, -c(artist:time_signature))
# contains(), column names that contains a string
select(spotify, contains('time'))
# ends_with(), column names that ends with letter k
select(spotify, ends_with('k'))
###################################################
# filter() function
# extracts rows from dataframe and returns them as new dataframe
# filter(x,y)
# x: a data frame to extract rows from
# y: a logical test that returns TRUE or FALSE
glimpse(spotify)
# a == b : a equal to b?
filter(spotify, time_signature == 5)
# a > b : is x greater than y
filter(spotify, time_signature > 3)
# is.na(a): is a missing value?
filter(spotify, is.na(time_signature))
#############################
# Exercise
# 1. filter all songs in the 'post_covid' covid_period
# 2. Filter all songs where tempo is less than or equal to 80
# 3. Stretch challenge: all songs that have more than 20 characters in title
# Hint: use str_length() to calculate # of characters
# two common mistakes:
# use single equal sign
# forget to use quotations for strings
## Boolean operators
# & : are both a and b true?
filter(spotify, time_signature > 4 & mode == 'major')
# | : are one or both of a and b true?
filter(spotify, season == 'Summer' | season == 'Spring')
# ! : is a not true
filter(spotify, ! season == "Winter")
# %in%: is a in the set?
filter(spotify, season %in% c('Summer', 'Spring'))
#########################################
# %>% Piping
# f(x,y) --> x %>% f(y)
# remember | in command line? This does the same thing
# Think about this task:
# extract rows that are in 'pre_covid' and
# select loudness & energy features from the data
select(filter(spotify, covid_period == "pre_covid"), loudness, energy)
spotify %>% filter(covid_period == 'pre_covid') %>% select(loudness, energy)
###########
# Exercise
## Filter spotify to the songs that are above .5 danceability
# select the tempo and energy columns
#######################################
# Summarizing data
# summarize: pass it a data frame and then one or more
# named arguments, it will turn each named argument into a column in the new data frame
spotify %>%
filter(season == "Spring") %>%
summarize(avg_energy = mean(energy))
spotify %>%
filter(season == "Winter") %>%
summarize(
avg_energy = mean(energy),
max_energy = max(energy),
total_energy = sum(energy)
)
################
# Exercise
# the average of loudness of songs in the fall (mean)
# the max danceability of any song in fall (max)
# the min energy of any song in fall (min)
########################
# Summarize by groups
# group_by() takes a data frame and returns a copy of the data frame
# that has been grouped into sets of rows
spotify %>%
group_by(covid_period) %>%
summarize(min_loud = min(loudness))
###############
# exercise
# Calculate the average danceability of top 40 songs for each season
spotify %>%
filter(rank <= ****) %>%
group_by(****) %>%
summarize(mean_dance = mean(****))
#################################################
# Create, modify, and delete columns
# mutate()
# change variable type
class(spotify$rank)
spotify_rank <- spotify %>%
mutate(rank = as.ordered(rank))
class(spotify_rank$rank)
spotify_rank$rank
# create new rows from existing rows
# let's create a new composite metric: raw_power (energy, loudness)
# raw_power = energy x (loudness - min(loudness))
spotify_power <- spotify %>%
mutate(raw_power = energy*(loudness-min(loudness)))
# take a look at the new composite metric raw_power
spotify_power %>% select(energy, loudness, raw_power)
##############################
# Exercise
# Create a new variable called boogie = danceability * tempo
# call the new data frame spotify_boogie
spotify_boogie <- spotify %>%
mutate(boogie = danceability*tempo)
spotify_boogie %>% select(danceability, tempo, boogie)
##########################################
# Investigate the mode in different tracks
select(spotify, track, mode, valence) %>% count(mode, wt = valence)
select(spotify, track, mode, valence, tempo, speechiness, energy, covid_period) %>%
group_by(mode) %>%
summarize(meanValence = mean(valence),
meanTempo = mean(tempo),
meanSpeechiness = mean(speechiness),
meanEnergy = mean(energy),
Pre_Covid = sum(covid_period == "pre_covid"),
Post_Covid = sum(covid_period == "post_covid"))
select(spotify, track, mode, valence, tempo, speechiness, energy, covid_period) %>%
group_by(mode) %>%
summarize(meanValence = mean(valence),
meanTempo = mean(tempo),
meanSpeechiness = mean(speechiness),
meanEnergy = mean(energy),
Pre_Covid = sum(covid_period == "pre_covid")/sum(spotify$covid_period == "pre_covid"),
Post_Covid = sum(covid_period == "post_covid")/sum(spotify$covid_period == "post_covid"))
#########################################
# Weekly summary stats
library(lubridate) # package for handling date-time data
spotify_sum <- spotify %>%
group_by(week) %>%
summarize(
valence = median(valence),
tempo = median(tempo),
speechiness = median(speechiness),
instrumentalness = median(instrumentalness),
liveness = median(liveness),
loudness = median(loudness),
acousticness = median(acousticness),
danceability = median(danceability),
energy = median(energy),
major = sum(mode == "major"),
minor = sum(mode == "minor"),
duration_ms = median(duration_ms),
time_since_covid = median(time_since_covid),
covid_period = max(covid_period)
) %>%
mutate(
# get the proportion of minor songs for the week
prop_minor = minor / (major + minor)
) %>%
# convert character to date format
mutate(week = mdy(week)) %>%
# reorder the rows from early time to later time
arrange(-desc(week))