-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdescriptive_statistics.R
148 lines (126 loc) · 5.97 KB
/
descriptive_statistics.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#Read in data
data <- read.table("data/processedData.txt", header=T)
# Statistical analysis
## t-test
t.test(data$time~data$gender)
### Result: finishing time is significantly different between genders
ggplot(data, aes(x = gender, y = time, fill = gender)) +
geom_boxplot() +
labs(title = "Time by gender", y = "time (sec)")
t.test(data$time~data$nationality)
### Result: finishing time is significantly different between Estonians and foreigners
ggplot(data, aes(x = nationality, y = time, fill = nationality)) +
geom_boxplot() +
labs(title = "Time by nationality", y = "time (sec)")
## ANOVA
summary(aov(data$time~data$age.group2))
### Result: finishing time is significantly different between age groups
ggplot(data, aes(x = factor(age.group2), y = time, fill = gender)) +
geom_boxplot() +
labs(title = "Time by age group", x = "age group", y = "time (sec)")
summary(aov(data$time~data$country))
### Result: finishing time is significantly different between countries
ggplot(data, aes(x = factor(country), y = time, fill = country)) +
geom_boxplot() +
labs(title = "Time by countries", x = "countries", y = "time (sec)") +
theme(axis.text.x = element_text(angle = 45))
ggplot(data, aes(x = factor(countryCategory), y = time, fill = countryCategory)) +
geom_boxplot() +
labs(title = "Time by countries", x = "countries", y = "time (sec)", fill = "countries") +
theme(axis.text.x = element_text(angle = 45))
summary(aov(data$time~data$county))
###Result: finishing time is significantly different between counties
ggplot(data, aes(x = county, y = as.POSIXct(time, tz = "GMT", origin = "2014-09-21"), fill = county)) +
geom_boxplot() +
labs(title = "Time by counties", x = "County", y = "Time") +
theme_bw() +
theme(panel.grid.major.x=element_blank(),
plot.title = element_text(lineheight=.8, face="bold", vjust=1),
axis.text.x=element_text(angle=45, vjust = 0.7),
legend.position="none") +
scale_y_datetime(breaks=date_breaks("30 min"), labels=date_format("%H:%M"))
## Chi-square test
tbl <- table(data$gender, data$age.group2)
ctbl <- cbind(tbl[,"17"]+tbl[,"20"],
tbl[,"21"],
tbl[,"35"],
tbl[,"40"],
tbl[,"45"],
tbl[,"50"]+tbl[,"55"]+tbl[,"60"]+tbl[,"65"]+tbl[,"70"]+tbl[,"75"])
colnames(ctbl) = c("[15-21)","[21-22)","[22-36)","[36-41)","[41-46)","[46-76)")
chisq.test(ctbl)
### Result: no significant difference in frequency distribution of age groups between genders
ggplot(data, aes(x = factor(age.group2), fill = gender)) +
geom_bar(position = "dodge") +
labs(title = "Cyclists per age group", x = "age group")
barplot(ctbl, col = c("red","blue"), legend = T)
## Correlation
cor(data$time, data$age.group2, use = "complete.obs", method = "kendall")
#alternative
splits <- read.table("data/splits.txt", header=T)
#Find speed based on given subset of data
findSpeeds = function(x, splits){
speed = mean(splits[,1] / (x[,1]/3600))
speeds = c(speed, speed)
for(i in 2:length(splits)){
speed = mean(splits[,i] / ((x[,i] - x[,i-1])/3600))
speeds = c(speeds, speed, speed)
}
return(speeds)
}
#Calculate speeds for each subset
overallSpeeds = findSpeeds(data[,c(paste("split.",1:6,sep=""),"time")], splits[-1])
menSpeeds = findSpeeds(data[data[,"gender"]=="male",c(paste("split.",1:6,sep=""),"time")], splits[-1])
womenSpeeds = findSpeeds(data[data[,"gender"]=="female",c(paste("split.",1:6,sep=""),"time")], splits[-1])
#Draw plots
xValues = c(0,sort(rep(1:6, 2)),7)
plot(xValues, overallSpeeds, type = "l", col="red", xlab="Splits", ylab = "Speed (km/h)",
xaxt="n", main="Average speeds between splits")
axis(1, at=0:7, labels= colnames(splits))
#By gender
plot(xValues, menSpeeds, type = "l", col="red", xlab="Splits", ylab = "Speed (km/h)",
xaxt="n", main="Average speeds between splits by gender", ylim=c(18,28))
lines(xValues, womenSpeeds, col="blue")
axis(1, at=0:7, labels= colnames(splits))
legend("topleft", legend = c("Men","Women"), lty=c(1,1), lwd=c(2.5,2.5), col=c("red","blue"), cex=.7)
#All together
plot(xValues, menSpeeds, type = "l", col="red", xlab="Splits", ylab = "Speed (km/h)",
xaxt="n", main="Average speeds between splits", ylim=c(18,28))
lines(xValues, womenSpeeds, col="blue")
lines(xValues, overallSpeeds, col="forestgreen")
axis(1, at=0:7, labels= colnames(splits))
legend("topleft", legend = c("Men","Women","Overall"), lty=c(1,1,1),
lwd=c(2.5,2.5,2.5), col=c("red","blue","forestgreen"), cex=1.7)
#Average finish times per age group
meanClass = function(data, class){
if(class=="Total"){
res = mean(data[, "time"], na.rm=T)
}else{
res = mean(data[data[,"age.group"]==class, "time"], na.rm=T)
}
hours = floor(res / 3600)
minutes = floor((res - (3600*hours))/60)
return(paste(hours,":",minutes,sep=""))
}
tab = table(data$age.group)
x = c(paste(names(tab),"(" ,tab, ")",sep=""), paste("Total(",sum(tab),")", sep=""))
y = as.POSIXct(sapply(c(names(tab),"Total"), FUN = function(x){meanClass(data, x)}), format="%H:%M")
xy=data.frame(x, y)
ggplot(xy, aes(x=xy$x, y=xy$y, width=0.5)) +
geom_bar(stat="identity",
fill=c(rep("deepskyblue",length(xy$y)-1), "chartreuse"))+
geom_text(aes(label=substr(xy$y,13,16)), vjust=-1, size=4) +
xlab("Age groups with counts") + ylab("Time") +
ggtitle("Average finish times per age group")+
theme_bw()+
theme(panel.grid.major.x=element_blank(),
plot.title = element_text(lineheight=.8, face="bold", vjust=1),
axis.text.x=element_text(angle=45, vjust = 0.7))+
scale_y_datetime(limits=c(as.POSIXct('0:00',format="%H:%M"),
as.POSIXct('6:00',format="%H:%M")))+
geom_hline(aes(yintercept = as.numeric(y[length(y)])), colour = "chartreuse",size=0.8)
#Distance vs participants per population
cor(dist$distance, dist$participants/dist$population, use = "complete.obs", method = "kendall")
##Result: medium correlation between distance from Tartu and participants per population
ggplot(dist, aes(x = distance, y = participants/population)) +
geom_point()