project_Rcode.Rmd

---
title: "Final_Project"
author: "Saniya Abushakimova"
output: html_document
---

## Exploratory Data Analysis

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r message=FALSE, warning=FALSE, paged.print=FALSE}
# Libraries
library(tidyverse)
library(ggpubr)
library(rcompanion)
library(car)
library(randomcoloR)
library(ggplot2)
library(olsrr)
library(faraway)
library(lmtest)
library(MASS)
library(lars)
```

### Data Understanding

```{r message=FALSE, warning=FALSE}
# Loading the dataset
df = read.csv("Survey.csv")
head(df)
```

```{r}
# Check for missing values
sapply(df, function(x) sum(is.na(x)))
```

```{r}
# Summary statistic for numerical variables
summary(select_if(df, is.numeric))
```

### Data Insights

```{r}
# Scatterplots for numerical variables
s1 = ggplot(data=df, aes(x=Age, y=Reaction.time)) + geom_point(col=randomColor(1))
s2 = ggplot(data=df, aes(x=Avg.sleep.time, y=Reaction.time)) + geom_point(col=randomColor(1))
s3 = ggplot(data=df, aes(x=last.night.sleep.time, y=Reaction.time)) + geom_point(col=randomColor(1))
s4 = ggplot(data=df, aes(x=Awake.hours, y=Reaction.time)) + geom_point(col=randomColor(1))
s5 = ggplot(data=df, aes(x=Avg.hours.exercise, y=Reaction.time)) + geom_point(col=randomColor(1))
s6 = ggplot(data=df, aes(x=Noise.level, y=Reaction.time)) + geom_point(col=randomColor(1))
ggarrange(s1,s2,s3,s4,s5,s6,ncol = 2, nrow = 3)
```

```{r}
# Check numerical variables for the outliers
df_num = select_if(df, is.numeric)

p1 = ggplot() + 
  geom_boxplot(aes(y = df_num$Reaction.time)) + 
  scale_x_discrete( ) + labs(y = "Reaction.time")

p2 = ggplot(stack(df_num[-c(1)]), aes(x = ind, y = values)) +
  geom_boxplot() + scale_x_discrete(guide = guide_axis(angle = 45))

figure <- ggarrange(p1, p2 ,
                    ncol = 2, nrow = 1)
figure
```

```{r}
# Bar plots for categorical variables
p1 = ggplot(df, aes(x=Class, fill = Class)) + geom_bar() + theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())
p2 = ggplot(df, aes(x=Fatigue.level, fill = Fatigue.level)) + geom_bar() + theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())
p3 = ggplot(df, aes(x=Stress.level, fill = Stress.level)) + geom_bar() + theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())
p4 = ggplot(df, aes(x=Distraction, fill = Distraction)) + geom_bar() + theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())
p6 = ggplot(df, aes(x=Temp.level, fill = Temp.level)) + geom_bar() + theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())
p7 = ggplot(df, aes(x=Game.freq, fill = Game.freq)) + geom_bar() + theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())
p8 = ggplot(df, aes(x=Sport.freq, fill = Sport.freq)) + geom_bar() + theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())
p9 = ggplot(df, aes(x=Caffein.intake, fill = Caffein.intake)) + geom_bar() + theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())
p10 = ggplot(df, aes(x=Alcohol.intake, fill = Alcohol.intake)) + geom_bar() + theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())
p11 = ggplot(df, aes(x=Visual.acuity, fill = Visual.acuity)) + geom_bar() + theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())
p12 = ggplot(df, aes(x=Primary.hand, fill = Primary.hand)) + geom_bar() + theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())
p13 = ggplot(df, aes(x=Use.primary.hand, fill = Use.primary.hand)) + geom_bar() + theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())
p14 = ggplot(df, aes(x=Cautious.level, fill = Cautious.level)) + geom_bar() + theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())
p15 = ggplot(df, aes(x=Input.device, fill = Input.device)) + geom_bar() + theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())
p16 = ggplot(df, aes(x=Device.OS, fill = Device.OS)) + geom_bar() + theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())
p17 = ggplot(df, aes(x=WiFi.stable, fill = WiFi.stable)) + geom_bar() + theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())

ggarrange(p1,p2,p3,p4,ncol = 2, nrow = 2)
```

```{r}
ggarrange(p6,p7,p8,p9,ncol = 2, nrow = 2)
```

```{r}
ggarrange(p10,p11,p12,p13,ncol = 2, nrow = 2)
```

```{r}
ggarrange(p14,p15,p16,p17,ncol = 2, nrow = 2)
```

### Data Pre-processing

#### a) Re-leveling redundant columns

```{r}
# Checking some of the counts, to remove irrelevant columns, or reduce factors:
df %>% count(Fatigue.level) # Take to Low, Moderate, High fatigue levels

df <- df %>%
mutate(Fatigues = case_when(
Fatigue.level %in% c("Extremely fatigued", "Very fatigued")~ "H.Fatigue",
Fatigue.level %in% c("Moderately fatigued") ~ "M.Fatigue",
Fatigue.level %in% c("Not fatigued at all", "Slightly Fatigued")~ "L.Fatigue"),
Fatigues = factor(Fatigues, levels=c("H.Fatigue", "M.Fatigue", "L.Fatigue")))

df$Fatigue.level = NULL
names(df)
```

```{r}
# Changing Temp.level
df <- df %>%
mutate(Temperature = case_when(
Temp.level %in% c("Neutral")~ "Neutral",
Temp.level %in% c("Cold","Warm","Very Cold","Very Warm") ~ "Non Neutral"),
Temperature = factor(Temperature, levels=c("Neutral", "Non Neutral")))

df$Temp.level = NULL
names(df)
```

```{r}
# Changing Visual.Acuity
df <- df %>%
mutate(Vision = case_when(
Visual.acuity %in% c("Excellent")~ "Excellent",
Visual.acuity %in% c("Good") ~ "Good",
Visual.acuity %in% c("Average","Poor","Very Poor") ~ "Suboptimal"),
Vision = factor(Vision, levels=c("Excellent", "Good","Suboptimal")))

df$Visual.acuity = NULL
names(df)
```

#### b) Removing low information columns

```{r}
# Removing low information columns
df$WiFi.stable = NULL
df$Alcohol.intake = NULL
df$Primary.hand = NULL
df$Use.primary.hand = NULL

names(df)
```

#### c) Removing correlated columns

```{r, echo=FALSE}
# Reference: https://stackoverflow.com/questions/52554336/plot-the-equivalent-of-correlation-matrix-for-factors-categorical-data-and-mi

# Calculate a pairwise association between all variables in a data-frame. In particular nominal vs nominal with Chi-square, numeric vs numeric with Pearson correlation, and nominal vs numeric with ANOVA.
# Adopted from https://stackoverflow.com/a/52557631/590437
mixed_assoc = function(df, cor_method="spearman", adjust_cramersv_bias=TRUE){
    df_comb = expand.grid(names(df), names(df),  stringsAsFactors = F) %>% set_names("X1", "X2")
   
    is_nominal = function(x) class(x) %in% c("factor", "character")
    # https://community.rstudio.com/t/why-is-purr-is-numeric-deprecated/3559
    # https://github.com/r-lib/rlang/issues/781
    is_numeric <- function(x) { is.integer(x) || is_double(x)}

    f = function(xName,yName) {
        x =  pull(df, xName)
        y =  pull(df, yName)

        result = if(is_nominal(x) && is_nominal(y)){
            # use bias corrected cramersV as described in https://rdrr.io/cran/rcompanion/man/cramerV.html
            cv = cramerV(as.character(x), as.character(y), bias.correct = adjust_cramersv_bias)
            data.frame(xName, yName, assoc=cv, type="cramersV")

        }else if(is_numeric(x) && is_numeric(y)){
            correlation = cor(x, y, method=cor_method, use="complete.obs")
            data.frame(xName, yName, assoc=correlation, type="correlation")

        }else if(is_numeric(x) && is_nominal(y)){
            # from https://stats.stackexchange.com/questions/119835/correlation-between-a-nominal-iv-and-a-continuous-dv-variable/124618#124618
            r_squared = summary(lm(x ~ y))$r.squared
            data.frame(xName, yName, assoc=sqrt(r_squared), type="anova")

        }else if(is_nominal(x) && is_numeric(y)){
            r_squared = summary(lm(y ~x))$r.squared
            data.frame(xName, yName, assoc=sqrt(r_squared), type="anova")

        }else {
            warning(paste("unmatched column type combination: ", class(x), class(y)))
        }

        # finally add complete obs number and ratio to table
        result %>% mutate(complete_obs_pairs=sum(!is.na(x) & !is.na(y)), complete_obs_ratio=complete_obs_pairs/length(x)) %>% rename(x=xName, y=yName)
    }

    # apply function to each variable combination
    map2_df(df_comb$X1, df_comb$X2, f)
}

mixed_corr = mixed_assoc(df)
reduced_mixed_corr = mixed_corr[round(mixed_corr$assoc, 4) != 1 & abs(mixed_corr$assoc) > 0.4, ]

odd_rows = seq_len(nrow(reduced_mixed_corr)) %% 2
reduced_mixed_corr_2 = reduced_mixed_corr[odd_rows == 1,]
```

```{r}
print(reduced_mixed_corr_2[order(reduced_mixed_corr_2$assoc ,decreasing = TRUE),])
```

```{r}
# Remove one of the variables with high correlation/association (>0.7): 'Device.OS' and 'Class'
df$Device.OS = NULL
df$Class = NULL
```

```{r}
# Saving new dataset to csv
write.csv(df,"Survey_postEDA.csv", row.names = FALSE) 
```

## Model Building

```{r}
df_reduced = read.csv("Survey_postEDA.csv")
```

### Model Fitting

```{r}
full_lm = lm(Reaction.time ~ ., data = df_reduced)
summary(full_lm)
```

#### 1) Checking for multicollinearity

```{r}
vif(full_lm)
```

#### 2) Variable selection

```{r}
ols_step_forward_p(full_lm, details = FALSE, p_val=0.15)
```

```{r}
reduced_lm = lm(Reaction.time ~ Vision + Age + Temperature + Input.device + Avg.sleep.time +
                Distraction + Sport.freq + Noise.level, data = df_reduced)
summary(reduced_lm)
```

### Diagnostics and Remedials

#### 1) Unusual observations

##### 1.1) High Leverage Points

```{r}
n = 141
p = 17
lev = influence(reduced_lm)$hat
lev[lev>2*p/n]
```

```{r}
halfnorm(lev, labs=row.names(df_reduced), ylab="Leverages")
```

```{r}
df_reduced[lev>2*p/n, ]
```

```{r}
df_reduced2 = df_reduced %>% filter(!row_number() %in% c(112))
reduced_lm2 = lm(Reaction.time ~ Vision + Age + Temperature + Input.device + Avg.sleep.time +
                 Distraction + Sport.freq + Noise.level, data = df_reduced2)
summary(reduced_lm2)
```

##### 1.2) Outliers

```{r}
n=n-1
stu.res = rstudent(reduced_lm2)
cutoff = qt(.05/(2*n), n-p-1)
plot(stu.res, ylim=c(-5,5))
abline(h=c(cutoff,-cutoff), col='red')
```

##### 1.3) Highly influential points

```{r}
cook = cooks.distance(reduced_lm2)
halfnorm(cook, labs=row.names(df_reduced2), ylab="Cook's distances")
```

#### 2) Error assumptions

##### 2.1) Normality

Graphically:

```{r}
# Q-Q Plot: violated
qqnorm(residuals(reduced_lm2),ylab="Residuals")
qqline(residuals(reduced_lm2), col = "red")
```

Formal test:

```{r}
# Shapiro-Wilk Test: violated
shapiro.test(residuals(reduced_lm2))
```

```{r}
# Box-Cox transformation
sr.trans=boxcox(reduced_lm2, lambda=seq(-3, 0, length=400))
```

```{r}
lambda = sr.trans$x[sr.trans$y == max(sr.trans$y)]
lambda
```

```{r}
df_reduced2$Reaction.time.trans = (df_reduced2$Reaction.time^(lambda)-1)/lambda
                               
reduced_lm3 = lm(Reaction.time.trans ~ Vision + Age + Temperature + Input.device + Avg.sleep.time +
                 Distraction + Sport.freq + Noise.level, data = df_reduced2)
summary(reduced_lm3)
```

Graphically:

```{r}
# Q-Q Plot: satisfied
qqnorm(residuals(reduced_lm3),ylab="Residuals")
qqline(residuals(reduced_lm3), col = "red")
```

Formal test:

```{r}
# Shapiro-Wilk Test: satisfied
shapiro.test(residuals(reduced_lm3))
```

##### 2.2) Constant variance

Graphically:

```{r}
# Residuals vs Fitted Plot: satisfied
plot(reduced_lm3, which = 1)
```

Formal test:

```{r}
# Breusch-Pagan Test: satisfied
bptest(reduced_lm3)
```

##### 2.3) Correlated errors

Graphically:

```{r}
# Residuals vs Index Plot: : satisfied
plot(summary(reduced_lm3)$res, type='o', ylab="Residuals")
abline(h=0, lty=2, col="blue", lwd=2)
```

```{r}
# Durbin-Watson Test: satisfied
dwtest(reduced_lm3)
```
#### 3) Structure assumptions

##### 3.1) Linearity

Graphically:
```{r}
# Added-Variable Plots
avPlots(reduced_lm3)
```

### Model Comparison and Selection

Baseline model

```{r}
summary(reduced_lm3)
```

#### a) A model with interactions

```{r}
interact_model = lm(Reaction.time.trans ~ Vision + Age + Temperature + Input.device + Avg.sleep.time +
                 Distraction + Sport.freq + Noise.level + Distraction:Noise.level + Sport.freq:Avg.sleep.time + 
                 Sport.freq:Distraction, data = df_reduced2)
summary(interact_model)
```

Check assumptions:

```{r}
# Normality: satisfied
shapiro.test(residuals(interact_model))
```
```{r}
# Constant variance: satisfied
bptest(interact_model)
```
```{r}
# Correlated errors: satisfied
dwtest(interact_model)
```
Comparison: Adjusted R2

```{r}
# Baseline model
summary(reduced_lm3)
```

```{r}
# Model with interaction
summary(interact_model)
```

Comparison: Partial F-test

```{r}
# Partial F-test
anova(reduced_lm3, interact_model)
```

#### b) LASSO Regression

```{r}
set.seed(1)

A = model.matrix(Reaction.time ~ ., data = df_reduced)[,-1]
modlasso = lars(A, df_reduced$Reaction.time)

# Cross Validation to choose the best lambda
cvmodlasso = cv.lars(A, df_reduced$Reaction.time)
sv = cvmodlasso$index[which.min(cvmodlasso$cv)]
sv
```

```{r}
coeffs = predict(modlasso, s=sv, type="coef", mode="fraction")$coef 
round(coeffs[coeffs!=0], 3)
```

```{r}
lasso_model = lm(Reaction.time ~ Age + Stress.level + Vision, data = df_reduced)
summary(lasso_model)
```

Check assumptions:

```{r}
# Normality: violated
shapiro.test(residuals(lasso_model))
```

```{r}
# Box-Cox transformation
sr.trans=boxcox(lasso_model, lambda=seq(-3, 0, length=400))
lambda2 = sr.trans$x[sr.trans$y == max(sr.trans$y)]

df_reduced2$Reaction.time.trans2 = (df_reduced2$Reaction.time^(lambda2)-1)/lambda2
lasso_model2 = lm(Reaction.time.trans2 ~ Age + Stress.level + Vision, data = df_reduced2)
```

```{r}
# Normality: satisfied
shapiro.test(residuals(lasso_model2))
```

```{r}
# Constant variance: satisfied
bptest(lasso_model2)
```

```{r}
# Correlated errors: satisfied
dwtest(lasso_model2)
```

Comparison: Adjusted R2

```{r}
# Baseline model
summary(reduced_lm3)
```

```{r}
# Linear model with a set of predictors chosen by LASSO Regression
summary(lasso_model2)
```