Skip to content

Commit

Permalink
Final published update
Browse files Browse the repository at this point in the history
  • Loading branch information
altonrus committed Nov 19, 2021
1 parent 36ab557 commit 950d3d6
Show file tree
Hide file tree
Showing 31 changed files with 991 additions and 508 deletions.
103 changes: 103 additions & 0 deletions 2_scripts/0_preprocess.R
Original file line number Diff line number Diff line change
Expand Up @@ -492,10 +492,12 @@ d.all[, outcome := fifelse(DER_VisitResult == 3,
0#1=no adverse event
))))]

##Main analysis: Index donations must have >150ml RBC lost, not including 2RBC
d.all[, iron_loss_visit := ifelse(DER_RBCLoss_mL > 55, 1, 0)]
d.all[, index_donation := ifelse(DER_RBCLoss_mL > 150 & DER_RBCLoss_Units < 2, 1, 0)]



d.labeled <- cbind(d.all[is.na(RandID)],
"time_to_fu" = numeric(),
"fu_outcome"= character())
Expand Down Expand Up @@ -524,6 +526,75 @@ for (row_idx in 1:nrow(d.all)){
}
}



# ##Separate analysis for 2RBC: Donations with >150ml RBC lost and are classified as 2 units lost
# d.all[, iron_loss_visit := ifelse(DER_RBCLoss_mL > 55, 1, 0)]
# d.all[, index_donation := ifelse(DER_RBCLoss_mL > 150 & DER_RBCLoss_Units == 2, 1, 0)]
#
#
#
# d.labeled <- cbind(d.all[is.na(RandID)],
# "time_to_fu" = numeric(),
# "fu_outcome"= character())
#
#
# for (row_idx in 1:nrow(d.all)){
# if(d.all[row_idx, index_donation]==1){
# #if qualifies as index visit, get all subsequent visits from donor
# d.index_visit <- d.all[row_idx]
# d.fu_visits <- d.all[RandID==d.index_visit$RandID & VisitNum > d.index_visit$VisitNum]
# reached_iron_loss_visit <- FALSE
# row_fu <- 1
# while (reached_iron_loss_visit == FALSE & row_fu <= nrow(d.fu_visits)){
# d.labeled <- rbind(
# d.labeled,
# cbind(d.index_visit,
# "time_to_fu" = d.fu_visits[row_fu, VisitDate] - d.index_visit$VisitDate,
# "fu_outcome" = d.fu_visits[row_fu, outcome]
# )
# )
#
# reached_iron_loss_visit <- fifelse(d.fu_visits[row_fu]$iron_loss_visit == 1, TRUE, FALSE)
# row_fu = row_fu+1
# }
#
# }
# }
# d.firstreturn <- cbind(d.all[is.na(RandID)],
# "time_to_fu" = numeric(),
# "fu_outcome"= character())
#
# for (row_idx in 1:nrow(d.all)){
# if(d.all[row_idx, index_donation]==1){
# #if qualifies as index visit, get all subsequent visits from donor
# d.index_visit <- d.all[row_idx]
# d.fu_visits <- d.all[RandID==d.index_visit$RandID & VisitNum > d.index_visit$VisitNum]
# fu_visit_added <- FALSE
# row_fu <- 1
# while (fu_visit_added == FALSE & row_fu <= nrow(d.fu_visits)){
# if(
# d.fu_visits[row_fu, DER_VisitResult == 3 | DER_RBCLoss_mL >= 55]
# ){
# d.firstreturn <- rbind(
# d.firstreturn,
# cbind(d.index_visit,
# "time_to_fu" = d.fu_visits[row_fu, VisitDate] - d.index_visit$VisitDate,
# "fu_outcome" = d.fu_visits[row_fu, outcome]
# )
# )
# fu_visit_added <- TRUE
#
# }
#
# row_fu = row_fu+1
# }
#
# }
# }



#Distribution of outcome including undetermined
table(d.labeled$fu_outcome)
d.labeled[, .N/nrow(d.labeled), by=fu_outcome]
Expand Down Expand Up @@ -587,3 +658,35 @@ view(dfSummary(d.firstreturn), file="./3_intermediate/data_summary_firstreturn.h


fwrite(d.firstreturn, "./1_data/first_return_dataset.csv")


# Covariance matrix ----------------------------
dt.md <- fread( "./1_data/model_dev_data/ml_training_data.csv")
identifiers <- c("RandID", "VisitDate", "VisitNum", "DER_VisitResult", "DV_Donproc", "DER_RBCLoss_Units")
#remove extraneous fields
dt.md[, c(identifiers) := NULL]
#remove index donations missing hemoglobin
dt.md<- dt.md[!is.na(FingerstickHGB_equiv)]

dt.md.OH <- data.table(model.matrix(fu_outcome~ ., data=dt.md))
dt.md.OH[,`(Intercept)`:=NULL]
#run a correlation and drop the insignificant ones
corr <- cor(dt.md.OH)
#prepare to drop duplicates and correlations of 1
corr[lower.tri(corr,diag=TRUE)] <- NA
#drop perfect correlations
corr[corr == 1] <- NA
#turn into a 3-column table
corr <- as.data.frame(as.table(corr))
#remove the NA values from above
corr <- na.omit(corr)
#select significant values
corr <- subset(corr, abs(Freq) > .5)
#sort by highest correlation
corr <- corr[order(-abs(corr$Freq)),]
#print table
corr <- as.data.table(corr)

setnames(corr, c("Var1", "Var2", "Freq"),c("Variable 1", "Variable 2","Correlation coefficient"))

fwrite(corr, "./3_intermediate/corr_table_data.csv")
43 changes: 28 additions & 15 deletions 2_scripts/2_ensemble_and_model_selection.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ fwrite(tune_results_all, "./3_intermediate/tuning_results/tuning_results_all.csv
# ylab("Mean overall AUC\ncross validation across 15 tuning sets")+xlab("Model types")+
# scale_x_discrete(labels = c("Elastic Net", "Elastic net\nwith interactions", "Random forest", "Regression trees", "Gradient\nboosted trees"))
#

tune_results_all <- fread("./3_intermediate/tuning_results/tuning_results_all.csv")
mod_names <- c("Gradient boosted trees",
"Random forest",
"Elastic net with interactions",
Expand Down Expand Up @@ -264,7 +264,8 @@ ggsave("./4_output/figs/AUC_top_models.png",
# # # # #
#### ASSESS TOP MODELS (withXB and noXB) on outer folds ####
# # # # #

base_mod_spec_XB <- readRDS("./1_data/base_mod_spec_XB.RDS")
base_mod_spec_noXB <- readRDS("./1_data/base_mod_spec_noXB.RDS")
#outer_fold_assess(base_mod_spec = base_mod_spec_noXB$random_forest.39, version="withXB")
#outer_fold_assess(base_mod_spec = base_mod_spec_noXB$random_forest.39, version="noXB")
outer_fold_assess_ensemble(base_model_specs = base_mod_spec_XB,
Expand All @@ -273,6 +274,9 @@ outer_fold_assess_ensemble(base_model_specs = base_mod_spec_XB,
outer_fold_assess_ensemble(base_model_specs = base_mod_spec_noXB,
path = "./3_intermediate/",
version="noXB")



#TOP OVERALL MODEL
#Multiclass AUCs (avg pairwise Hand and Till 2001)
dt_outer_preds_all_repeats <- rbind(
Expand Down Expand Up @@ -317,16 +321,16 @@ for (vsn in c("withXB", "noXB")){



ROC_1vall_withXB<- ggroc(roc_objects$withXB) +
ROC_1vall_withXB<- ggroc(roc_objects$withXB, aes=c("linetype", "color")) +
geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color="grey", linetype="dashed") +
theme(legend.position = "none")+
ggtitle("Extra biomarkers")+
#guides(col = guide_legend(nrow = 2))+
scale_color_manual(
values = c("turquoise2", "turquoise2", "turquoise2",
"yellow2", "yellow2", "yellow2",
"darkorange1", "darkorange1", "darkorange1",
"red1","red1", "red1"),
values = rep(c("#4affff", "#e8c700", "#d97900", "#b80202"),each=3),
labels = rep(c("None", "HGB deferral", "Low iron donation", "Absent iron donation"),each=3))+
scale_linetype_manual(
values = rep(c("solid", "longdash", "dotdash", "twodash"),each=3),
labels = rep(c("None", "HGB deferral", "Low iron donation", "Absent iron donation"),each=3))+
xlab("Specificity")+ylab("Sensitivity")+
geom_point(aes(x=0.75, y=0.75), fill="black", color="black", size = 2)
Expand All @@ -336,16 +340,16 @@ ROC_1vall_withXB<- ggroc(roc_objects$withXB) +
# height = 4.5,
# unit = "in")

ROC_1vall_noXB<- ggroc(roc_objects$noXB) +
ROC_1vall_noXB<- ggroc(roc_objects$noXB, aes=c("linetype", "color")) +
geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color="grey", linetype="dashed") +
theme(legend.position = "none")+
ggtitle("Standard biomarkers")+
#guides(col = guide_legend(nrow = 2))+
scale_color_manual(
values = c("turquoise2", "turquoise2", "turquoise2",
"yellow2", "yellow2", "yellow2",
"darkorange1", "darkorange1", "darkorange1",
"red1","red1", "red1"),
values = rep(c("#4affff", "#e8c700", "#d97900", "#b80202"),each=3),
labels = rep(c("None", "HGB deferral", "Low iron donation", "Absent iron donation"),each=3))+
scale_linetype_manual(
values = rep(c("solid", "longdash", "dotdash", "twodash"),each=3),
labels = rep(c("None", "HGB deferral", "Low iron donation", "Absent iron donation"),each=3))+
xlab("Specificity")+ylab("Sensitivity")+
geom_point(aes(x=0.75, y=0.75), fill="black", color="black", size = 2)
Expand All @@ -361,10 +365,13 @@ g_legend<-function(a.gplot){
mylegend<-g_legend(
ggplot(data = data.table(Cat=factor(c("No adverse outcome", "HGB deferral", "Low iron donation", "Absent iron donation"),
levels = c("No adverse outcome", "HGB deferral", "Low iron donation", "Absent iron donation"))))+
geom_bar(aes(x=Cat, fill=Cat))+
scale_fill_manual(values=c("turquoise2","yellow2","darkorange1","red1"),
geom_line(aes(x=Cat, color=Cat, linetype=Cat, y=Cat))+
scale_color_manual(values=c("#4affff", "#e8c700", "#d97900", "#b80202"),
name="")+
guides(fill=guide_legend(nrow=1))
scale_linetype_manual(values=c("solid", "longdash","dotdash", "twodash"),
name="")+
guides(color=guide_legend(nrow=1),
linetype = guide_legend(nrow=1))
)


Expand All @@ -375,6 +382,12 @@ ggsave("./4_output/figs/ROC_compare.png",
width = 6.5,
height = 4,
unit = "in")
ggsave("./4_output/figs/ROC_compare.pdf",
plot=grid.arrange(arrangeGrob(ROC_1vall_noXB,ROC_1vall_withXB,nrow=1),
mylegend,nrow=2, heights=c(7,1)),
width = 6.5,
height = 4,
unit = "in")



Expand Down
14 changes: 14 additions & 0 deletions 2_scripts/3_feature_importance.R
Original file line number Diff line number Diff line change
Expand Up @@ -132,3 +132,17 @@ featimp_median <- rbind(
)
fwrite(featimp_median, "./4_output/feature_importance_medians.csv")


#Mini plot for Grant
feats_for_mini <- c("Ferritin", median_noXB[median_imp >.005]$display_name)
dt_mini <- featimp_both[display_name %in% feats_for_mini]
dt_mini[, mod:=ifelse(mod=="Extra biomarkers", "With\nferritin", "Without")]

ggplot(dt_mini)+
geom_boxplot(aes(x=reorder(display_name, AUC_multi_pctchg, FUN = median), y=AUC_multi_pctchg))+
facet_grid(cols = vars(mod))+
coord_flip()+geom_hline(yintercept=0, color="red")+
scale_y_continuous(labels = label_percent(accuracy=1), breaks = c(0, .03, .06), name = "Relative variable\nimportance")+
xlab("")
ggsave("./4_output/figs/feat_imp_mini.png",
width = 3.4, height = 2, units = "in")
Loading

0 comments on commit 950d3d6

Please sign in to comment.