.Rhistory

bs_impVI <- mrvip(
mrBootstrap_obj = bs_malaria,
yhats = yhats_rf_downSamp,
X = X,
X1 = X1,
Y = Y,
mode = 'classification',
threshold = 0.0,
global_top_var = 10,
local_top_var = 5,
taxa = NULL,
ModelPerf = ModelPerf_rf_downSamp
)
bs_impVI[[3]]  #importance plot. There are plenty of other insights possible
#the 'global_top_var' provides a limit to how many predictors are included in the community-wide plot. 'local_top_var provides a limit to the number of individual taxa plots. The threshold excludes plotting individual importance plots for taxa not well predicted by the model.
pacman::p_load('MRFcov', 'tidyverse', 'future.apply','tidymodels', 'finetune', 'themis', 'vip', 'flashlight', 'iml', 'vivid', 'igraph', 'ggnetwork', 'network','gridExtra', 'xgboost', 'brulee', 'fastshap', 'tabnet', 'bonsai', 'parsnip', 'cowplot', 'progress', 'hstats', 'geosphere', 'mrIML')
pacman::p_load('MRFcov', 'tidyverse', 'future.apply','tidymodels', 'finetune', 'themis', 'vip', 'flashlight', 'iml', 'vivid', 'igraph', 'ggnetwork', 'network','gridExtra', 'xgboost', 'brulee', 'fastshap', 'tabnet', 'bonsai', 'parsnip', 'cowplot', 'progress', 'hstats', 'geosphere', 'mrIML')
Y <- dplyr::select(Bird.parasites, -scale.prop.zos) %>%
dplyr::select(sort(names(.)))#response variables eg. SNPs, pathogens, species....
X <- dplyr::select(Bird.parasites, scale.prop.zos) # feature set
X1 <- Y %>%
dplyr::select(sort(names(.)))
X1_fact <- X1 %>%
mutate_all(as.factor) %>%
mutate_all(~ifelse(. == 0, "absent", "present"))
pacman::p_load('MRFcov', 'tidyverse', 'future.apply','tidymodels', 'finetune', 'themis', 'vip', 'flashlight', 'iml', 'vivid', 'igraph', 'ggnetwork', 'network','gridExtra', 'xgboost', 'brulee', 'fastshap', 'tabnet', 'bonsai', 'parsnip', 'cowplot', 'progress', 'hstats', 'geosphere', 'mrIML')
Y <- dplyr::select(Bird.parasites, -scale.prop.zos) %>%
dplyr::select(sort(names(.)))#response variables eg. SNPs, pathogens, species....
X <- dplyr::select(Bird.parasites, scale.prop.zos) # feature set
X1 <- Y %>%
dplyr::select(sort(names(.)))
X1_fact <- X1 %>%
mutate_all(as.factor) %>%
mutate_all(~ifelse(. == 0, "absent", "present"))
model_rf <-
rand_forest(trees = 100, mode = "classification", mtry = tune(), min_n = tune()) %>% #100 trees are set for brevity. Aim to start with 1000
set_engine("randomForest")
model_lm <- #model used to generate yhat
logistic_reg() %>%
set_engine("glm") %>%
set_mode("classification") #just for your response
cl <- parallel::makeCluster(5)
plan(cluster, workers=cl)
#random forest
yhats_rf <- mrIMLpredicts(X=X, Y=Y,
X1=X1,
Model=model_rf,
balance_data='no',
mode='classification',
seed = sample.int(1e8, 1),
morans=F,
prop=0.7, k=5, racing=T)
#linear model
yhats_lm <- mrIMLpredicts(X=X,Y=Y,
X1=X1_fact,
Model=model_lm ,
balance_data='no',
mode='classification',
seed = sample.int(1e8, 1),
prop=0.6, racing=F, k=5)
ModelPerf_rf <- mrIMLperformance(yhats_rf, Model=model_rf, Y=Y, mode='classification')
ModelPerf_rf[[1]] #across all parasites
ModelPerf_rf[[2]] #overall
ModelPerf_lm <- mrIMLperformance(yhats_lm, Model=model_lm, Y=Y, mode='classification')
ModelPerf_lm[[1]]
ModelPerf_lm[[2]]
plots <- mrPerformancePlot(ModelPerf1 =ModelPerf_lm, ModelPerf2 = ModelPerf_rf, mod_names=c('linear_reg','rand_forest'), mode='classification' )
plots
yhats_rf_noAssoc <- mrIMLpredicts(X=X, Y=Y,
X1=NULL, #no associations for this one
Model=model_rf,
balance_data='no',
mode='classification',
seed = sample.int(1e8, 1),
prop=0.7, k=5, racing=T)
ModelPerf_rf_noAssoc <- mrIMLperformance(yhats_rf_noAssoc, Model=model_rf, Y=Y, mode='classification')
ModelPerf_rf_noAssoc[[1]]
ModelPerf_rf[[1]] #performance including associations
yhats_rf_downSamp <- mrIMLpredicts(X=X, Y=Y,
X1=X1,
Model=model_rf ,
balance_data='down', #down sampling
mode='classification',
seed = sample.int(1e8, 1),
prop=0.75, k=5, racing=T)
ModelPerf_rf_downSamp <- mrIMLperformance(yhats_rf_downSamp, Model=model_rf, Y=Y, mode='classification')
ModelPerf_rf_downSamp[[1]]
source("~/MrIML/mrIML/R/mrBootstrap.R")
cl <- parallel::makeCluster(5) #can increase the number of cores as needed.
plan(cluster, workers=cl)
#do bootstraps.
bs_malaria <- mrBootstrap(yhats=yhats_rf,Y=Y,
num_bootstrap = 10, downsample = FALSE, mode='classification')
#make sure downsample=TRUE as this did improve performance
#just 10 bootstraps to keep this short. We suggest using more for a final analysis (100 is reasonable but depends on how big the data is)
#up to here -not working properly
bs_impVI <- mrvip(
mrBootstrap_obj = bs_malaria,
yhats = yhats_rf_downSamp,
X = X,
X1 = X1,
Y = Y,
mode = 'classification',
threshold = 0.0,
global_top_var = 10,
local_top_var = 5,
taxa = NULL,
ModelPerf = ModelPerf_rf_downSamp
)
bs_impVI[[3]]  #importance plot. There are plenty of other insights possible
#the 'global_top_var' provides a limit to how many predictors are included in the community-wide plot. 'local_top_var provides a limit to the number of individual taxa plots. The threshold excludes plotting individual importance plots for taxa not well predicted by the model.
rm(mrBootstrap)
#source("~/MrIML/mrIML/R/mrBootstrap.R")
cl <- parallel::makeCluster(5) #can increase the number of cores as needed.
plan(cluster, workers=cl)
#do bootstraps.
bs_malaria <- mrBootstrap(yhats=yhats_rf,Y=Y,
num_bootstrap = 10, downsample = FALSE, mode='classification')
#source("~/MrIML/mrIML/R/mrBootstrap.R")
cl <- parallel::makeCluster(5) #can increase the number of cores as needed.
plan(cluster, workers=cl)
#do bootstraps.
bs_malaria <- mrBootstrap(yhats=yhats_rf,Y=Y,
num_bootstrap = 10, downsample = TRUE, mode='classification')
#make sure downsample=TRUE as this did improve performance
#just 10 bootstraps to keep this short. We suggest using more for a final analysis (100 is reasonable but depends on how big the data is)
#up to here -not working properly
bs_impVI <- mrvip(
mrBootstrap_obj = bs_malaria,
yhats = yhats_rf_downSamp,
X = X,
X1 = X1,
Y = Y,
mode = 'classification',
threshold = 0.0,
global_top_var = 10,
local_top_var = 5,
taxa = NULL,
ModelPerf = ModelPerf_rf_downSamp
)
bs_impVI[[3]]  #importance plot. There are plenty of other insights possible
#the 'global_top_var' provides a limit to how many predictors are included in the community-wide plot. 'local_top_var provides a limit to the number of individual taxa plots. The threshold excludes plotting individual importance plots for taxa not well predicted by the model.
pds <- mrPD_bootstrap(mrBootstrap_obj=bs_malaria,
vi_obj=bs_impVI, X, Y,
target='Plas',
global_top_var=5)
#do bootstraps.
#source("~/MrIML/mrIML/R/MrBootstrap.R")
bs_malaria <- mrBootstrap(yhats=yhats_rf,Y=Y, #n
num_bootstrap = 10)
remove.packages("mrIML")
devtools::install_github('nfj1380/mrIML')
devtools::document()
devtools::install_github('nfj1380/mrIML')
devtools::install_github('nfj1380/mrIML')
devtools::install_github('nfj1380/mrIML')
devtools::install_github('nfj1380/mrIML')
devtools::install_github('nfj1380/mrIML')
devtools::install_github('nfj1380/mrIML')
devtools::document()
devtools::document()
devtools::document()
devtools::document()
devtools::document()
devtools::install_github('nfj1380/mrIML')
remove.packages("mrIML")
devtools::install_github('nfj1380/mrIML')
pacman::p_load('MRFcov', 'tidyverse', 'future.apply','tidymodels', 'finetune', 'themis', 'vip', 'flashlight', 'iml', 'vivid', 'igraph', 'ggnetwork', 'network','gridExtra', 'xgboost', 'brulee', 'fastshap', 'tabnet', 'bonsai', 'parsnip', 'cowplot', 'progress', 'hstats', 'geosphere', 'mrIML')
Y <- dplyr::select(Bird.parasites, -scale.prop.zos) %>%
dplyr::select(sort(names(.)))#response variables eg. SNPs, pathogens, species....
X <- dplyr::select(Bird.parasites, scale.prop.zos) # feature set
X1 <- Y %>%
dplyr::select(sort(names(.)))
X1_fact <- X1 %>%
mutate_all(as.factor) %>%
mutate_all(~ifelse(. == 0, "absent", "present"))
model_rf <-
rand_forest(trees = 100, mode = "classification", mtry = tune(), min_n = tune()) %>% #100 trees are set for brevity. Aim to start with 1000
set_engine("randomForest")
model_lm <- #model used to generate yhat
logistic_reg() %>%
set_engine("glm") %>%
set_mode("classification") #just for your response
cl <- parallel::makeCluster(5)
plan(cluster, workers=cl)
#random forest
yhats_rf <- mrIMLpredicts(X=X, Y=Y,
X1=X1,
Model=model_rf,
balance_data='no',
mode='classification',
seed = sample.int(1e8, 1),
morans=F,
prop=0.7, k=5, racing=T)
#linear model
yhats_lm <- mrIMLpredicts(X=X,Y=Y,
X1=X1_fact,
Model=model_lm ,
balance_data='no',
mode='classification',
seed = sample.int(1e8, 1),
prop=0.6, racing=F, k=5)
ModelPerf_rf <- mrIMLperformance(yhats_rf, Model=model_rf, Y=Y, mode='classification')
ModelPerf_rf[[1]] #across all parasites
ModelPerf_rf[[2]] #overall
ModelPerf_lm <- mrIMLperformance(yhats_lm, Model=model_lm, Y=Y, mode='classification')
ModelPerf_lm[[1]]
ModelPerf_lm[[2]]
plots <- mrPerformancePlot(ModelPerf1 =ModelPerf_lm, ModelPerf2 = ModelPerf_rf, mod_names=c('linear_reg','rand_forest'), mode='classification' )
plots
yhats_rf_noAssoc <- mrIMLpredicts(X=X, Y=Y,
X1=NULL, #no associations for this one
Model=model_rf,
balance_data='no',
mode='classification',
seed = sample.int(1e8, 1),
prop=0.7, k=5, racing=T)
ModelPerf_rf_noAssoc <- mrIMLperformance(yhats_rf_noAssoc, Model=model_rf, Y=Y, mode='classification')
ModelPerf_rf_noAssoc[[1]]
ModelPerf_rf[[1]] #performance including associations
yhats_rf_downSamp <- mrIMLpredicts(X=X, Y=Y,
X1=X1,
Model=model_rf ,
balance_data='down', #down sampling
mode='classification',
seed = sample.int(1e8, 1),
prop=0.75, k=5, racing=T)
ModelPerf_rf_downSamp <- mrIMLperformance(yhats_rf_downSamp, Model=model_rf, Y=Y, mode='classification')
ModelPerf_rf_downSamp[[1]]
#source("~/MrIML/mrIML/R/mrBootstrap.R")
cl <- parallel::makeCluster(5) #can increase the number of cores as needed.
plan(cluster, workers=cl)
#do bootstraps.
bs_malaria <- mrBootstrap(yhats=yhats_rf,Y=Y,
num_bootstrap = 10, downsample = TRUE, mode='classification')
#make sure downsample=TRUE as this did improve performance
#just 10 bootstraps to keep this short. We suggest using more for a final analysis (100 is reasonable but depends on how big the data is)
#up to here -not working properly
bs_impVI <- mrvip(
mrBootstrap_obj = bs_malaria,
yhats = yhats_rf_downSamp,
X = X,
X1 = X1,
Y = Y,
mode = 'classification',
threshold = 0.0,
global_top_var = 10,
local_top_var = 5,
taxa = NULL,
ModelPerf = ModelPerf_rf_downSamp
)
bs_impVI[[3]]  #importance plot. There are plenty of other insights possible
#the 'global_top_var' provides a limit to how many predictors are included in the community-wide plot. 'local_top_var provides a limit to the number of individual taxa plots. The threshold excludes plotting individual importance plots for taxa not well predicted by the model.
pds <- mrPD_bootstrap(mrBootstrap_obj=bs_malaria,
vi_obj=bs_impVI, X, Y,
target='Plas',
global_top_var=5)
covar <- mr_Covar(yhats=yhats_rf_downSamp, X=X, X1=X1, Y=Y,
var='scale.prop.zos', sdthresh =0.01)
#sdthrsh just plots taxa responding the most.
i=1
mrBootstrap_obj=bs_malaria
vi_obj=bs_impVI
sdthresh =0.01
target='Plas'
global_top_var=5
n_response <- ncol(Y)
complete_df <- cbind(Y, X)
n_data <- ncol(complete_df)
bind_rows_by_name <- function(list_obj, object_name) {
filtered_list <- list_obj[names(list_obj) %in% object_name]
bind_rows(filtered_list)
}
object_name <- names(complete_df[i])
combined_list <- list()  # Create an empty list to store combined objects
for (j in 1:n_response) {
combined_object <- map_dfr(mrBootstrap_obj[[j]], bind_rows_by_name, object_name)
if (nrow(combined_object) > 0) {
combined_metadata <- data.frame(target = rep(names(Y)[j], nrow(combined_object)))
combined_object <- cbind(combined_object, combined_metadata)
combined_list[[j]] <- combined_object   # Append the combined object to the list
}
}
combined_df <- do.call(rbind, combined_list)  # Convert the list to a data frame
return(combined_df)
for (j in 1:n_response) {
combined_object <- map_dfr(mrBootstrap_obj[[j]], bind_rows_by_name, object_name)
if (nrow(combined_object) > 0) {
combined_metadata <- data.frame(target = rep(names(Y)[j], nrow(combined_object)))
combined_object <- cbind(combined_object, combined_metadata)
combined_list[[j]] <- combined_object   # Append the combined object to the list
}
for (j in 1:n_response) {
combined_object <- map_dfr(mrBootstrap_obj[[j]], bind_rows_by_name, object_name)
if (nrow(combined_object) > 0) {
combined_metadata <- data.frame(target = rep(names(Y)[j], nrow(combined_object)))
combined_object <- cbind(combined_object, combined_metadata)
combined_list[[j]] <- combined_object   # Append the combined object to the list
}
}
combined_df <- do.call(rbind, combined_list)  # Convert the list to a data frame
}
View(combined_df)
internal_fit_function <- function(i) {
object_name <- names(complete_df[i])
combined_list <- list()  # Create an empty list to store combined objects
for (j in 1:n_response) {
combined_object <- map_dfr(mrBootstrap_obj[[j]], bind_rows_by_name, object_name)
if (nrow(combined_object) > 0) {
combined_metadata <- data.frame(target = rep(names(Y)[j], nrow(combined_object)))
combined_object <- cbind(combined_object, combined_metadata)
combined_list[[j]] <- combined_object   # Append the combined object to the list
}
}
combined_df <- do.call(rbind, combined_list)  # Convert the list to a data frame
return(combined_df)
}
pd_list <- future_lapply(seq_len(n_data), internal_fit_function, future.seed = TRUE)
plot_list <- list()  # Create an empty list to store individual plots
vi_obj <- vi_obj[[1]]
vi_obj <- do.call(rbind, vi_obj)
G_target_data_avg <- vi_obj %>%
dplyr::filter(response == {{target}}) %>%
group_by(var) %>%
dplyr::summarise(mean_imp = mean(sd_value)) %>%
arrange(desc(mean_imp))
vi_obj <- as.data.frame(do.call(rbind, vi_obj))
vi_obj=bs_impVI
vi_obj <- vi_obj[[1]]
View(vi_obj)
G_target_data_avg <- vi_obj %>%
dplyr::filter(response == {{target}}) %>%
group_by(var) %>%
dplyr::summarise(mean_imp = mean(sd_value)) %>%
arrange(desc(mean_imp))
#' Bootstrap Partial Dependence plots
#'
#' This function bootstraps model predictions and generates partial dependence plots for each response variable.
#' It also creates a combined plot for the top variables of interest.
#'
#' @param mrBootstrap_obj A list of model bootstraps generated using mrBootstrap function.
#' @param vi_obj Variable Importance data.
#' @param X The predictor data.
#' @param Y The response data.
#' @param target The target variable for generating plots.
#' @param global_top_var The number of top variables to consider (default: 2).
#'
#' @return A list containing the partial dependence plots for each response variable and a combined plot.
#' @export
#'
#' @examples
#' \dontrun{
#'#' # Example usage:
#' #set up analysis
#' Y <- dplyr::select(Bird.parasites, -scale.prop.zos)%>%
#' dplyr::select(sort(names(.)))#response variables eg. SNPs, pathogens, species....
#' X <- dplyr::select(Bird.parasites, scale.prop.zos) # feature set
#' X1 <- Y %>%
#' dplyr::select(sort(names(.)))
#'model_rf <-
#' rand_forest(trees = 100, mode = "classification", mtry = tune(), min_n = tune()) %>% #100 trees are set for brevity. Aim to start with 1000
#' set_engine("randomForest")
#' yhats_rf <- mrIMLpredicts(X=X, Y=Y,
#'X1=X1,'Model=model_rf ,
#'balance_data='no',mode='classification',
#'tune_grid_size=5,seed = sample.int(1e8, 1),'morans=F,
#'prop=0.7, k=5, racing=T) #
#'bs_analysis <- mrBootstrap(yhats=yhats_rf,Y=Y, num_bootstrap = 5)
#'pds <- mrPD_bootstrap(mrBootstrap_obj=bs_malaria, vi_obj=bs_impVIa, X, Y,
#'target='Plas', global_top_var=5)
#'pd_list <- pds[[1]] #data
#'pds[[2]]#plot }
mrPD_bootstrap <- function(mrBootstrap_obj, vi_obj, X, Y, target, global_top_var = 2) {
n_response <- ncol(Y)
complete_df <- cbind(Y, X)
n_data <- ncol(complete_df)
# Internal function to combine objects by name
bind_rows_by_name <- function(list_obj, object_name) {
filtered_list <- list_obj[names(list_obj) %in% object_name]
bind_rows(filtered_list)
}
internal_fit_function <- function(i) {
object_name <- names(complete_df[i])
combined_list <- list()  # Create an empty list to store combined objects
for (j in 1:n_response) {
combined_object <- map_dfr(mrBootstrap_obj[[j]], bind_rows_by_name, object_name)
if (nrow(combined_object) > 0) {
combined_metadata <- data.frame(target = rep(names(Y)[j], nrow(combined_object)))
combined_object <- cbind(combined_object, combined_metadata)
combined_list[[j]] <- combined_object   # Append the combined object to the list
}
}
combined_df <- do.call(rbind, combined_list)  # Convert the list to a data frame
return(combined_df)
}
pd_list <- future_lapply(seq_len(n_data), internal_fit_function, future.seed = TRUE)
plot_list <- list()  # Create an empty list to store individual plots
vi_obj <- vi_obj[[1]]  # Extract VI data #need to check
#vi_obj <- as.data.frame(do.call(rbind, vi_obj))
G_target_data_avg <- vi_obj %>%
dplyr::filter(response == {{target}}) %>%
group_by(var) %>%
dplyr::summarise(mean_imp = mean(sd_value)) %>%
arrange(desc(mean_imp))
G_top_vars <- head(G_target_data_avg[order(-G_target_data_avg$mean_imp), ], global_top_var)
# Iterate through each pd_list and create individual plots
for (k in seq_along(pd_list)) {
df <- pd_list[[k]] %>%
dplyr::filter(target == {{target}})
if (names(df)[1] %in% G_top_vars$var) {
if (is.factor(df[[1]]) || (all(df[[1]] %in% c(0, 1)))) {
d1 <- df %>%
mutate(class = recode(.[[1]], `0` = "absent", `1` = "present"))
plot <- ggplot(d1, aes(x = class, y = value)) +
geom_boxplot() +
labs(x = names(d1)[1], y = paste(target, "prob", sep = " ")) +
theme_bw()
} else {
d1 <- df %>%
group_by(bootstrap) %>%
rename(class = 1)
plot <- ggplot(d1, aes(x = class, y = value, group = interaction(bootstrap, target)))+
geom_line(alpha = 0.3) +
labs(x =  names(df)[1], y = paste(target, "prob", sep = " ")) +
theme_bw()
}
plot_list[[k]] <- plot  # Add the plot to the list
}
}
plot_list_updated <- plot_list[sapply(plot_list, function(p) any(p$data$value != 0))]
p <- grid.arrange(grobs = plot_list_updated )
# Create combined plot using the order from G_top_vars
#combined_plot <- plot_grid(  plot_list_updated =   plot_list_updated[G_top_vars$var], ncol = 1, rel_heights = rep(1, length(G_top_vars$var)))
combined_plot <- plot_grid(p, ncol = 1, rel_heights = rep(1, length(G_top_vars$var)))
return(list(pd_list, combined_plot ))  # Return both pd_list and combined_plot
}
pds <- mrPD_bootstrap(mrBootstrap_obj=bs_malaria,
vi_obj=bs_impVI, X, Y,
target='Plas',
global_top_var=5)
assoc_net<- mrCoOccurNet_bootstrap (mrPD_obj=pds , Y=Y)
assoc_net_filtered <-  assoc_net %>%
filter(mean_strength > 0.1)
#based on our simulations the following rule of thumb for associations. Any association  < 0.05  for mean strength is included.
#convert to igraph
g <- graph_from_data_frame(assoc_net_filtered, directed=TRUE, vertices=names(Y)) #matching Y data
E(g)$Value <- assoc_net_filtered$mean_strength
E(g)$Color <- ifelse(assoc_net_filtered$direction == "negative", "blue", "red")
# Convert the igraph object to a ggplot object with NMDS layout
gg <- ggnetwork(g)
# Plot the graph
ggplot(gg, aes(x = x, y = y, xend = xend, yend = yend)) +
geom_edges(aes(color = Color, linewidth = (Value)),
curvature = 0.2,
arrow = arrow(length = unit(5, "pt"),
type = "closed")) +
geom_nodes(color = "gray", size = degree(g, mode = "out")/2)+
scale_color_identity() +
theme_void() +
theme(legend.position = "none")  +
geom_nodelabel_repel(aes(label = name),
box.padding = unit(0.5, "lines"),
data = gg,
size=2,
segment.colour = "black",
colour = "white", fill = "grey36")
int_ <- mrInteractions(yhats=yhats_rf, X, Y, num_bootstrap=10,
feature = 'Plas', top.int=10)
#10 bootstraps to keep it short. top int focusses on the 10 top interactions (all of them in this case).
int_[[1]] # overall plot
int_[[2]] # individual plot for the response of choice
int_[[3]] #two way plot
source("~/MrIML/mrIML/R/mrPD_bootstrap.R")
pds <- mrPD_bootstrap(mrBootstrap_obj=bs_malaria,
vi_obj=bs_impVI, X, Y,
target='Plas',
global_top_var=5)
devtools::document()
rm(mrPD_bootstrap.R)
rm('mrPD_bootstrap.R')
rm('mrPD_bootstrap')
devtools::document()
#cl <- parallel::makeCluster(5) #can increase the number of cores as needed.
#plan(cluster, workers=cl)
#do bootstraps.
bs_malaria <- mrBootstrap(yhats=yhats_rf,Y=Y,
num_bootstrap = 10, downsample = FALSE, mode='classification')
#cl <- parallel::makeCluster(5) #can increase the number of cores as needed.
#plan(cluster, workers=cl)
#do bootstraps.
bs_malaria <- mrBootstrap(yhats=yhats_rf,Y=Y,
num_bootstrap = 10, downsample = TRUE, mode='classification')
#make sure downsample=TRUE as this did improve performance
#just 10 bootstraps to keep this short. We suggest using more for a final analysis (100 is reasonable but depends on how big the data is)
#up to here -not working properly
bs_impVI <- mrvip(
mrBootstrap_obj = bs_malaria,
yhats = yhats_rf_downSamp,
X = X,
X1 = X1,
Y = Y,
mode = 'classification',
threshold = 0.0,
global_top_var = 10,
local_top_var = 5,
taxa = NULL,
ModelPerf = ModelPerf_rf_downSamp
)
bs_impVI[[3]]  #importance plot. There are plenty of other insights possible
#the 'global_top_var' provides a limit to how many predictors are included in the community-wide plot. 'local_top_var provides a limit to the number of individual taxa plots. The threshold excludes plotting individual importance plots for taxa not well predicted by the model.
bs_malaria <- mrBootstrap(yhats=yhats_rf,Y=Y,
num_bootstrap = 10, downsample = FALSE, mode='classification')