diff --git a/Machine_Learning/New Tree Based pipelines/Extratrees_classification_PCA_autosplit.R b/Machine_Learning/New Tree Based pipelines/Extratrees_classification_PCA_autosplit.R deleted file mode 100644 index d8aafd0..0000000 --- a/Machine_Learning/New Tree Based pipelines/Extratrees_classification_PCA_autosplit.R +++ /dev/null @@ -1,196 +0,0 @@ -# First time running this pipeline you will need to install some R packages -# -# Various R packages that are required -library(readr) -library(dplyr) -library(stringr) -library(ranger) -library(lubridate) -library(pROC) -library(ggfortify) -library(janitor) -library(ggplot2) -library(gridExtra) - -# Only the first iteration start here -# Load in data that will be used to train -#Autosplit data -if (isFALSE(exists("random_forest_input"))){ - setwd("C:\\Users\\file\\path") - data <- read.csv("clean_data.csv", check.names=FALSE) - data <- data[,-1] - data_size=nrow(data) - indexes=sample(1:nrow(data),size = 0.8*data_size) - training=data[indexes,] - validation=data[-indexes,] - - #This line was added to have the pipeline work with categorical data - training$Diagnosis = as.factor(training$Diagnosis) - # Load in validation dataset - validation$Diagnosis = as.factor(validation$Diagnosis) -} - -##################################################################### -##### All iterations after the first start here -##################################################################### - -# Model building and feature selection will end when 3 columns aka 2 features are left -while (as.numeric(ncol(training)) >3 ){ - if (exists("random_forest_input")){ - training <- random_forest_input - } - # Selecting tuning parameter based on the size of the data - # if() statement was added to account for iterations with very few ratios - training <- as.data.frame(training) - nvars <- floor(3*sqrt(dim(training)[2])) - if(nvars >= dim(training)[2]-1){nvars <- floor(sqrt(dim(training)[2]))} - - - # Create the model using ranger - mod <- ranger(data=training, # The full dataset containing predictors and response variables - num.trees=min(30000, # number of trees in the forest - length(training)), - splitrule = "extratrees", #If we use splitrule = "extratrees", replace = FALSE, sample.fraction = 1) we are using Geurts et al. implementation. - replace = FALSE, - sample.fraction = 1, - max.depth = 8, - min.node.size = 3, - mtry=nvars, # number of variables to sample for each split - importance='impurity', # Type of variable importance measure to calculate - seed= 87, # setting a seed for reproducibility - dependent.variable.name="Diagnosis", - classification = TRUE - ) - - # Now that we built our model lets check it's accuracy and save it - # Check accuracy - train_predictions <- predict(mod,training) - auc_train<- roc(as.numeric(training$Diagnosis), as.numeric(as.factor(train_predictions$predictions))) - - # Use below for more than two classes - #auc_train<- roc(as.numeric(training$Diagnosis), as.numeric(as.factor(mod$predictions))) - # Use if more than one category - #auc_train<- multiclass.roc(as.numeric(training$Stage), as.numeric(as.factor(mod$predictions))) - # View results - auc_train$auc - mod$confusion.matrix - - # Create prediction for validation dataset - # Calculate accuracy of validation predictions using auc - predictions <- predict(mod,validation) - auc_test<- roc(as.numeric(validation$Diagnosis), as.numeric(as.factor(predictions$predictions))) - auc_test$auc - - # For multiple classes - # test_auc_multi<- multiclass.roc(as.numeric(validation$Diagnosis), as.numeric(as.factor(predictions$predictions))) - # test_auc_multi$auc - - - # Add everything to list - if (isFALSE(exists("list_test_auc"))){ - list_test_auc <- rbind(c( ncol(training), auc_test$auc)) - } - if (exists("list_test_auc")){ - list_test_auc <- rbind(list_test_auc, c(ncol(training), auc_test$auc)) - } - # Create training auc list - if (isFALSE(exists("list_train_auc"))){ - list_train_auc <- rbind(c( ncol(training), auc_train$auc)) - } - if (exists("list_train_auc")){ - list_train_auc <- rbind(list_train_auc, c(ncol(training), auc_train$auc)) - } - - # Add predicton error to list - if (isFALSE(exists("list_prediction_error"))){ - list_prediction_error <- rbind(c( ncol(training), mod$prediction.error)) - } - if (exists("list_prediction_error")){ - list_prediction_error <- rbind(list_prediction_error, c(ncol(training), mod$prediction.error)) - } - - # Save feature importance list - csv_file_name <- paste0("importance", "_", as.numeric(ncol(training)), ".csv") - write.csv(mod$variable.importance, csv_file_name) - - - # Make PCA plot - # Remove zero variance columns - input<- training - input <- input %>% - clean_names() - input$Diagnosis = as.numeric(as.factor(training$Diagnosis)) - input <- input[ , which(apply(input, 2, var) != 0)] - # Calculate PCA - pca_res <- prcomp(input, scale. = TRUE) - par(mar=c(5,6,4,2)) - title <- paste0("Training ",as.numeric(ncol(training))) - plot1 <- autoplot(pca_res, - data = training, - colour = 'Diagnosis', - main = title) - plot1<- plot1 + theme(legend.position = "bottom") - #print(plot1) - - # Make PCA plot - # Remove zero variance columns - input<- validation - input <- input %>% - clean_names() - input$Diagnosis = as.numeric(as.factor(validation$Diagnosis)) - input <- input[ , which(apply(input, 2, var) != 0)] - # Calculate PCA - pca_res <- prcomp(input, scale. = TRUE) - par(mfrow=c(1,1)) - par(mar=c(5,6,4,2)) - title <- paste0("Test ",as.numeric(ncol(training))) - plot2 <- autoplot(pca_res, - data = validation, - colour = 'Diagnosis', - main = title) - plot2<- plot2 + theme(legend.position = "bottom") - -# Create the PCA plots - par(mfrow=c(1,2)) - par(mar=c(5,6,4,2)) - grid.arrange(plot1, plot2, nrow = 1) - - - - # Recursive feature elimination - # Take the top 90% of features and create a new list - var_importance <- as.data.frame(mod$variable.importance) - ordered <- var_importance[order(-var_importance$`mod$variable.importance`), , drop = FALSE] - N <- nrow(ordered) - n <- ceiling(N*.9) - top90per <- ordered[1:n, ,drop = FALSE] - names <- unlist(row.names(top90per)) - - # Take off the one worst feature if less than 10 - if (nrow(ordered) < 10) { - ordered <- head(ordered, -1) - names <- unlist(row.names(ordered)) - } - - random_forest_input <-training %>% - select(Diagnosis,names) - - validation <-validation %>% - select(Diagnosis,names) - - - # Save model - rds_file_name <- paste0("Diagnosis", as.numeric(ncol(training)), ".rds") - saveRDS(mod, file = rds_file_name) - - -} - -################################################## -################################################## -# End of pipeline everything below is optional -# Write summary of AUC -write.csv(list_train_auc, "list_train_auc.csv") -write.csv(list_test_auc, "list_test_auc.csv") - -