diff --git a/R/generate_resampling.R b/R/generate_resampling.R deleted file mode 100644 index 1801310a..00000000 --- a/R/generate_resampling.R +++ /dev/null @@ -1,279 +0,0 @@ -#' Generate `n_boot` bootstrap samples. -#' @details -#' The bootstrap samples are generated so that it is very unlikely that they -#' will be associated with a 0 variance variable. However such situation can -#' appear in very rare cases. -#' -#' The simplest procedure to avoid such situation would be to compute the -#' variance of each variable in each bootstrap sample. However, in -#' high-dimensional cases, this is time consuming. Thus, the function starts -#' by identifying variables that present a risk of being of null variance in -#' at least one bootstrap sample. A variable is detected as such if the -#' probability of sampling `N` times (where `N` is the number of observations) -#' its most frequent observed value is higher than a specific threshold -#' named `pval` (by default set to `1e-15`) corrected by the number of bootstrap -#' sample `n_boot` and the number of variables in the whole data-set -#' (\eqn{\sum_{j=1}^Jp_{j}}). In the end, a variable is defined as risky if the -#' proportion of its most frequent observed value is strictly higher than: -#' \deqn{risky\textunderscore threshold = \left(\frac{pval}{n_{boot}* -#' \left(\sum_{j=1}^J p_{j}\right)}\right)^{1/N}.} -#' This value is necessarily lower than `1` as `pval` is lower than `1`. -#' However, it could be lower than \eqn{1/N}, which means that all variables -#' are defined as risky. That is why the maximum value between -#' \eqn{risky\textunderscore threshold} and \eqn{1/N} is taken. -#' -#' Once risky variables have been identified, the function computes the -#' variance of each risky variable in each bootstrap sample. If there isn't any -#' 0 variance risky variable, the bootstrap samples are returned as is. -#' Otherwise, three cases are possible: -#' \itemize{ -#' \item `keep_all_variables = F`, then the risky variables that are of 0 -#' variance in at least one bootstrap sample are removed. If they correspond to -#' a whole block, an error message is generated. -#' \item `keep_all_variables = T`, two possibilities : -#' \itemize{ -#' \item If `balance = T`, the procedure is repeated at most `5` times until all -#' bootstrap samples do not present any 0 variance variable. -#' \item If `balance = F`, an heuristic procedure is used to modify the sampling -#' probability of each observation in order to keep all variables. -#' } -#' } -#' -#' A short detail of this lastly mentioned heuristic consist in replacing each -#' observed value of the risky variables by \eqn{1-\frac{N_k}{N}} -#' (where \eqn{N_k} corresponds to the number of times this value is observed), -#' normalized so that the sum through all the observations (for each variable) -#' equals `1`. Then, if \eqn{prob_i} defines the sampling probability for the -#' observation associated with line \eqn{i}, it is defined as the maximum value -#' of the previous matrix through all risky variables (again normalized so that -#' \eqn{\sum_{i=1}^N prob_i = 1}). Thus the sampling probability of an -#' observation is more of less associated with `1 - its proportion in the -#' variable where this observation is in the lowest frequent group -#' (through all risky variables)`. -#' @inheritParams rgcca_bootstrap -#' @param pval For all the variables, a threshold for the proportion of the most -#' frequent value of this variable is computed. This threshold is evaluated so -#' that the probability to sample only this value is below `pval`. This -#' probability is corrected by the number of bootstrap samples and the number -#' of variables (default is `1e-15`). -#' @return \item{full_idx}{A list of size n_boot containing the observations -#' kept for each bootstrap sample.} -#' @return \item{sd_null}{A list of size the number of block -#' containing the variables that were removed from each block for all the -#' bootstrap samples. Variables are removed if they appear to be of null -#' variance in at least one bootstrap sample. If no variable is removed, -#' return NULL.} -#' @title Generate bootstrap samples. -#' @noRd - -generate_resampling <- function(rgcca_res, n_boot, balanced = TRUE, - keep_all_variables = FALSE, pval = 1e-15, - verbose = TRUE) { - if (verbose) { - packageStartupMessage("Bootstrap samples sanity check...", - appendLF = FALSE - ) - } - - # Initialization - pval <- min(pval, 1) - NO_null_sd_var <- FALSE - iter <- 0 - raw_blocks <- rgcca_res$call$blocks - N <- NROW(raw_blocks[[1]]) - prob <- rep(1 / N, N) - - # For any variable, threshold for the proportion of the most frequent value - # of a variable. This threshold is computed so that the probability to sample - # only this value is below `pval`. This probability is corrected by the number - # of bootstrap samples and the number of variables. - risky_threshold <- max( - 1 / N, - (pval / (n_boot * sum(vapply(raw_blocks, NCOL, FUN.VALUE = 1L))))^(1 / N) - ) - - # Identify variables with value having an observed proportion higher than - # risky_threshold. - risky_var <- lapply( - raw_blocks, - function(block) { - which(apply( - block, 2, - function(x) { - max(table(x) / N) > risky_threshold - } - )) - } - ) - - # Keep only risky variables for each block. - raw_blocks_filtered <- Map( - function(x, y) x[, y, drop = FALSE], - raw_blocks, risky_var - ) - # While there are variables with null variance among the risky variables. - while (!NO_null_sd_var) { - if (balanced) { # Balanced bootstrap sampling. - full_idx <- rep(x = seq(N), each = n_boot) - full_idx <- sample(x = full_idx, size = N * n_boot, replace = FALSE) - full_idx <- split(full_idx, ceiling(seq_along(full_idx) / N)) - } else { # Unbalanced bootstrap sampling. - full_idx <- lapply( - seq(n_boot), - function(x) { - sample(x = seq(N), replace = TRUE, prob = prob) - } - ) - } - # Compute blocks for each bootstrap sample (only with risky variables). - boot_blocks_filtered <- - lapply( - full_idx, - function(idx) { - lapply( - raw_blocks_filtered, - function(x) { - y <- x[idx, , drop = FALSE] - rownames(y) <- paste("S", seq_along(idx)) - return(y) - } - ) - } - ) - - # For each sample, identify variables with a single unique value. - - boot_column_sd_null <- - lapply( - boot_blocks_filtered, - function(boot) { - lapply(boot, function(boot_bl) { - which(apply(boot_bl, 2, function(x) length(unique(x)) == 1)) - }) - } - ) - - # Summarize through all the samples. - eval_boot_sample <- vapply( - boot_column_sd_null, - function(x) sum(vapply(x, length, FUN.VALUE = 1L)), - FUN.VALUE = 1L - ) - NO_null_sd_var <- (sum(eval_boot_sample) == 0) - - if (NO_null_sd_var) { - # through all samples, all variables have non null variances. - sd_null <- NULL - } else { - # If at least one sample have been identified with a null - # variance variable. - if (!keep_all_variables) { - # It is allowed to remove variables. - # Extract the troublesome variables. - sd_null <- Reduce("rbind", boot_column_sd_null) - rownames(sd_null) <- NULL - sd_null <- apply(sd_null, 2, function(x) unique(names(Reduce("c", x)))) - sd_null <- Map(function(x, y) { - z <- match(x, y) - names(z) <- x - return(z) - }, sd_null, lapply(raw_blocks, colnames)) - - # Check if a whole block is troublesome - is_full_block_removed <- unlist(Map( - function(x, y) dim(x)[2] == length(y), - raw_blocks, - sd_null - )) - if (sum(is_full_block_removed) == 0) { - # A whole block is NOT troublesome - NO_null_sd_var <- TRUE - warning(paste( - "Variables: ", - paste(names(Reduce("c", sd_null)), collapse = " - "), - "appear to be of null variance in some bootstrap samples", - "and thus were removed from all samples. \n", - "==> RGCCA is run again without these variables." - )) - } else { - # If whole block IS troublesome then STOP - stop_rgcca(paste( - "The variance of all the variables from blocks: ", - paste(names(raw_blocks)[is_full_block_removed], collapse = " - "), - "appear to be null in some bootstrap samples.", - "Please consider removing them." - )) - } - } else { # It is NOT allowed to remove variables. - # Generate at most five different re-sampling until not a single - # variable has a null variance. - if (iter > 5) { # Otherwise STOP. - # Extract the troublesome variables. - sd_null <- Reduce("rbind", boot_column_sd_null) - rownames(sd_null) <- NULL - sd_null <- apply( - sd_null, 2, - function(x) unique(names(Reduce("c", x))) - ) - sd_null <- Map(function(x, y) { - z <- match(x, y) - names(z) <- x - return(z) - }, sd_null, lapply(raw_blocks, colnames)) - - error_message <- paste( - "Impossible to define all bootstrap samples", - "without variables with null variance. Please", - "consider removing these variables: ", - paste(names(Reduce("c", sd_null)), collapse = " - ") - ) - # In the balanced case, you CANNOT play with the sampling probability - # of the different observations as it is unbalanced. - if (balanced) { - error_message <- paste0( - error_message, - ". Please, consider unbalanced bootstrap by", - " setting 'balanced' to FALSE." - ) - } - stop_rgcca(error_message) - } - # In the unbalanced case, you CAN play with the sampling probability - # of the different observations. - if (!balanced) { - if (iter == 0) { # The first time, you define your unbalancedness. - # Each observed value of the risky variables is replaced by - # `1 - the proportion of this observed value`, normalized so that - # the sum through all the observations (for each variable) - # equals `1`. - prob <- lapply( - raw_blocks_filtered, - function(block) { - apply(block, 2, function(var) { - occurences <- table(var, useNA = "ifany") / length(var) - new_idx <- match(as.character(var), names(occurences)) - new_var <- as.matrix(occurences[new_idx]) - new_var <- (1 - new_var) / sum(1 - new_var) - return(new_var) - }) - } - ) - # The sampling probability for each observation is associated with - # the maximum value of the previous matrix through all risky - # variables (again normalize so that `sum(prob) = 1`). Thus - # the sampling probability of an observation is more of less - # associated with `1 - its proportion in the variable where this - # observation is in the lowest frequent group (through all - # risky variables)`. - prob <- apply(Reduce("cbind", prob), 1, max) / sum( - apply(Reduce("cbind", prob), 1, max) - ) - } - } - iter <- iter + 1 - } - } - } - if (verbose) packageStartupMessage("OK") - return(list(full_idx = full_idx, sd_null = sd_null)) -} diff --git a/R/remove_null_sd.R b/R/remove_null_sd.R deleted file mode 100644 index eaecadaa..00000000 --- a/R/remove_null_sd.R +++ /dev/null @@ -1,58 +0,0 @@ -#' Remove columns having a 0 standard deviation -#' -#' @param list_m A list of dataframe -#' @param column_sd_null Either NULL or a list of named vectors. If NULL, the -#' function will search for variables with null variance in each block. If not -#' NULL, this list defines for each block the index of the variables that are -#' of null variance (see the 'Value' section for more details about the content -#' of this list). In both cases, these variables are removed. -#' @return \item{list_m}{A list of dataframe.} -#' @return \item{column_sd_null}{Either NULL, if not a single variable was -#' removed, or a list of the same size as the number of blocks. In the last -#' situation, each element of this list is again NULL if not a single variable -#' was removed from the current block, or a named vector indicating the former -#' index of the removed variables along with their name.} -#' @noRd - -remove_null_sd <- function(list_m, column_sd_null = NULL) { - names <- names(list_m) - - if (is.null(column_sd_null)) { - column_sd_null <- lapply( - list_m, - function(x) { - which(apply(x, 2, function(y) { - if (mode(y) != "character") { - std <- sd(y, na.rm = TRUE) - res <- is.na(std) || (std == 0) - } else { - res <- FALSE - } - return(res) - })) - } - ) - } - - blocks_index <- seq(1, length(list_m))[ - unlist( - lapply( - column_sd_null, - function(x) length(x) > 0 - ) - ) - ] - list_m <- lapply( - seq_along(list_m), - function(x) { - if (x %in% blocks_index) { - list_m[[x]][, -column_sd_null[[x]], drop = FALSE] - } else { - list_m[[x]] - } - } - ) - - names(list_m) <- names - return(list(list_m = list_m, column_sd_null = column_sd_null)) -} diff --git a/R/rgcca.R b/R/rgcca.R index 48fcaa03..d41a9a4f 100644 --- a/R/rgcca.R +++ b/R/rgcca.R @@ -469,7 +469,7 @@ rgcca <- function(blocks, connection = NULL, tau = 1, ncomp = 1, rgcca_args$quiet <- quiet rgcca_args$verbose <- verbose - blocks <- remove_null_sd(rgcca_args$blocks)$list_m + blocks <- rgcca_args$blocks if (opt$disjunction) { blocks[[rgcca_args$response]] <- as_disjunctive( diff --git a/R/rgcca_bootstrap.R b/R/rgcca_bootstrap.R index c45f0ff9..3ba651e7 100644 --- a/R/rgcca_bootstrap.R +++ b/R/rgcca_bootstrap.R @@ -5,11 +5,6 @@ #' @param rgcca_res A fitted RGCCA object (see \code{\link[RGCCA]{rgcca}}). #' @param n_boot The number of bootstrap samples (default: 100). #' @param n_cores The number of cores used for parallelization. -#' @param balanced A logical value indicating if a balanced bootstrap procedure -#' is performed or not (default is TRUE). -#' @param keep_all_variables A logical value indicating if all variables have -#' to be kept even when some of them have null variance for at least one -#' bootstrap sample (default is FALSE). #' @param verbose A logical value indicating if the progress of the bootstrap #' procedure is reported. #' @return A rgcca_bootstrap object that can be printed and plotted. @@ -77,9 +72,7 @@ #' @seealso \code{\link[RGCCA]{plot.rgcca_bootstrap}}, #' \code{\link[RGCCA]{summary.rgcca_bootstrap}} rgcca_bootstrap <- function(rgcca_res, n_boot = 100, - n_cores = 1, - balanced = TRUE, keep_all_variables = FALSE, - verbose = TRUE) { + n_cores = 1, verbose = TRUE) { stability <- is(rgcca_res, "rgcca_stability") if (stability) { message( @@ -120,36 +113,20 @@ rgcca_bootstrap <- function(rgcca_res, n_boot = 100, check_integer("n_boot", n_boot) - boot_sampling <- generate_resampling( - rgcca_res = rgcca_res, - n_boot = n_boot, - balanced = balanced, - keep_all_variables = keep_all_variables, - verbose = verbose - ) + ### Create bootstrap samples + v_inds <- lapply(seq_len(n_boot), function(i) { + sample(seq_len(NROW(rgcca_res$call$blocks[[1]])), replace = TRUE) + }) - sd_null <- boot_sampling$sd_null - - if (!is.null(sd_null)) { - rgcca_res$call$blocks <- remove_null_sd( - list_m = rgcca_res$call$blocks, - column_sd_null = sd_null - )$list_m - rgcca_res <- rgcca(rgcca_res) - } - - W <- par_pblapply( - boot_sampling$full_idx, function(b) { - rgcca_bootstrap_k( - rgcca_res = rgcca_res, - inds = b - ) - }, - n_cores = n_cores, verbose = verbose - ) - - W <- W[!vapply(W, is.null, logical(1L))] + ### Run RGCCA on the bootstrap samples + W <- par_pblapply(v_inds, function(b) { + rgcca_bootstrap_k( + rgcca_res = rgcca_res, + inds = b + ) + }, n_cores = n_cores, verbose = verbose) + ### Extract statistics from the results of the bootstrap res <- format_bootstrap_list(W, rgcca_res) stats <- rgcca_bootstrap_stats(res, rgcca_res, length(W)) diff --git a/R/rgcca_bootstrap_k.R b/R/rgcca_bootstrap_k.R index b71dec0b..0bada0a5 100644 --- a/R/rgcca_bootstrap_k.R +++ b/R/rgcca_bootstrap_k.R @@ -22,49 +22,35 @@ rgcca_bootstrap_k <- function(rgcca_res, inds = NULL, type = "loadings") { } rgcca_res_boot <- rgcca(rgcca_res) - # block-weight vector - missing_var <- unlist(lapply( - seq_along(rgcca_res_boot$a), - function(x) { - setdiff( - colnames(rgcca_res$blocks[[x]]), - rownames(rgcca_res_boot$a[[x]]) - ) - } - )) - if (length(missing_var) == 0) { - # block-loadings vector - A <- check_sign_comp(rgcca_res, rgcca_res_boot$a) + # block-loadings vector + A <- check_sign_comp(rgcca_res, rgcca_res_boot$a) - if (type == "loadings") { - Y <- lapply( - seq_along(A), - function(j) pm(rgcca_res_boot$blocks[[j]], A[[j]]) - ) - L <- lapply( - seq_along(A), - function(j) { - cor2(rgcca_res_boot$blocks[[j]], Y[[j]]) - } - ) - } else { - L <- lapply(names(A), function(n) { - if (!(n %in% names(rgcca_res$AVE$AVE_X))) { - res <- rep(-1, NCOL(rgcca_res$a[[n]])) - } else { - res <- rgcca_res_boot$AVE$AVE_X[[n]] - } - res <- matrix( - res, nrow = nrow(A[[n]]), - ncol = length(res), byrow = TRUE - ) - rownames(res) <- rownames(A[[n]]) - return(res) - }) - } - names(L) <- names(rgcca_res$a) - return(list(W = A, L = L)) + if (type == "loadings") { + Y <- lapply( + seq_along(A), + function(j) pm(rgcca_res_boot$blocks[[j]], A[[j]]) + ) + L <- lapply( + seq_along(A), + function(j) { + cor2(rgcca_res_boot$blocks[[j]], Y[[j]]) + } + ) } else { - return(NULL) + L <- lapply(names(A), function(n) { + if (!(n %in% names(rgcca_res$AVE$AVE_X))) { + res <- rep(-1, NCOL(rgcca_res$a[[n]])) + } else { + res <- rgcca_res_boot$AVE$AVE_X[[n]] + } + res <- matrix( + res, nrow = nrow(A[[n]]), + ncol = length(res), byrow = TRUE + ) + rownames(res) <- rownames(A[[n]]) + return(res) + }) } + names(L) <- names(rgcca_res$a) + return(list(W = A, L = L)) } diff --git a/R/rgcca_stability.R b/R/rgcca_stability.R index 531d1390..fd460009 100644 --- a/R/rgcca_stability.R +++ b/R/rgcca_stability.R @@ -5,11 +5,8 @@ #' (VIP) based criterion is used to identify the most stable variables. #' #' @inheritParams rgcca_bootstrap -#' @param rgcca_res A fitted RGCCA object (see \code{\link[RGCCA]{rgcca}}). #' @param keep A numeric vector indicating the proportion of variables per #' block to select. -#' @param n_boot The number of bootstrap samples (default: 100). -#' @param n_cores The number of cores for parallelization. #' @param verbose A logical value indicating if the progress of the procedure #' is reported. #' @return A rgcca_stability object that can be printed and plotted. @@ -63,42 +60,23 @@ rgcca_stability <- function(rgcca_res, ), n_boot = 100, n_cores = 1, - verbose = TRUE, - balanced = TRUE, - keep_all_variables = FALSE) { + verbose = TRUE) { stopifnot(tolower(rgcca_res$call$method) %in% sparse_methods()) check_integer("n_boot", n_boot) check_integer("n_cores", n_cores, min = 0) - boot_sampling <- generate_resampling( - rgcca_res = rgcca_res, - n_boot = n_boot, - balanced = balanced, - verbose = verbose, - keep_all_variables = keep_all_variables - ) - - sd_null <- boot_sampling$sd_null - - if (!is.null(sd_null)) { - rgcca_res$call$blocks <- remove_null_sd( - list_m = rgcca_res$call$blocks, - column_sd_null = sd_null - )$list_m - rgcca_res <- rgcca(rgcca_res) - } - - W <- par_pblapply( - boot_sampling$full_idx, function(b) { - rgcca_bootstrap_k( - rgcca_res = rgcca_res, - inds = b, type = "AVE" - ) - }, - n_cores = n_cores, verbose = verbose - ) + ### Create bootstrap samples + v_inds <- lapply(seq_len(n_boot), function(i) { + sample(seq_len(NROW(rgcca_res$call$blocks[[1]])), replace = TRUE) + }) - W <- W[!vapply(W, is.null, logical(1L))] + ### Run RGCCA on the bootstrap samples + W <- par_pblapply(v_inds, function(b) { + rgcca_bootstrap_k( + rgcca_res = rgcca_res, + inds = b, type = "AVE" + ) + }, n_cores = n_cores, verbose = verbose) res <- format_bootstrap_list(W, rgcca_res) J <- length(rgcca_res$blocks) diff --git a/R/scale2.R b/R/scale2.R index 8ff014c2..331c89cd 100644 --- a/R/scale2.R +++ b/R/scale2.R @@ -7,14 +7,15 @@ #' @title Scaling and Centering of Matrix-like Objects #' @noRd scale2 <- function(A, scale = TRUE, bias = TRUE) { + # Center the data + A <- scale(A, center = TRUE, scale = FALSE) + + # Scale if needed if (scale) { - A <- scale(A, center = TRUE, scale = FALSE) std <- sqrt(apply(A, 2, function(x) cov2(x, bias = bias))) - A <- sweep(A, 2, std, FUN = "/") - attr(A, "scaled:scale") <- std - return(A) + std <- pmax(.Machine$double.eps, std) # Account for potentially 0 std + A <- scale(A, center = FALSE, scale = std) } - A <- scale(A, center = TRUE, scale = FALSE) return(A) } diff --git a/man/rgcca_bootstrap.Rd b/man/rgcca_bootstrap.Rd index b3f5ae0b..08a2966e 100644 --- a/man/rgcca_bootstrap.Rd +++ b/man/rgcca_bootstrap.Rd @@ -4,14 +4,7 @@ \alias{rgcca_bootstrap} \title{Bootstrap confidence intervals and p-values} \usage{ -rgcca_bootstrap( - rgcca_res, - n_boot = 100, - n_cores = 1, - balanced = TRUE, - keep_all_variables = FALSE, - verbose = TRUE -) +rgcca_bootstrap(rgcca_res, n_boot = 100, n_cores = 1, verbose = TRUE) } \arguments{ \item{rgcca_res}{A fitted RGCCA object (see \code{\link[RGCCA]{rgcca}}).} @@ -20,13 +13,6 @@ rgcca_bootstrap( \item{n_cores}{The number of cores used for parallelization.} -\item{balanced}{A logical value indicating if a balanced bootstrap procedure -is performed or not (default is TRUE).} - -\item{keep_all_variables}{A logical value indicating if all variables have -to be kept even when some of them have null variance for at least one -bootstrap sample (default is FALSE).} - \item{verbose}{A logical value indicating if the progress of the bootstrap procedure is reported.} } diff --git a/man/rgcca_stability.Rd b/man/rgcca_stability.Rd index 3542d944..9b4e8c38 100644 --- a/man/rgcca_stability.Rd +++ b/man/rgcca_stability.Rd @@ -9,30 +9,21 @@ rgcca_stability( keep = vapply(rgcca_res$a, function(x) mean(x != 0), FUN.VALUE = 1), n_boot = 100, n_cores = 1, - verbose = TRUE, - balanced = TRUE, - keep_all_variables = FALSE + verbose = TRUE ) } \arguments{ -\item{rgcca_res}{A fitted RGCCA object (see \code{\link[RGCCA]{rgcca}}).} +\item{rgcca_res}{A fitted RGCCA object (see \code{\link[RGCCA]{rgcca}}).} \item{keep}{A numeric vector indicating the proportion of variables per block to select.} \item{n_boot}{The number of bootstrap samples (default: 100).} -\item{n_cores}{The number of cores for parallelization.} +\item{n_cores}{The number of cores used for parallelization.} \item{verbose}{A logical value indicating if the progress of the procedure is reported.} - -\item{balanced}{A logical value indicating if a balanced bootstrap procedure -is performed or not (default is TRUE).} - -\item{keep_all_variables}{A logical value indicating if all variables have -to be kept even when some of them have null variance for at least one -bootstrap sample (default is FALSE).} } \value{ A rgcca_stability object that can be printed and plotted. diff --git a/tests/testthat/test_generate_resampling.r b/tests/testthat/test_generate_resampling.r deleted file mode 100644 index c3932952..00000000 --- a/tests/testthat/test_generate_resampling.r +++ /dev/null @@ -1,281 +0,0 @@ -data("Russett") -############################################## -# Test on the risk of having null variance # -# variables in at least one bootstrap sample # -############################################## -blocks <- list( - agriculture = Russett[, seq(3)], - industry = Russett[, 4:5], - politic = Russett[, 6:11] -) - -ncomp <- 1 -# Rent is trapped. -blocks$agriculture$rent <- 0 -blocks$agriculture$rent[1:4] <- 1 -rgcca_out <- rgcca(blocks, ncomp = ncomp) - -# When `pval = 1`, `generate_resampling` fails to identify `rent` as a risky -# variable, both when bootstraps are balanced or not. -set.seed(8882) -sample_out_balanced <- generate_resampling( - rgcca_res = rgcca_out, n_boot = 4, - balanced = TRUE, pval = 1 -) -sample_out_unbalanced <- generate_resampling( - rgcca_res = rgcca_out, n_boot = 4, - balanced = FALSE, pval = 1 -) -test_that("pVAL_high_noRiskyVAR", { - expect_null(sample_out_balanced$sd_null) - expect_null(sample_out_unbalanced$sd_null) -}) - -# Now, if `pval` is set to default, when `balanced = TRUE` and -# `keep_all_variables = FALSE`, a warning is generated to inform that variable -# `rent` is removed and `rent` is indeed removed. -set.seed(8882) -test_that("generate_resampling_missing_val_identification", { - sample_out <- expect_warning( - generate_resampling(rgcca_res = rgcca_out, n_boot = 4, balanced = TRUE), - paste0( - "Variables: rent appear to be of null ", - "variance in some bootstrap samples and thus ", - "were removed from all samples. \n", - " ==> RGCCA is run again without these variables." - ) - ) - expect_equal( - names(sample_out$sd_null$agriculture), - "rent" - ) -}) - -# Same situation, but this time, `pval` is set to its default value and it is -# specifically ask that all variables are kept. It is thus checked that `rent` -# is still there. -set.seed(8882) -sample_out <- generate_resampling( - rgcca_res = rgcca_out, n_boot = 4, - balanced = TRUE, keep_all_variables = TRUE -) -test_that("generate_resampling_keepAllVAriables", { - expect_null(sample_out$sd_null) -}) - - -############################################# -# Test with 2 null variances variables # -############################################# -# Now `rent` and `death` are trapped to be of null variance. -# Four tests are performed : -# - "generate_resampling_NUL_variance_1" : when `balanced = TRUE` -# and `keep_all_variables = FALSE`, a warning to inform that `rent` -# and `death` are removed is raised. -# - "generate_resampling_NUL_variance_2" : when `balanced = FALSE` -# and `keep_all_variables = FALSE`, a warning to inform that `rent` -# and `death` are removed is raised. -# - "generate_resampling_NUL_variance_3" : when `balanced = TRUE` -# and `keep_all_variables = TRUE`, an error is raised as it is impossible -# to keep all variables here because some have null variances. -# - "generate_resampling_NUL_variance_4" : when `balanced = TRUE or FALSE` -# and `keep_all_variables = FALSE`, check that `death` and `rent` are -# indeed removed. -N <- NROW(Russett) -rgcca_out$call$blocks$agriculture[, "rent"] <- rep(0, N) -rgcca_out$call$blocks$politic[, "death"] <- rep(2, N) - -test_that("generate_resampling_NUL_variance_1", { - sample_out_balanced_1 <- expect_warning( - generate_resampling(rgcca_res = rgcca_out, n_boot = 4, balanced = TRUE), - paste0( - "Variables: rent - death appear to be of null ", - "variance in some bootstrap samples and thus ", - "were removed from all samples. \n", - " ==> RGCCA is run again without these variables." - ) - ) - - sample_out_balanced_2 <- expect_warning( - generate_resampling(rgcca_res = rgcca_out, n_boot = 4, balanced = FALSE), - paste0( - "Variables: rent - death appear to be of null ", - "variance in some bootstrap samples and thus ", - "were removed from all samples. \n", - " ==> RGCCA is run again without these variables." - ) - ) - - expect_equal(unname(unlist(lapply( - sample_out_balanced_1$sd_null, - function(x) names(x) - ))), c("rent", "death")) - expect_equal(unname(unlist(lapply( - sample_out_balanced_2$sd_null, - function(x) names(x) - ))), c("rent", "death")) -}) - -test_that( - "generate_resampling_NUL_variance_2", - expect_error( - generate_resampling( - rgcca_res = rgcca_out, n_boot = 4, - balanced = TRUE, keep_all_variables = TRUE - ), - paste0( - "Impossible to define all bootstrap samples ", - "without variables with null variance. Please ", - "consider removing these variables: rent - death.", - " Please, consider unbalanced bootstrap by ", - "setting 'balanced' to FALSE." - ) - ) -) - -######################################### -# Test with 2 very risky variables # -######################################### -# Now `rent` and `death` are trapped to be very risky variables (only 1 -# observation differs from the others). -# Four tests are performed : -# - "generate_resampling_veryRisky_1" : when `balanced = TRUE` -# and `keep_all_variables = FALSE`, a warning to inform that `rent` -# and `death` are removed is raised. -# - "generate_resampling_veryRisky_2" : when `balanced = FALSE` -# and `keep_all_variables = FALSE`, a warning to inform that `rent` -# and `death` are removed is raised. -# - "generate_resampling_veryRisky_3" : when `balanced = TRUE` -# and `keep_all_variables = TRUE`, an error is raised as it is highly -# unlikely to keep all variables here because some have almost null -# variances. -# - "generate_resampling_veryRisky_4" : when `balanced = TRUE or FALSE` -# and `keep_all_variables = FALSE`, check that `death` and `rent` are -# indeed removed. -# - "generate_resampling_veryRisky_5" : when `balanced = FALSE` -# and `keep_all_variables = TRUE`, check that no variable is removed. -N <- NROW(Russett) -rgcca_out$call$blocks$agriculture[, "rent"] <- c(1, rep(0, N - 1)) -rgcca_out$call$blocks$politic[, "death"] <- c(1, rep(2, N - 1)) -set.seed(553) -test_that("generate_resampling_veryRisky_1", { - sample_out_balanced_1 <- expect_warning( - generate_resampling(rgcca_res = rgcca_out, n_boot = 4, balanced = TRUE), - paste0( - "Variables: rent - death appear to be of null ", - "variance in some bootstrap samples and thus ", - "were removed from all samples. \n", - " ==> RGCCA is run again without these variables." - ) - ) - - sample_out_balanced_2 <- expect_warning( - generate_resampling(rgcca_res = rgcca_out, n_boot = 4, balanced = FALSE), - paste0( - "Variables: rent - death appear to be of null ", - "variance in some bootstrap samples and thus ", - "were removed from all samples. \n", - " ==> RGCCA is run again without these variables." - ) - ) - - expect_equal(unname(unlist(lapply( - sample_out_balanced_1$sd_null, - function(x) names(x) - ))), c("rent", "death")) - expect_equal(unname(unlist(lapply( - sample_out_balanced_2$sd_null, - function(x) names(x) - ))), c("rent", "death")) -}) - -set.seed(5553) -test_that( - "generate_resampling_veryRisky_2", - expect_error( - generate_resampling( - rgcca_res = rgcca_out, n_boot = 4, - balanced = TRUE, keep_all_variables = TRUE - ), - paste0( - "Impossible to define all bootstrap samples ", - "without variables with null variance. Please ", - "consider removing these variables: rent - death.", - " Please, consider unbalanced bootstrap by ", - "setting 'balanced' to FALSE." - ) - ) -) - -set.seed(53) -sample_out_balanced <- generate_resampling( - rgcca_res = rgcca_out, n_boot = 4, - balanced = FALSE, keep_all_variables = TRUE -) -test_that("generate_resampling_veryRisky_5", { - expect_null(sample_out_balanced$sd_null) -}) - - -################################################################## -# Test with 2 very risky variables on a block of 2 variables # -################################################################## -# Now `rent` and `death` are trapped to be very risky variables (only 1 -# observation differs from the others). -# Four tests are performed : -# - "generate_resampling_ALL_Block_1" : when `balanced = TRUE` -# and `keep_all_variables = FALSE`, an error is raised as it want to remove -# all the variables from block `industry`. -# - "generate_resampling_ALL_Block_2" : when `balanced = FALSE` -# and `keep_all_variables = FALSE`, an error is raised as it want to remove -# all the variables from block `industry`. -# - "generate_resampling_ALL_Block_3" : when `balanced = TRUE or FALSE` -# and `keep_all_variables = TRUE` with a different random initialization, -# no error is raised as no variable needs to be removed. -rgcca_out$call$blocks$industry[, "gnpr"] <- c(1, rep(0, N - 1)) -rgcca_out$call$blocks$industry[, "labo"] <- c(1, rep(2, N - 1)) -set.seed(54) -test_that( - "generate_resampling_ALL_Block_1", - expect_error( - generate_resampling( - rgcca_res = rgcca_out, n_boot = 4, - balanced = TRUE - ), - paste0( - "The variance of all the variables from blocks:", - " industry appear to be null in some bootstrap ", - "samples. Please consider removing them." - ) - ) -) -set.seed(52) -test_that( - "generate_resampling_ALL_Block_2", - expect_error( - generate_resampling( - rgcca_res = rgcca_out, n_boot = 4, - balanced = FALSE - ), - paste0( - "The variance of all the variables from blocks:", - " industry appear to be null in some bootstrap ", - "samples. Please consider removing them." - ) - ) -) - -set.seed(1047) -sample_out_balanced <- generate_resampling( - rgcca_res = rgcca_out, n_boot = 4, - balanced = TRUE -) -set.seed(6576) -sample_out_unbalanced <- generate_resampling( - rgcca_res = rgcca_out, n_boot = 4, - balanced = FALSE -) -test_that("generate_resampling_ALL_Block_3", { - expect_null(sample_out_balanced$sd_null) - expect_null(sample_out_unbalanced$sd_null) -}) diff --git a/tests/testthat/test_remove_null_sd.r b/tests/testthat/test_remove_null_sd.r deleted file mode 100644 index a8fb62ea..00000000 --- a/tests/testthat/test_remove_null_sd.r +++ /dev/null @@ -1,26 +0,0 @@ -#' # remove_null_sd test - -#''' -X1 <- vapply(seq(3), function(x) rnorm(10), FUN.VALUE = double(10)) -X2 <- rep(1, 10) -X3 <- rep(1, 10) -X3[1:4] <- NA -X4 <- rep(NA, 10) -df <- cbind(X1, X2, X3) -df2 <- cbind(X1, X2, X4) - -test_that("remove_null_sd removes columns with null variance", { - res <- remove_null_sd(list(df)) - expect_equal(df[, seq(3)], res$list_m[[1]]) - removed_columns <- c(4, 5) - names(removed_columns) <- c("X2", "X3") - expect_equal(removed_columns, res$column_sd_null[[1]]) -}) - -test_that("remove_null_sd removes columns of NA", { - res <- remove_null_sd(list(df2)) - expect_equal(df[, seq(3)], res$list_m[[1]]) - removed_columns <- c(4, 5) - names(removed_columns) <- c("X2", "X4") - expect_equal(removed_columns, res$column_sd_null[[1]]) -}) diff --git a/tests/testthat/test_rgcca_bootstrap.r b/tests/testthat/test_rgcca_bootstrap.r index 427a8ab5..a06bbb73 100644 --- a/tests/testthat/test_rgcca_bootstrap.r +++ b/tests/testthat/test_rgcca_bootstrap.r @@ -72,68 +72,3 @@ test_that("rgcca_bootstrap_classif", { boot <- rgcca_bootstrap(rgcca_out, n_boot = 4, n_cores = 1) test_structure(boot, 4, 1, p) }) - -############################################## -# Test on the risk of having null variance # -# variables in at least one bootstrap sample # -############################################## -# Here, the variable `rent` is trapped and should be detected in -# `generate_resampling` which is going to raise a warning. Then it -# should be removed by `bootstrap`. -blocks <- list( - agriculture = Russett[, seq(3)], - industry = Russett[, 4:5], - politic = Russett[, 6:11] -) - -ncomp <- 1 -# Rent is trapped. -blocks$agriculture$rent <- 0 -blocks$agriculture$rent[1:4] <- 1 -rgcca_out <- rgcca(blocks, ncomp = ncomp) - -set.seed(8882) -test_that( - "rgcca_bootstrap_removed_variable_1", - expect_warning( - rgcca_bootstrap(rgcca_out, n_boot = 4, n_cores = 1, balanced = TRUE), - paste0( - "Variables: rent appear to be of null ", - "variance in some bootstrap samples and thus ", - "were removed from all samples. \n", - " ==> RGCCA is run again without these variables." - ) - ) -) -# Exact same situation where we check in the output that `rent` was removed. -set.seed(8882) - -test_that("rgcca_bootstrap_removed_variable_2", { - expect_warning( - boot_out <- rgcca_bootstrap( - rgcca_out, n_boot = 4, n_cores = 1, balanced = TRUE - ), - paste0( - "Variables: rent appear to be of null variance in some bootstrap ", - "samples and thus were removed from all samples." - ), - fixed = TRUE - ) - expect_false("rent" %in% boot_out$bootstrap$var) - expect_false("rent" %in% colnames(boot_out$rgcca$blocks$agriculture)) - expect_false("rent" %in% colnames(boot_out$rgcca$call$blocks$agriculture)) -}) - -# Same situation, but this time, it is specifically ask that all variables are -# kept. It is thus checked that `rent` is still there. -set.seed(8882) -boot_out <- rgcca_bootstrap(rgcca_out, - n_boot = 4, n_cores = 1, - keep_all_variables = TRUE, balanced = TRUE -) - -test_that("rgcca_bootstrap_keep_all_variables", { - expect_true("rent" %in% boot_out$bootstrap$var) - expect_true("rent" %in% colnames(boot_out$rgcca$blocks$agriculture)) - expect_true("rent" %in% colnames(boot_out$rgcca$call$blocks$agriculture)) -}) diff --git a/tests/testthat/test_rgcca_bootstrap_k.r b/tests/testthat/test_rgcca_bootstrap_k.r index 465e03fb..3755e016 100644 --- a/tests/testthat/test_rgcca_bootstrap_k.r +++ b/tests/testthat/test_rgcca_bootstrap_k.r @@ -24,7 +24,7 @@ test_that("test_rgcca_bootstrap_k_1", { rgcca_out_2 <- rgcca(blocks, superblock = FALSE, ncomp = 2) resb_2 <- rgcca_bootstrap_k(rgcca_out_2) -test_that("test_rgcca_bootstrap_k", { +test_that("test_rgcca_bootstrap_k_2", { expect_is(resb_2, "list") expect_is(resb_2[[1]][[1]], "matrix") expect_is(resb_2[[2]][[1]], "matrix") @@ -34,9 +34,7 @@ test_that("test_rgcca_bootstrap_k", { }) # If one bootstrap sample presents at least a single variable with null -# variance, rgcca_bootstrap_k should return the name of -# the null variance variables -# in both the two lists it returns. +# variance, rgcca_bootstrap_k should still return results blocks_3 <- blocks blocks_3$agriculture$rent <- 0 blocks_3$agriculture$rent[1] <- 1 @@ -44,6 +42,11 @@ rgcca_out_3 <- rgcca(blocks_3, superblock = FALSE, ncomp = 2) inds <- c(2, 2:NROW(blocks_3$agriculture)) resb_3 <- rgcca_bootstrap_k(rgcca_res = rgcca_out_3, inds = inds) -test_that("test_rgcca_bootstrap_k_missing_var_identification", { - expect_null(resb_3) +test_that("test_rgcca_bootstrap_k_3", { + expect_is(resb_3, "list") + expect_is(resb_3[[1]][[1]], "matrix") + expect_is(resb_3[[2]][[1]], "matrix") + expect_equal(length(resb_3), 2) + expect_true(all(vapply(resb_3[[1]], NCOL, FUN.VALUE = 1L) == 2)) + expect_identical(vapply(resb_3[[1]], NROW, FUN.VALUE = 1L), p) })