diff --git a/DESCRIPTION b/DESCRIPTION index 855f3cb0..1d1fda76 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -28,6 +28,7 @@ License: GPL-3 Depends: R (>= 3.5) Imports: + abind, caret, Deriv, ggplot2 (>= 3.4.0), @@ -53,7 +54,7 @@ Suggests: VignetteBuilder: knitr LazyData: true -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 Encoding: UTF-8 URL: https://github.com/rgcca-factory/RGCCA, https://rgcca-factory.github.io/RGCCA/ diff --git a/NAMESPACE b/NAMESPACE index 7389338c..18119324 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,15 +4,22 @@ S3method(block_init,block) S3method(block_init,dual_block) S3method(block_init,dual_regularized_block) S3method(block_init,primal_regularized_block) +S3method(block_init,regularized_tensor_block) +S3method(block_init,separable_regularized_tensor_block) +S3method(block_init,tensor_block) S3method(block_postprocess,block) +S3method(block_postprocess,separable_regularized_tensor_block) S3method(block_postprocess,sparse_block) S3method(block_project,block) S3method(block_project,dual_block) S3method(block_project,dual_regularized_block) S3method(block_project,primal_regularized_block) S3method(block_project,sparse_block) +S3method(block_project,tensor_block) S3method(block_update,block) S3method(block_update,dual_block) +S3method(block_update,regularized_tensor_block) +S3method(block_update,tensor_block) S3method(plot,rgcca) S3method(plot,rgcca_bootstrap) S3method(plot,rgcca_cv) @@ -27,6 +34,9 @@ S3method(shave,list) S3method(shaving,default) S3method(shaving,double) S3method(shaving,matrix) +S3method(subset_block_rows,array) +S3method(subset_block_rows,data.frame) +S3method(subset_block_rows,numeric) S3method(summary,rgcca) S3method(summary,rgcca_bootstrap) S3method(summary,rgcca_cv) @@ -42,6 +52,7 @@ export(rgcca_stability) export(rgcca_transform) importFrom(Deriv,Deriv) importFrom(MASS,ginv) +importFrom(abind,abind) importFrom(caret,confusionMatrix) importFrom(caret,multiClassSummary) importFrom(caret,postResample) @@ -60,7 +71,6 @@ importFrom(matrixStats,rowSds) importFrom(matrixStats,rowSums2) importFrom(methods,is) importFrom(rlang,.data) -importFrom(stats,complete.cases) importFrom(stats,cor) importFrom(stats,median) importFrom(stats,model.matrix) @@ -71,6 +81,5 @@ importFrom(stats,qnorm) importFrom(stats,quantile) importFrom(stats,rnorm) importFrom(stats,sd) -importFrom(stats,setNames) importFrom(stats,var) importFrom(utils,modifyList) diff --git a/R/block.R b/R/block.R index 0246542b..f2d394ad 100644 --- a/R/block.R +++ b/R/block.R @@ -45,21 +45,61 @@ new_sparse_block <- function(x, j, sparsity, tol = 1e-08, ...) { ) } +new_tensor_block <- function(x, j, rank, mode_orth, ..., class = character()) { + new_block( + x, j, rank = rank, mode_orth = mode_orth, factors = NULL, + lambda = NULL, ..., class = c(class, "tensor_block") + ) +} + +new_regularized_tensor_block <- function(x, j, rank, mode_orth, tau, ...) { + new_tensor_block( + x, j, rank = rank, mode_orth = mode_orth, tau = tau, + M = NULL, ..., class = "regularized_tensor_block" + ) +} + +new_separable_regularized_tensor_block <- function(x, j, rank, mode_orth, + tau, ...) { + new_tensor_block( + x, j, rank = rank, mode_orth = mode_orth, tau = tau, + M = NULL, ..., class = "separable_regularized_tensor_block" + ) +} + ### Utility method to choose the adequate class -create_block <- function(x, j, bias, na.rm, tau, sparsity, tol) { - if (sparsity < 1) { - res <- new_sparse_block(x, j, sparsity, tol, bias = bias, na.rm = na.rm) - } else if (NROW(x) > NCOL(x)) { +create_block <- function(x, j, bias, na.rm, tau, sparsity, + tol, rank, mode_orth, separable) { + if (length(dim(x)) > 2) { # TGCCA if (tau < 1) { - res <- new_primal_regularized_block(x, j, tau, bias = bias, na.rm = na.rm) + if (separable) { + res <- new_separable_regularized_tensor_block( + x, j, rank, mode_orth, tau, bias = bias, na.rm = na.rm + ) + } else { + res <- new_regularized_tensor_block( + x, j, rank, mode_orth, tau, bias = bias, na.rm = na.rm + ) + } } else { - res <- new_block(x, j, bias = bias, na.rm = na.rm) + res <- new_tensor_block(x, j, rank, mode_orth, bias = bias, na.rm = na.rm) } } else { - if (tau < 1) { - res <- new_dual_regularized_block(x, j, tau, bias = bias, na.rm = na.rm) - } else { - res <- new_dual_block(x, j, bias = bias, na.rm = na.rm) + if (sparsity < 1) { # SGCCA + res <- new_sparse_block(x, j, sparsity, tol, bias = bias, na.rm = na.rm) + } else if (NROW(x) > NCOL(x)) { # Primal RGCCA + if (tau < 1) { + res <- + new_primal_regularized_block(x, j, tau, bias = bias, na.rm = na.rm) + } else { + res <- new_block(x, j, bias = bias, na.rm = na.rm) + } + } else { # Dual RGCCA + if (tau < 1) { + res <- new_dual_regularized_block(x, j, tau, bias = bias, na.rm = na.rm) + } else { + res <- new_dual_block(x, j, bias = bias, na.rm = na.rm) + } } } return(res) diff --git a/R/block_init.R b/R/block_init.R index 4b801e02..8180faab 100644 --- a/R/block_init.R +++ b/R/block_init.R @@ -39,3 +39,73 @@ block_init.dual_regularized_block <- function(x, init = "svd") { x$M <- ginv(x$tau * diag(x$n) + (1 - x$tau) * x$K / x$N) NextMethod() } + +#' @export +block_init.tensor_block <- function(x, init = "svd") { + if (init == "svd") { + x$factors <- lapply(seq_along(dim(x$x))[-1], function(m) { + svd(apply(x$x, m, c), nu = 0, nv = x$rank)$v + }) + } else { + x$factors <- lapply(seq_along(dim(x$x))[-1], function(m) { + if (m == x$mode_orth) { + svd(matrix( + rnorm(dim(x$x)[m] * x$rank), dim(x$x)[m] + ), nu = x$rank, nv = 0)$u + } else { + matrix(rnorm(dim(x$x)[m] * x$rank), dim(x$x)[m]) + } + }) + } + x$lambda <- rep(1 / sqrt(x$rank), x$rank) + + return(block_project(x)) +} + +#' @export +block_init.regularized_tensor_block <- function(x, init = "svd") { + # Compute the highest singular value of the regularization matrix + p <- prod(dim(x$x)[-1]) + if (p > x$n) { + x$M <- eigen( + pm(matrix(x$x, x$n), t(matrix(x$x, x$n)), na.rm = x$na.rm), + symmetric = TRUE, only.values = TRUE + )$values[1] + } else { + x$M <- eigen( + pm(t(matrix(x$x, x$n)), matrix(x$x, x$n), na.rm = x$na.rm), + symmetric = TRUE, only.values = TRUE + )$values[1] + } + x$M <- x$tau + (1 - x$tau) * x$M / x$N + + # Initialize the factors and lambda using the tau = 1 strategy + x <- NextMethod() + + # Change lambda to satisfy the constraints + x$lambda <- x$lambda / sqrt(x$M) + x$a <- x$a / sqrt(x$M) + x$Y <- x$Y / sqrt(x$M) + return(x) +} + +#' @export +block_init.separable_regularized_tensor_block <- function(x, init = "svd") { + # Compute separable estimation of the regularization matrix + d <- length(dim(x$x)) - 1 + x$M <- estimate_separable_covariance(x$x) + x$M <- lapply(x$M, function(y) { + sqrt_matrix( + x$tau^(1 / d) * diag(nrow(y)) + (1 - x$tau^(1 / d)) * y, + inv = TRUE + ) + }) + + # Make a change of variables + for (m in seq_len(d)) { + x$x <- mode_product(x$x, x$M[[m]], m = m + 1) + } + + # Initialize the factors and lambda using the tau = 1 strategy + NextMethod() +} \ No newline at end of file diff --git a/R/block_postprocess.R b/R/block_postprocess.R index 73585952..6a68eeb3 100644 --- a/R/block_postprocess.R +++ b/R/block_postprocess.R @@ -31,3 +31,12 @@ block_postprocess.sparse_block <- function(x, ctrl) { } NextMethod() } + +#' @export +block_postprocess.separable_regularized_tensor_block <- function(x, ctrl) { + x$factors <- lapply(seq_along(x$factors), function(m) { + x$M[[m]] %*% x$factors[[m]] + }) + x$a <- Reduce(khatri_rao, rev(x$factors)) %*% x$lambda + NextMethod() +} \ No newline at end of file diff --git a/R/block_project.R b/R/block_project.R index b6e5e660..b8e68be0 100644 --- a/R/block_project.R +++ b/R/block_project.R @@ -54,3 +54,10 @@ block_project.sparse_block <- function(x) { x$Y <- pm(x$x, x$a, na.rm = x$na.rm) return(x) } + +#' @export +block_project.tensor_block <- function(x) { + x$a <- Reduce(khatri_rao, rev(x$factors)) %*% x$lambda + x$Y <- pm(matrix(x$x, nrow = nrow(x$x)), x$a, na.rm = x$na.rm) + return(x) +} diff --git a/R/block_update.R b/R/block_update.R index 084baa02..fdec6e88 100644 --- a/R/block_update.R +++ b/R/block_update.R @@ -13,3 +13,98 @@ block_update.dual_block <- function(x, grad) { x$alpha <- grad return(block_project(x)) } + +#' @export +block_update.tensor_block <- function(x, grad) { + grad <- array( + pm(t(matrix(x$x, nrow = nrow(x$x))), grad, na.rm = x$na.rm), + dim = dim(x$x)[-1] + ) + other_factors <- NULL + # Update factors + for (m in seq_along(dim(x$x)[-1])) { + grad_m <- matrix( + aperm(grad, c(m, seq_along(dim(grad))[-m])), nrow = dim(x$x)[m + 1] + ) + grad_m <- grad_m %*% khatri_rao( + Reduce(khatri_rao, rev(x$factors[-seq_len(m)])), other_factors + ) + if (m == x$mode_orth) { + SVD <- svd( + grad_m %*% diag(x$lambda, nrow = x$rank), nu = x$rank, nv = x$rank + ) + x$factors[[m]] <- SVD$u %*% t(SVD$v) + } else { + x$factors[[m]] <- grad_m %*% diag(x$lambda, nrow = x$rank) + x$factors[[m]] <- apply( + x$factors[[m]], 2, function(y) y / norm(y, type = "2") + ) + } + + other_factors <- khatri_rao(x$factors[[m]], other_factors) + } + # Update lambda + x$lambda <- t(other_factors) %*% as.vector(grad) + x$lambda <- drop(x$lambda) / norm(drop(x$lambda), type = "2") + return(block_project(x)) +} + +#' @export +block_update.regularized_tensor_block <- function(x, grad) { + grad <- array( + pm(t(matrix(x$x, nrow = nrow(x$x))), grad, na.rm = x$na.rm), + dim = dim(x$x)[-1] + ) + other_factors <- NULL + # Update factors + for (m in seq_along(dim(x$x)[-1])) { + grad_m <- matrix( + aperm(grad, c(m, seq_along(dim(grad))[-m])), nrow = dim(x$x)[m + 1] + ) + grad_m <- grad_m %*% khatri_rao( + Reduce(khatri_rao, rev(x$factors[-seq_len(m)])), other_factors + ) %*% diag(x$lambda, nrow = x$rank) + if (m == x$mode_orth) { + SVD <- svd(grad_m, nu = x$rank, nv = x$rank) + x$factors[[m]] <- SVD$u %*% t(SVD$v) + } else { + x$factors[[m]] <- apply(grad_m, 2, function(y) y / norm(y, type = "2")) + } + + other_factors <- khatri_rao(x$factors[[m]], other_factors) + } + # Update lambda + u <- drop(t(other_factors) %*% as.vector(grad)) + + w_ref <- drop(ginv( + x$tau * diag(x$rank) + (1 - x$tau) * crossprod( + pm(matrix(x$x, x$n), other_factors, na.rm = x$na.rm) + ) / x$N + ) %*% u) + w_ref_norm <- w_ref / (norm(w_ref, type = "2") * sqrt(x$M)) + + w_opt <- u / (norm(u, type = "2") * sqrt(x$M)) + + eps <- 0.5 * drop(t(u) %*% (x$lambda + w_opt)) + + # If w_ref is satisfying, keep w_ref, otherwise find a point that + # increases the criterion and satisfies the constraints between + # w_ref and w_opt + if (all(w_ref_norm == w_opt)) { + x$lambda <- w_ref_norm + } + else if (t(u) %*% w_ref_norm >= eps) { + x$lambda <- w_ref_norm + } else { + if (1 / x$M - eps^2 / crossprod(u) > .Machine$double.eps) { + Pu <- diag(x$rank) - tcrossprod(u / norm(u, type = "2")) + mu <- norm(Pu %*% w_ref, type = "2") / drop(sqrt( + 1 / x$M - eps^2 / crossprod(u) + )) + x$lambda <- eps / drop(crossprod(u)) * u + drop(Pu %*% w_ref) / mu + } else { # collinearity between u and w_ref + x$lambda <- w_opt + } + } + return(block_project(x)) +} diff --git a/R/check_blocks.r b/R/check_blocks.r index 0563d790..6f712008 100644 --- a/R/check_blocks.r +++ b/R/check_blocks.r @@ -6,60 +6,56 @@ #' check_blocks performs the following checks and apply the following #' transformations to the blocks: #' \itemize{ -#' \item If a single block is given as a data frame or a matrix, \code{blocks} +#' \item If a single block is given as a data frame or an array, \code{blocks} #' is transformed into a list with the block as its unique element. Otherwise, #' if \code{blocks} is not a list, an error is raised. -#' \item Coerce each element of \code{blocks} to a matrix. +#' \item Coerce each element of \code{blocks} to an array. #' \item Make sure that all the blocks apart from the response block are #' quantitative. #' \item Add missing names to \code{blocks}. -#' \item Add missing column names to each block and prefix column names with -#' block names if some column names are duplicated between blocks. -#' \item Check blocks' row names. Raises an error if a block has duplicated -#' row names. Several scenario are possible: +#' \item Add missing dimnames to each block and prefix dimnames with +#' block names if some dimnames are duplicated between blocks. +#' \item Check blocks' primary dimnames. Raises an error if a block has +#' duplicated primary dimnames. Several scenario are possible: #' \itemize{ -#' \item If all blocks are missing row names, row names are created if -#' \code{allow_unnames} is TRUE, otherwise an error is raised. -#' \item If a block is missing row names and all other blocks' row names -#' match, missing row names are copied from the other blocks. -#' \item If a block is missing row names but other blocks' have none -#' matching row names, an error is raised. +#' \item If all blocks are missing primary dimnames, primary dimnames are +#' created. +#' \item If a block is missing primary dimnames and all other blocks' +#' primary dimnames match, missing primary dimnames are copied from the +#' other blocks. +#' \item If a block is missing primary dimnames but other blocks' have none +#' matching primary dimnames, an error is raised. #' } -#' \item If \code{add_NAlines} is FALSE and blocks have different number of -#' rows, an error is raised. Otherwise, lines filled with NA values are added -#' to the blocks with missing rows. Blocks' rows are permuted so that every -#' block has the same row names in the same order. +#' \item If blocks have different number of variables on the primary +#' dimension, fibers filled with NA values are added to the blocks with +#' missing variables on the primary dimension. Blocks' variables on the +#' primary dimension are permuted so that every block has the same primary +#' dimnames in the same order. #' } #' @inheritParams rgcca -#' @param add_Nalines logical, if TRUE, lines filled with NA are added to blocks -#' with missing rows -#' @param allow_unnames logical, if FALSE, an error is raised if blocks do not -#' have row names -#' @importFrom stats setNames +#' @importFrom abind abind #' @noRd -check_blocks <- function(blocks, add_NAlines = FALSE, allow_unnames = TRUE, - quiet = FALSE, response = NULL) { +check_blocks <- function(blocks, quiet = FALSE, response = NULL) { blocks <- check_blocks_is_list(blocks) - blocks <- check_blocks_matrix(blocks) + blocks <- check_blocks_data_structure(blocks) blocks <- check_blocks_quantitative(blocks, response) - blocks <- check_blocks_names(blocks, quiet) - blocks <- check_blocks_colnames(blocks, quiet) - blocks <- check_blocks_rownames(blocks, allow_unnames, quiet) - blocks <- check_blocks_align(blocks, add_NAlines, quiet) + blocks <- check_blocks_names(blocks) + blocks <- check_blocks_dimnames(blocks, 1, quiet) + blocks <- check_blocks_align(blocks, 1) invisible(blocks) } check_blocks_is_list <- function(blocks) { # Check that there is either a single block or a list of blocks - if (is.matrix(blocks) || is.data.frame(blocks)) blocks <- list(blocks) + if (is.array(blocks) || is.data.frame(blocks)) blocks <- list(blocks) if (!is.list(blocks)) stop_rgcca(paste("blocks must be a list.")) return(blocks) } -check_blocks_matrix <- function(blocks) { +check_blocks_data_structure <- function(blocks) { blocks <- lapply(blocks, function(x) { - if (is.matrix(x)) { + if (is.array(x)) { return(x) } if (is.data.frame(x)) { @@ -99,145 +95,177 @@ check_blocks_quantitative <- function(blocks, response = NULL) { return(blocks) } -check_blocks_names <- function(blocks, quiet = FALSE) { +check_blocks_names <- function(blocks) { # Add block names if some are missing - renamed <- FALSE if (is.null(names(blocks))) names(blocks) <- rep("", length(blocks)) - for (x in which(names(blocks) == "")) { - names(blocks)[x] <- paste0("block", x) - renamed <- TRUE - } - if (!quiet && renamed) { - message("Missing block names are automatically labeled.") + for (j in which(names(blocks) == "")) { + names(blocks)[j] <- paste0("block", j) } return(blocks) } -check_blocks_colnames <- function(blocks, quiet = FALSE) { - # Check for empty colnames - if (any(vapply( - blocks, function(x) is.null(colnames(x)), - FUN.VALUE = logical(1) - ))) { - if (!quiet) message("Missing colnames are automatically labeled.") - blocks <- lapply( - setNames(seq_along(blocks), names(blocks)), - function(x) { - block <- blocks[[x]] - if (is.null(colnames(block))) { - if (NCOL(block) == 1) { - colnames(block) <- names(blocks)[x] - } else { - colnames(block) <- paste0("V", x, "_", seq_len(NCOL(block))) - } - } - return(block) - } - ) - } +check_blocks_dimnames <- function(blocks, primary = 1, quiet = FALSE) { + # Create dimnames if missing + missing_dimnames <- vapply(blocks, function(x) { + is.null(dimnames(x)) + }, FUN.VALUE = logical(1L)) + blocks[missing_dimnames] <- lapply(blocks[missing_dimnames], function(x) { + dimnames(x) <- list(NULL) + return(x) + }) - # Check for duplicated colnames - if (any(duplicated(unlist(lapply(blocks, colnames))))) { - if (!quiet) message("Duplicated colnames are modified to avoid confusion.") - - blocks <- lapply( - setNames(seq_along(blocks), names(blocks)), - function(x) { - block <- blocks[[x]] - colnames(block) <- paste(names(blocks)[x], - colnames(blocks[[x]]), - sep = "_" - ) - return(block) - } + # Check dimnames on primary dimension + blocks <- check_blocks_primary_dimnames(blocks, m = primary) + + # Check dimnames on other dimensions + blocks <- Map(function(block, name) { + check_block_secondary_dimnames(block, name, primary, quiet) + }, blocks, names(blocks)) + + # Check for duplicated dimnames across blocks (except primary dim) + if (any(duplicated(unlist( + lapply(blocks, function(x) dimnames(x)[-primary]) + )))) { + if (!quiet) message( + "Duplicated dimnames across blocks are modified to avoid confusion. \n" ) + + # Add block name as a prefix to avoid confusion + blocks[seq_along(blocks)] <- lapply(seq_along(blocks), function(j) { + block <- blocks[[j]] + dimnames(block)[-primary] <- lapply( + seq_along(dim(block))[-primary], function(m) { + dimnames(block)[[m]] <- paste( + names(blocks)[j], dimnames(blocks[[j]])[[m]], sep = "_" + ) + } + ) + return(block) + }) } return(blocks) } -check_blocks_rownames <- function(blocks, allow_unnames = TRUE, quiet = FALSE) { - # Raise error if duplicated rownames +check_blocks_primary_dimnames <- function(blocks, m = 1) { + # Raise error if dimension m does not exist in one of the blocks + lapply(seq_along(blocks), function(j) { + if (m > length(dim(blocks[[j]]))) stop_rgcca( + "wrong number of dimensions. Dimension ", m, " is the dimension shared ", + "by all the blocks but is missing for block ", j, "." + ) + }) + + # Raise error if there are duplicated dimnames lapply(blocks, function(x) { - if (!is.null(row.names(x)) && any(duplicated(row.names(x)))) { - stop_rgcca( - "blocks have duplicated rownames." - ) - } + duplicated_names <- + !is.null(dimnames(x)[[m]]) && any(duplicated(dimnames(x)[[m]])) + if (duplicated_names) stop_rgcca( + "blocks have duplicated names on dimension ", m, "." + ) }) - # Create rownames for all blocks if all missing - if (all(vapply( - blocks, function(x) is.null(row.names(x)), - FUN.VALUE = logical(1) - ))) { - if (allow_unnames) { - blocks <- lapply( - blocks, - function(x) { - rownames(x) <- paste0("S", seq_len(NROW(x))) - return(x) - } - ) - if (!quiet) message("Missing rownames are automatically labeled.") - } else { - stop_rgcca(paste("blocks must have rownames.")) - } - } + empty_names <- vapply( + blocks, function(x) is.null(dimnames(x)[[m]]), FUN.VALUE = logical(1L) + ) - # If at least one block does not have rownames, 2 cases arise: - # - if all blocks with names have the same rownames, in the same order, - # we fill the missing rownames with the rownames of the other blocks; - # - otherwise we raise an error. - if (any( - vapply(blocks, function(x) is.null(row.names(x)), FUN.VALUE = logical(1)) - )) { - matrix_of_rownames <- Reduce(cbind, lapply(blocks, row.names)) + # Create dimnames for all blocks if all missing + if (all(empty_names)) { + blocks <- lapply(blocks, function(x) { + dimnames(x)[[m]] <- paste0("S", seq_len(dim(x)[[m]])) + return(x) + }) + } else if (any(empty_names)) { + # If at least one block does not have dimnames, 2 cases arise: + # - if all blocks with names have the same dimnames, in the same order, + # we fill the missing dimnames with the dimnames of the other blocks; + # - otherwise we raise an error. + matrix_of_names <- do.call( + cbind, lapply(blocks[!empty_names], function(x) dimnames(x)[[m]]) + ) is_valid <- all( - apply(matrix_of_rownames, 2, identical, matrix_of_rownames[, 1]) + apply(matrix_of_names, 2, identical, matrix_of_names[, 1]) ) if (is_valid) { - blocks <- lapply( - blocks, - function(x) { - row.names(x) <- matrix_of_rownames[, 1] - return(x) - } - ) - if (!quiet) message("Missing rownames are automatically labeled.") + blocks[empty_names] <- lapply(blocks[empty_names], function(x) { + dimnames(x)[[m]] <- matrix_of_names[, 1] + return(x) + }) } else { stop_rgcca( - "some blocks are missing rownames, and the other blocks' ", - "rownames are not consistent." + "some blocks are missing names on dimension ", m, ", and the other ", + "blocks' names on dimension ", m, " are not consistent." ) } } + return(blocks) } -check_blocks_align <- function(blocks, add_NAlines = FALSE, quiet = FALSE) { - # Construct union of rownames - all_names <- Reduce(union, lapply(blocks, row.names)) - - # If add_NAlines is FALSE and one block doesn't have as many rows as there - # are names in all_names, we stop. Otherwise we complete the blocks by - # adding rows full of NA. - - if (any(vapply(blocks, nrow, FUN.VALUE = integer(1)) != length(all_names))) { - if (add_NAlines) { - blocks <- lapply(blocks, function(x) { - missing_names <- setdiff(all_names, rownames(x)) - y <- matrix(NA, nrow = length(missing_names), ncol = ncol(x)) - rownames(y) <- missing_names - return(rbind(x, y)) - }) +check_block_secondary_dimnames <- function(x, n, primary = 1, quiet = FALSE) { + n_dim <- length(dim(x)[-primary]) + # Set default dimnames if missing + dimnames(x)[-primary] <- lapply(seq_along(dim(x))[-primary], function(m) { + if (is.null(dimnames(x)[[m]])) { + if (n_dim == 1) { + if (dim(x)[m] == 1) { + return(n) + } else { + return(paste(n, seq_len(dim(x)[m]), sep = "_")) + } + } else { + return(paste(n, m, seq_len(dim(x)[m]), sep = "_")) + } } else { - stop_rgcca("blocks must have the same rownames.") + return(dimnames(x)[[m]]) } + }) + + # Add mode and variable number as a suffix if there are duplicates + duplicated_names <- duplicated(unlist(dimnames(x)[-primary])) + if (any(duplicated_names)) { + dimnames(x)[-primary] <- lapply(seq_along(dim(x))[-primary], function(m) { + if (n_dim == 1) { + return(paste(dimnames(x)[[m]], seq_len(dim(x)[m]), sep = "_")) + } else { + return(paste(dimnames(x)[[m]], m, seq_len(dim(x)[m]), sep = "_")) + } + }) + if (!quiet) message( + "Duplicated dimnames are modified to avoid confusion. \n" + ) } - # Align blocks using rownames - blocks <- lapply( - blocks, function(x) x[row.names(blocks[[1]]), , drop = FALSE] - ) + return(x) +} + +check_blocks_align <- function(blocks, m = 1) { + # Construct union of names on dimension m + all_names <- Reduce(union, lapply(blocks, function(x) dimnames(x)[[m]])) + + # If one block doesn't have as many fibers on mode m as there + # are names in all_names, we complete the blocks by + # adding fibers full of NA on mode m. + lacking_values <- vapply(blocks, function(x) { + dim(x)[[m]] != length(all_names) + }, FUN.VALUE = logical(1L)) + + blocks[lacking_values] <- lapply(blocks[lacking_values], function(x) { + missing_names <- setdiff(all_names, dimnames(x)[[m]]) + extra_dim <- dim(x) + extra_dim[m] <- length(missing_names) + y <- array(NA, dim = extra_dim) + dimnames(y)[[m]] <- missing_names + rownames(y) <- missing_names + return(abind(x, y, along = m)) + }) + + # Align blocks using names on dimension m + blocks <- lapply(blocks, function(x) { + perm <- seq_along(dim(x)) + perm[c(1, m)] <- c(m, 1) + x <- aperm(x, perm) + x <- subset_block_rows(x, all_names, drop = FALSE) + aperm(x, perm) + }) return(blocks) } diff --git a/R/checks.R b/R/checks.R index 421fea7f..d5925d6c 100644 --- a/R/checks.R +++ b/R/checks.R @@ -266,10 +266,10 @@ check_ncomp <- function(ncomp, blocks, min = 1, superblock = FALSE, check_sign_comp <- function(rgcca_res, w) { y <- lapply( seq_along(rgcca_res$a), - function(i) pm(rgcca_res$blocks[[i]], w[[i]]) + function(i) pm(to_mat(rgcca_res$blocks[[i]]), w[[i]]) ) - w <- lapply(setNames(seq_along(w), names(w)), function(i) { + w[seq_along(w)] <- lapply(seq_along(w), function(i) { if (NROW(w[[i]]) < NROW(y[[i]])) { res <- as.matrix(cor2(rgcca_res$Y[[i]], y[[i]])) } else { @@ -378,6 +378,49 @@ check_tau <- function(tau) { invisible(tau) } +check_mode_orth <- function(mode_orth, blocks) { + mode_orth <- elongate_arg(mode_orth, blocks) + mode_orth <- check_integer("mode_orth", mode_orth, min = 1, type = "vector") + check_size_blocks(blocks, "mode_orth", mode_orth) + lapply(seq_along(blocks), function(j) { + if (length(dim(blocks[[j]])) > 2 && + mode_orth[j] > length(dim(blocks[[j]])) - 1) { + stop_rgcca( + "mode_orth should be comprise between ", 1 , + " and ", length(dim(blocks[[j]])) - 1, " (that is the number of + modes of block ", j, " minus 1)." + ) + } + }) + return(mode_orth) +} + +check_rank <- function(rank, blocks, mode_orth, ncomp) { + rank <- elongate_arg(rank, blocks) + p <- ifelse(is.vector(rank), 1, nrow(rank)) + J <- length(blocks) + rank <- vapply(seq_along(rank), function(x) { + y <- check_integer("rank", rank[x], min = 1) + j <- (x - 1) %% J + 1 + if (length(dim(blocks[[j]])) > 2 && + y > dim(blocks[[j]])[mode_orth[j] + 1]) { + stop_rgcca( + "rank[", x, "] should be comprise between ", 1 , + " and ", dim(blocks[[j]])[mode_orth[j] + 1], " (that is the number of", + " variables of the mode bearing the orthogonality for block ", j, ")." + ) + } + return(y) + }, FUN.VALUE = 1L) + + rank <- matrix(rank, nrow = p) + check_size_blocks(blocks, "rank", rank, n_row = ncomp) + if (p < ncomp) { + rank <- matrix(rank, nrow = ncomp, ncol = J, byrow = TRUE) + } + return(rank) +} + check_scheme <- function(scheme) { if (mode(scheme) != "function") { scheme <- tolower(scheme) diff --git a/R/deflate.R b/R/deflate.R index 183bbcc2..503134ec 100644 --- a/R/deflate.R +++ b/R/deflate.R @@ -34,20 +34,25 @@ deflate <- function(a, Y, R, P, ndefl, n, superblock, cumsum_pjs <- cumsum(pjs)[seq_len(J - 1)] inf_pjs <- c(0, cumsum_pjs[seq_len(J - 2)]) + 1 R <- lapply(seq(J - 1), function(b) { - x <- defl_result$R[, inf_pjs[b]:cumsum_pjs[b], drop = FALSE] - colnames(x) <- colnames(defl_result$R)[inf_pjs[b]:cumsum_pjs[b]] + x <- defl_result$R[, seq(inf_pjs[b], cumsum_pjs[b]), drop = FALSE] + colnames(x) <- colnames(defl_result$R)[seq(inf_pjs[b], cumsum_pjs[b])] return(x) }) R[[J]] <- defl_result$R # Otherwise, the individual blocks are deflated } else { - defl_result <- defl_select(var_defl, R, + defl_result <- defl_select(var_defl, + lapply(R, function(x) matrix(x, nrow = nrow(x))), ndefl, n - 1, J, na.rm = na.rm, response = response, left = left ) - R <- defl_result$resdefl + R[seq_along(R)] <- lapply(seq_along(R), function(j) { + x <- array(defl_result$resdefl[[j]], dim = dim(R[[j]])) + dimnames(x) <- dimnames(R[[j]]) + return(x) + }) P <- lapply(seq(J), function(b) { cbind(P[[b]], defl_result$pdefl[[b]]) }) diff --git a/R/estimate_separable_covariance.R b/R/estimate_separable_covariance.R new file mode 100644 index 00000000..299abb18 --- /dev/null +++ b/R/estimate_separable_covariance.R @@ -0,0 +1,28 @@ + +#' estimate_separable_covariance estimates the covariance matrix of a set of +#' random variables with an underlying tensor structure making the assumption +#' that the real covariance matrix has a separable structure. +#' @param x A numerical array with at least 3 dimensions. +#' @return The list composed of the estimated terms in the separable covariance. +#' @references Hoff, P. D. (2011), Separable covariance arrays via the Tucker +#' product, with applications to multivariate relational data. +#' Eun Jeong Min et al (2019), Tensor canonical correlation analysis. +#' @title Separable covariance estimator +#' @noRd +estimate_separable_covariance <- function(x) { + dim_x <- dim(x) + n <- dim_x[1] + d <- length(dim_x) - 1 + x_bar <- apply(x, -1, mean) + r <- (1 / n) * norm( + matrix(x, nrow = n) - matrix(rep(x_bar, n), nrow = n, byrow = TRUE), + type = "F" + )^2 + x_bar <- array(x_bar, dim = dim_x[-1]) + lapply(seq_len(d), function(m) { + x_bar_m <- t(apply(x_bar, m, c)) + x_bar_m <- x_bar_m %x% t(rep(1, n)) + x_m <- t(apply(x, m + 1, c)) + (1 / (n * r^((d - 1) / d))) * tcrossprod(x_m - x_bar_m) + }) +} diff --git a/R/format_output.R b/R/format_output.R index b1e77f10..a7850d1a 100644 --- a/R/format_output.R +++ b/R/format_output.R @@ -23,7 +23,7 @@ format_output <- function(func_out, rgcca_args, opt, blocks) { }, FUN.VALUE = double(1L)) AVE <- lapply(blocks_AVE, function(j) { - ave(blocks[[j]], func_out$Y[[j]]) + ave(matrix(blocks[[j]], nrow = nrow(blocks[[j]])), func_out$Y[[j]]) }) AVE_X <- lapply(AVE, "[[", "AVE_X") AVE_X_cor <- lapply(AVE, "[[", "AVE_X_cor") @@ -43,8 +43,28 @@ format_output <- function(func_out, rgcca_args, opt, blocks) { names(func_out$AVE$AVE_X_cor) <- names_AVE ### Set names and shave - for (j in seq_along(blocks)) { + array_idx <- vapply( + blocks, function(x) length(dim(x)) > 2, FUN.VALUE = logical(1L) + ) + + for (j in seq_along(blocks)[array_idx]) { + func_out$factors[[j]] <- lapply( + seq_along(func_out$factors[[j]]), + function(m) { + x <- func_out$factors[[j]][[m]] + rownames(x) <- dimnames(blocks[[j]])[[m + 1]] + return(x) + } + ) + grid <- expand.grid(dimnames(blocks[[j]])[-1]) + rownames(func_out$a[[j]]) <- do.call(paste, c(grid, sep = "_")) + } + + for (j in seq_along(blocks)[!array_idx]) { rownames(func_out$a[[j]]) <- colnames(blocks[[j]]) + } + + for (j in seq_along(blocks)) { rownames(func_out$Y[[j]]) <- rownames(blocks[[j]]) colnames(func_out$Y[[j]]) <- paste0("comp", seq_len(max(rgcca_args$ncomp))) } @@ -56,7 +76,7 @@ format_output <- function(func_out, rgcca_args, opt, blocks) { rownames(func_out$astar) <- colnames(blocks[[length(blocks)]]) } else { for (j in seq_along(blocks)) { - rownames(func_out$astar[[j]]) <- colnames(blocks[[j]]) + rownames(func_out$astar[[j]]) <- rownames(func_out$a[[j]]) } func_out$astar <- shave(func_out$astar, rgcca_args$ncomp) names(func_out$astar) <- names(blocks) diff --git a/R/generate_resampling.R b/R/generate_resampling.R deleted file mode 100644 index 1801310a..00000000 --- a/R/generate_resampling.R +++ /dev/null @@ -1,279 +0,0 @@ -#' Generate `n_boot` bootstrap samples. -#' @details -#' The bootstrap samples are generated so that it is very unlikely that they -#' will be associated with a 0 variance variable. However such situation can -#' appear in very rare cases. -#' -#' The simplest procedure to avoid such situation would be to compute the -#' variance of each variable in each bootstrap sample. However, in -#' high-dimensional cases, this is time consuming. Thus, the function starts -#' by identifying variables that present a risk of being of null variance in -#' at least one bootstrap sample. A variable is detected as such if the -#' probability of sampling `N` times (where `N` is the number of observations) -#' its most frequent observed value is higher than a specific threshold -#' named `pval` (by default set to `1e-15`) corrected by the number of bootstrap -#' sample `n_boot` and the number of variables in the whole data-set -#' (\eqn{\sum_{j=1}^Jp_{j}}). In the end, a variable is defined as risky if the -#' proportion of its most frequent observed value is strictly higher than: -#' \deqn{risky\textunderscore threshold = \left(\frac{pval}{n_{boot}* -#' \left(\sum_{j=1}^J p_{j}\right)}\right)^{1/N}.} -#' This value is necessarily lower than `1` as `pval` is lower than `1`. -#' However, it could be lower than \eqn{1/N}, which means that all variables -#' are defined as risky. That is why the maximum value between -#' \eqn{risky\textunderscore threshold} and \eqn{1/N} is taken. -#' -#' Once risky variables have been identified, the function computes the -#' variance of each risky variable in each bootstrap sample. If there isn't any -#' 0 variance risky variable, the bootstrap samples are returned as is. -#' Otherwise, three cases are possible: -#' \itemize{ -#' \item `keep_all_variables = F`, then the risky variables that are of 0 -#' variance in at least one bootstrap sample are removed. If they correspond to -#' a whole block, an error message is generated. -#' \item `keep_all_variables = T`, two possibilities : -#' \itemize{ -#' \item If `balance = T`, the procedure is repeated at most `5` times until all -#' bootstrap samples do not present any 0 variance variable. -#' \item If `balance = F`, an heuristic procedure is used to modify the sampling -#' probability of each observation in order to keep all variables. -#' } -#' } -#' -#' A short detail of this lastly mentioned heuristic consist in replacing each -#' observed value of the risky variables by \eqn{1-\frac{N_k}{N}} -#' (where \eqn{N_k} corresponds to the number of times this value is observed), -#' normalized so that the sum through all the observations (for each variable) -#' equals `1`. Then, if \eqn{prob_i} defines the sampling probability for the -#' observation associated with line \eqn{i}, it is defined as the maximum value -#' of the previous matrix through all risky variables (again normalized so that -#' \eqn{\sum_{i=1}^N prob_i = 1}). Thus the sampling probability of an -#' observation is more of less associated with `1 - its proportion in the -#' variable where this observation is in the lowest frequent group -#' (through all risky variables)`. -#' @inheritParams rgcca_bootstrap -#' @param pval For all the variables, a threshold for the proportion of the most -#' frequent value of this variable is computed. This threshold is evaluated so -#' that the probability to sample only this value is below `pval`. This -#' probability is corrected by the number of bootstrap samples and the number -#' of variables (default is `1e-15`). -#' @return \item{full_idx}{A list of size n_boot containing the observations -#' kept for each bootstrap sample.} -#' @return \item{sd_null}{A list of size the number of block -#' containing the variables that were removed from each block for all the -#' bootstrap samples. Variables are removed if they appear to be of null -#' variance in at least one bootstrap sample. If no variable is removed, -#' return NULL.} -#' @title Generate bootstrap samples. -#' @noRd - -generate_resampling <- function(rgcca_res, n_boot, balanced = TRUE, - keep_all_variables = FALSE, pval = 1e-15, - verbose = TRUE) { - if (verbose) { - packageStartupMessage("Bootstrap samples sanity check...", - appendLF = FALSE - ) - } - - # Initialization - pval <- min(pval, 1) - NO_null_sd_var <- FALSE - iter <- 0 - raw_blocks <- rgcca_res$call$blocks - N <- NROW(raw_blocks[[1]]) - prob <- rep(1 / N, N) - - # For any variable, threshold for the proportion of the most frequent value - # of a variable. This threshold is computed so that the probability to sample - # only this value is below `pval`. This probability is corrected by the number - # of bootstrap samples and the number of variables. - risky_threshold <- max( - 1 / N, - (pval / (n_boot * sum(vapply(raw_blocks, NCOL, FUN.VALUE = 1L))))^(1 / N) - ) - - # Identify variables with value having an observed proportion higher than - # risky_threshold. - risky_var <- lapply( - raw_blocks, - function(block) { - which(apply( - block, 2, - function(x) { - max(table(x) / N) > risky_threshold - } - )) - } - ) - - # Keep only risky variables for each block. - raw_blocks_filtered <- Map( - function(x, y) x[, y, drop = FALSE], - raw_blocks, risky_var - ) - # While there are variables with null variance among the risky variables. - while (!NO_null_sd_var) { - if (balanced) { # Balanced bootstrap sampling. - full_idx <- rep(x = seq(N), each = n_boot) - full_idx <- sample(x = full_idx, size = N * n_boot, replace = FALSE) - full_idx <- split(full_idx, ceiling(seq_along(full_idx) / N)) - } else { # Unbalanced bootstrap sampling. - full_idx <- lapply( - seq(n_boot), - function(x) { - sample(x = seq(N), replace = TRUE, prob = prob) - } - ) - } - # Compute blocks for each bootstrap sample (only with risky variables). - boot_blocks_filtered <- - lapply( - full_idx, - function(idx) { - lapply( - raw_blocks_filtered, - function(x) { - y <- x[idx, , drop = FALSE] - rownames(y) <- paste("S", seq_along(idx)) - return(y) - } - ) - } - ) - - # For each sample, identify variables with a single unique value. - - boot_column_sd_null <- - lapply( - boot_blocks_filtered, - function(boot) { - lapply(boot, function(boot_bl) { - which(apply(boot_bl, 2, function(x) length(unique(x)) == 1)) - }) - } - ) - - # Summarize through all the samples. - eval_boot_sample <- vapply( - boot_column_sd_null, - function(x) sum(vapply(x, length, FUN.VALUE = 1L)), - FUN.VALUE = 1L - ) - NO_null_sd_var <- (sum(eval_boot_sample) == 0) - - if (NO_null_sd_var) { - # through all samples, all variables have non null variances. - sd_null <- NULL - } else { - # If at least one sample have been identified with a null - # variance variable. - if (!keep_all_variables) { - # It is allowed to remove variables. - # Extract the troublesome variables. - sd_null <- Reduce("rbind", boot_column_sd_null) - rownames(sd_null) <- NULL - sd_null <- apply(sd_null, 2, function(x) unique(names(Reduce("c", x)))) - sd_null <- Map(function(x, y) { - z <- match(x, y) - names(z) <- x - return(z) - }, sd_null, lapply(raw_blocks, colnames)) - - # Check if a whole block is troublesome - is_full_block_removed <- unlist(Map( - function(x, y) dim(x)[2] == length(y), - raw_blocks, - sd_null - )) - if (sum(is_full_block_removed) == 0) { - # A whole block is NOT troublesome - NO_null_sd_var <- TRUE - warning(paste( - "Variables: ", - paste(names(Reduce("c", sd_null)), collapse = " - "), - "appear to be of null variance in some bootstrap samples", - "and thus were removed from all samples. \n", - "==> RGCCA is run again without these variables." - )) - } else { - # If whole block IS troublesome then STOP - stop_rgcca(paste( - "The variance of all the variables from blocks: ", - paste(names(raw_blocks)[is_full_block_removed], collapse = " - "), - "appear to be null in some bootstrap samples.", - "Please consider removing them." - )) - } - } else { # It is NOT allowed to remove variables. - # Generate at most five different re-sampling until not a single - # variable has a null variance. - if (iter > 5) { # Otherwise STOP. - # Extract the troublesome variables. - sd_null <- Reduce("rbind", boot_column_sd_null) - rownames(sd_null) <- NULL - sd_null <- apply( - sd_null, 2, - function(x) unique(names(Reduce("c", x))) - ) - sd_null <- Map(function(x, y) { - z <- match(x, y) - names(z) <- x - return(z) - }, sd_null, lapply(raw_blocks, colnames)) - - error_message <- paste( - "Impossible to define all bootstrap samples", - "without variables with null variance. Please", - "consider removing these variables: ", - paste(names(Reduce("c", sd_null)), collapse = " - ") - ) - # In the balanced case, you CANNOT play with the sampling probability - # of the different observations as it is unbalanced. - if (balanced) { - error_message <- paste0( - error_message, - ". Please, consider unbalanced bootstrap by", - " setting 'balanced' to FALSE." - ) - } - stop_rgcca(error_message) - } - # In the unbalanced case, you CAN play with the sampling probability - # of the different observations. - if (!balanced) { - if (iter == 0) { # The first time, you define your unbalancedness. - # Each observed value of the risky variables is replaced by - # `1 - the proportion of this observed value`, normalized so that - # the sum through all the observations (for each variable) - # equals `1`. - prob <- lapply( - raw_blocks_filtered, - function(block) { - apply(block, 2, function(var) { - occurences <- table(var, useNA = "ifany") / length(var) - new_idx <- match(as.character(var), names(occurences)) - new_var <- as.matrix(occurences[new_idx]) - new_var <- (1 - new_var) / sum(1 - new_var) - return(new_var) - }) - } - ) - # The sampling probability for each observation is associated with - # the maximum value of the previous matrix through all risky - # variables (again normalize so that `sum(prob) = 1`). Thus - # the sampling probability of an observation is more of less - # associated with `1 - its proportion in the variable where this - # observation is in the lowest frequent group (through all - # risky variables)`. - prob <- apply(Reduce("cbind", prob), 1, max) / sum( - apply(Reduce("cbind", prob), 1, max) - ) - } - } - iter <- iter + 1 - } - } - } - if (verbose) packageStartupMessage("OK") - return(list(full_idx = full_idx, sd_null = sd_null)) -} diff --git a/R/get_rgcca_args.R b/R/get_rgcca_args.R index 1ba9cc1a..94c6a41a 100644 --- a/R/get_rgcca_args.R +++ b/R/get_rgcca_args.R @@ -23,6 +23,7 @@ get_rgcca_args <- function(object, default_args = list()) { tol = default_args$tol, init = tolower(default_args$init), bias = default_args$bias, + rank = default_args$rank, quiet = default_args$quiet, scale = default_args$scale, ncomp = default_args$ncomp, @@ -34,6 +35,8 @@ get_rgcca_args <- function(object, default_args = list()) { response = default_args$response, NA_method = tolower(default_args$NA_method), comp_orth = default_args$comp_orth, + mode_orth = default_args$mode_orth, + separable = default_args$separable, n_iter_max = default_args$n_iter_max, connection = default_args$connection, superblock = default_args$superblock, @@ -52,14 +55,15 @@ get_rgcca_args <- function(object, default_args = list()) { } rgcca_args$blocks <- check_blocks( - rgcca_args$blocks, add_NAlines = TRUE, - quiet = rgcca_args$quiet, response = rgcca_args$response + rgcca_args$blocks, quiet = rgcca_args$quiet, + response = rgcca_args$response ) check_integer("tol", rgcca_args$tol, float = TRUE, min = 0) check_integer("n_iter_max", rgcca_args$n_iter_max, min = 1) for (i in c( - "superblock", "verbose", "scale", "bias", "quiet", "comp_orth" + "superblock", "verbose", "scale", "bias", + "quiet", "comp_orth", "separable" )) { check_boolean(i, rgcca_args[[i]]) } diff --git a/R/handle_NA.R b/R/handle_NA.R index a082f305..7570155d 100644 --- a/R/handle_NA.R +++ b/R/handle_NA.R @@ -6,7 +6,7 @@ handle_NA <- function(blocks, NA_method = "na.ignore") { if (NA_method == "na.omit") blocks <- intersection_list(blocks) if (NA_method == "na.ignore") { blocks <- blocks - na.rm <- Reduce("||", lapply(blocks, function(x) any(is.na(x)))) + na.rm <- any(is.na(unlist(blocks, use.names = FALSE))) } return(list(blocks = blocks, na.rm = na.rm)) } diff --git a/R/intersection.R b/R/intersection.R index 98a1343f..144709d3 100755 --- a/R/intersection.R +++ b/R/intersection.R @@ -5,7 +5,9 @@ #' @noRd intersection_list <- function(A) { # Find rows without missing values in each block - valid_rows <- lapply(A, complete.cases) + valid_rows <- lapply(A, function(x) { + apply(x, 1, function(y) all(!is.na(y))) + }) # Take the intersection common_valid_rows <- apply( matrix(unlist(valid_rows), length(valid_rows[[1]]), length(valid_rows)), @@ -19,5 +21,5 @@ intersection_list <- function(A) { )) } # Extract the rows from the different blocks - lapply(A, function(x) subset_rows(x, as.logical(common_valid_rows))) + lapply(A, subset_block_rows, as.logical(common_valid_rows), drop = FALSE) } diff --git a/R/khatri_rao.R b/R/khatri_rao.R new file mode 100644 index 00000000..519d9ea7 --- /dev/null +++ b/R/khatri_rao.R @@ -0,0 +1,20 @@ +#' khatri_rao computes the Khatri-Rao products between two matrices +#' @param x A numeric matrix. +#' @param y A numeric matrix. +#' @return The Khatri-Rao product between x and y. +#' @title Khatri-Rao product +#' @noRd +khatri_rao <- function(x, y = NULL){ + if (is.null(x)) { + return(y) + } + if (is.null(y)) { + return(x) + } + r <- NCOL(x) + vapply( + seq_len(r), + function(i) x[, i] %x% y[, i], + FUN.VALUE = double(NROW(x) * NROW(y)) + ) +} diff --git a/R/methods.R b/R/methods.R index 3dec7a59..e109d418 100644 --- a/R/methods.R +++ b/R/methods.R @@ -7,7 +7,7 @@ #' @export available_methods <- function() { c( - "rgcca", "sgcca", "pca", "spca", "pls", "spls", "cca", + "rgcca", "sgcca", "tgcca", "pca", "spca", "pls", "spls", "cca", "ifa", "ra", "gcca", "maxvar", "maxvar-b", "maxvar-a", "mfa", "mcia", "mcoa", "cpca-1", "cpca-2", "cpca-4", "hpca", "maxbet-b", "maxbet", "maxdiff-b", "maxdiff", "sabscor", @@ -72,3 +72,7 @@ x4_methods <- function() { sparse_methods <- function() { c("sgcca", "spls", "spca") } + +tensor_methods <- function() { + c("tgcca") +} \ No newline at end of file diff --git a/R/mode_product.R b/R/mode_product.R new file mode 100644 index 00000000..2002f2d5 --- /dev/null +++ b/R/mode_product.R @@ -0,0 +1,29 @@ +#' Compute the mode product between a tensor X and a matrix y on a given mode m. +#' @param X An array. +#' @param y A matrix. +#' @param m A scalar designating a mode of the tensor X. +#' @examples +#' X <- array(rnorm(40 * 5 * 7), dim = c(40, 5, 7)) +#' y <- rnorm(5) +#' res <- mode_product(X, y, m = 2) +#' print(dim(X)) +#' print(dim(res)) +#' @noRd +mode_product <- function(X, y, m = 1) { + dim_X <- dim(X) + + # Unfold the tensor on dimension m + perm <- c(m, seq_along(dim_X)[-m]) + X <- aperm(X, perm) + + # Compute the product + X <- matrix(X, nrow = nrow(X)) + X <- t(y) %*% X + + # Reshape the result back to a tensor + dim_X[m] <- NCOL(y) + X <- array(X, dim = dim_X[perm]) + + X <- aperm(X, c(1 + seq_len(m - 1), 1, m + seq_len(length(dim_X) - m))) + return(X) +} diff --git a/R/plot.rgcca.R b/R/plot.rgcca.R index 7f5ecec6..ab72158c 100644 --- a/R/plot.rgcca.R +++ b/R/plot.rgcca.R @@ -222,12 +222,14 @@ plot.rgcca <- function(x, type = "weights", df <- data.frame( x = do.call(rbind, Map(function(i, j) { cor2( - x$blocks[[i]][rownames(x$Y[[j]]), ], + subset_block_rows(to_mat(x$blocks[[i]]), rownames(x$Y[[j]])), x$Y[[j]][, comp] ) }, display_blocks, block)), response = num_block, - y = do.call(c, lapply(x$blocks[display_blocks], colnames)) + y = do.call( + c, lapply(display_blocks, function(j) colnames(to_mat(x$blocks[[j]]))) + ) ) idx <- unlist( @@ -239,7 +241,9 @@ plot.rgcca <- function(x, type = "weights", df_weight <- function(x, block, comp, num_block, display_order) { df <- data.frame( x = unlist(lapply(x$a[block], function(z) z[, comp[1]])), - y = do.call(c, lapply(x$blocks[block], colnames)), + y = do.call(c, lapply( + block, function(j) colnames(to_mat(x$blocks[[j]])) + )), response = num_block ) df <- df[df$x != 0, ] @@ -397,7 +401,7 @@ plot.rgcca <- function(x, type = "weights", # Construct response vector for correlation circle, weights and loadings num_block <- as.factor(unlist(lapply( display_blocks, - function(j) rep(names(x$blocks)[j], NCOL(x$blocks[[j]])) + function(j) rep(names(x$blocks)[j], prod(dim(x$blocks[[j]])[-1])) ))) switch(type, @@ -481,7 +485,7 @@ plot.rgcca <- function(x, type = "weights", ) # Rescale weigths - var_tot <- sum(diag(var(x$blocks[[block[1]]], na.rm = TRUE))) + var_tot <- sum(diag(var(to_mat(x$blocks[[block[1]]]), na.rm = TRUE))) a <- data.matrix(df$a[, c(1, 2)]) %*% diag(sqrt( var_tot * x$AVE$AVE_X[[block[1]]][comp] )) diff --git a/R/remove_null_sd.R b/R/remove_null_sd.R deleted file mode 100644 index eaecadaa..00000000 --- a/R/remove_null_sd.R +++ /dev/null @@ -1,58 +0,0 @@ -#' Remove columns having a 0 standard deviation -#' -#' @param list_m A list of dataframe -#' @param column_sd_null Either NULL or a list of named vectors. If NULL, the -#' function will search for variables with null variance in each block. If not -#' NULL, this list defines for each block the index of the variables that are -#' of null variance (see the 'Value' section for more details about the content -#' of this list). In both cases, these variables are removed. -#' @return \item{list_m}{A list of dataframe.} -#' @return \item{column_sd_null}{Either NULL, if not a single variable was -#' removed, or a list of the same size as the number of blocks. In the last -#' situation, each element of this list is again NULL if not a single variable -#' was removed from the current block, or a named vector indicating the former -#' index of the removed variables along with their name.} -#' @noRd - -remove_null_sd <- function(list_m, column_sd_null = NULL) { - names <- names(list_m) - - if (is.null(column_sd_null)) { - column_sd_null <- lapply( - list_m, - function(x) { - which(apply(x, 2, function(y) { - if (mode(y) != "character") { - std <- sd(y, na.rm = TRUE) - res <- is.na(std) || (std == 0) - } else { - res <- FALSE - } - return(res) - })) - } - ) - } - - blocks_index <- seq(1, length(list_m))[ - unlist( - lapply( - column_sd_null, - function(x) length(x) > 0 - ) - ) - ] - list_m <- lapply( - seq_along(list_m), - function(x) { - if (x %in% blocks_index) { - list_m[[x]][, -column_sd_null[[x]], drop = FALSE] - } else { - list_m[[x]] - } - } - ) - - names(list_m) <- names - return(list(list_m = list_m, column_sd_null = column_sd_null)) -} diff --git a/R/rgcca.R b/R/rgcca.R index acccb808..aabc13ca 100644 --- a/R/rgcca.R +++ b/R/rgcca.R @@ -68,8 +68,8 @@ #' @param scale_block A logical value or a string indicating if each block is #' scaled. #' -#' If TRUE or "inertia", each block is divided by the sum of eigenvalues -#' of its empirical covariance matrix. +#' If TRUE or "inertia", each block is divided by the square root of the sum +#' of eigenvalues of its empirical covariance matrix. #' #' If "lambda1", each block is divided by #' the square root of the highest eigenvalue of its empirical covariance matrix. @@ -172,6 +172,21 @@ #' iterations. #' @param comp_orth A logical value indicating if the deflation should lead to #' orthogonal block components or orthogonal block weight vectors. +#' @param rank Either an integer, an integer vector of +#' size \eqn{J} or an integer matrix +#' of dimension \eqn{\textrm{max}(\textrm{ncomp}) \times J} giving the rank +#' of the decomposition sought for the canonical vectors in TGCCA. +#' If block \eqn{j} is an array with at least three dimensions, rank must be +#' comprised between 1 and the number of variables on the mode bearing the +#' orthogonality constraint. See \eqn{\textrm{mode_orth}}. +#' @param mode_orth Either an integer or an integer vector of size \eqn{J} +#' designating the mode which associated set of factors will be orthogonal +#' in the decomposition sought by TGCCA. If block \eqn{j} is an array with +#' \eqn{d > 2} dimensions, \eqn{\textrm{mode_orth}} must be comprised between +#' 1 and \eqn{d - 1}. +#' @param separable A logical value if the regularization matrices must be +#' estimated as separable matrices (i.e. products of Kronecker products +#' matching the dimensions of the modes in the data). #' @param A Deprecated argument, please use blocks instead. #' @param C Deprecated argument, please use connection instead. #' @return A fitted rgcca object. @@ -433,6 +448,7 @@ rgcca <- function(blocks, connection = NULL, tau = 1, ncomp = 1, superblock = FALSE, NA_method = "na.ignore", quiet = TRUE, n_iter_max = 1000, comp_orth = TRUE, + rank = 1, mode_orth = 1, separable = TRUE, A = NULL, C = NULL) { # Check for deprecated arguments if (!missing(A)) { @@ -453,7 +469,7 @@ rgcca <- function(blocks, connection = NULL, tau = 1, ncomp = 1, rgcca_args$quiet <- quiet rgcca_args$verbose <- verbose - blocks <- remove_null_sd(rgcca_args$blocks)$list_m + blocks <- rgcca_args$blocks if (opt$disjunction) { blocks[[rgcca_args$response]] <- as_disjunctive( @@ -480,7 +496,8 @@ rgcca <- function(blocks, connection = NULL, tau = 1, ncomp = 1, ### Call the gcca function gcca_args <- rgcca_args[c( "connection", "ncomp", "scheme", "init", "bias", "tol", - "verbose", "superblock", "response", "n_iter_max", "comp_orth" + "verbose", "superblock", "response", "n_iter_max", "comp_orth", + "rank", "mode_orth", "separable" )] gcca_args[["na.rm"]] <- na.rm gcca_args[["blocks"]] <- blocks diff --git a/R/rgcca_bootstrap.R b/R/rgcca_bootstrap.R index c45f0ff9..3ba651e7 100644 --- a/R/rgcca_bootstrap.R +++ b/R/rgcca_bootstrap.R @@ -5,11 +5,6 @@ #' @param rgcca_res A fitted RGCCA object (see \code{\link[RGCCA]{rgcca}}). #' @param n_boot The number of bootstrap samples (default: 100). #' @param n_cores The number of cores used for parallelization. -#' @param balanced A logical value indicating if a balanced bootstrap procedure -#' is performed or not (default is TRUE). -#' @param keep_all_variables A logical value indicating if all variables have -#' to be kept even when some of them have null variance for at least one -#' bootstrap sample (default is FALSE). #' @param verbose A logical value indicating if the progress of the bootstrap #' procedure is reported. #' @return A rgcca_bootstrap object that can be printed and plotted. @@ -77,9 +72,7 @@ #' @seealso \code{\link[RGCCA]{plot.rgcca_bootstrap}}, #' \code{\link[RGCCA]{summary.rgcca_bootstrap}} rgcca_bootstrap <- function(rgcca_res, n_boot = 100, - n_cores = 1, - balanced = TRUE, keep_all_variables = FALSE, - verbose = TRUE) { + n_cores = 1, verbose = TRUE) { stability <- is(rgcca_res, "rgcca_stability") if (stability) { message( @@ -120,36 +113,20 @@ rgcca_bootstrap <- function(rgcca_res, n_boot = 100, check_integer("n_boot", n_boot) - boot_sampling <- generate_resampling( - rgcca_res = rgcca_res, - n_boot = n_boot, - balanced = balanced, - keep_all_variables = keep_all_variables, - verbose = verbose - ) + ### Create bootstrap samples + v_inds <- lapply(seq_len(n_boot), function(i) { + sample(seq_len(NROW(rgcca_res$call$blocks[[1]])), replace = TRUE) + }) - sd_null <- boot_sampling$sd_null - - if (!is.null(sd_null)) { - rgcca_res$call$blocks <- remove_null_sd( - list_m = rgcca_res$call$blocks, - column_sd_null = sd_null - )$list_m - rgcca_res <- rgcca(rgcca_res) - } - - W <- par_pblapply( - boot_sampling$full_idx, function(b) { - rgcca_bootstrap_k( - rgcca_res = rgcca_res, - inds = b - ) - }, - n_cores = n_cores, verbose = verbose - ) - - W <- W[!vapply(W, is.null, logical(1L))] + ### Run RGCCA on the bootstrap samples + W <- par_pblapply(v_inds, function(b) { + rgcca_bootstrap_k( + rgcca_res = rgcca_res, + inds = b + ) + }, n_cores = n_cores, verbose = verbose) + ### Extract statistics from the results of the bootstrap res <- format_bootstrap_list(W, rgcca_res) stats <- rgcca_bootstrap_stats(res, rgcca_res, length(W)) diff --git a/R/rgcca_bootstrap_k.R b/R/rgcca_bootstrap_k.R index b71dec0b..af57e9a9 100644 --- a/R/rgcca_bootstrap_k.R +++ b/R/rgcca_bootstrap_k.R @@ -15,56 +15,42 @@ rgcca_bootstrap_k <- function(rgcca_res, inds = NULL, type = "loadings") { if (length(inds) > 0) { rgcca_res$call$blocks <- lapply(rgcca_res$call$blocks, function(x) { - y <- x[inds, , drop = FALSE] + y <- subset_block_rows(x, inds, drop = FALSE) rownames(y) <- paste("S", seq_along(inds)) return(y) }) } rgcca_res_boot <- rgcca(rgcca_res) - # block-weight vector - missing_var <- unlist(lapply( - seq_along(rgcca_res_boot$a), - function(x) { - setdiff( - colnames(rgcca_res$blocks[[x]]), - rownames(rgcca_res_boot$a[[x]]) - ) - } - )) - if (length(missing_var) == 0) { - # block-loadings vector - A <- check_sign_comp(rgcca_res, rgcca_res_boot$a) + # block-loadings vector + A <- check_sign_comp(rgcca_res, rgcca_res_boot$a) - if (type == "loadings") { - Y <- lapply( - seq_along(A), - function(j) pm(rgcca_res_boot$blocks[[j]], A[[j]]) - ) - L <- lapply( - seq_along(A), - function(j) { - cor2(rgcca_res_boot$blocks[[j]], Y[[j]]) - } - ) - } else { - L <- lapply(names(A), function(n) { - if (!(n %in% names(rgcca_res$AVE$AVE_X))) { - res <- rep(-1, NCOL(rgcca_res$a[[n]])) - } else { - res <- rgcca_res_boot$AVE$AVE_X[[n]] - } - res <- matrix( - res, nrow = nrow(A[[n]]), - ncol = length(res), byrow = TRUE - ) - rownames(res) <- rownames(A[[n]]) - return(res) - }) - } - names(L) <- names(rgcca_res$a) - return(list(W = A, L = L)) + if (type == "loadings") { + Y <- lapply( + seq_along(A), + function(j) pm(to_mat(rgcca_res_boot$blocks[[j]]), A[[j]]) + ) + L <- lapply( + seq_along(A), + function(j) { + cor2(to_mat(rgcca_res_boot$blocks[[j]]), Y[[j]]) + } + ) } else { - return(NULL) + L <- lapply(names(A), function(n) { + if (!(n %in% names(rgcca_res$AVE$AVE_X))) { + res <- rep(-1, NCOL(rgcca_res$a[[n]])) + } else { + res <- rgcca_res_boot$AVE$AVE_X[[n]] + } + res <- matrix( + res, nrow = nrow(A[[n]]), + ncol = length(res), byrow = TRUE + ) + rownames(res) <- rownames(A[[n]]) + return(res) + }) } + names(L) <- names(rgcca_res$a) + return(list(W = A, L = L)) } diff --git a/R/rgcca_bootstrap_stats.R b/R/rgcca_bootstrap_stats.R index ec62df36..b2e013d4 100644 --- a/R/rgcca_bootstrap_stats.R +++ b/R/rgcca_bootstrap_stats.R @@ -53,7 +53,7 @@ rgcca_bootstrap_stats <- function(res, rgcca_res, n_boot) { unlist(rgcca_res$a, use.names = FALSE), unlist(lapply(unique(stats$block), function(block) { cor2( - rgcca_res$blocks[[block]], + to_mat(rgcca_res$blocks[[block]]), rgcca_res$Y[[block]] ) }), use.names = FALSE) diff --git a/R/rgcca_cv.r b/R/rgcca_cv.r index 945b4ad4..d7abf5fa 100644 --- a/R/rgcca_cv.r +++ b/R/rgcca_cv.r @@ -187,6 +187,9 @@ rgcca_cv <- function(blocks, verbose = TRUE, n_iter_max = 1000, comp_orth = TRUE, + rank = 1, + mode_orth = 1, + separable = TRUE, ...) { ### Try to retrieve parameters from a rgcca object rgcca_args <- as.list(environment()) diff --git a/R/rgcca_cv_k.R b/R/rgcca_cv_k.R index 12c866f7..28f96462 100644 --- a/R/rgcca_cv_k.R +++ b/R/rgcca_cv_k.R @@ -12,17 +12,15 @@ rgcca_cv_k <- function(rgcca_args, inds, prediction_model, blocks <- rgcca_args[["blocks"]] rgcca_args[["blocks"]] <- lapply( - blocks, function(x) x[-inds, , drop = FALSE] + blocks, function(x) subset_block_rows(x, -inds, drop = FALSE) ) # Fit RGCCA on the training blocks res <- do.call(rgcca, rgcca_args) # Evaluate RGCCA on the validation blocks - blocks_test <- lapply(seq_along(blocks), function(j) { - x <- blocks[[j]][inds, , drop = FALSE] - colnames(x) <- colnames(res$call$blocks[[j]]) - return(x) - }) + blocks_test <- lapply( + blocks, function(x) subset_block_rows(x, inds, drop = FALSE) + ) names(blocks_test) <- names(res$blocks) return(rgcca_predict( diff --git a/R/rgcca_inner_loop.R b/R/rgcca_inner_loop.R index bd5a92fe..5717ad61 100644 --- a/R/rgcca_inner_loop.R +++ b/R/rgcca_inner_loop.R @@ -1,7 +1,10 @@ rgcca_inner_loop <- function(A, C, g, dg, tau = rep(1, length(A)), sparsity = rep(1, length(A)), verbose = FALSE, init = "svd", bias = TRUE, - tol = 1e-08, na.rm = TRUE, n_iter_max = 1000) { + tol = 1e-08, na.rm = TRUE, n_iter_max = 1000, + rank = rep(1, length(A)), + mode_orth = rep(1, length(A)), + separable = TRUE) { if (!is.numeric(tau)) { # From Schafer and Strimmer, 2005 tau <- vapply(A, tau.estimate, na.rm = na.rm, FUN.VALUE = 1.0) @@ -15,7 +18,8 @@ rgcca_inner_loop <- function(A, C, g, dg, tau = rep(1, length(A)), ### Initialization block_objects <- lapply(seq_along(A), function(j) { - create_block(A[[j]], j, bias, na.rm, tau[j], sparsity[j], tol) + create_block(A[[j]], j, bias, na.rm, tau[j], sparsity[j], + tol, rank[j], mode_orth[j], separable) }) block_objects <- lapply(block_objects, block_init, init = init) @@ -84,6 +88,10 @@ rgcca_inner_loop <- function(A, C, g, dg, tau = rep(1, length(A)), block_objects <- lapply(block_objects, block_postprocess, ctrl) a <- lapply(block_objects, "[[", "a") Y <- do.call(cbind, lapply(block_objects, "[[", "Y")) + factors <- lapply(block_objects, "[[", "factors") + lambda <- lapply(block_objects, "[[", "lambda") - return(list(Y = Y, a = a, crit = crit, tau = tau)) + return(list( + Y = Y, a = a, factors = factors, lambda = lambda, crit = crit, tau = tau + )) } diff --git a/R/rgcca_outer_loop.R b/R/rgcca_outer_loop.R index d7f2e20c..ff74c6dc 100644 --- a/R/rgcca_outer_loop.R +++ b/R/rgcca_outer_loop.R @@ -8,7 +8,8 @@ rgcca_outer_loop <- function(blocks, connection = 1 - diag(length(blocks)), verbose = TRUE, na.rm = TRUE, superblock = FALSE, response = NULL, disjunction = NULL, - n_iter_max = 1000, comp_orth = TRUE) { + n_iter_max = 1000, comp_orth = TRUE, + rank = 1, mode_orth = 1, separable = TRUE) { if (verbose) { scheme_str <- ifelse(is(scheme, "function"), "user-defined", scheme) cat( @@ -49,8 +50,10 @@ rgcca_outer_loop <- function(blocks, connection = 1 - diag(length(blocks)), crit <- list() R <- blocks - a <- lapply(seq(J), function(b) c()) - Y <- lapply(seq(J), function(b) c()) + a <- Y <- lambda <- lapply(seq(J), function(b) c()) + factors <- lapply(seq(J), function(b) { + lapply(seq_along(dim(R[[b]])[-1]), function(m) NULL) + }) if (superblock && comp_orth) { P <- c() @@ -74,11 +77,18 @@ rgcca_outer_loop <- function(blocks, connection = 1 - diag(length(blocks)), ) } + if (is.vector(rank)) { + rank <- matrix( + rep(rank, N + 1), + nrow = N + 1, J, byrow = TRUE + ) + } + # Whether primal or dual primal_dual <- matrix("primal", nrow = N + 1, ncol = J) primal_dual[which((sparsity == 1) & (nb_ind < matrix( pjs, nrow = N + 1, ncol = J, byrow = TRUE - )))] + )))] <- "dual" ##### Computation of RGCCA components ##### for (n in seq(N + 1)) { @@ -93,16 +103,26 @@ rgcca_outer_loop <- function(blocks, connection = 1 - diag(length(blocks)), sparsity = sparsity[n, ], init = init, bias = bias, tol = tol, verbose = verbose, na.rm = na.rm, - n_iter_max = n_iter_max + n_iter_max = n_iter_max, + rank = rank[n, ], mode_orth = mode_orth, + separable = separable ) # Store tau, crit computed_tau[n, ] <- gcca_result$tau crit[[n]] <- gcca_result$crit - # Store Y, a, factors and weights + # Store Y, a, factors and lambda a <- lapply(seq(J), function(b) cbind(a[[b]], gcca_result$a[[b]])) Y <- lapply(seq(J), function(b) cbind(Y[[b]], gcca_result$Y[, b])) + factors <- lapply(seq(J), function(b) { + lapply(seq_along(factors[[b]]), function(m) { + cbind(factors[[b]][[m]], gcca_result$factors[[b]][[m]]) + }) + }) + lambda <- lapply( + seq(J), function(b) cbind(lambda[[b]], gcca_result$lambda[[b]]) + ) # Deflation procedure if (n == N + 1) break @@ -113,7 +133,7 @@ rgcca_outer_loop <- function(blocks, connection = 1 - diag(length(blocks)), } # If there is a superblock and weight vectors are orthogonal, it is possible - # to have non meaningful weights associated to blocks that have been set to + # to have non meaningful lambda associated to blocks that have been set to # zero by the deflation if (superblock && !comp_orth) { a <- lapply(a, function(x) { @@ -138,6 +158,8 @@ rgcca_outer_loop <- function(blocks, connection = 1 - diag(length(blocks)), Y = Y, a = a, astar = astar, + factors = factors, + lambda = lambda, tau = computed_tau, crit = crit, primal_dual = primal_dual ) diff --git a/R/rgcca_permutation.R b/R/rgcca_permutation.R index b004ed02..1233d373 100644 --- a/R/rgcca_permutation.R +++ b/R/rgcca_permutation.R @@ -235,7 +235,8 @@ rgcca_permutation <- function(blocks, par_type = "tau", par_value = NULL, response = NULL, superblock = FALSE, NA_method = "na.ignore", rgcca_res = NULL, verbose = TRUE, n_iter_max = 1000, - comp_orth = TRUE) { + comp_orth = TRUE, rank = 1, mode_orth = 1, + separable = TRUE) { ### Try to retrieve parameters from a rgcca object rgcca_args <- as.list(environment()) tmp <- get_rgcca_args(blocks, rgcca_args) diff --git a/R/rgcca_permutation_k.R b/R/rgcca_permutation_k.R index f9ab0816..4fc68d5a 100644 --- a/R/rgcca_permutation_k.R +++ b/R/rgcca_permutation_k.R @@ -9,7 +9,7 @@ rgcca_permutation_k <- function(rgcca_args, inds, perm, par_type, par_value) { if (perm) { blocks <- lapply(seq_along(rgcca_args$blocks), function(i) { x <- rgcca_args$blocks[[i]] - block <- as.matrix(x)[inds[[i]], , drop = FALSE] + block <- subset_block_rows(x, inds[[i]], drop = FALSE) rownames(block) <- rownames(x) return(block) }) diff --git a/R/rgcca_predict.R b/R/rgcca_predict.R index 21afeb52..056d3265 100644 --- a/R/rgcca_predict.R +++ b/R/rgcca_predict.R @@ -97,8 +97,8 @@ rgcca_predict <- function(rgcca_res, metric <- match.arg(metric, available_metrics) ### Get train and test target (if present) - y_train <- rgcca_res$call$blocks[[response]] - y_test <- as.data.frame(blocks_test[[test_idx]]) + y_train <- to_mat(rgcca_res$call$blocks[[response]]) + y_test <- as.data.frame(to_mat(blocks_test[[test_idx]])) if (any(dim(y_test)[-1] != dim(y_train)[-1])) { stop_rgcca( @@ -114,7 +114,7 @@ rgcca_predict <- function(rgcca_res, X_test <- reformat_projection(projection) # Keep same lines in X_train and y_train - y_train <- as.data.frame(subset_rows(y_train, rownames(X_train))) + y_train <- as.data.frame(subset_block_rows(y_train, rownames(X_train))) # Test that in classification, variables are not constant within groups if (classification) { diff --git a/R/rgcca_stability.R b/R/rgcca_stability.R index 531d1390..fd460009 100644 --- a/R/rgcca_stability.R +++ b/R/rgcca_stability.R @@ -5,11 +5,8 @@ #' (VIP) based criterion is used to identify the most stable variables. #' #' @inheritParams rgcca_bootstrap -#' @param rgcca_res A fitted RGCCA object (see \code{\link[RGCCA]{rgcca}}). #' @param keep A numeric vector indicating the proportion of variables per #' block to select. -#' @param n_boot The number of bootstrap samples (default: 100). -#' @param n_cores The number of cores for parallelization. #' @param verbose A logical value indicating if the progress of the procedure #' is reported. #' @return A rgcca_stability object that can be printed and plotted. @@ -63,42 +60,23 @@ rgcca_stability <- function(rgcca_res, ), n_boot = 100, n_cores = 1, - verbose = TRUE, - balanced = TRUE, - keep_all_variables = FALSE) { + verbose = TRUE) { stopifnot(tolower(rgcca_res$call$method) %in% sparse_methods()) check_integer("n_boot", n_boot) check_integer("n_cores", n_cores, min = 0) - boot_sampling <- generate_resampling( - rgcca_res = rgcca_res, - n_boot = n_boot, - balanced = balanced, - verbose = verbose, - keep_all_variables = keep_all_variables - ) - - sd_null <- boot_sampling$sd_null - - if (!is.null(sd_null)) { - rgcca_res$call$blocks <- remove_null_sd( - list_m = rgcca_res$call$blocks, - column_sd_null = sd_null - )$list_m - rgcca_res <- rgcca(rgcca_res) - } - - W <- par_pblapply( - boot_sampling$full_idx, function(b) { - rgcca_bootstrap_k( - rgcca_res = rgcca_res, - inds = b, type = "AVE" - ) - }, - n_cores = n_cores, verbose = verbose - ) + ### Create bootstrap samples + v_inds <- lapply(seq_len(n_boot), function(i) { + sample(seq_len(NROW(rgcca_res$call$blocks[[1]])), replace = TRUE) + }) - W <- W[!vapply(W, is.null, logical(1L))] + ### Run RGCCA on the bootstrap samples + W <- par_pblapply(v_inds, function(b) { + rgcca_bootstrap_k( + rgcca_res = rgcca_res, + inds = b, type = "AVE" + ) + }, n_cores = n_cores, verbose = verbose) res <- format_bootstrap_list(W, rgcca_res) J <- length(rgcca_res$blocks) diff --git a/R/rgcca_transform.R b/R/rgcca_transform.R index 9a9b492f..ddc2763d 100644 --- a/R/rgcca_transform.R +++ b/R/rgcca_transform.R @@ -45,8 +45,8 @@ rgcca_transform <- function(rgcca_res, blocks_test = rgcca_res$call$blocks) { } X_train <- rgcca_res$blocks[names(blocks_test)] blocks_test <- lapply(seq_along(blocks_test), function(j) { - x <- as.matrix(blocks_test[[j]]) - y <- as.matrix(X_train[[j]]) + x <- to_mat(blocks_test[[j]]) + y <- to_mat(X_train[[j]]) # Deal with qualitative block if (rgcca_res$opt$disjunction) { j_train <- which(names(rgcca_res$blocks) == names(blocks_test)[j]) diff --git a/R/scale2.R b/R/scale2.R index 8ff014c2..331c89cd 100644 --- a/R/scale2.R +++ b/R/scale2.R @@ -7,14 +7,15 @@ #' @title Scaling and Centering of Matrix-like Objects #' @noRd scale2 <- function(A, scale = TRUE, bias = TRUE) { + # Center the data + A <- scale(A, center = TRUE, scale = FALSE) + + # Scale if needed if (scale) { - A <- scale(A, center = TRUE, scale = FALSE) std <- sqrt(apply(A, 2, function(x) cov2(x, bias = bias))) - A <- sweep(A, 2, std, FUN = "/") - attr(A, "scaled:scale") <- std - return(A) + std <- pmax(.Machine$double.eps, std) # Account for potentially 0 std + A <- scale(A, center = FALSE, scale = std) } - A <- scale(A, center = TRUE, scale = FALSE) return(A) } diff --git a/R/scale_inertia.R b/R/scale_inertia.R index 2f213d8a..11172c4b 100644 --- a/R/scale_inertia.R +++ b/R/scale_inertia.R @@ -1,25 +1,22 @@ -#' Scale a list of blocks by dividing each block by its Frobenius norm +#' Scale a block by dividing it by its Frobenius norm #' @inheritParams scale_lambda1 #' @noRd -scale_inertia <- function(blocks, sqrt_N, scale, na.rm) { - blocks <- lapply(blocks, function(x) { - if (scale) { - fac <- sqrt(NCOL(x)) - } else { - if (na.rm) { - z <- x - z[is.na(z)] <- 0 - } else { - z <- x - } - fac <- 1 / sqrt_N * norm(z, type = "F") - } - y <- x / fac - if (scale) { - attr(y, "scaled:scale") <- attr(x, "scaled:scale") * fac - } else { - attr(y, "scaled:scale") <- rep(fac, NCOL(x)) - } - return(y) - }) +scale_inertia <- function(A, sqrt_N, scale, na.rm) { + if (na.rm) { + z <- A + z[is.na(z)] <- 0 + } else { + z <- A + } + fac <- 1 / sqrt_N * norm(z, type = "F") + if (fac == 0) { + return(A) + } + y <- A / fac + if (scale) { + attr(y, "scaled:scale") <- attr(A, "scaled:scale") * fac + } else { + attr(y, "scaled:scale") <- rep(fac, NCOL(A)) + } + return(y) } diff --git a/R/scale_lambda1.R b/R/scale_lambda1.R index 5a7d6401..8d4d476c 100644 --- a/R/scale_lambda1.R +++ b/R/scale_lambda1.R @@ -1,21 +1,28 @@ -#' Scale a list of blocks by dividing each block by its first singular value -#' @inheritParams scale +#' Scale a of block by dividing it by its first singular value +#' @inheritParams scaling #' @param sqrt_N A numeric which corresponds to the square root of the number of #' subjects, taking the bias into account. #' @noRd -scale_lambda1 <- function(blocks, sqrt_N, scale, na.rm) { - blocks <- lapply(blocks, function(x) { - lambda <- sqrt(ifelse( - NCOL(x) < NROW(x), - eigen(pm(t(x / sqrt_N), x / sqrt_N, na.rm = na.rm))$values[1], - eigen(pm(x / sqrt_N, t(x / sqrt_N), na.rm = na.rm))$values[1] - )) - y <- x / lambda - if (scale) { - attr(y, "scaled:scale") <- attr(x, "scaled:scale") * lambda - } else { - attr(y, "scaled:scale") <- rep(lambda, NCOL(x)) - } - return(y) - }) +scale_lambda1 <- function(A, sqrt_N, scale, na.rm) { + lambda <- sqrt(ifelse( + NCOL(A) < NROW(A), + eigen( + pm(t(A / sqrt_N), A / sqrt_N, na.rm = na.rm), + symmetric = TRUE, only.values = TRUE + )$values[1], + eigen( + pm(A / sqrt_N, t(A / sqrt_N), na.rm = na.rm), + symmetric = TRUE, only.values = TRUE + )$values[1] + )) + if (lambda == 0) { + return(A) + } + y <- A / lambda + if (scale) { + attr(y, "scaled:scale") <- attr(A, "scaled:scale") * lambda + } else { + attr(y, "scaled:scale") <- rep(lambda, NCOL(A)) + } + return(y) } diff --git a/R/scaling.R b/R/scaling.R index 49b5b6e0..d11abf12 100644 --- a/R/scaling.R +++ b/R/scaling.R @@ -8,18 +8,34 @@ scaling <- function(blocks, scale = TRUE, bias = TRUE, if (isTRUE(scale_block)) scale_block <- "inertia" sqrt_N <- sqrt(NROW(blocks[[1]]) + bias - 1) - # Center and eventually scale the blocks - blocks <- lapply( - blocks, - function(x) scale2(x, scale = scale, bias = bias) - ) + blocks <- lapply(blocks, function(x) { + # Store dim and dimnames + dim_x <- dim(x) + dimnames_x <- dimnames(x) - # Scale each block by a constant if requested - if (scale_block == "lambda1") { - blocks <- scale_lambda1(blocks, sqrt_N, scale, na.rm = na.rm) - } else if (scale_block == "inertia") { - blocks <- scale_inertia(blocks, sqrt_N, scale, na.rm = na.rm) - } + # Unfold the array if needed + if (length(dim_x) > 2) { + x <- matrix(x, nrow = nrow(x)) + } + + # Center and eventually scale the blocks + x <- scale2(x, scale = scale, bias = bias) + + # Scale each block by a constant if requested + if (scale_block == "lambda1") { + x <- scale_lambda1(x, sqrt_N, scale, na.rm = na.rm) + } else if (scale_block == "inertia") { + x <- scale_inertia(x, sqrt_N, scale, na.rm = na.rm) + } + + # Go back to a tensor + y <- array(x, dim = dim_x) + dimnames(y) <- dimnames_x + attr(y, "scaled:center") <- attr(x, "scaled:center") + attr(y, "scaled:scale") <- attr(x, "scaled:scale") + + return(y) + }) return(blocks) } diff --git a/R/select_analysis.R b/R/select_analysis.R index fd1bb490..77745687 100644 --- a/R/select_analysis.R +++ b/R/select_analysis.R @@ -21,6 +21,7 @@ #' @noRd select_analysis <- function(rgcca_args, blocks) { tau <- rgcca_args$tau + rank <- rgcca_args$rank ncomp <- rgcca_args$ncomp quiet <- rgcca_args$quiet scheme <- rgcca_args$scheme @@ -28,6 +29,7 @@ select_analysis <- function(rgcca_args, blocks) { response <- rgcca_args$response sparsity <- rgcca_args$sparsity comp_orth <- rgcca_args$comp_orth + mode_orth <- rgcca_args$mode_orth connection <- rgcca_args$connection superblock <- rgcca_args$superblock scale_block <- rgcca_args$scale_block @@ -40,6 +42,10 @@ select_analysis <- function(rgcca_args, blocks) { } } + if (any(vapply(blocks, function(x) length(dim(x)), FUN.VALUE = 1L) > 2)) { + method <- "tgcca" + } + method <- check_method(method) call <- list( @@ -58,6 +64,13 @@ select_analysis <- function(rgcca_args, blocks) { param <- "sparsity" penalty <- sparsity }, + "tgcca" = { + mode_orth <- check_mode_orth(mode_orth, blocks) + param <- "tau" + penalty <- tau + comp_orth <- TRUE + superblock <- FALSE + }, "pca" = { check_nblocks(blocks, "pca") param <- "tau" @@ -401,7 +414,7 @@ select_analysis <- function(rgcca_args, blocks) { } } - if (method %in% c("rgcca", "sgcca")) { + if (method %in% c("rgcca", "sgcca", "tgcca")) { scheme <- check_scheme(scheme) if (any(sparsity != 1)) { param <- "sparsity" @@ -438,6 +451,7 @@ select_analysis <- function(rgcca_args, blocks) { penalty <- c(penalty[seq(J)], pen) } } else { + rank <- check_rank(rank, blocks, mode_orth, ncomp = max(ncomp)) if (is.null(connection)) { connection <- connection_matrix(blocks, type = "pair") } else { @@ -453,11 +467,13 @@ select_analysis <- function(rgcca_args, blocks) { rgcca_args[[param]] <- penalty rgcca_args <- modifyList(rgcca_args, list( + rank = rank, ncomp = ncomp, scheme = scheme, method = method, response = response, comp_orth = comp_orth, + mode_orth = mode_orth, connection = connection, superblock = superblock, scale_block = scale_block diff --git a/R/sqrt_matrix.R b/R/sqrt_matrix.R new file mode 100644 index 00000000..b5fab53c --- /dev/null +++ b/R/sqrt_matrix.R @@ -0,0 +1,19 @@ +#' Compute the square root or the inverse of the square root of a +#' symmetric matrix. +#' @param X A symmetric matrix. +#' @param tol A relative tolerance to detect zero singular values. +#' @param inv A boolean indicating if the inverse of the square root must be +#' computed. +#' @noRd +sqrt_matrix <- function(X, tol = sqrt(.Machine$double.eps), inv = FALSE) { + eig <- eigen(X, symmetric = TRUE) + positive <- eig$values > max(tol * eig$values[1], 0) + d <- eig$values + if (inv) { + d[positive] <- 1 / sqrt(d[positive]) + } else { + d[positive] <- sqrt(d[positive]) + } + d[!positive] <- 0 + eig$vectors %*% diag(d, nrow = length(d)) %*% t(eig$vectors) +} diff --git a/R/subset_block_rows.R b/R/subset_block_rows.R new file mode 100644 index 00000000..adc7e65a --- /dev/null +++ b/R/subset_block_rows.R @@ -0,0 +1,28 @@ +subset_block_rows <- function(x, rows, drop = TRUE) { + UseMethod("subset_block_rows") +} + +#' @export +subset_block_rows.numeric <- function(x, rows, drop = TRUE) { + return(x[rows, drop = drop]) +} + +#' @export +subset_block_rows.data.frame <- function(x, rows, drop = TRUE) { + row.names <- attr(x, "row.names")[rows] + x <- apply(x, -1, "[", rows, drop = drop) + data.frame(x, row.names = row.names) +} + +#' @export +subset_block_rows.array <- function(x, rows, drop = TRUE) { + dim_x <- dim(x) + dn <- dimnames(x) + x <- apply(x, -1, "[", rows, drop = drop) + if (!drop && length(dim(x)) < length(dim_x)) { + x <- array(x, dim = c(1, dim_x[-1])) + rownames(x) <- ifelse(is.numeric(rows), dn[[1]][rows], rows) + dimnames(x)[-1] <- dn[-1] + } + return(x) +} diff --git a/R/subset_rows.R b/R/subset_rows.R deleted file mode 100644 index 49408036..00000000 --- a/R/subset_rows.R +++ /dev/null @@ -1,21 +0,0 @@ -#' The function subset_rows extracts rows from an object (vector, matrix, array, -#' data.frame). -#' @param x An object from which we want to extract rows -#' @param rows A set of rows -#' @importFrom stats complete.cases -#' @noRd -subset_rows <- function(x, rows) { - is.x.data.frame <- is.data.frame(x) - if (is.x.data.frame) { - row.names <- attr(x, "row.names")[rows] - } - if (is.vector(x)) { - x <- x[rows] - } else { - x <- apply(x, -1, "[", rows) - } - if (is.x.data.frame) { - x <- data.frame(x, row.names = row.names) - } - return(x) -} diff --git a/R/to_mat.R b/R/to_mat.R new file mode 100644 index 00000000..9ff87c39 --- /dev/null +++ b/R/to_mat.R @@ -0,0 +1,16 @@ +#' Return a matrix version of a block. If it is already a matrix, +#' nothing is done, if it is an array, the first dimension is +#' kept while the others are concatenated and colnames are adapted. +#' @param block a block seen by RGCCA. +#' @return A matrix version of the block +#' @noRd +to_mat <- function(block) { + if (is.matrix(block) || is.data.frame(block)) { + return(block) + } + z <- matrix(block, nrow = NROW(block)) + rownames(z) <- rownames(block) + grid <- expand.grid(dimnames(block)[-1]) + colnames(z) <- do.call(paste, c(grid, sep = "_")) + return(z) +} diff --git a/man/rgcca.Rd b/man/rgcca.Rd index 4e3c3df4..a81066a7 100644 --- a/man/rgcca.Rd +++ b/man/rgcca.Rd @@ -24,6 +24,9 @@ rgcca( quiet = TRUE, n_iter_max = 1000, comp_orth = TRUE, + rank = 1, + mode_orth = 1, + separable = TRUE, A = NULL, C = NULL ) @@ -99,8 +102,8 @@ algorithm is reported while computing.} \item{scale_block}{A logical value or a string indicating if each block is scaled. -If TRUE or "inertia", each block is divided by the sum of eigenvalues -of its empirical covariance matrix. +If TRUE or "inertia", each block is divided by the square root of the sum +of eigenvalues of its empirical covariance matrix. If "lambda1", each block is divided by the square root of the highest eigenvalue of its empirical covariance matrix. @@ -165,6 +168,24 @@ iterations.} \item{comp_orth}{A logical value indicating if the deflation should lead to orthogonal block components or orthogonal block weight vectors.} +\item{rank}{Either an integer, an integer vector of +size \eqn{J} or an integer matrix +of dimension \eqn{\textrm{max}(\textrm{ncomp}) \times J} giving the rank +of the decomposition sought for the canonical vectors in TGCCA. +If block \eqn{j} is an array with at least three dimensions, rank must be +comprised between 1 and the number of variables on the mode bearing the +orthogonality constraint. See \eqn{\textrm{mode_orth}}.} + +\item{mode_orth}{Either an integer or an integer vector of size \eqn{J} +designating the mode which associated set of factors will be orthogonal +in the decomposition sought by TGCCA. If block \eqn{j} is an array with +\eqn{d > 2} dimensions, \eqn{\textrm{mode_orth}} must be comprised between +1 and \eqn{d - 1}.} + +\item{separable}{A logical value if the regularization matrices must be +estimated as separable matrices (i.e. products of Kronecker products +matching the dimensions of the modes in the data).} + \item{A}{Deprecated argument, please use blocks instead.} \item{C}{Deprecated argument, please use connection instead.} diff --git a/man/rgcca_bootstrap.Rd b/man/rgcca_bootstrap.Rd index b3f5ae0b..08a2966e 100644 --- a/man/rgcca_bootstrap.Rd +++ b/man/rgcca_bootstrap.Rd @@ -4,14 +4,7 @@ \alias{rgcca_bootstrap} \title{Bootstrap confidence intervals and p-values} \usage{ -rgcca_bootstrap( - rgcca_res, - n_boot = 100, - n_cores = 1, - balanced = TRUE, - keep_all_variables = FALSE, - verbose = TRUE -) +rgcca_bootstrap(rgcca_res, n_boot = 100, n_cores = 1, verbose = TRUE) } \arguments{ \item{rgcca_res}{A fitted RGCCA object (see \code{\link[RGCCA]{rgcca}}).} @@ -20,13 +13,6 @@ rgcca_bootstrap( \item{n_cores}{The number of cores used for parallelization.} -\item{balanced}{A logical value indicating if a balanced bootstrap procedure -is performed or not (default is TRUE).} - -\item{keep_all_variables}{A logical value indicating if all variables have -to be kept even when some of them have null variance for at least one -bootstrap sample (default is FALSE).} - \item{verbose}{A logical value indicating if the progress of the bootstrap procedure is reported.} } diff --git a/man/rgcca_cv.Rd b/man/rgcca_cv.Rd index da447e30..5e96460a 100644 --- a/man/rgcca_cv.Rd +++ b/man/rgcca_cv.Rd @@ -34,6 +34,9 @@ rgcca_cv( verbose = TRUE, n_iter_max = 1000, comp_orth = TRUE, + rank = 1, + mode_orth = 1, + separable = TRUE, ... ) } @@ -114,8 +117,8 @@ superblock option is used.} \item{scale_block}{A logical value or a string indicating if each block is scaled. -If TRUE or "inertia", each block is divided by the sum of eigenvalues -of its empirical covariance matrix. +If TRUE or "inertia", each block is divided by the square root of the sum +of eigenvalues of its empirical covariance matrix. If "lambda1", each block is divided by the square root of the highest eigenvalue of its empirical covariance matrix. @@ -223,6 +226,24 @@ iterations.} \item{comp_orth}{A logical value indicating if the deflation should lead to orthogonal block components or orthogonal block weight vectors.} +\item{rank}{Either an integer, an integer vector of +size \eqn{J} or an integer matrix +of dimension \eqn{\textrm{max}(\textrm{ncomp}) \times J} giving the rank +of the decomposition sought for the canonical vectors in TGCCA. +If block \eqn{j} is an array with at least three dimensions, rank must be +comprised between 1 and the number of variables on the mode bearing the +orthogonality constraint. See \eqn{\textrm{mode_orth}}.} + +\item{mode_orth}{Either an integer or an integer vector of size \eqn{J} +designating the mode which associated set of factors will be orthogonal +in the decomposition sought by TGCCA. If block \eqn{j} is an array with +\eqn{d > 2} dimensions, \eqn{\textrm{mode_orth}} must be comprised between +1 and \eqn{d - 1}.} + +\item{separable}{A logical value if the regularization matrices must be +estimated as separable matrices (i.e. products of Kronecker products +matching the dimensions of the modes in the data).} + \item{...}{Additional parameters to be passed to prediction_model.} } \value{ diff --git a/man/rgcca_permutation.Rd b/man/rgcca_permutation.Rd index d8e528a1..f07d17e2 100644 --- a/man/rgcca_permutation.Rd +++ b/man/rgcca_permutation.Rd @@ -29,7 +29,10 @@ rgcca_permutation( rgcca_res = NULL, verbose = TRUE, n_iter_max = 1000, - comp_orth = TRUE + comp_orth = TRUE, + rank = 1, + mode_orth = 1, + separable = TRUE ) } \arguments{ @@ -77,8 +80,8 @@ are reported.} \item{scale_block}{A logical value or a string indicating if each block is scaled. -If TRUE or "inertia", each block is divided by the sum of eigenvalues -of its empirical covariance matrix. +If TRUE or "inertia", each block is divided by the square root of the sum +of eigenvalues of its empirical covariance matrix. If "lambda1", each block is divided by the square root of the highest eigenvalue of its empirical covariance matrix. @@ -199,6 +202,24 @@ iterations.} \item{comp_orth}{A logical value indicating if the deflation should lead to orthogonal block components or orthogonal block weight vectors.} + +\item{rank}{Either an integer, an integer vector of +size \eqn{J} or an integer matrix +of dimension \eqn{\textrm{max}(\textrm{ncomp}) \times J} giving the rank +of the decomposition sought for the canonical vectors in TGCCA. +If block \eqn{j} is an array with at least three dimensions, rank must be +comprised between 1 and the number of variables on the mode bearing the +orthogonality constraint. See \eqn{\textrm{mode_orth}}.} + +\item{mode_orth}{Either an integer or an integer vector of size \eqn{J} +designating the mode which associated set of factors will be orthogonal +in the decomposition sought by TGCCA. If block \eqn{j} is an array with +\eqn{d > 2} dimensions, \eqn{\textrm{mode_orth}} must be comprised between +1 and \eqn{d - 1}.} + +\item{separable}{A logical value if the regularization matrices must be +estimated as separable matrices (i.e. products of Kronecker products +matching the dimensions of the modes in the data).} } \value{ A rgcca_permutation object that can be printed and plotted. diff --git a/man/rgcca_stability.Rd b/man/rgcca_stability.Rd index 3542d944..9b4e8c38 100644 --- a/man/rgcca_stability.Rd +++ b/man/rgcca_stability.Rd @@ -9,30 +9,21 @@ rgcca_stability( keep = vapply(rgcca_res$a, function(x) mean(x != 0), FUN.VALUE = 1), n_boot = 100, n_cores = 1, - verbose = TRUE, - balanced = TRUE, - keep_all_variables = FALSE + verbose = TRUE ) } \arguments{ -\item{rgcca_res}{A fitted RGCCA object (see \code{\link[RGCCA]{rgcca}}).} +\item{rgcca_res}{A fitted RGCCA object (see \code{\link[RGCCA]{rgcca}}).} \item{keep}{A numeric vector indicating the proportion of variables per block to select.} \item{n_boot}{The number of bootstrap samples (default: 100).} -\item{n_cores}{The number of cores for parallelization.} +\item{n_cores}{The number of cores used for parallelization.} \item{verbose}{A logical value indicating if the progress of the procedure is reported.} - -\item{balanced}{A logical value indicating if a balanced bootstrap procedure -is performed or not (default is TRUE).} - -\item{keep_all_variables}{A logical value indicating if all variables have -to be kept even when some of them have null variance for at least one -bootstrap sample (default is FALSE).} } \value{ A rgcca_stability object that can be printed and plotted. diff --git a/tests/testthat/test_check_blocks.r b/tests/testthat/test_check_blocks.r index ffdd75bd..693703a5 100644 --- a/tests/testthat/test_check_blocks.r +++ b/tests/testthat/test_check_blocks.r @@ -51,17 +51,6 @@ test_that("check_blocks renames blocks if names are missing", { names(check_blocks(list(agriculture = X_agric, industry = X_ind))), c("agriculture", "industry") ) - - # Check for messages as well - expect_message( - check_blocks(list(agriculture = X_agric, X_ind), quiet = FALSE), - "Missing block names are automatically labeled.", - fixed = TRUE - ) - expect_message( - check_blocks(list(agriculture = X_agric, industry = X_ind), quiet = FALSE), - NA - ) }) test_that("check_blocks add colnames with blocks with no colnames", { @@ -70,13 +59,8 @@ test_that("check_blocks add colnames with blocks with no colnames", { expect_equal(colnames(check_blocks(blocks)[[1]]), colnames(X_agric)) colnames(blocks[[1]]) <- NULL expect_equal( - colnames(check_blocks(blocks)[[1]]), paste0("V1_", seq_len(NCOL(X_agric))) + colnames(check_blocks(blocks)[[1]]), paste0("agri_", seq_len(NCOL(X_agric))) ) - expect_message(check_blocks(blocks, quiet = FALSE), - "Missing colnames are automatically labeled.", - fixed = TRUE - ) - expect_message(check_blocks(list(agric = X_agric), quiet = FALSE), NA) }) test_that("check_blocks add prefixes to avoid duplicated colnames", { @@ -85,7 +69,7 @@ test_that("check_blocks add prefixes to avoid duplicated colnames", { colnames(blocks[[2]]) <- c("gini", "labo") expect_message( check_blocks(blocks, quiet = FALSE), - "Duplicated colnames are modified to avoid confusion.", + "Duplicated dimnames across blocks are modified to avoid confusion.", fixed = TRUE ) blocks2 <- check_blocks(blocks, quiet = FALSE) @@ -100,7 +84,7 @@ test_that("check_blocks add prefixes to avoid duplicated colnames", { test_that("check_blocks raises an error if there are duplicated rownames", { blocks <- list(rbind(X_agric, X_agric), rbind(X_ind, X_ind)) expect_error( - check_blocks(blocks), "blocks have duplicated rownames.", + check_blocks(blocks), "blocks have duplicated names on dimension 1.", fixed = TRUE ) }) @@ -109,28 +93,14 @@ test_that("check_blocks creates rownames if no block has rownames", { expect_equal( rownames(check_blocks(X_polit)[[1]]), paste0("S", seq_along(X_polit)) ) - expect_message( - check_blocks(list(polit = X_polit), quiet = FALSE), - "Missing rownames are automatically labeled.", - fixed = TRUE - ) - expect_message(check_blocks(list(agric = X_agric), quiet = FALSE), NA) - expect_error( - check_blocks(X_polit, allow_unnames = FALSE), "blocks must have rownames.", - fixed = TRUE - ) + expect_message(check_blocks(list(polit = X_polit), quiet = FALSE), NA) }) test_that("check_blocks add rownames if a block lacks rownames and other rownames are compatible", { blocks <- list(agric = X_agric, ind = X_ind, polit = X_polit) expect_equal(rownames(check_blocks(blocks)[[3]]), rownames(blocks[[1]])) - expect_message( - check_blocks(blocks, quiet = FALSE), - "Missing rownames are automatically labeled.", - fixed = TRUE - ) - expect_message(check_blocks(blocks[-3], quiet = FALSE), NA) + expect_message(check_blocks(blocks, quiet = FALSE), NA) }) test_that("check_blocks raises an error if a block lacks rownames and other @@ -138,28 +108,70 @@ test_that("check_blocks raises an error if a block lacks rownames and other blocks <- list(agric = X_agric, ind = X_ind, polit = X_polit) rownames(blocks[[2]]) <- rownames(blocks[[1]])[c(2, 1, seq(3, 47))] expect_error(check_blocks(blocks), paste0( - "some blocks are missing rownames, and the other blocks' ", - "rownames are not consistent." + "some blocks are missing names on dimension 1, and the other blocks' ", + "names on dimension 1 are not consistent." ), fixed = TRUE) }) -test_that("check_blocks allows only blocks with the same rownames if - add_NAlines is FALSE", { - blocks <- list(agric = X_agric, ind = X_ind) - rownames(blocks[[2]]) <- c("xxx", rownames(blocks[[1]])[seq(2, 47)]) - expect_error(check_blocks(blocks, add_NAlines = FALSE), - "blocks must have the same rownames", - fixed = TRUE - ) - rownames(blocks[[2]]) <- rownames(blocks[[1]])[c(2, 1, seq(3, 47))] - expect_error(check_blocks(blocks, add_NAlines = FALSE), NA) -}) - test_that("check_blocks returns blocks with the same rownames in the same order", { blocks <- list(agric = X_agric, ind = X_ind) rownames(blocks[[2]]) <- c("xxx", rownames(blocks[[1]])[seq(2, 47)]) - blocks2 <- check_blocks(blocks, add_NAlines = TRUE) + blocks2 <- check_blocks(blocks) expect_equal(rownames(blocks2[[1]]), rownames(blocks2[[2]])) expect_equal(rownames(blocks2[[1]]), union(rownames(blocks[[1]]), "xxx")) }) + +### Make sure check_blocks works for arrays +n <- nrow(X_agric) +array_block <- array(rnorm(n * 12 * 15), dim = c(n, 12, 15)) + +test_that("check_blocks still works when a block is an array", { + blocks <- list(agric = X_agric, ind = X_ind, array = array_block) + blocks <- check_blocks(blocks) + expect_equal(rownames(blocks[[1]]), rownames(blocks[[3]])) + expect_equal( + dimnames(check_blocks(blocks)[[3]])[[2]], + paste0("array_2_", seq_len(dim(array_block)[2])) + ) + expect_equal( + dimnames(check_blocks(blocks)[[3]])[[3]], + paste0("array_3_", seq_len(dim(array_block)[3])) + ) + + blocks <- list(agric = X_agric, ind = X_ind, array = array_block) + dimnames(blocks[[3]])[c(2, 3)] <- list( + seq_len(dim(array_block)[2]), seq_len(dim(array_block)[3]) + ) + expect_message( + check_blocks(blocks, quiet = FALSE), + "Duplicated dimnames are modified to avoid confusion.", fixed = TRUE + ) + + dimnames(blocks[[3]])[c(2, 3)] <- list( + paste0("2", seq_len(dim(array_block)[2])), + paste0("3", seq_len(dim(array_block)[3])) + ) + blocks2 <- check_blocks(blocks) + expect_equal(dimnames(blocks2[[3]])[-1], dimnames(blocks[[3]])[-1]) + + rownames(blocks[[3]]) <- sample( + rownames(blocks[[1]]), size = n, replace = FALSE + ) + blocks2 <- check_blocks(blocks) + expect_equal(rownames(blocks2[[1]]), rownames(blocks2[[3]])) + expect_equal( + blocks2[[3]][rownames(blocks2[[3]]), , ], + blocks[[3]][rownames(blocks2[[3]]), , ] + ) + + rownames(blocks[[3]])[1] <- "xxx" + blocks2 <- check_blocks(blocks) + expect_equal(rownames(blocks2[[1]]), union(rownames(blocks[[1]]), "xxx")) + + rownames(blocks[[2]]) <- NULL + expect_error(check_blocks(blocks), paste0( + "some blocks are missing names on dimension 1, and the other blocks' ", + "names on dimension 1 are not consistent." + ), fixed = TRUE) +}) diff --git a/tests/testthat/test_checks.r b/tests/testthat/test_checks.r index 9a07fb51..adc45075 100644 --- a/tests/testthat/test_checks.r +++ b/tests/testthat/test_checks.r @@ -458,3 +458,37 @@ test_that("check_scheme passes when scheme is valid", { g <- function(x) x^2 expect_error(check_scheme(g), NA) }) + +# Test check_rank +n <- nrow(blocks[[1]]) +array_blocks <- blocks +array_blocks[[4]] <- array(seq(n * 10 * 5), dim = c(n, 10, 5)) +test_that("check_rank raises an error for invalid rank", { + expect_error( + check_rank("toto", array_blocks, rep(1, 4), 1), + "rank must be numeric.", fixed = TRUE + ) + expect_error( + check_rank(rep(1, 5), array_blocks, rep(1, 4), 1), + paste0("rank must have the same number of columns (actually 5) ", + "as the number of blocks (4)."), + fixed = TRUE + ) + expect_error( + check_rank(c(1, 1, 1, 7), array_blocks, c(1, 1, 1, 2), 1), + paste0("rank[4] should be comprise between 1 and 5 (that is the number of ", + "variables of the mode bearing the orthogonality for block 4)."), + fixed = TRUE + ) + expect_error( + check_rank(matrix(rep(1, 8), nrow = 2), array_blocks, rep(1, 4), 1), + "rank must have 1 rows.", fixed = TRUE + ) +}) +test_that("check_rank passes when rank is valid", { + expect_equal(check_rank(1, array_blocks, rep(1, 4), 1), matrix(1, 1, 4)) + r <- matrix(1, 2, 4) + expect_equal(check_rank(r, array_blocks, rep(1, 4), 2), r) + r[2, 4] <- 7 + expect_equal(check_rank(r, array_blocks, rep(1, 4), 2), r) +}) diff --git a/tests/testthat/test_generate_resampling.r b/tests/testthat/test_generate_resampling.r deleted file mode 100644 index c3932952..00000000 --- a/tests/testthat/test_generate_resampling.r +++ /dev/null @@ -1,281 +0,0 @@ -data("Russett") -############################################## -# Test on the risk of having null variance # -# variables in at least one bootstrap sample # -############################################## -blocks <- list( - agriculture = Russett[, seq(3)], - industry = Russett[, 4:5], - politic = Russett[, 6:11] -) - -ncomp <- 1 -# Rent is trapped. -blocks$agriculture$rent <- 0 -blocks$agriculture$rent[1:4] <- 1 -rgcca_out <- rgcca(blocks, ncomp = ncomp) - -# When `pval = 1`, `generate_resampling` fails to identify `rent` as a risky -# variable, both when bootstraps are balanced or not. -set.seed(8882) -sample_out_balanced <- generate_resampling( - rgcca_res = rgcca_out, n_boot = 4, - balanced = TRUE, pval = 1 -) -sample_out_unbalanced <- generate_resampling( - rgcca_res = rgcca_out, n_boot = 4, - balanced = FALSE, pval = 1 -) -test_that("pVAL_high_noRiskyVAR", { - expect_null(sample_out_balanced$sd_null) - expect_null(sample_out_unbalanced$sd_null) -}) - -# Now, if `pval` is set to default, when `balanced = TRUE` and -# `keep_all_variables = FALSE`, a warning is generated to inform that variable -# `rent` is removed and `rent` is indeed removed. -set.seed(8882) -test_that("generate_resampling_missing_val_identification", { - sample_out <- expect_warning( - generate_resampling(rgcca_res = rgcca_out, n_boot = 4, balanced = TRUE), - paste0( - "Variables: rent appear to be of null ", - "variance in some bootstrap samples and thus ", - "were removed from all samples. \n", - " ==> RGCCA is run again without these variables." - ) - ) - expect_equal( - names(sample_out$sd_null$agriculture), - "rent" - ) -}) - -# Same situation, but this time, `pval` is set to its default value and it is -# specifically ask that all variables are kept. It is thus checked that `rent` -# is still there. -set.seed(8882) -sample_out <- generate_resampling( - rgcca_res = rgcca_out, n_boot = 4, - balanced = TRUE, keep_all_variables = TRUE -) -test_that("generate_resampling_keepAllVAriables", { - expect_null(sample_out$sd_null) -}) - - -############################################# -# Test with 2 null variances variables # -############################################# -# Now `rent` and `death` are trapped to be of null variance. -# Four tests are performed : -# - "generate_resampling_NUL_variance_1" : when `balanced = TRUE` -# and `keep_all_variables = FALSE`, a warning to inform that `rent` -# and `death` are removed is raised. -# - "generate_resampling_NUL_variance_2" : when `balanced = FALSE` -# and `keep_all_variables = FALSE`, a warning to inform that `rent` -# and `death` are removed is raised. -# - "generate_resampling_NUL_variance_3" : when `balanced = TRUE` -# and `keep_all_variables = TRUE`, an error is raised as it is impossible -# to keep all variables here because some have null variances. -# - "generate_resampling_NUL_variance_4" : when `balanced = TRUE or FALSE` -# and `keep_all_variables = FALSE`, check that `death` and `rent` are -# indeed removed. -N <- NROW(Russett) -rgcca_out$call$blocks$agriculture[, "rent"] <- rep(0, N) -rgcca_out$call$blocks$politic[, "death"] <- rep(2, N) - -test_that("generate_resampling_NUL_variance_1", { - sample_out_balanced_1 <- expect_warning( - generate_resampling(rgcca_res = rgcca_out, n_boot = 4, balanced = TRUE), - paste0( - "Variables: rent - death appear to be of null ", - "variance in some bootstrap samples and thus ", - "were removed from all samples. \n", - " ==> RGCCA is run again without these variables." - ) - ) - - sample_out_balanced_2 <- expect_warning( - generate_resampling(rgcca_res = rgcca_out, n_boot = 4, balanced = FALSE), - paste0( - "Variables: rent - death appear to be of null ", - "variance in some bootstrap samples and thus ", - "were removed from all samples. \n", - " ==> RGCCA is run again without these variables." - ) - ) - - expect_equal(unname(unlist(lapply( - sample_out_balanced_1$sd_null, - function(x) names(x) - ))), c("rent", "death")) - expect_equal(unname(unlist(lapply( - sample_out_balanced_2$sd_null, - function(x) names(x) - ))), c("rent", "death")) -}) - -test_that( - "generate_resampling_NUL_variance_2", - expect_error( - generate_resampling( - rgcca_res = rgcca_out, n_boot = 4, - balanced = TRUE, keep_all_variables = TRUE - ), - paste0( - "Impossible to define all bootstrap samples ", - "without variables with null variance. Please ", - "consider removing these variables: rent - death.", - " Please, consider unbalanced bootstrap by ", - "setting 'balanced' to FALSE." - ) - ) -) - -######################################### -# Test with 2 very risky variables # -######################################### -# Now `rent` and `death` are trapped to be very risky variables (only 1 -# observation differs from the others). -# Four tests are performed : -# - "generate_resampling_veryRisky_1" : when `balanced = TRUE` -# and `keep_all_variables = FALSE`, a warning to inform that `rent` -# and `death` are removed is raised. -# - "generate_resampling_veryRisky_2" : when `balanced = FALSE` -# and `keep_all_variables = FALSE`, a warning to inform that `rent` -# and `death` are removed is raised. -# - "generate_resampling_veryRisky_3" : when `balanced = TRUE` -# and `keep_all_variables = TRUE`, an error is raised as it is highly -# unlikely to keep all variables here because some have almost null -# variances. -# - "generate_resampling_veryRisky_4" : when `balanced = TRUE or FALSE` -# and `keep_all_variables = FALSE`, check that `death` and `rent` are -# indeed removed. -# - "generate_resampling_veryRisky_5" : when `balanced = FALSE` -# and `keep_all_variables = TRUE`, check that no variable is removed. -N <- NROW(Russett) -rgcca_out$call$blocks$agriculture[, "rent"] <- c(1, rep(0, N - 1)) -rgcca_out$call$blocks$politic[, "death"] <- c(1, rep(2, N - 1)) -set.seed(553) -test_that("generate_resampling_veryRisky_1", { - sample_out_balanced_1 <- expect_warning( - generate_resampling(rgcca_res = rgcca_out, n_boot = 4, balanced = TRUE), - paste0( - "Variables: rent - death appear to be of null ", - "variance in some bootstrap samples and thus ", - "were removed from all samples. \n", - " ==> RGCCA is run again without these variables." - ) - ) - - sample_out_balanced_2 <- expect_warning( - generate_resampling(rgcca_res = rgcca_out, n_boot = 4, balanced = FALSE), - paste0( - "Variables: rent - death appear to be of null ", - "variance in some bootstrap samples and thus ", - "were removed from all samples. \n", - " ==> RGCCA is run again without these variables." - ) - ) - - expect_equal(unname(unlist(lapply( - sample_out_balanced_1$sd_null, - function(x) names(x) - ))), c("rent", "death")) - expect_equal(unname(unlist(lapply( - sample_out_balanced_2$sd_null, - function(x) names(x) - ))), c("rent", "death")) -}) - -set.seed(5553) -test_that( - "generate_resampling_veryRisky_2", - expect_error( - generate_resampling( - rgcca_res = rgcca_out, n_boot = 4, - balanced = TRUE, keep_all_variables = TRUE - ), - paste0( - "Impossible to define all bootstrap samples ", - "without variables with null variance. Please ", - "consider removing these variables: rent - death.", - " Please, consider unbalanced bootstrap by ", - "setting 'balanced' to FALSE." - ) - ) -) - -set.seed(53) -sample_out_balanced <- generate_resampling( - rgcca_res = rgcca_out, n_boot = 4, - balanced = FALSE, keep_all_variables = TRUE -) -test_that("generate_resampling_veryRisky_5", { - expect_null(sample_out_balanced$sd_null) -}) - - -################################################################## -# Test with 2 very risky variables on a block of 2 variables # -################################################################## -# Now `rent` and `death` are trapped to be very risky variables (only 1 -# observation differs from the others). -# Four tests are performed : -# - "generate_resampling_ALL_Block_1" : when `balanced = TRUE` -# and `keep_all_variables = FALSE`, an error is raised as it want to remove -# all the variables from block `industry`. -# - "generate_resampling_ALL_Block_2" : when `balanced = FALSE` -# and `keep_all_variables = FALSE`, an error is raised as it want to remove -# all the variables from block `industry`. -# - "generate_resampling_ALL_Block_3" : when `balanced = TRUE or FALSE` -# and `keep_all_variables = TRUE` with a different random initialization, -# no error is raised as no variable needs to be removed. -rgcca_out$call$blocks$industry[, "gnpr"] <- c(1, rep(0, N - 1)) -rgcca_out$call$blocks$industry[, "labo"] <- c(1, rep(2, N - 1)) -set.seed(54) -test_that( - "generate_resampling_ALL_Block_1", - expect_error( - generate_resampling( - rgcca_res = rgcca_out, n_boot = 4, - balanced = TRUE - ), - paste0( - "The variance of all the variables from blocks:", - " industry appear to be null in some bootstrap ", - "samples. Please consider removing them." - ) - ) -) -set.seed(52) -test_that( - "generate_resampling_ALL_Block_2", - expect_error( - generate_resampling( - rgcca_res = rgcca_out, n_boot = 4, - balanced = FALSE - ), - paste0( - "The variance of all the variables from blocks:", - " industry appear to be null in some bootstrap ", - "samples. Please consider removing them." - ) - ) -) - -set.seed(1047) -sample_out_balanced <- generate_resampling( - rgcca_res = rgcca_out, n_boot = 4, - balanced = TRUE -) -set.seed(6576) -sample_out_unbalanced <- generate_resampling( - rgcca_res = rgcca_out, n_boot = 4, - balanced = FALSE -) -test_that("generate_resampling_ALL_Block_3", { - expect_null(sample_out_balanced$sd_null) - expect_null(sample_out_unbalanced$sd_null) -}) diff --git a/tests/testthat/test_handle_NA.r b/tests/testthat/test_handle_NA.r index cd50e5b8..cbd126ca 100644 --- a/tests/testthat/test_handle_NA.r +++ b/tests/testthat/test_handle_NA.r @@ -7,7 +7,7 @@ blocks <- list( agriculture = Russett[, seq(3)], industry = Russett[, 4:5], politic = Russett[, 6:11], - target = Russett[, 11] + target = matrix(Russett[, 11]) ) # Add missing values @@ -23,7 +23,7 @@ test_that("handle_NA selects the common rows without missing values when NA_method is \"na.omit\"", { tmp <- handle_NA(blocks, NA_method = "na.omit") for (j in seq_along(blocks)) { - expect_equal(tmp$blocks[[j]], subset_rows(blocks[[j]], -ind_NA)) + expect_equal(tmp$blocks[[j]], subset_block_rows(blocks[[j]], -ind_NA)) expect_false(tmp$na.rm) } }) diff --git a/tests/testthat/test_intersection.r b/tests/testthat/test_intersection.r index c2f605c6..d6a1bd67 100644 --- a/tests/testthat/test_intersection.r +++ b/tests/testthat/test_intersection.r @@ -7,7 +7,7 @@ blocks <- list( agriculture = Russett[, seq(3)], industry = Russett[, 4:5], politic = Russett[, 6:11], - target = Russett[, 11] + target = matrix(Russett[, 11]) ) # Add missing values @@ -22,7 +22,7 @@ ind_NA <- c(2, 4, 8, 12, 17, 23, 30, 32, 40, 42) test_that("intersection_list selects the common rows without missing values", { blocks_inter <- intersection_list(blocks) for (j in seq_along(blocks)) { - expect_equal(blocks_inter[[j]], subset_rows(blocks[[j]], -ind_NA)) + expect_equal(blocks_inter[[j]], subset_block_rows(blocks[[j]], -ind_NA)) } }) test_that("intersection_list raises an error if there is less than 3 subjects @@ -38,3 +38,19 @@ test_that("intersection_list raises an error if there is less than 3 subjects fixed = TRUE ) }) + +# Test with an array block +n <- nrow(blocks[[1]]) +blocks[[5]] <- array(rnorm(n * 10 * 7), dim = c(n, 10, 7)) +blocks[[5]][c(3, 7, 8), , ] <- NA +blocks[[5]][c(12, 24), 1, ] <- NA +blocks[[5]][19, , 6] <- NA +ind_NA <- c(2, 3, 4, 7, 8, 12, 17, 19, 23, 24, 30, 32, 40, 42) + +test_that("intersection_list selects the common rows without missing + values with arrays", { + blocks_inter <- intersection_list(blocks) + for (j in seq_along(blocks)) { + expect_equal(blocks_inter[[j]], subset_block_rows(blocks[[j]], -ind_NA)) + } +}) diff --git a/tests/testthat/test_khatri_rao.R b/tests/testthat/test_khatri_rao.R new file mode 100644 index 00000000..3b08c45e --- /dev/null +++ b/tests/testthat/test_khatri_rao.R @@ -0,0 +1,24 @@ +# Expect to match computed by hand Khatri-Rao product +x <- matrix(c( + 1, 0, 2, + 0, -1, 1 +), nrow = 2, ncol = 3, byrow = TRUE) + +y <- matrix(c( + 0, 3, 1, + 1, 2, -5, + -1, 0, 6 +), nrow = 3, ncol = 3, byrow = TRUE) + +res <- matrix(c( + 0, 0, 2, + 1, 0, -10, + -1, 0, 12, + 0, -3, 1, + 0, -2, -5, + 0, 0, 6 +), nrow = 6, ncol = 3, byrow = T) + +test_that("test_khatri_rao", { + expect_identical(khatri_rao(x = x, y = y), res) +}) diff --git a/tests/testthat/test_mode_product.R b/tests/testthat/test_mode_product.R new file mode 100644 index 00000000..f67bb705 --- /dev/null +++ b/tests/testthat/test_mode_product.R @@ -0,0 +1,48 @@ +test_that("mode_product works the same way as the matrix product", { + X <- matrix(rnorm(30 * 12), nrow = 30, ncol = 12) + y <- rnorm(12) + res <- mode_product(X, y, m = 2) + expect_equal(length(res), 30) + expect_equal(res, X %*% y) + + y <- matrix(rnorm(12 * 17), nrow = 12, ncol = 17) + res <- mode_product(X, y, m = 2) + expect_equal(dim(res), c(30, 17)) + expect_equal(res, X %*% y) +}) + +test_that("mode_product works between a tensor and a vector", { + X <- array(rnorm(40 * 5 * 7), dim = c(40, 5, 7)) + y <- rnorm(5) + res <- mode_product(X, y, m = 2) + expect_equal(dim(res), c(40, 1, 7)) + expect_equal( + as.vector(res), drop(t(as.vector(X)) %*% (diag(7) %x% y %x% diag(40))) + ) + + y <- rnorm(7) + res <- mode_product(X, y, m = 3) + expect_equal(dim(res), c(40, 5, 1)) + expect_equal( + as.vector(res), drop(t(as.vector(X)) %*% (y %x% diag(5) %x% diag(40))) + ) +}) + +test_that("mode_product works between a tensor and a matrix", { + X <- array(rnorm(20 * 6 * 4), dim = c(20, 6, 4)) + y <- matrix(rnorm(6 * 7), nrow = 6, ncol = 7) + res <- mode_product(X, y, m = 2) + expect_equal(dim(res), c(20, 7, 4)) + expect_equal( + matrix(aperm(res, c(2, 1, 3)), nrow = 7), + t(y) %*% matrix(aperm(X, c(2, 1, 3)), nrow = 6) + ) + + y <- matrix(rnorm(4 * 13), nrow = 4, ncol = 13) + res <- mode_product(X, y, m = 3) + expect_equal(dim(res), c(20, 6, 13)) + expect_equal( + matrix(aperm(res, c(3, 1, 2)), nrow = 13), + t(y) %*% matrix(aperm(X, c(3, 1, 2)), nrow = 4) + ) +}) diff --git a/tests/testthat/test_remove_null_sd.r b/tests/testthat/test_remove_null_sd.r deleted file mode 100644 index a8fb62ea..00000000 --- a/tests/testthat/test_remove_null_sd.r +++ /dev/null @@ -1,26 +0,0 @@ -#' # remove_null_sd test - -#''' -X1 <- vapply(seq(3), function(x) rnorm(10), FUN.VALUE = double(10)) -X2 <- rep(1, 10) -X3 <- rep(1, 10) -X3[1:4] <- NA -X4 <- rep(NA, 10) -df <- cbind(X1, X2, X3) -df2 <- cbind(X1, X2, X4) - -test_that("remove_null_sd removes columns with null variance", { - res <- remove_null_sd(list(df)) - expect_equal(df[, seq(3)], res$list_m[[1]]) - removed_columns <- c(4, 5) - names(removed_columns) <- c("X2", "X3") - expect_equal(removed_columns, res$column_sd_null[[1]]) -}) - -test_that("remove_null_sd removes columns of NA", { - res <- remove_null_sd(list(df2)) - expect_equal(df[, seq(3)], res$list_m[[1]]) - removed_columns <- c(4, 5) - names(removed_columns) <- c("X2", "X4") - expect_equal(removed_columns, res$column_sd_null[[1]]) -}) diff --git a/tests/testthat/test_rgcca.R b/tests/testthat/test_rgcca.R index 7f9bf684..2e768fb0 100644 --- a/tests/testthat/test_rgcca.R +++ b/tests/testthat/test_rgcca.R @@ -196,33 +196,35 @@ test_that("Block weights can be retrieved using the superblock weights", { }) ##### Retrieve MFA with RGCCA ##### -df <- Russett[, c( - "gini", "farm", "rent", "gnpr", "labo", - "inst", "ecks", "death", "demostab", "dictator" -)] -fit.mfa <- FactoMineR::MFA(df, - group = c(3, 2, 5), ncp = 2, type = rep("s", 3), - graph = FALSE -) +if ("FactoMineR" %in% rownames(installed.packages())) { + df <- Russett[, c( + "gini", "farm", "rent", "gnpr", "labo", + "inst", "ecks", "death", "demostab", "dictator" + )] + fit.mfa <- FactoMineR::MFA(df, + group = c(3, 2, 5), ncp = 2, type = rep("s", 3), + graph = FALSE + ) -X_agric <- Russett[, c("gini", "farm", "rent")] -X_ind <- Russett[, c("gnpr", "labo")] -X_polit <- Russett[, c( - "inst", "ecks", "death", - "demostab", "dictator" -)] -A <- list(Agric = X_agric, Ind = X_ind, Polit = X_polit) + X_agric <- Russett[, c("gini", "farm", "rent")] + X_ind <- Russett[, c("gnpr", "labo")] + X_polit <- Russett[, c( + "inst", "ecks", "death", + "demostab", "dictator" + )] + A <- list(Agric = X_agric, Ind = X_ind, Polit = X_polit) -test_that("RGCCA is equivalent to MFA with right parameters", { - fit.mcoa <- rgcca( - blocks = A, tau = 1, scheme = "factorial", scale = TRUE, - scale_block = "lambda1", bias = TRUE, ncomp = 2, - superblock = TRUE, tol = 1e-16 - ) + test_that("RGCCA is equivalent to MFA with right parameters", { + fit.mcoa <- rgcca( + blocks = A, tau = 1, scheme = "factorial", scale = TRUE, + scale_block = "lambda1", bias = TRUE, ncomp = 2, + superblock = TRUE, tol = 1e-16 + ) - expect_lte(max(abs(fit.mcoa$Y[[4]][, 1] - fit.mfa$ind$coord[, 1])), tol) - expect_lte(max(abs(fit.mcoa$Y[[4]][, 2] - fit.mfa$ind$coord[, 2])), tol) -}) + expect_lte(max(abs(fit.mcoa$Y[[4]][, 1] - fit.mfa$ind$coord[, 1])), tol) + expect_lte(max(abs(fit.mcoa$Y[[4]][, 2] - fit.mfa$ind$coord[, 2])), tol) + }) +} ##### Test AVE ##### X_agric <- Russett[, c("gini", "farm", "rent")] diff --git a/tests/testthat/test_rgcca_bootstrap.r b/tests/testthat/test_rgcca_bootstrap.r index 427a8ab5..a06bbb73 100644 --- a/tests/testthat/test_rgcca_bootstrap.r +++ b/tests/testthat/test_rgcca_bootstrap.r @@ -72,68 +72,3 @@ test_that("rgcca_bootstrap_classif", { boot <- rgcca_bootstrap(rgcca_out, n_boot = 4, n_cores = 1) test_structure(boot, 4, 1, p) }) - -############################################## -# Test on the risk of having null variance # -# variables in at least one bootstrap sample # -############################################## -# Here, the variable `rent` is trapped and should be detected in -# `generate_resampling` which is going to raise a warning. Then it -# should be removed by `bootstrap`. -blocks <- list( - agriculture = Russett[, seq(3)], - industry = Russett[, 4:5], - politic = Russett[, 6:11] -) - -ncomp <- 1 -# Rent is trapped. -blocks$agriculture$rent <- 0 -blocks$agriculture$rent[1:4] <- 1 -rgcca_out <- rgcca(blocks, ncomp = ncomp) - -set.seed(8882) -test_that( - "rgcca_bootstrap_removed_variable_1", - expect_warning( - rgcca_bootstrap(rgcca_out, n_boot = 4, n_cores = 1, balanced = TRUE), - paste0( - "Variables: rent appear to be of null ", - "variance in some bootstrap samples and thus ", - "were removed from all samples. \n", - " ==> RGCCA is run again without these variables." - ) - ) -) -# Exact same situation where we check in the output that `rent` was removed. -set.seed(8882) - -test_that("rgcca_bootstrap_removed_variable_2", { - expect_warning( - boot_out <- rgcca_bootstrap( - rgcca_out, n_boot = 4, n_cores = 1, balanced = TRUE - ), - paste0( - "Variables: rent appear to be of null variance in some bootstrap ", - "samples and thus were removed from all samples." - ), - fixed = TRUE - ) - expect_false("rent" %in% boot_out$bootstrap$var) - expect_false("rent" %in% colnames(boot_out$rgcca$blocks$agriculture)) - expect_false("rent" %in% colnames(boot_out$rgcca$call$blocks$agriculture)) -}) - -# Same situation, but this time, it is specifically ask that all variables are -# kept. It is thus checked that `rent` is still there. -set.seed(8882) -boot_out <- rgcca_bootstrap(rgcca_out, - n_boot = 4, n_cores = 1, - keep_all_variables = TRUE, balanced = TRUE -) - -test_that("rgcca_bootstrap_keep_all_variables", { - expect_true("rent" %in% boot_out$bootstrap$var) - expect_true("rent" %in% colnames(boot_out$rgcca$blocks$agriculture)) - expect_true("rent" %in% colnames(boot_out$rgcca$call$blocks$agriculture)) -}) diff --git a/tests/testthat/test_rgcca_bootstrap_k.r b/tests/testthat/test_rgcca_bootstrap_k.r index 465e03fb..3755e016 100644 --- a/tests/testthat/test_rgcca_bootstrap_k.r +++ b/tests/testthat/test_rgcca_bootstrap_k.r @@ -24,7 +24,7 @@ test_that("test_rgcca_bootstrap_k_1", { rgcca_out_2 <- rgcca(blocks, superblock = FALSE, ncomp = 2) resb_2 <- rgcca_bootstrap_k(rgcca_out_2) -test_that("test_rgcca_bootstrap_k", { +test_that("test_rgcca_bootstrap_k_2", { expect_is(resb_2, "list") expect_is(resb_2[[1]][[1]], "matrix") expect_is(resb_2[[2]][[1]], "matrix") @@ -34,9 +34,7 @@ test_that("test_rgcca_bootstrap_k", { }) # If one bootstrap sample presents at least a single variable with null -# variance, rgcca_bootstrap_k should return the name of -# the null variance variables -# in both the two lists it returns. +# variance, rgcca_bootstrap_k should still return results blocks_3 <- blocks blocks_3$agriculture$rent <- 0 blocks_3$agriculture$rent[1] <- 1 @@ -44,6 +42,11 @@ rgcca_out_3 <- rgcca(blocks_3, superblock = FALSE, ncomp = 2) inds <- c(2, 2:NROW(blocks_3$agriculture)) resb_3 <- rgcca_bootstrap_k(rgcca_res = rgcca_out_3, inds = inds) -test_that("test_rgcca_bootstrap_k_missing_var_identification", { - expect_null(resb_3) +test_that("test_rgcca_bootstrap_k_3", { + expect_is(resb_3, "list") + expect_is(resb_3[[1]][[1]], "matrix") + expect_is(resb_3[[2]][[1]], "matrix") + expect_equal(length(resb_3), 2) + expect_true(all(vapply(resb_3[[1]], NCOL, FUN.VALUE = 1L) == 2)) + expect_identical(vapply(resb_3[[1]], NROW, FUN.VALUE = 1L), p) }) diff --git a/tests/testthat/test_scaling.r b/tests/testthat/test_scaling.r index ed08654b..4add181f 100644 --- a/tests/testthat/test_scaling.r +++ b/tests/testthat/test_scaling.r @@ -4,27 +4,39 @@ blocks <- list( industry = Russett[, 4:5], politic = Russett[, 6:11] ) +blocks <- lapply(blocks, data.matrix) + +n <- NROW(blocks[[1]]) +blocks[[4]] <- array(rnorm(n * 10 * 7), dim = c(n, 10, 7)) bias <- FALSE -sqrt_N <- sqrt(NROW(blocks[[1]]) + bias - 1) +sqrt_N <- sqrt(n + bias - 1) +tolerance <- 1e-12 -blocks3 <- lapply(blocks, scale) -blocks3 <- lapply(blocks3, function(x) { - return(x / sqrt(ncol(x))) -}) -blocks2 <- scaling(blocks, scale = TRUE, scale_block = TRUE, bias = bias) test_that("scaling_default_1", { - expect_true(sum(abs(blocks3[[2]] - blocks2[[2]])) < 1e-14) + blocks3 <- lapply(blocks, function(x) scale(matrix(x, nrow = nrow(x)))) + blocks3 <- lapply(blocks3, function(x) { + return(x / sqrt(ncol(x))) + }) + blocks3[[4]] <- array(blocks3[[4]], dim = dim(blocks[[4]])) + blocks2 <- scaling(blocks, scale = TRUE, scale_block = TRUE, bias = bias) + expect_true(sum(abs(blocks3[[2]] - blocks2[[2]])) < tolerance) }) test_that("scale_block = 'inertia' leads to unit Frobenius norm", { b <- scaling(blocks, scale = TRUE, scale_block = TRUE, bias = bias) for (j in seq_along(b)) { - expect_equal(norm(b[[j]] / sqrt_N, type = "F"), 1, tolerance = 1e-14) + expect_equal( + norm(matrix(b[[j]], nrow = nrow(b[[j]])) / sqrt_N, type = "F"), + 1, tolerance = tolerance + ) } b <- scaling(blocks, scale = FALSE, scale_block = TRUE, bias = bias) for (j in seq_along(b)) { - expect_equal(norm(b[[j]] / sqrt_N, type = "F"), 1, tolerance = 1e-14) + expect_equal( + norm(matrix(b[[j]], nrow = nrow(b[[j]])) / sqrt_N, type = "F"), + 1, tolerance = tolerance + ) } }) @@ -32,29 +44,31 @@ test_that("scale_block = 'lambda1' leads to top eigenvalue of covariance matrix being equal to one", { b <- scaling(blocks, scale = TRUE, scale_block = "lambda1", bias = bias) for (j in seq_along(b)) { - expect_equal(eigen(crossprod(b[[j]] / sqrt_N))$values[1], - 1, - tolerance = 1e-14 - ) + expect_equal(eigen(crossprod( + matrix(b[[j]], nrow = nrow(b[[j]])) / sqrt_N + ))$values[1], 1, tolerance = tolerance) } b <- scaling(blocks, scale = FALSE, scale_block = "lambda1", bias = bias) for (j in seq_along(b)) { - expect_equal(eigen(crossprod(b[[j]] / sqrt_N))$values[1], - 1, - tolerance = 1e-14 - ) + expect_equal(eigen(crossprod( + matrix(b[[j]], nrow = nrow(b[[j]])) / sqrt_N + ))$values[1], 1, tolerance = tolerance) } }) test_that("another value of scale_block does not lead to further scaling", { b <- scaling(blocks, scale = TRUE, scale_block = "none", bias = bias) - b_ref <- lapply(blocks, scale) + b_ref <- lapply(blocks, function(x) scale(matrix(x, nrow = nrow(x)))) + b_ref[[4]] <- array(b_ref[[4]], dim = dim(b[[4]])) for (j in seq_along(b)) { - expect_true(sum(abs(b[[j]] - b_ref[[j]])) < 1e-14) + expect_lte(sum(abs(b[[j]] - b_ref[[j]])), tolerance) } b <- scaling(blocks, scale = FALSE, scale_block = "none", bias = bias) - b_ref <- lapply(blocks, scale, center = TRUE, scale = FALSE) + b_ref <- lapply(blocks, function(x) scale( + matrix(x, nrow = nrow(x)), scale = FALSE) + ) + b_ref[[4]] <- array(b_ref[[4]], dim = dim(b[[4]])) for (j in seq_along(b)) { - expect_true(sum(abs(b[[j]] - b_ref[[j]])) < 1e-14) + expect_lte(sum(abs(b[[j]] - b_ref[[j]])), tolerance) } }) diff --git a/tests/testthat/test_select_analysis.R b/tests/testthat/test_select_analysis.R index 82690d61..ed52a5e5 100644 --- a/tests/testthat/test_select_analysis.R +++ b/tests/testthat/test_select_analysis.R @@ -13,12 +13,14 @@ run_selection <- function(method, quiet = TRUE, ...) { rgcca_args <- list( tau = rep(1, J), + rank = rep(1, J), ncomp = rep(1, J), quiet = quiet, scheme = "centroid", method = method, response = NULL, sparsity = rep(1, J), + mode_orth = rep(1, J), connection = 1 - diag(J), superblock = FALSE ) diff --git a/tests/testthat/test_sqrt_matrix.R b/tests/testthat/test_sqrt_matrix.R new file mode 100644 index 00000000..3998846d --- /dev/null +++ b/tests/testthat/test_sqrt_matrix.R @@ -0,0 +1,13 @@ +X <- crossprod(matrix(rnorm(10 * 5), 10, 5)) +X_inv <- ginv(X) + +test_that("sqrt_matrix retrieves the square root of a symmetric matrix", { + sqrt_X <- sqrt_matrix(X, inv = FALSE) + expect_equal(X, sqrt_X %*% sqrt_X) +}) + +test_that("sqrt_matrix retrieves the inverse of a square root of a + symmetric matrix", { + sqrt_inv_X <- sqrt_matrix(X, inv = TRUE) + expect_equal(X_inv, sqrt_inv_X %*% sqrt_inv_X) +}) diff --git a/tests/testthat/test_subset_block_rows.R b/tests/testthat/test_subset_block_rows.R new file mode 100644 index 00000000..bf668d2e --- /dev/null +++ b/tests/testthat/test_subset_block_rows.R @@ -0,0 +1,32 @@ +# '# Test subset_block_rows +# +# ''' +test_that("subset_block_rows successfully extract rows of vectors", { + x <- 1:10 + expect_equal(subset_block_rows(x, c(3, 5, 6)), x[c(3, 5, 6)]) + names(x) <- paste0("V", 1:10) + expect_equal(subset_block_rows(x, c(3, 5, 6)), x[c(3, 5, 6)]) +}) + +test_that("subset_block_rows successfully extract rows of matrices", { + x <- matrix(1:21, 7, 3) + expect_equal(subset_block_rows(x, c(3, 5, 6)), x[c(3, 5, 6), ]) + rownames(x) <- paste0("R", 1:7) + colnames(x) <- paste0("C", 1:3) + expect_equal(subset_block_rows(x, c(3, 5, 6)), x[c(3, 5, 6), ]) +}) + +test_that("subset_block_rows successfully extract rows of arrays", { + x <- array(1:72, dim = c(6, 3, 2, 2)) + expect_equal(subset_block_rows(x, c(3, 5, 6)), x[c(3, 5, 6), , , ]) + dimnames(x)[[1]] <- paste0("A", 1:6) + dimnames(x)[[2]] <- paste0("B", 1:3) + dimnames(x)[[3]] <- paste0("C", 1:2) + dimnames(x)[[4]] <- paste0("D", 1:2) + expect_equal(subset_block_rows(x, c(3, 5, 6)), x[c(3, 5, 6), , , ]) +}) + +test_that("subset_block_rows successfully extract rows of data.frames", { + x <- as.data.frame(matrix(1:21, 7, 3)) + expect_equal(subset_block_rows(x, c(3, 5, 6)), x[c(3, 5, 6), ]) +}) diff --git a/tests/testthat/test_subset_rows.r b/tests/testthat/test_subset_rows.r deleted file mode 100644 index ebdbcfa0..00000000 --- a/tests/testthat/test_subset_rows.r +++ /dev/null @@ -1,32 +0,0 @@ -# '# Test subset_rows -# -# ''' -test_that("subset_rows successfully extract rows of vectors", { - x <- 1:10 - expect_equal(subset_rows(x, c(3, 5, 6)), x[c(3, 5, 6)]) - names(x) <- paste0("V", 1:10) - expect_equal(subset_rows(x, c(3, 5, 6)), x[c(3, 5, 6)]) -}) - -test_that("subset_rows successfully extract rows of matrices", { - x <- matrix(1:21, 7, 3) - expect_equal(subset_rows(x, c(3, 5, 6)), x[c(3, 5, 6), ]) - rownames(x) <- paste0("R", 1:7) - colnames(x) <- paste0("C", 1:3) - expect_equal(subset_rows(x, c(3, 5, 6)), x[c(3, 5, 6), ]) -}) - -test_that("subset_rows successfully extract rows of arrays", { - x <- array(1:72, dim = c(6, 3, 2, 2)) - expect_equal(subset_rows(x, c(3, 5, 6)), x[c(3, 5, 6), , , ]) - dimnames(x)[[1]] <- paste0("A", 1:6) - dimnames(x)[[2]] <- paste0("B", 1:3) - dimnames(x)[[3]] <- paste0("C", 1:2) - dimnames(x)[[4]] <- paste0("D", 1:2) - expect_equal(subset_rows(x, c(3, 5, 6)), x[c(3, 5, 6), , , ]) -}) - -test_that("subset_rows successfully extract rows of data.frames", { - x <- as.data.frame(matrix(1:21, 7, 3)) - expect_equal(subset_rows(x, c(3, 5, 6)), x[c(3, 5, 6), ]) -})