diff --git a/NAMESPACE b/NAMESPACE index 437f5c8ef..6725e1efc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -53,7 +53,6 @@ export(pos_neg_marker) export(pos_neg_select) export(ref_feature_select) export(ref_marker_select) -export(remove_background) export(reverse_marker_matrix) export(run_gsea) export(seurat_meta) diff --git a/R/common_dplyr.R b/R/common_dplyr.R new file mode 100644 index 000000000..33551badc --- /dev/null +++ b/R/common_dplyr.R @@ -0,0 +1,384 @@ +#' get best calls for each cluster +#' +#' @param cor_mat input similarity matrix +#' @param metadata input metadata with tsne or umap coordinates and cluster ids +#' @param cluster_col metadata column, can be cluster or cellid +#' @param collapse_to_cluster if a column name is provided, takes the most +#' frequent call of entire cluster to color in plot +#' @param threshold minimum correlation coefficent cutoff for calling clusters +#' @param rename_prefix prefix to add to type and r column names +#' @param carry_r whether to include threshold in unassigned names +#' @return dataframe of cluster, new ident, and r info +#' @examples +#' res <- clustify( +#' input = pbmc_matrix_small, +#' metadata = pbmc_meta, +#' cluster_col = "classified", +#' ref_mat = cbmc_ref +#' ) +#' +#' cor_to_call(res) +#' @export +cor_to_call <- function(cor_mat, + metadata = NULL, + cluster_col = "cluster", + collapse_to_cluster = FALSE, + threshold = 0, + rename_prefix = NULL, + carry_r = FALSE) { + correlation_matrix <- cor_mat + if (threshold == "auto") { + threshold <- round(0.75 * max(correlation_matrix), 2) + message(paste0("using threshold of ", threshold)) + } + correlation_matrix[is.na(correlation_matrix)] <- 0 + df_temp <- + tibble::as_tibble(correlation_matrix, rownames = cluster_col) + df_temp <- tidyr::gather( + df_temp, + key = !!dplyr::sym("type"), + value = !!dplyr::sym("r"), -!!cluster_col + ) + + if (carry_r) { + df_temp[["type"]][df_temp$r < threshold] <- + paste0("r<", threshold, ", unassigned") + } else { + df_temp[["type"]][df_temp$r < threshold] <- "unassigned" + } + + df_temp <- + dplyr::top_n(dplyr::group_by_at(df_temp, 1), 1, !!dplyr::sym("r")) + if (nrow(df_temp) != nrow(correlation_matrix)) { + clash <- dplyr::summarize(dplyr::group_by_at(df_temp, 1), n = n()) + clash <- dplyr::filter(clash, n > 1) + clash <- dplyr::pull(clash, 1) + df_temp[lapply( + df_temp[, 1], + FUN = function(x) { + x %in% clash + } + )[[1]], 2] <- + paste0(df_temp[["type"]][lapply( + df_temp[, 1], + FUN = function(x) { + x %in% clash + } + )[[1]]], "-CLASH!") + df_temp2 <- df_temp + df_temp_full <- + dplyr::distinct_at(df_temp, + vars(-!!dplyr::sym("type")), + .keep_all = TRUE) + } else { + df_temp_full <- df_temp + } + + if (collapse_to_cluster != FALSE) { + if (!(cluster_col %in% colnames(metadata))) { + metadata <- tibble::as_tibble(metadata, rownames = "rn") + } + df_temp_full <- + collapse_to_cluster( + df_temp_full, + metadata = metadata, + cluster_col = cluster_col, + threshold = threshold + ) + } + + if (!is.null(rename_prefix)) { + if (collapse_to_cluster) { + eval(parse( + text = paste0( + "df_temp_full <- dplyr::rename(df_temp_full, ", + paste0(rename_prefix, "_type"), + " = type, ", + paste0(rename_prefix, "_sum"), + " = sum, ", + paste0(rename_prefix, "_n"), + " = n)" + ) + )) + } else { + eval(parse( + text = paste0( + "df_temp_full <- dplyr::rename(df_temp_full, ", + paste0(rename_prefix, "_type"), + " = type, ", + paste0(rename_prefix, "_r"), + " = r)" + ) + )) + } + } + df_temp_full +} + +#' Insert called ident results into metadata +#' +#' @param res dataframe of idents, such as output of cor_to_call +#' @param metadata input metadata with tsne or umap coordinates and cluster ids +#' @param cluster_col metadata column, can be cluster or cellid +#' @param per_cell whether the res dataframe is listed per cell +#' @param rename_prefix prefix to add to type and r column names +#' @return new metadata with added columns +#' @examples +#' \donttest{ +#' res <- clustify( +#' input = pbmc_matrix_small, +#' metadata = pbmc_meta, +#' cluster_col = "classified", +#' ref_mat = cbmc_ref +#' ) +#' +#' res2 <- cor_to_call(res, cluster_col = "classified") +#' +#' call_to_metadata( +#' res = res2, +#' metadata = pbmc_meta, +#' cluster_col = "classified", +#' rename_prefix = "assigned" +#' ) +#' } +#' @export +call_to_metadata <- function(res, + metadata, + cluster_col, + per_cell = FALSE, + rename_prefix = NULL) { + temp_col_id <- get_unique_column(metadata, "rn") + + df_temp <- res + if (!is.null(rename_prefix)) { + eval(parse( + text = paste0( + "df_temp <- dplyr::rename(df_temp, ", + paste0(rename_prefix, "_type"), + " = type, ", + paste0(rename_prefix, "_r"), + " = r)" + ) + )) + } + + if (per_cell == FALSE) { + if (!(cluster_col %in% colnames(metadata))) { + stop("cluster_col is not a column of metadata", + call. = FALSE) + } + + if (!(cluster_col %in% colnames(res))) { + stop("cluster_col is not a column ", + "of called cell type dataframe", + call. = FALSE + ) + } + + if (!(all(unique(df_temp[[cluster_col]]) %in% + unique(metadata[[cluster_col]])))) { + stop("cluster_col from clustify step and", + "joining to metadata step are not the same", + call. = FALSE + ) + } + + df_temp_full <- + suppressWarnings( + dplyr::left_join( + tibble::rownames_to_column( + metadata, + temp_col_id + ), + df_temp, + by = cluster_col, + suffix = c("", ".clustify") + ) + ) + + df_temp_full <- tibble::column_to_rownames( + df_temp_full, + temp_col_id + ) + } else { + colnames(df_temp)[1] <- cluster_col + names(cluster_col) <- temp_col_id + + df_temp_full <- + suppressWarnings( + dplyr::left_join( + tibble::rownames_to_column( + metadata, + temp_col_id + ), + df_temp, + by = cluster_col, + suffix = c("", ".clustify") + ) + ) + + df_temp_full <- + tibble::column_to_rownames(df_temp_full, + temp_col_id) + } + df_temp_full +} + +#' From per-cell calls, take highest freq call in each cluster +#' +#' @param res dataframe of idents, such as output of cor_to_call +#' @param metadata input metadata with tsne or umap coordinates and cluster ids +#' @param cluster_col metadata column for cluster +#' @param threshold minimum correlation coefficent cutoff for calling clusters +#' @return new metadata with added columns +#' @examples +#' res <- clustify( +#' input = pbmc_matrix_small, +#' metadata = pbmc_meta, +#' cluster_col = "classified", +#' ref_mat = cbmc_ref, +#' per_cell = TRUE +#' ) +#' +#' res2 <- cor_to_call(res) +#' +#' collapse_to_cluster( +#' res2, +#' metadata = pbmc_meta, +#' cluster_col = "classified", +#' threshold = 0 +#' ) +#' @export +collapse_to_cluster <- function(res, + metadata, + cluster_col, + threshold = 0) { + res_temp <- res + colnames(res_temp)[1] <- "rn" + df_temp_full <- as.data.frame(res_temp) + df_temp_full <- + dplyr::mutate(df_temp_full, + cluster = metadata[[cluster_col]]) + df_temp_full2 <- + dplyr::group_by(df_temp_full, + !!dplyr::sym("type"), + !!dplyr::sym("cluster")) + df_temp_full2 <- + dplyr::summarize(df_temp_full2, + sum = sum(!!dplyr::sym("r")), + n = n() + ) + df_temp_full2 <- + dplyr::group_by(df_temp_full2, + !!dplyr::sym("cluster")) + df_temp_full2 <- + dplyr::arrange(df_temp_full2, + desc(n), + desc(sum)) + df_temp_full2 <- + dplyr::filter(df_temp_full2, + !!dplyr::sym("type") != paste0("r<", + threshold, + ", unassigned")) + df_temp_full2 <- dplyr::slice(df_temp_full2, 1) + df_temp_full2 <- + dplyr::rename(df_temp_full2, + !!cluster_col := cluster) + dplyr::select(df_temp_full2, 2, 1, + tidyr::everything()) +} + +#' get ranked calls for each cluster +#' +#' @param cor_mat input similarity matrix +#' @param metadata input metadata with tsne or umap coordinates +#' and cluster ids +#' @param cluster_col metadata column, can be cluster or cellid +#' @param collapse_to_cluster if a column name is provided, takes the most +#' frequent call of entire cluster to color in plot +#' @param threshold minimum correlation coefficent cutoff for calling clusters +#' @param rename_prefix prefix to add to type and r column names +#' @param top_n the number of ranks to keep, the rest will be set to 100 +#' @return dataframe of cluster, new ident, and r info +#' @examples +#' res <- clustify( +#' input = pbmc_matrix_small, +#' metadata = pbmc_meta, +#' cluster_col = "classified", +#' ref_mat = cbmc_ref +#' ) +#' +#' cor_to_call_rank(res, threshold = "auto") +#' @export +cor_to_call_rank <- function(cor_mat, + metadata = NULL, + cluster_col = "cluster", + collapse_to_cluster = FALSE, + threshold = 0, + rename_prefix = NULL, + top_n = NULL) { + correlation_matrix <- cor_mat + if (threshold == "auto") { + threshold <- round(0.75 * max(correlation_matrix), 2) + message(paste0("using threshold of ", threshold)) + } + df_temp <- tibble::as_tibble(correlation_matrix, + rownames = cluster_col + ) + df_temp <- + tidyr::gather( + df_temp, + key = !!dplyr::sym("type"), + value = !!dplyr::sym("r"), -!!cluster_col + ) + df_temp <- + dplyr::mutate(dplyr::group_by_at(df_temp, 1), + rank = dplyr::dense_rank(desc(!!dplyr::sym("r")))) + df_temp[["rank"]][df_temp$r < threshold] <- 100 + if (!(is.null(top_n))) { + df_temp <- dplyr::filter(df_temp, rank <= top_n) + } + df_temp_full <- df_temp + if (!is.null(rename_prefix)) { + eval(parse( + text = paste0( + "df_temp_full <- dplyr::rename(df_temp_full, ", + paste0(rename_prefix, "_type"), + " = type, ", + paste0(rename_prefix, "_r"), + " = r)" + ) + )) + } + df_temp_full +} + +#' get concensus calls for a list of cor calls +#' +#' @param list_of_res list of call dataframes from cor_to_call_rank +#' @return dataframe of cluster, new ident, and mean rank +#' @examples +#' res <- clustify( +#' input = pbmc_matrix_small, +#' metadata = pbmc_meta, +#' cluster_col = "classified", +#' ref_mat = cbmc_ref +#' ) +#' +#' res2 <- cor_to_call_rank(res, threshold = "auto") +#' res3 <- cor_to_call_rank(res) +#' call_consensus(list(res2, res3)) +#' @export +call_consensus <- function(list_of_res) { + + res <- do.call("rbind", list_of_res) + df_temp <- dplyr::group_by_at(res, c(1, 2)) + df_temp <- dplyr::summarize_at(df_temp, 2, mean) + df_temp <- dplyr::top_n(df_temp, -1) + df_temp <- dplyr::group_by_at(df_temp, c(1, 3)) + df_temp <- + dplyr::summarize_at(df_temp, 1, function(x) { + stringr::str_c(x, collapse = "__") + }) + df_temp <- dplyr::select(df_temp, c(1, 3, 2)) +} diff --git a/R/common_dplyr.R.REMOVED.git-id b/R/common_dplyr.R.REMOVED.git-id deleted file mode 100644 index 5e1a0d35c..000000000 --- a/R/common_dplyr.R.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -33551badc150f6a351dcab30e2ea7caf8706f88b \ No newline at end of file diff --git a/R/compare_genelist.R b/R/compare_genelist.R new file mode 100644 index 000000000..676d95ce0 --- /dev/null +++ b/R/compare_genelist.R @@ -0,0 +1,328 @@ +#' Binarize scRNAseq data +#' +#' @param mat single-cell expression matrix +#' @param n number of top expressing genes to keep +#' @param cut cut off to set to 0 +#' @return matrix of 1s and 0s +#' @examples +#' pbmc_avg <- average_clusters( +#' mat = pbmc_matrix_small, +#' metadata = pbmc_meta, +#' cluster_col = "classified" +#' ) +#' +#' mat <- binarize_expr(pbmc_avg) +#' mat[1:3, 1:3] +#' @export +binarize_expr <- function(mat, + n = 1000, + cut = 0) { + expr_mat <- mat + if (n < nrow(expr_mat)) { + expr_df <-as.matrix(expr_mat) + filterout <- t(matrixStats::colRanks(-expr_df, + ties.method = "average")) > n + expr_df[filterout] <- 0 + expr_df[expr_df > cut] <- 1 + expr_df[expr_df < cut] <- 0 + as.matrix(expr_df) + } else { + expr_mat[expr_mat > cut] <- 1 + expr_mat[expr_mat < cut] <- 0 + expr_mat + } +} + +#' Convert candidate genes list into matrix +#' +#' @param marker_df dataframe of candidate genes, must contain +#' "gene" and "cluster" columns, or a matrix of gene names to +#' convert to ranked +#' @param ranked unranked gene list feeds into hyperp, the ranked +#' gene list feeds into regular corr_coef +#' @param n number of genes to use +#' @param step_weight ranked genes are tranformed into pseudo +#' expression by descending weight +#' @param background_weight ranked genes are tranformed into pseudo +#' expression with added weight +#' @param unique whether to use only unique markers to 1 cluster +#' @param metadata vector or dataframe of cluster names, should +#' have column named cluster +#' @param cluster_col column for cluster names to replace original +#' cluster, if metadata is dataframe +#' @param remove_rp do not include rps, rpl, rp1-9 in markers +#' @return matrix of unranked gene marker names, or matrix of +#' ranked expression +#' @examples +#' matrixize_markers(pbmc_markers) +#' @export +matrixize_markers <- function(marker_df, + ranked = FALSE, + n = NULL, + step_weight = 1, + background_weight = 0, + unique = FALSE, + metadata = NULL, + cluster_col = "classified", + remove_rp = FALSE) { + # takes marker in dataframe form + # equal number of marker genes per known cluster + marker_df <- dplyr::as_tibble(marker_df) + + # if "gene" not present in column names, + # assume df is a matrix to be converted to ranked + if (!("gene" %in% colnames(marker_df))) { + marker_df <- + data.frame(lapply(marker_df, as.character), + stringsAsFactors = FALSE) + marker_df <- + tidyr::gather(marker_df, + factor_key = TRUE, + key = "cluster", + value = "gene" + ) + } + + if (remove_rp) { + marker_df <- + dplyr::filter(marker_df, + !(stringr::str_detect(gene, + "^RP[0-9,L,S]|^Rp[0-9,l,s]"))) + } + + if (unique) { + nonunique <- dplyr::group_by(marker_df, gene) + nonunique <- dplyr::summarize(nonunique, n = dplyr::n()) + nonunique <- dplyr::filter(nonunique, n > 1) + marker_df <- dplyr::anti_join(marker_df, nonunique, by = "gene") + } + + cut_temp <- dplyr::group_by(marker_df, cluster) + cut_temp <- dplyr::summarize(cut_temp, n = n()) + cut_num <- min(cut_temp$n) + + if (!is.null(n)) { + if (n < cut_num) { + cut_num <- n + } + } + + marker_temp <- dplyr::select(marker_df, gene, cluster) + marker_temp <- dplyr::group_by(marker_temp, cluster) + marker_temp <- dplyr::slice(marker_temp, seq_len(cut_num)) + if (ranked) { + marker_temp <- + dplyr::mutate( + marker_temp, + n = seq( + step_weight * cut_num, + by = -step_weight, + length.out = cut_num + ) + background_weight + ) + marker_temp2 <- + tidyr::spread(marker_temp, key = "cluster", value = n) + marker_temp2 <- + as.data.frame(replace(marker_temp2, is.na(marker_temp2), 0)) + rownames(marker_temp2) <- marker_temp2$gene + marker_temp2 <- dplyr::select(marker_temp2, -gene) + } else { + marker_temp <- dplyr::mutate(marker_temp, n = seq_len(cut_num)) + marker_temp2 <- + tidyr::spread(marker_temp, key = "cluster", value = "gene") + marker_temp2 <- as.data.frame(dplyr::select(marker_temp2, -n)) + } + + # if metadata is vector, adopt names in vector; + # if metadata is a metadata dataframe, pulls names from cluster_col column + if (!is.null(metadata)) { + if (typeof(metadata) != "character") { + metadata <- + suppressWarnings(dplyr::left_join( + tibble::tibble(cluster = colnames(marker_temp2)), + unique( + tibble::tibble( + cluster = metadata$cluster, + classified = metadata[[cluster_col]] + ) + ), + by = "cluster" + )) + metadata <- metadata[[cluster_col]] + } + colnames(marker_temp2) <- metadata + } + + marker_temp2 +} + +#' Generate variable gene list from marker matrix +#' +#' @description Variable gene list is required for `clustify` main function. +#' This function parses variables genes from a matrix input. +#' +#' @param marker_mat matrix or dataframe of candidate genes for each cluster +#' @return vector of marker gene names +#' @examples +#' get_vargenes(cbmc_m) +#' @export +get_vargenes <- function(marker_mat) { + if (rownames(marker_mat)[1] != "1") { + unique(rownames(marker_mat)) + } else { + unique(unlist(marker_mat, use.names = FALSE)) + } +} + +#' Calculate adjusted p-values for hypergeometric test of gene lists +#' or jaccard index +#' +#' @param bin_mat binarized single-cell expression matrix, +#' feed in by_cluster mat, if desired +#' @param marker_mat matrix or dataframe of candidate genes for each cluster +#' @param n number of genes in the genome +#' @param metric adjusted p-value for hypergeometric test, or jaccard index +#' @param output_high if true (by default to fit with rest of package), +#' -log10 transform p-value +#' @return matrix of numeric values, clusters from expr_mat as row names, +#' cell types from marker_mat as column names +#' @examples +#' pbmc_mm <- matrixize_markers(pbmc_markers) +#' +#' pbmc_avg <- average_clusters( +#' pbmc_matrix_small, +#' pbmc_meta, +#' cluster_col = "classified" +#' ) +#' +#' pbmc_avgb <- binarize_expr(pbmc_avg) +#' +#' compare_lists( +#' pbmc_avgb, +#' pbmc_mm, +#' metric = "spearman" +#' ) +#' @export +compare_lists <- function(bin_mat, + marker_mat, + n = 30000, + metric = "hyper", + output_high = TRUE) { + # check if matrix is binarized + if ((length(unique(bin_mat[, 1])) > 2) & (metric != "gsea")) { + warning("non-binarized data, running spearman instead") + metric <- "spearman" + } + + if (metric == "hyper") { + out <- lapply( + colnames(bin_mat), + function(x) { + per_col <- lapply( + colnames(marker_mat), + function(y) { + marker_list <- unlist(marker_mat[, y], + use.names = FALSE) + bin_temp <- bin_mat[, x][bin_mat[, x] == 1] + list_top <- names(bin_temp) + + t <- length(intersect(list_top, marker_list)) + a <- max(length(list_top), length(marker_list)) + b <- min(length(list_top), length(marker_list)) + sum(dhyper(seq(t,b), a, n - a, b)) + } + ) + do.call(cbind, as.list(p.adjust(per_col))) + } + ) + if (any(vapply(out, is.na, FUN.VALUE = logical(length(out[[1]]))))) { + error("NaN produced, possibly due to wrong n") + } + } else if (metric == "jaccard") { + out <- lapply( + colnames(bin_mat), + function(x) { + per_col <- lapply( + colnames(marker_mat), + function(y) { + marker_list <- unlist(marker_mat[, y], + use.names = FALSE) + bin_temp <- bin_mat[, x][bin_mat[, x] == 1] + list_top <- names(bin_temp) + + I <- length(intersect(list_top, marker_list)) + I / (length(list_top) + length(marker_list) - I) + } + ) + do.call(cbind, per_col) + } + ) + } else if (metric == "spearman") { + out <- lapply( + colnames(bin_mat), + function(x) { + per_col <- lapply( + colnames(marker_mat), + function(y) { + marker_list <- unlist(marker_mat[, y], + use.names = FALSE) + v1 <- marker_list[marker_list %in% + names(as.matrix(bin_mat)[, x])] + + bin_temp <- as.matrix(bin_mat)[, x] + bin_temp <- bin_temp[order(bin_temp, + decreasing = TRUE)] + list_top <- names(bin_temp) + v2 <- list_top[list_top %in% v1] + v1 <- v1 + v2 <- v2 + sum(vapply(seq_along(v1), function(i) { + abs(i - ( + which(v2 == v1[i]) + )) + }, FUN.VALUE = numeric(1))) + } + ) + do.call(cbind, per_col) + } + ) + } else if (metric == "gsea") { + out <- lapply( + colnames(marker_mat), + function(y) { + marker_list <- list() + marker_list[[1]] <- marker_mat[, y] + names(marker_list) <- y + v1 <- marker_list + run_gsea(bin_mat, v1, n_perm = 1000, per_cell = TRUE) + } + ) + res <- do.call(cbind, out) + n <- ncol(res) + res2 <- res[, 3 * seq_len((ncol(res) / 3)) - 1, drop = FALSE] + rownames(res2) <- rownames(res) + colnames(res2) <- colnames(marker_mat) + res <- res2 + if (output_high) { + res <- -log10(res) + } + } else { + stop("unrecognized metric", call. = FALSE) + } + + if (metric != "gsea") { + res <- do.call(rbind, out) + rownames(res) <- colnames(bin_mat) + colnames(res) <- colnames(marker_mat) + } + + if (output_high) { + if (metric == "hyper") { + res <- -log10(res) + } else if (metric == "spearman") { + res <- -res + max(res) + } + } + + res +} diff --git a/R/compare_genelist.R.REMOVED.git-id b/R/compare_genelist.R.REMOVED.git-id deleted file mode 100644 index 2d5252c8f..000000000 --- a/R/compare_genelist.R.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -676d95ce0c6e90adb719788452584636e713cb23 \ No newline at end of file diff --git a/R/compute_similarity.R b/R/compute_similarity.R new file mode 100644 index 000000000..a47e09e0d --- /dev/null +++ b/R/compute_similarity.R @@ -0,0 +1,312 @@ +#' Compute similarity of matrices +#' +#' @param expr_mat single-cell expression matrix +#' @param ref_mat reference expression matrix +#' @param cluster_ids vector of cluster ids for each cell +#' @param compute_method method(s) for computing similarity scores +#' @param per_cell run per cell? +#' @param rm0 consider 0 as missing data, recommended for per_cell +#' @param ... additional parameters not used yet +#' @return matrix of numeric values, clusters from expr_mat as row names, +#' cell types from ref_mat as column names +get_similarity <- function(expr_mat, + ref_mat, + cluster_ids, + compute_method, + per_cell = FALSE, + rm0 = FALSE, + ...) { + if (nrow(expr_mat) == 0) { + stop("after subsetting to shared genes", + "query expression matrix has 0 rows", + call. = FALSE + ) + } + + if (ncol(expr_mat) == 0) { + stop("query expression matrix has 0 cols", + call. = FALSE) + } + + if (nrow(ref_mat) == 0) { + stop("after subsetting to shared genes", + "reference expression matrix has 0 rows", + call. = FALSE + ) + } + + if (ncol(ref_mat) == 0) { + stop("reference expression matrix has 0 cols", + call. = FALSE) + } + + ref_clust <- colnames(ref_mat) + if (ncol(expr_mat) != length(cluster_ids)) { + stop("number of cells in expression matrix not equal", + "to metadata/cluster_col", + call. = FALSE + ) + } + + if (sum(is.na(cluster_ids)) > 0) { + message("reassigning NAs to unknown") + cluster_ids <- factor(cluster_ids) + cluster_ids <- + factor( + cluster_ids, + levels = c(levels(cluster_ids), NA), + labels = c(levels(cluster_ids), "unknown"), + exclude = NULL + ) + cluster_ids <- as.character(cluster_ids) + } + + if (!per_cell) { + sc_clust <- sort(unique(cluster_ids)) + clust_avg <- average_clusters( + expr_mat, + cluster_ids + ) + } else { + sc_clust <- cluster_ids + clust_avg <- expr_mat + } + + assigned_score <- calc_similarity( + clust_avg, + ref_mat, + compute_method, + rm0 = rm0, + ... + ) + + rownames(assigned_score) <- sc_clust + colnames(assigned_score) <- ref_clust + + return(assigned_score) +} + +#' Compute a p-value for similarity using permutation +#' +#' @description Permute cluster labels to calculate empirical p-value +#' +#' +#' @param expr_mat single-cell expression matrix +#' @param cluster_ids clustering info of single-cell data assume that +#' genes have ALREADY BEEN filtered +#' @param ref_mat reference expression matrix +#' @param n_perm number of permutations +#' @param per_cell run per cell? +#' @param compute_method method(s) for computing similarity scores +#' @param rm0 consider 0 as missing data, recommended for per_cell +#' @param ... additional parameters +#' @return matrix of numeric values +permute_similarity <- function(expr_mat, + ref_mat, + cluster_ids, + n_perm, + per_cell = FALSE, + compute_method, + rm0 = FALSE, + ...) { + ref_clust <- colnames(ref_mat) + + if (!per_cell) { + sc_clust <- sort(unique(cluster_ids)) + clust_avg <- average_clusters( + expr_mat, + cluster_ids + ) + } else { + sc_clust <- colnames(expr_mat) + clust_avg <- expr_mat + } + + assigned_score <- calc_similarity(clust_avg, + ref_mat, + compute_method, + rm0 = rm0, + ... + ) + + # perform permutation + sig_counts <- + matrix(0L, nrow = length(sc_clust), + ncol = length(ref_clust)) + + for (i in seq_len(n_perm)) { + resampled <- sample(cluster_ids, + length(cluster_ids), + replace = FALSE + ) + + if (!per_cell) { + permuted_avg <- average_clusters( + expr_mat, + resampled + ) + } else { + permuted_avg <- expr_mat[, resampled, drop = FALSE] + } + + # permutate assignment + new_score <- calc_similarity(permuted_avg, + ref_mat, + compute_method, + rm0 = rm0, + ... + ) + sig_counts <- + sig_counts + as.numeric(new_score > assigned_score) + } + + rownames(assigned_score) <- sc_clust + colnames(assigned_score) <- ref_clust + rownames(sig_counts) <- sc_clust + colnames(sig_counts) <- ref_clust + + return(list( + score = assigned_score, + p_val = sig_counts / n_perm + )) +} + +#' compute similarity +#' @param query_mat query data matrix +#' @param ref_mat reference data matrix +#' @param compute_method method(s) for computing similarity scores +#' @param rm0 consider 0 as missing data, recommended for per_cell +#' @param ... additional parameters +#' @return matrix of numeric values +calc_similarity <- function(query_mat, + ref_mat, + compute_method, + rm0 = FALSE, + ...) { + # remove 0s ? + if (rm0) { + message("considering 0 as missing data") + query_mat[query_mat == 0] <- NA + similarity_score <- suppressWarnings(stats::cor(as.matrix(query_mat), + ref_mat, + method = compute_method, + use = "pairwise.complete.obs" + )) + return(similarity_score) + } else { + if (any(compute_method %in% c("pearson", + "spearman", + "kendall"))) { + similarity_score <- suppressWarnings( + stats::cor(as.matrix(query_mat), + ref_mat, + method = compute_method + )) + return(similarity_score) + } + } + + sc_clust <- colnames(query_mat) + ref_clust <- colnames(ref_mat) + features <- intersect(rownames(query_mat), rownames(ref_mat)) + query_mat <- query_mat[features, ] + ref_mat <- ref_mat[features, ] + similarity_score <- matrix(NA, + nrow = length(sc_clust), + ncol = length(ref_clust) + ) + for (i in seq_along(sc_clust)) { + for (j in seq_along(ref_clust)) { + similarity_score[i, j] <- vector_similarity( + query_mat[, sc_clust[i]], + ref_mat[, ref_clust[j]], + compute_method, ... + ) + } + } + return(similarity_score) +} + +#' Compute similarity between two vectors +#' +#' @description Compute the similarity score between two vectors using a +#' customized scoring function +#' Two vectors may be from either scRNA-seq or bulk RNA-seq data. +#' The lengths of vec1 and vec2 must match, and must be arranged in the +#' same order of genes. +#' Both vectors should be provided to this function after pre-processing, +#' feature selection and dimension reduction. +#' +#' @param vec1 test vector +#' @param vec2 reference vector +#' @param compute_method method to run i.e. corr_coef +#' @param ... arguments to pass to compute_method function +#' @return numeric value of desired correlation or distance measurement +vector_similarity <- function(vec1, vec2, compute_method, ...) { + # examine whether two vectors are of the same size + if (!is.numeric(vec1) || + !is.numeric(vec2) || length(vec1) != length(vec2)) { + stop( + "compute_similarity: two input vectors", + " are not numeric or of different sizes.", + call. = FALSE + ) + } + + if (!(compute_method %in% c("cosine", "kl_divergence"))) { + stop(paste(compute_method, "not implemented"), call. = FALSE) + } + + if (compute_method == "kl_divergence") { + res <- kl_divergence(vec1, vec2, ...) + } else if (compute_method == "cosine") { + res <- cosine(vec1, vec2, ...) + } + # return the similarity score, must be + return(res) +} + +#' Cosine distance +#' @param vec1 test vector +#' @param vec2 reference vector +#' @return numeric value of cosine distance between the vectors +cosine <- function(vec1, vec2) { + sum(vec1 * vec2) / sqrt(sum(vec1^2) * sum(vec2^2)) +} +#' KL divergence +#' +#' @description Use package entropy to compute Kullback-Leibler divergence. +#' The function first converts each vector's reads to pseudo-number of +#' transcripts by normalizing the total reads to total_reads. +#' The normalized read for each gene is then rounded to serve as the +#' pseudo-number of transcripts. +#' Function entropy::KL.shrink is called to compute the KL-divergence between +#' the two vectors, and the maximal allowed divergence is set to max_KL. +#' Finally, a linear transform is performed to convert the KL divergence, +#' which is between 0 and max_KL, to a similarity score between -1 and 1. +#' +#' @param vec1 Test vector +#' @param vec2 Reference vector +#' @param if_log Whether the vectors are log-transformed. If so, the +#' raw count should be computed before computing KL-divergence. +#' @param total_reads Pseudo-library size +#' @param max_KL Maximal allowed value of KL-divergence. +#' @return numeric value, with additional attributes, of kl divergence +#' between the vectors +kl_divergence <- function(vec1, + vec2, + if_log = FALSE, + total_reads = 1000, + max_KL = 1) { + if (if_log) { + vec1 <- expm1(vec1) + vec2 <- expm1(vec2) + } + count1 <- round(vec1 * total_reads / sum(vec1)) + count2 <- round(vec2 * total_reads / sum(vec2)) + est_KL <- entropy::KL.shrink(count1, count2, + unit = "log", + verbose = FALSE + ) + return((max_KL - est_KL) / max_KL * 2 - 1) +} diff --git a/R/compute_similarity.R.REMOVED.git-id b/R/compute_similarity.R.REMOVED.git-id deleted file mode 100644 index 9419e14e2..000000000 --- a/R/compute_similarity.R.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -a47e09e0d25dac5310ae44a3351c100c21edd6ae \ No newline at end of file diff --git a/R/main.R b/R/main.R new file mode 100644 index 000000000..3ec14fb95 --- /dev/null +++ b/R/main.R @@ -0,0 +1,949 @@ +#' Compare scRNA-seq data to reference data. +#' +#' @export +clustify <- function(input, ...) { + UseMethod("clustify", input) +} + +#' @rdname clustify +#' @param input single-cell expression matrix or Seurat object +#' @param metadata cell cluster assignments, +#' supplied as a vector or data.frame. +#' If data.frame is supplied then `cluster_col` needs to be set. +#' Not required if running correlation per cell. +#' @param ref_mat reference expression matrix +#' @param cluster_col column in metadata that contains cluster ids per cell. +#' Will default to first column of metadata if not supplied. +#' Not required if running correlation per cell. +#' @param query_genes A vector of genes of interest to compare. If NULL, then +#' common genes between the expr_mat and ref_mat +#' will be used for comparision. +#' @param per_cell if true run per cell, otherwise per cluster. +#' @param n_perm number of permutations, set to 0 by default +#' @param compute_method method(s) for computing similarity scores +#' @param use_var_genes if providing a seurat object, use the variable genes +#' (stored in seurat_object@var.genes) as the query_genes. +#' @param dr stored dimension reduction +#' @param seurat_out output cor matrix or called seurat object +#' (deprecated, use obj_out instead) +#' @param verbose whether to report certain variables chosen +#' @param lookuptable if not supplied, will look in built-in table +#' for object parsing +#' @param rm0 consider 0 as missing data, recommended for per_cell +#' @param obj_out whether to output object instead of cor matrix +#' @param rename_prefix prefix to add to type and r column names +#' @param threshold identity calling minimum correlation score threshold, +#' only used when obj_out = TRUE +#' @param low_threshold_cell option to remove clusters with too few cells +#' @param exclude_genes a vector of gene names to throw out of query +#' @param ... additional arguments to pass to compute_method function +#' +#' @return single cell object with identity assigned in metadata, +#' or matrix of correlation values, clusters from input as row names, cell +#' types from ref_mat as column names +#' +#' @examples +#' # Annotate a matrix and metadata +#' clustify( +#' input = pbmc_matrix_small, +#' metadata = pbmc_meta, +#' ref_mat = cbmc_ref, +#' query_genes = pbmc_vargenes, +#' cluster_col = "classified", +#' verbose = TRUE +#' ) +#' +#' # Annotate using a different method +#' clustify( +#' input = pbmc_matrix_small, +#' metadata = pbmc_meta, +#' ref_mat = cbmc_ref, +#' query_genes = pbmc_vargenes, +#' cluster_col = "classified", +#' compute_method = "cosine" +#' ) +#' +#' # Annotate a Seurat object +#' clustify( +#' s_small, +#' cbmc_ref, +#' cluster_col = "res.1", +#' obj_out = TRUE, +#' per_cell = FALSE, +#' dr = "tsne" +#' ) +#' +#' # Annotate (and return) a Seurat object per-cell +#' clustify( +#' input = s_small, +#' ref_mat = cbmc_ref, +#' cluster_col = "res.1", +#' obj_out = TRUE, +#' per_cell = TRUE, +#' dr = "tsne" +#' ) +#' @export +clustify.default <- function(input, + ref_mat, + metadata = NULL, + cluster_col = NULL, + query_genes = NULL, + per_cell = FALSE, + n_perm = 0, + compute_method = "spearman", + verbose = FALSE, + lookuptable = NULL, + rm0 = FALSE, + obj_out = TRUE, + seurat_out = TRUE, + rename_prefix = NULL, + threshold = "auto", + low_threshold_cell = 10, + exclude_genes = c(), + ...) { + if (!compute_method %in% clustifyr_methods) { + stop(paste(compute_method, "correlation method not implemented"), + call. = FALSE + ) + } + + input_original <- input + if (!inherits(input_original, c("matrix", "Matrix", "data.frame"))) { + temp <- parse_loc_object( + input, + type = class(input), + expr_loc = NULL, + meta_loc = NULL, + var_loc = NULL, + cluster_col = cluster_col, + lookuptable = lookuptable + ) + + if (!(is.null(temp[["expr"]]))) { + message(paste0("recognized object type - ", class(input))) + } + + input <- temp[["expr"]] + metadata <- temp[["meta"]] + if (is.null(query_genes)) { + query_genes <- temp[["var"]] + } + + if (is.null(cluster_col)) { + cluster_col <- temp[["col"]] + } + } + + if (is.null(metadata) && !per_cell) { + stop("`metadata` needed for per cluster analysis", call. = FALSE) + } + + if (!is.null(cluster_col) && + !cluster_col %in% colnames(metadata)) { + stop("given `cluster_col` is not a column in `metadata`", call. = FALSE) + } + + if (length(query_genes) == 0) { + message("var.features not found, using all genes instead") + query_genes <- NULL + } + + expr_mat <- input + + # select gene subsets + gene_constraints <- get_common_elements( + rownames(expr_mat), + rownames(ref_mat), + query_genes + ) + + if (length(exclude_genes) > 0) { + gene_constraints <- setdiff(gene_constraints, exclude_genes) + } + + if (verbose) { + message(paste0("using # of genes: ", length(gene_constraints))) + if (length(gene_constraints) >= 10000) { + message( + "using a high number genes to calculate correlation ", + "please consider feature selection to improve performance" + ) + } + } + + expr_mat <- expr_mat[gene_constraints, , drop = FALSE] + ref_mat <- ref_mat[gene_constraints, , drop = FALSE] + + if (!per_cell) { + if (is.vector(metadata)) { + cluster_ids <- metadata + } else if (is.factor(metadata)) { + cluster_ids <- as.character(metadata) + } else if (is.data.frame(metadata) & !is.null(cluster_col)) { + cluster_ids <- metadata[[cluster_col]] + } else { + stop( + "metadata not formatted correctly, + supply either a character vector or a dataframe", + call. = FALSE + ) + } + if (is.factor(cluster_ids)) { + cluster_ids <- as.character(cluster_ids) + } + cluster_ids[is.na(cluster_ids)] <- "orig.NA" + } + + if (per_cell) { + cluster_ids <- colnames(expr_mat) + } + + if (n_perm == 0) { + res <- get_similarity( + expr_mat, + ref_mat, + cluster_ids = cluster_ids, + per_cell = per_cell, + compute_method = compute_method, + rm0 = rm0, + ... + ) + } else { + # run permutation + res <- permute_similarity( + expr_mat, + ref_mat, + cluster_ids = cluster_ids, + n_perm = n_perm, + per_cell = per_cell, + compute_method = compute_method, + rm0 = rm0, + ... + ) + } + + if ((obj_out && + seurat_out) && + !inherits(input_original, c("matrix", + "Matrix", + "data.frame"))) { + df_temp <- cor_to_call( + res, + metadata = metadata, + cluster_col = cluster_col, + threshold = threshold + ) + + df_temp_full <- call_to_metadata( + df_temp, + metadata = metadata, + cluster_col = cluster_col, + per_cell = per_cell, + rename_prefix = rename_prefix + ) + + out <- insert_meta_object(input_original, + df_temp_full, + lookuptable = lookuptable + ) + + return(out) + } else { + return(res) + } +} + +#' @rdname clustify +#' @export +clustify.seurat <- function(input, + ref_mat, + cluster_col = NULL, + query_genes = NULL, + per_cell = FALSE, + n_perm = 0, + compute_method = "spearman", + use_var_genes = TRUE, + dr = "umap", + seurat_out = TRUE, + obj_out = TRUE, + threshold = "auto", + verbose = FALSE, + rm0 = FALSE, + rename_prefix = NULL, + exclude_genes = c(), + ...) { + s_object <- input + # for seurat < 3.0 + expr_mat <- s_object@data + metadata <- seurat_meta(s_object, dr = dr) + + if (use_var_genes && is.null(query_genes)) { + query_genes <- s_object@var.genes + } + + res <- clustify( + expr_mat, + ref_mat, + metadata, + query_genes, + per_cell = per_cell, + n_perm = n_perm, + cluster_col = cluster_col, + compute_method = compute_method, + verbose = verbose, + rm0 = rm0, + exclude_genes = exclude_genes, + ... + ) + + if (n_perm != 0) { + res <- -log(res$p_val + .01, 10) + } + + if (!(seurat_out && obj_out)) { + res + } else { + df_temp <- cor_to_call( + res, + metadata = metadata, + cluster_col = cluster_col, + threshold = threshold + ) + + df_temp_full <- call_to_metadata( + df_temp, + metadata = metadata, + cluster_col = cluster_col, + per_cell = per_cell, + rename_prefix = rename_prefix + ) + + if ("Seurat" %in% loadedNamespaces()) { + s_object@meta.data <- df_temp_full + return(s_object) + } else { + message("seurat not loaded, returning cor_mat instead") + return(res) + } + s_object + } +} + +#' @rdname clustify +#' @export +clustify.Seurat <- function(input, + ref_mat, + cluster_col = NULL, + query_genes = NULL, + per_cell = FALSE, + n_perm = 0, + compute_method = "spearman", + use_var_genes = TRUE, + dr = "umap", + seurat_out = TRUE, + obj_out = TRUE, + threshold = "auto", + verbose = FALSE, + rm0 = FALSE, + rename_prefix = NULL, + exclude_genes = c(), + ...) { + s_object <- input + # for seurat 3.0 + + expr_mat <- s_object@assays$RNA@data + metadata <- seurat_meta(s_object, dr = dr) + + if (use_var_genes && is.null(query_genes)) { + query_genes <- s_object@assays$RNA@var.features + } + + res <- clustify( + expr_mat, + ref_mat, + metadata, + query_genes, + per_cell = per_cell, + n_perm = n_perm, + cluster_col = cluster_col, + compute_method = compute_method, + verbose = verbose, + rm0 = rm0, + exclude_genes = exclude_genes, + ... + ) + + if (n_perm != 0) { + res <- -log(res$p_val + .01, 10) + } + + if (!(seurat_out && obj_out)) { + res + } else { + df_temp <- cor_to_call( + res, + metadata = metadata, + cluster_col = cluster_col, + threshold = threshold + ) + + df_temp_full <- call_to_metadata( + df_temp, + metadata = metadata, + cluster_col = cluster_col, + per_cell = per_cell, + rename_prefix = rename_prefix + ) + + if ("Seurat" %in% loadedNamespaces()) { + s_object@meta.data <- df_temp_full + return(s_object) + } else { + message("seurat not loaded, returning cor_mat instead") + return(res) + } + s_object + } +} + +#' @rdname clustify +#' @importFrom SingleCellExperiment logcounts colData +#' @importFrom SummarizedExperiment `colData<-` +#' @export +clustify.SingleCellExperiment <- function(input, + ref_mat, + cluster_col = NULL, + query_genes = NULL, + per_cell = FALSE, + n_perm = 0, + compute_method = "spearman", + use_var_genes = TRUE, + dr = "umap", + seurat_out = TRUE, + obj_out = TRUE, + threshold = "auto", + verbose = FALSE, + rm0 = FALSE, + rename_prefix = NULL, + exclude_genes = c(), + ...) { + s_object <- input + expr_mat <- SingleCellExperiment::logcounts(s_object) + metadata <- as.data.frame(SingleCellExperiment::colData(s_object)) + + res <- clustify( + expr_mat, + ref_mat, + metadata, + query_genes, + per_cell = per_cell, + n_perm = n_perm, + cluster_col = cluster_col, + compute_method = compute_method, + verbose = verbose, + rm0 = rm0, + exclude_genes = exclude_genes, + ... + ) + + if (n_perm != 0) { + res <- -log(res$p_val + .01, 10) + } + + if (!(seurat_out && obj_out)) { + res + } else { + df_temp <- cor_to_call( + res, + metadata = metadata, + cluster_col = cluster_col, + threshold = threshold + ) + + df_temp_full <- call_to_metadata( + df_temp, + metadata = metadata, + cluster_col = cluster_col, + per_cell = per_cell, + rename_prefix = rename_prefix + ) + + if ("SingleCellExperiment" %in% loadedNamespaces()) { + if (!(is.null(rename_prefix))) { + col_type <- stringr::str_c(rename_prefix, "_type") + col_r <- stringr::str_c(rename_prefix, "_r") + } else { + col_type <- "type" + col_r <- "r" + } + colDatatemp <- SingleCellExperiment::colData(s_object) + colDatatemp[[col_type]] <- df_temp_full[[col_type]] + colDatatemp[[col_r]] <- df_temp_full[[col_r]] + colData(s_object) <- colDatatemp + return(s_object) + } else { + message("SingleCellExperiment not loaded, returning cor_mat instead") + return(res) + } + s_object + } +} + +#' Correlation functions available in clustifyr +#' @export +clustifyr_methods <- c( + "pearson", + "spearman", + "cosine", + "kl_divergence", + "kendall" +) + +#' Main function to compare scRNA-seq data to gene lists. +#' +#' @export +clustify_lists <- function(input, ...) { + UseMethod("clustify_lists", input) +} + +#' @rdname clustify_lists +#' @param input single-cell expression matrix or Seurat object +#' @param marker matrix or dataframe of candidate genes for each cluster +#' @param marker_inmatrix whether markers genes are already in preprocessed +#' matrix form +#' @param metadata cell cluster assignments, +#' supplied as a vector or data.frame. +#' If data.frame is supplied then `cluster_col` needs to be set. +#' Not required if running correlation per cell. +#' @param cluster_col column in metadata with cluster number +#' @param if_log input data is natural log, averaging will be done on +#' unlogged data +#' @param per_cell compare per cell or per cluster +#' @param topn number of top expressing genes to keep from input matrix +#' @param cut expression cut off from input matrix +#' @param genome_n number of genes in the genome +#' @param metric adjusted p-value for hypergeometric test, or jaccard index +#' @param output_high if true (by default to fit with rest of package), +#' -log10 transform p-value +#' @param lookuptable if not supplied, will look in built-in table +#' for object parsing +#' @param obj_out whether to output object instead of cor matrix +#' @param rename_prefix prefix to add to type and r column names +#' @param threshold identity calling minimum correlation score threshold, +#' only used when obj_out = T +#' @param low_threshold_cell option to remove clusters with too few cells +#' @param dr stored dimension reduction +#' @param seurat_out output cor matrix or called seurat object +#' (deprecated, use obj_out instead) +#' @param ... passed to matrixize_markers +#' @examples +#' # Annotate a matrix and metadata +#' clustify_lists( +#' input = pbmc_matrix_small, +#' marker = cbmc_m, +#' metadata = pbmc_meta, +#' cluster_col = "classified", +#' verbose = TRUE +#' ) +#' +#' # Annotate using a different method +#' clustify_lists( +#' input = pbmc_matrix_small, +#' marker = cbmc_m, +#' metadata = pbmc_meta, +#' cluster_col = "classified", +#' verbose = TRUE, +#' metric = "jaccard" +#' ) +#' @return matrix of numeric values, clusters from input as row names, +#' cell types from marker_mat as column names + +#' @export +clustify_lists.default <- function(input, + marker, + marker_inmatrix = TRUE, + metadata = NULL, + cluster_col = NULL, + if_log = TRUE, + per_cell = FALSE, + topn = 800, + cut = 0, + genome_n = 30000, + metric = "hyper", + output_high = TRUE, + lookuptable = NULL, + obj_out = TRUE, + seurat_out = TRUE, + rename_prefix = NULL, + threshold = 0, + low_threshold_cell = 10, + ...) { + input_original <- input + if (!inherits(input, c("matrix", "Matrix", "data.frame"))) { + temp <- parse_loc_object( + input, + type = class(input), + expr_loc = NULL, + meta_loc = NULL, + var_loc = NULL, + cluster_col = cluster_col, + lookuptable = lookuptable + ) + input <- temp[["expr"]] + metadata <- temp[["meta"]] + cluster_info <- metadata + if (is.null(cluster_col)) { + cluster_col <- temp[["col"]] + } + } else { + cluster_info <- metadata + } + + if (metric %in% c("posneg", "pct")) { + per_cell <- TRUE + } + if (!(per_cell)) { + input <- average_clusters(input, + cluster_info, + if_log = if_log, + cluster_col = cluster_col, + low_threshold = low_threshold_cell + ) + } + + bin_input <- binarize_expr(input, n = topn, cut = cut) + + if (marker_inmatrix != TRUE) { + marker <- matrixize_markers( + marker, + ... + ) + } + + if (metric == "consensus") { + results <- lapply( + c("hyper", "jaccard", "pct", "posneg"), + function(x) { + clustify_lists( + input_original, + marker, + metadata = cluster_info, + cluster_col = cluster_col, + metric = x + ) + } + ) + call_list <- lapply( + results, + cor_to_call_rank + ) + res <- call_consensus(call_list) + } else if (metric == "pct") { + res <- gene_pct_markerm(input, + marker, + cluster_info, + cluster_col = cluster_col + ) + } else if (metric == "gsea") { + res <- compare_lists( + input, + marker_mat = marker, + n = genome_n, + metric = "gsea", + output_high = output_high + ) + } else if (metric != "posneg") { + res <- compare_lists( + bin_input, + marker_mat = marker, + n = genome_n, + metric = metric, + output_high = output_high + ) + } else { + if (length(marker) > 1) { + marker <- pos_neg_marker(marker) + } + res <- pos_neg_select(input, + marker, + cluster_info, + cluster_col = cluster_col, + cutoff_score = NULL + ) + } + + if ((!inherits(input_original, c("matrix", "Matrix", "data.frame")) && + obj_out && + seurat_out)) { + if (metric != "consensus") { + df_temp <- cor_to_call( + res, + metadata = metadata, + cluster_col = cluster_col, + threshold = threshold + ) + + df_temp_full <- call_to_metadata( + df_temp, + metadata = metadata, + cluster_col = cluster_col, + per_cell = per_cell, + rename_prefix = rename_prefix + ) + } else { + df_temp_full <- res + } + + out <- insert_meta_object(input_original, + df_temp_full, + lookuptable = lookuptable + ) + + return(out) + } else { + return(res) + } +} + +#' @rdname clustify_lists +#' @export +clustify_lists.seurat <- function(input, + metadata = NULL, + cluster_col = NULL, + if_log = TRUE, + per_cell = FALSE, + topn = 800, + cut = 0, + marker, + marker_inmatrix = TRUE, + genome_n = 30000, + metric = "hyper", + output_high = TRUE, + dr = "umap", + seurat_out = TRUE, + obj_out = TRUE, + threshold = 0, + rename_prefix = NULL, + ...) { + s_object <- input + # for seurat < 3.0 + input <- s_object@data + if (is.null(metadata)) { + cluster_info <- as.data.frame(seurat_meta(s_object, dr = dr)) + metadata <- cluster_info + } else { + cluster_info <- metadata + } + + res <- clustify_lists( + input, + per_cell = per_cell, + metadata = cluster_info, + if_log = if_log, + cluster_col = cluster_col, + topn = topn, + cut = cut, + marker, + marker_inmatrix = marker_inmatrix, + genome_n = genome_n, + metric = metric, + output_high = output_high, + ... + ) + + if (!(seurat_out && obj_out)) { + res + } else { + if (metric != "consensus") { + df_temp <- cor_to_call( + res, + metadata = metadata, + cluster_col = cluster_col, + threshold = threshold + ) + } else { + df_temp <- res + colnames(df_temp)[1] <- cluster_col + } + df_temp_full <- call_to_metadata( + df_temp, + metadata = metadata, + cluster_col = cluster_col, + per_cell = per_cell, + rename_prefix = rename_prefix + ) + + if ("Seurat" %in% loadedNamespaces()) { + s_object@meta.data <- df_temp_full + return(s_object) + } else { + message("seurat not loaded, returning cor_mat instead") + return(res) + } + s_object + } +} + +#' @rdname clustify_lists +#' @export +clustify_lists.Seurat <- function(input, + metadata = NULL, + cluster_col = NULL, + if_log = TRUE, + per_cell = FALSE, + topn = 800, + cut = 0, + marker, + marker_inmatrix = TRUE, + genome_n = 30000, + metric = "hyper", + output_high = TRUE, + dr = "umap", + seurat_out = TRUE, + obj_out = TRUE, + threshold = 0, + rename_prefix = NULL, + ...) { + s_object <- input + # for seurat 3.0 + + input <- s_object@assays$RNA@data + if (is.null(metadata)) { + cluster_info <- as.data.frame(seurat_meta(s_object, dr = dr)) + metadata <- cluster_info + } else { + cluster_info <- metadata + } + + res <- clustify_lists( + input, + per_cell = per_cell, + metadata = cluster_info, + if_log = if_log, + cluster_col = cluster_col, + topn = topn, + cut = cut, + marker, + marker_inmatrix = marker_inmatrix, + genome_n = genome_n, + metric = metric, + output_high = output_high, + ... + ) + + if (!(seurat_out && obj_out)) { + res + } else { + if (metric != "consensus") { + df_temp <- cor_to_call( + res, + metadata = metadata, + cluster_col = cluster_col, + threshold = threshold + ) + } else { + df_temp <- res + colnames(df_temp)[1] <- cluster_col + } + + df_temp_full <- call_to_metadata( + df_temp, + metadata = metadata, + cluster_col = cluster_col, + per_cell = per_cell, + rename_prefix = rename_prefix + ) + + if ("Seurat" %in% loadedNamespaces()) { + s_object@meta.data <- df_temp_full + return(s_object) + } else { + message("seurat not loaded, returning cor_mat instead") + return(res) + } + s_object + } +} + +#' @rdname clustify_lists +#' @importFrom SingleCellExperiment logcounts colData +#' @importFrom SummarizedExperiment `colData<-` +#' @export +clustify_lists.SingleCellExperiment <- function(input, + metadata = NULL, + cluster_col = NULL, + if_log = TRUE, + per_cell = FALSE, + topn = 800, + cut = 0, + marker, + marker_inmatrix = TRUE, + genome_n = 30000, + metric = "hyper", + output_high = TRUE, + dr = "umap", + seurat_out = TRUE, + obj_out = TRUE, + threshold = 0, + rename_prefix = NULL, + ...) { + s_object <- input + expr_mat <- SingleCellExperiment::logcounts(s_object) + if (is.null(metadata)) { + metadata <- as.data.frame(SingleCellExperiment::colData(s_object)) + } + + res <- clustify_lists( + expr_mat, + per_cell = per_cell, + metadata = metadata, + if_log = if_log, + cluster_col = cluster_col, + topn = topn, + cut = cut, + marker, + marker_inmatrix = marker_inmatrix, + genome_n = genome_n, + metric = metric, + output_high = output_high, + ... + ) + + if (!(seurat_out && obj_out)) { + res + } else { + df_temp <- cor_to_call( + res, + metadata = metadata, + cluster_col = cluster_col, + threshold = threshold + ) + + df_temp_full <- call_to_metadata( + df_temp, + metadata = metadata, + cluster_col = cluster_col, + per_cell = per_cell, + rename_prefix = rename_prefix + ) + + if ("SingleCellExperiment" %in% loadedNamespaces()) { + if (!(is.null(rename_prefix))) { + col_type <- stringr::str_c(rename_prefix, "_type") + col_r <- stringr::str_c(rename_prefix, "_r") + } else { + col_type <- "type" + col_r <- "r" + } + SingleCellExperiment::colData(s_object)[[col_type]] <- + df_temp_full[[col_type]] + + SingleCellExperiment::colData(s_object)[[col_r]] <- + df_temp_full[[col_r]] + + return(s_object) + } else { + message("SingleCellExperiment not loaded, returning cor_mat instead") + return(res) + } + s_object + } +} diff --git a/R/main.R.REMOVED.git-id b/R/main.R.REMOVED.git-id deleted file mode 100644 index e66bedb2f..000000000 --- a/R/main.R.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -3ec14fb95a67f131664cae6bd815ec93c51d98a7 \ No newline at end of file diff --git a/R/plot.R b/R/plot.R new file mode 100644 index 000000000..f70b7643b --- /dev/null +++ b/R/plot.R @@ -0,0 +1,587 @@ +#' Plot a tSNE or umap colored by feature. +#' +#' @param data input data +#' @param x x variable +#' @param y y variable +#' @param feature feature to color by +#' @param legend_name legend name to display, defaults to no name +#' @param c_cols character vector of colors to build color gradient +#' for continuous values, defaults to [`clustifyr::pretty_palette`] +#' @param d_cols character vector of colors for discrete values. +#' defaults to RColorBrewer paired palette +#' @param pt_size point size +#' @param alpha_col whether to refer to data column for alpha values +#' @param group_col group by another column instead of feature, +#' useful for labels +#' @param scale_limits defaults to min = 0, max = max(data$x), +#' otherwise a two-element numeric vector indicating min and max to plot +#' @param do_label whether to label each cluster at median center +#' @param do_legend whether to draw legend +#' @param do_repel whether to use ggrepel on labels +#' @return ggplot object, cells projected by dr, colored by feature +#' @examples +#' plot_dims( +#' pbmc_meta, +#' feature = "classified" +#' ) +#' @export +plot_dims <- function(data, + x = "UMAP_1", + y = "UMAP_2", + feature = NULL, + legend_name = "", + c_cols = pretty_palette2, + d_cols = NULL, + pt_size = 0.25, + alpha_col = NULL, + group_col = NULL, + scale_limits = NULL, + do_label = FALSE, + do_legend = TRUE, + do_repel = TRUE) { + # add backticks to allow special characters in column names + x_col <- paste0("`", x, "`") + y_col <- paste0("`", y, "`") + + # If feature is not provided return unlabeled plot + if (is.null(feature)) { + p <- ggplot2::ggplot(data, ggplot2::aes_string(x_col, y_col)) + + geom_point(size = pt_size) + + cowplot::theme_cowplot() + + if (!is.null(d_cols)) { + p <- p + + scale_color_manual(values = d_cols) + } + + return(p) + } + + # Retrieve features from data + feature_data <- data[[feature]] + n_features <- length(unique(feature_data)) + + feature_types <- c( + "character", + "logical", + "factor" + ) + + # If there are too many features, add more colors for pretty_palette2 + if (identical(c_cols, pretty_palette2) & + (n_features > length(pretty_palette2)) & + (typeof(feature_data) %in% feature_types)) { + c_cols <- pretty_palette_ramp_d(n_features) + d_cols <- pretty_palette_ramp_d(n_features) + } + + # sort data to avoid plotting null values over colors + data <- dplyr::arrange(data, !!dplyr::sym(feature)) + + if (!is.null(alpha_col)) { + p <- ggplot2::ggplot(data, ggplot2::aes_string(x_col, y_col)) + + geom_point(ggplot2::aes_string( + color = paste0("`", feature, "`"), + alpha = paste0("`", alpha_col, "`") + ), # backticks protect special character gene names + size = pt_size + ) + scale_alpha_continuous(range = c(0, 1)) + } else { + p <- ggplot2::ggplot(data, ggplot2::aes_string(x_col, y_col)) + + geom_point(ggplot2::aes_string(color = paste0("`", + feature, + "`")), + size = pt_size + ) + } + + # discrete values + if (!is.numeric(feature_data)) { + # use colors provided + if (!is.null(d_cols)) { + p <- p + + scale_color_manual( + values = d_cols, + name = legend_name + ) + } else { + p <- p + + scale_color_brewer( + palette = "Paired", + name = legend_name + ) + } + + # continuous values + } else { + if (is.null(scale_limits)) { + scale_limits <- c( + ifelse(min(feature_data) < 0, min(feature_data), 0), + max(feature_data) + ) + } + + p <- p + + scale_color_gradientn( + colors = c_cols, + name = legend_name, + limits = scale_limits + ) + } + + if (do_label) { + if (is.null(group_col)) { + centers <- dplyr::group_by(data, !!sym(feature)) + } else { + centers <- dplyr::group_by(data, !!sym(feature), !!sym(group_col)) + } + + if (!(is.null(alpha_col))) { + centers <- + dplyr::summarize(centers, + t1 = median(!!dplyr::sym(x)), + t2 = median(!!dplyr::sym(y)), + a = median(!!dplyr::sym(alpha_col)) + ) + centers <- dplyr::ungroup(centers) + + if (!(is.null(group_col))) { + centers <- dplyr::select(centers, -!!sym(group_col)) + } + } else { + centers <- + dplyr::summarize(centers, + t1 = median(!!dplyr::sym(x)), + t2 = median(!!dplyr::sym(y)), + a = 1 + ) + centers <- dplyr::ungroup(centers) + + if (!(is.null(group_col))) { + centers <- dplyr::select(centers, -!!sym(group_col)) + } + } + + if (do_repel) { + alldata <- dplyr::select(data, + !!dplyr::sym(feature), + !!dplyr::sym(x), + !!dplyr::sym(y)) + alldata[[1]] <- "" + alldata$a <- 0 + colnames(alldata) <- colnames(centers) + alldata <- rbind(alldata, centers) + p <- p + + geom_point( + data = alldata, + mapping = aes( + x = !!dplyr::sym("t1"), + y = !!dplyr::sym("t2") + ), + size = 3, + alpha = 0 + ) + + ggrepel::geom_text_repel( + data = alldata, + mapping = aes( + x = !!dplyr::sym("t1"), + y = !!dplyr::sym("t2"), + alpha = !!dplyr::sym("a"), + label = alldata[[feature]] + ), + point.padding = 0.5, + box.padding = 0.5, + max.iter = 50000 + ) + } else { + p <- p + + geom_text( + data = centers, + mapping = aes( + x = !!dplyr::sym("t1"), + y = !!dplyr::sym("t2"), + label = centers[[feature]] + ), + alpha = centers[["a"]] + ) + } + } + + p <- p + + cowplot::theme_cowplot() + + if (!do_legend) { + p <- p + + theme(legend.position = "none") + } + + p +} + +#' Color palette for plotting continous variables +#' @return vector of colors +pretty_palette <- rev(scales::brewer_pal(palette = "RdGy")(6)) + +#' Color palette for plotting continous variables, starting at gray +#' @return vector of colors +pretty_palette2 <- scales::brewer_pal(palette = "Reds")(9) + +#' black and white palette for plotting continous variables +#' @return vector of colors +not_pretty_palette <- scales::brewer_pal(palette = "Greys")(9) + +#' Expanded color palette ramp for plotting discrete variables +#' @param n number of colors to use +#' @return color ramp +pretty_palette_ramp_d <- + grDevices::colorRampPalette(scales::brewer_pal(palette = "Paired")(12)) + +#' Plot similarity measures on a tSNE or umap +#' +#' @param cor_mat input similarity matrix +#' @param metadata input metadata with per cell tsne or +#' umap coordinates and cluster ids +#' @param data_to_plot colname of data to plot, defaults to all +#' @param cluster_col colname of clustering data in metadata, defaults +#' to rownames of the metadata if not supplied. +#' @param x metadata column name with 1st axis dimension. +#' defaults to "UMAP_1". +#' @param y metadata column name with 2nd axis dimension. +#' defaults to "UMAP_2". +#' @param scale_legends if TRUE scale all legends to maximum values in entire +#' correlation matrix. if FALSE scale legends to maximum for each plot. A +#' two-element numeric vector can also be passed to supply +#' custom values i.e. c(0, 1) +#' @param ... passed to plot_dims +#' @return list of ggplot objects, cells projected by dr, +#' colored by cor values +#' @examples +#' res <- clustify( +#' input = pbmc_matrix_small, +#' metadata = pbmc_meta, +#' ref_mat = cbmc_ref, +#' query_genes = pbmc_vargenes, +#' cluster_col = "classified" +#' ) +#' +#' plot_cor( +#' cor_mat = res, +#' metadata = pbmc_meta, +#' data_to_plot = colnames(res)[1:2], +#' cluster_col = "classified", +#' x = "UMAP_1", +#' y = "UMAP_2" +#' ) +#' @export +plot_cor <- function(cor_mat, + metadata, + data_to_plot = colnames(cor_mat), + cluster_col = NULL, + x = "UMAP_1", + y = "UMAP_2", + scale_legends = FALSE, + ...) { + cor_matrix <- cor_mat + if (!any(data_to_plot %in% colnames(cor_matrix))) { + stop("cluster ids not shared between metadata and correlation matrix", + call. = FALSE + ) + } + + if (is.null(cluster_col)) { + cluster_col <- "rownames" + metadata <- tibble::rownames_to_column(metadata, cluster_col) + } + + cor_df <- as.data.frame(cor_matrix) + cor_df <- tibble::rownames_to_column(cor_df, cluster_col) + cor_df_long <- tidyr::gather( + cor_df, + "ref_cluster", + "expr", -dplyr::matches(cluster_col) + ) + + # If cluster_col is factor, convert to character + if (is.factor(metadata[, cluster_col])) { + metadata[, cluster_col] <- as.character(metadata[, cluster_col]) + } + + # checks matrix rownames, + # 2 branches for cluster number (avg) or cell bar code (each cell) + if (cor_df[[cluster_col]][1] %in% metadata[[cluster_col]]) { + plt_data <- dplyr::left_join(cor_df_long, + metadata, + by = cluster_col + ) + } else { + plt_data <- dplyr::left_join(cor_df_long, + metadata, + by = structure(names = cluster_col, "rn") + ) + } + + # determine scaling method, either same for all plots, + # or per plot (default) + if (is.logical(scale_legends) && scale_legends) { + scale_limits <- c( + ifelse(min(plt_data$expr) < 0, + min(plt_data$expr), + 0 + ), + max(max(plt_data$expr)) + ) + } else if (is.logical(scale_legends) && !scale_legends) { + scale_limits <- NULL + } else { + scale_limits <- scale_legends + } + + plts <- vector("list", length(data_to_plot)) + + for (i in seq_along(data_to_plot)) { + tmp_data <- dplyr::filter(plt_data, + !!dplyr::sym("ref_cluster") == data_to_plot[i]) + + plts[[i]] <- plot_dims( + data = tmp_data, + x = x, + y = y, + feature = "expr", + legend_name = data_to_plot[i], + scale_limits = scale_limits, + ... + ) + } + + plts +} + +#' Plot gene expression on to tSNE or umap +#' +#' @param expr_mat input single cell matrix +#' @param metadata data.frame with tSNE or umap coordinates +#' @param genes gene(s) to color tSNE or umap +#' @param cell_col column name in metadata containing cell ids, defaults +#' to rownames if not supplied +#' @param ... additional arguments passed to `[clustifyr::plot_dims()]` +#' @return list of ggplot object, cells projected by dr, +#' colored by gene expression +#' @examples +#' genes <- c( +#' "RP11-314N13.3", +#' "ARF4" +#' ) +#' +#' plot_gene( +#' expr_mat = pbmc_matrix_small, +#' metadata = tibble::rownames_to_column(pbmc_meta, "rn"), +#' genes = genes, +#' cell_col = "rn" +#' ) +#' @export +plot_gene <- function(expr_mat, + metadata, + genes, + cell_col = NULL, + ...) { + genes_to_plot <- genes[genes %in% rownames(expr_mat)] + genes_missing <- setdiff(genes, genes_to_plot) + + if (length(genes_missing) != 0) { + warning(paste0( + "the following genes were not present in the input matrix ", + paste(genes_missing, collapse = ",") + )) + } + + if (length(genes_to_plot) == 0) { + stop("no genes present to plot", call. = FALSE) + } + expr_dat <- t(as.matrix(expr_mat[genes_to_plot, , drop = FALSE])) + expr_dat <- + tibble::rownames_to_column(as.data.frame(expr_dat), "cell") + + if (is.null(cell_col)) { + mdata <- tibble::rownames_to_column(metadata, "cell") + cell_col <- "cell" + } else { + mdata <- metadata + } + + if (!cell_col %in% colnames(mdata)) { + stop("please supply a cell_col that is present in metadata", + call. = FALSE) + } + + plt_dat <- dplyr::left_join(expr_dat, mdata, + by = c("cell" = cell_col) + ) + + lapply( + genes_to_plot, + function(gene) { + plot_dims(plt_dat, + feature = gene, + legend_name = gene, + ... + ) + } + ) +} + +#' Plot called clusters on a tSNE or umap, for each reference cluster given +#' +#' @param cor_mat input similarity matrix +#' @param metadata input metadata with tsne or +#' umap coordinates and cluster ids +#' @param data_to_plot colname of data to plot, defaults to all +#' @param ... passed to plot_dims +#' @return list of ggplot object, cells projected by dr, +#' colored by cell type classification +plot_call <- function(cor_mat, + metadata, + data_to_plot = colnames(cor_mat), + ...) { + cor_matrix <- cor_mat + df_temp <- + as.data.frame(cor_matrix - matrixStats::rowMaxs(as.matrix(cor_matrix))) + names(df_temp) <- rownames(cor_matrix) + df_temp[df_temp == 0] <- "1" + df_temp[df_temp != "1"] <- "0" + plot_cor( + df_temp, + metadata, + data_to_plot, + ... + ) +} + +#' Plot best calls for each cluster on a tSNE or umap +#' +#' @param cor_mat input similarity matrix +#' @param metadata input metadata with tsne or +#' umap coordinates and cluster ids +#' @param cluster_col metadata column, can be cluster or cellid +#' @param collapse_to_cluster if a column name is provided, +#' takes the most frequent call of entire cluster to color in plot +#' @param threshold minimum correlation coefficent cutoff for calling clusters +#' @param x x variable +#' @param y y variable +#' @param plot_r whether to include second plot of cor eff for best call +#' @param per_cell whether the cor_mat was generate per cell or per cluster +#' @param ... passed to plot_dims +#' @return ggplot object, cells projected by dr, +#' colored by cell type classification +#' @examples +#' res <- clustify( +#' input = pbmc_matrix_small, +#' metadata = pbmc_meta, +#' ref_mat = cbmc_ref, +#' query_genes = pbmc_vargenes, +#' cluster_col = "classified" +#' ) +#' +#' plot_best_call( +#' cor_mat = res, +#' metadata = pbmc_meta, +#' cluster_col = "classified" +#' ) +#' @export +plot_best_call <- function(cor_mat, + metadata, + cluster_col = "cluster", + collapse_to_cluster = FALSE, + threshold = 0, + x = "UMAP_1", + y = "UMAP_2", + plot_r = FALSE, + per_cell = FALSE, + ...) { + cor_matrix <- cor_mat + col_meta <- colnames(metadata) + if ("type" %in% col_meta | "type2" %in% col_meta) { + warning('metadata column name clash of "type"/"type2"') + return() + } + df_temp <- cor_to_call( + cor_matrix, + metadata = metadata, + cluster_col = cluster_col, + threshold = threshold + ) + + df_temp_full <- call_to_metadata( + df_temp, + metadata = metadata, + cluster_col = cluster_col, + per_cell = per_cell + ) + + if (collapse_to_cluster != FALSE) { + df_temp_full <- collapse_to_cluster(df_temp_full, + metadata, + collapse_to_cluster, + threshold = threshold + ) + } + + g <- plot_dims(df_temp_full, + feature = "type", + x = x, + y = y, + ... + ) + + if (plot_r) { + l <- list() + l[[1]] <- g + l[[2]] <- plot_dims(df_temp_full, + feature = "r", + x = x, + y = y, + ... + ) + } else { + l <- g + } + + l +} + +#' Plot similarity measures on heatmap +#' +#' @param cor_mat input similarity matrix +#' @param metadata input metadata with per cell tsne +#' or umap cooordinates and cluster ids +#' @param cluster_col colname of clustering data in metadata, +#' defaults to rownames of the metadata if not supplied. +#' @param col color ramp to use +#' @param legend_title legend title to pass to Heatmap +#' @param ... passed to Heatmap +#' @return complexheatmap object +#' @examples +#' res <- clustify( +#' input = pbmc_matrix_small, +#' metadata = pbmc_meta, +#' ref_mat = cbmc_ref, +#' query_genes = pbmc_vargenes, +#' cluster_col = "classified", +#' per_cell = FALSE +#' ) +#' +#' plot_cor_heatmap(res) +#' @export +plot_cor_heatmap <- function(cor_mat, + metadata = NULL, + cluster_col = NULL, + col = not_pretty_palette, + legend_title = NULL, + ...) { + cor_matrix <- cor_mat + ComplexHeatmap::Heatmap( + cor_matrix, + col = col, + heatmap_legend_param = list(title = legend_title), + ... + ) +} diff --git a/R/plot.R.REMOVED.git-id b/R/plot.R.REMOVED.git-id deleted file mode 100644 index 1ceedbc76..000000000 --- a/R/plot.R.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -f70b7643b940cce1061d7f6cc5a350019c25e29f \ No newline at end of file diff --git a/R/run_fgsea.R b/R/run_fgsea.R new file mode 100644 index 000000000..958935305 --- /dev/null +++ b/R/run_fgsea.R @@ -0,0 +1,99 @@ +#' Run GSEA to compare a gene list(s) to per cell or +#' per cluster expression data +#' @description Use fgsea algorithm to compute normalized enrichment +#' scores and pvalues for gene +#' set ovelap +#' @param expr_mat single-cell expression matrix or Seurat object +#' @param query_genes A vector or named list of vectors of genesets of interest +#' to compare via GSEA. If supplying a named list, then the gene set names +#' will appear in the output. +#' @param cluster_ids vector of cell cluster assignments, supplied as a +#' vector with order that +#' matches columns in `expr_mat`. Not required if running per cell. +#' @param n_perm Number of permutation for fgsea function. Defaults to 1000. +#' @param per_cell if true run per cell, otherwise per cluster. +#' @param scale convert expr_mat into zscores prior to running GSEA?, +#' default = FALSE +#' @param no_warnings suppress warnings from gsea ties +#' @return dataframe of gsea scores (pval, NES), with clusters as rownames +#' @examples +#' run_gsea( +#' expr_mat = pbmc_matrix_small, +#' query_genes = pbmc_vargenes[1:100], +#' n_perm = 10, +#' cluster_ids = pbmc_meta$classified, +#' no_warnings = TRUE +#' ) +#' @export +run_gsea <- function(expr_mat, + query_genes, + cluster_ids = NULL, + n_perm = 1000, + per_cell = FALSE, + scale = FALSE, + no_warnings = TRUE) { + if (!is.list(query_genes)) { + geneset_list <- list("query_genes" = query_genes) + } else { + geneset_list <- query_genes + } + + if (!per_cell & (ncol(expr_mat) != length(cluster_ids))) { + stop("cluster_ids do not match number of cells (columns) ", + "in expr_mat", + call. = FALSE + ) + } + + if (n_perm > 1e4 & per_cell) { + warning("run_gsea() take a long time if running many ", + "permutations and running per cell") + } + + if (scale) { + expr_mat <- t(scale(t(as.matrix(expr_mat)))) + } + + if (!per_cell) { + avg_mat <- average_clusters(expr_mat, + metadata = cluster_ids) + } else { + avg_mat <- expr_mat + } + + res <- list() + for (i in seq_along(colnames(avg_mat))) { + if (!(no_warnings)) { + gsea_res <- fgsea::fgsea( + geneset_list, + avg_mat[, i], + minSize = 1, + maxSize = max(vapply(geneset_list, + length, + FUN.VALUE = numeric(1))), + nproc = 1, + nperm = n_perm + ) + } else { + suppressWarnings( + gsea_res <- fgsea::fgsea( + geneset_list, + avg_mat[, i], + minSize = 1, + maxSize = max(vapply(geneset_list, + length, + FUN.VALUE = numeric(1))), + nproc = 1, + nperm = n_perm + ) + ) + } + res[[i]] <- gsea_res[, c("pathway", "pval", "NES")] + } + gsea_res <- dplyr::bind_rows(res) + gsea_res <- + as.data.frame(dplyr::mutate(gsea_res, cell = colnames(avg_mat))) + gsea_res <- tibble::column_to_rownames(gsea_res, "cell") + + gsea_res +} diff --git a/R/run_fgsea.R.REMOVED.git-id b/R/run_fgsea.R.REMOVED.git-id deleted file mode 100644 index d6123a7bc..000000000 --- a/R/run_fgsea.R.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -958935305564c0aa73c807a4ea416e44682bb971 \ No newline at end of file diff --git a/R/seurat_wrapper.R b/R/seurat_wrapper.R new file mode 100644 index 000000000..1a9c19e1b --- /dev/null +++ b/R/seurat_wrapper.R @@ -0,0 +1,245 @@ +#' Function to convert labelled seurat object to avg expression matrix +#' @return reference expression matrix, with genes as row names, +#' and cell types as column names +#' @export +seurat_ref <- function(seurat_object, ...) { + UseMethod("seurat_ref", seurat_object) +} + +#' @rdname seurat_ref +#' @param seurat_object seurat_object after tsne or umap projections +#' and clustering +#' @param cluster_col column name where classified cluster names +#' are stored in seurat meta data, cannot be "rn" +#' @param var_genes_only whether to keep only var_genes in the final +#' matrix output, could also look up genes used for PCA +#' @param assay_name any additional assay data, such as ADT, to include. +#' If more than 1, pass a vector of names +#' @param method whether to take mean (default) or median +#' @param subclusterpower whether to get multiple averages per +#' original cluster +#' @param if_log input data is natural log, +#' averaging will be done on unlogged data +#' @param ... additional arguments +#' @examples +#' ref <- seurat_ref( +#' seurat_object = s_small, +#' cluster_col = "res.1", +#' var_genes_only = TRUE +#' ) +#' ref[1:3, 1:3] +#' @export +seurat_ref.seurat <- function(seurat_object, + cluster_col = "classified", + var_genes_only = FALSE, + assay_name = NULL, + method = "mean", + subclusterpower = 0, + if_log = TRUE, + ...) { + temp_mat <- seurat_object@data + if (is.logical(var_genes_only) && var_genes_only) { + temp_mat <- temp_mat[seurat_object@var.genes, ] + } else if (var_genes_only == "PCA") { + temp_mat <- temp_mat[rownames(seurat_object@dr$pca@gene.loadings), ] + } + + if (!is.null(assay_name)) { + temp_mat <- temp_mat[0, ] + for (element in assay_name) { + temp_mat2 <- seurat_object@assay[[element]]@raw.data + temp_mat <- rbind(temp_mat, as.matrix(temp_mat2)) + } + } + + temp_res <- average_clusters( + temp_mat, + seurat_object@meta.data, + cluster_col = cluster_col, + method = method, + subclusterpower = subclusterpower, + if_log = if_log + ) + + temp_res +} + +#' @rdname seurat_ref +#' @export +seurat_ref.Seurat <- function(seurat_object, + cluster_col = "classified", + var_genes_only = FALSE, + assay_name = NULL, + method = "mean", + subclusterpower = 0, + if_log = TRUE, + ...) { + if (is(seurat_object, "Seurat")) { + temp_mat <- seurat_object@assays$RNA@data + + if (is.logical(var_genes_only) && var_genes_only) { + temp_mat <- temp_mat[seurat_object@assays$RNA@var.features, ] + } else if (var_genes_only == "PCA") { + temp_mat <- + temp_mat[rownames(seurat_object@reductions$pca@feature.loadings), + ] + } + + if (!is.null(assay_name)) { + temp_mat <- temp_mat[0, ] + for (element in assay_name) { + temp_mat2 <- seurat_object@assays[[element]]@counts + temp_mat <- rbind(temp_mat, as.matrix(temp_mat2)) + } + } + } else { + stop("warning, not seurat3 object") + } + + temp_res <- average_clusters( + temp_mat, + seurat_object@meta.data, + cluster_col = cluster_col, + method = method, + subclusterpower = subclusterpower, + if_log = if_log + ) + + temp_res +} + +#' Function to convert labelled seurat object to fully prepared metadata +#' @return dataframe of metadata, including dimension reduction plotting info +#' @examples +#' \dontrun{ +#' seurat_meta(s_small) +#' } +#' @export +seurat_meta <- function(seurat_object, ...) { + UseMethod("seurat_meta", seurat_object) +} + +#' @rdname seurat_meta +#' @param seurat_object seurat_object after tsne or +#' umap projections and clustering +#' @param dr dimension reduction method +#' @param ... additional arguments +#' @export +seurat_meta.seurat <- function(seurat_object, + dr = "umap", + ...) { + dr2 <- dr + temp_dr <- + tryCatch( + as.data.frame(seurat_object@dr[[dr2]]@cell.embeddings), + error = function(e) { + message("cannot find dr info") + return(NA) + } + ) + if (!is.data.frame(temp_dr)) { + return(seurat_object@meta.data) + } else { + temp_dr <- tibble::rownames_to_column(temp_dr, "rn") + temp_meta <- + tibble::rownames_to_column(seurat_object@meta.data, "rn") + temp <- dplyr::left_join(temp_meta, temp_dr, by = "rn") + return(tibble::column_to_rownames(temp, "rn")) + } +} + +#' @rdname seurat_meta +#' @export +seurat_meta.Seurat <- function(seurat_object, + dr = "umap", + ...) { + dr2 <- dr + + mdata <- seurat_object@meta.data + temp_col_id <- get_unique_column(mdata, "rn") + + temp_dr <- + tryCatch( + as.data.frame(seurat_object@reductions[[dr2]]@cell.embeddings), + error = function(e) { + message("cannot find dr info") + return(NA) + } + ) + if (!is.data.frame(temp_dr)) { + return(mdata) + } else { + temp_dr <- tibble::rownames_to_column(temp_dr, temp_col_id) + temp_meta <- tibble::rownames_to_column(mdata, temp_col_id) + temp <- dplyr::left_join(temp_meta, temp_dr, by = temp_col_id) + return(tibble::column_to_rownames(temp, temp_col_id)) + } +} + +#' Function to convert labelled object to avg expression matrix +#' +#' @param input object after tsne or umap projections and clustering +#' @param cluster_col column name where classified cluster names +#' are stored in seurat meta data, cannot be "rn" +#' @param var_genes_only whether to keep only var.genes in the +#' final matrix output, could also look up genes used for PCA +#' @param assay_name any additional assay data, such as ADT, to include. +#' If more than 1, pass a vector of names +#' @param method whether to take mean (default) or median +#' @param lookuptable if not supplied, will look +#' in built-in table for object parsing +#' @param if_log input data is natural log, +#' averaging will be done on unlogged data +#' @return reference expression matrix, with genes as row names, +#' and cell types as column names +#' @examples +#' object_ref( +#' s_small3, +#' cluster_col = "RNA_snn_res.1" +#' ) +#' @export +object_ref <- function(input, + cluster_col = NULL, + var_genes_only = FALSE, + assay_name = NULL, + method = "mean", + lookuptable = NULL, + if_log = TRUE) { + if (!is(input, "seurat")) { + input_original <- input + temp <- parse_loc_object( + input, + type = class(input), + expr_loc = NULL, + meta_loc = NULL, + var_loc = NULL, + cluster_col = cluster_col, + lookuptable = lookuptable + ) + if (!(is.null(temp[["expr"]]))) { + message(paste0("recognized object type - ", + class(input))) + } + input <- temp[["expr"]] + metadata <- temp[["meta"]] + query_genes <- temp[["var"]] + if (is.null(cluster_col)) { + cluster_col <- temp[["col"]] + } + } + + temp_mat <- input + if (is.logical(var_genes_only) && var_genes_only) { + temp_mat <- temp_mat[query_genes, ] + } + + temp_res <- average_clusters( + temp_mat, + metadata, + cluster_col = cluster_col, + method = method, + if_log = if_log + ) + + temp_res +} diff --git a/R/seurat_wrapper.R.REMOVED.git-id b/R/seurat_wrapper.R.REMOVED.git-id deleted file mode 100644 index 0671c7d75..000000000 --- a/R/seurat_wrapper.R.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -1a9c19e1b608a0f71b2ea5444116db62e12615bb \ No newline at end of file diff --git a/R/utils.R b/R/utils.R new file mode 100644 index 000000000..88b7a426e --- /dev/null +++ b/R/utils.R @@ -0,0 +1,1820 @@ +#' Overcluster by kmeans per cluster +#' +#' @param mat expression matrix +#' @param cluster_id list of ids per cluster +#' @param power decides the number of clusters for kmeans +#' @return new cluster_id list of more clusters +#' @examples +#' res <- overcluster( +#' mat = pbmc_matrix_small, +#' cluster_id = split(colnames(pbmc_matrix_small), pbmc_meta$classified) +#' ) +#' length(res) +#' @export +overcluster <- function(mat, + cluster_id, + power = 0.15) { + mat <- as.matrix(mat) + new_ids <- list() + for (name in names(cluster_id)) { + ids <- cluster_id[[name]] + if (length(ids) > 1) { + new_clusters <- + stats::kmeans(t(mat[, ids]), + centers = as.integer(length(ids)^power)) + new_ids1 <- + split(names(new_clusters$cluster), + new_clusters$cluster) + names(new_ids1) <- + stringr::str_c(name, names(new_ids1), sep = "_") + new_ids <- append(new_ids, new_ids1) + } else { + new_ids <- append(new_ids, cluster_id[name]) + } + } + new_ids +} + +#' Average expression values per cluster +#' +#' @param mat expression matrix +#' @param metadata data.frame or vector containing cluster assignments per cell. +#' Order must match column order in supplied matrix. If a data.frame +#' provide the cluster_col parameters. +#' @param if_log input data is natural log, +#' averaging will be done on unlogged data +#' @param cluster_col column in metadata with cluster number +#' @param cell_col if provided, will reorder matrix first +#' @param low_threshold option to remove clusters with too few cells +#' @param method whether to take mean (default) or median +#' @param output_log whether to report log results +#' @param subclusterpower whether to get multiple averages per original cluster +#' @param cut_n set on a limit of genes as expressed, lower ranked genes +#' are set to 0, considered unexpressed +#' @return average expression matrix, with genes for row names, and clusters +#' for column names +#' @examples +#' mat <- average_clusters( +#' mat = pbmc_matrix_small, +#' metadata = pbmc_meta, +#' cluster_col = "classified", +#' if_log = FALSE +#' ) +#' mat[1:3, 1:3] +#' @importFrom matrixStats rowMaxs rowMedians colRanks +#' @export +average_clusters <- function(mat, + metadata, + cluster_col = "cluster", + if_log = TRUE, + cell_col = NULL, + low_threshold = 0, + method = "mean", + output_log = TRUE, + subclusterpower = 0, + cut_n = NULL) { + cluster_info <- metadata + if (!(is.null(cell_col))) { + if (!(all(colnames(mat) == cluster_info[[cell_col]]))) { + mat <- mat[, cluster_info[[cell_col]]] + } + } + if (is.vector(cluster_info)) { + cluster_ids <- split(colnames(mat), cluster_info) + } else if (is.data.frame(cluster_info) & !is.null(cluster_col)) { + cluster_info_temp <- cluster_info[[cluster_col]] + if (is.factor(cluster_info_temp)) { + cluster_info_temp <- droplevels(cluster_info_temp) + } + cluster_ids <- split(colnames(mat), cluster_info_temp) + } else if (is.factor(cluster_info)) { + cluster_info <- as.character(cluster_info) + cluster_ids <- split(colnames(mat), cluster_info) + } else { + stop("metadata not formatted correctly, + supply either a vector or a dataframe", + call. = FALSE + ) + } + + if (subclusterpower > 0) { + cluster_ids <- + overcluster(mat, cluster_ids, power = subclusterpower) + } + + if (method == "mean") { + out <- lapply( + cluster_ids, + function(cell_ids) { + if (!all(cell_ids %in% colnames(mat))) { + stop("cell ids not found in input matrix", + call. = FALSE) + } + if (if_log) { + mat_data <- expm1(mat[, cell_ids, drop = FALSE]) + } else { + mat_data <- mat[, cell_ids, drop = FALSE] + } + res <- Matrix::rowMeans(mat_data, na.rm = TRUE) + if (output_log) { + res <- log1p(res) + } + res + } + ) + } else { + out <- lapply( + cluster_ids, + function(cell_ids) { + if (!all(cell_ids %in% colnames(mat))) { + stop("cell ids not found in input matrix", + call. = FALSE) + } + mat_data <- mat[, cell_ids, drop = FALSE] + mat_data[mat_data == 0] <- NA + res <- matrixStats::rowMedians(as.matrix(mat_data), + na.rm = TRUE) + res[is.na(res)] <- 0 + names(res) <- rownames(mat_data) + res + } + ) + } + + out <- do.call(cbind, out) + if (low_threshold > 0) { + fil <- vapply(cluster_ids, + FUN = length, + FUN.VALUE = numeric(1)) >= low_threshold + out <- out[, as.vector(fil)] + } + if (!(is.null(cut_n))) { + expr_mat <- out + expr_df <- as.matrix(expr_mat) + df_temp <- t(matrixStats::colRanks(-expr_df, + ties.method = "average")) + rownames(df_temp) <- rownames(expr_mat) + colnames(df_temp) <- colnames(expr_mat) + expr_mat[df_temp > cut_n] <- 0 + out <- expr_mat + } + + return(out) +} + +#' Percentage detected per cluster +#' +#' @param mat expression matrix +#' @param metadata data.frame with cells +#' @param cluster_col column in metadata with cluster number +#' @param cut_num binary cutoff for detection +#' @return matrix of numeric values, with genes for row names, +#' and clusters for column names +percent_clusters <- function(mat, + metadata, + cluster_col = "cluster", + cut_num = 0.5) { + cluster_info <- metadata + mat[mat >= cut_num] <- 1 + mat[mat <= cut_num] <- 0 + + average_clusters(mat, cluster_info, + if_log = FALSE, + metadata = cluster_col + ) +} + +#' Function to make best call from correlation matrix +#' +#' @param cor_mat correlation matrix +#' @return matrix of 1s and 0s +get_best_match_matrix <- function(cor_mat) { + cor_mat <- as.matrix(cor_mat) + best_mat <- + as.data.frame(cor_mat - matrixStats::rowMaxs(as.matrix(cor_mat))) + best_mat[best_mat == 0] <- "1" + best_mat[best_mat != "1"] <- "0" + + return(best_mat) +} + +#' Function to make call and attach score +#' +#' @param name name of row to query +#' @param best_mat binarized call matrix +#' @param cor_mat correlation matrix +#' @param carry_cor whether the correlation score gets reported +#' @return string with ident call and possibly cor value +get_best_str <- function(name, + best_mat, + cor_mat, + carry_cor = TRUE) { + if (sum(as.numeric(best_mat[name, ])) > 0) { + best.names <- colnames(best_mat)[which(best_mat[name, ] == 1)] + best.cor <- + round(cor_mat[name, which(best_mat[name, ] == 1)], 2) + for (i in seq_len(length(best.cor))) { + if (i == 1) { + str <- paste0(best.names[i], + " (", + best.cor[i], + ")") + } else { + str <- paste0(str, + "; ", + best.names[i], + " (", + best.cor[i], + ")") + } + } + } else { + str <- "?" + } + + if (carry_cor == FALSE) { + str <- gsub(" \\(.*\\)", "", str) + } + return(str) +} + +#' Find entries shared in all vectors +#' @description return entries found in all supplied vectors. +#' If the vector supplied is NULL or NA, then it will be excluded +#' from the comparision. +#' @param ... vectors +#' @return vector of shared elements +get_common_elements <- function(...) { + vecs <- list(...) + # drop NULL elements of list + vecs <- vecs[!vapply(vecs, is.null, FUN.VALUE = logical(1))] + # drop NA elements of list (NA values OK in a vector) + vecs <- vecs[!is.na(vecs)] + + Reduce(intersect, vecs) +} + +#' Convert expression matrix to GSEA pathway scores +#' (would take a similar place in workflow before average_clusters/binarize) +#' +#' @param mat expression matrix +#' @param pathway_list a list of vectors, each named for a specific pathway, +#' or dataframe +#' @param n_perm Number of permutation for fgsea function. Defaults to 1000. +#' @param scale convert expr_mat into zscores prior to running GSEA?, +#' default = FALSE +#' @param no_warnings suppress warnings from gsea ties +#' @return matrix of GSEA NES values, cell types as row names, +#' pathways as column names +#' @examples +#' gl <- list( +#' "n" = c("PPBP", "LYZ", "S100A9"), +#' "a" = c("IGLL5", "GNLY", "FTL") +#' ) +#' +#' pbmc_avg <- average_clusters( +#' mat = pbmc_matrix_small, +#' metadata = pbmc_meta, +#' cluster_col = "classified" +#' ) +#' +#' calculate_pathway_gsea( +#' mat = pbmc_avg, +#' pathway_list = gl +#' ) +#' @export +calculate_pathway_gsea <- function(mat, + pathway_list, + n_perm = 1000, + scale = TRUE, + no_warnings = TRUE) { + # pathway_list can be user defined or + #`my_pathways <- fgsea::reactomePathways(rownames(pbmc4k_matrix))` + out <- lapply( + names(pathway_list), + function(y) { + marker_list <- list() + marker_list[[1]] <- pathway_list[[y]] + names(marker_list) <- y + v1 <- marker_list + temp <- run_gsea( + mat, + v1, + n_perm = n_perm, + scale = scale, + per_cell = TRUE, + no_warnings = no_warnings + ) + temp <- temp[, 3, drop = FALSE] + } + ) + res <- do.call(cbind, out) + colnames(res) <- names(pathway_list) + res +} + +#' manually change idents as needed +#' +#' @param metadata column of ident +#' @param cluster_col column in metadata containing cluster info +#' @param ident_col column in metadata containing identity assignment +#' @param clusters names of clusters to change, string or +#' vector of strings +#' @param idents new idents to assign, must be length of 1 or +#' same as clusters +#' @return new dataframe of metadata +assign_ident <- function(metadata, + cluster_col = "cluster", + ident_col = "type", + clusters, + idents) { + if (!is.vector(clusters) | !is.vector(idents)) { + stop("unsupported clusters or idents", call. = FALSE) + } else { + if (length(idents) == 1) { + idents <- rep(idents, length(clusters)) + } else if (length(idents) != length(clusters)) { + stop("unsupported lengths pairs of clusters and idents", + call. = FALSE) + } + } + + for (n in seq_len(length(clusters))) { + mindex <- metadata[[cluster_col]] == clusters[n] + metadata[mindex, ident_col] <- idents[n] + } + metadata +} + +#' get top calls for each cluster +#' +#' @param cor_mat input similarity matrix +#' @param metadata input metadata with tsne or umap coordinates +#' and cluster ids +#' @param col metadata column, can be cluster or cellid +#' @param collapse_to_cluster if a column name is provided, +#' takes the most frequent call of entire cluster to color in plot +#' @param threshold minimum correlation coefficent cutoff for calling clusters +#' @param topn number of calls for each cluster +#' @return dataframe of cluster, new potential ident, and r info +#' @examples +#' res <- clustify( +#' input = pbmc_matrix_small, +#' metadata = pbmc_meta, +#' ref_mat = cbmc_ref, +#' query_genes = pbmc_vargenes, +#' cluster_col = "classified" +#' ) +#' +#' cor_to_call_topn( +#' cor_mat = res, +#' metadata = pbmc_meta, +#' col = "classified", +#' collapse_to_cluster = FALSE, +#' threshold = 0.5 +#' ) +#' @export +cor_to_call_topn <- function(cor_mat, + metadata = NULL, + col = "cluster", + collapse_to_cluster = FALSE, + threshold = 0, + topn = 2) { + correlation_matrix <- cor_mat + df_temp <- tibble::as_tibble(correlation_matrix, rownames = col) + df_temp <- + tidyr::gather( + df_temp, + key = !!dplyr::sym("type"), + value = !!dplyr::sym("r"), + -!!col + ) + df_temp[["type"]][df_temp$r < threshold] <- + paste0("r<", threshold, ", unassigned") + df_temp <- + dplyr::top_n(dplyr::group_by_at(df_temp, 1), + topn, + !!dplyr::sym("r")) + df_temp_full <- df_temp + + if (collapse_to_cluster != FALSE) { + if (!(col %in% colnames(metadata))) { + metadata <- tibble::as_tibble(metadata, rownames = col) + } + df_temp_full <- + dplyr::left_join(df_temp_full, metadata, by = col) + df_temp_full[, "type2"] <- df_temp_full[[collapse_to_cluster]] + df_temp_full2 <- + dplyr::group_by( + df_temp_full, + !!dplyr::sym("type"), + !!dplyr::sym("type2") + ) + df_temp_full2 <- + dplyr::summarize(df_temp_full2, + sum = sum(!!dplyr::sym("r")), + n = n() + ) + df_temp_full2 <- + dplyr::group_by(df_temp_full2, !!dplyr::sym("type2")) + df_temp_full2 <- + dplyr::arrange(df_temp_full2, desc(n), desc(sum)) + df_temp_full2 <- + dplyr::filter( + df_temp_full2, + !!dplyr::sym("type") != paste0("r<", + threshold, + ", unassigned") + ) + df_temp_full2 <- dplyr::slice(df_temp_full2, seq_len(topn)) + df_temp_full2 <- + dplyr::right_join( + df_temp_full2, + dplyr::select(df_temp_full, -c( + !!dplyr::sym("type"), !!dplyr::sym("r") + )), + by = stats::setNames(collapse_to_cluster, "type2") + ) + df_temp_full <- + dplyr::mutate(df_temp_full2, type = tidyr::replace_na( + !!dplyr::sym("type"), + paste0("r<", threshold, ", unassigned") + )) + df_temp_full <- dplyr::group_by(df_temp_full, + !!dplyr::sym(col)) + df_temp_full <- + dplyr::distinct(df_temp_full, + !!dplyr::sym("type"), + !!dplyr::sym("type2"), + .keep_all = TRUE + ) + dplyr::arrange(df_temp_full, desc(n), desc(sum), + .by_group = TRUE) + } else { + df_temp_full <- dplyr::group_by(df_temp_full, + !!dplyr::sym(col)) + dplyr::arrange(df_temp_full, desc(!!dplyr::sym("r")), + .by_group = TRUE) + } +} + +#' pct of cells in each cluster that express genelist +#' +#' @param matrix expression matrix +#' @param genelist vector of marker genes for one identity +#' @param clusters vector of cluster identities +#' @param returning whether to return mean, min, +#' or max of the gene pct in the gene list +#' @return vector of numeric values +gene_pct <- function(matrix, + genelist, + clusters, + returning = "mean") { + genelist <- intersect(genelist, rownames(matrix)) + if (is.factor(clusters)) { + clusters <- + factor(clusters, levels = c(levels(clusters), "orig.NA")) + } + clusters[is.na(clusters)] <- "orig.NA" + unique_clusters <- unique(clusters) + + if (returning == "mean") { + vapply(unique_clusters, function(x) { + celllist <- clusters == x + tmp <- matrix[genelist, celllist, drop = FALSE] + tmp[tmp > 0] <- 1 + mean(Matrix::rowSums(tmp) / ncol(tmp)) + }, FUN.VALUE = numeric(1)) + } else if (returning == "min") { + vapply(unique_clusters, function(x) { + celllist <- clusters == x + tmp <- matrix[genelist, celllist, drop = FALSE] + tmp[tmp > 0] <- 1 + min(Matrix::rowSums(tmp) / ncol(tmp)) + }, FUN.VALUE = numeric(1)) + } else if (returning == "max") { + vapply(unique_clusters, function(x) { + celllist <- clusters == x + tmp <- matrix[genelist, celllist, drop = FALSE] + tmp[tmp > 0] <- 1 + max(Matrix::rowSums(tmp) / ncol(tmp)) + }, FUN.VALUE = numeric(1)) + } +} + +#' pct of cells in every cluster that express a series of genelists +#' +#' @param matrix expression matrix +#' @param marker_m matrixized markers +#' @param metadata data.frame or vector containing cluster +#' assignments per cell. +#' Order must match column order in supplied matrix. If a data.frame +#' provide the cluster_col parameters. +#' @param cluster_col column in metadata with cluster number +#' @param norm whether and how the results are normalized +#' @return matrix of numeric values, clusters from mat as row names, +#' cell types from marker_m as column names +#' @examples +#' gene_pct_markerm( +#' matrix = pbmc_matrix_small, +#' marker_m = cbmc_m, +#' metadata = pbmc_meta, +#' cluster_col = "classified" +#' ) +#' @export +gene_pct_markerm <- function(matrix, + marker_m, + metadata, + cluster_col = NULL, + norm = NULL) { + cluster_info <- metadata + if (is.vector(cluster_info)) { + + } else if (is.data.frame(cluster_info) & !is.null(cluster_col)) { + cluster_info <- cluster_info[[cluster_col]] + } else { + stop("metadata not formatted correctly, + supply either a vector or a dataframe", + call. = FALSE + ) + } + + # coerce factors in character + if (is.factor(cluster_info)) { + cluster_info <- as.character(cluster_info) + } + + if (!is.data.frame(marker_m)) { + marker_m <- as.data.frame(marker_m) + } + + out <- vapply(colnames(marker_m), function(x) { + gene_pct( + matrix, + marker_m[[x]], + cluster_info + ) + }, FUN.VALUE = numeric(length(unique(cluster_info)))) + + if (!(is.null(norm))) { + if (norm == "divide") { + out <- sweep(out, 2, apply(out, 2, max), "/") + } else if (norm == "diff") { + out <- sweep(out, 2, apply(out, 2, max), "-") + } else { + out <- sweep(out, 2, apply(out, 2, max) * norm) + out[out < 0] <- 0 + out[out > 0] <- 1 + } + } + + # edge cases where all markers can't be found for a cluster + out[is.na(out)] <- 0 + out +} + +#' Combined function to compare scRNA-seq data to +#' bulk RNA-seq data and marker list +#' +#' @examples +#' # Seurat2 +#' clustify_nudge( +#' input = s_small, +#' ref_mat = cbmc_ref, +#' marker = cbmc_m, +#' cluster_col = "res.1", +#' threshold = 0.8, +#' seurat_out = FALSE, +#' mode = "pct", +#' dr = "tsne" +#' ) +#' +#' # Seurat3 +#' clustify_nudge( +#' input = s_small3, +#' ref_mat = cbmc_ref, +#' marker = cbmc_m, +#' cluster_col = "RNA_snn_res.1", +#' threshold = 0.8, +#' seurat_out = FALSE, +#' mode = "pct", +#' dr = "tsne" +#' ) +#' +#' # Matrix +#' clustify_nudge( +#' input = pbmc_matrix_small, +#' ref_mat = cbmc_ref, +#' metadata = pbmc_meta, +#' marker = as.matrix(cbmc_m), +#' query_genes = pbmc_vargenes, +#' cluster_col = "classified", +#' threshold = 0.8, +#' call = FALSE, +#' marker_inmatrix = FALSE, +#' mode = "pct" +#' ) +#' @export +clustify_nudge <- function(input, ...) { + UseMethod("clustify_nudge", input) +} + +#' @rdname clustify_nudge +#' @param input express matrix or object +#' @param ref_mat reference expression matrix +#' @param metadata cell cluster assignments, supplied as a vector +#' or data.frame. If +#' data.frame is supplied then `cluster_col` needs to be set. +#' @param marker matrix of markers +#' @param query_genes A vector of genes of interest to compare. +#' If NULL, then common genes between +#' the expr_mat and ref_mat will be used for comparision. +#' @param cluster_col column in metadata that contains cluster ids per cell. +#' Will default to first +#' column of metadata if not supplied. +#' Not required if running correlation per cell. +#' @param compute_method method(s) for computing similarity scores +#' @param weight relative weight for the gene list scores, +#' when added to correlation score +#' @param dr stored dimension reduction +#' @param ... passed to matrixize_markers +#' @param norm whether and how the results are normalized +#' @param call make call or just return score matrix +#' @param marker_inmatrix whether markers genes are already +#' in preprocessed matrix form +#' @param mode use marker expression pct or ranked cor score for nudging +#' @param obj_out whether to output object instead of cor matrix +#' @param seurat_out output cor matrix or called seurat object +#' @param rename_prefix prefix to add to type and r column names +#' @param lookuptable if not supplied, will look in built-in +#' table for object parsing +#' @param threshold identity calling minimum score threshold, +#' only used when obj_out = T + +#' @return single cell object, or matrix of numeric values, +#' clusters from input as row names, cell types from ref_mat as column names +#' @export +clustify_nudge.default <- function(input, + ref_mat, + marker, + metadata = NULL, + cluster_col = NULL, + query_genes = NULL, + compute_method = "spearman", + weight = 1, + seurat_out = FALSE, + threshold = -Inf, + dr = "umap", + norm = "diff", + call = TRUE, + marker_inmatrix = TRUE, + mode = "rank", + obj_out = FALSE, + rename_prefix = NULL, + lookuptable = NULL, + ...) { + if (marker_inmatrix != TRUE) { + marker <- matrixize_markers( + marker, + ... + ) + } + + if (!inherits(input, c("matrix", "Matrix", "data.frame"))) { + input_original <- input + temp <- parse_loc_object( + input, + type = class(input), + expr_loc = NULL, + meta_loc = NULL, + var_loc = NULL, + cluster_col = cluster_col, + lookuptable = lookuptable + ) + + if (!(is.null(temp[["expr"]]))) { + message(paste0("recognized object type - ", class(input))) + } + + input <- temp[["expr"]] + metadata <- temp[["meta"]] + if (is.null(query_genes)) { + query_genes <- temp[["var"]] + } + if (is.null(cluster_col)) { + cluster_col <- temp[["col"]] + } + } + + resa <- clustify( + input = input, + ref_mat = ref_mat, + metadata = metadata, + cluster_col = cluster_col, + query_genes = query_genes, + seurat_out = FALSE, + per_cell = FALSE + ) + + if (mode == "pct") { + resb <- gene_pct_markerm(input, + marker, + metadata, + cluster_col = cluster_col, + norm = norm + ) + } else if (mode == "rank") { + if (ncol(marker) > 1 && is.character(marker[1, 1])) { + marker <- pos_neg_marker(marker) + } + resb <- pos_neg_select(input, + marker, + metadata, + cluster_col = cluster_col, + cutoff_score = NULL + ) + empty_vec <- setdiff(colnames(resa), colnames(resb)) + empty_mat <- + matrix( + 0, + nrow = nrow(resb), + ncol = length(empty_vec), + dimnames = list(rownames(resb), empty_vec) + ) + resb <- cbind(resb, empty_mat) + } + + res <- resa[order(rownames(resa)), order(colnames(resa))] + + resb[order(rownames(resb)), order(colnames(resb))] * weight + + if ((obj_out || + seurat_out) && + !inherits(input_original, c("matrix", "Matrix", "data.frame"))) { + df_temp <- cor_to_call( + res, + metadata = metadata, + cluster_col = cluster_col, + threshold = threshold + ) + + df_temp_full <- call_to_metadata( + df_temp, + metadata = metadata, + cluster_col = cluster_col, + per_cell = FALSE, + rename_prefix = rename_prefix + ) + + out <- insert_meta_object(input_original, + df_temp_full, + lookuptable = lookuptable + ) + + return(out) + } else { + if (call == TRUE) { + df_temp <- cor_to_call(res, + threshold = threshold + ) + colnames(df_temp) <- c(cluster_col, "type", "score") + return(df_temp) + } else { + return(res) + } + } +} + +#' @rdname clustify_nudge +#' @export +clustify_nudge.seurat <- function(input, + ref_mat, + marker, + cluster_col = NULL, + query_genes = NULL, + compute_method = "spearman", + weight = 1, + seurat_out = TRUE, + obj_out = FALSE, + threshold = -Inf, + dr = "umap", + norm = "diff", + marker_inmatrix = TRUE, + mode = "rank", + rename_prefix = NULL, + ...) { + if (marker_inmatrix != TRUE) { + marker <- matrixize_markers( + marker, + ... + ) + } + resa <- clustify( + input = input, + ref_mat = ref_mat, + cluster_col = cluster_col, + query_genes = query_genes, + seurat_out = FALSE, + per_cell = FALSE, + dr = dr + ) + + if (mode == "pct") { + resb <- gene_pct_markerm(input@data, + marker, + input@meta.data, + cluster_col = cluster_col, + norm = norm + ) + } else if (mode == "rank") { + if (ncol(marker) > 1 && is.character(marker[1, 1])) { + marker <- pos_neg_marker(marker) + } + resb <- pos_neg_select( + input@data, + marker, + metadata = input@meta.data, + cluster_col = cluster_col, + cutoff_score = NULL + ) + empty_vec <- setdiff(colnames(resa), colnames(resb)) + empty_mat <- + matrix( + 0, + nrow = nrow(resb), + ncol = length(empty_vec), + dimnames = list(rownames(resb), empty_vec) + ) + resb <- cbind(resb, empty_mat) + } + + res <- resa[order(rownames(resa)), order(colnames(resa))] + + resb[order(rownames(resb)), order(colnames(resb))] * weight + + if (!(seurat_out || obj_out)) { + res + } else { + df_temp <- cor_to_call( + res, + metadata = input@meta.data, + cluster_col = cluster_col, + threshold = threshold + ) + + df_temp_full <- call_to_metadata( + df_temp, + metadata = input@meta.data, + cluster_col = cluster_col, + per_cell = FALSE, + rename_prefix = rename_prefix + ) + + if ("Seurat" %in% loadedNamespaces()) { + input@meta.data <- df_temp_full + return(input) + } else { + message("seurat not loaded, returning cor_mat instead") + return(res) + } + input + } +} + +#' @rdname clustify_nudge +#' @export +clustify_nudge.Seurat <- function(input, + ref_mat, + marker, + cluster_col = NULL, + query_genes = NULL, + compute_method = "spearman", + weight = 1, + seurat_out = TRUE, + obj_out = FALSE, + threshold = -Inf, + dr = "umap", + norm = "diff", + marker_inmatrix = TRUE, + mode = "rank", + rename_prefix = NULL, + ...) { + if (marker_inmatrix != TRUE) { + marker <- matrixize_markers( + marker, + ... + ) + } + resa <- clustify( + input = input, + ref_mat = ref_mat, + cluster_col = cluster_col, + query_genes = query_genes, + seurat_out = FALSE, + per_cell = FALSE, + dr = dr + ) + + if (mode == "pct") { + resb <- gene_pct_markerm( + input@assays$RNA@data, + marker, + input@meta.data, + cluster_col = cluster_col, + norm = norm + ) + } else if (mode == "rank") { + if (ncol(marker) > 1 && is.character(marker[1, 1])) { + marker <- pos_neg_marker(marker) + } + resb <- pos_neg_select( + input@assays$RNA@data, + marker, + metadata = input@meta.data, + cluster_col = cluster_col, + cutoff_score = NULL + ) + empty_vec <- setdiff(colnames(resa), colnames(resb)) + empty_mat <- + matrix( + 0, + nrow = nrow(resb), + ncol = length(empty_vec), + dimnames = list(rownames(resb), empty_vec) + ) + resb <- cbind(resb, empty_mat) + } + + res <- resa[order(rownames(resa)), order(colnames(resa))] + + resb[order(rownames(resb)), order(colnames(resb))] * weight + + if (!(seurat_out || obj_out)) { + res + } else { + df_temp <- cor_to_call( + res, + metadata = input@meta.data, + cluster_col = cluster_col, + threshold = threshold + ) + + df_temp_full <- call_to_metadata( + df_temp, + metadata = input@meta.data, + cluster_col = cluster_col, + per_cell = FALSE, + rename_prefix = rename_prefix + ) + + if ("Seurat" %in% loadedNamespaces()) { + input@meta.data <- df_temp_full + return(input) + } else { + message("seurat not loaded, returning cor_mat instead") + return(res) + } + input + } +} + +#' more flexible parsing of single cell objects +#' +#' @param input input object +#' @param type look up predefined slots/loc +#' @param expr_loc expression matrix location +#' @param meta_loc metadata location +#' @param var_loc variable genes location +#' @param cluster_col column of clustering from metadata +#' @param lookuptable if not supplied, will look in built-in table for object parsing +#' @return list of expression, metadata, vargenes, cluster_col info from object +#' @examples +#' obj <- parse_loc_object(s_small3) +#' length(obj) +#' @export +parse_loc_object <- function(input, + type = class(input), + expr_loc = NULL, + meta_loc = NULL, + var_loc = NULL, + cluster_col = NULL, + lookuptable = NULL) { + if (is.null(lookuptable)) { + object_loc_lookup1 <- clustifyr::object_loc_lookup + } else { + object_loc_lookup1 <- lookuptable + } + + if (length(intersect(type, colnames(object_loc_lookup1))) > 0) { + type <- intersect(type, colnames(object_loc_lookup1))[1] + parsed <- list( + eval(parse(text = object_loc_lookup1[[type]][1])), + as.data.frame(eval(parse(text = object_loc_lookup1[[type]][2]))), + eval(parse(text = object_loc_lookup1[[type]][3])), + object_loc_lookup1[[type]][4] + ) + } else { + parsed <- list(NULL, NULL, NULL, NULL) + } + + names(parsed) <- c("expr", "meta", "var", "col") + + if (!(is.null(expr_loc))) { + parsed[["expr"]] <- eval(parse(text = paste0("input", expr_loc))) + } + + if (!(is.null(meta_loc))) { + parsed[["meta"]] <- + as.data.frame(eval(parse(text = paste0( + "input", meta_loc + )))) + } + + if (!(is.null(var_loc))) { + parsed[["var"]] <- eval(parse(text = paste0("input", var_loc))) + } + + if (!(is.null(cluster_col))) { + parsed[["col"]] <- cluster_col + } + + parsed +} + +#' more flexible metadata update of single cell objects +#' +#' @param input input object +#' @param new_meta new metadata table to insert back into object +#' @param type look up predefined slots/loc +#' @param meta_loc metadata location +#' @param lookuptable if not supplied, +#' will look in built-in table for object parsing +#' @return new object with new metadata inserted +#' @examples +#' \dontrun{ +#' insert_meta_object(s_small3, seurat_meta(s_small3, dr = "tsne")) +#' } +#' @export +insert_meta_object <- function(input, + new_meta, + type = class(input), + meta_loc = NULL, + lookuptable = NULL) { + if (is.null(lookuptable)) { + object_loc_lookup1 <- clustifyr::object_loc_lookup + } else { + object_loc_lookup1 <- lookuptable + } + + if (!type %in% colnames(object_loc_lookup1)) { + stop("unrecognized object type", call. = FALSE) + } else { + text1 <- paste0(object_loc_lookup1[[type]][2], " <- ", "new_meta") + eval(parse(text = text1)) + return(input) + } +} + +#' compare clustering parameters and classification outcomes +#' +#' @param expr expression matrix +#' @param metadata metadata including cluster info and +#' dimension reduction plotting +#' @param ref_mat reference matrix +#' @param cluster_col column of clustering from metadata +#' @param x_col column of metadata for x axis plotting +#' @param y_col column of metadata for y axis plotting +#' @param n expand n-fold for over/under clustering +#' @param ngenes number of genes to use for feature selection, +#' use all genes if NULL +#' @param query_genes vector, otherwise genes with be recalculated +#' @param do_label whether to label each cluster at median center +#' @param do_legend whether to draw legend +#' @param newclustering use kmeans if NULL on dr +#' or col name for second column of clustering +#' @param threshold type calling threshold +#' @return faceted ggplot object +#' @examples +#' set.seed(42) +#' overcluster_test( +#' expr = pbmc_matrix_small, +#' metadata = pbmc_meta, +#' ref_mat = cbmc_ref, +#' cluster_col = "classified", +#' x_col = "UMAP_1", +#' y_col = "UMAP_2" +#' ) +#' @export +overcluster_test <- function(expr, + metadata, + ref_mat, + cluster_col, + x_col = "UMAP_1", + y_col = "UMAP_2", + n = 5, + ngenes = NULL, + query_genes = NULL, + threshold = 0, + do_label = TRUE, + do_legend = FALSE, + newclustering = NULL) { + if (is.null(newclustering)) { + metadata$new_clusters <- + as.character(stats::kmeans(metadata[, c(x_col, y_col)], + centers = n * length(unique(metadata[[cluster_col]])) + )$clust) + } else { + metadata$new_clusters <- metadata[[newclustering]] + n <- length(unique(metadata[[newclustering]])) / + length(unique(metadata[[cluster_col]])) + } + + if (is.null(query_genes)) { + if (is.null(ngenes)) { + genes <- rownames(expr) + } else { + genes <- ref_feature_select(expr, ngenes) + } + } else { + genes <- query_genes + } + res1 <- clustify( + expr, + ref_mat, + metadata, + query_genes = genes, + cluster_col = cluster_col, + seurat_out = FALSE + ) + res2 <- clustify( + expr, + ref_mat, + metadata, + query_genes = genes, + cluster_col = "new_clusters", + seurat_out = FALSE + ) + o1 <- plot_dims( + metadata, + feature = cluster_col, + x = x_col, + y = y_col, + do_label = FALSE, + do_legend = FALSE + ) + o2 <- plot_dims( + metadata, + feature = "new_clusters", + x = x_col, + y = y_col, + do_label = FALSE, + do_legend = FALSE + ) + p1 <- plot_best_call( + res1, + metadata, + cluster_col, + threshold = threshold, + do_label = do_label, + do_legend = do_legend, + x = x_col, + y = y_col + ) + p2 <- plot_best_call( + res2, + metadata, + "new_clusters", + threshold = threshold, + do_label = do_label, + do_legend = do_legend, + x = x_col, + y = y_col + ) + g <- cowplot::plot_grid(o1, o2, p1, p2, + labels = c( + length(unique(metadata[[cluster_col]])), + n * length(unique(metadata[[cluster_col]])) + ) + ) + return(g) +} + +#' feature select from reference matrix +#' +#' @param mat reference matrix +#' @param n number of genes to return +#' @param mode the method of selecting features +#' @param rm.lowvar whether to remove lower variation genes first +#' @return vector of genes +#' @examples +#' pbmc_avg <- average_clusters( +#' mat = pbmc_matrix_small, +#' metadata = pbmc_meta, +#' cluster_col = "classified" +#' ) +#' +#' ref_feature_select( +#' mat = pbmc_avg[1:100, ], +#' n = 5 +#' ) +#' @export +ref_feature_select <- function(mat, + n = 3000, + mode = "var", + rm.lowvar = TRUE) { + if (rm.lowvar == TRUE) { + if (!(is.matrix(mat))) { + mat <- as.matrix(mat) + } + v <- matrixStats::rowVars(mat) + names(v) <- rownames(mat) + v2 <- v[order(-v)][seq_len(length(v) / 2)] + mat <- mat[names(v2)[!is.na(names(v2))], ] + } + + if (mode == "cor") { + cor_mat <- cor(t(as.matrix(mat)), method = "spearman") + diag(cor_mat) <- rep(0, times = nrow(cor_mat)) + cor_mat <- abs(cor_mat) + score <- matrixStats::rowMaxs(cor_mat, na.rm = TRUE) + names(score) <- rownames(cor_mat) + score <- score[order(-score)] + cor_genes <- names(score[seq_len(n)]) + } else if (mode == "var") { + cor_genes <- names(v2[seq_len(n)]) + } + cor_genes +} + +#' Returns a list of variable genes based on PCA +#' +#' @description Extract genes, i.e. "features", based on the top +#' loadings of principal components +#' formed from the bulk expression data set +#' +#' @param mat Expression matrix. Rownames are genes, +#' colnames are single cell cluster name, and +#' values are average single cell expression (log transformed). +#' @param pcs Precalculated pcs if available, will skip over processing on mat. +#' @param n_pcs Number of PCs to selected gene loadings from. +#' See the explore_PCA_corr.Rmd vignette for details. +#' @param percentile Select the percentile of absolute values of +#' PCA loadings to select genes from. E.g. 0.999 would select the +#' top point 1 percent of genes with the largest loadings. +#' @param if_log whether the data is already log transformed +#' @return vector of genes +#' @examples +#' feature_select_PCA( +#' cbmc_ref, +#' if_log = FALSE +#' ) +#' @export +feature_select_PCA <- function(mat = NULL, + pcs = NULL, + n_pcs = 10, + percentile = 0.99, + if_log = TRUE) { + if (if_log == FALSE) { + mat <- log(mat + 1) + } + + # Get the PCs + if (is.null(pcs)) { + pca <- prcomp(t(as.matrix(mat)))$rotation + } else { + pca <- pcs + } + + # For the given number PCs, select the genes with the largest loadings + genes <- c() + for (i in seq_len(n_pcs)) { + cutoff <- quantile(abs(pca[, i]), probs = percentile) + genes <- c(genes, rownames(pca[abs(pca[, i]) >= cutoff, ])) + } + + return(genes) +} + +#' convert gmt format of pathways to list of vectors +#' +#' @param path gmt file path +#' @param cutoff remove pathways with less genes than this cutoff +#' @param sep sep used in file to split path and genes +#' @return list of genes in each pathway +#' @examples +#' gmt_file <- system.file( +#' "extdata", +#' "c2.cp.reactome.v6.2.symbols.gmt.gz", +#' package = "clustifyr" +#' ) +#' +#' gl <- gmt_to_list(path = gmt_file) +#' length(gl) +#' @export +gmt_to_list <- function(path, + cutoff = 0, + sep = "\thttp://www.broadinstitute.org/gsea/msigdb/cards/.*?\t") { + df <- readr::read_csv(path, + col_names = FALSE + ) + df <- tidyr::separate(df, !!dplyr::sym("X1"), + sep = sep, + into = c("path", "genes") + ) + pathways <- stringr::str_split( + df$genes, + "\t" + ) + names(pathways) <- stringr::str_replace( + df$path, + "REACTOME_", + "" + ) + if (cutoff > 0) { + ids <- vapply(pathways, function(i) { + length(i) < cutoff + }, FUN.VALUE = logical(1)) + pathways <- pathways[!ids] + } + return(pathways) +} + +#' plot GSEA pathway scores as heatmap, +#' returns a list containing results and plot. +#' +#' @param mat expression matrix +#' @param pathway_list a list of vectors, each named for a specific pathway, +#' or dataframe +#' @param n_perm Number of permutation for fgsea function. Defaults to 1000. +#' @param scale convert expr_mat into zscores prior to running GSEA?, +#' default = TRUE +#' @param topn number of top pathways to plot +#' @param returning to return "both" list and plot, or either one +#' @return list of matrix and plot, or just plot, matrix of GSEA NES values, +#' cell types as row names, pathways as column names +#' @examples +#' gl <- list( +#' "n" = c("PPBP", "LYZ", "S100A9"), +#' "a" = c("IGLL5", "GNLY", "FTL") +#' ) +#' +#' pbmc_avg <- average_clusters( +#' mat = pbmc_matrix_small, +#' metadata = pbmc_meta, +#' cluster_col = "classified" +#' ) +#' +#' plot_pathway_gsea( +#' pbmc_avg, +#' gl, +#' 5 +#' ) +#' @export +plot_pathway_gsea <- function(mat, + pathway_list, + n_perm = 1000, + scale = TRUE, + topn = 5, + returning = "both") { + res <- calculate_pathway_gsea(mat, + pathway_list, + n_perm, + scale = scale + ) + coltopn <- + unique(cor_to_call_topn(res, topn = topn, threshold = -Inf)$type) + res[is.na(res)] <- 0 + + g <- suppressWarnings(ComplexHeatmap::Heatmap(res[, coltopn], + column_names_gp = grid::gpar(fontsize = 6) + )) + + if (returning == "both") { + return(list(res, g)) + } else if (returning == "plot") { + return(g) + } else { + return(res) + } +} + +#' downsample matrix by cluster or completely random +#' +#' @param mat expression matrix +#' @param n number per cluster or fraction to keep +#' @param keep_cluster_proportions whether to subsample +#' @param metadata data.frame or +#' vector containing cluster assignments per cell. +#' Order must match column order in supplied matrix. If a data.frame +#' provide the cluster_col parameters. +#' @param cluster_col column in metadata with cluster number +#' @return new smaller mat with less cell_id columns +#' @examples +#' set.seed(42) +#' mat <- downsample_matrix( +#' mat = pbmc_matrix_small, +#' metadata = pbmc_meta$classified, +#' n = 10, +#' keep_cluster_proportions = TRUE +#' ) +#' mat[1:3, 1:3] +#' @export +downsample_matrix <- function(mat, + n = 1, + keep_cluster_proportions = TRUE, + metadata = NULL, + cluster_col = "cluster") { + cluster_info <- metadata + if (keep_cluster_proportions == FALSE) { + cluster_ids <- colnames(mat) + if (n < 1) { + n <- as.integer(ncol(mat) * n) + } + cluster_ids_new <- sample(cluster_ids, n) + } else { + if (is.vector(cluster_info)) { + cluster_ids <- split(colnames(mat), cluster_info) + } else if (is.data.frame(cluster_info) & + !is.null(cluster_col)) { + cluster_ids <- split(colnames(mat), cluster_info[[cluster_col]]) + } else if (is.factor(cluster_info)) { + cluster_info <- as.character(cluster_info) + cluster_ids <- split(colnames(mat), cluster_info) + } else { + stop("metadata not formatted correctly, + supply either a vector or a dataframe", + call. = FALSE + ) + } + if (n < 1) { + n2 <- vapply(cluster_ids, function(x) { + as.integer(length(x) * n) + }, FUN.VALUE = numeric(1)) + n <- n2 + } + cluster_ids_new <- + mapply(sample, cluster_ids, n, SIMPLIFY = FALSE) + } + return(mat[, unlist(cluster_ids_new)]) +} + +#' marker selection from reference matrix +#' +#' @param mat reference matrix +#' @param cut an expression minimum cutoff +#' @param arrange whether to arrange (lower means better) +#' @param compto compare max expression to the value of next 1 or more +#' @return dataframe, with gene, cluster, ratio columns +#' @examples +#' ref_marker_select( +#' cbmc_ref, +#' cut = 2 +#' ) +#' @export +ref_marker_select <- + function(mat, + cut = 0.5, + arrange = TRUE, + compto = 1) { + mat <- mat[!is.na(rownames(mat)), ] + mat <- mat[Matrix::rowSums(mat) != 0, ] + ref_cols <- colnames(mat) + res <- + apply(mat, 1, marker_select, ref_cols, cut, compto = compto) + if (is.list(res)) { + res <- res[!vapply(res, is.null, FUN.VALUE = logical(1))] + } + resdf <- t(as.data.frame(res, stringsAsFactors = FALSE)) + resdf <-tibble::rownames_to_column(as.data.frame(resdf, + stringsAsFactors = FALSE), + "gene") + colnames(resdf) <- c("gene", "cluster", "ratio") + resdf <- + dplyr::mutate(resdf, + ratio = as.numeric(!!dplyr::sym("ratio"))) + if (arrange == TRUE) { + resdf <- dplyr::group_by(resdf, cluster) + resdf <- + dplyr::arrange(resdf, !!dplyr::sym("ratio"), + .by_group = TRUE) + resdf <- dplyr::ungroup(resdf) + } + resdf + } + +#' decide for one gene whether it is a marker for a certain cell type +#' @param row1 a numeric vector of expression values (row) +#' @param cols a vector of cell types (column) +#' @param cut an expression minimum cutoff +#' @param compto compare max expression to the value of next 1 or more +#' @return vector of cluster name and ratio value +#' @examples +#' pbmc_avg <- average_clusters( +#' mat = pbmc_matrix_small, +#' metadata = pbmc_meta, +#' cluster_col = "classified", +#' if_log = FALSE +#' ) +#' +#' marker_select( +#' row1 = pbmc_avg["PPBP", ], +#' cols = names(pbmc_avg["PPBP", ]) +#' ) +#' @export +marker_select <- function(row1, + cols, + cut = 1, + compto = 1) { + row_sorted <- sort(row1, decreasing = TRUE) + col_sorted <- names(row_sorted) + num_sorted <- unname(row_sorted) + if (num_sorted[1] >= cut) { + return(c(col_sorted[1], (num_sorted[1 + compto] / num_sorted[1]))) + } +} + +#' adapt clustify to tweak score for pos and neg markers +#' @param input single-cell expression matrix +#' @param metadata cell cluster assignments, +#' supplied as a vector or data.frame. If +#' data.frame is supplied then `cluster_col` needs to be set. +#' Not required if running correlation per cell. +#' @param ref_mat reference expression matrix with positive and +#' negative markers(set expression at 0) +#' @param cluster_col column in metadata that contains cluster ids per cell. +#' Will default to first +#' column of metadata if not supplied. +#' Not required if running correlation per cell. +#' @param cutoff_n expression cutoff where genes ranked below n are +#' considered non-expressing +#' @param cutoff_score positive score lower than this cutoff will be +#' considered as 0 to not influence scores +#' @param ... additional arguments to pass to compute_method function +#' @return matrix of numeric values, clusters from input as row names, +#' cell types from ref_mat as column names +#' @examples +#' pn_ref <- data.frame( +#' "Myeloid" = c(1, 0.01, 0), +#' row.names = c("CD74", "clustifyr0", "CD79A") +#' ) +#' +#' pos_neg_select( +#' input = pbmc_matrix_small, +#' ref_mat = pn_ref, +#' metadata = pbmc_meta, +#' cluster_col = "classified", +#' cutoff_score = 0.8 +#' ) +#' @export +pos_neg_select <- function(input, + ref_mat, + metadata, + cluster_col = "cluster", + cutoff_n = 0, + cutoff_score = 0.5) { + suppressWarnings( + res <- clustify( + rbind(input, "clustifyr0" = 0.01), + ref_mat, + metadata, + cluster_col = cluster_col, + per_cell = TRUE, + verbose = TRUE, + query_genes = rownames(ref_mat) + ) + ) + res[is.na(res)] <- 0 + suppressWarnings( + res2 <- average_clusters( + t(res), + metadata, + cluster_col = cluster_col, + if_log = FALSE, + output_log = FALSE + ) + ) + res2 <- t(res2) + + if (!(is.null(cutoff_score))) { + res2 <- apply(res2, 2, function(x) { + maxr <- max(x) + if (maxr > 0.1) { + x[x > 0 & x < cutoff_score * maxr] <- 0 + } + x + }) + } + + res2 +} + +#' generate negative markers from a list of exclusive positive markers +#' @param mat matrix or dataframe of markers +#' @return matrix of gene names +#' @examples +#' reverse_marker_matrix(cbmc_m) +#' @export +reverse_marker_matrix <- function(mat) { + full_vec <- as.vector(t(mat)) + mat_rev <- apply(mat, 2, function(x) { + full_vec[!(full_vec %in% x)] + }) + as.data.frame(mat_rev) +} + +#' generate pos and negative marker expression matrix from a +#' list/dataframe of positive markers +#' @param mat matrix or dataframe of markers +#' @return matrix of gene expression +#' @examples +#' m1 <- pos_neg_marker(cbmc_m) +#' @export +pos_neg_marker <- function(mat) { + if (is.data.frame(mat)) { + mat <- as.list(mat) + } else if (is.matrix(mat)) { + mat <- as.list(as.data.frame(mat, + stringsAsFactors = FALSE)) + } else if (!is.list(mat)) { + stop("unsupported marker format, + must be dataframe, matrix, or list", + call. = FALSE + ) + } + genelist <- mat + typenames <- names(genelist) + + g2 <- lapply(genelist, function(x) { + data.frame(gene = x, stringsAsFactors = FALSE) + }) + + g2 <- dplyr::bind_rows(g2, .id = "type") + g2 <- dplyr::mutate(g2, expression = 1) + g2 <- tidyr::spread(g2, key = "type", value = "expression") + g2 <- tibble::column_to_rownames(g2, "gene") + g2[is.na(g2)] <- 0 + g2 +} +#' takes files with positive and negative markers, as described in garnett, +#' and returns list of markers +#' @param filename txt file to load +#' @return list of positive and negative gene markers +#' @examples +#' marker_file <- system.file( +#' "extdata", +#' "hsPBMC_markers.txt", +#' package = "clustifyr" +#' ) +#' +#' file_marker_parse(marker_file) +#' @export +file_marker_parse <- function(filename) { + lines <- readLines(filename) + count <- 0 + ident_names <- c() + ident_pos <- c() + ident_neg <- c() + for (line in lines) { + tag <- substr(line, 1, 1) + if (tag == ">") { + count <- count + 1 + ident_names[count] <- substr(line, 2, nchar(line)) + } else if (tag == "e") { + ident_pos[count] <- + strsplit(substr(line, 12, nchar(line)), split = ", ") + } else if (tag == "n") { + ident_neg[count] <- + strsplit(substr(line, 16, nchar(line)), split = ", ") + } + } + + if (!(is.null(ident_neg))) { + names(ident_neg) <- ident_names + } + if (!(is.null(ident_pos))) { + names(ident_pos) <- ident_names + } + list("pos" = ident_pos, "neg" = ident_neg) +} + +#' Generate a unique column id for a dataframe +#' @param df dataframe with column names +#' @param id desired id if unique +#' @return character +get_unique_column <- function(df, id = NULL) { + if (!is.null(id)) { + out_id <- id + } else { + out_id <- "x" + } + + res <- ifelse(out_id %in% colnames(df), + make.unique(c(colnames(df), + out_id))[length(c(colnames(df), + out_id))], + out_id + ) + + res +} + +#' Find rank bias +#' @param mat original query expression matrix +#' @param metadata metadata after calling types +#' @param type_col column name in metadata that contains ids +#' @param ref_mat reference expression matrix +#' @param query_genes original vector of genes used to clustify +#' @param filter_out whether to only report filtered results +#' @param expr_cut consider all lower expressing genes as off +#' @param threshold diff threshold +#' @param consensus_cut filter out if lower +#' han number of types show large diff +#' @return matrix of rank diff values +#' @examples +#' res <- clustify( +#' input = pbmc_matrix_small, +#' metadata = pbmc_meta, +#' ref_mat = cbmc_ref, +#' query_genes = pbmc_vargenes, +#' cluster_col = "classified" +#' ) +#' call1 <- cor_to_call( +#' res, +#' metadata = pbmc_meta, +#' cluster_col = "classified", +#' collapse_to_cluster = FALSE, +#' threshold = 0.8 +#' ) +#' pbmc_meta2 <- call_to_metadata( +#' call1, +#' pbmc_meta, +#' "classified" +#' ) +#' find_rank_bias( +#' pbmc_matrix_small, +#' pbmc_meta2, "type", +#' cbmc_ref, +#' query_genes = pbmc_vargenes +#' ) +#' @export +find_rank_bias <- function(mat, + metadata, + type_col, + ref_mat, + query_genes = NULL, + filter_out = TRUE, + threshold = 0.33, + expr_cut = 3000, + consensus_cut = 1) { + if (is.null(query_genes)) { + query_genes <- intersect( + rownames(mat), + rownames(ref_mat) + ) + } else { + query_genes <- intersect( + query_genes, + intersect( + rownames(mat), + rownames(ref_mat) + ) + ) + } + avg2 <- average_clusters( + mat[, rownames(metadata)], + metadata[[type_col]] + ) + r2 <- t(matrixStats::colRanks(-avg2[query_genes, ], + ties.method = "average")) + rownames(r2) <- query_genes + colnames(r2) <- colnames(avg2) + r2 <- r2[, colnames(r2)[!stringr::str_detect(colnames(r2), + "unassigned"), + drop = FALSE], + drop = FALSE] + + r1 <- t(matrixStats::colRanks(-ref_mat[query_genes, ], + ties.method = "average")) + rownames(r1) <- query_genes + colnames(r1) <- colnames(ref_mat) + r1 <- r1[, colnames(r2), drop = FALSE] + + if (!(is.null(expr_cut))) { + r1[r1 > expr_cut] <- expr_cut + r2[r2 > expr_cut] <- expr_cut + nthreshold <- expr_cut * threshold + } else { + nthreshold <- length(query_genes) * threshold + } + rdiff <- r1 - r2 + if (filter_out) { + rp <- rdiff > nthreshold | rdiff < -nthreshold + rp[r1 > 0.9 * expr_cut & r2 > 0.9 * expr_cut] <- NA + v <- rowMeans(rp, na.rm = TRUE) == 1 + v[is.na(v)] <- FALSE + + v2 <- Matrix::rowSums(rp, na.rm = TRUE) == 1 + prob <- rdiff[v & !v2, , drop = FALSE] + + return(prob) + } else { + return(rdiff) + } +} diff --git a/R/utils.R.REMOVED.git-id b/R/utils.R.REMOVED.git-id deleted file mode 100644 index a27dce281..000000000 --- a/R/utils.R.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -2c37236168b5691258c64d82367c72a85d96eae3 \ No newline at end of file diff --git a/README.Rmd b/README.Rmd new file mode 100644 index 000000000..a03ba2f98 --- /dev/null +++ b/README.Rmd @@ -0,0 +1,137 @@ +--- +output: github_document +--- + +```{r, echo = FALSE, message = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + fig.path = "man/figures/" +) +``` + +# clustifyr + +[![Build Status](https://travis-ci.org/rnabioco/clustifyr.svg?branch=master)](https://travis-ci.org/rnabioco/clustifyr) +[![Coverage status](https://codecov.io/gh/rnabioco/clustifyr/branch/master/graph/badge.svg)](https://codecov.io/github/rnabioco/clustifyr?branch=master) + +clustifyr classifies cells and clusters in single-cell RNA sequencing experiments using reference bulk RNA-seq data sets, sorted microarray expression data, single-cell gene signatures, or lists of marker genes. + +Single cell transcriptomes are difficult to annotate without knowledge of the underlying biology. Even with this knowledge, accurate identification can be challenging due to the lack of detectable expression of common marker genes. clustifyr solves this problem by automatically annotating single cells or clusters of cells using single-cell RNA-seq, bulk RNA-seq data, microarray, or marker gene lists. Additional functions enable exploratory analysis of similarities between single cell RNA-seq datasets and reference data. + +## Installation + +``` r +# install.packages("devtools") +devtools::install_github("rnabioco/clustifyr") +``` + +## Example usage + +In this example we use the following built-in input data: + +- an expression matrix of single cell RNA-seq data (`pbmc_matrix_small`) +- a metadata data.frame (`pbmc_meta`), with cluster information stored (`"classified"`) +- a vector of variable genes (`pbmc_vargenes`) +- a matrix of mean normalized scRNA-seq UMI counts by cell type (`cbmc_ref`): + +We then calculate correlation coefficients and plot them on a pre-calculated projection (stored in `pbmc_meta`). + +```{r readme_example, warning=F, message=F} +library(clustifyr) + +# calculate correlation +res <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta$classified, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes +) + +# print assignments +cor_to_call(res) + +# plot assignments on a projection +plot_best_call( + cor_mat = res, + metadata = pbmc_meta, + cluster_col = "classified" +) +``` + +`clustify()` can also take a clustered `SingleCellExperiment` or `seurat` object (both v2 and v3) and assign identities. + +```{r example_seurat, warning=F, message=F} +# for SingleCellExperiment +clustify( + input = sce_small, # an SCE object + ref_mat = cbmc_ref, # matrix of RNA-seq expression data for each cell type + cluster_col = "cell_type1", # name of column in meta.data containing cell clusters + obj_out = TRUE # output SCE object with cell type inserted as "type" column +) + +library(Seurat) + +# for seurat2 +clustify( + input = s_small, + cluster_col = "res.1", + ref_mat = cbmc_ref, + seurat_out = TRUE +) + +# for Seurat3 +clustify( + input = s_small3, + cluster_col = "RNA_snn_res.1", + ref_mat = cbmc_ref, + seurat_out = TRUE +) +``` + +New reference matrix can be made directly from `SingleCellExperiment` and `seurat` objects as well. Other scRNAseq experiment object types are supported as well. + +```{r example_ref_matrix} +# make reference from SingleCellExperiment objects +sce_ref <- object_ref( + input = sce_small, # SCE object + cluster_col = "cell_type1" # name of column in colData containing cell identities +) + +# make reference from seurat objects +s_ref <- seurat_ref( + seurat_object = s_small, + cluster_col = "res.1" +) + +head(s_ref) +``` + +`clustify_lists()` handles identity assignment of matrix or `SingleCellExperiment` and `seurat` objects based on marker gene lists. + +```{r example_seurat2, warning=F, message=F} +clustify_lists( + input = pbmc_matrix_small, + metadata = pbmc_meta, + cluster_col = "classified", + marker = pbmc_markers, + marker_inmatrix = FALSE +) + +clustify_lists( + input = s_small, + marker = pbmc_markers, + marker_inmatrix = FALSE, + cluster_col = "res.1", + seurat_out = TRUE +) +``` + +## Additional reference data + +More reference data (including tabula muris, immgen, etc) are available at https://github.com/rnabioco/clustifyrdata. + +Also see list for individual downloads at https://rnabioco.github.io/clustifyrdata/articles/download_refs.html + +Additional tutorials at +https://rnabioco.github.io/clustifyrdata/articles/otherformats.html diff --git a/README.Rmd.REMOVED.git-id b/README.Rmd.REMOVED.git-id deleted file mode 100644 index ae2d5ee54..000000000 --- a/README.Rmd.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -a03ba2f98d5a556e5f78fec6688ffb1b2e058b95 \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 000000000..064f5a329 --- /dev/null +++ b/README.md @@ -0,0 +1,209 @@ + +# clustifyr + +[![Build +Status](https://travis-ci.org/rnabioco/clustifyr.svg?branch=master)](https://travis-ci.org/rnabioco/clustifyr) +[![Coverage +status](https://codecov.io/gh/rnabioco/clustifyr/branch/master/graph/badge.svg)](https://codecov.io/github/rnabioco/clustifyr?branch=master) + +clustifyr classifies cells and clusters in single-cell RNA sequencing +experiments using reference bulk RNA-seq data sets, sorted microarray +expression data, single-cell gene signatures, or lists of marker genes. + +Single cell transcriptomes are difficult to annotate without knowledge +of the underlying biology. Even with this knowledge, accurate +identification can be challenging due to the lack of detectable +expression of common marker genes. clustifyr solves this problem by +automatically annotating single cells or clusters of cells using +single-cell RNA-seq, bulk RNA-seq data, microarray, or marker gene +lists. Additional functions enable exploratory analysis of similarities +between single cell RNA-seq datasets and reference data. + +## Installation + +``` r +# install.packages("devtools") +devtools::install_github("rnabioco/clustifyr") +``` + +## Example usage + +In this example we use the following built-in input data: + + - an expression matrix of single cell RNA-seq data + (`pbmc_matrix_small`) + - a metadata data.frame (`pbmc_meta`), with cluster information stored + (`"classified"`) + - a vector of variable genes (`pbmc_vargenes`) + - a matrix of mean normalized scRNA-seq UMI counts by cell type + (`cbmc_ref`): + +We then calculate correlation coefficients and plot them on a +pre-calculated projection (stored in `pbmc_meta`). + +``` r +library(clustifyr) + +# calculate correlation +res <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta$classified, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes +) + +# print assignments +cor_to_call(res) +#> # A tibble: 9 x 3 +#> # Groups: cluster [9] +#> cluster type r +#> +#> 1 B B 0.891 +#> 2 CD14+ Mono CD14+ Mono 0.908 +#> 3 FCGR3A+ Mono CD16+ Mono 0.906 +#> 4 Memory CD4 T CD4 T 0.895 +#> 5 Naive CD4 T CD4 T 0.916 +#> 6 DC DC 0.833 +#> 7 Platelet Mk 0.630 +#> 8 CD8 T NK 0.866 +#> 9 NK NK 0.896 + +# plot assignments on a projection +plot_best_call( + cor_mat = res, + metadata = pbmc_meta, + cluster_col = "classified" +) +``` + +![](man/figures/readme_example-1.png) + +`clustify()` can also take a clustered `SingleCellExperiment` or +`seurat` object (both v2 and v3) and assign identities. + +``` r +# for SingleCellExperiment +clustify( + input = sce_small, # an SCE object + ref_mat = cbmc_ref, # matrix of RNA-seq expression data for each cell type + cluster_col = "cell_type1", # name of column in meta.data containing cell clusters + obj_out = TRUE # output SCE object with cell type inserted as "type" column +) +#> class: SingleCellExperiment +#> dim: 200 200 +#> metadata(0): +#> assays(2): counts logcounts +#> rownames(200): SGIP1 AZIN2 ... TAF12 SNHG3 +#> rowData names(10): feature_symbol is_feature_control ... total_counts +#> log10_total_counts +#> colnames(200): AZ_A1 AZ_A10 ... HP1502401_E18 HP1502401_E19 +#> colData names(35): cell_quality cell_type1 ... type r +#> reducedDimNames(0): +#> spikeNames(1): ERCC + +library(Seurat) + +# for seurat2 +clustify( + input = s_small, + cluster_col = "res.1", + ref_mat = cbmc_ref, + seurat_out = TRUE +) +#> An old seurat object +#> 230 genes across 80 samples + +# for Seurat3 +clustify( + input = s_small3, + cluster_col = "RNA_snn_res.1", + ref_mat = cbmc_ref, + seurat_out = TRUE +) +#> An object of class Seurat +#> 230 features across 80 samples within 1 assay +#> Active assay: RNA (230 features) +#> 2 dimensional reductions calculated: pca, tsne +``` + +New reference matrix can be made directly from `SingleCellExperiment` +and `seurat` objects as well. Other scRNAseq experiment object types are +supported as well. + +``` r +# make reference from SingleCellExperiment objects +sce_ref <- object_ref( + input = sce_small, # SCE object + cluster_col = "cell_type1" # name of column in colData containing cell identities +) +#> recognized object type - SingleCellExperiment + +# make reference from seurat objects +s_ref <- seurat_ref( + seurat_object = s_small, + cluster_col = "res.1" +) + +head(s_ref) +#> 0 1 2 3 +#> MS4A1 4.517255 3.204766 0.000000 0.000000 +#> CD79B 4.504191 3.549095 2.580662 0.000000 +#> CD79A 4.457349 4.199849 0.000000 0.000000 +#> HLA-DRA 6.211779 6.430463 3.659590 4.169965 +#> TCL1A 4.394310 2.837922 0.000000 0.000000 +#> HLA-DQB1 4.380289 4.325293 0.000000 1.666167 +``` + +`clustify_lists()` handles identity assignment of matrix or +`SingleCellExperiment` and `seurat` objects based on marker gene lists. + +``` r +clustify_lists( + input = pbmc_matrix_small, + metadata = pbmc_meta, + cluster_col = "classified", + marker = pbmc_markers, + marker_inmatrix = FALSE +) +#> 0 1 2 3 4 5 6 +#> Naive CD4 T 1.5639055 20.19469 31.77095 8.664074 23.844992 19.06931 19.06931 +#> Memory CD4 T 1.5639055 20.19469 31.77095 10.568007 23.844992 17.97875 19.06931 +#> CD14+ Mono 0.9575077 14.70716 76.21353 17.899569 11.687739 49.86699 16.83210 +#> B 0.6564777 12.70976 31.77095 26.422929 13.536295 20.19469 13.53630 +#> CD8 T 1.0785353 17.97875 31.82210 12.584823 31.822099 22.71234 40.45383 +#> FCGR3A+ Mono 0.6564777 13.63321 72.43684 17.899569 9.726346 56.48245 14.61025 +#> NK 0.6564777 14.61025 31.82210 7.757206 31.822099 22.71234 45.05072 +#> DC 0.6564777 15.80598 63.34978 19.069308 13.758144 40.56298 17.97875 +#> Platelet 0.7403358 13.71477 57.86117 13.714774 12.933078 47.83999 18.20599 +#> 7 8 +#> Naive CD4 T 6.165348 0.6055118 +#> Memory CD4 T 6.165348 0.9575077 +#> CD14+ Mono 25.181595 1.0785353 +#> B 17.899569 0.1401901 +#> CD8 T 7.882145 0.3309153 +#> FCGR3A+ Mono 21.409177 0.3309153 +#> NK 5.358651 0.3309153 +#> DC 45.101877 0.1401901 +#> Platelet 19.334716 64.9077516 + +clustify_lists( + input = s_small, + marker = pbmc_markers, + marker_inmatrix = FALSE, + cluster_col = "res.1", + seurat_out = TRUE +) +#> An old seurat object +#> 230 genes across 80 samples +``` + +## Additional reference data + +More reference data (including tabula muris, immgen, etc) are available +at . + +Also see list for individual downloads at + + +Additional tutorials at + diff --git a/README.md.REMOVED.git-id b/README.md.REMOVED.git-id deleted file mode 100644 index 33d4bc602..000000000 --- a/README.md.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -064f5a329a41e153d769153e5677b24a6d45212f \ No newline at end of file diff --git a/data-raw/cbmc_m.R b/data-raw/cbmc_m.R new file mode 100644 index 000000000..28b2be56c --- /dev/null +++ b/data-raw/cbmc_m.R @@ -0,0 +1,79 @@ +library(Seurat) +library(tidyverse) +library(clustifyr) + +# following seurat tutorial from https://satijalab.org/seurat/v3.0/multimodal_vignette.html#identify-differentially-expressed-proteins-between-clusters +cbmc.rna <- as.sparse(read.csv( + file = "GSE100866_CBMC_8K_13AB_10X-RNA_umi.csv.gz", sep = ",", + header = TRUE, row.names = 1 +)) +cbmc.rna <- CollapseSpeciesExpressionMatrix(cbmc.rna) +cbmc.adt <- as.sparse(read.csv( + file = "GSE100866_CBMC_8K_13AB_10X-ADT_umi.csv.gz", sep = ",", + header = TRUE, row.names = 1 +)) +cbmc.adt <- cbmc.adt[setdiff(rownames(x = cbmc.adt), c("CCR5", "CCR7", "CD10")), ] +cbmc <- CreateSeuratObject(counts = cbmc.rna) +cbmc <- NormalizeData(cbmc) +cbmc <- FindVariableFeatures(cbmc) +cbmc <- ScaleData(cbmc) +cbmc <- RunPCA(cbmc, verbose = FALSE) +cbmc <- FindNeighbors(cbmc, dims = 1:25) +cbmc <- FindClusters(cbmc, resolution = 0.8) +cbmc <- RunTSNE(cbmc, dims = 1:25, method = "FIt-SNE") +new.cluster.ids <- c( + "Memory CD4 T", "CD14+ Mono", "Naive CD4 T", "NK", "CD14+ Mono", "Mouse", "B", + "CD8 T", "CD16+ Mono", "T/Mono doublets", "NK", "CD34+", "Multiplets", "Mouse", "Eryth", "Mk", + "Mouse", "DC", "pDCs" +) +names(new.cluster.ids) <- levels(cbmc) +cbmc <- RenameIdents(cbmc, new.cluster.ids) +cbmc[["ADT"]] <- CreateAssayObject(counts = cbmc.adt) +cbmc <- NormalizeData(cbmc, assay = "ADT", normalization.method = "CLR") +cbmc <- ScaleData(cbmc, assay = "ADT") +cbmc <- subset(cbmc, idents = c("Multiplets", "Mouse"), invert = TRUE) +DefaultAssay(cbmc) <- "ADT" +cbmc <- RunPCA(cbmc, + features = rownames(cbmc), reduction.name = "pca_adt", reduction.key = "pca_adt_", + verbose = FALSE +) +adt.data <- GetAssayData(cbmc, slot = "data") +adt.dist <- dist(t(adt.data)) +cbmc[["rnaClusterID"]] <- Idents(cbmc) +cbmc[["tsne_adt"]] <- RunTSNE(adt.dist, assay = "ADT", reduction.key = "adtTSNE_") +cbmc[["adt_snn"]] <- FindNeighbors(adt.dist)$snn +cbmc <- FindClusters(cbmc, resolution = 0.2, graph.name = "adt_snn") +new.cluster.ids <- c( + "CD4 T", "CD14+ Mono", "NK", "B", "CD8 T", "NK", "CD34+", "T/Mono doublets", + "CD16+ Mono", "pDCs", "B" +) +names(new.cluster.ids) <- levels(cbmc) +cbmc <- RenameIdents(cbmc, new.cluster.ids) +cbmc[["citeID"]] <- Idents(cbmc) + +m <- cbmc@meta.data %>% + rownames_to_column("rn") %>% + mutate(ID = ifelse(citeID != "CD8 T" & citeID != "CD4 T", as.character(rnaClusterID), as.character(citeID))) %>% + mutate(ID = ifelse((rnaClusterID == "CD4 T" & citeID != "CD4 T") | (rnaClusterID == "CD8 T" & citeID != "CD8 T"), + "Unknown", + as.character(ID) + )) %>% + column_to_rownames("rn") +cbmc@meta.data <- m + +DefaultAssay(cbmc) <- "RNA" +Idents(cbmc) <- "ID" +m_cb <- FindAllMarkers(cbmc, only.pos = TRUE) + +cbmc_m <- matrixize_markers( + m_cb %>% filter( + pct.1 - pct.2 > 0.15, + cluster != "T/Mono doublets", + cluster != "Unknown" + ), + unique = TRUE, + remove_rp = TRUE, + n = 3 +) + +usethis::use_data(cbmc_m, compress = "xz", overwrite = TRUE) diff --git a/data-raw/cbmc_m.R.REMOVED.git-id b/data-raw/cbmc_m.R.REMOVED.git-id deleted file mode 100644 index 5af1c256e..000000000 --- a/data-raw/cbmc_m.R.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -28b2be56c164b33058c3a35fe02f0ab2403ea64e \ No newline at end of file diff --git a/data-raw/cbmc_ref.R b/data-raw/cbmc_ref.R new file mode 100644 index 000000000..9af98f330 --- /dev/null +++ b/data-raw/cbmc_ref.R @@ -0,0 +1,83 @@ +library(usethis) +library(Seurat) +library(tidyverse) +library(clustifyr) + +# following seurat tutorial from https://satijalab.org/seurat/v3.0/multimodal_vignette.html#identify-differentially-expressed-proteins-between-clusters +cbmc.rna <- as.sparse(read.csv( + file = "GSE100866_CBMC_8K_13AB_10X-RNA_umi.csv.gz", sep = ",", + header = TRUE, row.names = 1 +)) +cbmc.rna <- CollapseSpeciesExpressionMatrix(cbmc.rna) +cbmc.adt <- as.sparse(read.csv( + file = "GSE100866_CBMC_8K_13AB_10X-ADT_umi.csv.gz", sep = ",", + header = TRUE, row.names = 1 +)) +cbmc.adt <- cbmc.adt[setdiff(rownames(x = cbmc.adt), c("CCR5", "CCR7", "CD10")), ] +cbmc <- CreateSeuratObject(counts = cbmc.rna) +cbmc <- NormalizeData(cbmc) +cbmc <- FindVariableFeatures(cbmc) +cbmc <- ScaleData(cbmc) +cbmc <- RunPCA(cbmc, verbose = FALSE) +cbmc <- FindNeighbors(cbmc, dims = 1:25) +cbmc <- FindClusters(cbmc, resolution = 0.8) +cbmc <- RunTSNE(cbmc, dims = 1:25, method = "FIt-SNE") +new.cluster.ids <- c( + "Memory CD4 T", "CD14+ Mono", "Naive CD4 T", "NK", "CD14+ Mono", "Mouse", "B", + "CD8 T", "CD16+ Mono", "T/Mono doublets", "NK", "CD34+", "Multiplets", "Mouse", "Eryth", "Mk", + "Mouse", "DC", "pDCs" +) +names(new.cluster.ids) <- levels(cbmc) +cbmc <- RenameIdents(cbmc, new.cluster.ids) +cbmc[["ADT"]] <- CreateAssayObject(counts = cbmc.adt) +cbmc <- NormalizeData(cbmc, assay = "ADT", normalization.method = "CLR") +cbmc <- ScaleData(cbmc, assay = "ADT") +cbmc <- subset(cbmc, idents = c("Multiplets", "Mouse"), invert = TRUE) +DefaultAssay(cbmc) <- "ADT" +cbmc <- RunPCA(cbmc, + features = rownames(cbmc), reduction.name = "pca_adt", reduction.key = "pca_adt_", + verbose = FALSE +) +adt.data <- GetAssayData(cbmc, slot = "data") +adt.dist <- dist(t(adt.data)) +cbmc[["rnaClusterID"]] <- Idents(cbmc) +cbmc[["tsne_adt"]] <- RunTSNE(adt.dist, assay = "ADT", reduction.key = "adtTSNE_") +cbmc[["adt_snn"]] <- FindNeighbors(adt.dist)$snn +cbmc <- FindClusters(cbmc, resolution = 0.2, graph.name = "adt_snn") +new.cluster.ids <- c( + "CD4 T", "CD14+ Mono", "NK", "B", "CD8 T", "NK", "CD34+", "T/Mono doublets", + "CD16+ Mono", "pDCs", "B" +) +names(new.cluster.ids) <- levels(cbmc) +cbmc <- RenameIdents(cbmc, new.cluster.ids) +cbmc[["citeID"]] <- Idents(cbmc) + +m <- cbmc@meta.data %>% + rownames_to_column("rn") %>% + mutate(ID = ifelse(citeID != "CD8 T" & citeID != "CD4 T", as.character(rnaClusterID), as.character(citeID))) %>% + mutate(ID = ifelse((rnaClusterID == "CD4 T" & citeID != "CD4 T") | (rnaClusterID == "CD8 T" & citeID != "CD8 T"), "Unknown", as.character(ID))) %>% + column_to_rownames("rn") +cbmc@meta.data <- m + +cbmc_refm <- use_seurat_comp( + cbmc, + cluster_col = "ID", + var_genes_only = FALSE, + assay_name = NULL +) + +cbmc_ref <- as.data.frame(cbmc_refm) %>% + as_tibble(rownames = "gene") %>% + select(-Unknown, -`T/Mono doublets`) %>% + filter(str_sub(gene, 1, 5) != "MOUSE" & str_sub(gene, 1, 5) != "ERCC_") %>% + column_to_rownames("gene") %>% + as.matrix() + +cbmc_ref <- cbmc_ref[Matrix::rowSums(cbmc_ref) != 0, ] + +var_genes <- ref_feature_select(cbmc_ref, n = 2000) +cbmc_ref <- cbmc_ref[var_genes, ] + +usethis::use_data(cbmc_ref, compress = "xz", overwrite = TRUE) + + diff --git a/data-raw/cbmc_ref.R.REMOVED.git-id b/data-raw/cbmc_ref.R.REMOVED.git-id deleted file mode 100644 index d58adf8d0..000000000 --- a/data-raw/cbmc_ref.R.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -9af98f33014ae9391171c07dce341a5d47744f6b \ No newline at end of file diff --git a/data-raw/downrefs.R b/data-raw/downrefs.R new file mode 100644 index 000000000..2bdfe90e7 --- /dev/null +++ b/data-raw/downrefs.R @@ -0,0 +1,87 @@ +library(tidyverse) + +d1 <- c( + paste0("[", "ref_MCA", "](", "https://github.com/rnabioco/clustifyrdata/raw/master/data/ref_MCA.rda", ")"), + "Mouse Cell Atlas", + dim(clustifyrdata::ref_MCA)[2], + dim(clustifyrdata::ref_MCA)[1], + "mouse", + paste0("[from](", "https://www.cell.com/cell/fulltext/S0092-8674(18)30116-8", ")") +) + +d2 <- c( + paste0("[", "ref_tabula_muris_drop", "](", "https://github.com/rnabioco/clustifyrdata/raw/master/data/ref_tabula_muris_drop.rda", ")"), + "Tabula Muris (10X)", + dim(clustifyrdata::ref_tabula_muris_drop)[2], + dim(clustifyrdata::ref_tabula_muris_drop)[1], + "mouse", + paste0("[from](", "https://www.nature.com/articles/s41586-018-0590-4", ")") +) + +d3 <- c( + paste0("[", "ref_tabula_muris_facs", "](", "https://github.com/rnabioco/clustifyrdata/raw/master/data/ref_tabula_muris_facs.rda", ")"), + "Tabula Muris (SmartSeq2)", + dim(clustifyrdata::ref_tabula_muris_facs)[2], + dim(clustifyrdata::ref_tabula_muris_facs)[1], + "mouse", + paste0("[from](", "https://www.nature.com/articles/s41586-018-0590-4", ")") +) + +d4 <- c( + paste0("[", "ref_mouse.rnaseq", "](", "https://github.com/rnabioco/clustifyrdata/raw/master/data/ref_mouse.rnaseq.rda", ")"), + "Mouse RNA-seq from 28 cell types", + dim(clustifyrdata::ref_mouse.rnaseq)[2], + dim(clustifyrdata::ref_mouse.rnaseq)[1], + "mouse", + paste0("[from](", "https://genome.cshlp.org/content/early/2019/03/11/gr.240093.118", ")") +) + +d5 <- c( + paste0("[", "ref_moca_main", "](", "https://github.com/rnabioco/clustifyrdata/raw/master/data/ref_moca_main.rda", ")"), + "Mouse Organogenesis Cell Atlas (main cell types)", + dim(clustifyrdata::ref_moca_main)[2], + dim(clustifyrdata::ref_moca_main)[1], + "mouse", + paste0("[from](", "https://www.nature.com/articles/s41586-019-0969-x", ")") +) + +d6 <- c( + paste0("[", "ref_hema_microarray", "](", "https://github.com/rnabioco/clustifyrdata/raw/master/data/ref_hema_microarray.rda", ")"), + "Human hematopoietic cell microarray", + dim(clustifyrdata::ref_hema_microarray)[2], + dim(clustifyrdata::ref_hema_microarray)[1], + "human", + paste0("[from](", "https://www.cell.com/fulltext/S0092-8674(11)00005-5", ")") +) + +d7 <- c( + paste0("[", "ref_cortex_dev", "](", "https://github.com/rnabioco/clustifyrdata/raw/master/data/ref_cortex_dev.rda", ")"), + "Human cortex development scRNA-seq", + dim(clustifyrdata::ref_cortex_dev)[2], + dim(clustifyrdata::ref_cortex_dev)[1], + "human", + paste0("[from](", "https://science.sciencemag.org/content/358/6368/1318.long", ")") +) + +d8 <- c( + paste0("[", "ref_pan_indrop", "](", "https://github.com/rnabioco/clustifyrdata/raw/master/data/ref_pan_indrop.rda", ")"), + "Human pancreatic cell scRNA-seq (inDrop)", + dim(clustifyrdata::ref_pan_indrop)[2], + dim(clustifyrdata::ref_pan_indrop)[1], + "human", + paste0("[from](", "https://www.cell.com/fulltext/S2405-4712(16)30266-6", ")") +) + +d9 <- c( + paste0("[", "ref_pan_smartseq2", "](", "https://github.com/rnabioco/clustifyrdata/raw/master/data/ref_pan_smartseq2.rda", ")"), + "Human pancreatic cell scRNA-seq (SmartSeq2)", + dim(clustifyrdata::ref_pan_smartseq2)[2], + dim(clustifyrdata::ref_pan_smartseq2)[1], + "human", + paste0("[from](", "https://www.sciencedirect.com/science/article/pii/S1550413116304363", ")") +) + +d <- data.frame(d1, d2, d3, d4, d5, d6, d7, d8, d9) %>% t() +colnames(d) <- c("name", "desc", "ntypes", "ngenes", "org", "from_pub") +downrefs <- d %>% as.tibble() +usethis::use_data(downrefs, compress = "xz", overwrite = TRUE) diff --git a/data-raw/downrefs.R.REMOVED.git-id b/data-raw/downrefs.R.REMOVED.git-id deleted file mode 100644 index e5c7712d2..000000000 --- a/data-raw/downrefs.R.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -2bdfe90e7fffb5ea624722c9eba556e88287ab22 \ No newline at end of file diff --git a/data-raw/pbmc_bulk_matrix.R b/data-raw/pbmc_bulk_matrix.R new file mode 100644 index 000000000..ea7511ab2 --- /dev/null +++ b/data-raw/pbmc_bulk_matrix.R @@ -0,0 +1,104 @@ +library(dplyr) +library(purrr) +library(tidyr) +library(stringr) +library(recount) + +dl_recount <- function(sra_id) { + download_study(sra_id) + load(file.path(sra_id, "rse_gene.Rdata")) + # no longer need to downloaded data + unlink(sra_id, recursive = TRUE) + + rse <- scale_counts(rse_gene) + read_counts <- assay(rse, "counts") + gene_ids <- rownames(read_counts) + # get gene symbols, which are stored in rowData + id2symbol <- data_frame( + ids = rowData(rse_gene)$gene_id, + symbols = rowData(rse_gene)$symbol@listData + ) %>% + mutate(symbols = map_chr(symbols, ~ .x[1])) + + # clean up metadata into a dataframe + mdata <- colData(rse) + mdata_cols <- lapply( + mdata$characteristics, + function(x) { + str_match(x, "^([^:]+):")[, 2] + } + ) %>% + unique() %>% + unlist() + + mdata <- data_frame( + run = mdata$run, + all_data = as.list(mdata$characteristics) + ) %>% + mutate(out = purrr::map_chr( + all_data, + ~ str_c(.x, collapse = "::") + )) %>% + tidyr::separate(out, + sep = "::", + into = mdata_cols + ) %>% + select(-all_data) %>% + mutate_at( + .vars = vars(-matches("run")), + .funs = function(x) str_match(x, ": (.+)")[, 2] + ) + + # convert ids to symbols + row_ids_to_symbols <- left_join(data_frame(ids = gene_ids), + id2symbol, + by = "ids" + ) + + if (length(gene_ids) != nrow(row_ids_to_symbols)) { + warning("gene id mapping to symbols produce more or less ids") + } + + row_ids_to_symbols <- filter(row_ids_to_symbols, !is.na(symbols)) + + out_df <- read_counts %>% + as.data.frame() %>% + tibble::rownames_to_column("gene_id") %>% + left_join(., row_ids_to_symbols, + by = c("gene_id" = "ids") + ) %>% + dplyr::select(-gene_id) %>% + dplyr::select(symbols, everything()) %>% + filter(!is.na(symbols)) + + out_matrix <- tidyr::gather(out_df, library, expr, -symbols) %>% + group_by(symbols, library) %>% + summarize(expr = sum(expr)) %>% + tidyr::spread(library, expr) %>% + as.data.frame() %>% + tibble::column_to_rownames("symbols") %>% + as.matrix() + + list( + read_counts = out_matrix, + meta_data = mdata + ) +} + +# download +pbmc_data <- dl_recount("SRP051688") +# filter +good_libs <- filter(pbmc_data$meta_data, str_detect(time, "0")) +pbmc_data <- pbmc_data$read_counts[, good_libs$run] +# rename +new_ids <- left_join(data_frame(run = colnames(pbmc_data)), + good_libs, + by = "run" +) %>% + group_by(`cell type`) %>% + mutate(cell_id = stringr::str_c(`cell type`, " rep ", row_number())) %>% + pull(cell_id) + +colnames(pbmc_data) <- new_ids +pbmc_bulk_matrix <- pbmc_data +usethis::use_data(pbmc_bulk_matrix, compress = "xz", overwrite = TRUE) diff --git a/data-raw/pbmc_bulk_matrix.R.REMOVED.git-id b/data-raw/pbmc_bulk_matrix.R.REMOVED.git-id deleted file mode 100644 index f6bc9ba95..000000000 --- a/data-raw/pbmc_bulk_matrix.R.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -ea7511ab25e937ec72ef5f330bacac6e6f3f7ce3 \ No newline at end of file diff --git a/data/cbmc_m.rda b/data/cbmc_m.rda new file mode 100644 index 000000000..fc512d194 Binary files /dev/null and b/data/cbmc_m.rda differ diff --git a/data/cbmc_ref.rda b/data/cbmc_ref.rda new file mode 100644 index 000000000..a1e05c602 Binary files /dev/null and b/data/cbmc_ref.rda differ diff --git a/data/cbmc_ref.rda.REMOVED.git-id b/data/cbmc_ref.rda.REMOVED.git-id deleted file mode 100644 index 429e05ac8..000000000 --- a/data/cbmc_ref.rda.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -a1e05c6026776f831bf7888383413b12ea3fe07f \ No newline at end of file diff --git a/data/downrefs.rda b/data/downrefs.rda new file mode 100644 index 000000000..80db61ae1 Binary files /dev/null and b/data/downrefs.rda differ diff --git a/data/object_loc_lookup.rda b/data/object_loc_lookup.rda new file mode 100644 index 000000000..cb159deb5 Binary files /dev/null and b/data/object_loc_lookup.rda differ diff --git a/data/pbmc_markers.rda b/data/pbmc_markers.rda new file mode 100644 index 000000000..389989e85 Binary files /dev/null and b/data/pbmc_markers.rda differ diff --git a/data/pbmc_markers.rda.REMOVED.git-id b/data/pbmc_markers.rda.REMOVED.git-id deleted file mode 100644 index 01ccd93e6..000000000 --- a/data/pbmc_markers.rda.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -389989e85bfe580a304db52d6400b70556093f89 \ No newline at end of file diff --git a/data/pbmc_markers_M3Drop.rda b/data/pbmc_markers_M3Drop.rda new file mode 100644 index 000000000..2c80be6b4 Binary files /dev/null and b/data/pbmc_markers_M3Drop.rda differ diff --git a/data/pbmc_markers_M3Drop.rda.REMOVED.git-id b/data/pbmc_markers_M3Drop.rda.REMOVED.git-id deleted file mode 100644 index c0309f015..000000000 --- a/data/pbmc_markers_M3Drop.rda.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -2c80be6b44f364833477d7a2bcaa065edbebc304 \ No newline at end of file diff --git a/data/pbmc_matrix_small.rda b/data/pbmc_matrix_small.rda new file mode 100644 index 000000000..59d03d492 Binary files /dev/null and b/data/pbmc_matrix_small.rda differ diff --git a/data/pbmc_matrix_small.rda.REMOVED.git-id b/data/pbmc_matrix_small.rda.REMOVED.git-id deleted file mode 100644 index 909a69e14..000000000 --- a/data/pbmc_matrix_small.rda.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -59d03d4920613bc8a75513afd7dd424164326094 \ No newline at end of file diff --git a/data/pbmc_meta.rda b/data/pbmc_meta.rda new file mode 100644 index 000000000..b01da843d Binary files /dev/null and b/data/pbmc_meta.rda differ diff --git a/data/pbmc_meta.rda.REMOVED.git-id b/data/pbmc_meta.rda.REMOVED.git-id deleted file mode 100644 index b75c52482..000000000 --- a/data/pbmc_meta.rda.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -b01da843d6b354598f264165f144f35a6f4eadff \ No newline at end of file diff --git a/data/pbmc_vargenes.rda b/data/pbmc_vargenes.rda new file mode 100644 index 000000000..fdff02426 Binary files /dev/null and b/data/pbmc_vargenes.rda differ diff --git a/data/pbmc_vargenes.rda.REMOVED.git-id b/data/pbmc_vargenes.rda.REMOVED.git-id deleted file mode 100644 index a1d0a9a35..000000000 --- a/data/pbmc_vargenes.rda.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -fdff02426120a40e3b76001a9d386142bb617ce4 \ No newline at end of file diff --git a/data/s_small.rda b/data/s_small.rda new file mode 100644 index 000000000..ad31a0dee Binary files /dev/null and b/data/s_small.rda differ diff --git a/data/s_small.rda.REMOVED.git-id b/data/s_small.rda.REMOVED.git-id deleted file mode 100644 index e3e3e2373..000000000 --- a/data/s_small.rda.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -ad31a0dee06aa753930e37bffb80eabce1371e29 \ No newline at end of file diff --git a/data/s_small3.rda b/data/s_small3.rda new file mode 100644 index 000000000..02e0d4202 Binary files /dev/null and b/data/s_small3.rda differ diff --git a/data/s_small3.rda.REMOVED.git-id b/data/s_small3.rda.REMOVED.git-id deleted file mode 100644 index 4182ab7c2..000000000 --- a/data/s_small3.rda.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -02e0d42028b143133e330b381bc2e16486ea91a1 \ No newline at end of file diff --git a/data/sce_small.rda b/data/sce_small.rda new file mode 100644 index 000000000..a1e545ef8 Binary files /dev/null and b/data/sce_small.rda differ diff --git a/data/sce_small.rda.REMOVED.git-id b/data/sce_small.rda.REMOVED.git-id deleted file mode 100644 index 4c5f8f41f..000000000 --- a/data/sce_small.rda.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -a1e545ef8062bb296fd625b7077e1019951bdae3 \ No newline at end of file diff --git a/inst/extdata/c2.cp.reactome.v6.2.symbols.gmt.gz b/inst/extdata/c2.cp.reactome.v6.2.symbols.gmt.gz new file mode 100644 index 000000000..9c39d9642 Binary files /dev/null and b/inst/extdata/c2.cp.reactome.v6.2.symbols.gmt.gz differ diff --git a/inst/extdata/c2.cp.reactome.v6.2.symbols.gmt.gz.REMOVED.git-id b/inst/extdata/c2.cp.reactome.v6.2.symbols.gmt.gz.REMOVED.git-id deleted file mode 100644 index a35d9d566..000000000 --- a/inst/extdata/c2.cp.reactome.v6.2.symbols.gmt.gz.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -9c39d9642c390435ba4f2cf78f44831a7eaf4137 \ No newline at end of file diff --git a/man/clustify.Rd b/man/clustify.Rd new file mode 100644 index 000000000..cca62c997 --- /dev/null +++ b/man/clustify.Rd @@ -0,0 +1,194 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/main.R +\name{clustify} +\alias{clustify} +\alias{clustify.default} +\alias{clustify.seurat} +\alias{clustify.Seurat} +\alias{clustify.SingleCellExperiment} +\title{Compare scRNA-seq data to reference data.} +\usage{ +clustify(input, ...) + +\method{clustify}{default}( + input, + ref_mat, + metadata = NULL, + cluster_col = NULL, + query_genes = NULL, + per_cell = FALSE, + n_perm = 0, + compute_method = "spearman", + verbose = FALSE, + lookuptable = NULL, + rm0 = FALSE, + obj_out = TRUE, + seurat_out = TRUE, + rename_prefix = NULL, + threshold = "auto", + low_threshold_cell = 10, + exclude_genes = c(), + ... +) + +\method{clustify}{seurat}( + input, + ref_mat, + cluster_col = NULL, + query_genes = NULL, + per_cell = FALSE, + n_perm = 0, + compute_method = "spearman", + use_var_genes = TRUE, + dr = "umap", + seurat_out = TRUE, + obj_out = TRUE, + threshold = "auto", + verbose = FALSE, + rm0 = FALSE, + rename_prefix = NULL, + exclude_genes = c(), + ... +) + +\method{clustify}{Seurat}( + input, + ref_mat, + cluster_col = NULL, + query_genes = NULL, + per_cell = FALSE, + n_perm = 0, + compute_method = "spearman", + use_var_genes = TRUE, + dr = "umap", + seurat_out = TRUE, + obj_out = TRUE, + threshold = "auto", + verbose = FALSE, + rm0 = FALSE, + rename_prefix = NULL, + exclude_genes = c(), + ... +) + +\method{clustify}{SingleCellExperiment}( + input, + ref_mat, + cluster_col = NULL, + query_genes = NULL, + per_cell = FALSE, + n_perm = 0, + compute_method = "spearman", + use_var_genes = TRUE, + dr = "umap", + seurat_out = TRUE, + obj_out = TRUE, + threshold = "auto", + verbose = FALSE, + rm0 = FALSE, + rename_prefix = NULL, + exclude_genes = c(), + ... +) +} +\arguments{ +\item{input}{single-cell expression matrix or Seurat object} + +\item{...}{additional arguments to pass to compute_method function} + +\item{ref_mat}{reference expression matrix} + +\item{metadata}{cell cluster assignments, +supplied as a vector or data.frame. +If data.frame is supplied then \code{cluster_col} needs to be set. +Not required if running correlation per cell.} + +\item{cluster_col}{column in metadata that contains cluster ids per cell. +Will default to first column of metadata if not supplied. +Not required if running correlation per cell.} + +\item{query_genes}{A vector of genes of interest to compare. If NULL, then +common genes between the expr_mat and ref_mat +will be used for comparision.} + +\item{per_cell}{if true run per cell, otherwise per cluster.} + +\item{n_perm}{number of permutations, set to 0 by default} + +\item{compute_method}{method(s) for computing similarity scores} + +\item{verbose}{whether to report certain variables chosen} + +\item{lookuptable}{if not supplied, will look in built-in table +for object parsing} + +\item{rm0}{consider 0 as missing data, recommended for per_cell} + +\item{obj_out}{whether to output object instead of cor matrix} + +\item{seurat_out}{output cor matrix or called seurat object +(deprecated, use obj_out instead)} + +\item{rename_prefix}{prefix to add to type and r column names} + +\item{threshold}{identity calling minimum correlation score threshold, +only used when obj_out = TRUE} + +\item{low_threshold_cell}{option to remove clusters with too few cells} + +\item{exclude_genes}{a vector of gene names to throw out of query} + +\item{use_var_genes}{if providing a seurat object, use the variable genes +(stored in seurat_object@var.genes) as the query_genes.} + +\item{dr}{stored dimension reduction} +} +\value{ +single cell object with identity assigned in metadata, +or matrix of correlation values, clusters from input as row names, cell +types from ref_mat as column names +} +\description{ +Compare scRNA-seq data to reference data. +} +\examples{ +# Annotate a matrix and metadata +clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "classified", + verbose = TRUE +) + +# Annotate using a different method +clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "classified", + compute_method = "cosine" +) + +# Annotate a Seurat object +clustify( + s_small, + cbmc_ref, + cluster_col = "res.1", + obj_out = TRUE, + per_cell = FALSE, + dr = "tsne" +) + +# Annotate (and return) a Seurat object per-cell +clustify( + input = s_small, + ref_mat = cbmc_ref, + cluster_col = "res.1", + obj_out = TRUE, + per_cell = TRUE, + dr = "tsne" +) +} diff --git a/man/clustify.Rd.REMOVED.git-id b/man/clustify.Rd.REMOVED.git-id deleted file mode 100644 index ac095443a..000000000 --- a/man/clustify.Rd.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -cca62c9976734fafb7c73c282ecbfe6f13a3cb30 \ No newline at end of file diff --git a/man/clustify_lists.Rd b/man/clustify_lists.Rd new file mode 100644 index 000000000..83188b37f --- /dev/null +++ b/man/clustify_lists.Rd @@ -0,0 +1,174 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/main.R +\name{clustify_lists} +\alias{clustify_lists} +\alias{clustify_lists.default} +\alias{clustify_lists.seurat} +\alias{clustify_lists.Seurat} +\alias{clustify_lists.SingleCellExperiment} +\title{Main function to compare scRNA-seq data to gene lists.} +\usage{ +clustify_lists(input, ...) + +\method{clustify_lists}{default}( + input, + marker, + marker_inmatrix = TRUE, + metadata = NULL, + cluster_col = NULL, + if_log = TRUE, + per_cell = FALSE, + topn = 800, + cut = 0, + genome_n = 30000, + metric = "hyper", + output_high = TRUE, + lookuptable = NULL, + obj_out = TRUE, + seurat_out = TRUE, + rename_prefix = NULL, + threshold = 0, + low_threshold_cell = 10, + ... +) + +\method{clustify_lists}{seurat}( + input, + metadata = NULL, + cluster_col = NULL, + if_log = TRUE, + per_cell = FALSE, + topn = 800, + cut = 0, + marker, + marker_inmatrix = TRUE, + genome_n = 30000, + metric = "hyper", + output_high = TRUE, + dr = "umap", + seurat_out = TRUE, + obj_out = TRUE, + threshold = 0, + rename_prefix = NULL, + ... +) + +\method{clustify_lists}{Seurat}( + input, + metadata = NULL, + cluster_col = NULL, + if_log = TRUE, + per_cell = FALSE, + topn = 800, + cut = 0, + marker, + marker_inmatrix = TRUE, + genome_n = 30000, + metric = "hyper", + output_high = TRUE, + dr = "umap", + seurat_out = TRUE, + obj_out = TRUE, + threshold = 0, + rename_prefix = NULL, + ... +) + +\method{clustify_lists}{SingleCellExperiment}( + input, + metadata = NULL, + cluster_col = NULL, + if_log = TRUE, + per_cell = FALSE, + topn = 800, + cut = 0, + marker, + marker_inmatrix = TRUE, + genome_n = 30000, + metric = "hyper", + output_high = TRUE, + dr = "umap", + seurat_out = TRUE, + obj_out = TRUE, + threshold = 0, + rename_prefix = NULL, + ... +) +} +\arguments{ +\item{input}{single-cell expression matrix or Seurat object} + +\item{...}{passed to matrixize_markers} + +\item{marker}{matrix or dataframe of candidate genes for each cluster} + +\item{marker_inmatrix}{whether markers genes are already in preprocessed +matrix form} + +\item{metadata}{cell cluster assignments, +supplied as a vector or data.frame. +If data.frame is supplied then \code{cluster_col} needs to be set. +Not required if running correlation per cell.} + +\item{cluster_col}{column in metadata with cluster number} + +\item{if_log}{input data is natural log, averaging will be done on +unlogged data} + +\item{per_cell}{compare per cell or per cluster} + +\item{topn}{number of top expressing genes to keep from input matrix} + +\item{cut}{expression cut off from input matrix} + +\item{genome_n}{number of genes in the genome} + +\item{metric}{adjusted p-value for hypergeometric test, or jaccard index} + +\item{output_high}{if true (by default to fit with rest of package), +-log10 transform p-value} + +\item{lookuptable}{if not supplied, will look in built-in table +for object parsing} + +\item{obj_out}{whether to output object instead of cor matrix} + +\item{seurat_out}{output cor matrix or called seurat object +(deprecated, use obj_out instead)} + +\item{rename_prefix}{prefix to add to type and r column names} + +\item{threshold}{identity calling minimum correlation score threshold, +only used when obj_out = T} + +\item{low_threshold_cell}{option to remove clusters with too few cells} + +\item{dr}{stored dimension reduction} +} +\value{ +matrix of numeric values, clusters from input as row names, +cell types from marker_mat as column names +} +\description{ +Main function to compare scRNA-seq data to gene lists. +} +\examples{ +# Annotate a matrix and metadata +clustify_lists( + input = pbmc_matrix_small, + marker = cbmc_m, + metadata = pbmc_meta, + cluster_col = "classified", + verbose = TRUE +) + +# Annotate using a different method +clustify_lists( + input = pbmc_matrix_small, + marker = cbmc_m, + metadata = pbmc_meta, + cluster_col = "classified", + verbose = TRUE, + metric = "jaccard" +) +} diff --git a/man/clustify_lists.Rd.REMOVED.git-id b/man/clustify_lists.Rd.REMOVED.git-id deleted file mode 100644 index 1e55cb933..000000000 --- a/man/clustify_lists.Rd.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -83188b37f00678be4aec6caf84573bd538c08866 \ No newline at end of file diff --git a/man/clustify_nudge.Rd b/man/clustify_nudge.Rd new file mode 100644 index 000000000..f2b577850 --- /dev/null +++ b/man/clustify_nudge.Rd @@ -0,0 +1,169 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{clustify_nudge} +\alias{clustify_nudge} +\alias{clustify_nudge.default} +\alias{clustify_nudge.seurat} +\alias{clustify_nudge.Seurat} +\title{Combined function to compare scRNA-seq data to +bulk RNA-seq data and marker list} +\usage{ +clustify_nudge(input, ...) + +\method{clustify_nudge}{default}( + input, + ref_mat, + marker, + metadata = NULL, + cluster_col = NULL, + query_genes = NULL, + compute_method = "spearman", + weight = 1, + seurat_out = FALSE, + threshold = -Inf, + dr = "umap", + norm = "diff", + call = TRUE, + marker_inmatrix = TRUE, + mode = "rank", + obj_out = FALSE, + rename_prefix = NULL, + lookuptable = NULL, + ... +) + +\method{clustify_nudge}{seurat}( + input, + ref_mat, + marker, + cluster_col = NULL, + query_genes = NULL, + compute_method = "spearman", + weight = 1, + seurat_out = TRUE, + obj_out = FALSE, + threshold = -Inf, + dr = "umap", + norm = "diff", + marker_inmatrix = TRUE, + mode = "rank", + rename_prefix = NULL, + ... +) + +\method{clustify_nudge}{Seurat}( + input, + ref_mat, + marker, + cluster_col = NULL, + query_genes = NULL, + compute_method = "spearman", + weight = 1, + seurat_out = TRUE, + obj_out = FALSE, + threshold = -Inf, + dr = "umap", + norm = "diff", + marker_inmatrix = TRUE, + mode = "rank", + rename_prefix = NULL, + ... +) +} +\arguments{ +\item{input}{express matrix or object} + +\item{...}{passed to matrixize_markers} + +\item{ref_mat}{reference expression matrix} + +\item{marker}{matrix of markers} + +\item{metadata}{cell cluster assignments, supplied as a vector +or data.frame. If +data.frame is supplied then \code{cluster_col} needs to be set.} + +\item{cluster_col}{column in metadata that contains cluster ids per cell. +Will default to first +column of metadata if not supplied. +Not required if running correlation per cell.} + +\item{query_genes}{A vector of genes of interest to compare. +If NULL, then common genes between +the expr_mat and ref_mat will be used for comparision.} + +\item{compute_method}{method(s) for computing similarity scores} + +\item{weight}{relative weight for the gene list scores, +when added to correlation score} + +\item{seurat_out}{output cor matrix or called seurat object} + +\item{threshold}{identity calling minimum score threshold, +only used when obj_out = T} + +\item{dr}{stored dimension reduction} + +\item{norm}{whether and how the results are normalized} + +\item{call}{make call or just return score matrix} + +\item{marker_inmatrix}{whether markers genes are already +in preprocessed matrix form} + +\item{mode}{use marker expression pct or ranked cor score for nudging} + +\item{obj_out}{whether to output object instead of cor matrix} + +\item{rename_prefix}{prefix to add to type and r column names} + +\item{lookuptable}{if not supplied, will look in built-in +table for object parsing} +} +\value{ +single cell object, or matrix of numeric values, +clusters from input as row names, cell types from ref_mat as column names +} +\description{ +Combined function to compare scRNA-seq data to +bulk RNA-seq data and marker list +} +\examples{ +# Seurat2 +clustify_nudge( + input = s_small, + ref_mat = cbmc_ref, + marker = cbmc_m, + cluster_col = "res.1", + threshold = 0.8, + seurat_out = FALSE, + mode = "pct", + dr = "tsne" +) + +# Seurat3 +clustify_nudge( + input = s_small3, + ref_mat = cbmc_ref, + marker = cbmc_m, + cluster_col = "RNA_snn_res.1", + threshold = 0.8, + seurat_out = FALSE, + mode = "pct", + dr = "tsne" +) + +# Matrix +clustify_nudge( + input = pbmc_matrix_small, + ref_mat = cbmc_ref, + metadata = pbmc_meta, + marker = as.matrix(cbmc_m), + query_genes = pbmc_vargenes, + cluster_col = "classified", + threshold = 0.8, + call = FALSE, + marker_inmatrix = FALSE, + mode = "pct" +) +} diff --git a/man/clustify_nudge.Rd.REMOVED.git-id b/man/clustify_nudge.Rd.REMOVED.git-id deleted file mode 100644 index 72434220e..000000000 --- a/man/clustify_nudge.Rd.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -f2b5778500e0d8b8535cf7ffb9f9f54cbc803f6f \ No newline at end of file diff --git a/man/figures/README-example-1.png b/man/figures/README-example-1.png new file mode 100644 index 000000000..69d15b542 Binary files /dev/null and b/man/figures/README-example-1.png differ diff --git a/man/figures/README-example-1.png.REMOVED.git-id b/man/figures/README-example-1.png.REMOVED.git-id deleted file mode 100644 index 024f41499..000000000 --- a/man/figures/README-example-1.png.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -69d15b5421cb599baa387c9c3b079aa0fb18c433 \ No newline at end of file diff --git a/man/figures/README-example-2.png b/man/figures/README-example-2.png new file mode 100644 index 000000000..82f82331b Binary files /dev/null and b/man/figures/README-example-2.png differ diff --git a/man/figures/README-example-2.png.REMOVED.git-id b/man/figures/README-example-2.png.REMOVED.git-id deleted file mode 100644 index b457ef664..000000000 --- a/man/figures/README-example-2.png.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -82f82331b1af888989cd03e2f6f380af03b9c776 \ No newline at end of file diff --git a/man/figures/example-1.png b/man/figures/example-1.png new file mode 100644 index 000000000..69d15b542 Binary files /dev/null and b/man/figures/example-1.png differ diff --git a/man/figures/example-1.png.REMOVED.git-id b/man/figures/example-1.png.REMOVED.git-id deleted file mode 100644 index 024f41499..000000000 --- a/man/figures/example-1.png.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -69d15b5421cb599baa387c9c3b079aa0fb18c433 \ No newline at end of file diff --git a/man/figures/example-2.png b/man/figures/example-2.png new file mode 100644 index 000000000..82f82331b Binary files /dev/null and b/man/figures/example-2.png differ diff --git a/man/figures/example-2.png.REMOVED.git-id b/man/figures/example-2.png.REMOVED.git-id deleted file mode 100644 index b457ef664..000000000 --- a/man/figures/example-2.png.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -82f82331b1af888989cd03e2f6f380af03b9c776 \ No newline at end of file diff --git a/man/figures/logo.png b/man/figures/logo.png new file mode 100644 index 000000000..6b329d70a Binary files /dev/null and b/man/figures/logo.png differ diff --git a/man/figures/logo.png.REMOVED.git-id b/man/figures/logo.png.REMOVED.git-id deleted file mode 100644 index c58a1b646..000000000 --- a/man/figures/logo.png.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -6b329d70aeb5f973d7af5b5167cb98dcdc6a2e57 \ No newline at end of file diff --git a/man/figures/logo_old.png b/man/figures/logo_old.png new file mode 100644 index 000000000..ebc7b3bb8 Binary files /dev/null and b/man/figures/logo_old.png differ diff --git a/man/figures/logo_old.png.REMOVED.git-id b/man/figures/logo_old.png.REMOVED.git-id deleted file mode 100644 index 0347bd311..000000000 --- a/man/figures/logo_old.png.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -ebc7b3bb8c1e6a5acbc6f79d85788f11ecdd6be6 \ No newline at end of file diff --git a/man/figures/readme_example-1.png b/man/figures/readme_example-1.png new file mode 100644 index 000000000..97163e656 Binary files /dev/null and b/man/figures/readme_example-1.png differ diff --git a/man/figures/readme_example-1.png.REMOVED.git-id b/man/figures/readme_example-1.png.REMOVED.git-id deleted file mode 100644 index 5e38fc37c..000000000 --- a/man/figures/readme_example-1.png.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -97163e6560050ef68f668eb40afeb2c806e1416c \ No newline at end of file diff --git a/man/figures/readme_example-2.png b/man/figures/readme_example-2.png new file mode 100644 index 000000000..2f8cf8fc8 Binary files /dev/null and b/man/figures/readme_example-2.png differ diff --git a/man/figures/readme_example-2.png.REMOVED.git-id b/man/figures/readme_example-2.png.REMOVED.git-id deleted file mode 100644 index 707cfa38e..000000000 --- a/man/figures/readme_example-2.png.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -2f8cf8fc8c82b0ad915dbbe17689cdab0358df19 \ No newline at end of file diff --git a/man/figures/test.png b/man/figures/test.png new file mode 100644 index 000000000..a2a85ca00 Binary files /dev/null and b/man/figures/test.png differ diff --git a/man/figures/test.png.REMOVED.git-id b/man/figures/test.png.REMOVED.git-id deleted file mode 100644 index ee35465be..000000000 --- a/man/figures/test.png.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -a2a85ca00680e90347bd02613b46ae5b6208c6c0 \ No newline at end of file diff --git a/man/remove_background.Rd b/man/remove_background.Rd deleted file mode 100644 index 1540dc001..000000000 --- a/man/remove_background.Rd +++ /dev/null @@ -1,32 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{remove_background} -\alias{remove_background} -\title{Remove high background expression genes from matrix} -\usage{ -remove_background(mat, background, n = 0) -} -\arguments{ -\item{mat}{expression matrix} - -\item{background}{vector or dataframe or matrix of high -expression genes in background} - -\item{n}{the number of top genes to exclude, 0 defaults to all} -} -\value{ -expression matrix with rows removed -} -\description{ -Remove high background expression genes from matrix -} -\examples{ -avg1 <- average_clusters_filter( - mat = pbmc_matrix_small, - metadata = pbmc_meta, - filter_on = "nFeature_RNA" -) - -mat <- remove_background(pbmc_matrix_small, avg1, 1) -mat[1:3, 1:3] -} diff --git a/pkgdown/favicon/apple-touch-icon-120x120.png b/pkgdown/favicon/apple-touch-icon-120x120.png new file mode 100644 index 000000000..971d359ba Binary files /dev/null and b/pkgdown/favicon/apple-touch-icon-120x120.png differ diff --git a/pkgdown/favicon/apple-touch-icon-120x120.png.REMOVED.git-id b/pkgdown/favicon/apple-touch-icon-120x120.png.REMOVED.git-id deleted file mode 100644 index a54df221f..000000000 --- a/pkgdown/favicon/apple-touch-icon-120x120.png.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -971d359ba27964c28530d60c71ba1810e9429be1 \ No newline at end of file diff --git a/pkgdown/favicon/apple-touch-icon-152x152.png b/pkgdown/favicon/apple-touch-icon-152x152.png new file mode 100644 index 000000000..85c7c8880 Binary files /dev/null and b/pkgdown/favicon/apple-touch-icon-152x152.png differ diff --git a/pkgdown/favicon/apple-touch-icon-152x152.png.REMOVED.git-id b/pkgdown/favicon/apple-touch-icon-152x152.png.REMOVED.git-id deleted file mode 100644 index 4de1ebb2e..000000000 --- a/pkgdown/favicon/apple-touch-icon-152x152.png.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -85c7c88805c262d386e6149c19b3dd4cf2013709 \ No newline at end of file diff --git a/pkgdown/favicon/apple-touch-icon-180x180.png b/pkgdown/favicon/apple-touch-icon-180x180.png new file mode 100644 index 000000000..20cd7cbad Binary files /dev/null and b/pkgdown/favicon/apple-touch-icon-180x180.png differ diff --git a/pkgdown/favicon/apple-touch-icon-180x180.png.REMOVED.git-id b/pkgdown/favicon/apple-touch-icon-180x180.png.REMOVED.git-id deleted file mode 100644 index a5fce19a7..000000000 --- a/pkgdown/favicon/apple-touch-icon-180x180.png.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -20cd7cbad1da26549a93a540f6cb229204532bf1 \ No newline at end of file diff --git a/pkgdown/favicon/apple-touch-icon-60x60.png b/pkgdown/favicon/apple-touch-icon-60x60.png new file mode 100644 index 000000000..33b3905af Binary files /dev/null and b/pkgdown/favicon/apple-touch-icon-60x60.png differ diff --git a/pkgdown/favicon/apple-touch-icon-60x60.png.REMOVED.git-id b/pkgdown/favicon/apple-touch-icon-60x60.png.REMOVED.git-id deleted file mode 100644 index f3ffbc1b3..000000000 --- a/pkgdown/favicon/apple-touch-icon-60x60.png.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -33b3905af51aa0f02a91adc1129ed9d091772189 \ No newline at end of file diff --git a/pkgdown/favicon/apple-touch-icon-76x76.png b/pkgdown/favicon/apple-touch-icon-76x76.png new file mode 100644 index 000000000..59a3f4eeb Binary files /dev/null and b/pkgdown/favicon/apple-touch-icon-76x76.png differ diff --git a/pkgdown/favicon/apple-touch-icon-76x76.png.REMOVED.git-id b/pkgdown/favicon/apple-touch-icon-76x76.png.REMOVED.git-id deleted file mode 100644 index 2b660d0c9..000000000 --- a/pkgdown/favicon/apple-touch-icon-76x76.png.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -59a3f4eeb17f51c79392da522b7c13787d5fe932 \ No newline at end of file diff --git a/pkgdown/favicon/apple-touch-icon.png b/pkgdown/favicon/apple-touch-icon.png new file mode 100644 index 000000000..4cc701222 Binary files /dev/null and b/pkgdown/favicon/apple-touch-icon.png differ diff --git a/pkgdown/favicon/apple-touch-icon.png.REMOVED.git-id b/pkgdown/favicon/apple-touch-icon.png.REMOVED.git-id deleted file mode 100644 index a2e9e2c4b..000000000 --- a/pkgdown/favicon/apple-touch-icon.png.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -4cc701222fde4c878268b42874a3ad95420a814a \ No newline at end of file diff --git a/pkgdown/favicon/favicon-16x16.png b/pkgdown/favicon/favicon-16x16.png new file mode 100644 index 000000000..b0829dec8 Binary files /dev/null and b/pkgdown/favicon/favicon-16x16.png differ diff --git a/pkgdown/favicon/favicon-32x32.png b/pkgdown/favicon/favicon-32x32.png new file mode 100644 index 000000000..0c34171c5 Binary files /dev/null and b/pkgdown/favicon/favicon-32x32.png differ diff --git a/pkgdown/favicon/favicon.ico b/pkgdown/favicon/favicon.ico new file mode 100644 index 000000000..482073e77 Binary files /dev/null and b/pkgdown/favicon/favicon.ico differ diff --git a/pkgdown/favicon/favicon.ico.REMOVED.git-id b/pkgdown/favicon/favicon.ico.REMOVED.git-id deleted file mode 100644 index 706b13df8..000000000 --- a/pkgdown/favicon/favicon.ico.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -482073e772b9b0081afbda446cedd2618193d4ca \ No newline at end of file diff --git a/tests/testthat/test_cor.R b/tests/testthat/test_cor.R new file mode 100644 index 000000000..571ca64a4 --- /dev/null +++ b/tests/testthat/test_cor.R @@ -0,0 +1,465 @@ +context("clustify") + +test_that("output is correctly formatted", { + res <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "classified", + verbose = TRUE + ) + n_clusters <- length(unique(pbmc_meta$classified)) + n_ref_samples <- ncol(cbmc_ref) + + expect_equal(ncol(res), n_ref_samples) + expect_equal(n_clusters, nrow(res)) +}) + +test_that("clustify takes accidental dataframe as well", { + res <- clustify( + input = as.data.frame(as.matrix(pbmc_matrix_small)), + metadata = pbmc_meta, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "classified", + verbose = TRUE + ) + n_clusters <- length(unique(pbmc_meta$classified)) + n_ref_samples <- ncol(cbmc_ref) + + expect_equal(ncol(res), n_ref_samples) + expect_equal(n_clusters, nrow(res)) +}) + +test_that("clustify takes factor for metadata", { + res <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta$classified, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + verbose = TRUE + ) + n_clusters <- length(unique(pbmc_meta$classified)) + n_ref_samples <- ncol(cbmc_ref) + + expect_equal(ncol(res), n_ref_samples) + expect_equal(n_clusters, nrow(res)) +}) + +test_that("run all correlation functions", { + results <- lapply( + clustifyr_methods, + function(x) { + clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "classified", + compute_method = x + ) + } + ) + nrows <- lapply(results, nrow) %>% unlist() + ncols <- lapply(results, ncol) %>% unlist() + + expect_equal(1, length(unique(nrows))) + expect_equal(1, length(unique(ncols))) +}) + +test_that("test bad inputs", { + expect_error(clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + compute_method = "foo" + )) +}) + +test_that("test per cell", { + res <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cell_col = "rn", + per_cell = TRUE + ) + + expect_equal(nrow(res), ncol(pbmc_matrix_small)) +}) + +test_that("test permutation", { + res1 <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "classified" + ) + + res_full <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "classified", + n_perm = 2, + return_full = TRUE + ) + + expect_equal(res1, res_full$score) + expect_equal(length(res_full), 2) + expect_true(all(res_full$p_val >= 0 | res_full$p_val <= 0)) +}) + +test_that("seurat object clustifying", { + res <- clustify(s_small, + cbmc_ref, + cluster_col = "res.1", + dr = "tsne" + ) + + res <- clustify(s_small, + cbmc_ref, + cluster_col = "res.1", + seurat_out = FALSE, + per_cell = TRUE, + dr = "tsne" + ) + + res <- clustify(s_small, + cbmc_ref, + cluster_col = "res.1", + seurat_out = FALSE, + dr = "tsne" + ) + g <- plot_best_call( + res, + seurat_meta(s_small, + dr = "tsne" + ), + cluster_col = "res.1", + plot_r = TRUE, + x = "tSNE_1", + y = "tSNE_2" + ) + expect_true(ggplot2::is.ggplot(g[[1]])) +}) + +test_that("clustify reinserts seurat metadata correctly", { + res <- clustify(s_small, + cbmc_ref, + cluster_col = "res.1", + seurat_out = TRUE, + per_cell = TRUE, + dr = "tsne" + ) + res2 <- clustify(s_small, + cbmc_ref, + cluster_col = "res.1", + seurat_out = TRUE, + dr = "tsne" + ) + if ("Seurat" %in% loadedNamespaces()) { + expect_true(class(res) %in% c("seurat")) + } else { + expect_true(is.matrix(res)) + } +}) + +test_that("seurat3 object clustifying", { + res <- clustify(s_small3, + cbmc_ref, + cluster_col = "RNA_snn_res.1", + dr = "tsne" + ) + res <- clustify(s_small3, + cbmc_ref, + cluster_col = "RNA_snn_res.1", + seurat_out = FALSE, + per_cell = TRUE, + dr = "tsne" + ) + res <- clustify(s_small3, + cbmc_ref, + cluster_col = "RNA_snn_res.1", + seurat_out = FALSE, + dr = "tsne" + ) + g <- plot_best_call(res, + seurat_meta(s_small3, + dr = "tsne" + ), + cluster_col = "RNA_snn_res.1", + plot_r = TRUE + ) + expect_true(ggplot2::is.ggplot(g[[1]])) +}) + +test_that("clustify reinserts seurat3 metadata correctly", { + res <- clustify(s_small3, + cbmc_ref, + cluster_col = "RNA_snn_res.1", + seurat_out = TRUE, + per_cell = TRUE, + dr = "tsne" + ) + + res2 <- clustify(s_small3, + cbmc_ref, + cluster_col = "RNA_snn_res.1", + seurat_out = TRUE, + dr = "tsne" + ) + if ("Seurat" %in% loadedNamespaces()) { + expect_true(class(res) %in% c("Seurat")) + } else { + expect_true(is.matrix(res)) + } +}) + +test_that("get_similarity handles NA entries", { + pbmc_meta2 <- pbmc_meta + pbmc_meta2[1, "classified"] <- NA + res <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta2, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "classified" + ) + n_clusters <- length(unique(pbmc_meta$classified)) + n_ref_samples <- ncol(cbmc_ref) + + expect_equal(ncol(res), n_ref_samples) + expect_equal(n_clusters + 1, nrow(res)) +}) + +test_that("get_similarity handles NA entries", { + pbmc_meta2 <- pbmc_meta + pbmc_meta2[1, "classified"] <- NA + res <- get_similarity( + pbmc_matrix_small[intersect(rownames(pbmc_matrix_small), rownames(cbmc_ref)), ], + ref_mat = cbmc_ref[intersect(rownames(pbmc_matrix_small), rownames(cbmc_ref)), ], + pbmc_meta2$classified, + compute_method = "spearman" + ) + n_clusters <- length(unique(pbmc_meta$classified)) + n_ref_samples <- ncol(cbmc_ref) + + expect_equal(ncol(res), n_ref_samples) + expect_equal(n_clusters + 1, nrow(res)) +}) + +test_that("get_similarity can exclude 0s as missing data", { + res <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "classified", + per_cell = TRUE, + rm0 = TRUE + ) + + expect_equal(ncol(pbmc_matrix_small), nrow(res)) +}) + +test_that("permute_similarity runs per cell", { + res <- permute_similarity( + pbmc_matrix_small[c("PPBP", "LYZ", "S100A9"), c(7, 11)], + cbmc_ref[c("PPBP", "LYZ", "S100A9"), 1:3], + colnames(pbmc_matrix_small[c("PPBP", "LYZ", "S100A9"), c(7, 11)]), + n_perm = 2, + per_cell = TRUE, + compute_method = "spearman" + ) + expect_equal(length(res), 2) +}) + +test_that("error for unsupported method", { + expect_error( + res <- permute_similarity( + pbmc_matrix_small[c("RBM28", "CCDC136", "TNPO3"), c(7, 11)], + cbmc_ref[c("RBM28", "CCDC136", "TNPO3"), 1:3], + pbmc_meta$rn[c(7, 11)], + n_perm = 2, + per_cell = TRUE, + compute_method = "a" + ) + ) +}) + +test_that("cor throws readable error when mat has 0 rows", { + expect_error(res <- clustify( + input = pbmc_matrix_small[0, ], + metadata = pbmc_meta, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "classified", + verbose = TRUE + )) +}) + +test_that("cor throws readable error when mat has wrong number of cols", { + expect_error(res <- clustify( + input = pbmc_matrix_small[, 1:2630], + metadata = pbmc_meta, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "classified", + verbose = TRUE + )) +}) + +test_that("cor throws readable error when ref_mat has 0 rows", { + expect_error(res <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta, + ref_mat = cbmc_ref[0, ], + query_genes = pbmc_vargenes, + cluster_col = "classified", + verbose = TRUE + )) +}) + +test_that("cor throws readable error when ref_mat has 0 cols", { + expect_error(res <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta, + ref_mat = cbmc_ref[, 0], + query_genes = pbmc_vargenes, + cluster_col = "classified", + verbose = TRUE + )) +}) + +test_that("sparse matrix is accepted as input", { + res <- clustify( + input = s_small3@assays$RNA@counts, + metadata = s_small3@meta.data, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "letter.idents", + verbose = TRUE + ) + + expect_equal(2, nrow(res)) +}) + +test_that("correct error message is displayed for nonexistent cluster_col", { + expect_error(res <- clustify( + input = s_small3@assays$RNA@counts, + metadata = s_small3@meta.data, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "a", + verbose = TRUE + )) +}) + +test_that("input Seurat metadata columns are not changed (type, r, rn, etc). #259", { + skip_if_not_installed("Seurat") + tmp <- s_small3 + tmp@meta.data$type <- 0L + tmp@meta.data$rn <- 0L + tmp@meta.data$r <- 0L + + res <- clustify( + input = tmp, + ref_mat = cbmc_ref, + cluster_col = "RNA_snn_res.1", + dr = "tsne" + ) + + expect_true(all(c("type", "rn", "r") %in% colnames(res@meta.data))) + expect_true(all(res@meta.data$type == 0L)) + expect_true(all(res@meta.data$rn == 0L)) + expect_true(all(res@meta.data$r == 0L)) +}) + +test_that("clustify_lists works with pos_neg_select and Seurat3 object", { + res <- clustify_lists( + s_small3, + marker = cbmc_m, + cluster_col = "RNA_snn_res.1", + dr = "tsne", + metric = "posneg", + seurat_out = FALSE + ) + expect_true(nrow(res) == 3) +}) + +test_that("clustify_lists works with pct and Seurat3 object", { + res <- clustify_lists( + s_small3, + marker = cbmc_m, + cluster_col = "RNA_snn_res.1", + dr = "tsne", + metric = "pct", + seurat_out = FALSE + ) + expect_true(nrow(res) == 3) +}) + +test_that("clustify_lists gives correct error message upon unrecognized method", { + expect_error( + res <- clustify_lists( + s_small3, + marker = cbmc_m, + cluster_col = "RNA_snn_res.1", + dr = "tsne", + metric = "ptc", + seurat_out = FALSE + ) + ) +}) + +test_that("clustify takes factor for metadata", { + res <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta$classified, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + verbose = TRUE + ) + + res2 <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta$classified, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + verbose = TRUE, + exclude_genes = c("CD27", "ZNF232", "ZYX") + ) + + expect_true(res[1, 1] != res2[1, 1]) +}) + +test_that("sce object clustifying", { + res <- clustify(sce_small, + cbmc_ref, + cluster_col = "cell_type1", + obj_out = FALSE + ) + expect_true(nrow(res) == 13) +}) + +test_that("sce object clustify_lists", { + other <- c("TAF12", "SNHG3") + delta <- c("PCSK1", "LEPR") + panm <- data.frame(other, delta) + + res <- clustify_lists( + sce_small, + marker = panm, + cluster_col = "cell_type1", + obj_out = FALSE, + n = 100, + metric = "pct" + ) + expect_true(nrow(res) == 13) +}) diff --git a/tests/testthat/test_cor.R.REMOVED.git-id b/tests/testthat/test_cor.R.REMOVED.git-id deleted file mode 100644 index 5ac0a2234..000000000 --- a/tests/testthat/test_cor.R.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -571ca64a4adaa564832d4887c56d207675b66694 \ No newline at end of file diff --git a/tests/testthat/test_gsea.R b/tests/testthat/test_gsea.R new file mode 100644 index 000000000..5cea6400c --- /dev/null +++ b/tests/testthat/test_gsea.R @@ -0,0 +1,95 @@ +context("run_gsea") + +test_that("output is correctly formatted", { + data("pbmc_vargenes") + res <- run_gsea( + pbmc_matrix_small, + query_genes = pbmc_vargenes[1:100], + n_perm = 10, + cluster_ids = pbmc_meta$classified, + no_warnings = TRUE + ) + + expect_equal(nrow(res), length(unique(pbmc_meta$classified))) + expect_true(all(res$pval >= 0 & res$pval <= 1)) +}) + +test_that("run_gsea checks for matching number of clusters", { + data("pbmc_vargenes") + expect_error( + res <- run_gsea( + pbmc_matrix_small, + query_genes = pbmc_vargenes[1:100], + n_perm = 10, + cluster_ids = pbmc_meta$classified[1:3], + no_warnings = TRUE + ) + ) +}) + +test_that("run_gsea warns slow runs", { + data("pbmc_vargenes") + + expect_warning(res <- run_gsea(pbmc_matrix_small[, 1:3], + query_genes = pbmc_vargenes[1:2], + n_perm = 10001, + per_cell = TRUE, + cluster_ids = pbmc_meta$classified, + no_warnings = TRUE + )) +}) + +test_that("run_gsea warning suppression", { + data("pbmc_vargenes") + expect_warning( + res <- run_gsea( + pbmc_matrix_small[, 1:3], + query_genes = pbmc_vargenes[1:2], + n_perm = 1, + per_cell = TRUE, + cluster_ids = pbmc_meta$classified, + no_warnings = FALSE + ) + ) +}) + +test_that("calculate_pathway_gsea gives appropriate output", { + gl <- list( + "n" = c("PPBP", "LYZ", "S100A9"), + "a" = c("IGLL5", "GNLY", "FTL") + ) + pbmc_avg <- average_clusters(pbmc_matrix_small, + pbmc_meta, + cluster_col = "classified" + ) + res <- calculate_pathway_gsea(pbmc_avg, gl, scale = TRUE) + + expect_equal(nrow(res), length(unique(pbmc_meta$classified))) +}) + +test_that("plot_pathway_gsea gives appropriate output", { + gl <- list( + "n" = c("PPBP", "LYZ", "S100A9"), + "a" = c("IGLL5", "GNLY", "FTL") + ) + pbmc_avg <- average_clusters(pbmc_matrix_small, + pbmc_meta, + cluster_col = "classified" + ) + g <- plot_pathway_gsea(pbmc_avg, gl, 5) + expect_equal(length(g), 2) +}) + +test_that("plot_pathway_gsea gives output depending on returning option", { + gl <- list( + "n" = c("PPBP", "LYZ", "S100A9"), + "a" = c("IGLL5", "GNLY", "FTL") + ) + pbmc_avg <- average_clusters(pbmc_matrix_small, + pbmc_meta, + cluster_col = "classified" + ) + g <- plot_pathway_gsea(pbmc_avg, gl, 5, returning = "plot") + g2 <- plot_pathway_gsea(pbmc_avg, gl, 5, returning = "res") + expect_true(is(g, "Heatmap") & is.data.frame(g2)) +}) diff --git a/tests/testthat/test_gsea.R.REMOVED.git-id b/tests/testthat/test_gsea.R.REMOVED.git-id deleted file mode 100644 index 5bc28c2ee..000000000 --- a/tests/testthat/test_gsea.R.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -5cea6400cd70c8ff7d8211a11f08b565a068600b \ No newline at end of file diff --git a/tests/testthat/test_list.R b/tests/testthat/test_list.R new file mode 100644 index 000000000..1165ba8e4 --- /dev/null +++ b/tests/testthat/test_list.R @@ -0,0 +1,294 @@ +context("compare_list") + +test_that("warning if matrix is not binarized", { + pbmc_mm <- matrixize_markers(pbmc_markers) + pbmc_avg <- average_clusters(pbmc_matrix_small, + pbmc_meta, + cluster_col = "classified" + ) + pbmc_avgb <- binarize_expr(pbmc_avg) + gene_list_methods <- c("hyper") + expect_warning(results <- lapply( + gene_list_methods, + function(x) { + compare_lists(pbmc_avg, + pbmc_mm, + metric = x + ) + } + )) +}) + + +test_that("run all gene list functions", { + pbmc_mm <- matrixize_markers(pbmc_markers) + pbmc_avg <- average_clusters(pbmc_matrix_small, + pbmc_meta, + cluster_col = "classified" + ) + pbmc_avgb <- binarize_expr(pbmc_avg) + gene_list_methods <- c("spearman", "hyper", "jaccard", "gsea") + results <- lapply( + gene_list_methods, + function(x) { + compare_lists(pbmc_avgb, + pbmc_mm, + metric = x + ) + } + ) + + expect_equal(4, length(results)) +}) + +test_that("gene list function options", { + pbmc_mm <- matrixize_markers(pbmc_markers) + pbmc_avg <- average_clusters(pbmc_matrix_small, + pbmc_meta, + cluster_col = "classified" + ) + pbmc_avgb <- binarize_expr(pbmc_avg) + expect_error(suppressWarnings( + res <- compare_lists( + pbmc_avgb, + pbmc_mm, + metric = "hyper", + output_high = FALSE, + n = 5 + ) + )) +}) + +test_that("run all gene list functions in clustify_lists", { + gene_list_methods <- c("spearman", "hyper", "jaccard", "gsea") + results <- lapply( + gene_list_methods, + function(x) { + clustify_lists( + pbmc_matrix_small, + per_cell = FALSE, + metadata = pbmc_meta, + cluster_col = "classified", + marker = pbmc_markers, + marker_inmatrix = FALSE, + metric = x + ) + } + ) + + expect_equal(4, length(results)) +}) + +test_that("gsea outputs in cor matrix format", { + res <- clustify_lists( + pbmc_matrix_small, + per_cell = FALSE, + metadata = pbmc_meta, + cluster_col = "classified", + marker = pbmc_markers, + marker_inmatrix = FALSE, + metric = "gsea" + ) + res2 <- cor_to_call(res) + + expect_equal(9, nrow(res2)) +}) + +test_that("seurat object clustify_lists-ing", { + res <- clustify_lists( + s_small, + per_cell = FALSE, + marker = pbmc_markers, + marker_inmatrix = FALSE, + metric = "jaccard", + cluster_col = "res.1", + seurat_out = FALSE, + dr = "tsne" + ) + res <- clustify_lists( + s_small, + per_cell = FALSE, + marker = pbmc_markers, + marker_inmatrix = FALSE, + metric = "jaccard", + cluster_col = "res.1", + seurat_out = FALSE, + dr = "tsne" + ) + g <- plot_best_call( + res, + seurat_meta(s_small, + dr = "tsne" + ), + cluster_col = "res.1", + plot_r = TRUE, + x = "tSNE_1", + y = "tSNE_2" + ) + expect_true(ggplot2::is.ggplot(g[[1]])) +}) + +test_that("clustify_lists inserts seurat metadata correctly", { + res <- clustify_lists( + s_small, + per_cell = FALSE, + marker = pbmc_markers, + marker_inmatrix = FALSE, + metric = "jaccard", + cluster_col = "res.1", + seurat_out = TRUE, + dr = "tsne" + ) + res2 <- clustify_lists( + s_small, + per_cell = TRUE, + marker = pbmc_markers, + marker_inmatrix = FALSE, + metric = "jaccard", + cluster_col = "res.1", + seurat_out = TRUE, + dr = "tsne" + ) + if ("Seurat" %in% loadedNamespaces()) { + expect_true(class(res) %in% c("seurat")) + } else { + expect_true(is.matrix(res)) + } +}) + +test_that("seurat3 object clustify_lists-ing", { + res <- clustify_lists( + s_small3, + per_cell = FALSE, + marker = pbmc_markers, + marker_inmatrix = FALSE, + metric = "jaccard", + cluster_col = "RNA_snn_res.1", + seurat_out = TRUE, + dr = "tsne" + ) + res <- clustify_lists( + s_small3, + per_cell = FALSE, + marker = pbmc_markers, + marker_inmatrix = FALSE, + metric = "jaccard", + cluster_col = "RNA_snn_res.1", + seurat_out = FALSE, + dr = "tsne" + ) + g <- plot_best_call( + res, + seurat_meta(s_small3, + dr = "tsne" + ), + cluster_col = "RNA_snn_res.1", + plot_r = TRUE, + x = "tSNE_1", + y = "tSNE_2" + ) + expect_true(ggplot2::is.ggplot(g[[1]])) +}) + +test_that("clustify_lists inserts seurat3 metadata correctly", { + res <- clustify_lists( + s_small3, + per_cell = FALSE, + marker = pbmc_markers, + marker_inmatrix = FALSE, + metric = "jaccard", + cluster_col = "RNA_snn_res.1", + seurat_out = TRUE, + dr = "tsne" + ) + res2 <- clustify_lists( + s_small3, + per_cell = TRUE, + marker = pbmc_markers, + marker_inmatrix = FALSE, + metric = "jaccard", + cluster_col = "RNA_snn_res.1", + seurat_out = TRUE, + dr = "tsne" + ) + if ("Seurat" %in% loadedNamespaces()) { + expect_true(class(res) %in% c("Seurat")) + } else { + expect_true(is.matrix(res)) + } +}) + +test_that("run all gene list functions and then use consensus_call", { + pbmc_mm <- matrixize_markers(pbmc_markers) + pbmc_avg <- average_clusters(pbmc_matrix_small, + pbmc_meta, + cluster_col = "classified" + ) + pbmc_avgb <- binarize_expr(pbmc_avg) + gene_list_methods <- c("spearman", "hyper", "jaccard", "gsea") + results <- lapply( + gene_list_methods, + function(x) { + compare_lists(pbmc_avgb, + pbmc_mm, + metric = x + ) + } + ) + call_list <- lapply( + results, + cor_to_call_rank + ) + calls <- call_consensus(call_list) + expect_equal(4, length(results)) +}) + +test_that("run all gene list functions in clustify_lists", { + res <- clustify_lists( + pbmc_matrix_small, + cbmc_m, + metadata = pbmc_meta, + cluster_col = "classified", + metric = "consensus" + ) + expect_equal(9, nrow(res)) +}) + +test_that("run all gene list functions in clustify_lists and seurat object", { + res <- clustify_lists( + s_small3, + marker = cbmc_m, + dr = "tsne", + cluster_col = "RNA_snn_res.1", + metric = "consensus", + seurat_out = TRUE + ) + expect_true(is.data.frame(res) | "Seurat" %in% class(res)) +}) + +test_that("run all gene list functions in clustify_lists and seurat object", { + res <- clustify_lists( + s_small, + marker = cbmc_m, + dr = "tsne", + cluster_col = "res.1", + metric = "consensus", + seurat_out = TRUE + ) + expect_true(is.data.frame(res) | "seurat" %in% class(res)) +}) + +test_that("lists of genes will work with posneg", { + lst_of_markers <- split(pbmc_markers$gene, pbmc_markers$cluster) + res <- clustify_lists( + input = pbmc_matrix_small, + per_cell = FALSE, + cluster_col = "classified", + metadata = pbmc_meta, + marker = lst_of_markers, + marker_inmatrix = TRUE, + metric = "posneg", + seurat_out = FALSE + ) + expect_true(ncol(res) == length(lst_of_markers)) +}) diff --git a/tests/testthat/test_list.R.REMOVED.git-id b/tests/testthat/test_list.R.REMOVED.git-id deleted file mode 100644 index b9c618941..000000000 --- a/tests/testthat/test_list.R.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -1165ba8e42388f68281a736484a9179161c17ede \ No newline at end of file diff --git a/tests/testthat/test_plots.R b/tests/testthat/test_plots.R new file mode 100644 index 000000000..7c739658e --- /dev/null +++ b/tests/testthat/test_plots.R @@ -0,0 +1,230 @@ +context("plotting") + +res <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "classified" +) + +res2 <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "classified", + per_cell = TRUE +) + +test_that("plots can be generated", { + plts <- plot_best_call(res, + pbmc_meta, + cluster_col = "classified" + ) + plts2 <- plot_dims(pbmc_meta) + expect_true(ggplot2::is.ggplot(plts)) +}) + +test_that("plot_best_call warns about colnames", { + pbmc_meta2 <- pbmc_meta + pbmc_meta2$type <- 1 + expect_warning(plts <- plot_best_call(res, pbmc_meta2)) +}) + +test_that("call plots can be generated", { + plts <- plot_cor( + res, + pbmc_meta, + cluster_col = "classified" + ) + + expect_error( + plts <- plot_cor( + res, + pbmc_meta, + data_to_plot = "nonsense", + cluster_col = "classified" + ) + ) + + expect_true(is.list(plts)) + expect_true(ggplot2::is.ggplot(plts[[1]])) +}) + +test_that("plot_cor for all clusters by default", { + plts <- plot_cor(res, + pbmc_meta, + cluster_col = "classified", + x = "UMAP_1", + y = "UMAP_2" + ) + + plts2 <- plot_cor( + res2, + pbmc_meta %>% tibble::rownames_to_column("rn"), + cluster_col = "rn", + x = "UMAP_1", + y = "UMAP_2" + ) + + expect_true(length(plts) == ncol(cbmc_ref)) +}) + +test_that("plot_cor works with scale_legends option", { + plts <- plot_cor(res, + pbmc_meta, + cluster_col = "classified", + scale_legends = TRUE + ) + + plts2 <- plot_cor(res, + pbmc_meta, + cluster_col = "classified", + scale_legends = c(0, 1) + ) + expect_true(length(plts) == ncol(cbmc_ref)) +}) + +test_that("plot_gene can handle strange and normal genenames", { + genes <- c( + "RP11-314N13.3", + "ARF4" + ) + plts <- plot_gene( + pbmc_matrix_small, + pbmc_meta %>% tibble::rownames_to_column("rn"), + genes = genes, + cell_col = "rn" + ) + + expect_true(is.list(plts)) + expect_true(all(vapply(plts, ggplot2::is.ggplot, FUN.VALUE = logical(1)))) +}) + +test_that("plot_gene automatically plots all cells", { + genes <- c("ZYX") + expect_error( + plts <- plot_gene( + pbmc_matrix_small, + tibble::column_to_rownames(pbmc_meta, "rn"), + genes = genes, + cell_col = "nonsense" + ) + ) + + plts <- plot_gene(pbmc_matrix_small, + pbmc_meta, + genes = genes + ) + + expect_true(all(vapply(plts, ggplot2::is.ggplot, FUN.VALUE = logical(1)))) +}) + +test_that("plot_best_call threshold works as intended, on per cell and collapsing", { + res <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "classified", + per_cell = TRUE + ) + call1 <- plot_best_call( + res, + metadata = pbmc_meta, + per_cell = TRUE, + collapse_to_cluster = "classified", + threshold = 0.3 + ) + + expect_true(ggplot2::is.ggplot(call1)) +}) + +test_that("plot_gene checks for presence of gene name", { + expect_warning(plot_gene(pbmc_matrix_small, + pbmc_meta %>% tibble::rownames_to_column("rn"), + c("INIP", "ZFP36L3"), + cell_col = "rn", + do_label = TRUE, + do_legend = FALSE, + x = "UMAP_1", + y = "UMAP_2" + )) + expect_error(expect_warning(plot_gene(pbmc_matrix_small, + pbmc_meta %>% tibble::rownames_to_column("rn"), + c("ZFP36L3"), + cell_col = "rn", + x = "UMAP_1", + y = "UMAP_2" + ))) +}) + +test_that("plot_cor_heatmap returns a ggplot object", { + res <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "classified", + per_cell = FALSE + ) + g <- plot_cor_heatmap(res) + expect_true(is(g, "Heatmap")) +}) + +test_that("plot_call works on defaults", { + g <- plot_call(res, + pbmc_meta, + cluster_col = "classified" + ) + + expect_true(ggplot2::is.ggplot(g[[1]])) +}) + +test_that("plot_dims works with alpha_col", { + pbmc_meta2 <- pbmc_meta + pbmc_meta2$al <- 0 + pbmc_meta2$al[1] <- 1 + g <- plot_dims( + pbmc_meta2, + feature = "classified", + alpha_col = "al", + do_legend = FALSE + ) + g2 <- plot_dims( + pbmc_meta2, + feature = "classified", + alpha_col = "al", + do_legend = FALSE, + do_repel = TRUE, + do_label = TRUE + ) + expect_true(ggplot2::is.ggplot(g)) +}) + +test_that("plot_dims works with group_col", { + pbmc_meta2 <- pbmc_meta + pbmc_meta2$al <- 1 + pbmc_meta2$al[1:1500] <- 0 + pbmc_meta2$b <- pbmc_meta2$classified + g <- plot_dims( + pbmc_meta2, + feature = "classified", + group_col = "b", + do_legend = FALSE, + do_repel = TRUE, + do_label = FALSE + ) + + g2 <- plot_dims( + pbmc_meta2, + feature = "classified", + alpha_col = "al", + group_col = "b", + do_legend = FALSE, + do_repel = TRUE, + do_label = TRUE + ) + expect_true(ggplot2::is.ggplot(g2)) +}) diff --git a/tests/testthat/test_plots.R.REMOVED.git-id b/tests/testthat/test_plots.R.REMOVED.git-id deleted file mode 100644 index 865ea85d0..000000000 --- a/tests/testthat/test_plots.R.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -7c739658e323fd24491749f1f26dbcdc832b7075 \ No newline at end of file diff --git a/tests/testthat/test_utils.R b/tests/testthat/test_utils.R new file mode 100644 index 000000000..5ec1b9ac2 --- /dev/null +++ b/tests/testthat/test_utils.R @@ -0,0 +1,1249 @@ +context("utils") + +test_that("get_vargenes works for both matrix and dataframe form", { + pbmc_mm <- matrixize_markers(pbmc_markers) + var1 <- get_vargenes(pbmc_mm) + var2 <- get_vargenes(pbmc_markers) + + expect_equal(var1[1], var2[1]) +}) + +test_that("matrixize_markers with remove_rp option", { + pbmc_mm <- matrixize_markers(pbmc_markers) + pbmc_mm2 <- matrixize_markers(pbmc_markers, + remove_rp = TRUE + ) + + expect_true(nrow(pbmc_mm) != nrow(pbmc_mm2)) +}) + +test_that("matrixize_markers to turn matrix into ranked list", { + pbmc_mm <- matrixize_markers(pbmc_markers, n = 50) + pbmc_mm2 <- + matrixize_markers(pbmc_mm, ranked = TRUE, unique = TRUE) + + expect_true(nrow(pbmc_mm) < nrow(pbmc_mm2)) +}) + +test_that("matrixize_markers uses supplied labels", { + pbmc_mm <- matrixize_markers( + pbmc_markers, + n = 50, + metadata = pbmc_meta %>% mutate(cluster = seurat_clusters), + cluster_col = "classified" + ) + pbmc_mm2 <- matrixize_markers( + pbmc_mm, + metadata = unique(as.character(pbmc_meta$classified)), + cluster_col = "classified", + ranked = TRUE + ) + + expect_true(nrow(pbmc_mm) < nrow(pbmc_mm2)) +}) + +test_that("average_clusters works as intended", { + pbmc_avg2 <- average_clusters(pbmc_matrix_small, + pbmc_meta, + cluster_col = "classified", + if_log = FALSE + ) + expect_equal(nrow(pbmc_avg2), 2000) +}) + +test_that("average_clusters works with disordered data", { + pbmc_meta2 <- rbind(pbmc_meta[1320:2638, ], pbmc_meta[1:1319, ]) + pbmc_avg2 <- average_clusters( + pbmc_matrix_small, + pbmc_meta %>% tibble::rownames_to_column("rn"), + if_log = TRUE, + cell_col = "rn", + cluster_col = "classified" + ) + pbmc_avg3 <- average_clusters( + pbmc_matrix_small, + pbmc_meta2 %>% tibble::rownames_to_column("rn"), + if_log = TRUE, + cell_col = "rn", + cluster_col = "classified" + ) + expect_equal(pbmc_avg2, pbmc_avg3) +}) + + +test_that("average_clusters detects wrong cluster ident", { + expect_error( + pbmc_avg2 <- average_clusters( + pbmc_matrix_small, + matrix(5, 5), + if_log = FALSE, + cluster_col = "classified" + ) + ) +}) + +test_that("average_clusters able to coerce factors", { + col <- factor(pbmc_meta$classified) + pbmc_avg2 <- average_clusters(pbmc_matrix_small, + col, + if_log = FALSE + ) + expect_equal(nrow(pbmc_avg2), 2000) +}) + +test_that("average_clusters works with median option", { + pbmc_avg2 <- average_clusters(pbmc_matrix_small, + pbmc_meta, + method = "median", + cluster_col = "classified" + ) + expect_equal(nrow(pbmc_avg2), 2000) +}) + +test_that("average_clusters works when one cluster contains only 1 cell", { + pbmc_meta2 <- pbmc_meta + pbmc_meta2$classified <- as.character(pbmc_meta2$classified) + pbmc_meta2[1, "classified"] <- 15 + pbmc_avg2 <- average_clusters(pbmc_matrix_small, + pbmc_meta2, + cluster_col = "classified", + low_threshold = 0 + ) + expect_equal(ncol(pbmc_avg2), 9 + 1) +}) + +test_that("average_clusters works when low cell number clusters should be removed", { + pbmc_meta2 <- pbmc_meta %>% mutate(classified = as.character(classified)) + pbmc_meta2[1, "classified"] <- 15 + pbmc_avg2 <- average_clusters(pbmc_matrix_small, + pbmc_meta2, + low_threshold = 2, + cluster_col = "classified" + ) + expect_equal(ncol(pbmc_avg2), 9) +}) + +test_that("average_clusters works with cutoff gene number", { + pbmc_avg2 <- average_clusters( + pbmc_matrix_small, + pbmc_meta, + cluster_col = "classified", + if_log = FALSE, + cut_n = 1 + ) + expect_true(sum(pbmc_avg2[, "B"]) < 10) +}) + +test_that("average_clusters works when cluster info contains NA", { + pbmc_meta2 <- pbmc_meta + pbmc_meta2[1, "classified"] <- NA + pbmc_avg2 <- average_clusters(pbmc_matrix_small, + pbmc_meta2, + low_threshold = 2, + cluster_col = "classified" + ) + expect_equal(ncol(pbmc_avg2), 9) +}) + +test_that("average_clusters works when cluster info in factor form", { + pbmc_meta2 <- pbmc_meta + pbmc_meta2$classified <- as.factor(pbmc_meta2$classified) + pbmc_avg2 <- average_clusters(pbmc_matrix_small, + pbmc_meta2, + low_threshold = 2, + cluster_col = "classified" + ) + expect_equal(ncol(pbmc_avg2), 9) +}) + +test_that("cor_to_call threshold works as intended", { + res <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "classified" + ) + call1 <- cor_to_call( + res, + metadata = pbmc_meta, + cluster_col = "classified", + collapse_to_cluster = FALSE, + threshold = 0.8, + carry_r = TRUE + ) + + expect_true("r<0.8, unassigned" %in% call1$type) +}) + +test_that("cor_to_call threshold works as intended, on per cell and collapsing", { + res <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "classified", + per_cell = TRUE + ) + call1 <- cor_to_call( + res, + metadata = pbmc_meta %>% tibble::rownames_to_column("rn"), + cluster_col = "rn", + collapse_to_cluster = "classified", + threshold = 0.1 + ) + + expect_true(!any(is.na(call1))) +}) + +test_that("assign_ident works with equal length vectors and just 1 ident", { + m1 <- assign_ident( + pbmc_meta, + ident_col = "classified", + clusters = c("1", "2"), + idents = c("whatever1", "whatever2") + ) + m2 <- assign_ident( + pbmc_meta, + ident_col = "classified", + clusters = c("1", "2"), + idents = "whatever1" + ) + expect_true(nrow(m1) == nrow(m2)) +}) + +test_that("cor_to_call_topn works as intended", { + res <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "classified" + ) + call1 <- cor_to_call_topn( + res, + metadata = pbmc_meta, + col = "classified", + collapse_to_cluster = FALSE, + threshold = 0.5 + ) + + expect_true(nrow(call1) == 2 * nrow(res)) +}) + +test_that("cor_to_call_topn works as intended on collapse to cluster option", { + res <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "classified", + per_cell = TRUE + ) + call1 <- cor_to_call_topn( + res, + metadata = pbmc_meta %>% tibble::rownames_to_column("rn"), + col = "rn", + collapse_to_cluster = "classified", + threshold = 0 + ) + + expect_true(nrow(call1) == 2 * nrow(res)) +}) + +test_that("gene_pct and gene_pct_markerm work as intended", { + res <- gene_pct( + pbmc_matrix_small, + cbmc_m$B, + pbmc_meta$classified + ) + + res2 <- gene_pct_markerm(pbmc_matrix_small, + cbmc_m, + pbmc_meta, + cluster_col = "classified" + ) + expect_error( + res2 <- gene_pct_markerm(pbmc_matrix_small, + cbmc_m, + matrix(5, 5), + cluster_col = "classified" + ) + ) + expect_true(nrow(res2) == 9) +}) + +test_that("gene_pct can give min or max output", { + res <- gene_pct(pbmc_matrix_small, + cbmc_m$B, + pbmc_meta$classified, + returning = "min" + ) + res2 <- gene_pct(pbmc_matrix_small, + cbmc_m$B, + pbmc_meta$classified, + returning = "max" + ) + + expect_true(all(res2 >= res)) +}) + +test_that("gene_pct_markerm norm options work", { + res <- gene_pct_markerm(pbmc_matrix_small, + cbmc_m, + pbmc_meta, + cluster_col = "classified", + norm = NULL + ) + res2 <- gene_pct_markerm(pbmc_matrix_small, + cbmc_m, + pbmc_meta, + cluster_col = "classified", + norm = "divide" + ) + res3 <- gene_pct_markerm(pbmc_matrix_small, + cbmc_m, + pbmc_meta, + cluster_col = "classified", + norm = 0.3 + ) + + expect_true(nrow(res2) == 9) +}) + +test_that("clustify_nudge works with options and seruat2", { + res <- clustify_nudge( + input = s_small, + ref_mat = cbmc_ref, + marker = cbmc_m, + cluster_col = "res.1", + threshold = 0.8, + seurat_out = FALSE, + mode = "pct", + dr = "tsne" + ) + expect_true(nrow(res) == 4) +}) + +test_that("clustify_nudge works with seurat_out", { + res <- clustify_nudge( + input = s_small, + ref_mat = cbmc_ref, + marker = cbmc_m, + cluster_col = "res.1", + threshold = 0.8, + seurat_out = TRUE, + mode = "pct", + dr = "tsne" + ) + + res <- clustify_nudge( + input = s_small3, + ref_mat = cbmc_ref, + marker = cbmc_m, + threshold = 0.8, + seurat_out = TRUE, + cluster_col = "RNA_snn_res.1", + mode = "pct", + dr = "tsne" + ) + expect_true(3 == 3) +}) + +test_that("clustify_nudge works with rank/posneg option", { + res <- clustify_nudge( + input = s_small, + ref_mat = cbmc_ref, + marker = cbmc_m, + cluster_col = "res.1", + threshold = 0.8, + seurat_out = FALSE, + mode = "rank", + dr = "tsne" + ) + expect_true(nrow(res) == 4) +}) + +test_that("clustify_nudge works with options and seruat3", { + res <- clustify_nudge( + input = s_small3, + ref_mat = cbmc_ref, + marker = cbmc_m, + threshold = 0.8, + seurat_out = FALSE, + cluster_col = "RNA_snn_res.1", + mode = "pct", + dr = "tsne" + ) + expect_true(nrow(res) == 3) +}) + +test_that("clustify_nudge works with seurat_out option", { + res <- clustify_nudge( + input = s_small, + ref_mat = cbmc_ref, + marker = cbmc_m, + cluster_col = "res.1", + threshold = 0.8, + seurat_out = FALSE, + marker_inmatrix = FALSE, + mode = "pct", + dr = "tsne" + ) + expect_true(nrow(res) == 4) +}) + +test_that("clustify_nudge.Seurat works with seurat_out option", { + res <- clustify_nudge( + input = s_small3, + ref_mat = cbmc_ref, + marker = cbmc_m, + cluster_col = "RNA_snn_res.1", + threshold = 0.8, + seurat_out = TRUE, + marker_inmatrix = FALSE, + mode = "pct", + dr = "tsne" + ) + + res <- clustify_nudge( + input = s_small3, + ref_mat = cbmc_ref, + marker = cbmc_m, + cluster_col = "RNA_snn_res.1", + threshold = 0.8, + seurat_out = FALSE, + marker_inmatrix = FALSE, + mode = "pct", + dr = "tsne" + ) + expect_true(nrow(res) == 3) +}) + +test_that("clustify_nudge works with obj_out option", { + s3 <- s_small3 + setClass( + "ser3", + representation(meta.data = "data.frame") + ) + class(s3) <- "ser3" + object_loc_lookup2 <- data.frame( + ser3 = c( + expr = "input@assays$RNA@data", + meta = "input@meta.data", + var = "input@assays$RNA@var.features", + col = "RNA_snn_res.1" + ), + stringsAsFactors = FALSE + ) + + res <- clustify_nudge( + input = s3, + ref_mat = cbmc_ref, + marker = cbmc_m, + lookuptable = object_loc_lookup2, + cluster_col = "RNA_snn_res.1", + threshold = 0.8, + obj_out = TRUE, + marker_inmatrix = FALSE, + mode = "pct", + dr = "tsne" + ) + + res2 <- clustify_nudge( + input = s3, + ref_mat = cbmc_ref, + marker = cbmc_m, + lookuptable = object_loc_lookup2, + cluster_col = "RNA_snn_res.1", + threshold = 0.8, + obj_out = FALSE, + marker_inmatrix = FALSE, + mode = "pct", + dr = "tsne" + ) + expect_true(nrow(res2) == 3) +}) + +test_that("clustify_nudge works with list of markers", { + res <- clustify_nudge( + input = pbmc_matrix_small, + ref_mat = average_clusters(pbmc_matrix_small, + pbmc_meta, + cluster_col = "classified" + ), + metadata = pbmc_meta, + marker = pbmc_markers, + query_genes = pbmc_vargenes, + cluster_col = "classified", + threshold = 0.8, + call = FALSE, + marker_inmatrix = FALSE, + mode = "pct" + ) + expect_true(nrow(res) == 9) +}) + +test_that("clustify_nudge autoconverts when markers are in matrix", { + res <- clustify_nudge( + input = pbmc_matrix_small, + ref_mat = cbmc_ref, + metadata = pbmc_meta, + marker = as.matrix(cbmc_m), + query_genes = pbmc_vargenes, + cluster_col = "classified", + threshold = 0.8, + call = FALSE, + marker_inmatrix = FALSE, + mode = "pct" + ) + expect_true(nrow(res) == 9) +}) + +test_that("overcluster_test works with ngenes option", { + g <- overcluster_test( + pbmc_matrix_small[, 1:100], + pbmc_meta[1:100, ], + cbmc_ref, + cluster_col = "classified", + x_col = "UMAP_1", + y_col = "UMAP_2" + ) + g2 <- overcluster_test( + pbmc_matrix_small[, 1:100], + pbmc_meta[1:100, ], + cbmc_ref, + cluster_col = "classified", + ngenes = 100, + x_col = "UMAP_1", + y_col = "UMAP_2" + ) + expect_true(ggplot2::is.ggplot(g)) +}) + +test_that("overcluster_test works with defined other cluster column", { + g <- overcluster_test( + pbmc_matrix_small[, 1:100], + pbmc_meta[1:100, ], + cbmc_ref, + cluster_col = "seurat_clusters", + newclustering = "classified", + do_label = FALSE, + x_col = "UMAP_1", + y_col = "UMAP_2" + ) + expect_true(ggplot2::is.ggplot(g)) +}) + +test_that("ref_feature_select chooses the correct number of features", { + pbmc_avg <- average_clusters(pbmc_matrix_small, + pbmc_meta, + cluster_col = "classified" + ) + res <- ref_feature_select(pbmc_avg[1:100, ], 5) + expect_true(length(res) == 5) +}) + +test_that("ref_feature_select chooses the correct number of features with options", { + pbmc_avg <- average_clusters(pbmc_matrix_small, + pbmc_meta, + cluster_col = "classified" + ) + res <- ref_feature_select(pbmc_avg[1:100, ], 5, mode = "cor") + expect_true(length(res) == 5) +}) + +test_that("feature_select_PCA will log transform", { + res <- feature_select_PCA(cbmc_ref, if_log = FALSE) + res2 <- feature_select_PCA(cbmc_ref, if_log = TRUE) + expect_true(length(res) > 0) +}) + +test_that("feature_select_PCA can handle precalculated PCA", { + pcs <- prcomp(t(as.matrix(cbmc_ref)))$rotation + res <- feature_select_PCA(cbmc_ref, if_log = TRUE) + res2 <- feature_select_PCA(pcs = pcs, if_log = TRUE) + expect_true(all.equal(rownames(res), rownames(res2))) +}) + + +test_that("downsample_matrix can select same number of cells per cluster", { + mat1 <- downsample_matrix( + pbmc_matrix_small, + metadata = pbmc_meta$classified, + n = 10, + keep_cluster_proportions = TRUE + ) + expect_true(all.equal(ncol(mat1), 10 * length(unique( + pbmc_meta$classified + )))) +}) + +test_that("percent_clusters works with defaults", { + res <- percent_clusters(pbmc_matrix_small, + pbmc_meta, + cluster_col = "classified" + ) + expect_equal(nrow(res), nrow(pbmc_matrix_small)) +}) + +test_that("get_best_str finds correct values", { + res <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "classified", + per_cell = FALSE + ) + + a <- get_best_str("DC", get_best_match_matrix(res), res) + a2 <- get_best_str("DC", get_best_match_matrix(res), res, carry_cor = FALSE) + + expect_equal(stringr::str_sub(a, 1, 2), stringr::str_sub(a2, 1, 2)) +}) + +test_that("seurat_ref gets correct averages", { + avg <- seurat_ref(s_small, + cluster_col = "res.1", + var_genes_only = TRUE + ) + avg3 <- seurat_ref(s_small, + cluster_col = "res.1", + var_genes_only = TRUE, + if_log = FALSE + ) + avg2 <- seurat_ref(s_small, + cluster_col = "res.1", + var_genes_only = "PCA" + ) + expect_true(ncol(avg) == 4) +}) + +test_that("object_ref with seurat3", { + s3 <- s_small3 + avg <- object_ref(s3, + var_genes_only = TRUE + ) + expect_true(ncol(avg) == 3) +}) + +test_that("object_ref with SingleCellExperiment", { + sce <- sce_small + avg <- object_ref(sce) + expect_equal(dim(avg), c(200, 13)) +}) + + +test_that("object_ref gets correct averages", { + s3 <- s_small3 + class(s3) <- "ser3" + object_loc_lookup2 <- data.frame( + ser3 = c( + expr = "input@assays$RNA@data", + meta = "input@meta.data", + var = "input@assays$RNA@var.features", + col = "RNA_snn_res.1" + ), + stringsAsFactors = FALSE + ) + avg <- object_ref(s3, + lookuptable = object_loc_lookup2, + var_genes_only = TRUE + ) + expect_true(ncol(avg) == 3) +}) + +test_that("seurat_ref gets other assay slots", { + avg <- seurat_ref( + s_small, + cluster_col = "res.1", + assay_name = "ADT", + var_genes_only = TRUE + ) + avg2 <- seurat_ref( + s_small, + cluster_col = "res.1", + assay_name = c("ADT", "ADT2"), + var_genes_only = TRUE + ) + expect_true(nrow(avg2) - nrow(avg) == 2) +}) + +test_that("seurat_ref gets correct averages with seurat3 object", { + avg <- seurat_ref( + s_small3, + cluster_col = "RNA_snn_res.1", + assay_name = c("ADT", "ADT2"), + var_genes_only = TRUE + ) + avg <- seurat_ref( + s_small3, + cluster_col = "RNA_snn_res.1", + assay_name = c("ADT"), + var_genes_only = TRUE + ) + avg2 <- seurat_ref( + s_small3, + cluster_col = "RNA_snn_res.1", + assay_name = c("ADT", "ADT2"), + var_genes_only = "PCA" + ) + expect_true(nrow(avg2) - nrow(avg) == 2) +}) + +test_that("object parsing works for custom object", { + s3 <- s_small3 + class(s3) <- "ser3" + object_loc_lookup2 <- data.frame( + ser3 = c( + expr = "input@assays$RNA@data", + meta = "input@meta.data", + var = "input@assays$RNA@var.features", + col = "RNA_snn_res.1" + ), + stringsAsFactors = FALSE + ) + + res2 <- clustify(s3, + cbmc_ref, + lookuptable = object_loc_lookup2, + obj_out = FALSE + ) + + res <- clustify_lists( + s3, + marker = pbmc_markers, + marker_inmatrix = FALSE, + lookuptable = object_loc_lookup2, + obj_out = FALSE + ) + + expect_true(nrow(res) == nrow(res2)) +}) + +test_that("object metadata assignment works for custom object", { + s3 <- s_small3 + setClass( + "ser3", + representation(meta.data = "data.frame") + ) + class(s3) <- "ser3" + object_loc_lookup2 <- data.frame( + ser3 = c( + expr = "input@assays$RNA@data", + meta = "input@meta.data", + var = "input@assays$RNA@var.features", + col = "RNA_snn_res.1" + ), + stringsAsFactors = FALSE + ) + + res2 <- clustify(s3, + cbmc_ref, + lookuptable = object_loc_lookup2, + obj_out = TRUE + ) + + res3 <- clustify_lists( + s3, + marker = pbmc_markers, + marker_inmatrix = FALSE, + lookuptable = object_loc_lookup2, + obj_out = TRUE, + rename_prefix = "A" + ) + + expect_true(is(res2, "ser3")) +}) + +test_that("cor_to_call renaming with suffix input works as intended, per_cell or otherwise", { + res <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "classified" + ) + call1 <- cor_to_call( + res, + metadata = pbmc_meta, + cluster_col = "classified", + collapse_to_cluster = FALSE, + threshold = 0.5, + rename_prefix = "a" + ) + res2 <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "classified", + per_cell = TRUE + ) + call2 <- cor_to_call( + res2, + metadata = tibble::rownames_to_column(pbmc_meta, "rn"), + cluster_col = "classified", + collapse_to_cluster = TRUE, + threshold = 0, + rename_prefix = "a" + ) + call3 <- cor_to_call( + res2, + pbmc_meta, + cluster_col = "rn", + collapse_to_cluster = TRUE, + threshold = 0, + rename_prefix = "a" + ) + expect_true("a_type" %in% colnames(call1) & + "a_type" %in% colnames(call2)) +}) + +test_that("renaming with suffix input works as intended with clusify wrapper", { + res <- clustify( + input = s_small, + ref_mat = cbmc_ref, + cluster_col = "res.1", + rename_suff = "a", + dr = "tsne" + ) + res2 <- clustify( + input = s_small3, + ref_mat = cbmc_ref, + cluster_col = "RNA_snn_res.1", + rename_suff = "a", + dr = "tsne" + ) + expect_true(!is.null(res)) +}) + +test_that("ref_marker_select works with cutoffs", { + res1 <- ref_marker_select(cbmc_ref, cut = 0) + mm <- + matrixize_markers(res1, + n = 5, + unique = TRUE, + remove_rp = TRUE + ) + res2 <- ref_marker_select(cbmc_ref, cut = 2) + expect_true(nrow(res1) != nrow(res2)) +}) + +test_that("pos_neg_select takes dataframe of 1 col or more", { + pn_ref <- data.frame( + "CD4" = c(1, 0, 0), + "CD8" = c(0, 0, 1), + row.names = c("CD4", "clustifyr0", "CD8B") + ) + pn_ref2 <- data.frame( + "CD8" = c(0, 0, 1), + row.names = c("CD4", "clustifyr0", "CD8B") + ) + res <- pos_neg_select( + pbmc_matrix_small, + pn_ref, + pbmc_meta, + "classified" + ) + res2 <- pos_neg_select( + pbmc_matrix_small, + pn_ref2, + pbmc_meta, + "classified" + ) + expect_identical(res[, 2], res2[, 1]) +}) + +test_that("pos_neg_select normalizes res", { + pn_ref2 <- data.frame( + "a" = c(1, 0.01, 0), + row.names = c("CD74", "clustifyr0", "CD79A") + ) + res <- pos_neg_select(pbmc_matrix_small, + pn_ref2, + pbmc_meta, + "classified", + cutoff_score = 0.8 + ) + res2 <- pos_neg_select(pbmc_matrix_small, + pn_ref2, + pbmc_meta, + "classified", + cutoff_score = NULL + ) + expect_true(res[1] != res2[1]) +}) + +test_that("clustify_nudge works with pos_neg_select", { + pn_ref2 <- data.frame( + "CD8 T" = c(0, 0, 1), + row.names = c("CD4", "clustifyr0", "CD8B"), + check.names = FALSE + ) + res <- clustify_nudge( + pbmc_matrix_small, + cbmc_ref, + pn_ref2, + metadata = pbmc_meta, + cluster_col = "classified", + norm = 0.5 + ) + expect_true(all(dim(res) == c(9, 3))) +}) + +test_that("reverse_marker_matrix takes matrix of markers input", { + m1 <- reverse_marker_matrix(cbmc_m) + m2 <- reverse_marker_matrix(as.matrix(cbmc_m)) + expect_identical(m1, m2) +}) + +test_that("more readable error message when cluster_col is not in metadata when joining", { + res <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "classified", + verbose = TRUE + ) + + expect_error(plot_best_call( + res, + pbmc_meta, + "a" + )) +}) + +test_that("more readable error message when cluster_col is not the previous col from metadata when joining", { + res <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "classified", + verbose = TRUE + ) + + res2 <- cor_to_call( + res, + pbmc_meta, + "classified" + ) + expect_error(call_to_metadata( + res2, + pbmc_meta, + "seurat_clusters" + )) +}) + +test_that("more readable error message when cluster_col exist but is wrong info", { + res <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "classified", + verbose = TRUE + ) + + expect_error(plot_best_call( + res, + pbmc_meta, + "seurat_clusters" + )) +}) + +marker_file <- system.file("extdata", + "hsPBMC_markers.txt", + package = "clustifyr" +) + +test_that("paring marker files works on included example", { + markers <- file_marker_parse(marker_file) + expect_true(length(markers) == 2) +}) + +gmt_file <- system.file("extdata", + "c2.cp.reactome.v6.2.symbols.gmt.gz", + package = "clustifyr" +) + +test_that("paring gmt files works on included example", { + gmt_list <- gmt_to_list(path = gmt_file) + expect_true(is.list(gmt_list)) +}) + +test_that("clustify_nudge works with pos_neg_select and seurat2 object", { + pn_ref2 <- data.frame( + "CD8 T" = c(0, 0, 1), + row.names = c("CD4", "clustifyr0", "CD8B"), + check.names = FALSE + ) + res <- clustify_nudge( + s_small, + cbmc_ref, + pn_ref2, + cluster_col = "res.1", + norm = 0.5, + dr = "tsne", + seurat_out = FALSE + ) + expect_true(nrow(res) == 4) +}) + +test_that("clustify_nudge works with pos_neg_select and Seurat3 object", { + pn_ref2 <- data.frame( + "CD8 T" = c(0, 0, 1), + row.names = c("CD4", "clustifyr0", "CD8B"), + check.names = FALSE + ) + res <- clustify_nudge( + s_small3, + cbmc_ref, + pn_ref2, + cluster_col = "RNA_snn_res.1", + norm = 0.5, + dr = "tsne", + seurat_out = FALSE + ) + expect_true(nrow(res) == 3) +}) + +test_that("pos_neg_marker takes list, matrix, and dataframe", { + res <- pos_neg_marker(cbmc_m) + res2 <- pos_neg_marker(as.matrix(cbmc_m)) + res3 <- pos_neg_marker(as.list(cbmc_m)) + expect_true(nrow(res) == nrow(res2)) +}) + +test_that("pos_neg_marker takes uneven list", { + genelist <- list( + precusor_oligodendrocyte = c("Olig1", "Olig2"), + mature_oligodendrocyte = c("Olig1", "Olig2", "Mbp"), + microglia = c("Aif1", "Tmem119"), + astrocyte = c("Gfap", "Aqp4") + ) + res <- pos_neg_marker(genelist) + expect_true(ncol(res) == length(genelist)) +}) + +test_that("average_clusters can generate subclusters", { + ref_mat <- average_clusters( + pbmc_matrix_small, + pbmc_meta, + cluster_col = "classified", + subclusterpower = 0.15 + ) + expect_true(ncol(ref_mat) > length(unique(pbmc_meta$classified))) +}) + +test_that("cor_to_call threshold works as intended", { + res <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta, + ref_mat = cbmc_ref[, -10], + query_genes = pbmc_vargenes, + cluster_col = "classified" + ) + call1 <- cor_to_call( + res, + metadata = pbmc_meta, + cluster_col = "classified", + collapse_to_cluster = FALSE, + threshold = "auto", + carry_r = TRUE + ) + + expect_true(any(stringr::str_detect(call1$type, "unassigned"))) +}) + +test_that("cor_to_call_rank threshold works as intended", { + res <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "classified" + ) + call1 <- cor_to_call_rank( + res, + metadata = pbmc_meta, + cluster_col = "classified", + collapse_to_cluster = FALSE, + threshold = "auto", + rename_prefix = "a" + ) + + expect_true(max(call1$rank) == 100) +}) + +test_that("cor_to_call_rank options", { + res <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "classified" + ) + call1 <- cor_to_call_rank( + res, + metadata = pbmc_meta, + cluster_col = "classified", + collapse_to_cluster = FALSE, + threshold = "auto", + rename_prefix = "a", + top_n = 1 + ) + + expect_true(max(call1$rank) == 1) +}) + +test_that("call_consensus marks ties", { + res <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "classified" + ) + call1 <- cor_to_call_rank( + res, + metadata = pbmc_meta, + cluster_col = "classified", + collapse_to_cluster = FALSE, + threshold = "auto" + ) + res2 <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta, + ref_mat = cbmc_ref, + cluster_col = "classified" + ) + call2 <- cor_to_call_rank( + res2, + metadata = pbmc_meta, + cluster_col = "classified", + collapse_to_cluster = FALSE, + threshold = "auto" + ) + call_f <- call_consensus(list(call1, call2)) + expect_true(nrow(call_f) == length(unique(pbmc_meta$classified))) +}) + +test_that("cor_to_call can collapse_to_cluster", { + res <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "classified", + per_cell = TRUE + ) + call1 <- cor_to_call( + res, + metadata = pbmc_meta, + cluster_col = "classified", + collapse_to_cluster = TRUE, + threshold = 0.1 + ) + expect_true(ncol(call1) == 4) +}) + +test_that("cor_to_call and collapse_to_cluster work on objects", { + res <- clustify( + input = s_small, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "res.1", + dr = "tsne", + per_cell = TRUE, + collapse_to_cluster = TRUE + ) + expect_true(is.matrix(res) | is(res, "seurat")) +}) + +test_that("seurat_meta warns about not finding dr", { + m <- seurat_meta(s_small, + dr = "tsne" + ) + m2 <- seurat_meta(s_small, + dr = "s" + ) + m3 <- seurat_meta(s_small3, + dr = "s" + ) + expect_true(all(rownames(m) == rownames(m2))) +}) + +test_that("find_rank_bias filters out unassigned", { + res <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "classified" + ) + call1 <- cor_to_call( + res, + metadata = pbmc_meta, + cluster_col = "classified", + collapse_to_cluster = FALSE, + threshold = 0.8 + ) + pbmc_meta2 <- call_to_metadata( + call1, + pbmc_meta, + "classified" + ) + b <- find_rank_bias(pbmc_matrix_small, + pbmc_meta2, "type", + cbmc_ref, + query_genes = pbmc_vargenes + ) + expect_true(length(unique(pbmc_meta2$type)) > ncol(b)) +}) + +test_that("repeated insertionn of types into metdadata renames correctly", { + res <- clustify( + input = pbmc_matrix_small, + metadata = pbmc_meta, + ref_mat = cbmc_ref, + query_genes = pbmc_vargenes, + cluster_col = "classified" + ) + call1 <- cor_to_call( + res, + metadata = pbmc_meta, + cluster_col = "classified", + collapse_to_cluster = FALSE, + threshold = 0.8 + ) + pbmc_meta2 <- call_to_metadata( + call1, + pbmc_meta, + "classified" + ) + call2 <- cor_to_call( + res, + metadata = pbmc_meta, + cluster_col = "classified", + collapse_to_cluster = FALSE, + threshold = 0 + ) + pbmc_meta2 <- call_to_metadata( + call2, + pbmc_meta2, + "classified", + rename_prefix = "a" + ) + expect_true(colnames(pbmc_meta2)[10] == "type") +}) + +test_that("object_ref with sce", { + avg <- object_ref(sce_small, + cluster_col = "cell_type1" + ) + expect_true(ncol(avg) == 13) +}) diff --git a/tests/testthat/test_utils.R.REMOVED.git-id b/tests/testthat/test_utils.R.REMOVED.git-id deleted file mode 100644 index 5dad3858f..000000000 --- a/tests/testthat/test_utils.R.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -5ec1b9ac2ffeccd1b5f3c7de7403047fd2b83e10 \ No newline at end of file diff --git a/vignettes/clustifyR.Rmd b/vignettes/clustifyR.Rmd new file mode 100644 index 000000000..9203f5eda --- /dev/null +++ b/vignettes/clustifyR.Rmd @@ -0,0 +1,307 @@ +--- +title: 'Introduction to clustifyr' +date: '`r Sys.Date()`' +package: clustifyr +author: + - name: Rui Fu + affiliation: RNA Bioscience Initative, University of Colorado School of Medicine + - name: Austin Gillen + affiliation: RNA Bioscience Initative, University of Colorado School of Medicine + - name: Ryan Sheridan + affiliation: RNA Bioscience Initative, University of Colorado School of Medicine + - name: Chengzhe Tian + affiliation: Department of Biochemistry, University of Colorado Boulder + - name: Michelle Daya + affiliation: Biomedical Informatics & Personalized Medicine, University of Colorado Anschutz Medical Campus + - name: Yue Hao + affiliation: Bioinformatics Research Center, North Carolina State University + - name: Jay Hesselberth + affiliation: RNA Bioscience Initative, University of Colorado School of Medicine + - name: Kent Riemondy + affiliation: RNA Bioscience Initative, University of Colorado School of Medicine +output: + BiocStyle::html_document: + toc_float: true + +vignette: > + %\VignetteIndexEntry{Introduction to clustifyr} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r "knitr options", echo = FALSE, message = FALSE, warning = FALSE} +knitr::opts_chunk$set( + message = FALSE, + warning = FALSE, + collapse = TRUE, + fig.align = "center", + comment = "#>" +) +``` + +## Introduction: Why use `clustifyr`? + +Single cell transcriptomes are difficult to annotate without extensive knowledge of the underlying biology of the system in question. Even with this knowledge, accurate identification can be challenging due to the lack of detectable expression of common marker genes defined by bulk RNA-seq, flow cytometry, other single cell RNA-seq platforms, etc. + +`clustifyr` solves this problem by providing functions to automatically annotate single cells or clusters using bulk RNA-seq data or marker gene lists (ranked or unranked). Additional functions allow for exploratory analysis of calculated similarities between single cell RNA-seq datasets and reference data. + +## Installation + +To install `clustifyr` BiocManager must be installed. + +```{r "Installation", eval = FALSE} +install.packages("BiocManager") + +BiocManager::install("clustifyr") +``` + +## A simple example: 10x Genomics PBMCs + +In this example, we take a 10x Genomics 3' scRNA-seq dataset from peripheral blood mononuclear cells (PBMCs) and annotate the cell clusters (identified using `Seurat`) using scRNA-seq cell clusters assigned from a CITE-seq experiment. + +```{r "Load data"} +library(clustifyr) +library(ggplot2) +library(cowplot) + +# Matrix of normalized single-cell RNA-seq counts +pbmc_matrix <- clustifyr::pbmc_matrix_small + +# meta.data table containing cluster assignments for each cell +# The table that we are using also contains the known cell identities in the "classified" column +pbmc_meta <- clustifyr::pbmc_meta +``` + +## Calculate correlation coefficients + +To identify cell types, the `clustifyr()` function requires several inputs: + +* `input`: an SingleCellExperiment or Seurat object or a matrix of normalized single-cell RNA-seq counts +* `metadata`: a meta.data table containing the cluster assignments for each cell (not required if a Seurat object is given) +* `ref_mat`: a reference matrix containing RNA-seq expression data for each cell type of interest +* `query_genes`: a list of genes to use for comparison (optional but recommended) + +When using a matrix of scRNA-seq counts `clustifyr()` will return a matrix of correlation coefficients for each cell type and cluster, with the rownames corresponding to the cluster number. + +```{r "Run clustifyr()"} +# Calculate correlation coefficients for each cluster (spearman by default) +vargenes <- pbmc_vargenes[1:500] + +res <- clustify( + input = pbmc_matrix, # matrix of normalized scRNA-seq counts (or SCE/Seurat object) + metadata = pbmc_meta, # meta.data table containing cell clusters + cluster_col = "seurat_clusters", # name of column in meta.data containing cell clusters + ref_mat = cbmc_ref, # matrix of RNA-seq expression data for each cell type + query_genes = vargenes # list of highly varible genes identified with Seurat +) + +# Peek at correlation matrix +res[1:5, 1:5] + +# Call cell types +res2 <- cor_to_call( + cor_mat = res, # matrix correlation coefficients + cluster_col = "seurat_clusters" # name of column in meta.data containing cell clusters +) +res2[1:5, ] + +# Insert into original metadata as "type" column +pbmc_meta2 <- call_to_metadata( + res = res2, # data.frame of called cell type for each cluster + metadata = pbmc_meta, # original meta.data table containing cell clusters + cluster_col = "seurat_clusters" # name of column in meta.data containing cell clusters +) +``` + +To visualize the `clustifyr()` results we can use the `plot_cor_heatmap()` function to plot the correlation coefficients for each cluster and each cell type. + +```{r "Create correlation heatmap", fig.height = 5, fig.width = 7} +# Create heatmap of correlation coefficients using clustifyr() output +plot_cor_heatmap(cor_mat = res) +``` + +## Plot cluster identities and correlation coefficients + +`clustifyr` also provides functions to overlay correlation coefficients on pre-calculated tSNE embeddings (or those from any other dimensionality reduction method). + +```{r "Overlay corr coefficients on UMAP", fig.height = 3.5, fig.width = 9} +# Overlay correlation coefficients on UMAPs for the first two cell types +corr_umaps <- plot_cor( + cor_mat = res, # matrix of correlation coefficients from clustifyr() + metadata = pbmc_meta, # meta.data table containing UMAP or tSNE data + data_to_plot = colnames(res)[1:2], # name of cell type(s) to plot correlation coefficients + cluster_col = "seurat_clusters" # name of column in meta.data containing cell clusters +) + +plot_grid( + plotlist = corr_umaps, + rel_widths = c(0.47, 0.53) +) +``` + +
+ +The `plot_best_call()` function can be used to label each cluster with the cell type that gives the highest corelation coefficient. Using the `plot_dims()` function, we can also plot the known identities of each cluster, which were stored in the "classified" column of the meta.data table. The plots below show that the highest correlations between the reference RNA-seq data and the 10x Genomics scRNA-seq dataset are restricted to the correct cell clusters. + +```{r "Label clusters", fig.height = 5.5, fig.width = 12} +# Label clusters with clustifyr cell identities +clustifyr_types <- plot_best_call( + cor_mat = res, # matrix of correlation coefficients from clustifyr() + metadata = pbmc_meta, # meta.data table containing UMAP or tSNE data + do_label = TRUE, # should the feature label be shown on each cluster? + do_legend = FALSE, # should the legend be shown? + cluster_col = "seurat_clusters" +) + + ggtitle("clustifyr cell types") + +# Compare clustifyr results with known cell identities +known_types <- plot_dims( + data = pbmc_meta, # meta.data table containing UMAP or tSNE data + feature = "classified", # name of column in meta.data to color clusters by + do_label = TRUE, # should the feature label be shown on each cluster? + do_legend = FALSE # should the legend be shown? +) + + ggtitle("Known cell types") + +plot_grid(known_types, clustifyr_types) +``` + +## Classify cells using known marker genes + +The `clustify_lists()` function allows cell types to be assigned based on known marker genes. This function requires a table containing markers for each cell type of interest. Cell types can be assigned using several statistical tests including, hypergeometric, Jaccard, Spearman, and GSEA. + +```{r "clustifyr with gene lists", fig.height = 4, fig.width = 6} +# Take a peek at marker gene table +cbmc_m + +# Available metrics include: "hyper", "jaccard", "spearman", "gsea" +list_res <- clustify_lists( + input = pbmc_matrix, # matrix of normalized single-cell RNA-seq counts + metadata = pbmc_meta, # meta.data table containing cell clusters + cluster_col = "seurat_clusters", # name of column in meta.data containing cell clusters + marker = cbmc_m, # list of known marker genes + metric = "pct" # test to use for assigning cell types +) + +# View as heatmap, or plot_best_call +plot_cor_heatmap( + cor_mat = list_res, # matrix of correlation coefficients from clustify_lists() + cluster_rows = FALSE, # cluster by row? + cluster_columns = FALSE, # cluster by column? + legend_title = "% expressed" # title of heatmap legend +) + +# Downstream functions same as clustify() +# Call cell types +list_res2 <- cor_to_call( + cor_mat = list_res, # matrix correlation coefficients + cluster_col = "seurat_clusters" # name of column in meta.data containing cell clusters +) + +# Insert into original metadata as "list_type" column +pbmc_meta3 <- call_to_metadata( + res = list_res2, # data.frame of called cell type for each cluster + metadata = pbmc_meta, # original meta.data table containing cell clusters + cluster_col = "seurat_clusters", # name of column in meta.data containing cell clusters + rename_prefix = "list_" # set a prefix for the new column +) +``` + +## Direct handling of `SingleCellExperiment` objects +`clustifyr` can also use a `SingleCellExperiment` object as input and return a new `SingleCellExperiment` object with the cell types added as a column in the colData. + +``` r +res <- clustify( + input = sce_small, # an SCE object + ref_mat = cbmc_ref, # matrix of RNA-seq expression data for each cell type + cluster_col = "cell_type1", # name of column in meta.data containing cell clusters + obj_out = TRUE # output SCE object with cell type inserted as "type" column +) + +SingleCellExperiment::colData(res)[1:10,c("type", "r")] +#> DataFrame with 10 rows and 2 columns +#> type r +#> +#> AZ_A1 CD34+ 0.557678024919381 +#> AZ_A10 CD34+ 0.624777701661225 +#> AZ_A11 CD4 T 0.695067885340303 +#> AZ_A12 CD34+ 0.624777701661225 +#> AZ_A2 CD4 T 0.602804908958642 +#> AZ_A3 CD34+ 0.557678024919381 +#> AZ_A4 CD34+ 0.557678024919381 +#> AZ_A5 CD34+ 0.645378073051508 +#> AZ_A6 CD4 T 0.695067885340303 +#> AZ_A7 CD34+ 0.671644883893203 +``` + +## Direct handling of `seurat` v2 and v3 objects + +`clustifyr` can also use a `Seurat` object as input and return a new `Seurat` object with the cell types added as a column in the meta.data. + +``` r +res <- clustify( + input = s_small, # a Seurat object + ref_mat = cbmc_ref, # matrix of RNA-seq expression data for each cell type + cluster_col = "res.1", # name of column in meta.data containing cell clusters + obj_out = TRUE # output Seurat object with cell type inserted as "type" column +) + +res@meta.data[1:10, ] +#> nGene nUMI orig.ident res.0.8 res.1 type r +#> ATGCCAGAACGACT 47 70 SeuratProject 0 0 Memory CD4 T 0.7047302 +#> CATGGCCTGTGCAT 52 85 SeuratProject 0 0 Memory CD4 T 0.7047302 +#> GAACCTGATGAACC 50 87 SeuratProject 0 0 Memory CD4 T 0.7047302 +#> TGACTGGATTCTCA 56 127 SeuratProject 0 0 Memory CD4 T 0.7047302 +#> AGTCAGACTGCACA 53 173 SeuratProject 0 0 Memory CD4 T 0.7047302 +#> TCTGATACACGTGT 48 70 SeuratProject 0 0 Memory CD4 T 0.7047302 +#> TGGTATCTAAACAG 36 64 SeuratProject 0 0 Memory CD4 T 0.7047302 +#> GCAGCTCTGTTTCT 45 72 SeuratProject 0 0 Memory CD4 T 0.7047302 +#> GATATAACACGCAT 36 52 SeuratProject 0 0 Memory CD4 T 0.7047302 +#> AATGTTGACAGTCA 41 100 SeuratProject 0 0 Memory CD4 T 0.7047302 +``` + +## Building reference matrix from single cell expression matrix + +In its simplest form, a reference matrix is built by averaging expression (also includes an option to take the median) of a single cell RNA-seq expression matrix by cluster. Both log transformed or raw count matrices are supported. + +```{r average} +new_ref_matrix <- average_clusters( + mat = pbmc_matrix, + metadata = pbmc_meta$classified, # or use metadata = pbmc_meta, cluster_col = "classified" + if_log = TRUE # whether the expression matrix is already log transformed +) + +head(new_ref_matrix) + +# For further convenience, a shortcut function for generating reference matrix from `SingleCellExperiment` or `seurat` object is used. +new_ref_matrix_sce <- object_ref( + input = sce_small, # SCE object + cluster_col = "cell_type1" # name of column in colData containing cell identities +) + +new_ref_matrix_v2 <- seurat_ref( + seurat_object = s_small, # seuratv2 object + cluster_col = "res.1" # name of column in meta.data containing cell identities +) + +new_ref_matrix_v3 <- seurat_ref( + seurat_object = s_small3, # SeuratV3 object + cluster_col = "RNA_snn_res.1" # name of column in meta.data containing cell identities +) + +tail(new_ref_matrix_v3) +``` + +More reference data, including tabula muris, and code used to generate them are available at https://github.com/rnabioco/clustifyrdata + +Also see list for individual downloads at https://rnabioco.github.io/clustifyr/articles/download_refs.html + +Additional tutorials at +https://rnabioco.github.io/clustifyrdata/articles/otherformats.html + +## Session info + +```{r} +sessionInfo() +``` + diff --git a/vignettes/clustifyR.Rmd.REMOVED.git-id b/vignettes/clustifyR.Rmd.REMOVED.git-id deleted file mode 100644 index b85a4f80a..000000000 --- a/vignettes/clustifyR.Rmd.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -9203f5eda29afba62993f36862b3d867a5c6427f \ No newline at end of file