diff --git a/.travis.yml b/.travis.yml index c494260..89dbf60 100644 --- a/.travis.yml +++ b/.travis.yml @@ -33,7 +33,7 @@ before_install: conda config --add channels defaults; conda config --add channels bioconda; conda config --add channels conda-forge; - conda create -q -y -n $TEST_ENV r-seurat=3.1.0 bioconductor-singlecellexperiment bioconductor-loomexperiment=1.2.0 r-reticulate r-devtools scipy=1.2.1 anndata=0.6.19 loompy=2 h5py=2.9.0; + conda create -q -y -n $TEST_ENV bioconductor-singlecellexperiment bioconductor-loomexperiment r-reticulate r-devtools scipy anndata loompy h5py "r-seurat=3" "r-spatstat=1.64_1"; fi install: @@ -50,9 +50,9 @@ script: -e 'sceasy::convertFormat(srt, from="seurat", to="anndata", outFile="data/pbmc3k-small_seurat3_anndata.h5ad")' -e 'sce <- sceasy::convertFormat(srt, from="seurat", to="sce", outFile="data/pbmc3k-small_seurat3_sce.rds")' -e 'sceasy::convertFormat(sce, from="sce", to="anndata", outFile="data/pbmc3k-small_seurat3_sce_anndata.h5ad")' - -e 'sceasy::convertFormat(sce, from="sce", to="loom", outFile="data/pbmc3k-small_seurat3_sce_loom.loom")' - -e 'sceasy::convertFormat("data/pbmc3k-small_seurat3_sce_loom.loom", from="loom", to="anndata", outFile="data/pbmc3k-small_seurat3_sce_loom_anndata.h5ad")' - -e 'sceasy::convertFormat("data/pbmc3k-small_seurat3_sce_loom.loom", from="loom", to="sce", outFile="data/pbmc3k-small_seurat3_sce_loom_sce.rds", main_layer_name="scaled")' +# -e 'sceasy::convertFormat(sce, from="sce", to="loom", outFile="data/pbmc3k-small_seurat3_sce_loom.loom")' +# -e 'sceasy::convertFormat("data/pbmc3k-small_seurat3_sce_loom.loom", from="loom", to="anndata", outFile="data/pbmc3k-small_seurat3_sce_loom_anndata.h5ad")' +# -e 'sceasy::convertFormat("data/pbmc3k-small_seurat3_sce_loom.loom", from="loom", to="sce", outFile="data/pbmc3k-small_seurat3_sce_loom_sce.rds", main_layer_name="scaled")' cache: directories: diff --git a/DESCRIPTION b/DESCRIPTION index 0b99038..af557a2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -10,10 +10,6 @@ Description: Convert Seurat, SingleCellExperiment, Loom object to AnnData object Depends: R (>= 3.5.1), reticulate -Imports: - Seurat (>= 3.0.1), - SingleCellExperiment (>= 1.4.0), - LoomExperiment (>= 1.1.5) SystemRequirements: anndata, loompy diff --git a/NAMESPACE b/NAMESPACE index d9f5d1d..d952e9a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1 +1,3 @@ export(convertFormat) +import(reticulate) +import(Matrix) diff --git a/R/functions.R b/R/functions.R index ce805de..1ef24e6 100644 --- a/R/functions.R +++ b/R/functions.R @@ -1,314 +1,479 @@ +# Functions for converting between different objects + + +#' Regularise dataframe +#' +#' This function checks if certain columns of a dataframe is of a single value +#' and drop them if required +#' +#' @param df Input data frame, usually cell metadata table (data.frame-like +#' object) +#' @param drop_single_values Drop columns with only a single value (logical) +#' +#' @return Dataframe .regularise_df <- function(df, drop_single_values = TRUE) { - if (ncol(df) == 0) df[['name']] <- rownames(df) - if (drop_single_values) { - k_singular <- sapply(df, function(x) length(unique(x)) == 1) - if (sum(k_singular) > 0) - warning(paste('Dropping single category variables:'), - paste(colnames(df)[k_singular], collapse=', ')) - df <- df[, !k_singular, drop=F] - if (ncol(df) == 0) df[['name']] <- rownames(df) + if (ncol(df) == 0) df[["name"]] <- rownames(df) + if (drop_single_values) { + k_singular <- sapply(df, function(x) length(unique(x)) == 1) + if (sum(k_singular) > 0) { + warning( + paste("Dropping single category variables:"), + paste(colnames(df)[k_singular], collapse = ", ") + ) } - return(df) + df <- df[, !k_singular, drop = F] + if (ncol(df) == 0) df[["name"]] <- rownames(df) + } + return(df) } -seurat2anndata <- function( - obj, outFile = NULL, assay = 'RNA', main_layer = 'data', transfer_layers = NULL, drop_single_values = TRUE -) { - main_layer <- match.arg(main_layer, c('data', 'counts', 'scale.data')) - transfer_layers <- transfer_layers[ - transfer_layers %in% c('data', 'counts', 'scale.data')] - transfer_layers <- transfer_layers[transfer_layers != main_layer] - - if (compareVersion(as.character(obj@version), '3.0.0') < 0) - obj <- Seurat::UpdateSeuratObject(object = obj) - - X <- Seurat::GetAssayData(object = obj, assay = assay, slot = main_layer) - - obs <- .regularise_df(obj@meta.data, drop_single_values = drop_single_values) - - var <- .regularise_df(Seurat::GetAssay(obj, assay = assay)@meta.features, drop_single_values = drop_single_values) - - obsm <- NULL - reductions <- names(obj@reductions) - if (length(reductions) > 0) { - obsm <- sapply( - reductions, - function(name) as.matrix(Seurat::Embeddings(obj, reduction=name)), - simplify = FALSE - ) - names(obsm) <- paste0('X_', tolower(names(obj@reductions))) - } - - layers <- list() - for (layer in transfer_layers) { - mat <- Seurat::GetAssayData(object = obj, assay = assay, slot = layer) - if (all(dim(mat) == dim(X))) layers[[layer]] <- Matrix::t(mat) - } - - anndata <- reticulate::import('anndata', convert = FALSE) - - adata <- anndata$AnnData( - X = Matrix::t(X), - obs = obs, - var = var, - obsm = obsm, - layers = layers +#' Convert Seurat object to AnnData +#' +#' This function converts a Seurat object to an Anndata object +#' +#' @param obj Input Seurat object +#' @param outFile Save output AnnData to this file if specified (str or NULL) +#' @param assay Assay to be converted, default "RNA" (str) +#' @param main_layer Slot in `assay` to be converted to AnnData.X, may be +#' "counts", "data", "scale.data", default "data" (str) +#' @param transfer_layers If specified, convert slots to AnnData.layers[], +#' (vector of str) +#' @param drop_single_values Drop single value columns in cell metadata table, +#' default TRUE (logical) +#' +#' @return AnnData object +#' +#' @import reticulate +#' @import Matrix +seurat2anndata <- function(obj, outFile = NULL, assay = "RNA", main_layer = "data", transfer_layers = NULL, drop_single_values = TRUE) { + if (!requireNamespace("Seurat")) { + stop("This function requires the 'Seurat' package.") + } + main_layer <- match.arg(main_layer, c("data", "counts", "scale.data")) + transfer_layers <- transfer_layers[ + transfer_layers %in% c("data", "counts", "scale.data") + ] + transfer_layers <- transfer_layers[transfer_layers != main_layer] + + if (compareVersion(as.character(obj@version), "3.0.0") < 0) { + obj <- Seurat::UpdateSeuratObject(object = obj) + } + + X <- Seurat::GetAssayData(object = obj, assay = assay, slot = main_layer) + + obs <- .regularise_df(obj@meta.data, drop_single_values = drop_single_values) + + var <- .regularise_df(Seurat::GetAssay(obj, assay = assay)@meta.features, drop_single_values = drop_single_values) + + obsm <- NULL + reductions <- names(obj@reductions) + if (length(reductions) > 0) { + obsm <- sapply( + reductions, + function(name) as.matrix(Seurat::Embeddings(obj, reduction = name)), + simplify = FALSE ) - - if (!is.null(outFile)) - adata$write(outFile, compression = 'gzip') - - adata + names(obsm) <- paste0("X_", tolower(names(obj@reductions))) + } + + layers <- list() + for (layer in transfer_layers) { + mat <- Seurat::GetAssayData(object = obj, assay = assay, slot = layer) + if (all(dim(mat) == dim(X))) layers[[layer]] <- Matrix::t(mat) + } + + anndata <- reticulate::import("anndata", convert = FALSE) + + adata <- anndata$AnnData( + X = Matrix::t(X), + obs = obs, + var = var, + obsm = obsm, + layers = layers + ) + + if (!is.null(outFile)) { + adata$write(outFile, compression = "gzip") + } + + adata } -sce2anndata <- function( - obj, outFile = NULL, main_layer = 'counts', transfer_layers = NULL, drop_single_values = TRUE -) { - #if (exists('updateObject', where=loadNamespace('SingleCellExperiment'), mode='function')) { - # obj <- SingleCellExperiment::updateObject(obj) - #} - assay_names <- SummarizedExperiment::assayNames(obj) - main_layer <- match.arg(main_layer, assay_names) - transfer_layers <- transfer_layers[transfer_layers %in% assay_names] - transfer_layers <- transfer_layers[transfer_layers != main_layer] - - X <- SummarizedExperiment::assay(obj, main_layer) - - obs <- .regularise_df(as.data.frame(SummarizedExperiment::colData(obj)), drop_single_values = drop_single_values) - - var <- .regularise_df(as.data.frame(SummarizedExperiment::rowData(obj)), drop_single_values = drop_single_values) - - obsm <- NULL - reductions <- SingleCellExperiment::reducedDimNames(obj) - if (length(reductions) > 0) { - obsm <- sapply( - reductions, - function(name) as.matrix( - SingleCellExperiment::reducedDim(obj, type=name)), - simplify = FALSE +#' Convert SingleCellExperiment object to AnnData +#' +#' This function converts a SingleCellExperiment object to an Anndata object +#' +#' @param obj Input SingleCellExperiment object +#' @param outFile Save output AnnData to this file if specified (str or NULL) +#' @param main_layer Assay to be converted to AnnData.X, default "counts" (str) +#' @param transfer_layers If specified, convert additional assays to +#' AnnData.layers[], (vector of str) +#' @param drop_single_values Drop single value columns in cell metadata table, +#' default TRUE (logical) +#' +#' @return AnnData object +#' +#' @import reticulate +#' @import Matrix +sce2anndata <- function(obj, outFile = NULL, main_layer = "counts", transfer_layers = NULL, drop_single_values = TRUE) { + if (!requireNamespace("SummarizedExperiment")) { + stop("This function requires the 'SummarizedExperiment' package.") + } + if (!requireNamespace("SingleCellExperiment")) { + stop("This function requires the 'SingleCellExperiment' package.") + } + assay_names <- SummarizedExperiment::assayNames(obj) + main_layer <- match.arg(main_layer, assay_names) + transfer_layers <- transfer_layers[transfer_layers %in% assay_names] + transfer_layers <- transfer_layers[transfer_layers != main_layer] + + X <- SummarizedExperiment::assay(obj, main_layer) + + obs <- .regularise_df(as.data.frame(SummarizedExperiment::colData(obj)), drop_single_values = drop_single_values) + + var <- .regularise_df(as.data.frame(SummarizedExperiment::rowData(obj)), drop_single_values = drop_single_values) + + obsm <- NULL + reductions <- SingleCellExperiment::reducedDimNames(obj) + if (length(reductions) > 0) { + obsm <- sapply( + reductions, + function(name) { + as.matrix( + SingleCellExperiment::reducedDim(obj, type = name) ) - names(obsm) <- paste0( - 'X_', tolower(SingleCellExperiment::reducedDimNames(obj))) - } - - layers <- list() - for (layer in transfer_layers) { - mat <- SummarizedExperiment::assay(obj, layer) - if (all(dim(mat) == dim(X))) layers[[layer]] <- Matrix::t(mat) - } - - anndata <- reticulate::import('anndata', convert = FALSE) - - adata <- anndata$AnnData( - X = Matrix::t(X), - obs = obs, - var = var, - obsm = obsm, - layers = layers + }, + simplify = FALSE ) + names(obsm) <- paste0( + "X_", tolower(SingleCellExperiment::reducedDimNames(obj)) + ) + } - if (!is.null(outFile)) - adata$write(outFile, compression = 'gzip') - - adata -} - -loom2anndata <- function( - inFile, outFile = NULL, main_layer = c('spliced', 'unspliced'), - obs_names = 'CellID', var_names = 'Gene' -) { - main_layer <- match.arg(main_layer) - - anndata <- reticulate::import('anndata', convert = FALSE) - - if (compareVersion(as.character(anndata[['__version__']]), '0.6.20') < 0) - message(paste( - "Warning: anndata <0.6.20 detected.", - "Upgrade to handle multi-dimensional embeddings." - )) + layers <- list() + for (layer in transfer_layers) { + mat <- SummarizedExperiment::assay(obj, layer) + if (all(dim(mat) == dim(X))) layers[[layer]] <- Matrix::t(mat) + } - adata <- anndata$read_loom( - inFile, sparse = TRUE, cleanup = TRUE, X_name = main_layer, - obs_names = obs_names, var_names = var_names - ) + anndata <- reticulate::import("anndata", convert = FALSE) - anndata$AnnData$obs_names_make_unique(adata) - anndata$AnnData$var_names_make_unique(adata) + adata <- anndata$AnnData( + X = Matrix::t(X), + obs = obs, + var = var, + obsm = obsm, + layers = layers + ) - if (!is.null(outFile)) - adata$write(outFile, compression = 'gzip') + if (!is.null(outFile)) { + adata$write(outFile, compression = "gzip") + } - adata + adata } -seurat2sce <- function(obj, outFile = NULL, main_layer=NULL, assay='RNA', ...) { - sce <- Seurat::as.SingleCellExperiment(obj, assay=assay, ...) - if (!is.null(outFile)) - saveRDS(sce, outFile) +#' Convert Loom object to AnnData +#' +#' This function converts a Loom object to an Anndata object +#' +#' @param inFile Path to the input Loom file on disk (str) +#' @param outFile Save output AnnData to this file if specified (str or NULL) +#' @param main_layer Assay to be converted to AnnData.X, can be "spliced" or +#' "unspliced", default "spliced" (str) +#' @param obs_names Column in cell metadata table that contains cell IDs (str) +#' @param var_names Column in gene metadata table that contains gene names +#' +#' @return AnnData object +#' +#' @import reticulate +loom2anndata <- function(inFile, outFile = NULL, main_layer = c("spliced", "unspliced"), + obs_names = "CellID", var_names = "Gene") { + main_layer <- match.arg(main_layer) + + anndata <- reticulate::import("anndata", convert = FALSE) + + if (compareVersion(as.character(anndata[["__version__"]]), "0.6.20") < 0) { + message(paste( + "Warning: anndata <0.6.20 detected.", + "Upgrade to handle multi-dimensional embeddings." + )) + } + + adata <- anndata$read_loom( + inFile, + sparse = TRUE, cleanup = TRUE, X_name = main_layer, + obs_names = obs_names, var_names = var_names + ) + + anndata$AnnData$obs_names_make_unique(adata) + anndata$AnnData$var_names_make_unique(adata) + + if (!is.null(outFile)) { + adata$write(outFile, compression = "gzip") + } + + adata +} - sce +#' Convert Seurat object to SingleCellExperiment +#' +#' This function converts a Seurat object to a SingleCellExperiment object +#' +#' @param obj Input Seurat object +#' @param outFile Save output SingleCellExperiment object to this file if +#' specified (str or NULL) +#' @param assay Assay to be converted to AnnData.X, default "RNA" (str) +#' +#' @return AnnData object +seurat2sce <- function(obj, outFile = NULL, main_layer = NULL, assay = "RNA", ...) { + if (!requireNamespace("Seurat")) { + stop("This function requires the 'Seurat' package.") + } + sce <- Seurat::as.SingleCellExperiment(obj, assay = assay, ...) + if (!is.null(outFile)) { + saveRDS(sce, outFile) + } + + sce } +#' Convert SingleCellExperiment object to Loom file +#' +#' This function converts a SingleCellExperiment object to a LoomExperiment +#' object +#' +#' @param obj Input SingleCellExperiment object +#' @param outFile Save output Loom to this file if specified (str or NULL) +#' @param drop_single_values Drop single value columns in cell metadata table, +#' default TRUE (logical) +#' +#' @return LoomExperiment object sce2loom <- function(obj, outFile, main_layer = NULL, drop_single_values = TRUE, ...) { - #if (exists('updateObject', where=loadNamespace('SingleCellExperiment'), mode='function')) { - # obj <- SingleCellExperiment::updateObject(obj) - #} - SummarizedExperiment::colData(obj) <- .regularise_df(SummarizedExperiment::colData(obj), drop_single_values = drop_single_values) - SummarizedExperiment::rowData(obj) <- .regularise_df(SummarizedExperiment::rowData(obj), drop_single_values = drop_single_values) - writeExchangeableLoom(obj, outFile, main_layer = main_layer, ...) + if (!requireNamespace("LoomExperiment")) { + stop("This function requires the 'LoomExperiment' package.") + } + scle <- LoomExperiment::SingleCellLoomExperiment(obj) + + if (!is.null(outFile)) { + saveRDS(scle, outFile) + } + + scle } +#' Read a Loom file into a SingleCellExperiment object +#' +#' This function reads a Loom file object into a SingleCellExperiment object +#' +#' @param inFile Path to input Loom file on disk (str) +#' @param outFile Save output SingleCellExperiment object to this file if +#' specified (str or NULL) +#' +#' @return LoomExperiment object loom2sce <- function(inFile, outFile = NULL, main_layer = NULL, main_layer_name = NULL, ...) { - sce <- readExchangeableLoom(inFile, backed = FALSE, main_layer_name = main_layer_name, ...) - if (!is.null(outFile)) - saveRDS(sce, outFile) - - sce + if (!requireNamespace("LoomExperiment")) { + stop("This function requires the 'LoomExperiment' package.") + } + if (!requireNamespace("SingleCellExperiment")) { + stop("This function requires the 'LoomExperiment' package.") + } + scle <- LoomExperiment::import(inFile) + sce <- as(scle, "SingleCellExperiment") + + if (!is.null(outFile)) { + saveRDS(sce, outFile) + } + + sce } -.obs2metadata <- function(obs_pd, assay='RNA') { - obs_df <- .regularise_df(reticulate::py_to_r(obs_pd), drop_single_values = FALSE) - attr(obs_df, 'pandas.index') <- NULL - colnames(obs_df) <- sub('n_counts', paste0('nCounts_', assay), colnames(obs_df)) - colnames(obs_df) <- sub('n_genes', paste0('nFeaturess_', assay), colnames(obs_df)) - return(obs_df) +#' Prepare cell metadata +#' +#' This function prepare cell metadata from AnnData.obs +#' +#' @param obs_pd Input AnnData.obs dataframe +#' @param assay Assay name, default "RNA" (str) +#' +#' @return AnnData object +#' +#' @import reticulate +.obs2metadata <- function(obs_pd, assay = "RNA") { + obs_df <- .regularise_df(reticulate::py_to_r(obs_pd), drop_single_values = FALSE) + attr(obs_df, "pandas.index") <- NULL + colnames(obs_df) <- sub("n_counts", paste0("nCounts_", assay), colnames(obs_df)) + colnames(obs_df) <- sub("n_genes", paste0("nFeaturess_", assay), colnames(obs_df)) + return(obs_df) } +#' Prepare feature metadata +#' +#' This function prepare feature metadata from AnnData.var +#' +#' @param var_pd Input AnnData.var dataframe +#' +#' @return AnnData object +#' +#' @import reticulate .var2feature_metadata <- function(var_pd) { - var_df <- .regularise_df(reticulate::py_to_r(var_pd), drop_single_values = FALSE) - attr(var_df, 'pandas.index') <- NULL - colnames(var_df) <- sub('dispersions_norm', 'mvp.dispersion.scaled', colnames(var_df)) - colnames(var_df) <- sub('dispersions', 'mvp.dispersion', colnames(var_df)) - colnames(var_df) <- sub('means', 'mvp.mean', colnames(var_df)) - colnames(var_df) <- sub('highly_variable', 'highly.variable', colnames(var_df)) - return(var_df) + var_df <- .regularise_df(reticulate::py_to_r(var_pd), drop_single_values = FALSE) + attr(var_df, "pandas.index") <- NULL + colnames(var_df) <- sub("dispersions_norm", "mvp.dispersion.scaled", colnames(var_df)) + colnames(var_df) <- sub("dispersions", "mvp.dispersion", colnames(var_df)) + colnames(var_df) <- sub("means", "mvp.mean", colnames(var_df)) + colnames(var_df) <- sub("highly_variable", "highly.variable", colnames(var_df)) + return(var_df) } -.uns2misc <- function(ad_pd, target_uns_keys = list()) { - uns_keys <- intersect(target_uns_keys, reticulate::py_to_r(ad_pd$uns_keys())) - misc <- sapply(uns_keys, function(x) reticulate::py_to_r(ad_pd$uns[x]), simplify = FALSE, USE.NAMES = TRUE) - return(misc) +.uns2misc <- function(ad, target_uns_keys = list()) { + uns_keys <- intersect(target_uns_keys, reticulate::py_to_r(ad$uns_keys())) + misc <- sapply(uns_keys, function(x) reticulate::py_to_r(ad$uns[x]), simplify = FALSE, USE.NAMES = TRUE) + return(misc) } -anndata2seurat <- function(inFile, outFile = NULL, main_layer = 'counts', assay = 'RNA', use_seurat = FALSE, lzf = FALSE, target_uns_keys = list()) { - main_layer <- match.arg(main_layer, c('counts', 'data', 'scale.data')) - inFile <- path.expand(inFile) - - anndata <- reticulate::import('anndata', convert = FALSE) - sp <- reticulate::import('scipy.sparse', convert = FALSE) - - if (use_seurat) { - if (lzf) { - tmpFile <- paste0(tools::file_path_sans_ext(inFile), '.decompressed.h5ad') - ad <- anndata$read_h5ad(inFile) - ad$write(tmpFile) - tryCatch({ - srt <- Seurat::ReadH5AD(tmpFile) - }, finally = { - file.remove(tmpFile) - }) - } else { - srt <- Seurat::ReadH5AD(inFile) +#' Convert AnnData object to Seurat object +#' +#' This function converts an AnnData object to a Seurat object +#' +#' @param inFile Path to an input AnnData object on disk (str) +#' @param outFile Save output Seurat to this file if specified (str or NULL) +#' @param assay Name of assay in Seurat object to store expression values, +#' default "RNA" (str) +#' @param main_layer Name of slot in `assay` to store AnnData.X, can be +#' "counts", "data", "scale.data", default "counts" (str) +#' @param use_seurat Use Seurat::ReadH5AD() to do the conversion, default FALSE (logical) +#' @param lzf Whether AnnData is compressed by `lzf`, default FALSE (logical) +#' +#' @return Seurat object +#' +#' @import reticulate +#' @import Matrix +anndata2seurat <- function(inFile, outFile = NULL, main_layer = "counts", assay = "RNA", use_seurat = FALSE, lzf = FALSE, target_uns_keys = list()) { + if (!requireNamespace("Seurat")) { + stop("This function requires the 'Seurat' package.") + } + main_layer <- match.arg(main_layer, c("counts", "data", "scale.data")) + inFile <- path.expand(inFile) + + anndata <- reticulate::import("anndata", convert = FALSE) + sp <- reticulate::import("scipy.sparse", convert = FALSE) + + if (use_seurat) { + if (lzf) { + tmpFile <- paste0(tools::file_path_sans_ext(inFile), ".decompressed.h5ad") + ad <- anndata$read_h5ad(inFile) + ad$write(tmpFile) + tryCatch( + { + srt <- Seurat::ReadH5AD(tmpFile) + }, + finally = { + file.remove(tmpFile) } + ) } else { - ad <- anndata$read_h5ad(inFile) + srt <- Seurat::ReadH5AD(inFile) + } + } else { + ad <- anndata$read_h5ad(inFile) - obs_df <- .obs2metadata(ad$obs) - var_df <- .var2feature_metadata(ad$var) + obs_df <- .obs2metadata(ad$obs) + var_df <- .var2feature_metadata(ad$var) - if (reticulate::py_to_r(sp$issparse(ad$X))) { - X <- Matrix::t(reticulate::py_to_r(sp$csc_matrix(ad$X))) - } else { - X <- t(reticulate::py_to_r(ad$X)) - } - colnames(X) <- rownames(obs_df) - rownames(X) <- rownames(var_df) - - if (!is.null(reticulate::py_to_r(ad$raw))) { - raw_var_df <- .var2feature_metadata(ad$raw$var) - raw_X <- Matrix::t(reticulate::py_to_r(sp$csc_matrix(ad$raw$X))) - colnames(raw_X) <- rownames(obs_df) - rownames(raw_X) <- rownames(raw_var_df) - } else { - raw_var_df <- NULL - raw_X <- NULL - } + if (reticulate::py_to_r(sp$issparse(ad$X))) { + X <- Matrix::t(reticulate::py_to_r(sp$csc_matrix(ad$X))) + } else { + X <- t(reticulate::py_to_r(ad$X)) + } + colnames(X) <- rownames(obs_df) + rownames(X) <- rownames(var_df) + + if (!is.null(reticulate::py_to_r(ad$raw))) { + raw_var_df <- .var2feature_metadata(ad$raw$var) + raw_X <- Matrix::t(reticulate::py_to_r(sp$csc_matrix(ad$raw$X))) + colnames(raw_X) <- rownames(obs_df) + rownames(raw_X) <- rownames(raw_var_df) + } else { + raw_var_df <- NULL + raw_X <- NULL + } - if (main_layer == 'scale.data' && !is.null(raw_X)) { - assays <- list(Seurat::CreateAssayObject(data = raw_X)) - assays[[1]] <- Seurat::SetAssayData(assays[[1]], slot = 'scale.data', new.data = X) - message('X -> scale.data; raw.X -> data') - } else if (main_layer == 'data' && !is.null(raw_X)) { - if (nrow(X) != nrow(raw_X)) { - message("Raw layer was found with different number of genes than main layer, resizing X and raw.X to match dimensions") - raw_X <- raw_X[rownames(raw_X) %in% rownames(X), , drop=F] - X <- X[rownames(raw_X), , drop=F] - } - assays <- list(Seurat::CreateAssayObject(counts = raw_X)) - assays[[1]] <- Seurat::SetAssayData(assays[[1]], slot = 'data', new.data = X) - message('X -> data; raw.X -> counts') - } else if (main_layer == 'counts') { - assays <- list(Seurat::CreateAssayObject(counts = X)) - message('X -> counts') - } else { - assays <- list(Seurat::CreateAssayObject(data = X)) - message('X -> data') - } - names(assays) <- assay + if (main_layer == "scale.data" && !is.null(raw_X)) { + assays <- list(Seurat::CreateAssayObject(data = raw_X)) + assays[[1]] <- Seurat::SetAssayData(assays[[1]], slot = "scale.data", new.data = X) + message("X -> scale.data; raw.X -> data") + } else if (main_layer == "data" && !is.null(raw_X)) { + if (nrow(X) != nrow(raw_X)) { + message("Raw layer was found with different number of genes than main layer, resizing X and raw.X to match dimensions") + raw_X <- raw_X[rownames(raw_X) %in% rownames(X), , drop = F] + X <- X[rownames(raw_X), , drop = F] + } + assays <- list(Seurat::CreateAssayObject(counts = raw_X)) + assays[[1]] <- Seurat::SetAssayData(assays[[1]], slot = "data", new.data = X) + message("X -> data; raw.X -> counts") + } else if (main_layer == "counts") { + assays <- list(Seurat::CreateAssayObject(counts = X)) + message("X -> counts") + } else { + assays <- list(Seurat::CreateAssayObject(data = X)) + message("X -> data") + } + names(assays) <- assay - if (main_layer == 'scale.data' && !is.null(raw_X)) { - assays[[assay]]@meta.features <- raw_var_df - } else { - assays[[assay]]@meta.features <- var_df - } + if (main_layer == "scale.data" && !is.null(raw_X)) { + assays[[assay]]@meta.features <- raw_var_df + } else { + assays[[assay]]@meta.features <- var_df + } - project_name <- sub('\\.h5ad$', '', basename(inFile)) - srt <- new('Seurat', assays = assays, project.name = project_name, version = packageVersion('Seurat')) - Seurat::DefaultAssay(srt) <- assay - Seurat::Idents(srt) <- project_name - - srt@meta.data <- obs_df - embed_names <- unlist(reticulate::py_to_r(ad$obsm_keys())) - if (length(embed_names) > 0) { - embeddings <- sapply(embed_names, function(x) reticulate::py_to_r(ad$obsm[x]), simplify = FALSE, USE.NAMES = TRUE) - names(embeddings) <- embed_names - for (name in embed_names) { - rownames(embeddings[[name]]) <- colnames(assays[[assay]]) - } - - dim.reducs <- vector(mode = 'list', length = length(embeddings)) - for (i in seq(length(embeddings))) { - name <- embed_names[i] - embed <- embeddings[[name]] - key <- switch( - name, - sub('_(.*)', '\\L\\1', sub('^X_', '', toupper(name)), perl=T), - 'X_pca' = 'PC', 'X_tsne' = 'tSNE', 'X_umap' = 'UMAP' - ) - colnames(embed) <- paste0(key, '_', seq(ncol(embed))) - dim.reducs[[i]] <- Seurat::CreateDimReducObject( - embeddings = embed, - loadings = new('matrix'), - assay = assay, - stdev = numeric(0L), - key = paste0(key, '_') - ) - } - names(dim.reducs) <- sub('X_', '', embed_names) - - for (name in names(dim.reducs)) { - srt[[name]] <- dim.reducs[[name]] - } - } + project_name <- sub("\\.h5ad$", "", basename(inFile)) + srt <- new("Seurat", assays = assays, project.name = project_name, version = packageVersion("Seurat")) + Seurat::DefaultAssay(srt) <- assay + Seurat::Idents(srt) <- project_name + + srt@meta.data <- obs_df + embed_names <- unlist(reticulate::py_to_r(ad$obsm_keys())) + if (length(embed_names) > 0) { + embeddings <- sapply(embed_names, function(x) reticulate::py_to_r(ad$obsm[x]), simplify = FALSE, USE.NAMES = TRUE) + names(embeddings) <- embed_names + for (name in embed_names) { + rownames(embeddings[[name]]) <- colnames(assays[[assay]]) + } + + dim.reducs <- vector(mode = "list", length = length(embeddings)) + for (i in seq(length(embeddings))) { + name <- embed_names[i] + embed <- embeddings[[name]] + key <- switch(name, + sub("_(.*)", "\\L\\1", sub("^X_", "", toupper(name)), perl = T), + "X_pca" = "PC", + "X_tsne" = "tSNE", + "X_umap" = "UMAP" + ) + colnames(embed) <- paste0(key, "_", seq(ncol(embed))) + dim.reducs[[i]] <- Seurat::CreateDimReducObject( + embeddings = embed, + loadings = new("matrix"), + assay = assay, + stdev = numeric(0L), + key = paste0(key, "_") + ) + } + names(dim.reducs) <- sub("X_", "", embed_names) + + for (name in names(dim.reducs)) { + srt[[name]] <- dim.reducs[[name]] + } } + } - srt@misc <- .uns2misc(ad, target_uns_keys = target_uns_keys) + srt@misc <- .uns2misc(ad, target_uns_keys = target_uns_keys) - if (!is.null(outFile)) saveRDS(object = srt, file = outFile) + if (!is.null(outFile)) saveRDS(object = srt, file = outFile) - srt + srt } anndata2cds <- function(inFile, outFile = NULL) { - message('not implemented yet') + message("not implemented yet") } diff --git a/R/loom_utils.R b/R/loom_utils.R index 418785a..e0aca9e 100644 --- a/R/loom_utils.R +++ b/R/loom_utils.R @@ -1,361 +1,384 @@ #!/usr/bin/env Rscript -exchangeable_loom_version <- '3.0.0' +exchangeable_loom_version <- "3.0.0" isExchangeableLoom <- function(h5f) { - attrs <- rhdf5::h5readAttributes(h5f, '/') - version <- attrs[['LOOM_SPEC_VERSION']] - return(!is.null(version) && version == exchangeable_loom_version) + attrs <- rhdf5::h5readAttributes(h5f, "/") + version <- attrs[["LOOM_SPEC_VERSION"]] + return(!is.null(version) && version == exchangeable_loom_version) } readDimNames <- function(h5f) { - cell_attr <- rhdf5::h5readAttributes(h5f, '/col_attrs')[['CellID']] - gene_attr <- rhdf5::h5readAttributes(h5f, '/row_attrs')[['Gene']] - source <- rhdf5::h5readAttributes(h5f, '/')[['created_from']] - if (!is.null(source) && source == 'anndata') { - if (is.null(cell_attr)) - cell_attr <- 'obs_names' - if (is.null(gene_attr)) - gene_attr <- 'var_names' + cell_attr <- rhdf5::h5readAttributes(h5f, "/col_attrs")[["CellID"]] + gene_attr <- rhdf5::h5readAttributes(h5f, "/row_attrs")[["Gene"]] + source <- rhdf5::h5readAttributes(h5f, "/")[["created_from"]] + if (!is.null(source) && source == "anndata") { + if (is.null(cell_attr)) { + cell_attr <- "obs_names" } - return(list(col=cell_attr, row=gene_attr)) + if (is.null(gene_attr)) { + gene_attr <- "var_names" + } + } + return(list(col = cell_attr, row = gene_attr)) } readManifest <- function(h5f) { - tryCatch({ - manifest <- data.frame(t(rhdf5::h5read(h5f, '/attr/manifest')), stringsAsFactors=FALSE) - colnames(manifest) <- c('loom_path', 'dtype', 'anndata_path', 'sce_path') - return(manifest) + tryCatch( + { + manifest <- data.frame(t(rhdf5::h5read(h5f, "/attr/manifest")), stringsAsFactors = FALSE) + colnames(manifest) <- c("loom_path", "dtype", "anndata_path", "sce_path") + return(manifest) }, - error=function(e) { - manifest <- NULL - return(manifest) - }) + error = function(e) { + manifest <- NULL + return(manifest) + } + ) } nestedEnvAsList <- function(x) { - out <- as.list(x) - lapply(out, function(e) if (is.environment(e)) nestedEnvAsList(e) else e) + out <- as.list(x) + lapply(out, function(e) if (is.environment(e)) nestedEnvAsList(e) else e) } nestedEnv <- function(root_env, paths, v) { - n <- length(paths) - var <- paths[1] - if (n == 1) { - root_env[[var]] <- v - } else { - if (is.null(root_env[[var]])) - root_env[[var]] <- new.env(parent=emptyenv()) - nestedEnv(root_env[[var]], paths[2:n], v) + n <- length(paths) + var <- paths[1] + if (n == 1) { + root_env[[var]] <- v + } else { + if (is.null(root_env[[var]])) { + root_env[[var]] <- new.env(parent = emptyenv()) } - invisible() -} - -flattenNestedListToEnv <- function(x, e, prefix=NULL) { - entry_names <- names(x) - if (is.null(entry_names)) entry_names <- seq(length(x)) - sapply(entry_names, function(name) { - if (is.null(prefix)) { - full_name <- name - } else { - full_name <- paste(prefix, name, sep='__') - } - if (is.list(x[[name]])) { - flattenNestedListToEnv(x[[name]], e, full_name) - } else { - e[[full_name]] <- x[[name]] - } - }) - invisible() + nestedEnv(root_env[[var]], paths[2:n], v) + } + invisible() } -makeManifest <- function(entries, dtype='array', loom_prefix='/attr/', anndata_prefix='/uns/', - sce_prefix='@metadata$') { - n <- length(entries) - if (is.list(entries)) { - entry_names <- names(entries) - is_scalar <- sapply(entries, function(x) is.vector(x) && length(x) == 1) - dtypes <- ifelse(is_scalar, 'scalar', 'array') +flattenNestedListToEnv <- function(x, e, prefix = NULL) { + entry_names <- names(x) + if (is.null(entry_names)) entry_names <- seq(length(x)) + sapply(entry_names, function(name) { + if (is.null(prefix)) { + full_name <- name } else { - entry_names <- entries - dtypes <- rep(dtype, n) + full_name <- paste(prefix, name, sep = "__") } - loom_paths <- paste0(loom_prefix, entry_names) - if (endsWith(loom_prefix, '[')) - loom_paths <- paste0(loom_paths, ']') - if (is.null(anndata_prefix)) { - anndata_paths <- rep('', n) + if (is.list(x[[name]])) { + flattenNestedListToEnv(x[[name]], e, full_name) } else { - if (anndata_prefix == '/obsm/X_') { - ad_names <- tolower(entry_names) - } else { - ad_names <- gsub('__', '/', entry_names) - } - anndata_paths <- paste0(anndata_prefix, ad_names) + e[[full_name]] <- x[[name]] } - if (startsWith(sce_prefix, '@metadata$')) - entry_names <- gsub('__', '$', entry_names) - sce_paths <- paste0(sce_prefix, entry_names) - return(data.frame(loom_path=loom_paths, dtype=dtypes, anndata_path=anndata_paths, - sce_path=sce_paths, stringsAsFactors=FALSE)) + }) + invisible() } -readExchangeableLoom <- function(filename, backed=TRUE, main_layer_name=NULL) { - stopifnot(file.exists(filename), rhdf5::H5Fis_hdf5(filename)) - h5f <- rhdf5::H5Fopen(filename, flag='H5F_ACC_RDONLY') - if (!isExchangeableLoom(h5f)) { - rhdf5::H5Fclose(h5f) - return(LoomExperiment::import(filename, type='SingleCellLoomExperiment')) - } - dim_names <- readDimNames(h5f) - manifest <- readManifest(h5f) - rhdf5::h5closeAll() - - suppressWarnings(scle <- LoomExperiment::import( - filename, - type='SingleCellLoomExperiment', - rownames_attr=dim_names$row, - colnames_attr=dim_names$col)) - if (!backed) { - for (i in seq_along(SummarizedExperiment::assays(scle))) { - SummarizedExperiment::assays(scle)[[i]] <- as(SummarizedExperiment::assays(scle)[[i]], 'dgCMatrix') - } - } - - h5f <- rhdf5::H5Fopen(filename, flag='H5F_ACC_RDONLY') - - # Add appropriate assay name - mx_attrs <- rhdf5::h5readAttributes(h5f, '/matrix') - if (!is.null(main_layer_name)) { - names(SummarizedExperiment::assays(scle))[1] <- main_layer_name - } else if ('assay' %in% names(mx_attrs)) { - names(SummarizedExperiment::assays(scle))[1] <- mx_attrs['assay'] +makeManifest <- function(entries, dtype = "array", loom_prefix = "/attr/", anndata_prefix = "/uns/", + sce_prefix = "@metadata$") { + n <- length(entries) + if (is.list(entries)) { + entry_names <- names(entries) + is_scalar <- sapply(entries, function(x) is.vector(x) && length(x) == 1) + dtypes <- ifelse(is_scalar, "scalar", "array") + } else { + entry_names <- entries + dtypes <- rep(dtype, n) + } + loom_paths <- paste0(loom_prefix, entry_names) + if (endsWith(loom_prefix, "[")) { + loom_paths <- paste0(loom_paths, "]") + } + if (is.null(anndata_prefix)) { + anndata_paths <- rep("", n) + } else { + if (anndata_prefix == "/obsm/X_") { + ad_names <- tolower(entry_names) } else { - names(SummarizedExperiment::assays(scle))[1] <- 'counts' + ad_names <- gsub("__", "/", entry_names) } - - if (!is.null(manifest)) { - - # Graphs are already handled by import(), just record entries - is_graph <- (startsWith(manifest$loom_path, '/col_graphs/') | - startsWith(manifest$loom_path, '/row_graphs/')) - - # Handle reducedDims - is_rd <- startsWith(manifest$loom_path, '/attr/reducedDims__') - rd_paths <- manifest$loom_path[is_rd] - names(rd_paths) <- sub('^@reducedDims@listData[$]', '', manifest$sce_path[is_rd]) - SingleCellExperiment::reducedDims(scle) <- S4Vectors::SimpleList(lapply(rd_paths, function(path) { - mat <- t(rhdf5::h5read(h5f, path)) - rownames(mat) <- colnames(scle) - mat - })) - - # Handle global attributes - is_attr <- startsWith(manifest$loom_path, '/.attrs[') - src_paths <- manifest$loom_path[is_attr] - tgt_paths <- manifest$sce_path[is_attr] - attr_names <- substr(src_paths, 9, nchar(src_paths)-1) - global_attrs <- rhdf5::h5readAttributes(h5f, '/') - mtdt <- new.env(parent=emptyenv(), size=length(tgt_paths)) - for (i in seq_along(attr_names)) { - v <- global_attrs[[ attr_names[i] ]] - paths <- unlist(strsplit(sub('^@metadata[$]', '', tgt_paths[i]), '$', fixed=TRUE)) - nestedEnv(mtdt, paths, v) - scle@metadata[[ attr_names[i] ]] <- NULL - } - - # Handle global datasets - is_ds <- (!(is_graph | is_rd | is_attr) & manifest$sce_path != '') - src_paths <- manifest$loom_path[is_ds] - tgt_paths <- manifest$sce_path[is_ds] - for (i in seq_along(tgt_paths)) { - v <- rhdf5::h5read(h5f, src_paths[i]) - paths <- unlist(strsplit(sub('^@metadata[$]', '', tgt_paths[i]), '$', fixed=TRUE)) - nestedEnv(mtdt, paths, v) - } - mtdt <- nestedEnvAsList(mtdt) - for (name in names(mtdt)) { - scle@metadata[[name]] <- mtdt[[name]] - } - } - rhdf5::h5closeAll() - - return(as(scle, 'SingleCellExperiment')) + anndata_paths <- paste0(anndata_prefix, ad_names) + } + if (startsWith(sce_prefix, "@metadata$")) { + entry_names <- gsub("__", "$", entry_names) + } + sce_paths <- paste0(sce_prefix, entry_names) + return(data.frame( + loom_path = loom_paths, dtype = dtypes, anndata_path = anndata_paths, + sce_path = sce_paths, stringsAsFactors = FALSE + )) } -writeExchangeableLoom <- function(sce, filename, main_layer=NULL, return_manifest=FALSE) { - scle <- LoomExperiment::SingleCellLoomExperiment(sce) - - # Clean rowData and colData - row_fct_idx <- sapply(SummarizedExperiment::rowData(scle), is.factor) - SummarizedExperiment::rowData(scle)[row_fct_idx] <- lapply( - SummarizedExperiment::rowData(scle)[row_fct_idx], - function(x) type.convert(as.character(x), as.is=TRUE)) - col_fct_idx <- sapply(SummarizedExperiment::colData(scle), is.factor) - SummarizedExperiment::colData(scle)[col_fct_idx] <- lapply( - SummarizedExperiment::colData(scle)[col_fct_idx], - function(x) type.convert(as.character(x), as.is=TRUE)) - - # Handle reducedDims. Move embeddings out of reducedDims so they don't get - # written to unwanted location by export(). - rdims <- SingleCellExperiment::reducedDims(scle) - SingleCellExperiment::reducedDims(scle) <- S4Vectors::SimpleList() - if (!S4Vectors::isEmpty(rdims)) { - rdim_manifest <- makeManifest( - names(rdims), - dtype='array', - loom_prefix='/attr/reducedDims__', - anndata_prefix='/obsm/X_', - sce_prefix='@reducedDims@listData$') - } else { - rdim_manifest <- NULL - } - - # Handle graphs. They get written by export() but we still need to record the paths. - if (!S4Vectors::isEmpty(LoomExperiment::colGraphs(scle))) { - colgraph_manifest <- makeManifest( - names(LoomExperiment::colGraphs(scle)), - dtype='graph', - loom_prefix='/col_graphs/', - anndata_prefix='/uns/', - sce_prefix='@colGraphs$') - } else { - colgraph_manifest <- NULL +readExchangeableLoom <- function(filename, backed = TRUE, main_layer_name = NULL) { + stopifnot(file.exists(filename), rhdf5::H5Fis_hdf5(filename)) + h5f <- rhdf5::H5Fopen(filename, flag = "H5F_ACC_RDONLY") + if (!isExchangeableLoom(h5f)) { + rhdf5::H5Fclose(h5f) + return(LoomExperiment::import(filename, type = "SingleCellLoomExperiment")) + } + dim_names <- readDimNames(h5f) + manifest <- readManifest(h5f) + rhdf5::h5closeAll() + + suppressWarnings(scle <- LoomExperiment::import( + filename, + type = "SingleCellLoomExperiment", + rownames_attr = dim_names$row, + colnames_attr = dim_names$col + )) + if (!backed) { + for (i in seq_along(SummarizedExperiment::assays(scle))) { + SummarizedExperiment::assays(scle)[[i]] <- as(SummarizedExperiment::assays(scle)[[i]], "dgCMatrix") } - if (!S4Vectors::isEmpty(LoomExperiment::rowGraphs(scle))) { - rowgraph_manifest <- makeManifest( - names(LoomExperiment::rowGraphs(scle)), - dtype='graph', - loom_prefix='/row_graphs/', - anndata_prefix=NULL, - sce_prefix='@rowGraphs$') - } else { - rowgraph_manifest <- NULL + } + + h5f <- rhdf5::H5Fopen(filename, flag = "H5F_ACC_RDONLY") + + # Add appropriate assay name + mx_attrs <- rhdf5::h5readAttributes(h5f, "/matrix") + if (!is.null(main_layer_name)) { + names(SummarizedExperiment::assays(scle))[1] <- main_layer_name + } else if ("assay" %in% names(mx_attrs)) { + names(SummarizedExperiment::assays(scle))[1] <- mx_attrs["assay"] + } else { + names(SummarizedExperiment::assays(scle))[1] <- "counts" + } + + if (!is.null(manifest)) { + + # Graphs are already handled by import(), just record entries + is_graph <- (startsWith(manifest$loom_path, "/col_graphs/") | + startsWith(manifest$loom_path, "/row_graphs/")) + + # Handle reducedDims + is_rd <- startsWith(manifest$loom_path, "/attr/reducedDims__") + rd_paths <- manifest$loom_path[is_rd] + names(rd_paths) <- sub("^@reducedDims@listData[$]", "", manifest$sce_path[is_rd]) + SingleCellExperiment::reducedDims(scle) <- S4Vectors::SimpleList(lapply(rd_paths, function(path) { + mat <- t(rhdf5::h5read(h5f, path)) + rownames(mat) <- colnames(scle) + mat + })) + + # Handle global attributes + is_attr <- startsWith(manifest$loom_path, "/.attrs[") + src_paths <- manifest$loom_path[is_attr] + tgt_paths <- manifest$sce_path[is_attr] + attr_names <- substr(src_paths, 9, nchar(src_paths) - 1) + global_attrs <- rhdf5::h5readAttributes(h5f, "/") + mtdt <- new.env(parent = emptyenv(), size = length(tgt_paths)) + for (i in seq_along(attr_names)) { + v <- global_attrs[[attr_names[i]]] + paths <- unlist(strsplit(sub("^@metadata[$]", "", tgt_paths[i]), "$", fixed = TRUE)) + nestedEnv(mtdt, paths, v) + scle@metadata[[attr_names[i]]] <- NULL } - # Handle metadata. Flatten nested lists to make export() happy. Scalars go - # to /.attrs, others go to /attr - if (length(S4Vectors::metadata(scle)) > 0) { - mtdt <- new.env(parent=emptyenv()) - flattenNestedListToEnv(scle@metadata, mtdt) - mtdt <- lapply(mtdt, function(x) { - if (!is.numeric(x) && !is.character(x) && !is.logical(x)) - x <- type.convert(as.character(x), as.is=TRUE) - if ('class' %in% names(attributes(x))) - attributes(x)$class <- NULL - x - }) - is_attr <- rep(FALSE, length(mtdt)) - names(is_attr) <- names(mtdt) - for (i in seq_along(mtdt)) { - x <- mtdt[[i]] - if ((is.vector(x) || is.array(x)) && (length(x) == 1)) { - is_attr[i] <- TRUE - mtdt[[i]] <- x[1] - } - } - is_attr <- is_attr & !grepl('__', names(mtdt)) - - # Let export handle attributes - S4Vectors::metadata(scle) <- mtdt[is_attr] - - excluded_from_manifest <- c('LOOM_SPEC_VERSION', 'CreationDate', 'last_modified', - 'CreatedWith', 'LoomExperiment-class', 'created_from', - 'last_modified_by') - attr_names <- names(S4Vectors::metadata(scle)) - attr_names <- attr_names[!attr_names %in% excluded_from_manifest] - if (length(attr_names) > 0) { - attr_manifest <- makeManifest( - attr_names, - dtype='scalar', - loom_prefix='/.attrs[', - anndata_prefix='/uns/', - sce_prefix='@metadata$') - } else { - attr_manifest <- NULL - } - - datasets <- mtdt[!is_attr] - if (length(datasets) > 0) { - dts_manifest <- makeManifest( - datasets, - dtype=NULL, - loom_prefix='/attr/', - anndata_prefix='/uns/', - sce_prefix='@metadata$') - } else { - dts_manifest <- NULL - } - } else { - attr_manifest <- NULL - dts_manifest <- NULL - datasets <- list() + # Handle global datasets + is_ds <- (!(is_graph | is_rd | is_attr) & manifest$sce_path != "") + src_paths <- manifest$loom_path[is_ds] + tgt_paths <- manifest$sce_path[is_ds] + for (i in seq_along(tgt_paths)) { + v <- rhdf5::h5read(h5f, src_paths[i]) + paths <- unlist(strsplit(sub("^@metadata[$]", "", tgt_paths[i]), "$", fixed = TRUE)) + nestedEnv(mtdt, paths, v) } - - manifest <- rbind(attr_manifest, dts_manifest, rdim_manifest, colgraph_manifest, rowgraph_manifest) - - # LoomExperiment currently handles sparse matrix incorrectly, so convert to dense for now - for (assay_name in SummarizedExperiment::assayNames(scle)) { - if (class(SummarizedExperiment::assays(scle)[[assay_name]]) == 'dgCMatrix') - SummarizedExperiment::assays(scle)[[assay_name]] <- as.matrix(SummarizedExperiment::assays(scle)[[assay_name]]) - } - # Write to loom by LoomExperiment::export - if (file.exists(filename)) file.remove(filename) - suppressWarnings(LoomExperiment::export( - scle, - filename, - matrix=ifelse(!is.null(main_layer) && main_layer %in% SummarizedExperiment::assayNames(scle), - main_layer, SummarizedExperiment::assayNames(scle)[1]), - colnames_attr='obs_names', - rownames_attr='var_names')) - rhdf5::h5closeAll() - - # Write extra bits - h5f <- rhdf5::H5Fopen(filename) - - # Write extra global attributes - rhdf5::h5writeAttribute(exchangeable_loom_version, h5f, 'LOOM_SPEC_VERSION') - rhdf5::h5writeAttribute('sce', h5f, 'created_from') - - # Write column names of 'CellID' and 'Gene' as attributes of '/col_attrs' and '/row_attrs' - h5g_ca <- rhdf5::H5Gopen(h5f, '/col_attrs') - rhdf5::h5writeAttribute('obs_names', h5g_ca, 'CellID') - h5g_ra <- rhdf5::H5Gopen(h5f, '/row_attrs') - rhdf5::h5writeAttribute('var_names', h5g_ra, 'Gene') - - # Write primary asssay name as attribute of '/matrix' - h5d_mx <- rhdf5::H5Dopen(h5f, '/matrix') - rhdf5::h5writeAttribute('assay', h5d_mx, names(SummarizedExperiment::assays(scle))[1]) - - # Write manifest - rhdf5::h5createGroup(h5f, '/attr') - if (!is.null(manifest)) { - manifest <- manifest[order(manifest$dtype, manifest$loom_path), ] - rhdf5::h5write(t(manifest), h5f, '/attr/manifest') + mtdt <- nestedEnvAsList(mtdt) + for (name in names(mtdt)) { + scle@metadata[[name]] <- mtdt[[name]] } + } + rhdf5::h5closeAll() - # Write reducedDims - for (i in seq_along(rdims)) { - rdim <- rdims[[i]] - loom_path <- rdim_manifest$loom_path[i] - rhdf5::h5write(t(rdim), h5f, loom_path) - } + return(as(scle, "SingleCellExperiment")) +} - # Write extra global datasets - for (i in seq_along(datasets)) { - dts <- datasets[[i]] - loom_path <- dts_manifest$loom_path[i] - rhdf5::h5write(dts, h5f, loom_path) +writeExchangeableLoom <- function(sce, filename, main_layer = NULL, return_manifest = FALSE) { + scle <- LoomExperiment::SingleCellLoomExperiment(sce) + + # Clean rowData and colData + row_fct_idx <- sapply(SummarizedExperiment::rowData(scle), is.factor) + SummarizedExperiment::rowData(scle)[row_fct_idx] <- lapply( + SummarizedExperiment::rowData(scle)[row_fct_idx], + function(x) type.convert(as.character(x), as.is = TRUE) + ) + col_fct_idx <- sapply(SummarizedExperiment::colData(scle), is.factor) + SummarizedExperiment::colData(scle)[col_fct_idx] <- lapply( + SummarizedExperiment::colData(scle)[col_fct_idx], + function(x) type.convert(as.character(x), as.is = TRUE) + ) + + # Handle reducedDims. Move embeddings out of reducedDims so they don't get + # written to unwanted location by export(). + rdims <- SingleCellExperiment::reducedDims(scle) + SingleCellExperiment::reducedDims(scle) <- S4Vectors::SimpleList() + if (!S4Vectors::isEmpty(rdims)) { + rdim_manifest <- makeManifest( + names(rdims), + dtype = "array", + loom_prefix = "/attr/reducedDims__", + anndata_prefix = "/obsm/X_", + sce_prefix = "@reducedDims@listData$" + ) + } else { + rdim_manifest <- NULL + } + + # Handle graphs. They get written by export() but we still need to record the paths. + if (!S4Vectors::isEmpty(LoomExperiment::colGraphs(scle))) { + colgraph_manifest <- makeManifest( + names(LoomExperiment::colGraphs(scle)), + dtype = "graph", + loom_prefix = "/col_graphs/", + anndata_prefix = "/uns/", + sce_prefix = "@colGraphs$" + ) + } else { + colgraph_manifest <- NULL + } + if (!S4Vectors::isEmpty(LoomExperiment::rowGraphs(scle))) { + rowgraph_manifest <- makeManifest( + names(LoomExperiment::rowGraphs(scle)), + dtype = "graph", + loom_prefix = "/row_graphs/", + anndata_prefix = NULL, + sce_prefix = "@rowGraphs$" + ) + } else { + rowgraph_manifest <- NULL + } + + # Handle metadata. Flatten nested lists to make export() happy. Scalars go + # to /.attrs, others go to /attr + if (length(S4Vectors::metadata(scle)) > 0) { + mtdt <- new.env(parent = emptyenv()) + flattenNestedListToEnv(scle@metadata, mtdt) + mtdt <- lapply(mtdt, function(x) { + if (!is.numeric(x) && !is.character(x) && !is.logical(x)) { + x <- type.convert(as.character(x), as.is = TRUE) + } + if ("class" %in% names(attributes(x))) { + attributes(x)$class <- NULL + } + x + }) + is_attr <- rep(FALSE, length(mtdt)) + names(is_attr) <- names(mtdt) + for (i in seq_along(mtdt)) { + x <- mtdt[[i]] + if ((is.vector(x) || is.array(x)) && (length(x) == 1)) { + is_attr[i] <- TRUE + mtdt[[i]] <- x[1] + } + } + is_attr <- is_attr & !grepl("__", names(mtdt)) + + # Let export handle attributes + S4Vectors::metadata(scle) <- mtdt[is_attr] + + excluded_from_manifest <- c( + "LOOM_SPEC_VERSION", "CreationDate", "last_modified", + "CreatedWith", "LoomExperiment-class", "created_from", + "last_modified_by" + ) + attr_names <- names(S4Vectors::metadata(scle)) + attr_names <- attr_names[!attr_names %in% excluded_from_manifest] + if (length(attr_names) > 0) { + attr_manifest <- makeManifest( + attr_names, + dtype = "scalar", + loom_prefix = "/.attrs[", + anndata_prefix = "/uns/", + sce_prefix = "@metadata$" + ) + } else { + attr_manifest <- NULL } - # Remove '/col_attrs/reducedDims' to make anndata happy - rhdf5::h5delete(h5f, '/col_attrs/reducedDims') - rhdf5::h5closeAll() - - if (return_manifest) { - return(manifest) + datasets <- mtdt[!is_attr] + if (length(datasets) > 0) { + dts_manifest <- makeManifest( + datasets, + dtype = NULL, + loom_prefix = "/attr/", + anndata_prefix = "/uns/", + sce_prefix = "@metadata$" + ) } else { - invisible() + dts_manifest <- NULL + } + } else { + attr_manifest <- NULL + dts_manifest <- NULL + datasets <- list() + } + + manifest <- rbind(attr_manifest, dts_manifest, rdim_manifest, colgraph_manifest, rowgraph_manifest) + + # LoomExperiment currently handles sparse matrix incorrectly, so convert to dense for now + for (assay_name in SummarizedExperiment::assayNames(scle)) { + if (class(SummarizedExperiment::assays(scle)[[assay_name]]) == "dgCMatrix") { + SummarizedExperiment::assays(scle)[[assay_name]] <- as.matrix(SummarizedExperiment::assays(scle)[[assay_name]]) } + } + # Write to loom by LoomExperiment::export + if (file.exists(filename)) file.remove(filename) + suppressWarnings(LoomExperiment::export( + scle, + filename, + matrix = ifelse(!is.null(main_layer) && main_layer %in% SummarizedExperiment::assayNames(scle), + main_layer, SummarizedExperiment::assayNames(scle)[1] + ), + colnames_attr = "obs_names", + rownames_attr = "var_names" + )) + rhdf5::h5closeAll() + + # Write extra bits + h5f <- rhdf5::H5Fopen(filename) + + # Write extra global attributes + rhdf5::h5writeAttribute(exchangeable_loom_version, h5f, "LOOM_SPEC_VERSION") + rhdf5::h5writeAttribute("sce", h5f, "created_from") + + # Write column names of 'CellID' and 'Gene' as attributes of '/col_attrs' and '/row_attrs' + h5g_ca <- rhdf5::H5Gopen(h5f, "/col_attrs") + rhdf5::h5writeAttribute("obs_names", h5g_ca, "CellID") + h5g_ra <- rhdf5::H5Gopen(h5f, "/row_attrs") + rhdf5::h5writeAttribute("var_names", h5g_ra, "Gene") + + # Write primary asssay name as attribute of '/matrix' + h5d_mx <- rhdf5::H5Dopen(h5f, "/matrix") + rhdf5::h5writeAttribute("assay", h5d_mx, names(SummarizedExperiment::assays(scle))[1]) + + # Write manifest + rhdf5::h5createGroup(h5f, "/attr") + if (!is.null(manifest)) { + manifest <- manifest[order(manifest$dtype, manifest$loom_path), ] + rhdf5::h5write(t(manifest), h5f, "/attr/manifest") + } + + # Write reducedDims + for (i in seq_along(rdims)) { + rdim <- rdims[[i]] + loom_path <- rdim_manifest$loom_path[i] + rhdf5::h5write(t(rdim), h5f, loom_path) + } + + # Write extra global datasets + for (i in seq_along(datasets)) { + dts <- datasets[[i]] + loom_path <- dts_manifest$loom_path[i] + rhdf5::h5write(dts, h5f, loom_path) + } + + # Remove '/col_attrs/reducedDims' to make anndata happy + rhdf5::h5delete(h5f, "/col_attrs/reducedDims") + rhdf5::h5closeAll() + + if (return_manifest) { + return(manifest) + } else { + invisible() + } } - diff --git a/R/methods.R b/R/methods.R index 4c87647..dce63a6 100644 --- a/R/methods.R +++ b/R/methods.R @@ -1,15 +1,30 @@ -convertFormat <- function( - obj, from = c('anndata', 'seurat', 'sce', 'loom'), to = c('anndata', 'loom', 'sce', 'seurat'), outFile = NULL, - main_layer = NULL, ... -) { - from <- match.arg(from) - to <- match.arg(to) +#' Convert between data objects +#' +#' This function converts between data format frequently used to hold single +#' cell data +#' +#' @param obj Input Seurat object +#' @param from Format of input object, e.g. "anndata", "seurat", "sce", "loom", +#' etc (str) +#' @param to Format of output object (str) +#' @param main_layer Required by some formats, may be "counts", "data", +#' "scale.data", etc (str) +#' +#' @return Output object +convertFormat <- function(obj, from = c("anndata", "seurat", "sce", "loom"), to = c("anndata", "loom", "sce", "seurat"), outFile = NULL, + main_layer = NULL, ...) { + from <- match.arg(from) + to <- match.arg(to) - tryCatch({ - func <- eval(parse(text = paste(from, to, sep='2'))) - }, error = function(e) { - stop(paste0('Unsupported conversion from "', from, '" to "', to, '"'), call. = FALSE) - }, finally = {}) + tryCatch( + { + func <- eval(parse(text = paste(from, to, sep = "2"))) + }, + error = function(e) { + stop(paste0('Unsupported conversion from "', from, '" to "', to, '"'), call. = FALSE) + }, + finally = {} + ) - return(func(obj, outFile = outFile, main_layer = main_layer, ...)) + return(func(obj, outFile = outFile, main_layer = main_layer, ...)) }