Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prepare Monocle 3.0 for Cell Ranger 3.0 #267

Open
wants to merge 3 commits into
base: monocle3_alpha
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ export(landmark_selection)
export(learnGraph)
export(load_HSMM)
export(load_HSMM_markers)
export(load_cellranger_data)
export(load_lung)
export(louvain_R)
export(markerDiffTable)
Expand Down Expand Up @@ -128,6 +129,7 @@ importFrom(L1Graph,get_knn)
importFrom(L1Graph,get_mst_with_shortcuts)
importFrom(L1Graph,principal_graph)
importFrom(Matrix,as.matrix)
importFrom(Matrix,readMM)
importFrom(RANN,nn2)
importFrom(Rcpp,compileAttributes)
importFrom(VGAM,binomialff)
Expand Down
135 changes: 135 additions & 0 deletions R/data_io.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
#' Get a genome from Cell Ranger output
#'
#' @param matrix_path Path to a matrices directory produced by the Cell Ranger pipeline
#' @param genome Genome to specifically check for, otherwise will check for whatever genome(s) exist there
#' @return A string representing the genome found
get_genome_in_matrix_path <- function(matrix_path, genome=NULL) {
genomes <- dir(matrix_path)
if (is.null(genome)) {
if (length(genomes) == 1) {
genome <- genomes[1]
} else {
stop(sprintf("Multiple genomes found; please specify one. \n Genomes present: %s",paste(genomes, collapse=", ")))
}
} else if (!(genome %in% genomes)) {
stop(sprintf("Could not find specified genome: '%s'. Genomes present: %s",
genome,paste(genomes, collapse=", ")))
}
return(genome)
}

#' Load data from the 10x Genomics Cell Ranger pipeline
#'
#' Loads cellranger data into a CellDataSet object. Note that if your dataset
#' is from version 3.0 and contains non-Gene-Expression data (e.g. Antibodies or
#' CRISPR features), only the Gene Expression data is returned.
#'
#' @param pipestance_path Path to the output directory produced by Cell Ranger
#' @param genome The desired genome (e.g., 'hg19' or 'mm10')
#' @param barcode_filtered Load only the cell-containing barcodes
#' @param lowerDetectionLimit the minimum expression level that consistitutes true expression (passed to newCellDataSet)
#' @param expressionFamily the VGAM family function to be used for expression response variables (passed to newCellDataSet)
#' @return a new CellDataSet object
#' @export
#' @importFrom Matrix readMM
#' @examples
#' \dontrun{
#' # Load from a Cell Ranger output directory
#' gene_bc_matrix <- load_cellranger_data("/home/user/cellranger_output")
#' }
load_cellranger_data <- function(pipestance_path=NULL, genome=NULL, barcode_filtered=TRUE, lowerDetectionLimit=0.5, expressionFamily=negbinomial.size()) {
# check for correct directory structure
if (!dir.exists(pipestance_path))
stop("Could not find the pipestance path: '", pipestance_path,"'. Please double-check if the directory exists.\n")
od = file.path(pipestance_path, "outs")
if (!dir.exists(od))
stop("Could not find the pipestance output directory: '", file.path(pipestance_path,'outs'),"'. Please double-check if the directory exists.\n")

v3p = file.path(od, "filtered_feature_bc_matrix")
v2p = file.path(od, "filtered_gene_bc_matrices")
v3d = dir.exists(v3p)
if(barcode_filtered) {
matrix_dir = ifelse(v3d, v3p, v2p)
} else {
matrix_dir = ifelse(v3d, file.path(od, "raw_feature_bc_matrix"), file.path(od, "raw_gene_bc_matrices"))
}
if(!dir.exists(matrix_dir))
stop("Could not find directory: ", matrix_dir)

if(v3d) {
features.loc <- file.path(matrix_dir, "features.tsv.gz")
barcode.loc <- file.path(matrix_dir, "barcodes.tsv.gz")
matrix.loc <- file.path(matrix_dir, "matrix.mtx.gz")
summary.loc <- file.path(od, "metrics_summary_csv.csv")
} else {
genome = get_genome_in_matrix_path(matrix_dir, genome)
barcode.loc <- file.path(matrix_dir, genome, "barcodes.tsv")
features.loc <- file.path(matrix_dir, genome, "genes.tsv")
matrix.loc <- file.path(matrix_dir, genome, "matrix.mtx")
summary.loc <- file.path(od, "metrics_summary.csv")
}
if (!file.exists(barcode.loc)){
stop("Barcode file missing")
}
if (!file.exists(features.loc)){
stop("Gene name or features file missing")
}
if (!file.exists(matrix.loc)){
stop("Expression matrix file missing")
}
# Not importing for now.
#if(!file.exists(summary.loc)) {
# stop("Metrics summary file missing")
#}
data <- readMM(matrix.loc)

feature.names = read.delim(features.loc,
header = FALSE,
stringsAsFactors = FALSE)
feature.names$V1 = make.unique(feature.names$V1) # Duplicate row names not allowed
if(dim(data)[1] != length(feature.names[,1])) {
stop(sprintf("Mismatch dimension between gene file: \n\t %s\n and matrix file: \n\t %s\n",
features.loc,
matrix.loc))
}
if(v3d) {
# We will only load GEX data for the relevant genome
data_types = factor(feature.names$V3)
allowed = data_types == "Gene Expression"
if(!is.null(genome)) {
# If not multigenome, no prefix will be added and we won't filter out the one genome
gfilter = grepl(genome, feature.names$V1)
if(any(gfilter)) {
allowed = allowed & grepl(genome, feature.names$V1)
} else {
message("Data does not appear to be from a multi-genome sample, simply returning all gene feature data without filtering by genome.")
}

}
data = data[allowed, ]
feature.names = feature.names[allowed,1:2]
}
colnames(feature.names) = c("id", "gene_short_name")
rownames(data) = feature.names[,"id"]
rownames(feature.names) = feature.names[,"id"]

barcodes <- read.delim(barcode.loc, stringsAsFactors=FALSE, header=FALSE)
if (dim(data)[2] != length(barcodes[,1])) {
stop(sprintf("Mismatch dimension between barcode file: \n\t %s\n and matrix file: \n\t %s\n", barcode.loc,matrix.loc))
}
barcodes$V1 = make.unique(barcodes$V1)
colnames(data) = barcodes[,1]
pd = data.frame(barcode=barcodes[,1], row.names=barcodes[,1])
#The expression value matrix \emph{must} have the same number of columns as the \Robject{phenoData} has rows,
#and it must have the same number of rows as the \Robject{featureData} data frame has rows.
# Row names of the \Robject{phenoData} object should match the column names of the expression matrix. Row names of
# the \Robject{featureData} object should match row names of the expression matrix. Also, one of the columns of the
#\Robject{featureData} must be named "gene\_short\_name".

gbm <- newCellDataSet(data,
phenoData = new("AnnotatedDataFrame", pd),
featureData = new("AnnotatedDataFrame", feature.names),
lowerDetectionLimit=lowerDetectionLimit,
expressionFamily=expressionFamily)
return(gbm)
}
9 changes: 2 additions & 7 deletions actual_vignette_holder/monocle-vignette-original.Rnw
Original file line number Diff line number Diff line change
Expand Up @@ -441,17 +441,12 @@ To work with your data in a sparse format, simply provide it to Monocle as a spa
expressionFamily=negbinomial.size())
@ %def

\textbf{NOTE:} The output from a number of RNA-Seq pipelines, including CellRanger, is already in a sparseMatrix format (e.g. MTX). If so, you should just pass it directly to newCellDataSet without first converting it to a dense matrix (via \Rfunction(as.matrix)), because that may exceed your available memeory. If you have 10X Genomics data and are using \Rpackage{cellrangerRkit}, you can use it to load your data and then pass that to Monocle as follows:
\textbf{NOTE:} If you have 10X Genomics data, you can load it directly for Monocle as follows:

<<chromium_load, eval=FALSE>>=
cellranger_pipestance_path <- "/path/to/your/pipeline/output/directory"
gbm <- load_cellranger_matrix(cellranger_pipestance_path)
gbm_cds <- load_cellranger_data(cellranger_pipestance_path)

gbm_cds <- newCellDataSet(exprs(gbm),
phenoData = new("AnnotatedDataFrame", pData(gbm)),
featureData = new("AnnotatedDataFrame", fData(gbm)),
lowerDetectionLimit=0.5,
expressionFamily=negbinomial.size())
@ %de

Monocle's sparse matrix support is provided by the \Rpackage{Matrix} package. Other sparse matrix packages, such as \Rpackage{slam} or \Rpackage{SparseM} are not supported.
Expand Down
10 changes: 2 additions & 8 deletions actual_vignette_holder/monocle-vignette-original.tex
Original file line number Diff line number Diff line change
Expand Up @@ -588,19 +588,13 @@ \subsection{Choosing a distribution for expression data}
\end{kframe}
\end{knitrout}

\textbf{NOTE:} The output from a number of RNA-Seq pipelines, including CellRanger, is already in a sparseMatrix format (e.g. MTX). If so, you should just pass it directly to newCellDataSet without first converting it to a dense matrix (via \Rfunction(as.matrix)), because that may exceed your available memeory. If you have 10X Genomics data and are using \Rpackage{cellrangerRkit}, you can use it to load your data and then pass that to Monocle as follows:
\textbf{NOTE:} The output from a number of RNA-Seq pipelines, including CellRanger, is already in a sparseMatrix format (e.g. MTX). You can load this data into a \Rfunction{newCellDataSet} using the function \Rfunction(load_cellranger_data) as follows:

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlstd{cellranger_pipestance_path} \hlkwb{<-} \hlstr{"/path/to/your/pipeline/output/directory"}
\hlstd{gbm} \hlkwb{<-} \hlkwd{load_cellranger_matrix}\hlstd{(cellranger_pipestance_path)}

\hlstd{gbm_cds} \hlkwb{<-} \hlkwd{newCellDataSet}\hlstd{(}\hlkwd{exprs}\hlstd{(gbm),}
\hlkwc{phenoData} \hlstd{=} \hlkwd{new}\hlstd{(}\hlstr{"AnnotatedDataFrame"}\hlstd{,} \hlkwd{pData}\hlstd{(gbm)),}
\hlkwc{featureData} \hlstd{=} \hlkwd{new}\hlstd{(}\hlstr{"AnnotatedDataFrame"}\hlstd{,} \hlkwd{fData}\hlstd{(gbm)),}
\hlkwc{lowerDetectionLimit}\hlstd{=}\hlnum{0.5}\hlstd{,}
\hlkwc{expressionFamily}\hlstd{=}\hlkwd{negbinomial.size}\hlstd{())}
\hlstd{gbm} \hlkwb{<-} \hlkwd{load_cellranger_data}\hlstd{(cellranger_pipestance_path)}
\end{alltt}
\end{kframe}
\end{knitrout}
Expand Down
19 changes: 19 additions & 0 deletions man/get_genome_in_matrix_path.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

35 changes: 35 additions & 0 deletions man/load_cellranger_data.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
ATGCCAGAACGACT-1
CATGGCCTGTGCAT-1
GAACCTGATGAACC-1
TGACTGGATTCTCA-1
AGTCAGACTGCACA-1
TCTGATACACGTGT-1
TGGTATCTAAACAG-1
GCAGCTCTGTTTCT-1
GATATAACACGCAT-1
AATGTTGACAGTCA-1
AGGTCATGAGTGTC-1
AGAGATGATCTCGC-1
GGGTAACTCTAGTG-1
CATGAGACACGGGA-1
TACGCCACTCCGAA-1
CTAAACCTGTGCAT-1
GTAAGCACTCATTC-1
TTGGTACTGAATCC-1
CATCATACGGAGCA-1
TACATCACGCTAAC-1
TTACCATGAATCGC-1
ATAGGAGAAACAGA-1
GCGCACGACTTTAC-1
ACTCGCACGAAAGT-1
ATTACCTGCCTTAT-1
CCCAACTGCAATCG-1
AAATTCGAATCACG-1
CCATCCGATTCGCC-1
TCCACTCTGAGCTT-1
CATCAGGATGCACA-1
CTAAACCTCTGACA-1
GATAGAGAAGGGTG-1
CTAACGGAACCGAT-1
AGATATACCCGTAA-1
TACTCTGAATCGAC-1
GCGCATCTTGCTCC-1
GTTGACGATATCGG-1
ACAGGTACTGGTGT-1
GGCATATGCTTATC-1
CATTACACCAACTG-1
TAGGGACTGAACTC-1
GCTCCATGAGAAGT-1
TACAATGATGCTAG-1
CTTCATGACCGAAT-1
CTGCCAACAGGAGC-1
TTGCATTGAGCTAC-1
AAGCAAGAGCTTAG-1
CGGCACGAACTCAG-1
GGTGGAGATTACTC-1
GGCCGATGTACTCT-1
CGTAGCCTGTATGC-1
TGAGCTGAATGCTG-1
CCTATAACGAGACG-1
ATAAGTTGGTACGT-1
AAGCGACTTTGACG-1
ACCAGTGAATACCG-1
ATTGCACTTGCTTT-1
CTAGGTGATGGTTG-1
GCACTAGACCTTTA-1
CATGCGCTAGTCAC-1
TTGAGGACTACGCA-1
ATACCACTCTAAGC-1
CATATAGACTAAGC-1
TTTAGCTGTACTCT-1
GACATTCTCCACCT-1
ACGTGATGCCATGA-1
ATTGTAGATTCCCG-1
GATAGAGATCACGA-1
AATGCGTGGACGGA-1
GCGTAAACACGGTT-1
ATTCAGCTCATTGG-1
GGCATATGGGGAGT-1
ATCATCTGACACCA-1
GTCATACTTCGCCT-1
TTACGTACGTTCAG-1
GAGTTGTGGTAGCT-1
GACGCTCTCTCTCG-1
AGTCTTACTTCGGA-1
GGAACACTTCAGAC-1
CTTGATTGATCTTC-1
Loading