cole-trapnell-lab · evolvedmicrobe · Dec 19, 2018 · Feb 26, 2019 · Feb 26, 2019
diff --git a/NAMESPACE b/NAMESPACE
@@ -37,6 +37,7 @@ export(landmark_selection)
 export(learnGraph)
 export(load_HSMM)
 export(load_HSMM_markers)
+export(load_cellranger_data)
 export(load_lung)
 export(louvain_R)
 export(markerDiffTable)
@@ -128,6 +129,7 @@ importFrom(L1Graph,get_knn)
 importFrom(L1Graph,get_mst_with_shortcuts)
 importFrom(L1Graph,principal_graph)
 importFrom(Matrix,as.matrix)
+importFrom(Matrix,readMM)
 importFrom(RANN,nn2)
 importFrom(Rcpp,compileAttributes)
 importFrom(VGAM,binomialff)

diff --git a/R/data_io.R b/R/data_io.R
@@ -0,0 +1,135 @@
+#' Get a genome from Cell Ranger output
+#'
+#' @param matrix_path Path to a matrices directory produced by the Cell Ranger pipeline
+#' @param genome Genome to specifically check for, otherwise will check for whatever genome(s) exist there
+#' @return A string representing the genome found
+get_genome_in_matrix_path <- function(matrix_path, genome=NULL) {
+  genomes <- dir(matrix_path)
+  if (is.null(genome)) {
+    if (length(genomes) == 1) {
+      genome <- genomes[1]
+    } else {
+      stop(sprintf("Multiple genomes found; please specify one. \n Genomes present: %s",paste(genomes, collapse=", ")))
+    }
+  } else if (!(genome %in% genomes)) {
+    stop(sprintf("Could not find specified genome: '%s'. Genomes present: %s",
+                 genome,paste(genomes, collapse=", ")))
+  }
+  return(genome)
+}
+
+#' Load data from the 10x Genomics Cell Ranger pipeline
+#' 
+#' Loads cellranger data into a CellDataSet object.  Note that if your dataset
+#' is from version 3.0 and contains non-Gene-Expression data (e.g. Antibodies or
+#' CRISPR features), only the Gene Expression data is returned.
+#'
+#' @param pipestance_path Path to the output directory produced by Cell Ranger
+#' @param genome The desired genome (e.g., 'hg19' or 'mm10')
+#' @param barcode_filtered Load only the cell-containing barcodes
+#' @param lowerDetectionLimit the minimum expression level that consistitutes true expression (passed to newCellDataSet)
+#' @param expressionFamily the VGAM family function to be used for expression response variables (passed to newCellDataSet)
+#' @return a new CellDataSet object
+#' @export
+#' @importFrom Matrix readMM
+#' @examples
+#' \dontrun{
+#' # Load from a Cell Ranger output directory
+#' gene_bc_matrix <- load_cellranger_data("/home/user/cellranger_output")
+#' }
+load_cellranger_data <- function(pipestance_path=NULL, genome=NULL, barcode_filtered=TRUE, lowerDetectionLimit=0.5, expressionFamily=negbinomial.size()) {
+  # check for correct directory structure
+  if (!dir.exists(pipestance_path))
+    stop("Could not find the pipestance path: '", pipestance_path,"'. Please double-check if the directory exists.\n")
+  od = file.path(pipestance_path, "outs")
+  if (!dir.exists(od))
+    stop("Could not find the pipestance output directory: '", file.path(pipestance_path,'outs'),"'. Please double-check if the directory exists.\n")
+
+  v3p = file.path(od, "filtered_feature_bc_matrix")
+  v2p = file.path(od, "filtered_gene_bc_matrices")
+  v3d = dir.exists(v3p)
+  if(barcode_filtered) {
+    matrix_dir = ifelse(v3d, v3p, v2p)
+  } else {
+    matrix_dir = ifelse(v3d, file.path(od, "raw_feature_bc_matrix"), file.path(od, "raw_gene_bc_matrices"))
+  }
+  if(!dir.exists(matrix_dir))
+    stop("Could not find directory: ", matrix_dir)
+
+  if(v3d) {
+    features.loc <- file.path(matrix_dir, "features.tsv.gz")
+    barcode.loc <- file.path(matrix_dir, "barcodes.tsv.gz")
+    matrix.loc <- file.path(matrix_dir, "matrix.mtx.gz")
+    summary.loc <- file.path(od, "metrics_summary_csv.csv")
+  } else {
+    genome = get_genome_in_matrix_path(matrix_dir, genome)
+    barcode.loc <- file.path(matrix_dir, genome, "barcodes.tsv")
+    features.loc <- file.path(matrix_dir, genome, "genes.tsv")
+    matrix.loc <- file.path(matrix_dir, genome, "matrix.mtx")
+    summary.loc <- file.path(od, "metrics_summary.csv")
+  }
+  if (!file.exists(barcode.loc)){
+    stop("Barcode file missing")
+  }
+  if (!file.exists(features.loc)){
+    stop("Gene name or features file missing")
+  }
+  if (!file.exists(matrix.loc)){
+    stop("Expression matrix file missing")
+  }
+  # Not importing for now.
+  #if(!file.exists(summary.loc)) {
+  #  stop("Metrics summary file missing")
+  #}
+  data <- readMM(matrix.loc)
+
+  feature.names = read.delim(features.loc, 
+                             header = FALSE,
+                             stringsAsFactors = FALSE)
+  feature.names$V1 = make.unique(feature.names$V1) # Duplicate row names not allowed
+  if(dim(data)[1] != length(feature.names[,1])) {
+    stop(sprintf("Mismatch dimension between gene file: \n\t %s\n and matrix file: \n\t %s\n",
+                 features.loc,
+                 matrix.loc))
+  }
+  if(v3d) {
+    # We will only load GEX data for the relevant genome
+    data_types = factor(feature.names$V3)
+    allowed = data_types == "Gene Expression"
+    if(!is.null(genome)) {
+      # If not multigenome, no prefix will be added and we won't filter out the one genome
+      gfilter = grepl(genome, feature.names$V1)
+      if(any(gfilter)) {
+        allowed = allowed & grepl(genome, feature.names$V1)
+      } else {
+        message("Data does not appear to be from a multi-genome sample, simply returning all gene feature data without filtering by genome.")
+      }
+
+    }
+    data = data[allowed, ]
+    feature.names = feature.names[allowed,1:2]
+  }
+  colnames(feature.names) = c("id", "gene_short_name")
+  rownames(data) = feature.names[,"id"]
+  rownames(feature.names) = feature.names[,"id"]
+
+  barcodes <- read.delim(barcode.loc, stringsAsFactors=FALSE, header=FALSE)
+  if (dim(data)[2] != length(barcodes[,1])) {
+    stop(sprintf("Mismatch dimension between barcode file: \n\t %s\n and matrix file: \n\t %s\n", barcode.loc,matrix.loc))
+  }
+  barcodes$V1 = make.unique(barcodes$V1)
+  colnames(data) = barcodes[,1]
+  pd = data.frame(barcode=barcodes[,1], row.names=barcodes[,1])
+  #The expression value matrix \emph{must} have the same number of columns as the \Robject{phenoData} has rows, 
+  #and it must have the same number of rows as the \Robject{featureData} data frame has rows. 
+  # Row names of the \Robject{phenoData} object should match the column names of the expression matrix. Row names of 
+  # the \Robject{featureData} object should match row names of the expression matrix. Also, one of the columns of the 
+  #\Robject{featureData} must be named "gene\_short\_name".
+
+  gbm <- newCellDataSet(data,
+                            phenoData = new("AnnotatedDataFrame", pd), 
+                            featureData = new("AnnotatedDataFrame", feature.names),
+                            lowerDetectionLimit=lowerDetectionLimit,
+                            expressionFamily=expressionFamily)
+  return(gbm)
+}
diff --git a/actual_vignette_holder/monocle-vignette-original.Rnw b/actual_vignette_holder/monocle-vignette-original.Rnw
@@ -441,17 +441,12 @@ To work with your data in a sparse format, simply provide it to Monocle as a spa
                         expressionFamily=negbinomial.size())
  @ %def
 
-\textbf{NOTE:} The output from a number of RNA-Seq pipelines, including CellRanger, is already in a sparseMatrix format (e.g. MTX). If so, you should just pass it directly to newCellDataSet without first converting it to a dense matrix (via \Rfunction(as.matrix)), because that may exceed your available memeory. If you have 10X Genomics data and are using \Rpackage{cellrangerRkit}, you can use it to load your data and then pass that to Monocle as follows:
+\textbf{NOTE:} If you have 10X Genomics data, you can load it directly for Monocle as follows:
 
  <<chromium_load, eval=FALSE>>=
 cellranger_pipestance_path <- "/path/to/your/pipeline/output/directory"
-gbm <- load_cellranger_matrix(cellranger_pipestance_path)
+gbm_cds <- load_cellranger_data(cellranger_pipestance_path)
 
-gbm_cds <- newCellDataSet(exprs(gbm),
-                          phenoData = new("AnnotatedDataFrame", pData(gbm)), 
-                          featureData = new("AnnotatedDataFrame", fData(gbm)),
-                          lowerDetectionLimit=0.5,
-                          expressionFamily=negbinomial.size())
  @ %de
 
 Monocle's sparse matrix support is provided by the \Rpackage{Matrix} package. Other sparse matrix packages, such as \Rpackage{slam} or \Rpackage{SparseM} are not supported.

diff --git a/actual_vignette_holder/monocle-vignette-original.tex b/actual_vignette_holder/monocle-vignette-original.tex
@@ -588,19 +588,13 @@ \subsection{Choosing a distribution for expression data}
 \end{kframe}
 \end{knitrout}
 
-\textbf{NOTE:} The output from a number of RNA-Seq pipelines, including CellRanger, is already in a sparseMatrix format (e.g. MTX). If so, you should just pass it directly to newCellDataSet without first converting it to a dense matrix (via \Rfunction(as.matrix)), because that may exceed your available memeory. If you have 10X Genomics data and are using \Rpackage{cellrangerRkit}, you can use it to load your data and then pass that to Monocle as follows:
+\textbf{NOTE:} The output from a number of RNA-Seq pipelines, including CellRanger, is already in a sparseMatrix format (e.g. MTX).  You can load this data into a \Rfunction{newCellDataSet} using the function \Rfunction(load_cellranger_data) as follows:
 
 \begin{knitrout}
 \definecolor{shadecolor}{rgb}{0.969, 0.969, 0.969}\color{fgcolor}\begin{kframe}
 \begin{alltt}
 \hlstd{cellranger_pipestance_path} \hlkwb{<-} \hlstr{"/path/to/your/pipeline/output/directory"}
-\hlstd{gbm} \hlkwb{<-} \hlkwd{load_cellranger_matrix}\hlstd{(cellranger_pipestance_path)}
-
-\hlstd{gbm_cds} \hlkwb{<-} \hlkwd{newCellDataSet}\hlstd{(}\hlkwd{exprs}\hlstd{(gbm),}
-                         \hlkwc{phenoData} \hlstd{=} \hlkwd{new}\hlstd{(}\hlstr{"AnnotatedDataFrame"}\hlstd{,} \hlkwd{pData}\hlstd{(gbm)),}
-                         \hlkwc{featureData} \hlstd{=} \hlkwd{new}\hlstd{(}\hlstr{"AnnotatedDataFrame"}\hlstd{,} \hlkwd{fData}\hlstd{(gbm)),}
-                         \hlkwc{lowerDetectionLimit}\hlstd{=}\hlnum{0.5}\hlstd{,}
-                         \hlkwc{expressionFamily}\hlstd{=}\hlkwd{negbinomial.size}\hlstd{())}
+\hlstd{gbm} \hlkwb{<-} \hlkwd{load_cellranger_data}\hlstd{(cellranger_pipestance_path)}
 \end{alltt}
 \end{kframe}
 \end{knitrout}

diff --git a/man/get_genome_in_matrix_path.Rd b/man/get_genome_in_matrix_path.Rd
diff --git a/man/load_cellranger_data.Rd b/man/load_cellranger_data.Rd
diff --git a/tests/testdata/cr2.0/outs/filtered_gene_bc_matrices/hg19/barcodes.tsv b/tests/testdata/cr2.0/outs/filtered_gene_bc_matrices/hg19/barcodes.tsv
@@ -0,0 +1,80 @@
+ATGCCAGAACGACT-1
+CATGGCCTGTGCAT-1
+GAACCTGATGAACC-1
+TGACTGGATTCTCA-1
+AGTCAGACTGCACA-1
+TCTGATACACGTGT-1
+TGGTATCTAAACAG-1
+GCAGCTCTGTTTCT-1
+GATATAACACGCAT-1
+AATGTTGACAGTCA-1
+AGGTCATGAGTGTC-1
+AGAGATGATCTCGC-1
+GGGTAACTCTAGTG-1
+CATGAGACACGGGA-1
+TACGCCACTCCGAA-1
+CTAAACCTGTGCAT-1
+GTAAGCACTCATTC-1
+TTGGTACTGAATCC-1
+CATCATACGGAGCA-1
+TACATCACGCTAAC-1
+TTACCATGAATCGC-1
+ATAGGAGAAACAGA-1
+GCGCACGACTTTAC-1
+ACTCGCACGAAAGT-1
+ATTACCTGCCTTAT-1
+CCCAACTGCAATCG-1
+AAATTCGAATCACG-1
+CCATCCGATTCGCC-1
+TCCACTCTGAGCTT-1
+CATCAGGATGCACA-1
+CTAAACCTCTGACA-1
+GATAGAGAAGGGTG-1
+CTAACGGAACCGAT-1
+AGATATACCCGTAA-1
+TACTCTGAATCGAC-1
+GCGCATCTTGCTCC-1
+GTTGACGATATCGG-1
+ACAGGTACTGGTGT-1
+GGCATATGCTTATC-1
+CATTACACCAACTG-1
+TAGGGACTGAACTC-1
+GCTCCATGAGAAGT-1
+TACAATGATGCTAG-1
+CTTCATGACCGAAT-1
+CTGCCAACAGGAGC-1
+TTGCATTGAGCTAC-1
+AAGCAAGAGCTTAG-1
+CGGCACGAACTCAG-1
+GGTGGAGATTACTC-1
+GGCCGATGTACTCT-1
+CGTAGCCTGTATGC-1
+TGAGCTGAATGCTG-1
+CCTATAACGAGACG-1
+ATAAGTTGGTACGT-1
+AAGCGACTTTGACG-1
+ACCAGTGAATACCG-1
+ATTGCACTTGCTTT-1
+CTAGGTGATGGTTG-1
+GCACTAGACCTTTA-1
+CATGCGCTAGTCAC-1
+TTGAGGACTACGCA-1
+ATACCACTCTAAGC-1
+CATATAGACTAAGC-1
+TTTAGCTGTACTCT-1
+GACATTCTCCACCT-1
+ACGTGATGCCATGA-1
+ATTGTAGATTCCCG-1
+GATAGAGATCACGA-1
+AATGCGTGGACGGA-1
+GCGTAAACACGGTT-1
+ATTCAGCTCATTGG-1
+GGCATATGGGGAGT-1
+ATCATCTGACACCA-1
+GTCATACTTCGCCT-1
+TTACGTACGTTCAG-1
+GAGTTGTGGTAGCT-1
+GACGCTCTCTCTCG-1
+AGTCTTACTTCGGA-1
+GGAACACTTCAGAC-1
+CTTGATTGATCTTC-1