zUMIs2.9.0: speed improvements

sdparekh · Jul 5, 2020 · c06bcaa · c06bcaa
1 parent 991165d
commit c06bcaa
Show file tree

Hide file tree

Showing 5 changed files with 130 additions and 128 deletions.
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@ git clone https://github.com/sdparekh/zUMIs.git`
 ```
 Start anaylysing your data:
 ```
-zUMIs/zUMIs-master.sh -c -y my_config.yaml
+zUMIs/zUMIs.sh -c -y my_config.yaml
 ```
 zUMIs now comes with its own [miniconda](https://docs.conda.io/en/latest/miniconda.html) environment, so you do not need to deal with dependencies or installations (use the -c flag to use conda).
 If you wish to use your own dependencies, head to the [zUMIs wiki](https://github.com/sdparekh/zUMIs/wiki/Installation#dependencies) to see what is required.
@@ -29,6 +29,8 @@ We provide a script to convert zUMIs output into loom file automatically based o
 zUMIs will try to automatically do this, otherwise convert zUMIs output to loom by simply running `Rscript rds2loom.R myRun.yaml`.
 
 ## Changelog
+05 July 2020: [zUMIs2.9.0 released](https://github.com/sdparekh/zUMIs/releases/tag/2.7.0): Speed up STAR read mapping by parallel instances if enough resources are available. Change the main zUMIs script to `zUMIs.sh`. Speed up and reduce clutter by loading reads from bam files using parallelised Rsamtools calls instead of printing temporary text files. Speed up counting by parallelising exon / intron / exon+intron counting as well as downsamplings. Speed up by parallelising creation of wide format count matrices. 
+
 23 June 2020: zUMIs2.8.3: Merged code contribution from @gringer: prevent errors by emitting SAM headers in chunked unmapped .bam file output of fqfilter. Changed call to STAR to prevent stalling of samtools pipe. 
 
 18 May 2020: zUMIs2.8.2: Added `merge_demultiplexed_fastq.R` to concatenate previously demultiplexed fastq files. For usage details see here: https://github.com/sdparekh/zUMIs/wiki/Starting-from-demultiplexed-fastq-files

diff --git a/UMIstuffFUN.R b/UMIstuffFUN.R
@@ -56,96 +56,55 @@ ham_mat <- function(umistrings) {
   nrow(X) - H
 }
 
-prep_samtools <- function(featfile,bccount,inex,cores,samtoolsexc){
-  print("Extracting reads from bam file(s)...")
-
-  nchunks <- length(unique(bccount$chunkID))
-  all_rgfiles <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".RGgroup.",1:nchunks,".txt")
-
-
-  for(i in unique(bccount$chunkID)){
-    rgfile <- all_rgfiles[i]
-    chunks <- bccount[chunkID==i]$XC
-    write.table(file=rgfile,chunks,col.names = F,quote = F,row.names = F)
-  }
 
+reads2genes_new <- function(featfile, bccount, inex, chunk, cores, keepUnassigned = FALSE){
+  chunk_bcs <- bccount[chunkID==chunk]$XC
+  idxstats <- Rsamtools::idxstatsBam(featfile)
+  taglist <- c("BC", "UB","GE")
   if(inex){
-    headerXX <- paste( c(paste0("V",1:4)) ,collapse="\t")
-  }else{
-    headerXX <- paste( c(paste0("V",1:3)) ,collapse="\t")
+    taglist <- c(taglist, "GI")
   }
-  write(headerXX,"freadHeader")
-
-  headercommand <- "cat freadHeader > "
-  layoutflag <- ifelse(opt$read_layout == "PE", "-f 0x0040", "")
-  samcommand <- paste(samtoolsexc," view -x QB -x QU -x BX -x NH -x AS -x nM -x HI -x IH -x NM -x uT -x MD -x jM -x jI -x XN -x XS -x UX -x ES -x EN -x IS -x IN", layoutflag, "-@")
-
-  outfiles <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".tmp.",1:nchunks,".txt")
-  system(paste(headercommand,outfiles,collapse = "; "))
-
-  cpusperchunk <- round(cores/nchunks,0)
-
-  if(inex == FALSE){
-    grepcommand <- " | cut -f12,13,14 | sed 's/BC:Z://' | sed 's/GE:Z://' | sed 's/UB:Z://' | grep -F -f "
-    ex_cmd <- paste(samcommand,cpusperchunk,featfile,grepcommand,all_rgfiles,">>",outfiles," & ",collapse = " ")
-
-    system(paste(ex_cmd,"wait"))
+
+  rsamtools_reads <- mclapply(1:nrow(idxstats), function(x) {
+    if(opt$read_layout == "PE"){
+      parms <- ScanBamParam(tag=taglist, 
+                            what="pos", 
+                            flag = scanBamFlag(isFirstMateRead = TRUE),
+                            tagFilter = list(BC = chunk_bcs),
+                            which = GRanges(seqnames = idxstats[x,"seqnames"], ranges = IRanges(1,idxstats[x,"seqlength"])))
+    }else{
+      parms <- ScanBamParam(tag=taglist, 
+                            what="pos", 
+                            tagFilter = list(BC = chunk_bcs),
+                            which = GRanges(seqnames = idxstats[x,"seqnames"], ranges = IRanges(1,idxstats[x,"seqlength"])))
+    }
+
+    dat <- scanBam(file = featfile, param = parms)
+    if(inex){
+      dt <- data.table(RG = dat[[1]]$tag$BC, UB = dat[[1]]$tag$UB, GE = dat[[1]]$tag$GE, GEin = dat[[1]]$tag$GI)
+    }else{
+      dt <- data.table(RG = dat[[1]]$tag$BC, UB = dat[[1]]$tag$UB, GE = dat[[1]]$tag$GE)
+    }
+    return(dt)
+  }, mc.cores = cores)
+  rsamtools_reads <- rbindlist(rsamtools_reads, fill = TRUE, use.names = TRUE)
+  if(inex){
+    rsamtools_reads[ , ftype :="NA"][
+       is.na(GEin)==F, ftype :="intron"][
+       is.na(GE)==F  , ftype:="exon"][
+       is.na(GE)     , GE:=GEin][ 
+                     ,GEin:=NULL ]
   }else{
-    grepcommand <- " | cut -f12,13,14,15 | sed 's/BC:Z://' | sed 's/Z://g' | grep -F -f "
-    inex_cmd <- paste(samcommand,cpusperchunk,featfile,grepcommand,all_rgfiles,">>",outfiles," & ",collapse = " ")
-
-    system(paste(inex_cmd,"wait"))
+    rsamtools_reads[, ftype :="NA"][
+        is.na(GE)==F, ftype :="exon"]
   }
-  system("rm freadHeader")
-  system(paste("rm",all_rgfiles))
-
-  return(outfiles)
-}
-
-reads2genes <- function(inex,chunkID, keepUnassigned = FALSE){
-
-  samfile <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".tmp.",chunkID,".txt")
-
-   if(inex==FALSE){
-     reads<-data.table::fread(samfile, na.strings=c(""),
-                              select=c(1,2,3),header=T,fill=T,colClasses = "character" , col.names = c("RG","GE","UB") )[
-                              ,"ftype":="NA"
-                              ][is.na(GE)==F,  ftype:="exon"]
-  }else{
-    reads<-data.table::fread(samfile, na.strings=c(""),
-                             select=c(1,2,3,4),header=T,fill=T,colClasses = "character" , col.names = c("RG","V2","V3","V4") )[
-                                     , V2_id := substr(V2,1,2)
-                                  ][ , V3_id := substr(V3,1,2)
-                                  ][ , V4_id := substr(V4,1,2)
-                                  ][ , V2 := substr(V2,4,nchar(V2))
-                                  ][ , V3 := substr(V3,4,nchar(V3))
-                                  ][ , V4 := substr(V4,4,nchar(V4))
-                                  ][ V2_id == "GE", GE := V2
-                                  ][ V2_id == "GI", GEin := V2
-                                  ][ V2_id == "UB", UB := V2
-                                  ][ V3_id == "GE", GE := V3
-                                  ][ V3_id == "GI", GEin := V3
-                                  ][ V3_id == "UB", UB := V3
-                                  ][ V4_id == "UB", UB := V4
-                                  ][ V4_id == "GI", GEin := V4
-                                  ][ ,c("V2_id","V3_id","V4_id","V2","V3","V4") := NULL
-                                  ][ ,"ftype":="NA"
-                                  ][is.na(GEin)==F,ftype:="intron"
-                                  ][is.na(GE)==F,  ftype:="exon"
-                                  ][is.na(GE),GE:=GEin
-                                  ][ ,GEin:=NULL ]
-
-  }
-  system(paste("rm",samfile))
-
-  setkey(reads,RG)
-
+  setkey(rsamtools_reads,RG)
+
   if(keepUnassigned){
-    return( reads )
+    return( rsamtools_reads )
   }else{
-    return( reads[GE!="NA"] )
+    return( rsamtools_reads[GE!="NA"] )
   }
-
 }
 
 hammingFilter<-function(umiseq, edit=1, gbcid=NULL){
@@ -322,19 +281,19 @@ check_nonUMIcollapse <- function(seqfiles){
 collectCounts<-function(reads,bccount,subsample.splits, mapList, ...){
   subNames<-paste("downsampled",rownames(subsample.splits),sep="_")
   umiFUN<-"umiCollapseID"
-  lapply(mapList,function(tt){
+  parallel::mclapply(mapList,function(tt){
     ll<-list( all=umiFUNs[[umiFUN]](reads=reads,
                                 bccount=bccount,
                                 ftype=tt),
-              downsampling=lapply( 1:nrow(subsample.splits) , function(i){
+              downsampling=parallel::mclapply( 1:nrow(subsample.splits) , function(i){
                 umiFUNs[[umiFUN]](reads,bccount,
                                   nmin=subsample.splits[i,1],
                                   nmax=subsample.splits[i,2],
-                                  ftype=tt)} )
+                                  ftype=tt)}, mc.cores = floor(opt$num_threads/length(mapList)) )
     )
     names(ll$downsampling)<-subNames
     ll
-  })
+  }, mc.cores = length(mapList))
 
 }
 
@@ -349,16 +308,18 @@ bindList<-function(alldt,newdt){
 }
 
 convert2countM<-function(alldt,what){
-  fmat<-alldt
+  fmat<-copy(alldt)
   for( i in 1:length(alldt)){
     fmat[[i]][[1]]<-.makewide(alldt[[i]][[1]],what)
-    for(j in names(alldt[[i]][[2]])){
-      fmat[[i]][[2]][[j]]<-.makewide(alldt[[i]][[2]][[j]],what)
-    }
+    fmat[[i]][[2]] <- fmat[[i]][[2]][sapply(fmat[[i]][[2]], function(x) nrow(x)>0)]
+    downsamp_names <- names(fmat[[i]][[2]])
+    fmat[[i]][[2]] <- parallel::mclapply(downsamp_names, function(x){
+      .makewide(alldt[[i]][[2]][[x]],what)
+    }, mc.cores = opt$num_threads)
+    names(fmat[[i]][[2]]) <- downsamp_names
   }
   return(fmat)
 }
-
 write_molecule_mapping <- function(mm){
   mm_path <- paste0(opt$out_dir,"/zUMIs_output/molecule_mapping/")
   bcs <- unique(mm$BC)

diff --git a/zUMIs-dge2.R b/zUMIs-dge2.R
@@ -3,6 +3,7 @@ library(methods)
 library(data.table)
 library(yaml)
 library(ggplot2)
+suppressPackageStartupMessages(library(Rsamtools))
 
 ##########################
 myYaml <- commandArgs(trailingOnly = T)
@@ -126,15 +127,22 @@ if(opt$counting_opts$Ham_Dist == 0){
     dir.create( paste0(opt$out_dir,"/zUMIs_output/molecule_mapping/") )
   }
 
-  samouts <- prep_samtools(featfile  = outbamfile,
-                           bccount   = bccount,
-                           inex      = opt$counting_opts$introns,
-                           cores     = opt$num_threads,
-                           samtoolsexc=samtoolsexc)
+  tmpbamfile <- outbamfile
+  outbamfile <- paste0(opt$out_dir,"/",opt$project,".filtered.Aligned.GeneTagged.sorted.bam")
+  print("Coordinate sorting intermediate bam file...")
+  sort_cmd <- paste0(samtoolsexc," sort -O 'BAM' -@ ",opt$num_threads," -m ",mempercpu,"G -o ",outbamfile," ",tmpbamfile)
+  system(sort_cmd)
+  index_cmd <- paste(samtoolsexc,"index -@",opt$num_threads,outbamfile)
+  system(index_cmd)
+  system(paste0("rm ",tmpbamfile))
+
   for(i in unique(bccount$chunkID)){
     print( paste( "Hamming distance collapse in barcode chunk", i, "out of",length(unique(bccount$chunkID)) ))
-    reads<-reads2genes( inex = opt$counting_opts$introns,
-                        chunkID  = i )
+    reads <- reads2genes_new(featfile = outbamfile,
+                             bccount  = bccount,
+                             inex     = opt$counting_opts$introns,
+                             chunk    = i,
+                             cores    = opt$num_threads)
     reads <- reads[!UB==""] #make sure only UMI-containing reads go further
     u <- umiCollapseHam(reads,bccount, HamDist=opt$counting_opts$Ham_Dist)
   }
@@ -144,7 +152,7 @@ if(opt$counting_opts$Ham_Dist == 0){
   outbamfile <- correct_UB_tags(bccount, samtoolsexc)
   sortbamfile <-paste0(opt$out_dir,"/",opt$project,".filtered.Aligned.GeneTagged.UBcorrected.sorted.bam")
 }
-paste("Coordinate sorting final bam file...")
+print("Coordinate sorting final bam file...")
 sort_cmd <- paste0(samtoolsexc," sort -O 'BAM' -@ ",opt$num_threads," -m ",mempercpu,"G -o ",sortbamfile," ",outbamfile)
 system(sort_cmd)
 index_cmd <- paste(samtoolsexc,"index -@",opt$num_threads,sortbamfile)
@@ -177,18 +185,14 @@ if( opt$counting_opts$introns ){
 
 ########################## assign reads to UB & GENE
 
-samouts <- prep_samtools(featfile  = sortbamfile,
-                         bccount   = bccount,
-                         inex      = opt$counting_opts$introns,
-                         cores     = opt$num_threads,
-                         samtoolsexc=samtoolsexc)
-
 for(i in unique(bccount$chunkID)){
      print( paste( "Working on barcode chunk", i, "out of",length(unique(bccount$chunkID)) ))
      print( paste( "Processing",length(bccount[chunkID==i]$XC), "barcodes in this chunk..." ))
-     reads<-reads2genes( inex = opt$counting_opts$introns,
-                          chunkID  = i )
-
+     reads <- reads2genes_new(featfile = sortbamfile,
+                              bccount  = bccount,
+                              inex     = opt$counting_opts$introns,
+                              chunk    = i,
+                              cores    = opt$num_threads)
 
      tmp<-collectCounts(  reads =reads,
                           bccount=bccount[chunkID==i],

diff --git a/zUMIs-mapping.R b/zUMIs-mapping.R
@@ -10,6 +10,10 @@ additional_fq <- inp$reference$additional_files
 samtools <- inp$samtools_exec
 STAR_exec <- inp$STAR_exec
 
+if(is.null(inp$mem_limit) | inp$mem_limit == 0){
+  inp$mem_limit <- 100
+}
+
 # collect filtered bam files ----------------------------------------------
 tmpfolder <- paste(inp$out_dir,"/zUMIs_output/.tmpMerge/",sep="")
 if(inp$which_Stage == "Filtering"){
@@ -21,6 +25,12 @@ if(inp$which_Stage == "Filtering"){
 }
 
 
+# check if multiple STAR instances can be run -----------------------------
+
+genome_size <- system(command = paste("du -sh",inp$reference$STAR_index,"| cut -f1"), intern = TRUE)
+genome_size <- as.numeric(gsub(pattern = "G",replacement = "", x = genome_size))
+num_star_instances <- floor(inp$mem_limit/genome_size)
+
 # GTF file setup ----------------------------------------------------------
 #in case of additional sequences, we need to create a custom GTF
 
@@ -77,24 +87,18 @@ cDNA_read_length <- getmode(nchar(cDNA_peek$V1))
 # Setup STAR mapping ------------------------------------------------------
 samtools_load_cores <- ifelse(inp$num_threads>8,2,1)
 avail_cores <- inp$num_threads - samtools_load_cores #reserve threads for samtools file opening
-#if(avail_cores < 3){
-#  cores_samtools <- 1
-#  cores_star <- 2
-#}else{
-#  cores_samtools <- floor(avail_cores/3)
-#  cores_star <- avail_cores - cores_samtools
-#}
+if(inp$which_Stage == "Filtering"){
+  avail_cores <- floor(avail_cores / num_star_instances)
+}
+
 if(avail_cores < 2){
   avail_cores = 1
 }
 
-#param_defaults <- paste("--readFilesCommand ",samtools," view -@",samtools_load_cores," --outSAMmultNmax 1 --outFilterMultimapNmax 50 --outSAMunmapped Within --outSAMtype SAM --outStd SAM")
-param_defaults <- paste("--readFilesCommand ",samtools," view -@",samtools_load_cores," --outSAMmultNmax 1 --outFilterMultimapNmax 50 --outSAMunmapped Within --outSAMtype BAM Unsorted")
+param_defaults <- paste("--readFilesCommand ",samtools," view -@",samtools_load_cores," --outSAMmultNmax 1 --outFilterMultimapNmax 50 --outSAMunmapped Within --outSAMtype BAM Unsorted --quantMode TranscriptomeSAM")
 param_misc <- paste("--genomeDir",inp$reference$STAR_index,
                     "--sjdbGTFfile",gtf_to_use,
                     "--runThreadN",avail_cores,
-                    "--readFilesIn",paste0(filtered_bams,collapse=","),
-                    "--outFileNamePrefix",paste(inp$out_dir,"/",inp$project,".filtered.tagged.",sep=""),
                     "--sjdbOverhang", cDNA_read_length-1,
                     "--readFilesType SAM",inp$read_layout)
 
@@ -103,14 +107,45 @@ if(inp$counting_opts$twoPass==TRUE){
   STAR_command <- paste(STAR_command,"--twopassMode Basic")
 }
 
-#samtools_output <- paste0(" | ",samtools," view -@ ",cores_samtools," -o ",inp$out_dir,"/",inp$project,".filtered.tagged.Aligned.out.bam")
-#STAR_command <- paste(STAR_command,samtools_output)
-
 #finally, run STAR
-if(inp$which_Stage == "Filtering"){
+if(num_star_instances>1 & inp$which_Stage == "Filtering"){
+  map_tmp_dir <- paste0(inp$out_dir,"/zUMIs_output/.tmpMap/")
+  dir.create(path = map_tmp_dir)
+  input_split <- split(filtered_bams, ceiling(seq_along(filtered_bams) / ceiling(length(filtered_bams) / num_star_instances)))
+  input_split <- sapply(input_split, paste0, collapse = ",")
+  STAR_preset <- STAR_command
+  STAR_command <- lapply(seq(num_star_instances), function(x){
+    paste(STAR_preset,
+      "--readFilesIn",input_split[x],
+      "--outFileNamePrefix",paste0(map_tmp_dir,"/tmp.",inp$project,".",x,"."))
+  })
+  STAR_command <- paste(unlist(STAR_command), collapse = " & ")
   system(paste(STAR_command,"&",sammerge_command,"& wait"))
-  system(paste0("rm ",tmpfolder,"/",inp$project,".*"))
+
+  #after parallel instance STAR, collect output data in the usual file places
+  out_logs <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*.Log.final.out"), full = TRUE)
+  merge_logs <- paste("cat",paste(out_logs, collapse = " "),">",paste0(inp$out_dir,"/",inp$project,".filtered.tagged.Log.final.out"))
+  out_bams <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*.Aligned.out.bam"), full = TRUE)
+  merge_bams <- paste(inp$samtools_exec,"cat -o",paste0(inp$out_dir,"/",inp$project,".filtered.tagged.Aligned.out.bam"),paste(out_bams, collapse = " "))
+  out_txbams <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*.Aligned.toTranscriptome.out.bam"), full = TRUE)
+  merge_txbams <- paste(inp$samtools_exec,"cat -o",paste0(inp$out_dir,"/",inp$project,".filtered.tagged.Aligned.toTranscriptome.out.bam"),paste(out_txbams, collapse = " "))
+  system(paste(merge_logs,"&",merge_bams,"&",merge_txbams,"& wait"))
+  system(paste0("rm ", map_tmp_dir, "tmp.", inp$project, ".*"))
 }else{
-  system(STAR_command)
+  STAR_command <- paste(STAR_command,
+    "--readFilesIn",paste0(filtered_bams,collapse=","),
+    "--outFileNamePrefix",paste(inp$out_dir,"/",inp$project,".filtered.tagged.",sep="")
+  )
+  if(inp$which_Stage == "Filtering"){
+    system(paste(STAR_command,"&",sammerge_command,"& wait"))
+  }else{
+    system(STAR_command)
+  }
+}
+
+
+#clean up chunked bam files
+if(inp$which_Stage == "Filtering"){
+  system(paste0("rm ",tmpfolder,"/",inp$project,".*"))
 }
 q()
diff --git a/zUMIs-master.sh → zUMIs.sh b/zUMIs-master.sh → zUMIs.sh
@@ -3,7 +3,7 @@
 # Pipeline to run UMI-seq analysis from fastq to read count tables.
 # Authors: Swati Parekh, Christoph Ziegenhain, Beate Vieth & Ines Hellmann
 # Contact: [email protected] or [email protected]
-vers=2.8.3
+vers=2.9.0
 currentv=$(curl -s https://raw.githubusercontent.com/sdparekh/zUMIs/master/zUMIs-master.sh | grep '^vers=' | cut -f2 -d "=")
 if [ "$currentv" != "$vers" ] ; then
     echo -e "------------- \n\n Good news! A newer version of zUMIs is available at https://github.com/sdparekh/zUMIs \n\n-------------";
@@ -75,7 +75,7 @@ check_opts "${yaml}" "YAML" "-y"
 
 # create temporary YAML file for corrected options
 yaml_orig=${yaml}
-yaml=$(dirname ${yaml})/$(basename ${yaml} .yaml).corrected.yaml
+yaml=$(dirname ${yaml})/$(basename ${yaml} .yaml).run.yaml
 cp ${yaml_orig} ${yaml}
 
 #now get some variables from YAML