Skip to content

Commit

Permalink
zUMIs2.9.0: speed improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
cziegenhain committed Jul 5, 2020
1 parent 991165d commit c06bcaa
Show file tree
Hide file tree
Showing 5 changed files with 130 additions and 128 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ git clone https://github.com/sdparekh/zUMIs.git`
```
Start anaylysing your data:
```
zUMIs/zUMIs-master.sh -c -y my_config.yaml
zUMIs/zUMIs.sh -c -y my_config.yaml
```
zUMIs now comes with its own [miniconda](https://docs.conda.io/en/latest/miniconda.html) environment, so you do not need to deal with dependencies or installations (use the -c flag to use conda).
If you wish to use your own dependencies, head to the [zUMIs wiki](https://github.com/sdparekh/zUMIs/wiki/Installation#dependencies) to see what is required.
Expand All @@ -29,6 +29,8 @@ We provide a script to convert zUMIs output into loom file automatically based o
zUMIs will try to automatically do this, otherwise convert zUMIs output to loom by simply running `Rscript rds2loom.R myRun.yaml`.

## Changelog
05 July 2020: [zUMIs2.9.0 released](https://github.com/sdparekh/zUMIs/releases/tag/2.7.0): Speed up STAR read mapping by parallel instances if enough resources are available. Change the main zUMIs script to `zUMIs.sh`. Speed up and reduce clutter by loading reads from bam files using parallelised Rsamtools calls instead of printing temporary text files. Speed up counting by parallelising exon / intron / exon+intron counting as well as downsamplings. Speed up by parallelising creation of wide format count matrices.

23 June 2020: zUMIs2.8.3: Merged code contribution from @gringer: prevent errors by emitting SAM headers in chunked unmapped .bam file output of fqfilter. Changed call to STAR to prevent stalling of samtools pipe.

18 May 2020: zUMIs2.8.2: Added `merge_demultiplexed_fastq.R` to concatenate previously demultiplexed fastq files. For usage details see here: https://github.com/sdparekh/zUMIs/wiki/Starting-from-demultiplexed-fastq-files
Expand Down
143 changes: 52 additions & 91 deletions UMIstuffFUN.R
Original file line number Diff line number Diff line change
Expand Up @@ -56,96 +56,55 @@ ham_mat <- function(umistrings) {
nrow(X) - H
}

prep_samtools <- function(featfile,bccount,inex,cores,samtoolsexc){
print("Extracting reads from bam file(s)...")

nchunks <- length(unique(bccount$chunkID))
all_rgfiles <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".RGgroup.",1:nchunks,".txt")


for(i in unique(bccount$chunkID)){
rgfile <- all_rgfiles[i]
chunks <- bccount[chunkID==i]$XC
write.table(file=rgfile,chunks,col.names = F,quote = F,row.names = F)
}

reads2genes_new <- function(featfile, bccount, inex, chunk, cores, keepUnassigned = FALSE){
chunk_bcs <- bccount[chunkID==chunk]$XC
idxstats <- Rsamtools::idxstatsBam(featfile)
taglist <- c("BC", "UB","GE")
if(inex){
headerXX <- paste( c(paste0("V",1:4)) ,collapse="\t")
}else{
headerXX <- paste( c(paste0("V",1:3)) ,collapse="\t")
taglist <- c(taglist, "GI")
}
write(headerXX,"freadHeader")

headercommand <- "cat freadHeader > "
layoutflag <- ifelse(opt$read_layout == "PE", "-f 0x0040", "")
samcommand <- paste(samtoolsexc," view -x QB -x QU -x BX -x NH -x AS -x nM -x HI -x IH -x NM -x uT -x MD -x jM -x jI -x XN -x XS -x UX -x ES -x EN -x IS -x IN", layoutflag, "-@")

outfiles <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".tmp.",1:nchunks,".txt")
system(paste(headercommand,outfiles,collapse = "; "))

cpusperchunk <- round(cores/nchunks,0)

if(inex == FALSE){
grepcommand <- " | cut -f12,13,14 | sed 's/BC:Z://' | sed 's/GE:Z://' | sed 's/UB:Z://' | grep -F -f "
ex_cmd <- paste(samcommand,cpusperchunk,featfile,grepcommand,all_rgfiles,">>",outfiles," & ",collapse = " ")

system(paste(ex_cmd,"wait"))

rsamtools_reads <- mclapply(1:nrow(idxstats), function(x) {
if(opt$read_layout == "PE"){
parms <- ScanBamParam(tag=taglist,
what="pos",
flag = scanBamFlag(isFirstMateRead = TRUE),
tagFilter = list(BC = chunk_bcs),
which = GRanges(seqnames = idxstats[x,"seqnames"], ranges = IRanges(1,idxstats[x,"seqlength"])))
}else{
parms <- ScanBamParam(tag=taglist,
what="pos",
tagFilter = list(BC = chunk_bcs),
which = GRanges(seqnames = idxstats[x,"seqnames"], ranges = IRanges(1,idxstats[x,"seqlength"])))
}

dat <- scanBam(file = featfile, param = parms)
if(inex){
dt <- data.table(RG = dat[[1]]$tag$BC, UB = dat[[1]]$tag$UB, GE = dat[[1]]$tag$GE, GEin = dat[[1]]$tag$GI)
}else{
dt <- data.table(RG = dat[[1]]$tag$BC, UB = dat[[1]]$tag$UB, GE = dat[[1]]$tag$GE)
}
return(dt)
}, mc.cores = cores)
rsamtools_reads <- rbindlist(rsamtools_reads, fill = TRUE, use.names = TRUE)
if(inex){
rsamtools_reads[ , ftype :="NA"][
is.na(GEin)==F, ftype :="intron"][
is.na(GE)==F , ftype:="exon"][
is.na(GE) , GE:=GEin][
,GEin:=NULL ]
}else{
grepcommand <- " | cut -f12,13,14,15 | sed 's/BC:Z://' | sed 's/Z://g' | grep -F -f "
inex_cmd <- paste(samcommand,cpusperchunk,featfile,grepcommand,all_rgfiles,">>",outfiles," & ",collapse = " ")

system(paste(inex_cmd,"wait"))
rsamtools_reads[, ftype :="NA"][
is.na(GE)==F, ftype :="exon"]
}
system("rm freadHeader")
system(paste("rm",all_rgfiles))

return(outfiles)
}

reads2genes <- function(inex,chunkID, keepUnassigned = FALSE){

samfile <- paste0(opt$out_dir,"/zUMIs_output/.",opt$project,".tmp.",chunkID,".txt")

if(inex==FALSE){
reads<-data.table::fread(samfile, na.strings=c(""),
select=c(1,2,3),header=T,fill=T,colClasses = "character" , col.names = c("RG","GE","UB") )[
,"ftype":="NA"
][is.na(GE)==F, ftype:="exon"]
}else{
reads<-data.table::fread(samfile, na.strings=c(""),
select=c(1,2,3,4),header=T,fill=T,colClasses = "character" , col.names = c("RG","V2","V3","V4") )[
, V2_id := substr(V2,1,2)
][ , V3_id := substr(V3,1,2)
][ , V4_id := substr(V4,1,2)
][ , V2 := substr(V2,4,nchar(V2))
][ , V3 := substr(V3,4,nchar(V3))
][ , V4 := substr(V4,4,nchar(V4))
][ V2_id == "GE", GE := V2
][ V2_id == "GI", GEin := V2
][ V2_id == "UB", UB := V2
][ V3_id == "GE", GE := V3
][ V3_id == "GI", GEin := V3
][ V3_id == "UB", UB := V3
][ V4_id == "UB", UB := V4
][ V4_id == "GI", GEin := V4
][ ,c("V2_id","V3_id","V4_id","V2","V3","V4") := NULL
][ ,"ftype":="NA"
][is.na(GEin)==F,ftype:="intron"
][is.na(GE)==F, ftype:="exon"
][is.na(GE),GE:=GEin
][ ,GEin:=NULL ]

}
system(paste("rm",samfile))

setkey(reads,RG)

setkey(rsamtools_reads,RG)

if(keepUnassigned){
return( reads )
return( rsamtools_reads )
}else{
return( reads[GE!="NA"] )
return( rsamtools_reads[GE!="NA"] )
}

}

hammingFilter<-function(umiseq, edit=1, gbcid=NULL){
Expand Down Expand Up @@ -322,19 +281,19 @@ check_nonUMIcollapse <- function(seqfiles){
collectCounts<-function(reads,bccount,subsample.splits, mapList, ...){
subNames<-paste("downsampled",rownames(subsample.splits),sep="_")
umiFUN<-"umiCollapseID"
lapply(mapList,function(tt){
parallel::mclapply(mapList,function(tt){
ll<-list( all=umiFUNs[[umiFUN]](reads=reads,
bccount=bccount,
ftype=tt),
downsampling=lapply( 1:nrow(subsample.splits) , function(i){
downsampling=parallel::mclapply( 1:nrow(subsample.splits) , function(i){
umiFUNs[[umiFUN]](reads,bccount,
nmin=subsample.splits[i,1],
nmax=subsample.splits[i,2],
ftype=tt)} )
ftype=tt)}, mc.cores = floor(opt$num_threads/length(mapList)) )
)
names(ll$downsampling)<-subNames
ll
})
}, mc.cores = length(mapList))

}

Expand All @@ -349,16 +308,18 @@ bindList<-function(alldt,newdt){
}

convert2countM<-function(alldt,what){
fmat<-alldt
fmat<-copy(alldt)
for( i in 1:length(alldt)){
fmat[[i]][[1]]<-.makewide(alldt[[i]][[1]],what)
for(j in names(alldt[[i]][[2]])){
fmat[[i]][[2]][[j]]<-.makewide(alldt[[i]][[2]][[j]],what)
}
fmat[[i]][[2]] <- fmat[[i]][[2]][sapply(fmat[[i]][[2]], function(x) nrow(x)>0)]
downsamp_names <- names(fmat[[i]][[2]])
fmat[[i]][[2]] <- parallel::mclapply(downsamp_names, function(x){
.makewide(alldt[[i]][[2]][[x]],what)
}, mc.cores = opt$num_threads)
names(fmat[[i]][[2]]) <- downsamp_names
}
return(fmat)
}

write_molecule_mapping <- function(mm){
mm_path <- paste0(opt$out_dir,"/zUMIs_output/molecule_mapping/")
bcs <- unique(mm$BC)
Expand Down
38 changes: 21 additions & 17 deletions zUMIs-dge2.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ library(methods)
library(data.table)
library(yaml)
library(ggplot2)
suppressPackageStartupMessages(library(Rsamtools))

##########################
myYaml <- commandArgs(trailingOnly = T)
Expand Down Expand Up @@ -126,15 +127,22 @@ if(opt$counting_opts$Ham_Dist == 0){
dir.create( paste0(opt$out_dir,"/zUMIs_output/molecule_mapping/") )
}

samouts <- prep_samtools(featfile = outbamfile,
bccount = bccount,
inex = opt$counting_opts$introns,
cores = opt$num_threads,
samtoolsexc=samtoolsexc)
tmpbamfile <- outbamfile
outbamfile <- paste0(opt$out_dir,"/",opt$project,".filtered.Aligned.GeneTagged.sorted.bam")
print("Coordinate sorting intermediate bam file...")
sort_cmd <- paste0(samtoolsexc," sort -O 'BAM' -@ ",opt$num_threads," -m ",mempercpu,"G -o ",outbamfile," ",tmpbamfile)
system(sort_cmd)
index_cmd <- paste(samtoolsexc,"index -@",opt$num_threads,outbamfile)
system(index_cmd)
system(paste0("rm ",tmpbamfile))

for(i in unique(bccount$chunkID)){
print( paste( "Hamming distance collapse in barcode chunk", i, "out of",length(unique(bccount$chunkID)) ))
reads<-reads2genes( inex = opt$counting_opts$introns,
chunkID = i )
reads <- reads2genes_new(featfile = outbamfile,
bccount = bccount,
inex = opt$counting_opts$introns,
chunk = i,
cores = opt$num_threads)
reads <- reads[!UB==""] #make sure only UMI-containing reads go further
u <- umiCollapseHam(reads,bccount, HamDist=opt$counting_opts$Ham_Dist)
}
Expand All @@ -144,7 +152,7 @@ if(opt$counting_opts$Ham_Dist == 0){
outbamfile <- correct_UB_tags(bccount, samtoolsexc)
sortbamfile <-paste0(opt$out_dir,"/",opt$project,".filtered.Aligned.GeneTagged.UBcorrected.sorted.bam")
}
paste("Coordinate sorting final bam file...")
print("Coordinate sorting final bam file...")
sort_cmd <- paste0(samtoolsexc," sort -O 'BAM' -@ ",opt$num_threads," -m ",mempercpu,"G -o ",sortbamfile," ",outbamfile)
system(sort_cmd)
index_cmd <- paste(samtoolsexc,"index -@",opt$num_threads,sortbamfile)
Expand Down Expand Up @@ -177,18 +185,14 @@ if( opt$counting_opts$introns ){

########################## assign reads to UB & GENE

samouts <- prep_samtools(featfile = sortbamfile,
bccount = bccount,
inex = opt$counting_opts$introns,
cores = opt$num_threads,
samtoolsexc=samtoolsexc)

for(i in unique(bccount$chunkID)){
print( paste( "Working on barcode chunk", i, "out of",length(unique(bccount$chunkID)) ))
print( paste( "Processing",length(bccount[chunkID==i]$XC), "barcodes in this chunk..." ))
reads<-reads2genes( inex = opt$counting_opts$introns,
chunkID = i )

reads <- reads2genes_new(featfile = sortbamfile,
bccount = bccount,
inex = opt$counting_opts$introns,
chunk = i,
cores = opt$num_threads)

tmp<-collectCounts( reads =reads,
bccount=bccount[chunkID==i],
Expand Down
69 changes: 52 additions & 17 deletions zUMIs-mapping.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ additional_fq <- inp$reference$additional_files
samtools <- inp$samtools_exec
STAR_exec <- inp$STAR_exec

if(is.null(inp$mem_limit) | inp$mem_limit == 0){
inp$mem_limit <- 100
}

# collect filtered bam files ----------------------------------------------
tmpfolder <- paste(inp$out_dir,"/zUMIs_output/.tmpMerge/",sep="")
if(inp$which_Stage == "Filtering"){
Expand All @@ -21,6 +25,12 @@ if(inp$which_Stage == "Filtering"){
}


# check if multiple STAR instances can be run -----------------------------

genome_size <- system(command = paste("du -sh",inp$reference$STAR_index,"| cut -f1"), intern = TRUE)
genome_size <- as.numeric(gsub(pattern = "G",replacement = "", x = genome_size))
num_star_instances <- floor(inp$mem_limit/genome_size)

# GTF file setup ----------------------------------------------------------
#in case of additional sequences, we need to create a custom GTF

Expand Down Expand Up @@ -77,24 +87,18 @@ cDNA_read_length <- getmode(nchar(cDNA_peek$V1))
# Setup STAR mapping ------------------------------------------------------
samtools_load_cores <- ifelse(inp$num_threads>8,2,1)
avail_cores <- inp$num_threads - samtools_load_cores #reserve threads for samtools file opening
#if(avail_cores < 3){
# cores_samtools <- 1
# cores_star <- 2
#}else{
# cores_samtools <- floor(avail_cores/3)
# cores_star <- avail_cores - cores_samtools
#}
if(inp$which_Stage == "Filtering"){
avail_cores <- floor(avail_cores / num_star_instances)
}

if(avail_cores < 2){
avail_cores = 1
}

#param_defaults <- paste("--readFilesCommand ",samtools," view -@",samtools_load_cores," --outSAMmultNmax 1 --outFilterMultimapNmax 50 --outSAMunmapped Within --outSAMtype SAM --outStd SAM")
param_defaults <- paste("--readFilesCommand ",samtools," view -@",samtools_load_cores," --outSAMmultNmax 1 --outFilterMultimapNmax 50 --outSAMunmapped Within --outSAMtype BAM Unsorted")
param_defaults <- paste("--readFilesCommand ",samtools," view -@",samtools_load_cores," --outSAMmultNmax 1 --outFilterMultimapNmax 50 --outSAMunmapped Within --outSAMtype BAM Unsorted --quantMode TranscriptomeSAM")
param_misc <- paste("--genomeDir",inp$reference$STAR_index,
"--sjdbGTFfile",gtf_to_use,
"--runThreadN",avail_cores,
"--readFilesIn",paste0(filtered_bams,collapse=","),
"--outFileNamePrefix",paste(inp$out_dir,"/",inp$project,".filtered.tagged.",sep=""),
"--sjdbOverhang", cDNA_read_length-1,
"--readFilesType SAM",inp$read_layout)

Expand All @@ -103,14 +107,45 @@ if(inp$counting_opts$twoPass==TRUE){
STAR_command <- paste(STAR_command,"--twopassMode Basic")
}

#samtools_output <- paste0(" | ",samtools," view -@ ",cores_samtools," -o ",inp$out_dir,"/",inp$project,".filtered.tagged.Aligned.out.bam")
#STAR_command <- paste(STAR_command,samtools_output)

#finally, run STAR
if(inp$which_Stage == "Filtering"){
if(num_star_instances>1 & inp$which_Stage == "Filtering"){
map_tmp_dir <- paste0(inp$out_dir,"/zUMIs_output/.tmpMap/")
dir.create(path = map_tmp_dir)
input_split <- split(filtered_bams, ceiling(seq_along(filtered_bams) / ceiling(length(filtered_bams) / num_star_instances)))
input_split <- sapply(input_split, paste0, collapse = ",")
STAR_preset <- STAR_command
STAR_command <- lapply(seq(num_star_instances), function(x){
paste(STAR_preset,
"--readFilesIn",input_split[x],
"--outFileNamePrefix",paste0(map_tmp_dir,"/tmp.",inp$project,".",x,"."))
})
STAR_command <- paste(unlist(STAR_command), collapse = " & ")
system(paste(STAR_command,"&",sammerge_command,"& wait"))
system(paste0("rm ",tmpfolder,"/",inp$project,".*"))

#after parallel instance STAR, collect output data in the usual file places
out_logs <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*.Log.final.out"), full = TRUE)
merge_logs <- paste("cat",paste(out_logs, collapse = " "),">",paste0(inp$out_dir,"/",inp$project,".filtered.tagged.Log.final.out"))
out_bams <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*.Aligned.out.bam"), full = TRUE)
merge_bams <- paste(inp$samtools_exec,"cat -o",paste0(inp$out_dir,"/",inp$project,".filtered.tagged.Aligned.out.bam"),paste(out_bams, collapse = " "))
out_txbams <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*.Aligned.toTranscriptome.out.bam"), full = TRUE)
merge_txbams <- paste(inp$samtools_exec,"cat -o",paste0(inp$out_dir,"/",inp$project,".filtered.tagged.Aligned.toTranscriptome.out.bam"),paste(out_txbams, collapse = " "))
system(paste(merge_logs,"&",merge_bams,"&",merge_txbams,"& wait"))
system(paste0("rm ", map_tmp_dir, "tmp.", inp$project, ".*"))
}else{
system(STAR_command)
STAR_command <- paste(STAR_command,
"--readFilesIn",paste0(filtered_bams,collapse=","),
"--outFileNamePrefix",paste(inp$out_dir,"/",inp$project,".filtered.tagged.",sep="")
)
if(inp$which_Stage == "Filtering"){
system(paste(STAR_command,"&",sammerge_command,"& wait"))
}else{
system(STAR_command)
}
}


#clean up chunked bam files
if(inp$which_Stage == "Filtering"){
system(paste0("rm ",tmpfolder,"/",inp$project,".*"))
}
q()
4 changes: 2 additions & 2 deletions zUMIs-master.sh → zUMIs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Pipeline to run UMI-seq analysis from fastq to read count tables.
# Authors: Swati Parekh, Christoph Ziegenhain, Beate Vieth & Ines Hellmann
# Contact: [email protected] or [email protected]
vers=2.8.3
vers=2.9.0
currentv=$(curl -s https://raw.githubusercontent.com/sdparekh/zUMIs/master/zUMIs-master.sh | grep '^vers=' | cut -f2 -d "=")
if [ "$currentv" != "$vers" ] ; then
echo -e "------------- \n\n Good news! A newer version of zUMIs is available at https://github.com/sdparekh/zUMIs \n\n-------------";
Expand Down Expand Up @@ -75,7 +75,7 @@ check_opts "${yaml}" "YAML" "-y"

# create temporary YAML file for corrected options
yaml_orig=${yaml}
yaml=$(dirname ${yaml})/$(basename ${yaml} .yaml).corrected.yaml
yaml=$(dirname ${yaml})/$(basename ${yaml} .yaml).run.yaml
cp ${yaml_orig} ${yaml}

#now get some variables from YAML
Expand Down

0 comments on commit c06bcaa

Please sign in to comment.