Skip to content

Commit

Permalink
zUMIs0.0.4 with plate BC support
Browse files Browse the repository at this point in the history
  • Loading branch information
cziegenhain committed Feb 23, 2018
1 parent fc0a1a3 commit 9067aef
Show file tree
Hide file tree
Showing 6 changed files with 129 additions and 28 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,14 @@ You can read more about zUMIs in our [biorxiv preprint](https://www.biorxiv.org/

You can glance through zUMIs in [zUMIs poster](https://github.com/sdparekh/zUMIs/blob/master/zUMIs_GI2017_poster.pdf)!

## Releases
## Releases/Changelog
23 Feb 2018: [zUMIs.0.0.3 released](https://github.com/sdparekh/zUMIs/releases/tag/zUMIs.0.0.4).
Added support for plate barcodes with input of an additional barcode fastq file (eg. Illumina i7 index read). Addition of version number in zUMIs-master. Parameters are printed in a .zUMIs_run.txt file for each call.

18 Feb 2018: [zUMIs.0.0.3 released](https://github.com/sdparekh/zUMIs/releases/tag/zUMIs.0.0.3).
Switched support to the new Rsubread version and data format. Furthermore to compensate sequencing/PCR errors, zUMIs now features UMI correction using Hamming distance and binning of adjacent cell barcodes.

You can find the older version of zUMIs [here](https://github.com/sdparekh/zUMIs/releases/).
You can find the older versions of zUMIs [here](https://github.com/sdparekh/zUMIs/releases/).

## Installation and Usage

Expand Down
67 changes: 53 additions & 14 deletions fqfilter.pl
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
# Author: Swati Parekh
# Contact: [email protected] or [email protected] or [email protected]

if(@ARGV != 12)
if(@ARGV != 14)
{
print
"\n#####################################################################################
Usage: perl $0 <barcode-Read.fq.gz> <cDNA-Read.fq.gz> <cellbc_threshold> <Cellbc_Qual_threshold> <umi_threshold> <UMIbc_Qual_threshold> <Cellbc_range> <UMI_range> <Threads> <StudyName> <Outdir> <pigz-executable> \n
Usage: perl $0 <barcode-Read.fq.gz> <cDNA-Read.fq.gz> <plateBC-read.fq.gz> <cellbc_threshold> <Cellbc_Qual_threshold> <umi_threshold> <UMIbc_Qual_threshold> <Cellbc_range> <PlateBC_range> <UMI_range> <Threads> <StudyName> <Outdir> <pigz-executable> \n
Explanation of parameter:
barcode-Read.fq.gz - Input barcode reads fastq file name.
Expand All @@ -30,40 +30,56 @@

$bcread=$ARGV[0];
$cdnaread=$ARGV[1];
$bnbases=$ARGV[2];
$bqualthreshold=$ARGV[3];
$mnbases=$ARGV[4];
$mqualthreshold=$ARGV[5];
$bcrange=$ARGV[6];
$mcrange=$ARGV[7];
$threads=$ARGV[8];
$study=$ARGV[9];
$outdir=$ARGV[10];
$pigz=$ARGV[11];
$pbcread=$ARGV[2];
$bnbases=$ARGV[3];
$bqualthreshold=$ARGV[4];
$mnbases=$ARGV[5];
$mqualthreshold=$ARGV[6];
$bcrange=$ARGV[7];
$pbcrange=$ARGV[8];
$mcrange=$ARGV[9];
$threads=$ARGV[10];
$study=$ARGV[11];
$outdir=$ARGV[12];
$pigz=$ARGV[13];

@b = split("-",$bcrange);
@m = split("-",$mcrange);
$bs = $b[0] - 1;
$ms = $m[0] - 1;
$bl = $b[1]-$b[0]+1;
$ml = $m[1]-$m[0]+1;
if($pbcread ne "NA") {
@p = split("-",$pbcrange);
$ps = $p[0] - 1;
$pl = $p[1]-$p[0]+1;
}



$bcreadout = $outdir."/".$study.".barcodelist.filtered.sam";
$bcreadoutfull = $outdir."/".$study.".barcoderead.filtered.fastq";
if($pbcread ne "NA") {$pbcreadoutfull = $outdir."/".$study.".platebarcoderead.filtered.fastq";}
$cdnareadout = $outdir."/".$study.".cdnaread.filtered.fastq";

if ($bcread =~ /\.gz$/) {
open BCF, '-|', $pigz, '-dc', $bcread || die "Couldn't open file $bcread. Check permissions!\n Check if it is differently zipped then .gz\n\n";
if($pbcread ne "NA") {open PBCF, '-|', $pigz, '-dc', $pbcread || die "Couldn't open file $pbcread. Check permissions!\n Check if it is differently zipped then .gz\n\n";}
open CDF, '-|', $pigz, '-dc', $cdnaread || die "Couldn't open file $cdnaread. Check permissions!\n Check if it is differently zipped then .gz\n\n";
}
else {
open BCF, "<", $bcread || die "Couldn't open file $bcread. Check permissions!\n Check if it is differently zipped then .gz\n\n";
if($pbcread ne "NA") {open PBCF, "<", $pbcread || die "Couldn't open file $pbcread. Check permissions!\n Check if it is differently zipped then .gz\n\n";}
open CDF, "<", $cdnaread || die "Couldn't open file $cdnaread. Check permissions!\n Check if it is differently zipped then .gz\n\n";
}

open BCOUT, ">", $bcreadout || die "Couldn't open file $bcreadout to write\n\n";;
open CDOUT, ">", $cdnareadout || die "Couldn't open file $cdnareadout to write\n\n";;
open BCOUTFULL, ">", $bcreadoutfull || die "Couldn't open file $bcreadoutfull to write\n\n";;
if($pbcread ne "NA") {open PBCOUTFULL, ">", $pbcreadoutfull || die "Couldn't open file $pbcreadoutfull to write\n\n";;}




$count=0;
$total=0;
Expand All @@ -75,7 +91,13 @@
$brseq=<BCF>;
$bqid=<BCF>;
$bqseq=<BCF>;

if($pbcread ne "NA"){
$pbrid=<PBCF>;
$pbrseq=<PBCF>;
$pbqid=<PBCF>;
$pbqseq=<PBCF>;
$pbcqual = substr($pbqseq,$ps,$pl);
}
if($count==0){
$count=1;
@quals = map {$_} unpack "C*", $bqseq;
Expand All @@ -95,14 +117,26 @@

if($c[0] eq $b[0]){
@bquals = map {$_ - $offset} unpack "C*", $bcqual;
#@pbquals = map {$_ - $offset} unpack "C*", $pbcqual;
@mquals = map {$_ - $offset} unpack "C*", $mcqual;
$btmp = grep {$_ < $bqualthreshold} @bquals;
#$pbtmp = grep {$_ < $bqualthreshold} @pbquals;
$mtmp = grep {$_ < $mqualthreshold} @mquals;

if(($btmp < $bnbases) && ($mtmp < $mnbases)){
$filtered++;
$bcseq = substr($brseq,$bs,$bl);
if($pbcread ne "NA") {$pbcseq = substr($pbrseq,$ps,$pl);}
$mcseq = substr($brseq,$ms,$ml);

$brid =~ m/^@(.*)\s/; chomp($brseq);
print BCOUT $1,"\t4\t*\t0\t0\t*\t*\t0\t0\t$brseq\t$bqseq";
if($pbcread ne "NA") {
print BCOUT $1,"\t4\t*\t0\t0\t*\t*\t0\t0\t",$pbcseq,$bcseq,$mcseq,"\t",$pbcqual,$bcqual,$mcqual,"\n";
print PBCOUTFULL $pbrid,$pbrseq,"\n",$pbqid,$pbqseq;
}
else {
print BCOUT $1,"\t4\t*\t0\t0\t*\t*\t0\t0\t$brseq\t$bqseq";
}
print BCOUTFULL $brid,$brseq,"\n",$bqid,$bqseq;
print CDOUT $crid,$crseq,$cqid,$cqseq;
}
Expand All @@ -118,6 +152,11 @@
close BCOUT;
close CDOUT;
close BCOUTFULL;
if($pbcread ne "NA") {
close PBCF;
close PBCOUTFULL;
`$pigz -f -p $threads $pbcreadoutfull`;
}

print "Raw reads: $total \nFiltered reads: $filtered \n\n";

Expand Down
2 changes: 1 addition & 1 deletion zUMIs-dge.R
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ makeGEprofile <- function(abamfile,ubamfile,bcfile,safannot,ncores,stra,bcstart,
umiseq <- sort(umiseq)
uc <- data.frame(us = umiseq) %>% dplyr::count(us) # normal UMI counts

if(length(uc$us)>1 && length(uc$us)<10000){ #prevent use of > 100Gb RAM
if(length(uc$us)>1 && length(uc$us)<100000){ #prevent use of > 100Gb RAM
umi <- stringdist::stringdistmatrix(uc$us,method="hamming",useNames = "strings",nthread=1) %>% #only 1 core for each multidplyr worker
broom::tidy() %>%
dplyr::filter( distance <= edit ) %>% # only remove below chosen dist
Expand Down
4 changes: 3 additions & 1 deletion zUMIs-filtering.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ cq="${10}"
t="${11}"
d="${12}"
pigz="${13}"
pbcfastq="${14}"
pbcrange="${15}"

# MAKING THE HEADER
echo '#!/bin/bash' >$o/$sn.prep.sh
Expand All @@ -23,6 +25,6 @@ echo '#SBATCH --cpus-per-task='$t >>$o/$sn.prep.sh
echo '#SBATCH --workdir='$o >>$o/$sn.prep.sh
echo '#SBATCH --mem=1000' >>$o/$sn.prep.sh

echo "srun perl $d/fqfilter.pl $f1 $f2 $cq $cbq $mq $mbq $xc $xm $t $sn $o $pigz" >>$o/$sn.prep.sh
echo "srun perl $d/fqfilter.pl $f1 $f2 $pbcfastq $cq $cbq $mq $mbq $xc $pbcrange $xm $t $sn $o $pigz" >>$o/$sn.prep.sh

sbatch $o/$sn.prep.sh > $o/$sn.preparejobid.txt
52 changes: 43 additions & 9 deletions zUMIs-master.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Pipeline to run UMI-seq analysis from fastq to read count tables.
# Authors: Swati Parekh & Christoph Ziegenhain
# Contact: [email protected] or [email protected] or [email protected]

vers=0.0.4
function check_opts() {
value=$1
name=$2
Expand Down Expand Up @@ -64,6 +64,8 @@ Make sure you have 3-4 times more disk space to your input fastq files.
This pipeline works based on one hit per read. Therefore, please do not report more multimapping hits. Default: "".
-H <HammingDistance> : Hamming distance collapsing of UMI sequences. Default: 0.
-B <BarcodeBinning> : Hamming distance binning of close cell barcode sequences. Default: 0.
-T <PlateBC fastq> : Fastq file for plate barcode read. Default: NA
-U <PlateBC range> : Barcode range for plate barcode read (e.g. 1-8). Default: NA
## Program paths ##
-o <outputdir> : Where to write output bam. Default: working directory.
Expand Down Expand Up @@ -97,6 +99,8 @@ Make sure you have 3-4 times more disk space to your input fastq files.
-F <gel-barcode2 fastq> : Provide the second half of gel barcode + UMI read <-F> here. Default: NA.
-L <library barcode fastq> : Provide the library barcode read here. Default: NA
zUMIs version $vers
EOF
}

Expand Down Expand Up @@ -130,8 +134,10 @@ xcrange2=0-0
nreads=100
ham=0
XCbin=0
pbcfastq=NA
pbcrange=NA

while getopts ":R:S:f:r:g:o:a:t:s:c:m:l:b:n:N:q:Q:z:u:x:e:p:i:d:X:A:w:j:F:C:y:Y:L:P:V:H:B:h" options; do #Putting <:> between keys implies that they can not be called without an argument.
while getopts ":R:S:f:r:g:o:a:t:s:c:m:l:b:n:N:q:Q:z:u:x:e:p:i:d:X:A:w:j:F:C:y:Y:L:P:V:H:B:U:T:h" options; do #Putting <:> between keys implies that they can not be called without an argument.
case $options in
R ) isslurm=$OPTARG;;
S ) isStats=$OPTARG;;
Expand Down Expand Up @@ -170,6 +176,8 @@ while getopts ":R:S:f:r:g:o:a:t:s:c:m:l:b:n:N:q:Q:z:u:x:e:p:i:d:X:A:w:j:F:C:y:Y:
V ) Rexc=$OPTARG;;
H ) ham=$OPTARG;;
B ) XCbin=$OPTARG;;
T ) pbcfastq=$OPTARG;;
U ) pbcrange=$OPTARG;;
h ) usage
exit 1;;
\? ) echo -e "\n This key is not available! Please check the usage again: -$OPTARG"
Expand Down Expand Up @@ -250,6 +258,8 @@ echo -e "\n\n You provided these parameters:
# bases below phred in UMI: $molbcbase
Hamming Distance (UMI): $ham
Hamming Distance (CellBC): $XCbin
Plate Barcode Read: $pbcfastq
Plate Barcode range: $pbcrange
Barcodes: $barcodes
zUMIs directory: $zumisdir
STAR executable $starexc
Expand All @@ -263,7 +273,8 @@ echo -e "\n\n You provided these parameters:
Barcode read2(STRT-seq): $bcread2
Barcode read2 range(STRT-seq): $xcrange2
Bases(G) to trim(STRT-seq): $BaseTrim
Subsampling reads: $subsampling \n\n"
Subsampling reads: $subsampling \n\n
zUMIs version $vers \n\n" | tee "$sname.zUMIs_run.txt"

#create output folders
[ -d $outdir/zUMIs_output/ ] || mkdir $outdir/zUMIs_output/
Expand All @@ -279,18 +290,41 @@ if [[ "$isCustomFASTQ" != "no" ]] ; then
fi
#Submit all the jobs
if [[ "$isslurm" == "yes" ]] ; then
if [[ "$pbcfastq" != "NA" ]] ; then
tmpa=`echo $pbcrange | cut -f1 -d '-'`
tmpb=`echo $pbcrange | cut -f2 -d '-'`
pbcl=`expr $tmpa + $tmpb - 1`
tmpa=`echo $xcrange | cut -f1 -d '-'`
tmpb=`echo $xcrange | cut -f2 -d '-'`
bcl=`expr $tmpa + $tmpb - 1`
l=`expr $bcl + $pbcl`
xc=1-"$l"
xcst=1
xcend=$l
xmst=`expr $l + 1`
tmpa=`echo $xmrange | cut -f1 -d '-'`
tmpb=`echo $xmrange | cut -f2 -d '-'`
ml=`expr $tmpb - $tmpa`
xmend=`expr $xmst + $ml`
xmr="$xmst"-"$xmend"
xcr="$xcst"-"$xcend"
else
xmr=$xmrange
xcr=$xcrange
fi

case "$whichStage" in
"filtering")
if [[ "$isstrt" == "yes" ]] ; then
bash $zumisdir/zUMIs-filtering-strt.sh $cdnaread $bcread $sname $outdir $xmrange $cbasequal $mbasequal $molbcbase $cellbcbase $threads $zumisdir $bcread2 $BaseTrim $pigzexc
elif [[ "$isindrops" == "yes" ]] ; then
bash $zumisdir/zUMIs-filtering-inDrops.sh $cdnaread $bcread $libread $bcread2 $sname $outdir $xmrange $cbasequal $mbasequal $molbcbase $cellbcbase $threads $zumisdir $pigzexc
else
bash $zumisdir/zUMIs-filtering.sh $bcread $cdnaread $sname $outdir $xcrange $xmrange $cbasequal $mbasequal $molbcbase $cellbcbase $threads $zumisdir $pigzexc
bash $zumisdir/zUMIs-filtering.sh $bcread $cdnaread $sname $outdir $xcrange $xmrange $cbasequal $mbasequal $molbcbase $cellbcbase $threads $zumisdir $pigzexc $pbcfastq $pbcrange
fi
bash $zumisdir/zUMIs-mapping.sh $sname $outdir $genomedir $gtf $threads $readlen "$starparams" $starexc $samtoolsexc $xmrange $BaseTrim $isstrt
bash $zumisdir/zUMIs-prepCounting.sh $sname $outdir $threads $samtoolsexc
bash $zumisdir/zUMIs-counting.sh $sname $outdir $barcodes $threads $gtf $strandedness $xcrange $xmrange $subsampling $zumisdir $isStats $whichStage $isstrt $bcread2 $xcrange2 $nreads $Rexc $ham $XCbin
bash $zumisdir/zUMIs-counting.sh $sname $outdir $barcodes $threads $gtf $strandedness $xcr $xmr $subsampling $zumisdir $isStats $whichStage $isstrt $bcread2 $xcrange2 $nreads $Rexc $ham $XCbin
bash $zumisdir/zUMIs-cleaning.sh $sname $outdir
;;
"mapping")
Expand All @@ -309,7 +343,7 @@ if [[ "$isslurm" == "yes" ]] ; then
fi
bash $zumisdir/zUMIs-mapping.sh $sname $outdir $genomedir $gtf $threads $readlen "$starparams" $starexc $samtoolsexc $xmrange $BaseTrim $isstrt
bash $zumisdir/zUMIs-prepCounting.sh $sname $outdir $threads $samtoolsexc
bash $zumisdir/zUMIs-counting.sh $sname $outdir $barcodes $threads $gtf $strandedness $xcrange $xmrange $subsampling $zumisdir $isStats $whichStage $isstrt $bcread2 $xcrange2 $nreads $Rexc $ham $XCbin
bash $zumisdir/zUMIs-counting.sh $sname $outdir $barcodes $threads $gtf $strandedness $xcr $xmr $subsampling $zumisdir $isStats $whichStage $isstrt $bcread2 $xcrange2 $nreads $Rexc $ham $XCbin
bash $zumisdir/zUMIs-cleaning.sh $sname $outdir
;;
"counting")
Expand All @@ -331,14 +365,14 @@ if [[ "$isslurm" == "yes" ]] ; then
bash $zumisdir/zUMIs-prepCounting.sh $sname $outdir $threads $samtoolsexc
fi

bash $zumisdir/zUMIs-counting.sh $sname $outdir $barcodes $threads $gtf $strandedness $xcrange $xmrange $subsampling $zumisdir $isStats $whichStage $isstrt $bcread2 $xcrange2 $nreads $Rexc $ham $XCbin
bash $zumisdir/zUMIs-counting.sh $sname $outdir $barcodes $threads $gtf $strandedness $xcr $xmr $subsampling $zumisdir $isStats $whichStage $isstrt $bcread2 $xcrange2 $nreads $Rexc $ham $XCbin
bash $zumisdir/zUMIs-cleaning.sh $sname $outdir
;;
"summarising")
bash $zumisdir/zUMIs-counting.sh $sname $outdir $barcodes $threads $gtf $strandedness $xcrange $xmrange $subsampling $zumisdir $isStats $whichStage $isstrt $bcread2 $xcrange2 $nreads $Rexc $ham $XCbin
bash $zumisdir/zUMIs-counting.sh $sname $outdir $barcodes $threads $gtf $strandedness $xcr $xmr $subsampling $zumisdir $isStats $whichStage $isstrt $bcread2 $xcrange2 $nreads $Rexc $ham $XCbin
bash $zumisdir/zUMIs-cleaning.sh $sname $outdir
;;
esac
else
bash $zumisdir/zUMIs-noslurm.sh $cdnaread $bcread $sname $outdir $xcrange $xmrange $cbasequal $mbasequal $molbcbase $cellbcbase $threads $genomedir $gtf $readlen "$starparams" $starexc $barcodes $strandedness $subsampling $zumisdir $samtoolsexc $isStats $whichStage $bcread2 $BaseTrim $isstrt $xcrange2 $CustomMappedBAM $isCustomFASTQ $nreads $isindrops $libread $pigzexc $Rexc $ham $XCbin
bash $zumisdir/zUMIs-noslurm.sh $cdnaread $bcread $sname $outdir $xcrange $xmrange $cbasequal $mbasequal $molbcbase $cellbcbase $threads $genomedir $gtf $readlen "$starparams" $starexc $barcodes $strandedness $subsampling $zumisdir $samtoolsexc $isStats $whichStage $bcread2 $BaseTrim $isstrt $xcrange2 $CustomMappedBAM $isCustomFASTQ $nreads $isindrops $libread $pigzexc $Rexc $ham $XCbin $pbcfastq $pbcrange
fi
26 changes: 25 additions & 1 deletion zUMIs-noslurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ pigzexc="${33}"
Rexc="${34}"
ham="${35}"
XCbin="${36}"
pbcfastq="${37}"
pbcrange="${38}"


if [[ "$isstrt" == "no" ]] ; then
rl=`expr $r - 1`
Expand Down Expand Up @@ -69,7 +72,25 @@ re='^[0-9]+$'
elif [[ "$isindrops" == "yes" ]] ; then
perl $zumisdir/fqfilter-inDrops.pl $f1 $f2 $libread $f3 $cq $cbq $mq $mbq $xm $t $sn $o $pigzexc
else
perl $zumisdir/fqfilter.pl $f2 $f1 $cq $cbq $mq $mbq $xc $xm $t $sn $o $pigzexc
perl $zumisdir/fqfilter.pl $f2 $f1 $pbcfastq $cq $cbq $mq $mbq $xc $pbcrange $xm $t $sn $o $pigzexc
if [[ "$pbcfastq" != "NA" ]] ; then
tmpa=`echo $pbcrange | cut -f1 -d '-'`
tmpb=`echo $pbcrange | cut -f2 -d '-'`
pbcl=`expr $tmpa + $tmpb - 1`
tmpa=`echo $xc | cut -f1 -d '-'`
tmpb=`echo $xc | cut -f2 -d '-'`
bcl=`expr $tmpa + $tmpb - 1`
l=`expr $bcl + $pbcl`
xc=1-"$l"
xcst=1
xcend=$l
xmst=`expr $l + 1`
tmpa=`echo $xm | cut -f1 -d '-'`
tmpb=`echo $xm | cut -f2 -d '-'`
ml=`expr $tmpb - $tmpa`
xmend=`expr $xmst + $ml`
xm="$xmst"-"$xmend"
fi
fi

$starexc --genomeDir $g --runThreadN $t --readFilesCommand zcat --sjdbGTFfile $gtf --outFileNamePrefix $o/$sn. --outSAMtype BAM Unsorted --outSAMmultNmax 1 --outFilterMultimapNmax 50 --outSAMunmapped Within --sjdbOverhang $rl --twopassMode Basic --readFilesIn $o/$sn.cdnaread.filtered.fastq.gz $x
Expand Down Expand Up @@ -169,3 +190,6 @@ re='^[0-9]+$'
rm $o/$sn.Aligned.out.bam $o/$sn.aligned.sorted.bam.in $o/$sn.aligned.sorted.bam.ex $o/$sn.barcodelist.filtered.sam
mv $o/$sn.barcoderead.filtered.fastq.gz $o/zUMIs_output/filtered_fastq/
mv $o/$sn.cdnaread.filtered.fastq.gz $o/zUMIs_output/filtered_fastq/
if [[ "$pbcfastq" != "NA" ]] ; then
mv $o/$sn.platebarcoderead.filtered.fastq.gz $o/zUMIs_output/filtered_fastq/
fi

0 comments on commit 9067aef

Please sign in to comment.