-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathalleledb-newpipeline
executable file
·171 lines (132 loc) · 9.43 KB
/
alleledb-newpipeline
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
### allelicbias-check-cmd-loop <indiv_category> <pgenome> <basename/fastq.gz> <path pgenome> <path snp.calls.bed for het snps> <path fastq.gz> <asb/ase> <flag>
### e.g. allelicbias-check kilpinen-pooled-rnaseq-na12878 newSV kilpinen_NA12878_ERR356372_pooled.fastq.gz /gpfs/scratch/fas/gerstein/jc2296/personal_genomes/NA12878_pgenome_hg19/1kgp3-svs-pass_NA12878_hg19_150201_w_transcriptome_newSV /gpfs/scratch/fas/gerstein/jc2296/personal_genomes/NA12878_pgenome_hg19/1kgp3-svs-pass_NA12878_hg19_150201_w_transcriptome_newSV/snp.calls.bed /gpfs/scratch/fas/gerstein/jc2296/alleledb/allelicbias/rnaseq 1
### note for paths no end '/' pls
## requires the following scripts in environment: ThunderByRob.jar, bsub-make-plus.sh, map.back.ref.wrapper.sh.ori, multis-and-unaligneds-wrapper.sh
## option 8 uses 'small' fastqi for realignment, meaning these are only the reads that had aligned previously.
## 1 alignment
echo "######################"
echo "## 1-alignment #######"
echo "######################"
mkdir 1-alignment-$1
cd 1-alignment-$1
mkdir trash
ln -s $6/$3
~/software/AlleleSeq_pipeline_v1.2.rob.new/filter_input.sh ~/software/AlleleSeq_pipeline_v1.2.rob.new $3 | bowtie --best --strata -v 2 -m 1 -f $4/AltRefMother/AltRefMother - > $3.mat.bowtie; echo $3 ; zcat $3 | wc -l ; wc -l $3.mat.bowtie
~/software/AlleleSeq_pipeline_v1.2.rob.new/filter_input.sh ~/software/AlleleSeq_pipeline_v1.2.rob.new $3 | bowtie --best --strata -v 2 -m 1 -f $4/AltRefFather/AltRefFather - > $3.pat.bowtie; wc -l $3.pat.bowtie
cd ..
## 2 map to ref
echo "#######################"
echo "## 2-map to ref #######"
echo "#######################"
mkdir 2-map.back.ref-$1
cd 2-map.back.ref-$1
mkdir trash
ln -s $4/mat2ref.chain
ln -s $4/pat2ref.chain
cp ~/jmtools/map.back.ref.wrapper.sh.ori map.back.ref.wrapper.sh
for i in ../1-alignment-$1/*.bowtie
do
ln -s $i
done
./map.back.ref.wrapper.sh $3.mat.bowtie maternal MAT mat2ref.chain; awk '{OFS=\"\t\"}{FS=\"\t\"}{print \"chr\"\$0}' $3.mat.bowtie.maternal.map2ref.bed > $3.mat.bowtie.maternal.map2ref.bed_ ; mv $3.mat.bowtie.maternal.map2ref.bed trash ; mv $3.mat.bowtie.maternal.map2ref.bed_ $3.mat.bowtie.maternal.map2ref.bed ; wc -l *.maternal.*.bed
./map.back.ref.wrapper.sh $3.pat.bowtie paternal PAT pat2ref.chain; awk '{OFS=\"\t\"}{FS=\"\t\"}{print \"chr\"\$0}' $3.pat.bowtie.paternal.map2ref.bed > $3.pat.bowtie.paternal.map2ref.bed_ ; mv $3.pat.bowtie.paternal.map2ref.bed trash ; mv $3.pat.bowtie.paternal.map2ref.bed_ $3.pat.bowtie.paternal.map2ref.bed ; wc -l *.paternal.*.bed
cd ..
## 3 intersectBed
echo "##########################"
echo "## 3-intersectBed #######"
echo "##########################"
mkdir 3-intersectBed-$1
cd 3-intersectBed-$1
mkdir trash
ln -s $5
for i in ../2-map.back.ref-$1/*.map2ref.bed
do
ln -s $i
done
intersectBed -a $3.mat.bowtie.maternal.map2ref.bed -b snp.calls.bed -wa -wb > intersect.$3.mat.snp.calls.txt ; wc -l intersect.$3.mat.snp.calls.txt
intersectBed -a $3.pat.bowtie.paternal.map2ref.bed -b snp.calls.bed -wa -wb > intersect.$3.pat.snp.calls.txt ; wc -l intersect.$3.pat.snp.calls.txt
cd ..
## 4 flip the reads
echo "############################"
echo "## 4-flip the reads #######"
echo "############################"
mkdir 4-flip-$1
cd 4-flip-$1
mkdir trash
for i in ../3-intersectBed-$1/intersect.*.snp.calls.txt
do
ln -s $i
done
flipread2fastq -s 1 intersect.$3.mat.snp.calls.txt > intersect.$3.mat.flipread.fastq; wc -l intersect.*mat.*
flipread2fastq -s 1 intersect.$3.pat.snp.calls.txt > intersect.$3.pat.flipread.fastq; wc -l intersect.*pat.*
cd ..
## 5 alignment2
echo "#####################################"
echo "## 5-alignment2 flipped reads #######"
echo "#####################################"
mkdir 5-alignment2-$1
cd 5-alignment2-$1
mkdir trash
for i in ../4-flip-$1/*.fastq
do
ln -s $i
done
bowtie --un intersect.$3.matflip2mat.flipread.unaligned --max intersect.$3.matflip2mat.flipread.multi --best --strata -v 2 -m 1 -q $4/AltRefMother/AltRefMother intersect.$3.mat.flipread.fastq > intersect.$3.matflip2mat.flipread.bowtie; wc -l intersect.$3.matflip2mat.flipread.*
bowtie --un intersect.$3.matflip2pat.flipread.unaligned --max intersect.$3.matflip2pat.flipread.multi --best --strata -v 2 -m 1 -q $4/AltRefFather/AltRefFather intersect.$3.mat.flipread.fastq > intersect.$3.matflip2pat.flipread.bowtie; wc -l intersect.$3.matflip2pat.flipread.*
bowtie --un intersect.$3.patflip2mat.flipread.unaligned --max intersect.$3.patflip2mat.flipread.multi --best --strata -v 2 -m 1 -q $4/AltRefMother/AltRefMother intersect.$3.pat.flipread.fastq > intersect.$3.patflip2mat.flipread.bowtie; wc -l intersect.$3.patflip2mat.flipread.*
bowtie --un intersect.$3.patflip2pat.flipread.unaligned --max intersect.$3.patflip2pat.flipread.multi --best --strata -v 2 -m 1 -q $4/AltRefFather/AltRefFather intersect.$3.pat.flipread.fastq > intersect.$3.patflip2pat.flipread.bowtie; wc -l intersect.$3.patflip2pat.flipread.*
bowtie --un intersect.$3.matflip2ref.flipread.unaligned --max intersect.$3.matflip2ref.flipread.multi --best --strata -v 2 -m 1 -q /gpfs/scratch/fas/gerstein/jc2296/reference_genomes/fasta/b37_g1k_phase2/Refhs37d5ss intersect.$3.mat.flipread.fastq > intersect.$3.matflip2ref.flipread.bowtie; wc -l intersect.$3.matflip2ref.flipread.*
bowtie --un intersect.$3.patflip2ref.flipread.unaligned --max intersect.$3.patflip2ref.flipread.multi --best --strata -v 2 -m 1 -q /gpfs/scratch/fas/gerstein/jc2296/reference_genomes/fasta/b37_g1k_phase2/Refhs37d5ss intersect.$3.pat.flipread.fastq > intersect.$3.patflip2ref.flipread.bowtie; wc -l intersect.$3.patflip2ref.flipread.*
cd ..
## 6 unaligned
echo "###########################################"
echo "## 6-check unaligned flipped reads #######"
echo "###########################################"
mkdir 6-unaligned-$1
cd 6-unaligned-$1
mkdir trash
ln -s ../5-alignment2-$1/intersect.$3.matflip2pat.flipread.unaligned
ln -s ../5-alignment2-$1/intersect.$3.patflip2mat.flipread.unaligned
### original mat and pat
ln -s ../3-intersectBed-$1/intersect.$3.mat.snp.calls.txt
ln -s ../3-intersectBed-$1/intersect.$3.pat.snp.calls.txt
multis-and-unaligneds-wrapper.sh intersect.$3.matflip2pat.flipread.unaligned intersect.$3.mat.snp.calls.txt mat
multis-and-unaligneds-wrapper.sh intersect.$3.patflip2mat.flipread.unaligned intersect.$3.pat.snp.calls.txt pat
cd ..
## 7 multi
echo "#######################################"
echo "## 7-check multi flipped reads #######"
echo "#######################################"
mkdir 7-multi-$1
cd 7-multi-$1
mkdir trash
ln -s ../5-alignment2-$1/intersect.$3.matflip2pat.flipread.multi
ln -s ../5-alignment2-$1/intersect.$3.patflip2mat.flipread.multi
### original mat and pat
ln -s ../3-intersectBed-$1/intersect.$3.mat.snp.calls.txt
ln -s ../3-intersectBed-$1/intersect.$3.pat.snp.calls.txt
multis-and-unaligneds-wrapper.sh intersect.$3.matflip2pat.flipread.multi intersect.$3.mat.snp.calls.txt mat
multis-and-unaligneds-wrapper.sh intersect.$3.patflip2mat.flipread.multi intersect.$3.pat.snp.calls.txt pat
cd ..
## 8 fsieve original multi reads from original fastq
## then run alleleseq again on this filtered fastqs
echo "##############################################"
echo " 8 rerun alleleseq on bias filtered reads ####"
echo "##############################################"
mkdir 8-rerun-alleleseq-$1
cd 8-rerun-alleleseq-$1
mkdir trash
ln -s ../7-multi-$1/originalmatreads.intersect.$3.matflip2pat.flipread.multi.bed
ln -s ../7-multi-$1/originalpatreads.intersect.$3.patflip2mat.flipread.multi.bed
ln -s $6/$3
cat originalmatreads.intersect.$3.matflip2pat.flipread.multi.bed originalpatreads.intersect.$3.patflip2mat.flipread.multi.bed | sed 's/\#\*o\*\#/\t/g' | cut -f4 | sort | uniq > originalmatpatreads.multi.ids
## Thunder can only do this on cluster since it is Java
## 1) bed2fastq for mapped reads from folder 2 (smallfastq) for this we use all the original reads; concantenating mat and pat contains redundant entries - bedbowtie2fastq gives unique read info
## 2) cut for BED and then sort and uniq to obtain ids; Thunder doesnt require them to be in order
## 3) Thunder to obtain biasfiltered fastq
## largefastq
zcat $3 | java -Xmx2G -jar ~/jmtools/ThunderByRob.jar FilterFastxByIDList -b -IDs ./originalmatpatreads.multi.ids - | gzip -c > biasfiltered.$3 ; echo $3 ; zcat $3 | wc -l ; echo biasfiltered.$3 ; zcat biasfiltered.$3 | wc -l ; mkdir src ; mv $3 originalmatreads.intersect.$3.matflip2pat.flipread.multi.bed originalpatreads.intersect.$3.patflip2mat.flipread.multi.bed originalmatpatreads.multi.ids src ; make -f $4/PIPELINE.mk ; echo \"folder=\\\"$(pwd)/\\\"; setwd(folder)\" | cat - ~/jmtools/allele_readdepth_table_beta\&binomial_distribution_gradient.R > allele_readdepth_table_beta\&binomial_distribution_gradient.R ; alleleseqOutput2betabinomFormat.sh $1 $7 counts ; R CMD BATCH allele_readdepth_table_beta\&binomial_distribution_gradient.R ; echo \"folder=\\\"$(pwd)/\\\"; setwd(folder)\" | cat - ~/jmtools/alleleseq-betabinomial.R > alleleseq-betabinomial.R ; R CMD BATCH alleleseq-betabinomial.R ; alleleseqOutput2betabinomFormat.sh $1 $7 interestingHets ; alleleseqOutput2betabinomFormat.sh $1 $7 interestingHets.betabinom
sed 's/ ; /\n\n/g' bsub-script-rdy-$2-$1-fastqfilter-thunder.sh > bsub-script-rdy-$2-$1-fastqfilter-thunder.sh_
mv bsub-script-rdy-$2-$1-fastqfilter-thunder.sh_ bsub-script-rdy-$2-$1-fastqfilter-thunder.sh
cd ..
fi