-
Notifications
You must be signed in to change notification settings - Fork 1
/
script_bam.sh
executable file
·221 lines (178 loc) · 7.38 KB
/
script_bam.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
#! /bin/bash
workdir=/home/output/workdir
script20thPerc=/home/src/script_20th_perc.R
finalpredictdir=/home/output
script_equaR=/home/src/script_equa.R
script_calcR=/home/src/script_predict_bam.R
figuresdir=${workdir}/figures
tmp=/home/output/tmp
# ./script_bam.sh $species $processors $bam $chromsize
processors=$1
bam=$2
chromsizes=$3
if [ ! -d $workdir ]; then
mkdir $workdir
fi
cd $workdir
for d in $finalpredictdir $figuresdir $tmp; do
if [ ! -d $d ]; then
mkdir $d
fi
done
#################################################################################
## Calculating the coefficiants of the equation to calculate the resolution
>&2 echo "Calculating the resolution equation"
# Starting from the unique filtered mapped read pairs
>&2 echo -e "\tExtracting positions from bam file"
samtools view $bam | awk '//{if($7=="="){if($4<$8){print $3,$4,$3,$8} else {print $3,$8,$3,$4}} else {if($3<$7){print $3,$4,$7,$8} else print $7,$8,$3,$4}}' OFS='+' >> tmp.txt
if [ ! -f tmp.txt ]; then
>&2 echo "Error: samtools failed"
rm -r $workdir
rm -r $tmp
exit 1
fi
>&2 echo -e "\tSorting read pairs"
sort -k1,1 -k2,2n -k3,3 -k4,4n --parallel=$processors tmp.txt > tmp_srt.txt
if [ ! -f tmp_srt.txt ]; then
>&2 echo "Error: sorting failed"
rm -r $workdir
rm -r $tmp
exit 1
fi
rm tmp.txt
cat tmp_srt.txt | awk 'BEGIN{line="";m=0} //{if(line!=$0){if(NR>1){print line > "tmp_pair1.txt"; m=m+1}; line=$0}} END{print m > "all_count.txt"}'
rm tmp_srt.txt
# Checking if there are enough valid read pairs
n=`cat all_count.txt`
if (( $n<30000000 )); then
>&2 echo "Error, too low number of mapped reads in the library."
rm -r $workdir
rm -r $tmp
exit 1
fi
# Splitting the read file to maximise cpus use
>&2 echo -e "\tSubsampling read pairs"
split -a 5 -l 10000000 tmp_pair1.txt tmp_pair1_split_
# Generating subsamples tag files for 20M (low) valid read pairs, intermediate (mid) (mean between 20M and the total) and total mapped pairs (all)
mid=`echo "(${n}+20000000)/2" | bc`
s1=`echo "scale=2; 20000000/${n}" | bc`
s2=`echo "scale= 2; ${mid}/${n}" | bc`
for file in tmp_pair1_split_[a-z][a-z][a-z][a-z][a-z]; do
echo "cat $file | awk -v name=${file} -v s1=$s1 -v s2=$s2 'BEGIN{FS=\"+\"; l=0; m=0; srand()} //{print \$1\"\t\"\$2\"\t\"\$2+100 >> (name \"_all.bed\"); print \$3\"\t\"\$4\"\t\"\$4+100 >> (name \"_all.bed\"); if(rand()<s1){print \$1\"\t\"\$2\"\t\"\$2+100 >> (name \"_low.bed\"); print \$3\"\t\"\$4\"\t\"\$4+100 >> (name \"_low.bed\"); l=l+1}; if(rand()<s2){print \$1\"\t\"\$2\"\t\"\$2+100 > (name \"_mid.bed\"); print \$3\"\t\"\$4\"\t\"\$4+100 >> (name \"_mid.bed\"); m=m+1}} END{print l > (name \"_low_count.txt\"); print m > (name \"_mid_count.txt\")}'" >> parallel.sh
done
cat parallel.sh | xargs -P $processors -L 1 -0 bash -c
rm parallel.sh
# Merging the sub-sampled splitted files
for file in tmp_all.bed tmp_low.bed tmp_mid.bed; do
if [ -f $file ]; then
rm $file
fi
done
cat tmp_pair1_split_[a-z][a-z][a-z][a-z][a-z]_all.bed >> tmp_all.bed
cat tmp_pair1_split_[a-z][a-z][a-z][a-z][a-z]_low.bed >> tmp_low.bed
cat tmp_pair1_split_[a-z][a-z][a-z][a-z][a-z]_mid.bed >> tmp_mid.bed
rm tmp_pair1_split_[a-z][a-z][a-z][a-z][a-z]
rm tmp_pair1_split_[a-z][a-z][a-z][a-z][a-z]_all.bed
rm tmp_pair1_split_[a-z][a-z][a-z][a-z][a-z]_low.bed
rm tmp_pair1_split_[a-z][a-z][a-z][a-z][a-z]_mid.bed
if [ ! -f tmp_all.bed ] || [ ! -f tmp_low.bed ] || [ ! -f tmp_mid.bed ]; then
>&2 echo "Error while generating tags files"
rm -r $workdir
rm -r $tmp
exit 1
fi
# Sorting the tag files to use them with bedtools
>&2 echo -e "\tSorting reads"
sort -T $tmp --parallel=$processors -k1,1 -k2,2n --output=tmp_all_srt.bed tmp_all.bed
sort -T $tmp --parallel=$processors -k1,1 -k2,2n --output=tmp_low_srt.bed tmp_low.bed
sort -T $tmp --parallel=$processors -k1,1 -k2,2n --output=tmp_mid_srt.bed tmp_mid.bed
if [ ! -f tmp_all_srt.bed ] || [ ! -f tmp_low_srt.bed ] || [ ! -f tmp_mid_srt.bed ]; then
>&2 echo "Error: sorting failed"
rm -r $workdir
rm -r $tmp
exit 1
fi
# Extracting the real read count for each subsample
## NB: the subsampling is based on a random function and output an approximate number of reads.
cat tmp_pair1_split_[a-z][a-z][a-z][a-z][a-z]_low_count.txt | awk 'BEGIN{n=0} //{n=n+$1} END{print n > "low_count.txt"}'
cat tmp_pair1_split_[a-z][a-z][a-z][a-z][a-z]_mid_count.txt | awk 'BEGIN{n=0} //{n=n+$1} END{print n > "mid_count.txt"}'
rm tmp_pair1_split_[a-z][a-z][a-z][a-z][a-z]_low_count.txt
rm tmp_pair1_split_[a-z][a-z][a-z][a-z][a-z]_mid_count.txt
# Using bedtools to assess the coverage of the subsamples on 20kb, 50kb and 100kb windows
>&2 echo -e "\tCalculating coverage for 20kb, 50kb and 100kb windows"
sort --parallel=$processors -T $tmp -k1,1 -k2,2n $chromsizes > genome_sorted.chrom.sizes
if [ ! -f genome_sorted.chrom.sizes ]; then
>&2 echo "Error: sorting failed"
rm -r $workdir
rm -r $tmp
exit 1
fi
echo -e "SizeRaw\tSizeU\t\tWindow\tPerc_20th" > ${figuresdir}/res_data.txt
for i in 20 50 100; do
j=`echo ${i}*1000 | bc`
bedtools makewindows -w $j -g genome_sorted.chrom.sizes > genome_windows_${i}kb.bed
if [ ! -f genome_windows_${i}kb.bed ]; then
>&2 echo "Error: bedtools failed"
rm -r $workdir
rm -r $tmp
exit 1
fi
done
for i in 20 50 100; do
for name in low mid all; do
file=tmp_${name}_srt.bed
echo "bedtools intersect -sorted -c -b $file -a genome_windows_${i}kb.bed | awk '{print \$1,\$2,\$3,\$4}' OFS='\t' > ${file%_srt.bed}_${i}kb.bedGraph" >> parallel.sh
done
done
cat parallel.sh | xargs -P $processors -L 1 -0 bash -c
rm parallel.sh
# Calulating the 20th percentile for each subsample - window size pair
>&2 echo -e "\tCalculating the 20th percentile of coverage for each subsample / window size"
for i in 20 50 100; do
for name in low mid all; do
file=tmp_${name}_srt.bed
if [ ! -f ${file%_srt.bed}_${i}kb.bedGraph ]; then
>&2 echo "Error: bedtools failed"
rm -r $workdir
rm -r $tmp
exit 1
fi
sizeU=`cat ${name}_count.txt`
perc=`Rscript $script20thPerc ${file%_srt.bed}_${i}kb.bedGraph`
echo -e "NA\t"$sizeU"\t"$((i*1000))"\t"$perc >> ${figuresdir}/res_data.txt
done
done
# Using the 6 20th percentiles calculated to check the linearity and solve the resolution equation
>&2 echo -e "\tExtracting the coefficiants of the resolution equation"
Rscript $script_equaR ${figuresdir}/res_data.txt ${finalpredictdir}/equation.txt
#################################################################################
## Prediction table generation
cd $finalpredictdir
>&2 echo -e "Prediction table generation"
touch ${finalpredictdir}/equation.txt
n=`wc -l equation.txt | cut -d" " -f 1`
if (( $n==5 )); then
i=0
while read line; do
if [[ $i > 0 ]]; then
declare "coef_$i"=`echo $line | cut -d" " -f 3`
fi
i=$((i+1))
done < equation.txt
s=`cat ${workdir}/all_count.txt`
res=`echo "( 1000 - $coef_4 - ( $coef_3 * $s ) ) / ( ( $coef_1 * $s ) - $coef_2 )" | bc`
echo "************************************************************"
echo ""
echo "The resolution of your libary is "$res" bp (using all interactions)."
echo ""
Rscript $script_calcR $coef_1 $coef_2 $coef_3 $coef_4 ${finalpredictdir}/prediction.txt
else
>&2 echo "Error, no prediction computed."
rm -r $workdir
rm -r $tmp
exit 1
fi
#################################################################################
## Removing temporary files
rm -r $workdir
rm -r $tmp