-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmake_clusters.sh
executable file
·84 lines (69 loc) · 2.57 KB
/
make_clusters.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/bin/bash
g=/filer/misko/mini_chr/git/minichr/
p=/filer/misko/picard/picard-tools-1.56/
s=/home/misko/apps/bin/samtools
j=/usr/bin/java
b=/filer/misko/bedtools-2.17.0/
c=/data/misko/2013.04.12/cs2-4.3/cs2.exe
#using hg19
#ref=/filer/hg19/hg19.fa
#or hg18
#ref=/filer/hg18/hg18.fa
if [ $# -ne 2 ]; then
echo $0 bam clustermapq
exit
fi
cluster_mapq=$6
bam=$1
if [ ! -f $bam ]; then
echo Cannot find bam file!
#exit
fi
if [ ! -f $bam.bai ]; then
echo Bam file not indexed!
#exit
fi
clustermapq=$2
cluster=$g/clustering/cluster
echo `date` $0 $@ " making clusters clustermapq:" ${clustermapq} >> ${bam}_command_line
function mcluster {
msfile=`echo $bam | sed 's/.bam/_mean_and_std.txt/g'`
if [ ! -f $msfile -o `cat $msfile | awk 'END {print NR}'` -lt 1 ]; then
echo failed to find mean and std file!
$j -jar $p/CollectInsertSizeMetrics.jar VALIDATION_STRINGENCY=SILENT H=${bam}_histo I=${bam} O=${bam}_stats AS=true
cat ${bam}_stats | awk '{print $5,$6}' | grep -A 1 DEVI | tail -n 1 > ${msfile}
fi
if [ ! -f $msfile ] ; then
echo failed to make insert size metrics
exit
fi
local mean=`cat $msfile | awk '{print int($1)}'`
local stddev=`cat $msfile | awk '{print int($2)}'`
rfn=`echo $bam | sed 's/.bam//g'`
outd=${rfn}_clusters
mkdir -p $outd
outa=${rfn}_clusters/q${clustermapq}.txt.gz
if [ -e $outa ]; then
sz=`/usr/bin/du -s $outa | awk '{print $1}'`
else
sz=0
fi
if [ ! -e $outa -o $sz -lt 30 ]; then
rm $outa
echo Generating base clusters
$s view -q ${clustermapq} ${bam} | python $g/filter_bwa_mq.py ${clustermapq} | $cluster $mean $stddev | gzip > ${outa}_wchrM
zgrep -v chrM ${outa}_wchrM | gzip > $outa
else
echo Skipping gen
fi
covs="10 5 3 2 0"
for cov in $covs; do
#move over the clusters, filter for same strand (type 0 or type 1) with size less then 2000
out=${rfn}_clusters/q${clustermapq}_cov${cov}.txt.gz
zcat $outa | awk -v c=$cov '{if ($4>c) {print $0}}' | sed 's/\([0-9]\)\t\([0-9]*\)[:]\([0-9]*\)\t\([0-9]*\)[:]\([0-9]*\)\t\([0-9]*\)\t\(.*\)\t\([0-9][0-9]*[.]*[0-9]*\)\t\([0-9][0-9]*[.]*[0-9]*\)/\2\t\3\t\4\t\5\t\1\t\6\t\8\t\9/g' | awk '{if ($1==23) {$1="X"}; if ($1==24) {$1="Y"}; if ($3==23) {$3="X"}; if ($3==24) {$3="Y"}; a="+"; b="+"; if ($5==1) {a="-"; b="-"}; if ($5==2) {a="+"; b="-"}; if ($5==3) {a="-"; b="+"}; print "chr"$1"\t"$2"\t"a"\tchr"$3"\t"$4"\t"b"\t"$6"\t"$7"\t"$8}' | awk '{d=$2-$5; if (d<0) {d=-d}; if ($3==$6 && $1==$4 && d<2000) { } else {print $0}}' | gzip > $out
zcat $out | awk '{if ($8<200 && $9<200) {print $0}}' | gzip > ${out}_200.gz
done
}
##generate the rough clusters
echo current dir is `pwd`
mcluster