-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathannotate
executable file
·178 lines (159 loc) · 5.23 KB
/
annotate
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#!/bin/bash
SOURCEDIR=$(dirname $(readlink -f $0));
bedTools="bedtools"
annDir=$SOURCEDIR"/annotations";
inputFlag=0;
SHORTOPTS="i:o:"
LONGOPTS="hg18,hg19,genome:,annDir:,help"
PROGNAME="annotate"
ARGS=$(getopt -s bash --options $SHORTOPTS --longoptions $LONGOPTS --name $PROGNAME -- "$@" )
eval set -- "$ARGS"
# argument 1 - annotation file
# argument 2 - delta
# argument 3 - column number
function annotate {
additional_parameters="$4"
echo "Additional Parameters:" $additional_parameters
awk -v DELTA=$2 -v OFS='\t' '{print $1,$2-DELTA,$3+DELTA}' $annFile > $annFile"_temp.bed";
numLines=$(wc -l $annFile"_temp.bed"| awk '{print $1}');
for i in `seq 1 $numLines`
do
sed -n $i","$i"p" $annFile"_temp.bed" > $oneLineFile;
$bedTools intersect -a $oneLineFile -b $1 -loj -sorted -g $chrInfoFile $additional_parameters >$tempFile
$bedTools merge -i $tempFile -c $3 -o distinct >> $newFile
done
paste $newFile $annFile | awk -F'\t' -v OFS='\t' -v ORS='\t' '{for(i=5;i<=NF;i++)print $i;print $4;printf "\n";}' > $annFile"_temp.bed";
mv $annFile"_temp.bed" $annFile
tr -d '\b\r' < $annFile > ${annFile}_temp
mv ${annFile}_temp $annFile
rm -rf $oneLineFile $newFile $tempFile
}
while true; do
case "$1" in
-i) inputFile="$2"; inputFlag=1; shift 2;;
-o) outputPrefix="$2"; outputFlag=1; shift 2;;
--annDir) geneFile="$2"; shift 2;;
--hg18) genome="hg18"; genomeFlag=1; shift ;;
--hg19) genome="hg19"; genomeFlag=1; shift ;;
--genome) genome="$2"; genomeFlag=1; shift 2 ;;
--help) echo "Usage: annotate -i <input bed> -o <output prefix> <genome flag>"
echo "Please go through ReadMe for futher help"; exit 1 ;;
--) shift; break;;
*) echo "Internal Error!"; exit 1;;
esac
done
if [[ $inputFlag -eq 0 || $genomeFlag -eq 0 || $outputFlag -eq 0 ]]; then
echo "One or more required paramters is/are missing">&2;
exit 1;
elif [[ ! -f $inputFile ]]; then
echo "Error: $inputFile not found!" >&2;
exit 1;
fi
COMMENT_LINE="1i#"
NumColumns=$(head -1 $inputFile | awk '{printf("%d",NF)}')
echo "NUMCOLUMNS: $NumColumns"
for i in `seq 1 $NumColumns`
do
COMMENT_LINE=$COMMENT_LINE"c${i} "
done
# ANNOTATION FILE LIST
chrInfoFile=${annDir}/${genome}ChrInfo.txt
geneList=${annDir}/${genome}.bed
locFile=${annDir}/${genome}LocInfo.bed
dgvFile=${annDir}/${genome}DGV.bed
enhancerFile=${annDir}/${genome}_enhancer
lncRNAFile=${annDir}/${genome}_lincRNA
miRNAFile=${annDir}/${genome}_miRNA_sites.bed
sdFile=${annDir}/${genome}_SD
clinVarFile=${annDir}/${genome}_clinvar
decipherFile=${annDir}/${genome}_DECIPHER
exacFile=${annDir}/${genome}_ExAC
OMIM=${annDir}/${genome}_OMIM
gene_elements=${annDir}/${genome}_ensembl_3
IR=${annDir}/${genome}_IR
TR=${annDir}/${genome}_TR
tempFile=$inputFile"_temp"
oneLineFile=$inputFile"_1.bed"
newFile=$inputFile"_new"
annFile=$outputPrefix"_annotated.bed"
rm -rf $newFile $tempFile $annFile
cp $inputFile $annFile
# check if files are avialable and run annotation function
COMMENT_LINE=$COMMENT_LINE" GENE"
echo "Annotation with genes"
annotate $geneList 0 8
if [[ -f $locFile ]]; then
echo "Annotating with location"
annotate $locFile 0 7
COMMENT_LINE=$COMMENT_LINE" LOCATION"
fi
if [[ -f $gene_elements ]]; then
echo "Annotating with gene structural elements"
annotate $gene_elements 0 7
COMMENT_LINE=$COMMENT_LINE" Gene_Structural_Elements"
fi
if [[ -f $dgvFile ]]; then
echo "Annotating with dgv annotations"
annotate $dgvFile 0 7 "-f 0.5 -F 0.5 -e"
COMMENT_LINE=$COMMENT_LINE" DGV_accessions"
fi
if [[ -f $enhancerFile ]]; then
echo "Annotating with enhancers"
annotate $enhancerFile 0 7
COMMENT_LINE=$COMMENT_LINE" Enhancers"
fi
if [[ -f $lncRNAFile ]]; then
echo "Annotating with lncRNAs"
annotate $lncRNAFile 0 7
COMMENT_LINE=$COMMENT_LINE" LNCRNA"
fi
if [[ -f $miRNAFile ]]; then
echo "Annotating with miRNAs target sites"
annotate $miRNAFile 0 7
COMMENT_LINE=$COMMENT_LINE" miRNA_target_sites"
fi
if [[ -f $sdFile ]]; then
echo "Annotating with segmental duplications"
annotate $sdFile 0 7 "-f 0.9"
COMMENT_LINE=$COMMENT_LINE" Segmental_Duplications"
fi
if [[ -f $IR ]]; then
echo "Annotating with Interspersed repeats"
annotate $IR 0 7 "-f 0.9"
COMMENT_LINE=$COMMENT_LINE" Interspersed_repeats"
fi
if [[ -f $TR ]]; then
echo "Annotating with Tandem repeats"
annotate $TR 0 7 "-f 0.9"
COMMENT_LINE=$COMMENT_LINE" Tandem_repeats"
fi
if [[ -f $clinVarFile ]]; then
echo "Annotating with clinical var pathogenicity"
annotate $clinVarFile 0 7 "-f 0.5"
COMMENT_LINE=$COMMENT_LINE" ClinVar_pathogenicity"
fi
if [[ -f $clinVarFile ]]; then
echo "Annotating with clinical var"
annotate $clinVarFile 0 9 "-f 0.5"
COMMENT_LINE=$COMMENT_LINE" ClinVar_Phenotype"
fi
if [[ -f $OMIM ]]; then
echo "Annotating with OMIM phenotypes"
annotate $OMIM 0 7 "-F 0.5"
COMMENT_LINE=$COMMENT_LINE" OMIM_PHENOTYPE"
fi
if [[ -f $decipherFile ]]; then
echo "Annotating with decipher"
annotate $decipherFile 0 7
COMMENT_LINE=$COMMENT_LINE" DECIPHER"
fi
if [[ -f $exacFile ]]; then
echo "Annotating with exAC"
annotate $exacFile 0 7
COMMENT_LINE=$COMMENT_LINE" EXAC"
fi
sed -i "$COMMENT_LINE" $annFile
sed -i -e 's/\t\t/\t/g' $annFile
tr -d '\b\r' < $annFile > ${annFile}_temp
mv ${annFile}_temp $annFile
Rscript ${SOURCEDIR}/source/R/calculatePriority.R ${annFile} "${SOURCEDIR}/source/R/"