-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_polca.sh
executable file
·162 lines (148 loc) · 4.64 KB
/
run_polca.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#!/bin/bash
# Copyright (C) 2022-2023 Yu Wan <[email protected]>
# Licensed under the GNU General Public Licence version 3 (GPLv3) <https://www.gnu.org/licenses/>.
# First version: 1 May 2022; latest update: 17 November 2024
# Set default values ###############
SCRIPT_VERSION=2.0
i='isolate'
n="$i"
outdir='polca_output'
t=1
# Function definitions ###############
# Run './run_polca.sh' to display the information below.
display_parameters() {
echo "
run_polca.sh v$SCRIPT_VERSION
This script polishes an input genome assembly with POLCA and paired-end short reads.
Prerequisites: please ensure bwa aligner is accessible in PATH.
Parameters (seven in total):
-a=*: absolute path and filename of the input assembly in FASTA format (mandatory)
-r=*: path to the directory of paired-end short reads (without the end forward
slash), where read files' names follow format [sample name]_[1,2].fastq.gz (mandatory)
-i=*: isolate name (default: ${i})
-s=*: suffix of input FASTQ filename: [isolate name]_[suffix]_[1,2].fastq.gz (default: none)
-n=*: prefix of output filenames (default: ${n})
-o=*: path to output directory (default: ${outdir})
-p=*: MaSuRCA/bin directory (without the end forward slash), where polca.sh is
stored (mandatory)
-t=*: number of threads (default: ${t})
Example command:
~/bin/Assembly_toolkit/run_polca.sh -a=\$PWD/1_isolate1_polypolish.fasta -r=reads/illumina \\
-i=isolate1 -n=3_isolate1 -o="\$PWD" -p=\$HOME/bin/MaSuRCA-4.0.5/bin -t=8 > 3_isolate1_polca.log
Output: a polished assembly [o]/[n]_polca.fna in FASTA format
"
}
MYDIR="$(dirname "$(readlink -f "$0")")" # Directory of this script
source $MYDIR/modules.sh # Function print_failure_message
# Print parameter information ###############
if [ -z "$1" ]
then
display_parameters
exit
fi
# Read parameters ###############
for arg in "$@"
do
case $arg in
-a=*)
fasta_in="${arg#*=}"
;;
-r=*)
read_dir="${arg#*=}"
;;
-i=*)
i="${arg#*=}"
;;
-s=*)
s="${arg#*=}"
;;
-n=*)
n="${arg#*=}"
;;
-o=*)
outdir="${arg#*=}"
;;
-p=*)
polca_dir="${arg#*=}"
;;
-t=*)
t="${arg#*=}"
;;
*)
;;
esac
done
# Run POLCA for the input genome assembly ###############
if [ -z "$(which bwa)" ]
then
echo "Error: bwa was not accessible"
print_failure_message "$i"
exit
fi
if [ -e "$fasta_in" ]
then
fasta_name=`basename $fasta_in`
echo "Base name of the input FASTA file: $fasta_name"
else
echo "Error: assembly $fasta_in was not found."
exit
fi
if [ -d "$read_dir" ]
then
if [ -z "$s" ]
then
r1=${read_dir}/${i}_1.fastq.gz
r2=${read_dir}/${i}_2.fastq.gz
else
r1="${read_dir}/${i}_${s}_1.fastq.gz"
r2="${read_dir}/${i}_${s}_2.fastq.gz"
fi
if [ ! -f "$r1" ] || [ ! -f "$r2" ] # '-f' works for symbolic links
then
echo "Error: read file $r1 and/or $r2 were not found."
print_failure_message "$i"
exit
fi
else
echo "Error: read directory $read_dir was not found."
print_failure_message "$i"
exit
fi
if [ -d "$polca_dir" ]
then
polca="$polca_dir/polca.sh"
if [ -f "$polca" ]
then
echo "run_polca.sh v$SCRIPT_VERSION"
if [ ! -d "$outdir" ]
then
echo "Create output directory $outdir"
mkdir $outdir
fi
tm="polca_tmp_$i"
mkdir $tm # Create a temporary directory in the current working directory
cd $tm
echo "[$(date)] Start to polish $fasta_in of isolates $i with reads $r1 and $r2 and with $t threads"
$polca -a $fasta_in -r "$r1 $r2" -t "$t" -m 8G
polca_out="${fasta_name}.PolcaCorrected.fa"
fasta_out="${outdir}/${n}_polca.fna"
if [ -f "$polca_out" ]
then
cd ..
echo "Saving $tm/$polca_out to $fasta_out"
mv "$tm/$polca_out" "$fasta_out" # Output FASTA file and its name
rm -r $tm # Delete the temporary directory and its content
rm "$fasta_in".fai
echo "[$(date)] Success: polished assembly of isolate $i was saved as ${fasta_out}."
else
echo "Error: output file $polca_out was not found. Please check files in $tm for details."
print_failure_message "$i"
fi
else
echo "Error: script $polca was not found."
print_failure_message "$i"
fi
else
echo "Error: POLCA directory $polca_dir was not found."
print_failure_message "$i"
fi