-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathBEAF.sh
3509 lines (3391 loc) · 188 KB
/
BEAF.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env bash
address=$(cd "$(dirname "")" && pwd)/$(basename "")
ConfigFile=$(cd "$(dirname "config.file")" && pwd)/$(basename "config.file")
spades="/media/coppini/SDATA2/Bioinfo/MAGs/Lib/spades/SPAdes-3.11.0/bin/spades.py"
# metaspades="/media/coppini/SDATA2/Bioinfo/MAGs/Lib/spades/SPAdes-3.11.0/bin/metaspades.py"
# rnaspades="/media/coppini/SDATA2/Bioinfo/MAGs/Lib/spades/SPAdes-3.11.0/bin/rnaspades.py"
quast="/media/coppini/SDATA2/Bioinfo/MAGs/Lib/quast/metaquast.py"
# bbtools="/media/coppini/SDATA2/Bioinfo/MAGs/Lib/bbmap"
usearch="/bin/usearch"
# usearch64="/media/coppini/SDATA2/Bioinfo/TCC/miTAGs/miTAGs_SILVA/usearch9.2.64_i86linux64"
magicblast="/bin/magicblast"
makemagicblastdb="/bin/makemagicblastdb"
diamond="/bin/diamond"
fastqc="/usr/local/bin/fastqc"
cdhit="/usr/lib/cd-hit/cd-hit"
cutadapt="/usr/local/bin/cutadapt"
pigz="/usr/local/bin/pigz"
LIB="/media/coppini/SDATA2/Bioinfo/MAGs/Lib/"
orffinder="$LIB/bb.orffinder.pl"
extpy="$LIB/ext.py"
SeqLength="$LIB/SeqLength.py"
Splitter="$LIB/Splitter.py"
PCAmaker="$LIB/PCA_maker.py"
BucketsFolder="Buckets"
ReferencesFolder="$address/Reference_seqs"
SearchMode="Full"
TrimFiles="True"
adapter="AGATCGGAAGAGC"
GenID=0.95
GenID1=0.85
GenID2=0.95
ProtID1=25
ProtID2=90
TaxonID1=0.90
TaxonID2=0.97
GENevalue="1e-20"
GENevalue1="1e-10"
GENevalue2="1e-20"
PROTevalue1="1e-5"
PROTevalue2="1e-5"
Taxonevalue1="1e-20"
Taxonevalue2="1e-20"
PROTqcov1=0.5
PROTqcovHits=50
PROTqcovORFs=50
Taxonqcov1=0.1
Taxonqcov2=90
maxaccepts1=1
maxrejects1=32
maxaccepts2=5000
maxrejects2=5000
genmaxrejects2=64
rDNA16Scutoff=50
rDNAminCntgLength=500
strand="both"
GenomeCoverage=5
GenomeCoverage1=0
GenomeCoverage2=5
GenomeFragLength=100
ProtFragLength=100
ProtOverlap=0
ProtCoverage=0
threads=4
if [[ $(nproc) -gt 2 ]]
then
threads=`echo "$(nproc) * 9 / 10" | bc` # If more than 2 threads in the computer, use 90% of threads
elif [[ $(nproc) -eq 2 ]] || [[ $(nproc) -eq 1 ]]
then
threads=1 # If only 1 or 2 threads in the computer, use only 1 thread
else
echo "Couldn't determine the number of threads in the computer. Please, specify threads manually." # If the system can't determine the number of threads as higher than 2, 2 nor 1, then ask for the threads to be determined manually, and, until it is, use the default of 4.
fi
CFLR="U"
ver="BEAF (full)"
KeepBlastDBs="no" #If you're sure you want to keep databases files from one loop to the next, change this to 'Y'. If you're sure you're not reusing blast databases, change to 'N'
KeepGenomeUDBs="no"
AlwaysKeepBuckets="no"
StopAfterMakingBuckets="no"
show_help ()
{
echo "
#### BEAF - version 1.0.0 (Built on 22/05/2018) ####
#### Avaiable at https://github.com/celiosantosjr/BEAF ####
Usage: BEAF.sh [options]
Here's a guide for avaiable options. Defaults for each option are showed between brackets: [default].
N=Integer (eg. 100)
n=Integer or float (eg. 90 or 0.90)
e=float or exponential (eg. 0.001 or 1e-3)
Tutorial options:
-h, --help Show this helpful help page
--config_help Show a help page in configuring your config.file
Basic options:
--config [file] Config file to be used [config.file]
--output [folder] Folder where output is sent [current folder]
-t, --threads [N] Number of threads [90% of avaiable threads]
--reference [folder] Folder where your reference files are located [Reference_seqs]
Advanced options for your run:
--force_continue BEAF continues a previous run
--force_restart BEAF deletes files from a previous run, starting anew.
--disable_ref_folder Use full paths for references in the config.file, disable the references folder (same as doing --reference "")
--AlwaysKeepBuckets Don't delete buckets after runs. Only use this if you are using the same sequencing file for all your runs.
--DontTrim Skip trimming of reads.
--OnlyMakeBuckets Stop after making buckets. Don't run the BEAF pipeline.
--KeepBlastDBs Don't delete blast databases after using them - for proteins and genes.
--Quick Use MagicBlast and Diamond instead of USEARCH as your search engine. This may speed up runs, but may also show fewer results.
Homology search options
--Affecting all modules
--maxaccepts [N] Stop USEARCH search (2nd filter) after finding N results that match filter criteria [5000]
--maxrejects [N] Stop USEARCH search (apply to both filters) after finding N results that do not match filter criteria
--maxrejects1 [N] Stop USEARCH search (1st filter) after finding N results that do not match filter criteria [32]
--maxrejects2 [N] Stop USEARCH search (2nd filter) after finding N results that do not match filter criteria for protein/gene and taxonomy modes [5000]
--genmaxrejects2 [N] Stop USEARCH search (2nd filter) after finding N results that do not match filter criteria for genome mode [64]
--Genome
--gen_id [n] Minimum identity for genome analyses [0.95]
--gen_id1 [n] Minimum identity only for the 1st filter of genome analyses [0.85]
--gen_id2 [n] Minimum identity only for the 2nd filter of genome analyses (subrefs) [0.95]
--gen_evalue1 [e] Minimum e-value for the 1st filter of genome analyses
--gen_evalue2 [e] Minimum e-value for the 2nd filter of genome analyses
--GenCoverage [n] Genome coverage for the genome database used in the homology searches (if using subrefs, applies to both the 1st and 2nd filter)
--GenCoverage1 [n] Genome coverage for the genome database used in the 1st homology search (if using subrefs)
--GenCoverage2 [n] Genome coverage for the genome database used in the 2nd homology search (if using subrefs)
--GenFragLength [N] Length of fragments generated for the genome database used in the homology searches
--Protein/Gene
--prot_id1 [n] Minimum identity for the 1st filter of protein/gene analyses (general filter)
--prot_id1 [n] Minimum identity for the 2nd filter of protein/gene analyses (subref filter)
--prot_evalue1 [e] Minimum e-value for the 1st filter of protein/gene analyses
--prot_evalue2 [e] Minimum e-value for the 2nd filter of protein/gene analyses
--prot_qcov1 [n] Minimum query-coverage for the 1st filter of protein/gene analyses
--prot_qcov2 [n] Minimum query-coverage for the 2nd filter of protein/gene analyses
--Taxonomy
--16S_id1 [N] Minimum identity for the 1st filter of taxonomic analyses (general filter)
--16S_id2 [N] Minimum identity for the 2nd filter of taxonomic analyses (specific/OTU filter)
--16S_evalue1 [e] Minimum e-value for the 1st filter of taxonomic analyses
--16S_evalue2 [e] Minimum e-value for the 2nd filter of taxonomic analyses
--16S_qcov [n] Minimum query-coverage filter only for taxonomic analyses
--16S_cutoff [N] Minimum number of counts for an OTU to be accepted
"
}
show_config_help ()
{
echo "-Setting up config.file
Make a config.file file containing 7 columns, respectively: T1, T2, R1, R2, Ref, Subref and Out.
On the first column (T1), use 'G' to analyse genomes, '16S' to analyse taxonomy, 'P' to analyse proteins and 'N' with nucleotide sequences.
On the second column (T2), depending on the format of the metagnomic data, you may use 'R', for pair-end fastq files; 'I', for an interleaved fastq file; or 'F', for an interleaved fasta file. Regardless of the format you're using, all files must be compressed with gzip (.gz).
On the third column (R1), you must give the complete path to your file (R1 file in the case of pair-end, interleaved in the other two cases), from your main folder to the file itself (example: home/usr/Desktop/BEAF-master/R1_trimmed.fastq.gz).
On the fourth column (R2), in the case of paired end files, you must give the complete path to the second file (R2), as explained above. For interleaved files, use 'NA' instead.
On the fifth column (Ref), use the name of the reference genome file (not compressed and in fasta format) or protein/gene databank file in the Reference folder (this first database should be less specific and be composed by a large number of sequences, helping program to identify besides highly homologous reads, less homologous ones too). If there will be a Reference for protein/gene databanks, you can use an UDB file instead of fasta. If running taxonomy, you may leave this as NA and use the default database. For genomes, you may leave the Referene as NA and use multiple subreferences instead (see below).
On the sixth column (SubRef), use the path to the folder containing subreferences for protein/gene families, if any. If using genome, you may use files of several strains or species in the subreference folder, with no Reference. If you do this, BEAF will first do a broader search against all subreferences, finding sequences related to any of the subreferences provided, and will then match against each subreference individually. If running taxonomy, you may leave this as NA and use the default database.
On the seventh column (Out), use the desired name for the output folder generated from that file (example: Metaspot1_Ecoli). Don't use spaces nor slashes in the name.
Example:
N I /home/usr/Desktop/BEAF_master/Test_sample/Alistipes_putredinis_DSM_17216.fna.fastq.gz NA
DNA_pol.fasta DNA_pol 1D
16S R /home/usr/Desktop/BEAF_master/Test_sample/Metagenome_R1.fastq.gz /home/usr/Desktop/BEAF_master/Test_sample/Metagenome_R2.fastq.gz NA NA Taxonomy
G R /home/usr/Desktop/BEAF_master/Test_sample/Metagenome_R1.fastq.gz /home/usr/Desktop/BEAF_master/Test_sample/Metagenome_R2.fastq.gz Alistipes_putredinis_DSM_17216.fna NA Alistipes
On the example above, three runs would be made:
-prospecting for sequences related to the reference DNA_pol.fasta, subreferences in the folder DNA_pol, in the file Alistipes_putredinis_DSM_17216.fna.fastq.gz;
-taxonomy analyses on the metagenome with paired-ends Metagenome_R1.fastq.gz and Metagenome_R2.fastq.gz, using the default taxonomy databases;
-genome recoverage with the Alistipes_putredinis_DSM_17216.fna genome as reference, on the metagenome with paired-ends Metagenome_R1.fastq.gz and Metagenome_R2.fastq.gz.
After the three runs are over, their output would be found, respectively, in the folders '1D', 'Taxonomy' and 'Alistipes'"
}
while [[ $# -gt 0 ]]
do
case $1 in
-h|--help|-help|\?|-\?)
show_help
exit
;;
--config_help|--help_config)
show_config_help
exit
;;
--DontTrim|donttrim|Donttrim|dontTrim|DONTTRIM|DONTtrim|dontTRIM|noTrim|NoTrim|NOtrim|noTRIM|Notrim|NOTRIM|--no-trim|--NO-TRIM|--No-Trim|--No-trim)
TrimFiles="False"
;;
--config|--Config|--CONFIG|-config|-Config|--CONFIG)
if [[ -s ${2} ]]
then
ConfigFile=$(cd "$(dirname "${2}")" && pwd)/$(basename "${2}")
else
echo "Couldn't find the designated config.file '${2}'"
exit
fi
shift
;;
--config=*|--Config=*|--CONFIG=*|-config=*|-Config=*|--CONFIG=*)
if [[ -s ${1#*=} ]]
then
ConfigFile=$(cd "$(dirname "${1#*=}")" && pwd)/$(basename "${1#*=}")
else
echo "Couldn't find the designated config.file '${1#*=}'"
exit
fi
;;
--output|--Output|--OUTPUT|-output|-Output|-OUTPUT|-O|-o|-Out|-OUT|-out|--out|--Out|--OUT|--o|--O|--address|--Address|--ADDRESS)
if [[ -d "${2}" ]]
then
touch ${2}
else
mkdir ${2}
fi
address=$(cd "$(dirname "${2}")" && pwd)/$(basename "${2}")
shift
;;
--output=*|--Output=*|--OUTPUT=*|-output=*|-Output=*|-OUTPUT=*|-O=*|-o=*|-Out=*|-OUT=*|-out=*|--out=*|--Out=*|--OUT=*|--o=*|--O=*|--address=*|--Address=*|--ADDRESS=*)
if [[ -d "${1#*=}" ]]
then
touch ${1#*=}
else
mkdir ${1#*=}
fi
address=$(cd "$(dirname "${1#*=}")" && pwd)/$(basename "${1#*=}")
;;
--force_continue|--Force_Continue|--Force_continue|--force_Continue|--FORCE_CONTINUE|--fc|--FC|--Fc|--fC|-fc|-FC|-Fc|-fC)
CFLR="Y"
;;
--force_restart|--Force_Restart|--Force_restart|--force_Restart|--FORCE_RESTART|--forcerestart|--Forcerestart|--ForceRestart|--FORCERESTART|--fr|--FR|--Fr|--fR|-fr|-FR|-Fr|-fR)
CFLR="N"
;;
-t|-T|--threads|--Threads|--THREADS|--t|--T)
threads=${2}
shift
;;
-t=*|-T=*|--threads=*|--Threads=*|--THREADS=*|--t=*|--T=*)
threads=${1#*=}
;;
--reference|--Reference|--REFERENCE|--references|--References|--REFERENCES|--ref|--Ref|--REF|--refs|--Refs|--REFS|-ref|-Ref|-REF|-reference|-references)
if [[ -d "${2}" ]]
then
ReferencesFolder=$(cd "$(dirname "${2}")" && pwd)/$(basename "${2}")
else
ReferencesFolder="${2}"
fi
shift
;;
--reference=*|--Reference=*|--REFERENCE=*|--references=*|--References=*|--REFERENCES=*|--ref=*|--Ref=*|--REF=*|--refs=*|--Refs=*|--REFS=*|-ref=*|-Ref=*|-REF=*|-reference=*|-references=*)
if [[ -d "${1#*=}" ]]
then
ReferencesFolder=$(cd "$(dirname "${1#*=}")" && pwd)/$(basename "${1#*=}")
else
ReferencesFolder="${1#*=}"
fi
;;
--disable_ref_folder|--disable_ref)
ReferencesFolder=""
;;
--gen_id)
if [[ $(echo "${2} > 0" | bc -l) -eq 1 ]]
then
if [[ $(echo "${2} > 1" | bc -l) -eq 1 ]]
then
if [[ $(echo "${2} > 100" | bc -l) -eq 1 ]]
then
echo "Invalid identity for genome filter. Input was over 100 (gen_id=${2}). Ignoring this input."
# elif [[ ${2} -eq 100 ]]
# then
# GenID1="1"
# GenID2="1"
else
# GenID1="0.${2/./}"
GenID1="${2}"
# GenID2="0.${2/./}"
GenID2="${2}"
fi
elif [[ $(echo "${2} == 1" | bc -l) -eq 1 ]]
then
# GenID1="1"
GenID1="100"
# GenID2="1"
GenID2="100"
elif [[ $(echo "${2} < 1" | bc -l) -eq 1 ]]
then
# GenID1=0.${2#*.}
GenID1=$(echo "${2} * 100" | bc -l)
# GenID2=0.${2#*.}
GenID2=$(echo "${2} * 100" | bc -l)
else
echo "Invalid identity for genome filter. Input was gen_id=${2}. Ignoring this input."
fi
else
echo "Invalid identity for genome filter. Input was gen_id=${2}. Ignoring this input."
fi
shift
;;
--gen_id1)
if [[ $(echo "${2} > 0" | bc -l) -eq 1 ]]
then
if [[ $(echo "${2} > 1" | bc -l) -eq 1 ]]
then
if [[ $(echo "${2} > 100" | bc -l) -eq 1 ]]
then
echo "Invalid identity for genome filter1. Input was over 100 (gen_id=${2}). Ignoring this input."
# elif [[ ${2} -eq 100 ]]
# then
# GenID1="1"
else
# GenID1="0.${2/./}"
GenID1="${2}"
fi
elif [[ $(echo "${2} == 1" | bc -l) -eq 1 ]]
then
# GenID1="1"
GenID1="100"
elif [[ $(echo "${2} < 1" | bc -l) -eq 1 ]]
then
# GenID1=0.${2#*.}
GenID1=$(echo "${2} * 100" | bc -l)
else
echo "Invalid identity for genome filter1. Input was gen_id1=${2}. Ignoring this input."
fi
else
echo "Invalid identity for genome filter1. Input was gen_id1=${2}. Ignoring this input."
fi
shift
;;
--gen_id2)
if [[ $(echo "${2} > 0" | bc -l) -eq 1 ]]
then
if [[ $(echo "${2} > 1" | bc -l) -eq 1 ]]
then
if [[ $(echo "${2} > 100" | bc -l) -eq 1 ]]
then
echo "Invalid identity for genome filter2. Input was over 100 (gen_id=${2}). Ignoring this input."
# elif [[ ${2} -eq 100 ]]
# then
# GenID2="1"
else
# GenID2="0.${2/./}"
GenID2="${2}"
fi
elif [[ $(echo "${2} == 1" | bc -l) -eq 1 ]]
then
# GenID2="1"
GenID2="100"
elif [[ $(echo "${2} < 1" | bc -l) -eq 1 ]]
then
# GenID2=0.${2#*.}
GenID2=$(echo "${2} * 100" | bc -l)
else
echo "Invalid identity for genome filter2. Input was gen_id2=${2}. Ignoring this input."
fi
else
echo "Invalid identity for genome filter2. Input was gen_id2=${2}. Ignoring this input."
fi
shift
;;
--prot_id1)
if [[ $(echo "${2} > 0" | bc -l) -eq 1 ]]
then
if [[ $(echo "${2} > 1" | bc -l) -eq 1 ]]
then
if [[ $(echo "${2} > 100" | bc -l) -eq 1 ]]
then
echo "Invalid identity for protein filter1. Input was over 100 (gen_id=${2}). Ignoring this input."
# elif [[ ${2} -eq 100 ]]
# then
# ProtID1="1"
else
# ProtID1="0.${2/./}"
ProtID1="${2}"
fi
elif [[ $(echo "${2} == 1" | bc -l) -eq 1 ]]
then
# ProtID1="1"
ProtID1="100"
elif [[ $(echo "${2} < 1" | bc -l) -eq 1 ]]
then
# ProtID1=0.${2#*.}
ProtID1=$(echo "${2} * 100" | bc -l)
else
echo "Invalid identity for protein filter1. Input was prot_id1=${2}. Ignoring this input."
fi
else
echo "Invalid identity for protein filter1. Input was prot_id1=${2}. Ignoring this input."
fi
shift
;;
--prot_id2)
if [[ $(echo "${2} > 0" | bc -l) -eq 1 ]]
then
if [[ $(echo "${2} > 1" | bc -l) -eq 1 ]]
then
if [[ $(echo "${2} > 100" | bc -l) -eq 1 ]]
then
echo "Invalid identity for protein filter2. Input was over 100 (gen_id=${2}). Ignoring this input."
# elif [[ ${2} -eq 100 ]]
# then
# ProtID2="1"
else
# ProtID2="0.${2/./}"
ProtID2="${2}"
fi
elif [[ $(echo "${2} == 1" | bc -l) -eq 1 ]]
then
# ProtID2="1"
ProtID2="100"
elif [[ $(echo "${2} < 1" | bc -l) -eq 1 ]]
then
# ProtID2=0.${2#*.}
ProtID2=$(echo "${2} * 100" | bc -l)
else
echo "Invalid identity for protein filter2. Input was prot_id2=${2}. Ignoring this input."
fi
else
echo "Invalid identity for protein filter2. Input was prot_id2=${2}. Ignoring this input."
fi
shift
;;
--taxon_id1|--taxonomy_id1|--tax_id1|--16S_id1)
if [[ $(echo "${2} > 0" | bc -l) -eq 1 ]]
then
if [[ $(echo "${2} > 1" | bc -l) -eq 1 ]]
then
if [[ $(echo "${2} > 100" | bc -l) -eq 1 ]]
then
echo "Invalid identity for taxonomy filter1. Input was over 100 (gen_id=${2}). Ignoring this input."
# elif [[ ${2} -eq 100 ]]
# then
# TaxonID1="1"
else
# TaxonID1="0.${2/./}"
TaxonID1="${2}"
fi
elif [[ $(echo "${2} == 1" | bc -l) -eq 1 ]]
then
# TaxonID1="1"
TaxonID1="100"
elif [[ $(echo "${2} < 1" | bc -l) -eq 1 ]]
then
# TaxonID1=0.${2#*.}
TaxonID1=$(echo "${2} * 100" | bc -l)
else
echo "Invalid identity for taxonomy filter1. Input was taxon_id1=${2}. Ignoring this input."
fi
else
echo "Invalid identity for taxonomy filter1. Input was taxon_id1=${2}. Ignoring this input."
fi
shift
;;
--taxon_id2|--taxonomy_id2|--tax_id2|--16S_id2)
if [[ $(echo "${2} > 0" | bc -l) -eq 1 ]]
then
if [[ $(echo "${2} > 1" | bc -l) -eq 1 ]]
then
if [[ $(echo "${2} > 100" | bc -l) -eq 1 ]]
then
echo "Invalid identity for taxonomy filter2. Input was over 100 (gen_id=${2}). Ignoring this input."
# elif [[ ${2} -eq 100 ]]
# then
# TaxonID2="1"
else
# TaxonID2="0.${2/./}"
TaxonID2="${2}"
fi
elif [[ $(echo "${2} == 1" | bc -l) -eq 1 ]]
then
# TaxonID2="1"
TaxonID2="100"
elif [[ $(echo "${2} < 1" | bc -l) -eq 1 ]]
then
# TaxonID2=0.${2#*.}
TaxonID2=$(echo "${2} * 100" | bc -l)
else
echo "Invalid identity for taxonomy filter2. Input was taxon_id2=${2}. Ignoring this input."
fi
else
echo "Invalid identity for taxonomy filter2. Input was taxon_id2=${2}. Ignoring this input."
fi
shift
;;
--usearch64)
usearch=${usearch64}
;;
--prot_evalue1)
PROTevalue1=$2
shift
;;
--prot_evalue2)
PROTevalue2=$2
shift
;;
--gen_evalue1)
GENevalue1=$2
shift
;;
--gen_evalue2)
GENevalue2=$2
shift
;;
--16S_evalue1)
Taxonevalue1=$2
shift
;;
--16S_evalue2)
Taxonevalue2=$2
shift
;;
--prot_qcov)
if [[ "$2" -gt "1" ]]
then
PROTqcovORFs=$2
PROTqcovHits=$2
elif [[ "$2" -gt "0" ]]
then
PROTqcovORFs=$(echo "$2 * 100" | bc -l)
PROTqcovHits=$(echo "$2 * 100" | bc -l)
else
echo "Invalid QCoverage. Input given was --prot_qcov $2. QCov must be greater than 0."
fi
shift
;;
--prot_qcov1)
if [[ "$2" -gt "1" ]]
then
PROTqcovHits=$2
elif [[ "$2" -gt "0" ]]
then
PROTqcovHits=$(echo "$2 * 100" | bc -l)
else
echo "Invalid QCoverage. Input given was --prot_qcov1 $2. QCov must be greater than 0."
fi
shift
;;
--prot_qcov2)
if [[ "$2" -gt "10" ]]
then
PROTqcovORFs="0$2"
elif [[ "$2" -gt "1" ]]
then
PROTqcovORFs=$2
elif [[ "$2" -gt "0" ]]
then
PROTqcovORFs=$(echo "$2 * 100" | bc -l)
else
echo "Invalid QCoverage. Input given was --prot_qcov2 $2. QCov must be greater than 0."
fi
shift
;;
--16S_qcov)
if [[ "$2" -gt "10" ]]
then
Taxonqcov2="0$2"
elif [[ "$2" -gt "1" ]]
then
Taxonqcov2=$2
elif [[ "$2" -gt "0" ]]
then
Taxonqcov2=$(echo "$2 * 100" | bc -l)
else
echo "Invalid QCoverage. Input given was --16S_qcov $2. QCov must be greater than 0."
fi
shift
;;
--16S_cutoff)
rDNA16Scutoff=$2
shift
;;
--16S_min_cntg_length)
rDNAminCntgLength=$2
shift
;;
# --maxaccepts1)
# maxaccepts1=$2
# shift
# ;;
--maxaccepts2|--maxaccepts)
maxaccepts2=$2
shift
;;
--maxrejects1)
maxrejects1=$2
shift
;;
--maxrejects2)
maxrejects2=$2
shift
;;
--allmaxrejects2)
maxrejects2=$2
genmaxrejects2=$2
shift
;;
--genmaxrejects2)
genmaxrejects2=$2
shift
;;
--maxrejects)
maxrejects1=$2
maxrejects2=$2
shift
;;
--allmaxrejects)
genmaxrejects2=$2
maxrejects1=$2
maxrejects2=$2
shift
;;
--GenCoverage)
GenomeCoverage=$2
GenomeCoverage1=$2
GenomeCoverage2=$2
shift
;;
--GenCoverage1)
GenomeCoverage1=$2
shift
;;
--GenCoverage2)
GenomeCoverage2=$2
shift
;;
--GenFragLength)
GenomeFragLength=$2
shift
;;
--ProtFragLength)
ProtFragLength=$2
shift
;;
--ProtOverlap)
ProtOverlap=$2
shift
;;
--ProtCoverage)
ProtCoverage=$2
shift
;;
--KeepBlastDBs|--keepblastdbs|--keepbdb|--keepbdbs|--Keepbdb|--Keepbdbs)
KeepBlastDBs="yes"
;;
--KeepGenomeUDBs|--keepgenomeudbs|--keepudb|--keepudbs|--Keepudb|--Keepudbs)
KeepGenomeUDBs="yes"
;;
--OnlyMakeBuckets)
StopAfterMakingBuckets="Y"
;;
--AlwaysKeepBuckets|--KeepBuckets|--AKB)
AlwaysKeepBuckets="Y"
;;
--Quick|--quick|--QUICK|--MagicBlast|--Magicblast|--magicblast|--MAGICBLAST)
SearchMode="quick"
;;
--use_usearch)
SearchMode="usearch"
;;
*)
echo "Couldn't recognize command '${1}'. Ignoring it."
sleep 1
;;
esac
shift
done
echo "Starting program in folder ${address}, at $(date "+%X, %e/%m/%Y")"
# ======================================================================================================================================================================================== #
# =====================================================================================BEAF MODULES======================================================================================= #
# ======================================================================================================================================================================================== #
# BEAF works in a modular fashion, so that the main pipeline will call different functions (modules) as it runs. Each step works in a separate function so that the program can continue from a specific function if ended abruptly (see "Continue function" in README.md)
make_kp () # This function reorders the config.file in order to keep buckets in case they could be used more than once. This will prevent the program from copying, trimming and assessing the quality of trimmage more than once for the same data, reusing files.
{
if [[ "$CFLR" == "Y" ]]; then echo "******Reusing previous settings and files"; else
rm -rf $address/*.kp; rm -rf $address/config.tmp; rm -rf $address/config.file1
cat $ConfigFile | awk NF > $address/config.file1
case $AlwaysKeepBuckets in
Y|y|Yes|yes|YES)
echo "# Buckets will be stored whenever possible."
cat config.file1 | awk '{print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $6 "\t" $7 "\t" "Y"}' > $address/config.kp
rm -rf config.file1
;;
*)
echo "# Checking if any buckets must be stored..."
echo "890_abc.123_XYZ" > $address/LastR1.kp
sort -S 50% --parallel=${threads} -k3,3 $address/config.file1 > $address/doconfig.kp
while read T1 T2 R1 R2 Ref SubRef Out; do
LastR1=`cat $address/LastR1.kp`
if [[ "$R1" == "$LastR1" ]]
then
echo "Y" >> $address/Keep_config.kp
else
echo "N" >> $address/Keep_config.kp
fi
echo "$R1" > $address/LastR1.kp
done < $address/doconfig.kp
echo "N" >> $address/Keep_config.kp
sed -i -e 1,1d $address/Keep_config.kp
paste $address/doconfig.kp $address/Keep_config.kp > $address/config.tmp
rm -rf $address/*.kp; rm -rf $address/config.file1
mv $address/config.tmp $address/config.kp
;;
esac
echo "1" > $address/CR.step; CFLR="N"
fi
}
Check () # This function checks each line in config.file, checking for possible format errors.
{
if [[ "$CFLR" == "Y" && `cat $address/CR.step` != "1" ]]; then echo "******Jumping autocheking system"; else
rm -rf $address/*.check
echo "# Checking your config file..."
sort -S 50% --parallel=${threads} -k7,7 $address/config.kp > $address/config.check
touch $address/LastOut.check
while read T1 T2 R1 R2 Ref SubRef Out Keep; do
case $T1 in
G|g|Genome|genome)
;;
P|p|Prot|prot|Protein|protein|N|n|Nucl|nucl|Nucleotide|nucleotide)
;;
16S|16s|16|T|t|Taxonomy|taxonomy)
;;
*)
echo "# You're using a wrong config.file format. On the first column (T1), where you're currently using '$T1', use only G (for genome analysis), P (for protein analysis) or N (for protein nucleotide sequences analysis)"
rm -rf $address/config.kp $address/config.check
exit
;;
esac
case $T2 in
R|r)
case $R2 in
*.gz)
if ! [ -s $R2 ];
then
echo "# Check your R2 file in '$R2'. The program either couldn't find the file or the file is empty."
rm -rf $address/config.kp $address/config.check
exit
fi
;;
*)
echo "# You're using a wrong config format. On the fourth column (R2), where you're currently using '$R2', you must use a gzipped file instead."
rm -rf $address/config.kp $address/config.check
exit
;;
esac
;;
I|i|F|f)
touch $address/config.kp
;;
*)
echo "# You're using a wrong config.file On the second column (T2), where you're currently using '$T2', use only R (for paired end fastq files), I (for interleaved fastq file) or F (for interleaved fasta file)"
rm -rf $address/config.kp $address/config.check
exit
;;
esac
case $R1 in
*.gz)
if ! [ -s $R1 ];
then
echo "# Check your R1 file in '$R1'. The program either couldn't find the file or the file is empty."
rm -rf $address/config.kp $address/config.check
exit
fi
;;
*.fasta|*.fa|*.fna|*.fsa|*.FASTA|*.FA|*.FNA|*.FSA|*.Fasta|*.Fa|*.Fsa|*.Fna)
case $T2 in
F|f)
;;
*)
echo "# You're using a wrong config.file. You're using a fasta file ($R1), which means you must use the fasta option (F) instead of $T2."
rm -rf $address/config.kp $address/config.check
exit
;;
esac
;;
*)
echo "# You're using a wrong config.file On the third column (R1), where you're currently using '$R1', you must use a gzipped file instead."
rm -rf $address/config.kp $address/config.check
exit
;;
esac
LastOut=`cat $address/LastOut.check`
if [[ "${Out}" == "$LastOut" ]]
then
echo "# You're using a wrong config.file On your seventh column (Out), you've used the same name for your output folder more than once, repeating '${Out}'"
rm -rf $address/config.kp $address/config.check
exit
fi
echo "${Out}" > $address/LastOut.check
done < $address/config.check
rm -rf $address/*.check
if [[ ! -s $address/$(basename $ConfigFile) ]]
then
cp ${ConfigFile} ${address}/$(basename "${ConfigFile}")
if [[ ! -s $address/$(basename $ConfigFile) ]]
then
ConfigFile="${address}/$(basename ${ConfigFile})"
fi
fi
echo "2" > $address/CR.step; CFLR="N"
fi
}
TimeHeader () # Generates the header for the Log.tsv file (full version).
{
if [[ "$CFLR" == "Y" && `cat $address/CR.step` != "2" ]]; then echo "******In this module, time is not measured"; else
echo "______|________|_____|_____|_________|______|____|_____|_______|____|_______|___________|_____________|__________|___________|____|__________|____________|_________|__________
Output|Sequence|Type1|Type2|Reference|Subref|Time|Reads|Buckets|ppm1|contigs|AvgSizeCntg|TotalSizeCntg|StdDevCntg|MaxCntgSize|ORFs|AvgSizeORF|TotalSizeORF|StdDevORF|MaxORFSize" >> $address/Log.tsv
echo "3" > $address/CR.step; CFLR="N"
fi
}
Trim () # Trims adapters from Illumina data and merges sequences R1 and R2 into one file, when using pair-end (full version).
{
if [[ "$CFLR" == "Y" && `cat $address/CR.step` != "3" && `cat $address/CR.step` != "24" ]]; then echo "******Skipping reads trimming opperations"; else
rm -rf $address/${BucketsFolder}
case $T1 in
16S|16s|16|T|t|Taxonomy|taxonomy)
BucketsFolder="Bucket_Taxonomy_R1_$(basename $R1 | sed 's/.gz//' | sed 's/\..*//')"
;;
*)
case $T2 in
R|r)
BucketsFolder="Bucket_R1_$(basename $R1 | sed 's/.gz//' | sed 's/\..*//')__R2_$(basename $R2 | sed 's/.gz//' | sed 's/\..*//')"
;;
I|i)
BucketsFolder="Bucket_Interleaved_$(basename $R1 | sed 's/.gz//' | sed 's/\..*//')"
;;
F|f)
BucketsFolder="Bucket_Fasta_$(basename $R1 | sed 's/.gz//' | sed 's/\..*//')"
;;
*)
BucketsFolder="Bucket_unknown_$(basename $R1 | sed 's/.gz//' | sed 's/\..*//')_$(basename $R2 | sed 's/.gz//' | sed 's/\..*//')"
;;
esac
;;
esac
mkdir $address/${BucketsFolder}
date -u +%s > $address/${BucketsFolder}/datestarttrimming.tmp
case $TrimFiles in
False|false|FALSE)
case $T2 in
R|r|I|i)
cp $R1 $address/${BucketsFolder}/
cp $R2 $address/${BucketsFolder}/
${pigz} -p ${threads} -d -c $address/${BucketsFolder}/*.gz | ${pigz} -p ${threads} -c > $address/${BucketsFolder}/FastaQ-full.gz
;;
F|f)
;;
esac
;;
# BBDuk)
# ${bbtools}/bbduk.sh
# ;;
*)
case $T2 in
R|r)
case $T1 in
16S|16s|16|T|t|Taxonomy|taxonomy)
echo "# We're now trimming your files. Only R1 will be used, as interleaved files may result in wrong abundance values..."
${cutadapt} --cores=${threads} --quiet --minimum-length 80 --max-n 0.008 --trim-n -a $adapter -e 0.1 -O 5 -q 24,24 -o $address/${BucketsFolder}/FastaQ-full.gz $R1 > $address/${BucketsFolder}/cutadaptlog.txt # Parameters of reads trimming should be specified here. Trimming universal Illumina adapter
;;
*)
echo "# Trimming and merging your files..."
${cutadapt} --cores=${threads} --quiet --interleaved --minimum-length 80 --max-n 0.008 --trim-n -a $adapter -A $adapter -e 0.1 -O 5 -q 24,24 -o $address/${BucketsFolder}/FastaQ-full.gz $R1 $R2 > $address/${BucketsFolder}/cutadaptlog.txt # Parameters of reads trimming should be specified here. Trimming universal Illumina adapter
;;
esac
;;
I|i)
echo "# We are now trimming your files..."
${cutadapt} --cores=${threads} --quiet --minimum-length 80 --max-n 0.008 --trim-n -a $adapter -e 0.1 -O 5 -q 24,24 -o $address/${BucketsFolder}/FastaQ-full.gz $R1 > $address/${BucketsFolder}/cutadaptlog.txt # Parameters of reads trimming should be specified here. Trimming universal Illumina adapter
;;
F|f)
case $R1 in
*.gz)
${pigz} -p $threads -d -c $R1 > $address/${BucketsFolder}/FastaQ-full.fa
;;
*.fasta|*.fa|*.fna|*.fsa|*.FASTA|*.FA|*.FNA|*.FSA|*.Fasta|*.Fa|*.Fsa|*.Fna)
cp $R1 $address/${BucketsFolder}/FastaQ-full.fa
;;
esac
;;
esac
;;
esac
echo "$(date -u +%s) - $(cat $address/${BucketsFolder}/datestarttrimming.tmp)" | bc -l > $address/${BucketsFolder}/trimmingtime.nmb
rm -rf $address/${BucketsFolder}/cutadaptlog.txt
echo "4" > $address/CR.step; CFLR="N"
fi
}
QAnConversion () # Makes assessment of trimmage and converts file from fastq to fasta (full version).
{
if [[ "$CFLR" == "Y" && `cat $address/CR.step` != "4" ]]; then echo "******Skipping FASTQ handling / FASTQ conversion"; else
date -u +%s > $address/${BucketsFolder}/datestartfastqc.tmp
case $T2 in
R|r|I|i)
echo "# Starting quality assessment of trimming..."
rm -rf $address/${BucketsFolder}/FASTQCresults; rm -rf $address/${BucketsFolder}/FastaQ-full.fa
mkdir $address/${BucketsFolder}/FASTQCresults
${fastqc} --quiet --threads $threads -f fastq -o $address/${BucketsFolder}/FASTQCresults $address/${BucketsFolder}/FastaQ-full.gz > $address/${BucketsFolder}/fastqclog.txt # FASTQ assessment is done here
rm -rf $address/${BucketsFolder}/FASTQCresults/*_fastqc ; rm -rf $address/${BucketsFolder}/fastqclog.txt
echo "# We will convert merged file to fasta format."
${pigz} -p ${threads} -d -c $address/${BucketsFolder}/FastaQ-full.gz | awk '{if(NR%4==1) {printf(">%s\n",substr($0,2));} else if(NR%4==2) print;}' > $address/${BucketsFolder}/FastaQ-full.fa
echo "# Removing unwanted files..."
rm -rf $address/${BucketsFolder}/*.gz
;;
F|f)
rm -rf $address/${BucketsFolder}/FASTQCresults
echo "# FASTA file type identified. Since FASTA does not have PHRED values, It wont be assessed by FASTQC algorithm."
;;
esac
echo "$(date -u +%s) - $(cat $address/${BucketsFolder}/datestartfastqc.tmp)" | bc -l > $address/${BucketsFolder}/fastqctime.nmb
echo "5" > $address/CR.step; CFLR="N"
fi
}
BucketEngine () # Breaks the data into separate buckets to speed up the process
{
if [[ "$CFLR" == "Y" && `cat $address/CR.step` != "5" && `cat $address/CR.step` != "24" ]]; then echo "******Skipping Buckets System"; else
if [[ -s $address/${BucketsFolder}/bk_list.txt ]]
then
reads=`cat $address/${BucketsFolder}/reads.nmb`
# rm -rf $address/${BucketsFolder}/*.bk
elif [[ -s $address/${BucketsFolder}/buckets_list.txt ]];
then
buckets=`ls $address/${BucketsFolder}/*.bk | wc -l`
reads=`cat $address/${BucketsFolder}/reads.nmb`
echo "$buckets buckets were found from the previous run. We'll be using those for this run as well."
echo "(The $buckets buckets had already been generated in a previous run. This is the time it took in that run for them to be generated, not in the current run)" > $address/${BucketsFolder}/bucketpreviouslygeneratedmessage.tmp
else
if [[ -s $address/${BucketsFolder}/reads.nmb ]]
then
touch $address/${BucketsFolder}/reads.nmb
else
grep ">" $address/${BucketsFolder}/FastaQ-full.fa | wc -l > $address/${BucketsFolder}/reads.nmb
reads=`cat $address/${BucketsFolder}/reads.nmb`
fi
original_size=$(wc -c $address/${BucketsFolder}/FastaQ-full.fa | sed 's/ .*//') # ; echo "original size $original_size"
# sizeinKb=`expr $original_size / 1024`
# sizeinMb=`expr $sizeinKb / 1024` # `expr $original_size / 1048576`
buckets=`expr $original_size / 268435456 + 1` # ; echo "buckets $buckets" # (256Mb buckets)
bucketsize=`expr $original_size / $buckets + 1024` # ; echo "bucketsize $bucketsize" # (+1Kb)
date -u +%s > $address/${BucketsFolder}/datestartbucketengine.tmp
if [ "$buckets" -eq "1" ];
then
if [[ $original_size -ge 1048576 ]]
then
echo "# We identified $reads reads (file size is $original_size bytes, or $(expr $original_size / 1048576)Mb). It would take no buckets, avoiding this step."
elif [[ $original_size -ge 1024 ]]
then
echo "# We identified $reads reads (file size is $original_size bytes, or $(expr $original_size / 1024)Kb). It would take no buckets, avoiding this step."
else
echo "# We identified $reads reads (file size is only $original_size bytes). It would take no buckets, avoiding this step."
fi
mv $address/${BucketsFolder}/FastaQ-full.fa $address/${BucketsFolder}/1.bk
else