-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdeconseq_16cores.pl
604 lines (504 loc) · 20.7 KB
/
deconseq_16cores.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
#!/usr/bin/perl
#===============================================================================
# Author: Robert SCHMIEDER, Computational Science Research Center @ SDSU, CA
#
# File: deconseq
# Date: 2013-05-05
# Version: 0.4.3
#
# Usage:
# deconseq [options] -f <file> -dbs <list> -dbs_retain <list> ...
#
# Try 'deconseq -h' for more information.
#
# Purpose: DeconSeq will help you remove unwanted sequences (contaminations)
# from your sequence data sets by using the BWA-SW algorithm.
#
#===============================================================================
use strict;
use warnings;
use FindBin;
use lib "$FindBin::Bin";
use DeconSeqConfig;
use Data::Dumper;
use Getopt::Long;
use Pod::Usage;
use File::Path qw(make_path); #requires version 2.07 or newer
use Cwd;
$| = 1; # Do not buffer output
my $man = 0;
my $help = 0;
my %params = ('help' => \$help, 'h' => \$help, 'man' => \$man);
GetOptions(\%params, 'help|h', 'man', 'no_seq_out', 'keep_tmp_files', 'dbs=s', 'dbs_retain=s', 'f=s', 'out_dir=s', 'i=i', 'c=i', 'group=i', 'id=s', 'version' => sub { print VERSION_INFO."\n"; exit; }, 'show_dbs' => sub { print $_." - ".DBS->{$_}->{name}."\n" foreach(sort keys %{(DBS)}); exit; }, 'S=i', 'z=i', 'T=i') or pod2usage(2);
pod2usage(1) if $help;
pod2usage(-exitstatus => 0, -verbose => 2) if $man;
=head1 NAME
DeconSeq - DECONtamination of SEQuence data
=head1 VERSION
0.4.3
=head1 SYNOPSIS
deconseq [options] -f <file> -dbs <list> -dbs_retain <list> ...
=head1 DESCRIPTION
DeconSeq will help you remove unwanted sequences (contaminations) from your sequence datasets by using the altered BWA-SW algorithm.
=head1 OPTIONS
=over 8
=item B<-help> | B<-h>
Prints the help message and exists.
=item B<-man>
Prints the full documentation.
=item B<-version>
Prints the version of the program.
=item B<-show_dbs>
Prints a list of available databases.
=item B<-f> <file>
Input file in FASTA or FASTQ format that contains the query sequences.
=item B<-dbs> <list>
Name of database(s) to use (default: hsref). Names are according to their definition in the config file. Separate multiple database names by comma without spaces.
Example: -dbs hs1,hs2,hsref
=item B<-dbs_retain> <list>
Name of database(s) to use for cross-check. Query sequences with hit against any dbs will be compared to these databases. Databases have to be different from names in dbs. Names are according to their definition in the config file. Separate multiple database names by comma without spaces.
Example: -dbs_retain bact,vir
=item B<-out_dir> <dir>
Directory where the results should be written (default: .). If the directory does not exist, it will be created.
=item B<-i> <integer>
Alignment identity threshold in percentage (integer from 1-100 without %) used to define matching sequences as similar. The identity is calculated for the part of the query sequence that is aligned to a reference sequence. For example, a query sequence of 100 bp that aligns to a reference sequence over the first 50 bp with 40 matching positions has an identity value of 80%.
=item B<-c> <integer>
Alignment coverage threshold in percentage (integer from 1-100 without %) used to define matching sequences as similar. The coverage is calculated for the part of the query sequence that is aligned to a reference sequence. For example, a query sequence of 100 bp that aligns to a reference sequence over the first 50 bp with 40 matching positions has an coverage value of 50%.
=item B<-group> <integer>
If dbs_retain is set, then this option can be used to group the sequences similar to dbs and dbs_retain databases with either the clean or the contamination output file. If group is not set and dbs_retain is set, then three separate files will be generated.
Use B<-group 1> for grouping "Clean + Both" and use B<-group 2> for grouping "Contamination + Both".
=item B<-no_seq_out>
Prevents the generation of the fasta/fastq output file for the given coverage and identity thresholds. This feature is e.g. useful for the web-version since -i and -c are set interactively and not yet defined at the data processing step.
=item B<-keep_tmp_files>
Prevents from unlinking the generated tmp files. These usually include the id file and the .tsv file(s). This feature is e.g. useful for the web-version since .tsv files are used to dynamically generate the output files.
=item B<-id> <string>
Optional parameter. If not set, ID will be automatically generated to prevent from overwriting previous results. This option is useful if integrated into other tools and the output filenames need to be known.
=item B<-S> <integer>
Chunk size of reads in bp as used by BWA-SW (default: 10000000).
=item B<-z> <integer>
Z-best value as used by BWA-SW (default: 1).
=item B<-T> <integer>
Alignment score threshold as used by BWA-SW (default: 30).
=back
=head1 AUTHOR
Robert SCHMIEDER, C<< <rschmieder_at_gmail_dot_com> >>
=head1 BUGS
If you find a bug please email me at C<< <rschmieder_at_gmail_dot_com> >> so that I can make DeconSeq better.
=head1 COPYRIGHT
Copyright (C) 2010-2011 Robert SCHMIEDER
=head1 LICENSE
This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.
=cut
#
################################################################################
## DATA AND PARAMETER CHECKING
################################################################################
#
#Check if input file exists and estimate file format
my $format = '';
if(exists $params{f}) {
if(-e $params{f}) {
#check for file format
$format = &checkFileFormat($params{f});
unless($format eq 'fasta' || $format eq 'fastq') {
&printError('input file for -f is in '.uc($format).' format not in FASTA or FASTQ format');
}
} else {
&printError("could not find input file \"".$params{f}."\"");
}
} else {
&printError("you did not specify an input file containing the query sequences");
}
#Check if output dir is defined and if not, if it can be created
if(exists $params{out_dir}) {
make_path($params{out_dir}, { verbose => 0, mode => 0711, error => \my $err });
if(@$err) {
&printError("problems while trying to create directory \"".$params{out_dir}."\"");
}
} else {
$params{out_dir} = cwd();
}
$params{out_dir} .= '/' unless($params{out_dir} =~ /\/$/);
#Check if databases are defined, if their names are valid, and if they exist
if(exists $params{dbs}) {
my @tmp = split(/\,/,$params{dbs});
foreach my $db (@tmp) {
unless(exists DBS->{$db}) {
&printError("database \"".$db."\" does not exist in config file");
}
unless(&checkForDbFiles($db)) {
&printError("cannot find all database files for database \"".$db."\" in dir \"".DB_DIR."\"");
}
}
} else {
$params{dbs} = DB_DEFAULT;
unless(&checkForDbFiles(DB_DEFAULT)) {
&printError("cannot find all database files for database \"".DB_DEFAULT."\" in dir \"".DB_DIR."\"");
}
}
#Check if keep databases are defined, if their names are valid, and if they exist
if(exists $params{dbs_retain}) {
my @tmp = split(/\,/,$params{dbs_retain});
foreach my $db (@tmp) {
unless(exists DBS->{$db}) {
&printError("database \"".$db."\" does not exist in config file");
}
unless(&checkForDbFiles($db)) {
&printError("cannot find all database files for database \"".$db."\" in dir \"".DB_DIR."\"");
}
}
}
#Check if databases and keep databases are not the same
if(exists $params{dbs_retain}) {
my @tmp = split(/\,/,$params{dbs});
my @tmp_keep = split(/\,/,$params{dbs_retain});
foreach my $keep (@tmp_keep) {
if(grep {/^$keep$/} @tmp) {
&printError("cannot use the same database(s) for dbs AND dbs_retain");
}
}
}
#Check if thresholds are defined and valid
if(exists $params{i}) {
unless($params{i} > 0 && $params{i} <= 100) {
&printError("invalid value for identity threshold");
}
} else {
$params{i} = 1;
}
if(exists $params{c}) {
unless($params{c} > 0 && $params{c} <= 100) {
&printError("invalid value for coverage threshold");
}
} else {
$params{c} = 1;
}
#Check if group is set correctly
if(exists $params{group}) {
unless($params{group} == 1 || $params{group} == 2) {
&printError("invalid group option");
}
unless(exists $params{dbs_retain}) {
&printError("group can only be used when dbs_retain is set");
}
}
#
################################################################################
## DATA PROCESSING
################################################################################
#
## Get new id for data
unless(exists $params{id}) {
$params{id} = &generateNewIdFile($params{out_dir});
}
## Run modified BWA-SW for input file on all selected databases
my @dbs = split(/\,/,$params{dbs});
my @dbs_retain;
@dbs_retain = split(/\,/,$params{dbs_retain}) if(exists $params{dbs_retain});
my %data;
my ($time,$seqnum1,$seqnum2,$seqnum3);
foreach my $db (@dbs) {
my @refdbs = split(/\,/,DBS->{$db}->{db});
foreach my $refdb (@refdbs) {
my $tsvfile = $params{out_dir}.$params{id}.'_'.$db.'_'.$refdb.'.tsv';
my $cmd = PROG_DIR.PROG_NAME.' bwasw -t 16 -A -f '.$tsvfile.(exists $params{S} ? ' -S '.$params{S} : '').(exists $params{z} ? ' -z '.$params{z} : '').(exists $params{T} ? ' -T '.$params{T} : '').' '.DB_DIR.$refdb.' '.$params{f};
unless(-e $tsvfile) {
open(TSV, ">$tsvfile") or &printError("Could not create tsv file $tsvfile: $!");
close(TSV);
}
$time = &runCmd($cmd);
print STDERR "[deconseq] CMD finished in $time seconds\n" if(DEBUG);
$data{$db.$refdb}->{tsv} = $tsvfile;
}
}
## If dbs_retain is defined
if(@dbs_retain) {
# Parse TSV file(s) to get unique query ids
my %ids;
foreach my $db (@dbs) {
my @refdbs = split(/\,/,DBS->{$db}->{db});
foreach my $refdb (@refdbs) {
&parseTsvFiles($data{$db.$refdb}->{tsv},\%ids);
}
}
if(scalar(keys %ids)) {
#generate FASTA file for query ids
my $fasta = $params{out_dir}.$params{id}.'_tmp.fa';
($time,$seqnum1) = &generateFileFromIds(\%ids,$params{f},'fasta',$fasta);
print STDERR "[deconseq] Generated file ".$fasta." with $seqnum1 sequences in $time seconds\n" if(DEBUG);
# Run BWA-SW for input file on all selected databases
foreach my $db (@dbs_retain) {
my @refdbs = split(/\,/,DBS->{$db}->{db});
foreach my $refdb (@refdbs) {
my $tsvfile = $params{out_dir}.$params{id}.'_'.$db.'_'.$refdb.'.tsv';
my $cmd = PROG_DIR.PROG_NAME.' bwasw -t16 -A -f '.$tsvfile.(exists $params{S} ? ' -S '.$params{S} : '').(exists $params{z} ? ' -z '.$params{z} : '').(exists $params{T} ? ' -T '.$params{T} : '').' '.DB_DIR.$refdb.' '.$params{f};
unless(-e $tsvfile) {
open(TSV, ">$tsvfile") or &printError("Could not create tsv file $tsvfile: $!");
close(TSV);
}
$time = &runCmd($cmd);
print STDERR "[deconseq] CMD finished in $time seconds\n" if(DEBUG);
$data{$db.$refdb}->{tsv} = $tsvfile;
}
}
unlink($fasta);
} else {
print STDERR "[deconseq] No sequences for dbs_retain runs\n" if(DEBUG);
}
}
## Generate output files
if(exists $params{no_seq_out}) {
print STDERR "[deconseq] Did not generate sequence output files, since no_seq_out was set as parameter.\n" if(DEBUG);
} else {
## Parse TSV files and combine results
my %ids;
foreach my $db (@dbs) {
my @refdbs = split(/\,/,DBS->{$db}->{db});
foreach my $refdb (@refdbs) {
&parseTsvFiles($data{$db.$refdb}->{tsv},\%ids);
}
}
my %ids_keep;
foreach my $db (@dbs_retain) {
my @refdbs = split(/\,/,DBS->{$db}->{db});
foreach my $refdb (@refdbs) {
&parseTsvFiles($data{$db.$refdb}->{tsv},\%ids_keep) if(exists $data{$db.$refdb}->{tsv});
}
}
my $filterids = &combineResults(\%ids,\%ids_keep,$params{c},$params{i},$params{id});
## Generate output files with query sequence subsets
($time,$seqnum1,$seqnum2,$seqnum3) = &generateFileFromIds($filterids,$params{f},$format,$params{out_dir}.$params{id}.'_cont.f'.($format eq 'fasta' ? 'a' : 'q'),$params{out_dir}.$params{id}.'_clean.f'.($format eq 'fasta' ? 'a' : 'q'),$params{out_dir}.$params{id}.'_both.f'.($format eq 'fasta' ? 'a' : 'q'));
print STDERR "[deconseq] Generated files ".$params{out_dir}.$params{id}.'_cont.f'.($format eq 'fasta' ? 'a' : 'q').", ".$params{out_dir}.$params{id}.'_clean.f'.($format eq 'fasta' ? 'a' : 'q').", ".$params{out_dir}.$params{id}.'_both.f'.($format eq 'fasta' ? 'a' : 'q')." with $seqnum1, $seqnum2, and $seqnum3 sequences in $time seconds\n" if(DEBUG);
}
## Clean up temp files generated
#tsv files and id file
if(exists $params{keep_tmp_files}) {
print STDERR "[deconseq] Did not unlink files, since keep_tmp_files was set as parameter.\n" if(DEBUG);
} else {
print STDERR "[deconseq] Unlink ".$params{out_dir}.$params{id}."\n" if(DEBUG);
unlink($params{out_dir}.$params{id});
foreach my $db (keys %data) {
print STDERR "[deconseq] Unlink ".$data{$db}->{tsv}."\n" if(DEBUG);
unlink($data{$db}->{tsv});
}
}
#TODOs
#add timing to all parts for later benchmarks and analytics
#
################################################################################
## MISC FUNCTIONS
################################################################################
#
sub printError {
my $msg = shift;
print STDERR "ERROR: ".$msg.".\n\nTry \'deconseq -h\' for more information.\nExit program.\n";
exit(0);
}
sub checkForDbFiles {
my $db = shift;
my @exts = qw(amb ann bwt pac rbwt rpac rsa sa);
my $status = 1;
my @tmp = split(/\,/,DBS->{$db}->{db});
foreach my $tmpdb (@tmp) {
foreach my $ext (@exts) {
unless(-e DB_DIR.$tmpdb.'.'.$ext) {
$status = 0;
last;
}
}
}
return $status;
}
sub generateNewIdFile {
my $dir = shift;
my $id;
$id = time();
while(-e $dir.$id) {
$id = time();
}
`touch $dir$id`;
return $id;
}
sub runCmd {
my $cmd = shift;
my $time = time();
print STDERR "[deconseq] Run CMD: $cmd\n" if (DEBUG);
system($cmd) == 0 or &printError("system call \"$cmd\" failed: $?");
return (time()-$time);
}
sub parseTsvFiles {
my ($file,$ids) = @_;
my (@lineargs);
open(IN,"<$file") or die "ERROR: could not open file $file: $! \n";
while(<IN>) {
next if(/^\#/);
chomp();
@lineargs = split(/\t/);
#query id, dbid, db start, db aligned, coverage, identity
next if($lineargs[4] < 1 || $lineargs[5] < 1); #problems with hits from bwasw (hit over more than 2 reference sequences);
next if(exists $ids->{$lineargs[0]} && exists $ids->{$lineargs[0]}->{$lineargs[4]} && $ids->{$lineargs[0]}->{$lineargs[4]} >= $lineargs[5]);
$ids->{$lineargs[0]}->{$lineargs[4]} = $lineargs[5];
}
close(IN);
return $ids;
}
sub generateFileFromIds {
my ($ids,$in,$format,$out1,$out2,$out3) = @_;
my $time = time();
my $group;
if(exists $params{dbs_retain}) {
if(exists $params{group}) {
$group = $params{group};
} else {
$group = 0;
}
} else {
$group = -1;
}
my ($id,$seqnum1,$seqnum2,$seqnum3,$type);
$seqnum1 = $seqnum2 = $seqnum3 = $id = $type = 0;
open(IN,"<$in") or die "ERROR: could not open file $in: $! \n";
open(OUT1,">$out1") or die "ERROR: could not write to file $out1: $! \n"; #cont
open(OUT2,">$out2") or die "ERROR: could not write to file $out2: $! \n" if(defined $out2); #clean
open(OUT3,">$out3") or die "ERROR: could not write to file $out3: $! \n" if($group == 0 && defined $out3); #both
if($format eq 'fastq') { #fastq input
my $count = 0;
while (<IN>) {
if($count == 0 && /^\@(\S+)/) {
$id = $1;
if(exists $ids->{$id}) {
if($ids->{$id} == 1) { #cont
$seqnum1++;
$type = 1;
} elsif($group == 0) { #both
$seqnum3++;
$type = 3;
} elsif($group == 1) { #clean+both
$seqnum2++;
$type = 2;
} elsif($group == 2) { #cont+both
$seqnum1++;
$type = 1;
}
} else { #clean
$type = 2;
$seqnum2++;
}
} elsif($count == 3) {
$count = -1;
}
$count++;
if($type == 1) {
print OUT1 $_;
} elsif($type == 2) {
print OUT2 $_ if(defined $out2);
} elsif($type == 3) {
print OUT3 $_ if(defined $out3);
}
}
} elsif($format eq 'fasta') {
while(<IN>) {
if(/^>(\S+)/) {
$id = $1;
if(exists $ids->{$id}) {
if($ids->{$id} == 1) { #cont
$seqnum1++;
$type = 1;
} elsif($group == 0) { #both
$seqnum3++;
$type = 3;
} elsif($group == 1) { #clean+both
$seqnum2++;
$type = 2;
} elsif($group == 2) { #cont+both
$seqnum1++;
$type = 1;
}
} else { #clean
$type = 2;
$seqnum2++;
}
}
if($type == 1) {
print OUT1 $_;
} elsif($type == 2) {
print OUT2 $_ if(defined $out2);
} elsif($type == 3) {
print OUT3 $_ if(defined $out3);
}
}
} else {
die "ERROR: unknow file format: $format\n";
}
close(OUT1);
close(OUT2) if(defined $out2);
close(OUT3) if($group == 0 && defined $out3);
close(IN);
return (time()-$time,$seqnum1,$seqnum2,$seqnum3);
}
sub combineResults {
my ($ids,$ids_keep,$c,$i) = @_;
my (%filterids,$filter);
foreach my $qid (keys %$ids) {
$filter = 0;
while(my ($cov,$ident) = each(%{$ids->{$qid}}) ) {
if($cov >= $c && $ident >= $i) {
$filter = 1;
#check if in $ids_keep with same or better alignment values
if(exists $ids_keep->{$qid}) {
while(my ($cov_keep,$ident_keep) = each(%{$ids_keep->{$qid}}) ) {
if($cov_keep >= $cov && $ident_keep >= $ident) {
$filter = 2;
last;
}
}
}
last;
}
}
if($filter) {
$filterids{$qid} = $filter;
}
}
return \%filterids;
}
sub checkFileFormat {
my $file = shift;
my ($format,$count,$id,$fasta,$fastq,$qual);
$count = 3;
$fasta = $fastq = $qual = 0;
$format = 'unknown';
open(FILE,"perl -p -e 's/\r/\n/g;s/\n\n/\n/g' < $file |") or die "ERROR: Could not open file $file: $! \n";
while (<FILE>) {
chomp();
next unless(length($_));
if($count-- == 0) {
last;
} elsif(!$fasta && /^\>\S+\s*/) {
$fasta = 1;
$qual = 1;
} elsif($fasta == 1 && /^[ACGTURYKMSWBDHVNXacgturykmswbdhvnx-]+/) {
$fasta = 2;
} elsif($qual == 1 && /^\s*\d+/) {
$qual = 2;
} elsif(!$fastq && /^\@(\S+)\s*/) {
$id = $1;
$fastq = 1;
} elsif($fastq == 1 && /^[ACGTURYKMSWBDHVNXacgturykmswbdhvnx-]+/) {
$fastq = 2;
} elsif($fastq == 2 && /^\+(\S*)\s*/) {
$fastq = 3 if($id eq $1 || /^\+\s*$/);
}
}
close(FILE);
if($fasta == 2) {
$format = 'fasta';
} elsif($qual == 2) {
$format = 'qual';
} elsif($fastq == 3) {
$format = 'fastq';
}
return $format;
}