forked from jaybake5/Oral_Microbiome_Chapter
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bwa_reference_summary_frag.pl
129 lines (110 loc) · 2.87 KB
/
bwa_reference_summary_frag.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env perl
use strict;
use warnings;
use Getopt::Long;
use Getopt::Long qw(GetOptionsFromArray);
my @files;
my $rate_flag;
my $verbose;
my $help;
my $status = GetOptionsFromArray(\@ARGV,
"files=s{,}" => \@files,
#"sam" => \$sam_flag,
"rate" => \$rate_flag,
"verbose" => \$verbose,
"help" => \$help);
my $usage =
"This program takes BWA mapping files (SAM/BAM) and counts the number of mapped reads for each reference sequence.\n\n" .
"Usage:$0\n" .
"\t-f,--file\trequired\tstring(s)\tmapping-file1,...\n" .
"\t-r,--rate\toptional\tflag\tratio if mulitple best hit?\t(default:off)\n" .
"\t-v,--verbose\toptional\tflag\tverbose\t(default:off)\n" .
"\t-h,--help\toptional\tflag\tthis message\n" .
" e.g.:$0 -f test.bam -r\n";
if ( $help || @files == 0 ) {
print $usage;
exit;
}
my %Counts;
my %Sbjcts; ## subject matches for each query sequence
my $bwa_mem = 0;
my $first_read = 1;
foreach my $file ( @files ) {
print STDERR "Loading $file\n";
my $fh;
## Ignore unmapped reads (0x4) and supplementray alignments (0x800)
my $exe = "samtools view -h -F 0x4 $file | samtools view -h -F 0x800 - |";
open($fh, $exe) || die("Open error:$file");
my $line = 0;
my ( $prev, $curr );
while ( <$fh> ) {
## Get reference ids
if ( /^\@SQ\s+SN:([^\s]+)/ ) {
$Counts{$1} = 0; ## Initialize all references match count to be 0
} elsif ( /\@PG/ ) {
$bwa_mem = 1 if /mem/;
}
next if /^\@/;
$line++;
parseLine( $_ );
}
print STDERR "#line:$line\n" ;
}
print STDERR "#read:" . scalar keys %Sbjcts, "\n";
## For each query sequence, increment match count to reference sequences or
## evenly distribute to reference sequences (1/N).
foreach my $q ( keys %Sbjcts ) {
my @sbj = @{$Sbjcts{$q}};
foreach my $s ( @sbj ) {
if ( ! defined $rate_flag ) {
$Counts{$s}++;
} else {
$Counts{$s} += 1/scalar(@sbj);
}
}
}
foreach my $r ( sort keys %Counts ) {
print join("\t", $r, $Counts{$r}), "\n";
}
sub getScore
{
my ( $column ) = @_;
$column =~ /NM:i:(\d+)/;
return $1;
}
sub getOtherSbjcts
{
my ( $edit, $Col) = @_;
my @Sbj;
##AFNN01000024,-14652,101M,0
if ( $Col->[$#$Col] =~ /XA:Z:(.+);$/ ) {
my @others = split /;/, $1;
foreach my $o ( @others ) {
my @f = split /,/, $o;
next if $f[$#f] > $edit;
push @Sbj, $f[0];
}
}
return @Sbj;
}
sub parseLine
{
my ($line) = @_;
my @c = split /\s+/, $line;
my $qid = $c[0];
my $ref = $c[2];
my $ecol = $bwa_mem ? 11 : 12;
if ( $first_read ) {
checkFormat($c[$ecol]);
$first_read = 0;
}
my $edit = getScore($c[$ecol]);
push @{$Sbjcts{$qid}}, $ref;
push @{$Sbjcts{$qid}}, getOtherSbjcts( $edit, \@c );
}
sub checkFormat
{
my ( $column ) = @_;
$column =~ /NM:i:(\d+)/;
die "Edit distance column error ($column)" unless $column =~ /^NM/;
}