-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcheck-barcodes.pl
executable file
·151 lines (111 loc) · 4.09 KB
/
check-barcodes.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/usr/bin/env perl
# See usage
use strict;
use Getopt::Std;
use FileHandle;
use Algorithm::Combinatorics;
use mismatch;
use vars qw($opt_h $opt_m $opt_o);
my $version=mismatch::getversion($0);
my @args=@ARGV;
my $Usage="Usage:
$0 -m NMISMATCHES [-o uniqified-barcodes.txt ] barcodes.txt
Given a barcodefile (format: id \\t barcode \\n), find all the barcodes
that have ambiguous potential barcode misreads. The output (to stdout) is
MISREAD : BARCODE1 BARCODE2 [ BARCODE3 etc.]
where BARCODE1 etc. are the real barcodes, with letters mismatching the
MISREAD 'highlighted' in lowercase. Statistics are written to stderr.
When the -o FILE option is used, it will calculate new barcodes that are
written to FILE. These new barcodes are unambiguous while allowing for
mismatches. These barcodes have lowercase letters in places where
mistmatches cannot be tolerated. This output can be used by the
demultiplex-{sam,fastq}.pl scripts (they use the same convention:
mismatches are never allowed on lowercase letters). Note that running the
current script on this new file will therefore not find any ambiguity
anymore. For a testcase, see the file testdata/mm1-ambiguous-codes.txt
written by <plijnzaad\@gmail.com>
";
if ( !getopts("m:o:h") || $opt_h || @ARGV!= 1 ) {
die $Usage;
}
die "-m option missing " unless defined($opt_m);
warn "Running $0, version $version, with args @args\n";
my $allowed_mismatches = $opt_m;
my $barcodefile=$ARGV[0];
my $barcodes_mixedcase = mismatch::readbarcodes($barcodefile); ## eg. $h->{'AGCGtT') => 'M3'
my $barcodes = mismatch::mixedcase2upper($barcodes_mixedcase); ## e.g. $h->{'AGCGTT') => 'M3'
my $mismatch_REs = mismatch::convert2mismatchREs(barcodes=>$barcodes_mixedcase,
allowed_mismatches =>$allowed_mismatches);# eg. $h->{'AGCGTT') => REGEXP(0x25a7788)
# $barcodes_mixedcase=undef;
my $id2code={};
## invert the table for lookup later on
for my $code (keys %$barcodes_mixedcase) {
$id2code->{ $barcodes_mixedcase->{$code} }=$code;
}
my $len=length( (keys %$barcodes)[0] );
warn "Will now check all possible 4 ^ $len i.e. " . 4 ** $len . " barcodes for ambiguous misreads,
allowing for $allowed_mismatches mismatches\n";
my $iter= Algorithm::Combinatorics::tuples_with_repetition([ qw(A C G T) ] , $len);
my $nunknown=0;
my $nexact=0;
my $nunique=0;
my $nambiguous=0;
my $badcodes={};
sub merge_codes {
## do something like qw(AcGTT ACgTT ACgTt ) => "AcgTt"
my (@mm)=@_;
return $mm[0] if(@mm==1);
my @w= map { [ split('', $_) ] } @mm;
my $len=int(@{$w[0]});
my @final=();
for(my $i=0; $i<$len; $i++) {
my $h={};
my @col=map { $_->[$i];} @w ;
for my $col ( @col ) { $h->{$col}++; }
my $n= int(keys %$h);
push(@final, ($n==1) ? $col[0] : "\L$col[0]");
}
join("", @final);
}
print "# misreads that map ambiguously to mismatched barcodes:\n";
WORD:
while(my $inst=$iter->next()) {
my $w=join("",@$inst); # potential mismatched
if (exists( $barcodes->{$w})) {
$nexact++;
next WORD;
}
my @codes=mismatch::safe_rescue($w, $mismatch_REs);
$nunknown += (@codes==0);
$nunique += (@codes==1);
$nambiguous += (@codes>1);
if (@codes>1) { # ambiguous
my @mm = map { mismatch::format_mm($w, $_) . " ($barcodes->{$_})" ; } @codes;
print "$w: ". join(' ', @mm) . "\n";
for my $code (@codes) {
my $mm=mismatch::format_mm($w, $code);
$badcodes->{$code}{$mm}++;
}
}
} # WORD
if ($opt_o) {
open(FILE, "> $opt_o") || die "$opt_o: $!";
## for my $code (sort keys %$badcodes) {
for my $id (sort keys %$id2code ) {
my $code=$id2code->{$id}; # mixed case!
if(exists ($badcodes->{$code})) {
my @mm=keys %{$badcodes->{$code}};
print FILE "$id\t". merge_codes(@mm) . "\t # was: $code\n";
} else {
print FILE "$id\t$code\n";
}
}
close(FILE);
}
warn "\nstats:\n";
warn "no mismatch: $nexact\n";
warn "mismatched but unique: $nunique\n";
warn "mismatched ambiguous: $nambiguous\n";
warn "unknown: $nunknown\n";
warn "done\n";
1;