-
Notifications
You must be signed in to change notification settings - Fork 0
/
mir_find.pl
125 lines (100 loc) · 3.54 KB
/
mir_find.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/perl
use warnings;
use strict;
# mir_find.pl - a script that uses ncbiBLAST to search for potential homologs of known miRNAs
# Modified as mir_find.pl, 2021
# Written by Stuart J. Lucas, 2012
# Version History:
# v1.1, 23.01.2013: Added option to specify number of mismatches, more feedback during running, and set to delete temporary files.
print "\n\n+++ mir_find.pl v1.0.0 +++\n\n";
my ( $mirnaquery, $blastdatabase ) = @ARGV or die "Please specify a fasta file containing the miRNA sequences you wish to search with, and the BLAST database you wish to search, with full path if it is not in the current directory";
# Get number of mismatches from user
print "Please specify the maximum number of base mismatches you wish to allow between your query miRNAs and test sequences.\n";
print "Maximum permitted mismatches (1-3 recommended): ";
my $mislimit = <STDIN>;
chomp $mislimit;
$mislimit =~ s/\r//;
if ( $mislimit =~ m/\D+/ )
{
die "Your entry was not a number. Please run again and enter digits only.\n";
}
else
{
print "Searching for miRNA sequences with a maximum of $mislimit mismatches.\n";
}
# Convert fasta file into a table of miRNAs
my $mirnacount = 0;
open ( MIRNAS, $mirnaquery ) or die "Could not open $mirnaquery. Is it in the right folder?";
open ( MIRNATABLE, ">".$mirnaquery.".tbl" ) or die "Could not open an output file!";
print MIRNATABLE "# miRNA ID\tsequence";
while(my $line = <MIRNAS>)
{
chomp $line;
$line =~ s/\r//;
if ($line =~ /^>(\S+)\s*/)
{
print MIRNATABLE "\n$1";
print MIRNATABLE "\t";
$mirnacount++;
}
else
{
print MIRNATABLE "$line";
}
}
print MIRNATABLE "\n";
close MIRNAS;
close MIRNATABLE;
print "$mirnacount miRNA sequences detected in query file.\n";
# Populate hash table of miRNAs from newly generated table
my (%QuerymiRNAs);
open ( MIRNADATA, $mirnaquery.".tbl" ) or die "The miRNA table was not generated";
while (my $line = <MIRNADATA>)
{
chomp $line;
if ( $line =~ /^#/ )
{
next;
}
elsif ( $line =~ /(\S+)\t(\S+)/ )
{
my $idkey = $1;
my $seqval = $2;
$QuerymiRNAs{"$idkey"} = "$seqval";
}
}
close MIRNADATA;
my $mirnacount2 = scalar(keys(%QuerymiRNAs));
print "miRNA data analysed.... Now running BLAST for $mirnacount2 miRNAs. This could take some time.\n";
# Run BLAST for the specified miRNAs
system ("blastn -task blastn-short -query $mirnaquery -db $blastdatabase -ungapped -penalty -1 -reward 1 -outfmt 6 -out $mirnaquery.allhits" );
# Filter blast hit table (in output format 6) to remove alignments with >specified mismatches
print "BLAST complete, now filtering.\n";
open ( BLASTHITS, $mirnaquery.".allhits" ) or die "Couldn't find the BLAST output!";
open ( FILTEREDHITS, ">".$mirnaquery.".results.tbl" ) or die "Results file failure";
print FILTEREDHITS "# Query ID\tSubject ID\t%\tlength\tmism\tgaps\tqstart\tqend\tsstart\tsend\tevalue\tbitscore\n";
my $badcount = 0;
my $goodcount = 0;
while (my $line = <BLASTHITS> )
{
chomp $line;
my ( $qid, $sid, $percent, $allength, $mismatch, $gaps, $qstart, $qend, $sstart, $send, $evalue, $bitscore ) = split /\t/, $line;
my $qlength = length $QuerymiRNAs{ $qid };
my $difference = $qlength - $allength;
if ( $mismatch + $difference > $mislimit )
{
$badcount++;
next;
}
else
{
print FILTEREDHITS "$line\n";
$goodcount++;
}
}
close BLASTHITS;
close FILTEREDHITS;
unlink ($mirnaquery.".tbl");
unlink ($mirnaquery.".allhits");
print "Filtering complete. $badcount hits were rejected due to being too short, or having too many mismatches.\n";
print "$goodcount hits were recorded in the file ".$mirnaquery.".results.tbl\n";