-
Notifications
You must be signed in to change notification settings - Fork 8
/
seqlen.py
60 lines (49 loc) · 2.62 KB
/
seqlen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/usr/bin/env python
"""
Calculate sequence lengths in a multiFASTA file.
This script was inspired by Lesley Sitter's code published on biostars (www.biostars.org/p/148815/).
Command: python seqlen.py -i [input FASTA file] (-a) (-n) > [output TSV file]
Examples:
python seqlen.py -i input.fna -a > seq_lengths.tsv # With sequence annotation in the sequence description
python seqlen.py -i input.fna -n > seq_lengths.tsv # Only keep the sequence ID in the sequence description and ignore 'N' and '-' characters
Any character: a flag to keep sequence annotation in the output.
Copyright (C) 2021 Yu Wan <[email protected]>
Licensed under the GNU General Public Licence version 3 (GPLv3) <https://www.gnu.org/licenses/>.
Release: 2021; latest update: 26 Sep 2022
"""
import sys
import re
from argparse import ArgumentParser
def parse_argument():
parser = ArgumentParser(description = "Calculating lengths of sequences in a FASTA file")
parser.add_argument('-i', '--input', dest = 'i', type = str, required = True, help = "An input FASTA file")
parser.add_argument('-a', '--annot', dest = 'a', action = 'store_true', help = "Keep sequence annotations in addition to sequence names")
parser.add_argument('-n', '--nucl', dest = 'n', action = 'store_true', help = "Ignoring \'-\' and \'N\' in nucleotide sequences.")
return parser.parse_args()
def main():
args = parse_argument()
with open(args.i, "r") as f:
input_fasta = f.read().splitlines() # Newline characters are dropped.
print("\t".join(["Name", "Length"])) # The header line
seq = ""
allow_write = False # A flag indicating that the first sequence name has been completely loaded.
ignore_annot = not args.a
for line in input_fasta:
if line.startswith(">"): # A new sequence is encountered
if allow_write:
if args.n:
seq = re.sub('[N-]', '', seq.upper()) # Stripping multiple characters from a string. Ref: stackoverflow.com/questions/3900054/python-strip-multiple-characters.
print("\t".join([seqid, str(len(seq))])) # Write the name and length of the previous sequence
seq = ""
if ignore_annot:
seqid = line.split(" ")[0] # Get the sequence ID and ignore the sequence annotation
else:
seqid = line
seqid = seqid[1 : ] # Drop the ">" character
allow_write = True
else:
seq += line # concatenate lines of the current sequence
print("\t".join([seqid, str(len(seq))])) # Write the length of the last sequence
return
if __name__ == '__main__':
main()