-
Notifications
You must be signed in to change notification settings - Fork 5
/
parse_big_fasta.py
116 lines (87 loc) · 3.31 KB
/
parse_big_fasta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
"""
Parses RVDB formatted FASTA headers so they can be interperated by
HIVE-hexagon's tablequery
Input:
#######
* -i : input FASTA file to reformat
* -o : specified output file
Output:
#######
* Reformatted FASTA file
Usage:
#######
*python parse_big_fasta.py --version
*This is the option that show you the program's version.*
*python parse_big_fasta.py -h
*This can show you some help information*
*python parse_big_fasta.py -i <filename.fasta> -o <output_file>
*Runs program with specified FASTA file and output file*
"""
# !/usr/bin/env python
# -*- coding: utf-8 -*-
################################################################################
# parse-big-fasta.py
################################################################################
__version__ = "1.0"
__status__ = "Dev"
import re
import sys
import argparse
# ______________________________________________________________________________#
def create_arg_parser():
"""
Creates and returns the ArgumentParser object.
"""
parser = argparse.ArgumentParser(
description='Fixes RVDB fasta file header format for HIVE-hexagon and tablequery.')
parser.add_argument('-i', 'inputFASTA',
help='Path to the input FASTA.')
parser.add_argument('-o', '--output',
help='The output file for the new FASTA. If no output is provided the '
'default will just append NEW to the file name')
return parser
# ______________________________________________________________________________#
def format_header(parsed_args):
"""
Parse the RVDB formatted FASTA headers and re-writes in desired format
"""
accessions = []
with open(parsed_args.inputFASTA) as infile:
reader = infile.readlines()
try:
with open(parsed_args.output, 'w') as outfile:
for line in reader:
if re.search('^>', line):
newline = line.split('|')[2:]
newline = '>gb|' + '|'.join(newline)
accessions.append(newline[0])
outfile.writelines(newline)
else:
outfile.writelines(line)
except FileNotFoundError:
outfile = 'new_' + parsed_args.inputFASTA
with open(outfile, 'w') as outfile:
for line in reader:
if re.search('^>', line):
newline = line.split('|')[2:]
accessions.append(newline[0])
newline = '>gb|' + '|'.join(newline)
outfile.writelines(newline)
else:
outfile.writelines(line)
return accessions
# ______________________________________________________________________________#
def main():
"""
Write reformatted .fasta to specified file
"""
arg_parser = create_arg_parser()
parsed_args = arg_parser.parse_args(sys.argv[1:])
accessions = format_header(parsed_args)
print(accessions)
with open('accessions.txt', 'w') as acc:
for accession in accessions:
acc.write(accession + '\n')
# ______________________________________________________________________________#
if __name__ == "__main__":
main()