-
Notifications
You must be signed in to change notification settings - Fork 36
/
mcl2rawcafe.py
60 lines (43 loc) · 2.8 KB
/
mcl2rawcafe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
"""
This script defines and runs the mcl2rawcafe function, which tabulates genes/peptides per species for each cluster from a mcl dump file.
The output is a raw (unfiltered) table that can be used as CAFE's input.
"""
__author__ = "Fabio H. K. Mendes"
import os
import argparse
import copy
def mcl2rawcafe(mcl_dump, path_output_file, species_ids):
"""
[str] mcl_dump: path to mcl's dump file
[str] path_output_file: path to output .txt file
[str] species_ids: unique species identifiers (separated by white spaces, enclosed within double quotes)
(e.g., "ENSBTA ENSCJA ENSCAF ENSECA ENSP00 ENSMMU ENSMUS ENSNLE ENSPTR ENSPAN ENSPPY ENSRNO")
"""
sps_header = "\t".join(sp_id for sp_id in species_ids)
species_ids_dict_0 = dict((sp_id, 0) for sp_id in species_ids) # initializing dictionary template
with open(path_output_file, "w") as output_file:
output_file.write("Desc\tFamily ID\t" + sps_header + "\n") # writing CAFE input header
with open(mcl_dump, "r") as input_file:
for line_count, line in enumerate(input_file):
species_gene_ids = line.rstrip().split() # list of species id in this line
species_ids_dict = copy.deepcopy(species_ids_dict_0) # dictionary for this line
for sp_gene_ids in species_gene_ids:
for sp_id in species_ids:
if sp_gene_ids.find(sp_id) != -1:
species_ids_dict[sp_id] += 1
output_file.write("\t".join(["(null)", str(line_count+1), ""])) # first 2 cols + "\t"
for sp_id in species_ids:
output_file.write(str(species_ids_dict[sp_id]) + "\t") # gene family counts for all species
output_file.write("\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__, prog="mcl2rawcafe.py")
parser.add_argument("-i", "--input-file", action="store", dest="input_file", required=True, type=str, help="full path to mcl's output dump file")
parser.add_argument("-o", "--output-file", action="store", dest="output_file", required=True, type=str, help="full path to file to be written")
parser.add_argument("-sp", "--species-ids", action="store", dest="sps", required=True, type=str, help="list of unique species identifiers (separated by white spaces) present in the input file")
args = parser.parse_args()
if not os.path.isfile(args.input_file):
exit("Could not find input file (mcl's output dump file). Exiting...\n")
if len(args.sps.split(" ")) <= 1:
exit("You specified a single species identifier. Exiting...\n")
print "\nAuthor: " + __author__ + " <[email protected]>\nRunning mcl2rawcafe.py as a standalone...\n"
mcl2rawcafe(args.input_file, args.output_file, args.sps.split(" "))