-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbigscape_converter(old).py
156 lines (139 loc) · 6.83 KB
/
bigscape_converter(old).py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
###############################################################################
#### Goal: Work on BiG-SCAPE results #######
#### Usage: python bigscape_converter.py path/to/directory_wt_all_files/ ####
###############################################################################
import argparse
import sys
parser=argparse.ArgumentParser(
description='''Work on BiG-SCAPE results.''',
epilog=""" """)
__author__ = 'Sandra Godinho Silva ([email protected])'
__version__ = '0.2'
__date__ = 'June 25, 2020'
parser.add_argument('inputDirectory', help='Path to the input directory where all files are')
inputDirectory = sys.argv[1]
# import standard Python modules
import os
import pandas as pd
curdir = inputDirectory
def GetClans(curdir):
"""
Find folder with clans.
"""
clans_paths=[]
for subdir, dirs, files in os.walk(curdir):
for file in files:
if "clans" in file:
clans_paths.append(os.path.join(subdir, file))
return clans_paths #list of clan files to parse
def GetNetworkAnno(curdir):
"""
Get the annotation file for every class.
"""
for subdir, dirs, files in os.walk(curdir):
for file in files:
if "Network_Annotations_Full.tsv" in file:
file_path = os.path.join(subdir, file)
network_anno = pd.read_csv(file_path,header=None, sep='\n')
network_anno = network_anno[0].str.split('\t', expand=True)
network_anno.columns = network_anno.iloc[0]
network_anno.drop(columns=["Description", "Organism", "Taxonomy"], inplace=True)
network_anno = network_anno[~network_anno["BGC"].str.contains("BGC")]
return network_anno #dataframe with generic information on all BGCs
def GetPfamsPF(curdir):
"""
Get the sequence/vector of Pfams per BGC and
get length of this vector of pfams (count Pfams).
"""
d_pf={} #empty dictionary
d_counts ={} #empty dictionary
for subdir, dirs, files in os.walk(curdir):
if "pfs" in subdir:
file_path = os.path.join(subdir)
for subdir, dirs, files in os.walk(file_path):
for file in files:
if "BGC" in file: #discard BGCs from MIBIG (they aren't from our dataset)
pass
else:
with open(os.path.join(subdir,file)) as f:
lines = f.readlines()
count = len(str(lines).split(" "))
file_name = str(file).split(".")[0]
d_counts[file_name]= lines[0].split(" ")
d_pf[file_name]=[lines, count]
df_pfams_pf = pd.DataFrame.from_dict(d_pf, orient='index') #transform dictionary into dataframe
df_pfams_pf = df_pfams_pf.reset_index()
df_pfams_pf.columns = ["BGC", "Pfam_vector", "Pfam_vector_length"]
return df_pfams_pf, d_counts #datafrane with Pfam vector and dictionary for the count table
def GetPfamsPFD(curdir):
"""
Open file already converted by the bigscape algorithm from domtable.
Get the Pfam descriptors.
"""
d_pfd={}
for subdir, dirs, files in os.walk(curdir):
if "pfd" in subdir:
file_path = os.path.join(subdir)
for subdir, dirs, files in os.walk(file_path):
for file in files:
if "BGC" in file: #discard BGCs from MIBIG (they aren't from our dataset)
pass
else:
file_name = str(file).split(".")[0]
df_pfd = pd.read_csv(os.path.join(subdir,file), header=None, sep="\t")
df_pfd.columns = ["Cluster name", "(per-domain) score",
"gene id (if present)"," envelope coordinate from", "envelope coordinate to (of the domain prediction, in amino acids)",
"pfam id", "pfam descriptor", "start coordinate gene", "end coordinate gene", "internal cds header"]
l_descriptor = df_pfd["pfam descriptor"].tolist()
l_descriptor = ';'.join(l_descriptor)
d_pfd[file_name]=l_descriptor
df_pfams_pfd = pd.DataFrame.from_dict(d_pfd, orient='index')
df_pfams_pfd = df_pfams_pfd.reset_index()
df_pfams_pfd.columns = ["BGC", "Pfam_descriptor"]
return df_pfams_pfd #dataframe with Pfam descriptors
def ConcatenateClansFiles(clans_paths):
"""
For every BGC get info on its class, clan and family.
"""
df_all = pd.DataFrame() #create empty dataframe
for file_path in clans_paths:
file_name = os.path.basename(file_path)
file_name = file_name.split("_")[0]
df_clan = pd.read_csv(file_path, header=None, sep='\n', skiprows=1)
df_clan = df_clan[0].str.split('\t', expand=True)
df_clan = df_clan.assign(Class=file_name)[['Class'] + df_clan.columns.tolist()] # add info regarding bigcape class
df_all = pd.concat([df_all, df_clan], sort=False) #join each BGC to main dataframe
return df_all #dataframe with class, clan and family for all BGCs
def JoinAll(df_all, network_anno, df_pfams_pf, df_pfams_pfd):
"""
Join all the information. Every BGC will have information on: class, clan,
family, vector of pfams, length of pfam vector and pfam descriptors.
"""
df_all.columns = ["Class","BGC_name", "Clan", "Family"]
new_df = pd.merge(df_all, network_anno, how="left", left_on="BGC_name", right_on="BGC")
new_df["BGC_name"] = new_df["BGC_name"].str.split(".", expand=True)[0:-1] #simplify name to afterwards join with pfams dfs
#new_df["Genome"] = new_df["BGC_name"].str.split("_")#.astype("str")
#new_df["Genome"] = "_".join(new_df["Genome"][0:2].astype("str") )
new_df2 = pd.merge(new_df, df_pfams_pf, how="left", left_on="BGC_name", right_on="BGC")
new_df3 = pd.merge(new_df2, df_pfams_pfd, how="left", left_on="BGC_name", right_on="BGC")
new_df3.rename(columns={"BGC_x":"BGC_full"}, inplace=True)
new_df3 = new_df3[['BGC_name', 'BGC_full','Class', 'BiG-SCAPE class',
'Product Prediction', 'Clan', 'Family',
'Pfam_vector', "Pfam_vector_length", "Pfam_descriptor"]]
#new_df2=new_df2.dropna(axis=0) #uncoment to remove rows with NA
return new_df3 #final dataframe
clans_paths = GetClans(curdir)
df_all = ConcatenateClansFiles(clans_paths)
df_pfams_pf, d_counts = GetPfamsPF(curdir)
df_pfams_pfd = GetPfamsPFD(curdir)
network_anno = GetNetworkAnno(curdir)
final_df = JoinAll(df_all, network_anno, df_pfams_pf, df_pfams_pfd)
from collections import Counter
df_counts_pfams = pd.DataFrame({k:Counter(v) for k, v in d_counts.items()}).fillna(0).astype(int)
df_counts_pfams = df_counts_pfams.T.reset_index()
df_counts_pfams = df_counts_pfams.replace(" ","")
df_counts_pfams.rename(columns={"index":"BGC_name"}, inplace=True)
df_counts_pfams.to_csv("Pfams_bgcs_counts.csv", sep=",", index=False)
final_df.to_csv("Clans_and_families.csv", index=False)