-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathkg_gen.py
396 lines (291 loc) · 11.9 KB
/
kg_gen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
# -*- coding: utf-8 -*-
"""Utils files with all functions relevant to generation of KG."""
from IPython.display import Image
import logging
import pickle
from collections import defaultdict
import networkx as nx
import pandas as pd
import pubchempy as pcp
import requests
from chembl_webresource_client.http_errors import HttpBadRequest, HttpApplicationError
from chembl_webresource_client.new_client import new_client
from pybel import BELGraph
from pybel.dsl import Protein, Abundance, Pathology, BiologicalProcess
from tqdm import tqdm
import time
import seaborn as sns
import pybel
import json
from utils import *
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from IPython.display import display, HTML
from IPython.display import Markdown, display
logger = logging.getLogger("__name__")
def searchDisease(name):
disease_name = str(name)
query_string = """
query searchAnything ($disname:String!){
search(queryString:$disname,entityNames:"disease",page:{size:20,index:0}){
total
hits {
id
entity
name
description
}
}
}
"""
variables = {"disname": disease_name}
# Set base URL of GraphQL API endpoint
base_url = "https://api.platform.opentargets.org/api/v4/graphql"
# Perform POST request and check status code of response
r = requests.post(base_url, json={"query": query_string, "variables": variables})
#print(r.status_code)
# Transform API response from JSON into Python dictionary and print in console
api_response = json.loads(r.text)
#print(api_response)
df = pd.DataFrame(api_response['data']['search']['hits'])
if not df.empty:
df = df.drop(columns=['entity', 'description'])
df['index'] = df.index
df.style.hide(axis='index')
df = df[['index','id','name']]
df.style.hide(axis='index')
#print('\n')
#print(df.head(20))
display(HTML(df.to_html(index=False)))
# else:
# print('\n')
# print('Ooops!! Did you have a typo in the name. Please try again!')
# Generate_KG()
return(df)
def printmd(string, color=None):
colorstr = "<span style='color:{}'>{}</span>".format(color, string)
display(Markdown(colorstr))
def Generate_KG():
from IPython.display import Image
printmd("**Welcome to the KG Generator tool. In the following steps, we will need a couple of inputs from your side.**",color = "blue")
Image(filename='data/KGG.png')
#this is required to get above printed before the input is asked
time.sleep(0.05)
dis = input('Please enter the disease you are interested in and we will try to find the best matches for you.' +'\n' + '\n' + 'Input: ')
temp = searchDisease(dis)
#print(temp)
#print('\n')
if not temp.empty:
printmd('**Here you go! Hopefully your disease of interest is in the list. If so, let\'s get started.**')
#print('\n')
#print(temp)
return(temp)
else:
print('Ooops!! Did you have a typo in the name. Please try again!')
return(Generate_KG())
#return(temp)
def GetDiseaseAssociatedProteins(disease_id):
#efo_id = input('Please enter the id of your disease of interest. Input: ')
#print('\n')
#print('Just a second please!')
#print('\n')
#import numpy as np
import matplotlib.pyplot as plt
efo_id = str(disease_id)
query_string = """
query associatedTargets{
disease(efoId: $efo_id){
id
name
associatedTargets(page:{size:15000,index:0}){
count
rows {
target {
id
approvedSymbol
proteinIds {
id
source
}
}
score
}
}
}
}
"""
#replace $efo_id with value from efo_id
query_string = query_string.replace("$efo_id",f'"{efo_id}"')
#variables = {"$efo_id":efo_id}
# Set base URL of GraphQL API endpoint
base_url = "https://api.platform.opentargets.org/api/v4/graphql"
# Perform POST request and check status code of response
#r = requests.post(base_url, json={"query": query_string, "variables": variables})
r = requests.post(base_url, json={"query": query_string})
#print(r.status_code)
# Transform API response from JSON into Python dictionary and print in console
api_response = json.loads(r.text)
temp_list = []
for item in api_response['data']['disease']['associatedTargets']['rows']:
#print(item['target'])
#break
for obj in item['target']['proteinIds']:
if obj['source'] == 'uniprot_swissprot':
#print(obj)
uprot = obj['id']
source = obj['source']
score = item['score']
ensg = item['target']['id']
name = item['target']['approvedSymbol']
temp = {'Protein':name,'ENSG':ensg,'UniProt':uprot,'Source':source,'Score':score}
temp_list.append(temp)
df = pd.DataFrame(temp_list)
print('We have identified ' + str(len(df)) + ' proteins (Swiss-Prot) associated with the disease. Following is a histogram that shows '
+ 'distribution of proteins based on scores provided by OpenTargets. The scores are influenced by various factors '
+ 'such as genetic associations, expression, mutations, known pathways, targeting drugs and so on.'+'\n')
print('Displaying top 20 genes')
df_display = df.head(20)
display(HTML(df_display.to_html(index=False)))
#print(df,'\n')
#print('\n')
fig, ax = plt.subplots()
ax.hist(df['Score'])
ax.set_title('Distribution of proteins based on OpenTargets score')
ax.set_xlabel('Score')
ax.set_ylabel('No. of proteins')
fig.tight_layout()
plt.show()
print('\n')
time.sleep(0.05)
score = input('We recommend taking a threshold above 0.3 to exclude loosely associated proteins. ' + '\n' +'Please enter your desired threshold: ')
df = df.loc[df['Score'] >= float(score),:]
print('\n')
print('Alright, we are good to go now. Your KG is now being generated! Sit back and relax!!')
print('\n','Total no. of proteins: ',len(df))
#display(HTML(df.to_html(index=False)))
print('\n',df)
#print('\n')
return(df)
def getDrugCount(disease_id):
efo_id = disease_id
query_string = """
query associatedTargets($my_efo_id: String!){
disease(efoId: $my_efo_id){
id
name
knownDrugs{
uniqueTargets
uniqueDrugs
count
}
}
}
"""
# Set variables object of arguments to be passed to endpoint
variables = {"my_efo_id": efo_id}
# Set base URL of GraphQL API endpoint
base_url = "https://api.platform.opentargets.org/api/v4/graphql"
# Perform POST request and check status code of response
r = requests.post(base_url, json={"query": query_string, "variables": variables})
# Transform API response from JSON into Python dictionary and print in console
api_response = json.loads(r.text)
#get the count value from api_repsonse dict
api_response = api_response['data']['disease']['knownDrugs']['count']
return(api_response)
def GetDiseaseAssociatedDrugs(disease_id,CT_phase):
efo_id = disease_id
size = getDrugCount(efo_id)
query_string = """
query associatedTargets($my_efo_id: String!, $my_size: Int){
disease(efoId: $my_efo_id){
id
name
knownDrugs(size:$my_size){
uniqueTargets
uniqueDrugs
count
rows{
approvedSymbol
approvedName
prefName
drugType
drugId
phase
ctIds
}
}
}
}
"""
#replace $efo_id with value from efo_id
#query_string = query_string.replace("$efo_id",f'"{efo_id}"')
#query_string = query_string.replace("$efo_id",f'"{efo_id}"')
# Set variables object of arguments to be passed to endpoint
variables = {"my_efo_id": efo_id, "my_size": size}
# Set base URL of GraphQL API endpoint
base_url = "https://api.platform.opentargets.org/api/v4/graphql"
# Perform POST request and check status code of response
r = requests.post(base_url, json={"query": query_string, "variables": variables})
#r = requests.post(base_url, json={"query": query_string})
#print(r.status_code)
# Transform API response from JSON into Python dictionary and print in console
api_response = json.loads(r.text)
df = pd.DataFrame(api_response['data']['disease']['knownDrugs']['rows'])
df = df.loc[df['phase'] == int(CT_phase),:]
chembl_list = list(set(df['drugId']))
return(chembl_list)
def KG_namespace_plot(final_kg):
import matplotlib.pyplot as plt
nspace_count = pybel.struct.summary.count_namespaces(final_kg)
nspace_count = dict(nspace_count)
nspace_data = {'Namespace':list(nspace_count.keys()),'Number':list(nspace_count.values())}
nspace = pd.DataFrame(nspace_data)
plt.figure()
a = sns.barplot(x="Number", y="Namespace", data=nspace_data)
a.set(xlabel='Number',ylabel='Namespace',title= 'KG Namespace in numbers')
plt.tight_layout()
#plt.savefig('data/export/test2.png',dpi=600)
plt.show()
def createKG():
#import matplotlib.pyplot as plt
#import matplotlib.image as mpimg
image = mpimg.imread("data/KGG.png")
plt.imshow(image)
plt.axis('off')
plt.show()
doid = Generate_KG()
#print(doid)
time.sleep(0.1)
efo_id = int(input('Please enter the index value of your disease of interest. Input: '))
#print(efo_id)
print('\n')
print('Please enter the clinical trial phase of chemicals which should be identified by the workflow. Use a number between 1 (early phase) and 4 (FDA approved). For example, if you use 3, the KG will fetch chemicals that are in phase 3. Also, remember that lower the input value, higher will be the number of identified chemicals and therefore the running time of workflow also increases.')
time.sleep(0.05)
print('\n')
ct_phase = input('Your desired clinical trial phase: ')
print('\n')
kg_name = input('Please provide a name for you KG. Input: ')
print('\n')
#print(doid['id'][efo_id])
#df_dis2prot = GetDiseaseAssociatedProteins(efo_id)
df_dis2prot = GetDiseaseAssociatedProteins(doid['id'][efo_id])
#chembl_list = GetDiseaseAssociatedDrugs(efo_id,ct_phase)
chembl_list = GetDiseaseAssociatedDrugs(doid['id'][efo_id],ct_phase)
#create empty KG
kg = pybel.BELGraph(name=kg_name, version="0.0.1")
uprot_ext = ExtractFromUniProt(df_dis2prot['UniProt'])
print('A total of ' + str(len(chembl_list)) + ' drugs have been identified. Now fetching relevant data')
chembl2mech = RetMech(chembl_list)
chembl2act = RetAct(chembl_list)
prtn_as_chembl = Ret_chembl_protein(chembl2act) + Ret_chembl_protein(chembl2mech)
prtn_as_chembl = set(prtn_as_chembl)
prtn_as_chembl = list(prtn_as_chembl)
chembl2uprot = chembl2uniprot(prtn_as_chembl)
chembl2act = chembl2gene2path(chembl2uprot, chembl2act)
chembl2mech = chembl2gene2path(chembl2uprot, chembl2mech)
kg = uniprot_rel(uprot_ext, 'HGNC', kg)
kg = chem2moa_rel(chembl2mech, 'HGNC', kg)
kg = chem2act_rel(chembl2act, 'HGNC', kg)
kg = gene2path_rel(chembl2uprot, 'HGNC', kg)
print('Your KG is now generated!')
return(kg)