forked from RTXteam/RTX-KG2
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathjensenlab_tsv_to_kg_jsonl.py
executable file
·191 lines (169 loc) · 8.41 KB
/
jensenlab_tsv_to_kg_jsonl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#!/usr/bin/env python3
''' jensenlab_tsv_to_kg_json.py: Extracts a KG2 JSON file from the
Jensen Lab filtered text mining channel tsv file.
Usage: jensenlab_tsv_to_kg_json.py [--test] <inputDirectory>
<outputNodesFile.json> <outputEdgesFile.json>
'''
import csv
import sys
import re
import kg2_util
import argparse
import datetime
from collections import defaultdict
from typing import *
__author__ = 'Lindsey Kvarfordt'
__copyright__ = 'Oregon State University'
__credits__ = ['Stephen Ramsey', 'Lindsey Kvarfordt']
__license__ = 'MIT'
__version__ = '0.1.0'
__maintainer__ = ''
__email__ = ''
__status__ = 'Prototype'
csv.field_size_limit(sys.maxsize)
# for now, just using HGNC gene ids to keep the size of this etl managable.
# regex from https://www.uniprot.org/help/accession_numbers
# REGEX_UNIPROT_ID = re.compile(r'^[P,Q,O][0-9][A-Z0-9][A-Z0-9][A-Z0-9][0-9]$')
# regex from multi_ont_to_kg_json.py
# REGEX_ENSEMBL_ID = re.compile('ENS[A-Z]{0,3}([PG])[0-9]{11}')
def get_args():
arg_parser = argparse.ArgumentParser(description='jensenlab_tsv_to_kg_json.py: \
Extracts a KG2 JSON file from the \
Jensen Lab filtered text mining channel tsv file.')
arg_parser.add_argument('--test',
dest='test',
action="store_true",
default=False)
arg_parser.add_argument('inputDirectory', type=str) #note to self: kg2-build/jensenlab
arg_parser.add_argument('outputNodesFile', type=str)
arg_parser.add_argument('outputEdgesFile', type=str)
return arg_parser.parse_args()
def date():
return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
def make_gene_id_dictionary(human_names_file:str, human_entities_file:str) -> Dict[str, list]:
_human_entities_dict = dict(); # string_id: dictionary_serial_no
_human_names_dict = defaultdict(lambda: list()) # dictionary_serial_no: [external_idsinkg2]
with open(human_entities_file, 'r') as tsvin:
tsvin = csv.reader(tsvin, delimiter="\t")
for row in tsvin:
_human_entities_dict[row[2]]=row[0]
with open(human_names_file, "r") as tsvin:
tsvin = csv.reader(tsvin, delimiter="\t")
for row in tsvin:
identifier = _reformat_id(row[1])
if identifier is not None:
_human_names_dict[row[0]].append(identifier)
gene_id_dict = dict() # string id: [external_ids in kg2]
for k, v in _human_entities_dict.items():
if len(_human_names_dict[v]) != 0:
gene_id_dict[k] = _human_names_dict[v]
return gene_id_dict
def make_gene_pmids_dict(gene_ids:set, filename:str) -> Dict[str, set]:
gene_pmids_dict = dict()
with open(filename, 'r') as inp:
tsvin = csv.reader(inp, delimiter="\t")
for row in tsvin:
gene_id, pmidlist = row
if gene_id not in gene_ids:
continue
pmidlist = ["PMID:"+idnum for idnum in pmidlist.split(' ')]
gene_pmids_dict[gene_id] = set(pmidlist)
return gene_pmids_dict
def make_disease_pmids_dict(filename:str) -> Dict[str,set]:
disease_pmids_dict = dict()
with open(filename, 'r') as inp:
tsvin = csv.reader(inp, delimiter="\t")
for row in tsvin:
disease_id, pmidlist = row
if "DOID" not in disease_id:
continue
pmidlist = ["PMID:"+idnum for idnum in pmidlist.split(' ')]
disease_pmids_dict[disease_id] = set(pmidlist)
return disease_pmids_dict
def _reformat_id(id:str):
if "HGNC" in id:
return id; # HGNC ids are already formatted the same as KG2 nodes
# for now, just using HGNC gene ids to keep the size of this etl managable.
#uniprot_match = REGEX_UNIPROT_ID.match(id)
#if uniprot_match is not None:
# return "UniProtKB:"+id
#ensembl_match = REGEX_ENSEMBL_ID.match(id)
#if ensembl_match is not None:
# return "ENSEMBL:"+id
return None
def make_edges(input_tsv:str, gene_id_dict:Dict[str,list], pmids_dict:Dict[str,Dict[str,set]], edges_output, test_mode: bool) -> list:
gene_ids_actually_used = set()
update_date = datetime.datetime.now().replace(microsecond=0).isoformat()
with open(input_tsv) as inp:
tsvin = csv.reader(inp, delimiter="\t")
for row in tsvin:
[gene_id,
gene_name,
disease_id,
disease_name,
z_score,
_,
source_url] = row
gene_ids_actually_used.add(gene_id)
kg2_gene_id_list = gene_id_dict.get(gene_id, None)
if kg2_gene_id_list is None:
# print(f"Missing kg2 equivalent gene ids for {gene_id}. Skipping")
continue
if float(z_score) < 3.0:
continue
for kg2_gene_id in kg2_gene_id_list:
if pmids_dict['disease'].get(disease_id, None) is None:
# print(f"Disease id {disease_id} is not DOID. Skipping.")
continue
publications_list = list(pmids_dict['gene'][gene_id].intersection(pmids_dict['disease'][disease_id]))
publications_list = publications_list[:30] # limit number of publications to 30 for size constraints
edge = kg2_util.make_edge(kg2_gene_id,
disease_id,
"JensenLab:associated_with",
"associated_with",
kg2_util.CURIE_ID_JENSENLAB,
update_date)
# seems hacky, but following example in rtx_kg1_neo4j_to_kg_json.py
publication_info_dict = {'publication date': None,
'sentence': None,
'subject score': None,
'object score': str(z_score)}
publications_info = {edge['object']: publication_info_dict}
edge["publications"] = publications_list
edge["publications_info"] = publications_info
edges_output.write(edge)
if test_mode and len(gene_ids_actually_used) > 1000:
break
used_genes_missing_ids = gene_ids_actually_used - set(gene_id_dict.keys())
print(f"Skipped {len(used_genes_missing_ids)} rows for lack of kg2 gene ids.")
print(f"Found {len(gene_ids_actually_used - used_genes_missing_ids)} used kg2 gene ids.")
if __name__ == '__main__':
print("Start time: ", date())
args = get_args()
output_nodes_file_name = args.outputNodesFile
output_edges_file_name = args.outputEdgesFile
test_mode = args.test
nodes_info, edges_info = kg2_util.create_kg2_jsonlines(test_mode)
nodes_output = nodes_info[0]
edges_output = edges_info[0]
human_names_file = f"{args.inputDirectory}/human_dictionary/human_names.tsv"
human_entities_file = f"{args.inputDirectory}/human_dictionary/human_entities.tsv"
edges_tsv_file = f"{args.inputDirectory}/human_disease_textmining_full.tsv"
gene_publications_file = f"{args.inputDirectory}/gene_pmids.tsv"
disease_publications_file = f"{args.inputDirectory}/disease_pmids.tsv"
gene_id_dict = make_gene_id_dictionary(human_names_file, human_entities_file)
gene_pmids_dict = make_gene_pmids_dict(set(gene_id_dict.keys()), gene_publications_file)
disease_pmids_dict = make_disease_pmids_dict(disease_publications_file)
pmids_dict = { "gene": gene_pmids_dict,
"disease" : disease_pmids_dict }
make_edges(edges_tsv_file, gene_id_dict, pmids_dict, edges_output, args.test)
update_date = datetime.datetime.now().replace(microsecond=0).isoformat()
jensen_lab_source_node = kg2_util.make_node(kg2_util.CURIE_ID_JENSENLAB,
kg2_util.BASE_URL_JENSENLAB,
"Jensen Lab Disease Gene Associations",
kg2_util.SOURCE_NODE_CATEGORY,
update_date,
kg2_util.CURIE_ID_JENSENLAB)
nodes_output.write(jensen_lab_source_node)
kg2_util.close_kg2_jsonlines(nodes_info, edges_info, output_nodes_file_name, output_edges_file_name)
print("Finish time: ", date())