-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrefseq-wo-uniprot.py
134 lines (126 loc) · 3.82 KB
/
refseq-wo-uniprot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from sys import *
import csv, os, json
"""
Set missing UniProt if there is a RefSeq Id which is referred to only once
by UniProt.
Dependecy: List of UniProt-->RefSeq associations created from idmapping.tar.gz
via the script zgrep RefSeq$'\t' idmapping.dat.gz |grep -v - |sed 's/RefSeq\t//g' |sed 's/\..$//g' >uniprot-refseq.tab
Note: Part of items have RefSeq IDs (with presumably identical proteins)
that are referred to by more than one UniProt entry. We take only one of
these UniProt IDs as new ID.
"""
parser = argparse.ArgumentParser()
parser.add_argument("-s", "--output_qs", help="output to QS",
action="store_true")
parser.add_argument("-q", "--query", help="perform SPARQL query",
action="store_true")
args = parser.parse_args()
QS = args.output_qs
dontquery = not args.query
script = os.path.basename(argv[0])[:-3]
ndate = '2020-05-02'
newd = ndate + 'T00:00:00Z'
if dontquery is False:
print('performing query...')
ret = os.popen('wd sparql {}.rq >{}.json'.format(script, script))
if ret.close() is not None:
raise
with open('{}.json'.format(script)) as file:
s = file.read()
jol = json.loads(s)
# taxa not in UniProt
nontaxa = ['Q56085857', 'Q27944478', 'Q21102943', 'Q21102968',
'Q21102941', 'Q21102930']
print('reading UniProt --> RefSeq map')
refs = {}
with open('uniprot-refseq.tab'.format(script)) as file:
for line in file.readlines():
u = line.rstrip().split(sep='\t')
if len(u) != 2:
continue
r = refs.get(u[1])
if r is None:
refs[u[1]] = u[0]
else:
if type(r) is list:
r.append(u[0])
else:
refs[u[1]] = [r, u[0]]
qs = {}
obsf = {}
stmts = {}
for d in jol:
tax = d.get('taxon')
if tax in nontaxa:
continue
rf = d.get('refseq')
fstop = rf.rfind('.')
if fstop != -1:
rf = rf[:fstop]
it = d.get('item')
stmt = d.get('stmt')
stmts[it] = stmt
p31 = d.get('p31')
obsf[it] = p31 == "Q66826848"
uid = refs.get(rf)
if uid is None:
continue
q = qs.get(it)
if type(uid) is list:
if q is None:
qs[it] = set(uid)
else:
q.union(set(uid))
else:
if q is None:
qs[it] = set([uid])
else:
q.add(uid)
for it in qs.keys():
bag = qs.get(it)
if bag is None:
continue
short_id = None
long_id = None
if len(bag) > 1:
for u in list(bag):
if len(u) > 6:
long_id = u
else:
short_id = u
else:
short_id = bag.pop()
if short_id is None:
short_id = long_id
if QS:
print('{}|P352|"{}"|S248|Q905695|S813|+{}/11'.format(it, short_id, newd))
if obsf.get(it):
print('-{}|P31|Q66826848'.format(it))
else:
if obsf.get(it):
stmt = stmts.get(it)
if stmt is None:
print("CAN'T HAPPEN")
continue
j = {"id": it,
"claims": {
"P31": [{ "id": stmt, "remove": True }],
"P352": [ { "value": short_id,
"references": { "P248": "Q905695",
"P813": ndate }}]
} }
else:
j = {"id": it,
"claims": {
"P352": [ { "value": short_id,
"references": { "P248": "Q905695",
"P813": ndate }}]
} }
f = open('t.json', 'w')
f.write(json.dumps(j))
f.close()
print(json.dumps(j), flush=True)
ret = os.popen('wd ee t.json --summary refseq-wo-uniprot')
print(ret.read())
if ret.close() is not None:
print('ERROR')