-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathPDB.jl
302 lines (275 loc) · 9.82 KB
/
PDB.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
# PDB ids from Pfam sequence annotations
# ======================================
const _regex_PDB_from_GS = r"PDB;\s+(\w+)\s+(\w);\s+\w+-\w+;" # i.e.: "PDB; 2VQC A; 4-73;\n"
"""
Generates from a Pfam `msa` a `Dict{String, Vector{Tuple{String,String}}}`.
Keys are sequence IDs and each value is a list of tuples containing PDB code and chain.
```julia
julia> getseq2pdb(msa)
Dict{String,Array{Tuple{String,String},1}} with 1 entry:
"F112_SSV1/3-112" => [("2VQC","A")]
```
"""
function getseq2pdb(msa::AnnotatedMultipleSequenceAlignment)
dict = Dict{String,Vector{Tuple{String,String}}}()
for (k, v) in getannotsequence(msa)
id, annot = k
# i.e.: "#=GS F112_SSV1/3-112 DR PDB; 2VQC A; 4-73;\n"
if annot == "DR" && occursin(_regex_PDB_from_GS, v)
for m in eachmatch(_regex_PDB_from_GS, v)
if haskey(dict, id)
push!(dict[id], (m.captures[1], m.captures[2]))
else
dict[id] = Tuple{String,String}[(m.captures[1], m.captures[2])]
end
end
end
end
sizehint!(dict, length(dict))
end
# Mapping PDB/Pfam
# ================
"""
`msacolumn2pdbresidue(msa, seqid, pdbid, chain, pfamid, siftsfile; strict=false, checkpdbname=false, missings=true)`
This function returns a `OrderedDict{Int,String}` with **MSA column numbers on the input file**
as keys and PDB residue numbers (`""` for missings) as values. The mapping is performed
using SIFTS. This function needs correct *ColMap* and *SeqMap* annotations. This checks
correspondence of the residues between the MSA sequence and SIFTS
(It throws a warning if there are differences). Missing residues are included if the
keyword argument `missings` is `true` (default: `true`). If the keyword argument `strict`
is `true` (default: `false`), throws an Error, instead of a Warning, when residues don't
match. If the keyword argument `checkpdbname` is `true` (default: `false`), throws an Error
if the three letter name of the PDB residue isn't the MSA residue. If you are working with
a **downloaded Pfam MSA without modifications**, you should `read` it using
`generatemapping=true` and `useidcoordinates=true`. If you don't indicate the path to the
`siftsfile` used in the mapping, this function downloads the SIFTS file in the current
folder. If you don't indicate the Pfam accession number (`pfamid`), this function tries to
read the *AC* file annotation.
"""
function msacolumn2pdbresidue(
msa::AnnotatedMultipleSequenceAlignment,
seqid::String,
pdbid::String,
chain::String,
pfamid::String,
siftsfile::String;
strict::Bool = false,
checkpdbname::Bool = false,
missings::Bool = true,
)
siftsres = read_file(siftsfile, SIFTSXML, chain = chain, missings = missings)
up2res = OrderedDict{String,Tuple{String,String,Char}}()
for res in siftsres
if !ismissing(res.Pfam) && res.Pfam.id == uppercase(pfamid)
pfnum = res.Pfam.number
if pfnum == ""
continue
end
pfname = res.Pfam.name
if !ismissing(res.PDB) && (res.PDB.id == lowercase(pdbid)) && !res.missing
up2res[pfnum] =
checkpdbname ? (pfname, res.PDB.number, three2residue(res.PDB.name)) :
(pfname, res.PDB.number, '-')
else
up2res[pfnum] =
checkpdbname ?
(pfname, "", ismissing(res.PDB) ? "" : three2residue(res.PDB.name)) :
(pfname, "", '-')
end
end
end
seq = Char[x for x in vec(getsequence(msa, seqid))]
seqmap = getsequencemapping(msa, seqid)
colmap = getcolumnmapping(msa)
N = ncolumns(msa)
m = OrderedDict{Int,String}()
sizehint!(m, N)
for i = 1:N
up_number = string(seqmap[i])
if up_number != "0"
up_res, pdb_resnum, pdb_res = get(up2res, up_number, ("", "", '-'))
if string(seq[i]) == up_res
m[colmap[i]] = pdb_resnum
else
msg = string(
pfamid,
" ",
seqid,
" ",
pdbid,
" ",
chain,
" : MSA sequence residue at ",
i,
" (",
seq[i],
") != SIFTS residue (UniProt/Pfam: ",
up_res,
", PDB: ",
pdb_resnum,
")",
)
strict ? throw(ErrorException(msg)) : @warn(msg)
end
if (checkpdbname && (seq[i] != pdb_res))
msg = string(
pfamid,
" ",
seqid,
" ",
pdbid,
" ",
chain,
" : MSA sequence residue at ",
i,
" (",
seq[i],
") != PDB residue at ",
pdb_resnum,
" (",
pdb_res,
")",
)
throw(ErrorException(msg))
end
end
end
m
end
function msacolumn2pdbresidue(
msa::AnnotatedMultipleSequenceAlignment,
seqid::String,
pdbid::String,
chain::String,
pfamid::String;
kargs...,
)
msacolumn2pdbresidue(msa, seqid, pdbid, chain, pfamid, downloadsifts(pdbid), kargs...)
end
function msacolumn2pdbresidue(
msa::AnnotatedMultipleSequenceAlignment,
seqid::String,
pdbid::String,
chain::String;
kargs...,
)
msacolumn2pdbresidue(
msa,
seqid,
pdbid,
chain,
String(split(getannotfile(msa, "AC"), '.')[1]),
kargs...,
)
end
"""
Returns a `BitVector` where there is a `true` for each column with PDB residue.
"""
function hasresidues(
msa::AnnotatedMultipleSequenceAlignment,
column2residues::AbstractDict{Int,String},
)
colmap = getcolumnmapping(msa)
ncol = length(colmap)
mask = falses(ncol)
for i = 1:ncol
if get(column2residues, colmap[i], "") != ""
mask[i] = true
end
end
mask
end
# PDB residues for each column
# ============================
"""
This function takes an `AnnotatedMultipleSequenceAlignment` with correct *ColMap*
annotations and two dicts:
1. The first is an `OrderedDict{String,PDBResidue}` from PDB residue number to `PDBResidue`.
2. The second is a `Dict{Int,String}` from MSA column number **on the input file** to PDB residue number.
`msaresidues` returns an `OrderedDict{Int,PDBResidue}` from input column number (ColMap)
to `PDBResidue`. Residues on inserts are not included.
"""
function msaresidues(
msa::AnnotatedMultipleSequenceAlignment,
residues::AbstractDict{String,PDBResidue},
column2residues::AbstractDict{Int,String},
)
colmap = getcolumnmapping(msa)
msares = sizehint!(OrderedDict{Int,PDBResidue}(), length(colmap))
for col in colmap
resnum = get(column2residues, col, "")
if resnum != ""
if haskey(residues, resnum)
msares[col] = residues[resnum]
else
@warn(
"MSA column $col : The residue number $resnum isn't in the residues Dict."
)
end
end
end
sizehint!(msares, length(msares))
end
# Contact Map
# ===========
"""
This function takes an `AnnotatedMultipleSequenceAlignment` with correct *ColMap*
annotations and two dicts:
1. The first is an `OrderedDict{String,PDBResidue}` from PDB residue number to `PDBResidue`.
2. The second is a `Dict{Int,String}` from **MSA column number on the input file** to PDB residue number.
`msacontacts` returns a `PairwiseListMatrix{Float64,false}` of `0.0` and `1.0` where `1.0`
indicates a residue contact. Contacts are defined with an inter residue distance less or
equal to `distance_limit` (default to `6.05`) angstroms between any heavy atom. `NaN`
indicates a missing value.
"""
function msacontacts(
msa::AnnotatedMultipleSequenceAlignment,
residues::AbstractDict{String,PDBResidue},
column2residues::AbstractDict{Int,String},
distance_limit::Float64 = 6.05,
)
colmap = getcolumnmapping(msa)
contacts = columnpairsmatrix(msa)
plm = getarray(contacts)
@inbounds @iterateupper plm false begin
resi = get(column2residues, colmap[i], "")
resj = get(column2residues, colmap[j], "")
if resi != "" && resj != "" && haskey(residues, resi) && haskey(residues, resj)
list[k] = Float64(contact(residues[resi], residues[resj], distance_limit))
else
list[k] = NaN
end
end
contacts
end
# AUC (contact prediction)
# ========================
"""
This function takes a `msacontacts` or its list of contacts `contact_list` with 1.0 for
true contacts and 0.0 for not contacts (NaN or other numbers for missing values).
Returns two `BitVector`s, the first with `true`s where `contact_list` is 1.0 and the second
with `true`s where `contact_list` is 0.0. There are useful for AUC calculations.
"""
function getcontactmasks(contact_list::Vector{T}) where {T<:AbstractFloat}
N = length(contact_list)
true_contacts = falses(N)
false_contacts = falses(N)
@inbounds for i = 1:N
value = contact_list[i]
if value == 1.0
true_contacts[i] = true
elseif value == 0.0
false_contacts[i] = true
end
# If value is NaN, It keeps the false value
end
true_contacts, false_contacts
end
function getcontactmasks(plm::PairwiseListMatrix{T,false,VT}) where {T<:AbstractFloat,VT}
getcontactmasks(getlist(plm))
end
function getcontactmasks(
nplm::NamedArray{T,2,PairwiseListMatrix{T,false,TV},DN},
) where {T,TV,DN}
getcontactmasks(getarray(nplm))
end