-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathAnnotations.jl
467 lines (396 loc) · 13.6 KB
/
Annotations.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
# Annotations
# ===========
"""
The `Annotations` type is basically a container for `Dict`s with the annotations of a
multiple sequence alignment. `Annotations` was designed for storage of annotations of
the **Stockholm format**.
MIToS also uses MSA annotations to keep track of:
- **Modifications** of the MSA (`MIToS_...`) as deletion of sequences or columns.
- Positions numbers in the original MSA file (**column mapping:** `ColMap`)
- Position of the residues in the sequence (**sequence mapping:** `SeqMap`)
"""
@auto_hash_equals mutable struct Annotations
file::OrderedDict{String,String}
sequences::Dict{Tuple{String,String},String}
columns::Dict{String,String}
residues::Dict{Tuple{String,String},String}
end
Annotations() = Annotations(
OrderedDict{String,String}(),
Dict{Tuple{String,String},String}(),
Dict{String,String}(),
Dict{Tuple{String,String},String}(),
)
# Length
# ------
function Base.length(a::Annotations)
length(a.file) + length(a.sequences) + length(a.columns) + length(a.residues)
end
# Filters
# -------
# This function is useful because of the Julia issue #12495
function _filter(str::String, mask::AbstractArray{Bool})
@assert length(str) == length(mask) "The string and the mask must have the same length"
# data readable writable
buffer = IOBuffer(Array{UInt8}(undef, lastindex(str)), read = true, write = true)
# To start at the beginning of the buffer:
truncate(buffer, 0)
i = 1
for char in str
@inbounds if mask[i]
write(buffer, char)
end
i += 1
end
String(take!(buffer))
end
_filter(str::String, indexes::AbstractArray{Int}) = String(collect(str)[indexes])
"""
For filter column and sequence mapping of the format: ",,,,10,11,,12"
"""
_filter_mapping(str_map::String, mask) = join(split(str_map, ',')[mask], ',')
"""
`filtersequences!(data::Annotations, ids::Vector{String}, mask::AbstractArray{Bool,1})`
It is useful for deleting sequence annotations. `ids` should be a list of the sequence
names and `mask` should be a logical vector.
"""
function filtersequences!(
data::Annotations,
ids::Vector{String},
mask::AbstractVector{Bool},
)
@assert length(ids) == length(mask) "It's needed one sequence id per element in the mask."
nresannot = length(data.residues)
nseqannot = length(data.sequences)
if nresannot > 0 || nseqannot > 0
del = Set(ids[.!mask])
end
if nresannot > 0
for key in keys(data.residues)
if key[1] in del
delete!(data.residues, key)
end
end
data.residues = sizehint!(data.residues, length(data.residues))
end
if nseqannot > 0
for key in keys(data.sequences)
if key[1] in del
delete!(data.sequences, key)
end
end
data.sequences = sizehint!(data.sequences, length(data.sequences))
end
data
end
"""
`filtercolumns!(data::Annotations, mask)`
It is useful for deleting column annotations (creating a subset in place).
"""
function filtercolumns!(data::Annotations, mask)
if length(data.residues) > 0
for (key, value) in data.residues
data.residues[key] = _filter(value, mask)
end
end
if length(data.columns) > 0
for (key, value) in data.columns
data.columns[key] = _filter(value, mask)
end
end
if length(data.sequences) > 0
for (key, value) in data.sequences
if key[2] == "SeqMap"
data.sequences[key] = _filter_mapping(value, mask)
end
end
end
# vcat can create multiple ColMap annotations in a single MSA; update all of them
colmap_keys = filter(endswith("ColMap"), keys(data.file))
for key in colmap_keys
data.file[key] = _filter_mapping(data.file[key], mask)
end
data
end
# Copy, deepcopy and empty!
# -------------------------
for fun in [:copy, :deepcopy]
@eval begin
Base.$(fun)(ann::Annotations) = Annotations(
$(fun)(ann.file),
$(fun)(ann.sequences),
$(fun)(ann.columns),
$(fun)(ann.residues),
)
end
end
function Base.empty!(ann::Annotations)
empty!(ann.file)
empty!(ann.sequences)
empty!(ann.columns)
empty!(ann.residues)
ann
end
Base.isempty(ann::Annotations) =
isempty(ann.file) &&
isempty(ann.sequences) &&
isempty(ann.columns) &&
isempty(ann.residues)
# merge! and merge
# ----------------
const _MERGE_NOTE = md"""
NOTE: This function does not check for consistency among the various `Annotations`.
For example, it does not verify that `SeqMap` annotations have consistent lengths.
"""
"""
merge!(target::Annotations, sources::Annotations...)
Merge one or more source `Annotations` into a target `Annotations`. This function calls
`Base.merge!` on each of the four dictionaries in the `Annotations` type: `file`,
`sequences`, `columns`, and `residues`. Consequently, it behaves like `Base.merge!` for
dictionaries; if the same key exists in different `Annotations` objects, the value from the
last one is used.
$_MERGE_NOTE
"""
function Base.merge!(target::Annotations, sources::Annotations...)
for source in sources
merge!(target.file, source.file)
merge!(target.sequences, source.sequences)
merge!(target.columns, source.columns)
merge!(target.residues, source.residues)
end
target
end
"""
merge(target::Annotations, sources::Annotations...)
Create a new `Annotations` object by merging two or more `Annotations`. If the same
annotation exists in different `Annotations` objects, the value from the last one is used.
See also `merge!`.
$_MERGE_NOTE
"""
Base.merge(target::Annotations, sources::Annotations...) =
merge!(deepcopy(target), sources...)
# ncolumns
# --------
"""
`ncolumns(ann::Annotations)` returns the number of columns/residues with annotations.
This function returns `-1` if there is not annotations per column/residue.
"""
function ncolumns(ann::Annotations)
for value in values(ann.columns)
return (length(value))
end
for value in values(ann.residues)
return (length(value))
end
-1
end
# Getters
# -------
for (fun, field) in [(:getannotfile, :(ann.file)), (:getannotcolumn, :(ann.columns))]
@eval begin
$(fun)(ann::Annotations) = $(field)
$(fun)(ann::Annotations, feature::String) = getindex($(field), feature)
$(fun)(ann::Annotations, feature::String, default::String) =
get($(field), feature, default)
end
end
for (fun, field) in
[(:getannotsequence, :(ann.sequences)), (:getannotresidue, :(ann.residues))]
@eval begin
$(fun)(ann::Annotations) = $(field)
$(fun)(ann::Annotations, seqname::String, feature::String) =
getindex($(field), (seqname, feature))
$(fun)(ann::Annotations, seqname::String, feature::String, default::String) =
get($(field), (seqname, feature), default)
end
end
@doc """`getannotfile(ann[, feature[,default]])`
It returns per file annotation for `feature`
""" getannotfile
@doc """`getannotcolumn(ann[, feature[,default]])`
It returns per column annotation for `feature`
""" getannotcolumn
@doc """`getannotsequence(ann[, seqname, feature[,default]])`
It returns per sequence annotation for `(seqname, feature)`
""" getannotsequence
@doc """`getannotresidue(ann[, seqname, feature[,default]])`
It returns per residue annotation for `(seqname, feature)`
""" getannotresidue
# Setters
# -------
function _test_feature_name(feature::String)
@assert length(feature) <= 50 "Feature name has a limit of 50 characters."
@assert !occursin(r"\s", feature) "Feature name must not have spaces."
end
function setannotfile!(ann::Annotations, feature::String, annotation::String)
_test_feature_name(feature)
previous = get(ann.file, feature, "")
ann.file[feature] = previous != "" ? string(previous, '\n', annotation) : annotation
end
function setannotsequence!(
ann::Annotations,
seqname::String,
feature::String,
annotation::String,
)
_test_feature_name(feature)
previous = get(ann.sequences, (seqname, feature), "")
ann.sequences[(seqname, feature)] =
previous != "" ? string(previous, '\n', annotation) : annotation
end
function setannotcolumn!(ann::Annotations, feature::String, annotation::String)
_test_feature_name(feature)
len = ncolumns(ann)
if (len == -1) || (len == length(annotation))
setindex!(ann.columns, annotation, feature)
else
throw(
DimensionMismatch(
string(
"You should have exactly 1 char per column (",
len,
" columns/residues)",
),
),
)
end
end
function setannotresidue!(
ann::Annotations,
seqname::String,
feature::String,
annotation::String,
)
_test_feature_name(feature)
len = ncolumns(ann)
if (len == -1) || (len == length(annotation))
setindex!(ann.residues, annotation, (seqname, feature))
else
throw(
DimensionMismatch(
string(
"You should have exactly 1 char per residue (",
len,
" columns/residues)",
),
),
)
end
end
@doc """`setannotfile!(ann, feature, annotation)`
It stores per file `annotation` for `feature`
""" setannotfile!
@doc """`setannotcolumn!(ann, feature, annotation)`
It stores per column `annotation` (1 char per column) for `feature`
""" setannotcolumn!
@doc """`setannotsequence!(ann, seqname, feature, annotation)`
It stores per sequence `annotation` for `(seqname, feature)`
""" setannotsequence!
@doc """`setannotresidue!(ann, seqname, feature, annotation)`
It stores per residue `annotation` (1 char per residue) for `(seqname, feature)`
""" setannotresidue!
# MIToS modification annotations
# ===============================
"""
Annotates on file annotations the modifications realized by MIToS on the MSA. It always
returns `true`, so It can be used in a boolean context.
"""
function annotate_modification!(ann::Annotations, modification::String)
setannotfile!(ann, string("MIToS_", Dates.now()), modification)
true # generally used in a boolean context: annotate && annotate_modification!(...
end
"""
Deletes all the MIToS annotated modifications
"""
function delete_annotated_modifications!(ann::Annotations)
for key in keys(ann.file)
if startswith(key, "MIToS_")
delete!(ann.file, key)
end
end
end
"""
Prints MIToS annotated modifications
"""
function printmodifications(ann::Annotations)
for (key, value) in ann.file
if startswith(key, "MIToS_")
list_k = split(key, '_')
println("-------------------")
println(list_k[2])
println('\n', value)
end
end
end
# Show & Print Annotations
# ========================
function _printfileannotations(io::IO, ann::Annotations)
if !isempty(ann.file)
for (key, value) in ann.file
for val in split(value, '\n')
println(io, string("#=GF ", key, '\t', val))
end
end
end
end
function _printcolumnsannotations(io::IO, ann::Annotations)
if !isempty(ann.columns)
for (key, value) in ann.columns
println(io, string("#=GC ", key, "\t\t\t", value))
end
end
end
function _printsequencesannotations(io::IO, ann::Annotations)
if !isempty(ann.sequences)
for (key, value) in ann.sequences
for val in split(value, '\n')
println(io, string("#=GS ", key[1], '\t', key[2], '\t', val))
end
end
end
end
function _printresiduesannotations(io::IO, ann::Annotations)
if !isempty(ann.residues)
for (key, value) in ann.residues
println(io, string("#=GR ", key[1], '\t', key[2], '\t', value))
end
end
end
function Base.print(io::IO, ann::Annotations)
_printfileannotations(io, ann)
_printsequencesannotations(io, ann)
_printresiduesannotations(io, ann)
_printcolumnsannotations(io, ann)
end
Base.show(io::IO, ann::Annotations) = print(io, ann)
# Rename sequences
# ================
# This private function is used by rename sequences during MSA concatenation,
# but it could be useful for other purposes
"""
_rename_sequences(annotations::Annotations, old2new::Dict{String,String})
Renames sequences in a given `Annotations` object based on a mapping provided in `old2new`.
This function iterates through residue-level and sequence-level annotations in the
provided `Annotations` object. For each annotation, it checks if the sequence name exists
in the `old2new` dictionary. If so, the sequence name is updated to the new name from
`old2new`; otherwise, the original sequence name is retained.
The function then returns a new `Annotations` object with updated sequence names.
"""
function _rename_sequences(annotations::Annotations, old2new::Dict{String,String})
res_annotations = Dict{Tuple{String,String},String}()
for ((seqname, annot_name), value) in getannotresidue(annotations)
seqname = get(old2new, seqname, seqname)
res_annotations[(seqname, annot_name)] = value
end
seq_annotations = Dict{Tuple{String,String},String}()
for ((seqname, annot_name), value) in getannotsequence(annotations)
seqname = get(old2new, seqname, seqname)
seq_annotations[(seqname, annot_name)] = value
end
Annotations(
copy(getannotfile(annotations)),
seq_annotations,
copy(getannotcolumn(annotations)),
res_annotations,
)
end