-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathMSAEditing.jl
389 lines (342 loc) · 12.1 KB
/
MSAEditing.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
# Filters
# =======
# Creation of a boolean mask array
# --------------------------------
# If the input is a matrix, dropdims the singleton dimension
function _column_mask(mask::AbstractMatrix{Bool}, msa)
@assert size(mask, 1) == 1 "The mask should be a vector or a matrix of size (1,ncol)"
@assert size(mask, 2) == ncolumns(msa) "One boolean value per column is needed."
dropdims(mask, dims = 1)
end
function _sequence_mask(mask::AbstractMatrix{Bool}, msa)
@assert size(mask, 2) == 1 "The mask should be a vector or a matrix of size (nseq,1)"
@assert size(mask, 1) == nsequences(msa) "One boolean value per sequence is needed."
dropdims(mask, dims = 2)
end
# If the input is a function, apply it to the respective slice
function _column_mask(mask::Function, msa)
out = mapslices(mask, msa, dims = 1)
if size(out) != (1, ncolumns(msa)) || eltype(out) != Bool
error("The function must return a Bool element per column.")
end
dropdims(out, dims = 1)
end
function _sequence_mask(mask::Function, msa)
out = mapslices(mask, msa, dims = 2)
if size(out) != (nsequences(msa), 1) || eltype(out) != Bool
error("The function must return a Bool element per column.")
end
dropdims(out, dims = 2)
end
# If the mask is an AbstractVector{Bool}, return it without changes
function _column_mask(mask::AbstractVector{Bool}, msa)
@assert length(mask) == ncolumns(msa) "One boolean value per column is needed."
mask
end
function _sequence_mask(mask::AbstractVector{Bool}, msa)
@assert length(mask) == nsequences(msa) "One boolean value per sequence is needed."
mask
end
# If the mask is list of indexes, return it without changes
_column_mask(mask::Union{AbstractArray,Colon}, msa) = mask
# Annotate modifications
# ----------------------
_annotate_modification!(x, selector::Colon, fun::String, column::Bool) = nothing
function _annotate_modification!(x, selector::AbstractArray, fun::String, column::Bool)
entity = column ? "column" : "sequence"
verb = "has"
if eltype(selector) <: Bool
n_deleted = sum(.~selector)
if n_deleted > 0
if n_deleted > 1
entity *= "s"
verb = "have"
end
str = string("$fun : $n_deleted $entity $verb been deleted.")
annotate_modification!(x, str)
end
else
cleaned_selector = unique(selector)
n_selected = length(cleaned_selector)
if n_selected > 0
if n_selected > 1
entity *= "s"
verb = "have"
end
str = string("$fun : $n_selected $entity $verb been selected.")
annotate_modification!(x, str)
if column && n_selected > 1 && !issorted(selector) # do not store sequence order changes
annotate_modification!(x, "$fun : column order has changed!")
end
end
end
end
function _annotate_col_modification!(x, selector)
_annotate_modification!(x, selector, "filtercolumns!", true)
end
function _annotate_seq_modification!(x, selector)
_annotate_modification!(x, selector, "filtersequences!", false)
end
# Filter sequences
# ----------------
"""
It's similar to `filtersequences!` but for an `AbstractMatrix{Residue}`
"""
filtersequences(msa::AbstractMatrix{Residue}, mask) = msa[_sequence_mask(mask, msa), :]
"""
`filtersequences!(msa, mask[, annotate::Bool=true])`
It allows to filter `msa` sequences using a `AbstractVector{Bool}` `mask`
(It removes sequences with `false` values). `AnnotatedMultipleSequenceAlignment` annotations
are updated if `annotate` is `true` (default).
"""
function filtersequences!(
msa::AnnotatedMultipleSequenceAlignment,
mask,
annotate::Bool = true,
)
boolean_vector = _sequence_mask(mask, msa)
filtersequences!(annotations(msa), sequencenames(msa), boolean_vector)
# Filter annotations first, names will be changed by filtersequences:
msa.matrix = filtersequences(namedmatrix(msa), boolean_vector)
if annotate
_annotate_seq_modification!(msa, boolean_vector)
end
msa
end
function filtersequences!(msa::MultipleSequenceAlignment, mask, annotate::Bool = false) # annotate is useful for calling this
# inside other functions
msa.matrix = filtersequences(namedmatrix(msa), _sequence_mask(mask, msa))
msa
end
function filtersequences(
msa::Union{AnnotatedMultipleSequenceAlignment,MultipleSequenceAlignment},
args...,
)
filtersequences!(deepcopy(msa), args...)
end
# Filter columns
# --------------
"""
It's similar to `filtercolumns!` but for an `AbstractMatrix{Residue}`
"""
function filtercolumns(msa::AbstractMatrix{Residue}, mask)
msa[:, _column_mask(mask, msa)]
end
"""
`filtercolumns!(msa, mask[, annotate::Bool=true])`
It allows to filter MSA or aligned sequence columns/positions using a
`AbstractVector{Bool}` `mask`. Annotations are updated if `annotate` is `true` (default).
"""
function filtercolumns!(
x::Union{AnnotatedMultipleSequenceAlignment,AnnotatedAlignedSequence,AnnotatedSequence},
mask,
annotate::Bool = true,
)
selector = _column_mask(mask, x)
filtercolumns!(annotations(x), selector)
x.matrix = filtercolumns(namedmatrix(x), selector)
if annotate
_annotate_col_modification!(x, selector)
end
x
end
function filtercolumns!(x::UnannotatedAlignedObject, mask, annotate::Bool = false) # annotate is useful for calling this
# inside other functions
x.matrix = filtercolumns(namedmatrix(x), _column_mask(mask, x))
x
end
filtercolumns(x::AbstractResidueMatrix, args...) = filtercolumns!(deepcopy(x), args...)
# Util function
# -------------
"""
This function takes a vector of sequence names and a sequence id.
It returns the position of that id in the vector.
If the id isn't in the vector, It throws an error.
"""
function _get_seqid_index(names::Vector{String}, sequence_id::String)
id_index = something(findfirst(isequal(sequence_id), names), 0)
if id_index == 0
throw(ErrorException("$sequence_id is not in the list of sequence names."))
end
id_index
end
# Reference
# ---------
"""
It swaps the names on the positions `i` and `j` of a `Vector{String}`
"""
function _swap!(names::Vector{String}, i::Int, j::Int)
name = names[i, :]
names[i, :] = names[j, :]
names[j, :] = name
names
end
"""
It swaps the sequences on the positions `i` and `j` of an MSA. Also it's possible to swap
sequences using their sequence names/identifiers when the MSA object as names.
"""
function swapsequences!(matrix::Matrix{Residue}, i::Int, j::Int)
seq = matrix[i, :]
matrix[i, :] = matrix[j, :]
matrix[j, :] = seq
return matrix
end
function swapsequences!(matrix::NamedArray, i::Int, j::Int)
swapsequences!(getarray(matrix), i, j)
setnames!(matrix, _swap!(sequencenames(matrix), i, j), 1)
return matrix
end
function swapsequences!(matrix::NamedArray, i::String, j::String)
seqnames = sequencenames(matrix)
swapsequences!(matrix, _get_seqid_index(seqnames, i), _get_seqid_index(seqnames, j))
end
"""
It puts the sequence `i` (name or position) as reference (first sequence) of the MSA. This
function swaps the sequences 1 and `i`.
"""
function setreference!(
msa::AnnotatedMultipleSequenceAlignment,
i::Int,
annotate::Bool = true,
)
swapsequences!(namedmatrix(msa), 1, i)
if annotate
seqnames = sequencenames(msa)
annotate_modification!(
msa,
string(
"setreference! : Using ",
seqnames[1],
" instead of ",
seqnames[i],
" as reference.",
),
)
end
msa
end
function setreference!(msa::MultipleSequenceAlignment, i::Int, annotate::Bool = false)
# The annotate argument is useful for calling this inside other functions
swapsequences!(namedmatrix(msa), 1, i)
msa
end
function setreference!(
msa::NamedResidueMatrix{T},
i::Int,
annotate::Bool = false,
) where {T<:AbstractArray}
swapsequences!(msa, 1, i)
end
function setreference!(
msa::NamedResidueMatrix{T},
id::String,
annotate::Bool = false,
) where {T<:AbstractArray}
swapsequences!(msa, names(msa, 1)[1], id)
end
function setreference!(
msa::AbstractMultipleSequenceAlignment,
id::String,
annotate::Bool = true,
)
setreference!(msa, _get_seqid_index(sequencenames(msa), id), annotate)
end
function setreference!(msa::Matrix{Residue}, i::Int, annotate::Bool = false)
# The annotate argument is useful for calling this inside other functions
swapsequences!(msa, 1, i)
end
"""
Creates a new matrix of residues. This function deletes positions/columns of the MSA with
gaps in the reference (first) sequence.
"""
function adjustreference(msa::AbstractMatrix{Residue}, annotate::Bool = false)
# The annotate argument is useful for calling this inside other functions
msa[:, msa[1, :].!=GAP]
end
"""
It removes positions/columns of the MSA with gaps in the reference (first) sequence.
"""
function adjustreference!(msa::AbstractMultipleSequenceAlignment, annotate::Bool = true)
filtercolumns!(msa, vec(getresidues(getsequence(msa, 1))) .!= GAP, annotate)
end
"""
Creates a new matrix of `Residue`s (MSA) with deleted sequences and columns/positions. The
MSA is edited in the following way:
1. Removes all the columns/position on the MSA with gaps on the reference (first) sequence
2. Removes all the sequences with a coverage with respect to the number of
columns/positions on the MSA **less** than a `coveragelimit`
(default to `0.75`: sequences with 25% of gaps)
3. Removes all the columns/position on the MSA with **more** than a `gaplimit`
(default to `0.5`: 50% of gaps)
"""
function gapstrip(
msa::AbstractMatrix{Residue};
coveragelimit::Float64 = 0.75,
gaplimit::Float64 = 0.5,
)
msa = adjustreference(msa)
# Remove sequences with pour coverage of the reference sequence
if ncolumns(msa) != 0
msa = filtersequences(msa, vec(coverage(msa) .>= coveragelimit))
else
throw("There are not columns in the MSA after the gap trimming")
end
if nsequences(msa) != 0
msa = filtercolumns(msa, vec(columngapfraction(msa) .<= gaplimit))
else
throw("There are not sequences in the MSA after coverage filter")
end
msa
end
"""
This functions deletes/filters sequences and columns/positions on the MSA on the following
order:
1. Removes all the columns/position on the MSA with gaps on the reference (first) sequence.
2. Removes all the sequences with a coverage with respect to the number of
columns/positions on the MSA **less** than a `coveragelimit`
(default to `0.75`: sequences with 25% of gaps).
3. Removes all the columns/position on the MSA with **more** than a `gaplimit`
(default to `0.5`: 50% of gaps).
"""
function gapstrip!(
msa::AbstractMultipleSequenceAlignment,
annotate::Bool = isa(msa, AnnotatedAlignedObject);
coveragelimit::Float64 = 0.75,
gaplimit::Float64 = 0.5,
)
if annotate
annotate_modification!(
msa,
string("gapstrip! : Deletes columns with gaps in the first sequence."),
)
end
adjustreference!(msa, annotate)
# Remove sequences with pour coverage of the reference sequence
if ncolumns(msa) != 0
if annotate
annotate_modification!(
msa,
string(
"gapstrip! : Deletes sequences with a coverage less than ",
coveragelimit,
),
)
end
filtersequences!(msa, vec(coverage(msa) .>= coveragelimit), annotate)
else
throw("There are not columns in the MSA after the gap trimming")
end
# Remove columns with a porcentage of gap greater than gaplimit
if nsequences(msa) != 0
if annotate
annotate_modification!(
msa,
string("gapstrip! : Deletes columns with more than ", gaplimit, " gaps."),
)
end
filtercolumns!(msa, vec(columngapfraction(msa) .<= gaplimit), annotate)
else
throw("There are not sequences in the MSA after coverage filter")
end
msa
end