-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathHobohm.jl
68 lines (63 loc) · 2.06 KB
/
Hobohm.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# Hobohm I
# ========
"""
Fill `cluster` and `clustersize` matrices. These matrices are assumed to be empty
(only zeroes) and their length is assumed to be equal to the number of sequences in the
alignment (`aln`). `threshold` is the minimum identity value between two sequences
to be in the same cluster.
"""
function _fill_hobohmI!(
cluster::Vector{Int},
clustersize::Vector{Int},
aln::Vector{Vector{Residue}},
threshold,
)
cluster_id = 0
nseq = length(aln)
@inbounds for i = 1:(nseq-1)
if cluster[i] == 0
cluster_id += 1
cluster[i] = cluster_id
clustersize[cluster_id] += 1
ref_seq = aln[i]
for j = (i+1):nseq
if cluster[j] == 0 && percentidentity(ref_seq, aln[j], threshold)
cluster[j] = cluster_id
clustersize[cluster_id] += 1
end
end
end
end
@inbounds if cluster[nseq] == 0
cluster_id += 1
cluster[nseq] = cluster_id
clustersize[cluster_id] += 1
end
resize!(clustersize, cluster_id)
end
"""
Calculates the weight of each sequence in a cluster. The weight is equal to one divided
by the number of sequences in the cluster.
"""
function _get_sequence_weight(clustersize, cluster)
nseq = length(cluster)
sequence_weight = Array{Float64}(undef, nseq)
for i = 1:nseq
@inbounds sequence_weight[i] = 1.0 / clustersize[cluster[i]]
end
Weights(sequence_weight, Float64(length(clustersize)))
end
"""
Sequence clustering using the Hobohm I method from Hobohm et al.
# References
- [Hobohm, Uwe, et al. "Selection of representative protein data sets."
Protein Science 1.3 (1992): 409-417.](@cite 10.1002/pro.5560010313)
"""
function hobohmI(msa::AbstractMatrix{Residue}, threshold)
aln = getresiduesequences(msa)
nseq = length(aln)
cluster = zeros(Int, nseq)
clustersize = zeros(Int, nseq)
_fill_hobohmI!(cluster, clustersize, aln, threshold)
Clusters(clustersize, cluster, _get_sequence_weight(clustersize, cluster))
end