-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhenikoffweight.cpp
84 lines (72 loc) · 2.44 KB
/
henikoffweight.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#include "muscle.h"
#include "msa.h"
/***
Compute Henikoff weights.
Steven Henikoff and Jorja G. Henikoff (1994), Position-based sequence weights.
J. Mol. Biol., 243(4):574-578.
Award each different residue an equal share of the weight, and then to divide up
that weight equally among the sequences sharing the same residue. So if in a
position of a multiple alignment, r different residues are represented, a residue
represented in only one sequence contributes a score of 1/r to that sequence, whereas a
residue represented in s sequences contributes a score of 1/rs to each of the s
sequences. For each sequence, the contributions from each position are summed to give
a sequence weight.
See also HenikoffWeightPB.
***/
void MSA::CalcHenikoffWeightsCol(unsigned uColIndex) const
{
const unsigned uSeqCount = GetSeqCount();
// Compute letter counts in this column
unsigned uLetterCount[MAX_ALPHA];
memset(uLetterCount, 0, sizeof(uLetterCount));
unsigned uDifferentLetterCount = 0;
for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
{
unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex);
if (uLetter >= 20)
continue;
unsigned uNewCount = uLetterCount[uLetter] + 1;
uLetterCount[uLetter] = uNewCount;
if (1 == uNewCount)
++uDifferentLetterCount;
}
// Compute weight contributions
for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
{
unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex);
if (uLetter >= 20)
continue;
const unsigned uCount = uLetterCount[uLetter];
unsigned uDenom = uCount*uDifferentLetterCount;
if (uDenom == 0)
continue;
m_Weights[uSeqIndex] += (WEIGHT) (1.0/uDenom);
}
}
void MSA::SetHenikoffWeights() const
{
const unsigned uColCount = GetColCount();
const unsigned uSeqCount = GetSeqCount();
if (0 == uSeqCount)
return;
else if (1 == uSeqCount)
{
m_Weights[0] = (WEIGHT) 1.0;
return;
}
else if (2 == uSeqCount)
{
m_Weights[0] = (WEIGHT) 0.5;
m_Weights[1] = (WEIGHT) 0.5;
return;
}
for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
m_Weights[uSeqIndex] = 0.0;
for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
CalcHenikoffWeightsCol(uColIndex);
// Set all-gap seqs weight to 0
for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
if (IsGapSeq(uSeqIndex))
m_Weights[uSeqIndex] = 0.0;
Normalize(m_Weights, uSeqCount);
}