-
Notifications
You must be signed in to change notification settings - Fork 18
/
sketch.c
100 lines (96 loc) · 3.01 KB
/
sketch.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#include <assert.h>
#include <stdio.h>
#include "mppriv.h"
#include "kalloc.h"
#include "kvec-km.h"
static inline uint32_t mp_hash32_mask(uint32_t key, uint32_t mask)
{
key = (key + ~(key << 15)) & mask;
key ^= (key >> 10);
key = (key + (key << 3)) & mask;
key ^= (key >> 6);
key = (key + ~(key << 11)) & mask;
key ^= (key >> 16);
return key;
}
void mp_sketch_prot(void *km, const char *seq, int32_t len, int32_t kmer, int32_t mod_bit, mp64_v *a)
{
int32_t i, l = 0;
uint32_t x = 0, y, mask_k = (1U<<kmer*MP_BITS_PER_AA) - 1, mask_mod = (1U<<mod_bit) - 1;
a->n = 0;
for (i = 0; i < len; ++i) {
int32_t c = ns_tab_aa13[(uint8_t)seq[i]];
if (c < 14) {
#if MP_BITS_PER_AA == 4
x = (x<<MP_BITS_PER_AA | c) & mask_k;
#else
x = (x<<MP_BITS_PER_AA | c>>1) & mask_k;
#endif
if (++l >= kmer) {
y = mp_hash32_mask(x, mask_k);
if ((y&mask_mod) == 0)
kv_push(uint64_t, km, *a, (uint64_t)(y>>mod_bit)<<32 | i);
}
} else x = 0, l = 0;
}
}
void mp_sketch_clean_orf(void *km, const uint8_t *seq, int64_t st, int64_t en, int32_t kmer, int32_t mod_bit, int32_t bbit, int64_t boff, mp64_v *a)
{
int64_t i;
int32_t l;
uint32_t x, y, mask_k = (1U<<kmer*MP_BITS_PER_AA) - 1, mask_mod = (1U<<mod_bit) - 1;
for (i = st, l = 0, x = 0; i < en; i += 3) {
uint8_t codon = seq[i]<<4 | seq[i+1]<<2 | seq[i+2];
assert(seq[i] < 4 && seq[i+1] < 4 && seq[i+2] < 4);
#if MP_BITS_PER_AA == 4
x = (x<<MP_BITS_PER_AA | ns_tab_codon13[codon]) & mask_k;
#else
x = (x<<MP_BITS_PER_AA | ns_tab_codon13[codon]>>1) & mask_k;
#endif
if (++l >= kmer) {
y = mp_hash32_mask(x, mask_k);
assert(y <= mask_k);
if ((y&mask_mod) == 0)
kv_push(uint64_t, km, *a, (uint64_t)(y>>mod_bit)<<32 | (((i+2)>>bbit) + boff));
}
}
}
void mp_sketch_nt4(void *km, const uint8_t *seq, int64_t len, int32_t min_aa_len, int32_t kmer, int32_t mod_bit, int32_t bbit, int64_t boff, mp64_v *a)
{
uint8_t codon, p, q;
int64_t i, j, l, e[3], k[3];
kv_resize(uint64_t, km, *a, len>>2);
a->n = 0;
for (p = 0; p < 3; ++p)
e[p] = -1, k[p] = 0;
for (i = 0, p = 1, codon = 0, l = 0; i < len; ++i, ++p) {
if (p == 3) p = 0;
if (seq[i] < 4) {
codon = (codon << 2 | seq[i]) & 0x3f;
if (++l >= 3) {
uint8_t aa = ns_tab_codon[codon];
if (aa >= 20) {
if (k[p] >= min_aa_len)
mp_sketch_clean_orf(km, seq, e[p] + 1 - k[p] * 3, e[p] + 1, kmer, mod_bit, bbit, boff, a);
k[p] = 0, e[p] = -1;
} else e[p] = i, ++k[p];
}
} else {
for (q = 0; q < 3; ++q) {
if (k[q] >= min_aa_len)
mp_sketch_clean_orf(km, seq, e[q] + 1 - k[q] * 3, e[q] + 1, kmer, mod_bit, bbit, boff, a);
k[q] = 0, e[q] = -1;
}
l = 0, codon = 0;
}
}
for (p = 0; p < 3; ++p)
if (k[p] >= min_aa_len)
mp_sketch_clean_orf(km, seq, e[p] + 1 - k[p] * 3, e[p] + 1, kmer, mod_bit, bbit, boff, a);
if (a->n <= 1) return; // the following loop doesn't work when a->n == 0
radix_sort_mp64(a->a, a->a + a->n);
for (i = 1, j = 0; i < a->n; ++i) // squeeze out identical entries
if (a->a[j] != a->a[i])
a->a[++j] = a->a[i];
a->n = ++j;
}