forked from lh3/minimap2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
hit.c
466 lines (438 loc) · 15 KB
/
hit.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
#include <string.h>
#include <stdlib.h>
#include <math.h>
#include "mmpriv.h"
#include "kalloc.h"
#include "khash.h"
static inline void mm_cal_fuzzy_len(mm_reg1_t *r, const mm128_t *a)
{
int i;
r->mlen = r->blen = 0;
if (r->cnt <= 0) return;
r->mlen = r->blen = a[r->as].y>>32&0xff;
for (i = r->as + 1; i < r->as + r->cnt; ++i) {
int span = a[i].y>>32&0xff;
int tl = (int32_t)a[i].x - (int32_t)a[i-1].x;
int ql = (int32_t)a[i].y - (int32_t)a[i-1].y;
r->blen += tl > ql? tl : ql;
r->mlen += tl > span && ql > span? span : tl < ql? tl : ql;
}
}
static inline void mm_reg_set_coor(mm_reg1_t *r, int32_t qlen, const mm128_t *a, int is_qstrand)
{ // NB: r->as and r->cnt MUST BE set correctly for this function to work
int32_t k = r->as, q_span = (int32_t)(a[k].y>>32&0xff);
r->rev = a[k].x>>63;
r->rid = a[k].x<<1>>33;
r->rs = (int32_t)a[k].x + 1 > q_span? (int32_t)a[k].x + 1 - q_span : 0; // NB: target span may be shorter, so this test is necessary
r->re = (int32_t)a[k + r->cnt - 1].x + 1;
if (!r->rev || is_qstrand) {
r->qs = (int32_t)a[k].y + 1 - q_span;
r->qe = (int32_t)a[k + r->cnt - 1].y + 1;
} else {
r->qs = qlen - ((int32_t)a[k + r->cnt - 1].y + 1);
r->qe = qlen - ((int32_t)a[k].y + 1 - q_span);
}
mm_cal_fuzzy_len(r, a);
}
static inline uint64_t hash64(uint64_t key)
{
key = (~key + (key << 21));
key = key ^ key >> 24;
key = ((key + (key << 3)) + (key << 8));
key = key ^ key >> 14;
key = ((key + (key << 2)) + (key << 4));
key = key ^ key >> 28;
key = (key + (key << 31));
return key;
}
mm_reg1_t *mm_gen_regs(void *km, uint32_t hash, int qlen, int n_u, uint64_t *u, mm128_t *a, int is_qstrand) // convert chains to hits
{
mm128_t *z, tmp;
mm_reg1_t *r;
int i, k;
if (n_u == 0) return 0;
// sort by score
z = (mm128_t*)kmalloc(km, n_u * 16);
for (i = k = 0; i < n_u; ++i) {
uint32_t h;
h = (uint32_t)hash64((hash64(a[k].x) + hash64(a[k].y)) ^ hash);
z[i].x = u[i] ^ h; // u[i] -- higher 32 bits: chain score; lower 32 bits: number of seeds in the chain
z[i].y = (uint64_t)k << 32 | (int32_t)u[i];
k += (int32_t)u[i];
}
radix_sort_128x(z, z + n_u);
for (i = 0; i < n_u>>1; ++i) // reverse, s.t. larger score first
tmp = z[i], z[i] = z[n_u-1-i], z[n_u-1-i] = tmp;
// populate r[]
r = (mm_reg1_t*)calloc(n_u, sizeof(mm_reg1_t));
for (i = 0; i < n_u; ++i) {
mm_reg1_t *ri = &r[i];
ri->id = i;
ri->parent = MM_PARENT_UNSET;
ri->score = ri->score0 = z[i].x >> 32;
ri->hash = (uint32_t)z[i].x;
ri->cnt = (int32_t)z[i].y;
ri->as = z[i].y >> 32;
ri->div = -1.0f;
mm_reg_set_coor(ri, qlen, a, is_qstrand);
}
kfree(km, z);
return r;
}
void mm_mark_alt(const mm_idx_t *mi, int n, mm_reg1_t *r)
{
int i;
if (mi->n_alt == 0) return;
for (i = 0; i < n; ++i)
if (mi->seq[r[i].rid].is_alt)
r[i].is_alt = 1;
}
static inline int mm_alt_score(int score, float alt_diff_frac)
{
if (score < 0) return score;
score = (int)(score * (1.0 - alt_diff_frac) + .499);
return score > 0? score : 1;
}
void mm_split_reg(mm_reg1_t *r, mm_reg1_t *r2, int n, int qlen, mm128_t *a, int is_qstrand)
{
if (n <= 0 || n >= r->cnt) return;
*r2 = *r;
r2->id = -1;
r2->sam_pri = 0;
r2->p = 0;
r2->split_inv = 0;
r2->cnt = r->cnt - n;
r2->score = (int32_t)(r->score * ((float)r2->cnt / r->cnt) + .499);
r2->as = r->as + n;
if (r->parent == r->id) r2->parent = MM_PARENT_TMP_PRI;
mm_reg_set_coor(r2, qlen, a, is_qstrand);
r->cnt -= r2->cnt;
r->score -= r2->score;
mm_reg_set_coor(r, qlen, a, is_qstrand);
r->split |= 1, r2->split |= 2;
}
void mm_set_parent(void *km, float mask_level, int mask_len, int n, mm_reg1_t *r, int sub_diff, int hard_mask_level, float alt_diff_frac) // and compute mm_reg1_t::subsc
{
int i, j, k, *w;
uint64_t *cov;
if (n <= 0) return;
for (i = 0; i < n; ++i) r[i].id = i;
cov = (uint64_t*)kmalloc(km, n * sizeof(uint64_t));
w = (int*)kmalloc(km, n * sizeof(int));
w[0] = 0, r[0].parent = 0;
for (i = 1, k = 1; i < n; ++i) {
mm_reg1_t *ri = &r[i];
int si = ri->qs, ei = ri->qe, n_cov = 0, uncov_len = 0;
if (hard_mask_level) goto skip_uncov;
for (j = 0; j < k; ++j) { // traverse existing primary hits to find overlapping hits
mm_reg1_t *rp = &r[w[j]];
int sj = rp->qs, ej = rp->qe;
if (ej <= si || sj >= ei) continue;
if (sj < si) sj = si;
if (ej > ei) ej = ei;
cov[n_cov++] = (uint64_t)sj<<32 | ej;
}
if (n_cov == 0) {
goto set_parent_test; // no overlapping primary hits; then i is a new primary hit
} else if (n_cov > 0) { // there are overlapping primary hits; find the length not covered by existing primary hits
int j, x = si;
radix_sort_64(cov, cov + n_cov);
for (j = 0; j < n_cov; ++j) {
if ((int)(cov[j]>>32) > x) uncov_len += (cov[j]>>32) - x;
x = (int32_t)cov[j] > x? (int32_t)cov[j] : x;
}
if (ei > x) uncov_len += ei - x;
}
skip_uncov:
for (j = 0; j < k; ++j) { // traverse existing primary hits again
mm_reg1_t *rp = &r[w[j]];
int sj = rp->qs, ej = rp->qe, min, max, ol;
if (ej <= si || sj >= ei) continue; // no overlap
min = ej - sj < ei - si? ej - sj : ei - si;
max = ej - sj > ei - si? ej - sj : ei - si;
ol = si < sj? (ei < sj? 0 : ei < ej? ei - sj : ej - sj) : (ej < si? 0 : ej < ei? ej - si : ei - si); // overlap length; TODO: this can be simplified
if ((float)ol / min - (float)uncov_len / max > mask_level && uncov_len <= mask_len) { // then this is a secondary hit
int cnt_sub = 0, sci = ri->score;
ri->parent = rp->parent;
if (!rp->is_alt && ri->is_alt) sci = mm_alt_score(sci, alt_diff_frac);
rp->subsc = rp->subsc > sci? rp->subsc : sci;
if (ri->cnt >= rp->cnt) cnt_sub = 1;
if (rp->p && ri->p && (rp->rid != ri->rid || rp->rs != ri->rs || rp->re != ri->re || ol != min)) { // the last condition excludes identical hits after DP
sci = ri->p->dp_max;
if (!rp->is_alt && ri->is_alt) sci = mm_alt_score(sci, alt_diff_frac);
rp->p->dp_max2 = rp->p->dp_max2 > sci? rp->p->dp_max2 : sci;
if (rp->p->dp_max - ri->p->dp_max <= sub_diff) cnt_sub = 1;
}
if (cnt_sub) ++rp->n_sub;
break;
}
}
set_parent_test:
if (j == k) w[k++] = i, ri->parent = i, ri->n_sub = 0;
}
kfree(km, cov);
kfree(km, w);
}
void mm_hit_sort(void *km, int *n_regs, mm_reg1_t *r, float alt_diff_frac)
{
int32_t i, n_aux, n = *n_regs, has_cigar = 0, no_cigar = 0;
mm128_t *aux;
mm_reg1_t *t;
if (n <= 1) return;
aux = (mm128_t*)kmalloc(km, n * 16);
t = (mm_reg1_t*)kmalloc(km, n * sizeof(mm_reg1_t));
for (i = n_aux = 0; i < n; ++i) {
if (r[i].inv || r[i].cnt > 0) { // squeeze out elements with cnt==0 (soft deleted)
int score;
if (r[i].p) score = r[i].p->dp_max, has_cigar = 1;
else score = r[i].score, no_cigar = 1;
if (r[i].is_alt) score = mm_alt_score(score, alt_diff_frac);
aux[n_aux].x = (uint64_t)score << 32 | r[i].hash;
aux[n_aux++].y = i;
} else if (r[i].p) {
free(r[i].p);
r[i].p = 0;
}
}
assert(has_cigar + no_cigar == 1);
radix_sort_128x(aux, aux + n_aux);
for (i = n_aux - 1; i >= 0; --i)
t[n_aux - 1 - i] = r[aux[i].y];
memcpy(r, t, sizeof(mm_reg1_t) * n_aux);
*n_regs = n_aux;
kfree(km, aux);
kfree(km, t);
}
int mm_set_sam_pri(int n, mm_reg1_t *r)
{
int i, n_pri = 0;
for (i = 0; i < n; ++i)
if (r[i].id == r[i].parent) {
++n_pri;
r[i].sam_pri = (n_pri == 1);
} else r[i].sam_pri = 0;
return n_pri;
}
void mm_sync_regs(void *km, int n_regs, mm_reg1_t *regs) // keep mm_reg1_t::{id,parent} in sync; also reset id
{
int *tmp, i, max_id = -1, n_tmp;
if (n_regs <= 0) return;
for (i = 0; i < n_regs; ++i) // NB: doesn't work if mm_reg1_t::id is negative
max_id = max_id > regs[i].id? max_id : regs[i].id;
n_tmp = max_id + 1;
tmp = (int*)kmalloc(km, n_tmp * sizeof(int));
for (i = 0; i < n_tmp; ++i) tmp[i] = -1;
for (i = 0; i < n_regs; ++i)
if (regs[i].id >= 0) tmp[regs[i].id] = i;
for (i = 0; i < n_regs; ++i) {
mm_reg1_t *r = ®s[i];
r->id = i;
if (r->parent == MM_PARENT_TMP_PRI)
r->parent = i;
else if (r->parent >= 0 && tmp[r->parent] >= 0)
r->parent = tmp[r->parent];
else r->parent = MM_PARENT_UNSET;
}
kfree(km, tmp);
mm_set_sam_pri(n_regs, regs);
}
void mm_select_sub(void *km, float pri_ratio, int min_diff, int best_n, int check_strand, int min_strand_sc, int *n_, mm_reg1_t *r)
{
if (pri_ratio > 0.0f && *n_ > 0) {
int i, k, n = *n_, n_2nd = 0;
for (i = k = 0; i < n; ++i) {
int p = r[i].parent;
if (p == i || r[i].inv) { // primary or inversion
r[k++] = r[i];
} else if ((r[i].score >= r[p].score * pri_ratio || r[i].score + min_diff >= r[p].score) && n_2nd < best_n) {
if (!(r[i].qs == r[p].qs && r[i].qe == r[p].qe && r[i].rid == r[p].rid && r[i].rs == r[p].rs && r[i].re == r[p].re)) // not identical hits
r[k++] = r[i], ++n_2nd;
else if (r[i].p) free(r[i].p);
} else if (check_strand && n_2nd < best_n && r[i].score > min_strand_sc && r[i].rev != r[p].rev) {
r[i].strand_retained = 1;
r[k++] = r[i], ++n_2nd;
} else if (r[i].p) free(r[i].p);
}
if (k != n) mm_sync_regs(km, k, r); // removing hits requires sync()
*n_ = k;
}
}
int mm_filter_strand_retained(int n_regs, mm_reg1_t *r)
{
int i, k;
for (i = k = 0; i < n_regs; ++i) {
int p = r[i].parent;
if (!r[i].strand_retained || r[i].div < r[p].div * 5.0f || r[i].div < 0.01f) {
if (k < i) r[k++] = r[i];
else ++k;
}
}
return k;
}
void mm_filter_regs(const mm_mapopt_t *opt, int qlen, int *n_regs, mm_reg1_t *regs)
{ // NB: after this call, mm_reg1_t::parent can be -1 if its parent filtered out
int i, k;
for (i = k = 0; i < *n_regs; ++i) {
mm_reg1_t *r = ®s[i];
int flt = 0;
if (!r->inv && !r->seg_split && r->cnt < opt->min_cnt) flt = 1;
if (r->p) { // these filters are only applied when base-alignment is available
if (r->mlen < opt->min_chain_score) flt = 1;
else if (r->p->dp_max < opt->min_dp_max) flt = 1;
else if (r->qs > qlen * opt->max_clip_ratio && qlen - r->qe > qlen * opt->max_clip_ratio) flt = 1;
if (flt) free(r->p);
}
if (!flt) {
if (k < i) regs[k++] = regs[i];
else ++k;
}
}
*n_regs = k;
}
int mm_squeeze_a(void *km, int n_regs, mm_reg1_t *regs, mm128_t *a)
{ // squeeze out regions in a[] that are not referenced by regs[]
int i, as = 0;
uint64_t *aux;
aux = (uint64_t*)kmalloc(km, n_regs * 8);
for (i = 0; i < n_regs; ++i)
aux[i] = (uint64_t)regs[i].as << 32 | i;
radix_sort_64(aux, aux + n_regs);
for (i = 0; i < n_regs; ++i) {
mm_reg1_t *r = ®s[(int32_t)aux[i]];
if (r->as != as) {
memmove(&a[as], &a[r->as], r->cnt * 16);
r->as = as;
}
as += r->cnt;
}
kfree(km, aux);
return as;
}
mm_seg_t *mm_seg_gen(void *km, uint32_t hash, int n_segs, const int *qlens, int n_regs0, const mm_reg1_t *regs0, int *n_regs, mm_reg1_t **regs, const mm128_t *a)
{
int s, i, j, acc_qlen[MM_MAX_SEG+1], qlen_sum = 0;
mm_seg_t *seg;
assert(n_segs <= MM_MAX_SEG);
for (s = 1, acc_qlen[0] = 0; s < n_segs; ++s)
acc_qlen[s] = acc_qlen[s-1] + qlens[s-1];
qlen_sum = acc_qlen[n_segs - 1] + qlens[n_segs - 1];
seg = (mm_seg_t*)kcalloc(km, n_segs, sizeof(mm_seg_t));
for (s = 0; s < n_segs; ++s) {
seg[s].u = (uint64_t*)kmalloc(km, n_regs0 * 8);
for (i = 0; i < n_regs0; ++i)
seg[s].u[i] = (uint64_t)regs0[i].score << 32;
}
for (i = 0; i < n_regs0; ++i) {
const mm_reg1_t *r = ®s0[i];
for (j = 0; j < r->cnt; ++j) {
int sid = (a[r->as + j].y&MM_SEED_SEG_MASK)>>MM_SEED_SEG_SHIFT;
++seg[sid].u[i];
++seg[sid].n_a;
}
}
for (s = 0; s < n_segs; ++s) {
mm_seg_t *sr = &seg[s];
for (i = 0, sr->n_u = 0; i < n_regs0; ++i) // squeeze out zero-length per-segment chains
if ((int32_t)sr->u[i] != 0)
sr->u[sr->n_u++] = sr->u[i];
sr->a = (mm128_t*)kmalloc(km, sr->n_a * sizeof(mm128_t));
sr->n_a = 0;
}
for (i = 0; i < n_regs0; ++i) {
const mm_reg1_t *r = ®s0[i];
for (j = 0; j < r->cnt; ++j) {
int sid = (a[r->as + j].y&MM_SEED_SEG_MASK)>>MM_SEED_SEG_SHIFT;
mm128_t a1 = a[r->as + j];
// on reverse strand, the segment position is:
// x_for_cat = qlen_sum - 1 - (int32_t)a1.y - 1 + q_span
// (int32_t)new_a1.y = qlens[sid] - (x_for_cat - acc_qlen[sid] + 1 - q_span) - 1 = (int32_t)a1.y - (qlen_sum - (qlens[sid] + acc_qlen[sid]))
a1.y -= a1.x>>63? qlen_sum - (qlens[sid] + acc_qlen[sid]) : acc_qlen[sid];
seg[sid].a[seg[sid].n_a++] = a1;
}
}
for (s = 0; s < n_segs; ++s) {
regs[s] = mm_gen_regs(km, hash, qlens[s], seg[s].n_u, seg[s].u, seg[s].a, 0);
n_regs[s] = seg[s].n_u;
for (i = 0; i < n_regs[s]; ++i) {
regs[s][i].seg_split = 1;
regs[s][i].seg_id = s;
}
}
return seg;
}
void mm_seg_free(void *km, int n_segs, mm_seg_t *segs)
{
int i;
for (i = 0; i < n_segs; ++i) kfree(km, segs[i].u);
for (i = 0; i < n_segs; ++i) kfree(km, segs[i].a);
kfree(km, segs);
}
static void mm_set_inv_mapq(void *km, int n_regs, mm_reg1_t *regs)
{
int i, n_aux;
mm128_t *aux;
if (n_regs < 3) return;
for (i = 0; i < n_regs; ++i)
if (regs[i].inv) break;
if (i == n_regs) return; // no inversion hits
aux = (mm128_t*)kmalloc(km, n_regs * 16);
for (i = n_aux = 0; i < n_regs; ++i)
if (regs[i].parent == i || regs[i].parent < 0)
aux[n_aux].y = i, aux[n_aux++].x = (uint64_t)regs[i].rid << 32 | regs[i].rs;
radix_sort_128x(aux, aux + n_aux);
for (i = 1; i < n_aux - 1; ++i) {
mm_reg1_t *inv = ®s[aux[i].y];
if (inv->inv) {
mm_reg1_t *l = ®s[aux[i-1].y];
mm_reg1_t *r = ®s[aux[i+1].y];
inv->mapq = l->mapq < r->mapq? l->mapq : r->mapq;
}
}
kfree(km, aux);
}
void mm_set_mapq(void *km, int n_regs, mm_reg1_t *regs, int min_chain_sc, int match_sc, int rep_len, int is_sr)
{
static const float q_coef = 40.0f;
int64_t sum_sc = 0;
float uniq_ratio;
int i;
if (n_regs == 0) return;
for (i = 0; i < n_regs; ++i)
if (regs[i].parent == regs[i].id)
sum_sc += regs[i].score;
uniq_ratio = (float)sum_sc / (sum_sc + rep_len);
for (i = 0; i < n_regs; ++i) {
mm_reg1_t *r = ®s[i];
if (r->inv) {
r->mapq = 0;
} else if (r->parent == r->id) {
int mapq, subsc;
float pen_s1 = (r->score > 100? 1.0f : 0.01f * r->score) * uniq_ratio;
float pen_cm = r->cnt > 10? 1.0f : 0.1f * r->cnt;
pen_cm = pen_s1 < pen_cm? pen_s1 : pen_cm;
subsc = r->subsc > min_chain_sc? r->subsc : min_chain_sc;
if (r->p && r->p->dp_max2 > 0 && r->p->dp_max > 0) {
float identity = (float)r->mlen / r->blen;
float x = (float)r->p->dp_max2 * subsc / r->p->dp_max / r->score0;
mapq = (int)(identity * pen_cm * q_coef * (1.0f - x * x) * logf((float)r->p->dp_max / match_sc));
if (!is_sr) {
int mapq_alt = (int)(6.02f * identity * identity * (r->p->dp_max - r->p->dp_max2) / match_sc + .499f); // BWA-MEM like mapQ, mostly for short reads
mapq = mapq < mapq_alt? mapq : mapq_alt; // in case the long-read heuristic fails
}
} else {
float x = (float)subsc / r->score0;
if (r->p) {
float identity = (float)r->mlen / r->blen;
mapq = (int)(identity * pen_cm * q_coef * (1.0f - x) * logf((float)r->p->dp_max / match_sc));
} else {
mapq = (int)(pen_cm * q_coef * (1.0f - x) * logf(r->score));
}
}
mapq -= (int)(4.343f * logf(r->n_sub + 1) + .499f);
mapq = mapq > 0? mapq : 0;
r->mapq = mapq < 60? mapq : 60;
if (r->p && r->p->dp_max > r->p->dp_max2 && r->mapq == 0) r->mapq = 1;
} else r->mapq = 0;
}
mm_set_inv_mapq(km, n_regs, regs);
}