From be7b90c5dc0afc580f7eecf7144dcb1a83926b79 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 7 Oct 2024 15:48:49 -0400 Subject: [PATCH] r262: penalize sites not in --spsc --- align.c | 1 + main.c | 8 +++++++- miniprot.h | 3 ++- nasw-sse.c | 22 ++++++++++++++++------ nasw-tab.c | 1 + nasw.h | 1 + ntseq.c | 5 ++--- options.c | 1 + 8 files changed, 31 insertions(+), 11 deletions(-) diff --git a/align.c b/align.c index 1a7c508..ade9453 100644 --- a/align.c +++ b/align.c @@ -54,6 +54,7 @@ static inline void mp_map2ns_opt(const mp_mapopt_t *mo, ns_opt_t *no) no->go = mo->go, no->ge = mo->ge, no->io = mo->io, no->fs = mo->fs, no->xdrop = mo->xdrop, no->sc = mo->mat; no->ie_coef = mo->ie_coef; no->end_bonus = mo->end_bonus; + no->sp_null_bonus = mo->sp_null_bonus; ns_opt_set_sp(no, mo->sp_model); for (i = 0; i < 6; ++i) no->sp[i] = (int32_t)(no->sp[i] * mo->sp_scale + .499f); } diff --git a/main.c b/main.c index e534408..7dcd927 100644 --- a/main.c +++ b/main.c @@ -21,6 +21,7 @@ static ko_longopt_t long_options[] = { { "trans", ko_no_argument, 315 }, { "no-cs", ko_no_argument, 316 }, { "spsc", ko_required_argument, 317 }, + { "spsc0", ko_required_argument, 318 }, { "version", ko_no_argument, 401 }, { "no-kalloc", ko_no_argument, 501 }, { "dbg-qname", ko_no_argument, 502 }, @@ -81,6 +82,7 @@ static void print_usage(FILE *fp, const mp_idxopt_t *io, const mp_mapopt_t *mo, fprintf(fp, " -B INT bonus score for alignment reaching query ends [%d]\n", mo->end_bonus); fprintf(fp, " -j INT splice model: 2=mammal, 1=general, 0=none (see manual) [%d]\n", mo->sp_model); fprintf(fp, " --spsc=FILE splice score []\n"); + fprintf(fp, " --spsc0=INT splice penalty for sites not in --spsc [%d]\n", mo->sp_null_bonus); fprintf(fp, " Input/output:\n"); fprintf(fp, " -t INT number of threads [%d]\n", n_threads); fprintf(fp, " --gff output in the GFF3 format\n"); @@ -161,7 +163,11 @@ int main(int argc, char *argv[]) else if (c == 504) mp_dbg_flag |= MP_DBG_MORE_DP; // --dbg-aflt else if (c == 505) mp_dbg_flag |= MP_DBG_ANCHOR; // --dbg-anchor else if (c == 506) mp_dbg_flag |= MP_DBG_CHAIN; // --dbg-chain - else if (c == 's') { + else if (c == 318) { // --spsc0 + int32_t s; + s = atoi(o.arg); + mo.sp_null_bonus = s < 0? s : -s; + } else if (c == 's') { fprintf(stderr, "Option '-s' is deprecated.\n"); } else if (c == 401) { printf("%s\n", MP_VERSION); diff --git a/miniprot.h b/miniprot.h index 4c4cd0e..6013500 100644 --- a/miniprot.h +++ b/miniprot.h @@ -3,7 +3,7 @@ #include -#define MP_VERSION "0.13-r261-dirty" +#define MP_VERSION "0.13-r262-dirty" #define MP_F_NO_SPLICE 0x1 #define MP_F_NO_ALIGN 0x2 @@ -65,6 +65,7 @@ typedef struct { int32_t io_end; float ie_coef; int32_t sp_model; + int32_t sp_null_bonus; float sp_scale; int32_t xdrop; int32_t end_bonus; diff --git a/nasw-sse.c b/nasw-sse.c index 68fc7fd..b9df688 100644 --- a/nasw-sse.c +++ b/nasw-sse.c @@ -126,9 +126,14 @@ static uint8_t *ns_prep_seq(void *km, const char *ns, int32_t nl, const char *as } if (ss) { // NB: ss[] uses the offset rule to specify a splice site; donor/acceptor[] uses a different rule. The ss[] way is better but too tricky to change now for (i = 1; i < nl; ++i) { - if (ss[i]>>1 == 0) continue; - if (ss[i]&1) acceptor[i-1] -= (int8_t)(ss[i]>>1) - (int8_t)NS_SPSC_OFFSET; - else donor[i-1] -= (int8_t)(ss[i]>>1) - (int8_t)NS_SPSC_OFFSET; + if (ss[i] == 0xff) { // score not set + donor[i-1] -= opt->sp_null_bonus; + acceptor[i-1] -= opt->sp_null_bonus; + } else if (ss[i]&1) { // acceptor + acceptor[i-1] -= (int8_t)(ss[i]>>1) - (int8_t)NS_SPSC_OFFSET; + } else { // donor + donor[i-1] -= (int8_t)(ss[i]>>1) - (int8_t)NS_SPSC_OFFSET; + } } } ns_prep_nas(ns, nl, opt, nas); @@ -169,9 +174,14 @@ static uint8_t *ns_prep_seq_left(void *km, const char *ns, int32_t nl, const cha } if (ss) { for (i = 0; i < nl; ++i) { - if (ss[i]>>1 == 0) continue; - if (ss[i]&1) donor[nl - i - 1] -= (int8_t)(ss[i]>>1) - (int8_t)NS_SPSC_OFFSET; - else acceptor[nl - i - 1] -= (int8_t)(ss[i]>>1) - (int8_t)NS_SPSC_OFFSET; + if (ss[i] == 0xff) { + donor[nl - i - 1] -= opt->sp_null_bonus; + acceptor[nl - i - 1] -= opt->sp_null_bonus; + } else if (ss[i]&1) { + donor[nl - i - 1] -= (int8_t)(ss[i]>>1) - (int8_t)NS_SPSC_OFFSET; + } else { + acceptor[nl - i - 1] -= (int8_t)(ss[i]>>1) - (int8_t)NS_SPSC_OFFSET; + } } } ns_prep_nas(ns, nl, opt, nas); diff --git a/nasw-tab.c b/nasw-tab.c index c89f2f4..581a7b1 100644 --- a/nasw-tab.c +++ b/nasw-tab.c @@ -137,6 +137,7 @@ void ns_opt_init(ns_opt_t *opt) opt->xdrop = 100; opt->end_bonus = 5; ns_opt_set_sp(opt, NS_S_MAMMAL); + opt->sp_null_bonus = -7; opt->asize = 22; opt->ie_coef = .5f; opt->sc = ns_mat_blosum62; diff --git a/nasw.h b/nasw.h index 510c468..9c13c43 100644 --- a/nasw.h +++ b/nasw.h @@ -64,6 +64,7 @@ typedef struct { int32_t xdrop, end_bonus; // xdrop for extension, and bonus for reaching end of proteins int32_t asize; // alphabet size; always 22 in the current implementation int32_t sp[6]; // 0:pos3, 1:GC-AC, 2:AT-AC, 3:other, 4:pos0, 5:poly-Y + int32_t sp_null_bonus; // it is negative float ie_coef; const int8_t *sc; // 22x22 scoring matrix uint8_t *nt4, *aa20, *codon; // char-to-int convertion table and codon table diff --git a/ntseq.c b/ntseq.c index 0709a20..2f090b4 100644 --- a/ntseq.c +++ b/ntseq.c @@ -135,7 +135,7 @@ int64_t mp_ntseq_spsc_get(const mp_ntdb_t *db, int32_t cid, int64_t st0, int64_t if (en0 < 0 || en0 > db->ctg[cid].len) en0 = db->ctg[cid].len; if (!rev) st = st0, en = en0; else st = db->ctg[cid].len - en0, en = db->ctg[cid].len - st0; - memset(sc, 0, en - st); + memset(sc, 0xff, en - st); s = &db->spsc[cid << 1 | (!!rev)]; if (s->n > 0) { int32_t j, l, r; @@ -146,8 +146,7 @@ int64_t mp_ntseq_spsc_get(const mp_ntdb_t *db, int32_t cid, int64_t st0, int64_t uint8_t score = s->a[j] & 0xff; assert(x <= en - st); if (x == en - st) continue; - //fprintf(stderr, "st=%lld, j=%d, pos=%lld\n", st0, j, s->a[j]>>8); - sc[x] = sc[x] > score? sc[x] : score; + if (sc[x] == 0xff || sc[x] < score) sc[x] = score; } } return en - st; diff --git a/options.c b/options.c index 5ea5275..4e31a05 100644 --- a/options.c +++ b/options.c @@ -75,6 +75,7 @@ void mp_mapopt_init(mp_mapopt_t *mo) mo->io_end = 19; mo->ie_coef = .5f; mo->sp_model = NS_S_GENERIC; + mo->sp_null_bonus = -7; mo->sp_scale = 1.0f; mo->end_bonus = 5; mo->xdrop = 100;