From 4a85d52b4540f22b58ef991d2917ca479f269c29 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Mar 2024 21:38:06 -0500 Subject: [PATCH] r239: NCBI translation table 1-5 Resolves #56 and #57 --- chain.c | 3 +-- index.c | 1 + main.c | 9 ++++++++- miniprot.h | 7 ++++--- nasw-tab.c | 26 +++++++++++++++++++++----- nasw.h | 6 ++++-- options.c | 1 + 7 files changed, 40 insertions(+), 13 deletions(-) diff --git a/chain.c b/chain.c index 04e1972..bc87de2 100644 --- a/chain.c +++ b/chain.c @@ -161,7 +161,7 @@ uint64_t *mp_chain(int32_t max_dist_x, int32_t max_dist_y, int32_t bw, int32_t m int32_t is_spliced, int32_t kmer, int32_t bbit, int64_t n, uint64_t *a, int32_t *n_u_, uint64_t **_u, void *km) { // TODO: make sure this works when n has more than 32 bits int32_t *f, *t, *v, n_u, n_v, mmax_f = 0, max_drop = bw, hf = 0; - int64_t *p, i, j, st = 0, n_iter = 0, hi = -1; + int64_t *p, i, j, st = 0, hi = -1; uint64_t *u; if (_u) *_u = 0, *n_u_ = 0; @@ -191,7 +191,6 @@ uint64_t *mp_chain(int32_t max_dist_x, int32_t max_dist_y, int32_t bw, int32_t m for (j = i - 1; j >= st; --j) { int32_t sc; sc = comput_sc(a[i], a[j], max_dist_x, max_dist_y, bw, chn_ceof_log, is_spliced, bbit, kmer); - ++n_iter; if (sc == INT32_MIN) continue; sc += f[j]; if (sc > max_f) { diff --git a/index.c b/index.c index f269a21..c1c5977 100644 --- a/index.c +++ b/index.c @@ -214,6 +214,7 @@ mp_idx_t *mp_idx_restore(const char *fn) return 0; mi = Kcalloc(0, mp_idx_t, 1); fread(&mi->opt, sizeof(mi->opt), 1, fp); + ns_make_tables(mi->opt.trans_code); fread(&mi->n_kb, 8, 1, fp); mi->nt = mp_ntseq_restore(fp); mi->ki = Kmalloc(0, int64_t, mp_n_bucket(&mi->opt)); diff --git a/main.c b/main.c index ca9e694..a732880 100644 --- a/main.c +++ b/main.c @@ -55,6 +55,7 @@ static void print_usage(FILE *fp, const mp_idxopt_t *io, const mp_mapopt_t *mo, fprintf(fp, " -k INT k-mer size [%d]\n", io->kmer); fprintf(fp, " -M INT modimisers bit (sample rate = 1/2**M) [%d]\n", io->mod_bit); fprintf(fp, " -L INT min ORF length to index [%d]\n", io->min_aa_len); + fprintf(fp, " -T INT NCBI translation table (from 1 to 5) [%d]\n", io->trans_code); fprintf(fp, " -b INT bits per block [%d]\n", io->bbit); fprintf(fp, " -d FILE save index to FILE []\n"); fprintf(fp, " Mapping:\n"); @@ -104,11 +105,12 @@ int main(int argc, char *argv[]) mp_start(); mp_mapopt_init(&mo); mp_idxopt_init(&io); - while ((c = ketopt(&o, argc, argv, 1, "k:M:L:s:l:b:t:d:c:n:m:K:p:N:SAO:E:J:C:F:G:e:uB:P:w:j:g:I", long_options)) >= 0) { + while ((c = ketopt(&o, argc, argv, 1, "k:M:L:s:l:b:T:t:d:c:n:m:K:p:N:SAO:E:J:C:F:G:e:uB:P:w:j:g:I", long_options)) >= 0) { if (c == 'k') io.kmer = atoi(o.arg); else if (c == 'M') io.mod_bit = atoi(o.arg); else if (c == 'L') io.min_aa_len = atoi(o.arg); else if (c == 'b') io.bbit = atoi(o.arg); + else if (c == 'T') io.trans_code = atoi(o.arg); else if (c == 'd') fn_idx = o.arg; else if (c == 't') n_threads = atoi(o.arg); else if (c == 'l') mo.kmer2 = atoi(o.arg); @@ -171,6 +173,11 @@ int main(int argc, char *argv[]) return 1; } + if (ns_make_tables(io.trans_code) < 0) { + if (mp_verbose >= 1) + fprintf(stderr, "[ERROR]\033[1;31m failed to find translation table %d\033[0m\n", io.trans_code); + return 1; + } mi = mp_idx_load(argv[o.ind], &io, n_threads); if (mi == 0) { if (mp_verbose >= 1) diff --git a/miniprot.h b/miniprot.h index 68ed5c7..2610d57 100644 --- a/miniprot.h +++ b/miniprot.h @@ -3,7 +3,7 @@ #include -#define MP_VERSION "0.12-r237" +#define MP_VERSION "0.12-r239-dirty" #define MP_F_NO_SPLICE 0x1 #define MP_F_NO_ALIGN 0x2 @@ -22,8 +22,8 @@ #define MP_BITS_PER_AA 4 #define MP_BLOCK_BONUS 2 -#define MP_CODON_STD 0 -#define MP_IDX_MAGIC "MPI\2" +#define MP_CODON_STD 1 +#define MP_IDX_MAGIC "MPI\3" #ifdef __cplusplus extern "C" { @@ -37,6 +37,7 @@ typedef struct { int32_t bbit; // block bit int32_t min_aa_len; // ignore ORFs shorter than this int32_t kmer, mod_bit; + uint32_t trans_code; } mp_idxopt_t; typedef struct { diff --git a/nasw-tab.c b/nasw-tab.c index 3025be3..f661429 100644 --- a/nasw-tab.c +++ b/nasw-tab.c @@ -1,4 +1,5 @@ #include +#include #include #include "nasw.h" @@ -11,9 +12,19 @@ char *ns_tab_aa_i2c = "ARNDCQEGHILKMFPSTWYV*X"; uint8_t ns_tab_a2r[22] = { 0, 2, 4, 4, 6, 5, 5, 8, 3, 10, 11, 2, 11, 12, 7, 1, 1, 13, 12, 10, 14, 15 }; // A R N D C Q E G H I L K M F P S T W Y V * X -char *ns_tab_codon_std = "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLFX"; - // 01234567890123456789012345678901234567890123456789012345678901234 - // KKNNRRSSTTTTIMIIEEDDGGGGAAAAVVVVQQHHRRRRPPPPLLLL**YY*WCCSSSSLLFFX <- this is the AGCT order +#define NS_MAX_TRANS_CODE 5 +static const char *ns_tab_codon_all[NS_MAX_TRANS_CODE + 1] = { + 0, + // 0123456789012345678901234567890123456789012345678901234567890123 + // A C G T + // A C G T A C G T A C G T A C G T + // ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT + "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLFX", // 1: The Standard Code; in order of AAA, AAC, AAG, AAT, ACA, ... + "KNKNTTTT*S*SMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLFX", // 2: The Vertebrate Mitochondrial Code + "KNKNTTTTRSRSMIMIQHQHPPPPRRRRTTTTEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLFX", // 3: The Yeast Mitochondrial Code + "KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLFX", // 4: The Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code + "KNKNTTTTSSSSMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLFX" // 5: The Invertebrate Mitochondrial Code +}; uint8_t ns_tab_nt4[256], ns_tab_aa20[256], ns_tab_aa13[256], ns_tab_codon[64], ns_tab_codon13[64]; @@ -43,10 +54,14 @@ int8_t ns_mat_blosum62[484] = { // 484 = 22*22 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-4,-1 }; -void ns_make_tables(int codon_type) +int ns_make_tables(int codon_type) { + const char *trans_tab; char *p; int i; + if (codon_type < 0 || codon_type > NS_MAX_TRANS_CODE) return -1; // out of range + trans_tab = ns_tab_codon_all[codon_type]; + if (trans_tab == 0) return -2; // not defined memset(ns_tab_nt4, 4, 256); for (p = ns_tab_nt_i2c; *p; ++p) ns_tab_nt4[p - ns_tab_nt_i2c] = ns_tab_nt4[(uint8_t)toupper(*p)] = ns_tab_nt4[(uint8_t)tolower(*p)] = p - ns_tab_nt_i2c; @@ -57,9 +72,10 @@ void ns_make_tables(int codon_type) for (p = ns_tab_aa_i2c; *p; ++p) ns_tab_aa13[p - ns_tab_aa_i2c] = ns_tab_aa13[(uint8_t)toupper(*p)] = ns_tab_aa13[(uint8_t)tolower(*p)] = ns_tab_a2r[p - ns_tab_aa_i2c]; for (i = 0; i < 64; ++i) { - ns_tab_codon[i] = ns_tab_aa20[(uint8_t)ns_tab_codon_std[i]]; + ns_tab_codon[i] = ns_tab_aa20[(uint8_t)trans_tab[i]]; ns_tab_codon13[i] = ns_tab_a2r[ns_tab_codon[i]]; } + return 0; } /* See Sibley et al (2016) diff --git a/nasw.h b/nasw.h index 8c815d2..c4a0c3e 100644 --- a/nasw.h +++ b/nasw.h @@ -84,9 +84,11 @@ extern "C" { * This function should be called before invoking other nasw functions. It is * not thread safe. * - * @param codon_type ignored for now + * @param codon_type NCBI genetic code + * + * @return 0 if tables were initialized; <0 if the translation table is not defined */ -void ns_make_tables(int codon_type); +int ns_make_tables(int codon_type); /** * Initialize alignment parameters diff --git a/options.c b/options.c index 1fac1e6..5ea5275 100644 --- a/options.c +++ b/options.c @@ -10,6 +10,7 @@ void mp_idxopt_init(mp_idxopt_t *io) { memset(io, 0, sizeof(*io)); + io->trans_code = MP_CODON_STD; io->bbit = 8; io->min_aa_len = 30; #if MP_BITS_PER_AA == 4