From 61a352fd7a0d41ca3e66dab8d973a1e51581ce13 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 5 Mar 2024 22:04:17 -0500 Subject: [PATCH] r243: in GFF3, the last CDS includes stop codon Resolves #55 --- format.c | 9 ++++++--- miniprot.1 | 5 ++++- miniprot.h | 2 +- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/format.c b/format.c index 8dd9f9f..47c1031 100644 --- a/format.c +++ b/format.c @@ -356,8 +356,11 @@ static void mp_write_gff(kstring_t *s, void *km, const mp_idx_t *mi, const mp_bs for (j = 0; j < r->n_feat; ++j) { f = &feat[j]; - vs = r->vid&1? ctg->len - f->ve : f->vs; - ve = r->vid&1? ctg->len - f->vs : f->ve; + ve = f->ve; + if (has_stop && f->type == MP_FEAT_CDS && j + 1 < r->n_feat && feat[j+1].type == MP_FEAT_STOP) // in GFF3, the last CDS includes stop codon. GTF is different! + ve += 3; + vs = r->vid&1? ctg->len - ve : f->vs; + ve = r->vid&1? ctg->len - f->vs : ve; mp_sprintf_lite(s, "%s\tminiprot\t%s\t%d\t%d\t%d\t%c\t%d\tParent=%s;Rank=%d", ctg->name, f->type == MP_FEAT_STOP? "stop_codon" : "CDS", (int)vs + 1, (int)ve, f->score, "+-"[r->vid&1], f->phase, id_str, hit_idx); if (f->type == MP_FEAT_CDS) { @@ -399,7 +402,7 @@ static void mp_write_gtf(kstring_t *s, void *km, const mp_idx_t *mi, const mp_bs for (j = 0; j < r->n_feat; ++j) { int64_t vs2, ve2; f = &feat[j]; - if (f->type != MP_FEAT_CDS) continue; + if (f->type != MP_FEAT_CDS) continue; // GTF is simpler without stop_codon and additional attributes vs2 = vs = r->vid&1? ctg->len - f->ve : f->vs; ve2 = ve = r->vid&1? ctg->len - f->vs : f->ve; if (f->ve == r->ve) { // last exon; then adjust for stop codon diff --git a/miniprot.1 b/miniprot.1 index 4e493e1..19506e5 100644 --- a/miniprot.1 +++ b/miniprot.1 @@ -1,4 +1,4 @@ -.TH miniprot 1 "24 June 2023" "miniprot-0.12 (r237)" "Bioinformatics tools" +.TH miniprot 1 "5 March 2024" "miniprot-0.12-dirty (r243)" "Bioinformatics tools" .SH NAME .PP miniprot - protein-to-genome alignment with splicing and frameshifts @@ -47,6 +47,9 @@ Sample k-mers at a rate .BI -L \ INT Minimum ORF length to index [30] .TP +.BI -T \ INT +NCBI translation table (1 through 5) [1] +.TP .BI -b \ INT Number of bits per bin [8]. Miniprot splits the genome into non-overlapping bins of 2^8 bp in size. .TP diff --git a/miniprot.h b/miniprot.h index 2610d57..f97d058 100644 --- a/miniprot.h +++ b/miniprot.h @@ -3,7 +3,7 @@ #include -#define MP_VERSION "0.12-r239-dirty" +#define MP_VERSION "0.12-r243-dirty" #define MP_F_NO_SPLICE 0x1 #define MP_F_NO_ALIGN 0x2