From 87f11cb7cdb15b27939a1b4a2b51f225a50a4aae Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 6 Mar 2023 22:56:18 -0500 Subject: [PATCH] r218: let -G override -I, not the reverse This is more intuitive. Also updated README --- README.md | 64 +++++++++++++++--------------------------------------- main.c | 12 +++++----- miniprot.1 | 6 ++--- miniprot.h | 2 +- 4 files changed, 27 insertions(+), 57 deletions(-) diff --git a/README.md b/README.md index be50074..adfc8eb 100644 --- a/README.md +++ b/README.md @@ -11,12 +11,12 @@ cd miniprot && make ./miniprot test/DPP3-hs.gen.fa.gz test/DPP3-mm.pep.fa.gz > aln.paf # PAF output ./miniprot --gff test/DPP3-hs.gen.fa.gz test/DPP3-mm.pep.fa.gz > aln.gff # GFF3+PAF output -# general command line: index and align in one go -./miniprot -ut16 --gff genome.fna protein.faa > aln.gff +# general command line: index and align in one go (-I sets max intron size based on genome size) +./miniprot -Iut16 --gff genome.fna protein.faa > aln.gff # general command line: index first and then align (recommended) ./miniprot -t16 -d genome.mpi genome.fna -./miniprot -ut16 --gff genome.mpi protein.faa > aln.gff +./miniprot -Iut16 --gff genome.mpi protein.faa > aln.gff # output format man ./miniprot.1 @@ -29,7 +29,6 @@ man ./miniprot.1 - [Users' Guide](#uguide) - [Installation](#install) - [Usage](#usage) - - [Evaluation](#eval) - [Algorithm overview](#algo) - [Citing miniprot](#cite) - [Limitations](#limit) @@ -84,47 +83,12 @@ For convenience, miniprot can also output GFF3 with option `--gff`: ```sh miniprot -t8 --gff -d ref.mpi ref.fna > out.gff ``` -The detailed alignment is embedded in `##PAF` lines in the GFF3 output. - -### Evaluation - -We collected Ensembl canonical mouse proteins from Gencode vM30 and longest -proteins per gene for chicken and zebrafish (fish). We then aligned these proteins to -the human reference genome GRCh38. We say a junction is confirmed if it can be -found in the human Gencode annotation v41; a junction is non-overlapping if the -intron in the junction does not overlap with any introns in the Gencode -annotation. - -We only evaluated miniprot-r173 and [spaln][spaln]-2.4.13a as these are the -only tools practical for whole genomes. Running other tools would require to -find approximate protein mapping first and then realign in each local region. -This procedure is complex and does not evaluate the mapping step. In addition, -[Iwata and Gotoh (2012)][spaln2] suggest that spaln2 consistently outperforms -exonerate, GeneWise, ProSplign and genBlastG. - -In the evaluation, both miniprot (mp) and spaln (sp) were set to use 16 CPU -threads. We used option `-Q7 -O0 -Thomosapi -LS -yS` with spaln (local -alignment with the human-specific splicing model). It gives the best accuracy -on these dataset. Note spaln-2.4.13a crashed for a few zebrafish proteins. We -used 98% of zebrafish proteins in the evaluation. Miniprot uses a splice model -for mammals by default. 'mp-j1' applies a more general model and has lower -accuracy for aligning against the human genome. - -|Metric |mouse/mp |mouse/sp |chicken/mp| fish/mp|fish/mp-j1| fish/sp| -|:---------------|--------:|--------:|--------:|--------:|---------:|--------:| -|Elapsed time (s)| 314 | 3,767 | 260 | 470 | 475 | 12703 | -|Peak RAM (Gb) | 15.3 | 5.6 | 14.6 | 18.7 | 18.9 | 5.5 | -|# proteins | 21,844 | 21,844 | 17,007 | 29,706 | 29,706 | 29,706 | -|# mapped | 19,303 | 18,840 | 13,421 | 19,594 | 19,594 | 17,491 | -|# single-exon | 2,810 | | 1,227 | 1,798 | 1,667 | | -|# predicted junc| 165,458 | 171,241 | 132,473 | 174,975 | 177,995 | 180,117 | -|# non-ovlp junc | 316 | 852 | 258 | 457 | 737 | 1,391 | -|# confirmed junc| 161,113 | 162,551 | 123,523 | 162,195 | 161,225 | 162,757 | -|% confirmed | 97.37 | 94.93 | 94.29 | 92.70 | 90.58 | 90.36 | - -Generally, miniprot finds fewer novel splice junctions, implying higher -specificity, but spaln finds more confirmed junctions, implying higher -sensitivity. +The detailed alignment is embedded in `##PAF` lines in the GFF3 output. You can +also get detailed residue alignment with `--aln`. + +If you are aligning proteins to a whole genome, it is recommended to add option +`-I` to let miniprot automatically set the maximum intron size. You can also +use `-G` to explicitly specify the max intron size. ### Algorithm overview @@ -151,9 +115,14 @@ sensitivity. ### Citing miniprot -The miniprot algorithm is described in the following preprint: +If you use miniprot, please cite: + +> Li, H. (2023) Protein-to-genome alignment with miniprot. *Bioinformatics*, **39**, btad014 [[PMID: 36648328]][mp-pmid]. -> Li, H. (2022). Protein-to-genome alignment with miniprot. [arXiv:2210.08052](https://arxiv.org/abs/2210.08052). +The preprint is available at +[arXiv:2210.08052](https://arxiv.org/abs/2210.08052), which +additionally shows metrics on MetaEuk. Please note that the published paper +evaluated miniprot-0.7. The latest version may report different numbers. ## Limitations @@ -167,6 +136,7 @@ The miniprot algorithm is described in the following preprint: [exonerate]: https://pubmed.ncbi.nlm.nih.gov/15713233/ [genewise]: https://pubmed.ncbi.nlm.nih.gov/15123596/ +[mp-pmid]: https://pubmed.ncbi.nlm.nih.gov/36648328/ [zlib]: https://zlib.net [paftools]: https://github.com/lh3/minimap2/blob/master/misc/paftools.js [minimap2]: https://github.com/lh3/minimap2 diff --git a/main.c b/main.c index 919c218..9aa157c 100644 --- a/main.c +++ b/main.c @@ -56,8 +56,8 @@ static void print_usage(FILE *fp, const mp_idxopt_t *io, const mp_mapopt_t *mo, fprintf(fp, " Mapping:\n"); fprintf(fp, " -S no splicing (applying -G1k -J1k -e1k)\n"); fprintf(fp, " -c NUM max k-mer occurrence [%d]\n", mo->max_occ); - fprintf(fp, " -G NUM max intron size [200k]\n"); - fprintf(fp, " -I set max intron size to 3.6*sqrt(refLen); override -G\n"); + fprintf(fp, " -G NUM max intron size; override -I [200k]\n"); + fprintf(fp, " -I set max intron size to 3.6*sqrt(refLen)\n"); fprintf(fp, " -w FLOAT weight of log gap penalty [%g]\n", mo->chn_coef_log); fprintf(fp, " -n NUM minimum number of syncmers in a chain [%d]\n", mo->min_chn_cnt); fprintf(fp, " -m NUM min chaining score [%d]\n", mo->min_chn_sc); @@ -88,7 +88,7 @@ static void print_usage(FILE *fp, const mp_idxopt_t *io, const mp_mapopt_t *mo, int main(int argc, char *argv[]) { - int32_t c, i, set_I = 0, n_threads = 4; + int32_t c, i, set_I = 0, set_G = 0, n_threads = 4; ketopt_t o = KETOPT_INIT; mp_mapopt_t mo; mp_idxopt_t io; @@ -107,14 +107,14 @@ int main(int argc, char *argv[]) else if (c == 't') n_threads = atoi(o.arg); else if (c == 'l') mo.kmer2 = atoi(o.arg); else if (c == 'c') mo.max_occ = mp_parse_num(o.arg); - else if (c == 'G') mo.bw = mo.max_intron = mp_parse_num(o.arg); + else if (c == 'G') mo.bw = mo.max_intron = mp_parse_num(o.arg), set_G = 1; else if (c == 'I') set_I = 1; else if (c == 'n') mo.min_chn_cnt = mp_parse_num(o.arg); else if (c == 'm') mo.min_chn_sc = mp_parse_num(o.arg); else if (c == 'K') mo.mini_batch_size = mp_parse_num(o.arg); else if (c == 'p') mo.pri_ratio = atof(o.arg); else if (c == 'N') mo.best_n = mp_parse_num(o.arg); - else if (c == 'S') mo.flag |= MP_F_NO_SPLICE, mo.bw = mo.max_intron = mo.max_ext = 1000, mo.io = mo.io_end = 10000; + else if (c == 'S') mo.flag |= MP_F_NO_SPLICE, mo.bw = mo.max_intron = mo.max_ext = 1000, mo.io = mo.io_end = 10000, set_G = 1; else if (c == 'A') mo.flag |= MP_F_NO_ALIGN; else if (c == 'O') mo.go = atoi(o.arg); else if (c == 'E') mo.ge = atoi(o.arg); @@ -167,7 +167,7 @@ int main(int argc, char *argv[]) fprintf(stderr, "[ERROR]\033[1;31m failed to open/build the index\033[0m\n"); return 1; } - if (set_I) mp_mapopt_set_max_intron(&mo, mi->nt->l_seq); + if (set_I && !set_G) mp_mapopt_set_max_intron(&mo, mi->nt->l_seq); if (mp_verbose >= 3) mp_idx_print_stat(mi, mo.max_occ); if (fn_idx != 0) mp_idx_dump(fn_idx, mi); for (i = o.ind + 1; i < argc; ++i) diff --git a/miniprot.1 b/miniprot.1 index 76f4f4c..17a8dcc 100644 --- a/miniprot.1 +++ b/miniprot.1 @@ -69,15 +69,15 @@ Ignore k-mers occurring times or more [50k] .TP .BI -G \ NUM -Max intron size [200k] +Max intron size [200k]. This option overrides +.BR -I . .TP .BI -I Set max intron size to .RI min(max(3.6*sqrt( refLen ),10000),300000) where .I refLen -is the total length of the input genome. This option overrides -.BR -G . +is the total length of the input genome. .TP .BI -n \ NUM Min number of syncmers in a chain [10] diff --git a/miniprot.h b/miniprot.h index af6a004..914d869 100644 --- a/miniprot.h +++ b/miniprot.h @@ -3,7 +3,7 @@ #include -#define MP_VERSION "0.7-r217-dirty" +#define MP_VERSION "0.7-r218-dirty" #define MP_F_NO_SPLICE 0x1 #define MP_F_NO_ALIGN 0x2