Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Miniprot Update version v0.10 to v0.12 #5317

Merged
merged 16 commits into from
Jul 13, 2023
2 changes: 1 addition & 1 deletion tools/miniprot/macros.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<macros>
<token name="@TOOL_VERSION@">0.10</token>
<token name="@TOOL_VERSION@">0.11</token>
<xml name="citation">
<citations>
<citation type="doi">10.1093/bioinformatics/btad014</citation>
Expand Down
149 changes: 108 additions & 41 deletions tools/miniprot/miniprot.xml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
#if str($adv.mapping.intron_size.mode) == 'manual'
-G $adv.mapping.intron_size.max_intron
#elif str($adv.mapping.intron_size.mode) == 'auto'
-I
-I
#end if

#if str($adv.output.prefix) != 'MP'
Expand All @@ -40,6 +40,7 @@
--outn=$adv.output.outputs_per_query
--outc=$adv.output.output_fraction_query
--outs=$adv.output.output_score_least
$adv.output.output_translated_protein
rlibouba marked this conversation as resolved.
Show resolved Hide resolved
$adv.output.output_residue_alignment
#end if
#if str($db.dbtype) == 'fasta'
Expand All @@ -65,19 +66,19 @@
<option value="preindexed">Pre-indexed</option>
</param>
<when value="fasta">
<param name="genomic_fasta" type="data" format="fasta,fasta.gz" label="Genomic sequence (FASTA)" help="Genomic contigs / scaffolds to be aligned against in FASTA format" />
<param argument="-k" name="kmer_size" type="integer" min="1" value="6" label="K-mer size for genome-wide indexing" />
<param name="genomic_fasta" type="data" format="fasta,fasta.gz" label="Genomic sequence (FASTA)" help="Genomic contigs / scaffolds to be aligned against in FASTA format"/>
<param argument="-k" name="kmer_size" type="integer" min="1" value="6" label="K-mer size for genome-wide indexing"/>
<param argument="-b" name="bits_per_block" type="integer" min="1" value="8" label="Number of bits per bin" help="Miniprot splits the genome into non-overlapping bins of 2^8 bp in size" />
<param argument="-M" name="modimisers" type="integer" value="1" label="Sample k-mers at a rate 1/2**INT" help="Increasing this option reduces peak memory but decreases sensitivity" />
<param argument="-L" name="min_ORF" type="integer" value="30" label="Minimum ORF length to index" />
<param argument="-L" name="min_ORF" type="integer" value="30" label="Minimum ORF length to index"/>
</when>
<when value="preindexed">
<!-- refine the datatype here once Miniprot index data type is in Galaxy -->
<param name="genomic_db" type="data" format="binary" label="Pre-indexed genomic database" help="A pre-indexed database built by miniprot" />
<param name="genomic_db" type="data" format="binary" label="Pre-indexed genomic database" help="A pre-indexed database built by miniprot"/>
</when>
</conditional>
<param name="protein_fasta" type="data" format="fasta,fasta.gz" label="Protein sequence (FASTA)" help="Protein sequences to be aligned in FASTA format" />
<param name="output_format" type="select" label="Output format" >
<param name="protein_fasta" type="data" format="fasta,fasta.gz" label="Protein sequence (FASTA)" help="Protein sequences to be aligned in FASTA format"/>
<param name="output_format" type="select" label="Output format">
<option value="gff" selected="true">GFF3</option>
<option value="paf">PAF</option>
<option value="gtf">GTF</option>
Expand All @@ -89,34 +90,34 @@
</param>
<when value="yes">
<section name="mapping" title="Mapping">
<param argument="-S" name="no_splicing" type="boolean" truevalue="-S" falsevalue="" checked="false" label="No splicing" help="No splicing (apply -G1000 -J1000 -e1000)" />
<param argument="-c" name="max_kmer" type="integer" min="1" value="20000" label="Max k-mer occurences" />
<param argument="-w" name="log_gap_penalty_weight" type="float" min="0" max="1" value="0.75" label="Log gap penalty weight" />
<param argument="-n" name="min_syncmers" type="integer" min="1" value="3" label="Minimum number of syncmers in a chain" />
<param argument="-m" name="min_chain_score" type="integer" min="0" value="0" label="Minimum chaining score" />
<param argument="-l" name="second_round_kmer_size" type="integer" min="1" value="5" label="K-mer size for second round of chaining" />
<param argument="-e" name="max_extension" type="integer" min="0" value="10000" label="Max extension for second round of chaining" />
<param argument="-p" name="score_ratio" type="float" min="0" max="1" value="0.7" label="Minimum secondary-to-primary score ratio" />
<param argument="-N" name="max_secondary_alignments" type="integer" min="0" value="30" label="Max secondary alignments to consider" />
<param argument="-S" name="no_splicing" type="boolean" truevalue="-S" falsevalue="" checked="false" label="No splicing" help="No splicing (apply -G1000 -J1000 -e1000)"/>
<param argument="-c" name="max_kmer" type="integer" min="1" value="20000" label="Max k-mer occurences"/>
<param argument="-w" name="log_gap_penalty_weight" type="float" min="0" max="1" value="0.75" label="Log gap penalty weight"/>
<param argument="-n" name="min_syncmers" type="integer" min="1" value="3" label="Minimum number of syncmers in a chain"/>
<param argument="-m" name="min_chain_score" type="integer" min="0" value="0" label="Minimum chaining score"/>
<param argument="-l" name="second_round_kmer_size" type="integer" min="1" value="5" label="K-mer size for second round of chaining"/>
<param argument="-e" name="max_extension" type="integer" min="0" value="10000" label="Max extension for second round of chaining"/>
<param argument="-p" name="score_ratio" type="float" min="0" max="1" value="0.7" label="Minimum secondary-to-primary score ratio"/>
<param argument="-N" name="max_secondary_alignments" type="integer" min="0" value="30" label="Max secondary alignments to consider"/>
<conditional name="intron_size">
<param name="mode" type="select" label="Maximum intron size">
<option value="manual" selected="true">Manual</option>
<option value="auto">Auto (3.6*sqrt)</option>
</param>
<when value="manual">
<param argument="-G" name="max_intron" type="integer" min="0" value="200000" label="Maximum intron size" />
<param argument="-G" name="max_intron" type="integer" min="0" value="200000" label="Maximum intron size"/>
</when>
<when value="auto" />
<when value="auto"/>
</conditional>
</section>
<section name="alignment" title="Alignment">
<param argument="-O" name="gap_open" type="integer" min="0" value="11" label="Gap open penalty" />
<param argument="-O" name="gap_open" type="integer" min="0" value="11" label="Gap open penalty"/>
<param argument="-E" name="gap_extension" type="integer" min="0" value="1" label="Gap extension penalty" help="A k-long gap costs open_penalty+k*extension_penalty" />
<param argument="-J" name="intron_open" type="integer" min="0" value="29" label="Intron open penalty" />
<param argument="-C" name="non_canonical_splice" type="float" value="1" label="Weight of splice penalty; 0 to ignore splice signals" />
<param argument="-F" name="frameshift" type="integer" min="0" value="23" label="Frameshift penalty" />
<param argument="-B" name="end_bonus" type="integer" min="0" value="5" label="Bonus score for alignment reaching query ends" />
<param argument="-j" name="splice_model" type="select" label="Splicing model for the target genome" help="2=mammal, 1=general, 0=none" >
<param argument="-J" name="intron_open" type="integer" min="0" value="29" label="Intron open penalty"/>
<param argument="-C" name="non_canonical_splice" type="float" value="1" label="Weight of splice penalty; 0 to ignore splice signals"/>
<param argument="-F" name="frameshift" type="integer" min="0" value="23" label="Frameshift penalty"/>
<param argument="-B" name="end_bonus" type="integer" min="0" value="5" label="Bonus score for alignment reaching query ends"/>
<param argument="-j" name="splice_model" type="select" label="Splicing model for the target genome" help="2=mammal, 1=general, 0=none">
<option value="O" >None: No splicing model (0)</option>
<option value="1" selected="true">General: Optimal splicing sequence: '|GTR...YAG|' (1)</option>
<option value="2">Mammal: Optimal splicing sequence: 'G|GTR...YYYNYAG|' (2)</option>
Expand All @@ -131,14 +132,16 @@
</valid>
</sanitizer>
</param>
<param argument="-u" name="print_unmapped_proteins" type="boolean" truevalue="-u" falsevalue="" label="Print unmapped proteins" checked="false" />
<param argument="-u" name="print_unmapped_proteins" type="boolean" truevalue="-u" falsevalue="" label="Print unmapped proteins" checked="false"/>
<param argument="--outn" name="outputs_per_query" type="integer" min="0" value="100" label="Outputs per query" help="The number of outputs will be the minimum of this and the max secondary alignments option" />
<param argument="--aln" name="output_residue_alignment" type="boolean" truevalue="--aln" falsevalue="" checked="false" label="Output residue alignment" help="Only for GFF output" />
<param argument="--outs" name="output_score_least" type="float" min="0" max="1" value="0.99"
label="For each protein, only output alignments with a score higher than 'best_score' multiplied by this value"/>
<param argument="--outc" name="output_fraction_query" type="float" value="0.1" label="Output if at least this fraction of query is aligned" />
<param argument="--outc" name="output_fraction_query" type="float" value="0.1" label="Output if at least this fraction of query is aligned"/>
<!-- <param argument="-trans" name="output_translated_protein" type="boolean" truevalue="-trans" falsevalue="" checked="false" label="Output translated protein sequences" help="Skipping frameshift" /> -->
<!-- Warning: there is a bug with the -trans parameter for gtf output. A release is underway to correct this. -->
rlibouba marked this conversation as resolved.
Show resolved Hide resolved
</section>
<param argument="-K" name="query_batch_size" type="integer" min="1" value="2000000" label="Query batch size" />
<param argument="-K" name="query_batch_size" type="integer" min="1" value="2000000" label="Query batch size"/>
</when>
<when value="no">
</when>
Expand All @@ -147,7 +150,7 @@
<outputs>
<data name="output_alignment" format="gff3" label="Miniprot on ${on_string}">
<change_format>
<when input="output_format" value="paf" format="paf" />
<when input="output_format" value="paf" format="paf"/>
<when input="output_format" value="gtf" format="gtf"/>
</change_format>
</data>
Expand All @@ -156,22 +159,22 @@
<test expect_num_outputs="1">
<conditional name="db">
<param name="dbtype" value="fasta" />
<param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta" />
<param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta"/>
</conditional>
<param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta" />
<param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta"/>
<output name="output_alignment" ftype="gff3">
<assert_contents>
<has_text text="ID=MP000001;Rank=1;Identity=0.3420;Positive=0.5104;Target=tr|I6YGH7|I6YGH7_MYCTU 1 375" />
<has_text text="ID=MP000066;Rank=1;Identity=0.3613;Positive=0.5178;Target=sp|P9WQE5|PPSB_MYCTU 1 1214" />
<has_text text="ID=MP000001;Rank=1;Identity=0.3420;Positive=0.5104;Target=tr|I6YGH7|I6YGH7_MYCTU 1 375"/>
<has_text text="ID=MP000066;Rank=1;Identity=0.3613;Positive=0.5178;Target=sp|P9WQE5|PPSB_MYCTU 1 1214"/>
</assert_contents>
</output>
</test>
<test expect_num_outputs="1">
<conditional name="db">
<param name="dbtype" value="fasta" />
<param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta" />
<param name="dbtype" value="fasta"/>
<param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta"/>
</conditional>
<param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta" />
<param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta"/>
<param name="output_format" value="paf" />
<output name="output_alignment" ftype="paf">
<assert_contents>
Expand All @@ -183,13 +186,13 @@
<test expect_num_outputs="1">
<conditional name="db">
<param name="dbtype" value="fasta" />
<param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta" />
<param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta"/>
</conditional>
<param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta" />
<param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta"/>
<param name="output_format" value="gff" />
<conditional name="adv">
<param name="options" value="yes" />
<param name="second_round_kmer_size" value="32" />
<param name="second_round_kmer_size" value="32"/>
</conditional>
<output name="output_alignment" ftype="gff3">
<assert_contents>
Expand All @@ -207,16 +210,80 @@
<param name="output_format" value="gtf"></param>
<conditional name="adv">
<param name="options" value="yes"></param>
<param name="second_round_kmer_size" value="32"></param>
</conditional>
<output name="output_alignment" ftype="gtf">
<assert_contents>
<has_text text="NC_000962.3" />
<has_text text='transcript_id "MPT000004"; gene_id "MPG000004"' />
<has_text text='transcript_id "MPT000004"; gene_id "MPG000004"'/>
</assert_contents>
</output>
</test>
<test expect_num_outputs="1">
<conditional name="db">
<param name="dbtype" value="fasta"></param>
<param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta"></param>
</conditional>
<param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta"></param>
<param name="output_format" value="gff"></param>
<conditional name="adv">
<param name="options" value="yes"/>
<section name="output">
<param name="output_translated_protein" value="true"/>
rlibouba marked this conversation as resolved.
Show resolved Hide resolved
</section>
</conditional>
<output name="output_alignment" ftype="gff3">
<assert_contents>
<has_text text="##gff-version 3"/>
<has_text text="tr|I6YGH7|I6YGH7_MYCTU" />
<has_text text="VDIDLDPSTEKLRAQIRAEVAALKAMPREPRTVAIAEGGWVLPYLPKPWGRAASPVEQIIIAQEFTAGRVKRPQIAIATWIVPSIVAFGTDNQKQRLLPPTFRGDIFWCQLFSEPGAGSDLASLATKATRVDGGWRITGQKIWTTGAQYSQWGALLARTDPSAPKHNGITYFLLDMKSEGVQVKPLRELTGKEFFNTVYLDDVFVPDELVLGEVNRGWEVSRNTLTAERVSIGGSDSTFLPTLGEFVDFVRDYRFEGQFDQVARHRAGQLIAEGHATKLLNLRSTLLTLAGGDPMAPAAISKLLSMRTGQGYAEFAVSSFGTDAVIGDTERLPGKWGEYLLASRATTIYGGTSEVQLNIIAERLLGLPRDP"/>
<has_n_lines n="1633"/>
</assert_contents>
</output>
</test>

<test expect_num_outputs="1">
<conditional name="db">
<param name="dbtype" value="fasta"></param>
<param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta"></param>
</conditional>
<param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta"></param>
<param name="output_format" value="gtf"></param>
<conditional name="adv">
<param name="options" value="yes"></param>
<section name="output">
<param name="output_translated_protein" value="true"/>
</section> </conditional>
<output name="output_alignment" ftype="gtf">
<assert_contents>
<has_text text="tr|I6YGH7|I6YGH7_MYCTU"/>
<has_text text="VDIDLDPSTEKLRAQIRAEVAALKAMPREPRTVAIAEGGWVLPYLPKPWGRAASPVEQIIIAQEFTAGRVKRPQIAIATWIVPSIVAFGTDNQKQRLLPPTFRGDIFWCQLFSEPGAGSDLASLATKATRVDGGWRITGQKIWTTGAQYSQWGALLARTDPSAPKHNGITYFLLDMKSEGVQVKPLRELTGKEFFNTVYLDDVFVPDELVLGEVNRGWEVSRNTLTAERVSIGGSDSTFLPTLGEFVDFVRDYRFEGQFDQVARHRAGQLIAEGHATKLLNLRSTLLTLAGGDPMAPAAISKLLSMRTGQGYAEFAVSSFGTDAVIGDTERLPGKWGEYLLASRATTIYGGTSEVQLNIIAERLLGLPRDP"/>
<has_text text="NC_000962.3"/>
<has_n_lines n="2172"/>
</assert_contents>
</output>
</test>

<test expect_num_outputs="1">
<conditional name="db">
<param name="dbtype" value="fasta"></param>
<param name="genomic_fasta" value="input_genome.fasta.gz" ftype="fasta"></param>
</conditional>
<param name="protein_fasta" value="input_query.fasta.gz" ftype="fasta"></param>
<param name="output_format" value="paf"></param>
<conditional name="adv">
<param name="options" value="yes"></param>
<section name="output">
<param name="output_translated_protein" value="true"/>
</section> </conditional>
<output name="output_alignment" ftype="paf">
<assert_contents>
<has_text text="tr|I6YGH7|I6YGH7_MYCTU"/>
<has_text text="VDIDLDPSTEKLRAQIRAEVAALKAMPREPRTVAIAEGGWVLPYLPKPWGRAASPVEQIIIAQEFTAGRVKRPQIAIATWIVPSIVAFGTDNQKQRLLPPTFRGDIFWCQLFSEPGAGSDLASLATKATRVDGGWRITGQKIWTTGAQYSQWGALLARTDPSAPKHNGITYFLLDMKSEGVQVKPLRELTGKEFFNTVYLDDVFVPDELVLGEVNRGWEVSRNTLTAERVSIGGSDSTFLPTLGEFVDFVRDYRFEGQFDQVARHRAGQLIAEGHATKLLNLRSTLLTLAGGDPMAPAAISKLLSMRTGQGYAEFAVSSFGTDAVIGDTERLPGKWGEYLLASRATTIYGGTSEVQLNIIAERLLGLPRDP"/>
<has_text text="sp|O05779|FTSE_MYCTU"/>
<has_n_lines n="650"/>
</assert_contents>
</output>
</test>

</tests>
<help><![CDATA[
Expand All @@ -232,4 +299,4 @@
.. _overview: https://github.com/lh3/miniprot#algorithm-overview
]]></help>
<expand macro="citation"></expand>
</tool>
</tool>
Loading