diff --git a/anvio/__init__.py b/anvio/__init__.py index 995403f138..711a8f9ce2 100644 --- a/anvio/__init__.py +++ b/anvio/__init__.py @@ -295,13 +295,12 @@ def TABULATE(table, header, numalign="right", max_width=0): ['--prodigal-translation-table'], {'metavar': 'INT', 'default': None, - 'help': "This is a parameter to pass to the Prodigal for a specific translation table. This parameter " + 'help': "This is a parameter to pass to the Pyrodigal-gv for a specific translation table. This parameter " "corresponds to the parameter `-g` in Prodigal, the default value of which is 11 (so if you do " - "not set anything, it will be set to 11 in Prodigal runtime. Please refer to the Prodigal " + "not set anything, it will be set to 11 in Pyrodigal-gv runtime. Please refer to the Prodigal " "documentation to determine what is the right translation table for you if you think you need " "it.)"} ), - 'skip-gene-calling': ( ['--skip-gene-calling'], {'default': False, @@ -314,14 +313,22 @@ def TABULATE(table, header, numalign="right", max_width=0): ['--prodigal-single-mode'], {'default': False, 'action': 'store_true', - 'help': "By default, anvi'o will use prodigal for gene calling (unless you skipped gene calling, or provided " - "anvi'o with external gene calls). One of the flags anvi'o includes in prodigal run is `-p meta`, which " - "optimizes prodigal's ability to identify genes in metagenomic assemblies. In some rare cases, for a " - "given set of contigs prodigal will yield a segmentation fault error due to one or more genes in your " + 'help': "By default, anvi'o will use pyrodigal-gv for gene calling (unless you skipped gene calling, or provided " + "anvi'o with external gene calls). One of the flags anvi'o includes in pyrodigal-gv run is `-p meta`, which " + "optimizes pyrodigal-gv's ability to identify genes in metagenomic assemblies. In some rare cases, for a " + "given set of contigs pyrodigal-gv will yield a segmentation fault error due to one or more genes in your " "collections will confuse the program when it is used with the `-p meta` flag. While anvi'o developers " "are not quite sure under what circumstances this happens, we realized that removal of this flag often " "solves this issue. If you are dealing with such cyrptic errors, the inclusion of `--skip-prodigal-meta-flag` " - "will instruct anvi'o to run prodigal without the `-meta` flag, and may resolve this issue for you."} + "will instruct anvi'o to run pyrodigal-gv without the `-meta` flag, and may resolve this issue for you."} + ), + 'full-gene-calling-report': ( + ['--full-gene-calling-report'], + {'metavar': 'FILE', + 'default': None, + 'help': "When anvi'o is done with gene calling using pyrodigal, it only stores some data about individual gene " + "calls. Using this parameter you can pass an output file to report most comprehensive data on gene calls " + "as a TAB-delimited text file with gene caller ids matching to those that are stored in the contigs-db."} ), 'remove-partial-hits': ( ['--remove-partial-hits'], diff --git a/anvio/constants.py b/anvio/constants.py index 139fe0f321..14a114e5e2 100644 --- a/anvio/constants.py +++ b/anvio/constants.py @@ -114,7 +114,7 @@ pan_default = "presence-absence" trnaseq_default = "cov" -default_gene_caller = "prodigal" +default_gene_caller = "pyrodigal-gv" # see https://github.com/merenlab/anvio/issues/1358 gene_call_types = {'CODING': 1, diff --git a/anvio/dbops.py b/anvio/dbops.py index 7303c7ad51..d67ee3aa78 100644 --- a/anvio/dbops.py +++ b/anvio/dbops.py @@ -4443,7 +4443,7 @@ def create(self, args): "skipped. Please make up your mind.") if (external_gene_calls_file_path or skip_gene_calling) and prodigal_translation_table: - raise ConfigError("You asked anvi'o to %s, yet you set a specific translation table for prodigal. These " + raise ConfigError("You asked anvi'o to %s, yet you set a specific translation table for pyrodigal-gv. These " "parameters do not make much sense and anvi'o is kindly asking you to make up your " "mind." % ('skip gene calling' if skip_gene_calling else 'use external gene calls')) @@ -4636,7 +4636,7 @@ def create(self, args): self.run.info('External gene calls file have AA sequences?', external_gene_calls_include_amino_acid_sequences, mc='green') self.run.info('Proper frames will be predicted?', (not skip_predict_frame), mc='green') else: - self.run.info('Is prodigal run in single mode?', ('YES' if prodigal_single_mode else 'NO'), mc='green') + self.run.info('Is pyrodigal-gv run in single mode?', ('YES' if prodigal_single_mode else 'NO'), mc='green') self.run.info('Ignoring internal stop codons?', ignore_internal_stop_codons) self.run.info('Splitting pays attention to gene calls?', (not skip_mindful_splitting)) diff --git a/anvio/docs/__init__.py b/anvio/docs/__init__.py index ec4ba01b4d..690d84a171 100644 --- a/anvio/docs/__init__.py +++ b/anvio/docs/__init__.py @@ -14,7 +14,7 @@ "artifacts_accepted": ['fasta-txt'], "anvio_workflows_inherited": [], "third_party_programs_used": [ - ('Gene calling', ['prodigal']), + ('Gene calling', ['pyrodigal-gv']), ('HMM search', ['HMMER']), ('Gene taxonomy', ['krakenuniq', 'centrifuge']), ('Sequence search against various databases', ['DIAMOND']) @@ -35,7 +35,7 @@ ('Quality control of short reads', ['illumina-utils']), ('Assembly', ['IDBA-UD', 'metaSPAdes', 'MEGAHIT']), ('BAM file manipulations', ['samtools']), - ('Gene calling', ['prodigal']), + ('Gene calling', ['pyrodigal-gv']), ('HMM search', ['HMMER']), ('Gene taxonomy', ['krakenuniq', 'centrifuge']), ('Read recruitment', ['Bowtie2']) @@ -116,7 +116,7 @@ 'metaSPAdes': {'link': "https://cab.spbu.ru/software/meta-spades/"}, 'MEGAHIT': {'link': 'https://github.com/voutcn/megahit'}, 'samtools': {'link': 'http://www.htslib.org/'}, - 'prodigal': {'link': 'https://github.com/hyattpd/Prodigal'}, + 'pyrodigal-gv': {'link': 'https://github.com/althonos/pyrodigal-gv'}, 'HMMER': {'link': 'http://hmmer.org/'}, 'Bowtie2': {'link': 'https://github.com/BenLangmead/bowtie2'}, 'krakenuniq': {'link': 'https://github.com/fbreitwieser/krakenuniq'}, diff --git a/anvio/docs/programs/anvi-export-gene-calls.md b/anvio/docs/programs/anvi-export-gene-calls.md index 150631ccea..25923f1367 100644 --- a/anvio/docs/programs/anvi-export-gene-calls.md +++ b/anvio/docs/programs/anvi-export-gene-calls.md @@ -7,7 +7,7 @@ anvi-export-gene-calls -c %(contigs-db)s \ --list-gene-callers {{ codestop }} -Running this will export all of your gene calls identified by the gene caller [prodigal](https://github.com/hyattpd/Prodigal) (assuming it is in your %(contigs-db)s: +Running this will export all of your gene calls identified by the gene caller [pyrodigal-gv](https://github.com/althonos/pyrodigal-gv) (assuming it is in your %(contigs-db)s: {{ codestart }} anvi-export-gene-calls -c %(contigs-db)s \ diff --git a/anvio/docs/programs/anvi-gen-contigs-database.md b/anvio/docs/programs/anvi-gen-contigs-database.md index ea80c54917..8248500a75 100644 --- a/anvio/docs/programs/anvi-gen-contigs-database.md +++ b/anvio/docs/programs/anvi-gen-contigs-database.md @@ -10,7 +10,8 @@ When run on a %(contigs-fasta)s this program will, * **Soft-split contigs** longer than 20,000 bp into smaller ones (you can change the split size using the `--split-length` flag). When the gene calling step is not skipped, the process of splitting contigs will consider where genes are and avoid cutting genes in the middle. For very, very large assemblies this process can take a while, and you can skip it with `--skip-mindful-splitting` flag. -* **Identify open reading frames** using [Prodigal](http://prodigal.ornl.gov/), UNLESS, (1) you have used the flag `--skip-gene-calling` (no gene calls will be made) or (2) you have provided %(external-gene-calls)s. +* **Identify open reading frames** using [pyrodigal-gv](https://github.com/althonos/pyrodigal-gv) which is an extension of [pyrodigal](https://github.com/althonos/pyrodigal) ([doi:10.21105/joss.04296](https://doi.org/10.21105/joss.04296)) (which builds upon [prodigal](http://prodigal.ornl.gov/), the approach originally implemented by Hyatt et al ([doi:10.1186/1471-2105-11-119](https://doi.org/10.1186/1471-2105-11-119)). +Additionally, it includes metagenomics models for giant viruses and viruses with alternative genetic codes by Camargo et al [doi:10.1038/s41587-023-01953-y](https://doi.org/10.1038/s41587-023-01953-y). **UNLESS**, (1) you have used the flag `--skip-gene-calling` (no gene calls will be made) or (2) you have provided %(external-gene-calls)s. See other details related to gene calling below. {:.notice} This program can work with compressed input FASTA files (i.e., the file name ends with a `.gz` extention). @@ -61,6 +62,31 @@ anvi-gen-contigs-database -f %(contigs-fasta)s \ --ignore-internal-stop-codons {{ codestop }} +### Gene calling + +By default, this program will use [pyrodigal](https://github.com/althonos/pyrodigal) for gene calling, and the key aspects of the resulting information about genes in input sequences are stored in the resulting %(contigs-db)s files. That said, gene calls include much more information than what will be stored in the database. If you need a more comprehensive report on gene calls, you can run %(anvi-gen-contigs-database)s with the following parameter: + +{{ codestart }} +anvi-gen-contigs-database -f %(contigs-fasta)s \ + -o %(contigs-db)s \ + --full-gene-calling-report OUTPUT.txt +{{ codestop }} + +In this case the `OUTPUT.txt` will contain additional data, and the gene caller ids will match to those that are stored in the database therefore tractable all downstream analyses that will stem from the resulting %(contigs-db)s. Here is a snippet from an example file: + +|**gene_callers_id**|**contig**|**start**|**stop**|**direction**|**partial**|**partial_begin**|**partial_end**|**confidence**|**gc_cont**|**rbs_motif**|**rbs_spacer**|**score**|**cscore**|**rscore**|**sscore**|**start_type**|**translation_table**|**tscore**|**uscore**|**sequence**|**translated_sequence**| +|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--|:--| +|0|NC_009091|169|1327|f|False|False|False|99.99|0.3013|||153.96808116167358|151.14841116167358|-0.9874499999999999|2.8196700000000003|ATG|11|2.8971|0.9100200000000003|ATGGAAATTATTTGTAATCAAAATGAATTA(...)|MEIICNQNELNNAIQLVSKAVASRPTHPIL(...)| +|1|NC_009091|1388|2036|f|False|False|False|99.99|0.2885|GGA/GAG/AGG|5-10bp|81.87259573141999|71.58136573142|2.71875|10.291229999999999|ATG|11|2.8971|4.67538|ATGGTCCTTAATTATGGAAATGGTGAAAAT(...)|MVLNYGNGENVWMHPPVHRILGWYSRPSNL(...)| +|2|NC_009091|2039|4379|f|False|False|False|99.99|0.3260|GGA/GAG/AGG|5-10bp|352.2303246874159|347.0894946874159|2.71875|5.14083|ATG|11|2.8971|-0.47502|ATGATAAATCAAGAAAATAATGATCTATAT(...)|MINQENNDLYDLNEALQVENLTLNDYEEIC(...)| +|3|NC_009091|4426|5887|f|False|False|False|99.99|0.3347|||194.19481790304437|188.61028790304437|-0.9874499999999999|5.584530000000002|ATG|11|2.8971|3.6748800000000017|ATGTGCGGAATAGTTGGAATCGTTTCTTCA(...)|MCGIVGIVSSDDVNQQIYDSLLLLQHRGQD(...)| +|4|NC_009091|5883|8325|r|False|False|False|99.99|0.2731|||318.93104438253084|315.33533438253085|-0.9874499999999999|3.5957100000000004|ATG|11|2.8971|1.6860600000000003|ATGGATAAGAAAAACTTCACTTCTATCTCA(...)|MDKKNFTSISLQEEMQRSYLEYAMSVIVGR(...)| +|5|NC_009091|8402|9266|r|False|False|False|99.99|0.2719|||99.52927145207937|93.12346145207937|-0.9874499999999999|6.405809999999998|ATG|11|2.8971|4.496159999999998|ATGAAAAAGTTTTTACAAAGAATACTCTGG(...)|MKKFLQRILWISLISFYFLQIKKVQAIVPY(...)| +|6|NC_009091|9262|10219|r|False|False|False|99.99|0.3239|||107.44082411540528|104.45237411540528|-0.9874499999999999|2.9884500000000003|ATG|11|2.8971|1.0788000000000002|ATGATTAATAGGATTCAAGACAAAAAAGAA(...)|MINRIQDKKEISKKLKERAIFEGFTIAGIA(...)| +|7|NC_009091|10365|11100|f|False|False|False|99.99|0.3319|||71.66956065705634|79.71184065705633|-0.9874499999999999|-8.042279999999998|TTG|11|-9.60915|2.55432|TTGGTTGAATCTAATCAAAATCAAGATTCC(...)|MVESNQNQDSNLGSRLQQDLKNDLIAGLLV(...)| +|8|NC_009091|11103|11721|f|False|False|False|99.99|0.2993|||69.10757680331382|69.03014680331381|-0.9874499999999999|0.07743000000000055|ATG|11|2.8971|-1.8322199999999995|ATGCATAATAGATCTCTTTCTAGAGAATTA(...)|MHNRSLSRELSLISLGLIKDKGDLKLNKFQ(...)| +|(...)|(...)|(...)|(...)|(...)|(...)|(...)|(...)|(...)|(...)|(...)|(...)|(...)|(...)|(...)|(...)|(...)|(...)|(...)|(...)|(...)|(...)| + ### Changing k-mer size You can change the k-mer size by modifying the `--kmer-size` parameter: diff --git a/anvio/docs/programs/anvi-report-inversions.md b/anvio/docs/programs/anvi-report-inversions.md index fcdb526852..813ad1d93d 100644 --- a/anvio/docs/programs/anvi-report-inversions.md +++ b/anvio/docs/programs/anvi-report-inversions.md @@ -133,7 +133,7 @@ For every inversion, anvi'o can report the surrounding genes and their function You can use the flag `--num-genes-to-consider-in-context` to choose how many genes to consider upstream/downstream of the inversion. By default, anvi'o report three genes downstream, and three genes upstream. -To select a specific gene caller, you can use `--gene-caller`. The default is prodigal. +To select a specific gene caller, you can use `--gene-caller`. The default is pyrodigal-gv. If you want to skip this step, you can use the flag `--skip-recovering-genomic-context`. diff --git a/anvio/drivers/pyrodigal.py b/anvio/drivers/pyrodigal.py new file mode 100644 index 0000000000..cdc77303c7 --- /dev/null +++ b/anvio/drivers/pyrodigal.py @@ -0,0 +1,174 @@ +# coding: utf-8 +"""Interface for gene calling that uses `pyrodigal-gv`.""" + +import multiprocessing.pool + +import anvio +import anvio.fastalib as f +import anvio.utils as utils +import anvio.terminal as terminal +import anvio.constants as constants +import anvio.filesnpaths as filesnpaths + +from anvio.errors import ConfigError + +try: + import pyrodigal_gv +except ImportError: + raise ConfigError("Your anvi'o environment is missing the `pyrodigal-gv` package. But it is easy to " + "solve. Please run `pip install pyrodigal-gv` first.") + +__author__ = "Developers of anvi'o (see AUTHORS.txt)" +__copyright__ = "Copyleft 2015-2024, the Meren Lab (http://merenlab.org/)" +__credits__ = [] +__license__ = "GPL 3.0" +__version__ = anvio.__version__ +__maintainer__ = "A. Murat Eren" +__email__ = "a.murat.eren@gmail.com" + +run = terminal.Run() +progress = terminal.Progress() +pp = terminal.pretty_print + + +def predict(datum): + contig_name, sequence, predictor = datum + return (contig_name, predictor.find_genes(sequence)) + + +class Pyrodigal_gv: + def __init__(self, args=None, progress=progress, run=run): + self.progress = progress + self.run = run + self.args = args + + # user params + A = lambda x: (args.__dict__[x] if x in args.__dict__ else None) if args else None + self.prodigal_translation_table = A('prodigal_translation_table') + self.prodigal_single_mode = A('prodigal_single_mode') + self.full_gene_calling_report = A('full_gene_calling_report') + self.num_threads = A('num_threads') + + self.gene_caller_name = 'pyrodigal-gv' + self.gene_caller_version = pyrodigal_gv.__version__ + + # default header for full report. if you change anything here, please also change + # the corresponding section in `anvio/docs/programs/anvi-gen-contigs-database.md` + self.header_for_full_report = ['gene_callers_id', 'contig', 'start', 'stop', 'direction', 'partial', 'partial_begin', + 'partial_end', 'confidence', 'gc_cont', 'rbs_motif', 'rbs_spacer', 'score', 'cscore', + 'rscore', 'sscore', 'start_type', 'translation_table', 'tscore', 'uscore', 'sequence', + 'translated_sequence'] + + + def process(self, fasta_file_path, output_dir): + """Take the fasta file, run pyrodigal-gv on it, and make sense of the output + + Returns a gene calls dict, and amino acid sequences dict. + """ + + if self.full_gene_calling_report: + filesnpaths.is_output_file_writable(self.full_gene_calling_report) + + if self.prodigal_translation_table: + raise ConfigError("Unfortunately the `--prodigal-translation-table` parameter is not yet implemented :/ " + "This is mostly because anvi'o developers did not have enough expertise to test its use " + "and validity. If you are willing to help us implement this feature, please contact us " + "through Discord (it will be a simple change, but will require someone's supervision).") + + # preparations to set the predictor starting with a check of single vs meta mode + if self.prodigal_single_mode: + # if we are in 'single' mode, that means we will have to first explicitly train + # the gene finder with one of the sequences in the FASTA file. assuming the user + # knows what they're doing, all seqeunces in the input file will be coming from + # the same organism. so, since we have to offer a single sequence, we are going + # to select the longest sequence in the FASTA file + longest_sequence = '' + longest_sequence_length = 0 + fasta = f.SequenceSource(fasta_file_path) + while next(fasta): + if len(fasta.seq) > longest_sequence_length: + longest_sequence_length = len(fasta.seq) + longest_sequence = copy.deepcopy(fasta.seq) + fasta.close() + + if len(longest_sequence) < 20000: + raise ConfigError(f"We have a problem. You are calling pyrodigal-gv with `--prodigal-single-mode` flag, most " + f"likely becasue you are working with a single genome rather than a metagenome. Which " + f"is great. But this mode requires a training step, which cannot be done with a sequence " + f"that is shorter than at least 20,000 nucleotides long. However, the longest sequence in " + f"your FASTA file is only {pp(longest_sequence_length)}. Which means, you will have to drop " + f"the `--prodigal-single-mode` for this to work :/") + + self.predictor = pyrodigal_gv.ViralGeneFinder() + self.predictor.train(longest_sequence) + else: + self.predictor = pyrodigal_gv.ViralGeneFinder(meta=True) + + # since the predictor is now set, next we will read all sequences into the memory :/ + data = [] + fasta = f.SequenceSource(fasta_file_path) + while next(fasta): + data.append((fasta.id, fasta.seq, self.predictor),) + fasta.close() + + self.run.warning("Anvi'o will use 'pyrodigal-gv' by Martin Larralde to identify open reading frames in your data. " + "It is an extension of 'pyrodigal' (doi:10.21105/joss.04296), which builds upon the approach " + "originally implemented by Hyatt et al (doi:10.1186/1471-2105-11-119), with additional metagenomics " + "models for giant viruses and viruses with alternative genetic codes by Camargo et al. " + "(doi:10.1038/s41587-023-01953-y). If you publish your findings, please do not forget to properly credit " + "all three work.", lc='green', header="CITATION") + + # let's learn the number of sequences we will work with early on and report + num_sequences_in_fasta_file = len(data) + + # some nice logs. + self.run.warning('', header=f'Finding ORFs using pyrodigal-gv {pyrodigal_gv.__version__}', lc='green') + self.run.info('Number of sequences', pp(num_sequences_in_fasta_file)) + self.run.info('Procedure', 'Single Genome (with `--prodigal-single-mode`)' if self.prodigal_single_mode else 'Metagenome (without `--prodigal-single-mode`)') + self.run.info('Full gene calling reporting requested?', 'Yes' if self.full_gene_calling_report else 'No') + + self.progress.new('Processing') + self.progress.update(f"Identifying ORFs using {terminal.pluralize('thread', self.num_threads)}.") + + # key variables to fill in + gene_calls_dict = {} + amino_acid_sequences_dict = {} + + gene_callers_id = 0 + with multiprocessing.pool.Pool(self.num_threads) as pool: + for contig_name, predicted_genes in pool.map(predict, data): + for gene in predicted_genes: + gene_calls_dict[gene_callers_id] = {'contig': contig_name, + 'start': gene.begin - 1, + 'stop': gene.end, + 'direction': 'f' if int(gene.strand) == 1 else 'r', + 'partial': gene.partial_begin or gene.partial_end, + 'call_type': constants.gene_call_types['CODING'], + 'source': self.gene_caller_name, + 'version': self.gene_caller_version} + + amino_acid_sequences_dict[gene_callers_id] = gene.translate().replace('*', '') + + # if the user wants a full report, we will update the gene calls dict with additional + # data from the object + if self.full_gene_calling_report: + addtl_data = [('confidence', gene.confidence()), ('gc_cont', gene.gc_cont), ('partial_begin', gene.partial_begin), ('partial_end', gene.partial_end), + ('rbs_motif', gene.rbs_motif), ('rbs_spacer', gene.rbs_spacer), ('score', gene.score), ('cscore', gene.cscore), ('rscore', gene.rscore), ('sscore', gene.sscore), + ('start_type', gene.start_type), ('translation_table', gene.translation_table), ('tscore', gene.tscore), ('uscore', gene.uscore), ('sequence', gene.sequence()), + ('translated_sequence', amino_acid_sequences_dict[gene_callers_id])] + + for k, v in addtl_data: + gene_calls_dict[gene_callers_id][k] = v + + # uppity + gene_callers_id += 1 + + self.progress.end() + + self.run.info('Result', f'Pyrodigal-gv (v{pyrodigal_gv.__version__}) has identified {pp(len(gene_calls_dict))} genes.', nl_after=1) + + if self.full_gene_calling_report: + utils.store_dict_as_TAB_delimited_file(gene_calls_dict, self.full_gene_calling_report, headers=self.header_for_full_report) + self.run.info('Full gene calling report', self.full_gene_calling_report, nl_after=1) + + return gene_calls_dict, amino_acid_sequences_dict diff --git a/anvio/fastalib.py b/anvio/fastalib.py index 6ba64579e4..323bc59d8b 100644 --- a/anvio/fastalib.py +++ b/anvio/fastalib.py @@ -6,7 +6,6 @@ import io import sys import gzip -import numpy import hashlib import anvio diff --git a/anvio/genecalling.py b/anvio/genecalling.py index 1c9eda6265..040d8f5be1 100644 --- a/anvio/genecalling.py +++ b/anvio/genecalling.py @@ -13,6 +13,7 @@ from anvio.errors import ConfigError from anvio.drivers.prodigal import Prodigal +from anvio.drivers.pyrodigal import Pyrodigal_gv __copyright__ = "Copyleft 2015-2024, The Anvi'o Project (http://anvio.org/)" @@ -28,7 +29,7 @@ class GeneCaller: - def __init__(self, fasta_file_path, gene_caller='prodigal', args=None, progress=progress, run=run, debug=False): + def __init__(self, fasta_file_path, gene_caller='pyrodigal-gv', args=None, progress=progress, run=run, debug=False): filesnpaths.is_file_exists(fasta_file_path) filesnpaths.is_file_fasta_formatted(fasta_file_path) @@ -41,13 +42,14 @@ def __init__(self, fasta_file_path, gene_caller='prodigal', args=None, progress= self.debug = debug self.tmp_dirs = [] - self.gene_callers = {'prodigal': Prodigal} + self.gene_callers = {'pyrodigal-gv': Pyrodigal_gv, + 'prodigal': Prodigal} self.gene_caller = gene_caller if self.gene_caller not in self.gene_callers: - raise ConfigError("The gene caller you requested ('%s') is not available at this point. " - "here is a list of what we have: %s." % (', '.join(self.gene_callers))) + raise ConfigError(f"Anvi'o does not know the gene caller you requested: {self.gene_caller} :( Here is a list of " + f"the gene callers she knows about: {', '.join(self.gene_callers)}") def process(self): diff --git a/anvio/interactive.py b/anvio/interactive.py index 07e9c3cb25..417adbf291 100644 --- a/anvio/interactive.py +++ b/anvio/interactive.py @@ -3005,7 +3005,7 @@ def __init__(self, args, run=run, progress=progress): def generate_tables(self): # let's keep track of all keys we will need to access later from the interface. if - # we don't do this, non-standard keys (such as 'Gene caller (prodigal)' becomes very + # we don't do this, non-standard keys (such as 'Gene caller (pyrodigal-gv)' becomes very # inaccessable when we need to access to it the way we access to 'N50' or 'Contig # Lengths'): self.human_readable_keys = [] diff --git a/anvio/inversions.py b/anvio/inversions.py index d6a885e9fe..a4fe905a78 100644 --- a/anvio/inversions.py +++ b/anvio/inversions.py @@ -139,7 +139,7 @@ def __init__(self, args, skip_sanity_check=False, run=terminal.Run(), progress=t # skip learning about the genomic context that surrounds inversions? self.skip_recovering_genomic_context = A('skip_recovering_genomic_context') - self.gene_caller_to_consider_in_context = A('gene_caller') or 'prodigal' + self.gene_caller_to_consider_in_context = A('gene_caller') or 'pyrodigal-gv' self.num_genes_to_consider_in_context = A('num_genes_to_consider_in_context') or 3 # parameters for motif search diff --git a/anvio/tables/genecalls.py b/anvio/tables/genecalls.py index fad279d198..13c891b5ab 100644 --- a/anvio/tables/genecalls.py +++ b/anvio/tables/genecalls.py @@ -472,7 +472,7 @@ def update_gene_call(self, gene_call, frame): return gene_call - def call_genes_and_populate_genes_in_contigs_table(self, gene_caller='prodigal'): + def call_genes_and_populate_genes_in_contigs_table(self, gene_caller='pyrodigal-gv'): Table.__init__(self, self.db_path, anvio.__contigs__version__, self.run, self.progress, simple=True) # get gene calls and amino acid sequences @@ -485,7 +485,7 @@ def call_genes_and_populate_genes_in_contigs_table(self, gene_caller='prodigal') self.populate_genes_in_contigs_table(gene_calls_dict, amino_acid_sequences) - def run_gene_caller(self, gene_caller='prodigal'): + def run_gene_caller(self, gene_caller): """Runs gene caller, and returns gene_calls_dict, and amino acid sequences.""" remove_fasta_after_processing = False @@ -505,15 +505,9 @@ def run_gene_caller(self, gene_caller='prodigal'): try: gene_calls_dict, amino_acid_sequences = gene_caller.process() except Exception as e: - if 'prodigal' in e.e: - self.run.warning("There was a problem with your gene calling, and the error seems to be related to 'prodigal'. Please " - "find additional details below. It is difficult to determine what caused this error, but if you would " - "like to be certain, you can literally copy the command shown below into a single line, and run on " - "the same machine manually (or re-run the same command you run to get this error with the `--debug` flag " - "to keep all the original log files from profigal). If this error is due to a 'segmentation fault', " - "please consider including `--prodigal-single-mode` flag `anvi-gen-contigs-database` command. More " - "information `--prodigal-single-mode` is available in the help menu of `anvi-gen-contigs-database`.", - header="💀 PRODIGAL FAILED 💀") + self.run.warning("There was a problem with your gene calling. Anvi'o will now remove the residual contigs-db file. Please " + "find the actual error message below. If you need more details, re-run your command with `--debug` parameter.", + header="💀 GENE CALLING STEP FAILED 💀") # remove the unfinished contigs-db file os.remove(self.db_path) diff --git a/anvio/utils.py b/anvio/utils.py index f1632d8f38..13599289a8 100644 --- a/anvio/utils.py +++ b/anvio/utils.py @@ -4097,16 +4097,6 @@ def sanity_check_pfam_accessions(pfam_accession_ids): f"start with \"PF\", please double check the following: {','.join(not_pfam_accession_ids)}") -def get_missing_programs_for_hmm_analysis(): - missing_programs = [] - for p in ['prodigal', 'hmmscan']: - try: - is_program_exists(p) - except ConfigError: - missing_programs.append(p) - return missing_programs - - def get_genes_database_path_for_bin(profile_db_path, collection_name, bin_name): if not collection_name or not bin_name: raise ConfigError("Genes database must be associated with a collection name and a bin name :/") diff --git a/bin/anvi-display-functions b/bin/anvi-display-functions index c699578712..a13c37efcd 100755 --- a/bin/anvi-display-functions +++ b/bin/anvi-display-functions @@ -60,7 +60,7 @@ if __name__ == '__main__': groupD.add_argument(*anvio.A('profile-db'), **anvio.K('profile-db')) groupE = parser.add_argument_group('GENES', "By default, anvi'o will look for genes in contigs databases that are identified " - "by `prodigal`. But if you have generated your contigs databse with external gene calls, or have " + "by `pyrodigal-gv`. But if you have generated your contigs databse with external gene calls, or have " "otherwise used another gene caller than the default, you can explicitly ask anvi'o to use that " "one to recover your genes.") groupE.add_argument(*anvio.A('gene-caller'), **anvio.K('gene-caller')) diff --git a/bin/anvi-gen-contigs-database b/bin/anvi-gen-contigs-database index a2ad71a494..82391c1cb4 100755 --- a/bin/anvi-gen-contigs-database +++ b/bin/anvi-gen-contigs-database @@ -54,6 +54,7 @@ if __name__ == '__main__': groupD.add_argument(*anvio.A('skip-gene-calling'), **anvio.K('skip-gene-calling')) groupD.add_argument(*anvio.A('prodigal-single-mode'), **anvio.K('prodigal-single-mode')) groupD.add_argument(*anvio.A('prodigal-translation-table'), **anvio.K('prodigal-translation-table')) + groupD.add_argument(*anvio.A('full-gene-calling-report'), **anvio.K('full-gene-calling-report')) groupD.add_argument(*anvio.A('external-gene-calls'), **anvio.K('external-gene-calls')) groupD.add_argument(*anvio.A('ignore-internal-stop-codons'), **anvio.K('ignore-internal-stop-codons')) groupD.add_argument(*anvio.A('skip-predict-frame'), **anvio.K('skip-predict-frame')) diff --git a/bin/anvi-run-hmms b/bin/anvi-run-hmms index e64581f6ca..c7090c85de 100755 --- a/bin/anvi-run-hmms +++ b/bin/anvi-run-hmms @@ -37,12 +37,6 @@ P = terminal.pluralize @time_program def main(args): - # first check whether this computer is capable of doing an HMM search. - missing_programs = utils.get_missing_programs_for_hmm_analysis() - if missing_programs: - raise ConfigError("Well, in order to run this program, you need %s to be installed on your system." %\ - (', and '.join(missing_programs))) - # then check whether we are going to use the default HMM profiles, or run it for a new one. sources = {} if args.hmm_profile_dir and args.installed_hmm_profile: diff --git a/requirements.txt b/requirements.txt index 08f6dacbec..a4c5bac3c0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,4 +23,5 @@ snakemake multiprocess plotext networkx +pyrodigal pulp==2.7.0 diff --git a/sandbox/anvi-script-gen-function-matrix-across-genomes b/sandbox/anvi-script-gen-function-matrix-across-genomes index c3ea99ed19..07b1301a20 100755 --- a/sandbox/anvi-script-gen-function-matrix-across-genomes +++ b/sandbox/anvi-script-gen-function-matrix-across-genomes @@ -71,7 +71,7 @@ if __name__ == '__main__': "of genomes will be excluded."), type=int) groupE = parser.add_argument_group('GENES', "By default, anvi'o will look for genes in contigs databases that are identified " - "by `prodigal`. But if you have generated your contigs databse with external gene calls, or have " + "by `pyrodigal-gv`. But if you have generated your contigs databse with external gene calls, or have " "otherwise used another gene caller than the default, you can explicitly ask anvi'o to use that " "one to recover your genes.") groupE.add_argument(*anvio.A('gene-caller'), **anvio.K('gene-caller'))