diff --git a/.all-contributorsrc b/.all-contributorsrc index 869b0b43..42740170 100644 --- a/.all-contributorsrc +++ b/.all-contributorsrc @@ -115,6 +115,15 @@ "code", "ideas" ] + }, + { + "login": "dparks1134", + "name": "Donovan Parks", + "avatar_url": "https://avatars.githubusercontent.com/u/3688336?v=4", + "profile": "https://github.com/dparks1134", + "contributions": [ + "bug" + ] } ], "contributorsPerLine": 7, diff --git a/.flake8 b/.flake8 index c5e8a8d1..051ea600 100644 --- a/.flake8 +++ b/.flake8 @@ -1,5 +1,5 @@ [flake8] -ignore = E203, E231, E266, E501, W503, F403, F401 +ignore = E203, E231, E266, E501, W503, F403, F401, E731 max-line-length = 88 max-complexity = 18 -select = B,C,E,F,W,T4,B9 \ No newline at end of file +select = B,C,E,F,W,T4,B9 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8669c669..391b5f3e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -97,6 +97,20 @@ This will install all dependencies for running and developing `pyani`, as well a make test ``` +If you want to be able to edit source files and have those changes take immediate effect when calling `pyani` (useful for testing), clone the GitHub repository with: + +```bash +git clone https://github.com/widdowquinn/pyani.git +``` + +then inside the new `pyani` directory run: + +```bash +pip install -e . +``` + +This is the [`pip install --editable`](https://pip.pypa.io/en/stable/cli/pip_install/#install-editable) command, which links the installed package to the specified location (here `.`, i.e. the current directory) rather than the usual package location (`site-packages`). When using this option, edits to the source code are immediately available in the installed package. This allows you to test changes to the source code as you make them, without the need for an additional uninstall/install step. + #### Cleaning up development environment You can remove the `conda` development environment with the following commands: @@ -219,7 +233,7 @@ A good long description could be > This fix improves efficiency of the veeblefetzer. The main change is replacing a > nested loop with asyncio calls to a new function `fetzveebles()`. This commit > makes affects `veebles.py`, and new tests are added in `test_veeblefetzer.py`. -> +> > fixes #246 A bad long description might be diff --git a/Makefile b/Makefile index e5fa34d2..bdf7101b 100644 --- a/Makefile +++ b/Makefile @@ -41,7 +41,7 @@ clean_walkthrough: walkthrough: clean_walkthrough pyani download --email my.email@my.domain -t 203804 C_blochmannia pyani createdb -f - pyani anim C_blochmannia C_blochmannia_ANIm \ + pyani anim -i C_blochmannia -o C_blochmannia_ANIm \ --name "C. blochmannia run 1" \ --labels C_blochmannia/labels.txt --classes C_blochmannia/classes.txt pyani report --runs C_blochmannia_ANIm/ --formats html,excel,stdout diff --git a/README.md b/README.md index ae1b901c..68610afb 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,7 @@ and we are grateful to all who have contributed to this software:
BalΓ‘zs Brankovics

πŸ’» πŸ›
sammywinchester19

πŸ›
Ram Krishna Shrestha

⚠️ πŸ’» πŸ€” +
Donovan Parks

πŸ› @@ -63,7 +64,7 @@ DOI: [10.1039/C5AY02550H](https://doi.org/10.1039/C5AY02550H) [![pyani sourcerank](https://img.shields.io/librariesio/sourcerank/pypi/pyani.svg?logo=koding&logoColor=white)](https://libraries.io/pypi/pyani) -[![All Contributors](https://img.shields.io/badge/all_contributors-10-orange.svg?style=flat-square)](#contributors-) +[![All Contributors](https://img.shields.io/badge/all_contributors-11-orange.svg?style=flat-square)](#contributors-) [![pyani PyPi version](https://img.shields.io/pypi/v/pyani "PyPI version")](https://pypi.python.org/pypi/pyani) @@ -132,7 +133,7 @@ DOI: [10.1039/C5AY02550H](https://doi.org/10.1039/C5AY02550H) Where available, `pyani` can take advantage of multicore systems, and integrates with [SGE/OGE](http://gridscheduler.sourceforge.net/)-type job schedulers for the sequence comparisons. -`pyani` installs the prgram `pyani`, which enables command-line based analysis of genomes. +`pyani` installs the program `pyani`, which enables command-line based analysis of genomes. ----- @@ -235,10 +236,10 @@ The first step is to obtain genome data for analysis. `pyani` expects to find ea We'll use the `pyani download` subcommand to download all available genomes for *Candidatus Blochmannia* from NCBI. The taxon ID for this grouping is [203804](https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=203804&lvl=3&lin=f&keep=1&srchmode=1&unlock). ```bash -pyani download C_blochmannia --email my.email@my.domain -t 203804 -v -l C_blochmannia_dl.log +pyani download -o C_blochmannia --email my.email@my.domain -t 203804 -v -l C_blochmannia_dl.log ``` -The first argument is the output directory into which the downloaded genomes will be written (`C_blochmannia`). To download anything from NCBI we must provide an email address (`--email my.email@my.domain`), and to specify which taxon subtree we want to download we provide the taxon ID (`-t 203804`). +The first argument is the output directory into which the downloaded genomes will be written (`-o C_blochmannia`). To download anything from NCBI we must provide an email address (`--email my.email@my.domain`), and to specify which taxon subtree we want to download we provide the taxon ID (`-t 203804`). Here we also request verbose output (`-v`), and write a log file for reproducible research/diagnosing bugs and errors (`-l C_blochmannia_dl.log`). @@ -249,27 +250,27 @@ $ tree C_blochmannia C_blochmannia β”œβ”€β”€ GCF_000011745.1_ASM1174v1_genomic.fna β”œβ”€β”€ GCF_000011745.1_ASM1174v1_genomic.fna.gz -β”œβ”€β”€ GCF_000011745.1_ASM1174v1_genomic.md5 +β”œβ”€β”€ GCF_000011745.1_ASM1174v1_genomic.fna.md5 β”œβ”€β”€ GCF_000011745.1_ASM1174v1_hashes.txt β”œβ”€β”€ GCF_000043285.1_ASM4328v1_genomic.fna β”œβ”€β”€ GCF_000043285.1_ASM4328v1_genomic.fna.gz -β”œβ”€β”€ GCF_000043285.1_ASM4328v1_genomic.md5 +β”œβ”€β”€ GCF_000043285.1_ASM4328v1_genomic.fna.md5 β”œβ”€β”€ GCF_000043285.1_ASM4328v1_hashes.txt β”œβ”€β”€ GCF_000185985.2_ASM18598v2_genomic.fna β”œβ”€β”€ GCF_000185985.2_ASM18598v2_genomic.fna.gz -β”œβ”€β”€ GCF_000185985.2_ASM18598v2_genomic.md5 +β”œβ”€β”€ GCF_000185985.2_ASM18598v2_genomic.fna.md5 β”œβ”€β”€ GCF_000185985.2_ASM18598v2_hashes.txt β”œβ”€β”€ GCF_000331065.1_ASM33106v1_genomic.fna β”œβ”€β”€ GCF_000331065.1_ASM33106v1_genomic.fna.gz -β”œβ”€β”€ GCF_000331065.1_ASM33106v1_genomic.md5 +β”œβ”€β”€ GCF_000331065.1_ASM33106v1_genomic.fna.md5 β”œβ”€β”€ GCF_000331065.1_ASM33106v1_hashes.txt β”œβ”€β”€ GCF_000973505.1_ASM97350v1_genomic.fna β”œβ”€β”€ GCF_000973505.1_ASM97350v1_genomic.fna.gz -β”œβ”€β”€ GCF_000973505.1_ASM97350v1_genomic.md5 +β”œβ”€β”€ GCF_000973505.1_ASM97350v1_genomic.fna.md5 β”œβ”€β”€ GCF_000973505.1_ASM97350v1_hashes.txt β”œβ”€β”€ GCF_000973545.1_ASM97354v1_genomic.fna β”œβ”€β”€ GCF_000973545.1_ASM97354v1_genomic.fna.gz -β”œβ”€β”€ GCF_000973545.1_ASM97354v1_genomic.md5 +β”œβ”€β”€ GCF_000973545.1_ASM97354v1_genomic.fna.md5 β”œβ”€β”€ GCF_000973545.1_ASM97354v1_hashes.txt β”œβ”€β”€ classes.txt └── labels.txt @@ -279,7 +280,7 @@ Seven genomes have been downloaded, and each is represented by four files: - `_genomic.fna.gz`: the compressed genome sequence - `_genomic.fna`: the uncompressed genome sequence -- `_genomic.md5`: an MD5 hash/checksum of the (uncompressed) genome sequence; this was generated during the download +- `_genomic.fna.md5`: an MD5 hash/checksum of the (uncompressed) genome sequence; this was generated during the download - `_hashes.txt`: a list of MD5 hashes; this is provided by NCBI and is a reference to be sure that the download did not corrupt the genome sequence There are two additional plain text files: `classes.txt` and `labels.txt`, which provide alternative labels for use in the analysis. These files are generated during the download. @@ -316,7 +317,7 @@ Subsequent `pyani` commands will assume this location for the database, but you In this walkthrough, we'll run ANIm on the downloaded genomes, using the command: ```bash -pyani anim C_blochmannia C_blochmannia_ANIm -v -l C_blochmannia_ANIm.log \ +pyani anim -i C_blochmannia -o C_blochmannia_ANIm -v -l C_blochmannia_ANIm.log \ --name "C. blochmannia run 1" \ --labels C_blochmannia/labels.txt --classes C_blochmannia/classes.txt ``` @@ -332,10 +333,10 @@ One reason for using a database backend for analysis results is so that, for ver You can test this for yourself by running the analysis command again, as below. You will see a number of messages indicating that genomes have been seen before, and that analyses performed before were skipped: ```bash -$ pyani anim C_blochmannia C_blochmannia_ANIm -v -l C_blochmannia_ANIm.log \ +$ pyani anim -i C_blochmannia -o C_blochmannia_ANIm -v -l C_blochmannia_ANIm.log \ --name "C. blochmannia run 2" \ --labels C_blochmannia/labels.txt --classes C_blochmannia/classes.txt -INFO: command-line: pyani anim C_blochmannia C_blochmannia_ANIm -v -l C_blochmannia_ANIm.log +INFO: command-line: pyani anim -i C_blochmannia -o C_blochmannia_ANIm -v -l C_blochmannia_ANIm.log INFO: Running ANIm analysis INFO: Adding analysis information to database .pyani/pyanidb INFO: Current analysis has ID 2 in this database @@ -367,9 +368,9 @@ Once an analysis is run, the results are placed in a local `SQLite` database, wh The report tables are written to a named directory (compulsory argument), and are written by default to a `.tab` plain-text format, but HTML and Excel format can also be requested with the `--formats` argument: ```bash -$ pyani report -v --runs C_blochmannia_ANIm/ --formats html,excel,stdout -INFO: Processed arguments: Namespace(cmdline='./pyani report -v --runs C_blochmannia_ANIm/ --formats html,excel', dbpath='.pyani/pyanidb', formats='html,excel', func=, logfile=None, outdir='C_blochmannia_ANIm/', run_results=False, show_genomes=False, show_genomes_runs=False, show_runs=True, show_runs_genomes=False, verbose=True) -INFO: command-line: ./pyani report -v --runs C_blochmannia_ANIm/ --formats html,excel +$ pyani report -v --runs -o C_blochmannia_ANIm/ --formats html,excel,stdout +INFO: Processed arguments: Namespace(cmdline='./pyani report -v --runs -o C_blochmannia_ANIm/ --formats html,excel', dbpath='.pyani/pyanidb', formats='html,excel', func=, logfile=None, outdir='C_blochmannia_ANIm/', run_results=False, show_genomes=False, show_genomes_runs=False, show_runs=True, show_runs_genomes=False, verbose=True) +INFO: command-line: ./pyani report -v --runs -o C_blochmannia_ANIm/ --formats html,excel INFO: Creating output in formats: ['excel', 'tab', 'html'] INFO: Using database: .pyani/pyanidb INFO: Writing table of pyani runs from the database to C_blochmannia_ANIm/runs.* @@ -385,9 +386,9 @@ C_blochmannia_ANIm/ To see all of the pairwise results for an individual run, the run ID must be provided. It is possible to get results for more than one run ID by providing a comma-separated list of run IDs (though each run's results will be provided in a separate file): ```bash -$ pyani report -v --runs C_blochmannia_ANIm/ --formats html,excel --run_results 1,2,3,4 -INFO: Processed arguments: Namespace(cmdline='./pyani report -v --runs C_blochmannia_ANIm/ --formats html,excel --run_results 1,2,3,4', dbpath='.pyani/pyanidb', formats='html,excel', func=, logfile=None, outdir='C_blochmannia_ANIm/', run_results='1,2,3,4', show_genomes=False, show_genomes_runs=False, show_runs=True, show_runs_genomes=False, verbose=True) -INFO: command-line: ./pyani report -v --runs C_blochmannia_ANIm/ --formats html,excel --run_results 1,2,3,4 +$ pyani report -v --runs -o C_blochmannia_ANIm/ --formats html,excel --run_results 1,2,3,4 +INFO: Processed arguments: Namespace(cmdline='./pyani report -v --runs -o C_blochmannia_ANIm/ --formats html,excel --run_results 1,2,3,4', dbpath='.pyani/pyanidb', formats='html,excel', func=, logfile=None, outdir='C_blochmannia_ANIm/', run_results='1,2,3,4', show_genomes=False, show_genomes_runs=False, show_runs=True, show_runs_genomes=False, verbose=True) +INFO: command-line: ./pyani report -v --runs -o C_blochmannia_ANIm/ --formats html,excel --run_results 1,2,3,4 INFO: Creating output in formats: ['tab', 'excel', 'html'] INFO: Using database: .pyani/pyanidb INFO: Writing table of pyani runs from the database to C_blochmannia_ANIm/runs.* @@ -402,7 +403,7 @@ INFO: Completed. Time taken: 1.285 You can see a run's results in the terminal by specifying the `stdout` format. For example, to see the identity, coverage, and other output matrices, you would specify `--run_matrices ` and `--formats=stdout` as below: ```bash -$ pyani report C_blochmannia_ANIm --formats=stdout --run_matrices 1 +$ pyani report -o C_blochmannia_ANIm --formats=stdout --run_matrices 1 TABLE: C_blochmannia_ANIm/matrix_identity_1 C. Blochmannia pennsylvanicus BPEN C. Blochmannia floridanus C. Blochmannia vafer BVAF C. Blochmannia chromaiodes 640 B. endosymbiont of Polyrhachis (Hedomyrma) turneri 675 B. endosymbiont of Camponotus (Colobopsis) obliquus 757 C. Blochmannia pennsylvanicus BPEN 1.000000 0.834866 0.836903 0.980244 0.843700 0.829509 @@ -454,7 +455,7 @@ B. endosymbiont of Camponotus (Colobopsis) obli... 0. The output of a `pyani` run can also be represented graphically, using the `plot` subcommand. For example, the command: ```bash -pyani plot C_blochmannia_ANIm 1 -v --formats png,pdf +pyani plot -o C_blochmannia_ANIm --run_id 1 -v --formats png,pdf ``` will place `.pdf` and `.png` format output in the `C_blochmannia_ANIm` output directory for the run with ID 1, generated above. Five heatmaps are generated: diff --git a/README_v_0_2_x.md b/README_v_0_2_x.md index c9e4bd02..f8c7e838 100644 --- a/README_v_0_2_x.md +++ b/README_v_0_2_x.md @@ -213,7 +213,7 @@ Command-line options can be viewed using: ```bash $ genbank_get_genomes_by_taxon.py -h -usage: genbacnk_get_genomes_by_taxon.py [-h] [-o OUTDIRNAME] [-t TAXON] [-v] +usage: genbank_get_genomes_by_taxon.py [-h] [-o OUTDIRNAME] [-t TAXON] [-v] [-f] [--noclobber] [-l LOGFILE] [--format FORMAT] [--email EMAIL] [--retries RETRIES] diff --git a/docs/basic_use.rst b/docs/basic_use.rst index 1f1e9289..dc43d44d 100644 --- a/docs/basic_use.rst +++ b/docs/basic_use.rst @@ -12,4 +12,4 @@ Basic Use indexing createdb run_anim - + interpreting_plots diff --git a/docs/citations.rst b/docs/citations.rst index 9f57d618..3d1bddcf 100644 --- a/docs/citations.rst +++ b/docs/citations.rst @@ -62,10 +62,14 @@ If you are using ``pyani``, you are in good company. These authors and manuscrip * Abdel-Glil *et al.* (2021) "Comparative in silico genome analysis of *Clostridium perfringens* unravels stable phylogroups with different genome characteristics and pathogenic potential" *Sci. Rep.* `doi:10.1038/s41598-021-86148-8 `_ * Abdullah *et al.* (2021) "Comparative analysis of whole genome sequences of *Leptospira* spp. from RefSeq database provide interspecific divergence and repertoire of virulence factors" *bioRxiv* `doi:10.1101/2021.01.12.426470 `_ * Al Rubaye *et al.* (2021) "Novel genomic islands and a new vanD-subtype in the first sporadic VanD-type vancomycin resistant enterococci in Norway" *PLoS One* `doi:10.1371/journal.pone.0255187 `_ +* Albuquerque *et al.* (2021) "Complete Genome Sequence of Two Deep-Sea *Streptomyces* Isolates from Madeira Archipelago and Evaluation of Their Biosynthetic Potential" *marine drugs* `doi:10.3390/md19110621 `_ +* Alonso-Reyes *et al.* (2021) "Genomic insights into an andean multiresistant soil actinobacterium of biotechnological interest." *World J Microbiol Biotechnol* `doi:10.1007/s11274-021-03129-9 `_ * Asselin *et al.* (2021) "Complete genome sequence resources for the onion pathogen, *Pantoea ananatis* OC5a" *Phytopath.* `doi:10.1094/phyto-09-20-0416-a `_ * Badhai *et al.* (2021) "Genomic plasticity and antibody response of *Bordetella bronchiseptica* strain HT200, a natural variant from a thermal spring" *FEMS Micro. Lett.* `doi:10.1093/femsle/fnab035 `_ * Becken *et al.* (2021) "Genotypic and Phenotypic Diversity among Human Isolates of *Akkermansia muciniphila*" *mBio* `doi:10.1128/mBio.00478-21 `_ +* Bei *et al.* (2021) "Shedding light on the functional role of the *Ignavibacteria* in Italian rice field soil: A meta-genomic/transcriptomic analysis" *Soil Biol. Biochem.* `doi:10.1016/j.soilbio.2021.108444 `_ * Biffignandi *et al* (2021) "Genome of *Superficieibacter maynardsmithii*, a novel, antibiotic susceptible representative of *Enterobacteriaceae*" *G3* `doi:10.1093/g3journal/jkab019 `_ +* Biggel *et al.* (2021) "Spread of vancomycin-resistant *Enterococcus faecium* ST133 in the aquatic environment in Switzerland" *J. Glob. Anitmicrob. Res.* `doi:10.1016/j.jgar.2021.08.002 `_ * Boeuf *et al.* (2021) "Meta-pangenomics Reveals Depth-dependent Shifts in Metabolic Potential for the Ubiquitous Marine Bacterial SAR324 Lineage" *Research Squared* `doi:10.21203/rs.3.rs-225427/v1 `_ * Carlin *et al.* (2021) "*Listeria cossartiae sp. nov.*, *Listeria immobilis sp. nov.*, *Listeria portnoyi sp. nov.* and *Listeria rustica sp. nov.*, isolated from agricultural water and natural environments" *Int. J. Syst. Evol. Microbiol.* `doi:10.1099/ijsem.0.004795 `_ * Chen *et al.* (2021) "Integrated Phenotypic-Genotypic Analysis of *Latilactobacillus sakei* from Different Niches" *preprints* `doi:10.20944/preprints202107.0457.v1 `_ @@ -77,8 +81,11 @@ If you are using ``pyani``, you are in good company. These authors and manuscrip * Costatini *et al.* (2021) "Insight into phenotypic and genotypic differences between vaginal *Lactobacillus crispatus* BC5 and *Lactobacillus gasseri* BC12 to unravel nutritional and stress factors influencing their metabolic activity" *Microb. Genomics* `doi:10.1099/mgen.0.000575 `_ * de Silva *et al.* (2021) "Revisiting the *Colletotrichum* species causing anthracnose of almond in Australia" *Aust. Plant Path.* `doi:10.1007/s13313-020-00765-x `_ * Delgado-Blas *et al.* (2021) "Population genomics and antimicrobial resistance dynamics of *Escherichia coli* in wastewater and river environments" *Commun Biol* `doi:10.1038/s42003-021-01949-x `_ +* Devika *et al.* (2021) "In Silico Prediction of Novel Probiotic Species Limiting Pathogenic *Vibrio* Growth Using Constraint-Based Genome Scale Metabolic Modeling" *Front. Cell. Inf. Microbiol.* `doi:10.3389/fcimb.2021.752477 `_ +* DΓ­az *et al.* (2021) "Comparative Genomic Analysis of Novel *Bifidobacterium longum* subsp. *longum* Strains Reveals Functional Divergence in the Human Gut Microbiota" *microorganisms* `doi:10.3390/microorganisms9091906 `_ * DragoΕ‘ *et al* (2021) "Phages carry interbacterial weapons encoded by biosynthetic gene clusters" *Curr. Biol.* `doi:10.1016/j.cub.2021.05.046 `_ * Ducarmon *et al.* (2021) "Microbiota-associated risk factors for asymptomatic gut colonisation with multi-drug-resistant organisms in a Dutch nursing home" *Genome Medicine* `doi:0.1186/s13073-021-00869-z `_ +* Fluit *et al.* (2021) "Characterization of clinical *Ralstonia* strains and their taxonomic position." *Antonie van Leeuwenhoek* `doi:10.1007/s10482-021-01637-0 `_ * Foucher *et al.* (2021) "Improving common bacterial blight phenotyping by using rub-inoculation and machine learning: cheaper, better, faster, stronger" *Phytopath.* `doi:10.1094/PHYTO-04-21-0129-R `_ * Friedrich *et al.* (2021) "Complete Genome Sequence of *Stenotrophomonas indicatrix* DAIF1" *Micro Res. Ann.* `doi:10.1128/MRA.01484-20 `_ * Friedrich *et al.* (2021) "Living in a Puddle of Mud: Isolation and Characterization of Two Novel *Caulobacteraceae* Strains *Brevundimonas pondensis sp. nov.* and *Brevundimonas goettingensis sp. nov.*" *appl. microbiol.* `doi:10.3390/applmicrobiol1010005 `_ @@ -86,17 +93,26 @@ If you are using ``pyani``, you are in good company. These authors and manuscrip * Gallardo-Benavente *et al.* (2021) "Genomics Insights into *Pseudomonas* sp. CG01: An Antarctic Cadmium-Resistant Strain Capable of Biosynthesizing CdS Nanoparticles Using Methionine as S-Source" *genes* `doi:10.3390/genes12020187 `_ * Girard *et al.* (2021) "The Ever-Expanding *Pseudomonas* Genus: Description of 43 New Species and Partition of the *Pseudomonas Putida* Group" *preprints* `doi:10.20944/preprints202107.0335.v1 `_ * Ghosh *et al.* (2021) "Reconstructing Draft Genomes Using Genome Resolved Metagenomics Reveal Arsenic Metabolizing Genes and Secondary Metabolites in Fresh Water Lake in Eastern India" *Bioinf. Biol. Insights* `doi:10.1177/11779322211025332 https://doi.org/10.1177/11779322211025332>`_ +* GranehΓ€ll *et al.* (2021) "Metagenomic analysis of ancient dental calculus reveals unexplored diversity of oral archaeal *Methanobrevibacter*." *Microbiome* `doi:https://doi.org/10.1186/s40168-021-01132-8 `_ * Guerin *et al.* (2021) "Isolation and characterisation of Ξ¦crAss002, a crAss-like phage from the human gut that infects *Bacteroides xylanisolvens*" *Microbiome* `doi:10.1186/s40168-021-01036-7 `_ * Halary *et al.* (2021) "Unexpected Micro-Spatial Scale Genomic Diversity of the Bloom-Forming Cyanobacterium *Aphanizomenon gracile* and its Phycosphere" *Res. Sq.* `doi:10.21203/rs.3.rs-617160/v1 `_ +* Hansen *et al.* (2021) "Metagenomic sequencing for rapid identification o f*Xylella fastidiosa* from leaf samples" *bioRxiv* `doi:10.1101/2021.05.12.443947 `_ * Hoetzinger *et al.* (2021) "Dynamics of Baltic Sea phages driven by environmental changes" *Env. Microbiol.* `doi:10.1111/1462-2920.15651 `_ +* Holzer *et al.* (2021) "Tracking the Distribution of *Brucella abortus* in Egypt Based on Core Genome SNP Analysis and In Silico MLVA-16" *microorganisms* `doi:10.3390/microorganisms9091942 `_ +* von Hoyningen-Huene *et al.* (2021) "*Pontibacillus* sp. ALD_SL1 and *Psychroflexus* sp. ALD_RP9, two novel moderately halophilic bacteria isolated from sediment and water from the Aldabra Atoll, Seychelles" *PLoS ONE* `doi:10.1371/journal.pone.0256639 `_ * Huang *et al.* (2021) "Phenotypic properties and genotyping analysis of *Bacillus cereus* group isolates from dairy and potato products" *LWT* `doi:10.1016/j.lwt.2021.110853 `_ * Huang *et al.* (2021) "Genome-resolved metagenomics using environmental and clinical samples" *Brief. Bioinf.* `doi:10.1093/bib/bbab030 `_ +* Huang *et al.* (2021) "Comparative Genomics and Specific Functional Characteristics Analysis of *Lactobacillus acidophilus*" *microorganisms* `doi:10.3390/microorganisms9091992 `_ +* Hugouvieux-Cotte-Pattat & Van Gijsegem (2021) "Diversity within the *Dickeya zeae* complex, identification of *Dickeya zeae* and *Dickeya oryzae* members, proposal of the novel species *Dickeya parazeae* sp. nov." *Int. J. Syst. Envol. Microbiol.* `doi:10.1099/ijsem.0.005059 `_ +* Huihui *et al.* (2021) "Partial biological characteristics and genomic analysis of *Vibrio cholerae* typing phage VP2" *Disease Surv.* `doi:10.3784/jbjc.202105190282 `_ * HΓΌnnefeld *et al.* (2021) "Genome Sequence of the Bacteriophage CL31 and Interaction with the Host Strain *Corynebacterium glutamicum* ATCC 13032" *viruses* `doi:10.3390/v13030495 `_ * Ivanova *et al.* (2021) "Draft Genome Assemblies of Two *Campylobacter novaezeelandiae* and Four Unclassified Thermophilic *Campylobacter* Isolates from Canadian Agricultural Surface Water" *Microbiol. Res. Ann.* `doi:10.1128/MRA.00249-21 `_ * Jian *et al.* (2021) "Diversity and distribution of viruses inhabiting the deepest ocean on Earth" *ISME J.* `doi:10.1038/s41396-021-00994-y `_ * Jungblut *et al.* (2021) "Genomic diversity and CRISPR‐Cas systems in the cyanobacterium *Nostoc* in the High Arctic" *Env. Microbiol.* `doi:10.1111/1462-2920.15481 `_ * Karaseva *et al.* (2021) "*Fervidicoccus fontis* Strain 3639Fd, the First Crenarchaeon Capable of Growth on Lipids" *Microbiol.* `doi:10.1134/S002626172104007X `_ +* Keen *et al.* (2021) "Comparative Genomics of *Mycobacterium avium* Complex Reveals Signatures of Environment-Specific Adaptation and Community Acquisition" *mSystems* `doi:10.1128/mSystems.01194-21 `_ * Koirala *et al.* (2021) "Identification of two novel pathovars of *Pantoea stewartii* subsp. *indologenes* affecting Allium sp. and millets" *Phytopathology* `doi:10.1094/PHYTO-11-20-0508-R `_ +* KuΕΊmiΕ„ska-Bajor *et al.* (2021) "Genomic and functional characterization of five novel *Salmonella*-targeting bacteriophages." *Virol J* `doi:10.1186/s12985-021-01655-4 `_ * Lakra *et al.* (2021) "Genome based reclassification of *Deinococcus swuensis* as a heterotypic synonym of *Deinococcus radiopugnans*" *Int. J. Syst. Evol. Microbiol.* `doi:10.1099/ijsem.0.004879 `_ * Lee *et al.* (2021) "*Bifidobacterium bifidum* strains synergize with immune checkpoint inhibitors to reduce tumour burden in mice" *Nat. Microbiol.* `doi:10.1038/s41564-020-00831-6 `_ * Lee *et al.* (2021) "Identification and Characterization of a Novel Genomic Island Harboring Cadmium and Arsenic Resistance Genes in *Listeria welshimeri*" *biomolecules* `doi:10.3390/biom11040560 `_ @@ -108,35 +124,48 @@ If you are using ``pyani``, you are in good company. These authors and manuscrip * Liao *et al.* (2021) "Nationwide genomic atlas of soil-dwelling *Listeria* reveals effects of selection and population ecology on pangenome evolution" *Nat. Microbiol.* `doi:10.1038/s41564-021-00935-7 `_ * Liu *et al.* (2021) "*Corynebacterium anserum* sp. nov., isolated from the faeces of greater white-fronted geese (*Anser albifrons*) at Poyang Lake, PR China" *Int. J. Syst. Evol. Microbiol.* `doi:10.1099/ijsem.0.004637 `_ * Lood *et al.* (2021) "Genomics of an endemic cystic fibrosis *Burkholderia multivorans* strain reveals low within-patient evolution but high between-patient diversity" *PLoS Pathog.* `doi:0.1371/journal.ppat.1009418 `_ +* LΓ³pez-PΓ©rez *et al.* (2021) "Ecological diversification reveals routes of pathogen emergence in endemic *Vibrio vulnificus* populations" *Proc. Natl. Acad. Sci. USA* `doi:10.1073/pnas.2103470118 `_ * Lu *et al.* (2021) "Asgard archaea in the haima cold seep: Spatial distribution and genomic insights" *Deep Sea Res. I* `doi:10.1016/j.dsr.2021.103489 `_ +* Lu *et al.* (2021) "Comparative Genomic Analysis of *Bifidobacterium bifidum* Strains Isolated from Different Niches" *genes* `doi:10.3390/genes12101504 `_ * Luo *et al.* (2021) "Isolation and characterization of new phage vB_CtuP_A24 and application to control *Cronobacter* spp. in infant milk formula and lettuce" *Food Res. Int.* `doi:10.1016/j.foodres.2021.110109 `_ * Ma *et al.* (2021) "Identification of *Pectobacterium versatile* causing blackleg of potato in New York State" *Plant Disease* `doi:10.1094/PDIS-09-20-2089-RE `_ * Majer *et al.* (2021) "Whole genome sequencing of *Streptomyces actuosus* ISP-5337, *Streptomyces sioyaensis* B-5408, and *Actinospica acidiphila* B-2296 reveals secondary metabolomes with antibiotic potential" *Biotech. Rep.* `doi:10.1016/j.btre.2021.e00596 `_ * Matarrita-Carranza *et al.* (2021) "*Streptomyces* sp. M54: an actinobacteria associated with a neotropical social wasp with high potential for antibiotic production." *Antonie van Leeuwenhoek* `doi:10.1007/s10482-021-01520-y `_ +* Matsumoto *et al.* (2021) "Complete Genome Sequence of *Acinetobacter pittii* OCU_Ac17, Isolated from Human Venous Blood" *Microbiol. Res. Ann.* `doi:10.1128/MRA.00696-21 `_ * Mao *et al.* (2021) "Comparative Genomic Analysis of *Lactiplantibacillus plantarum* Isolated from Different Niches" *genes* `doi:10.3390/genes12020241 `_ +* McKay *et al.* (2021) "Sulfur cycling and host-virus interactions in *Aquificales*-dominated biofilms from Yellowstone’s hottest ecosystems." *ISME J* `doi:10.1038/s41396-021-01132-4 `_ * Moon *et al* (2021) "Mobile Colistin Resistance Gene mcr-1 Detected on an IncI2 Plasmid in *Salmonella Typhimurium* Sequence Type 19 from a Healthy Pig in South Korea" *microorganisms* `doi:10.3390/microorganisms9020398 `_ * Moya-BeltrΓ‘n *et al.* (2021) "Genomic evolution of the class *Acidithiobacillia*: deep-branching *Proteobacteria* living in extreme acidic conditions" *ISME J.* `doi:0.1038/s41396-021-00995-x `_ * Mullins *et al.* (2021) "Discovery of the Pseudomonas Polyyne Protegencin by a Phylogeny-Guided Study of Polyyne Biosynthetic Gene Cluster Diversity" *mBio* `doi:10.1128/mBio.00715-21 `_ +* Nascimento *et al.* (2021) "Genomic Analysis of the 1-Aminocyclopropane-1-Carboxylate Deaminase-Producing *Pseudomonas thivervalensis* SC5 Reveals Its Multifaceted Roles in Soil and in Beneficial Interactions With Plants" *Front. Microbiol.* `doi:10.3389/fmicb.2021.752288 `_ * Nemec *et al.* (2021) "Delineation of a novel environmental phylogroup of the genus *Acinetobacter* encompassing *Acinetobacter terrae sp. nov.*, *Acinetobacter terrestris sp. nov.* and three other tentative species" *Syst. Appl. Microbiol.* `doi:10.1016/j.syapm.2021.126217 `_ * Nikolaisen *et al.* (2021) "First finding of *Streptococcus phocae* infections in mink (*Neovison vison*)" *Res. Vet. Sci.* `doi:10.1016/j.rvsc.2021.07.015 `_ * Nooij *et al.* (2021) "Faecal microbiota transplantation influences procarcinogenic *Escherichia coli* in recipient recurrent *Clostridioides difficile* patients" *Gastroenterology* `doi:10.1053/j.gastro.2021.06.009 `_ * Ogg *et al.* (2021) "Pangenome analyses of LuxS-coding genes and enzymatic repertoires in cocoa-related lactic acid bacteria" *Genomics* `doi:10.1016/j.ygeno.2021.04.010 `_ * Γ–hrman *et al.* (2021) "Reorganized Genomic Taxonomy of *Francisellaceae* Enables Design of Robust Environmental PCR Assays for Detection of *Francisella tularensis*" *Microorganisms* `doi:10.3390/microorganisms9010146 `_ * Γ–hrman *et al.* (2021) "Complete Genome Sequence of *Francisella* sp. Strain LA11-2445 (FDC406), a Novel *Francisella* Species Isolated from a Human Skin Lesion" *Micro. Res. Ann.* `doi:10.1128/MRA.01233-20 `_ -* PΓ©dron *et al.* (2021) "Early Emergence of *Dickeya solani* Revealed by Analysis of *Dickeya* Diversity of Potato Blackleg and Soft Rot Causing Pathogens in Switzerland" *microorganisms* `doi:10.3390/microorganisms9061187 `_ * Pais *et al.* (2021) "Genomic sequencing of different sequevars of *Ralstonia solanacearum* belonging to the Moko ecotype" *Genet. Mol. Bol.* `doi:10.1590/1678-4685-gmb-2020-0172 `_ +* PΓ©dron *et al.* (2021) "Early Emergence of *Dickeya solani* Revealed by Analysis of *Dickeya* Diversity of Potato Blackleg and Soft Rot Causing Pathogens in Switzerland" *microorganisms* `doi:10.3390/microorganisms9061187 `_ +* PΓ©rez-Carrascal *et al.* (2021) "Single-colony sequencing reveals microbe-by-microbiome phylosymbiosis between the cyanobacterium *Microcystis* and its associated bacteria." *Microbiome* `doi:10.1186/s40168-021-01140-8 `_ * Petriglieri *et al.* (2021) "Candidatus *Dechloromonas phosphoritropha* and Ca. *D. phosphorivorans*, novel polyphosphate accumulating organisms abundant in wastewater treatment systems" *ISME J.* `doi:10.1038/s41396-021-01029-2 `_ +* Pidcock *et al.* (2021) "Phylogenetic systematics of *Butyrivibrio* and *Pseudobutyrivibrio* genomes illustrate vast taxonomic diversity, open genomes and an abundance of carbohydrate-active enzyme family isoforms" *Microbial Genomics* `doi:10.1099/mgen.0.000638 `_ * Puri *et al.* "Phylogenomic Framework for Taxonomic Delineation of *Paracoccus* spp. and Exploration of Core-Pan Genome" *Ind. J. Microbiol.* `doi:10.1007/s12088-021-00929-3 `_ +* RomΓ‘n-Reyna *et al.* (2021) "Metagenomic Sequencing for Identification of *Xylella fastidiosa* from Leaf Samples" *mSystems* `doi:10.1128/mSystems.00591-21 `_ * Reichler *et al.* (2021) "Identification, subtyping, and tracking of dairy spoilage-associated *Pseudomonas* by sequencing the *ileS* gene" *J. Dairy Sci.* `doi:10.3168/jds.2020-19283 `_ * RyngajΕ‚Ε‚o *et al.* (2021) "Complete genome sequence of lovastatin producer *Aspergillus terreus* ATCC 20542 and evaluation of genomic diversity among *A. terreus* strains" *Appl. Microbiol. Biotechnol.* `doi:10.1007/s00253-021-11133-0 `_ +* Saati-SantamarΓ­a *et al.* (2021) "Phylogenomic Analyses of the Genus _Pseudomonas_ Lead to the Rearrangement of Several Species and the Definition of New Genera" *Biology* `doi:10.3390/biology10080782 `_ +* Sakiyama *et al.* (2021) "Complete Genome Sequence of a Clinical Isolate of *Acinetobacter baumannii* Harboring 11 Plasmids" *Microbiol. Res. Ann.* `doi:10.1128/MRA.00695-21 `_ * Schlez *et al.* (2021) "*Corynebacterium rouxii*, a recently described member of the *C. diphtheriae* group isolated from three dogs with ulcerative skin lesions" *Ant. van Leeuw.* `doi:10.1007/s10482-021-01605-8 `_ +* Santos *et al.* (2021) "*Phaffia brasiliana* sp. nov., a yeast species isolated from soil in a Cerrado–Atlantic Rain Forest ecotone site in Brazil" *Int. J. Syst. Evol. Microbiol.* `doi:10.1099/ijsem.0.005080 `_ * SchΓΆrner *et al.* (2021) "Genomic analysis of *Neisseria elongata* isolate from a patient with infective endocarditis" *FEBS Open Bio* `doi:10.1002/2211-5463.13201 `_ * von Schwartzenberg *et al.* (2021) "Caloric restriction disrupts the microbiota and colonization resistance" *Nature* `doi:10.1038/s41586-021-03663-4 `_ * Sedaghatjoo *et al.* (2021) "Development of a loop-mediated isothermal amplification assay for the detection of *Tilletia controversa* based on genome comparison" *Sci. Reports.* `doi:10.1038/s41598-021-91098-2 `_ +* Seibert *et al.* (2021) "*Chlamydia buteonis* in birds of prey presented to California wildlife rehabilitation facilities" *PLoS One* `doi:10.1371/journal.pone.0258500 `_ * Singh *et al.* (2021) "Enrichment and description of novel bacteria performing syntrophic propionate oxidation at high ammonia level" *Env. Micro.* `doi:10.1111/1462-2920.15388 `_ * Singh *et al.* (2021) "Genome-based reclassification of *Amycolatopsis eurytherma* as a later heterotypic synonym of *Amycolatopsis thermoflava*" *Int. J. Syst. Evol. Microbiol.* `doi:10.1099/ijsem.0.004642 `_ * Son *et al.* (2021) "*Serratia rhizosphaerae sp. nov.*, a novel plant resistance inducer against soft rot disease in tobacco" *Int. J. Syst. Evol. Microbiol.* `doi:10.1099/ijsem.0.004788 `_ * Sorokin *et al.* (2021) "*Natronoglycomyces albus gen. nov., sp. nov*, a haloalkaliphilic actinobacterium from a soda solonchak soil" *Int. J. Syst. Evol. Microbiol.* `doi:10.1099/ijsem.0.004804 `_ +* Strube (2021) "RibDif: can individual species be differentiated by 16S sequencing?" *Bioinf. Adv.* `doi:10.1093/bioadv/vbab020 `_ * Suarez *et al.* (2021) "Whole-Genome sequencing and comparative genomics of *Mycobacterium spp.* from farmed Atlantic and coho salmon in Chile" *Antonie van Leeuw.* `doi:10.1007/s10482-021-01592-w `_ * Tian *et al.* (2021) "LINflow: a computational pipeline that combines an alignment-free with an alignment-based method to accelerate generation of similarity matrices for prokaryotic genomes" *PeerJ* `doi:10.7717/peerj.10906 `_ * Tian *et al.* (2021) "Antifungal mechanism of *Bacillus amyloliquefaciens* strain GKT04 against *Fusarium* wilt revealed using genomic and transcriptomic analyses" *Microbiol. Open* `doi:10.1002/mbo3.1192 `_ @@ -147,8 +176,13 @@ If you are using ``pyani``, you are in good company. These authors and manuscrip * Viera *et al.* (2021) "A highly specific *Serratia*-infecting T7-like phage inhibits biofilm formation in two different genera of the Enterobacteriaceae family" *Res. Microbiol.* `doi:10.1016/j.resmic.2021.103869 `_ * Vincent *et al.* (2021) "AsaGEI2d: a new variant of a genomic island identified in a group of *Aeromonas salmonicida* subsp. *salmonicida* isolated from France, which bears the pAsa7 plasmid" *FEMS Micro. Lett.* `doi:10.1093/femsle/fnab021 `_ * Volpiano *et al.* (2021) "Genomic Metrics Applied to *Rhizobiales (Hyphomicrobiales)*: Species Reclassification, Identification of Unauthentic Genomes and False Type Strains" *Front. Microbiol.* `doi:10.3389/fmicb.2021.614957 `_ +* Wang *et al.* (2021). "Spontaneous Bacterial Peritonitis Caused by *Bordetella hinzii*." *Emerging Infectious Diseases* `doi:10.3201/eid2711.211428 `_ +* Wang *et al.* (2021) "Dynamic impact of virome on colitis and colorectal cancer: Immunity, inflammation, prevention and treatment" *Sem. Cancer Biol.* `doi:10.1016/j.semcancer.2021.10.004 `_ * Watson *et al.* (2021) "Adaptive ecological processes and metabolic independence drive microbial colonization and resilience in the human gut" *bioRxiv* `doi:10.1101/2021.03.02.433653 `_ * Wu *et al.* (2021) "Metagenomic insights into nitrogen and phosphorus cycling at the soil aggregate scale driven by organic material amendments" *Sci. Tot. Env.* `doi:10.1016/j.scitotenv.2021.147329 `_ +* Wu *et al.* (2021) "Moisture modulates soil reservoirs of active DNA and RNA viruses." *Commun Biol* `doi:10.1038/s42003-021-02514-2 `_ +* Wu *et al.* (2021) "An Effective Preprocessing Method for High-Quality Pan-Genome Analysis of *Bacillus subtilis* and *Escherichia coli*" *Essential Genes and Genomes* `doi:10.1007/978-1-0716-1720-5_21 `_ +* Wu *et al.* (2021) "DNA Viral Diversity, Abundance, and Functional Potential Vary across Grassland Soils with a Range of Historical Moisture Regimes" *mBio* `doi:doi.org/10.1128/mBio.02595-21 `_ * Xiao *et al.* (2021) "Carbapenem-resistant *Acinetobacter Baumannii* Ventilator-Associated Pneumonia in Critically Ill Patients: Potential Inference with Respiratory Tract Microbiota Dysbiosis" *Res. Sq.* `doi:10.21203/rs.3.rs-736916/v1 `_ * Young *et al.* (2021) "Defining the *Rhizobium leguminosarum* Species Complex" *genes* `doi:10.3390/genes12010111 `_ * Zeng *et al.* (2021) "Novel phage vB_CtuP_B1 for controlling *Cronobacter malonaticus* and *Cronobacter turicensis* in ready-to-eat lettuce and powered infant formula" *Food Res. Int.* `doi:10.1016/j.foodres.2021.110255 `_ diff --git a/docs/contributing.rst b/docs/contributing.rst index 46745d46..74a62c99 100644 --- a/docs/contributing.rst +++ b/docs/contributing.rst @@ -82,6 +82,7 @@ Making changes and pull requests 1. Fork the ``pyani`` `repository`_ under your account at `GitHub`_. 2. Clone your fork to your development machine. + - To be able to edit `pyani` and have changes you make take effect immediately without a reinstall (useful for testing), you can run `pip install -e .` inside the local cloned repository. 3. Create a new branch in your forked repository with an informative name like ``fix_issue_107``, using ``git`` (e.g. with the command ``git checkout -b fix_issue_107``). 4. Make the changes you need and commit them to your local branch. 5. Run the repository tests (see the :ref:`pyani-testing` documentation for more details). diff --git a/docs/download.rst b/docs/download.rst index 49ae1cd4..30cd1235 100644 --- a/docs/download.rst +++ b/docs/download.rst @@ -19,15 +19,15 @@ The basic form of the command is: .. code-block:: bash - pyani download --email my.email@my.domain -t + pyani download --email my.email@my.domain -t -o -This instructs ``pyani`` to use the ``download`` subcommand to obtain all available genome assemblies below the taxon ID ````, passed with the ``-t`` argument, and place the downloaded files - along with label and class information files created by ``pyani`` in the subdirectory ````. +This instructs ``pyani`` to use the ``download`` subcommand to obtain all available genome assemblies below the taxon ID ````, passed with the ``-t`` argument, and place the downloaded files - along with label and class information files created by ``pyani`` in the subdirectory ````, passed with the ``-o`` argument. -For example, if we wished to download all available assemblies for the bacterium *Pseudomonas flexibilis* we would `identify the taxon ID `_ to be 706570, and use this as the argument to ``-t``, placing the output in a convenient subdirectory (e.g. ``genomes``), with the command: +For example, if we wished to download all available assemblies for the bacterium *Pseudomonas flexibilis* we would `identify the taxon ID `_ to be 706570, and use this as the argument to ``-t``, placing the output in a convenient subdirectory (e.g. ``genomes``, the argument to ``-o``), with the command: .. code-block:: bash - $ pyani download --email my.email@my.domain -t 706570 genomes + $ pyani download --email my.email@my.domain -t 706570 -o genomes GCF_900155995.1_IMG-taxon_2681812811_annotated_assembly_genomic.fna.gz: 2097152it [00:00, 3224293.90it/s] GCF_900155995.1_IMG-taxon_2681812811_annotated_assembly_hashes.txt: 1048576it [00:00, 110097041.36it/s] GCF_900101515.1_IMG-taxon_2596583557_annotated_assembly_genomic.fna.gz: 2097152it [00:00, 3125724.89it/s] @@ -47,23 +47,23 @@ This displays each assembly as a download is attempted, and places all output in genomes/ β”œβ”€β”€ GCA_001312105.1_ASM131210v1_genomic.fna β”œβ”€β”€ GCA_001312105.1_ASM131210v1_genomic.fna.gz - β”œβ”€β”€ GCA_001312105.1_ASM131210v1_genomic.md5 + β”œβ”€β”€ GCA_001312105.1_ASM131210v1_genomic.fna.md5 β”œβ”€β”€ GCA_001312105.1_ASM131210v1_hashes.txt β”œβ”€β”€ GCF_000802425.1_ASM80242v1_genomic.fna β”œβ”€β”€ GCF_000802425.1_ASM80242v1_genomic.fna.gz - β”œβ”€β”€ GCF_000802425.1_ASM80242v1_genomic.md5 + β”œβ”€β”€ GCF_000802425.1_ASM80242v1_genomic.fna.md5 β”œβ”€β”€ GCF_000802425.1_ASM80242v1_hashes.txt β”œβ”€β”€ GCF_000806415.1_ASM80641v1_genomic.fna β”œβ”€β”€ GCF_000806415.1_ASM80641v1_genomic.fna.gz - β”œβ”€β”€ GCF_000806415.1_ASM80641v1_genomic.md5 + β”œβ”€β”€ GCF_000806415.1_ASM80641v1_genomic.fna.md5 β”œβ”€β”€ GCF_000806415.1_ASM80641v1_hashes.txt β”œβ”€β”€ GCF_900101515.1_IMG-taxon_2596583557_annotated_assembly_genomic.fna β”œβ”€β”€ GCF_900101515.1_IMG-taxon_2596583557_annotated_assembly_genomic.fna.gz - β”œβ”€β”€ GCF_900101515.1_IMG-taxon_2596583557_annotated_assembly_genomic.md5 + β”œβ”€β”€ GCF_900101515.1_IMG-taxon_2596583557_annotated_assembly_genomic.fna.md5 β”œβ”€β”€ GCF_900101515.1_IMG-taxon_2596583557_annotated_assembly_hashes.txt β”œβ”€β”€ GCF_900155995.1_IMG-taxon_2681812811_annotated_assembly_genomic.fna β”œβ”€β”€ GCF_900155995.1_IMG-taxon_2681812811_annotated_assembly_genomic.fna.gz - β”œβ”€β”€ GCF_900155995.1_IMG-taxon_2681812811_annotated_assembly_genomic.md5 + β”œβ”€β”€ GCF_900155995.1_IMG-taxon_2681812811_annotated_assembly_genomic.fna.md5 β”œβ”€β”€ GCF_900155995.1_IMG-taxon_2681812811_annotated_assembly_hashes.txt β”œβ”€β”€ classes.txt └── labels.txt @@ -72,7 +72,7 @@ Each genome is downloaded in compressed format (``.fna.gz`` files) and expanded .. code-block:: bash - $ head genomes/GCA_001312105.1_ASM131210v1_genomic.md5 + $ head genomes/GCA_001312105.1_ASM131210v1_genomic.fna.md5 e55cd3d913a198ac60afd8d509c02ab4 genomes/GCA_001312105.1_ASM131210v1_genomic.fna ``pyani`` also creates two files: @@ -107,13 +107,13 @@ To download genomes from more than one taxon, you can provide a comma-separated .. code-block:: bash - pyani download --email my.email@my.domain -t ,,... + pyani download --email my.email@my.domain -t ,,... -o The following command can be used to download assemblies from three different *Pseudomonas* taxa (*P. flexibilis*: 706570, *P. mosselli*: 78327, and *P. fulva*: 47880): .. code-block:: bash - $ pyani download --email my.email@my.domain -t 706570,78327,47880 multi_taxa + $ pyani download --email my.email@my.domain -t 706570,78327,47880 -o multi_taxa GCF_900155995.1_IMG-taxon_2681812811_annotated_assembly_genomic.fna.gz: 2097152it [00:00, 3081776.59it/s] GCF_900155995.1_IMG-taxon_2681812811_annotated_assembly_hashes.txt: 1048576it [00:00, 63489526.95it/s] GCF_900101515.1_IMG-taxon_2596583557_annotated_assembly_genomic.fna.gz: 2097152it [00:00, 3194885.99it/s] @@ -127,7 +127,7 @@ If you only want to see which genomes will be downloaded from NCBI with a given .. code-block:: bash - $ pyani download --email my.email@my.domain -t 706570,78327,47880 multi_taxa --dry-run + $ pyani download --email my.email@my.domain -t 706570,78327,47880 -o multi_taxa --dry-run WARNING: Dry run only: will not overwrite or download WARNING: (dry-run) skipping download of GCF_900155995.1 WARNING: (dry-run) skipping download of GCF_900101515.1 @@ -145,7 +145,7 @@ The ``pyani download`` command can prepare downloaded genome files for immediate .. code-block:: bash - $ pyani download --email my.email@my.domain -t 706570,78327,47880 genomes_kraken --kraken + $ pyani download --email my.email@my.domain -t 706570,78327,47880 -o genomes_kraken --kraken GCF_900155995.1_IMG-taxon_2681812811_annotated_assembly_genomic.fna.gz: 2097152it [00:00, 3085741.03it/s] GCF_900155995.1_IMG-taxon_2681812811_annotated_assembly_hashes.txt: 1048576it [00:00, 140958511.30it/s] WARNING: Modifying downloaded sequence for Kraken compatibility @@ -169,9 +169,9 @@ Using this option does affects downstream performance or use of ``pyani`` only i .. code-block:: bash - $ head multi_taxa/GCA_001312105.1_ASM131210v1_genomic.md5 + $ head multi_taxa/GCA_001312105.1_ASM131210v1_genomic.fna.md5 e55cd3d913a198ac60afd8d509c02ab4 multi_taxa/GCA_001312105.1_ASM131210v1_genomic.fna - $ head genomes_kraken/GCA_001312105.1_ASM131210v1_genomic.md5 + $ head genomes_kraken/GCA_001312105.1_ASM131210v1_genomic.fna.md5 053fd98d8c9ab30de46f56fd601ef529 genomes_kraken/GCA_001312105.1_ASM131210v1_genomic.fna and so will not be considered to be the "same sequence" when repeating comparisons. diff --git a/docs/examples.rst b/docs/examples.rst index 1627d682..ffe37a89 100644 --- a/docs/examples.rst +++ b/docs/examples.rst @@ -11,11 +11,11 @@ Using non-NCBI genomes It is usual to want to include or work only with genomes that have been generated locally, or that were not downloaded from NCBI using `pyani download`. To use these genomes with the `pyani` analysis subcommands, the genomes must be *indexed* [1]_. -To *index* a set of genomes, use the `pyani index` subcommand on the input directory. To index the directory ``mygenomes``, for example: +To *index* a set of genomes, use the `pyani index` subcommand on the input directory, which is passed to the ``-i`` argument. To index the directory ``mygenomes``, for example: .. code-block:: bash - pyani index mygenomes + pyani index -i mygenomes This will create a ``.md5`` file (containing the *hash*) for each genome, as well as class and label files listing all the input genomes. @@ -31,7 +31,7 @@ The location of the labels and classes files may be changed using the ``--labels .. code-block:: bash - pyani index mygenomes --classes myclasses.txt --labels mylabels.txt + pyani index -i mygenomes --classes myclasses.txt --labels mylabels.txt diff --git a/docs/images/distribution_identity_run3.png b/docs/images/distribution_identity_run3.png new file mode 100644 index 00000000..6ebb01e9 Binary files /dev/null and b/docs/images/distribution_identity_run3.png differ diff --git a/docs/images/scatterplot.png b/docs/images/scatterplot.png new file mode 100644 index 00000000..2aed9473 Binary files /dev/null and b/docs/images/scatterplot.png differ diff --git a/docs/indexing.rst b/docs/indexing.rst index aba7ceda..b1743dbf 100644 --- a/docs/indexing.rst +++ b/docs/indexing.rst @@ -34,7 +34,7 @@ The basic form of the command is: .. code-block:: bash - pyani index + pyani index -i This instructs ``pyani`` to search ```` for files with a standard FASTA suffix (``.fna``, ``.fasta``, ``.fa``, ``.fas``, ``.fsa_nt``). For each file found, it calculates the MD5 hash and writes it to an accompanying file with extension ``.md5``. The hash is then associated with a genome label and a genome class, written to the two files ``labels.txt`` and ``classes.txt`` (see above). @@ -52,15 +52,15 @@ We could run the ``pyani index`` command: .. code-block:: bash - $ pyani index unindexed/ + $ pyani index -i unindexed/ $ tree unindexed unindexed β”œβ”€β”€ GCA_001312105.1_ASM131210v1_genomic.fna - β”œβ”€β”€ GCA_001312105.1_ASM131210v1_genomic.md5 + β”œβ”€β”€ GCA_001312105.1_ASM131210v1_genomic.fna.md5 β”œβ”€β”€ GCF_000834555.1_ASM83455v1_genomic.fna - β”œβ”€β”€ GCF_000834555.1_ASM83455v1_genomic.md5 + β”œβ”€β”€ GCF_000834555.1_ASM83455v1_genomic.fna.md5 β”œβ”€β”€ GCF_005796105.1_ASM579610v1_genomic.fna - β”œβ”€β”€ GCF_005796105.1_ASM579610v1_genomic.md5 + β”œβ”€β”€ GCF_005796105.1_ASM579610v1_genomic.fna.md5 β”œβ”€β”€ classes.txt └── labels.txt @@ -72,7 +72,7 @@ This creates an ``.md5`` file for each genome, and corresponding ``classes.txt`` >BBCY01000001.1 Pseudomonas tuomuerensis JCM 14085 DNA, contig: JCM14085.contig00001, whole genome shotgun sequence ACCAGCATCTGGCGGATCAGGTCGCGGGCCTTCTCGGCCGATTGGCGGATGCGCCCGAGGTAGCGGCCGAGCGGCGCGTC GCCGCGCTCGCCCGCCAGCTCCTCGGCCATCTGCGTGTAGCCGAGCATGCTGGTCAGCAGGTTGTTGAAGTCGTGGGCAA - $ head unindexed/GCA_001312105.1_ASM131210v1_genomic.md5 + $ head unindexed/GCA_001312105.1_ASM131210v1_genomic.fna.md5 e55cd3d913a198ac60afd8d509c02ab4 unindexed/GCA_001312105.1_ASM131210v1_genomic.fna $ head unindexed/classes.txt 527f35b3eb9dd371d8d5309b6043dd9f GCF_000834555.1_ASM83455v1_genomic Pseudomonas fulva strain MEJ086 contig_1, whole genome shotgun sequence diff --git a/docs/interpreting_plots.rst b/docs/interpreting_plots.rst new file mode 100644 index 00000000..70dcb469 --- /dev/null +++ b/docs/interpreting_plots.rst @@ -0,0 +1,187 @@ +================================= +Interpreting the Graphical Output +================================= +.. + Graphical output is obtained by executing the ``pyani plot`` subcommand, specifying the output directory and run ID. Optionally, output file formats and the graphics drawing method can be specified. + + .. code-block:: bash + + pyani plot --formats png,pdf --method seaborn C_blochmannia_ANIm 1 + + Supported output methods are: + + - ``seaborn`` + - ``mpl`` (``matplotlib``) + - ``plotly`` + +---------- +The Output +---------- + +``pyani plot`` generates five heatmaps corresponding to the matrices that ``pyani report`` produces: + + - percentage identity across all aligned regions (*Average Nucleotide Identity*, ANI) + - percentage coverage of each genome by aligned regions (*Coverage*, or *Aligned Fraction* (AF)) + - number of bases from each genome contributing to the aligned regions + - number of "similarity errors" on each genome + - a Hadamard matrix of percentage identity multiplied by percentage coverage for each comparison + +For each heatmap, a pair of plots describing the distributions of values in the heatmap/matrix are also generated. These show a histogram (left) and a KDE with rugplot (right) of the values in the heatmap. + +In addition, a scatterplot of ANI vs Coverage/AF for each pairwise comparison is produced. + +-------- +Heatmaps +-------- + +^^^^^^^^^^^^^^^^^^^^^^^ +Percentage Identity/ANI +^^^^^^^^^^^^^^^^^^^^^^^ + +.. figure:: images/matrix_identity_1.png + :alt: percentage identity matrix for *Candidatus Blochmannia* ANIm analysis + + Percentage identity matrix for *Candidatus Blochmannia* ANIm analysis + + Each cell represents a pairwise comparison between the named genomes on rows and columns, and the number in each cell is the pairwise identity *of all aligned regions*. The dendrograms are produced by single-linkage hierarchical clustering trees from the matrix of pairwise identity results. The default colour scheme colours cells with identity > 0.95 as red, and those with < 0.95 as blue. This division corresponds to a widely-used convention for bacterial species boundaries. + +.. note:: + + No single ANI threshold should be considered universally applicable to distinguish between species for all bacterial genomes. + +We can often take the red blocks on the main diagonal of the heatmap to indicate groups of genomes that are coherent with each other and exclude all other genomes in the analysis, which is one of the criteria we would use to delineate a biological species or other taxon. As a rule of thumb, red squares on the main diagonal are a good approximation to species. + +Taking the 95% threshold between red and blue cells to be equivalent to a species boundary, an interpretation of this figure would be that: + +* the two genomes BPEN and 640 could be classified as the same species +* the remaining four genomes each represent a distinct species + +In particular, we can see that the off-diagonal identity values are all around 85%, consistent with the limit of detection for homologous nucleotide regions. + +.. note:: + + ANI reports the average percentage identity for the *aligned regions* only. If the total aligned proportion of either genome is not large, then ANI is not a reliable measure of overall genome similarity, and interpretation of percentage identity thresholds as species boundaries becomes less reliable. Percentage identity should always be considered in conjunction with coverage/aligned fraction. + +^^^^^^^^^^^^^^^^^^^^^^^^^ +Coverage/Aligned Fraction +^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. figure:: images/matrix_coverage_1.png + :alt: percentage coverage matrix for *Candidatus Blochmannia* ANIm analysis + + Percentage coverage matrix for *Candidatus Blochmannia* ANIm analysis + + Each cell represents a pairwise comparison between the named genomes on rows and columns, and the number in each cell is the pairwise coverage of each genome by aligned regions in the comparison. The dendrograms are generated by single-linkage hierarchical clustering from the matrix of pairwise coverage results. The default colour scheme colours cells with identity > 0.50 as red, and those with < 0.50 as blue. This division corresponds to a strict majority of each genome in the comparison being alignable (a plausible *ad hoc* minimum requirement for two sequences being considered "the same thing"). + +.. note:: + + There is no widely-accepted convention for interpreting coverage/aligned fraction in its own right. Coverage should always be considered in conjunction with percentage identity and other measures. + +The default graphical representation of coverage/AF distinguishes between alignments that cover more than 50% of the query genome (red) from those that cover less than 50% (blue). This is an *ad hoc* boundary with no particular biological meaning, but which does have a useful property that guides interpretation of the output. + +If two genomes A and B align over less than 50% of genome A, then it is possible that the *majority* of genome A aligns to a different genome than genome B, with which genome B shares no homology. For instance, if the alignment of A and B has 30% coverage of genome A, it is possible that 70% of genome A is identical to another genome - C - which shares no sequence at all with genome B. In this case, it is perhaps not reasonable to assert that genomes A and B "are the same thing" or, in technical terms, *belong to the same taxon*. + +However, if two genomes A and B align over more than 50% of their genomes - a majority of each genome - it is possibly reasonable to assert that the two genomes are, in some way, "the same thing" (and possibly correspond to the same taxon). + +.. note:: + + The 50% coverage threshold can be considered as a line of "caution." Where coverage is less than 50%, there is the possibility that the two genomes are not in the same taxon. However, this is not diagnostic. + +Here, taking the 50% coverage threshold between red and blue cells to indicate an approximate boundary between genera, we would consider that BPEN and 640 are the same genus, but that each of the other genomes is representative of a distinct genus. + +.. note:: + + There is no agreed, universal coverage threshold corresponding to genus boundaries, so we should always consider the actual coverage/AF values + +In this case, nearly all the off-diagonal values in the blue cells are below β‰ˆ5%. This indicates that the proportion of each genome in the off-diagonal alignments is very small, and we are safe to assert that these organisms come from different genera. The exception to this is the comparison between BVAF and floridianus: their coverage is higher, at β‰ˆ15%. This may indicate a common plasmid or mobile element, or it may indicate a more recent common ancestor than the other comparisons; they may or may not validly be in the same genus - we would need to investigate further to understand their relationship. + +The BPEN/640 comparison is conclusive, however. Their coverage/AF is essentially 100%, so these are closely-related, highly sequence-homologous organisms. + +------------------ +Distribution Plots +------------------ + +.. figure:: images/distribution_identity_run3.png + :alt: distribution plot for an ANI comparison + + ANI value distribution plot + + In the distribution plots for each matrix, two figures are shown. On the left, a histogram of cell values is presented, representing binned values. On the right, a rug plot of individual matrix cell values and corresponding KDE plot (smoothed curve modelling the density as Gaussian distributions) is shown. + +.. note:: + + *Discontinuities* in the distribution of ANI values have been associated with taxonomically-useful boundaries, especially species boundaries (between 94-96% depending on lineage). It is common to see these as gaps in the rug plot. + +By inspection, we can see a discontinuity (i.e. a gap) in the rug plot that spans 95%. This is consistent with many prior observations that species boundaries coincide with a 94-96% ANI threshold. This plot provides some support for the assertion that comparisons to the right of the gap (red in the heatmap) are within-species comparisons, and those to the left (blue in the heatmap) are between-species comparisons. + +Other gaps/discontinuities are visible. The interpretations of these are highly context-dependent and it is not always clear whether they are taxonomically meaningful (e.g. subspecies or genus boundary), or reflect sampling biases. Further investigation and evidential support is necessary. + + +------------ +Scatterplots +------------ + +.. figure:: images/scatterplot.png + :alt: scatterplot of coverage/aligned fraction vs ANI + + Scatterplot of coverage/aligned fraction vs ANI + + Plotting coverage/aligned fraction (y-axis) against ANI (x-axis) can be informative. Here, as is often the case for larger comparisons, there is a clear piecewise linear appearance to the plot. There is a relatively shallow gradient for high (>50%) coverage comparisons, and a steep gradient for low (<50%) coverage comparisons. There is a discontinuity on the coverage axis between β‰ˆ40% and 60% coverage, corresponding to a shift between the two piecewise linear regimes. This is often interpretable as a genus boundary, but requires further evidence and support to be certain. + +Comparisons with high ANI but relatively low coverage for the dataset in question (these appear *below* the main population) may suggest the presence of a significant proportion of mobile elements in the sequenced genome. + +Often a vertical banding can be seen, due to discontinuities in the distribution of ANI values. Here, the discontinuity at around 95% ANI is consistent with a division between within-species (right of the gap) and between-species (left of the gap) comparisons. Vertical bands to the left of the 95% line may indicate comparisons between particular pairs of species that are more or less recently diverged, but likely fall within the same genus. + +The bulk of comparisons at the lower left of the plot likely indicate comparisons between relatively unrelated genomes, possibly from different genera. + + +-------------- +Plot Asymmetry +-------------- + +.. note:: + + Each ANI method in `pyani` calculates results by a different method. The difference between methods is usually that alternative third-party alignment tools are used. However, there may also be differences between the ways those alignment outputs are used. Please see the relevant documentation for details of each method. + +**Average nucleotide identity** is a measure of similarity between two genomes. Depending on the ANI method used, this may be symmetrical: comparing genome A to genome B is the same as comparing genome B to genome A; or asymmetrical: the result of comparing genome A with genome B can be different from comparing genome B with genome A. + +Asymmetry can arise as a consequence of the way the sequence alignment algorithm used for calculating genome alignments works. For instance, the initial seed alignment for a pair of genomes may be very similar, but not identical, and this difference may propagate through an extension step into differences in the final alignment. Alternatively, an aspect of the ANI algorithm may introduce asymmetry. For instance, the genome fragmentation step in ANIb may break each participating genome in different ways. + +`pyani` provides both symmetrical and asymmetrical ANI methods: + + - ANIm β€” symmetrical + - FastANI β€”Β asymmetrical (only available in version 0.3.0-alpha) + - ANIb β€” asymmetrical + - ANIblastall β€” asymmetrical + - TETRA β€” symmetrical (though please note that this is not strictly an ANI method) + +**Alignment coverage** is the proportion of the query genome that aligns against the reference genome. This can be asymmetrical even when the alignment itself is symmetrical, as the genomes participating in a pairwise alignment may have differing amounts of genomic sequence that do not contribute to the alignment. In general, comparing genome A to genome B will give different coverage values for A and B. + + - in ANIm this is ``alignment_length / genome_length`` (asymmetrical) + - in fastANI this is ``matched_fragments / all_fragments`` (asymmetrical) + - in ANIb this is ``alignment_length / query_genome_length`` (asymmetrical) + - in ANIblastall this is ``alignment_length / query_genome_length`` (asymmetrical) + +**Alignment length** is the count of bases contributed by each genome to the pairwise alignment between those genomes. + + - in ANIm this is calculated as ``reference_positions_in_alignment + insertions - deletions`` + - in fastANI this is ``matched_fragments * fragment_length`` + - in ANIb this is ``alignment_length - gaps`` + - in ANIblastall this is ``alignment_length - gaps`` + +The **similarity errors** graph shows a measure of the number of bases/positions that do not match exactly. + + - in ANIm this is ``non-identities + insertions + deletions`` + - in fastANI this is ``all_fragments - matched_fragents`` + - in ANIb this is ``gaps + mismatches`` + - in ANIblastall this is ``gaps + mismatches`` + +The **Hadamard** ouptut is the elementwise product (identity x coverage), as described at `Hadamard product`_ of identity and coverage. It's meant to provide a measure that allows you to interpret identity and coverage simultaneously. + + - this is always ``ANI * coverage``, but as the plot is not symmetric, coverage may differ for query and reference genomes + + + +``pyani plot`` also outputs a scatterplot of **Average nucleotide identity** versus **Alignment coverage** (calculated as described above). + +.. _Hadamard product: https://en.wikipedia.org/wiki/Hadamard_product_(matrices) diff --git a/docs/quickstart.rst b/docs/quickstart.rst index 489d5c7d..b369f7da 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -98,7 +98,7 @@ This creates a new directory (``C_blochmannia``) with the following contents: C_blochmannia β”œβ”€β”€ GCF_000011745.1_ASM1174v1_genomic.fna β”œβ”€β”€ GCF_000011745.1_ASM1174v1_genomic.fna.gz - β”œβ”€β”€ GCF_000011745.1_ASM1174v1_genomic.md5 + β”œβ”€β”€ GCF_000011745.1_ASM1174v1_genomic.fna.md5 [...] β”œβ”€β”€ GCF_000973545.1_ASM97354v1_hashes.txt β”œβ”€β”€ classes.txt diff --git a/docs/run_anib.rst b/docs/run_anib.rst index bba5aac1..5d04c2c7 100644 --- a/docs/run_anib.rst +++ b/docs/run_anib.rst @@ -40,48 +40,47 @@ The basic form of the command is: .. code-block:: bash - pyani anib + pyani anib -i -o -This instructs ``pyani`` to perform ANIb on the genome FASTA files in ````, and write any output files to ````. For example, the following command performs ANIb on genomes in the directory ``genomes`` and writes output to a new directory ``genomes_ANIb``: +This instructs ``pyani`` to perform ANIb on the genome FASTA files in ````, which is passed to the ``-i`` argument, and write any output files to ````, which is passed to the ``-o`` argument. For example, the following command performs ANIb on genomes in the directory ``genomes`` and writes output to a new directory ``genomes_ANIb``: .. code-block:: bash - pyani anib genomes genomes_ANIb + pyani anib -i genomes -o genomes_ANIb .. NOTE:: - While running, ``pyani anim`` will show progress bars unless these are disabled with the option ``--disable_tqdm`` + While running, ``pyani anib`` will show progress bars unless these are disabled with the option ``--disable_tqdm`` -This command will write the intermediate ``nucmer``/``MUMmer`` output to the directory ``genomes_ANIm``, in a subdirectory called ``nucmer_output``, where the results can be inspected if required. .. code-block:: bash - $ ls genomes_ANIm/ - nucmer_output + $ ls genomes_ANIb/ + blastn_output .. ATTENTION:: - To view the output ANIm results, you will need to use the ``pyani report`` or ``pyani plot`` subcommands. Please see :ref:`pyani-subcmd-report` and :ref:`pyani-subcmd-plot` for more details. + To view the output ANIb results, you will need to use the ``pyani report`` or ``pyani plot`` subcommands. Please see :ref:`pyani-subcmd-report` and :ref:`pyani-subcmd-plot` for more details. ---------------------------------------------- -Perform ANIm analysis with Open Grid Scheduler +Perform ANIb analysis with Open Grid Scheduler ---------------------------------------------- -The ``MUMmer`` comparison step of ANIm is embarrassingly parallel, and ``nucmer`` jobs can be distributed across cores in a cluster using the `Open Grid Scheduler`. To enable this during the analysis, use the ``--scheduler SGE`` option: +The ``blastn`` comparison step of ANIb is embarrassingly parallel, and ``blastn`` jobs can be distributed across cores in a cluster using the `Open Grid Scheduler`. To enable this during the analysis, use the ``--scheduler SGE`` option: .. code-block:: bash - pyani anim --scheduler SGE genomes genomes_ANIm + pyani anib --scheduler SGE -i genomes -o genomes_ANIb .. NOTE:: Jobs are submitted as *array jobs* to keep the scheduler queue short. .. NOTE:: - If ``--scheduler SGE`` is not specified, all ``MUMmer`` jobs are run locally with ``Python``'s ``multiprocessing`` module. + If ``--scheduler SGE`` is not specified, all ``blastn`` jobs are run locally with ``Python``'s ``multiprocessing`` module. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Controlling parameters of Open Grid Scheduler ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -It is possible to control the following features of `Open Grid Scheduler`_ `via` the ``pyani anim`` subcommand: +It is possible to control the following features of `Open Grid Scheduler`_ `via` the ``pyani anib`` subcommand: - The array job size (by default, comparison jobs are batched in arrays of 10,000) - The prefix string for the job, as reported in the scheduler queue @@ -91,19 +90,19 @@ These allow for useful control of job execution. For example, the command: .. code-block:: bash - pyani anim --scheduler SGE --SGEgroupsize 5000 genomes genomes_ANIm + pyani anib --scheduler SGE --SGEgroupsize 5000 -i genomes -o genomes_ANIb -will batch ``MUMmer`` jobs in groups of 500 for the scheduler. The command: +will batch ``blastn`` jobs in groups of 500 for the scheduler. The command: .. code-block:: bash - pyani anim --scheduler SGE --jobprefix My_Ace_Job genomes genomes_ANIm + pyani anib --scheduler SGE --jobprefix My_Ace_Job -i genomes -o genomes_ANIb will prepend the string ``My_Ace_Job`` to your job in the scheduler queue. And the command: .. code-block:: bash - pyani anim --scheduler SGE --SGEargs "-m e -M my.name@my.domain" 5000 genomes genomes_ANIm + pyani anib --scheduler SGE --SGEargs "-m e -M my.name@my.domain" 5000 -i genomes -o genomes_ANIb will email ``my.name@my.domain`` when the jobs finish. diff --git a/docs/run_anim.rst b/docs/run_anim.rst index b8a31540..379b5585 100644 --- a/docs/run_anim.rst +++ b/docs/run_anim.rst @@ -38,13 +38,13 @@ The basic form of the command is: .. code-block:: bash - pyani anim + pyani anim -i -o -This instructs ``pyani`` to perform ANIm on the genome FASTA files in ````, and write any output files to ````. For example, the following command performs ANIm on genomes in the directory ``genomes`` and writes output to a new directory ``genomes_ANIm``: +This instructs ``pyani`` to perform ANIm on the genome FASTA files in ````, which is passed to the ``-i`` argument, and write any output files to ````, which is passed to the ``-o`` argument. For example, the following command performs ANIm on genomes in the directory ``genomes`` and writes output to a new directory ``genomes_ANIm``: .. code-block:: bash - pyani anim genomes genomes_ANIm + pyani anim -i genomes -o genomes_ANIm .. NOTE:: While running, ``pyani anim`` will show progress bars unless these are disabled with the option ``--disable_tqdm`` @@ -67,7 +67,7 @@ The ``MUMmer`` comparison step of ANIm is embarrassingly parallel, and ``nucmer` .. code-block:: bash - pyani anim --scheduler SGE genomes genomes_ANIm + pyani anim --scheduler SGE -i genomes -o genomes_ANIm .. NOTE:: Jobs are submitted as *array jobs* to keep the scheduler queue short. @@ -89,19 +89,19 @@ These allow for useful control of job execution. For example, the command: .. code-block:: bash - pyani anim --scheduler SGE --SGEgroupsize 5000 genomes genomes_ANIm + pyani anim --scheduler SGE --SGEgroupsize 5000 -i genomes -o genomes_ANIm will batch ``MUMmer`` jobs in groups of 500 for the scheduler. The command: .. code-block:: bash - pyani anim --scheduler SGE --jobprefix My_Ace_Job genomes genomes_ANIm + pyani anim --scheduler SGE --jobprefix My_Ace_Job -i genomes -o genomes_ANIm will prepend the string ``My_Ace_Job`` to your job in the scheduler queue. And the command: .. code-block:: bash - pyani anim --scheduler SGE --SGEargs "-m e -M my.name@my.domain" 5000 genomes genomes_ANIm + pyani anim --scheduler SGE --SGEargs "-m e -M my.name@my.domain" 5000 -i genomes -o genomes_ANIm will email ``my.name@my.domain`` when the jobs finish. diff --git a/docs/subcmd_anim.rst b/docs/subcmd_anim.rst index 66713b98..86a64f19 100644 --- a/docs/subcmd_anim.rst +++ b/docs/subcmd_anim.rst @@ -4,39 +4,30 @@ ``pyani anim`` ============== -The ``anim`` subcommand will carry out ANIm analysis using genome files contained in the ``indir`` directory, writing result files to the ``outdir`` directory, and recording data about each comparison and run in a local `SQLite3`_ database. +The ``anim`` subcommand will carry out ANIm analysis using genome files contained in the ``INDIR`` directory, writing result files to the ``OUTDIR`` directory, and recording data about each comparison and run in a local `SQLite3`_ database. .. code-block:: text - usage: pyani.py anim [-h] [-l LOGFILE] [-v] [--disable_tqdm] - [--scheduler {multiprocessing,SGE}] [--workers WORKERS] - [--SGEgroupsize SGEGROUPSIZE] [--SGEargs SGEARGS] - [--jobprefix JOBPREFIX] [--name NAME] [--classes CLASSES] - [--labels LABELS] [--recovery] [--dbpath DBPATH] - [--nucmer_exe NUCMER_EXE] [--filter_exe FILTER_EXE] - [--maxmatch] [--nofilter] - indir outdir + usage: pyani anim [-h] [-l LOGFILE] [-v] [--debug] [--disable_tqdm] [--version] + [--citation] [--scheduler {multiprocessing,SGE}] + [--workers WORKERS] [--SGEgroupsize SGEGROUPSIZE] + [--SGEargs SGEARGS] [--jobprefix JOBPREFIX] [--name NAME] + [--classes CLASSES] [--labels LABELS] [--recovery] -i INDIR + -o OUTDIR [--dbpath DBPATH] [--nucmer_exe NUCMER_EXE] + [--filter_exe FILTER_EXE] [--maxmatch] [--nofilter] -.. _SQLite3: https://www.sqlite.org/index.html - --------------------- -Positional arguments --------------------- -``indir`` - Path to the directory containing indexed genome files to be used for the analysis. -``outdir`` - Path to a directory where comparison output files will be written. +.. _SQLite3: https://www.sqlite.org/index.html ----------------- Flagged arguments ----------------- ``--classes CLASSFNAME`` - Use the set of classes (one per genome sequence file) found in the file ``CLASSFNAME`` in ``indir``. Default: ``classes.txt`` + Use the set of classes (one per genome sequence file) found in the file ``CLASSFNAME`` in ``INDIR``. Default: ``classes.txt`` ``--dbpath DBPATH`` Path to the location of the local ``pyani`` database to be used. Default: ``.pyani/pyanidb`` @@ -47,6 +38,9 @@ Flagged arguments ``--filter_exe FILTER_EXE`` Path to the ``MUMmer`` ``delta-filter`` executable. Default: ``delta-filter`` +``-i INDIR, --indir INDIR`` + Path to the directory containing indexed genome files to be used for the analysis. + ``-h, --help`` Display usage information for ``pyani anim``. @@ -54,13 +48,7 @@ Flagged arguments Use the string ``JOBPREFIX`` as a prefix for SGE job submission names. Default: ``PYANI`` ``--labels LABELFNAME`` - Use the set of labels (one per genome sequence file) found in the file ``LABELFNAME`` in ``indir``. Default: ``labels.txt`` - -``--name NAME`` - Use the string ``NAME`` to identify this ANIm run in the ``pyani`` database. - -``--nucmer_exe NUCMER_EXE`` - Path to the ``MUMmer`` ``nucmer`` executable. Default: ``nucmer`` + Use the set of labels (one per genome sequence file) found in the file ``LABELFNAME`` in ``INDIR``. Default: ``labels.txt`` ``-l LOGFILE, --logfile LOGFILE`` Provide the location ``LOGFILE`` to which a logfile of the download process will be written. @@ -68,9 +56,18 @@ Flagged arguments ``--maxmatch`` Use the ``MUMmer`` ``--maxmatch`` option to include all ``nucmer`` matches. +``--name NAME`` + Use the string ``NAME`` to identify this ANIm run in the ``pyani`` database. + ``--nofilter`` Do not use ``delta-filter`` to restrict ``nucmer`` output to 1:1 matches. +``--nucmer_exe NUCMER_EXE`` + Path to the ``MUMmer`` ``nucmer`` executable. Default: ``nucmer`` + +``-o OUTDIR, --outdir OUTDIR`` + Path to a directory where comparison output files will be written. + ``--recovery`` Use existing ``NUCmer`` comparison output if available, e.g. if recovering from a failed job submission. Using this option will not generate a new comparison if the old output files exist. diff --git a/docs/subcmd_createdb.rst b/docs/subcmd_createdb.rst index f08a42c6..b6d986fa 100644 --- a/docs/subcmd_createdb.rst +++ b/docs/subcmd_createdb.rst @@ -8,7 +8,7 @@ The ``createdb`` subcommand creates a new, empty database for ``pyani`` to use i .. code-block:: text - usage: pyani.py createdb [-h] [-l LOGFILE] [-v] [--disable_tqdm] + usage: pyani createdb [-h] [-l LOGFILE] [-v] [--disable_tqdm] [--dbpath DBPATH] [-f] diff --git a/docs/subcmd_download.rst b/docs/subcmd_download.rst index 61c67668..f9baf545 100644 --- a/docs/subcmd_download.rst +++ b/docs/subcmd_download.rst @@ -8,13 +8,14 @@ The ``download`` subcommand controls download of genome files from the `NCBI Ass .. code-block:: text - usage: pyani.py download [-h] [-l LOGFILE] [-v] [--disable_tqdm] -t TAXON - --email EMAIL [--api_key API_KEYPATH] - [--retries RETRIES] [--batchsize BATCHSIZE] - [--timeout TIMEOUT] [-f] [--noclobber] - [--labels LABELFNAME] [--classes CLASSFNAME] + + usage: pyani download [-h] [-l LOGFILE] [-v] [--debug] [--disable_tqdm] [--version] + [--citation] -o OUTDIR -t TAXON --email EMAIL + [--api_key API_KEYPATH] [--retries RETRIES] + [--batchsize BATCHSIZE] [--timeout TIMEOUT] [-f] + [--noclobber] [--labels LABELFNAME] [--classes CLASSFNAME] + [--kraken] [--dry-run] - outdir -------------------- Positional arguments @@ -46,7 +47,7 @@ Flagged arguments **COMPULSORY**. Provide the email address ``EMAIL`` to NCBI so that they can track problems. ``-f, --force`` - Force use of the ``outdir`` directory when downloaded genomes, even if it already exists. + Force use of the ``OUTDIR`` directory when downloaded genomes, even if it already exists. ``-h, --help`` Display usage information for ``pyani download``. @@ -63,6 +64,9 @@ Flagged arguments ``--noclobber`` Do not overwrite individual files in the ``outdir`` directory, when used with ``-f``. +``-o OUTDIR, --outdir OUTDIR`` + The ``OUTDIR`` argument should be the path to a directory into which genome files will be downloaded. If the directory exists, a warning will be given and the download will not proceed, to avoid overwriting existing data. To force writing into an existing directory, use the ``-f`` option. + ``--retries RETRIES`` The download process will attempt to download each batch of assemblies a maximum of ``RETRIES`` times. Default: 20 diff --git a/docs/subcmd_index.rst b/docs/subcmd_index.rst index 399b6dd0..ea748e70 100644 --- a/docs/subcmd_index.rst +++ b/docs/subcmd_index.rst @@ -4,27 +4,20 @@ ``pyani index`` =============== -The ``index`` subcommand will index the genome files it finds the passed directory ``indir``, generating label and class files, and files that contain an MD5 hash of the nucleotide sequence of each genome. +The ``index`` subcommand will index the genome files it finds the passed directory ``INDIR``, generating label and class files, and files that contain an MD5 hash of the nucleotide sequence of each genome. .. code-block:: text - usage: pyani index [-h] [-l LOGFILE] [-v] [--disable_tqdm] - [--labels LABELFNAME] [--classes CLASSFNAME] - indir - --------------------- -Positional arguments --------------------- - -``indir`` - The ``indir`` argument should be the path to a directory containing genome sequence data as FASTA files (one per genome assembly). + usage: pyani index [-h] [-l LOGFILE] [-v] [--debug] [--disable_tqdm] [--version] + [--citation] -i INDIR [--labels LABELFNAME] + [--classes CLASSFNAME] ----------------- Flagged arguments ----------------- ``--classes CLASSFNAME`` - Write a set of labels (one per genome sequence file) to the file ``CLASSFNAME`` in ``indir``. Default: ``classes.txt`` + Write a set of labels (one per genome sequence file) to the file ``CLASSFNAME`` in ``INDIR``. Default: ``classes.txt`` ``--disable_tqdm`` Disable the ``tqdm`` progress bar while the download process runs. This is useful when testing to avoid aesthetic problems with test output. @@ -32,11 +25,14 @@ Flagged arguments ``-h, --help`` Display usage information for ``pyani index``. +``-i INDIR, --indir INDIR`` + The ``INDIR`` argument should be the path to a directory containing genome sequence data as FASTA files (one per genome assembly). + ``-l LOGFILE, --logfile LOGFILE`` Provide the location ``LOGFILE`` to which a logfile of the download process will be written. ``--labels LABELFNAME`` - Write a set of labels (one per genome sequence file) to the file ``LABELFNAME`` in ``indir``. Default: ``labels.txt`` + Write a set of labels (one per genome sequence file) to the file ``LABELFNAME`` in ``INDIR``. Default: ``labels.txt`` ``-v, --verbose`` Provide verbose output to ``STDOUT`` diff --git a/docs/subcmd_listdeps.rst b/docs/subcmd_listdeps.rst index d881043f..c94a16cf 100644 --- a/docs/subcmd_listdeps.rst +++ b/docs/subcmd_listdeps.rst @@ -8,7 +8,7 @@ The ``listdeps`` subcommand writes an account of the local platform, the install .. code-block:: text - usage: pyani.py listdeps [-h] [-l LOGFILE] [-v] + usage: pyani listdeps [-h] [-l LOGFILE] [-v] ----------------- Flagged arguments diff --git a/pyani/anim.py b/pyani/anim.py index 23207aeb..a638a04d 100644 --- a/pyani/anim.py +++ b/pyani/anim.py @@ -248,11 +248,15 @@ def construct_nucmer_cmdline( outdir, called "nucmer_output". """ # Cast path strings to pathlib.Path for safety - fname1, fname2 = Path(fname1), Path(fname2) + fname1, fname2 = sorted([Path(fname1), Path(fname2)]) # Compile commands + # Nested output folders to avoid N^2 scaling in files-per-folder + # Create folders incrementally (want an error if outdir does not exist) outsubdir = outdir / pyani_config.ALIGNDIR["ANIm"] outsubdir.mkdir(exist_ok=True) + outsubdir = outdir / pyani_config.ALIGNDIR["ANIm"] / fname1.stem + outsubdir.mkdir(exist_ok=True) outprefix = outsubdir / f"{fname1.stem}_vs_{fname2.stem}" if maxmatch: mode = "--maxmatch" @@ -367,7 +371,7 @@ def process_deltadir( # Process directory to identify input files - as of v0.2.4 we use the # .filter files that result from delta-filter (1:1 alignments) - deltafiles = sorted(delta_dir.glob("*.filter")) + deltafiles = sorted(delta_dir.glob("*/*.filter")) logger.info("%s has %d files to load", delta_dir, len(deltafiles)) if not deltafiles: diff --git a/pyani/pyani_files.py b/pyani/pyani_files.py index 022d9899..3f10d35d 100644 --- a/pyani/pyani_files.py +++ b/pyani/pyani_files.py @@ -74,15 +74,15 @@ def get_fasta_paths( :param dirname: Path, path to directory containing input FASTA files :param extlist: List, file suffixes for FASTA files - Returns the full path to each file. + Returns sorted list of the full path to each file. """ # Lists are dangerous to have as default function arguments extlist = extlist or [".fna", ".fa", ".fasta", ".fas"] - return [ + return sorted( fname for fname in dirname.iterdir() if fname.is_file() and fname.suffix in extlist - ] + ) # Get a list of FASTA files and corresponding hashes from the input directory @@ -116,12 +116,12 @@ def get_fasta_and_hash_paths(dirname: Path = Path(".")) -> List[Tuple[Path, Path # Get list of FASTA files in a directory def get_input_files(dirname: Path, *ext) -> List[Path]: - """Return files in passed directory, filtered by extension. + """Return sorted files in passed directory, filtered by extension. :param dirname: Path, path to input directory :param *ext: optional iterable of arguments describing permitted file extensions """ - return [fname for fname in dirname.iterdir() if fname.suffix in ext] + return sorted(fname for fname in dirname.iterdir() if fname.suffix in ext) # Get lengths of input sequences diff --git a/pyani/pyani_graphics/mpl/__init__.py b/pyani/pyani_graphics/mpl/__init__.py index 7b325b21..a2ae3137 100644 --- a/pyani/pyani_graphics/mpl/__init__.py +++ b/pyani/pyani_graphics/mpl/__init__.py @@ -358,3 +358,51 @@ def heatmap(dfr, outfilename=None, title=None, params=None): if outfilename: fig.savefig(outfilename) return fig + + +def scatter( + dfr1, + dfr2, + outfilename=None, + matname1="identity", + matname2="coverage", + title=None, + params=None, +): + """Return matplotlib scatterplot. + + :param dfr1: pandas DataFrame with x-axis data + :param dfr2: pandas DataFrame with y-axis data + :param outfilename: path to output file (indicates output format) + :param matname1: name of x-axis data + :param matname2: name of y-axis data + :param title: title for the plot + :param params: a list of parameters for plotting: [colormap, vmin, vmax] + """ + # Make an empty dataframe to collect the input data in + combined = pd.DataFrame() + + # Add data + combined[matname1] = dfr1.values.flatten() + combined[matname2] = dfr2.values.flatten() + + # Add lable information, if available + # if params.labels: + # hue = "labels" + # combined['labels'] = # add labels to dataframe; unsure of their configuration at this point + # else: + hue = None + + fig, ax = plt.subplots(figsize=(8, 8)) + fig.suptitle(title) + ax.set_xlabel(f"{matname1.title()}") + ax.set_ylabel(f"{matname2.title()}") + + plt.scatter(matname1, matname2, data=combined, c=hue, s=2) + + # Return figure output, and write, if required + plt.subplots_adjust(top=0.85) # Leave room for title + fig.set_tight_layout(True) + if outfilename: + fig.savefig(outfilename) + return fig diff --git a/pyani/pyani_graphics/sns/__init__.py b/pyani/pyani_graphics/sns/__init__.py index 85451d1a..c8bcb284 100644 --- a/pyani/pyani_graphics/sns/__init__.py +++ b/pyani/pyani_graphics/sns/__init__.py @@ -176,14 +176,21 @@ def distribution(dfr, outfilename, matname, title=None): :param matname: str, type of matrix being plotted :param title: str, optional title """ + fill = "#A6C8E0" + rug = "#2678B2" fig, axes = plt.subplots(1, 2, figsize=(15, 5)) fig.suptitle(title) - sns.distplot( - dfr.values.flatten(), kde=False, rug=False, ax=axes[0], norm_hist=False - ) - sns.distplot( - dfr.values.flatten(), hist=False, rug=True, ax=axes[1], norm_hist=False + sns.histplot( + dfr.values.flatten(), + ax=axes[0], + stat="count", + element="step", + color=fill, + edgecolor=fill, ) + axes[0].set_ylim(ymin=0) + sns.kdeplot(dfr.values.flatten(), ax=axes[1]) + sns.rugplot(dfr.values.flatten(), ax=axes[1], color=rug) # Modify axes after data is plotted for _ in axes: @@ -203,3 +210,56 @@ def distribution(dfr, outfilename, matname, title=None): fig.savefig(outfilename) return fig + + +def scatter( + dfr1, + dfr2, + outfilename=None, + matname1="identity", + matname2="coverage", + title=None, + params=None, +): + """Return seaborn scatterplot. + + :param dfr1: pandas DataFrame with x-axis data + :param dfr2: pandas DataFrame with y-axis data + :param outfilename: path to output file (indicates output format) + :param matname1: name of x-axis data + :param matname2: name of y-axis data + :param title: title for the plot + :param params: a list of parameters for plotting: [colormap, vmin, vmax] + """ + # Make an empty dataframe to collect the input data in + combined = pd.DataFrame() + + # Add data + combined[matname1] = dfr1.values.flatten() + combined[matname2] = dfr2.values.flatten() + + # Add lable information, if available + # if params.labels: + # hue = "labels" + # combined['labels'] = # add labels to dataframe; unsure of their configuration at this point + # else: + hue = None + + # Create the plot + fig = sns.lmplot( + x=matname1, + y=matname2, + data=combined, + hue=hue, + fit_reg=False, + scatter_kws={"s": 2}, + ) + fig.set(xlabel=matname1.title(), ylabel=matname2.title()) + plt.title(title) + + # Save to file + if outfilename: + fig.savefig(outfilename) + + # Return clustermap + return fig diff --git a/pyani/pyani_report.py b/pyani/pyani_report.py index b02aa753..6de35595 100644 --- a/pyani/pyani_report.py +++ b/pyani/pyani_report.py @@ -189,8 +189,7 @@ def write_dbtable( dfm: pd.DataFrame, path: Path, formats: Sequence[str] = ("tab",), - index: bool = False, - show_index: bool = False, + show_index: bool = True, colour_num: bool = False, ) -> None: """Write database result table to output file in named format. @@ -198,18 +197,17 @@ def write_dbtable( :param dfm: pd.Dataframe :param path: Path to output file :param formats: tuple of str, output file formats - :param index: Boolean - :param show_index: Boolean + :param show_index: output row and column labels :param colour_num: use colours for values in HTML output colours are used for identity/coverage tables """ formatdict = { - "tab": (dfm.to_csv, {"sep": "\t", "index": False}, ".tab"), + "tab": (dfm.to_csv, {"sep": "\t", "index": show_index}, ".tab"), "excel": (dfm.to_excel, {"index": show_index}, ".xlsx"), "html": ( write_styled_html, - {"dfm": dfm, "index": index, "colour_num": colour_num}, + {"dfm": dfm, "index": show_index, "colour_num": colour_num}, ".html", ), "stdout": (write_to_stdout, {"dfm": dfm, "show_index": show_index}, ""), diff --git a/pyani/pyani_tools.py b/pyani/pyani_tools.py index b39cdac0..a9b93ae8 100644 --- a/pyani/pyani_tools.py +++ b/pyani/pyani_tools.py @@ -303,8 +303,13 @@ def label_results_matrix(matrix: pd.DataFrame, labels: Dict) -> pd.DataFrame: Applies the labels from the dictionary to the dataframe in matrix, and returns the result. """ - matrix.columns = [f"{labels.get(_, _)}:{_}" for _ in matrix.columns] - matrix.index = [f"{labels.get(_, _)}:{_}" for _ in matrix.index] + # The dictionary uses string keys! + # Create a label function that produces