From b96679e50ce33a4d6b07860d76f18c476c61a046 Mon Sep 17 00:00:00 2001
From: Alejandra Escobar
Date: Fri, 13 Sep 2024 11:05:49 +0100
Subject: [PATCH 01/16] Update databases_setup.sh
limiting biome options to a list
---
bin/databases_setup.sh | 12 +++++++++++-
1 file changed, 11 insertions(+), 1 deletion(-)
diff --git a/bin/databases_setup.sh b/bin/databases_setup.sh
index 17702f0..fc0b455 100755
--- a/bin/databases_setup.sh
+++ b/bin/databases_setup.sh
@@ -1,14 +1,24 @@
#!/bin/bash
+# Define the list of valid biomes
+valid_biomes=('chicken-gut-v1-0-1' 'mouse-gut-v1-0' 'non-model-fish-gut-v2-0' 'human-vaginal-v1-0' 'honeybee-gut-v1-0-1'
+ 'sheep-rumen-v1-0' 'marine-v2-0' 'zebrafish-fecal-v1-0' 'human-oral-v1-0-1' 'pig-gut-v1-0'
+ 'cow-rumen-v1-0-1' 'human-gut-v2-0-2')
+
# Parse command-line arguments
while [[ $# -gt 0 ]]; do
key="$1"
case $key in
--biome)
BIOME="$2"
+ # Check if the provided biome is in the valid biomes list
+ if [[ ! " ${valid_biomes[@]} " =~ " ${BIOME} " ]]; then
+ echo "The input $BIOME is not a valid biome, please use one of the following: ${valid_biomes[*]}"
+ exit 1
+ fi
shift
shift
- ;;
+ ;;
--catalogue_dbs_path)
CATALOGUE_DBS_PATH="$2"
shift
From 7eadb9ced47f78c2363f06c836129d3a5be9757e Mon Sep 17 00:00:00 2001
From: Alejandra Escobar
Date: Mon, 21 Oct 2024 13:24:11 +0100
Subject: [PATCH 02/16] Update databases_setup.sh
---
bin/databases_setup.sh | 24 ++++++++++++++----------
1 file changed, 14 insertions(+), 10 deletions(-)
diff --git a/bin/databases_setup.sh b/bin/databases_setup.sh
index fc0b455..d3229ba 100755
--- a/bin/databases_setup.sh
+++ b/bin/databases_setup.sh
@@ -96,34 +96,38 @@ fi
NEW_BIOME=$(echo $BIOME | sed 's/-vaginal-/-tmp-/;s/-v/|/;s/-tmp-/-vaginal-/' )
PREFIX_BIOME=$(echo "$NEW_BIOME" | cut -d '|' -f1)
VERSION=$(echo "$NEW_BIOME" | cut -d '|' -f2)
-VERSION=$(echo "v$VERSION" | sed 's/-/./g' )
+CAT_VERSION=$(echo "v$VERSION" | sed 's/-/./g' )
echo " *** Downloading catalogue related databases to ${CATALOGUE_DBS_PATH}/${BIOME}"
# Downloading the catalogue metadata file
-wget --continue "https://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/$PREFIX_BIOME/$VERSION/genomes-all_metadata.tsv"
+wget --continue "https://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/$PREFIX_BIOME/$CAT_VERSION/genomes-all_metadata.tsv"
+# Setting up the files location in ftp
+TABLES_DIR="https://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/references/mgnify_genomes/${PREFIX_BIOME}_reps"
+FUNCTIONS_DIR="$TABLES_DIR/${PREFIX_BIOME}_v${VERSION}_functions"
+SOURMASH_DIR="$TABLES_DIR/${PREFIX_BIOME}_v${VERSION}_sourmash"
+BWAMEM_DIR="$TABLES_DIR/${PREFIX_BIOME}_v${VERSION}_bwamem2.tar.gz"
# Downloading the pangenome function tables
-wget --continue "https://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/$PREFIX_BIOME/$VERSION/pangenome_functions/functional_profiles.tar.gz"
+wget --continue "$FUNCTIONS_DIR/functional_profiles.tar.gz"
tar -xvf functional_profiles.tar.gz
rm functional_profiles.tar.gz
-wget --continue "https://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/$PREFIX_BIOME/$VERSION/pangenome_functions/kegg_completeness.tar.gz"
+wget --continue "$FUNCTIONS_DIR/kegg_completeness.tar.gz"
tar -xvf kegg_completeness.tar.gz
rm kegg_completeness.tar.gz
# Downloading the representative genomes indexed for sourmash
-wget --continue "https://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/$PREFIX_BIOME/$VERSION/sourmash_db_${HOST}_${VERSION}/sourmash_species_representatives_k21.sbt.zip"
+wget --continue "$SOURMASH_DIR/sourmash_species_representatives_k21.sbt.zip"
# Downloading bwamem2 db index if the option is set
if [ "$DOWNLOAD_BWA" = "true" ]; then
echo " *** Downloading bwamem2 indexed database for $BIOME to ${CATALOGUE_DBS_PATH}/${BIOME}"
- NEW_PREFIX=$(echo "$PREFIX_BIOME" | sed 's/-/_/')
- wget --continue "https://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/references/${NEW_PREFIX}_reps/${NEW_PREFIX}-${VERSION}_bwamem2.tar.gz"
- tar -xvf "${NEW_PREFIX}-${VERSION}_bwamem2.tar.gz"
- mv "${NEW_PREFIX}-${VERSION}_bwamem2"/* .
- rm -r "${BIOME}-${VERSION}_bwamem2" "${NEW_PREFIX}-${VERSION}_bwamem2.tar.gz"
+ wget --continue "$BWAMEM_DIR"
+ tar -xvf "${PREFIX_BIOME}_${VERSION}_bwamem2.tar.gz"
+ mv "${PREFIX_BIOME}_${VERSION}_bwamem2"/* .
+ rm -r "${PREFIX_BIOME}_${VERSION}_bwamem2" "${PREFIX_BIOME}_${VERSION}_bwamem2.tar.gz"
else
echo " *** Skipping download of bwamem2 indexed database for $BIOME"
echo " Note you will not be able to use --run_bwa true option on shallow-mapping pipeline for this biome"
From be966ebe28748d70ee70eb90f61a25b6f9073fef Mon Sep 17 00:00:00 2001
From: Alejandra Escobar
Date: Mon, 21 Oct 2024 13:50:12 +0100
Subject: [PATCH 03/16] Update README.md
---
README.md | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/README.md b/README.md
index e62aaad..c3e3c28 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
**ebi-metagenomics/shallowmapping** is a bioinformatics pipeline that generates taxonomic and functional profiles for low-yield (shallow shotgun: < 10 M reads) short raw-reads using [`MGnify biome-specific genome catalogues`](https://www.ebi.ac.uk/metagenomics/browse/genomes) as a reference.
-At the moment, the biome selection is limited to the precomputed databases for [chicken-gut-v1-0-1](https://www.ebi.ac.uk/metagenomics/genome-catalogues/chicken-gut-v1-0-1), [mouse-gut-v1-0](https://www.ebi.ac.uk/metagenomics/genome-catalogues/mouse-gut-v1-0), and [human-gut-v2-0-2](https://www.ebi.ac.uk/metagenomics/genome-catalogues/human-gut-v2-0-2). Other databases can be build for any of the [`MGnify genome catalogues`](https://www.ebi.ac.uk/metagenomics/browse/genomes) upon request by opening an issue in this repository (they will be built on a best-effort basis).
+The biome selection includes all the biomes available in the [`MGnify genome catalogues`](https://www.ebi.ac.uk/metagenomics/browse/genomes).
The main sections of the pipeline include the following steps:
@@ -40,19 +40,19 @@ git clone https://github.com/EBI-Metagenomics/shallowmapping.git
The first time you run the pipeline you must put available indexed databases for the decontamination step, MGnify genomes catalogue tables, and some external tables for DRAM visuals generation. MGnify hosts most of the databases and setting up can be done in a single step by providing the location for decontamination and MGnify databases where the new files will be added. The directories have to exist already. Please provide full paths.
-Consider that decontamination reference genomes require ~15-20G of storage.
-MGnify catalogue genomes db occupy ~1G.
+Consider that human-phiX decontamination reference genomes require ~15-20G of storage.
+Each MGnify catalogue genomes db occupy ~1G.
```bash
cd shallowmapping
bash bin/databases_setup.sh \
- --biome \ # Any of the MGnify catalogue ID for which databases are available
+ --biome \ # Any of the MGnify catalogue ID
--catalogue_dbs_path \ # Central location of shallow-mapping dbs. A directory with the biome name will be created
--decont_refs_path \ # Central location of reference genomes for decontamination. Other bwamem2 databases can exist there
--download_bwa default = `false`
```
-Running the pipeline using bwamem2 is optional. If you want to run the pipeline with this option set the `--download_bwa true`. Consider that this database will occupy >15G of storage in your system.
+Running the pipeline using bwamem2 is optional. If you want to run the pipeline with this option set the `--download_bwa true`. This database will occupy considerable storage in your system depending on the biome.
In addition, instructions to generate the databases from custom catalogues can be found in the [shallowmapping paper's repository](https://github.com/EBI-Metagenomics/shallow_shotgun_paper/tree/main?tab=readme-ov-file#31-processing-custom-genome-catalogues).
From 6b6c0f0cfa8cd7a73191b755b1b8d92753c1f032 Mon Sep 17 00:00:00 2001
From: Alejandra Escobar
Date: Mon, 21 Oct 2024 14:35:11 +0100
Subject: [PATCH 04/16] Update nextflow_schema.json
---
nextflow_schema.json | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 6c7dd93..7590c12 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -26,7 +26,7 @@
"biome": {
"type": "string",
"description": "This can be any of the MGnify catalogues for which shallow-mapping databases are currently available",
- "enum": ["chicken-gut-v1-0-1", "mouse-gut-v1-0", "human-gut-v2-0-2"]
+ "enum": ["chicken-gut-v1-0-1", "mouse-gut-v1-0", "non-model-fish-gut-v2-0", "human-vaginal-v1-0", "honeybee-gut-v1-0-1", "sheep-rumen-v1-0", "marine-v2-0", "zebrafish-fecal-v1-0", "human-oral-v1-0-1", "pig-gut-v1-0", "cow-rumen-v1-0-1", "human-gut-v2-0-2"]
},
"run_bwa": {
"type": "boolean",
From 67f806c0c1565c2686a02a35463f34db169d4071 Mon Sep 17 00:00:00 2001
From: Martin Beracochea
Date: Wed, 6 Nov 2024 17:29:23 +0000
Subject: [PATCH 05/16] Fix a few linting issues
---
nextflow.config | 2 +-
nextflow_schema.json | 17 +++++++++++++++--
2 files changed, 16 insertions(+), 3 deletions(-)
diff --git a/nextflow.config b/nextflow.config
index c2fc840..42b68eb 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -131,7 +131,7 @@ singularity.registry = 'quay.io'
// Nextflow plugins
plugins {
- id 'nf-validation' // Validation of pipeline parameters and creation of an input channel from a sample sheet
+ id 'nf-validation@1.1.4'
}
// Export these variables to prevent local Python/R libraries from conflicting with those in the container
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 7590c12..7c231c7 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -1,5 +1,5 @@
{
- "$schema": "http://json-schema.org/draft-07/schema",
+ "$schema": "https://json-schema.org/draft-07/schema",
"$id": "https://raw.githubusercontent.com/ebi-metagenomics/shallowmapping/master/nextflow_schema.json",
"title": "ebi-metagenomics/shallowmapping pipeline parameters",
"description": "Shallow-shotgun mapping pipeline",
@@ -26,7 +26,20 @@
"biome": {
"type": "string",
"description": "This can be any of the MGnify catalogues for which shallow-mapping databases are currently available",
- "enum": ["chicken-gut-v1-0-1", "mouse-gut-v1-0", "non-model-fish-gut-v2-0", "human-vaginal-v1-0", "honeybee-gut-v1-0-1", "sheep-rumen-v1-0", "marine-v2-0", "zebrafish-fecal-v1-0", "human-oral-v1-0-1", "pig-gut-v1-0", "cow-rumen-v1-0-1", "human-gut-v2-0-2"]
+ "enum": [
+ "chicken-gut-v1-0-1",
+ "mouse-gut-v1-0",
+ "non-model-fish-gut-v2-0",
+ "human-vaginal-v1-0",
+ "honeybee-gut-v1-0-1",
+ "sheep-rumen-v1-0",
+ "marine-v2-0",
+ "zebrafish-fecal-v1-0",
+ "human-oral-v1-0-1",
+ "pig-gut-v1-0",
+ "cow-rumen-v1-0-1",
+ "human-gut-v2-0-2"
+ ]
},
"run_bwa": {
"type": "boolean",
From 6e54bccb5fa632292914e811ef3b9e017fed26a1 Mon Sep 17 00:00:00 2001
From: Martin Beracochea
Date: Tue, 17 Dec 2024 16:28:04 +0000
Subject: [PATCH 06/16] Change LICENSE to Apache, lint script and adjust dbs.sh
script
---
.nf-core.yml | 3 +
LICENSE | 222 +++++++++++++++++++---
README.md | 23 ++-
bin/bam2cov.py | 5 +-
bin/bam2cov_filt.py | 16 +-
bin/bwa_genome2species.py | 13 +-
bin/check_samplesheet.py | 261 --------------------------
bin/databases_setup.sh | 113 +++++++----
bin/keggcomp_DB.py | 10 +-
bin/matrix_integrator.py | 9 +-
bin/panaroo_inputs_builder.py | 13 +-
bin/panaroo_inputs_builder_custom.py | 13 +-
bin/pangenomeDB_builder_codon.py | 25 +--
bin/pangenomeDB_builder_custom.py | 25 +--
bin/pangenomeDB_builder_mgnify_old.py | 25 +--
bin/sm_genome2species.py | 11 +-
bin/species2functions.py | 25 +--
bin/species2pathways.py | 10 +-
nextflow_schema.json | 14 +-
19 files changed, 362 insertions(+), 474 deletions(-)
delete mode 100755 bin/check_samplesheet.py
diff --git a/.nf-core.yml b/.nf-core.yml
index 6066fba..c94ce15 100644
--- a/.nf-core.yml
+++ b/.nf-core.yml
@@ -46,6 +46,9 @@ lint:
- process.memory
- process.time
- custom_config
+ - params.max_cpus
+ - params.max_time
+ - params.max_memory
repository_type: pipeline
template:
prefix: ebi-metagenomics
diff --git a/LICENSE b/LICENSE
index 33b0f32..bc757fa 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,21 +1,201 @@
-MIT License
-
-Copyright (c) Microbiome Informatics team
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright 2024 EMBL-EBI
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/README.md b/README.md
index c3e3c28..bfff8bd 100644
--- a/README.md
+++ b/README.md
@@ -23,28 +23,33 @@ The final output includes a species relative abundance table, Pfam and KEGG Orth
-## Install and dependencies
+## Installation
-This workflow was built using [Nextflow](https://www.nextflow.io/) and follows the [nf-core guidelines](https://nf-co.re/docs/contributing/guidelines). It uses Singularity containers making installation trivial and results highly reproducible. To run the pipeline in your system you need:
+This workflow was built using [Nextflow](https://www.nextflow.io/) and follows [nf-core](https://nf-co.re/) good practices. It is containerized, so users can use either Docker or Apptainer/Singularity to run the pipeline. At the moment, it doesn't support Conda environments.
-- Install [Nextflow version >=21.10](https://www.nextflow.io/docs/latest/getstarted.html#installation)
-- Install [Singularity](https://github.com/apptainer/singularity/blob/master/INSTALL.md)
+The pipeline requires [Nextflow version >=21.10](https://www.nextflow.io/docs/latest/getstarted.html#installation) and a container technology such as [Apptainer/Singularity](https://github.com/apptainer/singularity/blob/master/INSTALL.md) or [Docker](https://www.docker.com/).
-Clone the Shallow-mapping pipeline github repo:
+A Linux/macOS system with Bash and wget installed is required to download the reference databases. We will integrate this step into the pipeline itself in the near future.
+
+> **Note:**
+> The pipeline reference databases currently need to be downloaded manually by the user on a Linux/macOS system.
+
+### Required Reference Databases
+
+The first time you run the pipeline, you must provide available indexed databases for the decontamination step, MGnify genomes catalog tables, and some external tables for DRAM visuals generation. MGnify hosts most of the databases, and setup can be done in a single step by providing the locations for the decontamination and MGnify databases where the new files will be added. The directories must already exist. Please provide full paths.
+
+Get the Shallow-mapping pipeline GitHub repository:
```bash
git clone https://github.com/EBI-Metagenomics/shallowmapping.git
```
-### Required reference databases
-
-The first time you run the pipeline you must put available indexed databases for the decontamination step, MGnify genomes catalogue tables, and some external tables for DRAM visuals generation. MGnify hosts most of the databases and setting up can be done in a single step by providing the location for decontamination and MGnify databases where the new files will be added. The directories have to exist already. Please provide full paths.
-
Consider that human-phiX decontamination reference genomes require ~15-20G of storage.
Each MGnify catalogue genomes db occupy ~1G.
```bash
cd shallowmapping
+
bash bin/databases_setup.sh \
--biome \ # Any of the MGnify catalogue ID
--catalogue_dbs_path \ # Central location of shallow-mapping dbs. A directory with the biome name will be created
diff --git a/bin/bam2cov.py b/bin/bam2cov.py
index e2a6bb2..ae7e7d4 100755
--- a/bin/bam2cov.py
+++ b/bin/bam2cov.py
@@ -1,10 +1,10 @@
#!/usr/bin/env python
import argparse
-import pysam
-import sys
import re
+import pysam
+
##### This script process BWA results to compute per genome coverage
##### Alejandra Escobar, EMBL-EBI
##### v1.0 Nov 10, 2023
@@ -43,7 +43,6 @@ def bam_parser(bam_file):
with pysam.AlignmentFile(bam_file, "rb") as input_bam:
for read in input_bam:
- read_id = str(read.query_name)
ref_genome = str(read.reference_name).split("_")[0]
ani = (
(read.query_alignment_length - read.get_tag("NM"))
diff --git a/bin/bam2cov_filt.py b/bin/bam2cov_filt.py
index 676e0bd..1bc883c 100755
--- a/bin/bam2cov_filt.py
+++ b/bin/bam2cov_filt.py
@@ -1,13 +1,11 @@
#!/usr/bin/env python3
import argparse
+
import pysam
-import sys
-import re
-##### This script process BWA results to compute relative abundance of unique mapped reads
-##### Alejandra Escobar, EMBL-EBI
-##### Nov 10, 2023
+# Constants #
+COV_THRESHOLD = 0.01
def bam_header(bwa_bam):
@@ -37,7 +35,6 @@ def bam_parser(bam_file):
with pysam.AlignmentFile(bam_file, "rb") as input_bam:
for read in input_bam:
- read_id = str(read.query_name)
ref_genome = str(read.reference_name).split("_")[0]
ani = (
(read.query_alignment_length - read.get_tag("NM"))
@@ -52,7 +49,7 @@ def bam_parser(bam_file):
reads_len_sum += read.query_length
# Unique mapping reads don't have XA tag
- if not "XA:Z:" in read.tostring():
+ if "XA:Z:" not in read.tostring():
if ref_genome in unique_matches:
unique_matches[ref_genome] += 1
else:
@@ -70,9 +67,8 @@ def bam_parser(bam_file):
return (unique_matches, ave_read_len)
-def FP_control(out_root, genomes_len, unique_matches, ave_read_len):
+def fp_control(out_root, genomes_len, unique_matches, ave_read_len):
unique_thres01 = []
- COV_THRESHOLD = 0.01
total_unique = 0
for genome in unique_matches:
assembly_len = genomes_len[genome]
@@ -140,7 +136,7 @@ def main():
(unique_matches, ave_read_len) = bam_parser(args.bwa_bam)
- FP_control(out_root, genomes_len, unique_matches, ave_read_len)
+ fp_control(out_root, genomes_len, unique_matches, ave_read_len)
if __name__ == "__main__":
diff --git a/bin/bwa_genome2species.py b/bin/bwa_genome2species.py
index 6b58144..2c1a3d0 100755
--- a/bin/bwa_genome2species.py
+++ b/bin/bwa_genome2species.py
@@ -1,24 +1,17 @@
#!/usr/bin/env python
import argparse
-import os.path
-import sys
-from Bio import SeqIO
-
-##### This script transforms BWA genomes relative abundance into species relative abundance
-##### Alejandra Escobar, EMBL-EBI
-##### Dec 18, 2023
def metadata_parser(catalogue_metadata):
ref_spec_genome = {}
- with open(catalogue_metadata, "r") as input_file:
+ with open(catalogue_metadata) as input_file:
next(input_file)
for line in input_file:
l_line = line.rstrip().split("\t")
genome = l_line[13]
lineage = l_line[14] + ";" + genome
- if not genome in ref_spec_genome:
+ if genome not in ref_spec_genome:
ref_spec_genome[genome] = lineage
return ref_spec_genome
@@ -27,7 +20,7 @@ def metadata_parser(catalogue_metadata):
def aggregate_species(genomes_relab, ref_spec_genome, out_name):
species_reads = {}
total_reads = 0
- with open(genomes_relab, "r") as input_file:
+ with open(genomes_relab) as input_file:
next(input_file)
for line in input_file:
(
diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
deleted file mode 100755
index dabf3bc..0000000
--- a/bin/check_samplesheet.py
+++ /dev/null
@@ -1,261 +0,0 @@
-#!/usr/bin/env python
-
-
-"""Provide a command line tool to validate and transform tabular samplesheets."""
-
-
-import argparse
-import csv
-import logging
-import sys
-from collections import Counter
-from pathlib import Path
-
-logger = logging.getLogger()
-
-
-class RowChecker:
- """
- Define a service that can validate and transform each given row.
-
- Attributes:
- modified (list): A list of dicts, where each dict corresponds to a previously
- validated and transformed row. The order of rows is maintained.
-
- """
-
- VALID_FORMATS = (
- ".fq.gz",
- ".fastq.gz",
- )
-
- def __init__(
- self,
- sample_col="sample",
- first_col="fastq_1",
- second_col="fastq_2",
- single_col="single_end",
- **kwargs,
- ):
- """
- Initialize the row checker with the expected column names.
-
- Args:
- sample_col (str): The name of the column that contains the sample name
- (default "sample").
- first_col (str): The name of the column that contains the first (or only)
- FASTQ file path (default "fastq_1").
- second_col (str): The name of the column that contains the second (if any)
- FASTQ file path (default "fastq_2").
- single_col (str): The name of the new column that will be inserted and
- records whether the sample contains single- or paired-end sequencing
- reads (default "single_end").
-
- """
- super().__init__(**kwargs)
- self._sample_col = sample_col
- self._first_col = first_col
- self._second_col = second_col
- self._single_col = single_col
- self._seen = set()
- self.modified = []
-
- def validate_and_transform(self, row):
- """
- Perform all validations on the given row and insert the read pairing status.
-
- Args:
- row (dict): A mapping from column headers (keys) to elements of that row
- (values).
-
- """
- self._validate_sample(row)
- self._validate_first(row)
- self._validate_second(row)
- self._validate_pair(row)
- self._seen.add((row[self._sample_col], row[self._first_col]))
- self.modified.append(row)
-
- def _validate_sample(self, row):
- """Assert that the sample name exists and convert spaces to underscores."""
- if len(row[self._sample_col]) <= 0:
- raise AssertionError("Sample input is required.")
- # Sanitize samples slightly.
- row[self._sample_col] = row[self._sample_col].replace(" ", "_")
-
- def _validate_first(self, row):
- """Assert that the first FASTQ entry is non-empty and has the right format."""
- if len(row[self._first_col]) <= 0:
- raise AssertionError("At least the first FASTQ file is required.")
- self._validate_fastq_format(row[self._first_col])
-
- def _validate_second(self, row):
- """Assert that the second FASTQ entry has the right format if it exists."""
- if len(row[self._second_col]) > 0:
- self._validate_fastq_format(row[self._second_col])
-
- def _validate_pair(self, row):
- """Assert that read pairs have the same file extension. Report pair status."""
- if row[self._first_col] and row[self._second_col]:
- row[self._single_col] = False
- first_col_suffix = Path(row[self._first_col]).suffixes[-2:]
- second_col_suffix = Path(row[self._second_col]).suffixes[-2:]
- if first_col_suffix != second_col_suffix:
- raise AssertionError("FASTQ pairs must have the same file extensions.")
- else:
- row[self._single_col] = True
-
- def _validate_fastq_format(self, filename):
- """Assert that a given filename has one of the expected FASTQ extensions."""
- if not any(filename.endswith(extension) for extension in self.VALID_FORMATS):
- raise AssertionError(
- f"The FASTQ file has an unrecognized extension: {filename}\n"
- f"It should be one of: {', '.join(self.VALID_FORMATS)}"
- )
-
- def validate_unique_samples(self):
- """
- Assert that the combination of sample name and FASTQ filename is unique.
-
- In addition to the validation, also rename all samples to have a suffix of _T{n}, where n is the
- number of times the same sample exist, but with different FASTQ files, e.g., multiple runs per experiment.
-
- """
- if len(self._seen) != len(self.modified):
- raise AssertionError("The pair of sample name and FASTQ must be unique.")
- seen = Counter()
- for row in self.modified:
- sample = row[self._sample_col]
- seen[sample] += 1
- row[self._sample_col] = f"{sample}_T{seen[sample]}"
-
-
-def read_head(handle, num_lines=10):
- """Read the specified number of lines from the current position in the file."""
- lines = []
- for idx, line in enumerate(handle):
- if idx == num_lines:
- break
- lines.append(line)
- return "".join(lines)
-
-
-def sniff_format(handle):
- """
- Detect the tabular format.
-
- Args:
- handle (text file): A handle to a `text file`_ object. The read position is
- expected to be at the beginning (index 0).
-
- Returns:
- csv.Dialect: The detected tabular format.
-
- .. _text file:
- https://docs.python.org/3/glossary.html#term-text-file
-
- """
- peek = read_head(handle)
- handle.seek(0)
- sniffer = csv.Sniffer()
- dialect = sniffer.sniff(peek)
- return dialect
-
-
-def check_samplesheet(file_in, file_out):
- """
- Check that the tabular samplesheet has the structure expected by nf-core pipelines.
-
- Validate the general shape of the table, expected columns, and each row. Also add
- an additional column which records whether one or two FASTQ reads were found.
-
- Args:
- file_in (pathlib.Path): The given tabular samplesheet. The format can be either
- CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``.
- file_out (pathlib.Path): Where the validated and transformed samplesheet should
- be created; always in CSV format.
-
- Example:
- This function checks that the samplesheet follows the following structure,
- see also the `viral recon samplesheet`_::
-
- sample,fastq_1,fastq_2
- SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz
- SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz
- SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz,
-
- .. _viral recon samplesheet:
- https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
-
- """
- required_columns = {"sample", "fastq_1", "fastq_2"}
- # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
- with file_in.open(newline="") as in_handle:
- reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle))
- # Validate the existence of the expected header columns.
- if not required_columns.issubset(reader.fieldnames):
- req_cols = ", ".join(required_columns)
- logger.critical(
- f"The sample sheet **must** contain these column headers: {req_cols}."
- )
- sys.exit(1)
- # Validate each row.
- checker = RowChecker()
- for i, row in enumerate(reader):
- try:
- checker.validate_and_transform(row)
- except AssertionError as error:
- logger.critical(f"{str(error)} On line {i + 2}.")
- sys.exit(1)
- checker.validate_unique_samples()
- header = list(reader.fieldnames)
- header.insert(1, "single_end")
- # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
- with file_out.open(mode="w", newline="") as out_handle:
- writer = csv.DictWriter(out_handle, header, delimiter=",")
- writer.writeheader()
- for row in checker.modified:
- writer.writerow(row)
-
-
-def parse_args(argv=None):
- """Define and immediately parse command line arguments."""
- parser = argparse.ArgumentParser(
- description="Validate and transform a tabular samplesheet.",
- epilog="Example: python check_samplesheet.py samplesheet.csv samplesheet.valid.csv",
- )
- parser.add_argument(
- "file_in",
- metavar="FILE_IN",
- type=Path,
- help="Tabular input samplesheet in CSV or TSV format.",
- )
- parser.add_argument(
- "file_out",
- metavar="FILE_OUT",
- type=Path,
- help="Transformed output samplesheet in CSV format.",
- )
- parser.add_argument(
- "-l",
- "--log-level",
- help="The desired log level (default WARNING).",
- choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"),
- default="WARNING",
- )
- return parser.parse_args(argv)
-
-
-def main(argv=None):
- """Coordinate argument parsing and program execution."""
- args = parse_args(argv)
- logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s")
- if not args.file_in.is_file():
- logger.error(f"The given input file {args.file_in} was not found!")
- sys.exit(2)
- args.file_out.parent.mkdir(parents=True, exist_ok=True)
- check_samplesheet(args.file_in, args.file_out)
-
-
-if __name__ == "__main__":
- sys.exit(main())
diff --git a/bin/databases_setup.sh b/bin/databases_setup.sh
index d3229ba..bc1fabe 100755
--- a/bin/databases_setup.sh
+++ b/bin/databases_setup.sh
@@ -1,50 +1,95 @@
#!/bin/bash
+# Strict mode for better error handling
+set -euo pipefail
+
+# Default configuration
+DEFAULT_DOWNLOAD_BWA="false"
+DEFAULT_CATALOGUE_DBS_PATH="./catalogue_dbs/"
+DEFAULT_DECONT_REFS_PATH="./decontamination_refs/"
+
# Define the list of valid biomes
-valid_biomes=('chicken-gut-v1-0-1' 'mouse-gut-v1-0' 'non-model-fish-gut-v2-0' 'human-vaginal-v1-0' 'honeybee-gut-v1-0-1'
- 'sheep-rumen-v1-0' 'marine-v2-0' 'zebrafish-fecal-v1-0' 'human-oral-v1-0-1' 'pig-gut-v1-0'
- 'cow-rumen-v1-0-1' 'human-gut-v2-0-2')
+declare -ra VALID_BIOMES=(
+ 'chicken-gut-v1-0-1' 'mouse-gut-v1-0' 'non-model-fish-gut-v2-0'
+ 'human-vaginal-v1-0' 'honeybee-gut-v1-0-1' 'sheep-rumen-v1-0'
+ 'marine-v2-0' 'zebrafish-fecal-v1-0' 'human-oral-v1-0-1'
+ 'pig-gut-v1-0' 'cow-rumen-v1-0-1' 'human-gut-v2-0-2'
+)
+
+# Usage function
+usage() {
+ echo "Usage: $0"
+ echo " --biome "
+ echo " --catalogue_dbs_path [default: ${DEFAULT_CATALOGUE_DBS_PATH}]"
+ echo " --decont_refs_path [default: ${DEFAULT_DECONT_REFS_PATH}]"
+ echo " --download_bwa [default: ${DEFAULT_DOWNLOAD_BWA}]"
+ exit 1
+}
+
+# Validate biome
+validate_biome() {
+ local biome="$1"
+ for valid_biome in "${VALID_BIOMES[@]}"; do
+ if [[ "$biome" == "$valid_biome" ]]; then
+ return 0
+ fi
+ done
+ echo "Error: Invalid biome '$biome'. Valid options are:"
+ printf '%s\n' "${VALID_BIOMES[@]}"
+ exit 1
+}
+
+# Parse command-line arguments with defaults
+BIOME=""
+CATALOGUE_DBS_PATH="${DEFAULT_CATALOGUE_DBS_PATH}"
+DECONT_REFS_PATH="${DEFAULT_DECONT_REFS_PATH}"
+DOWNLOAD_BWA="${DEFAULT_DOWNLOAD_BWA}"
-# Parse command-line arguments
while [[ $# -gt 0 ]]; do
key="$1"
case $key in
--biome)
BIOME="$2"
- # Check if the provided biome is in the valid biomes list
- if [[ ! " ${valid_biomes[@]} " =~ " ${BIOME} " ]]; then
- echo "The input $BIOME is not a valid biome, please use one of the following: ${valid_biomes[*]}"
- exit 1
- fi
- shift
- shift
- ;;
+ validate_biome "$BIOME"
+ shift 2
+ ;;
--catalogue_dbs_path)
CATALOGUE_DBS_PATH="$2"
- shift
- shift
+ shift 2
;;
--decont_refs_path)
DECONT_REFS_PATH="$2"
- shift
- shift
+ shift 2
;;
--download_bwa)
DOWNLOAD_BWA="$2"
- shift
- shift
+ shift 2
+ ;;
+ --help)
+ usage
;;
*)
echo "Unknown option: $1"
- exit 1
+ usage
;;
esac
done
-# Create verbose log file
+# Validate required argument
+[[ -z "$BIOME" ]] && { echo "Error: --biome is required"; usage; }
+
+# Create log file
LOG_FILE="dbs_setup_$(date +'%Y%m%d_%H%M%S').log"
exec > >(tee -a "$LOG_FILE") 2>&1
+# Ensure paths end with slash
+CATALOGUE_DBS_PATH="${CATALOGUE_DBS_PATH%/}/"
+DECONT_REFS_PATH="${DECONT_REFS_PATH%/}/"
+
+# Create directories if they don't exist
+mkdir -p "${DECONT_REFS_PATH}reference_genomes"
+mkdir -p "${CATALOGUE_DBS_PATH}"
+
# Change directory to decontamination references path
cd "$DECONT_REFS_PATH" || exit
if [ ! -d "reference_genomes" ]; then
@@ -55,14 +100,13 @@ else
cd reference_genomes || exit
fi
-
# Check if human_phix.fa.* files exist
if ls human_phix.fa.* &>/dev/null; then
echo " *** The human and phiX reference genomes already exist. Skipping download"
else
# Downloading human+phiX reference genomes
echo " *** Downloading the human and phiX reference genomes to ${DECONT_REFS_PATH}reference_genomes"
- wget --continue https://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/references/human_phiX/human_phix_ref_bwamem2.tar.gz
+ wget -nv --show-progress --progress=bar:force:noscroll --continue https://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/references/human_phiX/human_phix_ref_bwamem2.tar.gz
echo " *** Extracting human and phiX reference genomes"
tar -xvf human_phix_ref_bwamem2.tar.gz
mv bwamem2/* .
@@ -76,7 +120,7 @@ if ls ${HOST}.* &>/dev/null; then
else
# Downloading the host genome
echo " *** Downloading the $HOST reference genome to $DECONT_REFS_PATH/reference_genomes"
- wget --continue "https://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/references/$HOST/${HOST}_ref_bwamem2.tar.gz"
+ wget -nv --show-progress --progress=bar:force:noscroll --continue "https://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/references/$HOST/${HOST}_ref_bwamem2.tar.gz"
echo " *** Extracting the $HOST reference genome"
tar -xvf "${HOST}_ref_bwamem2.tar.gz"
mv bwamem2/* .
@@ -101,7 +145,7 @@ CAT_VERSION=$(echo "v$VERSION" | sed 's/-/./g' )
echo " *** Downloading catalogue related databases to ${CATALOGUE_DBS_PATH}/${BIOME}"
# Downloading the catalogue metadata file
-wget --continue "https://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/$PREFIX_BIOME/$CAT_VERSION/genomes-all_metadata.tsv"
+wget -nv --show-progress --progress=bar:force:noscroll --continue "https://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/$PREFIX_BIOME/$CAT_VERSION/genomes-all_metadata.tsv"
# Setting up the files location in ftp
TABLES_DIR="https://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/references/mgnify_genomes/${PREFIX_BIOME}_reps"
@@ -110,21 +154,21 @@ SOURMASH_DIR="$TABLES_DIR/${PREFIX_BIOME}_v${VERSION}_sourmash"
BWAMEM_DIR="$TABLES_DIR/${PREFIX_BIOME}_v${VERSION}_bwamem2.tar.gz"
# Downloading the pangenome function tables
-wget --continue "$FUNCTIONS_DIR/functional_profiles.tar.gz"
+wget -nv --show-progress --progress=bar:force:noscroll --continue "$FUNCTIONS_DIR/functional_profiles.tar.gz"
tar -xvf functional_profiles.tar.gz
rm functional_profiles.tar.gz
-wget --continue "$FUNCTIONS_DIR/kegg_completeness.tar.gz"
+wget -nv --show-progress --progress=bar:force:noscroll --continue "$FUNCTIONS_DIR/kegg_completeness.tar.gz"
tar -xvf kegg_completeness.tar.gz
rm kegg_completeness.tar.gz
# Downloading the representative genomes indexed for sourmash
-wget --continue "$SOURMASH_DIR/sourmash_species_representatives_k21.sbt.zip"
+wget -nv --show-progress --progress=bar:force:noscroll --continue "$SOURMASH_DIR/sourmash_species_representatives_k21.sbt.zip"
# Downloading bwamem2 db index if the option is set
if [ "$DOWNLOAD_BWA" = "true" ]; then
echo " *** Downloading bwamem2 indexed database for $BIOME to ${CATALOGUE_DBS_PATH}/${BIOME}"
- wget --continue "$BWAMEM_DIR"
+ wget -nv --show-progress --progress=bar:force:noscroll --continue "$BWAMEM_DIR"
tar -xvf "${PREFIX_BIOME}_${VERSION}_bwamem2.tar.gz"
mv "${PREFIX_BIOME}_${VERSION}_bwamem2"/* .
rm -r "${PREFIX_BIOME}_${VERSION}_bwamem2" "${PREFIX_BIOME}_${VERSION}_bwamem2.tar.gz"
@@ -140,19 +184,18 @@ if [ -d "external_dbs" ]; then
else
echo " *** Downloading external dbs to $CATALOGUE_DBS_PATH/external_dbs/dram_distill_dbs"
mkdir -p external_dbs/dram_distill_dbs && cd external_dbs/dram_distill_dbs || exit
- wget --continue "https://raw.githubusercontent.com/WrightonLabCSU/DRAM/v1.5.0/data/amg_database.tsv"
- wget --continue "https://raw.githubusercontent.com/WrightonLabCSU/DRAM/v1.5.0/data/etc_module_database.tsv"
- wget --continue "https://raw.githubusercontent.com/WrightonLabCSU/DRAM/v1.5.0/data/function_heatmap_form.tsv"
- wget --continue "https://raw.githubusercontent.com/WrightonLabCSU/DRAM/v1.5.0/data/genome_summary_form.tsv"
- wget --continue "https://raw.githubusercontent.com/WrightonLabCSU/DRAM/v1.5.0/data/module_step_form.tsv"
- wget --continue "https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.dat.gz"
+ wget -nv --show-progress --progress=bar:force:noscroll --continue "https://raw.githubusercontent.com/WrightonLabCSU/DRAM/v1.5.0/data/amg_database.tsv"
+ wget -nv --show-progress --progress=bar:force:noscroll --continue "https://raw.githubusercontent.com/WrightonLabCSU/DRAM/v1.5.0/data/etc_module_database.tsv"
+ wget -nv --show-progress --progress=bar:force:noscroll --continue "https://raw.githubusercontent.com/WrightonLabCSU/DRAM/v1.5.0/data/function_heatmap_form.tsv"
+ wget -nv --show-progress --progress=bar:force:noscroll --continue "https://raw.githubusercontent.com/WrightonLabCSU/DRAM/v1.5.0/data/genome_summary_form.tsv"
+ wget -nv --show-progress --progress=bar:force:noscroll --continue "https://raw.githubusercontent.com/WrightonLabCSU/DRAM/v1.5.0/data/module_step_form.tsv"
+ wget -nv --show-progress --progress=bar:force:noscroll --continue "https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.dat.gz"
fi
# Creating the CONFIG file for DRAM distill
echo " *** Creating the CONFIG file for DRAM distill"
echo '{"description_db": "None", "kegg": null, "kofam": null, "kofam_ko_list": null, "uniref": null, "pfam": null, "pfam_hmm_dat": null, "dbcan": null, "dbcan_fam_activities": null, "viral": null, "peptidase": null, "vogdb": null, "vog_annotations": null, "genome_summary_form": "/data/genome_summary_form.tsv", "module_step_form": "/data/module_step_form.tsv", "etc_module_database": "/data/etc_module_database.tsv", "function_heatmap_form": "/data/function_heatmap_form.tsv", "amg_database": "/data/amg_database.tsv"}' > CONFIG
-
echo " *** Databases setting up finished successfully for $BIOME"
echo " *** Use the following parameters to test the shallow-mapping pipeline from shallowmapping/test:"
echo " nextflow run ../main.nf \\"
diff --git a/bin/keggcomp_DB.py b/bin/keggcomp_DB.py
index 671dc2d..48b2bcb 100755
--- a/bin/keggcomp_DB.py
+++ b/bin/keggcomp_DB.py
@@ -1,18 +1,12 @@
#!/usr/bin/env python
import argparse
-import os.path
-import sys
-
-##### This script integrates the output of the kegg completeness tool to build a DB at pangenome level
-##### Alejandra Escobar, EMBL-EBI
-##### March 21, 2024
def core_parser(core_table):
all_modules = []
core_values = {}
- with open(core_table, "r") as input_file:
+ with open(core_table) as input_file:
next(input_file)
for line in input_file:
l_line = line.rstrip().split("\t")
@@ -27,7 +21,7 @@ def core_parser(core_table):
def pan_parser(all_modules, pan_table):
pan_values = {}
- with open(pan_table, "r") as input_file:
+ with open(pan_table) as input_file:
next(input_file)
for line in input_file:
l_line = line.rstrip().split("\t")
diff --git a/bin/matrix_integrator.py b/bin/matrix_integrator.py
index b9bd2c9..c9ae3d0 100755
--- a/bin/matrix_integrator.py
+++ b/bin/matrix_integrator.py
@@ -1,17 +1,10 @@
#!/usr/bin/env python
import argparse
-import os.path
-import sys
-from Bio import SeqIO
-
-##### This script integrates multiple count matices into a single output
-##### Alejandra Escobar, EMBL-EBI
-##### Jan 12, 2024
def matrix_parser(matrix_file, features_dict, all_features, all_samples):
- with open(matrix_file, "r") as input_file:
+ with open(matrix_file) as input_file:
samples_list = input_file.readline().strip().split("\t")
samples_list.pop(0)
all_samples = all_samples + samples_list
diff --git a/bin/panaroo_inputs_builder.py b/bin/panaroo_inputs_builder.py
index 4c653f0..9b944ae 100755
--- a/bin/panaroo_inputs_builder.py
+++ b/bin/panaroo_inputs_builder.py
@@ -1,23 +1,14 @@
#!/usr/bin/env python
import argparse
-import os.path
-import sys
-import wget
import gzip
+import os.path
import shutil
-import subprocess
-from Bio import SeqIO
-
-
-##### This script prepare the inputs to launch panaroo on the human-gut catalogue v2.0
-##### Alejandra Escobar, EMBL-EBI
-##### June 21, 2024
def metadata_parser(catalogue_metadata):
reps_clusters = {}
- with open(catalogue_metadata, "r") as input_file:
+ with open(catalogue_metadata) as input_file:
next(input_file)
for line in input_file:
l_line = line.rstrip().split("\t")
diff --git a/bin/panaroo_inputs_builder_custom.py b/bin/panaroo_inputs_builder_custom.py
index 4318017..4d7da4d 100755
--- a/bin/panaroo_inputs_builder_custom.py
+++ b/bin/panaroo_inputs_builder_custom.py
@@ -2,22 +2,11 @@
import argparse
import os.path
-import sys
-import wget
-import gzip
-import shutil
-import subprocess
-from Bio import SeqIO
-
-
-##### This script prepare the inputs to launch panaroo on custom databases
-##### Alejandra Escobar, EMBL-EBI
-##### July 3, 2024
def metadata_parser(drep_clstrs, derep_genomes):
clusters = {}
- with open(drep_clstrs, "r") as input_file:
+ with open(drep_clstrs) as input_file:
next(input_file)
for line in input_file:
l_line = line.rstrip().split(",")
diff --git a/bin/pangenomeDB_builder_codon.py b/bin/pangenomeDB_builder_codon.py
index 809833e..0d707d9 100755
--- a/bin/pangenomeDB_builder_codon.py
+++ b/bin/pangenomeDB_builder_codon.py
@@ -1,15 +1,10 @@
#!/usr/bin/env python
import argparse
-import os.path
-import sys
-import wget
import gzip
-from Bio import SeqIO
+import os.path
-##### This script find the accessory genes that needs eggNOG annotation in codon catalogues
-##### Alejandra Escobar, EMBL-EBI
-##### March 27, 2024
+from Bio import SeqIO
def pfam_parser(pfam_data):
@@ -35,7 +30,7 @@ def pfam_parser(pfam_data):
def metadata_parser(catalogue_metadata):
reps_clusters = {}
- with open(catalogue_metadata, "r") as input_file:
+ with open(catalogue_metadata) as input_file:
next(input_file)
for line in input_file:
l_line = line.rstrip().split("\t")
@@ -67,7 +62,7 @@ def accessory_writer(reps_clusters, loc_prefix):
# Parsing the presence/absence tab
r_tab_loc = pan_loc + "gene_presence_absence.Rtab"
accessory_genes = []
- with open(r_tab_loc, "r") as input_file:
+ with open(r_tab_loc) as input_file:
header = input_file.readline().strip().split("\t")
index = header.index(rep)
for line in input_file:
@@ -125,7 +120,7 @@ def annot_writer(reps_clusters, loc_prefix, pfam_desc):
core_tab_loc = pan_loc + "core_genes.txt"
# Saving the core genes ids
- with open(core_tab_loc, "r") as input_file:
+ with open(core_tab_loc) as input_file:
for line in input_file:
gene_name = line.rstrip()
core_list.append(gene_name)
@@ -137,7 +132,7 @@ def annot_writer(reps_clusters, loc_prefix, pfam_desc):
accesory_genes = {}
relevant_members = []
relevant_genes = []
- with open(acc_tab_loc, "r") as input_file:
+ with open(acc_tab_loc) as input_file:
next(input_file)
for line in input_file:
l_line = line.rstrip().split(",")
@@ -152,7 +147,7 @@ def annot_writer(reps_clusters, loc_prefix, pfam_desc):
else:
prefix = member_gen.split("_")[0]
if prefix != rep:
- if not gene_key in accesory_genes:
+ if gene_key not in accesory_genes:
relevant_members.append(prefix)
accesory_genes[gene_key] = member_gen
relevant_genes.append(member_gen)
@@ -190,7 +185,7 @@ def annot_writer(reps_clusters, loc_prefix, pfam_desc):
def gff_parser(gff_file):
gff_dict = {}
- with open(gff_file, "r") as input_file:
+ with open(gff_file) as input_file:
for line in input_file:
l_line = line.rstrip().split("\t")
# Annotation lines have exactly 9 columns
@@ -216,7 +211,7 @@ def gff_parser(gff_file):
def eggnog_parser(eggnog_out, gff_dict, pfam_desc):
ko_annot, cazy_annot, pfam_annot = {}, {}, {}
- with open(eggnog_out, "r") as input_file:
+ with open(eggnog_out) as input_file:
next(input_file)
for line in input_file:
l_line = line.rstrip().split("\t")
@@ -297,7 +292,7 @@ def acc_eggnog_parser(eggnog_annot, accesory_genes, acc_gff_dict, pfam_desc):
for pan_gene, genome_gene in accesory_genes.items():
rev_accesory_genes[genome_gene] = pan_gene
kegg_annot, pfam_annot, cazy_annot = {}, {}, {}
- with open(eggnog_annot, "r") as input_file:
+ with open(eggnog_annot) as input_file:
next(input_file)
for line in input_file:
l_line = line.rstrip().split("\t")
diff --git a/bin/pangenomeDB_builder_custom.py b/bin/pangenomeDB_builder_custom.py
index f99232f..0696941 100755
--- a/bin/pangenomeDB_builder_custom.py
+++ b/bin/pangenomeDB_builder_custom.py
@@ -1,15 +1,8 @@
#!/usr/bin/env python
import argparse
-import os.path
-import sys
-import wget
import gzip
-from Bio import SeqIO
-
-##### This script find the accessory genes that needs eggNOG annotation for custom genome catalogues
-##### Alejandra Escobar, EMBL-EBI
-##### July 10, 2024
+import os.path
def pfam_parser(pfam_data):
@@ -35,7 +28,7 @@ def pfam_parser(pfam_data):
def metadata_parser(drep_clstrs, derep_genomes):
clusters = {}
- with open(drep_clstrs, "r") as input_file:
+ with open(drep_clstrs) as input_file:
next(input_file)
for line in input_file:
l_line = line.rstrip().split(",")
@@ -85,7 +78,7 @@ def annot_writer(reps_clusters, prokka_path, panaroo_path, pfam_desc):
core_tab_loc = pan_loc + "core_genes.txt"
# Saving the core genes ids
- with open(core_tab_loc, "r") as input_file:
+ with open(core_tab_loc) as input_file:
for line in input_file:
gene_name = line.rstrip()
core_list.append(gene_name)
@@ -93,7 +86,7 @@ def annot_writer(reps_clusters, prokka_path, panaroo_path, pfam_desc):
# Parsing the pangenomic table to keep only one accessory gene per genome
acc_tab_loc = pan_loc + "gene_presence_absence.csv"
pan_genes = {}
- with open(acc_tab_loc, "r") as input_file:
+ with open(acc_tab_loc) as input_file:
next(input_file)
for line in input_file:
l_line = line.rstrip().split(",")
@@ -111,7 +104,7 @@ def annot_writer(reps_clusters, prokka_path, panaroo_path, pfam_desc):
if len(paralogs) > 0:
first_copy = paralogs[0]
pan_genes[gene_key].append(first_copy)
- elif not "refound" in member_gen:
+ elif "refound" not in member_gen:
pan_genes[gene_key].append(member_gen)
# Giving priority to genes in the representative genome
@@ -165,7 +158,7 @@ def annot_writer(reps_clusters, prokka_path, panaroo_path, pfam_desc):
def gff_parser(gff_file):
gff_dict = {}
- with open(gff_file, "r") as input_file:
+ with open(gff_file) as input_file:
for line in input_file:
l_line = line.rstrip().split("\t")
# Annotation lines have exactly 9 columns
@@ -191,7 +184,7 @@ def gff_parser(gff_file):
def eggnog_parser(eggnog_out, gff_dict, pfam_desc):
ko_annot, cazy_annot, pfam_annot = {}, {}, {}
- with open(eggnog_out, "r") as input_file:
+ with open(eggnog_out) as input_file:
next(input_file)
for line in input_file:
l_line = line.rstrip().split("\t")
@@ -242,7 +235,7 @@ def eggnog_parser(eggnog_out, gff_dict, pfam_desc):
def acc_gff_parser(mem_gff_out, relevant_genes, acc_gff_dict):
- with open(mem_gff_out, "r") as input_file:
+ with open(mem_gff_out) as input_file:
for line in input_file:
l_line = line.rstrip().split("\t")
# Annotation lines have exactly 9 columns
@@ -270,7 +263,7 @@ def acc_gff_parser(mem_gff_out, relevant_genes, acc_gff_dict):
def acc_eggnog_parser(eggnog_annot, relevant_genes, acc_gff_dict, pfam_desc):
kegg_annot, pfam_annot, cazy_annot = {}, {}, {}
- with open(eggnog_annot, "r") as input_file:
+ with open(eggnog_annot) as input_file:
next(input_file)
for line in input_file:
l_line = line.rstrip().split("\t")
diff --git a/bin/pangenomeDB_builder_mgnify_old.py b/bin/pangenomeDB_builder_mgnify_old.py
index d32cdd7..e56f39c 100644
--- a/bin/pangenomeDB_builder_mgnify_old.py
+++ b/bin/pangenomeDB_builder_mgnify_old.py
@@ -1,15 +1,10 @@
#!/usr/bin/env python
import argparse
-import os.path
-import sys
-import wget
import gzip
-from Bio import SeqIO
+import os.path
-##### This script find the accessory genes that needs eggNOG annotation for the huma-gut catalogue
-##### Alejandra Escobar, EMBL-EBI
-##### June 22, 2024
+from Bio import SeqIO
def pfam_parser(pfam_data):
@@ -35,7 +30,7 @@ def pfam_parser(pfam_data):
def metadata_parser(catalogue_metadata):
reps_clusters = {}
- with open(catalogue_metadata, "r") as input_file:
+ with open(catalogue_metadata) as input_file:
next(input_file)
for line in input_file:
l_line = line.rstrip().split("\t")
@@ -56,7 +51,7 @@ def accessory_writer(reps_clusters, panaroo_path):
# Parsing the presence/absence tab
r_tab_loc = pan_loc + "gene_presence_absence.Rtab"
accessory_genes = []
- with open(r_tab_loc, "r") as input_file:
+ with open(r_tab_loc) as input_file:
header = input_file.readline().strip().split("\t")
index = header.index(rep)
for line in input_file:
@@ -107,7 +102,7 @@ def annot_writer(reps_clusters, loc_prefix, panaroo_path, pfam_desc):
core_tab_loc = pan_loc + "core_genes.txt"
# Saving the core genes ids
- with open(core_tab_loc, "r") as input_file:
+ with open(core_tab_loc) as input_file:
for line in input_file:
gene_name = line.rstrip()
core_list.append(gene_name)
@@ -119,7 +114,7 @@ def annot_writer(reps_clusters, loc_prefix, panaroo_path, pfam_desc):
accesory_genes = {}
relevant_members = []
relevant_genes = []
- with open(acc_tab_loc, "r") as input_file:
+ with open(acc_tab_loc) as input_file:
next(input_file)
for line in input_file:
l_line = line.rstrip().split(",")
@@ -134,7 +129,7 @@ def annot_writer(reps_clusters, loc_prefix, panaroo_path, pfam_desc):
else:
prefix = member_gen.split("_")[0]
if prefix != rep:
- if not gene_key in accesory_genes:
+ if gene_key not in accesory_genes:
relevant_members.append(prefix)
accesory_genes[gene_key] = member_gen
relevant_genes.append(member_gen)
@@ -172,7 +167,7 @@ def annot_writer(reps_clusters, loc_prefix, panaroo_path, pfam_desc):
def gff_parser(gff_file):
gff_dict = {}
- with open(gff_file, "r") as input_file:
+ with open(gff_file) as input_file:
for line in input_file:
l_line = line.rstrip().split("\t")
# Annotation lines have exactly 9 columns
@@ -198,7 +193,7 @@ def gff_parser(gff_file):
def eggnog_parser(eggnog_out, gff_dict, pfam_desc):
ko_annot, cazy_annot, pfam_annot = {}, {}, {}
- with open(eggnog_out, "r") as input_file:
+ with open(eggnog_out) as input_file:
next(input_file)
for line in input_file:
l_line = line.rstrip().split("\t")
@@ -279,7 +274,7 @@ def acc_eggnog_parser(eggnog_annot, accesory_genes, acc_gff_dict, pfam_desc):
for pan_gene, genome_gene in accesory_genes.items():
rev_accesory_genes[genome_gene] = pan_gene
kegg_annot, pfam_annot, cazy_annot = {}, {}, {}
- with open(eggnog_annot, "r") as input_file:
+ with open(eggnog_annot) as input_file:
next(input_file)
for line in input_file:
l_line = line.rstrip().split("\t")
diff --git a/bin/sm_genome2species.py b/bin/sm_genome2species.py
index e0e95c8..0e10dbc 100755
--- a/bin/sm_genome2species.py
+++ b/bin/sm_genome2species.py
@@ -1,25 +1,18 @@
#!/usr/bin/env python
import argparse
-import os.path
-import sys
import gzip
-from Bio import SeqIO
-
-##### This script transforms sourmash genomes relative abundance into species relative abundance
-##### Alejandra Escobar, EMBL-EBI
-##### Dec 20, 2023
def metadata_parser(catalogue_metadata):
ref_spec_genome = {}
- with open(catalogue_metadata, "r") as input_file:
+ with open(catalogue_metadata) as input_file:
next(input_file)
for line in input_file:
l_line = line.rstrip().split("\t")
rep_genome = l_line[13]
lineage = l_line[14] + ";" + rep_genome
- if not rep_genome in ref_spec_genome:
+ if rep_genome not in ref_spec_genome:
ref_spec_genome[rep_genome] = lineage.replace(" ", "_")
return ref_spec_genome
diff --git a/bin/species2functions.py b/bin/species2functions.py
index ad6cd54..d776d6b 100755
--- a/bin/species2functions.py
+++ b/bin/species2functions.py
@@ -1,14 +1,7 @@
#!/usr/bin/env python
import argparse
-import os.path
-import sys
import gzip
-from Bio import SeqIO
-
-##### This script use the species prediction to generate functional tables from the pangenomic profiles
-##### Alejandra Escobar, EMBL-EBI
-##### Jan 11, 2024
def pfam_parser(pfam_data):
@@ -35,14 +28,14 @@ def pfam_parser(pfam_data):
def dram_parser(dram_form):
dram_desc = {}
- with open(dram_form, "r") as input_file:
+ with open(dram_form) as input_file:
next(input_file)
for line in input_file:
l_line = line.rstrip().split("\t")
gene_id = l_line[0]
gene_description = l_line[1].replace('"', "")
if gene_id in dram_desc:
- if not gene_description in dram_desc[gene_id]:
+ if gene_description not in dram_desc[gene_id]:
dram_desc[gene_id].append(gene_description)
else:
dram_desc[gene_id] = [gene_description]
@@ -52,7 +45,7 @@ def dram_parser(dram_form):
def relab_parser(relab_table):
taxonomy = {}
reps_list = []
- with open(relab_table, "r") as input_file:
+ with open(relab_table) as input_file:
next(input_file)
for line in input_file:
l_line = line.rstrip().split("\t")
@@ -82,7 +75,7 @@ def functions_finder_pan(reps_list, db_path):
per_gene_dict[rep_genome] = []
pan_kos = []
pan_pfams = []
- with open(db_file, "r") as input_file:
+ with open(db_file) as input_file:
next(input_file)
next(input_file)
for line in input_file:
@@ -113,7 +106,7 @@ def functions_finder_pan(reps_list, db_path):
kegg_list = [kegg]
pan_kos = pan_kos + kegg_list
for current_ko in kegg_list:
- if not current_ko in species_kos[rep_genome]:
+ if current_ko not in species_kos[rep_genome]:
species_kos[rep_genome].append(current_ko)
if pfam != "-":
@@ -123,7 +116,7 @@ def functions_finder_pan(reps_list, db_path):
pfam_list = [pfam]
pan_pfams = pan_pfams + pfam_list
for current_pfam in pfam_list:
- if not current_pfam in species_pfams[rep_genome]:
+ if current_pfam not in species_pfams[rep_genome]:
species_pfams[rep_genome].append(current_pfam)
pan_kos = list(set(pan_kos))
@@ -172,7 +165,7 @@ def functions_finder_core(reps_list, db_path):
per_gene_dict[rep_genome] = []
pan_kos = []
pan_pfams = []
- with open(db_file, "r") as input_file:
+ with open(db_file) as input_file:
next(input_file)
next(input_file)
for line in input_file:
@@ -204,7 +197,7 @@ def functions_finder_core(reps_list, db_path):
kegg_list = [kegg]
pan_kos = pan_kos + kegg_list
for current_ko in kegg_list:
- if not current_ko in species_kos[rep_genome]:
+ if current_ko not in species_kos[rep_genome]:
species_kos[rep_genome].append(current_ko)
if pfam != "-":
@@ -214,7 +207,7 @@ def functions_finder_core(reps_list, db_path):
pfam_list = [pfam]
pan_pfams = pan_pfams + pfam_list
for current_pfam in pfam_list:
- if not current_pfam in species_pfams[rep_genome]:
+ if current_pfam not in species_pfams[rep_genome]:
species_pfams[rep_genome].append(current_pfam)
pan_kos = list(set(pan_kos))
diff --git a/bin/species2pathways.py b/bin/species2pathways.py
index 5439445..c514ad5 100755
--- a/bin/species2pathways.py
+++ b/bin/species2pathways.py
@@ -1,17 +1,11 @@
#!/usr/bin/env python
import argparse
-import os.path
-import sys
-
-##### This script use the species prediction to generate species pathways completeness profiles from pangenomic tables
-##### Alejandra Escobar, EMBL-EBI
-##### March 22, 2024
def relab_parser(relab_table):
reps_list = []
- with open(relab_table, "r") as input_file:
+ with open(relab_table) as input_file:
next(input_file)
for line in input_file:
l_line = line.rstrip().split("\t")
@@ -25,7 +19,7 @@ def pathways_finder(reps_list, kegg_comp_db, core_mode):
all_pathways = []
for rep_genome in reps_list:
db_file = kegg_comp_db + "/" + rep_genome + "_clstr_kegg_comp.tsv"
- with open(db_file, "r") as input_file:
+ with open(db_file) as input_file:
next(input_file)
for line in input_file:
module, pangenome, core = line.rstrip().split("\t")
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 7c231c7..bf3db35 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -28,17 +28,17 @@
"description": "This can be any of the MGnify catalogues for which shallow-mapping databases are currently available",
"enum": [
"chicken-gut-v1-0-1",
- "mouse-gut-v1-0",
- "non-model-fish-gut-v2-0",
+ "cow-rumen-v1-0-1",
+ "human-gut-v2-0-2",
+ "human-oral-v1-0-1",
"human-vaginal-v1-0",
"honeybee-gut-v1-0-1",
- "sheep-rumen-v1-0",
"marine-v2-0",
- "zebrafish-fecal-v1-0",
- "human-oral-v1-0-1",
+ "mouse-gut-v1-0",
+ "non-model-fish-gut-v2-0",
"pig-gut-v1-0",
- "cow-rumen-v1-0-1",
- "human-gut-v2-0-2"
+ "sheep-rumen-v1-0",
+ "zebrafish-fecal-v1-0"
]
},
"run_bwa": {
From b250809c7fbd812e2e23877b973e45f3d0b00e9f Mon Sep 17 00:00:00 2001
From: Martin Beracochea
Date: Tue, 17 Dec 2024 16:33:20 +0000
Subject: [PATCH 07/16] Adjust nf-core.yml
---
.nf-core.yml | 1 +
1 file changed, 1 insertion(+)
diff --git a/.nf-core.yml b/.nf-core.yml
index c94ce15..0c263bb 100644
--- a/.nf-core.yml
+++ b/.nf-core.yml
@@ -37,6 +37,7 @@ lint:
- .github/workflows/linting.yml
- .gitignore
- pyproject.toml
+ - LICENSE
multiqc_config:
- report_comment
nextflow_config:
From d45a5f197c117619aab138e8b6c40d6dd165afa8 Mon Sep 17 00:00:00 2001
From: Martin Beracochea
Date: Wed, 18 Dec 2024 16:44:33 +0000
Subject: [PATCH 08/16] Refactor the pipeline and download dbs code.
This ended up being a massive refactoring of the pipeline.
Some of the changes:
- Remove the bash script to download the databases - replaced with a subworkflow
- Swapped the bespoke fastp module with the one from nf-core
- Move things around based on the linter errors (vscode)
- Adjusted the parameters - based on db download sciprts
- Formatted code code
- Added some test reference dbs (this still needs some work)
---
README.md | 64 +-
bin/databases_setup.sh | 206 -
conf/base.config | 72 +-
conf/codon.config | 31 +-
conf/codon_dbs.config | 10 -
conf/modules.config | 58 +-
conf/test.config | 37 +
main.nf | 34 +-
modules.json | 5 +
.../download_human_phix_bwamem2_index.nf | 15 +
.../download_mgnify_genomes_reference_dbs.nf | 65 +
modules/local/dram/distill.nf | 17 +-
modules/local/dram/download_dram_db.nf | 45 +
modules/local/fastp/main.nf | 82 -
modules/local/fastp/meta.yml | 57 -
.../{local => nf-core}/fastp/environment.yml | 2 -
modules/nf-core/fastp/main.nf | 125 +
modules/nf-core/fastp/meta.yml | 113 +
modules/nf-core/fastp/tests/main.nf.test | 576 +++
modules/nf-core/fastp/tests/main.nf.test.snap | 1331 ++++++
.../fastp/tests/nextflow.interleaved.config | 5 +
.../fastp/tests/nextflow.save_failed.config | 5 +
modules/nf-core/fastp/tests/tags.yml | 2 +
nextflow.config | 147 +-
nextflow_schema.json | 38 +-
nf-test.config | 2 +-
subworkflows/download_references.nf | 71 +
tests/bwa2mem/human_phix.fasta.0123 | Bin 0 -> 10772 bytes
tests/bwa2mem/human_phix.fasta.amb | 1 +
tests/bwa2mem/human_phix.fasta.ann | 3 +
tests/bwa2mem/human_phix.fasta.bwt.2bit.64 | Bin 0 -> 17607 bytes
tests/bwa2mem/human_phix.fasta.pac | Bin 0 -> 1348 bytes
tests/nextflow.config | 8 -
tests/reference_dbs/dram_dbs/DRAM_CONFIG.json | 20 +
.../reference_dbs/dram_dbs/Pfam-A.hmm.dat.gz | Bin 0 -> 628809 bytes
tests/reference_dbs/dram_dbs/amg_database.tsv | 280 ++
.../dram_dbs/etc_module_database.tsv | 20 +
.../dram_dbs/function_heatmap_form.tsv | 81 +
.../dram_dbs/genome_summary_form.tsv | 3715 +++++++++++++++++
.../dram_dbs/module_step_form.tsv | 3289 +++++++++++++++
.../MGYG000303700_clstr.tsv | 1265 ++++++
.../genomes-all_metadata.tsv | 2 +
.../MGYG000303700_clstr_kegg_comp.tsv | 31 +
...urmash_species_representatives_k21.sbt.zip | Bin 0 -> 10873308 bytes
workflows/shallowmapping.nf | 264 +-
45 files changed, 11471 insertions(+), 723 deletions(-)
delete mode 100755 bin/databases_setup.sh
delete mode 100644 conf/codon_dbs.config
create mode 100644 conf/test.config
create mode 100644 modules/local/download_human_phix_bwamem2_index.nf
create mode 100644 modules/local/download_mgnify_genomes_reference_dbs.nf
create mode 100644 modules/local/dram/download_dram_db.nf
delete mode 100644 modules/local/fastp/main.nf
delete mode 100644 modules/local/fastp/meta.yml
rename modules/{local => nf-core}/fastp/environment.yml (76%)
create mode 100644 modules/nf-core/fastp/main.nf
create mode 100644 modules/nf-core/fastp/meta.yml
create mode 100644 modules/nf-core/fastp/tests/main.nf.test
create mode 100644 modules/nf-core/fastp/tests/main.nf.test.snap
create mode 100644 modules/nf-core/fastp/tests/nextflow.interleaved.config
create mode 100644 modules/nf-core/fastp/tests/nextflow.save_failed.config
create mode 100644 modules/nf-core/fastp/tests/tags.yml
create mode 100644 subworkflows/download_references.nf
create mode 100644 tests/bwa2mem/human_phix.fasta.0123
create mode 100644 tests/bwa2mem/human_phix.fasta.amb
create mode 100644 tests/bwa2mem/human_phix.fasta.ann
create mode 100644 tests/bwa2mem/human_phix.fasta.bwt.2bit.64
create mode 100644 tests/bwa2mem/human_phix.fasta.pac
create mode 100644 tests/reference_dbs/dram_dbs/DRAM_CONFIG.json
create mode 100644 tests/reference_dbs/dram_dbs/Pfam-A.hmm.dat.gz
create mode 100644 tests/reference_dbs/dram_dbs/amg_database.tsv
create mode 100644 tests/reference_dbs/dram_dbs/etc_module_database.tsv
create mode 100644 tests/reference_dbs/dram_dbs/function_heatmap_form.tsv
create mode 100644 tests/reference_dbs/dram_dbs/genome_summary_form.tsv
create mode 100644 tests/reference_dbs/dram_dbs/module_step_form.tsv
create mode 100644 tests/reference_dbs/human-vaginal-v1.0/functional_profiles_DB/MGYG000303700_clstr.tsv
create mode 100644 tests/reference_dbs/human-vaginal-v1.0/genomes-all_metadata.tsv
create mode 100644 tests/reference_dbs/human-vaginal-v1.0/kegg_completeness_DB/MGYG000303700_clstr_kegg_comp.tsv
create mode 100644 tests/reference_dbs/human-vaginal-v1.0/sourmash_species_representatives_k21.sbt.zip
diff --git a/README.md b/README.md
index bfff8bd..f6c97b2 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,4 @@
-[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A523.04.0-23aa62.svg)](https://www.nextflow.io/)
-[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)
-
-## Introduction
+# Introduction
**ebi-metagenomics/shallowmapping** is a bioinformatics pipeline that generates taxonomic and functional profiles for low-yield (shallow shotgun: < 10 M reads) short raw-reads using [`MGnify biome-specific genome catalogues`](https://www.ebi.ac.uk/metagenomics/browse/genomes) as a reference.
@@ -27,35 +24,11 @@ The final output includes a species relative abundance table, Pfam and KEGG Orth
This workflow was built using [Nextflow](https://www.nextflow.io/) and follows [nf-core](https://nf-co.re/) good practices. It is containerized, so users can use either Docker or Apptainer/Singularity to run the pipeline. At the moment, it doesn't support Conda environments.
-The pipeline requires [Nextflow version >=21.10](https://www.nextflow.io/docs/latest/getstarted.html#installation) and a container technology such as [Apptainer/Singularity](https://github.com/apptainer/singularity/blob/master/INSTALL.md) or [Docker](https://www.docker.com/).
-
-A Linux/macOS system with Bash and wget installed is required to download the reference databases. We will integrate this step into the pipeline itself in the near future.
-
-> **Note:**
-> The pipeline reference databases currently need to be downloaded manually by the user on a Linux/macOS system.
+The pipeline requires [Nextflow](https://www.nextflow.io/docs/latest/getstarted.html#installation) and a container technology such as [Apptainer/Singularity](https://github.com/apptainer/singularity/blob/master/INSTALL.md) or [Docker](https://www.docker.com/).
### Required Reference Databases
-The first time you run the pipeline, you must provide available indexed databases for the decontamination step, MGnify genomes catalog tables, and some external tables for DRAM visuals generation. MGnify hosts most of the databases, and setup can be done in a single step by providing the locations for the decontamination and MGnify databases where the new files will be added. The directories must already exist. Please provide full paths.
-
-Get the Shallow-mapping pipeline GitHub repository:
-
-```bash
-git clone https://github.com/EBI-Metagenomics/shallowmapping.git
-```
-
-Consider that human-phiX decontamination reference genomes require ~15-20G of storage.
-Each MGnify catalogue genomes db occupy ~1G.
-
-```bash
-cd shallowmapping
-
-bash bin/databases_setup.sh \
- --biome \ # Any of the MGnify catalogue ID
- --catalogue_dbs_path \ # Central location of shallow-mapping dbs. A directory with the biome name will be created
- --decont_refs_path \ # Central location of reference genomes for decontamination. Other bwamem2 databases can exist there
- --download_bwa default = `false`
-```
+The first time you run the pipeline, it will download the required MGnify genomes catalog reference files and the human_phiX BWAMEM2 index. If you select a different host for decontamination, you must provide the index yourself.
Running the pipeline using bwamem2 is optional. If you want to run the pipeline with this option set the `--download_bwa true`. This database will occupy considerable storage in your system depending on the biome.
@@ -82,8 +55,8 @@ nextflow run ebi-metagenomics/shallowmapping \
--biome \
--input samplesheet.csv \
--outdir default = `results` \
- --shallow_dbs_path \
- --decont_reference_paths
+ --dbs \
+ --decontamination_indexes
```
The central location for the databases can be set in the config file.
@@ -99,6 +72,29 @@ Use `--core_mode true` for large catalogues like the human-gut to avoid over-pre
Nextflow option `-profile` can be used to select a suitable config for your computational resources. You can add profile files to the `config` directory.
Nextflow option `-resume` can be used to re-run the pipeline from the last successfully finished step.
+#### Available biomes
+
+This can be any of the MGnify catalogues for which shallow-mapping databases are currently available
+
+| Biome | Catalogue Version |
+| ------------------ | ------------------------------------------------------------------------------------ |
+| chicken-gut | [v1.0.1](https://www.ebi.ac.uk/metagenomics/genome-catalogues/chicken-gut/v1.0.1) |
+| cow-rumen | [v1.0.1](https://www.ebi.ac.uk/metagenomics/genome-catalogues/cow-rumen/v1.0.1) |
+| human-gut | [v2.0.2 ⚠️](https://www.ebi.ac.uk/metagenomics/genome-catalogues/human-gut/v2.0.2) |
+| human-oral | [v1.0.1](https://www.ebi.ac.uk/metagenomics/genome-catalogues/human-oral/v1.0.1) |
+| human-vaginal | [v1.0](https://www.ebi.ac.uk/metagenomics/genome-catalogues/human-vaginal/v1.0) |
+| honeybee-gut | [v1.0.1](https://www.ebi.ac.uk/metagenomics/genome-catalogues/honeybee-gut/v1.0.1) |
+| marine | [v2.0](https://www.ebi.ac.uk/metagenomics/genome-catalogues/marine/v2.0) |
+| mouse-gut | [v1.0](https://www.ebi.ac.uk/metagenomics/genome-catalogues/mouse-gut/v1.0) |
+| non-model-fish-gut | [v2.0](https://www.ebi.ac.uk/metagenomics/genome-catalogues/non-model-fish-gut/v2.0) |
+| pig-gut | [v1.0](https://www.ebi.ac.uk/metagenomics/genome-catalogues/pig-gut/v1.0) |
+| sheep-rumen | [v1.0](https://www.ebi.ac.uk/metagenomics/genome-catalogues/sheep-rumen/v1.0) |
+| zebrafish-fecal | [v1.0](https://www.ebi.ac.uk/metagenomics/genome-catalogues/zebrafish-fecal/v1.0) |
+
+> **⚠️ Note for human-gut**:
+>
+> The human-gut shallow-mapping database was created manually by re-running Panaroo to reconstruct the pangenomes. This is likely to have caused discrepancies in the pangenomes, so please bear that in mind.
+
## Test
To test the installed tool with your douwnloaded databases you can run the pipeline using the small test dataset. Even if there are no hits with the biome you are interested, the pipeline should finish successfully. Add `-profile` if you have set up a config profile for your compute resources.
@@ -108,8 +104,8 @@ cd shallowmapping/tests
nextflow run ../main.nf \
--input test_samplesheet.csv \
--biome \
- --shallow_dbs_path \
- --decont_reference_paths
+ --dbs \
+ --decontamination_indexes
```
## Credits
diff --git a/bin/databases_setup.sh b/bin/databases_setup.sh
deleted file mode 100755
index bc1fabe..0000000
--- a/bin/databases_setup.sh
+++ /dev/null
@@ -1,206 +0,0 @@
-#!/bin/bash
-
-# Strict mode for better error handling
-set -euo pipefail
-
-# Default configuration
-DEFAULT_DOWNLOAD_BWA="false"
-DEFAULT_CATALOGUE_DBS_PATH="./catalogue_dbs/"
-DEFAULT_DECONT_REFS_PATH="./decontamination_refs/"
-
-# Define the list of valid biomes
-declare -ra VALID_BIOMES=(
- 'chicken-gut-v1-0-1' 'mouse-gut-v1-0' 'non-model-fish-gut-v2-0'
- 'human-vaginal-v1-0' 'honeybee-gut-v1-0-1' 'sheep-rumen-v1-0'
- 'marine-v2-0' 'zebrafish-fecal-v1-0' 'human-oral-v1-0-1'
- 'pig-gut-v1-0' 'cow-rumen-v1-0-1' 'human-gut-v2-0-2'
-)
-
-# Usage function
-usage() {
- echo "Usage: $0"
- echo " --biome "
- echo " --catalogue_dbs_path [default: ${DEFAULT_CATALOGUE_DBS_PATH}]"
- echo " --decont_refs_path [default: ${DEFAULT_DECONT_REFS_PATH}]"
- echo " --download_bwa [default: ${DEFAULT_DOWNLOAD_BWA}]"
- exit 1
-}
-
-# Validate biome
-validate_biome() {
- local biome="$1"
- for valid_biome in "${VALID_BIOMES[@]}"; do
- if [[ "$biome" == "$valid_biome" ]]; then
- return 0
- fi
- done
- echo "Error: Invalid biome '$biome'. Valid options are:"
- printf '%s\n' "${VALID_BIOMES[@]}"
- exit 1
-}
-
-# Parse command-line arguments with defaults
-BIOME=""
-CATALOGUE_DBS_PATH="${DEFAULT_CATALOGUE_DBS_PATH}"
-DECONT_REFS_PATH="${DEFAULT_DECONT_REFS_PATH}"
-DOWNLOAD_BWA="${DEFAULT_DOWNLOAD_BWA}"
-
-while [[ $# -gt 0 ]]; do
- key="$1"
- case $key in
- --biome)
- BIOME="$2"
- validate_biome "$BIOME"
- shift 2
- ;;
- --catalogue_dbs_path)
- CATALOGUE_DBS_PATH="$2"
- shift 2
- ;;
- --decont_refs_path)
- DECONT_REFS_PATH="$2"
- shift 2
- ;;
- --download_bwa)
- DOWNLOAD_BWA="$2"
- shift 2
- ;;
- --help)
- usage
- ;;
- *)
- echo "Unknown option: $1"
- usage
- ;;
- esac
-done
-
-# Validate required argument
-[[ -z "$BIOME" ]] && { echo "Error: --biome is required"; usage; }
-
-# Create log file
-LOG_FILE="dbs_setup_$(date +'%Y%m%d_%H%M%S').log"
-exec > >(tee -a "$LOG_FILE") 2>&1
-
-# Ensure paths end with slash
-CATALOGUE_DBS_PATH="${CATALOGUE_DBS_PATH%/}/"
-DECONT_REFS_PATH="${DECONT_REFS_PATH%/}/"
-
-# Create directories if they don't exist
-mkdir -p "${DECONT_REFS_PATH}reference_genomes"
-mkdir -p "${CATALOGUE_DBS_PATH}"
-
-# Change directory to decontamination references path
-cd "$DECONT_REFS_PATH" || exit
-if [ ! -d "reference_genomes" ]; then
- echo " *** Creating the reference_genomes directory in $DECONT_REFS_PATH"
- mkdir reference_genomes && cd reference_genomes || exit
-else
- echo " *** The reference_genomes directory already exists in $DECONT_REFS_PATH"
- cd reference_genomes || exit
-fi
-
-# Check if human_phix.fa.* files exist
-if ls human_phix.fa.* &>/dev/null; then
- echo " *** The human and phiX reference genomes already exist. Skipping download"
-else
- # Downloading human+phiX reference genomes
- echo " *** Downloading the human and phiX reference genomes to ${DECONT_REFS_PATH}reference_genomes"
- wget -nv --show-progress --progress=bar:force:noscroll --continue https://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/references/human_phiX/human_phix_ref_bwamem2.tar.gz
- echo " *** Extracting human and phiX reference genomes"
- tar -xvf human_phix_ref_bwamem2.tar.gz
- mv bwamem2/* .
- rm -r bwamem2 human_phix_ref_bwamem2.tar.gz
-fi
-
-# Check if $HOST.* files exist
-HOST=$(echo "$BIOME" | cut -d '-' -f1)
-if ls ${HOST}.* &>/dev/null; then
- echo " *** The $HOST reference genome already exist. Skipping download"
-else
- # Downloading the host genome
- echo " *** Downloading the $HOST reference genome to $DECONT_REFS_PATH/reference_genomes"
- wget -nv --show-progress --progress=bar:force:noscroll --continue "https://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/references/$HOST/${HOST}_ref_bwamem2.tar.gz"
- echo " *** Extracting the $HOST reference genome"
- tar -xvf "${HOST}_ref_bwamem2.tar.gz"
- mv bwamem2/* .
- rm -r bwamem2 "${HOST}_ref_bwamem2.tar.gz"
-fi
-
-# Downloading the catalogue-related files
-cd "$CATALOGUE_DBS_PATH" || exit
-if [ -d "$BIOME" ]; then
- echo " *** A directory for the catalogue $BIOME already exists. Please remove the current directory to re-download. Exiting..."
- exit 1
-else
- echo " *** Creating $BIOME directory in $CATALOGUE_DBS_PATH"
- mkdir "$BIOME" && cd "$BIOME" || exit
-fi
-
-NEW_BIOME=$(echo $BIOME | sed 's/-vaginal-/-tmp-/;s/-v/|/;s/-tmp-/-vaginal-/' )
-PREFIX_BIOME=$(echo "$NEW_BIOME" | cut -d '|' -f1)
-VERSION=$(echo "$NEW_BIOME" | cut -d '|' -f2)
-CAT_VERSION=$(echo "v$VERSION" | sed 's/-/./g' )
-
-echo " *** Downloading catalogue related databases to ${CATALOGUE_DBS_PATH}/${BIOME}"
-
-# Downloading the catalogue metadata file
-wget -nv --show-progress --progress=bar:force:noscroll --continue "https://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/$PREFIX_BIOME/$CAT_VERSION/genomes-all_metadata.tsv"
-
-# Setting up the files location in ftp
-TABLES_DIR="https://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/references/mgnify_genomes/${PREFIX_BIOME}_reps"
-FUNCTIONS_DIR="$TABLES_DIR/${PREFIX_BIOME}_v${VERSION}_functions"
-SOURMASH_DIR="$TABLES_DIR/${PREFIX_BIOME}_v${VERSION}_sourmash"
-BWAMEM_DIR="$TABLES_DIR/${PREFIX_BIOME}_v${VERSION}_bwamem2.tar.gz"
-
-# Downloading the pangenome function tables
-wget -nv --show-progress --progress=bar:force:noscroll --continue "$FUNCTIONS_DIR/functional_profiles.tar.gz"
-tar -xvf functional_profiles.tar.gz
-rm functional_profiles.tar.gz
-
-wget -nv --show-progress --progress=bar:force:noscroll --continue "$FUNCTIONS_DIR/kegg_completeness.tar.gz"
-tar -xvf kegg_completeness.tar.gz
-rm kegg_completeness.tar.gz
-
-# Downloading the representative genomes indexed for sourmash
-wget -nv --show-progress --progress=bar:force:noscroll --continue "$SOURMASH_DIR/sourmash_species_representatives_k21.sbt.zip"
-
-# Downloading bwamem2 db index if the option is set
-if [ "$DOWNLOAD_BWA" = "true" ]; then
- echo " *** Downloading bwamem2 indexed database for $BIOME to ${CATALOGUE_DBS_PATH}/${BIOME}"
- wget -nv --show-progress --progress=bar:force:noscroll --continue "$BWAMEM_DIR"
- tar -xvf "${PREFIX_BIOME}_${VERSION}_bwamem2.tar.gz"
- mv "${PREFIX_BIOME}_${VERSION}_bwamem2"/* .
- rm -r "${PREFIX_BIOME}_${VERSION}_bwamem2" "${PREFIX_BIOME}_${VERSION}_bwamem2.tar.gz"
-else
- echo " *** Skipping download of bwamem2 indexed database for $BIOME"
- echo " Note you will not be able to use --run_bwa true option on shallow-mapping pipeline for this biome"
-fi
-
-# Downloading external databases for dram visualization
-cd "$CATALOGUE_DBS_PATH" || exit
-if [ -d "external_dbs" ]; then
- echo " *** Skipping external dbs downloading. The directory external_dbs already exists in $CATALOGUE_DBS_PATH"
-else
- echo " *** Downloading external dbs to $CATALOGUE_DBS_PATH/external_dbs/dram_distill_dbs"
- mkdir -p external_dbs/dram_distill_dbs && cd external_dbs/dram_distill_dbs || exit
- wget -nv --show-progress --progress=bar:force:noscroll --continue "https://raw.githubusercontent.com/WrightonLabCSU/DRAM/v1.5.0/data/amg_database.tsv"
- wget -nv --show-progress --progress=bar:force:noscroll --continue "https://raw.githubusercontent.com/WrightonLabCSU/DRAM/v1.5.0/data/etc_module_database.tsv"
- wget -nv --show-progress --progress=bar:force:noscroll --continue "https://raw.githubusercontent.com/WrightonLabCSU/DRAM/v1.5.0/data/function_heatmap_form.tsv"
- wget -nv --show-progress --progress=bar:force:noscroll --continue "https://raw.githubusercontent.com/WrightonLabCSU/DRAM/v1.5.0/data/genome_summary_form.tsv"
- wget -nv --show-progress --progress=bar:force:noscroll --continue "https://raw.githubusercontent.com/WrightonLabCSU/DRAM/v1.5.0/data/module_step_form.tsv"
- wget -nv --show-progress --progress=bar:force:noscroll --continue "https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.dat.gz"
-fi
-
-# Creating the CONFIG file for DRAM distill
-echo " *** Creating the CONFIG file for DRAM distill"
-echo '{"description_db": "None", "kegg": null, "kofam": null, "kofam_ko_list": null, "uniref": null, "pfam": null, "pfam_hmm_dat": null, "dbcan": null, "dbcan_fam_activities": null, "viral": null, "peptidase": null, "vogdb": null, "vog_annotations": null, "genome_summary_form": "/data/genome_summary_form.tsv", "module_step_form": "/data/module_step_form.tsv", "etc_module_database": "/data/etc_module_database.tsv", "function_heatmap_form": "/data/function_heatmap_form.tsv", "amg_database": "/data/amg_database.tsv"}' > CONFIG
-
-echo " *** Databases setting up finished successfully for $BIOME"
-echo " *** Use the following parameters to test the shallow-mapping pipeline from shallowmapping/test:"
-echo " nextflow run ../main.nf \\"
-echo " --biome $BIOME \\"
-echo " --input test_samplesheet.csv \\"
-echo " --outdir test_output \\"
-echo " --shallow_dbs_path $CATALOGUE_DBS_PATH \\"
-echo " --decont_reference_paths ${DECONT_REFS_PATH}reference_genomes"
diff --git a/conf/base.config b/conf/base.config
index 1b71db0..e024606 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -10,58 +10,64 @@
process {
- cpus = { check_max( 1 * task.attempt, 'cpus' ) }
- memory = { check_max( 6.GB * task.attempt, 'memory' ) }
- time = { check_max( 4.h * task.attempt, 'time' ) }
+ resourceLimits = [
+ cpus: params.max_cpus,
+ memory: params.max_memory,
+ time: params.max_time,
+ ]
- errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' }
- maxRetries = 3
- maxErrors = '-1'
+ cpus = { 1 * task.attempt }
+ memory = { 6.GB * task.attempt }
+ time = { 4.h * task.attempt }
- withLabel:process_single {
- cpus = { check_max( 1 , 'cpus' ) }
- memory = { check_max( 6.GB * task.attempt, 'memory' ) }
- time = { check_max( 4.h * task.attempt, 'time' ) }
+ errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' }
+ maxRetries = 3
+ maxErrors = '-1'
+
+ withLabel: process_single {
+ cpus = { 1 }
+ memory = { 6.GB * task.attempt }
+ time = { 4.h * task.attempt }
}
- withLabel:process_low {
- cpus = { check_max( 2 * task.attempt, 'cpus' ) }
- memory = { check_max( 12.GB * task.attempt, 'memory' ) }
- time = { check_max( 4.h * task.attempt, 'time' ) }
+ withLabel: process_low {
+ cpus = { 2 * task.attempt }
+ memory = { 12.GB * task.attempt }
+ time = { 4.h * task.attempt }
}
- withLabel:process_medium {
- cpus = { check_max( 6 * task.attempt, 'cpus' ) }
- memory = { check_max( 36.GB * task.attempt, 'memory' ) }
- time = { check_max( 8.h * task.attempt, 'time' ) }
+ withLabel: process_medium {
+ cpus = { 6 * task.attempt }
+ memory = { 36.GB * task.attempt }
+ time = { 8.h * task.attempt }
}
- withLabel:process_high {
- cpus = { check_max( 12 * task.attempt, 'cpus' ) }
- memory = { check_max( 72.GB * task.attempt, 'memory' ) }
- time = { check_max( 16.h * task.attempt, 'time' ) }
+ withLabel: process_high {
+ cpus = { 12 * task.attempt }
+ memory = { 72.GB * task.attempt }
+ time = { 16.h * task.attempt }
}
- withLabel:process_long {
- time = { check_max( 20.h * task.attempt, 'time' ) }
+ withLabel: process_long {
+ time = { 20.h * task.attempt }
}
- withLabel:process_high_memory {
- memory = { check_max( 200.GB * task.attempt, 'memory' ) }
+ withLabel: process_high_memory {
+ memory = { 200.GB * task.attempt }
}
- withLabel:error_ignore {
+ withLabel: error_ignore {
errorStrategy = 'ignore'
}
- withLabel:error_retry {
+ withLabel: error_retry {
errorStrategy = 'retry'
maxRetries = 2
}
- withName:CUSTOM_DUMPSOFTWAREVERSIONS {
+ withName: CUSTOM_DUMPSOFTWAREVERSIONS {
cache = false
}
- withName:ALIGN_BWAMEM2 {
- cpus = { check_max( 12 * task.attempt, 'cpus' ) }
+ withName: ALIGN_BWAMEM2 {
+ cpus = { 12 * task.attempt }
memory = {
def size = meta.single_end ? reads.size() : reads[0].size()
// Files bigger than 700.MB require more memory
- check_max(size > 700000000 ? 90.GB : 72.GB, 'memory')
+ size > 700000000 ? 90.GB : 72.GB
}
- time = { check_max( 16.h * task.attempt, 'time' ) }
+ time = { 16.h * task.attempt }
}
}
diff --git a/conf/codon.config b/conf/codon.config
index 55bb0bf..dd49658 100644
--- a/conf/codon.config
+++ b/conf/codon.config
@@ -12,36 +12,13 @@ params {
workDir = params.workdir
profiles {
- ebi_lsf {
-
- includeConfig 'codon_dbs.config'
-
- executor {
- name = "lsf"
- queueSize = 200
- queueGlobalStatus = true
- submitRateLimit = "10 sec"
- pollInterval = "10 sec"
- }
-
- process {
- queue = {
- task.memory >= 200.GB ? 'bigmem' : 'production'
- }
- }
-
- conda.enabled = false
-
- singularity {
- enabled = true
- autoMounts = true
- cacheDir = params.singularity_cachedir
- }
- }
ebi_slurm {
- includeConfig 'codon_dbs.config'
+ params {
+ reference_dbs = "/hps/nobackup/rdf/metagenomics/service-team/ref-dbs/shallow-mapping/"
+ decontamination_indexes = "/hps/nobackup/rdf/metagenomics/service-team/ref-dbs/bwa-mem2/"
+ }
executor {
name = "slurm"
diff --git a/conf/codon_dbs.config b/conf/codon_dbs.config
deleted file mode 100644
index 4ea2548..0000000
--- a/conf/codon_dbs.config
+++ /dev/null
@@ -1,10 +0,0 @@
-/*
- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- Config to store CODON DB paths and names
- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-*/
-
-params {
- shallow_dbs_path = "/hps/nobackup/rdf/metagenomics/service-team/ref-dbs/shallow-mapping/"
- decont_reference_paths = "/hps/nobackup/rdf/metagenomics/service-team/ref-dbs/bwa-mem2/"
-}
diff --git a/conf/modules.config b/conf/modules.config
index 2b1b3ac..b9bdb75 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -12,28 +12,49 @@
process {
+ withName: DOWNLOAD_MGNIFY_GENOMES_REFERENCE_DBS {
+ publishDir = [
+ path: { "${params.reference_dbs}/${params.biome}" },
+ mode: params.publish_dir_mode,
+ ]
+ }
+
+ withName: DOWNLOAD_DRAM_DB {
+ publishDir = [
+ path: { "${params.reference_dbs}/" },
+ mode: params.publish_dir_mode,
+ ]
+ }
+
+ withName: DOWNLOAD_HUMAN_PHIX_BWAMEM2_INDEX {
+ publishDir = [
+ path: { "${params.decontamination_indexes}/" },
+ mode: params.publish_dir_mode,
+ ]
+ }
+
withName: FASTP {
publishDir = [
path: { "${params.outdir}/quality_control/fastp" },
- mode: params.publish_dir_mode
+ mode: params.publish_dir_mode,
]
}
withName: FASTQC {
- ext.args = '--quiet'
+ ext.args = '--quiet'
publishDir = [
path: { "${params.outdir}/quality_control/decont_fastqc" },
mode: params.publish_dir_mode,
- pattern: '*.html'
+ pattern: '*.html',
]
}
withName: MULTIQC {
- ext.args = params.multiqc_title ? "--title \"$params.multiqc_title\"" : ''
+ ext.args = params.multiqc_title ? "--title \"${params.multiqc_title}\"" : ''
publishDir = [
path: { "${params.outdir}/quality_control/multiqc" },
mode: params.publish_dir_mode,
- saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
]
}
@@ -41,7 +62,7 @@ process {
publishDir = [
path: { "${params.outdir}/mapping/sourmash" },
mode: params.publish_dir_mode,
- pattern: '*.csv.gz'
+ pattern: '*.csv.gz',
]
}
@@ -49,7 +70,7 @@ process {
publishDir = [
path: { "${params.outdir}/mapping/bwamem2" },
mode: params.publish_dir_mode,
- pattern: '*u_relab_01.tsv'
+ pattern: '*u_relab_01.tsv',
]
}
@@ -57,7 +78,7 @@ process {
publishDir = [
path: { "${params.outdir}/taxonomy_tables" },
mode: params.publish_dir_mode,
- pattern: '*.tsv'
+ pattern: '*.tsv',
]
}
@@ -65,7 +86,7 @@ process {
publishDir = [
path: { "${params.outdir}/taxonomy_tables" },
mode: params.publish_dir_mode,
- pattern: '*.tsv'
+ pattern: '*.tsv',
]
}
@@ -73,7 +94,7 @@ process {
publishDir = [
path: { "${params.outdir}/function_tables" },
mode: params.publish_dir_mode,
- pattern: '*.tsv'
+ pattern: '*.tsv',
]
}
@@ -82,14 +103,13 @@ process {
[
path: { "${params.outdir}/dram_results" },
mode: params.publish_dir_mode,
- pattern: '*_species_*'
-
+ pattern: '*_species_*',
],
[
path: { "${params.outdir}/integrated_annotation" },
mode: params.publish_dir_mode,
- pattern: '*_community_*'
- ]
+ pattern: '*_community_*',
+ ],
]
}
@@ -97,7 +117,7 @@ process {
publishDir = [
path: { "${params.outdir}/kegg_completeness" },
mode: params.publish_dir_mode,
- pattern: '*.tsv'
+ pattern: '*.tsv',
]
}
@@ -105,15 +125,15 @@ process {
publishDir = [
path: { "${params.outdir}/kegg_completeness" },
mode: params.publish_dir_mode,
- pattern: '*.tsv'
+ pattern: '*.tsv',
]
}
withName: POSTPROC_INTEGRATOR {
- publishDir = [
+ publishDir = [
path: { "${params.outdir}/integrated_annotation" },
mode: params.publish_dir_mode,
- pattern: '*.tsv'
+ pattern: '*.tsv',
]
}
@@ -121,7 +141,7 @@ process {
publishDir = [
path: { "${params.outdir}/pipeline_info" },
mode: params.publish_dir_mode,
- pattern: '*_versions.yml'
+ pattern: '*_versions.yml',
]
}
}
diff --git a/conf/test.config b/conf/test.config
new file mode 100644
index 0000000..830b201
--- /dev/null
+++ b/conf/test.config
@@ -0,0 +1,37 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ Defines input files and everything required to run a fast and simple pipeline test.
+
+ Use as follows:
+ nextflow run ebi-metagenomics/miassembler -profile test, --outdir
+
+----------------------------------------------------------------------------------------
+*/
+
+profiles {
+
+ test {
+ process {
+ resourceLimits = [
+ cpus: 2,
+ memory: 6.GB,
+ time: 1.h
+ ]
+ }
+ params {
+ input = "${projectDir}/tests/test_samplesheet.csv"
+ biome = "human-vaginal-v1.0"
+ decontamination_indexes = "${projectDir}/tests/bwa2mem/"
+ reference_dbs = "${projectDir}/tests/reference_dbs/"
+ }
+
+ process {
+ errorStrategy = 'fail'
+ maxRetries = 0
+ }
+ }
+}
+
+
diff --git a/main.nf b/main.nf
index f31f99e..9950dc0 100644
--- a/main.nf
+++ b/main.nf
@@ -1,4 +1,3 @@
-#!/usr/bin/env nextflow
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ebi-metagenomics/shallowmapping
@@ -7,8 +6,6 @@
----------------------------------------------------------------------------------------
*/
-nextflow.enable.dsl = 2
-
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
VALIDATE & PRINT PARAMETER SUMMARY
@@ -17,21 +14,6 @@ nextflow.enable.dsl = 2
include { validateParameters; paramsHelp } from 'plugin/nf-validation'
-// Print help message if needed
-if (params.help) {
- def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs)
- def citation = '\n' + WorkflowMain.citation(workflow) + '\n'
- def String command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv --biome -profile docker"
- log.info logo + paramsHelp(command) + citation + NfcoreTemplate.dashedLine(params.monochrome_logs)
- System.exit(0)
-}
-
-// Validate input parameters
-if (params.validate_params) {
- validateParameters()
-}
-
-WorkflowMain.initialise(workflow, params, log)
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -45,6 +27,22 @@ include { SHALLOWMAPPING } from './workflows/shallowmapping'
// WORKFLOW: Run main ebi-metagenomics/shallowmapping analysis pipeline
//
workflow EBIMETAGENOMICS_SHALLOWMAPPING {
+ // Print help message if needed
+ if (params.help) {
+ def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs)
+ def citation = '\n' + WorkflowMain.citation(workflow) + '\n'
+ def String command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv --biome -profile docker"
+ log.info logo + paramsHelp(command) + citation + NfcoreTemplate.dashedLine(params.monochrome_logs)
+ System.exit(0)
+ }
+
+ // Validate input parameters
+ if (params.validate_params) {
+ validateParameters()
+ }
+
+ WorkflowMain.initialise(workflow, params, log)
+
SHALLOWMAPPING ()
}
diff --git a/modules.json b/modules.json
index 8f634a3..4edfa34 100644
--- a/modules.json
+++ b/modules.json
@@ -21,6 +21,11 @@
"git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
"installed_by": ["modules"]
},
+ "fastp": {
+ "branch": "master",
+ "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
+ "installed_by": ["modules"]
+ },
"fastqc": {
"branch": "master",
"git_sha": "bd8092b67b5103bdd52e300f75889442275c3117",
diff --git a/modules/local/download_human_phix_bwamem2_index.nf b/modules/local/download_human_phix_bwamem2_index.nf
new file mode 100644
index 0000000..d57b977
--- /dev/null
+++ b/modules/local/download_human_phix_bwamem2_index.nf
@@ -0,0 +1,15 @@
+process DOWNLOAD_HUMAN_PHIX_BWAMEM2_INDEX {
+
+ container "${workflow.containerEngine in ['singularity', 'apptainer']
+ ? 'https://depot.galaxyproject.org/singularity/gnu-wget:1.18--h36e9172_9'
+ : 'biocontainers/gnu-wget:1.18--h36e9172_9'}"
+
+ output:
+ path ("human_phix.fa*"), emit: human_phix_index
+
+ script:
+ """
+ wget -q --continue ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/references/human_phiX/human_phix_ref_bwamem2.tar.gz
+ tar -xvf human_phix_ref_bwamem2.tar.gz
+ """
+}
diff --git a/modules/local/download_mgnify_genomes_reference_dbs.nf b/modules/local/download_mgnify_genomes_reference_dbs.nf
new file mode 100644
index 0000000..a711d46
--- /dev/null
+++ b/modules/local/download_mgnify_genomes_reference_dbs.nf
@@ -0,0 +1,65 @@
+process DOWNLOAD_MGNIFY_GENOMES_REFERENCE_DBS {
+
+ container "${workflow.containerEngine in ['singularity', 'apptainer']
+ ? 'https://depot.galaxyproject.org/singularity/gnu-wget:1.18--h36e9172_9'
+ : 'biocontainers/gnu-wget:1.18--h36e9172_9'}"
+
+ input:
+ val biome
+ val download_bwamem2
+
+ output:
+ tuple val(biome), path("genomes-all_metadata.tsv"), emit: genomes_metadata_tsv
+ tuple val(biome), path("functional_profiles_DB/"), emit: pangenome_functional_anns_db
+ tuple val(biome), path("kegg_completeness_DB/"), emit: kegg_completeness_db
+ tuple val(biome), path("sourmash_species_representatives_k21.sbt.zip"), emit: sourmash_db
+ tuple val(biome), path("bwamem2_index/") , emit: bwamem2_index, optional: true
+
+ script:
+ def matcher = biome =~ /(.+?)(-v[0-9.\.]+)?$/
+ def biome_name = matcher[0][1]
+ def biome_version = matcher[0][2] ? matcher[0][2].substring(1) : null
+ if (!biome_version) {
+ exit("Error the biome version of ${biome} can't be parsed.")
+ }
+ // MGnify genomes catalogue data //
+ // Example: https://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/honeybee-gut/v1.0.1/
+ def biome_catalogue_ftp = "ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/mgnify_genomes/${biome_name}/${biome_version}"
+
+ // Shallow mapping specific //
+ // This FTP path contains the MGnify Genomes catalogue processed annotations, ready to be used with this pipeline
+ def ftp_base = "ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/pipelines/references/mgnify_genomes/${biome_name}_reps/${biome_version}/"
+
+ def functions_ftp = "${ftp_base}/pangenome_functional_profiles.tar.gz"
+ def kegg_ftp = "${ftp_base}/kegg_completeness.tar.gz"
+ def sourmash_ftp = "${ftp_base}/sourmash_species_representatives_k21.sbt.zip"
+ def reps_bwamem2_index_ftp = "${ftp_base}/reps_bwamem2.tar.gz"
+
+ """
+ if [[ "${download_bwamem2}" == 'True' ]];
+ then
+ # Downloading the host genome #
+ mkdir -p bwamem2_index/
+ echo " *** Downloading the biome reps mgnify genomes bwamem2 index ${reps_bwamem2_index_ftp}"
+ wget -nv --show-progress --progress=bar:force:noscroll --continue "${reps_bwamem2_index_ftp}"
+
+ echo " *** Extracting the bwamem index..."
+ tar -xvf reps_bwamem2.tar.gz -C bwamem2_index/
+ fi
+
+ # Downloading the catalogue-related files #
+ echo " *** Downloading catalogue related reference data"
+
+ # Downloading the catalogue metadata file
+ wget -nv --show-progress --progress=bar:force:noscroll --continue "${biome_catalogue_ftp}/genomes-all_metadata.tsv"
+
+ wget -nv --show-progress --progress=bar:force:noscroll --continue "${functions_ftp}"
+ tar -xvf pangenome_functional_profiles.tar.gz
+
+ wget -nv --show-progress --progress=bar:force:noscroll --continue "${kegg_ftp}"
+ tar -xvf kegg_completeness.tar.gz
+
+ # Downloading the representative genomes indexed for sourmash
+ wget -nv --show-progress --progress=bar:force:noscroll --continue "${sourmash_ftp}"
+ """
+}
diff --git a/modules/local/dram/distill.nf b/modules/local/dram/distill.nf
index 95d4378..678f94d 100644
--- a/modules/local/dram/distill.nf
+++ b/modules/local/dram/distill.nf
@@ -7,18 +7,13 @@ process DRAM_DISTILL {
'quay.io/biocontainers/dram:1.3.5--pyhdfd78af_0' }"
containerOptions {
- def arg = ""
- switch (workflow.containerEngine) {
- case 'singularity':
- arg = "--bind"
- break;
- case 'docker':
- arg = "--volume"
- break;
+ def arg = "--volume"
+ if (workflow.containerEngine == 'singularity' || workflow.containerEngine == 'apptainer') {
+ arg = "--bind"
}
- mounts = [
- "${params.shallow_dbs_path}/external_dbs/dram_distill_dbs/:/data/",
- "${params.shallow_dbs_path}/external_dbs/dram_distill_dbs/CONFIG:/usr/local/lib/python3.10/site-packages/mag_annotator/CONFIG"
+ def mounts = [
+ "${params.reference_dbs}/dram_dbs/:/data/",
+ "${params.reference_dbs}/dram_dbs/DRAM_CONFIG.json:/usr/local/lib/python3.10/site-packages/mag_annotator/CONFIG"
]
return "${arg} " + mounts.join(" ${arg} ")
}
diff --git a/modules/local/dram/download_dram_db.nf b/modules/local/dram/download_dram_db.nf
new file mode 100644
index 0000000..55eb2b7
--- /dev/null
+++ b/modules/local/dram/download_dram_db.nf
@@ -0,0 +1,45 @@
+process DOWNLOAD_DRAM_DB {
+
+ container "${workflow.containerEngine in ['singularity', 'apptainer']
+ ? 'https://depot.galaxyproject.org/singularity/gnu-wget:1.18--h36e9172_9'
+ : 'biocontainers/gnu-wget:1.18--h36e9172_9'}"
+
+ output:
+ path("dram_dbs/"), emit: dram_db
+
+ script:
+ """
+ mkdir -p dram_dbs
+
+ wget -q --continue "https://raw.githubusercontent.com/WrightonLabCSU/DRAM/v1.5.0/data/amg_database.tsv" -O dram_dbs/amg_database.tsv
+ wget -q --continue "https://raw.githubusercontent.com/WrightonLabCSU/DRAM/v1.5.0/data/etc_module_database.tsv" -O dram_dbs/etc_module_database.tsv
+ wget -q --continue "https://raw.githubusercontent.com/WrightonLabCSU/DRAM/v1.5.0/data/function_heatmap_form.tsv" -O dram_dbs/function_heatmap_form.tsv
+ wget -q --continue "https://raw.githubusercontent.com/WrightonLabCSU/DRAM/v1.5.0/data/genome_summary_form.tsv" -O dram_dbs/genome_summary_form.tsv
+ wget -q --continue "https://raw.githubusercontent.com/WrightonLabCSU/DRAM/v1.5.0/data/module_step_form.tsv" -O dram_dbs/module_step_form.tsv
+ wget -q --continue "https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.dat.gz" -O dram_dbs/Pfam-A.hmm.dat.gz
+
+ echo "Creating the CONFIG file for DRAM distill"
+ cat > dram_dbs/DRAM_CONFIG.json << EOF
+ {
+ "description_db": "None",
+ "kegg": null,
+ "kofam": null,
+ "kofam_ko_list": null,
+ "uniref": null,
+ "pfam": null,
+ "pfam_hmm_dat": null,
+ "dbcan": null,
+ "dbcan_fam_activities": null,
+ "viral": null,
+ "peptidase": null,
+ "vogdb": null,
+ "vog_annotations": null,
+ "genome_summary_form": "/data/genome_summary_form.tsv",
+ "module_step_form": "/data/module_step_form.tsv",
+ "etc_module_database": "/data/etc_module_database.tsv",
+ "function_heatmap_form": "/data/function_heatmap_form.tsv",
+ "amg_database": "/data/amg_database.tsv"
+ }
+ EOF
+ """
+}
diff --git a/modules/local/fastp/main.nf b/modules/local/fastp/main.nf
deleted file mode 100644
index fbf3f74..0000000
--- a/modules/local/fastp/main.nf
+++ /dev/null
@@ -1,82 +0,0 @@
-process FASTP {
- tag "$meta.id"
- label 'process_medium'
-
- conda "${moduleDir}/environment.yml"
- container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
- 'https://depot.galaxyproject.org/singularity/fastp:0.23.4--h5f740d0_0' :
- 'biocontainers/fastp:0.23.4--h5f740d0_0' }"
-
- input:
- tuple val(meta), path(reads)
-
- output:
- tuple val(meta), path('*.fastp.fastq.gz') , emit: reads
- tuple val(meta), path('*.json') , emit: json
- tuple val(meta), path('*.html') , emit: html
- tuple val(meta), path('*.log') , emit: log
- path "versions.yml" , emit: versions
-
- when:
- task.ext.when == null || task.ext.when
-
- script:
- def args = task.ext.args ?: ''
- def prefix = task.ext.prefix ?: "${meta.id}"
- // Added soft-links to original fastqs for consistent naming in MultiQC
- if (meta.single_end) {
- """
- [ ! -f ${prefix}.fastq.gz ] && ln -sf $reads ${prefix}.fastq.gz
- fastp \\
- --in1 ${prefix}.fastq.gz \\
- --out1 ${prefix}.fastp.fastq.gz \\
- --thread $task.cpus \\
- --json ${prefix}.fastp.json \\
- --html ${prefix}.fastp.html \\
- $args \\
- 2> >(tee ${prefix}.fastp.log >&2)
-
- cat <<-END_VERSIONS > versions.yml
- "${task.process}":
- fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g")
- END_VERSIONS
- """
- } else {
- """
- [ ! -f ${prefix}_1.fastq.gz ] && ln -sf ${reads[0]} ${prefix}_1.fastq.gz
- [ ! -f ${prefix}_2.fastq.gz ] && ln -sf ${reads[1]} ${prefix}_2.fastq.gz
- fastp \\
- --in1 ${prefix}_1.fastq.gz \\
- --in2 ${prefix}_2.fastq.gz \\
- --out1 ${prefix}_1.fastp.fastq.gz \\
- --out2 ${prefix}_2.fastp.fastq.gz \\
- --json ${prefix}.fastp.json \\
- --html ${prefix}.fastp.html \\
- --thread $task.cpus \\
- --detect_adapter_for_pe \\
- $args \\
- 2> >(tee ${prefix}.fastp.log >&2)
-
- cat <<-END_VERSIONS > versions.yml
- "${task.process}":
- fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g")
- END_VERSIONS
- """
- }
-
-
- stub:
- def prefix = task.ext.prefix ?: "${meta.id}"
- def touch_reads = meta.single_end ? "${prefix}.fastp.fastq.gz" : "${prefix}_1.fastp.fastq.gz ${prefix}_2.fastp.fastq.gz"
- """
- touch $touch_reads
- touch "${prefix}.fastp.json"
- touch "${prefix}.fastp.html"
- touch "${prefix}.fastp.log"
-
- cat <<-END_VERSIONS > versions.yml
- "${task.process}":
- fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g")
- END_VERSIONS
- """
-}
diff --git a/modules/local/fastp/meta.yml b/modules/local/fastp/meta.yml
deleted file mode 100644
index 63260e4..0000000
--- a/modules/local/fastp/meta.yml
+++ /dev/null
@@ -1,57 +0,0 @@
-name: fastp
-description: Perform adapter/quality trimming on sequencing reads
-keywords:
- - trimming
- - quality control
- - fastq
-tools:
- - fastp:
- description: |
- A tool designed to provide fast all-in-one preprocessing for FastQ files. This tool is developed in C++ with multithreading supported to afford high performance.
- documentation: https://github.com/OpenGene/fastp
- doi: 10.1093/bioinformatics/bty560
- licence: ["MIT"]
-input:
- - meta:
- type: map
- description: |
- Groovy Map containing sample information. Use 'single_end: true' to specify single ended or interleaved FASTQs. Use 'single_end: false' for paired-end reads.
- e.g. [ id:'test', single_end:false ]
- - reads:
- type: file
- description: |
- List of input FastQ files of size 1 and 2 for single-end and paired-end data,
- respectively. If you wish to run interleaved paired-end data, supply as single-end data
- but with `--interleaved_in` in your `modules.conf`'s `ext.args` for the module.
-output:
- - meta:
- type: map
- description: |
- Groovy Map containing sample information
- e.g. [ id:'test', single_end:false ]
- - reads:
- type: file
- description: The trimmed/modified/unmerged fastq reads
- pattern: "*fastp.fastq.gz"
- - json:
- type: file
- description: Results in JSON format
- pattern: "*.json"
- - html:
- type: file
- description: Results in HTML format
- pattern: "*.html"
- - log:
- type: file
- description: fastq log file
- pattern: "*.log"
- - versions:
- type: file
- description: File containing software versions
- pattern: "versions.yml"
-authors:
- - "@drpatelh"
- - "@kevinmenden"
-maintainers:
- - "@drpatelh"
- - "@kevinmenden"
diff --git a/modules/local/fastp/environment.yml b/modules/nf-core/fastp/environment.yml
similarity index 76%
rename from modules/local/fastp/environment.yml
rename to modules/nf-core/fastp/environment.yml
index 70389e6..26d4aca 100644
--- a/modules/local/fastp/environment.yml
+++ b/modules/nf-core/fastp/environment.yml
@@ -1,7 +1,5 @@
-name: fastp
channels:
- conda-forge
- bioconda
- - defaults
dependencies:
- bioconda::fastp=0.23.4
diff --git a/modules/nf-core/fastp/main.nf b/modules/nf-core/fastp/main.nf
new file mode 100644
index 0000000..e1b9f56
--- /dev/null
+++ b/modules/nf-core/fastp/main.nf
@@ -0,0 +1,125 @@
+process FASTP {
+ tag "$meta.id"
+ label 'process_medium'
+
+ conda "${moduleDir}/environment.yml"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://depot.galaxyproject.org/singularity/fastp:0.23.4--h5f740d0_0' :
+ 'biocontainers/fastp:0.23.4--h5f740d0_0' }"
+
+ input:
+ tuple val(meta), path(reads)
+ path adapter_fasta
+ val discard_trimmed_pass
+ val save_trimmed_fail
+ val save_merged
+
+ output:
+ tuple val(meta), path('*.fastp.fastq.gz') , optional:true, emit: reads
+ tuple val(meta), path('*.json') , emit: json
+ tuple val(meta), path('*.html') , emit: html
+ tuple val(meta), path('*.log') , emit: log
+ tuple val(meta), path('*.fail.fastq.gz') , optional:true, emit: reads_fail
+ tuple val(meta), path('*.merged.fastq.gz'), optional:true, emit: reads_merged
+ path "versions.yml" , emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ def args = task.ext.args ?: ''
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ def adapter_list = adapter_fasta ? "--adapter_fasta ${adapter_fasta}" : ""
+ def fail_fastq = save_trimmed_fail && meta.single_end ? "--failed_out ${prefix}.fail.fastq.gz" : save_trimmed_fail && !meta.single_end ? "--failed_out ${prefix}.paired.fail.fastq.gz --unpaired1 ${prefix}_1.fail.fastq.gz --unpaired2 ${prefix}_2.fail.fastq.gz" : ''
+ def out_fq1 = discard_trimmed_pass ?: ( meta.single_end ? "--out1 ${prefix}.fastp.fastq.gz" : "--out1 ${prefix}_1.fastp.fastq.gz" )
+ def out_fq2 = discard_trimmed_pass ?: "--out2 ${prefix}_2.fastp.fastq.gz"
+ // Added soft-links to original fastqs for consistent naming in MultiQC
+ // Use single ended for interleaved. Add --interleaved_in in config.
+ if ( task.ext.args?.contains('--interleaved_in') ) {
+ """
+ [ ! -f ${prefix}.fastq.gz ] && ln -sf $reads ${prefix}.fastq.gz
+
+ fastp \\
+ --stdout \\
+ --in1 ${prefix}.fastq.gz \\
+ --thread $task.cpus \\
+ --json ${prefix}.fastp.json \\
+ --html ${prefix}.fastp.html \\
+ $adapter_list \\
+ $fail_fastq \\
+ $args \\
+ 2> >(tee ${prefix}.fastp.log >&2) \\
+ | gzip -c > ${prefix}.fastp.fastq.gz
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g")
+ END_VERSIONS
+ """
+ } else if (meta.single_end) {
+ """
+ [ ! -f ${prefix}.fastq.gz ] && ln -sf $reads ${prefix}.fastq.gz
+
+ fastp \\
+ --in1 ${prefix}.fastq.gz \\
+ $out_fq1 \\
+ --thread $task.cpus \\
+ --json ${prefix}.fastp.json \\
+ --html ${prefix}.fastp.html \\
+ $adapter_list \\
+ $fail_fastq \\
+ $args \\
+ 2> >(tee ${prefix}.fastp.log >&2)
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g")
+ END_VERSIONS
+ """
+ } else {
+ def merge_fastq = save_merged ? "-m --merged_out ${prefix}.merged.fastq.gz" : ''
+ """
+ [ ! -f ${prefix}_1.fastq.gz ] && ln -sf ${reads[0]} ${prefix}_1.fastq.gz
+ [ ! -f ${prefix}_2.fastq.gz ] && ln -sf ${reads[1]} ${prefix}_2.fastq.gz
+ fastp \\
+ --in1 ${prefix}_1.fastq.gz \\
+ --in2 ${prefix}_2.fastq.gz \\
+ $out_fq1 \\
+ $out_fq2 \\
+ --json ${prefix}.fastp.json \\
+ --html ${prefix}.fastp.html \\
+ $adapter_list \\
+ $fail_fastq \\
+ $merge_fastq \\
+ --thread $task.cpus \\
+ --detect_adapter_for_pe \\
+ $args \\
+ 2> >(tee ${prefix}.fastp.log >&2)
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g")
+ END_VERSIONS
+ """
+ }
+
+ stub:
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ def is_single_output = task.ext.args?.contains('--interleaved_in') || meta.single_end
+ def touch_reads = (discard_trimmed_pass) ? "" : (is_single_output) ? "echo '' | gzip > ${prefix}.fastp.fastq.gz" : "echo '' | gzip > ${prefix}_1.fastp.fastq.gz ; echo '' | gzip > ${prefix}_2.fastp.fastq.gz"
+ def touch_merged = (!is_single_output && save_merged) ? "echo '' | gzip > ${prefix}.merged.fastq.gz" : ""
+ def touch_fail_fastq = (!save_trimmed_fail) ? "" : meta.single_end ? "echo '' | gzip > ${prefix}.fail.fastq.gz" : "echo '' | gzip > ${prefix}.paired.fail.fastq.gz ; echo '' | gzip > ${prefix}_1.fail.fastq.gz ; echo '' | gzip > ${prefix}_2.fail.fastq.gz"
+ """
+ $touch_reads
+ $touch_fail_fastq
+ $touch_merged
+ touch "${prefix}.fastp.json"
+ touch "${prefix}.fastp.html"
+ touch "${prefix}.fastp.log"
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g")
+ END_VERSIONS
+ """
+}
diff --git a/modules/nf-core/fastp/meta.yml b/modules/nf-core/fastp/meta.yml
new file mode 100644
index 0000000..159404d
--- /dev/null
+++ b/modules/nf-core/fastp/meta.yml
@@ -0,0 +1,113 @@
+name: fastp
+description: Perform adapter/quality trimming on sequencing reads
+keywords:
+ - trimming
+ - quality control
+ - fastq
+tools:
+ - fastp:
+ description: |
+ A tool designed to provide fast all-in-one preprocessing for FastQ files. This tool is developed in C++ with multithreading supported to afford high performance.
+ documentation: https://github.com/OpenGene/fastp
+ doi: 10.1093/bioinformatics/bty560
+ licence: ["MIT"]
+ identifier: biotools:fastp
+input:
+ - - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information. Use 'single_end: true' to specify single ended or interleaved FASTQs. Use 'single_end: false' for paired-end reads.
+ e.g. [ id:'test', single_end:false ]
+ - reads:
+ type: file
+ description: |
+ List of input FastQ files of size 1 and 2 for single-end and paired-end data,
+ respectively. If you wish to run interleaved paired-end data, supply as single-end data
+ but with `--interleaved_in` in your `modules.conf`'s `ext.args` for the module.
+ - - adapter_fasta:
+ type: file
+ description: File in FASTA format containing possible adapters to remove.
+ pattern: "*.{fasta,fna,fas,fa}"
+ - - discard_trimmed_pass:
+ type: boolean
+ description: Specify true to not write any reads that pass trimming thresholds.
+ | This can be used to use fastp for the output report only.
+ - - save_trimmed_fail:
+ type: boolean
+ description: Specify true to save files that failed to pass trimming thresholds
+ ending in `*.fail.fastq.gz`
+ - - save_merged:
+ type: boolean
+ description: Specify true to save all merged reads to a file ending in `*.merged.fastq.gz`
+output:
+ - reads:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - "*.fastp.fastq.gz":
+ type: file
+ description: The trimmed/modified/unmerged fastq reads
+ pattern: "*fastp.fastq.gz"
+ - json:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - "*.json":
+ type: file
+ description: Results in JSON format
+ pattern: "*.json"
+ - html:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - "*.html":
+ type: file
+ description: Results in HTML format
+ pattern: "*.html"
+ - log:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - "*.log":
+ type: file
+ description: fastq log file
+ pattern: "*.log"
+ - reads_fail:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - "*.fail.fastq.gz":
+ type: file
+ description: Reads the failed the preprocessing
+ pattern: "*fail.fastq.gz"
+ - reads_merged:
+ - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test', single_end:false ]
+ - "*.merged.fastq.gz":
+ type: file
+ description: Reads that were successfully merged
+ pattern: "*.{merged.fastq.gz}"
+ - versions:
+ - versions.yml:
+ type: file
+ description: File containing software versions
+ pattern: "versions.yml"
+authors:
+ - "@drpatelh"
+ - "@kevinmenden"
+maintainers:
+ - "@drpatelh"
+ - "@kevinmenden"
diff --git a/modules/nf-core/fastp/tests/main.nf.test b/modules/nf-core/fastp/tests/main.nf.test
new file mode 100644
index 0000000..30dbb8a
--- /dev/null
+++ b/modules/nf-core/fastp/tests/main.nf.test
@@ -0,0 +1,576 @@
+nextflow_process {
+
+ name "Test Process FASTP"
+ script "../main.nf"
+ process "FASTP"
+ tag "modules"
+ tag "modules_nfcore"
+ tag "fastp"
+
+ test("test_fastp_single_end") {
+
+ when {
+
+ process {
+ """
+ input[0] = Channel.of([
+ [ id:'test', single_end:true ],
+ [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ]
+ ])
+ input[1] = []
+ input[2] = false
+ input[3] = false
+ input[4] = false
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert path(process.out.html.get(0).get(1)).getText().contains("single end (151 cycles)") },
+ { assert path(process.out.log.get(0).get(1)).getText().contains("reads passed filter: 99") },
+ { assert snapshot(
+ process.out.json,
+ process.out.reads,
+ process.out.reads_fail,
+ process.out.reads_merged,
+ process.out.versions).match() }
+ )
+ }
+ }
+
+ test("test_fastp_paired_end") {
+
+ when {
+
+ process {
+ """
+ adapter_fasta = []
+ save_trimmed_pass = true
+ save_trimmed_fail = false
+ save_merged = false
+
+ input[0] = Channel.of([
+ [ id:'test', single_end:false ], // meta map
+ [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true),
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ]
+ ])
+ input[1] = []
+ input[2] = false
+ input[3] = false
+ input[4] = false
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert path(process.out.html.get(0).get(1)).getText().contains("The input has little adapter percentage (~0.000000%), probably it's trimmed before.") },
+ { assert path(process.out.log.get(0).get(1)).getText().contains("Q30 bases: 12281(88.3716%)") },
+ { assert snapshot(
+ process.out.json,
+ process.out.reads,
+ process.out.reads_fail,
+ process.out.reads_merged,
+ process.out.versions).match() }
+ )
+ }
+ }
+
+ test("fastp test_fastp_interleaved") {
+
+ config './nextflow.interleaved.config'
+ when {
+ process {
+ """
+ input[0] = Channel.of([
+ [ id:'test', single_end:true ], // meta map
+ [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_interleaved.fastq.gz', checkIfExists: true) ]
+ ])
+ input[1] = []
+ input[2] = false
+ input[3] = false
+ input[4] = false
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert path(process.out.html.get(0).get(1)).getText().contains("paired end (151 cycles + 151 cycles)") },
+ { assert path(process.out.log.get(0).get(1)).getText().contains("reads passed filter: 162") },
+ { assert process.out.reads_fail == [] },
+ { assert process.out.reads_merged == [] },
+ { assert snapshot(
+ process.out.reads,
+ process.out.json,
+ process.out.versions).match() }
+ )
+ }
+ }
+
+ test("test_fastp_single_end_trim_fail") {
+
+ when {
+
+ process {
+ """
+ input[0] = Channel.of([
+ [ id:'test', single_end:true ], // meta map
+ [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ]
+ ])
+ input[1] = []
+ input[2] = false
+ input[3] = true
+ input[4] = false
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert path(process.out.html.get(0).get(1)).getText().contains("single end (151 cycles)") },
+ { assert path(process.out.log.get(0).get(1)).getText().contains("reads passed filter: 99") },
+ { assert snapshot(
+ process.out.json,
+ process.out.reads,
+ process.out.reads_fail,
+ process.out.reads_merged,
+ process.out.versions).match() }
+ )
+ }
+ }
+
+ test("test_fastp_paired_end_trim_fail") {
+
+ config './nextflow.save_failed.config'
+ when {
+ process {
+ """
+ input[0] = Channel.of([
+ [ id:'test', single_end:false ], // meta map
+ [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true),
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true)]
+ ])
+ input[1] = []
+ input[2] = false
+ input[3] = true
+ input[4] = false
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert path(process.out.html.get(0).get(1)).getText().contains("The input has little adapter percentage (~0.000000%), probably it's trimmed before.") },
+ { assert path(process.out.log.get(0).get(1)).getText().contains("reads passed filter: 162") },
+ { assert snapshot(
+ process.out.reads,
+ process.out.reads_fail,
+ process.out.reads_merged,
+ process.out.json,
+ process.out.versions).match() }
+ )
+ }
+ }
+
+ test("test_fastp_paired_end_merged") {
+
+ when {
+ process {
+ """
+ input[0] = Channel.of([
+ [ id:'test', single_end:false ], // meta map
+ [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true),
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ]
+ ])
+ input[1] = []
+ input[2] = false
+ input[3] = false
+ input[4] = true
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert path(process.out.html.get(0).get(1)).getText().contains("The input has little adapter percentage (~0.000000%), probably it's trimmed before.") },
+ { assert path(process.out.log.get(0).get(1)).getText().contains("total reads: 75") },
+ { assert snapshot(
+ process.out.json,
+ process.out.reads,
+ process.out.reads_fail,
+ process.out.reads_merged,
+ process.out.versions).match() },
+ )
+ }
+ }
+
+ test("test_fastp_paired_end_merged_adapterlist") {
+
+ when {
+ process {
+ """
+ input[0] = Channel.of([
+ [ id:'test', single_end:false ], // meta map
+ [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true),
+ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ]
+ ])
+ input[1] = Channel.of([ file(params.modules_testdata_base_path + 'delete_me/fastp/adapters.fasta', checkIfExists: true) ])
+ input[2] = false
+ input[3] = false
+ input[4] = true
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert path(process.out.html.get(0).get(1)).getText().contains("