diff --git a/README.md b/README.md index a076e2a..6d071d3 100644 --- a/README.md +++ b/README.md @@ -67,8 +67,11 @@ Name of the bowtie index, e.g. hg19_1kgmaj
Run [vcfmix](https://github.com/AlexOrlek/VCFMIX), yes or no. Set to no for synthetic samples
* **resistance_profiler**
Run resistance profiling for Mycobacterium tubercuclosis. Either ["tb-profiler"](https://tbdr.lshtm.ac.uk/) or "none". -* **afanc_myco_db**
Path to the [afanc](https://github.com/ArthurVM/Afanc) database used for speciation. Obtain from https://s3.climb.ac.uk/microbial-bioin-sp3/Mycobacteriaciae_DB_7.0.tar.gz -
+* **afanc_myco_db**
+Path to the [afanc](https://github.com/ArthurVM/Afanc) database used for speciation. Obtain from https://s3.climb.ac.uk/microbial-bioin-sp3/Mycobacteriaciae_DB_7.0.tar.gz +* **update_tbprofiler**
+Update tb-profiler. Either "yes" or "no". "yes" may be useful when running outside of a container for the first time as we will not have constructed a tb-profiler database matching our reference. This is not needed with the climb, docker and singluarity profiles as the reference has already been added. Alternatively you can run ```tb-profiler update_tbdb --match_ref /resources/tuberculosis.fasta```. + For more information on the parameters run `nextflow run main.nf --help` @@ -83,49 +86,57 @@ NXF_VER=20.11.0-edge nextflow run main.nf -stub -config testing.config ## Checkpoints ## Checkpoints used throughout this workflow to fail a sample/issue warnings: -processes preprocessing:checkFqValidity or preprocessing:checkBamValidity -1. (Fail) If sample does not pass fqtools 'validate' or samtools 'quickcheck', as appropriate. +**processes preprocessing:checkFqValidity or preprocessing:checkBamValidity** +1. (*Fail*) If sample does not pass fqtools 'validate' or samtools 'quickcheck', as appropriate. -process preprocessing:countReads\ -2. (Fail) If sample contains < 100k pairs of raw reads. +**process preprocessing:countReads** + +2. (*Fail*) If sample contains < 100k pairs of raw reads. -process preprocessing:fastp\ -3. (Fail) If sample contains < 100k pairs of cleaned reads, required to all be > 50bp (cleaning using fastp with --length_required 50 --average_qual 10 --low_complexity_filter --correction --cut_right --cut_tail --cut_tail_window_size 1 --cut_tail_mean_quality 20). - -process preprocessing:kraken2\ -4. (Fail) If the top family hit is not Mycobacteriaceae\ -5. (Fail) If there are fewer than 100k reads classified as Mycobacteriaceae \ -6. (Warn) If the top family classification is mycobacterial, but this is not consistent with top genus and species classifications\ -7. (Warn) If the top family is Mycobacteriaceae but no G1 (species complex) classifications meet minimum thresholds of > 5000 reads or > 0.5% of the total reads (this is not necessarily a concern as not all mycobacteria have a taxonomic classification at this rank)\ -8. (Warn) If sample is mixed or contaminated - defined as containing reads > the 5000/0.5% thresholds from multiple non-human species\ -9. (Warn) If sample contains multiple classifications to mycobacterial species complexes, each meeting the > 5000/0.5% thresholds\ -10. (Warn) If no species classification meets the 5000/0.5% thresholds\ -11. (Warn) If no genus classification meets the 5000/0.5% thresholds +**process preprocessing:fastp** + +3. (*Fail*) If sample contains < 100k pairs of cleaned reads, required to all be > 50bp (cleaning using fastp with --length_required 50 --average_qual 10 --low_complexity_filter --correction --cut_right --cut_tail --cut_tail_window_size 1 --cut_tail_mean_quality 20). + +**process preprocessing:kraken2** + +4. (*Fail*) If the top family hit is not Mycobacteriaceae +5. (*Fail*) If there are fewer than 100k reads classified as Mycobacteriaceae +6. (*Warn*) If the top family classification is mycobacterial, but this is not consistent with top genus and species classifications +7. (*Warn*) If the top family is Mycobacteriaceae but no G1 (species complex) classifications meet minimum thresholds of > 5000 reads or > 0.5% of the total reads (this is not necessarily a concern as not all mycobacteria have a taxonomic classification at this rank) +8. (*Warn*) If sample is mixed or contaminated - defined as containing reads > the 5000/0.5% thresholds from multiple non-human species +9. (*Warn*) If sample contains multiple classifications to mycobacterial species complexes, each meeting the > 5000/0.5% thresholds +10. (*Warn*) If no species classification meets the 5000/0.5% thresholds +11. (*Warn*) If no genus classification meets the 5000/0.5% thresholds -process preprocessing:identifyBacterialContaminants\ -12. (Fail) If regardless of what Kraken reports, Afanc does not make a species-level mycobacterial classification (note that we do not use Kraken mycobacterial classifications other than to determine whether 100k reads are family Mycobacteriaceae; for higher-resolution classification, we defer to Afanc)\ -13. (Fail) If the sample is not contaminated and the top species hit is not one of the 10 supported Mycobacteria: abscessus|africanum|avium|bovis|chelonae|chimaera|fortuitum|intracellulare|kansasii|tuberculosis\ -14. (Fail) If the sample is not contaminated and the top species hit is contrary to the species expected (e.g. "avium" rather than "tuberculosis" - only tested if you provide that expectation)\ -15. (Warn) If the top Afanc species hit, on the basis of highest % coverage, does not also have the highest median depth\ -16. (Warn) If we are unable to associate an NCBI taxon ID to any given contaminant species, which means we will not be able to locate its genome, and thereby remove it as a contaminant\ -17. (Warn) If we are unable to determine a URL for the latest RefSeq genome associated with a contaminant species' taxon ID\ -18. (Warn) If no complete genome could be found for a contaminant species. The workflow will proceed with alignment-based contaminant removal, but you're warned that there's reduced confidence in detecting reads from this species +**process preprocessing:identifyBacterialContaminants** + +12. (*Fail*) If regardless of what Kraken reports, Afanc does not make a species-level mycobacterial classification (note that we do not use Kraken mycobacterial classifications other than to determine whether 100k reads are family Mycobacteriaceae; for higher-resolution classification, we defer to Afanc) +13. (*Fail*) If the sample is not contaminated and the top species hit is not one of the 10 supported Mycobacteria: abscessus|africanum|avium|bovis|chelonae|chimaera|fortuitum|intracellulare|kansasii|tuberculosis +14. (*Fail*) If the sample is not contaminated and the top species hit is contrary to the species expected (e.g. "avium" rather than "tuberculosis" - only tested if you provide that expectation) +15. (*Warn*) If the top Afanc species hit, on the basis of highest % coverage, does not also have the highest median depth +16. (*Warn*) If we are unable to associate an NCBI taxon ID to any given contaminant species, which means we will not be able to locate its genome, and thereby remove it as a contaminant +17. (*Warn*) If we are unable to determine a URL for the latest RefSeq genome associated with a contaminant species' taxon ID +18. (*Warn*) If no complete genome could be found for a contaminant species. The workflow will proceed with alignment-based contaminant removal, but you're warned that there's reduced confidence in detecting reads from this species -process preprocessing:downloadContamGenomes\ -19. (Fail) If a contaminant is detected but we are unable to download a representative genome, and thereby remove it +**process preprocessing:downloadContamGenomes** + +19. (*Fail*) If a contaminant is detected but we are unable to download a representative genome, and thereby remove it -process preprocessing:summarise\ -20. (Fail) If after having taken an alignment-based approach to decontamination, Kraken still detects a contaminant species\ -21. (Fail) If after having taken an alignment-based approach to decontamination, the top species hit is not one of the 10 supported Mycobacteria\ -22. (Fail) If, after successfully removing contaminants, the top species hit is contrary to the species expected (e.g. "avium" rather than "tuberculosis" - only tested if you provide that expectation) - -process clockwork:alignToRef\ -23. (Fail) If < 100k reads could be aligned to the reference genome\ -24. (Fail) If, after aligning to the reference genome, the average read mapping quality < 10\ -25. (Fail) If < 50% of the reference genome was covered at 10-fold depth - -process clockwork:minos\ -26. (Warn) If sample is not TB, then it is not passed to a resistance profiler +**process preprocessing:summarise** + +20. (*Fail*) If after having taken an alignment-based approach to decontamination, Kraken still detects a contaminant species +21. (*Fail*) If after having taken an alignment-based approach to decontamination, the top species hit is not one of the 10 supported Mycobacteria +22. (*Fail*) If, after successfully removing contaminants, the top species hit is contrary to the species expected (e.g. "avium" rather than "tuberculosis" - only tested if you provide that expectation) + +**process clockwork:alignToRef** + +23. (*Fail*) If < 100k reads could be aligned to the reference genome +24. (*Fail*) If, after aligning to the reference genome, the average read mapping quality < 10 +25. (*Fail*) If < 50% of the reference genome was covered at 10-fold depth + +**process clockwork:minos** + +26. (*Warn*) If sample is not TB, then it is not passed to a resistance profiler ## Acknowledgements ## For a list of direct authors of this pipeline, please see the contributors list. All of the software dependencies of this pipeline are recorded in the version.json diff --git a/config/containers.config b/config/containers.config index e961b71..a0983db 100644 --- a/config/containers.config +++ b/config/containers.config @@ -1,13 +1,4 @@ -params{ - container_enabled = "true" - container_enabled = "true" -} - - -process { - update_tbprofiler = "false" - - +process { withLabel:low_cpu {cpus = 2} withLabel:normal_cpu { cpus = 8 } withLabel:low_memory { memory = '5GB' } diff --git a/docker/Dockerfile.tbprofiler-0.9.8 b/docker/Dockerfile.tbprofiler-0.9.8 index a176110..b6390e1 100644 --- a/docker/Dockerfile.tbprofiler-0.9.8 +++ b/docker/Dockerfile.tbprofiler-0.9.8 @@ -11,7 +11,8 @@ ARG TBPROFILER_VER="6.2.0" # this version is the shortened commit hash on the `master` branch here https://github.com/jodyphelan/tbdb/ # commits are found on https://github.com/jodyphelan/tbdb/commits/master # this was the latest commit as of 2024-05-01 -ARG TBDB_VER="152d603" + +ARG TBDB_VER="e6a0040" # LABEL instructions tag the image with metadata that might be important to the user LABEL base.image="micromamba:1.3.0" @@ -48,7 +49,8 @@ ENV PATH="/opt/conda/bin:${PATH}" # Version of database can be confirmed at /opt/conda/share/tbprofiler/tbdb.version.json # can also run 'tb-profiler list_db' to find the same version info # In 5.0.1 updating_tbdb does not work with tb-profiler update_tbdb --commit ${TBDB_VER} -RUN tb-profiler update_tbdb --commit ${TBDB_VER} WORKDIR /data -RUN tb-profiler update_tbdb --match_ref tuberculosis.fasta + +#wants full path to reference +RUN tb-profiler update_tbdb --match_ref /data/tuberculosis.fasta --commit ${TBDB_VER} diff --git a/main.nf b/main.nf index 0cd98f2..60a579a 100644 --- a/main.nf +++ b/main.nf @@ -92,12 +92,6 @@ if(!resistance_profilers.contains(params.resistance_profiler)){ exit 1, 'Invalid resistance profiler. Must be one of "tb-profiler" or "none" to skip.' } -//tbprofiler container already has the reference genome in the DB, so skip if using docker -if((params.resistance_profiler == "tb-profiler") && (params.container_enabled == true)) { - update_tbprofiler = true -} else { - update_tbprofiler = false -} resistance_profiler = params.resistance_profiler diff --git a/nextflow.config b/nextflow.config index 1ea7d51..25e79c4 100644 --- a/nextflow.config +++ b/nextflow.config @@ -35,7 +35,9 @@ params { vcfmix = 'yes' resistance_profiler = "tb-profiler" - update_tbprofiler = "yes" + + update_tbprofiler = "no" + // path to singularity recipes directory (needed to strip software versions in getversion) sing_dir = "${baseDir}/singularity"