diff --git a/.circleci/config.yml b/.circleci/config.yml index 0390dca9e0..62e2e17cda 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -60,10 +60,9 @@ jobs: - setup_remote_docker - run: command: docker build -t nfcore/sarekvep:dev.${GENOME} containers/vep/. --build-arg GENOME=${GENOME} --build-arg SPECIES=${SPECIES} --build-arg VEP_VERSION=${VEP_VERSION} - no_output_timeout: 45m + no_output_timeout: 1.5h - run: command: echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin ; docker push nfcore/sarekvep:dev.${GENOME} - no_output_timeout: 45m vepgrch38: docker: @@ -77,10 +76,9 @@ jobs: - setup_remote_docker - run: command: docker build -t nfcore/sarekvep:dev.${GENOME} containers/vep/. --build-arg GENOME=${GENOME} --build-arg SPECIES=${SPECIES} --build-arg VEP_VERSION=${VEP_VERSION} - no_output_timeout: 45m + no_output_timeout: 1.5h - run: command: echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin ; docker push nfcore/sarekvep:dev.${GENOME} - no_output_timeout: 45m vepgrcm38: docker: @@ -94,10 +92,9 @@ jobs: - setup_remote_docker - run: command: docker build -t nfcore/sarekvep:dev.${GENOME} containers/vep/. --build-arg GENOME=${GENOME} --build-arg SPECIES=${SPECIES} --build-arg VEP_VERSION=${VEP_VERSION} - no_output_timeout: 45m + no_output_timeout: 30m - run: command: echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin ; docker push nfcore/sarekvep:dev.${GENOME} - no_output_timeout: 45m workflows: version: 2 diff --git a/README.md b/README.md index fadcd83b64..00749a51c7 100644 --- a/README.md +++ b/README.md @@ -3,14 +3,16 @@ **An open-source analysis pipeline to detect germline or somatic variants from whole genome or targeted sequencing**. -> :warning: This pipeline is a work in progress being ported to nf-core from [SciLifeLab/Sarek](https://github/SciLifeLab/Sarek) +> :warning: This pipeline is a work in progress being ported to nf-core from [SciLifeLab/Sarek](https://github/SciLifeLab/Sarek/) -[![Nextflow version][nextflow-badge]](https://www.nextflow.io) -[![Travis build status][travis-badge]](https://travis-ci.org/nf-core/sarek) -[![CircleCi build status][circleci-badge]](https://circleci.com/gh/nf-core/workflows/sarek) +[![Nextflow version][nextflow-badge]](https://www.nextflow.io/) +[![nf-core][nf-core-badge]](https://nf-co.re/) + +[![Travis build status][travis-badge]](https://travis-ci.com/nf-core/sarek/) +[![CircleCi build status][circleci-badge]](https://circleci.com/gh/nf-core/sarek/) [![Install with bioconda][bioconda-badge]](http://bioconda.github.io/) -[![Docker Container available][docker-sarek-badge]](https://hub.docker.com/r/nfcore/sarek) +[![Docker Container available][docker-sarek-badge]](https://hub.docker.com/r/nfcore/sarek/) [![Install with Singularity][singularity-badge]](https://www.sylabs.io/docs/) [![Join us on Slack][slack-badge]](https://nfcore.slack.com/messages/CGFUX04HZ/) @@ -33,16 +35,12 @@ It is listed on the [Elixir - Tools and Data Services Registry](https://bio.tool The nf-core/sarek pipeline comes with documentation about the pipeline, found in the `docs/` directory: 1. [Installation](https://nf-co.re/usage/installation) - * [Installation documentation](docs/INSTALL.md) - * [Installation documentation specific for UPPMAX `rackham`](docs/INSTALL_RACKHAM.md) - * [Installation documentation specific for UPPMAX `bianca`](docs/INSTALL_BIANCA.md) 2. Pipeline configuration * [Local installation](https://nf-co.re/usage/local_installation) * [Adding your own system config](https://nf-co.re/usage/adding_own_config) * [Reference genomes](https://nf-co.re/usage/reference_genomes) 3. [Running the pipeline](docs/usage.md) * [Tests documentation](docs/TESTS.md) - * [Reference files documentation](docs/REFERENCES.md) * [Configuration and profiles documentation](docs/CONFIG.md) * [Intervals documentation](docs/INTERVALS.md) * [Running the pipeline](docs/USAGE.md) @@ -51,8 +49,8 @@ The nf-core/sarek pipeline comes with documentation about the pipeline, found in * [Examples](docs/USE_CASES.md) * [Input files documentation](docs/INPUT.md) * [Processes documentation](docs/PROCESS.md) -4. [Output and how to interpret the results](docs/output.md) * [Documentation about containers](docs/CONTAINERS.md) +4. [Output and how to interpret the results](docs/output.md) * [Complementary information about ASCAT](docs/ASCAT.md) * [Complementary information about annotations](docs/ANNOTATION.md) * [Output documentation structure](docs/OUTPUT.md) @@ -69,7 +67,7 @@ You can choose which variant callers to use, plus the pipeline is capable of acc The worflow steps and tools used are as follows: -1. **Preprocessing** - `main.nf` _(based on [GATK best practices](https://software.broadinstitute.org/gatk/best-practices/))_ +1. **Preprocessing** _(based on [GATK best practices](https://software.broadinstitute.org/gatk/best-practices/))_ * Map reads to Reference * [BWA](http://bio-bwa.sourceforge.net/) * Mark Duplicates @@ -77,13 +75,13 @@ The worflow steps and tools used are as follows: * Base (Quality Score) Recalibration * [GATK BaseRecalibrator](https://github.com/broadinstitute/gatk) * [GATK ApplyBQSR](https://github.com/broadinstitute/gatk) -2. **Germline variant calling** - `germlineVC.nf` +2. **Germline variant calling** * SNVs and small indels * [GATK HaplotypeCaller](https://github.com/broadinstitute/gatk) * [Strelka2](https://github.com/Illumina/strelka) * Structural variants * [Manta](https://github.com/Illumina/manta) -3. **Somatic variant calling** - `somaticVC.nf` _(optional)_ +3. **Somatic variant calling** * SNVs and small indels * [MuTect2](https://github.com/broadinstitute/gatk) * [Freebayes](https://github.com/ekg/freebayes) @@ -92,11 +90,19 @@ The worflow steps and tools used are as follows: * [Manta](https://github.com/Illumina/manta) * Sample heterogeneity, ploidy and CNVs * [ASCAT](https://github.com/Crick-CancerGenomics/ascat) -4. **Annotation** - `annotate.nf` _(optional)_ +4. **Annotation** * Variant annotation * [SnpEff](http://snpeff.sourceforge.net/) * [VEP (Variant Effect Predictor)](https://www.ensembl.org/info/docs/tools/vep/index.html) -5. **Reporting** - `runMultiQC.nf` +5. **QC and Reporting** + * QC + * [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) + * [Qualimap bamqc](http://qualimap.bioinfo.cipf.es/doc_html/command_line.html) + * [samtools stats](https://www.htslib.org/doc/samtools.html) + * [GATK MarkDuplicates](https://github.com/broadinstitute/gatk) + * [bcftools stats](http://www.htslib.org/doc/bcftools.html) + * [VCFtools](https://vcftools.github.io/index.html) + * [SnpEff](http://snpeff.sourceforge.net/) * Reporting * [MultiQC](http://multiqc.info) @@ -139,16 +145,17 @@ For further information or help, don't hesitate to get in touch on [Slack](https :-:|:-: [![National Genomics Infrastructure](docs/images/NGI_logo.png)](https://ngisweden.scilifelab.se/) | [![National Bioinformatics Infrastructure Sweden](docs/images/NBIS_logo.png)](https://nbis.se) -[bioconda-badge]: https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?logo= +[bioconda-badge]: https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?logo= [btb-link]: https://ki.se/forskning/barntumorbanken-0 [circleci-badge]: https://img.shields.io/circleci/project/github/nf-core/sarek.svg?logo=circleci [docker-sarek-badge]: https://img.shields.io/docker/automated/nfcore/sarek.svg?logo=docker [docker-snpeff-badge]: https://img.shields.io/docker/automated/nfcore/sareksnpeff.svg?logo=docker [docker-vep-badge]: https://img.shields.io/docker/automated/nfcore/sarekvep.svg?logo=docker [nbis-link]: https://nbis.se -[nextflow-badge]: https://img.shields.io/badge/nextflow-%E2%89%A519.04.0-brightgreen.svg?logo= +[nextflow-badge]: https://img.shields.io/badge/nextflow-%E2%89%A519.04.0-brightgreen.svg?logo= +[nf-core-badge]: https://img.shields.io/badge/nf--core-pipeline-brightgreen.svg?logo= [ngi-link]: https://ngisweden.scilifelab.se/ [scilifelab-link]: https://scilifelab.se -[singularity-badge]: https://img.shields.io/badge/use%20with-singularity-purple.svg +[singularity-badge]: https://img.shields.io/badge/use%20with-singularity-purple.svg?logo= [slack-badge]: https://img.shields.io/badge/slack-nfcore/sarek-blue.svg?logo=slack [travis-badge]: https://img.shields.io/travis/nf-core/sarek.svg?logo=travis diff --git a/bin/build_reference.sh b/bin/build_reference.sh index ad01808d69..faa7607645 100755 --- a/bin/build_reference.sh +++ b/bin/build_reference.sh @@ -1,7 +1,6 @@ #!/bin/bash set -xeuo pipefail -BUILD=false TEST=ALL TRAVIS_BUILD_DIR=${TRAVIS_BUILD_DIR:-.} TRAVIS=${TRAVIS:-false} @@ -15,24 +14,16 @@ do shift # past argument shift # past value ;; - -b|--build) - BUILD=true - shift # past value - ;; *) # unknown option shift # past argument ;; esac done -# Always download test data -rm -rf data -git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data - # Build references for smallGRCh37 -if [[ BUILD ]] && [[ $TEST != ANNOTATESNPEFF ]] && [[ $TEST != ANNOTATEVEP ]] +if [[ $TEST != ANNOTATESNPEFF ]] && [[ $TEST != ANNOTATEVEP ]] then rm -rf references - nextflow run ${TRAVIS_BUILD_DIR}/build.nf -profile docker -ansi-log false --publishDirMode link --max_memory 7.GB --max_cpus 2 -dump-channels --genome smallGRCh37 --refdir data/reference --outdir references + nextflow run ${TRAVIS_BUILD_DIR}/build.nf -profile test,docker --build --outdir references -ansi-log false -dump-channels rm -rf .nextflow* references/pipeline_info work fi diff --git a/bin/run_tests.sh b/bin/run_tests.sh index 98119c2174..beb2b77ac6 100755 --- a/bin/run_tests.sh +++ b/bin/run_tests.sh @@ -26,23 +26,25 @@ do done function run_sarek() { - nextflow run ${TRAVIS_BUILD_DIR}/main.nf -profile docker -ansi-log false --publishDirMode link --max_memory 7.GB --max_cpus 2 -dump-channels --genome smallGRCh37 --igenomes_base references $@ + nextflow run ${TRAVIS_BUILD_DIR}/main.nf -profile test,docker -ansi-log false -dump-channels $@ } if [[ ALL,GERMLINE =~ $TEST ]] then + rm -rf data + git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data run_sarek --sample data/testdata/tiny/normal --tools HaplotypeCaller,Strelka --noReports - run_sarek --step recalibrate --noReports + run_sarek --step recalibrate --sample results/Preprocessing/TSV/duplicateMarked.tsv --noReports fi if [[ ALL,SOMATIC =~ $TEST ]] then - run_sarek --sample data/testdata/tsv/tiny-manta.tsv --tools FreeBayes,HaplotypeCaller,Manta,Strelka,Mutect2 --noReports + run_sarek --tools FreeBayes,HaplotypeCaller,Manta,Strelka,Mutect2 --noReports fi if [[ ALL,TARGETED =~ $TEST ]] then - run_sarek --sample data/testdata/tsv/tiny-manta.tsv --tools FreeBayes,HaplotypeCaller,Manta,Strelka,Mutect2 --noReports --targetBED data/testdata/target.bed + run_sarek --tools FreeBayes,HaplotypeCaller,Manta,Strelka,Mutect2 --noReports --targetBED https://github.com/nf-core/test-datasets/raw/sarek/testdata/target.bed fi if [[ ALL,ANNOTATEALL,ANNOTATESNPEFF,ANNOTATEVEP =~ $TEST ]] @@ -57,10 +59,10 @@ then then ANNOTATOR=merge,snpEFF,VEP fi - run_sarek --step annotate --tools ${ANNOTATOR} --annotateVCF data/testdata/vcf/Strelka_1234N_variants.vcf.gz --noReports + run_sarek --step annotate --tools ${ANNOTATOR} --sample https://github.com/nf-core/test-datasets/raw/sarek/testdata/vcf/Strelka_1234N_variants.vcf.gz --noReports fi if [[ MULTIPLE =~ $TEST ]] then - run_sarek --sample data/testdata/tsv/tiny-multiple.tsv --tools FreeBayes,HaplotypeCaller,Manta,Strelka,Mutect2 --noReports + run_sarek --sample https://github.com/nf-core/test-datasets/raw/sarek/testdata/tsv/tiny-multiple.tsv --tools FreeBayes,HaplotypeCaller,Manta,Strelka,Mutect2 --noReports fi diff --git a/build.nf b/build.nf index 979601371c..583fccac5a 100644 --- a/build.nf +++ b/build.nf @@ -21,14 +21,23 @@ Usage: you're reading it BUILD REFERENCES: - nextflow run build.nf [--refdir --outdir ] - --refdir - Specify a directory containing reference files + nextflow run build.nf --build --outdir [--offline] + --build + Will build reference files for smallGRCh37 --outdir Specify an output directory + --offline + Will use data as the source for the reference files + Need to do: + `git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data` + Before transfering the repo to an offline location + DOWNLOAD CACHE: nextflow run build.nf --download_cache [--snpEff_cache ] [--vep_cache ] + [--cadd_cache --cadd_version ] + --download_cache + Will download specified cache --snpEff_cache Specify path to snpEff cache If none, will use snpEff version specified in configuration @@ -54,15 +63,29 @@ DOWNLOAD CACHE: // Show help message if (params.help) exit 0, helpMessage() -ch_referencesFiles = Channel.fromPath("${params.refdir}/*") - // Default value for params +params.build = null +params.offline = null params.cadd_cache = null params.cadd_version = 'v1.5' params.genome = 'smallGRCh37' params.snpEff_cache = null params.vep_cache = null +ch_referencesFiles = Channel.empty() + +if ((params.build) && (params.offline)) ch_referencesFiles = Channel.fromPath("data/reference/*") +if ((params.build) && (!params.offline)) ch_referencesFiles = ch_referencesFiles.mix( + Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/1000G_phase1.indels.b37.small.vcf.gz"), + Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/1000G_phase3_20130502_SNP_maf0.3.small.loci"), + Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/1000G_phase3_20130502_SNP_maf0.3.small.loci.gc"), + Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/Mills_and_1000G_gold_standard.indels.b37.small.vcf.gz"), + Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/dbsnp_138.b37.small.vcf.gz"), + Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/human_g1k_v37_decoy.small.fasta.gz"), + Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/small.intervals")) + +ch_referencesFiles = ch_referencesFiles.dump(tag:'Reference Files') + // Check if genome exists in the config file if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) { exit 1, "The provided genome '${params.genome}' is not available in the iGenomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}" diff --git a/conf/test.config b/conf/test.config index 84e483eb2a..95a9a6ce6d 100644 --- a/conf/test.config +++ b/conf/test.config @@ -8,18 +8,18 @@ */ params { - config_profile_name = 'Test profile' config_profile_description = 'Minimal test dataset to check pipeline function' + config_profile_name = 'Test profile' // Limit resources so that this can run on Travis max_cpus = 2 - max_memory = 6.GB + max_memory = 7.GB max_time = 48.h // Input data - // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets - // TODO nf-core: Give any required params for the test so that command line flags are not needed - singleEnd = false - readPaths = [ - ['Testdata', ['https://github.com/nf-core/test-datasets/raw/exoseq/testdata/Testdata_R1.tiny.fastq.gz', 'https://github.com/nf-core/test-datasets/raw/exoseq/testdata/Testdata_R2.tiny.fastq.gz']], - ['SRR389222', ['https://github.com/nf-core/test-datasets/raw/methylseq/testdata/SRR389222_sub1.fastq.gz', 'https://github.com/nf-core/test-datasets/raw/methylseq/testdata/SRR389222_sub2.fastq.gz']] - ] + sample = 'https://github.com/nf-core/test-datasets/raw/sarek/testdata/tsv/tiny-manta-https.tsv' + // Small reference genome + // To be build with: `nextflow run build.nf --build -profile docker --outdir references` + genome = 'smallGRCh37' + igenomes_base = 'references' + // Use publishDir mode link so that work can be removed + publishDirMode = 'link' } diff --git a/docs/reference.md b/docs/reference.md new file mode 100644 index 0000000000..5d772a2733 --- /dev/null +++ b/docs/reference.md @@ -0,0 +1,21 @@ +# Genomes and reference files + +## AWS iGenomes +Sarek is using [AWS iGenomes](https://ewels.github.io/AWS-iGenomes/), which facilitate storing and sharing references. +Both `GRCh37` and `GRCh38` are available with `--genome GRCh37` or `--genome GRCh38` respectively with any profile using the `conf/igenomes.config` file, or you can specify it with `-c conf/igenomes.config`. + +Sarek currently uses `GRCh38` by default. + +Settings in `igenomes.config` can be tailored to your needs. + +The [`build.nf`](#buildnf) script is used to build the indexes for the reference test. + +Use `--genome smallGRCh37` to map against a small reference genome based on GRCh37. + +## build.nf + +The `build.nf` script can build the files needed for smallGRCh37. + +``` +nextflow run build.nf +``` diff --git a/main.nf b/main.nf index 9775904fb9..cf73043548 100644 --- a/main.nf +++ b/main.nf @@ -93,7 +93,6 @@ if (params.genomes && params.genome && !params.genomes.containsKey(params.genome // Default value for params params.annotateTools = null -params.annotateVCF = null params.annotation_cache = null params.cadd_InDels = null params.cadd_InDels_tbi = null @@ -104,10 +103,12 @@ params.noReports = null params.nucleotidesPerSecond = 1000.0 params.sample = null params.sequencing_center = null +params.snpEff_cache = null params.step = 'mapping' params.strelkaBP = true params.targetBED = null params.tools = null +params.vep_cache = null stepList = defineStepList() step = params.step ? params.step.toLowerCase() : '' @@ -117,7 +118,6 @@ if ( step.contains(',') ) exit 1, 'You can choose only one step, see --help for tools = params.tools ? params.tools.split(',').collect{it.trim().toLowerCase()} : [] annotateTools = params.annotateTools ? params.annotateTools.split(',').collect{it.trim().toLowerCase()} : [] -annotateVCF = params.annotateVCF ? params.annotateVCF.split(',').collect{it.trim()} : [] toolList = defineToolList() if ( !checkParameterList(tools,toolList) ) exit 1, 'Unknown tool(s), see --help for more information' @@ -148,7 +148,7 @@ ch_output_docs = Channel.fromPath("${baseDir}/docs/output.md") */ tsvPath = null -if (params.sample) if (hasExtension(params.sample,"tsv")) tsvPath = params.sample +if (params.sample) if (hasExtension(params.sample,"tsv") || hasExtension(params.sample,"vcf") || hasExtension(params.sample,"vcf.gz")) tsvPath = params.sample // No need for tsv file for step annotate if (!params.sample) { @@ -166,6 +166,7 @@ if (tsvPath) { case 'mapping': inputFiles = extractSample(tsvFile); break case 'recalibrate': bamFiles = extractRecal(tsvFile); break case 'variantcalling': bamFiles = extractBams(tsvFile); break + case 'annotate': break default: exit 1, "Unknown step ${step}" } } else if (params.sample) if (!hasExtension(params.sample,"tsv")) { @@ -174,9 +175,11 @@ if (tsvPath) { inputFiles = extractFastqFromDir(params.sample) (inputFiles, fastqTmp) = inputFiles.into(2) fastqTmp.toList().subscribe onNext: { - if (it.size() == 0) exit 1, "No FASTQ files found in --sample directory '${params.sample}'" -} -tsvFile = params.sample // used in the reports + if (it.size() == 0) exit 1, "No FASTQ files found in --sample directory '${params.sample}'" + } + tsvFile = params.sample // used in the reports +} else if (step == 'annotate') { + println "Annotating ${tsvFile}" } else exit 1, 'No sample were defined, see --help' if (step == 'recalibrate') (patientGenders, bamFiles) = extractGenders(bamFiles) @@ -1558,7 +1561,7 @@ vcfToAnnotate = Channel.create() if (step == 'annotate') { vcfNotToAnnotate = Channel.create() - if (annotateVCF == []) { + if (tsvPath == []) { // Sarek, by default, annotates all available vcfs that it can find in the VariantCalling directory // Excluding vcfs from FreeBayes, and g.vcf from HaplotypeCaller // Basically it's: VariantCalling/*/{HaplotypeCaller,Manta,MuTect2,Strelka}/*.vcf.gz @@ -1580,7 +1583,7 @@ if (step == 'annotate') { } else if (annotateTools == []) { // Annotate user-submitted VCFs // If user-submitted, Sarek assume that the idSample should be assumed automatically - vcfToAnnotate = Channel.fromPath(annotateVCF) + vcfToAnnotate = Channel.fromPath(tsvPath) .map{vcf -> ['userspecified', vcf.minus(vcf.fileName)[-2].toString(), vcf]} } else exit 1, "specify only tools or files to annotate, not both" @@ -1620,8 +1623,6 @@ process RunSnpeff { reducedVCF = reduceVCF(vcf) cache = (params.snpEff_cache && params.annotation_cache) ? "-dataDir \${PWD}/${dataDir}" : "" """ - echo ${task.container} - snpEff -Xmx${task.memory.toGiga()}g \ ${snpeffDb} \ -csvStats ${reducedVCF}_snpEff.csv \ diff --git a/nextflow.config b/nextflow.config index c062fb9173..e6de815088 100644 --- a/nextflow.config +++ b/nextflow.config @@ -9,17 +9,11 @@ params { // Workflow flags - // TODO nf-core: Specify your pipeline's command line flags - reads = "data/*{1,2}.fastq.gz" - singleEnd = false + genome = 'GRCh38' outdir = './results' + publishDirMode = 'symlink' // Boilerplate options - publishDirMode = 'symlink' - snpEff_cache = '' - cadd_version = '' - vep_cache = '' - genome = 'GRCh38' name = false multiqc_config = "$baseDir/assets/multiqc_config.yaml" email = false