Improve tests and docs (#8)

* improve tests + add some docs
nf-core · May 14, 2019 · f9e13a4 · f9e13a4
1 parent 3c5f236
commit f9e13a4
Show file tree

Hide file tree

Showing 9 changed files with 109 additions and 73 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -60,10 +60,9 @@ jobs:
       - setup_remote_docker
       - run:
           command: docker build -t nfcore/sarekvep:dev.${GENOME} containers/vep/. --build-arg GENOME=${GENOME} --build-arg SPECIES=${SPECIES} --build-arg VEP_VERSION=${VEP_VERSION}
-          no_output_timeout: 45m
+          no_output_timeout: 1.5h
       - run:
           command: echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin ; docker push nfcore/sarekvep:dev.${GENOME}
-          no_output_timeout: 45m
 
   vepgrch38:
     docker:
@@ -77,10 +76,9 @@ jobs:
       - setup_remote_docker
       - run:
           command: docker build -t nfcore/sarekvep:dev.${GENOME} containers/vep/. --build-arg GENOME=${GENOME} --build-arg SPECIES=${SPECIES} --build-arg VEP_VERSION=${VEP_VERSION}
-          no_output_timeout: 45m
+          no_output_timeout: 1.5h
       - run:
           command: echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin ; docker push nfcore/sarekvep:dev.${GENOME}
-          no_output_timeout: 45m
 
   vepgrcm38:
     docker:
@@ -94,10 +92,9 @@ jobs:
       - setup_remote_docker
       - run:
           command: docker build -t nfcore/sarekvep:dev.${GENOME} containers/vep/. --build-arg GENOME=${GENOME} --build-arg SPECIES=${SPECIES} --build-arg VEP_VERSION=${VEP_VERSION}
-          no_output_timeout: 45m
+          no_output_timeout: 30m
       - run:
           command: echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin ; docker push nfcore/sarekvep:dev.${GENOME}
-          no_output_timeout: 45m
 
 workflows:
   version: 2

diff --git a/README.md b/README.md
diff --git a/bin/build_reference.sh b/bin/build_reference.sh
@@ -1,7 +1,6 @@
 #!/bin/bash
 set -xeuo pipefail
 
-BUILD=false
 TEST=ALL
 TRAVIS_BUILD_DIR=${TRAVIS_BUILD_DIR:-.}
 TRAVIS=${TRAVIS:-false}
@@ -15,24 +14,16 @@ do
     shift # past argument
     shift # past value
     ;;
-    -b|--build)
-    BUILD=true
-    shift # past value
-    ;;
     *) # unknown option
     shift # past argument
     ;;
   esac
 done
 
-# Always download test data
-rm -rf data
-git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data
-
 # Build references for smallGRCh37
-if [[ BUILD ]] && [[ $TEST != ANNOTATESNPEFF ]] && [[ $TEST != ANNOTATEVEP ]]
+if [[ $TEST != ANNOTATESNPEFF ]] && [[ $TEST != ANNOTATEVEP ]]
 then
   rm -rf references
-  nextflow run ${TRAVIS_BUILD_DIR}/build.nf -profile docker -ansi-log false --publishDirMode link --max_memory 7.GB --max_cpus 2 -dump-channels --genome smallGRCh37 --refdir data/reference --outdir references
+  nextflow run ${TRAVIS_BUILD_DIR}/build.nf -profile test,docker --build --outdir references -ansi-log false -dump-channels
   rm -rf .nextflow* references/pipeline_info work
 fi
diff --git a/bin/run_tests.sh b/bin/run_tests.sh
@@ -26,23 +26,25 @@ do
 done
 
 function run_sarek() {
-  nextflow run ${TRAVIS_BUILD_DIR}/main.nf -profile docker -ansi-log false --publishDirMode link --max_memory 7.GB --max_cpus 2 -dump-channels --genome smallGRCh37 --igenomes_base references $@
+  nextflow run ${TRAVIS_BUILD_DIR}/main.nf -profile test,docker -ansi-log false -dump-channels $@
 }
 
 if [[ ALL,GERMLINE =~ $TEST ]]
 then
+  rm -rf data
+  git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data
   run_sarek --sample data/testdata/tiny/normal --tools HaplotypeCaller,Strelka --noReports
-  run_sarek --step recalibrate --noReports
+  run_sarek --step recalibrate --sample results/Preprocessing/TSV/duplicateMarked.tsv --noReports
 fi
 
 if [[ ALL,SOMATIC =~ $TEST ]]
 then
-	run_sarek --sample data/testdata/tsv/tiny-manta.tsv --tools FreeBayes,HaplotypeCaller,Manta,Strelka,Mutect2 --noReports
+	run_sarek --tools FreeBayes,HaplotypeCaller,Manta,Strelka,Mutect2 --noReports
 fi
 
 if [[ ALL,TARGETED =~ $TEST ]]
 then
-	run_sarek --sample data/testdata/tsv/tiny-manta.tsv --tools FreeBayes,HaplotypeCaller,Manta,Strelka,Mutect2 --noReports --targetBED data/testdata/target.bed
+	run_sarek --tools FreeBayes,HaplotypeCaller,Manta,Strelka,Mutect2 --noReports --targetBED https://github.com/nf-core/test-datasets/raw/sarek/testdata/target.bed
 fi
 
 if [[ ALL,ANNOTATEALL,ANNOTATESNPEFF,ANNOTATEVEP =~ $TEST ]]
@@ -57,10 +59,10 @@ then
   then
     ANNOTATOR=merge,snpEFF,VEP
   fi
-  run_sarek --step annotate --tools ${ANNOTATOR} --annotateVCF data/testdata/vcf/Strelka_1234N_variants.vcf.gz --noReports
+  run_sarek --step annotate --tools ${ANNOTATOR} --sample https://github.com/nf-core/test-datasets/raw/sarek/testdata/vcf/Strelka_1234N_variants.vcf.gz --noReports
 fi
 
 if [[ MULTIPLE =~ $TEST ]]
 then
-  run_sarek --sample data/testdata/tsv/tiny-multiple.tsv --tools FreeBayes,HaplotypeCaller,Manta,Strelka,Mutect2 --noReports
+  run_sarek --sample https://github.com/nf-core/test-datasets/raw/sarek/testdata/tsv/tiny-multiple.tsv --tools FreeBayes,HaplotypeCaller,Manta,Strelka,Mutect2 --noReports
 fi
diff --git a/build.nf b/build.nf
@@ -21,14 +21,23 @@ Usage:
       you're reading it
 
 BUILD REFERENCES:
-  nextflow run build.nf [--refdir <pathToDirectory> --outdir <pathToDirectory>]
-    --refdir <Directoy>
-      Specify a directory containing reference files
+  nextflow run build.nf --build --outdir <pathToDirectory> [--offline]
+    --build
+      Will build reference files for smallGRCh37
     --outdir <Directoy>
       Specify an output directory
 
+    --offline
+      Will use data as the source for the reference files
+      Need to do:
+      `git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data`
+      Before transfering the repo to an offline location
+
 DOWNLOAD CACHE:
   nextflow run build.nf --download_cache [--snpEff_cache <pathToSNPEFFcache>] [--vep_cache <pathToVEPcache>]
+                                         [--cadd_cache <pathToCADDcache> --cadd_version <CADD Version>]
+    --download_cache
+      Will download specified cache
     --snpEff_cache <Directoy>
       Specify path to snpEff cache
       If none, will use snpEff version specified in configuration
@@ -54,15 +63,29 @@ DOWNLOAD CACHE:
 // Show help message
 if (params.help) exit 0, helpMessage()
 
-ch_referencesFiles = Channel.fromPath("${params.refdir}/*")
-
 // Default value for params
+params.build = null
+params.offline = null
 params.cadd_cache = null
 params.cadd_version = 'v1.5'
 params.genome = 'smallGRCh37'
 params.snpEff_cache = null
 params.vep_cache = null
 
+ch_referencesFiles = Channel.empty()
+
+if ((params.build) && (params.offline)) ch_referencesFiles = Channel.fromPath("data/reference/*")
+if ((params.build) && (!params.offline)) ch_referencesFiles = ch_referencesFiles.mix(
+  Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/1000G_phase1.indels.b37.small.vcf.gz"),
+  Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/1000G_phase3_20130502_SNP_maf0.3.small.loci"),
+  Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/1000G_phase3_20130502_SNP_maf0.3.small.loci.gc"),
+  Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/Mills_and_1000G_gold_standard.indels.b37.small.vcf.gz"),
+  Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/dbsnp_138.b37.small.vcf.gz"),
+  Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/human_g1k_v37_decoy.small.fasta.gz"),
+  Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/small.intervals"))
+
+ch_referencesFiles = ch_referencesFiles.dump(tag:'Reference Files')
+
 // Check if genome exists in the config file
 if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) {
     exit 1, "The provided genome '${params.genome}' is not available in the iGenomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}"

diff --git a/conf/test.config b/conf/test.config
@@ -8,18 +8,18 @@
  */
 
 params {
-  config_profile_name = 'Test profile'
   config_profile_description = 'Minimal test dataset to check pipeline function'
+  config_profile_name = 'Test profile'
   // Limit resources so that this can run on Travis
   max_cpus = 2
-  max_memory = 6.GB
+  max_memory = 7.GB
   max_time = 48.h
   // Input data
-  // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
-  // TODO nf-core: Give any required params for the test so that command line flags are not needed
-  singleEnd = false
-  readPaths = [
-    ['Testdata', ['https://github.com/nf-core/test-datasets/raw/exoseq/testdata/Testdata_R1.tiny.fastq.gz', 'https://github.com/nf-core/test-datasets/raw/exoseq/testdata/Testdata_R2.tiny.fastq.gz']],
-    ['SRR389222', ['https://github.com/nf-core/test-datasets/raw/methylseq/testdata/SRR389222_sub1.fastq.gz', 'https://github.com/nf-core/test-datasets/raw/methylseq/testdata/SRR389222_sub2.fastq.gz']]
-  ]
+  sample = 'https://github.com/nf-core/test-datasets/raw/sarek/testdata/tsv/tiny-manta-https.tsv'
+  // Small reference genome
+  // To be build with: `nextflow run build.nf --build -profile docker --outdir references`
+  genome = 'smallGRCh37'
+  igenomes_base = 'references'
+  // Use publishDir mode link so that work can be removed
+  publishDirMode = 'link'
 }
diff --git a/docs/reference.md b/docs/reference.md
@@ -0,0 +1,21 @@
+# Genomes and reference files
+
+## AWS iGenomes
+Sarek is using [AWS iGenomes](https://ewels.github.io/AWS-iGenomes/), which facilitate storing and sharing references.
+Both `GRCh37` and `GRCh38` are available with `--genome GRCh37` or `--genome GRCh38` respectively with any profile using the `conf/igenomes.config` file, or you can specify it with `-c conf/igenomes.config`.
+
+Sarek currently uses `GRCh38` by default.
+
+Settings in `igenomes.config` can be tailored to your needs.
+
+The [`build.nf`](#buildnf) script is used to build the indexes for the reference test.
+
+Use `--genome smallGRCh37` to map against a small reference genome based on GRCh37.
+
+## build.nf
+
+The `build.nf` script can build the files needed for smallGRCh37.
+
+```
+nextflow run build.nf
+```
diff --git a/main.nf b/main.nf
@@ -93,7 +93,6 @@ if (params.genomes && params.genome && !params.genomes.containsKey(params.genome
 
 // Default value for params
 params.annotateTools = null
-params.annotateVCF = null
 params.annotation_cache = null
 params.cadd_InDels = null
 params.cadd_InDels_tbi = null
@@ -104,10 +103,12 @@ params.noReports = null
 params.nucleotidesPerSecond = 1000.0
 params.sample = null
 params.sequencing_center = null
+params.snpEff_cache = null
 params.step = 'mapping'
 params.strelkaBP = true
 params.targetBED = null
 params.tools = null
+params.vep_cache = null
 
 stepList = defineStepList()
 step = params.step ? params.step.toLowerCase() : ''
@@ -117,7 +118,6 @@ if ( step.contains(',') ) exit 1, 'You can choose only one step, see --help for
 
 tools = params.tools ? params.tools.split(',').collect{it.trim().toLowerCase()} : []
 annotateTools = params.annotateTools ? params.annotateTools.split(',').collect{it.trim().toLowerCase()} : []
-annotateVCF = params.annotateVCF ? params.annotateVCF.split(',').collect{it.trim()} : []
 toolList = defineToolList()
 if ( !checkParameterList(tools,toolList) ) exit 1, 'Unknown tool(s), see --help for more information'
 
@@ -148,7 +148,7 @@ ch_output_docs = Channel.fromPath("${baseDir}/docs/output.md")
  */
 
 tsvPath = null
-if (params.sample) if (hasExtension(params.sample,"tsv")) tsvPath = params.sample
+if (params.sample) if (hasExtension(params.sample,"tsv") || hasExtension(params.sample,"vcf") || hasExtension(params.sample,"vcf.gz")) tsvPath = params.sample
 
  // No need for tsv file for step annotate
 if (!params.sample) {
@@ -166,6 +166,7 @@ if (tsvPath) {
         case 'mapping': inputFiles = extractSample(tsvFile); break
         case 'recalibrate': bamFiles = extractRecal(tsvFile); break
         case 'variantcalling': bamFiles = extractBams(tsvFile); break
+        case 'annotate': break
         default: exit 1, "Unknown step ${step}"
     }
 } else if (params.sample) if (!hasExtension(params.sample,"tsv")) {
@@ -174,9 +175,11 @@ if (tsvPath) {
     inputFiles = extractFastqFromDir(params.sample)
     (inputFiles, fastqTmp) = inputFiles.into(2)
     fastqTmp.toList().subscribe onNext: {
-    if (it.size() == 0) exit 1, "No FASTQ files found in --sample directory '${params.sample}'"
-}
-tsvFile = params.sample  // used in the reports
+        if (it.size() == 0) exit 1, "No FASTQ files found in --sample directory '${params.sample}'"
+    }
+    tsvFile = params.sample  // used in the reports
+} else if (step == 'annotate') {
+    println "Annotating ${tsvFile}"
 } else exit 1, 'No sample were defined, see --help'
 
 if (step == 'recalibrate') (patientGenders, bamFiles) = extractGenders(bamFiles)
@@ -1558,7 +1561,7 @@ vcfToAnnotate = Channel.create()
 if (step == 'annotate') {
     vcfNotToAnnotate = Channel.create()
 
-    if (annotateVCF == []) {
+    if (tsvPath == []) {
     // Sarek, by default, annotates all available vcfs that it can find in the VariantCalling directory
     // Excluding vcfs from FreeBayes, and g.vcf from HaplotypeCaller
     // Basically it's: VariantCalling/*/{HaplotypeCaller,Manta,MuTect2,Strelka}/*.vcf.gz
@@ -1580,7 +1583,7 @@ if (step == 'annotate') {
     } else if (annotateTools == []) {
     // Annotate user-submitted VCFs
     // If user-submitted, Sarek assume that the idSample should be assumed automatically
-      vcfToAnnotate = Channel.fromPath(annotateVCF)
+      vcfToAnnotate = Channel.fromPath(tsvPath)
         .map{vcf -> ['userspecified', vcf.minus(vcf.fileName)[-2].toString(), vcf]}
     } else exit 1, "specify only tools or files to annotate, not both"
 
@@ -1620,8 +1623,6 @@ process RunSnpeff {
   reducedVCF = reduceVCF(vcf)
   cache = (params.snpEff_cache && params.annotation_cache) ? "-dataDir \${PWD}/${dataDir}" : ""
   """
-  echo ${task.container}
-
   snpEff -Xmx${task.memory.toGiga()}g \
   ${snpeffDb} \
   -csvStats ${reducedVCF}_snpEff.csv \

diff --git a/nextflow.config b/nextflow.config
@@ -9,17 +9,11 @@
 params {
 
   // Workflow flags
-  // TODO nf-core: Specify your pipeline's command line flags
-  reads = "data/*{1,2}.fastq.gz"
-  singleEnd = false
+  genome = 'GRCh38'
   outdir = './results'
+  publishDirMode = 'symlink'
 
   // Boilerplate options
-  publishDirMode = 'symlink'
-  snpEff_cache = ''
-  cadd_version = ''
-  vep_cache = ''
-  genome = 'GRCh38'
   name = false
   multiqc_config = "$baseDir/assets/multiqc_config.yaml"
   email = false