Skip to content

Commit

Permalink
Improve tests and docs (#8)
Browse files Browse the repository at this point in the history
* improve tests + add some docs
  • Loading branch information
maxulysse authored May 14, 2019
1 parent 3c5f236 commit f9e13a4
Show file tree
Hide file tree
Showing 9 changed files with 109 additions and 73 deletions.
9 changes: 3 additions & 6 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,9 @@ jobs:
- setup_remote_docker
- run:
command: docker build -t nfcore/sarekvep:dev.${GENOME} containers/vep/. --build-arg GENOME=${GENOME} --build-arg SPECIES=${SPECIES} --build-arg VEP_VERSION=${VEP_VERSION}
no_output_timeout: 45m
no_output_timeout: 1.5h
- run:
command: echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin ; docker push nfcore/sarekvep:dev.${GENOME}
no_output_timeout: 45m

vepgrch38:
docker:
Expand All @@ -77,10 +76,9 @@ jobs:
- setup_remote_docker
- run:
command: docker build -t nfcore/sarekvep:dev.${GENOME} containers/vep/. --build-arg GENOME=${GENOME} --build-arg SPECIES=${SPECIES} --build-arg VEP_VERSION=${VEP_VERSION}
no_output_timeout: 45m
no_output_timeout: 1.5h
- run:
command: echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin ; docker push nfcore/sarekvep:dev.${GENOME}
no_output_timeout: 45m

vepgrcm38:
docker:
Expand All @@ -94,10 +92,9 @@ jobs:
- setup_remote_docker
- run:
command: docker build -t nfcore/sarekvep:dev.${GENOME} containers/vep/. --build-arg GENOME=${GENOME} --build-arg SPECIES=${SPECIES} --build-arg VEP_VERSION=${VEP_VERSION}
no_output_timeout: 45m
no_output_timeout: 30m
- run:
command: echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin ; docker push nfcore/sarekvep:dev.${GENOME}
no_output_timeout: 45m

workflows:
version: 2
Expand Down
43 changes: 25 additions & 18 deletions README.md

Large diffs are not rendered by default.

13 changes: 2 additions & 11 deletions bin/build_reference.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#!/bin/bash
set -xeuo pipefail

BUILD=false
TEST=ALL
TRAVIS_BUILD_DIR=${TRAVIS_BUILD_DIR:-.}
TRAVIS=${TRAVIS:-false}
Expand All @@ -15,24 +14,16 @@ do
shift # past argument
shift # past value
;;
-b|--build)
BUILD=true
shift # past value
;;
*) # unknown option
shift # past argument
;;
esac
done

# Always download test data
rm -rf data
git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data

# Build references for smallGRCh37
if [[ BUILD ]] && [[ $TEST != ANNOTATESNPEFF ]] && [[ $TEST != ANNOTATEVEP ]]
if [[ $TEST != ANNOTATESNPEFF ]] && [[ $TEST != ANNOTATEVEP ]]
then
rm -rf references
nextflow run ${TRAVIS_BUILD_DIR}/build.nf -profile docker -ansi-log false --publishDirMode link --max_memory 7.GB --max_cpus 2 -dump-channels --genome smallGRCh37 --refdir data/reference --outdir references
nextflow run ${TRAVIS_BUILD_DIR}/build.nf -profile test,docker --build --outdir references -ansi-log false -dump-channels
rm -rf .nextflow* references/pipeline_info work
fi
14 changes: 8 additions & 6 deletions bin/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,23 +26,25 @@ do
done

function run_sarek() {
nextflow run ${TRAVIS_BUILD_DIR}/main.nf -profile docker -ansi-log false --publishDirMode link --max_memory 7.GB --max_cpus 2 -dump-channels --genome smallGRCh37 --igenomes_base references $@
nextflow run ${TRAVIS_BUILD_DIR}/main.nf -profile test,docker -ansi-log false -dump-channels $@
}

if [[ ALL,GERMLINE =~ $TEST ]]
then
rm -rf data
git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data
run_sarek --sample data/testdata/tiny/normal --tools HaplotypeCaller,Strelka --noReports
run_sarek --step recalibrate --noReports
run_sarek --step recalibrate --sample results/Preprocessing/TSV/duplicateMarked.tsv --noReports
fi

if [[ ALL,SOMATIC =~ $TEST ]]
then
run_sarek --sample data/testdata/tsv/tiny-manta.tsv --tools FreeBayes,HaplotypeCaller,Manta,Strelka,Mutect2 --noReports
run_sarek --tools FreeBayes,HaplotypeCaller,Manta,Strelka,Mutect2 --noReports
fi

if [[ ALL,TARGETED =~ $TEST ]]
then
run_sarek --sample data/testdata/tsv/tiny-manta.tsv --tools FreeBayes,HaplotypeCaller,Manta,Strelka,Mutect2 --noReports --targetBED data/testdata/target.bed
run_sarek --tools FreeBayes,HaplotypeCaller,Manta,Strelka,Mutect2 --noReports --targetBED https://github.com/nf-core/test-datasets/raw/sarek/testdata/target.bed
fi

if [[ ALL,ANNOTATEALL,ANNOTATESNPEFF,ANNOTATEVEP =~ $TEST ]]
Expand All @@ -57,10 +59,10 @@ then
then
ANNOTATOR=merge,snpEFF,VEP
fi
run_sarek --step annotate --tools ${ANNOTATOR} --annotateVCF data/testdata/vcf/Strelka_1234N_variants.vcf.gz --noReports
run_sarek --step annotate --tools ${ANNOTATOR} --sample https://github.com/nf-core/test-datasets/raw/sarek/testdata/vcf/Strelka_1234N_variants.vcf.gz --noReports
fi

if [[ MULTIPLE =~ $TEST ]]
then
run_sarek --sample data/testdata/tsv/tiny-multiple.tsv --tools FreeBayes,HaplotypeCaller,Manta,Strelka,Mutect2 --noReports
run_sarek --sample https://github.com/nf-core/test-datasets/raw/sarek/testdata/tsv/tiny-multiple.tsv --tools FreeBayes,HaplotypeCaller,Manta,Strelka,Mutect2 --noReports
fi
33 changes: 28 additions & 5 deletions build.nf
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,23 @@ Usage:
you're reading it
BUILD REFERENCES:
nextflow run build.nf [--refdir <pathToDirectory> --outdir <pathToDirectory>]
--refdir <Directoy>
Specify a directory containing reference files
nextflow run build.nf --build --outdir <pathToDirectory> [--offline]
--build
Will build reference files for smallGRCh37
--outdir <Directoy>
Specify an output directory
--offline
Will use data as the source for the reference files
Need to do:
`git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data`
Before transfering the repo to an offline location
DOWNLOAD CACHE:
nextflow run build.nf --download_cache [--snpEff_cache <pathToSNPEFFcache>] [--vep_cache <pathToVEPcache>]
[--cadd_cache <pathToCADDcache> --cadd_version <CADD Version>]
--download_cache
Will download specified cache
--snpEff_cache <Directoy>
Specify path to snpEff cache
If none, will use snpEff version specified in configuration
Expand All @@ -54,15 +63,29 @@ DOWNLOAD CACHE:
// Show help message
if (params.help) exit 0, helpMessage()

ch_referencesFiles = Channel.fromPath("${params.refdir}/*")

// Default value for params
params.build = null
params.offline = null
params.cadd_cache = null
params.cadd_version = 'v1.5'
params.genome = 'smallGRCh37'
params.snpEff_cache = null
params.vep_cache = null

ch_referencesFiles = Channel.empty()

if ((params.build) && (params.offline)) ch_referencesFiles = Channel.fromPath("data/reference/*")
if ((params.build) && (!params.offline)) ch_referencesFiles = ch_referencesFiles.mix(
Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/1000G_phase1.indels.b37.small.vcf.gz"),
Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/1000G_phase3_20130502_SNP_maf0.3.small.loci"),
Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/1000G_phase3_20130502_SNP_maf0.3.small.loci.gc"),
Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/Mills_and_1000G_gold_standard.indels.b37.small.vcf.gz"),
Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/dbsnp_138.b37.small.vcf.gz"),
Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/human_g1k_v37_decoy.small.fasta.gz"),
Channel.fromPath("https://github.com/nf-core/test-datasets/raw/sarek/reference/small.intervals"))

ch_referencesFiles = ch_referencesFiles.dump(tag:'Reference Files')

// Check if genome exists in the config file
if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) {
exit 1, "The provided genome '${params.genome}' is not available in the iGenomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}"
Expand Down
18 changes: 9 additions & 9 deletions conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,18 @@
*/

params {
config_profile_name = 'Test profile'
config_profile_description = 'Minimal test dataset to check pipeline function'
config_profile_name = 'Test profile'
// Limit resources so that this can run on Travis
max_cpus = 2
max_memory = 6.GB
max_memory = 7.GB
max_time = 48.h
// Input data
// TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
// TODO nf-core: Give any required params for the test so that command line flags are not needed
singleEnd = false
readPaths = [
['Testdata', ['https://github.com/nf-core/test-datasets/raw/exoseq/testdata/Testdata_R1.tiny.fastq.gz', 'https://github.com/nf-core/test-datasets/raw/exoseq/testdata/Testdata_R2.tiny.fastq.gz']],
['SRR389222', ['https://github.com/nf-core/test-datasets/raw/methylseq/testdata/SRR389222_sub1.fastq.gz', 'https://github.com/nf-core/test-datasets/raw/methylseq/testdata/SRR389222_sub2.fastq.gz']]
]
sample = 'https://github.com/nf-core/test-datasets/raw/sarek/testdata/tsv/tiny-manta-https.tsv'
// Small reference genome
// To be build with: `nextflow run build.nf --build -profile docker --outdir references`
genome = 'smallGRCh37'
igenomes_base = 'references'
// Use publishDir mode link so that work can be removed
publishDirMode = 'link'
}
21 changes: 21 additions & 0 deletions docs/reference.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Genomes and reference files

## AWS iGenomes
Sarek is using [AWS iGenomes](https://ewels.github.io/AWS-iGenomes/), which facilitate storing and sharing references.
Both `GRCh37` and `GRCh38` are available with `--genome GRCh37` or `--genome GRCh38` respectively with any profile using the `conf/igenomes.config` file, or you can specify it with `-c conf/igenomes.config`.

Sarek currently uses `GRCh38` by default.

Settings in `igenomes.config` can be tailored to your needs.

The [`build.nf`](#buildnf) script is used to build the indexes for the reference test.

Use `--genome smallGRCh37` to map against a small reference genome based on GRCh37.

## build.nf

The `build.nf` script can build the files needed for smallGRCh37.

```
nextflow run build.nf
```
21 changes: 11 additions & 10 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,6 @@ if (params.genomes && params.genome && !params.genomes.containsKey(params.genome

// Default value for params
params.annotateTools = null
params.annotateVCF = null
params.annotation_cache = null
params.cadd_InDels = null
params.cadd_InDels_tbi = null
Expand All @@ -104,10 +103,12 @@ params.noReports = null
params.nucleotidesPerSecond = 1000.0
params.sample = null
params.sequencing_center = null
params.snpEff_cache = null
params.step = 'mapping'
params.strelkaBP = true
params.targetBED = null
params.tools = null
params.vep_cache = null

stepList = defineStepList()
step = params.step ? params.step.toLowerCase() : ''
Expand All @@ -117,7 +118,6 @@ if ( step.contains(',') ) exit 1, 'You can choose only one step, see --help for

tools = params.tools ? params.tools.split(',').collect{it.trim().toLowerCase()} : []
annotateTools = params.annotateTools ? params.annotateTools.split(',').collect{it.trim().toLowerCase()} : []
annotateVCF = params.annotateVCF ? params.annotateVCF.split(',').collect{it.trim()} : []
toolList = defineToolList()
if ( !checkParameterList(tools,toolList) ) exit 1, 'Unknown tool(s), see --help for more information'

Expand Down Expand Up @@ -148,7 +148,7 @@ ch_output_docs = Channel.fromPath("${baseDir}/docs/output.md")
*/

tsvPath = null
if (params.sample) if (hasExtension(params.sample,"tsv")) tsvPath = params.sample
if (params.sample) if (hasExtension(params.sample,"tsv") || hasExtension(params.sample,"vcf") || hasExtension(params.sample,"vcf.gz")) tsvPath = params.sample

// No need for tsv file for step annotate
if (!params.sample) {
Expand All @@ -166,6 +166,7 @@ if (tsvPath) {
case 'mapping': inputFiles = extractSample(tsvFile); break
case 'recalibrate': bamFiles = extractRecal(tsvFile); break
case 'variantcalling': bamFiles = extractBams(tsvFile); break
case 'annotate': break
default: exit 1, "Unknown step ${step}"
}
} else if (params.sample) if (!hasExtension(params.sample,"tsv")) {
Expand All @@ -174,9 +175,11 @@ if (tsvPath) {
inputFiles = extractFastqFromDir(params.sample)
(inputFiles, fastqTmp) = inputFiles.into(2)
fastqTmp.toList().subscribe onNext: {
if (it.size() == 0) exit 1, "No FASTQ files found in --sample directory '${params.sample}'"
}
tsvFile = params.sample // used in the reports
if (it.size() == 0) exit 1, "No FASTQ files found in --sample directory '${params.sample}'"
}
tsvFile = params.sample // used in the reports
} else if (step == 'annotate') {
println "Annotating ${tsvFile}"
} else exit 1, 'No sample were defined, see --help'

if (step == 'recalibrate') (patientGenders, bamFiles) = extractGenders(bamFiles)
Expand Down Expand Up @@ -1558,7 +1561,7 @@ vcfToAnnotate = Channel.create()
if (step == 'annotate') {
vcfNotToAnnotate = Channel.create()

if (annotateVCF == []) {
if (tsvPath == []) {
// Sarek, by default, annotates all available vcfs that it can find in the VariantCalling directory
// Excluding vcfs from FreeBayes, and g.vcf from HaplotypeCaller
// Basically it's: VariantCalling/*/{HaplotypeCaller,Manta,MuTect2,Strelka}/*.vcf.gz
Expand All @@ -1580,7 +1583,7 @@ if (step == 'annotate') {
} else if (annotateTools == []) {
// Annotate user-submitted VCFs
// If user-submitted, Sarek assume that the idSample should be assumed automatically
vcfToAnnotate = Channel.fromPath(annotateVCF)
vcfToAnnotate = Channel.fromPath(tsvPath)
.map{vcf -> ['userspecified', vcf.minus(vcf.fileName)[-2].toString(), vcf]}
} else exit 1, "specify only tools or files to annotate, not both"

Expand Down Expand Up @@ -1620,8 +1623,6 @@ process RunSnpeff {
reducedVCF = reduceVCF(vcf)
cache = (params.snpEff_cache && params.annotation_cache) ? "-dataDir \${PWD}/${dataDir}" : ""
"""
echo ${task.container}
snpEff -Xmx${task.memory.toGiga()}g \
${snpeffDb} \
-csvStats ${reducedVCF}_snpEff.csv \
Expand Down
10 changes: 2 additions & 8 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,11 @@
params {

// Workflow flags
// TODO nf-core: Specify your pipeline's command line flags
reads = "data/*{1,2}.fastq.gz"
singleEnd = false
genome = 'GRCh38'
outdir = './results'
publishDirMode = 'symlink'

// Boilerplate options
publishDirMode = 'symlink'
snpEff_cache = ''
cadd_version = ''
vep_cache = ''
genome = 'GRCh38'
name = false
multiqc_config = "$baseDir/assets/multiqc_config.yaml"
email = false
Expand Down

0 comments on commit f9e13a4

Please sign in to comment.