Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEAT: Use annotation-cache and update VEP to v110 #1122

Merged
merged 24 commits into from
Aug 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- [#1158](https://github.com/nf-core/sarek/pull/1158) - Add preprint
- [#1159](https://github.com/nf-core/sarek/pull/1159) - ISMB Poster
- [#1173](https://github.com/nf-core/sarek/pull/1173) - CI tests for VQSR track with stub runs
- [#1122](https://github.com/nf-core/sarek/pull/1122) - Add `annotation cache` functionality

### Changed

Expand Down
40 changes: 8 additions & 32 deletions conf/igenomes.config
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,9 @@ params {
mappability = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/Control-FREEC/out100m2_hg19.gem"
snpeff_db = 87
snpeff_genome = 'GRCh37'
snpeff_version = '5.1'
vep_cache_version = 108
vep_cache_version = 110
vep_genome = 'GRCh37'
vep_species = 'homo_sapiens'
vep_version = '108.2'
}
'GATK.GRCh38' {
ascat_alleles = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/ASCAT/G1000_alleles_hg38.zip"
Expand Down Expand Up @@ -74,34 +72,28 @@ params {
pon_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/1000g_pon.hg38.vcf.gz.tbi"
snpeff_db = 105
snpeff_genome = 'GRCh38'
snpeff_version = '5.1'
vep_cache_version = 108
vep_cache_version = 110
vep_genome = 'GRCh38'
vep_species = 'homo_sapiens'
vep_version = '108.2'
}
'Ensembl.GRCh37' {
bwa = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BWAIndex/version0.6.0/"
fasta = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/WholeGenomeFasta/genome.fa"
readme = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/README.txt"
snpeff_db = 87
snpeff_genome = 'GRCh37'
snpeff_version = '5.1'
vep_cache_version = 108
vep_cache_version = 110
vep_genome = 'GRCh37'
vep_species = 'homo_sapiens'
vep_version = '108.2'
}
'NCBI.GRCh38' {
bwa = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/version0.6.0/"
fasta = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/WholeGenomeFasta/genome.fa"
snpeff_db = 105
snpeff_genome = 'GRCh38'
snpeff_version = '5.1'
vep_cache_version = 108
vep_cache_version = 110
vep_genome = 'GRCh38'
vep_species = 'homo_sapiens'
vep_version = '108.2'
}
'CHM13' {
fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/CHM13/Sequence/WholeGenomeFasta/genome.fa"
Expand All @@ -123,11 +115,9 @@ params {
readme = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/README.txt"
snpeff_db = 99
snpeff_genome = 'GRCm38'
snpeff_version = '5.1'
vep_cache_version = 102
vep_genome = 'GRCm38'
vep_species = 'mus_musculus'
vep_version = '108.2'
}
'TAIR10' {
bwa = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/BWAIndex/version0.6.0/"
Expand All @@ -145,34 +135,28 @@ params {
readme = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/README.txt"
snpeff_db = 75
snpeff_genome = 'UMD3.1'
snpeff_version = '5.1'
vep_cache_version = 94
vep_genome = 'UMD3.1'
vep_species = 'bos_taurus'
vep_version = '108.2'
}
'WBcel235' {
bwa = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/BWAIndex/version0.6.0/"
fasta = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/WholeGenomeFasta/genome.fa"
snpeff_db = 105
snpeff_genome = 'WBcel235'
snpeff_version = '5.1'
vep_cache_version = 108
vep_cache_version = 110
vep_genome = 'WBcel235'
vep_species = 'caenorhabditis_elegans'
vep_version = '108.2'
}
'CanFam3.1' {
bwa = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/BWAIndex/version0.6.0/"
fasta = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/WholeGenomeFasta/genome.fa"
readme = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/README.txt"
snpeff_db = 99
snpeff_genome = 'CanFam3.1'
snpeff_version = '5.1'
vep_cache_version = 104
vep_genome = 'CanFam3.1'
vep_species = 'canis_lupus_familiaris'
vep_version = '108.2'
}
'GRCz10' {
bwa = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/BWAIndex/version0.6.0/"
Expand Down Expand Up @@ -228,11 +212,9 @@ params {
fasta = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/WholeGenomeFasta/genome.fa"
snpeff_db = 105
snpeff_genome = 'R64-1-1'
snpeff_version = '5.1'
vep_cache_version = 108
vep_cache_version = 110
vep_genome = 'R64-1-1'
vep_species = 'saccharomyces_cerevisiae'
vep_version = '108.2'
}
'EF2' {
bwa = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/BWAIndex/version0.6.0/"
Expand All @@ -258,35 +240,29 @@ params {
fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa"
snpeff_db = 105
snpeff_genome = 'GRCh38'
snpeff_version = '5.1'
vep_cache_version = 108
vep_cache_version = 110
vep_genome = 'GRCh38'
vep_species = 'homo_sapiens'
vep_version = '108.2'
}
'hg19' {
bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/BWAIndex/version0.6.0/"
fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa"
readme = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Annotation/README.txt"
snpeff_db = 87
snpeff_genome = 'GRCh37'
snpeff_version = '5.1'
vep_cache_version = 108
vep_cache_version = 110
vep_genome = 'GRCh37'
vep_species = 'homo_sapiens'
vep_version = '108.2'
}
'mm10' {
bwa = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/BWAIndex/version0.6.0/"
fasta = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/WholeGenomeFasta/genome.fa"
readme = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Annotation/README.txt"
snpeff_db = 99
snpeff_genome = 'GRCm38'
snpeff_version = '5.1'
maxulysse marked this conversation as resolved.
Show resolved Hide resolved
vep_cache_version = 102
vep_genome = 'GRCm38'
vep_species = 'mus_musculus'
vep_version = '108.2'
}
'bosTau8' {
bwa = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/BWAIndex/version0.6.0/"
Expand Down
2 changes: 0 additions & 2 deletions conf/modules/annotate.config
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ process {
withName: 'SNPEFF_SNPEFF' {
ext.prefix = { vcf.baseName - ".vcf" + "_snpEff" }
ext.args = '-nodownload -canon -v'
if (!params.snpeff_cache && !params.download_cache) container = { params.snpeff_genome ? "docker.io/nfcore/snpeff:${params.snpeff_version}.${params.snpeff_genome}" : "docker.io/nfcore/snpeff:${params.snpeff_version}.${params.genome}" }
publishDir = [
[
mode: params.publish_dir_mode,
Expand All @@ -46,7 +45,6 @@ process {
].join(' ').trim() }
// If just VEP: <vcf prefix>_VEP.ann.vcf
ext.prefix = { vcf.baseName - ".vcf" + "_VEP.ann" }
if (!params.vep_cache && !params.download_cache) container = { params.vep_genome ? "docker.io/nfcore/vep:${params.vep_version}.${params.vep_genome}" : "docker.io/nfcore/vep:${params.vep_version}.${params.genome}" }
publishDir = [
[
mode: params.publish_dir_mode,
Expand Down
8 changes: 4 additions & 4 deletions conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -29,20 +29,20 @@ params {
germline_resource = "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/gnomAD.r2.1.1.vcf.gz"
intervals = "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.interval_list"
known_indels = "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/mills_and_1000G.indels.vcf.gz"
snpeff_cache = null
FriederikeHanssen marked this conversation as resolved.
Show resolved Hide resolved
snpeff_db = 105
snpeff_genome = 'WBcel235'
snpeff_version = '5.1'
vep_cache_version = 106
vep_cache = null
vep_cache_version = 110
vep_genome = 'WBcel235'
vep_species = 'caenorhabditis_elegans'
vep_version = '106.1'

// default params
split_fastq = 0 // no FASTQ splitting
tools = 'strelka' // Variant calling with Strelka

// Ignore params that will throw warning through params validation
validationSchemaIgnoreParams = 'genomes,snpeff_version,vep_version'
validationSchemaIgnoreParams = 'genomes'
}

process {
Expand Down
8 changes: 4 additions & 4 deletions conf/test/cache.config
Original file line number Diff line number Diff line change
Expand Up @@ -41,20 +41,20 @@ params {
germline_resource = params.test_data['homo_sapiens']['genome']['gnomad_r2_1_1_vcf_gz']
intervals = params.test_data['homo_sapiens']['genome']['genome_interval_list']
known_indels = params.test_data['homo_sapiens']['genome']['mills_and_1000g_indels_vcf_gz']
snpeff_cache = null
snpeff_db = 105
snpeff_genome = 'WBcel235'
snpeff_version = '5.1'
vep_cache_version = 106
vep_cache = null
vep_cache_version = 110
vep_genome = 'WBcel235'
vep_species = 'caenorhabditis_elegans'
vep_version = '106.1'

// default params
split_fastq = 0 // no FASTQ splitting
tools = 'strelka' // Variant calling with Strelka

// Ignore params that will throw warning through params validation
validationSchemaIgnoreParams = 'genomes,test_data,snpeff_version,vep_version'
validationSchemaIgnoreParams = 'genomes,test_data'
}

process {
Expand Down
2 changes: 0 additions & 2 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,9 @@ params.pon = WorkflowMain.getGenomeAttribute(params, 'pon')
params.pon_tbi = WorkflowMain.getGenomeAttribute(params, 'pon_tbi')
params.snpeff_db = WorkflowMain.getGenomeAttribute(params, 'snpeff_db')
params.snpeff_genome = WorkflowMain.getGenomeAttribute(params, 'snpeff_genome')
params.snpeff_version = WorkflowMain.getGenomeAttribute(params, 'snpeff_version')
params.vep_cache_version = WorkflowMain.getGenomeAttribute(params, 'vep_cache_version')
params.vep_genome = WorkflowMain.getGenomeAttribute(params, 'vep_genome')
params.vep_species = WorkflowMain.getGenomeAttribute(params, 'vep_species')
params.vep_version = WorkflowMain.getGenomeAttribute(params, 'vep_version')

/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
39 changes: 20 additions & 19 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -73,24 +73,25 @@ params {
sentieon_haplotyper_emit_mode = "variant" // default value for Sentieon haplotyper

// Annotation
vep_out_format = "vcf"
vep_custom_args = "--everything --filter_common --per_gene --total_length --offline --format vcf" // Default arguments for VEP
vep_dbnsfp = null // dbnsfp plugin disabled within VEP
dbnsfp = null // No dbnsfp processed file
dbnsfp_tbi = null // No dbnsfp processed file index
dbnsfp_consequence = null // No default consequence for dbnsfp plugin
dbnsfp_fields = "rs_dbSNP,HGVSc_VEP,HGVSp_VEP,1000Gp3_EAS_AF,1000Gp3_AMR_AF,LRT_score,GERP++_RS,gnomAD_exomes_AF" // Default fields for dbnsfp plugin
vep_loftee = null // loftee plugin disabled within VEP
vep_spliceai = null // spliceai plugin disabled within VEP
spliceai_snv = null // No spliceai_snv file
spliceai_snv_tbi = null // No spliceai_snv file index
spliceai_indel = null // No spliceai_indel file
spliceai_indel_tbi = null // No spliceai_indel file index
vep_spliceregion = null // spliceregion plugin disabled within VEP
outdir_cache = null // No output directory for cache
snpeff_cache = null // No directory for snpEff cache
vep_cache = null // No directory for VEP cache
vep_include_fasta = false // Don't use fasta file for annotation with VEP
dbnsfp = null // No dbnsfp processed file
dbnsfp_consequence = null // No default consequence for dbnsfp plugin
dbnsfp_fields = "rs_dbSNP,HGVSc_VEP,HGVSp_VEP,1000Gp3_EAS_AF,1000Gp3_AMR_AF,LRT_score,GERP++_RS,gnomAD_exomes_AF" // Default fields for dbnsfp plugin
dbnsfp_tbi = null // No dbnsfp processed file index
outdir_cache = null // No default outdir cache
snpeff_cache = 's3://annotation-cache/snpeff_cache/'
spliceai_indel = null // No spliceai_indel file
spliceai_indel_tbi = null // No spliceai_indel file index
spliceai_snv = null // No spliceai_snv file
spliceai_snv_tbi = null // No spliceai_snv file index
use_annotation_cache_keys = true
vep_cache = 's3://annotation-cache/vep_cache/'
vep_custom_args = "--everything --filter_common --per_gene --total_length --offline --format vcf" // Default arguments for VEP
vep_dbnsfp = null // dbnsfp plugin disabled within VEP
vep_include_fasta = false // Don't use fasta file for annotation with VEP
vep_loftee = null // loftee plugin disabled within VEP
vep_out_format = "vcf"
vep_spliceai = null // spliceai plugin disabled within VEP
vep_spliceregion = null // spliceregion plugin disabled within VEP

// MultiQC options
multiqc_config = null
Expand Down Expand Up @@ -128,7 +129,7 @@ params {
// Schema validation default options
validationFailUnrecognisedParams = false
validationLenientMode = true
validationSchemaIgnoreParams = 'genomes,snpeff_version,vep_version,cf_ploidy'
validationSchemaIgnoreParams = 'genomes,cf_ploidy'
validationShowHiddenParams = false
validate_params = true
}
Expand Down
22 changes: 8 additions & 14 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -508,16 +508,24 @@
"hidden": true,
"help_text": "Using this params you can add custom args to VEP."
},
"use_annotation_cache_keys": {
"type": "boolean",
"fa_icon": "fas fa-toolbox",
"description": "Use annotation cache keys for snpeff_cache and vep_cache.",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we get a help text with a bit more explanation

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

like i am unsure now, if this referes to the new s3 bucket or if I want to define some cache locally or so

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, basically that's because of how snpeff and vep store data, they're made for HPC, and I'm storing on cloud.
Plan would be to extrapolate that in the docs.
But I need to build a website for that.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok awesome :)

"hidden": true
},
"snpeff_cache": {
"type": "string",
"fa_icon": "fas fa-file",
"default": "s3://annotation-cache/snpeff_cache/",
"description": "Path to snpEff cache.",
"help_text": "To be used with `--annotation_cache`.",
"hidden": true
},
"vep_cache": {
"type": "string",
"fa_icon": "fas fa-file",
"default": "s3://annotation-cache/vep_cache/",
"description": "Path to VEP cache.",
"help_text": "To be used with `--annotation_cache`.",
"hidden": true
Expand Down Expand Up @@ -729,13 +737,6 @@
"help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nThis is used to specify the genome when using the container with pre-downloaded cache.",
"hidden": true
},
"snpeff_version": {
"type": "string",
"fa_icon": "fas fa-tag",
"description": "snpEff version.",
"help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nThis is used to specify the snpeff version when using the container with pre-downloaded cache.",
"hidden": true
},
"vep_genome": {
"type": "string",
"fa_icon": "fas fa-microscope",
Expand All @@ -757,13 +758,6 @@
"help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nAlternatively cache version can be use to specify the correct Ensembl Genomes version number as these differ from the concurrent Ensembl/VEP version numbers",
"hidden": true
},
"vep_version": {
"type": "string",
"fa_icon": "fas fa-tag",
"description": "VEP version.",
"help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nThis is used to specify the VEP version when using the container with pre-downloaded cache.",
"hidden": true
},
"save_reference": {
"type": "boolean",
"fa_icon": "fas fa-download",
Expand Down
26 changes: 0 additions & 26 deletions tests/test_annotation_cache.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,29 +9,3 @@
- path: results/cache/vep_cache
- path: results/annotation
should_exist: false
- name: Download annotation cache and annotate using snpEff and VEP
command: nextflow run main.nf -profile test_cache,annotation --tools snpeff,vep --download_cache --outdir results
tags:
- annotation
- cache
files:
- path: results/multiqc
- path: results/cache/snpeff_cache
- path: results/cache/vep_cache
- path: results/annotation/test/test_VEP.ann.vcf.gz
# conda changes md5sums for test
- path: results/annotation/test/test_VEP.ann.vcf.gz.tbi
# conda changes md5sums for test
- path: results/annotation/test/test_snpEff.ann.vcf.gz
# conda changes md5sums for test
- path: results/annotation/test/test_snpEff.ann.vcf.gz.tbi
# conda changes md5sums for test
- path: results/multiqc
- path: results/reports/EnsemblVEP/test/test_VEP.ann.summary.html
# text-based file changes md5sums on reruns
- path: results/reports/snpeff/test/snpEff_summary.html
# text-based file changes md5sums on reruns
- path: results/reports/snpeff/test/test_snpEff.csv
# text-based file changes md5sums on reruns
- path: results/reports/snpeff/test/test_snpEff.genes.txt
md5sum: 130536bf0237d7f3f746d32aaa32840a
Loading