From 440d2f93b642490a9d0e4e3514f6416784eb2171 Mon Sep 17 00:00:00 2001 From: Rike Date: Sun, 17 Jul 2022 17:22:55 +0200 Subject: [PATCH 1/6] have freebayes only use a single core, parallel implementation isn't adding anything in our etup --- conf/base.config | 3 +++ 1 file changed, 3 insertions(+) diff --git a/conf/base.config b/conf/base.config index 5c87237b50..a98a1dec58 100644 --- a/conf/base.config +++ b/conf/base.config @@ -53,4 +53,7 @@ process { memory = { check_max( 60.GB * task.attempt, 'memory' ) } time = { check_max( 48.h * task.attempt, 'time' ) } } + withName: 'FREEBAYES' { + cpus = { check_max( 1 * task.attempt, 'cpus' ) } + } } From 9f6ee7b4182de3ccd88864ae67c5e03e36f78fed Mon Sep 17 00:00:00 2001 From: Rike Date: Tue, 19 Jul 2022 12:05:24 +0200 Subject: [PATCH 2/6] resources that worked for matched ICGC WGS data --- conf/base.config | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/conf/base.config b/conf/base.config index a98a1dec58..7b588f2dd9 100644 --- a/conf/base.config +++ b/conf/base.config @@ -49,11 +49,28 @@ process { errorStrategy = 'retry' maxRetries = 2 } - withName: 'BWAMEM2_MEM' { - memory = { check_max( 60.GB * task.attempt, 'memory' ) } + withName: 'BWAMEM1_MEM|BWAMEM2_MEM' { + cpus = { check_max( 24 * task.attempt, 'cpus' ) } + memory = { check_max( 36.GB * task.attempt, 'memory' ) } time = { check_max( 48.h * task.attempt, 'time' ) } } - withName: 'FREEBAYES' { + withName: 'FASTP'{ + cpus = { check_max( 12 * task.attempt, 'cpus' ) } + } + withName:'FASTQC|FASTP|MOSDEPTH|SAMTOOLS_CONVERT|SAMTOOLS_MERGE'{ + memory = { check_max( 4.GB * task.attempt, 'memory' ) } + } + withName:'APPLYBQSR|APPLYBQSR_SPARK|BASERECALIBRATOR|SAMTOOLS_STATS'{ + cpus = { check_max( 4 * task.attempt, 'cpus' ) } + } + withName:'APPLYBQSR|APPLYBQSR_SPARK|BASERECALIBRATOR|GATHERBQSRREPORTS'{ + memory = { check_max( 46.GB * task.attempt, 'memory' ) } + } + withName: 'MARKDUPLICATES'{ + memory = { check_max( 300.GB * task.attempt, 'memory' ) } + } + withName: 'FREEBAYES|SAMTOOLS_STATS|SAMTOOLS_INDEX|UNZIP' { cpus = { check_max( 1 * task.attempt, 'cpus' ) } } + } From 1fcd53e88fdae567365b5c255b0908bb36ce0569 Mon Sep 17 00:00:00 2001 From: Rike Date: Tue, 19 Jul 2022 12:05:51 +0200 Subject: [PATCH 3/6] reduce resource requests in local modules --- modules/local/build_intervals/main.nf | 1 - modules/local/create_intervals_bed/main.nf | 1 - 2 files changed, 2 deletions(-) diff --git a/modules/local/build_intervals/main.nf b/modules/local/build_intervals/main.nf index 38fbb36eec..77f4a5b826 100644 --- a/modules/local/build_intervals/main.nf +++ b/modules/local/build_intervals/main.nf @@ -1,6 +1,5 @@ process BUILD_INTERVALS { tag "$fasta_fai" - label 'process_medium' conda (params.enable_conda ? "anaconda::gawk=5.1.0" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/local/create_intervals_bed/main.nf b/modules/local/create_intervals_bed/main.nf index 08bb6eb5e6..50376259e5 100644 --- a/modules/local/create_intervals_bed/main.nf +++ b/modules/local/create_intervals_bed/main.nf @@ -1,6 +1,5 @@ process CREATE_INTERVALS_BED { tag "$intervals" - label 'process_medium' conda (params.enable_conda ? "anaconda::gawk=5.1.0" : null) container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? From e91c85ba32e34c90bbeaae118ec70131d58d0092 Mon Sep 17 00:00:00 2001 From: Rike Date: Tue, 19 Jul 2022 12:09:22 +0200 Subject: [PATCH 4/6] fix naming --- conf/base.config | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/conf/base.config b/conf/base.config index 7b588f2dd9..13decc5358 100644 --- a/conf/base.config +++ b/conf/base.config @@ -60,13 +60,13 @@ process { withName:'FASTQC|FASTP|MOSDEPTH|SAMTOOLS_CONVERT|SAMTOOLS_MERGE'{ memory = { check_max( 4.GB * task.attempt, 'memory' ) } } - withName:'APPLYBQSR|APPLYBQSR_SPARK|BASERECALIBRATOR|SAMTOOLS_STATS'{ + withName:'GATK4_APPLYBQSR|GATK4_APPLYBQSR_SPARK|GATK4_BASERECALIBRATOR|SAMTOOLS_STATS'{ cpus = { check_max( 4 * task.attempt, 'cpus' ) } } - withName:'APPLYBQSR|APPLYBQSR_SPARK|BASERECALIBRATOR|GATHERBQSRREPORTS'{ + withName:'GATK4_APPLYBQSR|GATK4_APPLYBQSR_SPARK|GATK4_BASERECALIBRATOR|GATK4_GATHERBQSRREPORTS'{ memory = { check_max( 46.GB * task.attempt, 'memory' ) } } - withName: 'MARKDUPLICATES'{ + withName: 'GATK4_MARKDUPLICATES'{ memory = { check_max( 300.GB * task.attempt, 'memory' ) } } withName: 'FREEBAYES|SAMTOOLS_STATS|SAMTOOLS_INDEX|UNZIP' { From ba1096adc1d08b5471d86cc5312914dd71ac264b Mon Sep 17 00:00:00 2001 From: Rike Date: Tue, 19 Jul 2022 12:11:04 +0200 Subject: [PATCH 5/6] update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ed0dc39ca3..0e47008dda 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -43,6 +43,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#632](https://github.com/nf-core/sarek/pull/632) - Added params `--snpeff_version` to allow more configuration on the snpeff container definition - [#632](https://github.com/nf-core/sarek/pull/632) - Added params `--vep_include_fasta` to use the fasta file for annotation - [#639](https://github.com/nf-core/sarek/pull/639) - Adding genes-txt-file and summary-html-file to the published output from snpEff. +- [#647](https://github.com/nf-core/sarek/pull/647) - Update resource requests for preprocessing based on what worked for 5 ICGC matched WGS samples ### Changed From c7c4d279c40d8cbe927f20ba33451e1d00d54085 Mon Sep 17 00:00:00 2001 From: Rike Date: Tue, 19 Jul 2022 12:25:53 +0200 Subject: [PATCH 6/6] also add some docs on the resource requests --- docs/usage.md | 233 ++++++++++++++++++++++++++------------------------ 1 file changed, 119 insertions(+), 114 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 91b209e509..2318e7e66b 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -581,10 +581,6 @@ This list is by no means exhaustive and it will depend on the specific analysis | [Control-FREEC](https://github.com/BoevaLab/FREEC) | x | x | x | - | x | x | | [MSIsensorPro](https://github.com/xjtu-omics/msisensor-pro) | x | x | x | - | - | x | -## How to create a panel-of-normals for Mutect2 - -For a detailed tutorial on how to create a panel-of-normals, see [here](https://gatk.broadinstitute.org/hc/en-us/articles/360035531132). - ## How to run ASCAT with WES _under construction_ @@ -610,39 +606,91 @@ Then, you can derive both loci (just chromosome and position) and allele files ( For further reading and documentation, please take a look at the Battenberg repository. --> -## Where do the used reference genomes originate from +## What are the bwa/bwa-mem2 parameters? -_under construction - help needed_ +For mapping, sarek follows the parameter suggestions provided in this [paper](https://www.nature.com/articles/s41467-018-06159-4): -GATK.GRCh38: +`-K 100000000` : for deterministic pipeline results, for more info see [here](https://github.com/CCDG/Pipeline-Standardization/issues/2) -| File | Tools | Origin | Docs | -| :-------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----------------------------------------------------------------------------------- | -| ascat_alleles | ASCAT | https://www.dropbox.com/s/uouszfktzgoqfy7/G1000_alleles_hg38.zip | https://github.com/VanLoo-lab/ascat/tree/master/ReferenceFiles/WGS | -| ascat_loci | ASCAT | https://www.dropbox.com/s/80cq0qgao8l1inj/G1000_loci_hg38.zip | https://github.com/VanLoo-lab/ascat/tree/master/ReferenceFiles/WGS | -| ascat_loci_gc | ASCAT | https://www.dropbox.com/s/80cq0qgao8l1inj/G1000_loci_hg38.zip | https://github.com/VanLoo-lab/ascat/tree/master/ReferenceFiles/WGS | -| ascat_loci_rt | ASCAT | https://www.dropbox.com/s/xlp99uneqh6nh6p/RT_G1000_hg38.zip | https://github.com/VanLoo-lab/ascat/tree/master/ReferenceFiles/WGS | -| bwa | bwa-mem | bwa index -p bwa/${fasta.baseName} $fasta | | -| bwamem2 | bwa-mem2 | bwa-mem2 index -p bwamem2/${fasta} $fasta | | -| dragmap | DragMap | dragen-os --build-hash-table true --ht-reference $fasta --output-directory dragmap | | -| dbsnp | Baserecalibrator, ControlFREEC, GenotypeGVCF, HaplotypeCaller | possibly from an old ftp server dbsnp_146.hg38.vcf.gz | https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle | -| dbsnp_tbi | Baserecalibrator, ControlFREEC, GenotypeGVCF, HaplotypeCaller | | | -| dict | Baserecalibrator(Spark), CNNScoreVariant, EstimateLibraryComplexity, FilterMutectCalls, FilterVariantTranches, GatherPileupSummaries,GenotypeGVCF, GetPileupSummaries, HaplotypeCaller, MarkDulpicates(Spark), MergeVCFs, Mutect2, Variantrecalibrator | https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dict | https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle | -| fasta | ApplyBQSR(Spark), ApplyVQSR, ASCAT, Baserecalibrator(Spark), BWA, BWAMem2, CNNScoreVariant, CNVKit, ControlFREEC, DragMap, DEEPVariant, EnsemblVEP, EstimateLibraryComplexity, FilterMutectCalls, FilterVariantTranches, FreeBayes, GatherPileupSummaries,GenotypeGVCF, GetPileupSummaries, HaplotypeCaller, interval building, Manta, MarkDuplicates(Spark),MergeVCFs,MSISensorPro, Mutect2, Samtools, snpEff, Strelka, Tiddit, Variantrecalibrator | https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta | https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle | -| fasta_fai | ApplyBQSR(Spark), ApplyVQSR, ASCAT, Baserecalibrator(Spark), BWA, BWAMem2, CNNScoreVariant, CNVKit, ControlFREEC, DragMap, DEEPVariant, EnsemblVEP, EstimateLibraryComplexity, FilterMutectCalls, FilterVariantTranches, FreeBayes, GatherPileupSummaries,GenotypeGVCF, GetPileupSummaries, HaplotypeCaller, interval building, Manta, MarkDuplicates(Spark),MergeVCFs,MSISensorPro, Mutect2, Samtools, snpEff, Strelka, Tiddit, Variantrecalibrator | https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.fai | https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle | -| germline_resource | GetPileupsummaries,Mutect2 | ? gnomAD.r2.1.1.GRCh38.PASS.AC.AF.only.vcf.gz" | | -| germline_resource_tbi | GetPileupsummaries,Mutect2 | ? gnomAD.r2.1.1.GRCh38.PASS.AC.AF.only.vcf.gz.tbi" | | -| intervals | ApplyBQSR(Spark), ASCAT, Baserecalibraotr(Spark), BCFTools, CNNScoreVariants, ControlFREEC, Deepvariant, FilterVariantTranches, FreeBayes, GenotypeGVCF, GetPileupSummaries, HaplotypeCaller, Strelka, mpileup, MSISensorPro, Mutect2, VCFTools | https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/wgs_calling_regions.hg38.interval_list | | -| known_indels | BaseRecalibrator(Spark), FilterVariantTranches | https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz,beta/Homo_sapiens_assembly38.known_indels}.vcf. | | -| known_indels_tbi | BaseRecalibrator(Spark), FilterVariantTranches | https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz.tbi" | | -| mappability | ControlFREEC | http://xfer.curie.fr/get/vyIi4w8EONl/out100m2_hg38.zip | http://boevalab.inf.ethz.ch/FREEC/tutorial.html | -| pon | Mutect2 | https://console.cloud.google.com/storage/browser/_details/gatk-best-practices/somatic-hg38/1000g_pon.hg38.vcf.gz | https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON- | -| pon_tbi | Mutect2 | https://console.cloud.google.com/storage/browser/_details/gatk-best-practices/somatic-hg38/1000g_pon.hg38.vcf.gz.tbi | https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON- | -| snpeff_db | | 'GRCh38.99' | | -| snpeff_genome | | 'GRCh38' | | -| vep_cache_version | | 105 | | -| vep_genome | | 'GRCh38' | | -| chr_dir | | "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Sequence/Chromosomes" | | +`-Y`: force soft-clipping rather than default hard-clipping of supplementary alignments + +In addition, currently the mismatch penalty for reads with tumor status in the sample sheet are mapped with a mismatch penalty of `-B 3`. + +## MultiQC related issues + +### Plots for SnpEff are missing + +When plots are missing, it is possible that the fasta and the custom SnpEff database are not matching https://pcingola.github.io/SnpEff/se_faq/#error_chromosome_not_found-details. +The SnpEff completes without throwing an error causing nextflow to complete successfully. An indication for the error are these lines in the `.command` files: + +``` +ERRORS: Some errors were detected +Error type Number of errors +ERROR_CHROMOSOME_NOT_FOUND 17522411 +``` + +## How to create a panel-of-normals for Mutect2 + +For a detailed tutorial on how to create a panel-of-normals, see [here](https://gatk.broadinstitute.org/hc/en-us/articles/360035531132). + +## Spark related issues + +If you have problems running processes that make use of Spark such as `MarkDuplicates`. +You are probably experiencing issues with the limit of open files in your system. +You can check your current limit by typing the following: + +```bash +ulimit -n +``` + +The default limit size is usually 1024 which is quite low to run Spark jobs. +In order to increase the size limit permanently you can: + +Edit the file `/etc/security/limits.conf` and add the lines: + +```bash +* soft nofile 65535 +* hard nofile 65535 +``` + +Edit the file `/etc/sysctl.conf` and add the line: + +```bash +fs.file-max = 65535 +``` + +Edit the file `/etc/sysconfig/docker` and add the new limits to OPTIONS like this: + +```bash +OPTIONS=”—default-ulimit nofile=65535:65535" +``` + +Re-start your session. + +Note that the way to increase the open file limit in your system may be slightly different or require additional steps. + +### Cannot delete work folder when using docker + Spark + +Currently, when running spark-based tools in combination with docker, it is required to set `docker.userEmulation = false`. This can unfortunately causes permission issues when `work/` is being written with root permissions. In case this happens, you might need to configure docker to run without `userEmulation` (see [here](https://github.com/Midnighter/nf-core-adr/blob/main/docs/adr/0008-refrain-from-using-docker-useremulation-in-nextflow.md)). + +## How to handle UMIs + +Sarek can process UMI-reads, using [fgbio](http://fulcrumgenomics.github.io/fgbio/tools/latest/) tools. + +In order to use reads containing UMI tags as your initial input, you need to include `--umi_read_structure [structure]` in your parameters. + +This will enable pre-processing of the reads and UMI consensus reads calling, which will then be used to continue the workflow from the mapping steps. For post-UMI processing depending on the experimental setup, duplicate marking and base quality recalibration can be skipped with [`--skip_tools`]. + +### UMI Read Structure + +This parameter is a string, which follows a [convention](https://github.com/fulcrumgenomics/fgbio/wiki/Read-Structures) to describe the structure of the umi. +If your reads contain a UMI only on one end, the string should only represent one structure (i.e. "2M11S+T"); should your reads contain a UMI on both ends, the string will contain two structures separated by a blank space (i.e. "2M11S+T 2M11S+T"). + +### Limitations and future updates + +Recent updates to Samtools have been introduced, which can speed-up performance of fgbio tools used in this workflow. +The current workflow does not handle duplex UMIs (i.e. where opposite strands of a duplex molecule have been tagged with a different UMI), and best practices have been proposed to process this type of data. +Both changes will be implemented in a future release. ## How to run sarek when no(t all) reference files are in igenomes @@ -674,6 +722,40 @@ Example for not using known indels, but all other provided reference file: nextflow run nf-core/sarek --known_indels false --genome GRCh38.GATK ``` +### Where do the used reference genomes originate from + +_under construction - help needed_ + +GATK.GRCh38: + +| File | Tools | Origin | Docs | +| :-------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----------------------------------------------------------------------------------- | +| ascat_alleles | ASCAT | https://www.dropbox.com/s/uouszfktzgoqfy7/G1000_alleles_hg38.zip | https://github.com/VanLoo-lab/ascat/tree/master/ReferenceFiles/WGS | +| ascat_loci | ASCAT | https://www.dropbox.com/s/80cq0qgao8l1inj/G1000_loci_hg38.zip | https://github.com/VanLoo-lab/ascat/tree/master/ReferenceFiles/WGS | +| ascat_loci_gc | ASCAT | https://www.dropbox.com/s/80cq0qgao8l1inj/G1000_loci_hg38.zip | https://github.com/VanLoo-lab/ascat/tree/master/ReferenceFiles/WGS | +| ascat_loci_rt | ASCAT | https://www.dropbox.com/s/xlp99uneqh6nh6p/RT_G1000_hg38.zip | https://github.com/VanLoo-lab/ascat/tree/master/ReferenceFiles/WGS | +| bwa | bwa-mem | bwa index -p bwa/${fasta.baseName} $fasta | | +| bwamem2 | bwa-mem2 | bwa-mem2 index -p bwamem2/${fasta} $fasta | | +| dragmap | DragMap | dragen-os --build-hash-table true --ht-reference $fasta --output-directory dragmap | | +| dbsnp | Baserecalibrator, ControlFREEC, GenotypeGVCF, HaplotypeCaller | possibly from an old ftp server dbsnp_146.hg38.vcf.gz | https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle | +| dbsnp_tbi | Baserecalibrator, ControlFREEC, GenotypeGVCF, HaplotypeCaller | | | +| dict | Baserecalibrator(Spark), CNNScoreVariant, EstimateLibraryComplexity, FilterMutectCalls, FilterVariantTranches, GatherPileupSummaries,GenotypeGVCF, GetPileupSummaries, HaplotypeCaller, MarkDulpicates(Spark), MergeVCFs, Mutect2, Variantrecalibrator | https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dict | https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle | +| fasta | ApplyBQSR(Spark), ApplyVQSR, ASCAT, Baserecalibrator(Spark), BWA, BWAMem2, CNNScoreVariant, CNVKit, ControlFREEC, DragMap, DEEPVariant, EnsemblVEP, EstimateLibraryComplexity, FilterMutectCalls, FilterVariantTranches, FreeBayes, GatherPileupSummaries,GenotypeGVCF, GetPileupSummaries, HaplotypeCaller, interval building, Manta, MarkDuplicates(Spark),MergeVCFs,MSISensorPro, Mutect2, Samtools, snpEff, Strelka, Tiddit, Variantrecalibrator | https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta | https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle | +| fasta_fai | ApplyBQSR(Spark), ApplyVQSR, ASCAT, Baserecalibrator(Spark), BWA, BWAMem2, CNNScoreVariant, CNVKit, ControlFREEC, DragMap, DEEPVariant, EnsemblVEP, EstimateLibraryComplexity, FilterMutectCalls, FilterVariantTranches, FreeBayes, GatherPileupSummaries,GenotypeGVCF, GetPileupSummaries, HaplotypeCaller, interval building, Manta, MarkDuplicates(Spark),MergeVCFs,MSISensorPro, Mutect2, Samtools, snpEff, Strelka, Tiddit, Variantrecalibrator | https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.fai | https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle | +| germline_resource | GetPileupsummaries,Mutect2 | ? gnomAD.r2.1.1.GRCh38.PASS.AC.AF.only.vcf.gz" | | +| germline_resource_tbi | GetPileupsummaries,Mutect2 | ? gnomAD.r2.1.1.GRCh38.PASS.AC.AF.only.vcf.gz.tbi" | | +| intervals | ApplyBQSR(Spark), ASCAT, Baserecalibraotr(Spark), BCFTools, CNNScoreVariants, ControlFREEC, Deepvariant, FilterVariantTranches, FreeBayes, GenotypeGVCF, GetPileupSummaries, HaplotypeCaller, Strelka, mpileup, MSISensorPro, Mutect2, VCFTools | https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/wgs_calling_regions.hg38.interval_list | | +| known_indels | BaseRecalibrator(Spark), FilterVariantTranches | https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz,beta/Homo_sapiens_assembly38.known_indels}.vcf. | | +| known_indels_tbi | BaseRecalibrator(Spark), FilterVariantTranches | https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz.tbi" | | +| mappability | ControlFREEC | http://xfer.curie.fr/get/vyIi4w8EONl/out100m2_hg38.zip | http://boevalab.inf.ethz.ch/FREEC/tutorial.html | +| pon | Mutect2 | https://console.cloud.google.com/storage/browser/_details/gatk-best-practices/somatic-hg38/1000g_pon.hg38.vcf.gz | https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON- | +| pon_tbi | Mutect2 | https://console.cloud.google.com/storage/browser/_details/gatk-best-practices/somatic-hg38/1000g_pon.hg38.vcf.gz.tbi | https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON- | +| snpeff_db | | 'GRCh38.99' | | +| snpeff_genome | | 'GRCh38' | | +| vep_cache_version | | 105 | | +| vep_genome | | 'GRCh38' | | +| chr_dir | | "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Sequence/Chromosomes" | | + ## How to customise SnpEff and VEP annotation _under construction help needed_ @@ -769,87 +851,10 @@ nextflow run download_cache.nf --cadd_cache --cadd_version #### SpliceRegions -## What are the bwa/bwa-mem2 parameters? - -For mapping, sarek follows the parameter suggestions provided in this [paper](https://www.nature.com/articles/s41467-018-06159-4): - -`-K 100000000` : for deterministic pipeline results, for more info see [here](https://github.com/CCDG/Pipeline-Standardization/issues/2) - -`-Y`: force soft-clipping rather than default hard-clipping of supplementary alignments - -In addition, currently the mismatch penalty for reads with tumor status in the sample sheet are mapped with a mismatch penalty of `-B 3`. - -## Spark related issues - -If you have problems running processes that make use of Spark such as `MarkDuplicates`. -You are probably experiencing issues with the limit of open files in your system. -You can check your current limit by typing the following: - -```bash -ulimit -n -``` - -The default limit size is usually 1024 which is quite low to run Spark jobs. -In order to increase the size limit permanently you can: - -Edit the file `/etc/security/limits.conf` and add the lines: - -```bash -* soft nofile 65535 -* hard nofile 65535 -``` - -Edit the file `/etc/sysctl.conf` and add the line: +## Requested resources for the tools -```bash -fs.file-max = 65535 -``` - -Edit the file `/etc/sysconfig/docker` and add the new limits to OPTIONS like this: - -```bash -OPTIONS=”—default-ulimit nofile=65535:65535" -``` - -Re-start your session. - -Note that the way to increase the open file limit in your system may be slightly different or require additional steps. - -### Cannot delete work folder when using docker + Spark - -Currently, when running spark-based tools in combination with docker, it is required to set `docker.userEmulation = false`. This can unfortunately causes permission issues when `work/` is being written with root permissions. In case this happens, you might need to configure docker to run without `userEmulation` (see [here](https://github.com/Midnighter/nf-core-adr/blob/main/docs/adr/0008-refrain-from-using-docker-useremulation-in-nextflow.md)). - -## How to handle UMIs - -Sarek can process UMI-reads, using [fgbio](http://fulcrumgenomics.github.io/fgbio/tools/latest/) tools. - -In order to use reads containing UMI tags as your initial input, you need to include `--umi_read_structure [structure]` in your parameters. - -This will enable pre-processing of the reads and UMI consensus reads calling, which will then be used to continue the workflow from the mapping steps. For post-UMI processing depending on the experimental setup, duplicate marking and base quality recalibration can be skipped with [`--skip_tools`]. - -### UMI Read Structure - -This parameter is a string, which follows a [convention](https://github.com/fulcrumgenomics/fgbio/wiki/Read-Structures) to describe the structure of the umi. -If your reads contain a UMI only on one end, the string should only represent one structure (i.e. "2M11S+T"); should your reads contain a UMI on both ends, the string will contain two structures separated by a blank space (i.e. "2M11S+T 2M11S+T"). - -### Limitations and future updates - -Recent updates to Samtools have been introduced, which can speed-up performance of fgbio tools used in this workflow. -The current workflow does not handle duplex UMIs (i.e. where opposite strands of a duplex molecule have been tagged with a different UMI), and best practices have been proposed to process this type of data. -Both changes will be implemented in a future release. - -## MultiQC related issues - -### Plots for SnpEff are missing - -When plots are missing, it is possible that the fasta and the custom SnpEff database are not matching https://pcingola.github.io/SnpEff/se_faq/#error_chromosome_not_found-details. -The SnpEff completes without throwing an error causing nextflow to complete successfully. An indication for the error are these lines in the `.command` files: - -``` -ERRORS: Some errors were detected -Error type Number of errors -ERROR_CHROMOSOME_NOT_FOUND 17522411 -``` +Resource requests are difficult to generalize and are often dependent on input data size. Currently, the number of cpus and memory requested by default were adapted from tests on 5 ICGC paired whole-genome sequencing samples with approximately 40X and 80X depth. +For targeted data analysis, this is overshooting by a lot. In this case resources for each process can be limited by either setting `--max_memory` and `-max_cpus` or tailoring the request by process name as described [here](#resource-requests). If you are using sarek for a certain data type regulary, and would like to make these requests available to others on your system, an institution-specific, pipeline-specific config file can be added [here](https://github.com/nf-core/configs/tree/master/conf/pipeline/sarek). ## How to set sarek up to use sentieon