cpg_workflows/defaults.toml

[workflow]
# Cohorts to use as inputs.
#input_cohorts = []

# Datasets to load inputs. If not provided, datasets will be determined
# automatically based on the input provider implementation:
#input_datasets = []

# Datasets to skip:
#skip_datasets = []

# Sequencing groups to skip:
#skip_sgs = []

# Only process the following sequencing groups:
#only_sgs = []

# Process the following sequencing groups even if outputs exist:
#force_sgs = []

# Skip these stages:
#skip_stages = []

# Skip all other stages:
#only_stages = []

# Start from this stage:
#first_stages = []

# Finish at this stage:
#last_stages = []

# Force stage rerun
#force_stages = []

# Map of stages to lists of sequencing groups, to skip for specific stages
#[workflow.skip_stages_for_sgs]
#CramQC = ['CPGaaa']

# Name of the workflow (to prefix output paths)
#name =

# Description of the workflow (to display in the Batch GUI)
#description =

# Suffix the workflow outputs location (`get_workflow().prefix`) with this string.
# By default, the hash of all input paths will be used.
#output_version = "0.1"

# Check input file existence (e.g. FASTQ files). When they are missing,
# the `skip_sgs_with_missing_input` option controls whether such
# sequencing groups should be ignored, or it should cause raising an error.
check_inputs = true

# For the first (not-skipped) stage, if the input for a target does
# not exist, just skip this target instead of failing. E.g. if the first
# stage is Align, and `sequencing_group.alignment_input` for a sequencing group do not exist,
# remove this sequencing group, instead of failing. In other words, ignore sequencing groups
# that are missing results from skipped stages.
skip_sgs_with_missing_input = false

# Within jobs, check all in-job intermediate files for possible reuse.
# If set to False, will overwrite all intermediates. Used by `utils.can_reuse(path)`.
check_intermediates = true

# Before running a stage, check if input (i.e. expected outputs from required stages)
# already exist. If it exists, do not submit stage jobs.
check_expected_outputs = true

# Limit to data of this sequencing type
#sequencing_type = 'genome'

# Realign CRAM when available, instead of using FASTQ.
# The parameter value should correspond to CRAM version
# (e.g. v0 in gs://cpg-fewgenomes-main/cram/v0/CPGaaa.cram
#realign_from_cram_version = 'v0'

# Calling intervals (defauls to whole genome intervals)
#intervals_path =

# The GQ bands used for ReblockGVCF, specified as exclusive upper bounds for reference
# confidence GQ bands (must be in [1, 100] and specified in increasing order). Finer
# granularity bands result in more reference blocks and therefore larger GVCFs.
reblock_gq_bands = [20, 30, 40]

# Only print the final merged config and a list of stages to be submitted.
# Will skip any communication with Metamist, Hail Batch, and Cloud Storage, so
# the code can be run without permissions.
#dry_run = true

# By default, BamToCram stage will create CRAM analysis types, this can be overridden
# bam_to_cram_analysis_type = 'pacbio_cram'

# Map internally used validation sample external_id to truth sample names
[validation.sample_map]
HG001_NA12878 = 'na12878'
SYNDIP = 'syndip'

[hail]
delete_scratch_on_exit = false

[resource_overrides]
# Override default resource requirements for unusually large seq data without
# demanding higher resources for all operations as standard. Examples below

# picard MarkDuplicates overrides for unreasnobly large sequnce groups
#picard_mem_gb = 100
#picard_storage_gb = 350

# haplotype caller overrides, see production-pipelines PR#381
# defaults in code are 40 for genomes, none for exomes
#haplotypecaller_storage = 80

# Use highmem machine type for alignment step
# align_use_highmem = true

# Use additional storage in postproc_gvcf job for large gVCFs
# postproc_gvcf_storage = 50

# JointGenotyping GenomicsDBImport job overrides
# genomicsdb_import_mem_gb = 32
# genomicsdb_import_use_highmem = false

# JointGenotyping GenotypeGVCFs job overrides
# genotype_gvcfs_mem_gb = 15
# genotype_gvcfs_use_highmem = false

[mito_snv]
# Example config for broad wdl found here:
# https://raw.githubusercontent.com/broadinstitute/gatk/master/scripts/mitochondria_m2_wdl/ExampleInputsMitochondriaPipeline.json
# f_score_beta is not configured so will use tool default of 1.0
f_score_beta = 1.0
# Sarah Stenton from Broad runs this pipline for seqr ingest and indicated they use a
# threshold of 0.01 for seqr cohorts.
vaf_filter_threshold = 0.01
# Use verifybamid in addition to haplocheck for contamination estimate
use_verifybamid = true

[stripy]
# Analysis_type can be "standard" (fast) or "extended" (marginally slower
# but also uses unmapped reads for genotying)
analysis_type = "extended"
# See https://gitlab.com/andreassh/stripy-pipeline#list-of-loci
# Excluded by default: C9orf72, HTT
target_loci = """AFF2,AR,ARX_1,ARX_2,ATN1,ATXN1,ATXN10,ATXN2,ATXN3,ATXN7,ATXN8OS,\
BEAN1,CACNA1A,CBL,CNBP,COMP,CSTB,DAB1,DIP2B,DMD,DMPK,EIF4A3,FGF14,FMR1,FOXL2,FXN,GIPC1,\
GLS,HOXA13_1,HOXA13_2,HOXA13_3,HOXD13,JPH3,LRP12,MARCHF6,NIPA1,NOP56,NOTCH2NLC,\
NUTM2B-AS1,PABPN1,PHOX2B,PPP2R2B,PRDM12,PRNP,RAPGEF2,RFC1,RILPL1,RUNX2,SAMD12,SOX3,\
STARD7,TBP,TBX1,TCF4,TNRC6A,VWA1,XYLT1,YEATS2,ZFHX3,ZIC2,ZIC3"""
# Path to bed+ file containins extra loci to include in the analysis. Tab-delimited BED
# file containing at least the following four values: chromosome, start and end position
# of the STR locus and motif on the plus strand. Optionally, the locus name/ID can be
# specified as fifth value. Additionally, you can also specify disease name, inheritance,
# normal range and pathogenic cut-off values which are then being used to colourise
# results e.g.:
# https://gitlab.com/andreassh/stripy-pipeline/-/blob/main/examples/vntr.bed
# custom_loci_path = "gs://cpg-reference/hg38/loci/seqr/seqr_stripy_custom_loci.bed"
# Set to empty string if no custom loci are to be used.
custom_loci_path = ""
# Change the path the stripy report is saved to, useful when testing novel loci
output_prefix = "stripy"

# Add in specific multiQC report config options
# See https://multiqc.info/docs/getting_started/config for more details
# [workflow.cram_multiqc]
# send_to_slack = true
# [workflow.cram_multiqc.extra_config]
# plots_force_interactive = true

# [workflow.gvcf_multiqc]
# send_to_slack = true
# [workflow.gvcf_multiqc.extra_config]
# plots_force_interactive = true