configs/defaults/large_cohort.toml

[workflow]
name = 'large_cohort'
scatter_count = 50
scatter_count_genotype = 50
status_reporter = 'metamist'
# define reference fasta. If not specified here, the reference file is pulled from [references.broad][ref_fasta]
ref_fasta='gs://cpg-common-main/references/hg38/v0/dragen_reference/Homo_sapiens_assembly38_masked.fasta'
highmem_workers = true
# Realign CRAM when available, instead of using FASTQ.
# The parameter value should correspond to CRAM version
# (e.g. v0 in gs://cpg-fewgenomes-main/cram/v0/CPGaaa.cram
#realign_from_cram_version = 'v0'

# Write the sorted BAM file to the temp bucket after alignment and before MarkDuplicates
checkpoint_sorted_bam = false

# Calling intervals (defauls to whole genome intervals)
#intervals_path =

[combiner]
force_new_combiner = false
vds_version = "0.1"
memory = "8Gi"
storage = "10Gi"
vds_analysis_ids = []
merge_only_vds = false
# if false, use non-preemptible VMs
preemptible_vms = false
# for each combiner step, generate 5 intermediate VDSs, each combining 50 gVCFs
gvcf_batch_size = 5
branch_factor = 50


[vqsr]
# VQSR, when applying model, targets indel_filter_level and snp_filter_level
# sensitivities. The tool matches them internally to a VQSLOD score cutoff
# based on the model's estimated sensitivity to a set of true variants.
snp_filter_level = 99.7
indel_filter_level = 99.0

[cramqc]
assume_sorted = true
num_pcs = 4

[resource_overrides]
# Override default resource requirements for unusually large seq data without
# demanding higher resources for all operations as standard. Examples below

[large_cohort]
# Kinship threshold to consider two samples as related.
# the default threshold for second degree relatives in gnomAD is 0.1:
# https://github.com/broadinstitute/gnomad_methods/blob/382fc2c7976d58cc8983cc4c9f0df5d8d5f9fae3/gnomad/sample_qc/relatedness.py#L195
max_kin = 0.1
# Parameters for population inference
n_pcs = 16  # number of principal components for PCA analysis
min_pop_prob = 0.5  # minimum random forest probability for population assignment
training_pop = 'Superpopulation name'  # metamist's `Participant/meta` field for the
                                         # known population label for the RF training
# whether or not to key sequencing groups by their external ID
use_external_id = true
pca_plot_name = 'hgdp_1kg_sites'
# Specify the samples to exclude from the PCA analysis.
# Provide a list of sample IDs in 'pca_samples_to_remove'.
# If 'pca_samples_to_remove' is empty or not provided, the PCA analysis will include all samples.
pca_samples_to_remove = []
# Specify whether to remove related individuals prior to plotting.
# Set 'remove_relateds' to true to remove related individuals from the tables before generating plots.
# By default, 'remove_relateds' is set to false, meaning that related individuals are included in the tables used for plotting.
remove_relateds = false
# In addition to related individuals, remove samples from the PCA that failed QC
remove_failed_qc_pca = true

# Section that specifies background samples for PCA analysis.
# `datasets` must be a list paths to matrix tables or VDS, and
# `pop_field` if defined would specify the column-level field in th datasets
# to extract the population tag for RF training for ancestry inference.
[large_cohort.pca_background]
#datasets = ['gs://cpg-common-main/references/ancestry/oceania_eur.mt']
#pop_field = 'continental_pop'
# set whether PCA output labels are from given population names, or random forest
# inferred population labels
inferred_ancestry = false
# Whether or not to infer ploidy using the variant data:
# Using variant sites only for ploidy calculations causes ChrY ploidy inflation.
# Setting `variants_only_x_ploidy` and `variants_only_y_ploidy` to `False` (default value) causes
# function to use reference blocks for ploidy calculations
# https://centrepopgen.slack.com/archives/C018KFBCR1C/p1705539231990829?thread_ts=1704233101.883849&cid=C018KFBCR1C

# Extra functionality at the Ancestry stage to make a PCA at a more granular level,
# by subsetting the background population to individuals of a particular ancestry.
# Specify in list format the superpopulations to filter for.
# https://centrepopgen.slack.com/archives/C018KFBCR1C/p1717731698661969
# superpopulation_to_filter = ['African']

# Set whether to allow column discrepancies when unioning background dataset tables
#allow_missing_columns = false
# Columns to be dropped from the final unioned background table
#drop_columns = ['autosomal_mean_dp']

# Parameters for sample QC. The default values for soft filtering taken from gnomAD QC:
# https://macarthurlab.org/2019/10/16/gnomad-v3-0, with `max_n_snps` and
# `max_n_singletons` adjusted based on the results we got for the TOB-WGS project.
[large_cohort.sample_qc_cutoffs]
min_coverage = 18  # minimum mean coverage for a sample
max_n_snps = 8000000  # maximum number of SNPs for a sample
min_n_snps = 2400000  # minimum number of SNPs for a sample
max_n_singletons = 800000  # maximum number of unique SNPs for a sample
max_r_duplication = 0.3  # maximum rate of duplicated reads (from picard metrics)
max_r_het_hom = 3.3  # maximum rate of heterozygous to homozygous SNPs

[hail]
pool_label = 'large-cohort'
delete_scratch_on_exit = false

# Autoscaling policy must be created in the project that corresponds
# to the analysis dataset.
#[hail.dataproc]
#combiner_autoscaling_policy = 'vcf-combiner-50'

[references.gnomad]
tel_and_cent_ht = "gs://cpg-common-main/references/gnomad/v0/telomeres_and_centromeres/hg38.telomeresAndMergedCentromeres.ht"
[references.hg38_telomeres_and_centromeres_intervals]
# Derived from hg38.telomeresAndMergedCentromeres.bed used in gnomAD v3
interval_list = "gs://cpg-common-main/references/hg38/v0/hg38.telomeresAndMergedCentromeres.interval_list"