-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlarge_cohort.toml
124 lines (109 loc) · 5.67 KB
/
large_cohort.toml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
[workflow]
name = 'large_cohort'
scatter_count = 50
scatter_count_genotype = 50
status_reporter = 'metamist'
# define reference fasta. If not specified here, the reference file is pulled from [references.broad][ref_fasta]
ref_fasta='gs://cpg-common-main/references/hg38/v0/dragen_reference/Homo_sapiens_assembly38_masked.fasta'
highmem_workers = true
# Realign CRAM when available, instead of using FASTQ.
# The parameter value should correspond to CRAM version
# (e.g. v0 in gs://cpg-fewgenomes-main/cram/v0/CPGaaa.cram
#realign_from_cram_version = 'v0'
# Write the sorted BAM file to the temp bucket after alignment and before MarkDuplicates
checkpoint_sorted_bam = false
# Calling intervals (defauls to whole genome intervals)
#intervals_path =
[combiner]
force_new_combiner = false
vds_version = "0.1"
memory = "8Gi"
storage = "10Gi"
vds_analysis_ids = []
merge_only_vds = false
# if false, use non-preemptible VMs
preemptible_vms = false
# for each combiner step, generate 5 intermediate VDSs, each combining 50 gVCFs
gvcf_batch_size = 5
branch_factor = 50
[vqsr]
# VQSR, when applying model, targets indel_filter_level and snp_filter_level
# sensitivities. The tool matches them internally to a VQSLOD score cutoff
# based on the model's estimated sensitivity to a set of true variants.
snp_filter_level = 99.7
indel_filter_level = 99.0
[cramqc]
assume_sorted = true
num_pcs = 4
[resource_overrides]
# Override default resource requirements for unusually large seq data without
# demanding higher resources for all operations as standard. Examples below
[large_cohort]
# Kinship threshold to consider two samples as related.
# the default threshold for second degree relatives in gnomAD is 0.1:
# https://github.com/broadinstitute/gnomad_methods/blob/382fc2c7976d58cc8983cc4c9f0df5d8d5f9fae3/gnomad/sample_qc/relatedness.py#L195
max_kin = 0.1
# Parameters for population inference
n_pcs = 16 # number of principal components for PCA analysis
min_pop_prob = 0.5 # minimum random forest probability for population assignment
training_pop = 'Superpopulation name' # metamist's `Participant/meta` field for the
# known population label for the RF training
# whether or not to key sequencing groups by their external ID
use_external_id = true
pca_plot_name = 'hgdp_1kg_sites'
# Specify the samples to exclude from the PCA analysis.
# Provide a list of sample IDs in 'pca_samples_to_remove'.
# If 'pca_samples_to_remove' is empty or not provided, the PCA analysis will include all samples.
pca_samples_to_remove = []
# Specify whether to remove related individuals prior to plotting.
# Set 'remove_relateds' to true to remove related individuals from the tables before generating plots.
# By default, 'remove_relateds' is set to false, meaning that related individuals are included in the tables used for plotting.
remove_relateds = false
# In addition to related individuals, remove samples from the PCA that failed QC
remove_failed_qc_pca = true
# Section that specifies background samples for PCA analysis.
# `datasets` must be a list paths to matrix tables or VDS, and
# `pop_field` if defined would specify the column-level field in th datasets
# to extract the population tag for RF training for ancestry inference.
[large_cohort.pca_background]
#datasets = ['gs://cpg-common-main/references/ancestry/oceania_eur.mt']
#pop_field = 'continental_pop'
# set whether PCA output labels are from given population names, or random forest
# inferred population labels
inferred_ancestry = false
# Whether or not to infer ploidy using the variant data:
# Using variant sites only for ploidy calculations causes ChrY ploidy inflation.
# Setting `variants_only_x_ploidy` and `variants_only_y_ploidy` to `False` (default value) causes
# function to use reference blocks for ploidy calculations
# https://centrepopgen.slack.com/archives/C018KFBCR1C/p1705539231990829?thread_ts=1704233101.883849&cid=C018KFBCR1C
# Extra functionality at the Ancestry stage to make a PCA at a more granular level,
# by subsetting the background population to individuals of a particular ancestry.
# Specify in list format the superpopulations to filter for.
# https://centrepopgen.slack.com/archives/C018KFBCR1C/p1717731698661969
# superpopulation_to_filter = ['African']
# Set whether to allow column discrepancies when unioning background dataset tables
#allow_missing_columns = false
# Columns to be dropped from the final unioned background table
#drop_columns = ['autosomal_mean_dp']
# Parameters for sample QC. The default values for soft filtering taken from gnomAD QC:
# https://macarthurlab.org/2019/10/16/gnomad-v3-0, with `max_n_snps` and
# `max_n_singletons` adjusted based on the results we got for the TOB-WGS project.
[large_cohort.sample_qc_cutoffs]
min_coverage = 18 # minimum mean coverage for a sample
max_n_snps = 8000000 # maximum number of SNPs for a sample
min_n_snps = 2400000 # minimum number of SNPs for a sample
max_n_singletons = 800000 # maximum number of unique SNPs for a sample
max_r_duplication = 0.3 # maximum rate of duplicated reads (from picard metrics)
max_r_het_hom = 3.3 # maximum rate of heterozygous to homozygous SNPs
[hail]
pool_label = 'large-cohort'
delete_scratch_on_exit = false
# Autoscaling policy must be created in the project that corresponds
# to the analysis dataset.
#[hail.dataproc]
#combiner_autoscaling_policy = 'vcf-combiner-50'
[references.gnomad]
tel_and_cent_ht = "gs://cpg-common-main/references/gnomad/v0/telomeres_and_centromeres/hg38.telomeresAndMergedCentromeres.ht"
[references.hg38_telomeres_and_centromeres_intervals]
# Derived from hg38.telomeresAndMergedCentromeres.bed used in gnomAD v3
interval_list = "gs://cpg-common-main/references/hg38/v0/hg38.telomeresAndMergedCentromeres.interval_list"