Skip to content

Commit

Permalink
pass dropped variants
Browse files Browse the repository at this point in the history
  • Loading branch information
RoriCremer committed Oct 22, 2021
1 parent 7e523f2 commit e2603bb
Showing 1 changed file with 22 additions and 13 deletions.
35 changes: 22 additions & 13 deletions scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ workflow GvsSitesOnlyVCF {
project_id = project_id,
dataset_name = dataset_name,
counts_variants = ExtractAnAcAfFromVCF.count_variants,
counts_variant_duplicates = ExtractAnAcAfFromVCF.count_duplicates,
track_dropped_variants = ExtractAnAcAfFromVCF.track_dropped,
table_suffix = table_suffix,
service_account_json_path = service_account_json_path,
load_jsons_done = BigQueryLoadJson.done
Expand Down Expand Up @@ -227,11 +227,14 @@ task ExtractAnAcAfFromVCF {
# "sas"
#]

## get the number of +500 alt alleles for Lee
bcftools view -i 'N_ALT>500' --no-update --no-header ~{local_input_vcf} | wc -l
## track the dropped variants with +500 alt alleles or N's in the reference
bcftools view -i 'N_ALT>500 || REF~"N"' ~{local_input_vcf} | bcftools query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\n' > track_dropped.tsv

## Since Nirvana cant handle N as a base, drop them for now and keep track of the lines dropped
## bcftools view -e 'REF~"N"' ~{local_input_vcf}

## filter out sites with too many alt alleles
bcftools view -e 'N_ALT>500' --no-update ~{local_input_vcf} | \
bcftools view -e 'N_ALT>500 || REF~"N"' --no-update ~{local_input_vcf} | \
## filter out the non-passing sites
bcftools view -f 'PASS,.' --no-update | \
## normalize, left align and split multi allelic sites to new lines, remove duplicate lines
Expand All @@ -240,6 +243,7 @@ task ExtractAnAcAfFromVCF {
bcftools view -e 'ALT[0]="*" || AC=0' --no-update | \
## ensure that we respect the FT tag
bcftools filter -i "FORMAT/FT='PASS,.'" --set-GTs . > ~{normalized_vcf}

du -h ~{normalized_vcf}

## clean up
Expand All @@ -254,21 +258,23 @@ task ExtractAnAcAfFromVCF {
grep -v -wFf duplicates.tsv ~{normalized_vcf} > deduplicated.vcf
rm ~{normalized_vcf} ## clean up

wc -l duplicates.tsv | awk '{print $1}' > duplicates_count.txt
echo "the following duplicate variants will be dropped due to a bug"
cat duplicates.tsv
cat duplicates.tsv >> track_dropped.tsv
cat track_dropped.tsv

rm duplicates.tsv ## clean up

## calculate annotations for all subpopulations
bcftools plugin fill-tags -- deduplicated.vcf -S ~{subpopulation_sample_list} -t AC,AF,AN,AC_het,AC_hom,AC_Hemi | bcftools query -f \
'%CHROM\t%POS\t%REF\t%ALT\t%AC\t%AN\t%AF\t%AC_Hom\t%AC_Het\t%AC_Hemi\t%AC_afr\t%AN_afr\t%AF_afr\t%AC_Hom_afr\t%AC_Het_afr\t%AC_Hemi_afr\t%AC_amr\t%AN_amr\t%AF_amr\t%AC_Hom_amr\t%AC_Het_amr\t%AC_Hemi_amr\t%AC_eas\t%AN_eas\t%AF_eas\t%AC_Hom_eas\t%AC_Het_eas\t%AC_Hemi_eas\t%AC_eur\t%AN_eur\t%AF_eur\t%AC_Hom_eur\t%AC_Het_eur\t%AC_Hemi_eur\t%AC_mid\t%AN_mid\t%AF_mid\t%AC_Hom_mid\t%AC_Het_mid\t%AC_Hemi_mid\t%AC_oth\t%AN_oth\t%AF_oth\t%AC_Hom_oth\t%AC_Het_oth\t%AC_Hemi_oth\t%AC_sas\t%AN_sas\t%AF_sas\t%AC_Hom_sas\t%AC_Het_sas\t%AC_Hemi_sas\n' \
>> ~{custom_annotations_file_name}

### for validation of the pipeline
## for validation of the pipeline
wc -l ~{custom_annotations_file_name} | awk '{print $1 -7}' > count.txt

### compress the vcf and index it, make it sites-only
## compress the vcf and index it, make it sites-only
bcftools view --no-update --drop-genotypes deduplicated.vcf -Oz -o ~{normalized_vcf_compressed}
##bcftools annotate -x INFO Is this overkill?
## bcftools annotate -x INFO Is this overkill?
bcftools index --tbi ~{normalized_vcf_compressed}

>>>
Expand All @@ -286,7 +292,7 @@ task ExtractAnAcAfFromVCF {
output {
File annotations_file = "~{custom_annotations_file_name}"
Int count_variants = read_int("count.txt")
Int count_duplicates = read_int("duplicates_count.txt")
File track_dropped = "track_dropped.tsv"
File output_vcf = "~{normalized_vcf_compressed}"
File output_vcf_index = "~{normalized_vcf_indexed}"
}
Expand Down Expand Up @@ -643,7 +649,7 @@ task BigQuerySmokeTest {
String project_id
String dataset_name
Array[Int] counts_variants
Array[Int] counts_variant_duplicates
Array[File] track_dropped_variants
String table_suffix
String? service_account_json_path
Boolean load_jsons_done
Expand Down Expand Up @@ -671,9 +677,12 @@ task BigQuerySmokeTest {
# sum all the initial input variants across the shards

INITIAL_VARIANT_COUNT=$(python -c "print(sum([~{sep=', ' counts_variants}]))")
DUPLICATE_VARIANT_COUNT=$(python -c "print(sum([~{sep=', ' counts_variant_duplicates}]))")
## here a list of files: ~{sep=', ' track_dropped_variants}
## I need to get the lines from each file
awk 'FNR==1{print ""}1' ~{sep=' ' track_dropped_variants} > dropped_variants.txt

echo "Duplicate variant count: $DUPLICATE_VARIANT_COUNT."
echo "Dropped variants:"
cat dropped_variants.txt

# Count number of variants in the VAT
bq query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT COUNT (DISTINCT vid) AS count FROM `~{dataset_name}.~{vat_table}`' > bq_variant_count.csv
Expand Down

0 comments on commit e2603bb

Please sign in to comment.