Skip to content

Commit

Permalink
Create workflow to create and populate alt_allele table [VS-51] (#7426)
Browse files Browse the repository at this point in the history
  • Loading branch information
rsasch authored and RoriCremer committed Sep 20, 2021
1 parent 0d4f2e3 commit 67dec79
Show file tree
Hide file tree
Showing 7 changed files with 381 additions and 11 deletions.
7 changes: 7 additions & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,13 @@ workflows:
branches:
- master
- ah_var_store
- name: GvsCreateAltAllele
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsCreateAltAllele.wdl
filters:
branches:
- master
- ah_var_store
- name: GvsExtractCallset
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsExtractCallset.wdl
Expand Down
13 changes: 7 additions & 6 deletions scripts/variantstore/TERRA_QUICKSTART.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,15 +57,16 @@ These are the required parameters which must be supplied to the workflow:
**NOTE**: if your workflow fails, you will need to manually remove a lockfile from the output directory. It is called LOCKFILE, and can be removed with `gsutil rm`

## 2. Create Alt Allele Table
**NOTE:** This is a bit of a kludge until we gain more confidence that the data loaded into the ALT_ALLELE table for feature training are optimal and we can automate this process
This step loads data into the ALT_ALLELE table from the `vet_*` tables.

You'll need to run this from the BigQuery Console for your dataset.
This is done by running the `GvsCreateAltAllele` workflow with the following parameters:

Load the SQL script you can find here in the [GATK GitHub Repository](https://github.com/broadinstitute/gatk/blob/ah_var_store/scripts/variantstore/bq/alt_allele_creation.example.sql)

There are three places where you need to replace the string `spec-ops-aou.gvs_tieout_acmg_v1` with your project and dataset name in the form `PROJECT.DATASET`
| Parameter | Description |
| ----------------- | ----------- |
| data_project | The name of the google project containing the dataset |
| default_dataset | The name of the dataset |

Execute the script, it should take 30-60 seconds to run resulting in the creation of the `ALT_ALLELE` table in your dataset
**Note:** This workflow does not use the Terra Entity model to run, so be sure to select `Run workflow with inputs defined by file paths`

## 3. Create Filter Set

Expand Down
193 changes: 193 additions & 0 deletions scripts/variantstore/wdl/GvsCreateAltAllele.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
version 1.0

workflow GvsCreateAltAllele {
input {
String dataset_project
String query_project_id = dataset_project
String default_dataset

String? service_account_json_path
}

call GetVetTableNames {
input:
query_project_id = query_project_id,
dataset_project_id = dataset_project,
dataset_name = default_dataset,
service_account_json_path = service_account_json_path
}

call CreateAltAlleleTable {
input:
query_project_id = query_project_id,
dataset_project_id = dataset_project,
dataset_name = default_dataset,
service_account_json_path = service_account_json_path
}

scatter (idx in range(length(GetVetTableNames.vet_tables))) {
call PopulateAltAlleleTable {
input:
create_table_done = CreateAltAlleleTable.done,
vet_table_name = GetVetTableNames.vet_tables[idx],
query_project_id = query_project_id,
dataset_project_id = dataset_project,
dataset_name = default_dataset,
service_account_json_path = service_account_json_path
}
}

output {
Array[String] vet_tables_loaded = PopulateAltAlleleTable.done
}
}

task GetVetTableNames {
meta {
volatile: true
}

input {
String query_project_id
String dataset_project_id
String dataset_name

String? service_account_json_path
}

String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'

command <<<
set -e

if [ ~{has_service_account_file} = 'true' ]; then
gsutil cp ~{service_account_json_path} local.service_account.json
gcloud auth activate-service-account --key-file=local.service_account.json
gcloud config set project ~{query_project_id}
fi

echo "project_id = ~{query_project_id}" > ~/.bigqueryrc
bq query --location=US --project_id=~{query_project_id} --format=csv --use_legacy_sql=false \
"SELECT table_name FROM ~{dataset_project_id}.~{dataset_name}.INFORMATION_SCHEMA.TABLES WHERE table_name LIKE 'vet_%' ORDER BY table_name" > vet_tables.csv

# remove the header row from the CSV file
sed -i 1d vet_tables.csv
>>>

output {
Array[String] vet_tables = read_lines("vet_tables.csv")
}

runtime {
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:305.0.0"
memory: "3 GB"
disks: "local-disk 10 HDD"
preemptible: 3
cpu: 1
}
}

task CreateAltAlleleTable {
input {
String query_project_id
String dataset_project_id
String dataset_name

String? service_account_json_path
}

String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'

command <<<
set -e

if [ ~{has_service_account_file} = 'true' ]; then
gsutil cp ~{service_account_json_path} local.service_account.json
gcloud auth activate-service-account --key-file=local.service_account.json
gcloud config set project ~{query_project_id}
fi

echo "project_id = ~{query_project_id}" > ~/.bigqueryrc
bq query --location=US --project_id=~{query_project_id} --format=csv --use_legacy_sql=false \
"CREATE OR REPLACE TABLE ~{dataset_project_id}.~{dataset_name}.alt_allele (
location INT64,
sample_id INT64,
ref STRING,
allele STRING,
allele_pos INT64,
call_GT STRING,
call_GQ INT64,
as_raw_mq STRING,
raw_mq INT64,
as_raw_mqranksum STRING,
raw_mqranksum_x_10 INT64,
as_qualapprox STRING,
qualapprox STRING,
qual INT64,
as_raw_readposranksum STRING,
raw_readposranksum_x_10 INT64,
as_sb_table STRING,
sb_ref_plus INT64,
sb_ref_minus INT64,
sb_alt_plus INT64,
sb_alt_minus INT64,
call_AD STRING,
ref_ad INT64,
ad INT64
) PARTITION BY RANGE_BUCKET(location, GENERATE_ARRAY(0, 25000000000000, 1000000000000))
CLUSTER BY location, sample_id;"

>>>

output {
String done = "done"
}

runtime {
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:305.0.0"
memory: "3 GB"
disks: "local-disk 10 HDD"
cpu: 1
}
}

task PopulateAltAlleleTable {
input {
String create_table_done
String vet_table_name
String query_project_id
String dataset_project_id
String dataset_name

String? service_account_json_path
}

String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'

command <<<
set -e

if [ ~{has_service_account_file} = 'true' ]; then
gsutil cp ~{service_account_json_path} local.service_account.json
gcloud auth activate-service-account --key-file=local.service_account.json
SERVICE_ACCOUNT_STANZA="--sa_key_path local.service_account.json "
fi

python3 /app/populate_alt_allele_table.py \
--query_project ~{query_project_id} \
--vet_table_name ~{vet_table_name} \
--fq_dataset ~{dataset_project_id}.~{dataset_name} \
$SERVICE_ACCOUNT_STANZA
>>>

output {
String done = "~{vet_table_name}"
}

runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_20210903"
memory: "3 GB"
disks: "local-disk 10 HDD"
cpu: 1
}
}
13 changes: 8 additions & 5 deletions scripts/variantstore/wdl/extract/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
FROM gcr.io/google.com/cloudsdktool/cloud-sdk:349.0.0

# Copy the application's requirements.txt and run pip to install
ADD requirements.txt /app/requirements.txt
COPY requirements.txt /app/requirements.txt
RUN pip install -r /app/requirements.txt
RUN apt-get update && apt-get -y upgrade && apt-get -y install bcftools

# Add the application source code.
ADD create_cohort_extract_data_table.py /app
ADD create_variant_annotation_table.py /app
ADD extract_subpop.py /app
# Copy the application source code.
COPY create_cohort_extract_data_table.py /app
COPY create_variant_annotation_table.py /app
COPY extract_subpop.py /app
COPY populate_alt_allele_table.py /app
COPY alt_allele_positions.sql /app
COPY alt_allele_temp_function.sql /app

WORKDIR /app
ENTRYPOINT ["/bin/bash"]
72 changes: 72 additions & 0 deletions scripts/variantstore/wdl/extract/alt_allele_positions.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
select location, sample_id,
SPLIT(minimize(ref, SPLIT(alt,',')[OFFSET(0)]))[OFFSET(0)] as ref,
SPLIT(minimize(ref, SPLIT(alt,',')[OFFSET(0)]))[OFFSET(1)] as allele,
1 as allele_pos, call_GT, call_GQ,
as_raw_mq,
cast(SPLIT(as_raw_mq,'|')[OFFSET(1)] as int64) raw_mq,
as_raw_mqranksum,
SAFE_cast(SAFE_cast(SPLIT(as_raw_mqranksum,',')[SAFE_OFFSET(0)] as float64) * 10.0 as int64) as raw_mqranksum_x_10,
as_qualapprox,
qualapprox,
cast(SPLIT(as_qualapprox,'|')[OFFSET(0)] as int64) as qual,
as_raw_readposranksum,
SAFE_cast(SAFE_cast(SPLIT(as_raw_readposranksum,',')[SAFE_OFFSET(0)] as float64) * 10.0 as int64) as raw_readposranksum_x_10,
as_sb_table,
cast(SPLIT(SPLIT(as_sb_table,'|')[OFFSET(0)],',')[OFFSET(0)] as int64) as sb_ref_plus,
cast(SPLIT(SPLIT(as_sb_table,'|')[OFFSET(0)],',')[OFFSET(1)] as int64) as sb_ref_minus,
cast(SPLIT(SPLIT(as_sb_table,'|')[OFFSET(1)],',')[OFFSET(0)] as int64) as sb_alt_plus,
cast(SPLIT(SPLIT(as_sb_table,'|')[OFFSET(1)],',')[OFFSET(1)] as int64) as sb_alt_minus,
call_AD,
cast(SPLIT(call_AD,',')[OFFSET(0)] as int64) as ref_ad,
cast(SPLIT(call_AD,',')[OFFSET(1)] as int64) as ad
from position1

union all

select location, sample_id,
SPLIT(minimize(ref, SPLIT(alt,',')[OFFSET(0)]))[OFFSET(0)] as ref,
SPLIT(minimize(ref, SPLIT(alt,',')[OFFSET(0)]))[OFFSET(1)] as allele,
1 as allele_pos, call_GT, call_GQ,
as_raw_mq,
cast(SPLIT(as_raw_mq,'|')[OFFSET(1)] as int64) raw_mq,
as_raw_mqranksum,
SAFE_cast(SAFE_cast(SPLIT(as_raw_mqranksum,',')[SAFE_OFFSET(0)] as float64) * 10.0 as int64) as raw_mqranksum_x_10,
as_qualapprox,
qualapprox,
cast(SPLIT(as_qualapprox,'|')[OFFSET(0)] as int64) as qual,
as_raw_readposranksum,
SAFE_cast(SAFE_cast(SPLIT(as_raw_readposranksum,',')[SAFE_OFFSET(0)] as float64) * 10.0 as int64) as raw_readposranksum_x_10,
as_sb_table,
cast(SPLIT(SPLIT(as_sb_table,'|')[OFFSET(0)],',')[OFFSET(0)] as int64) as sb_ref_plus,
cast(SPLIT(SPLIT(as_sb_table,'|')[OFFSET(0)],',')[OFFSET(1)] as int64) as sb_ref_minus,
cast(SPLIT(SPLIT(as_sb_table,'|')[OFFSET(1)],',')[OFFSET(0)] as int64) as sb_alt_plus,
cast(SPLIT(SPLIT(as_sb_table,'|')[OFFSET(1)],',')[OFFSET(1)] as int64) as sb_alt_minus,
call_AD,
cast(SPLIT(call_AD,',')[OFFSET(0)] as int64) as ref_ad,
cast(SPLIT(call_AD,',')[OFFSET(1)] as int64) as ad
from position2

union all

select location, sample_id,
SPLIT(minimize(ref, SPLIT(alt,',')[OFFSET(1)]))[OFFSET(0)] as ref,
SPLIT(minimize(ref, SPLIT(alt,',')[OFFSET(1)]))[OFFSET(1)] as allele,
2 as allele_pos, call_GT, call_GQ,
as_raw_mq,
cast(SPLIT(as_raw_mq,'|')[OFFSET(2)] as int64) raw_mq,
as_raw_mqranksum,
SAFE_cast(SAFE_cast(SPLIT(as_raw_mqranksum,',')[SAFE_OFFSET(1)] as float64) * 10.0 as int64) as raw_mqranksum_x_10,
as_qualapprox,
qualapprox,
cast(SPLIT(as_qualapprox,'|')[OFFSET(1)] as int64) as qual,
as_raw_readposranksum,
SAFE_cast(SAFE_cast(SPLIT(as_raw_readposranksum,',')[SAFE_OFFSET(1)] as float64) * 10.0 as int64) as raw_readposranksum_x_10,
as_sb_table,
cast(SPLIT(SPLIT(as_sb_table,'|')[OFFSET(0)],',')[OFFSET(0)] as int64) as sb_ref_plus,
cast(SPLIT(SPLIT(as_sb_table,'|')[OFFSET(0)],',')[OFFSET(1)] as int64) as sb_ref_minus,
cast(SPLIT(SPLIT(as_sb_table,'|')[OFFSET(2)],',')[OFFSET(0)] as int64) as sb_alt_plus,
cast(SPLIT(SPLIT(as_sb_table,'|')[OFFSET(2)],',')[OFFSET(1)] as int64) as sb_alt_minus,
call_AD,
cast(SPLIT(call_AD,',')[OFFSET(0)] as int64) as ref_ad,
cast(SPLIT(call_AD,',')[OFFSET(2)] as int64) as ad
from position2;
14 changes: 14 additions & 0 deletions scripts/variantstore/wdl/extract/alt_allele_temp_function.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
CREATE TEMPORARY FUNCTION minimize(ref STRING, allele STRING)
RETURNS STRING
LANGUAGE js AS """
let done = false
while (!done && ref.length !== 1) {
if (ref.slice(-1) === allele.slice(-1)) {
ref = ref.slice(0, -1)
allele = allele.slice(0,-1)
} else {
done = true
}
}
return ref+','+allele
""";
Loading

0 comments on commit 67dec79

Please sign in to comment.