-
Notifications
You must be signed in to change notification settings - Fork 596
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Create workflow to create and populate alt_allele table [VS-51] (#7426)
- Loading branch information
1 parent
0d4f2e3
commit 67dec79
Showing
7 changed files
with
381 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,193 @@ | ||
version 1.0 | ||
|
||
workflow GvsCreateAltAllele { | ||
input { | ||
String dataset_project | ||
String query_project_id = dataset_project | ||
String default_dataset | ||
|
||
String? service_account_json_path | ||
} | ||
|
||
call GetVetTableNames { | ||
input: | ||
query_project_id = query_project_id, | ||
dataset_project_id = dataset_project, | ||
dataset_name = default_dataset, | ||
service_account_json_path = service_account_json_path | ||
} | ||
|
||
call CreateAltAlleleTable { | ||
input: | ||
query_project_id = query_project_id, | ||
dataset_project_id = dataset_project, | ||
dataset_name = default_dataset, | ||
service_account_json_path = service_account_json_path | ||
} | ||
|
||
scatter (idx in range(length(GetVetTableNames.vet_tables))) { | ||
call PopulateAltAlleleTable { | ||
input: | ||
create_table_done = CreateAltAlleleTable.done, | ||
vet_table_name = GetVetTableNames.vet_tables[idx], | ||
query_project_id = query_project_id, | ||
dataset_project_id = dataset_project, | ||
dataset_name = default_dataset, | ||
service_account_json_path = service_account_json_path | ||
} | ||
} | ||
|
||
output { | ||
Array[String] vet_tables_loaded = PopulateAltAlleleTable.done | ||
} | ||
} | ||
|
||
task GetVetTableNames { | ||
meta { | ||
volatile: true | ||
} | ||
|
||
input { | ||
String query_project_id | ||
String dataset_project_id | ||
String dataset_name | ||
|
||
String? service_account_json_path | ||
} | ||
|
||
String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false' | ||
|
||
command <<< | ||
set -e | ||
|
||
if [ ~{has_service_account_file} = 'true' ]; then | ||
gsutil cp ~{service_account_json_path} local.service_account.json | ||
gcloud auth activate-service-account --key-file=local.service_account.json | ||
gcloud config set project ~{query_project_id} | ||
fi | ||
|
||
echo "project_id = ~{query_project_id}" > ~/.bigqueryrc | ||
bq query --location=US --project_id=~{query_project_id} --format=csv --use_legacy_sql=false \ | ||
"SELECT table_name FROM ~{dataset_project_id}.~{dataset_name}.INFORMATION_SCHEMA.TABLES WHERE table_name LIKE 'vet_%' ORDER BY table_name" > vet_tables.csv | ||
|
||
# remove the header row from the CSV file | ||
sed -i 1d vet_tables.csv | ||
>>> | ||
|
||
output { | ||
Array[String] vet_tables = read_lines("vet_tables.csv") | ||
} | ||
|
||
runtime { | ||
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:305.0.0" | ||
memory: "3 GB" | ||
disks: "local-disk 10 HDD" | ||
preemptible: 3 | ||
cpu: 1 | ||
} | ||
} | ||
|
||
task CreateAltAlleleTable { | ||
input { | ||
String query_project_id | ||
String dataset_project_id | ||
String dataset_name | ||
|
||
String? service_account_json_path | ||
} | ||
|
||
String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false' | ||
|
||
command <<< | ||
set -e | ||
|
||
if [ ~{has_service_account_file} = 'true' ]; then | ||
gsutil cp ~{service_account_json_path} local.service_account.json | ||
gcloud auth activate-service-account --key-file=local.service_account.json | ||
gcloud config set project ~{query_project_id} | ||
fi | ||
|
||
echo "project_id = ~{query_project_id}" > ~/.bigqueryrc | ||
bq query --location=US --project_id=~{query_project_id} --format=csv --use_legacy_sql=false \ | ||
"CREATE OR REPLACE TABLE ~{dataset_project_id}.~{dataset_name}.alt_allele ( | ||
location INT64, | ||
sample_id INT64, | ||
ref STRING, | ||
allele STRING, | ||
allele_pos INT64, | ||
call_GT STRING, | ||
call_GQ INT64, | ||
as_raw_mq STRING, | ||
raw_mq INT64, | ||
as_raw_mqranksum STRING, | ||
raw_mqranksum_x_10 INT64, | ||
as_qualapprox STRING, | ||
qualapprox STRING, | ||
qual INT64, | ||
as_raw_readposranksum STRING, | ||
raw_readposranksum_x_10 INT64, | ||
as_sb_table STRING, | ||
sb_ref_plus INT64, | ||
sb_ref_minus INT64, | ||
sb_alt_plus INT64, | ||
sb_alt_minus INT64, | ||
call_AD STRING, | ||
ref_ad INT64, | ||
ad INT64 | ||
) PARTITION BY RANGE_BUCKET(location, GENERATE_ARRAY(0, 25000000000000, 1000000000000)) | ||
CLUSTER BY location, sample_id;" | ||
|
||
>>> | ||
|
||
output { | ||
String done = "done" | ||
} | ||
|
||
runtime { | ||
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:305.0.0" | ||
memory: "3 GB" | ||
disks: "local-disk 10 HDD" | ||
cpu: 1 | ||
} | ||
} | ||
|
||
task PopulateAltAlleleTable { | ||
input { | ||
String create_table_done | ||
String vet_table_name | ||
String query_project_id | ||
String dataset_project_id | ||
String dataset_name | ||
|
||
String? service_account_json_path | ||
} | ||
|
||
String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false' | ||
|
||
command <<< | ||
set -e | ||
|
||
if [ ~{has_service_account_file} = 'true' ]; then | ||
gsutil cp ~{service_account_json_path} local.service_account.json | ||
gcloud auth activate-service-account --key-file=local.service_account.json | ||
SERVICE_ACCOUNT_STANZA="--sa_key_path local.service_account.json " | ||
fi | ||
|
||
python3 /app/populate_alt_allele_table.py \ | ||
--query_project ~{query_project_id} \ | ||
--vet_table_name ~{vet_table_name} \ | ||
--fq_dataset ~{dataset_project_id}.~{dataset_name} \ | ||
$SERVICE_ACCOUNT_STANZA | ||
>>> | ||
|
||
output { | ||
String done = "~{vet_table_name}" | ||
} | ||
|
||
runtime { | ||
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_20210903" | ||
memory: "3 GB" | ||
disks: "local-disk 10 HDD" | ||
cpu: 1 | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,17 @@ | ||
FROM gcr.io/google.com/cloudsdktool/cloud-sdk:349.0.0 | ||
|
||
# Copy the application's requirements.txt and run pip to install | ||
ADD requirements.txt /app/requirements.txt | ||
COPY requirements.txt /app/requirements.txt | ||
RUN pip install -r /app/requirements.txt | ||
RUN apt-get update && apt-get -y upgrade && apt-get -y install bcftools | ||
|
||
# Add the application source code. | ||
ADD create_cohort_extract_data_table.py /app | ||
ADD create_variant_annotation_table.py /app | ||
ADD extract_subpop.py /app | ||
# Copy the application source code. | ||
COPY create_cohort_extract_data_table.py /app | ||
COPY create_variant_annotation_table.py /app | ||
COPY extract_subpop.py /app | ||
COPY populate_alt_allele_table.py /app | ||
COPY alt_allele_positions.sql /app | ||
COPY alt_allele_temp_function.sql /app | ||
|
||
WORKDIR /app | ||
ENTRYPOINT ["/bin/bash"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
select location, sample_id, | ||
SPLIT(minimize(ref, SPLIT(alt,',')[OFFSET(0)]))[OFFSET(0)] as ref, | ||
SPLIT(minimize(ref, SPLIT(alt,',')[OFFSET(0)]))[OFFSET(1)] as allele, | ||
1 as allele_pos, call_GT, call_GQ, | ||
as_raw_mq, | ||
cast(SPLIT(as_raw_mq,'|')[OFFSET(1)] as int64) raw_mq, | ||
as_raw_mqranksum, | ||
SAFE_cast(SAFE_cast(SPLIT(as_raw_mqranksum,',')[SAFE_OFFSET(0)] as float64) * 10.0 as int64) as raw_mqranksum_x_10, | ||
as_qualapprox, | ||
qualapprox, | ||
cast(SPLIT(as_qualapprox,'|')[OFFSET(0)] as int64) as qual, | ||
as_raw_readposranksum, | ||
SAFE_cast(SAFE_cast(SPLIT(as_raw_readposranksum,',')[SAFE_OFFSET(0)] as float64) * 10.0 as int64) as raw_readposranksum_x_10, | ||
as_sb_table, | ||
cast(SPLIT(SPLIT(as_sb_table,'|')[OFFSET(0)],',')[OFFSET(0)] as int64) as sb_ref_plus, | ||
cast(SPLIT(SPLIT(as_sb_table,'|')[OFFSET(0)],',')[OFFSET(1)] as int64) as sb_ref_minus, | ||
cast(SPLIT(SPLIT(as_sb_table,'|')[OFFSET(1)],',')[OFFSET(0)] as int64) as sb_alt_plus, | ||
cast(SPLIT(SPLIT(as_sb_table,'|')[OFFSET(1)],',')[OFFSET(1)] as int64) as sb_alt_minus, | ||
call_AD, | ||
cast(SPLIT(call_AD,',')[OFFSET(0)] as int64) as ref_ad, | ||
cast(SPLIT(call_AD,',')[OFFSET(1)] as int64) as ad | ||
from position1 | ||
|
||
union all | ||
|
||
select location, sample_id, | ||
SPLIT(minimize(ref, SPLIT(alt,',')[OFFSET(0)]))[OFFSET(0)] as ref, | ||
SPLIT(minimize(ref, SPLIT(alt,',')[OFFSET(0)]))[OFFSET(1)] as allele, | ||
1 as allele_pos, call_GT, call_GQ, | ||
as_raw_mq, | ||
cast(SPLIT(as_raw_mq,'|')[OFFSET(1)] as int64) raw_mq, | ||
as_raw_mqranksum, | ||
SAFE_cast(SAFE_cast(SPLIT(as_raw_mqranksum,',')[SAFE_OFFSET(0)] as float64) * 10.0 as int64) as raw_mqranksum_x_10, | ||
as_qualapprox, | ||
qualapprox, | ||
cast(SPLIT(as_qualapprox,'|')[OFFSET(0)] as int64) as qual, | ||
as_raw_readposranksum, | ||
SAFE_cast(SAFE_cast(SPLIT(as_raw_readposranksum,',')[SAFE_OFFSET(0)] as float64) * 10.0 as int64) as raw_readposranksum_x_10, | ||
as_sb_table, | ||
cast(SPLIT(SPLIT(as_sb_table,'|')[OFFSET(0)],',')[OFFSET(0)] as int64) as sb_ref_plus, | ||
cast(SPLIT(SPLIT(as_sb_table,'|')[OFFSET(0)],',')[OFFSET(1)] as int64) as sb_ref_minus, | ||
cast(SPLIT(SPLIT(as_sb_table,'|')[OFFSET(1)],',')[OFFSET(0)] as int64) as sb_alt_plus, | ||
cast(SPLIT(SPLIT(as_sb_table,'|')[OFFSET(1)],',')[OFFSET(1)] as int64) as sb_alt_minus, | ||
call_AD, | ||
cast(SPLIT(call_AD,',')[OFFSET(0)] as int64) as ref_ad, | ||
cast(SPLIT(call_AD,',')[OFFSET(1)] as int64) as ad | ||
from position2 | ||
|
||
union all | ||
|
||
select location, sample_id, | ||
SPLIT(minimize(ref, SPLIT(alt,',')[OFFSET(1)]))[OFFSET(0)] as ref, | ||
SPLIT(minimize(ref, SPLIT(alt,',')[OFFSET(1)]))[OFFSET(1)] as allele, | ||
2 as allele_pos, call_GT, call_GQ, | ||
as_raw_mq, | ||
cast(SPLIT(as_raw_mq,'|')[OFFSET(2)] as int64) raw_mq, | ||
as_raw_mqranksum, | ||
SAFE_cast(SAFE_cast(SPLIT(as_raw_mqranksum,',')[SAFE_OFFSET(1)] as float64) * 10.0 as int64) as raw_mqranksum_x_10, | ||
as_qualapprox, | ||
qualapprox, | ||
cast(SPLIT(as_qualapprox,'|')[OFFSET(1)] as int64) as qual, | ||
as_raw_readposranksum, | ||
SAFE_cast(SAFE_cast(SPLIT(as_raw_readposranksum,',')[SAFE_OFFSET(1)] as float64) * 10.0 as int64) as raw_readposranksum_x_10, | ||
as_sb_table, | ||
cast(SPLIT(SPLIT(as_sb_table,'|')[OFFSET(0)],',')[OFFSET(0)] as int64) as sb_ref_plus, | ||
cast(SPLIT(SPLIT(as_sb_table,'|')[OFFSET(0)],',')[OFFSET(1)] as int64) as sb_ref_minus, | ||
cast(SPLIT(SPLIT(as_sb_table,'|')[OFFSET(2)],',')[OFFSET(0)] as int64) as sb_alt_plus, | ||
cast(SPLIT(SPLIT(as_sb_table,'|')[OFFSET(2)],',')[OFFSET(1)] as int64) as sb_alt_minus, | ||
call_AD, | ||
cast(SPLIT(call_AD,',')[OFFSET(0)] as int64) as ref_ad, | ||
cast(SPLIT(call_AD,',')[OFFSET(2)] as int64) as ad | ||
from position2; |
14 changes: 14 additions & 0 deletions
14
scripts/variantstore/wdl/extract/alt_allele_temp_function.sql
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
CREATE TEMPORARY FUNCTION minimize(ref STRING, allele STRING) | ||
RETURNS STRING | ||
LANGUAGE js AS """ | ||
let done = false | ||
while (!done && ref.length !== 1) { | ||
if (ref.slice(-1) === allele.slice(-1)) { | ||
ref = ref.slice(0, -1) | ||
allele = allele.slice(0,-1) | ||
} else { | ||
done = true | ||
} | ||
} | ||
return ref+','+allele | ||
"""; |
Oops, something went wrong.