Skip to content

Commit

Permalink
Quoting of table names (#7666)
Browse files Browse the repository at this point in the history
* fixed quoting
* use named parameters
  • Loading branch information
kcibul authored Feb 14, 2022
1 parent 5e1fcb5 commit c733b4b
Show file tree
Hide file tree
Showing 7 changed files with 30 additions and 40 deletions.
4 changes: 4 additions & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ workflows:
branches:
- master
- ah_var_store
- kc_quoting_bug
- name: GvsAoUReblockGvcf
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsAoUReblockGvcf.wdl
Expand All @@ -90,6 +91,7 @@ workflows:
- ah_var_store
- rsa_split_intervals_part_2
- kc_cluster_vqsr
- kc_quoting_bug
- name: GvsCreateAltAllele
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsCreateAltAllele.wdl
Expand All @@ -99,6 +101,7 @@ workflows:
branches:
- master
- ah_var_store
- kc_quoting_bug
- name: GvsCreateTables
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsCreateTables.wdl
Expand Down Expand Up @@ -129,6 +132,7 @@ workflows:
branches:
- master
- ah_var_store
- kc_quoting_bug
- name: GvsPrepareCallset
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsPrepareCallset.wdl
Expand Down
12 changes: 6 additions & 6 deletions scripts/variantstore/wdl/GvsAssignIds.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -136,20 +136,20 @@ task AssignIds {

# add sample_name to sample_info_table
bq --project_id=~{project_id} query --use_legacy_sql=false \
'INSERT into ~{dataset_name}.~{sample_info_table} (sample_name) select sample_name from ~{dataset_name}.sample_id_assignment_lock m where m.sample_name not in (SELECT sample_name FROM ~{dataset_name}.~{sample_info_table})'
'INSERT into `~{dataset_name}.~{sample_info_table}` (sample_name) select sample_name from `~{dataset_name}.sample_id_assignment_lock` m where m.sample_name not in (SELECT sample_name FROM `~{dataset_name}.~{sample_info_table}`)'

# get the current maximum id, or 0 if there are none
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false "SELECT IFNULL(MAX(sample_id),0) FROM ~{dataset_name}.~{sample_info_table}" > maxid
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false 'SELECT IFNULL(MAX(sample_id),0) FROM `~{dataset_name}.~{sample_info_table}`' > maxid
offset=$(tail -1 maxid)

# perform actual id assignment
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false \
"UPDATE ~{dataset_name}.~{sample_info_table} m SET m.sample_id = id_assign.id FROM (SELECT sample_name, $offset + ROW_NUMBER() OVER() as id FROM ~{dataset_name}.~{sample_info_table} WHERE sample_id IS NULL) id_assign WHERE m.sample_name = id_assign.sample_name;"
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false --parameter=offset:INTEGER:$offset \
'UPDATE `~{dataset_name}.~{sample_info_table}` m SET m.sample_id = id_assign.id FROM (SELECT sample_name, @offset + ROW_NUMBER() OVER() as id FROM `~{dataset_name}.~{sample_info_table}` WHERE sample_id IS NULL) id_assign WHERE m.sample_name = id_assign.sample_name;'

# retrieve the list of assigned ids and samples to update the datamodel
echo "entity:sample_id,gvs_id" > update.tsv
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false -n $num_samples \
"SELECT sample_name, sample_id from ~{dataset_name}.~{sample_info_table} WHERE sample_id >= $offset" > update.tsv
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false -n $num_samples --parameter=offset:INTEGER:$offset \
'SELECT sample_name, sample_id from `~{dataset_name}.~{sample_info_table}` WHERE sample_id >= @offset' > update.tsv
cat update.tsv | sed -e 's/sample_id/gvs_id/' -e 's/sample_name/entity:sample_id/' -e 's/,/\t/g' > gvs_ids.tsv

# get the max id to create tables for
Expand Down
8 changes: 4 additions & 4 deletions scripts/variantstore/wdl/GvsCreateAltAllele.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ task GetVetTableNames {

echo "project_id = ~{query_project_id}" > ~/.bigqueryrc
bq query --location=US --project_id=~{query_project_id} --format=csv --use_legacy_sql=false \
"SELECT table_name FROM ~{dataset_project_id}.~{dataset_name}.INFORMATION_SCHEMA.TABLES WHERE table_name LIKE 'vet_%' ORDER BY table_name" > vet_tables.csv
'SELECT table_name FROM `~{dataset_project_id}.~{dataset_name}.INFORMATION_SCHEMA.TABLES` WHERE table_name LIKE "vet_%" ORDER BY table_name' > vet_tables.csv

# remove the header row from the CSV file
sed -i 1d vet_tables.csv
Expand Down Expand Up @@ -113,7 +113,7 @@ task CreateAltAlleleTable {

echo "project_id = ~{query_project_id}" > ~/.bigqueryrc
bq query --location=US --project_id=~{query_project_id} --format=csv --use_legacy_sql=false \
"CREATE OR REPLACE TABLE ~{dataset_project_id}.~{dataset_name}.alt_allele (
'CREATE OR REPLACE TABLE `~{dataset_project_id}.~{dataset_name}.alt_allele` (
location INT64,
sample_id INT64,
ref STRING,
Expand All @@ -139,7 +139,7 @@ task CreateAltAlleleTable {
ref_ad INT64,
ad INT64
) PARTITION BY RANGE_BUCKET(location, GENERATE_ARRAY(0, 25000000000000, 1000000000000))
CLUSTER BY location, sample_id;"
CLUSTER BY location, sample_id;'

>>>

Expand Down Expand Up @@ -193,7 +193,7 @@ task PopulateAltAlleleTable {
}

runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_20210923"
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_20220211"
memory: "3 GB"
disks: "local-disk 10 HDD"
cpu: 1
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsCreateFilterSet.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ task GetNumSamplesLoaded {

echo "project_id = ~{project_id}" > ~/.bigqueryrc
bq query --location=US --project_id=~{project_id} --format=csv --use_legacy_sql=false \
"SELECT COUNT(*) as num_rows FROM ~{fq_sample_table} WHERE is_loaded = true" > num_rows.csv
'SELECT COUNT(*) as num_rows FROM `~{fq_sample_table}` WHERE is_loaded = true' > num_rows.csv

NUMROWS=$(python3 -c "csvObj=open('num_rows.csv','r');csvContents=csvObj.read();print(csvContents.split('\n')[1]);")

Expand Down
15 changes: 0 additions & 15 deletions scripts/variantstore/wdl/GvsCreateTables.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,6 @@ workflow CreateBQTables {
String vet_schema_json = '[{"name": "sample_id", "type" :"INTEGER", "mode": "REQUIRED"},{"name": "location", "type" :"INTEGER", "mode": "REQUIRED"},{"name": "ref", "type" :"STRING", "mode": "REQUIRED"},{"name": "alt", "type" :"STRING", "mode": "REQUIRED"},{"name": "AS_RAW_MQ", "type" :"STRING", "mode": "NULLABLE"},{"name": "AS_RAW_MQRankSum", "type" :"STRING", "mode": "NULLABLE"},{"name": "QUALapprox", "type" :"STRING", "mode": "NULLABLE"},{"name": "AS_QUALapprox", "type" :"STRING", "mode": "NULLABLE"},{"name": "AS_RAW_ReadPosRankSum", "type" :"STRING", "mode": "NULLABLE"},{"name": "AS_SB_TABLE", "type" :"STRING", "mode": "NULLABLE"},{"name": "AS_VarDP", "type" :"STRING", "mode": "NULLABLE"},{"name": "call_GT", "type" :"STRING", "mode": "NULLABLE"},{"name": "call_AD", "type" :"STRING", "mode": "NULLABLE"},{"name": "call_GQ", "type" :"INTEGER", "mode": "NULLABLE"},{"name": "call_PGT", "type" :"STRING", "mode": "NULLABLE"},{"name": "call_PID", "type" :"STRING", "mode": "NULLABLE"},{"name": "call_PL", "type" :"STRING", "mode": "NULLABLE"}]'
String ref_ranges_schema_json = '[{"name": "location","type": "INTEGER","mode": "REQUIRED"},{"name": "sample_id","type": "INTEGER","mode": "REQUIRED"},{"name": "length","type": "INTEGER","mode": "REQUIRED"},{"name": "state","type": "STRING","mode": "REQUIRED"}]'
Int? preemptible_tries

}

call CreateTables as CreatePetTables {
input:
project_id = project_id,
dataset_name = dataset_name,
datatype = "pet",
max_table_id = max_table_id,
schema_json = pet_schema_json,
superpartitioned = "true",
partitioned = "true",
service_account_json_path = service_account_json_path,
preemptible_tries = preemptible_tries
}

call CreateTables as CreateVetTables {
Expand Down Expand Up @@ -54,7 +40,6 @@ workflow CreateBQTables {
}

output {
String petDone = CreatePetTables.done
String vetDone = CreateVetTables.done
String refDone = CreateRefRangesTables.done
}
Expand Down
23 changes: 12 additions & 11 deletions scripts/variantstore/wdl/GvsImportGenomes.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -131,14 +131,15 @@ task CheckForDuplicateData {

# check the INFORMATION_SCHEMA.PARTITIONS table to see if any of input sample names/ids have data loaded into their partitions
# this returns the list of sample names that do already have data loaded
bq --location=US --project_id=~{project_id} query --format=csv -n ~{num_samples} --use_legacy_sql=false \
"WITH items as (SELECT s.sample_id, s.sample_name, s.is_loaded FROM ${TEMP_TABLE} t left outer join ${SAMPLE_INFO_TABLE} s on (s.sample_name = t.sample_name)) " \
"SELECT i.sample_name FROM ${INFO_SCHEMA_TABLE} p JOIN items i ON (p.partition_id = CAST(i.sample_id AS STRING)) WHERE p.total_logical_bytes > 0 AND (table_name like 'ref_ranges_%' OR table_name like 'vet_%' OR table_name like 'pet_%')" \
"UNION DISTINCT " \
"SELECT i.sample_name FROM items i WHERE i.is_loaded = True " \
"UNION DISTINCT " \
"SELECT i.sample_name FROM items i WHERE i.sample_id IN (SELECT sample_id FROM ~{dataset_name}.sample_load_status) " \
| sed -e '/sample_name/d' > duplicates
echo "WITH items as (SELECT s.sample_id, s.sample_name, s.is_loaded FROM \`${TEMP_TABLE}\` t left outer join \`${SAMPLE_INFO_TABLE}\` s on (s.sample_name = t.sample_name)) " >> query.sql
echo "SELECT i.sample_name FROM \`${INFO_SCHEMA_TABLE}\` p JOIN items i ON (p.partition_id = CAST(i.sample_id AS STRING)) WHERE p.total_logical_bytes > 0 AND (table_name like 'ref_ranges_%' OR table_name like 'vet_%' OR table_name like 'pet_%')" >> query.sql
echo "UNION DISTINCT " >> query.sql
echo "SELECT i.sample_name FROM items i WHERE i.is_loaded = True " >> query.sql
echo "UNION DISTINCT " >> query.sql
echo "SELECT i.sample_name FROM items i WHERE i.sample_id IN (SELECT sample_id FROM \`~{dataset_name}.sample_load_status\`) " >> query.sql


cat query.sql | bq --location=US --project_id=~{project_id} query --format=csv -n ~{num_samples} --use_legacy_sql=false | sed -e '/sample_name/d' > duplicates

# remove the temp table
bq --project_id=~{project_id} rm -f -t ${TEMP_TABLE}
Expand Down Expand Up @@ -351,7 +352,7 @@ task SetIsLoadedColumn {

# set is_loaded to true if there is a corresponding vet table partition with rows for that sample_id
bq --location=US --project_id=~{project_id} query --format=csv --use_legacy_sql=false \
"UPDATE ~{dataset_name}.sample_info SET is_loaded = true WHERE sample_id IN (SELECT CAST(partition_id AS INT64) from ~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS WHERE partition_id NOT LIKE \"__%\" AND total_logical_bytes > 0 AND table_name LIKE \"vet_%\") OR sample_id IN (SELECT sample_id FROM ~{dataset_name}.sample_load_status GROUP BY 1 HAVING COUNT(1) = 2)"
'UPDATE `~{dataset_name}.sample_info` SET is_loaded = true WHERE sample_id IN (SELECT CAST(partition_id AS INT64) from `~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS` WHERE partition_id NOT LIKE "__%" AND total_logical_bytes > 0 AND table_name LIKE "vet_%") OR sample_id IN (SELECT sample_id FROM `~{dataset_name}.sample_load_status` GROUP BY 1 HAVING COUNT(1) = 2)'

>>>

Expand Down Expand Up @@ -417,7 +418,7 @@ task GetSampleIds {

# get the current maximum id, or 0 if there are none
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false \
"SELECT IFNULL(MIN(sample_id),0) as min, IFNULL(MAX(sample_id),0) as max FROM ~{dataset_name}.~{table_name} AS samples JOIN ${TEMP_TABLE} AS temp ON samples.sample_name=temp.sample_name" > results
"SELECT IFNULL(MIN(sample_id),0) as min, IFNULL(MAX(sample_id),0) as max FROM \`~{dataset_name}.~{table_name}\` AS samples JOIN \`${TEMP_TABLE}\` AS temp ON samples.sample_name=temp.sample_name" > results

# prep for being able to return min table id
min_sample_id=$(tail -1 results | cut -d, -f1)
Expand All @@ -433,7 +434,7 @@ task GetSampleIds {
python3 -c "from math import ceil; print(ceil($min_sample_id/~{samples_per_table}))" > min_sample_id

bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false -n ~{num_samples} \
"SELECT sample_id, samples.sample_name FROM ~{dataset_name}.~{table_name} AS samples JOIN ${TEMP_TABLE} AS temp ON samples.sample_name=temp.sample_name" > sample_map
"SELECT sample_id, samples.sample_name FROM \`~{dataset_name}.~{table_name}\` AS samples JOIN \`${TEMP_TABLE}\` AS temp ON samples.sample_name=temp.sample_name" > sample_map

cut -d, -f1 sample_map > gvs_ids

Expand Down
6 changes: 3 additions & 3 deletions scripts/variantstore/wdl/extract/populate_alt_allele_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ def populate_alt_allele_table(query_project, vet_table_name, fq_dataset, sa_key_
alt_allele_temp_function = Path('alt_allele_temp_function.sql').read_text()
alt_allele_positions = Path('alt_allele_positions.sql').read_text()
fq_vet_table = f"{fq_dataset}.{vet_table_name}"
query_with = f"""INSERT INTO {fq_dataset}.alt_allele
query_with = f"""INSERT INTO `{fq_dataset}.alt_allele`
WITH
position1 as (select * from {fq_vet_table} WHERE call_GT IN ('0/1', '1/0', '1/1', '0|1', '1|0', '1|1', '0/2', '0|2','2/0', '2|0')),
position2 as (select * from {fq_vet_table} WHERE call_GT IN ('1/2', '1|2', '2/1', '2|1'))"""
position1 as (select * from `{fq_vet_table}` WHERE call_GT IN ('0/1', '1/0', '1/1', '0|1', '1|0', '1|1', '0/2', '0|2','2/0', '2|0')),
position2 as (select * from `{fq_vet_table}` WHERE call_GT IN ('1/2', '1|2', '2/1', '2|1'))"""

sql = alt_allele_temp_function + query_with + alt_allele_positions
result = utils.execute_with_retry(client, f"into alt allele from {vet_table_name}", sql)
Expand Down

0 comments on commit c733b4b

Please sign in to comment.