Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

VS-222 dont hard code the dataset name! #7704

Merged
merged 15 commits into from
Mar 9, 2022
2 changes: 1 addition & 1 deletion .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ workflows:
branches:
- master
- ah_var_store
- rc-split-intervals-odd
- rc-vs-222-dataset-id
- name: GvsCreateAltAllele
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsCreateAltAllele.wdl
Expand Down
7 changes: 5 additions & 2 deletions scripts/variantstore/wdl/GvsCreateFilterSet.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ workflow GvsCreateFilterSet {
File? excluded_intervals

String output_file_base_name
File? gatk_override = "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/jars/kc_extract_perf_20220111/gatk-package-4.2.0.0-455-g40a40bc-SNAPSHOT-local.jar"
File? gatk_override = "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/jars/rc_testing_dataset_id_20220303/gatk-package-4.2.0.0-484-gf9e5c0e-SNAPSHOT-local.jar"

File dbsnp_vcf
File dbsnp_vcf_index
Expand Down Expand Up @@ -122,6 +122,7 @@ workflow GvsCreateFilterSet {
fq_alt_allele_table = fq_alt_allele_table,
excess_alleles_threshold = excess_alleles_threshold,
read_project_id = query_project,
default_dataset_id = default_dataset,
output_file = "${output_file_base_name}_${i}.vcf.gz",
service_account_json_path = service_account_json_path,
query_project = query_project,
Expand Down Expand Up @@ -349,6 +350,7 @@ task ExtractFilterTask {

String fq_alt_allele_table
String read_project_id
String default_dataset_id
String output_file
Int? excess_alleles_threshold

Expand Down Expand Up @@ -391,7 +393,8 @@ task ExtractFilterTask {
~{sep=" " query_label_args} \
-L ~{intervals} \
~{"-XL " + excluded_intervals} \
--project-id ~{read_project_id}
--project-id ~{read_project_id} \
--dataset-id ~{default_dataset_id}
>>>

runtime {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,15 @@ public abstract class ExtractTool extends GATKTool {
)
protected String projectID = null;


@Argument(
fullName = "dataset-id",
doc = "ID of the Google Cloud dataset to use when executing queries",
optional = true // I guess, but wont it break otherwise or require that a dataset be created with the name temp_tables?
Copy link
Contributor Author

@RoriCremer RoriCremer Mar 4, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm going to delete this comment, but why is projectId above optional?

)
protected String datasetID = null;


@Argument(
fullName = "sample-table",
doc = "Fully qualified name of a bigquery table containing a single column `sample` that describes the full list of samples to extract",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ protected void onStartup() {

engine = new ExtractFeaturesEngine(
projectID,
datasetID,
vcfWriter,
header,
annotationEngine,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ public class ExtractFeaturesEngine {

private final ReferenceConfidenceVariantContextMerger variantContextMerger;
private final String projectID;
private final String datasetID;

private final TableReference altAlleleTable;
private final TableReference sampleListTable;
Expand All @@ -69,6 +70,7 @@ public class ExtractFeaturesEngine {
// /** Set of sample names seen in the variant data from BigQuery. */
// private final Set<String> sampleNames = new HashSet<>();
public ExtractFeaturesEngine(final String projectID,
final String datasetID,
final VariantContextWriter vcfWriter,
final VCFHeader vcfHeader,
final VariantAnnotatorEngine annotationEngine,
Expand All @@ -93,6 +95,7 @@ public ExtractFeaturesEngine(final String projectID,
this.localSortMaxRecordsInRam = localSortMaxRecordsInRam;

this.projectID = projectID;
this.datasetID = datasetID;
this.vcfWriter = vcfWriter;
this.refSource = refSource;
this.altAlleleTable = new TableReference(fqAltAlleleTable, SchemaUtils.ALT_ALLELE_FIELDS);
Expand Down Expand Up @@ -134,6 +137,7 @@ public void traverse() {
featureQueryString,
SchemaUtils.FEATURE_EXTRACT_FIELDS,
projectID,
datasetID,
userDefinedFunctions,
useBatchQueries,
cleanQueryLabels);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -412,21 +412,22 @@ private static long getQueryCostBytesProcessedEstimate(String queryString, Strin
public static StorageAPIAvroReader executeQueryWithStorageAPI(final String queryString,
final List<String> fieldsToRetrieve,
final String projectID,
final String datasetID,
final String userDefinedFunctions,
Map<String, String> labels) {

return executeQueryWithStorageAPI(queryString, fieldsToRetrieve, projectID, userDefinedFunctions, false, labels);
return executeQueryWithStorageAPI(queryString, fieldsToRetrieve, projectID, datasetID, userDefinedFunctions, false, labels);
}

public static StorageAPIAvroReader executeQueryWithStorageAPI(final String queryString,
final List<String> fieldsToRetrieve,
final String projectID,
final String datasetID,
final String userDefinedFunctions,
final boolean runQueryInBatchMode,
Map<String, String> labels) {
final String tempTableDataset = "temp_tables";
final String tempTableName = UUID.randomUUID().toString().replace('-', '_');
final String tempTableFullyQualified = String.format("%s.%s.%s", projectID, tempTableDataset, tempTableName);
final String tempTableName = String.format("%s_%s", "temp_table", UUID.randomUUID().toString().replace('-', '_'));
final String tempTableFullyQualified = String.format("%s.%s.%s", projectID, datasetID, tempTableName);

final String queryStringWithUDFs = userDefinedFunctions == null ? queryString : userDefinedFunctions + queryString;

Expand All @@ -446,7 +447,7 @@ public static StorageAPIAvroReader executeQueryWithStorageAPI(final String query

executeQuery(projectID, queryStringIntoTempTable, runQueryInBatchMode, labels);

final Table tableInfo = getBigQueryEndPoint(projectID).getTable( TableId.of(projectID, tempTableDataset, tempTableName) );
final Table tableInfo = getBigQueryEndPoint(projectID).getTable( TableId.of(projectID, datasetID, tempTableName) );
logger.info(String.format("Query temp table created with %s rows and %s bytes in size", tableInfo.getNumRows(), tableInfo.getNumBytes()));

TableReference tr = new TableReference(tempTableFullyQualified, fieldsToRetrieve);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ public void testQueryWithStorageAPI() {
fieldsToRetrieve.add("name");
Map<String, String> labels = new HashMap<String, String>();
labels.put("gatktestquery", "teststorageapi" + runUuid);
final StorageAPIAvroReader result = BigQueryUtils.executeQueryWithStorageAPI(query, fieldsToRetrieve, BIGQUERY_TEST_PROJECT, noUDFs, labels);
final StorageAPIAvroReader result = BigQueryUtils.executeQueryWithStorageAPI(query, fieldsToRetrieve, BIGQUERY_TEST_PROJECT, BIGQUERY_TEST_DATASET, noUDFs, labels);

int rowCount = 0;
final Set<String> retrievedNames = new HashSet<>();
Expand Down Expand Up @@ -123,7 +123,7 @@ public void testQueryWithEmptyDatasetStorageAPI() {
fieldsToRetrieve.add("name");
Map<String, String> labels = new HashMap<String, String>();
labels.put("gatktestquery", "testapiwithemptydata" + runUuid);
final StorageAPIAvroReader result = BigQueryUtils.executeQueryWithStorageAPI(query, fieldsToRetrieve, BIGQUERY_TEST_PROJECT, noUDFs, labels);
final StorageAPIAvroReader result = BigQueryUtils.executeQueryWithStorageAPI(query, fieldsToRetrieve, BIGQUERY_TEST_PROJECT, BIGQUERY_TEST_DATASET, noUDFs, labels);

int rowCount = 0;
final Set<String> retrievedNames = new HashSet<>();
Expand Down