From 8634ac3552957e7e066052fc8b8d0f0199d95af0 Mon Sep 17 00:00:00 2001 From: kcibul Date: Tue, 10 Nov 2020 15:58:57 -0500 Subject: [PATCH] Rudimentary support for other output types, support for scattering extract cohort (#6949) * cohort extract work for WGS * rudimentary support for configurable output types * support for nocalls * extract wdl, missing classes, and compiler warning * tidying --- build.gradle | 2 + .../ngs_cohort_extract.inputs.json | 13 ++ .../variantstore_wdl/ngs_cohort_extract.wdl | 115 ++++++++++++++++++ .../tools/variantdb/CommonCode.java | 6 + .../tools/variantdb/IngestUtils.java | 2 +- .../nextgen/CreateVariantIngestFiles.java | 23 +++- .../variantdb/nextgen/ExtractCohort.java | 27 ++-- .../nextgen/ExtractCohortEngine.java | 27 +++- .../tools/variantdb/nextgen/PetOrcWriter.java | 59 +++++++++ .../variantdb/nextgen/PetParquetWriter.java | 53 ++++++++ .../variantdb/nextgen/PetTsvCreator.java | 84 ++++++++++--- .../tools/variantdb/nextgen/VetFieldEnum.java | 3 +- 12 files changed, 377 insertions(+), 37 deletions(-) create mode 100644 scripts/variantstore_wdl/ngs_cohort_extract.inputs.json create mode 100644 scripts/variantstore_wdl/ngs_cohort_extract.wdl create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/PetOrcWriter.java create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/PetParquetWriter.java diff --git a/build.gradle b/build.gradle index de0b77ac280..cb30dd7e177 100644 --- a/build.gradle +++ b/build.gradle @@ -332,6 +332,8 @@ dependencies { compile('org.apache.hadoop:hadoop-client:' + hadoopVersion) // should be a 'provided' dependency compile('com.github.jsr203hadoop:jsr203hadoop:1.0.3') + compile('org.apache.orc:orc:1.6.5') + compile('de.javakaffee:kryo-serializers:0.41') { exclude module: 'kryo' // use Spark's version } diff --git a/scripts/variantstore_wdl/ngs_cohort_extract.inputs.json b/scripts/variantstore_wdl/ngs_cohort_extract.inputs.json new file mode 100644 index 00000000000..18cbb04f681 --- /dev/null +++ b/scripts/variantstore_wdl/ngs_cohort_extract.inputs.json @@ -0,0 +1,13 @@ +{ + "NgsCohortExtract.reference": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", + "NgsCohortExtract.reference_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", + "NgsCohortExtract.reference_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + + "NgsCohortExtract.gatk_override": "gs://broad-dsp-spec-ops/kcibul/gatk-package-4.1.8.1-140-g8aa14d3-SNAPSHOT-local.jar", + + "NgsCohortExtract.fq_sample_table": "spec-ops-aou.kc_high_cov_ccdg.cohort_100_of_194", + "NgsCohortExtract.fq_cohort_extract_table": "spec-ops-aou.kc_high_cov_ccdg.exported_cohort_100_test", + "NgsCohortExtract.query_project": "spec-ops-aou", + + "NgsCohortExtract.output_file_base_name": "ccdg_high_cov_export_100" +} diff --git a/scripts/variantstore_wdl/ngs_cohort_extract.wdl b/scripts/variantstore_wdl/ngs_cohort_extract.wdl new file mode 100644 index 00000000000..9ce621fae7c --- /dev/null +++ b/scripts/variantstore_wdl/ngs_cohort_extract.wdl @@ -0,0 +1,115 @@ +version 1.0 + +workflow NgsCohortExtract { + input { + Int max_chrom_id = 24 + + # bug in cromwell, can't support large integers... + # https://github.com/broadinstitute/cromwell/issues/2685 + String chrom_offset = "1000000000000" + + File reference + File reference_index + File reference_dict + + String fq_sample_table + String fq_cohort_extract_table + String query_project + + String output_file_base_name + File? gatk_override + } + + scatter(i in range(max_chrom_id)) { + call ExtractTask { + input: + gatk_override = gatk_override, + reference = reference, + reference_index = reference_index, + reference_dict = reference_dict, + fq_sample_table = fq_sample_table, + chrom_offset = chrom_offset, + chrom_id = i+1, + fq_cohort_extract_table = fq_cohort_extract_table, + read_project_id = query_project, + output_file = "${output_file_base_name}_${i}.vcf.gz" + } + } +} + +################################################################################ +task ExtractTask { + # indicates that this task should NOT be call cached + meta { + volatile: true + } + + input { + # ------------------------------------------------ + # Input args: + File reference + File reference_index + File reference_dict + + String fq_sample_table + + # bug in cromwell, can't support large integers... + # https://github.com/broadinstitute/cromwell/issues/2685 + String chrom_offset + Int chrom_id + + String fq_cohort_extract_table + String read_project_id + String output_file + + # Runtime Options: + File? gatk_override + + Int? local_sort_max_records_in_ram = 10000000 + } + + + # ------------------------------------------------ + # Run our command: + command <<< + set -e + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} + + df -h + min_location=$(echo "~{chrom_id} * ~{chrom_offset}" | bc) + max_location=$(echo "( ~{chrom_id} + 1 ) * ~{chrom_offset}" | bc) + + gatk --java-options "-Xmx4g" \ + ExtractCohort \ + --mode GENOMES --ref-version 38 --query-mode LOCAL_SORT \ + -R "~{reference}" \ + -O "~{output_file}" \ + --local-sort-max-records-in-ram ~{local_sort_max_records_in_ram} \ + --sample-table ~{fq_sample_table} \ + --cohort-extract-table ~{fq_cohort_extract_table} \ + --min-location ${min_location} --max-location ${max_location} \ + --project-id ~{read_project_id} + >>> + + # ------------------------------------------------ + # Runtime settings: + runtime { + docker: "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_d8a72b825eab2d979c8877448c0ca948fd9b34c7_change_to_hwe" + memory: "7 GB" + disks: "local-disk 10 HDD" + bootDiskSizeGb: 15 + preemptible: 3 + cpu: 2 + } + + # ------------------------------------------------ + # Outputs: + output { + File output_vcf = "~{output_file}" + File output_vcf_index = "~{output_file}.tbi" + } + } + + + + diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/CommonCode.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/CommonCode.java index b8491f750da..552b16d260d 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/CommonCode.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/CommonCode.java @@ -197,4 +197,10 @@ public enum ModeEnum { GENOMES, ARRAYS } + + public enum OutputType { + TSV, + ORC, + PARQUET + } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/IngestUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/IngestUtils.java index 7b4372824ba..cbf6b82bc44 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/IngestUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/IngestUtils.java @@ -69,7 +69,7 @@ public static int getTableNumber(String sampleId, int sampleMod) { // this is ba public static int getTableNumber(long sampleId, int sampleMod) { // this is based on sample id // sample ids 1-4000 will go in directory 001 // subtract 1 from the sample id to make it 1-index (or do we want to 0-index?) and add 1 to the dir - int directoryNumber = new Long(Math.floorDiv((sampleId - 1), sampleMod) + 1).intValue(); // TODO omg write some unit tests + int directoryNumber = (int) (Math.floorDiv((sampleId - 1), sampleMod) + 1); // TODO omg write some unit tests return directoryNumber; } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/CreateVariantIngestFiles.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/CreateVariantIngestFiles.java index affb9035388..e48c37c826e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/CreateVariantIngestFiles.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/CreateVariantIngestFiles.java @@ -14,6 +14,7 @@ import org.broadinstitute.hellbender.engine.ReadsContext; import org.broadinstitute.hellbender.engine.ReferenceContext; import org.broadinstitute.hellbender.engine.VariantWalker; +import org.broadinstitute.hellbender.exceptions.GATKException; import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.tools.variantdb.*; import org.broadinstitute.hellbender.tools.variantdb.IngestConstants; @@ -22,6 +23,7 @@ import java.util.*; import java.io.File; +import java.io.IOException; /** * Ingest variant walker @@ -78,6 +80,12 @@ public final class CreateVariantIngestFiles extends VariantWalker { optional = true) public CommonCode.ModeEnum mode = CommonCode.ModeEnum.EXOMES; + @Argument(fullName = "output-type", + shortName = "ot", + doc = "[Experimental] Output file format: TSV, ORC or PARQUET [default=TSV].", + optional = true) + public CommonCode.OutputType outputType = CommonCode.OutputType.TSV; + @Argument( fullName = "ref-version", doc = "Remove this option!!!! only for ease of testing. Valid options are 37 or 38", @@ -137,7 +145,7 @@ public void onTraversalStart() { final GenomeLocSortedSet genomeLocSortedSet = new GenomeLocSortedSet(new GenomeLocParser(seqDictionary)); intervalArgumentGenomeLocSortedSet = GenomeLocSortedSet.createSetFromList(genomeLocSortedSet.getGenomeLocParser(), IntervalUtils.genomeLocsFromLocatables(genomeLocSortedSet.getGenomeLocParser(), intervalArgumentCollection.getIntervals(seqDictionary))); - petTsvCreator = new PetTsvCreator(sampleName, sampleId, tableNumberPrefix, seqDictionary, gqStateToIgnore, outputDir); + petTsvCreator = new PetTsvCreator(sampleName, sampleId, tableNumberPrefix, seqDictionary, gqStateToIgnore, outputDir, outputType); switch (mode) { case EXOMES: case GENOMES: @@ -177,12 +185,21 @@ public void apply(final VariantContext variant, final ReadsContext readsContext, if (!variant.isReferenceBlock()) { vetTsvCreator.apply(variant, readsContext, referenceContext, featureContext); } - petTsvCreator.apply(variant, intervalsToWrite); + try { + petTsvCreator.apply(variant, intervalsToWrite); + } catch (IOException ioe) { + throw new GATKException("Error writing PET", ioe); + } + } @Override public Object onTraversalSuccess() { - petTsvCreator.writeMissingIntervals(intervalArgumentGenomeLocSortedSet); + try { + petTsvCreator.writeMissingIntervals(intervalArgumentGenomeLocSortedSet); + } catch (IOException ioe) { + throw new GATKException("Error writing missing intervals", ioe); + } return 0; } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/ExtractCohort.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/ExtractCohort.java index 7be9a0d1ed3..d270d9e56fe 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/ExtractCohort.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/ExtractCohort.java @@ -1,26 +1,14 @@ package org.broadinstitute.hellbender.tools.variantdb.nextgen; -import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.VCFHeader; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; import org.broadinstitute.barclay.help.DocumentedFeature; -import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.cmdline.programgroups.ShortVariantDiscoveryProgramGroup; -import org.broadinstitute.hellbender.engine.GATKTool; -import org.broadinstitute.hellbender.tools.variantdb.ChromosomeEnum; import org.broadinstitute.hellbender.tools.variantdb.CommonCode; import org.broadinstitute.hellbender.tools.variantdb.SampleList; -import org.broadinstitute.hellbender.tools.variantdb.SchemaUtils; -import org.broadinstitute.hellbender.tools.variantdb.arrays.ExtractCohortBQ; -import org.broadinstitute.hellbender.tools.walkers.annotator.Annotation; -import org.broadinstitute.hellbender.tools.walkers.annotator.StandardAnnotation; -import org.broadinstitute.hellbender.tools.walkers.annotator.VariantAnnotatorEngine; -import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.AS_StandardAnnotation; -import org.broadinstitute.hellbender.utils.bigquery.TableReference; -import org.broadinstitute.hellbender.utils.io.IOUtils; import java.util.*; @@ -49,6 +37,19 @@ public class ExtractCohort extends ExtractTool { ) private String cohortTable = null; + @Argument( + fullName = "min-location", + doc = "When extracting data, only include locations >= this value", + optional = true + ) + private Long minLocation = null; + + @Argument( + fullName = "max-location", + doc = "When extracting data, only include locations <= this value", + optional = true + ) + private Long maxLocation = null; @Override protected void onStartup() { @@ -68,6 +69,8 @@ protected void onStartup() { sampleNames, mode, cohortTable, + minLocation, + maxLocation, filteringFQTableName, localSortMaxRecordsInRam, printDebugInformation, diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/ExtractCohortEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/ExtractCohortEngine.java index 7dc2cef3af1..e3b397cf9b5 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/ExtractCohortEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/ExtractCohortEngine.java @@ -42,6 +42,8 @@ public class ExtractCohortEngine { private final boolean printDebugInformation; private final int localSortMaxRecordsInRam; private final TableReference cohortTableRef; + private final Long minLocation; + private final Long maxLocation; private final TableReference filteringTableRef; private final ReferenceDataSource refSource; private double vqsLodSNPThreshold = 0; @@ -75,6 +77,8 @@ public ExtractCohortEngine(final String projectID, final Set sampleNames, final CommonCode.ModeEnum mode, final String cohortTableName, + final Long minLocation, + final Long maxLocation, final String filteringTableName, final int localSortMaxRecordsInRam, final boolean printDebugInformation, @@ -89,8 +93,12 @@ public ExtractCohortEngine(final String projectID, this.refSource = refSource; this.sampleNames = sampleNames; this.mode = mode; + this.cohortTableRef = new TableReference(cohortTableName, SchemaUtils.COHORT_FIELDS); - this.filteringTableRef = new TableReference(filteringTableName, SchemaUtils.YNG_FIELDS); + this.minLocation = minLocation; + this.maxLocation = maxLocation; + this.filteringTableRef = filteringTableName == null || "".equals(filteringTableName) ? null : new TableReference(filteringTableName, SchemaUtils.YNG_FIELDS); + this.printDebugInformation = printDebugInformation; this.vqsLodSNPThreshold = vqsLodSNPThreshold; this.vqsLodINDELThreshold = vqsLodINDELThreshold; @@ -110,7 +118,13 @@ public void traverse() { if (printDebugInformation) { logger.debug("using storage api with local sort"); } - final StorageAPIAvroReader storageAPIAvroReader = new StorageAPIAvroReader(cohortTableRef); + + String rowRestriction = null; + if (minLocation != null && maxLocation != null) { + rowRestriction = "location >= " + minLocation + " AND location <= " + maxLocation; + } + + final StorageAPIAvroReader storageAPIAvroReader = new StorageAPIAvroReader(cohortTableRef, rowRestriction, projectID); createVariantsFromUngroupedTableResult(storageAPIAvroReader); break; case QUERY: @@ -447,12 +461,17 @@ private VariantContext createVariantContextFromSampleRecord(final GenericRecord final String genotypeAttributeName = columnName.substring(SchemaUtils.GENOTYPE_FIELD_PREFIX.length()); if ( genotypeAttributeName.equals(VCFConstants.GENOTYPE_KEY) ) { - final List genotypeAlleles = + if ("./.".equals(columnValueString)) { + genotypeBuilder.alleles(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)); + + } else { + final List genotypeAlleles = Arrays.stream(columnValueString.split("[/|]")) .map(Integer::parseInt) .map(alleleIndex -> alleles.get(alleleIndex)) .collect(Collectors.toList()); - genotypeBuilder.alleles(genotypeAlleles); + genotypeBuilder.alleles(genotypeAlleles); + } } else if ( genotypeAttributeName.equals(VCFConstants.GENOTYPE_QUALITY_KEY) ) { genotypeBuilder.GQ(Integer.parseInt(columnValueString)); } else if ( genotypeAttributeName.equals(VCFConstants.GENOTYPE_PL_KEY) ) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/PetOrcWriter.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/PetOrcWriter.java new file mode 100644 index 00000000000..c250444ead0 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/PetOrcWriter.java @@ -0,0 +1,59 @@ +package org.broadinstitute.hellbender.tools.variantdb.nextgen; + +import java.io.Closeable; +import java.io.IOException; + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.conf.Configuration; +import org.apache.orc.*; +import org.apache.orc.storage.ql.exec.vector.BytesColumnVector; +import org.apache.orc.storage.ql.exec.vector.LongColumnVector; +import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch; + +public class PetOrcWriter implements Closeable { + private Writer writer; + private VectorizedRowBatch batch; + private LongColumnVector locationCV; + private LongColumnVector sampleCV; + private BytesColumnVector stateCV; + + static private TypeDescription schema = TypeDescription.fromString("struct"); + + public PetOrcWriter(String outputFile) throws IOException{ + Configuration conf = new Configuration(); + + this.writer = OrcFile.createWriter(new Path(outputFile), + OrcFile.writerOptions(conf) + .setSchema(schema) + .overwrite(true) + .compress(CompressionKind.SNAPPY) + .stripeSize( 16 * 1024 * 1024 ) + .blockSize( 1 * 1024 * 1024 ) + ); + this.batch = schema.createRowBatch(); + this.locationCV = (LongColumnVector) batch.cols[0]; + this.sampleCV = (LongColumnVector) batch.cols[1]; + this.stateCV = (BytesColumnVector) batch.cols[2]; + } + + public void addRow(long location, long sampleId, String state) throws IOException { + int row = batch.size++; + locationCV.vector[row] = location; + sampleCV.vector[row] = sampleId; + stateCV.setVal(row, state.getBytes()); + + // If the batch is full, write it out and reset the counter + if (batch.size == batch.getMaxSize()) { + writer.addRowBatch(batch); + batch.reset(); + } + } + + public void close() throws IOException { + if (batch.size != 0) { + writer.addRowBatch(batch); + batch.reset(); + } + writer.close(); + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/PetParquetWriter.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/PetParquetWriter.java new file mode 100644 index 00000000000..af6adc11f01 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/PetParquetWriter.java @@ -0,0 +1,53 @@ +package org.broadinstitute.hellbender.tools.variantdb.nextgen; + +import java.io.Closeable; +import java.io.IOException; + +import org.apache.hadoop.fs.Path; +import org.apache.avro.Schema; +import org.apache.avro.SchemaBuilder; +import org.apache.avro.generic.GenericData; +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.avro.AvroParquetWriter; +import org.apache.parquet.column.ParquetProperties.WriterVersion; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.ParquetFileWriter.Mode; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; + +public class PetParquetWriter implements Closeable { + private ParquetWriter writer; + private Schema schema; + + public PetParquetWriter(String outputFile) throws IOException{ + this.schema = SchemaBuilder.record("pet") + .namespace("org.broadinstitute.dsp") + .fields().requiredInt("location").requiredInt("sample_id").requiredString("state") + .endRecord(); + + writer = AvroParquetWriter. + builder(new Path(outputFile)) + .withRowGroupSize(ParquetWriter.DEFAULT_BLOCK_SIZE) + .withPageSize(ParquetWriter.DEFAULT_PAGE_SIZE) + .withSchema(schema) + .withWriteMode(Mode.OVERWRITE) + .withConf(new Configuration()) + .withCompressionCodec(CompressionCodecName.SNAPPY) + .withValidation(false) + .withDictionaryEncoding(true) + .withWriterVersion(WriterVersion.PARQUET_1_0) + .build(); + + } + + public void addRow(long location, long sampleId, String state) throws IOException { + GenericData.Record record = new GenericData.Record(schema); + record.put("location", location); + record.put("sample_id", sampleId); + record.put("state", state); + writer.write(record); + } + + public void close() throws IOException { + writer.close(); + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/PetTsvCreator.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/PetTsvCreator.java index 960b5ab01bc..f06bfa15b2e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/PetTsvCreator.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/PetTsvCreator.java @@ -6,6 +6,7 @@ import org.apache.logging.log4j.Logger; import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.tools.variantdb.SchemaUtils; +import org.broadinstitute.hellbender.tools.variantdb.CommonCode; import org.broadinstitute.hellbender.tools.variantdb.IngestConstants; import org.broadinstitute.hellbender.utils.*; import org.broadinstitute.hellbender.utils.tsv.SimpleXSVWriter; @@ -20,7 +21,12 @@ public final class PetTsvCreator { private static final Logger logger = LogManager.getLogger(PetTsvCreator.class); - private SimpleXSVWriter petWriter = null; + private CommonCode.OutputType outputType; + + private SimpleXSVWriter petTsvWriter = null; + private PetOrcWriter petOrcWriter = null; + private PetParquetWriter petParquetWriter = null; + private final String sampleId; private SimpleInterval previousInterval; private final SAMSequenceDictionary seqDictionary; @@ -28,17 +34,28 @@ public final class PetTsvCreator { private GenomeLocSortedSet coverageLocSortedSet; private static String PET_FILETYPE_PREFIX = "pet_"; - public PetTsvCreator(String sampleName, String sampleId, String tableNumberPrefix, SAMSequenceDictionary seqDictionary, GQStateEnum gqStateToIgnore, final File outputDirectory) { + public PetTsvCreator(String sampleName, String sampleId, String tableNumberPrefix, SAMSequenceDictionary seqDictionary, GQStateEnum gqStateToIgnore, final File outputDirectory, final CommonCode.OutputType outputType) { this.sampleId = sampleId; this.seqDictionary = seqDictionary; this.gqStateToIgnore = gqStateToIgnore; + this.outputType = outputType; + coverageLocSortedSet = new GenomeLocSortedSet(new GenomeLocParser(seqDictionary)); try { - final File petOutputFile = new File(outputDirectory, PET_FILETYPE_PREFIX + tableNumberPrefix + sampleName + IngestConstants.FILETYPE); - List petHeader = PetTsvCreator.getHeaders(); - petWriter = new SimpleXSVWriter(petOutputFile.toPath(), IngestConstants.SEPARATOR); - petWriter.setHeaderLine(petHeader); + final File petOutputFile = new File(outputDirectory, PET_FILETYPE_PREFIX + tableNumberPrefix + sampleName + "." + outputType.toString().toLowerCase()); + switch (outputType) { + case TSV: + List petHeader = PetTsvCreator.getHeaders(); + petTsvWriter = new SimpleXSVWriter(petOutputFile.toPath(), IngestConstants.SEPARATOR); + petTsvWriter.setHeaderLine(petHeader); + break; + case ORC: + petOrcWriter = new PetOrcWriter(petOutputFile.getCanonicalPath()); + break; + case PARQUET: + petParquetWriter = new PetParquetWriter(petOutputFile.getCanonicalPath()); + } } catch (final IOException e) { throw new UserException("Could not create pet outputs", e); } @@ -79,7 +96,7 @@ public String getValue() { } - public void apply(VariantContext variant, List intervalsToWrite) { + public void apply(VariantContext variant, List intervalsToWrite) throws IOException { boolean firstInterval = true; final String variantChr = variant.getContig(); @@ -122,7 +139,21 @@ public void apply(VariantContext variant, List intervalsToWrite) { // write the position to the XSV for (List TSVLineToCreatePet : TSVLinesToCreatePet) { - petWriter.getNewLineBuilder().setRow(TSVLineToCreatePet).write(); + long location = Long.parseLong(TSVLineToCreatePet.get(0)); + long sampleId = Long.parseLong(TSVLineToCreatePet.get(1)); + String state = TSVLineToCreatePet.get(2); + + switch (outputType) { + case TSV: + petTsvWriter.getNewLineBuilder().setRow(TSVLineToCreatePet).write(); + break; + case ORC: + petOrcWriter.addRow(location, sampleId, state); + break; + case PARQUET: + petParquetWriter.addRow(location, sampleId, state); + break; + } } } firstInterval = false; @@ -130,7 +161,7 @@ public void apply(VariantContext variant, List intervalsToWrite) { } - public void writeMissingIntervals(GenomeLocSortedSet intervalArgumentGenomeLocSortedSet) { + public void writeMissingIntervals(GenomeLocSortedSet intervalArgumentGenomeLocSortedSet) throws IOException { GenomeLocSortedSet uncoveredIntervals = intervalArgumentGenomeLocSortedSet.subtractRegions(coverageLocSortedSet); logger.info("MISSING_GREP_HERE:" + uncoveredIntervals.coveredSize()); logger.info("MISSING_PERCENTAGE_GREP_HERE:" + (1.0 * uncoveredIntervals.coveredSize()) / intervalArgumentGenomeLocSortedSet.coveredSize()); @@ -142,7 +173,21 @@ public void writeMissingIntervals(GenomeLocSortedSet intervalArgumentGenomeLocSo SchemaUtils.encodeLocation(contig, genomeLoc.getEnd()), sampleId )) { - petWriter.getNewLineBuilder().setRow(TSVLineToCreatePet).write(); + long location = Long.parseLong(TSVLineToCreatePet.get(0)); + long sampleId = Long.parseLong(TSVLineToCreatePet.get(1)); + String state = TSVLineToCreatePet.get(2); + + switch (outputType) { + case TSV: + petTsvWriter.getNewLineBuilder().setRow(TSVLineToCreatePet).write(); + break; + case ORC: + petOrcWriter.addRow(location, sampleId, state); + break; + case PARQUET: + petParquetWriter.addRow(location, sampleId, state); + break; + } } } } @@ -258,13 +303,20 @@ public static List getHeaders() { } public void closeTool() { - if (petWriter != null) { - try { - petWriter.close(); - } catch (final Exception e) { - throw new IllegalArgumentException("Couldn't close VET writer", e); + try { + switch (outputType) { + case TSV: + if (petTsvWriter != null) petTsvWriter.close(); + break; + case ORC: + if (petOrcWriter != null) petOrcWriter.close(); + break; + case PARQUET: + if (petParquetWriter != null) petParquetWriter.close(); + break; } + } catch (final Exception e) { + throw new IllegalArgumentException("Couldn't close PET writer", e); } - } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/VetFieldEnum.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/VetFieldEnum.java index 8fe03e6075e..f8d27cd5f98 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/VetFieldEnum.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/VetFieldEnum.java @@ -80,7 +80,8 @@ public String getColumnValue(final VariantContext variant) { } if (out != null) { if(!out.endsWith("|0.00")) { - logger.warn("Expected AS_RAW_MQ value to end in |0.00. value is: " + out + " for variant " + variant.toString()); +// KC: we are seeing a TON of these! +// logger.warn("Expected AS_RAW_MQ value to end in |0.00. value is: " + out + " for variant " + variant.toString()); } out = out.substring(0, out.lastIndexOf("|")); String[] outValues = out.split("\\|");