From 8634ac3552957e7e066052fc8b8d0f0199d95af0 Mon Sep 17 00:00:00 2001
From: kcibul <kcibul@broadinstitute.org>
Date: Tue, 10 Nov 2020 15:58:57 -0500
Subject: [PATCH] Rudimentary support for other output types, support for
 scattering extract cohort (#6949)

* cohort extract work for WGS

* rudimentary support for configurable output types

* support for nocalls

* extract wdl, missing classes, and compiler warning

* tidying
---
 build.gradle                                  |   2 +
 .../ngs_cohort_extract.inputs.json            |  13 ++
 .../variantstore_wdl/ngs_cohort_extract.wdl   | 115 ++++++++++++++++++
 .../tools/variantdb/CommonCode.java           |   6 +
 .../tools/variantdb/IngestUtils.java          |   2 +-
 .../nextgen/CreateVariantIngestFiles.java     |  23 +++-
 .../variantdb/nextgen/ExtractCohort.java      |  27 ++--
 .../nextgen/ExtractCohortEngine.java          |  27 +++-
 .../tools/variantdb/nextgen/PetOrcWriter.java |  59 +++++++++
 .../variantdb/nextgen/PetParquetWriter.java   |  53 ++++++++
 .../variantdb/nextgen/PetTsvCreator.java      |  84 ++++++++++---
 .../tools/variantdb/nextgen/VetFieldEnum.java |   3 +-
 12 files changed, 377 insertions(+), 37 deletions(-)
 create mode 100644 scripts/variantstore_wdl/ngs_cohort_extract.inputs.json
 create mode 100644 scripts/variantstore_wdl/ngs_cohort_extract.wdl
 create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/PetOrcWriter.java
 create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/PetParquetWriter.java

diff --git a/build.gradle b/build.gradle
index de0b77ac280..cb30dd7e177 100644
--- a/build.gradle
+++ b/build.gradle
@@ -332,6 +332,8 @@ dependencies {
     compile('org.apache.hadoop:hadoop-client:' + hadoopVersion) // should be a 'provided' dependency
     compile('com.github.jsr203hadoop:jsr203hadoop:1.0.3')
 
+    compile('org.apache.orc:orc:1.6.5')
+
     compile('de.javakaffee:kryo-serializers:0.41') {
         exclude module: 'kryo' // use Spark's version
     }
diff --git a/scripts/variantstore_wdl/ngs_cohort_extract.inputs.json b/scripts/variantstore_wdl/ngs_cohort_extract.inputs.json
new file mode 100644
index 00000000000..18cbb04f681
--- /dev/null
+++ b/scripts/variantstore_wdl/ngs_cohort_extract.inputs.json
@@ -0,0 +1,13 @@
+{
+  "NgsCohortExtract.reference": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta",
+  "NgsCohortExtract.reference_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai",
+  "NgsCohortExtract.reference_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict",
+
+  "NgsCohortExtract.gatk_override": "gs://broad-dsp-spec-ops/kcibul/gatk-package-4.1.8.1-140-g8aa14d3-SNAPSHOT-local.jar",
+
+  "NgsCohortExtract.fq_sample_table": "spec-ops-aou.kc_high_cov_ccdg.cohort_100_of_194",
+  "NgsCohortExtract.fq_cohort_extract_table": "spec-ops-aou.kc_high_cov_ccdg.exported_cohort_100_test",
+  "NgsCohortExtract.query_project": "spec-ops-aou",
+
+  "NgsCohortExtract.output_file_base_name": "ccdg_high_cov_export_100"
+}
diff --git a/scripts/variantstore_wdl/ngs_cohort_extract.wdl b/scripts/variantstore_wdl/ngs_cohort_extract.wdl
new file mode 100644
index 00000000000..9ce621fae7c
--- /dev/null
+++ b/scripts/variantstore_wdl/ngs_cohort_extract.wdl
@@ -0,0 +1,115 @@
+version 1.0
+
+workflow NgsCohortExtract {
+   input {
+        Int max_chrom_id = 24
+        
+        # bug in cromwell, can't support large integers...
+        # https://github.com/broadinstitute/cromwell/issues/2685
+        String chrom_offset = "1000000000000"
+       
+        File reference
+        File reference_index
+        File reference_dict
+        
+        String fq_sample_table
+        String fq_cohort_extract_table
+        String query_project
+    
+        String output_file_base_name
+        File? gatk_override
+    }
+    
+    scatter(i in range(max_chrom_id)) {
+        call ExtractTask {
+            input:
+                gatk_override            = gatk_override,
+                reference                = reference,
+                reference_index          = reference_index,
+                reference_dict           = reference_dict,
+                fq_sample_table          = fq_sample_table,
+                chrom_offset             = chrom_offset,
+                chrom_id                 = i+1,
+                fq_cohort_extract_table  = fq_cohort_extract_table,
+                read_project_id          = query_project,
+                output_file              = "${output_file_base_name}_${i}.vcf.gz"
+        }
+    }
+}
+
+################################################################################
+task ExtractTask {
+    # indicates that this task should NOT be call cached
+    meta {
+       volatile: true
+    }
+
+    input {
+        # ------------------------------------------------
+        # Input args:
+        File reference
+        File reference_index
+        File reference_dict
+    
+        String fq_sample_table
+
+        # bug in cromwell, can't support large integers...
+        # https://github.com/broadinstitute/cromwell/issues/2685
+        String chrom_offset
+        Int chrom_id
+
+        String fq_cohort_extract_table
+        String read_project_id
+        String output_file
+        
+        # Runtime Options:
+        File? gatk_override
+        
+        Int? local_sort_max_records_in_ram = 10000000
+    }
+
+
+    # ------------------------------------------------
+    # Run our command:
+    command <<<
+        set -e
+        export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}
+
+        df -h
+        min_location=$(echo "~{chrom_id} * ~{chrom_offset}" | bc)
+        max_location=$(echo "( ~{chrom_id} + 1 ) * ~{chrom_offset}" | bc)
+
+        gatk --java-options "-Xmx4g" \
+            ExtractCohort \
+                --mode GENOMES --ref-version 38 --query-mode LOCAL_SORT \
+                -R "~{reference}" \
+                -O "~{output_file}" \
+                --local-sort-max-records-in-ram ~{local_sort_max_records_in_ram} \
+                --sample-table ~{fq_sample_table} \
+                --cohort-extract-table ~{fq_cohort_extract_table} \
+                --min-location ${min_location} --max-location ${max_location} \
+                --project-id ~{read_project_id}
+    >>>
+
+    # ------------------------------------------------
+    # Runtime settings:
+    runtime {
+        docker: "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_d8a72b825eab2d979c8877448c0ca948fd9b34c7_change_to_hwe"
+        memory: "7 GB"
+        disks: "local-disk 10 HDD"
+        bootDiskSizeGb: 15
+        preemptible: 3
+        cpu: 2
+    }
+
+    # ------------------------------------------------
+    # Outputs:
+    output {
+        File output_vcf = "~{output_file}"
+        File output_vcf_index = "~{output_file}.tbi"
+    }
+ }
+ 
+
+
+
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/CommonCode.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/CommonCode.java
index b8491f750da..552b16d260d 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/CommonCode.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/CommonCode.java
@@ -197,4 +197,10 @@ public enum ModeEnum {
         GENOMES,
         ARRAYS
     }
+
+    public enum OutputType {
+        TSV,
+        ORC,
+        PARQUET
+    }
 }
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/IngestUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/IngestUtils.java
index 7b4372824ba..cbf6b82bc44 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/IngestUtils.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/IngestUtils.java
@@ -69,7 +69,7 @@ public static int getTableNumber(String sampleId, int sampleMod) { // this is ba
     public static int getTableNumber(long sampleId, int sampleMod) { // this is based on sample id
         // sample ids 1-4000 will go in directory 001
         // subtract 1 from the sample id to make it 1-index (or do we want to 0-index?) and add 1 to the dir
-        int directoryNumber = new Long(Math.floorDiv((sampleId - 1), sampleMod) + 1).intValue(); // TODO omg write some unit tests
+        int directoryNumber = (int) (Math.floorDiv((sampleId - 1), sampleMod) + 1); // TODO omg write some unit tests
         return directoryNumber;
     }
 
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/CreateVariantIngestFiles.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/CreateVariantIngestFiles.java
index affb9035388..e48c37c826e 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/CreateVariantIngestFiles.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/CreateVariantIngestFiles.java
@@ -14,6 +14,7 @@
 import org.broadinstitute.hellbender.engine.ReadsContext;
 import org.broadinstitute.hellbender.engine.ReferenceContext;
 import org.broadinstitute.hellbender.engine.VariantWalker;
+import org.broadinstitute.hellbender.exceptions.GATKException;
 import org.broadinstitute.hellbender.exceptions.UserException;
 import org.broadinstitute.hellbender.tools.variantdb.*;
 import org.broadinstitute.hellbender.tools.variantdb.IngestConstants;
@@ -22,6 +23,7 @@
 
 import java.util.*;
 import java.io.File;
+import java.io.IOException;
 
 /**
  * Ingest variant walker
@@ -78,6 +80,12 @@ public final class CreateVariantIngestFiles extends VariantWalker {
             optional = true)
     public CommonCode.ModeEnum mode = CommonCode.ModeEnum.EXOMES;
 
+    @Argument(fullName = "output-type", 
+            shortName = "ot", 
+            doc = "[Experimental] Output file format: TSV, ORC or PARQUET [default=TSV].", 
+            optional = true)
+    public CommonCode.OutputType outputType = CommonCode.OutputType.TSV;
+
     @Argument(
             fullName = "ref-version",
             doc = "Remove this option!!!! only for ease of testing. Valid options are 37 or 38",
@@ -137,7 +145,7 @@ public void onTraversalStart() {
         final GenomeLocSortedSet genomeLocSortedSet = new GenomeLocSortedSet(new GenomeLocParser(seqDictionary));
         intervalArgumentGenomeLocSortedSet = GenomeLocSortedSet.createSetFromList(genomeLocSortedSet.getGenomeLocParser(), IntervalUtils.genomeLocsFromLocatables(genomeLocSortedSet.getGenomeLocParser(), intervalArgumentCollection.getIntervals(seqDictionary)));
 
-        petTsvCreator = new PetTsvCreator(sampleName, sampleId, tableNumberPrefix, seqDictionary, gqStateToIgnore, outputDir);
+        petTsvCreator = new PetTsvCreator(sampleName, sampleId, tableNumberPrefix, seqDictionary, gqStateToIgnore, outputDir, outputType);
         switch (mode) {
             case EXOMES:
             case GENOMES:
@@ -177,12 +185,21 @@ public void apply(final VariantContext variant, final ReadsContext readsContext,
         if (!variant.isReferenceBlock()) {
             vetTsvCreator.apply(variant, readsContext, referenceContext, featureContext);
         }
-        petTsvCreator.apply(variant, intervalsToWrite);
+        try {
+            petTsvCreator.apply(variant, intervalsToWrite);
+        } catch (IOException ioe) {
+            throw new GATKException("Error writing PET", ioe);
+        }
+
     }
 
     @Override
     public Object onTraversalSuccess() {
-        petTsvCreator.writeMissingIntervals(intervalArgumentGenomeLocSortedSet);
+        try {
+            petTsvCreator.writeMissingIntervals(intervalArgumentGenomeLocSortedSet);
+        } catch (IOException ioe) {
+            throw new GATKException("Error writing missing intervals", ioe);
+        }
         return 0;
     }
 
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/ExtractCohort.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/ExtractCohort.java
index 7be9a0d1ed3..d270d9e56fe 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/ExtractCohort.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/ExtractCohort.java
@@ -1,26 +1,14 @@
 package org.broadinstitute.hellbender.tools.variantdb.nextgen;
 
-import htsjdk.variant.variantcontext.writer.VariantContextWriter;
 import htsjdk.variant.vcf.VCFHeader;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.broadinstitute.barclay.argparser.Argument;
 import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
 import org.broadinstitute.barclay.help.DocumentedFeature;
-import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
 import org.broadinstitute.hellbender.cmdline.programgroups.ShortVariantDiscoveryProgramGroup;
-import org.broadinstitute.hellbender.engine.GATKTool;
-import org.broadinstitute.hellbender.tools.variantdb.ChromosomeEnum;
 import org.broadinstitute.hellbender.tools.variantdb.CommonCode;
 import org.broadinstitute.hellbender.tools.variantdb.SampleList;
-import org.broadinstitute.hellbender.tools.variantdb.SchemaUtils;
-import org.broadinstitute.hellbender.tools.variantdb.arrays.ExtractCohortBQ;
-import org.broadinstitute.hellbender.tools.walkers.annotator.Annotation;
-import org.broadinstitute.hellbender.tools.walkers.annotator.StandardAnnotation;
-import org.broadinstitute.hellbender.tools.walkers.annotator.VariantAnnotatorEngine;
-import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.AS_StandardAnnotation;
-import org.broadinstitute.hellbender.utils.bigquery.TableReference;
-import org.broadinstitute.hellbender.utils.io.IOUtils;
 
 import java.util.*;
 
@@ -49,6 +37,19 @@ public class ExtractCohort extends ExtractTool {
     )
     private String cohortTable = null;
 
+    @Argument(
+            fullName = "min-location",
+            doc = "When extracting data, only include locations >= this value",
+            optional = true
+    )
+    private Long minLocation = null;
+
+    @Argument(
+            fullName = "max-location",
+            doc = "When extracting data, only include locations <= this value",
+            optional = true
+    )
+    private Long maxLocation = null;
 
     @Override
     protected void onStartup() {
@@ -68,6 +69,8 @@ protected void onStartup() {
                 sampleNames,
                 mode,
                 cohortTable,
+                minLocation,
+                maxLocation,
                 filteringFQTableName,
                 localSortMaxRecordsInRam,
                 printDebugInformation,
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/ExtractCohortEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/ExtractCohortEngine.java
index 7dc2cef3af1..e3b397cf9b5 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/ExtractCohortEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/ExtractCohortEngine.java
@@ -42,6 +42,8 @@ public class ExtractCohortEngine {
     private final boolean printDebugInformation;
     private final int localSortMaxRecordsInRam;
     private final TableReference cohortTableRef;
+    private final Long minLocation;
+    private final Long maxLocation;
     private final TableReference filteringTableRef;
     private final ReferenceDataSource refSource;
     private double vqsLodSNPThreshold = 0;
@@ -75,6 +77,8 @@ public ExtractCohortEngine(final String projectID,
                                final Set<String> sampleNames,
                                final CommonCode.ModeEnum mode,
                                final String cohortTableName,
+                               final Long minLocation,
+                               final Long maxLocation,
                                final String filteringTableName,
                                final int localSortMaxRecordsInRam,
                                final boolean printDebugInformation,
@@ -89,8 +93,12 @@ public ExtractCohortEngine(final String projectID,
         this.refSource = refSource;
         this.sampleNames = sampleNames;
         this.mode = mode;
+
         this.cohortTableRef = new TableReference(cohortTableName, SchemaUtils.COHORT_FIELDS);
-        this.filteringTableRef = new TableReference(filteringTableName, SchemaUtils.YNG_FIELDS);
+        this.minLocation = minLocation;
+        this.maxLocation = maxLocation;
+        this.filteringTableRef = filteringTableName == null || "".equals(filteringTableName) ? null : new TableReference(filteringTableName, SchemaUtils.YNG_FIELDS);
+
         this.printDebugInformation = printDebugInformation;
         this.vqsLodSNPThreshold = vqsLodSNPThreshold;
         this.vqsLodINDELThreshold = vqsLodINDELThreshold;
@@ -110,7 +118,13 @@ public void traverse() {
                 if (printDebugInformation) {
                     logger.debug("using storage api with local sort");
                 }
-                final StorageAPIAvroReader storageAPIAvroReader = new StorageAPIAvroReader(cohortTableRef);
+
+                String rowRestriction = null;
+                if (minLocation != null && maxLocation != null) {
+                    rowRestriction = "location >= " + minLocation + " AND location <= " + maxLocation;
+                }
+        
+                final StorageAPIAvroReader storageAPIAvroReader = new StorageAPIAvroReader(cohortTableRef, rowRestriction, projectID);        
                 createVariantsFromUngroupedTableResult(storageAPIAvroReader);
                 break;
             case QUERY:
@@ -447,12 +461,17 @@ private VariantContext createVariantContextFromSampleRecord(final GenericRecord
                 final String genotypeAttributeName = columnName.substring(SchemaUtils.GENOTYPE_FIELD_PREFIX.length());
 
                 if ( genotypeAttributeName.equals(VCFConstants.GENOTYPE_KEY) ) {
-                    final List<Allele> genotypeAlleles =
+                    if ("./.".equals(columnValueString)) {
+                        genotypeBuilder.alleles(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL));
+
+                    } else {
+                        final List<Allele> genotypeAlleles =
                             Arrays.stream(columnValueString.split("[/|]"))
                                     .map(Integer::parseInt)
                                     .map(alleleIndex -> alleles.get(alleleIndex))
                                     .collect(Collectors.toList());
-                    genotypeBuilder.alleles(genotypeAlleles);
+                        genotypeBuilder.alleles(genotypeAlleles);
+                    }
                 } else if ( genotypeAttributeName.equals(VCFConstants.GENOTYPE_QUALITY_KEY) ) {
                     genotypeBuilder.GQ(Integer.parseInt(columnValueString));
                 } else if ( genotypeAttributeName.equals(VCFConstants.GENOTYPE_PL_KEY) ) {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/PetOrcWriter.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/PetOrcWriter.java
new file mode 100644
index 00000000000..c250444ead0
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/PetOrcWriter.java
@@ -0,0 +1,59 @@
+package org.broadinstitute.hellbender.tools.variantdb.nextgen;
+
+import java.io.Closeable;
+import java.io.IOException;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.orc.*;
+import org.apache.orc.storage.ql.exec.vector.BytesColumnVector;
+import org.apache.orc.storage.ql.exec.vector.LongColumnVector;
+import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch;
+
+public class PetOrcWriter implements Closeable {
+    private Writer writer;
+    private VectorizedRowBatch batch;
+    private LongColumnVector locationCV;
+    private LongColumnVector sampleCV;
+    private BytesColumnVector stateCV;
+
+    static private TypeDescription schema = TypeDescription.fromString("struct<location:int,sample:int,state:char(1)>");
+
+    public PetOrcWriter(String outputFile) throws IOException{
+        Configuration conf = new Configuration();
+        
+        this.writer = OrcFile.createWriter(new Path(outputFile),
+                      OrcFile.writerOptions(conf)
+                             .setSchema(schema)
+                             .overwrite(true)
+                             .compress(CompressionKind.SNAPPY)
+                             .stripeSize( 16 * 1024 * 1024 )
+                             .blockSize(   1 * 1024 * 1024 )
+                             );
+        this.batch = schema.createRowBatch();
+        this.locationCV = (LongColumnVector) batch.cols[0];
+        this.sampleCV = (LongColumnVector) batch.cols[1];
+        this.stateCV = (BytesColumnVector) batch.cols[2];         
+    }
+
+    public void addRow(long location, long sampleId, String state) throws IOException {       
+        int row = batch.size++;
+        locationCV.vector[row] = location;
+        sampleCV.vector[row] = sampleId;
+        stateCV.setVal(row, state.getBytes());
+
+        // If the batch is full, write it out and reset the counter
+        if (batch.size == batch.getMaxSize()) {            
+            writer.addRowBatch(batch);
+            batch.reset();
+        }
+    }
+
+    public void close() throws IOException {
+        if (batch.size != 0) {
+            writer.addRowBatch(batch);
+            batch.reset();
+        }
+        writer.close();
+    }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/PetParquetWriter.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/PetParquetWriter.java
new file mode 100644
index 00000000000..af6adc11f01
--- /dev/null
+++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/PetParquetWriter.java
@@ -0,0 +1,53 @@
+package org.broadinstitute.hellbender.tools.variantdb.nextgen;
+
+import java.io.Closeable;
+import java.io.IOException;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.avro.Schema;
+import org.apache.avro.SchemaBuilder;
+import org.apache.avro.generic.GenericData;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.parquet.avro.AvroParquetWriter;
+import org.apache.parquet.column.ParquetProperties.WriterVersion;
+import org.apache.parquet.hadoop.ParquetWriter;
+import org.apache.parquet.hadoop.ParquetFileWriter.Mode;
+import org.apache.parquet.hadoop.metadata.CompressionCodecName;
+
+public class PetParquetWriter implements Closeable {
+    private ParquetWriter<GenericData.Record> writer;
+    private Schema schema;
+
+    public PetParquetWriter(String outputFile) throws IOException{
+        this.schema = SchemaBuilder.record("pet")
+                                    .namespace("org.broadinstitute.dsp")
+                                    .fields().requiredInt("location").requiredInt("sample_id").requiredString("state")
+                                    .endRecord();
+                                    
+        writer = AvroParquetWriter.
+                            <GenericData.Record>builder(new Path(outputFile))
+                            .withRowGroupSize(ParquetWriter.DEFAULT_BLOCK_SIZE)
+                            .withPageSize(ParquetWriter.DEFAULT_PAGE_SIZE)
+                            .withSchema(schema)
+                            .withWriteMode(Mode.OVERWRITE)
+                            .withConf(new Configuration())
+                            .withCompressionCodec(CompressionCodecName.SNAPPY)
+                            .withValidation(false)
+                            .withDictionaryEncoding(true)
+                            .withWriterVersion(WriterVersion.PARQUET_1_0)
+                            .build();
+        
+    }
+
+    public void addRow(long location, long sampleId, String state) throws IOException {
+        GenericData.Record record = new GenericData.Record(schema);
+        record.put("location", location);
+        record.put("sample_id", sampleId);
+        record.put("state", state);
+        writer.write(record);
+    }
+
+    public void close() throws IOException {
+        writer.close();
+    }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/PetTsvCreator.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/PetTsvCreator.java
index 960b5ab01bc..f06bfa15b2e 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/PetTsvCreator.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/PetTsvCreator.java
@@ -6,6 +6,7 @@
 import org.apache.logging.log4j.Logger;
 import org.broadinstitute.hellbender.exceptions.UserException;
 import org.broadinstitute.hellbender.tools.variantdb.SchemaUtils;
+import org.broadinstitute.hellbender.tools.variantdb.CommonCode;
 import org.broadinstitute.hellbender.tools.variantdb.IngestConstants;
 import org.broadinstitute.hellbender.utils.*;
 import org.broadinstitute.hellbender.utils.tsv.SimpleXSVWriter;
@@ -20,7 +21,12 @@
 public final class PetTsvCreator {
     private static final Logger logger = LogManager.getLogger(PetTsvCreator.class);
 
-    private SimpleXSVWriter petWriter = null;
+    private CommonCode.OutputType outputType;
+
+    private SimpleXSVWriter petTsvWriter = null;
+    private PetOrcWriter petOrcWriter = null;
+    private PetParquetWriter petParquetWriter = null;
+
     private final String sampleId;
     private SimpleInterval previousInterval;
     private final SAMSequenceDictionary seqDictionary;
@@ -28,17 +34,28 @@ public final class PetTsvCreator {
     private GenomeLocSortedSet coverageLocSortedSet;
     private static String PET_FILETYPE_PREFIX = "pet_";
 
-    public PetTsvCreator(String sampleName, String sampleId, String tableNumberPrefix, SAMSequenceDictionary seqDictionary, GQStateEnum gqStateToIgnore, final File outputDirectory) {
+    public PetTsvCreator(String sampleName, String sampleId, String tableNumberPrefix, SAMSequenceDictionary seqDictionary, GQStateEnum gqStateToIgnore, final File outputDirectory, final CommonCode.OutputType outputType) {
         this.sampleId = sampleId;
         this.seqDictionary = seqDictionary;
         this.gqStateToIgnore = gqStateToIgnore;
+        this.outputType = outputType;
+
         coverageLocSortedSet = new GenomeLocSortedSet(new GenomeLocParser(seqDictionary));
 
        try {
-           final File petOutputFile = new File(outputDirectory, PET_FILETYPE_PREFIX + tableNumberPrefix + sampleName  + IngestConstants.FILETYPE);
-            List<String> petHeader = PetTsvCreator.getHeaders();
-            petWriter = new SimpleXSVWriter(petOutputFile.toPath(), IngestConstants.SEPARATOR);
-            petWriter.setHeaderLine(petHeader);
+            final File petOutputFile = new File(outputDirectory, PET_FILETYPE_PREFIX + tableNumberPrefix + sampleName  + "." + outputType.toString().toLowerCase());
+            switch (outputType) {
+                case TSV:                
+                    List<String> petHeader = PetTsvCreator.getHeaders();
+                    petTsvWriter = new SimpleXSVWriter(petOutputFile.toPath(), IngestConstants.SEPARATOR);
+                    petTsvWriter.setHeaderLine(petHeader);
+                    break;
+                case ORC:
+                    petOrcWriter = new PetOrcWriter(petOutputFile.getCanonicalPath());
+                    break;
+                case PARQUET:
+                    petParquetWriter = new PetParquetWriter(petOutputFile.getCanonicalPath());
+            }
         } catch (final IOException e) {
             throw new UserException("Could not create pet outputs", e);
         }
@@ -79,7 +96,7 @@ public String getValue() {
 
     }
 
-    public void apply(VariantContext variant, List<GenomeLoc> intervalsToWrite) {
+    public void apply(VariantContext variant, List<GenomeLoc> intervalsToWrite) throws IOException {
         boolean firstInterval = true;
         final String variantChr = variant.getContig();
 
@@ -122,7 +139,21 @@ public void apply(VariantContext variant, List<GenomeLoc> intervalsToWrite) {
 
                 // write the position to the XSV
                 for (List<String> TSVLineToCreatePet : TSVLinesToCreatePet) {
-                    petWriter.getNewLineBuilder().setRow(TSVLineToCreatePet).write();
+                    long location = Long.parseLong(TSVLineToCreatePet.get(0));
+                    long sampleId = Long.parseLong(TSVLineToCreatePet.get(1));
+                    String state = TSVLineToCreatePet.get(2);
+
+                    switch (outputType) {
+                        case TSV:                
+                            petTsvWriter.getNewLineBuilder().setRow(TSVLineToCreatePet).write();
+                            break;
+                        case ORC:
+                            petOrcWriter.addRow(location, sampleId, state);        
+                            break;
+                        case PARQUET:
+                            petParquetWriter.addRow(location, sampleId, state);        
+                            break;                            
+                    }
                 }
             }
             firstInterval = false;
@@ -130,7 +161,7 @@ public void apply(VariantContext variant, List<GenomeLoc> intervalsToWrite) {
 
     }
 
-    public void writeMissingIntervals(GenomeLocSortedSet intervalArgumentGenomeLocSortedSet) {
+    public void writeMissingIntervals(GenomeLocSortedSet intervalArgumentGenomeLocSortedSet) throws IOException {
         GenomeLocSortedSet uncoveredIntervals = intervalArgumentGenomeLocSortedSet.subtractRegions(coverageLocSortedSet);
         logger.info("MISSING_GREP_HERE:" + uncoveredIntervals.coveredSize());
         logger.info("MISSING_PERCENTAGE_GREP_HERE:" + (1.0 * uncoveredIntervals.coveredSize()) / intervalArgumentGenomeLocSortedSet.coveredSize());
@@ -142,7 +173,21 @@ public void writeMissingIntervals(GenomeLocSortedSet intervalArgumentGenomeLocSo
                     SchemaUtils.encodeLocation(contig, genomeLoc.getEnd()),
                     sampleId
             )) {
-                petWriter.getNewLineBuilder().setRow(TSVLineToCreatePet).write();
+                long location = Long.parseLong(TSVLineToCreatePet.get(0));
+                long sampleId = Long.parseLong(TSVLineToCreatePet.get(1));
+                String state = TSVLineToCreatePet.get(2);
+
+                switch (outputType) {
+                    case TSV:                
+                        petTsvWriter.getNewLineBuilder().setRow(TSVLineToCreatePet).write();
+                        break;
+                    case ORC:
+                        petOrcWriter.addRow(location, sampleId, state);        
+                        break;
+                    case PARQUET:
+                        petParquetWriter.addRow(location, sampleId, state);        
+                        break;                            
+                }
             }
         }
     }
@@ -258,13 +303,20 @@ public static List<String> getHeaders() {
     }
 
     public void closeTool() {
-        if (petWriter != null) {
-            try {
-                petWriter.close();
-            } catch (final Exception e) {
-                throw new IllegalArgumentException("Couldn't close VET writer", e);
+        try {
+            switch (outputType) {
+                case TSV:   
+                    if (petTsvWriter != null) petTsvWriter.close();
+                    break;
+                case ORC:
+                    if (petOrcWriter != null) petOrcWriter.close();
+                    break;
+                case PARQUET:
+                    if (petParquetWriter != null) petParquetWriter.close();
+                    break;
             }
+        } catch (final Exception e) {
+            throw new IllegalArgumentException("Couldn't close PET writer", e);
         }
-
     }
 }
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/VetFieldEnum.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/VetFieldEnum.java
index 8fe03e6075e..f8d27cd5f98 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/VetFieldEnum.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/nextgen/VetFieldEnum.java
@@ -80,7 +80,8 @@ public String getColumnValue(final VariantContext variant) {
             }
             if (out != null) {
                 if(!out.endsWith("|0.00")) {
-                    logger.warn("Expected AS_RAW_MQ value to end in |0.00. value is: " + out + " for variant " + variant.toString());
+// KC: we are seeing a TON of these!                    
+//                    logger.warn("Expected AS_RAW_MQ value to end in |0.00. value is: " + out + " for variant " + variant.toString());
                 }
                 out = out.substring(0, out.lastIndexOf("|"));
                 String[] outValues = out.split("\\|");