From 8853098507a98907a03e3d8ecd9fa319e4e6fd7f Mon Sep 17 00:00:00 2001 From: Andrea Haessly Date: Wed, 16 Sep 2020 12:07:41 -0400 Subject: [PATCH 1/2] remove compressed data, change GT to single char --- .../variantdb/arrays/ArrayExtractCohort.java | 22 +- .../arrays/ArrayExtractCohortEngine.java | 295 ++++++++++++------ .../variantdb/arrays/BasicArrayData.java | 2 + .../variantdb/arrays/ExtractCohortBQ.java | 18 +- .../tools/variantdb/arrays/GT_encoding.java | 38 +++ .../variantdb/arrays/RawArrayFieldEnum.java | 13 +- .../variantdb/arrays/RawArrayTsvCreator.java | 17 - 7 files changed, 275 insertions(+), 130 deletions(-) create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/GT_encoding.java diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohort.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohort.java index b98da97f1a5..02cb94655aa 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohort.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohort.java @@ -57,8 +57,8 @@ public enum QueryMode { private String projectID = null; @Argument( - fullName = "sample-info-table", - doc = "Fully qualified name of a bigquery table containing a single column `sample` that describes the full list of samples to evoque", + fullName = "cohort-sample-table", + doc = "Fully qualified name of a bigquery table containing the sample_id and sample_name for the samples in the cohort you are extracting ", optional = true ) private String sampleTableName = null; @@ -98,6 +98,12 @@ public enum QueryMode { ) private String cohortTable = null; + @Argument( + fullName = "gt-only", + doc = "If true, only get the genotype info", + optional = true) + private boolean gtDataOnly = false; + @Argument( fullName = "use-compressed-data", doc = "If true, use bit-packed fields for data", @@ -117,6 +123,14 @@ public enum QueryMode { ) private int localSortMaxRecordsInRam = DEFAULT_LOCAL_SORT_MAX_RECORDS_IN_RAM; + // TODO remove before production + @Argument( + fullName = "use-legacy-gt-encoding", + doc = "If the GT encodoing was AA, AB, BB", + optional = true + ) + private Boolean useLegacyGTEncoding = false; + @Override public boolean requiresReference() { return true; @@ -170,12 +184,14 @@ protected void onStartup() { sampleIdMap, probeIdMap, cohortTable, + gtDataOnly, minProbeId, maxProbeId, localSortMaxRecordsInRam, useCompressedData, printDebugInformation, - progressMeter); + progressMeter, + useLegacyGTEncoding); vcfWriter.writeHeader(header); } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohortEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohortEngine.java index 86bbf6de10d..1dd44d767d7 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohortEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohortEngine.java @@ -1,9 +1,6 @@ package org.broadinstitute.hellbender.tools.variantdb.arrays; -import htsjdk.variant.variantcontext.Allele; -import htsjdk.variant.variantcontext.GenotypeBuilder; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.variantcontext.*; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.VCFHeader; import org.apache.avro.generic.GenericRecord; @@ -33,10 +30,11 @@ public class ArrayExtractCohortEngine { private final VariantContextWriter vcfWriter; + private boolean gtDataOnly; private final Integer minProbeId; private final Integer maxProbeId; - private final boolean useCompressedData; +// private final boolean useCompressedData; private final boolean printDebugInformation; private final int localSortMaxRecordsInRam; private final TableReference cohortTableRef; @@ -55,6 +53,8 @@ public class ArrayExtractCohortEngine { private int totalNumberOfVariants = 0; private int totalNumberOfSites = 0; + private final boolean useLegacyGTEncoding; //TODO remove + public ArrayExtractCohortEngine(final String projectID, final VariantContextWriter vcfWriter, final VCFHeader vcfHeader, @@ -63,12 +63,14 @@ public ArrayExtractCohortEngine(final String projectID, final Map sampleIdMap, final Map probeIdMap, final String cohortTableName, + final boolean gtDataOnly, final Integer minProbeId, final Integer maxProbeId, final int localSortMaxRecordsInRam, final boolean useCompressedData, final boolean printDebugInformation, - final ProgressMeter progressMeter) { + final ProgressMeter progressMeter, + final boolean useLegacyGTEncoding) { this.df.setMaximumFractionDigits(3); this.df.setGroupingSize(0); @@ -80,19 +82,21 @@ public ArrayExtractCohortEngine(final String projectID, this.refSource = refSource; this.sampleIdMap = sampleIdMap; this.sampleNames = new HashSet<>(sampleIdMap.values()); + this.gtDataOnly = gtDataOnly; this.probeIdMap = probeIdMap; this.cohortTableRef = new TableReference(cohortTableName, useCompressedData? SchemaUtils.RAW_ARRAY_COHORT_FIELDS_COMPRESSED:SchemaUtils.RAW_ARRAY_COHORT_FIELDS_UNCOMPRESSED); this.minProbeId = minProbeId; this.maxProbeId = maxProbeId; - this.useCompressedData = useCompressedData; +// this.useCompressedData = useCompressedData; this.printDebugInformation = printDebugInformation; this.progressMeter = progressMeter; // TODO: what is the right variant context merger for arrays? this.variantContextMerger = new ReferenceConfidenceVariantContextMerger(annotationEngine, vcfHeader); + this.useLegacyGTEncoding = useLegacyGTEncoding; } int getTotalNumberOfVariants() { return totalNumberOfVariants; } @@ -120,7 +124,7 @@ private void createVariantsFromUngroupedTableResult(final GATKAvroReader avroRea final Set columnNames = new HashSet<>(); schema.getFields().forEach(field -> columnNames.add(field.name())); - Comparator comparator = this.useCompressedData ? COMPRESSED_PROBE_ID_COMPARATOR : UNCOMPRESSED_PROBE_ID_COMPARATOR; + Comparator comparator = UNCOMPRESSED_PROBE_ID_COMPARATOR; SortingCollection sortingCollection = getAvroProbeIdSortingCollection(schema, localSortMaxRecordsInRam, comparator); for ( final GenericRecord queryRow : avroReader ) { @@ -135,13 +139,13 @@ private void createVariantsFromUngroupedTableResult(final GATKAvroReader avroRea for ( final GenericRecord sortedRow : sortingCollection ) { long probeId; - if (useCompressedData) { - final long bits = (Long) sortedRow.get(SchemaUtils.BASIC_ARRAY_DATA_FIELD_NAME); - BasicArrayData data = new BasicArrayData(bits); - probeId = data.probeId; - } else { +// if (useCompressedData) { +// final long bits = (Long) sortedRow.get(SchemaUtils.BASIC_ARRAY_DATA_FIELD_NAME); +// BasicArrayData data = new BasicArrayData(bits); +// probeId = data.probeId; +// } else { probeId = (Long) sortedRow.get("probe_id"); - } +// } if ( probeId != currentProbeId && currentProbeId != -1 ) { ++totalNumberOfSites; @@ -177,16 +181,16 @@ private void processSampleRecordsForLocation(final long probeId, final Iterable< for ( final GenericRecord sampleRecord : sampleRecordsAtPosition ) { final long sampleId; - if (useCompressedData) { - final long bits = (Long) sampleRecord.get(SchemaUtils.BASIC_ARRAY_DATA_FIELD_NAME); - BasicArrayData data = new BasicArrayData(bits); - sampleId = data.sampleId; - } else { +// if (useCompressedData) { +// final long bits = (Long) sampleRecord.get(SchemaUtils.BASIC_ARRAY_DATA_FIELD_NAME); +// BasicArrayData data = new BasicArrayData(bits); +// sampleId = data.sampleId; +// } else { sampleId = (Long) sampleRecord.get(SchemaUtils.SAMPLE_ID_FIELD_NAME); // TODO: hack to test roundtrip - } +// } // TODO: handle missing values String sampleName = sampleIdMap.get((int) sampleId); @@ -199,8 +203,12 @@ private void processSampleRecordsForLocation(final long probeId, final Iterable< } ++totalNumberOfVariants; - unmergedCalls.add(createVariantContextFromSampleRecord(probeInfo, sampleRecord, columnNames, contig, position, sampleName)); + if (useLegacyGTEncoding) { + unmergedCalls.add(createVariantContextFromSampleRecordLegacyGT(probeInfo, sampleRecord, columnNames, contig, position, sampleName)); + } else { + unmergedCalls.add(createVariantContextFromSampleRecord(probeInfo, sampleRecord, columnNames, contig, position, sampleName)); + } } if ( printDebugInformation ) { @@ -266,9 +274,108 @@ private VariantContext createVariantContextFromSampleRecord(final ProbeInfo prob builder.chr(contig); builder.start(startPosition); builder.id(probeInfo.name); - + + final List alleles = createAllelesFromProbeInfo(probeInfo); + + builder.alleles(alleles); + builder.stop(startPosition + alleles.get(0).length() - 1); + + List genotypeAlleles = new ArrayList(); + + Object gtObj = sampleRecord.get(RawArrayFieldEnum.GT_encoded.name()); + GT_encoding gt; + if (gtObj == null) { + gt = RawArrayTsvCreator.value_to_drop; + } else { + gt = GT_encoding.getGTEncodingFromValue(gtObj.toString()); + } + + switch (gt) { + case HOM_REF: + genotypeAlleles.add(alleles.get(0)); + genotypeAlleles.add(alleles.get(0)); + break; + case HET0_1: + genotypeAlleles.add(alleles.get(0)); + genotypeAlleles.add(alleles.get(1)); + break; + case HOM_VAR: + genotypeAlleles.add(alleles.get(1)); + genotypeAlleles.add(alleles.get(1)); + break; + case HET1_2: + genotypeAlleles.add(alleles.get(1)); + genotypeAlleles.add(alleles.get(2)); + break; + case HOM_ALT2: + genotypeAlleles.add(alleles.get(2)); + genotypeAlleles.add(alleles.get(2)); + break; + case MISSING: + default: + genotypeAlleles.add(Allele.NO_CALL); + genotypeAlleles.add(Allele.NO_CALL); + break; + } + + genotypeBuilder.alleles(genotypeAlleles); + + if (!gtDataOnly) { + genotypeBuilder.attribute(RawArrayTsvCreator.NORMX, formatFloatForVcf(getNullableFloatFromDouble(sampleRecord.get(RawArrayFieldEnum.NORMX.name())))); + genotypeBuilder.attribute(RawArrayTsvCreator.NORMY, formatFloatForVcf(getNullableFloatFromDouble(sampleRecord.get(RawArrayFieldEnum.NORMY.name())))); + genotypeBuilder.attribute(RawArrayTsvCreator.BAF, formatFloatForVcf(getNullableFloatFromDouble(sampleRecord.get(RawArrayFieldEnum.BAF.name())))); + genotypeBuilder.attribute(RawArrayTsvCreator.LRR, formatFloatForVcf(getNullableFloatFromDouble(sampleRecord.get(RawArrayFieldEnum.LRR.name())))); + } + + genotypeBuilder.name(sample); + + builder.genotypes(genotypeBuilder.make()); + + try { + VariantContext vc = builder.make(); + return vc; + } catch (Exception e) { + System.out.println("Error: "+ e.getMessage() + " processing " + sampleRecord + " PI: " + probeInfo.alleleA + "/" +probeInfo.alleleB + " with ga " + genotypeAlleles + " and alleles " + alleles); + throw e; + } + + } + + List createAllelesFromProbeInfo(final ProbeInfo probeInfo) { final List alleles = new ArrayList<>(); - Allele ref = Allele.create(probeInfo.ref, true); + Allele ref = Allele.create(probeInfo.ref, true); + alleles.add(ref); + + Allele alleleA = Allele.create(probeInfo.alleleA, false); + Allele alleleB = Allele.create(probeInfo.alleleB, false); + + boolean alleleAisRef = probeInfo.ref.equals(probeInfo.alleleA); + boolean alleleBisRef = probeInfo.ref.equals(probeInfo.alleleB); + + if (alleleAisRef) { + alleleA = ref; + } else { + alleles.add(alleleA); + } + + if (alleleBisRef) { + alleleB = ref; + } else { + alleles.add(alleleB); + } + return alleles; + } + + private VariantContext createVariantContextFromSampleRecordLegacyGT(final ProbeInfo probeInfo, final GenericRecord sampleRecord, final Set columnNames, final String contig, final long startPosition, final String sample) { + final VariantContextBuilder builder = new VariantContextBuilder(); + final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(); + + builder.chr(contig); + builder.start(startPosition); + builder.id(probeInfo.name); + + List alleles = createAllelesFromProbeInfo(probeInfo); + Allele ref = Allele.create(probeInfo.ref, true); alleles.add(ref); Allele alleleA = Allele.create(probeInfo.alleleA, false); @@ -298,80 +405,80 @@ private VariantContext createVariantContextFromSampleRecord(final ProbeInfo prob Float lrr; List genotypeAlleles = new ArrayList(); - if (this.useCompressedData) { - final BasicArrayData basicData = new BasicArrayData((Long) sampleRecord.get(SchemaUtils.BASIC_ARRAY_DATA_FIELD_NAME)); - Object rd = sampleRecord.get(SchemaUtils.RAW_ARRAY_DATA_FIELD_NAME); - - final RawArrayData rawData = new RawArrayData((Long) rd); - normx = rawData.normx; - normy = rawData.normy; - lrr = rawData.lrr; - baf = rawData.baf; - - if (basicData.genotype == ArrayGenotype.AA) { - genotypeAlleles.add(alleleA); - genotypeAlleles.add(alleleA); - } else if (basicData.genotype == ArrayGenotype.AB) { - genotypeAlleles.add(alleleA); - genotypeAlleles.add(alleleB); - } else if (basicData.genotype == ArrayGenotype.BB) { - genotypeAlleles.add(alleleB); - genotypeAlleles.add(alleleB); - } else { - genotypeAlleles.add(Allele.NO_CALL); - genotypeAlleles.add(Allele.NO_CALL); - } - } else { - Object gt = sampleRecord.get("GT_encoded"); - ArrayGenotype agt; - if ("AA".equals(gt.toString())) { - genotypeAlleles.add(alleleA); - genotypeAlleles.add(alleleA); - agt = ArrayGenotype.AA; - } else if ("AB".equals(gt.toString())) { - genotypeAlleles.add(alleleA); - genotypeAlleles.add(alleleB); - agt = ArrayGenotype.AB; - } else if ("BB".equals(gt.toString())) { - genotypeAlleles.add(alleleB); - genotypeAlleles.add(alleleB); - agt = ArrayGenotype.BB; - } else if (".".equals(gt.toString())) { - genotypeAlleles.add(Allele.NO_CALL); - genotypeAlleles.add(Allele.NO_CALL); - agt = ArrayGenotype.NO_CALL; - } else { - System.out.println("Processing getnotype " + gt.toString()); - throw new RuntimeException(); - } - - // TODO: constantize - try { - normx = getNullableFloatFromDouble(sampleRecord.get("NORMX")); - normy = getNullableFloatFromDouble(sampleRecord.get("NORMY")); - baf = getNullableFloatFromDouble(sampleRecord.get("BAF")); - lrr = getNullableFloatFromDouble(sampleRecord.get("LRR")); - - // Hack to pack and unpack data - BasicArrayData b = new BasicArrayData(0, (int) probeInfo.probeId, agt); - RawArrayData d = new RawArrayData(normx, normy, lrr, baf); - - long bits = d.encode(); - RawArrayData d2 = new RawArrayData(bits); - normx = d2.normx; - normy = d2.normy; - baf = d2.baf; - lrr = d2.lrr; - - } catch (NullPointerException npe) { - System.out.println("NPE on " + sampleRecord); - System.out.println("NPE on BAF " + sampleRecord.get("BAF")); - System.out.println("NPE on LRR " +sampleRecord.get("LRR")); - throw npe; - } +// if (this.useCompressedData) { +// final BasicArrayData basicData = new BasicArrayData((Long) sampleRecord.get(SchemaUtils.BASIC_ARRAY_DATA_FIELD_NAME)); +// Object rd = sampleRecord.get(SchemaUtils.RAW_ARRAY_DATA_FIELD_NAME); +// +// final RawArrayData rawData = new RawArrayData((Long) rd); +// normx = rawData.normx; +// normy = rawData.normy; +// lrr = rawData.lrr; +// baf = rawData.baf; +// +// if (basicData.genotype == ArrayGenotype.AA) { +// genotypeAlleles.add(alleleA); +// genotypeAlleles.add(alleleA); +// } else if (basicData.genotype == ArrayGenotype.AB) { +// genotypeAlleles.add(alleleA); +// genotypeAlleles.add(alleleB); +// } else if (basicData.genotype == ArrayGenotype.BB) { +// genotypeAlleles.add(alleleB); +// genotypeAlleles.add(alleleB); +// } else { +// genotypeAlleles.add(Allele.NO_CALL); +// genotypeAlleles.add(Allele.NO_CALL); +// } +// } else { + Object gt = sampleRecord.get("GT_encoded"); + ArrayGenotype agt; + // for compatibility with old GT encoding + if ("AA".equals(gt.toString())) { + genotypeAlleles.add(alleleA); + genotypeAlleles.add(alleleA); + agt = ArrayGenotype.AA; + } else if ("AB".equals(gt.toString())) { + genotypeAlleles.add(alleleA); + genotypeAlleles.add(alleleB); + agt = ArrayGenotype.AB; + } else if ("BB".equals(gt.toString())) { + genotypeAlleles.add(alleleB); + genotypeAlleles.add(alleleB); + agt = ArrayGenotype.BB; + } else if (".".equals(gt.toString())) { + genotypeAlleles.add(Allele.NO_CALL); + genotypeAlleles.add(Allele.NO_CALL); + agt = ArrayGenotype.NO_CALL; + } else { + System.out.println("Processing getnotype " + gt.toString()); + throw new RuntimeException(); + } + // TODO: constantize + try { + normx = getNullableFloatFromDouble(sampleRecord.get("NORMX")); + normy = getNullableFloatFromDouble(sampleRecord.get("NORMY")); + baf = getNullableFloatFromDouble(sampleRecord.get("BAF")); + lrr = getNullableFloatFromDouble(sampleRecord.get("LRR")); + + // Hack to pack and unpack data + BasicArrayData b = new BasicArrayData(0, (int) probeInfo.probeId, agt); + RawArrayData d = new RawArrayData(normx, normy, lrr, baf); + + long bits = d.encode(); + RawArrayData d2 = new RawArrayData(bits); + normx = d2.normx; + normy = d2.normy; + baf = d2.baf; + lrr = d2.lrr; + + } catch (NullPointerException npe) { + System.out.println("NPE on " + sampleRecord); + System.out.println("NPE on BAF " + sampleRecord.get("BAF")); + System.out.println("NPE on LRR " +sampleRecord.get("LRR")); + throw npe; } + genotypeBuilder.alleles(genotypeAlleles); genotypeBuilder.attribute(RawArrayTsvCreator.NORMX, formatFloatForVcf(normx)); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/BasicArrayData.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/BasicArrayData.java index a0f55cab8f9..d21b014c1a8 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/BasicArrayData.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/BasicArrayData.java @@ -5,6 +5,8 @@ import org.broadinstitute.hellbender.exceptions.GATKException; public class BasicArrayData { + // TODO remove this before prod + // replace with public static enum ArrayGenotype { // Order is critical here, the ordinal is the int encoding AA,AB, BB, NO_CALL diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ExtractCohortBQ.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ExtractCohortBQ.java index bc932599858..18497ff16dd 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ExtractCohortBQ.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ExtractCohortBQ.java @@ -53,15 +53,15 @@ public static SortingCollection getAvroProbeIdSortingCollection(o return SortingCollection.newInstance(GenericRecord.class, sortingCollectionCodec, comparator, localSortMaxRecordsInRam); } - public final static Comparator COMPRESSED_PROBE_ID_COMPARATOR = new Comparator() { - @Override - public int compare( GenericRecord o1, GenericRecord o2 ) { - final long firstProbeId = new BasicArrayData((Long) o1.get(SchemaUtils.BASIC_ARRAY_DATA_FIELD_NAME)).probeId; - final long secondProbeId = new BasicArrayData((Long) o2.get(SchemaUtils.BASIC_ARRAY_DATA_FIELD_NAME)).probeId; - - return Long.compare(firstProbeId, secondProbeId); - } - }; +// public final static Comparator COMPRESSED_PROBE_ID_COMPARATOR = new Comparator() { +// @Override +// public int compare( GenericRecord o1, GenericRecord o2 ) { +// final long firstProbeId = new BasicArrayData((Long) o1.get(SchemaUtils.BASIC_ARRAY_DATA_FIELD_NAME)).probeId; +// final long secondProbeId = new BasicArrayData((Long) o2.get(SchemaUtils.BASIC_ARRAY_DATA_FIELD_NAME)).probeId; +// +// return Long.compare(firstProbeId, secondProbeId); +// } +// }; public final static Comparator UNCOMPRESSED_PROBE_ID_COMPARATOR = new Comparator() { @Override diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/GT_encoding.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/GT_encoding.java new file mode 100644 index 00000000000..05c0742a4a6 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/GT_encoding.java @@ -0,0 +1,38 @@ +package org.broadinstitute.hellbender.tools.variantdb.arrays; + +public enum GT_encoding { + HOM_REF("R"), + HET0_1("X"), + HOM_VAR("A"), + HET1_2("Y"), + HOM_ALT2("B"), + MISSING("."); + + String value; + GT_encoding(String v) { + value = v; + } + String getValue() { + return value; + } + + public static GT_encoding getGTEncodingFromValue(String value) { + GT_encoding response = MISSING; + if (value != null) { + if (value.equalsIgnoreCase(HOM_REF.value)) { + response = HOM_REF; + } else if (value.equalsIgnoreCase(HET0_1.value)) { + response = HET0_1; + } else if (value.equalsIgnoreCase(HOM_VAR.value)) { + response = HOM_VAR; + } else if (value.equalsIgnoreCase(HET1_2.value)) { + response = HET1_2; + } else if (value.equalsIgnoreCase(HOM_ALT2.value)) { + response = HOM_ALT2; + } else if (value.equalsIgnoreCase(MISSING.value)) { + response = MISSING; + } + } + return response; + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/RawArrayFieldEnum.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/RawArrayFieldEnum.java index dfb6fb72018..6f856144542 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/RawArrayFieldEnum.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/RawArrayFieldEnum.java @@ -36,26 +36,25 @@ public enum RawArrayFieldEnum { GT_encoded { // Required public String getColumnValue(final VariantContext variant) { List alleleIndexes = CommonCode.getGTAlleleIndexes(variant); - - RawArrayTsvCreator.GT_encoding gt = RawArrayTsvCreator.GT_encoding.MISSING; + GT_encoding gt = GT_encoding.MISSING; if (alleleIndexes.size() == 2) { Set uniqueAlleleIndexes = new HashSet<>(alleleIndexes); if (uniqueAlleleIndexes.size() == 1) { // we know it's HOM something if (uniqueAlleleIndexes.contains(0)) { - gt = RawArrayTsvCreator.GT_encoding.HOM_REF; + gt = GT_encoding.HOM_REF; } else if (uniqueAlleleIndexes.contains(1)) { - gt = RawArrayTsvCreator.GT_encoding.HOM_VAR; + gt = GT_encoding.HOM_VAR; } else if (uniqueAlleleIndexes.contains(2)) { - gt = RawArrayTsvCreator.GT_encoding.HOM_ALT2; + gt = GT_encoding.HOM_ALT2; } } else { // we know its het if (uniqueAlleleIndexes.containsAll(new HashSet<>(Arrays.asList(0, 1)))) { - gt = RawArrayTsvCreator.GT_encoding.HET0_1; + gt = GT_encoding.HET0_1; } else if (uniqueAlleleIndexes.containsAll(new HashSet<>(Arrays.asList(1, 2)))) - gt = RawArrayTsvCreator.GT_encoding.HET1_2; + gt = GT_encoding.HET1_2; } } else { logger.warn("Found " + alleleIndexes.size() + " alleles instead of 2. Not processing variant \t" + variant); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/RawArrayTsvCreator.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/RawArrayTsvCreator.java index a5d92643c32..60afedf54f2 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/RawArrayTsvCreator.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/RawArrayTsvCreator.java @@ -34,23 +34,6 @@ public final class RawArrayTsvCreator { private final Map probeDataByName; private static String RAW_FILETYPE_PREFIX = "raw_"; - enum GT_encoding { - HOM_REF("R"), - HET0_1("X"), - HOM_VAR("A"), - HET1_2("Y"), - HOM_ALT2("B"), - MISSING("."); - - String value; - GT_encoding(String v) { - value = v; - } - String getValue() { - return value; - } - } - public RawArrayTsvCreator(final String sampleName, final String sampleId, final String tableNumberPrefix, final Map probeDataByName, final File outputDirectory) { this.sampleId = sampleId; this.probeDataByName = probeDataByName; From 30bab6dba69d61b9cb39d8e6a4c84d283d7feea2 Mon Sep 17 00:00:00 2001 From: Andrea Haessly Date: Thu, 17 Sep 2020 16:50:32 -0400 Subject: [PATCH 2/2] updated from PR feedback --- .../hellbender/tools/variantdb/arrays/ArrayExtractCohort.java | 2 +- .../tools/variantdb/arrays/ArrayExtractCohortEngine.java | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohort.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohort.java index 02cb94655aa..8bd0d03f07f 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohort.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohort.java @@ -100,7 +100,7 @@ public enum QueryMode { @Argument( fullName = "gt-only", - doc = "If true, only get the genotype info", + doc = "If true, only get the genotype info. Otherwise include NORMX, NORMY, BAF, and LRR", optional = true) private boolean gtDataOnly = false; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohortEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohortEngine.java index 1dd44d767d7..48365d5bb1f 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohortEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/variantdb/arrays/ArrayExtractCohortEngine.java @@ -312,7 +312,6 @@ private VariantContext createVariantContextFromSampleRecord(final ProbeInfo prob genotypeAlleles.add(alleles.get(2)); break; case MISSING: - default: genotypeAlleles.add(Allele.NO_CALL); genotypeAlleles.add(Allele.NO_CALL); break; @@ -374,7 +373,7 @@ private VariantContext createVariantContextFromSampleRecordLegacyGT(final ProbeI builder.start(startPosition); builder.id(probeInfo.name); - List alleles = createAllelesFromProbeInfo(probeInfo); + List alleles = new ArrayList<>(); Allele ref = Allele.create(probeInfo.ref, true); alleles.add(ref);