From 1a290ae9be031306523c1b4c753da5bed18b065f Mon Sep 17 00:00:00 2001 From: jonathoncohen98 <51167978+jonathoncohen98@users.noreply.github.com> Date: Mon, 5 Aug 2019 11:48:32 -0400 Subject: [PATCH] Issue #5910: Extract GenotypeGVCFs engine into publicly accessible class/function (#6004) --- .../tools/walkers/GenotypeGVCFs.java | 468 +--------------- .../tools/walkers/GenotypeGVCFsEngine.java | 511 ++++++++++++++++++ .../tools/walkers/GenotypeGVCFsUnitTest.java | 16 +- 3 files changed, 546 insertions(+), 449 deletions(-) create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFsEngine.java diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFs.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFs.java index de7ba42bcb3..dcc28da1edf 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFs.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFs.java @@ -1,36 +1,35 @@ package org.broadinstitute.hellbender.tools.walkers; -import com.google.common.annotations.VisibleForTesting; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.util.Locatable; -import htsjdk.variant.variantcontext.*; +import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.writer.VariantContextWriter; -import htsjdk.variant.vcf.*; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLine; import org.broadinstitute.barclay.argparser.*; import org.broadinstitute.barclay.help.DocumentedFeature; -import org.broadinstitute.hellbender.cmdline.*; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.cmdline.argumentcollections.DbsnpArgumentCollection; import org.broadinstitute.hellbender.cmdline.programgroups.ShortVariantDiscoveryProgramGroup; -import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.engine.FeatureContext; +import org.broadinstitute.hellbender.engine.ReadsContext; +import org.broadinstitute.hellbender.engine.ReferenceContext; +import org.broadinstitute.hellbender.engine.VariantLocusWalker; import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBImport; -import org.broadinstitute.hellbender.tools.walkers.annotator.*; -import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.AS_RMSMappingQuality; -import org.broadinstitute.hellbender.tools.walkers.genotyper.*; -import org.broadinstitute.hellbender.tools.walkers.genotyper.afcalc.GeneralPloidyFailOverAFCalculatorProvider; +import org.broadinstitute.hellbender.tools.walkers.annotator.Annotation; +import org.broadinstitute.hellbender.tools.walkers.annotator.StandardAnnotation; +import org.broadinstitute.hellbender.tools.walkers.annotator.VariantAnnotatorEngine; +import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeCalculationArgumentCollection; import org.broadinstitute.hellbender.tools.walkers.mutect.M2ArgumentCollection; -import org.broadinstitute.hellbender.utils.GATKProtectedVariantContextUtils; import org.broadinstitute.hellbender.utils.IntervalUtils; import org.broadinstitute.hellbender.utils.SimpleInterval; -import org.broadinstitute.hellbender.utils.Utils; -import org.broadinstitute.hellbender.utils.genotyper.IndexedSampleList; -import org.broadinstitute.hellbender.utils.genotyper.SampleList; -import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; -import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines; import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils; import java.io.File; -import java.util.*; -import java.util.stream.Collectors; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Set; /** * Perform joint genotyping on one or more samples pre-called with HaplotypeCaller @@ -98,9 +97,6 @@ public final class GenotypeGVCFs extends VariantLocusWalker { public static final String ALL_SITES_SHORT_NAME = "all-sites"; public static final String KEEP_COMBINED_LONG_NAME = "keep-combined-raw-annotations"; public static final String KEEP_COMBINED_SHORT_NAME = "keep-combined"; - private static final String GVCF_BLOCK = "GVCFBlock"; - private VCFHeader outputHeader; - @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc="File to which variants should be written", optional=false) @@ -166,24 +162,18 @@ public final class GenotypeGVCFs extends VariantLocusWalker { @ArgumentCollection private final DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); - // the genotyping engine - private GenotypingEngine genotypingEngine; // the annotation engine private VariantAnnotatorEngine annotationEngine; private ReferenceConfidenceVariantContextMerger merger; - // the INFO field annotation key names to remove - private final List infoFieldAnnotationKeyNamesToRemove = new ArrayList<>(); - - // INFO Header names that require alt alleles - final LinkedHashSet infoHeaderAltAllelesLineNames = new LinkedHashSet<>(); - private VariantContextWriter vcfWriter; /** these are used when {@link #onlyOutputCallsStartingInIntervals) is true */ private List intervals; + private GenotypeGVCFsEngine gvcfEngine; + /** * Get the largest interval per contig that contains the intervals specified on the command line. * @param getIntervals intervals to be transformed @@ -214,6 +204,7 @@ public List> getDefaultVariantAnnotationGroups() { @Override public void onTraversalStart() { + if (somaticInput) { logger.warn("Note that the Mutect2 reference confidence mode is in BETA -- the likelihoods model and output format are subject to change in subsequent versions."); } @@ -233,83 +224,27 @@ public void onTraversalStart() { intervals = hasUserSuppliedIntervals() ? intervalArgumentCollection.getIntervals(getBestAvailableSequenceDictionary()) : Collections.emptyList(); - final SampleList samples = new IndexedSampleList(inputVCFHeader.getGenotypeSamples()); //todo should this be getSampleNamesInOrder? - annotationEngine = new VariantAnnotatorEngine(makeVariantAnnotations(), dbsnp.dbsnp, Collections.emptyList(), false, keepCombined); - // Request INFO field annotations inheriting from RankSumTest and RMSAnnotation added to remove list - for ( final InfoFieldAnnotation annotation : annotationEngine.getInfoAnnotations() ) { - if ( annotation instanceof RankSumTest || - annotation instanceof AS_RMSMappingQuality || - annotation instanceof RMSMappingQuality) { - final List keyNames = annotation.getKeyNames(); - if ( !keyNames.isEmpty() ) { - infoFieldAnnotationKeyNamesToRemove.add(keyNames.get(0)); - } - } - } - - // We only want the engine to generate the AS_QUAL key if we are using AlleleSpecific annotations. - genotypingEngine = new MinimalGenotypingEngine(createUAC(), samples, new GeneralPloidyFailOverAFCalculatorProvider(genotypeArgs), annotationEngine.isRequestedReducibleRawKey(GATKVCFConstants.AS_QUAL_KEY)); - merger = new ReferenceConfidenceVariantContextMerger(annotationEngine, getHeaderForVariants(), somaticInput); - if ( includeNonVariants ) { - // Save INFO header names that require alt alleles - for ( final VCFHeaderLine headerLine : inputVCFHeader.getMetaDataInInputOrder() ) { - if (headerLine instanceof VCFInfoHeaderLine ) { - if (((VCFInfoHeaderLine) headerLine).getCountType() == VCFHeaderLineCount.A) { - infoHeaderAltAllelesLineNames.add(((VCFInfoHeaderLine) headerLine).getID()); - } - } - } - } - - setupVCFWriter(inputVCFHeader, samples); - } - - private static boolean annotationShouldBeSkippedForHomRefSites(VariantAnnotation annotation) { - return annotation instanceof RankSumTest || annotation instanceof RMSMappingQuality || annotation instanceof AS_RMSMappingQuality; - } - - private void setupVCFWriter(final VCFHeader inputVCFHeader, final SampleList samples) { - final Set headerLines = new LinkedHashSet<>(inputVCFHeader.getMetaDataInInputOrder()); - headerLines.addAll(getDefaultToolVCFHeaderLines()); - - // Remove GCVFBlocks - headerLines.removeIf(vcfHeaderLine -> vcfHeaderLine.getKey().startsWith(GVCF_BLOCK)); - - headerLines.addAll(annotationEngine.getVCFAnnotationDescriptions(false)); - headerLines.addAll(genotypingEngine.getAppropriateVCFInfoHeaders()); + //methods that cannot be called in engine bc its protected + Set defaultToolVCFHeaderLines = getDefaultToolVCFHeaderLines(); + vcfWriter = createVCFWriter(outputFile); - // add headers for annotations added by this tool - headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.MLE_ALLELE_COUNT_KEY)); - headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY)); - headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.REFERENCE_GENOTYPE_QUALITY)); - headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY)); // needed for gVCFs without DP tags - if (keepCombined) { - headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.AS_QUAL_KEY)); - } - if ( dbsnp.dbsnp != null ) { - VCFStandardHeaderLines.addStandardInfoLines(headerLines, true, VCFConstants.DBSNP_KEY); - } + //create engine object + gvcfEngine = new GenotypeGVCFsEngine(annotationEngine, genotypeArgs, includeNonVariants, inputVCFHeader); - vcfWriter = createVCFWriter(outputFile); + //call initialize method in engine class that creates VCFWriter object and writes a header to it + vcfWriter = gvcfEngine.setupVCFWriter(defaultToolVCFHeaderLines, keepCombined, dbsnp, vcfWriter); - final Set sampleNameSet = samples.asSetOfSamples(); - outputHeader = new VCFHeader(headerLines, new TreeSet<>(sampleNameSet)); - vcfWriter.writeHeader(outputHeader); } @Override public void apply(final Locatable loc, List variants, ReadsContext reads, ReferenceContext ref, FeatureContext features) { - final List variantsToProcess = getVariantSubsetToProcess(loc, variants); + final VariantContext regenotypedVC = gvcfEngine.callRegion(loc, variants, ref, features, merger, somaticInput, tlodThreshold, afTolerance); - ref.setWindow(10, 10); //TODO this matches the gatk3 behavior but may be unnecessary - final VariantContext mergedVC = merger.merge(variantsToProcess, loc, includeNonVariants ? ref.getBase() : null, !includeNonVariants, false); - final VariantContext regenotypedVC = somaticInput ? regenotypeSomaticVC(mergedVC, ref, features, includeNonVariants) : - regenotypeVC(mergedVC, ref, features, includeNonVariants); if (regenotypedVC != null) { final SimpleInterval variantStart = new SimpleInterval(regenotypedVC.getContig(), regenotypedVC.getStart(), regenotypedVC.getStart()); if (!GATKVariantContextUtils.isSpanningDeletionOnly(regenotypedVC) && @@ -319,355 +254,6 @@ public void apply(final Locatable loc, List variants, ReadsConte } } - // If includeNonVariants is set, we're using group-by-locus traversal. To match GATK3 GenotypeGVCFs, - // see if there is a variant in the overlapping group that starts exactly at the locus start position, and if so - // prioritize and process only that variant. Otherwise process all of the overlapping variants. - private List getVariantSubsetToProcess(final Locatable loc, List preProcessedVariants) { - if (includeNonVariants) { - final List matchingStart = - preProcessedVariants.stream().filter(vc -> vc.getStart() == loc.getStart()).collect(Collectors.toList()); - if (matchingStart.size() == 0) { - return preProcessedVariants; - } - else if (matchingStart.size() == 1) { - return matchingStart; - } - // since this tool only accepts a single input source, there should never be - // more than one variant at a given starting locus - throw new IllegalStateException( - String.format( - "Variant input contains more than one variant starting at location: %s", - new SimpleInterval(matchingStart.get(0)))); - } else { - return preProcessedVariants; - } - } - - /** - * Re-genotype (and re-annotate) a combined genomic VC - * @return a new VariantContext or null if the site turned monomorphic and we don't want such sites - */ - private VariantContext regenotypeVC(final VariantContext originalVC, final ReferenceContext ref, final FeatureContext features, boolean includeNonVariants) { - Utils.nonNull(originalVC); - - final VariantContext result; - - if ( originalVC.isVariant() && originalVC.getAttributeAsInt(VCFConstants.DEPTH_KEY,0) > 0 ) { - // only re-genotype polymorphic sites - final VariantContext regenotypedVC = calculateGenotypes(originalVC); - if (regenotypedVC == null || (!GATKVariantContextUtils.isProperlyPolymorphic(regenotypedVC) && !includeNonVariants)) { - return null; - } - if (GATKVariantContextUtils.isProperlyPolymorphic(regenotypedVC) || includeNonVariants) { - // Note that reversetrimAlleles must be performed after the annotations are finalized because the reducible annotation data maps - // were generated and keyed on the un reverseTrimmed alleles from the starting VariantContexts. Thus reversing the order will make - // it difficult to recover the data mapping due to the keyed alleles no longer being present in the variant context. - final VariantContext withGenotypingAnnotations = addGenotypingAnnotations(originalVC.getAttributes(), regenotypedVC); - final VariantContext withAnnotations = annotationEngine.finalizeAnnotations(withGenotypingAnnotations, originalVC); - final int[] relevantIndices = regenotypedVC.getAlleles().stream().mapToInt(a -> originalVC.getAlleles().indexOf(a)).toArray(); - final VariantContext trimmed = GATKVariantContextUtils.reverseTrimAlleles(withAnnotations); - final GenotypesContext updatedGTs = subsetAlleleSpecificFormatFields(outputHeader, trimmed.getGenotypes(), relevantIndices); - result = new VariantContextBuilder(trimmed).genotypes(updatedGTs).make(); - } else if (includeNonVariants) { - result = originalVC; - } else { - return null; - } - } else { - result = originalVC; - } - - - // if it turned monomorphic then we either need to ignore or fix such sites - // Note that the order of these actions matters and is different for polymorphic and monomorphic sites. - // For polymorphic sites we need to make sure e.g. the SB tag is sent to the annotation engine and then removed later. - // For monomorphic sites we need to make sure e.g. the hom ref genotypes are created and only then are passed to the annotation engine. - // We could theoretically make 2 passes to re-create the genotypes, but that gets extremely expensive with large sample sizes. - if (result.isPolymorphicInSamples()) { - // For polymorphic sites we need to make sure e.g. the SB tag is sent to the annotation engine and then removed later. - final VariantContext reannotated = annotationEngine.annotateContext(result, features, ref, null, a -> true); - return new VariantContextBuilder(reannotated).genotypes(cleanupGenotypeAnnotations(reannotated, false)).make(); - } else if (includeNonVariants) { - // For monomorphic sites we need to make sure e.g. the hom ref genotypes are created and only then are passed to the annotation engine. - VariantContext reannotated = new VariantContextBuilder(result).genotypes(cleanupGenotypeAnnotations(result, true)).make(); - reannotated = annotationEngine.annotateContext(reannotated, features, ref, null, GenotypeGVCFs::annotationShouldBeSkippedForHomRefSites); - return removeNonRefAlleles(reannotated); - } else { - return null; - } - } - - private VariantContext calculateGenotypes(VariantContext vc){ - /* - * Query the VariantContext for the appropriate model. If type == MIXED, one would want to use model = BOTH. - * However GenotypingEngine.getAlleleFrequencyPriors throws an exception if you give it anything but a SNP or INDEL model. - */ - final GenotypeLikelihoodsCalculationModel model = vc.getType() == VariantContext.Type.INDEL - ? GenotypeLikelihoodsCalculationModel.INDEL - : GenotypeLikelihoodsCalculationModel.SNP; - return genotypingEngine.calculateGenotypes(vc, model, null); - } - - /** - * Remove NON-REF alleles from the variant context - * - * @param vc the variant context - * @return variant context with the NON-REF alleles removed if multiallelic or replaced with NO-CALL alleles if biallelic - */ - private VariantContext removeNonRefAlleles(final VariantContext vc) { - - // If NON_REF is the only alt allele, ignore this site - final List newAlleles = new ArrayList<>(); - // Only keep alleles that are not NON-REF - for ( final Allele allele : vc.getAlleles() ) { - if ( !allele.equals(Allele.NON_REF_ALLELE) ) { - newAlleles.add(allele); - } - } - - // If no alt allele, then remove INFO fields that require alt alleles - if ( newAlleles.size() == 1 ) { - final VariantContextBuilder builder = new VariantContextBuilder(vc).alleles(newAlleles); - for ( final String name : infoHeaderAltAllelesLineNames ) { - builder.rmAttributes(Arrays.asList(name)); - } - return builder.make(); - } else { - return vc; - } - } - - private GenotypesContext subsetAlleleSpecificFormatFields(final VCFHeader outputHeader, final GenotypesContext originalGs, final int[] relevantIndices) { - final GenotypesContext newGTs = GenotypesContext.create(originalGs.size()); - for (final Genotype g : originalGs) { - final GenotypeBuilder gb = new GenotypeBuilder(g); - final Set keys = g.getExtendedAttributes().keySet(); - for (final String key : keys) { - final VCFFormatHeaderLine headerLine = outputHeader.getFormatHeaderLine(key); - final Object attribute; - if (headerLine.getCountType().equals(VCFHeaderLineCount.INTEGER) && headerLine.getCount() == 1) { - attribute = g.getAnyAttribute(key); - } - else { - attribute = ReferenceConfidenceVariantContextMerger.generateAnnotationValueVector(headerLine.getCountType(), - GATKProtectedVariantContextUtils.attributeToList(g.getAnyAttribute(key)), relevantIndices); - } - gb.attribute(key, attribute); - } - newGTs.add(gb.make()); - } - return newGTs; - } - - /** - * Re-genotype (and re-annotate) a combined genomic VC - * @return a new VariantContext or null if the site turned monomorphic and we don't want such sites - */ - private VariantContext regenotypeSomaticVC(final VariantContext originalVC, final ReferenceContext ref, final FeatureContext features, boolean includeNonVariants) { - Utils.nonNull(originalVC); - - final VariantContext result; - if ( originalVC.isVariant() && originalVC.getAttributeAsInt(VCFConstants.DEPTH_KEY,0) > 0 ) { - result = callSomaticGenotypes(originalVC); - } else if (includeNonVariants) { - result = originalVC; - } else { - result = null; - } - return result; - } - - - /** - * Drop low quality alleles and call genotypes - * CombineGVCFs will convert calls to no-call (of varying ploidy, as is the case in somatic) - * - * @param vc input VariantContext with no-called genotypes - * @return a VC with called genotypes and low quality alleles removed, may be null - */ - private VariantContext callSomaticGenotypes(final VariantContext vc) { - final List newGenotypes = new ArrayList<>(); - final GenotypesContext genotypes = vc.getGenotypes(); - final double[] perAlleleLikelihoodSums = new double[vc.getAlleles().size()]; //needs the ref for the subsetting utils - - for(final Genotype g : genotypes) { - GenotypeBuilder gb = new GenotypeBuilder(g); - final double[] tlodArray = GATKProtectedVariantContextUtils.getAttributeAsDoubleArray(g, GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY, () -> null, 0.0); - final double[] variantAFArray = GATKProtectedVariantContextUtils.getAttributeAsDoubleArray(g, GATKVCFConstants.ALLELE_FRACTION_KEY, () -> null, 0.0); - double variantAFtotal = 0; - final List calledAlleles = new ArrayList<>(); - for(int i = 0; i < vc.getAlleles().size()-1; i++) { - variantAFtotal += variantAFArray[i]; - if (tlodArray[i] > tlodThreshold) { - calledAlleles.add(vc.getAlternateAllele(i)); - perAlleleLikelihoodSums[i+1] += tlodArray[i]; - } - } - //hack for weird Mutect2 ploidy -- if the variant is non-homoplasmic, call the reference allele too - if(variantAFtotal < 1-afTolerance && (!g.hasAD() || g.getAD()[0] > 0)) { - - calledAlleles.add(0, vc.getReference()); - } - //"ploidy" gets set according to the size of the alleles List in the Genotype - gb.alleles(calledAlleles); - newGenotypes.add(gb.make()); - } - - final VariantContextBuilder builder = new VariantContextBuilder(vc); - final VariantContext regenotypedVC = builder.genotypes(newGenotypes).make(); - - final int maxAltAlleles = ((UnifiedArgumentCollection)genotypingEngine.getConfiguration()).genotypeArgs.MAX_ALTERNATE_ALLELES; - List allelesToKeep; - - //we need to make sure all alleles pass the tlodThreshold - allelesToKeep = new ArrayList<>(perAlleleLikelihoodSums.length-1); - allelesToKeep.add(vc.getReference()); - for (int i = 1; i < perAlleleLikelihoodSums.length; i++) { - if (perAlleleLikelihoodSums[i] > tlodThreshold) { - allelesToKeep.add(vc.getAlternateAllele(i-1)); - } - } - - if (regenotypedVC.getAlternateAlleles().size() > maxAltAlleles) { - allelesToKeep = AlleleSubsettingUtils.filterToMaxNumberOfAltAllelesBasedOnScores(maxAltAlleles, allelesToKeep, perAlleleLikelihoodSums); - } - - if (allelesToKeep.size() == 1) { - return null; - } - - //if we didn't drop alleles then we're done! - if (allelesToKeep.size() == regenotypedVC.getAlleles().size()) { - return regenotypedVC; - } - - final int[] relevantIndices = allelesToKeep.stream().mapToInt(a -> regenotypedVC.getAlleles().indexOf(a)).toArray(); - - //do another pass over genotypes to drop the alleles that aren't called - final GenotypesContext reducedGenotypes = AlleleSubsettingUtils.subsetSomaticAlleles(outputHeader, regenotypedVC.getGenotypes(), allelesToKeep, relevantIndices); - final VariantContext subsetVC = builder.alleles(allelesToKeep).genotypes(reducedGenotypes).make(); - final VariantContext trimmedVC = GATKVariantContextUtils.trimAlleles(subsetVC, true, true); - if (GATKVariantContextUtils.isProperlyPolymorphic(trimmedVC)) { - return trimmedVC; - } - else { - return null; - } - } - - - /** - * Add genotyping-based annotations to the new VC - * - * @param originalAttributes the non-null annotations from the original VC - * @param newVC the new non-null VC - * @return a non-null VC - */ - private VariantContext addGenotypingAnnotations(final Map originalAttributes, final VariantContext newVC) { - // we want to carry forward the attributes from the original VC but make sure to add the MLE-based annotations and any other annotations generated by the genotyper. - final Map attrs = new LinkedHashMap<>(originalAttributes); - attrs.put(GATKVCFConstants.MLE_ALLELE_COUNT_KEY, newVC.getAttribute(GATKVCFConstants.MLE_ALLELE_COUNT_KEY)); - attrs.put(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY, newVC.getAttribute(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY)); - if (newVC.hasAttribute(GATKVCFConstants.NUMBER_OF_DISCOVERED_ALLELES_KEY)) { - attrs.put(GATKVCFConstants.NUMBER_OF_DISCOVERED_ALLELES_KEY, newVC.getAttribute(GATKVCFConstants.NUMBER_OF_DISCOVERED_ALLELES_KEY)); - } - if (newVC.hasAttribute(GATKVCFConstants.AS_QUAL_KEY)) { - attrs.put(GATKVCFConstants.AS_QUAL_KEY, newVC.getAttribute(GATKVCFConstants.AS_QUAL_KEY)); - } - return new VariantContextBuilder(newVC).attributes(attrs).make(); - } - - - /** - * Cleans up genotype-level annotations that need to be updated. - * 1. move MIN_DP to DP if present - * 2. propagate DP to AD if not present - * 3. remove SB if present - * 4. change the PGT value from "0|1" to "1|1" for homozygous variant genotypes - * 5. move GQ to RGQ if the site is monomorphic - * - * @param vc the VariantContext with the Genotypes to fix - * @param createRefGTs if true we will also create proper hom ref genotypes since we assume the site is monomorphic - * @return a new set of Genotypes - */ - @VisibleForTesting - static List cleanupGenotypeAnnotations(final VariantContext vc, final boolean createRefGTs) { - final GenotypesContext oldGTs = vc.getGenotypes(); - final List recoveredGs = new ArrayList<>(oldGTs.size()); - for ( final Genotype oldGT : oldGTs ) { - final Map attrs = new HashMap<>(oldGT.getExtendedAttributes()); - - final GenotypeBuilder builder = new GenotypeBuilder(oldGT); - int depth = oldGT.hasDP() ? oldGT.getDP() : 0; - - // move the MIN_DP to DP - if ( oldGT.hasExtendedAttribute(GATKVCFConstants.MIN_DP_FORMAT_KEY) ) { - depth = parseInt(oldGT.getAnyAttribute(GATKVCFConstants.MIN_DP_FORMAT_KEY)); - builder.DP(depth); - attrs.remove(GATKVCFConstants.MIN_DP_FORMAT_KEY); - } - - attrs.remove(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY); - - // update PGT for hom vars - if ( oldGT.isHomVar() && oldGT.hasExtendedAttribute(GATKVCFConstants.HAPLOTYPE_CALLER_PHASING_GT_KEY) ) { - attrs.put(GATKVCFConstants.HAPLOTYPE_CALLER_PHASING_GT_KEY, PHASED_HOM_VAR_STRING); - } - - // create AD if it's not there - if ( !oldGT.hasAD() && vc.isVariant() ) { - final int[] AD = new int[vc.getNAlleles()]; - AD[0] = depth; - builder.AD(AD); - } - - if ( createRefGTs ) { - // move the GQ to RGQ - if (oldGT.hasGQ()) { - builder.noGQ(); - attrs.put(GATKVCFConstants.REFERENCE_GENOTYPE_QUALITY, oldGT.getGQ()); - } - - //keep 0 depth samples and 0 GQ samples as no-call - if (depth > 0 && oldGT.hasGQ() && oldGT.getGQ() > 0) { - final List refAlleles = Collections.nCopies(oldGT.getPloidy(), vc.getReference()); - builder.alleles(refAlleles); - } - - // also, the PLs are technically no longer usable - builder.noPL(); - } - - recoveredGs.add(builder.noAttributes().attributes(attrs).make()); - } - return recoveredGs; - } - - private static int parseInt(Object attribute){ - if( attribute instanceof String) { - return Integer.parseInt((String)attribute); - } else if ( attribute instanceof Number){ - return ((Number) attribute).intValue(); - } else { - throw new IllegalArgumentException("Expected a Number or a String but found something else."); - } - } - - /** - * Creates a UnifiedArgumentCollection with appropriate values filled in from the arguments in this walker - * @return a complete UnifiedArgumentCollection - */ - private UnifiedArgumentCollection createUAC() { - final UnifiedArgumentCollection uac = new UnifiedArgumentCollection(); - uac.genotypeArgs = new GenotypeCalculationArgumentCollection(genotypeArgs); - - //whether to emit non-variant sites is not contained in genotypeArgs and must be passed to uac separately - //Note: GATK3 uses OutputMode.EMIT_ALL_CONFIDENT_SITES when includeNonVariants is requested - //GATK4 uses EMIT_ALL_SITES to ensure LowQual sites are emitted. - uac.outputMode = includeNonVariants ? OutputMode.EMIT_ALL_SITES : OutputMode.EMIT_VARIANTS_ONLY; - return uac; - } - @Override public void closeTool() { if ( vcfWriter != null) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFsEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFsEngine.java new file mode 100644 index 00000000000..abc370a23dd --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFsEngine.java @@ -0,0 +1,511 @@ +package org.broadinstitute.hellbender.tools.walkers; + + +import com.google.common.annotations.VisibleForTesting; +import htsjdk.samtools.util.Locatable; +import htsjdk.variant.variantcontext.*; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.vcf.*; +import org.broadinstitute.hellbender.cmdline.argumentcollections.DbsnpArgumentCollection; +import org.broadinstitute.hellbender.engine.FeatureContext; +import org.broadinstitute.hellbender.engine.ReferenceContext; +import org.broadinstitute.hellbender.tools.walkers.annotator.*; +import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.AS_RMSMappingQuality; +import org.broadinstitute.hellbender.tools.walkers.genotyper.*; +import org.broadinstitute.hellbender.tools.walkers.genotyper.afcalc.GeneralPloidyFailOverAFCalculatorProvider; +import org.broadinstitute.hellbender.utils.GATKProtectedVariantContextUtils; +import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.genotyper.IndexedSampleList; +import org.broadinstitute.hellbender.utils.genotyper.SampleList; +import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; +import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines; +import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils; + +import java.util.*; +import java.util.stream.Collectors; + +/** + * Engine class to allow for other classes to replicate the behavior of GenotypeGVCFs. See {@link GenotypeGVCFs} for details + * + * Usage: + * -Pass the genotype args into the constructor, which will the initialize the engine completely + * Get the appropriate writer and write the appropriate header via {@link #setupVCFWriter} + * Repeatedly call {@link #callRegion} to call variants in each region, and add them to your writer + */ + +public class GenotypeGVCFsEngine +{ + private static final String GVCF_BLOCK = "GVCFBlock"; + + //the annotation engine + private VariantAnnotatorEngine annotationEngine = null; + + //the genotyping engine + private GenotypingEngine genotypingEngine = null; + + // the INFO field annotation key names to remove + private final List infoFieldAnnotationKeyNamesToRemove = new ArrayList<>(); + + private GenotypeCalculationArgumentCollection genotypeArgs; + + // INFO Header names that require alt alleles + final LinkedHashSet infoHeaderAltAllelesLineNames = new LinkedHashSet<>(); + + private boolean includeNonVariants; + + private VCFHeader outputHeader; + + private SampleList samples; + + final VCFHeader inputVCFHeader; + + /** + * Create and initialize a new GenotypeGVCFsEngine given a collection of GenotypeGVCF arguments and a VCF header + * + * @param annotationEngine variantAnnotatorEngine with annotations to process already added + * @param genotypeArgs command-line arguments for the GenotypeGVCFs caller + * @param includeNonVariants true to save INFO header names that require alt alleles + * @param inputVCFHeader header for the VCF + */ + public GenotypeGVCFsEngine(VariantAnnotatorEngine annotationEngine, GenotypeCalculationArgumentCollection genotypeArgs, boolean includeNonVariants, VCFHeader inputVCFHeader) + { + this.annotationEngine = annotationEngine; + this.genotypeArgs = genotypeArgs; + this.includeNonVariants = includeNonVariants; + this.inputVCFHeader = inputVCFHeader; + initialize(); + } + + private void initialize() + { + samples = new IndexedSampleList(inputVCFHeader.getGenotypeSamples()); //todo should this be getSampleNamesInOrder? + + // Request INFO field annotations inheriting from RankSumTest and RMSAnnotation added to remove list + for ( final InfoFieldAnnotation annotation : annotationEngine.getInfoAnnotations() ) { + if ( annotation instanceof RankSumTest || + annotation instanceof AS_RMSMappingQuality || + annotation instanceof RMSMappingQuality) { + final List keyNames = annotation.getKeyNames(); + if ( !keyNames.isEmpty() ) { + infoFieldAnnotationKeyNamesToRemove.add(keyNames.get(0)); + } + } + } + + // We only want the engine to generate the AS_QUAL key if we are using AlleleSpecific annotations. + genotypingEngine = new MinimalGenotypingEngine(createUAC(), samples, new GeneralPloidyFailOverAFCalculatorProvider(genotypeArgs), annotationEngine.isRequestedReducibleRawKey(GATKVCFConstants.AS_QUAL_KEY)); + + if ( includeNonVariants ) { + // Save INFO header names that require alt alleles + for ( final VCFHeaderLine headerLine : inputVCFHeader.getMetaDataInInputOrder() ) { + if (headerLine instanceof VCFInfoHeaderLine ) { + if (((VCFInfoHeaderLine) headerLine).getCountType() == VCFHeaderLineCount.A) { + infoHeaderAltAllelesLineNames.add(((VCFInfoHeaderLine) headerLine).getID()); + } + } + } + } + } + + public VariantContext callRegion(Locatable loc, List variants, ReferenceContext ref, FeatureContext features, ReferenceConfidenceVariantContextMerger merger, boolean somaticInput, double tlodThreshold, double afTolerance) //do work for apply + { + final List variantsToProcess = getVariantSubsetToProcess(loc, variants); + + ref.setWindow(10, 10); //TODO this matches the gatk3 behavior but may be unnecessary + final VariantContext mergedVC = merger.merge(variantsToProcess, loc, includeNonVariants ? ref.getBase() : null, !includeNonVariants, false); + final VariantContext regenotypedVC = somaticInput ? regenotypeSomaticVC(mergedVC, ref, features, includeNonVariants, tlodThreshold, afTolerance) : + regenotypeVC(mergedVC, ref, features, includeNonVariants); + + return regenotypedVC; + } + + + /** + * Re-genotype (and re-annotate) a combined genomic VC + * @return a new VariantContext or null if the site turned monomorphic and we don't want such sites + */ + private VariantContext regenotypeVC(final VariantContext originalVC, final ReferenceContext ref, final FeatureContext features, boolean includeNonVariants) { + Utils.nonNull(originalVC); + + final VariantContext result; + + if ( originalVC.isVariant() && originalVC.getAttributeAsInt(VCFConstants.DEPTH_KEY,0) > 0 ) { + // only re-genotype polymorphic sites + final VariantContext regenotypedVC = calculateGenotypes(originalVC); + if (regenotypedVC == null || (!GATKVariantContextUtils.isProperlyPolymorphic(regenotypedVC) && !includeNonVariants)) { + return null; + } + if (GATKVariantContextUtils.isProperlyPolymorphic(regenotypedVC) || includeNonVariants) { + // Note that reversetrimAlleles must be performed after the annotations are finalized because the reducible annotation data maps + // were generated and keyed on the un reverseTrimmed alleles from the starting VariantContexts. Thus reversing the order will make + // it difficult to recover the data mapping due to the keyed alleles no longer being present in the variant context. + final VariantContext withGenotypingAnnotations = addGenotypingAnnotations(originalVC.getAttributes(), regenotypedVC); + final VariantContext withAnnotations = annotationEngine.finalizeAnnotations(withGenotypingAnnotations, originalVC); + final int[] relevantIndices = regenotypedVC.getAlleles().stream().mapToInt(a -> originalVC.getAlleles().indexOf(a)).toArray(); + final VariantContext trimmed = GATKVariantContextUtils.reverseTrimAlleles(withAnnotations); + final GenotypesContext updatedGTs = subsetAlleleSpecificFormatFields(outputHeader, trimmed.getGenotypes(), relevantIndices); + result = new VariantContextBuilder(trimmed).genotypes(updatedGTs).make(); + } else if (includeNonVariants) { + result = originalVC; + } else { + return null; + } + } else { + result = originalVC; + } + + + // if it turned monomorphic then we either need to ignore or fix such sites + // Note that the order of these actions matters and is different for polymorphic and monomorphic sites. + // For polymorphic sites we need to make sure e.g. the SB tag is sent to the annotation engine and then removed later. + // For monomorphic sites we need to make sure e.g. the hom ref genotypes are created and only then are passed to the annotation engine. + // We could theoretically make 2 passes to re-create the genotypes, but that gets extremely expensive with large sample sizes. + if (result.isPolymorphicInSamples()) { + // For polymorphic sites we need to make sure e.g. the SB tag is sent to the annotation engine and then removed later. + final VariantContext reannotated = annotationEngine.annotateContext(result, features, ref, null, a -> true); + return new VariantContextBuilder(reannotated).genotypes(cleanupGenotypeAnnotations(reannotated, false)).make(); + } else if (includeNonVariants) { + // For monomorphic sites we need to make sure e.g. the hom ref genotypes are created and only then are passed to the annotation engine. + VariantContext reannotated = new VariantContextBuilder(result).genotypes(cleanupGenotypeAnnotations(result, true)).make(); + reannotated = annotationEngine.annotateContext(reannotated, features, ref, null, GenotypeGVCFsEngine::annotationShouldBeSkippedForHomRefSites); + return removeNonRefAlleles(reannotated); + } else { + return null; + } + } + + /** + * Remove NON-REF alleles from the variant context + * + * @param vc the variant context + * @return variant context with the NON-REF alleles removed if multiallelic or replaced with NO-CALL alleles if biallelic + */ + private VariantContext removeNonRefAlleles(final VariantContext vc) { + + // If NON_REF is the only alt allele, ignore this site + final List newAlleles = new ArrayList<>(); + // Only keep alleles that are not NON-REF + for ( final Allele allele : vc.getAlleles() ) { + if ( !allele.equals(Allele.NON_REF_ALLELE) ) { + newAlleles.add(allele); + } + } + + // If no alt allele, then remove INFO fields that require alt alleles + if ( newAlleles.size() == 1 ) { + final VariantContextBuilder builder = new VariantContextBuilder(vc).alleles(newAlleles); + for ( final String name : infoHeaderAltAllelesLineNames ) { + builder.rmAttributes(Arrays.asList(name)); + } + return builder.make(); + } else { + return vc; + } + } + + private static boolean annotationShouldBeSkippedForHomRefSites(VariantAnnotation annotation) { + return annotation instanceof RankSumTest || annotation instanceof RMSMappingQuality || annotation instanceof AS_RMSMappingQuality; + } + + private GenotypesContext subsetAlleleSpecificFormatFields(final VCFHeader outputHeader, final GenotypesContext originalGs, final int[] relevantIndices) { + final GenotypesContext newGTs = GenotypesContext.create(originalGs.size()); + for (final Genotype g : originalGs) { + final GenotypeBuilder gb = new GenotypeBuilder(g); + final Set keys = g.getExtendedAttributes().keySet(); + for (final String key : keys) { + final VCFFormatHeaderLine headerLine = outputHeader.getFormatHeaderLine(key); + final Object attribute; + if (headerLine.getCountType().equals(VCFHeaderLineCount.INTEGER) && headerLine.getCount() == 1) { + attribute = g.getAnyAttribute(key); + } + else { + attribute = ReferenceConfidenceVariantContextMerger.generateAnnotationValueVector(headerLine.getCountType(), + GATKProtectedVariantContextUtils.attributeToList(g.getAnyAttribute(key)), relevantIndices); + } + gb.attribute(key, attribute); + } + newGTs.add(gb.make()); + } + return newGTs; + } + + /** + * Add genotyping-based annotations to the new VC + * + * @param originalAttributes the non-null annotations from the original VC + * @param newVC the new non-null VC + * @return a non-null VC + */ + private VariantContext addGenotypingAnnotations(final Map originalAttributes, final VariantContext newVC) { + // we want to carry forward the attributes from the original VC but make sure to add the MLE-based annotations and any other annotations generated by the genotyper. + final Map attrs = new LinkedHashMap<>(originalAttributes); + attrs.put(GATKVCFConstants.MLE_ALLELE_COUNT_KEY, newVC.getAttribute(GATKVCFConstants.MLE_ALLELE_COUNT_KEY)); + attrs.put(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY, newVC.getAttribute(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY)); + if (newVC.hasAttribute(GATKVCFConstants.NUMBER_OF_DISCOVERED_ALLELES_KEY)) { + attrs.put(GATKVCFConstants.NUMBER_OF_DISCOVERED_ALLELES_KEY, newVC.getAttribute(GATKVCFConstants.NUMBER_OF_DISCOVERED_ALLELES_KEY)); + } + if (newVC.hasAttribute(GATKVCFConstants.AS_QUAL_KEY)) { + attrs.put(GATKVCFConstants.AS_QUAL_KEY, newVC.getAttribute(GATKVCFConstants.AS_QUAL_KEY)); + } + return new VariantContextBuilder(newVC).attributes(attrs).make(); + } + + private VariantContext calculateGenotypes(VariantContext vc){ + /* + * Query the VariantContext for the appropriate model. If type == MIXED, one would want to use model = BOTH. + * However GenotypingEngine.getAlleleFrequencyPriors throws an exception if you give it anything but a SNP or INDEL model. + */ + final GenotypeLikelihoodsCalculationModel model = vc.getType() == VariantContext.Type.INDEL + ? GenotypeLikelihoodsCalculationModel.INDEL + : GenotypeLikelihoodsCalculationModel.SNP; + return genotypingEngine.calculateGenotypes(vc, model, null); + } + + /** + * Re-genotype (and re-annotate) a combined genomic VC + * @return a new VariantContext or null if the site turned monomorphic and we don't want such sites + */ + private VariantContext regenotypeSomaticVC(final VariantContext originalVC, final ReferenceContext ref, final FeatureContext features, boolean includeNonVariants, double tlodThreshold, double afTolerance) { + Utils.nonNull(originalVC); + + final VariantContext result; + if ( originalVC.isVariant() && originalVC.getAttributeAsInt(VCFConstants.DEPTH_KEY,0) > 0 ) { + result = callSomaticGenotypes(originalVC, tlodThreshold, afTolerance); + } else if (includeNonVariants) { + result = originalVC; + } else { + result = null; + } + return result; + } + + /** + * Drop low quality alleles and call genotypes + * CombineGVCFs will convert calls to no-call (of varying ploidy, as is the case in somatic) + * + * @param vc input VariantContext with no-called genotypes + * @return a VC with called genotypes and low quality alleles removed, may be null + */ + private VariantContext callSomaticGenotypes(final VariantContext vc, double tlodThreshold, double afTolerance) { + final List newGenotypes = new ArrayList<>(); + final GenotypesContext genotypes = vc.getGenotypes(); + final double[] perAlleleLikelihoodSums = new double[vc.getAlleles().size()]; //needs the ref for the subsetting utils + + for(final Genotype g : genotypes) { + GenotypeBuilder gb = new GenotypeBuilder(g); + final double[] tlodArray = GATKProtectedVariantContextUtils.getAttributeAsDoubleArray(g, GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY, () -> null, 0.0); + final double[] variantAFArray = GATKProtectedVariantContextUtils.getAttributeAsDoubleArray(g, GATKVCFConstants.ALLELE_FRACTION_KEY, () -> null, 0.0); + double variantAFtotal = 0; + final List calledAlleles = new ArrayList<>(); + for(int i = 0; i < vc.getAlleles().size()-1; i++) { + variantAFtotal += variantAFArray[i]; + if (tlodArray[i] > tlodThreshold) { + calledAlleles.add(vc.getAlternateAllele(i)); + perAlleleLikelihoodSums[i+1] += tlodArray[i]; + } + } + //hack for weird Mutect2 ploidy -- if the variant is non-homoplasmic, call the reference allele too + if(variantAFtotal < 1-afTolerance && (!g.hasAD() || g.getAD()[0] > 0)) { + + calledAlleles.add(0, vc.getReference()); + } + //"ploidy" gets set according to the size of the alleles List in the Genotype + gb.alleles(calledAlleles); + newGenotypes.add(gb.make()); + } + + final VariantContextBuilder builder = new VariantContextBuilder(vc); + final VariantContext regenotypedVC = builder.genotypes(newGenotypes).make(); + + final int maxAltAlleles = ((UnifiedArgumentCollection)genotypingEngine.getConfiguration()).genotypeArgs.MAX_ALTERNATE_ALLELES; + List allelesToKeep; + + //we need to make sure all alleles pass the tlodThreshold + allelesToKeep = new ArrayList<>(perAlleleLikelihoodSums.length-1); + allelesToKeep.add(vc.getReference()); + for (int i = 1; i < perAlleleLikelihoodSums.length; i++) { + if (perAlleleLikelihoodSums[i] > tlodThreshold) { + allelesToKeep.add(vc.getAlternateAllele(i-1)); + } + } + + if (regenotypedVC.getAlternateAlleles().size() > maxAltAlleles) { + allelesToKeep = AlleleSubsettingUtils.filterToMaxNumberOfAltAllelesBasedOnScores(maxAltAlleles, allelesToKeep, perAlleleLikelihoodSums); + } + + if (allelesToKeep.size() == 1) { + return null; + } + + //if we didn't drop alleles then we're done! + if (allelesToKeep.size() == regenotypedVC.getAlleles().size()) { + return regenotypedVC; + } + + final int[] relevantIndices = allelesToKeep.stream().mapToInt(a -> regenotypedVC.getAlleles().indexOf(a)).toArray(); + + //do another pass over genotypes to drop the alleles that aren't called + final GenotypesContext reducedGenotypes = AlleleSubsettingUtils.subsetSomaticAlleles(outputHeader, regenotypedVC.getGenotypes(), allelesToKeep, relevantIndices); + final VariantContext subsetVC = builder.alleles(allelesToKeep).genotypes(reducedGenotypes).make(); + final VariantContext trimmedVC = GATKVariantContextUtils.trimAlleles(subsetVC, true, true); + if (GATKVariantContextUtils.isProperlyPolymorphic(trimmedVC)) { + return trimmedVC; + } + else { + return null; + } + } + + // If includeNonVariants is set, we're using group-by-locus traversal. To match GATK3 GenotypeGVCFs, + // see if there is a variant in the overlapping group that starts exactly at the locus start position, and if so + // prioritize and process only that variant. Otherwise process all of the overlapping variants. + private List getVariantSubsetToProcess(final Locatable loc, List preProcessedVariants) { + if (includeNonVariants) { + final List matchingStart = + preProcessedVariants.stream().filter(vc -> vc.getStart() == loc.getStart()).collect(Collectors.toList()); + if (matchingStart.size() == 0) { + return preProcessedVariants; + } + else if (matchingStart.size() == 1) { + return matchingStart; + } + // since this tool only accepts a single input source, there should never be + // more than one variant at a given starting locus + throw new IllegalStateException( + String.format( + "Variant input contains more than one variant starting at location: %s", + new SimpleInterval(matchingStart.get(0)))); + } else { + return preProcessedVariants; + } + } + + /** + * Creates a UnifiedArgumentCollection with appropriate values filled in from the arguments in this walker + * @return a complete UnifiedArgumentCollection + */ + private UnifiedArgumentCollection createUAC() { + final UnifiedArgumentCollection uac = new UnifiedArgumentCollection(); + uac.genotypeArgs = new GenotypeCalculationArgumentCollection(genotypeArgs); + + //whether to emit non-variant sites is not contained in genotypeArgs and must be passed to uac separately + //Note: GATK3 uses OutputMode.EMIT_ALL_CONFIDENT_SITES when includeNonVariants is requested + //GATK4 uses EMIT_ALL_SITES to ensure LowQual sites are emitted. + uac.outputMode = includeNonVariants ? OutputMode.EMIT_ALL_SITES : OutputMode.EMIT_VARIANTS_ONLY; + return uac; + } + + /** + * Create a VCF header in the writer + * + * @param vcfWriter + * @return a VCF writer + + */ + public VariantContextWriter setupVCFWriter(Set defaultToolVCFHeaderLines, boolean keepCombined, DbsnpArgumentCollection dbsnp, VariantContextWriter vcfWriter) { + final Set headerLines = new LinkedHashSet<>(inputVCFHeader.getMetaDataInInputOrder()); + headerLines.addAll(defaultToolVCFHeaderLines); + + // Remove GCVFBlocks + headerLines.removeIf(vcfHeaderLine -> vcfHeaderLine.getKey().startsWith(GVCF_BLOCK)); + + headerLines.addAll(annotationEngine.getVCFAnnotationDescriptions(false)); + headerLines.addAll(genotypingEngine.getAppropriateVCFInfoHeaders()); + + // add headers for annotations added by this tool + headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.MLE_ALLELE_COUNT_KEY)); + headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY)); + headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.REFERENCE_GENOTYPE_QUALITY)); + headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY)); // needed for gVCFs without DP tags + if (keepCombined) { + headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.AS_QUAL_KEY)); + } + if ( dbsnp.dbsnp != null ) { + VCFStandardHeaderLines.addStandardInfoLines(headerLines, true, VCFConstants.DBSNP_KEY); + } + + final Set sampleNameSet = samples.asSetOfSamples(); + outputHeader = new VCFHeader(headerLines, new TreeSet<>(sampleNameSet)); + vcfWriter.writeHeader(outputHeader); + + return vcfWriter; + } + + + /** + * Cleans up genotype-level annotations that need to be updated. + * 1. move MIN_DP to DP if present + * 2. propagate DP to AD if not present + * 3. remove SB if present + * 4. change the PGT value from "0|1" to "1|1" for homozygous variant genotypes + * 5. move GQ to RGQ if the site is monomorphic + * + * @param vc the VariantContext with the Genotypes to fix + * @param createRefGTs if true we will also create proper hom ref genotypes since we assume the site is monomorphic + * @return a new set of Genotypes + */ + @VisibleForTesting + static List cleanupGenotypeAnnotations(final VariantContext vc, final boolean createRefGTs) { + final GenotypesContext oldGTs = vc.getGenotypes(); + final List recoveredGs = new ArrayList<>(oldGTs.size()); + for ( final Genotype oldGT : oldGTs ) { + final Map attrs = new HashMap<>(oldGT.getExtendedAttributes()); + + final GenotypeBuilder builder = new GenotypeBuilder(oldGT); + int depth = oldGT.hasDP() ? oldGT.getDP() : 0; + + // move the MIN_DP to DP + if ( oldGT.hasExtendedAttribute(GATKVCFConstants.MIN_DP_FORMAT_KEY) ) { + depth = parseInt(oldGT.getAnyAttribute(GATKVCFConstants.MIN_DP_FORMAT_KEY)); + builder.DP(depth); + attrs.remove(GATKVCFConstants.MIN_DP_FORMAT_KEY); + } + + attrs.remove(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY); + + // update PGT for hom vars + if ( oldGT.isHomVar() && oldGT.hasExtendedAttribute(GATKVCFConstants.HAPLOTYPE_CALLER_PHASING_GT_KEY) ) { + attrs.put(GATKVCFConstants.HAPLOTYPE_CALLER_PHASING_GT_KEY, GenotypeGVCFs.PHASED_HOM_VAR_STRING); + } + + // create AD if it's not there + if ( !oldGT.hasAD() && vc.isVariant() ) { + final int[] AD = new int[vc.getNAlleles()]; + AD[0] = depth; + builder.AD(AD); + } + + if ( createRefGTs ) { + // move the GQ to RGQ + if (oldGT.hasGQ()) { + builder.noGQ(); + attrs.put(GATKVCFConstants.REFERENCE_GENOTYPE_QUALITY, oldGT.getGQ()); + } + + //keep 0 depth samples and 0 GQ samples as no-call + if (depth > 0 && oldGT.hasGQ() && oldGT.getGQ() > 0) { + final List refAlleles = Collections.nCopies(oldGT.getPloidy(), vc.getReference()); + builder.alleles(refAlleles); + } + + // also, the PLs are technically no longer usable + builder.noPL(); + } + + recoveredGs.add(builder.noAttributes().attributes(attrs).make()); + } + return recoveredGs; + } + + + private static int parseInt(Object attribute){ + if( attribute instanceof String) { + return Integer.parseInt((String)attribute); + } else if ( attribute instanceof Number){ + return ((Number) attribute).intValue(); + } else { + throw new IllegalArgumentException("Expected a Number or a String but found something else."); + } + } +} \ No newline at end of file diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFsUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFsUnitTest.java index 80ddc4c9f2c..c809120bd7e 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFsUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFsUnitTest.java @@ -58,7 +58,7 @@ public Object[][] variantContexts() { @Test(dataProvider= "variantContexts") public void testCleanupGenotypeAnnotations(VariantContext vc, boolean createRefGTs, List expected){ - final List genotypes = GenotypeGVCFs.cleanupGenotypeAnnotations(vc, createRefGTs); + final List genotypes = GenotypeGVCFsEngine.cleanupGenotypeAnnotations(vc, createRefGTs); VariantContextTestUtils.assertGenotypesAreEqual(genotypes.get(0), expected.get(0)); } @@ -92,15 +92,15 @@ public Object[][] getMinDPData(){ @Test(dataProvider = "getMinDPData") public void testMinDPReplacedWithDP(VariantContext vc, int expectedDepth){ - Assert.assertEquals(GenotypeGVCFs.cleanupGenotypeAnnotations(vc, false).get(0).getDP(), expectedDepth); - Assert.assertNull(GenotypeGVCFs.cleanupGenotypeAnnotations(vc, false).get(0).getExtendedAttribute(GATKVCFConstants.MIN_DP_FORMAT_KEY)); + Assert.assertEquals(GenotypeGVCFsEngine.cleanupGenotypeAnnotations(vc, false).get(0).getDP(), expectedDepth); + Assert.assertNull(GenotypeGVCFsEngine.cleanupGenotypeAnnotations(vc, false).get(0).getExtendedAttribute(GATKVCFConstants.MIN_DP_FORMAT_KEY)); } @Test public void testSBRemoved(){ final VariantContext vcWithSB = getHetWithGenotype(generateGenotypes(b -> b.attribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY, new int[]{6, 11, 11, 10}))); Assert.assertNotNull(vcWithSB.getGenotype("Sample_0").getAnyAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY)); - final Genotype afterCleanup = GenotypeGVCFs.cleanupGenotypeAnnotations(vcWithSB, true).get(0); + final Genotype afterCleanup = GenotypeGVCFsEngine.cleanupGenotypeAnnotations(vcWithSB, true).get(0); Assert.assertNull(afterCleanup.getExtendedAttribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY)); } @@ -108,7 +108,7 @@ public void testSBRemoved(){ public void testADCreated(){ final VariantContext noAD = getHetWithGenotype(generateGenotypes(ADD_DP)); Assert.assertNull(noAD.getGenotype("Sample_0").getAD()); - final Genotype afterCleanup = GenotypeGVCFs.cleanupGenotypeAnnotations(noAD, true).get(0); + final Genotype afterCleanup = GenotypeGVCFsEngine.cleanupGenotypeAnnotations(noAD, true).get(0); Assert.assertEquals(afterCleanup.getAD(), new int[]{DP, 0}); } @@ -117,9 +117,9 @@ public void testPhasingUpdate(){ final VariantContext withPhasing = getHetWithGenotype(generateGenotypes(b -> b.attribute(GATKVCFConstants.HAPLOTYPE_CALLER_PHASING_GT_KEY, "bad") .alleles(Arrays.asList(ALT,ALT)), b -> b.attribute(GATKVCFConstants.HAPLOTYPE_CALLER_PHASING_GT_KEY, "something").alleles(Arrays.asList(REF,REF)))); - final Genotype homVarAfterCleanup = GenotypeGVCFs.cleanupGenotypeAnnotations(withPhasing, true).get(0); + final Genotype homVarAfterCleanup = GenotypeGVCFsEngine.cleanupGenotypeAnnotations(withPhasing, true).get(0); Assert.assertEquals(homVarAfterCleanup.getAnyAttribute(GATKVCFConstants.HAPLOTYPE_CALLER_PHASING_GT_KEY), GenotypeGVCFs.PHASED_HOM_VAR_STRING); - final Genotype homRefAfterCleaning = GenotypeGVCFs.cleanupGenotypeAnnotations(withPhasing, true).get(1); + final Genotype homRefAfterCleaning = GenotypeGVCFsEngine.cleanupGenotypeAnnotations(withPhasing, true).get(1); Assert.assertEquals(homRefAfterCleaning.getAnyAttribute(GATKVCFConstants.HAPLOTYPE_CALLER_PHASING_GT_KEY), "something"); } @@ -128,7 +128,7 @@ public void testRGQ0IsNoCall(){ final List noCall = GATKVariantContextUtils.noCallAlleles(2); final VariantContext gq0 = getHetWithGenotype(generateGenotypes(b -> b.GQ(0).DP(10).alleles(noCall), // GQ = 0 (b -> b.DP(10).alleles(noCall)))); //no GQ - final List genotypes = GenotypeGVCFs.cleanupGenotypeAnnotations(gq0, true); + final List genotypes = GenotypeGVCFsEngine.cleanupGenotypeAnnotations(gq0, true); for( Genotype genotype : genotypes ){ Assert.assertEquals(genotype.getAlleles(), noCall); }