diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerArgumentCollection.java index dc01955334e..4464d26f763 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerArgumentCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerArgumentCollection.java @@ -20,6 +20,8 @@ public abstract class AssemblyBasedCallerArgumentCollection { public static final String BAM_WRITER_TYPE_LONG_NAME = "bam-writer-type"; public static final String DONT_USE_SOFT_CLIPPED_BASES_LONG_NAME = "dont-use-soft-clipped-bases"; public static final String DO_NOT_RUN_PHYSICAL_PHASING_LONG_NAME = "do-not-run-physical-phasing"; + public static final String MAX_MNP_DISTANCE_LONG_NAME = "max-mnp-distance"; + public static final String MAX_MNP_DISTANCE_SHORT_NAME = "mnp-dist"; public static final String MIN_BASE_QUALITY_SCORE_LONG_NAME = "min-base-quality-score"; public static final String SMITH_WATERMAN_LONG_NAME = "smith-waterman"; @@ -111,4 +113,14 @@ public ReadThreadingAssembler createReadThreadingAssembler() { @Advanced @Argument(fullName="emit-ref-confidence", shortName="ERC", doc="(BETA feature) Mode for emitting reference confidence scores", optional = true) public ReferenceConfidenceMode emitReferenceConfidence = ReferenceConfidenceMode.NONE; + + protected abstract int getDefaultMaxMnpDistance(); + + /** + * Two or more phased substitutions separated by this distance or less are merged into MNPs. + */ + @Advanced + @Argument(fullName = MAX_MNP_DISTANCE_LONG_NAME, shortName = MAX_MNP_DISTANCE_SHORT_NAME, + doc = "Two or more phased substitutions separated by this distance or less are merged into MNPs.", optional = true) + public int maxMnpDistance = getDefaultMaxMnpDistance(); } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtils.java index c86a8fb1366..c8fbea471ca 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtils.java @@ -13,6 +13,7 @@ import org.broadinstitute.hellbender.engine.AlignmentContext; import org.broadinstitute.hellbender.engine.AssemblyRegion; import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.tools.walkers.ReferenceConfidenceVariantContextMerger; import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler; import org.broadinstitute.hellbender.utils.QualityUtils; import org.broadinstitute.hellbender.utils.SimpleInterval; @@ -29,10 +30,7 @@ import org.broadinstitute.hellbender.utils.io.IOUtils; import org.broadinstitute.hellbender.utils.locusiterator.LocusIteratorByState; import org.broadinstitute.hellbender.utils.pileup.ReadPileup; -import org.broadinstitute.hellbender.utils.read.AlignmentUtils; -import org.broadinstitute.hellbender.utils.read.GATKRead; -import org.broadinstitute.hellbender.utils.read.ReadCoordinateComparator; -import org.broadinstitute.hellbender.utils.read.ReadUtils; +import org.broadinstitute.hellbender.utils.read.*; import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner; import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils; @@ -49,6 +47,7 @@ public final class AssemblyBasedCallerUtils { static final int REFERENCE_PADDING_FOR_ASSEMBLY = 500; + public static final int NUM_HAPLOTYPES_TO_INJECT_FORCE_CALLING_ALLELES_INTO = 5; public static final String SUPPORTED_ALLELES_TAG="XA"; public static final String CALLABLE_REGION_TAG = "CR"; public static final String ALIGNMENT_REGION_TAG = "AR"; @@ -275,7 +274,7 @@ public static AssemblyResultSet assembleReads(final AssemblyRegion region, final byte[] fullReferenceWithPadding = region.getAssemblyRegionReference(referenceReader, REFERENCE_PADDING_FOR_ASSEMBLY); final SimpleInterval paddedReferenceLoc = getPaddedReferenceLoc(region, REFERENCE_PADDING_FOR_ASSEMBLY, referenceReader); - final Haplotype referenceHaplotype = createReferenceHaplotype(region, paddedReferenceLoc, referenceReader); + final Haplotype refHaplotype = createReferenceHaplotype(region, paddedReferenceLoc, referenceReader); final ReadErrorCorrector readErrorCorrector = argumentCollection.assemblerArgs.errorCorrectReads ? new ReadErrorCorrector(argumentCollection.assemblerArgs.kmerLengthForReadErrorCorrection, @@ -286,9 +285,12 @@ public static AssemblyResultSet assembleReads(final AssemblyRegion region, null; try { - final AssemblyResultSet assemblyResultSet = assemblyEngine.runLocalAssembly(region, referenceHaplotype, fullReferenceWithPadding, - paddedReferenceLoc, givenAlleles, readErrorCorrector, header, - aligner); + final AssemblyResultSet assemblyResultSet = assemblyEngine.runLocalAssembly(region, refHaplotype, fullReferenceWithPadding, + paddedReferenceLoc, readErrorCorrector, header, aligner); + if (!givenAlleles.isEmpty()) { + addGivenAlleles(region.getExtendedSpan().getStart(), givenAlleles, argumentCollection.maxMnpDistance, aligner, refHaplotype, assemblyResultSet); + } + assemblyResultSet.setDebug(argumentCollection.assemblerArgs.debugAssembly); assemblyResultSet.debugDump(logger); return assemblyResultSet; @@ -305,6 +307,57 @@ public static AssemblyResultSet assembleReads(final AssemblyRegion region, } } + @VisibleForTesting + static void addGivenAlleles(final int assemblyRegionStart, final List givenAlleles, final int maxMnpDistance, + final SmithWatermanAligner aligner, final Haplotype refHaplotype, final AssemblyResultSet assemblyResultSet) { + final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef(); + final Map assembledVariants = assemblyResultSet.getVariationEvents(maxMnpDistance).stream() + .collect(Collectors.groupingBy(VariantContext::getStart, Collectors.collectingAndThen(Collectors.toList(), AssemblyBasedCallerUtils::makeMergedVariantContext))); + + final List assembledHaplotypes = assemblyResultSet.getHaplotypeList(); + for (final VariantContext givenVC : givenAlleles) { + final VariantContext assembledVC = assembledVariants.get(givenVC.getStart()); + final int givenVCRefLength = givenVC.getReference().length(); + final Allele longerRef = (assembledVC == null || givenVCRefLength > assembledVC.getReference().length()) ? givenVC.getReference() : assembledVC.getReference(); + final List unassembledGivenAlleles; + if (assembledVC == null) { + unassembledGivenAlleles = givenVC.getAlternateAlleles(); + } else { + // map all alleles to the longest common reference + final Set assembledAlleleSet = new HashSet<>(longerRef.length() == assembledVC.getReference().length() ? assembledVC.getAlternateAlleles() : + ReferenceConfidenceVariantContextMerger.remapAlleles(assembledVC, longerRef)); + final Set givenAlleleSet = new HashSet<>(longerRef.length() == givenVCRefLength ? givenVC.getAlternateAlleles() : + ReferenceConfidenceVariantContextMerger.remapAlleles(givenVC, longerRef)); + unassembledGivenAlleles = givenAlleleSet.stream().filter(a -> !assembledAlleleSet.contains(a)).collect(Collectors.toList()); + } + + // choose the highest-scoring haplotypes along with the reference for building force-calling haplotypes + final List baseHaplotypes = unassembledGivenAlleles.isEmpty() ? Collections.emptyList() : assembledHaplotypes.stream() + .sorted(Comparator.comparingInt((Haplotype hap) -> hap.isReference() ? 1 : 0).thenComparingDouble(hap -> hap.getScore()).reversed()) + .limit(NUM_HAPLOTYPES_TO_INJECT_FORCE_CALLING_ALLELES_INTO) + .collect(Collectors.toList()); + + for (final Allele givenAllele : unassembledGivenAlleles) { + for (final Haplotype baseHaplotype : baseHaplotypes) { + // make sure this allele doesn't collide with a variant on the haplotype + if (baseHaplotype.getEventMap()!= null && baseHaplotype.getEventMap().getVariantContexts().stream().anyMatch(vc -> vc.overlaps(givenVC))) { + continue; + } + + final Haplotype insertedHaplotype = baseHaplotype.insertAllele(longerRef, givenAllele, activeRegionStart + givenVC.getStart() - assemblyRegionStart, givenVC.getStart()); + if (insertedHaplotype != null) { // can be null if the requested allele can't be inserted into the haplotype + final Cigar cigar = CigarUtils.calculateCigar(refHaplotype.getBases(), insertedHaplotype.getBases(), aligner); + insertedHaplotype.setCigar(cigar); + insertedHaplotype.setGenomeLocation(refHaplotype.getGenomeLocation()); + insertedHaplotype.setAlignmentStartHapwrtRef(activeRegionStart); + assemblyResultSet.add(insertedHaplotype); + } + } + } + } + assemblyResultSet.regenerateVariationEvents(maxMnpDistance); + } + /** * Annotates reads in ReadLikelihoods with alignment region (the ref region spanned by the haplotype the read is aligned to) and * callable region (the ref region over which a caller is using these ReadLikelihoods to call variants) @@ -817,4 +870,5 @@ private static VariantContext phaseVC(final VariantContext vc, final String ID, } return new VariantContextBuilder(vc).genotypes(phasedGenotypes).make(); } + } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyResultSet.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyResultSet.java index 39ed79558ae..4bcbd257ecb 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyResultSet.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyResultSet.java @@ -43,6 +43,7 @@ public final class AssemblyResultSet { private boolean wasTrimmed = false; private final SortedSet kmerSizes; private SortedSet variationEvents; + private OptionalInt lastMaxMnpDistanceUsed = OptionalInt.empty(); private boolean debug; private static final Logger logger = LogManager.getLogger(AssemblyResultSet.class); @@ -92,6 +93,7 @@ public AssemblyResultSet trimTo(final AssemblyRegion trimmedAssemblyRegion) { result.setRegionForGenotyping(trimmedAssemblyRegion); result.setFullReferenceWithPadding(fullReferenceWithPadding); result.setPaddedReferenceLoc(paddedReferenceLoc); + result.variationPresent = haplotypes.stream().anyMatch(Haplotype::isNonReference); if (result.refHaplotype == null) { throw new IllegalStateException("missing reference haplotype in the trimmed set"); } @@ -510,14 +512,24 @@ private void updateReferenceHaplotype(final Haplotype newHaplotype) { */ public SortedSet getVariationEvents(final int maxMnpDistance) { ParamUtils.isPositiveOrZero(maxMnpDistance, "maxMnpDistance may not be negative."); - if (variationEvents == null) { - final List haplotypeList = getHaplotypeList(); - EventMap.buildEventMapsForHaplotypes(haplotypeList, fullReferenceWithPadding, paddedReferenceLoc, debug, maxMnpDistance); - variationEvents = EventMap.getAllVariantContexts(haplotypeList); + + final boolean sameMnpDistance = lastMaxMnpDistanceUsed.isPresent() && maxMnpDistance == lastMaxMnpDistanceUsed.getAsInt(); + lastMaxMnpDistanceUsed = OptionalInt.of(maxMnpDistance); + + if (variationEvents == null || !sameMnpDistance || haplotypes.stream().anyMatch(hap -> hap.isNonReference() && hap.getEventMap() == null)) { + regenerateVariationEvents(maxMnpDistance); } return variationEvents; } + public void regenerateVariationEvents(int maxMnpDistance) { + final List haplotypeList = getHaplotypeList(); + EventMap.buildEventMapsForHaplotypes(haplotypeList, fullReferenceWithPadding, paddedReferenceLoc, debug, maxMnpDistance); + variationEvents = EventMap.getAllVariantContexts(haplotypeList); + lastMaxMnpDistanceUsed = OptionalInt.of(maxMnpDistance); + variationPresent = haplotypeList.stream().anyMatch(Haplotype::isNonReference); + } + public void setDebug(boolean debug) { this.debug = debug; } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java index d13d522b825..8bf42828e43 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java @@ -21,8 +21,6 @@ public class HaplotypeCallerArgumentCollection extends AssemblyBasedCallerArgumentCollection implements Serializable{ private static final long serialVersionUID = 1L; - public static final String MAX_MNP_DISTANCE_LONG_NAME = "max-mnp-distance"; - public static final String MAX_MNP_DISTANCE_SHORT_NAME = "mnp-dist"; public static final String GQ_BAND_LONG_NAME = "gvcf-gq-bands"; public static final String GQ_BAND_SHORT_NAME = "GQB"; public static final String CORRECT_OVERLAPPING_BASE_QUALITIES_LONG_NAME = "correct-overlapping-quality"; @@ -31,6 +29,9 @@ public class HaplotypeCallerArgumentCollection extends AssemblyBasedCallerArgume @ArgumentCollection public StandardCallerArgumentCollection standardArgs = new StandardCallerArgumentCollection(); + @Override + protected int getDefaultMaxMnpDistance() { return 0; } + @Override protected ReadThreadingAssemblerArgumentCollection getReadThreadingAssemblerArgumentCollection() { return new HaplotypeCallerReadThreadingAssemblerArgumentCollection(); @@ -133,15 +134,6 @@ protected ReadThreadingAssemblerArgumentCollection getReadThreadingAssemblerArgu @Argument(fullName = "dont-genotype", doc = "Perform assembly but do not genotype variants", optional = true) public boolean dontGenotype = false; - /** - * Two or more phased substitutions separated by this distance or less are merged into MNPs. - */ - @Advanced - @Argument(fullName = MAX_MNP_DISTANCE_LONG_NAME, shortName = MAX_MNP_DISTANCE_SHORT_NAME, - doc = "Two or more phased substitutions separated by this distance or less are merged into MNPs. " + - "WARNING: When used in GVCF mode, resulting GVCFs cannot be joint-genotyped.", optional = true) - public int maxMnpDistance = 0; - /** * As of GATK 3.3, HaplotypeCaller outputs physical (read-based) information (see version 3.3 release notes and documentation for details). This argument disables that behavior. */ diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java index 80e2adc372f..360b05ee749 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java @@ -542,9 +542,6 @@ public List callRegion(final AssemblyRegion region, final Featur final AssemblyResultSet untrimmedAssemblyResult = AssemblyBasedCallerUtils.assembleReads(region, givenAlleles, hcArgs, readsHeader, samplesList, logger, referenceReader, assemblyEngine, aligner, !hcArgs.doNotCorrectOverlappingBaseQualities); final SortedSet allVariationEvents = untrimmedAssemblyResult.getVariationEvents(hcArgs.maxMnpDistance); - // TODO - line bellow might be unnecessary : it might be that assemblyResult will always have those alleles anyway - // TODO - so check and remove if that is the case: - allVariationEvents.addAll(givenAlleles); final AssemblyRegionTrimmer.Result trimmingResult = trimmer.trim(region, allVariationEvents); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java index 4955bc4738b..b8f74d1c2ae 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java @@ -32,7 +32,6 @@ public final class ReadThreadingAssembler { private static final Logger logger = LogManager.getLogger(ReadThreadingAssembler.class); private static final int DEFAULT_NUM_PATHS_PER_GRAPH = 128; - private static final int GGA_MODE_ARTIFICIAL_COUNTS = 1000; private static final int KMER_SIZE_ITERATION_INCREASE = 10; private static final int MAX_KMER_ITERATIONS_TO_ATTEMPT = 6; @@ -94,20 +93,18 @@ public ReadThreadingAssembler(final int maxAllowedPathsForReadThreadingAssembler /** * Main entry point into the assembly engine. Build a set of deBruijn graphs out of the provided reference sequence and list of reads - * @param aligner * @param assemblyRegion AssemblyRegion object holding the reads which are to be used during assembly * @param refHaplotype reference haplotype object * @param fullReferenceWithPadding byte array holding the reference sequence with padding * @param refLoc GenomeLoc object corresponding to the reference sequence with padding - * @param givenAlleles the alleles to inject into the haplotypes during GGA mode * @param readErrorCorrector a ReadErrorCorrector object, if read are to be corrected before assembly. Can be null if no error corrector is to be used. + * @param aligner {@link SmithWatermanAligner} used to align dangling ends in assembly graphs to the reference sequence * @return the resulting assembly-result-set */ public AssemblyResultSet runLocalAssembly(final AssemblyRegion assemblyRegion, final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final SimpleInterval refLoc, - final List givenAlleles, final ReadErrorCorrector readErrorCorrector, final SAMFileHeader header, final SmithWatermanAligner aligner) { @@ -120,9 +117,6 @@ public AssemblyResultSet runLocalAssembly(final AssemblyRegion assemblyRegion, Utils.validateArg( fullReferenceWithPadding.length == refLoc.size(), "Reference bases and reference loc must be the same size."); ParamUtils.isPositiveOrZero(pruneFactor, "Pruning factor cannot be negative"); - // create the list of artificial haplotypes that should be added to the graph for GGA mode - final List givenHaplotypes = composeGivenHaplotypes(refHaplotype, givenAlleles, assemblyRegion.getExtendedSpan()); - // error-correct reads before clipping low-quality tails: some low quality bases might be good and we want to recover them final List correctedReads; if ( readErrorCorrector != null ) { @@ -145,7 +139,7 @@ public AssemblyResultSet runLocalAssembly(final AssemblyRegion assemblyRegion, resultSet.add(refHaplotype); final Map assemblyResultByGraph = new HashMap<>(); // create the graphs by calling our subclass assemble method - for ( final AssemblyResult result : assemble(correctedReads, refHaplotype, givenHaplotypes, header, aligner) ) { + for ( final AssemblyResult result : assemble(correctedReads, refHaplotype, header, aligner) ) { if ( result.getStatus() == AssemblyResult.Status.ASSEMBLED_SOME_VARIATION ) { // do some QC on the graph sanityCheckGraph(result.getGraph(), refHaplotype); @@ -155,6 +149,7 @@ public AssemblyResultSet runLocalAssembly(final AssemblyRegion assemblyRegion, } } + // add assembled alt haplotypes to the {@code resultSet} findBestPaths(nonRefGraphs, refHaplotype, refLoc, activeRegionExtendedLocation, assemblyResultByGraph, resultSet, aligner); // print the graphs if the appropriate debug option has been turned on @@ -163,37 +158,6 @@ public AssemblyResultSet runLocalAssembly(final AssemblyRegion assemblyRegion, return resultSet; } - /** - * Create the list of artificial GGA-mode haplotypes by injecting each of the provided alternate alleles into the reference haplotype - * - * @param refHaplotype the reference haplotype - * @param givenHaplotypes the list of alternate alleles in VariantContexts - * @param activeRegionWindow the window containing the reference haplotype - * - * @return a non-null list of haplotypes - */ - private static List composeGivenHaplotypes(final Haplotype refHaplotype, final List givenHaplotypes, final SimpleInterval activeRegionWindow) { - Utils.nonNull(refHaplotype, "the reference haplotype cannot be null"); - Utils.nonNull(givenHaplotypes, "given haplotypes cannot be null"); - Utils.nonNull(activeRegionWindow, "active region window cannot be null"); - Utils.validateArg(activeRegionWindow.size() == refHaplotype.length(), "inconsistent reference haplotype and active region window"); - - final Set returnHaplotypes = new LinkedHashSet<>(); - final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef(); - - for( final VariantContext compVC : givenHaplotypes ) { - Utils.validateArg(compVC.overlaps(activeRegionWindow), " some variant provided does not overlap with active region window"); - for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { - final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()); - if( insertedRefHaplotype != null ) { // can be null if the requested allele can't be inserted into the haplotype - returnHaplotypes.add(insertedRefHaplotype); - } - } - } - - return new ArrayList<>(returnHaplotypes); - } - private List findBestPaths(final Collection graphs, final Haplotype refHaplotype, final SimpleInterval refLoc, final SimpleInterval activeRegionWindow, final Map assemblyResultByGraph, final AssemblyResultSet assemblyResultSet, final SmithWatermanAligner aligner) { // add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes @@ -370,18 +334,18 @@ private static void addResult(final Collection results, final As * Given reads and a reference haplotype give us graphs to use for constructing * non-reference haplotypes. * - * @param aligner * @param reads the reads we're going to assemble * @param refHaplotype the reference haplotype + * @param aligner {@link SmithWatermanAligner} used to align dangling ends in assembly graphs to the reference sequence * @return a non-null list of reads */ @VisibleForTesting - List assemble(final List reads, final Haplotype refHaplotype, final List givenHaplotypes, final SAMFileHeader header, final SmithWatermanAligner aligner) { + List assemble(final List reads, final Haplotype refHaplotype, final SAMFileHeader header, final SmithWatermanAligner aligner) { final List results = new LinkedList<>(); // first, try using the requested kmer sizes for ( final int kmerSize : kmerSizes ) { - addResult(results, createGraph(reads, refHaplotype, kmerSize, givenHaplotypes, dontIncreaseKmerSizesForCycles, allowNonUniqueKmersInRef, header, aligner)); + addResult(results, createGraph(reads, refHaplotype, kmerSize, dontIncreaseKmerSizesForCycles, allowNonUniqueKmersInRef, header, aligner)); } // if none of those worked, iterate over larger sizes if allowed to do so @@ -391,7 +355,7 @@ List assemble(final List reads, final Haplotype refHap while ( results.isEmpty() && numIterations <= MAX_KMER_ITERATIONS_TO_ATTEMPT ) { // on the last attempt we will allow low complexity graphs final boolean lastAttempt = numIterations == MAX_KMER_ITERATIONS_TO_ATTEMPT; - addResult(results, createGraph(reads, refHaplotype, kmerSize, givenHaplotypes, lastAttempt, lastAttempt, header, aligner)); + addResult(results, createGraph(reads, refHaplotype, kmerSize, lastAttempt, lastAttempt, header, aligner)); kmerSize += KMER_SIZE_ITERATION_INCREASE; numIterations++; } @@ -407,19 +371,17 @@ private static int arrayMaxInt(final List array) { /** * Creates the sequence graph for the given kmerSize * - * @param aligner * @param reads reads to use * @param refHaplotype reference haplotype * @param kmerSize kmer size - * @param activeAlleleHaplotypes the GGA haplotypes to inject into the graph * @param allowLowComplexityGraphs if true, do not check for low-complexity graphs * @param allowNonUniqueKmersInRef if true, do not fail if the reference has non-unique kmers + * @param aligner {@link SmithWatermanAligner} used to align dangling ends to the reference sequence * @return sequence graph or null if one could not be created (e.g. because it contains cycles or too many paths or is low complexity) */ private AssemblyResult createGraph(final Iterable reads, final Haplotype refHaplotype, final int kmerSize, - final Iterable activeAlleleHaplotypes, final boolean allowLowComplexityGraphs, final boolean allowNonUniqueKmersInRef, final SAMFileHeader header, @@ -443,12 +405,6 @@ private AssemblyResult createGraph(final Iterable reads, // add the reference sequence to the graph rtgraph.addSequence("ref", refHaplotype.getBases(), true); - // add the artificial GGA haplotypes to the graph - int hapCount = 0; - for ( final Haplotype h : activeAlleleHaplotypes ) { - rtgraph.addSequence("activeAllele" + hapCount++, h.getBases(), GGA_MODE_ARTIFICIAL_COUNTS, false); - } - // Next pull kmers out of every read and throw them on the graph for( final GATKRead read : reads ) { rtgraph.addRead(read, header); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java index 6a4369a2a7b..dfd339b6271 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java @@ -42,8 +42,6 @@ public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection public static final String DOWNSAMPLING_STRIDE_SHORT_NAME = "stride"; public static final String MAX_SUSPICIOUS_READS_PER_ALIGNMENT_START_LONG_NAME = "max-suspicious-reads-per-alignment-start"; public static final String NORMAL_LOG_10_ODDS_LONG_NAME = "normal-lod"; - public static final String MAX_MNP_DISTANCE_LONG_NAME = "max-mnp-distance"; - public static final String MAX_MNP_DISTANCE_SHORT_NAME = "mnp-dist"; public static final String IGNORE_ITR_ARTIFACTS_LONG_NAME = "ignore-itr-artifacts"; public static final String MITOCHONDRIA_MODE_LONG_NAME = "mitochondria-mode"; public static final String CALLABLE_DEPTH_LONG_NAME = "callable-depth"; @@ -64,6 +62,10 @@ public class M2ArgumentCollection extends AssemblyBasedCallerArgumentCollection public static final double DEFAULT_MITO_PRUNING_LOG_ODDS_THRESHOLD = MathUtils.log10ToLog(-4); + @Override + protected int getDefaultMaxMnpDistance() { return 1; } + + @Override protected ReadThreadingAssemblerArgumentCollection getReadThreadingAssemblerArgumentCollection(){ return new MutectReadThreadingAssemblerArgumentCollection(); } @@ -215,14 +217,6 @@ public double getInitialLogOdds() { @Argument(fullName = NORMAL_LOG_10_ODDS_LONG_NAME, optional = true, doc = "Log 10 odds threshold for calling normal variant non-germline.") public double normalLog10Odds = DEFAULT_NORMAL_LOG_10_ODDS; - /** - * Two or more phased substitutions separated by this distance or less are merged into MNPs. - */ - @Advanced - @Argument(fullName = MAX_MNP_DISTANCE_LONG_NAME, shortName = MAX_MNP_DISTANCE_SHORT_NAME, - doc = "Two or more phased substitutions separated by this distance or less are merged into MNPs.", optional = true) - public int maxMnpDistance = 1; - /** * When opposite ends of a fragment are inverted tandem repeats of each other, the sequence past one end may be copied onto the other * during library prep. By default, Mutect2 identifies and clips these artifacts, which are especially prevalent when diff --git a/src/main/java/org/broadinstitute/hellbender/utils/haplotype/EventMap.java b/src/main/java/org/broadinstitute/hellbender/utils/haplotype/EventMap.java index 7ebdc280824..6c55fb0a8fe 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/haplotype/EventMap.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/haplotype/EventMap.java @@ -61,7 +61,7 @@ public EventMap(final Collection stateForTesting) { /** * * @param maxMnpDistance Phased substitutions separated by this distance or less are merged into MNPs. More than - * two substitutions occuring in the same alignment block (ie the same M/X/EQ CIGAR element) + * two substitutions occurring in the same alignment block (ie the same M/X/EQ CIGAR element) * are merged until a substitution is separated from the previous one by a greater distance. * That is, if maxMnpDistance = 1, substitutions at 10,11,12,14,15,17 are partitioned into a MNP * at 10-12, a MNP at 14-15, and a SNP at 17. May not be negative. diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtilsUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtilsUnitTest.java index 411c0b6e5e8..4b855ca841d 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtilsUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtilsUnitTest.java @@ -21,6 +21,7 @@ import java.util.stream.Collectors; import org.broadinstitute.hellbender.GATKBaseTest; +import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner; import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; import org.testng.Assert; import org.testng.annotations.DataProvider; @@ -967,4 +968,107 @@ public void testCreateHaplotypeMapping(final VariantContext vc, final Set assemble(final ReadThreadingAssembler assembler, final b final AssemblyRegion activeRegion = new AssemblyRegion(loc, null, true, 0, header); activeRegion.addAll(reads); // logger.warn("Assembling " + activeRegion + " with " + engine); - final AssemblyResultSet assemblyResultSet = assembler.runLocalAssembly(activeRegion, refHaplotype, refBases, loc, Collections.emptyList(), null, header, + final AssemblyResultSet assemblyResultSet = assembler.runLocalAssembly(activeRegion, refHaplotype, refBases, loc, null, header, SmithWatermanJavaAligner.getInstance()); return assemblyResultSet.getHaplotypeList(); } @@ -259,7 +259,7 @@ public SeqGraph assemble() { assembler.setRecoverDanglingBranches(false); // needed to pass some of the tests assembler.setDebugGraphTransformations(true); assembler.setDebugGraphOutputPath(createTempDir("debugGraphs")); - final SeqGraph graph = assembler.assemble(reads, refHaplotype, Collections.emptyList(), header, SmithWatermanJavaAligner + final SeqGraph graph = assembler.assemble(reads, refHaplotype, header, SmithWatermanJavaAligner .getInstance()).get(0).getGraph(); if ( DEBUG ) graph.printGraph(new File("test.dot"), 0); return graph; diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2IntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2IntegrationTest.java index d672bb93409..372035d357c 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2IntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2IntegrationTest.java @@ -394,7 +394,7 @@ public void testMnps() throws Exception { "-L", "20:10019000-10022000", "-O", outputVcf.getAbsolutePath(), "-" + M2ArgumentCollection.EMISSION_LOG_SHORT_NAME, "15", - "-" + M2ArgumentCollection.MAX_MNP_DISTANCE_SHORT_NAME, Integer.toString(maxMnpDistance)); + "-" + AssemblyBasedCallerArgumentCollection.MAX_MNP_DISTANCE_SHORT_NAME, Integer.toString(maxMnpDistance)); runCommandLine(args); checkMnpOutput(maxMnpDistance, outputVcf); @@ -465,6 +465,33 @@ public void testGivenAllelesMode() throws Exception { } } + /** + * Here we give Mutect2 ridiculous kmer settings in order to force assembly to fail. + * @throws Exception + */ + @Test + public void testGivenAllelesModeWithCycles() throws Exception { + Utils.resetRandomGenerator(); + final File unfilteredVcf = createTempFile("unfiltered", ".vcf"); + + final File givenAllelesVcf = new File(toolsTestDir, "mutect/gga_mode.vcf"); + final List args = Arrays.asList("-I", NA12878_20_21_WGS_bam, + "-R", b37_reference_20_21, + "-L", "20:9998500-10010000", + "-O", unfilteredVcf.getAbsolutePath(), + "--alleles", givenAllelesVcf.getAbsolutePath(), + "--kmer-size", "1", + "--dont-increase-kmer-sizes-for-cycles"); + runCommandLine(args); + + final Map> altAllelesByPosition = VariantContextTestUtils.streamVcf(unfilteredVcf) + .collect(Collectors.toMap(vc -> vc.getStart(), vc -> vc.getAlternateAlleles())); + for (final VariantContext vc : new FeatureDataSource(givenAllelesVcf)) { + final List altAllelesAtThisLocus = altAllelesByPosition.get(vc.getStart()); + vc.getAlternateAlleles().forEach(a -> Assert.assertTrue(altAllelesAtThisLocus.contains(a))); + } + } + // make sure that GGA mode with given alleles that normally wouldn't be called due to complete lack of coverage // doesn't run into any edge case bug involving empty likelihoods matrices @Test diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/expected.testGenotypeGivenAllelesMode.gatk4.vcf b/src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/expected.testGenotypeGivenAllelesMode.gatk4.vcf index 40687acef1f..845770843a8 100644 --- a/src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/expected.testGenotypeGivenAllelesMode.gatk4.vcf +++ b/src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/expected.testGenotypeGivenAllelesMode.gatk4.vcf @@ -27,9 +27,9 @@ 20 10000694 . G A 1029.60 . AC=1;AF=0.500;AN=2;BaseQRankSum=3.599;DP=82;ExcessHet=3.0103;FS=1.822;MLEAC=1;MLEAF=0.500;MQ=48.66;MQRankSum=-5.114;QD=12.56;ReadPosRankSum=0.657;SOR=0.566 GT:AD:DP:GQ:PL 0/1:45,37:82:99:1037,0,1601 20 10001436 . A AAGGCT 2326.06 . AC=2;AF=1.00;AN=2;DP=55;ExcessHet=3.0103;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=41.07;QD=25.36;SOR=3.442 GT:AD:DP:GQ:PL 1/1:0,50:50:99:2340,156,0 20 10001661 . T C 3621.03 . AC=2;AF=1.00;AN=2;DP=81;ExcessHet=3.0103;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=58.33;QD=28.73;SOR=1.193 GT:AD:DP:GQ:PL 1/1:0,81:81:99:3635,249,0 -20 10004094 . A T 0 LowQual DP=53;FS=0.000;MLEAC=0;MLEAF=NaN;MQ=52.75;SOR=0.693 GT ./. +20 10004094 . A T 0 LowQual AC=0;AF=0.00;AN=2;DP=53;ExcessHet=3.0103;FS=0.000;MLEAC=0;MLEAF=0.00;MQ=52.75;SOR=0.527 GT:AD:DP:GQ:PL 0/0:20,0:20:45:0,45,237 20 10004769 . TAAAACTATGC T 1010.60 . AC=1;AF=0.500;AN=2;BaseQRankSum=1.603;DP=78;ExcessHet=3.0103;FS=3.758;MLEAC=1;MLEAF=0.500;MQ=52.60;MQRankSum=-6.625;QD=15.79;ReadPosRankSum=1.879;SOR=1.306 GT:AD:DP:GQ:PL 0/1:37,27:64:99:1018,0,1471 -20 10004771 . A *,T 0 LowQual AC=1,0;AF=0.500,0.00;AN=2;BaseQRankSum=-1.270;DP=72;ExcessHet=3.0103;FS=2.397;MLEAC=1,0;MLEAF=0.500,0.00;MQ=51.93;MQRankSum=-6.325;QD=-0.00;ReadPosRankSum=2.185;SOR=1.115 GT:AD:DP:GQ:PL 0/1:32,27,0:59:99:1027,0,1345,1162,1305,2852 +20 10004771 . A *,T 0 LowQual AC=1,0;AF=0.500,0.00;AN=2;BaseQRankSum=-1.270;DP=72;ExcessHet=3.0103;FS=2.397;MLEAC=1,0;MLEAF=0.500,0.00;MQ=51.93;MQRankSum=-6.325;QD=-0.00;ReadPosRankSum=2.185;SOR=1.115 GT:AD:DP:GQ:PL 0/1:32,27,0:59:99:1027,0,1345,1167,1315,2965 20 10006819 . AAAAC T 0 LowQual AC=0;AF=0.00;AN=2;DP=75;ExcessHet=3.0103;FS=0.000;MLEAC=0;MLEAF=0.00;MQ=61.14;SOR=0.572 GT:AD:DP:GQ:PL 0/0:75,0:75:99:0,244,2147483647 20 10006823 . C *,G 0 LowQual AC=0,0;AF=0.00,0.00;AN=2;DP=73;ExcessHet=3.0103;FS=0.000;MLEAC=0,0;MLEAF=0.00,0.00;MQ=61.18;SOR=0.069 GT:AD:DP:GQ:PL 0/0:17,0,0:17:51:0,229,2147483647,51,819,590 20 10008738 . GGTTTGTTT GGTTTGTTTGTTT,GGTTTGTTTGTTTGTTT,G 0 LowQual AC=0,0,0;AF=0.00,0.00,0.00;AN=2;DP=50;ExcessHet=3.0103;FS=0.000;MLEAC=0,0,0;MLEAF=0.00,0.00,0.00;MQ=37.07;SOR=0.488 GT:AD:DP:GQ:PL 0/0:50,0,0,0:50:99:0,154,2147483647,154,2147483647,2147483647,154,2147483647,2147483647,2147483647 diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/expected.testGenotypeGivenAllelesMode.gatk4.vcf.idx b/src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/expected.testGenotypeGivenAllelesMode.gatk4.vcf.idx index e4390f3a689..8240045d087 100644 Binary files a/src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/expected.testGenotypeGivenAllelesMode.gatk4.vcf.idx and b/src/test/resources/org/broadinstitute/hellbender/tools/haplotypecaller/expected.testGenotypeGivenAllelesMode.gatk4.vcf.idx differ