Skip to content

Commit

Permalink
Fix several issues with M2 and HC force-calling mode (#5874)
Browse files Browse the repository at this point in the history
  • Loading branch information
davidbenjamin authored Apr 18, 2019
1 parent 86a71a1 commit 5381421
Show file tree
Hide file tree
Showing 13 changed files with 242 additions and 94 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ public abstract class AssemblyBasedCallerArgumentCollection {
public static final String BAM_WRITER_TYPE_LONG_NAME = "bam-writer-type";
public static final String DONT_USE_SOFT_CLIPPED_BASES_LONG_NAME = "dont-use-soft-clipped-bases";
public static final String DO_NOT_RUN_PHYSICAL_PHASING_LONG_NAME = "do-not-run-physical-phasing";
public static final String MAX_MNP_DISTANCE_LONG_NAME = "max-mnp-distance";
public static final String MAX_MNP_DISTANCE_SHORT_NAME = "mnp-dist";

public static final String MIN_BASE_QUALITY_SCORE_LONG_NAME = "min-base-quality-score";
public static final String SMITH_WATERMAN_LONG_NAME = "smith-waterman";
Expand Down Expand Up @@ -111,4 +113,14 @@ public ReadThreadingAssembler createReadThreadingAssembler() {
@Advanced
@Argument(fullName="emit-ref-confidence", shortName="ERC", doc="(BETA feature) Mode for emitting reference confidence scores", optional = true)
public ReferenceConfidenceMode emitReferenceConfidence = ReferenceConfidenceMode.NONE;

protected abstract int getDefaultMaxMnpDistance();

/**
* Two or more phased substitutions separated by this distance or less are merged into MNPs.
*/
@Advanced
@Argument(fullName = MAX_MNP_DISTANCE_LONG_NAME, shortName = MAX_MNP_DISTANCE_SHORT_NAME,
doc = "Two or more phased substitutions separated by this distance or less are merged into MNPs.", optional = true)
public int maxMnpDistance = getDefaultMaxMnpDistance();
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import org.broadinstitute.hellbender.engine.AlignmentContext;
import org.broadinstitute.hellbender.engine.AssemblyRegion;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.walkers.ReferenceConfidenceVariantContextMerger;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler;
import org.broadinstitute.hellbender.utils.QualityUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
Expand All @@ -29,10 +30,7 @@
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.locusiterator.LocusIteratorByState;
import org.broadinstitute.hellbender.utils.pileup.ReadPileup;
import org.broadinstitute.hellbender.utils.read.AlignmentUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.ReadCoordinateComparator;
import org.broadinstitute.hellbender.utils.read.ReadUtils;
import org.broadinstitute.hellbender.utils.read.*;
import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
Expand All @@ -49,6 +47,7 @@
public final class AssemblyBasedCallerUtils {

static final int REFERENCE_PADDING_FOR_ASSEMBLY = 500;
public static final int NUM_HAPLOTYPES_TO_INJECT_FORCE_CALLING_ALLELES_INTO = 5;
public static final String SUPPORTED_ALLELES_TAG="XA";
public static final String CALLABLE_REGION_TAG = "CR";
public static final String ALIGNMENT_REGION_TAG = "AR";
Expand Down Expand Up @@ -275,7 +274,7 @@ public static AssemblyResultSet assembleReads(final AssemblyRegion region,

final byte[] fullReferenceWithPadding = region.getAssemblyRegionReference(referenceReader, REFERENCE_PADDING_FOR_ASSEMBLY);
final SimpleInterval paddedReferenceLoc = getPaddedReferenceLoc(region, REFERENCE_PADDING_FOR_ASSEMBLY, referenceReader);
final Haplotype referenceHaplotype = createReferenceHaplotype(region, paddedReferenceLoc, referenceReader);
final Haplotype refHaplotype = createReferenceHaplotype(region, paddedReferenceLoc, referenceReader);

final ReadErrorCorrector readErrorCorrector = argumentCollection.assemblerArgs.errorCorrectReads ?
new ReadErrorCorrector(argumentCollection.assemblerArgs.kmerLengthForReadErrorCorrection,
Expand All @@ -286,9 +285,12 @@ public static AssemblyResultSet assembleReads(final AssemblyRegion region,
null;

try {
final AssemblyResultSet assemblyResultSet = assemblyEngine.runLocalAssembly(region, referenceHaplotype, fullReferenceWithPadding,
paddedReferenceLoc, givenAlleles, readErrorCorrector, header,
aligner);
final AssemblyResultSet assemblyResultSet = assemblyEngine.runLocalAssembly(region, refHaplotype, fullReferenceWithPadding,
paddedReferenceLoc, readErrorCorrector, header, aligner);
if (!givenAlleles.isEmpty()) {
addGivenAlleles(region.getExtendedSpan().getStart(), givenAlleles, argumentCollection.maxMnpDistance, aligner, refHaplotype, assemblyResultSet);
}

assemblyResultSet.setDebug(argumentCollection.assemblerArgs.debugAssembly);
assemblyResultSet.debugDump(logger);
return assemblyResultSet;
Expand All @@ -305,6 +307,57 @@ public static AssemblyResultSet assembleReads(final AssemblyRegion region,
}
}

@VisibleForTesting
static void addGivenAlleles(final int assemblyRegionStart, final List<VariantContext> givenAlleles, final int maxMnpDistance,
final SmithWatermanAligner aligner, final Haplotype refHaplotype, final AssemblyResultSet assemblyResultSet) {
final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef();
final Map<Integer, VariantContext> assembledVariants = assemblyResultSet.getVariationEvents(maxMnpDistance).stream()
.collect(Collectors.groupingBy(VariantContext::getStart, Collectors.collectingAndThen(Collectors.toList(), AssemblyBasedCallerUtils::makeMergedVariantContext)));

final List<Haplotype> assembledHaplotypes = assemblyResultSet.getHaplotypeList();
for (final VariantContext givenVC : givenAlleles) {
final VariantContext assembledVC = assembledVariants.get(givenVC.getStart());
final int givenVCRefLength = givenVC.getReference().length();
final Allele longerRef = (assembledVC == null || givenVCRefLength > assembledVC.getReference().length()) ? givenVC.getReference() : assembledVC.getReference();
final List<Allele> unassembledGivenAlleles;
if (assembledVC == null) {
unassembledGivenAlleles = givenVC.getAlternateAlleles();
} else {
// map all alleles to the longest common reference
final Set<Allele> assembledAlleleSet = new HashSet<>(longerRef.length() == assembledVC.getReference().length() ? assembledVC.getAlternateAlleles() :
ReferenceConfidenceVariantContextMerger.remapAlleles(assembledVC, longerRef));
final Set<Allele> givenAlleleSet = new HashSet<>(longerRef.length() == givenVCRefLength ? givenVC.getAlternateAlleles() :
ReferenceConfidenceVariantContextMerger.remapAlleles(givenVC, longerRef));
unassembledGivenAlleles = givenAlleleSet.stream().filter(a -> !assembledAlleleSet.contains(a)).collect(Collectors.toList());
}

// choose the highest-scoring haplotypes along with the reference for building force-calling haplotypes
final List<Haplotype> baseHaplotypes = unassembledGivenAlleles.isEmpty() ? Collections.emptyList() : assembledHaplotypes.stream()
.sorted(Comparator.comparingInt((Haplotype hap) -> hap.isReference() ? 1 : 0).thenComparingDouble(hap -> hap.getScore()).reversed())
.limit(NUM_HAPLOTYPES_TO_INJECT_FORCE_CALLING_ALLELES_INTO)
.collect(Collectors.toList());

for (final Allele givenAllele : unassembledGivenAlleles) {
for (final Haplotype baseHaplotype : baseHaplotypes) {
// make sure this allele doesn't collide with a variant on the haplotype
if (baseHaplotype.getEventMap()!= null && baseHaplotype.getEventMap().getVariantContexts().stream().anyMatch(vc -> vc.overlaps(givenVC))) {
continue;
}

final Haplotype insertedHaplotype = baseHaplotype.insertAllele(longerRef, givenAllele, activeRegionStart + givenVC.getStart() - assemblyRegionStart, givenVC.getStart());
if (insertedHaplotype != null) { // can be null if the requested allele can't be inserted into the haplotype
final Cigar cigar = CigarUtils.calculateCigar(refHaplotype.getBases(), insertedHaplotype.getBases(), aligner);
insertedHaplotype.setCigar(cigar);
insertedHaplotype.setGenomeLocation(refHaplotype.getGenomeLocation());
insertedHaplotype.setAlignmentStartHapwrtRef(activeRegionStart);
assemblyResultSet.add(insertedHaplotype);
}
}
}
}
assemblyResultSet.regenerateVariationEvents(maxMnpDistance);
}

/**
* Annotates reads in ReadLikelihoods with alignment region (the ref region spanned by the haplotype the read is aligned to) and
* callable region (the ref region over which a caller is using these ReadLikelihoods to call variants)
Expand Down Expand Up @@ -817,4 +870,5 @@ private static VariantContext phaseVC(final VariantContext vc, final String ID,
}
return new VariantContextBuilder(vc).genotypes(phasedGenotypes).make();
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ public final class AssemblyResultSet {
private boolean wasTrimmed = false;
private final SortedSet<Integer> kmerSizes;
private SortedSet<VariantContext> variationEvents;
private OptionalInt lastMaxMnpDistanceUsed = OptionalInt.empty();
private boolean debug;
private static final Logger logger = LogManager.getLogger(AssemblyResultSet.class);

Expand Down Expand Up @@ -92,6 +93,7 @@ public AssemblyResultSet trimTo(final AssemblyRegion trimmedAssemblyRegion) {
result.setRegionForGenotyping(trimmedAssemblyRegion);
result.setFullReferenceWithPadding(fullReferenceWithPadding);
result.setPaddedReferenceLoc(paddedReferenceLoc);
result.variationPresent = haplotypes.stream().anyMatch(Haplotype::isNonReference);
if (result.refHaplotype == null) {
throw new IllegalStateException("missing reference haplotype in the trimmed set");
}
Expand Down Expand Up @@ -510,14 +512,24 @@ private void updateReferenceHaplotype(final Haplotype newHaplotype) {
*/
public SortedSet<VariantContext> getVariationEvents(final int maxMnpDistance) {
ParamUtils.isPositiveOrZero(maxMnpDistance, "maxMnpDistance may not be negative.");
if (variationEvents == null) {
final List<Haplotype> haplotypeList = getHaplotypeList();
EventMap.buildEventMapsForHaplotypes(haplotypeList, fullReferenceWithPadding, paddedReferenceLoc, debug, maxMnpDistance);
variationEvents = EventMap.getAllVariantContexts(haplotypeList);

final boolean sameMnpDistance = lastMaxMnpDistanceUsed.isPresent() && maxMnpDistance == lastMaxMnpDistanceUsed.getAsInt();
lastMaxMnpDistanceUsed = OptionalInt.of(maxMnpDistance);

if (variationEvents == null || !sameMnpDistance || haplotypes.stream().anyMatch(hap -> hap.isNonReference() && hap.getEventMap() == null)) {
regenerateVariationEvents(maxMnpDistance);
}
return variationEvents;
}

public void regenerateVariationEvents(int maxMnpDistance) {
final List<Haplotype> haplotypeList = getHaplotypeList();
EventMap.buildEventMapsForHaplotypes(haplotypeList, fullReferenceWithPadding, paddedReferenceLoc, debug, maxMnpDistance);
variationEvents = EventMap.getAllVariantContexts(haplotypeList);
lastMaxMnpDistanceUsed = OptionalInt.of(maxMnpDistance);
variationPresent = haplotypeList.stream().anyMatch(Haplotype::isNonReference);
}

public void setDebug(boolean debug) {
this.debug = debug;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@
public class HaplotypeCallerArgumentCollection extends AssemblyBasedCallerArgumentCollection implements Serializable{
private static final long serialVersionUID = 1L;

public static final String MAX_MNP_DISTANCE_LONG_NAME = "max-mnp-distance";
public static final String MAX_MNP_DISTANCE_SHORT_NAME = "mnp-dist";
public static final String GQ_BAND_LONG_NAME = "gvcf-gq-bands";
public static final String GQ_BAND_SHORT_NAME = "GQB";
public static final String CORRECT_OVERLAPPING_BASE_QUALITIES_LONG_NAME = "correct-overlapping-quality";
Expand All @@ -31,6 +29,9 @@ public class HaplotypeCallerArgumentCollection extends AssemblyBasedCallerArgume
@ArgumentCollection
public StandardCallerArgumentCollection standardArgs = new StandardCallerArgumentCollection();

@Override
protected int getDefaultMaxMnpDistance() { return 0; }

@Override
protected ReadThreadingAssemblerArgumentCollection getReadThreadingAssemblerArgumentCollection() {
return new HaplotypeCallerReadThreadingAssemblerArgumentCollection();
Expand Down Expand Up @@ -133,15 +134,6 @@ protected ReadThreadingAssemblerArgumentCollection getReadThreadingAssemblerArgu
@Argument(fullName = "dont-genotype", doc = "Perform assembly but do not genotype variants", optional = true)
public boolean dontGenotype = false;

/**
* Two or more phased substitutions separated by this distance or less are merged into MNPs.
*/
@Advanced
@Argument(fullName = MAX_MNP_DISTANCE_LONG_NAME, shortName = MAX_MNP_DISTANCE_SHORT_NAME,
doc = "Two or more phased substitutions separated by this distance or less are merged into MNPs. " +
"WARNING: When used in GVCF mode, resulting GVCFs cannot be joint-genotyped.", optional = true)
public int maxMnpDistance = 0;

/**
* As of GATK 3.3, HaplotypeCaller outputs physical (read-based) information (see version 3.3 release notes and documentation for details). This argument disables that behavior.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -542,9 +542,6 @@ public List<VariantContext> callRegion(final AssemblyRegion region, final Featur
final AssemblyResultSet untrimmedAssemblyResult = AssemblyBasedCallerUtils.assembleReads(region, givenAlleles, hcArgs, readsHeader, samplesList, logger, referenceReader, assemblyEngine, aligner, !hcArgs.doNotCorrectOverlappingBaseQualities);

final SortedSet<VariantContext> allVariationEvents = untrimmedAssemblyResult.getVariationEvents(hcArgs.maxMnpDistance);
// TODO - line bellow might be unnecessary : it might be that assemblyResult will always have those alleles anyway
// TODO - so check and remove if that is the case:
allVariationEvents.addAll(givenAlleles);

final AssemblyRegionTrimmer.Result trimmingResult = trimmer.trim(region, allVariationEvents);

Expand Down
Loading

0 comments on commit 5381421

Please sign in to comment.