diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java index 309cedbf527..0dd65d47b67 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java @@ -15,6 +15,7 @@ import java.util.*; import java.util.stream.Collectors; +import java.util.stream.Stream; import static org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants.COPY_NUMBER_FORMAT; @@ -232,8 +233,20 @@ public static int getExpectedCopyNumber(final Genotype genotype) { return VariantContextGetters.getAttributeAsInt(genotype, GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 0); } - public Set getCarrierSamples() { - return genotypes.stream().filter(this::isCarrier).map(Genotype::getSampleName).collect(Collectors.toSet()); + public Set getCarrierSampleSet() { + return getCarrierSampleStream().collect(Collectors.toSet()); + } + + public List getCarrierGenotypeList() { + return getCarrierGenotypeStream().collect(Collectors.toList()); + } + + private Stream getCarrierSampleStream() { + return getCarrierGenotypeStream().map(Genotype::getSampleName); + } + + private Stream getCarrierGenotypeStream() { + return genotypes.stream().filter(this::isCarrier); } public boolean isDepthOnly() { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtils.java index 10b50b51ed0..57f2108f0d3 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtils.java @@ -7,10 +7,11 @@ import htsjdk.variant.vcf.VCFConstants; import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; +import org.broadinstitute.hellbender.tools.sv.cluster.CanonicalSVCollapser; +import org.broadinstitute.hellbender.tools.sv.cluster.PloidyTable; import org.broadinstitute.hellbender.utils.IntervalUtils; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.Utils; -import org.broadinstitute.hellbender.utils.variant.VariantContextGetters; import java.util.*; import java.util.stream.Collectors; @@ -64,71 +65,70 @@ public static VariantContextBuilder getVariantBuilder(final SVCallRecord record) builder.attribute(GATKSVVCFConstants.CONTIG2_ATTRIBUTE, record.getContigB()); builder.attribute(GATKSVVCFConstants.END2_ATTRIBUTE, end2); } - if (!svtype.equals(StructuralVariantType.BND)) { + if (svtype.equals(StructuralVariantType.INS)) { builder.attribute(GATKSVVCFConstants.SVLEN, record.getLength()); } if (svtype.equals(StructuralVariantType.BND) || svtype.equals(StructuralVariantType.INV)) { builder.attribute(GATKSVVCFConstants.STRANDS_ATTRIBUTE, getStrandString(record)); } + final GenotypesContext genotypes = GenotypesContext.create(record.getGenotypes().size()); + for (final Genotype g : record.getGenotypes()) { + genotypes.add(sanitizeEmptyGenotype(g)); + } + // htsjdk vcf encoder does not allow genotypes to have empty alleles + builder.genotypes(record.getGenotypes().stream().map(SVCallRecordUtils::sanitizeEmptyGenotype).collect(Collectors.toList())); + return builder; + } - // Generate alleles for DEL genotypes, which can be inferred from expected and actual copy numbers - final List newGenotypes = new ArrayList<>(record.getGenotypes().size()); - if (svtype.equals(StructuralVariantType.DEL)) { - for (final Genotype g : record.getGenotypes()) { - Utils.validate(altAlleles.size() == 1, "Encountered deletion with multiple ALT alleles"); - Utils.validate(g.hasExtendedAttribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT), - "Deletion genotype missing " + GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT + " field"); - Utils.validate(g.hasExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT), - "Deletion genotype missing " + GATKSVVCFConstants.COPY_NUMBER_FORMAT + " field"); - final int expectedCopyNumber = VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 0); - final int copyNumber = VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.COPY_NUMBER_FORMAT, 0); - final int numAltAlleles = expectedCopyNumber - copyNumber; - Utils.validate(numAltAlleles >= 0, "Invalid copy number " + copyNumber + - " for deletion genotype with expected copy number " + expectedCopyNumber); - final List genotypeAlleles = new ArrayList<>(expectedCopyNumber); - for (int i = 0; i < copyNumber; i++) { - genotypeAlleles.add(refAllele); - } - for (int i = copyNumber; i < numAlleles; i++) { - genotypeAlleles.add(altAlleles.get(0)); - } - newGenotypes.add(new GenotypeBuilder(g).alleles(genotypeAlleles).make()); - } - builder.genotypes(newGenotypes); + /** + * Adds NO_CALL allele if empty + */ + private static Genotype sanitizeEmptyGenotype(final Genotype g) { + if (g.getAlleles().isEmpty()) { + return new GenotypeBuilder(g).alleles(Collections.singletonList(Allele.NO_CALL)).make(); } else { - builder.genotypes(record.getGenotypes()); + return g; } - - return builder; } /** - * Creates a new {@link GenotypesContext} object augmented with the given sample set. Samples with existing - * genotypes are not touched. Samples without genotypes are assigned the provided sets of alleles and attributes. - * @param genotypes base genotypes + * Populates genotypes for samples not present in the given record. Samples with existing genotypes are not + * touched. Samples without genotypes are assigned one according to the provided default reference allele + * and ploidy, specified by the {@link GATKSVVCFConstants#EXPECTED_COPY_NUMBER_FORMAT} value. If the record + * represents a CNV, the {@link GATKSVVCFConstants#COPY_NUMBER_FORMAT} is also set. + * @param record record containing the genotypes * @param samples samples which the resulting genotypes must contain (existing samples are ignored) - * @param alleles alleles to apply to all new genotypes, must be length equal to the ploidy for the sample - * @param attributes attributes to apply to all new genotypes + * @param refAlleleDefault default allele to use for samples without genotypes + * @param ploidyTable ploidy table, which must contain at least all samples with missing genotypes * @return genotypes augmented with missing samples */ - public static GenotypesContext populateGenotypesForMissingSamplesWithAlleles(final GenotypesContext genotypes, + public static GenotypesContext populateGenotypesForMissingSamplesWithAlleles(final SVCallRecord record, final Set samples, - final List alleles, - final Map attributes) { - Utils.nonNull(genotypes); + final boolean refAlleleDefault, + final PloidyTable ploidyTable) { + Utils.nonNull(record); Utils.nonNull(samples); + final GenotypesContext genotypes = record.getGenotypes(); final Set missingSamples = Sets.difference(samples, genotypes.getSampleNames()); if (missingSamples.isEmpty()) { return genotypes; } final ArrayList newGenotypes = new ArrayList<>(genotypes.size() + missingSamples.size()); newGenotypes.addAll(genotypes); + final String contig = record.getContigA(); + final List altAlleles = record.getAltAlleles(); + final Allele refAllele = record.getRefAllele(); + final boolean isCNV = record.isSimpleCNV(); for (final String sample : missingSamples) { final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(sample); - if (attributes != null) { - genotypeBuilder.attributes(attributes); + final int ploidy = ploidyTable.get(sample, contig); + genotypeBuilder.attribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, ploidy); + if (isCNV) { + genotypeBuilder.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, ploidy); + genotypeBuilder.alleles(CanonicalSVCollapser.getCNVGenotypeAllelesFromCopyNumber(altAlleles, refAllele, ploidy, ploidy)); + } else { + genotypeBuilder.alleles(Collections.nCopies(ploidy, refAlleleDefault ? refAllele : Allele.NO_CALL)); } - genotypeBuilder.alleles(alleles); newGenotypes.add(genotypeBuilder.make()); } return GenotypesContext.create(newGenotypes); @@ -393,10 +393,18 @@ public static boolean containsAltAllele(final Genotype g) { return g.getAlleles().stream().anyMatch(SVCallRecordUtils::isAltAllele); } + public static boolean isAltGenotype(final Genotype g) { + return g.getAlleles().stream().anyMatch(SVCallRecordUtils::isAltAllele); + } + public static boolean isAltAllele(final Allele allele) { return allele != null && !allele.isNoCall() && !allele.isReference(); } + public static boolean isNonRefAllele(final Allele allele) { + return allele != null && !allele.isReference(); + } + // TODO this is sort of hacky but the Allele compareTo() method doesn't give stable ordering public static List sortAlleles(final Collection alleles) { return alleles.stream().sorted(Comparator.nullsFirst(Comparator.comparing(Allele::getDisplayString))).collect(Collectors.toList()); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CNVLinkage.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CNVLinkage.java index 2059dafbc62..978878e3e0d 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CNVLinkage.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CNVLinkage.java @@ -59,8 +59,8 @@ public boolean areClusterable(final SVCallRecord a, final SVCallRecord b) { // In the single-sample case, match copy number strictly if we're looking at the same sample // TODO repeated check for CN attributes in hasSampleOverlap and getCarrierSamples - final Set carriersA = a.getCarrierSamples(); - final Set carriersB = b.getCarrierSamples(); + final Set carriersA = a.getCarrierSampleSet(); + final Set carriersB = b.getCarrierSampleSet(); if (carriersA.size() == 1 && carriersA.equals(carriersB)) { final Genotype genotypeA = a.getGenotypes().get(carriersA.iterator().next()); final Genotype genotypeB = b.getGenotypes().get(carriersB.iterator().next()); @@ -90,6 +90,9 @@ public boolean areClusterable(final SVCallRecord a, final SVCallRecord b) { @Override public int getMaxClusterableStartingPosition(final SVCallRecord call) { final int contigLength = dictionary.getSequence(call.getContigA()).getSequenceLength(); + if (!call.isSimpleCNV()) { + return 0; + } final int maxTheoreticalStart = (int) Math.floor((call.getPositionB() + paddingFraction * (call.getLength() + contigLength)) / (1.0 + paddingFraction)); return Math.min(maxTheoreticalStart, dictionary.getSequence(call.getContigA()).getSequenceLength()); } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapser.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapser.java index 32faa93bf78..16cde33b2a2 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapser.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapser.java @@ -1,6 +1,7 @@ package org.broadinstitute.hellbender.tools.sv.cluster; import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.Sets; import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.variant.variantcontext.*; import org.apache.commons.lang3.tuple.Pair; @@ -78,6 +79,16 @@ public enum InsertionLengthSummaryStrategy { UNDEFINED } + // TODO Add support for complex (CPX) + private static final Set SUPPORTED_SV_TYPES = Sets.newHashSet( + StructuralVariantType.DEL, + StructuralVariantType.DUP, + StructuralVariantType.CNV, + StructuralVariantType.INS, + StructuralVariantType.INV, + StructuralVariantType.BND + ); + private final AltAlleleSummaryStrategy altAlleleSummaryStrategy; private final BreakpointSummaryStrategy breakpointSummaryStrategy; private final InsertionLengthSummaryStrategy insertionLengthSummaryStrategy; @@ -97,6 +108,7 @@ public CanonicalSVCollapser(final ReferenceSequenceFile reference, @Override public SVCallRecord collapse(final Collection items) { + validateRecords(items); final String id = collapseIds(items); final List algorithms = collapseAlgorithms(items); final StructuralVariantType type = collapseTypes(items); @@ -109,20 +121,56 @@ public SVCallRecord collapse(final Collection items) { final Pair coordinates = collapseInterval(mostPreciseCalls); final int start = coordinates.getKey(); final int end = coordinates.getValue(); - final int length = collapseLength(mostPreciseCalls, start, end, type); + final Integer length = collapseLength(mostPreciseCalls, type); final Allele refAllele = collapseRefAlleles(exampleCall.getContigA(), start); + final List altAlleles = collapseAltAlleles(items); - final int numAlleles = (refAllele == null ? 0 : 1) + altAlleles.size(); + final int numAlleles = 1 + altAlleles.size(); final List alleles = new ArrayList<>(numAlleles); - if (refAllele != null) { - alleles.add(refAllele); - } + alleles.add(refAllele); alleles.addAll(altAlleles); - final List genotypes = collapseAllGenotypes(items, refAllele); - return new SVCallRecord(id, exampleCall.getContigA(), start, exampleCall.getStrandA(), - exampleCall.getContigB(), end, exampleCall.getStrandB(), type, length, algorithms, alleles, genotypes, attributes); + final List genotypes = collapseAllGenotypes(items, refAllele, altAlleles); + final List harmonizedGenotypes = harmonizeAltAlleles(altAlleles, genotypes); + + return new SVCallRecord(id, exampleCall.getContigA(), start, exampleCall.getStrandA(), exampleCall.getContigB(), + end, exampleCall.getStrandB(), type, length, algorithms, alleles, harmonizedGenotypes, attributes); + } + + /** + * Asserts that the given records are valid for collapsing. + */ + protected void validateRecords(final Collection records) { + for (final SVCallRecord r : records) { + Utils.validateArg(SUPPORTED_SV_TYPES.contains(r.getType()), + "Unsupported SV type: " + r.getType()); + } + } + + protected List harmonizeAltAlleles(final List sortedAltAlleles, final List collapsedGenotypes) { + Utils.nonNull(sortedAltAlleles); + Utils.nonNull(collapsedGenotypes); + final Set genotypeAltAlleles = collapsedGenotypes.stream() + .map(Genotype::getAlleles) + .flatMap(List::stream) + .filter(SVCallRecordUtils::isAltAllele) + .collect(Collectors.toSet()); + // Alt alleles match already, or some alts vanished in genotype collapsing (and we keep them) + if (sortedAltAlleles.containsAll(genotypeAltAlleles)) { + return collapsedGenotypes; + } + // One or more subtypes were collapsed - need to replace genotype alt alleles + Utils.validate(sortedAltAlleles.size() == 1, "Multi-allelic variants with subtyped alleles are " + + "not supported."); + final Allele newAlt = sortedAltAlleles.get(0); + return collapsedGenotypes.stream() + .map(g -> new GenotypeBuilder(g).alleles(replaceAltAlleles(g.getAlleles(), newAlt)).make()) + .collect(Collectors.toList()); + } + + private List replaceAltAlleles(final List alleles, final Allele replacement) { + return alleles.stream().map(a -> SVCallRecordUtils.isAltAllele(a) ? replacement : a).collect(Collectors.toList()); } /** @@ -164,6 +212,7 @@ protected List collapseAltAlleles(final Collection items) final List altAlleles = items.stream().map(SVCallRecord::getAltAlleles) .flatMap(List::stream) .distinct() + .sorted() .collect(Collectors.toList()); if (altAlleles.isEmpty()) { return Collections.emptyList(); @@ -239,74 +288,87 @@ private String[] collapseAltAllelesMostSpecific(final List alleles) { } private List collapseAllGenotypes(final Collection items, - final Allele refAllele) { + final Allele refAllele, + final List altAlleles) { return items.stream() .map(SVCallRecord::getGenotypes) .flatMap(GenotypesContext::stream) .collect(Collectors.groupingBy(Genotype::getSampleName)) .values() .stream() - .map(g -> collapseSampleGenotypes(g, refAllele)) + .map(g -> collapseSampleGenotypes(g, refAllele, altAlleles)) .collect(Collectors.toList()); } + @VisibleForTesting + protected List collapseSampleGenotypeAlleles(final Collection genotypes, + final int expectedCopyNumber, + final Integer copyNumber, + final List altAlleles, + final Allele refAllele) { + final List alleles; + if (altAlleles.contains(Allele.SV_SIMPLE_DEL) || altAlleles.contains(Allele.SV_SIMPLE_DUP)) { + // For CNVs, collapse genotype using copy number + alleles = getCNVGenotypeAllelesFromCopyNumber(altAlleles, refAllele, expectedCopyNumber, copyNumber); + } else { + alleles = collapseNonCNVGenotypeAlleles(genotypes, expectedCopyNumber, refAllele); + } + return alleles; + } + /** - * Collapses a set of genotypes from a single sample into a list of alleles for a representative genotype. Ploidy - * is determined with the {@link #collapseExpectedCopyNumber(Collection)} method. The result is the most common non-ref genotype - * alleles list, potentially augmented with additional ref alleles to match the ploidy. + * Collapses a set of genotypes from a single sample into a list of alleles for a representative genotype. The + * result is the most common non-ref genotype alleles list, potentially augmented with additional ref alleles + * to match the ploidy. * @param genotypes genotypes from a single variant and single sample * @param refAllele ref allele for the genotype - * @param sampleAltAlleles unique alt alleles * @return list of alleles for a representative genotype (may be empty) */ - @VisibleForTesting - protected List collapseSampleGenotypeAlleles(final Collection genotypes, - final int expectedCopyNumber, - final Allele refAllele, - final List sampleAltAlleles) { + private List collapseNonCNVGenotypeAlleles(final Collection genotypes, + final int expectedCopyNumber, + final Allele refAllele) { Utils.nonNull(genotypes); - Utils.nonEmpty(genotypes); - // No alt alleles, so return all ref - if (sampleAltAlleles.isEmpty()) { - return Collections.nCopies(expectedCopyNumber, refAllele); + // Trivial case + if (genotypes.size() == 1) { + return genotypes.iterator().next().getAlleles(); } - // Ploidy 0 - if (expectedCopyNumber == 0) { + // Empty case + if (genotypes.isEmpty()) { return Collections.emptyList(); } - // Multi-allelic CNVs - genotypes are stored in copy number attribute - if (sampleAltAlleles.size() == 2 && (sampleAltAlleles.contains(Allele.SV_SIMPLE_DEL) - || sampleAltAlleles.contains(Allele.SV_SIMPLE_DUP))) { - return Collections.nCopies(expectedCopyNumber, Allele.NO_CALL); - } - - // Cannot resolve alleles for DUPs when ploidy is over 1 - if (expectedCopyNumber > 1 && sampleAltAlleles.size() == 1 - && sampleAltAlleles.get(0).equals(Allele.SV_SIMPLE_DUP)) { - return Collections.nCopies(expectedCopyNumber, Allele.NO_CALL); + // Ploidy 0 + // TODO : we may not want to always throw away alleles here. For example, in the case of segmental + // duplications on chromosome Y. + if (expectedCopyNumber == 0) { + return Collections.emptyList(); } // Determine frequency of each genotype final Map, Integer> genotypeCounts = genotypes.stream().map(Genotype::getAlleles) .collect(Collectors.groupingBy(l -> l, Collectors.collectingAndThen(Collectors.toList(), List::size))); // Sort keys somehow, for stability - final List> sortedAlleleLists = genotypeCounts.keySet().stream().sorted(ALLELE_COMPARATOR).collect(Collectors.toList()); + final List> sortedAlleleLists = genotypeCounts.keySet().stream() + .sorted(ALLELE_COMPARATOR).collect(Collectors.toList()); // Find most common non-ref genotype // Ties go to the first result in the list, as determined by the comparator // TODO : implement an allele comparator that can take into account GQ if available List bestGenotypeAltAlleles = null; int bestGenotypeFrequency = 0; for (final List alleles : sortedAlleleLists) { - final List altAlleles = alleles.stream().filter(SVCallRecordUtils::isAltAllele).collect(Collectors.toList()); + final List genotypeAltAlleles = alleles.stream().filter(SVCallRecordUtils::isAltAllele).collect(Collectors.toList()); final int genotypeFrequency = genotypeCounts.get(alleles); - if (!altAlleles.isEmpty() && (bestGenotypeAltAlleles == null || genotypeFrequency > bestGenotypeFrequency)) { - bestGenotypeAltAlleles = altAlleles; + if (!genotypeAltAlleles.isEmpty() && (bestGenotypeAltAlleles == null || genotypeFrequency > bestGenotypeFrequency)) { + bestGenotypeAltAlleles = genotypeAltAlleles; bestGenotypeFrequency = genotypeFrequency; } } + // No non-ref genotypes + if (bestGenotypeAltAlleles == null) { + return sortedAlleleLists.get(0); + } // Create alleles list with the selected alt alleles, and fill in remaining with ref final List alleles = new ArrayList<>(expectedCopyNumber); @@ -331,11 +393,11 @@ private Integer collapseSampleCopyNumber(final Collection genotypes, return nonRefCopyNumberCounts.keySet().iterator().next(); } - // If we have conflicting sv types, let them cancel out to ref + // If we have conflicting sv types, let them cancel out to no-call // TODO expose option to control this behavior final int numLoss = (int) nonRefCopyNumberCounts.keySet().stream().filter(c -> c < expectedCopyNumber).count(); if (numLoss != 0 && numLoss != nonRefCopyNumberCounts.size()) { - return expectedCopyNumber; + return null; } // Sort such that ties go to state closest to ref @@ -374,22 +436,204 @@ private static int compareCopyNumberForCollapsing(final Integer first, final Int * Collapses collection of genotypes belonging to a single sample. */ protected Genotype collapseSampleGenotypes(final Collection genotypes, - final Allele refAllele) { + final Allele refAllele, + final List altAlleles) { final GenotypeBuilder builder = new GenotypeBuilder(genotypes.iterator().next()); - final List sampleAltAlleles = genotypes.stream().map(Genotype::getAlleles) - .flatMap(List::stream) - .filter(SVCallRecordUtils::isAltAllele) - .distinct() - .collect(Collectors.toList()); - final int expectedCopyNumber = collapseExpectedCopyNumber(genotypes); - builder.noAttributes(); - final List collapsedAlleles = collapseSampleGenotypeAlleles(genotypes, expectedCopyNumber, refAllele, sampleAltAlleles); - builder.alleles(collapsedAlleles); + // Reset attributes and collapse extended attributes + builder.noAttributes(); + final int expectedCopyNumber = collapseExpectedCopyNumber(genotypes); builder.attributes(collapseGenotypeAttributes(genotypes, expectedCopyNumber)); + + final Integer copyNumber; + if (altAlleles.contains(Allele.SV_SIMPLE_DEL) || altAlleles.contains(Allele.SV_SIMPLE_DUP)) { + // For CNVs, collapse genotype using copy number + copyNumber = collapseSampleCopyNumber(genotypes, expectedCopyNumber); + builder.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, copyNumber); + } else { + copyNumber = null; + } + final List collapsedAlleles = collapseSampleGenotypeAlleles(genotypes, expectedCopyNumber, copyNumber, altAlleles, refAllele); + // Replace ref alleles with the possibly-new one + final List collapsedAllelesNewRef = collapsedAlleles.stream() + .map(a -> (a != null && a.isReference()) ? refAllele : a) + .collect(Collectors.toList()); + builder.alleles(collapsedAllelesNewRef); return builder.make(); } + /** + * Generates genotype alleles, i.e. for the GT field, for CNVs (DEL and/or DUP). Genotypes that cannot be determined + * unambiguously (e.g. at diploid DUP sites) result in {@link Allele#NO_CALL} alleles. + * @param siteAltAlleles unique alt alleles for the sample at the site + * @param refAllele reference allele + * @param expectedCopyNumber expected copy number (i.e. ploidy) + * @param copyNumber copy number state + * @return alleles for the sample at the site + * @throws {@link IllegalArgumentException} if the alt allele(s) are not CNV(s) + */ + public static List getCNVGenotypeAllelesFromCopyNumber(final List siteAltAlleles, + final Allele refAllele, + final int expectedCopyNumber, + final Integer copyNumber) { + Utils.nonNull(siteAltAlleles); + Utils.nonNull(refAllele); + Utils.validateArg(siteAltAlleles.size() <= 2, "No support for variants with over 2 alleles"); + if (copyNumber == null) { + // Had conflicting genotypes + return Collections.nCopies(expectedCopyNumber, Allele.NO_CALL); + } + Utils.validateArg(copyNumber >= 0, "Invalid negative copy number: " + copyNumber); + Utils.validateArg(expectedCopyNumber >= 0, "Invalid expected copy number: " + expectedCopyNumber); + if (siteAltAlleles.isEmpty()) { + return Collections.nCopies(expectedCopyNumber, refAllele); + } else if (siteAltAlleles.size() == 2) { + if (siteAltAlleles.contains(Allele.SV_SIMPLE_DUP) && siteAltAlleles.contains(Allele.SV_SIMPLE_DEL)) { + return getMultiallelicCNVAllelesFromCopyNumber(refAllele, expectedCopyNumber, copyNumber); + } else { + final String messageAlleles = String.join(", ", + siteAltAlleles.stream().map(Allele::getDisplayString).collect(Collectors.toList())); + throw new IllegalArgumentException("Unsupported CNV alt alleles: " + messageAlleles); + } + } + final Allele altAllele = siteAltAlleles.get(0); + if (altAllele.equals(Allele.SV_SIMPLE_DEL)) { + return getDeletionAllelesFromCopyNumber(refAllele, expectedCopyNumber, copyNumber); + } else if (altAllele.equals(Allele.SV_SIMPLE_DUP)) { + return getDuplicationAllelesFromCopyNumber(refAllele, expectedCopyNumber, copyNumber); + } else { + throw new IllegalArgumentException("Unsupported CNV alt allele: " + altAllele.getDisplayString()); + } + } + + /** + * Generates genotype alleles for deletion genotypes from the given copy number. + * @param refAllele reference allele for the site + * @param expectedCopyNumber expected copy number for the genotype + * @param copyNumber copy number for the genotype + * @return genotype alleles + */ + public static List getDeletionAllelesFromCopyNumber(final Allele refAllele, final int expectedCopyNumber, + final int copyNumber) { + Utils.nonNull(refAllele); + Utils.validateArg(copyNumber <= expectedCopyNumber, "Invalid deletion copy number " + + copyNumber + " when ploidy is " + expectedCopyNumber); + Utils.validateArg(expectedCopyNumber >= 0, "Invalid expected copy number: " + expectedCopyNumber); + if (expectedCopyNumber == 0) { + return Collections.emptyList(); + } else if (expectedCopyNumber == copyNumber) { + // Most common in practice - use faster method + return Collections.nCopies(expectedCopyNumber, refAllele); + } else { + final List alleles = new ArrayList<>(expectedCopyNumber); + for (int i = 0; i < copyNumber; i++) { + alleles.add(refAllele); + } + for (int i = copyNumber; i < expectedCopyNumber; i++) { + alleles.add(Allele.SV_SIMPLE_DEL); + } + return alleles; + } + } + + /** + * Generates genotype alleles for duplication genotypes from the given copy number. Genotypes that cannot be + * determined unambiguously (e.g. diploid sites) result in {@link Allele#NO_CALL} alleles. + * @param refAllele reference allele for the site + * @param expectedCopyNumber expected copy number for the genotype + * @param copyNumber copy number for the genotype + * @return genotype alleles + */ + public static List getDuplicationAllelesFromCopyNumber(final Allele refAllele, final int expectedCopyNumber, + final int copyNumber) { + Utils.nonNull(refAllele); + Utils.validateArg(copyNumber >= expectedCopyNumber, "Invalid duplication copy number " + + copyNumber + " when ploidy is " + expectedCopyNumber); + Utils.validateArg(expectedCopyNumber >= 0, "Invalid expected copy number: " + expectedCopyNumber); + if (expectedCopyNumber == copyNumber) { + // Most common in practice - use faster method + return Collections.nCopies(expectedCopyNumber, refAllele); + } else if (expectedCopyNumber == 0) { + return Collections.emptyList(); + } else if (expectedCopyNumber == 1) { + // At this point know it's a DUP, and since it's haploid the phasing is unambiguous + // TODO : May not be a simple DUP, need a more general Allele for possible multi-copy DUP + return Collections.singletonList(Allele.SV_SIMPLE_DUP); + } else if (copyNumber == expectedCopyNumber + 1) { + // Case where we can resolve alleles + return makeBiallelicList(Allele.SV_SIMPLE_DUP, refAllele, 1, expectedCopyNumber); + } else { + // DUP alleles cannot be resolved in other cases + return Collections.nCopies(expectedCopyNumber, Allele.NO_CALL); + } + } + + /** + * Generates genotype alleles for multi-allelic CNV genotypes from the given copy number. Genotypes that cannot be + * determined unambiguously (e.g. diploid sites) result in {@link Allele#NO_CALL} alleles. + * @param refAllele reference allele for the site + * @param expectedCopyNumber expected copy number for the genotype + * @param copyNumber copy number for the genotype + * @return genotype alleles + */ + public static List getMultiallelicCNVAllelesFromCopyNumber(final Allele refAllele, + final int expectedCopyNumber, + final int copyNumber) { + Utils.nonNull(refAllele); + Utils.validateArg(copyNumber >= 0, "Invalid multi-allelic CNV copy number: " + copyNumber); + Utils.validateArg(expectedCopyNumber >= 0, "Invalid expected copy number: " + expectedCopyNumber); + if (expectedCopyNumber == 0) { + return Collections.emptyList(); + } else if (expectedCopyNumber == 1) { + if (copyNumber == 0) { + return Collections.singletonList(Allele.SV_SIMPLE_DEL); + } else if (copyNumber == 1) { + return Collections.singletonList(refAllele); + } else { + // copyNumber >= 2 - haploid dups have non-ambiguous phasing + return Collections.singletonList(Allele.SV_SIMPLE_DUP); + } + } else if (copyNumber == 0) { + return Collections.nCopies(expectedCopyNumber, Allele.SV_SIMPLE_DEL); + } else if (copyNumber == 1) { + return makeBiallelicList(Allele.SV_SIMPLE_DEL, refAllele, expectedCopyNumber - 1, expectedCopyNumber); + } else { + // Alleles cannot be resolved in other cases + return Collections.nCopies(expectedCopyNumber, Allele.NO_CALL); + } + } + + /** + * Creates list for biallelic sites with the specified number of each allele. + * @param alt alt allele + * @param ref ref allele + * @param numAlt number of alt alleles + * @param ploidy number of alleles (total) + * @return resulting list + */ + public static List makeBiallelicList(final Allele alt, final Allele ref, final int numAlt, final int ploidy) { + Utils.nonNull(alt); + Utils.nonNull(ref); + Utils.validateArg(numAlt >= 0, "Negative number of alt alleles"); + Utils.validateArg(ploidy >= 0, "Negative number of ref alleles"); + if (ploidy == 0) { + return Collections.emptyList(); + } else if (numAlt == 0) { + return Collections.nCopies(ploidy, ref); + } else if (numAlt == ploidy) { + return Collections.nCopies(ploidy, alt); + } + final int numRef = ploidy - numAlt; + final List alleles = new ArrayList<>(numAlt + numRef); + for (int i = 0; i < numRef; i++) { + alleles.add(ref); + } + for (int i = 0; i < numAlt; i++) { + alleles.add(alt); + } + return alleles; + } + protected Map collapseGenotypeAttributes(final Collection genotypes, final int expectedCopyNumber) { Utils.nonNull(genotypes); @@ -401,8 +645,11 @@ protected Map collapseGenotypeAttributes(final Collection> entry : genotypeFields.entrySet()) { if (entry.getKey().equals(GATKSVVCFConstants.COPY_NUMBER_FORMAT)) { - collapsedAttributes.put(GATKSVVCFConstants.COPY_NUMBER_FORMAT, collapseSampleCopyNumber(genotypes, expectedCopyNumber)); + // Copy number attribute set later using special logic in collapseSampleCopyNumber + // Unit testing is easier with this design + continue; } else if (entry.getKey().equals(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT)) { + // Set after loop continue; } else { collapsedAttributes.put(entry.getKey(), collapseSampleGenotypeAttribute(entry.getKey(), entry.getValue())); @@ -446,22 +693,16 @@ protected Object collapseSingleVariantAttribute(final Set values) { /*** * Calculates new SVLEN value. * @param items records to collapse - * @param newStart collapsed start position - * @param newEnd collapsed end position * @param newType collapsed sv type * @return */ - protected final int collapseLength(final Collection items, final int newStart, final int newEnd, - final StructuralVariantType newType) { + protected final Integer collapseLength(final Collection items, final StructuralVariantType newType) { Utils.nonNull(items); Utils.nonEmpty(items); - final SVCallRecord exampleCall = items.iterator().next(); - if (!exampleCall.isIntrachromosomal()) { - return -1; - } else if (newType.equals(StructuralVariantType.INS)) { + if (newType.equals(StructuralVariantType.INS)) { return collapseInsertionLength(items); } else { - return newEnd - newStart + 1; + return null; } } @@ -539,39 +780,37 @@ protected Pair collapseInterval(final Collection final int[] startPositions = items.stream().mapToInt(SVCallRecord::getPositionA).sorted().toArray(); final int[] endPositions = items.stream().mapToInt(SVCallRecord::getPositionB).sorted().toArray(); //use the mid value of the sorted list so the start and end represent real breakpoint observations - final int medianStart = MathUtils.median(startPositions, Percentile.EstimationType.R_3); - final int medianEnd = MathUtils.median(endPositions, Percentile.EstimationType.R_3); + final int medianStart = MathUtils.median(startPositions, Percentile.EstimationType.R_1); + final int medianEnd = MathUtils.median(endPositions, Percentile.EstimationType.R_1); final int newStart; final int newEnd; + switch (breakpointSummaryStrategy) { + case MEDIAN_START_MEDIAN_END: + newStart = medianStart; + newEnd = medianEnd; + break; + case MIN_START_MAX_END: + newStart = startPositions[0]; + newEnd = endPositions[endPositions.length - 1]; + break; + case MAX_START_MIN_END: + newStart = startPositions[startPositions.length - 1]; + newEnd = endPositions[0]; + break; + case MEAN_START_MEAN_END: + newStart = (int)Math.round(MathUtils.sum(startPositions) / (double) startPositions.length); + newEnd = (int)Math.round(MathUtils.sum(endPositions) / (double) startPositions.length); + break; + default: + throw new UnsupportedOperationException("Unknown breakpoint summary strategy: " + breakpointSummaryStrategy.name()); + } if (exampleCall.getType().equals(StructuralVariantType.INS)) { - // Insertions should be a single locus; also fixes case where end-supporting split reads are to the - // left of start-supporting split reads - final int mean = (medianStart + medianEnd) / 2; - newStart = mean; - newEnd = mean; + // Insertions are a single locus + return Pair.of(newStart, newStart); } else { - switch (breakpointSummaryStrategy) { - case MEDIAN_START_MEDIAN_END: - newStart = medianStart; - newEnd = medianEnd; - break; - case MIN_START_MAX_END: - newStart = startPositions[0]; - newEnd = endPositions[endPositions.length - 1]; - break; - case MAX_START_MIN_END: - newStart = startPositions[startPositions.length - 1]; - newEnd = endPositions[0]; - break; - case MEAN_START_MEAN_END: - newStart = (int)Math.round(MathUtils.sum(startPositions) / (double) startPositions.length); - newEnd = (int)Math.round(MathUtils.sum(endPositions) / (double) startPositions.length); - break; - default: - throw new UnsupportedOperationException("Unknown breakpoint summary strategy: " + breakpointSummaryStrategy.name()); - } + // Do not let end precede start + return Pair.of(newStart, Math.max(newEnd, newStart)); } - return Pair.of(newStart, newEnd); } protected StructuralVariantType collapseTypes(final Collection records) { @@ -603,6 +842,22 @@ public int compare(final Collection o1, final Collection o2) { } Utils.nonNull(o1); Utils.nonNull(o2); + // Sort on number of alts + final long numAlt1 = (int) o1.stream().filter(SVCallRecordUtils::isAltAllele).count(); + final long numAlt2 = (int) o2.stream().filter(SVCallRecordUtils::isAltAllele).count(); + if (numAlt1 < numAlt2) { + return -1; + } else if (numAlt1 > numAlt2) { + return 1; + } + // Sort on number of calls + final long numCalled1 = (int) o1.stream().filter(a -> a != null && !a.isNoCall()).count(); + final long numCalled2 = (int) o2.stream().filter(a -> a != null && !a.isNoCall()).count(); + if (numCalled1 > numCalled2) { + return -1; + } else if (numCalled1 < numCalled2) { + return 1; + } final List sorted1 = SVCallRecordUtils.sortAlleles(o1); final List sorted2 = SVCallRecordUtils.sortAlleles(o2); // Resort to display strings diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVLinkage.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVLinkage.java index e2e4416225c..2d2e658693e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVLinkage.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVLinkage.java @@ -177,18 +177,20 @@ private static int getMaxClusterableStartingPositionWithParams(final SVCallRecor final SAMSequenceDictionary dictionary) { final String contig = call.getContigA(); final int contigLength = dictionary.getSequence(contig).getSequenceLength(); - // Reciprocal overlap window - final int maxPositionByOverlap; - if (call.isIntrachromosomal()) { - final int maxPosition = (int) (call.getPositionA() + (1.0 - params.getReciprocalOverlap()) * getLengthForOverlap(call)); - maxPositionByOverlap = Math.min(maxPosition, contigLength); - } else { - maxPositionByOverlap = call.getPositionA(); - } // Breakend proximity window final int maxPositionByWindow = Math.min(call.getPositionA() + params.getWindow(), contigLength); + // Don't use overlap for inter-chromosomal events + if (!call.isIntrachromosomal()) { + return maxPositionByWindow; + } + + // Reciprocal overlap window + final int maxPositionByOverlap; + final int maxPosition = (int) (call.getPositionA() + (1.0 - params.getReciprocalOverlap()) * getLengthForOverlap(call)); + maxPositionByOverlap = Math.min(maxPosition, contigLength); + if (params.requiresOverlapAndProximity()) { return Math.min(maxPositionByOverlap, maxPositionByWindow); } else { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/PloidyTable.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/PloidyTable.java new file mode 100644 index 00000000000..bf514583b39 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/PloidyTable.java @@ -0,0 +1,68 @@ +package org.broadinstitute.hellbender.tools.sv.cluster; + +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.tsv.TableReader; +import org.broadinstitute.hellbender.utils.tsv.TableUtils; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.HashMap; +import java.util.Map; +import java.util.stream.Collectors; + +public class PloidyTable { + + private final Map samplePloidyMap; + + public PloidyTable(final Path path) { + try { + final TableReader reader = TableUtils.reader(path, (columns, exceptionFactory) -> + (dataLine) -> { + final String sample = dataLine.get(0); + final Map dataMap = new HashMap<>(); + for (int i = 1; i < columns.columnCount(); i++) { + final String contig = columns.names().get(i); + final Integer ploidy = dataLine.getInt(i); + dataMap.put(contig, ploidy); + } + return new PloidyRecord(sample, dataMap); + } + ); + samplePloidyMap = reader.stream().collect(Collectors.toMap(PloidyRecord::getSample, r -> r)); + reader.close(); + } catch (final IOException e) { + throw new GATKException("IO error while reading ploidy table", e); + } + } + + public PloidyTable(final Map> samplePloidyMap) { + this.samplePloidyMap = samplePloidyMap.entrySet().stream() + .collect(Collectors.toMap(p -> p.getKey(), p -> new PloidyRecord(p.getKey(), p.getValue()))); + } + + public Integer get(final String sample, final String contig) { + Utils.validateArg(samplePloidyMap.containsKey(sample), "Sample " + sample + " not found in ploidy records"); + return samplePloidyMap.get(sample).getPloidy(contig); + } + + private static final class PloidyRecord { + private final String sample; + private final Map ploidyMap; + + public PloidyRecord(final String sample, final Map ploidyMap) { + this.sample = sample; + this.ploidyMap = ploidyMap; + } + + public String getSample() { + return sample; + } + + public Integer getPloidy(final String contig) { + Utils.validateArg(ploidyMap.containsKey(contig), "No ploidy entry for sample " + sample + + " at contig " + contig); + return ploidyMap.get(contig); + } + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngine.java index 9a38606efaa..1e580a62070 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngine.java @@ -42,6 +42,8 @@ public enum CLUSTERING_TYPE { private int nextItemId; private int nextClusterId; private int lastStart; + private Integer minActiveStartingPosition; + private Integer minActiveStartingPositionItemId; /** * @param clusteringType algorithm choice @@ -60,6 +62,9 @@ public SVClusterEngine(final CLUSTERING_TYPE clusteringType, nextItemId = 0; nextClusterId = 0; lastStart = 0; + minActiveStartingPosition = null; + minActiveStartingPositionItemId = null; + } @@ -67,8 +72,15 @@ public SVClusterEngine(final CLUSTERING_TYPE clusteringType, * Flushes all active clusters, adding them to the output buffer. Results from the output buffer are then copied out * and the buffer is cleared. This should be called between contigs to save memory. */ - public final List getOutput() { + public final List forceFlushAndGetOutput() { flushClusters(); + return getOutput(); + } + + /** + * Gets any available finalized clusters. + */ + public final List getOutput() { final List output = new ArrayList<>(outputBuffer); outputBuffer.clear(); return output; @@ -84,6 +96,10 @@ public SVClusterLinkage getLinkage() { return linkage; } + public Integer getMinActiveStartingPosition() { + return minActiveStartingPosition; + } + /** * Returns true if there are any active or finalized clusters. */ @@ -114,6 +130,10 @@ private final int registerItem(final T item) { lastStart = item.getPositionA(); final int itemId = nextItemId++; idToItemMap.put(itemId, item); + if (minActiveStartingPosition == null || item.getPositionA() < minActiveStartingPosition) { + minActiveStartingPosition = item.getPositionA(); + minActiveStartingPositionItemId = itemId; + } return itemId; } @@ -242,7 +262,43 @@ private final void combineClusters(final Collection clusterIds, final I private final void processCluster(final int clusterIndex) { final Cluster cluster = getCluster(clusterIndex); idToClusterMap.remove(clusterIndex); - outputBuffer.add(collapser.collapse(cluster.getItemIds().stream().map(idToItemMap::get).collect(Collectors.toList()))); + final List clusterItemIds = cluster.getItemIds(); + outputBuffer.add(collapser.collapse(clusterItemIds.stream().map(idToItemMap::get).collect(Collectors.toList()))); + // Clean up item id map + if (clusterItemIds.size() == 1) { + // Singletons won't be present in any other clusters + idToItemMap.remove(clusterItemIds.get(0)); + } else { + // Need to check that items aren't present in any other clusters + final Set activeItemIds = idToClusterMap.values().stream() + .map(Cluster::getItemIds) + .flatMap(List::stream) + .collect(Collectors.toSet()); + final List itemsToRemove = idToItemMap.keySet().stream().filter(i -> !activeItemIds.contains(i)) + .collect(Collectors.toList()); + for (final Integer i : itemsToRemove) { + idToItemMap.remove(i); + } + } + // Update min active start position + if (clusterItemIds.contains(minActiveStartingPositionItemId)) { + findAndSetMinActiveStart(); + } + } + + /** + * Scans active items for the current min active starting position. + */ + private final void findAndSetMinActiveStart() { + minActiveStartingPosition = null; + minActiveStartingPositionItemId = null; + for (final Integer itemId : idToItemMap.keySet()) { + final T item = idToItemMap.get(itemId); + if (minActiveStartingPosition == null || item.getPositionA() < minActiveStartingPosition) { + minActiveStartingPosition = item.getPositionA(); + minActiveStartingPositionItemId = itemId; + } + } } /** @@ -264,6 +320,8 @@ private final void flushClusters() { processCluster(clusterId); } idToItemMap.clear(); + minActiveStartingPosition = null; + minActiveStartingPositionItemId = null; nextItemId = 0; nextClusterId = 0; } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngineArgumentsCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngineArgumentsCollection.java new file mode 100644 index 00000000000..4f23460d2d2 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngineArgumentsCollection.java @@ -0,0 +1,107 @@ +package org.broadinstitute.hellbender.tools.sv.cluster; + +import org.broadinstitute.barclay.argparser.Argument; + +import java.io.Serializable; + +/** + * Arguments for use with {@link SVClusterEngine}. + */ +public class SVClusterEngineArgumentsCollection implements Serializable { + private static final long serialVersionUID = 1L; + + protected static final String BASE_INTERVAL_OVERLAP_FRACTION_NAME = "-interval-overlap"; + protected static final String BASE_BREAKEND_WINDOW_NAME = "-breakend-window"; + protected static final String BASE_SAMPLE_OVERLAP_FRACTION_NAME = "-sample-overlap"; + + protected static final String BASE_INTERVAL_OVERLAP_FRACTION_DOC = " interval reciprocal overlap fraction"; + protected static final String BASE_BREAKEND_WINDOW_DOC = " window size for breakend proximity"; + protected static final String BASE_SAMPLE_OVERLAP_FRACTION_DOC = " shared sample overlap fraction"; + + public static final String DEPTH_INTERVAL_OVERLAP_FRACTION_NAME = "depth" + BASE_INTERVAL_OVERLAP_FRACTION_NAME; + public static final String MIXED_INTERVAL_OVERLAP_FRACTION_NAME = "mixed" + BASE_INTERVAL_OVERLAP_FRACTION_NAME; + public static final String PESR_INTERVAL_OVERLAP_FRACTION_NAME = "pesr" + BASE_INTERVAL_OVERLAP_FRACTION_NAME; + + public static final String DEPTH_BREAKEND_WINDOW_NAME = "depth" + BASE_BREAKEND_WINDOW_NAME; + public static final String MIXED_BREAKEND_WINDOW_NAME = "mixed" + BASE_BREAKEND_WINDOW_NAME; + public static final String PESR_BREAKEND_WINDOW_NAME = "pesr" + BASE_BREAKEND_WINDOW_NAME; + + public static final String DEPTH_SAMPLE_OVERLAP_FRACTION_NAME = "depth" + BASE_SAMPLE_OVERLAP_FRACTION_NAME; + public static final String MIXED_SAMPLE_OVERLAP_FRACTION_NAME = "mixed" + BASE_SAMPLE_OVERLAP_FRACTION_NAME; + public static final String PESR_SAMPLE_OVERLAP_FRACTION_NAME = "pesr" + BASE_SAMPLE_OVERLAP_FRACTION_NAME; + + /** + * Minimum interval reciprocal overlap fraction to cluster depth-only/depth-only variant pairs. + */ + @Argument(fullName = DEPTH_INTERVAL_OVERLAP_FRACTION_NAME, + doc="Depth/Depth" + BASE_INTERVAL_OVERLAP_FRACTION_DOC, optional=true, minValue = 0, maxValue = 1) + public double depthOverlapFraction = CanonicalSVLinkage.DEFAULT_RECIPROCAL_OVERLAP_DEPTH_ONLY; + + /** + * Minimum interval reciprocal overlap fraction to cluster depth-only/PESR variant pairs. + */ + @Argument(fullName = MIXED_INTERVAL_OVERLAP_FRACTION_NAME, + doc="PESR/Depth" + BASE_INTERVAL_OVERLAP_FRACTION_DOC, optional=true, minValue = 0, maxValue = 1) + public double mixedOverlapFraction = CanonicalSVLinkage.DEFAULT_RECIPROCAL_OVERLAP_MIXED; + + /** + * Minimum interval reciprocal overlap fraction to cluster PESR/PESR variant pairs. + */ + @Argument(fullName = PESR_INTERVAL_OVERLAP_FRACTION_NAME, + doc="PESR/PESR" + BASE_INTERVAL_OVERLAP_FRACTION_DOC, optional=true, minValue = 0, maxValue = 1) + public double pesrOverlapFraction = CanonicalSVLinkage.DEFAULT_RECIPROCAL_OVERLAP_PESR; + + /** + * Maximum allowed distance between endpoints (in bp) to cluster depth-only/depth-only variant pairs. + */ + @Argument(fullName = DEPTH_BREAKEND_WINDOW_NAME, + doc="Depth/Depth" + BASE_BREAKEND_WINDOW_DOC, optional=true, minValue = 0) + public int depthBreakendWindow = CanonicalSVLinkage.DEFAULT_WINDOW_DEPTH_ONLY; + + /** + * Maximum allowed distance between endpoints (in bp) to cluster depth-only/PESR variant pairs. + */ + @Argument(fullName = MIXED_BREAKEND_WINDOW_NAME, + doc="Depth/PESR" + BASE_BREAKEND_WINDOW_DOC, optional=true, minValue = 0) + public int mixedBreakendWindow = CanonicalSVLinkage.DEFAULT_WINDOW_MIXED; + + /** + * Maximum allowed distance between endpoints (in bp) to cluster PESR/PESR variant pairs. + */ + @Argument(fullName = PESR_BREAKEND_WINDOW_NAME, + doc="PESR/PESR" + BASE_BREAKEND_WINDOW_DOC, optional=true, minValue = 0) + public int pesrBreakendWindow = CanonicalSVLinkage.DEFAULT_WINDOW_PESR; + + /** + * Minimum carrier sample reciprocal overlap fraction to cluster depth-only/depth-only variant pairs. + */ + @Argument(fullName = DEPTH_SAMPLE_OVERLAP_FRACTION_NAME, + doc="Depth/Depth" + BASE_SAMPLE_OVERLAP_FRACTION_DOC, optional=true, minValue = 0, maxValue = 1) + public double depthSampleOverlapFraction = CanonicalSVLinkage.DEFAULT_SAMPLE_OVERLAP_DEPTH_ONLY; + + /** + * Minimum carrier sample reciprocal overlap fraction to cluster depth-only/PESR variant pairs. + */ + @Argument(fullName = MIXED_SAMPLE_OVERLAP_FRACTION_NAME, + doc="Depth/PESR" + BASE_SAMPLE_OVERLAP_FRACTION_DOC, optional=true, minValue = 0, maxValue = 1) + public double mixedSampleOverlapFraction = CanonicalSVLinkage.DEFAULT_SAMPLE_OVERLAP_MIXED; + + /** + * Minimum carrier sample reciprocal overlap fraction to cluster PESR/PESR variant pairs. + */ + @Argument(fullName = PESR_SAMPLE_OVERLAP_FRACTION_NAME, + doc="PESR/PESR" + BASE_SAMPLE_OVERLAP_FRACTION_DOC, optional=true, minValue = 0, maxValue = 1) + public double pesrSampleOverlapFraction = CanonicalSVLinkage.DEFAULT_SAMPLE_OVERLAP_PESR; + + public final ClusteringParameters getDepthParameters() { + return ClusteringParameters.createDepthParameters(depthOverlapFraction, depthBreakendWindow, depthSampleOverlapFraction); + } + + public final ClusteringParameters getMixedParameters() { + return ClusteringParameters.createMixedParameters(mixedOverlapFraction, mixedBreakendWindow, mixedSampleOverlapFraction); + } + + public final ClusteringParameters getPESRParameters() { + return ClusteringParameters.createPesrParameters(pesrOverlapFraction, pesrBreakendWindow, pesrSampleOverlapFraction); + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterLinkage.java b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterLinkage.java index b7f9021fa04..cc43a04872a 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterLinkage.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterLinkage.java @@ -63,8 +63,8 @@ protected static int getSampleSetOverlap(final Collection a, final Set 0) { - final Set samplesA = a.getCarrierSamples(); - final Set samplesB = b.getCarrierSamples(); + final Set samplesA = a.getCarrierSampleSet(); + final Set samplesB = b.getCarrierSampleSet(); return hasSampleSetOverlap(samplesA, samplesB, minSampleOverlap); } else { return true; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentation.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentation.java index 9519cf93585..b0221ad65c5 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentation.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentation.java @@ -293,11 +293,11 @@ public Object onTraversalSuccess() { private void processClusters() { if (!defragmenter.isEmpty()) { - final List defragmentedCalls = defragmenter.getOutput(); + final List defragmentedCalls = defragmenter.forceFlushAndGetOutput(); defragmentedCalls.stream().forEachOrdered(clusterEngine::add); } //Jack and Isaac cluster first and then defragment - final List clusteredCalls = clusterEngine.getOutput(); + final List clusteredCalls = clusterEngine.forceFlushAndGetOutput(); write(clusteredCalls); } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster.java new file mode 100644 index 00000000000..602d296528c --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster.java @@ -0,0 +1,501 @@ +package org.broadinstitute.hellbender.tools.walkers.sv; + +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.reference.ReferenceSequenceFile; +import htsjdk.variant.variantcontext.GenotypesContext; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.vcf.*; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.ArgumentCollection; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; +import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFHeaderLines; +import org.broadinstitute.hellbender.tools.sv.SVCallRecord; +import org.broadinstitute.hellbender.tools.sv.SVCallRecordUtils; +import org.broadinstitute.hellbender.tools.sv.cluster.*; +import org.broadinstitute.hellbender.utils.reference.ReferenceUtils; + +import java.util.*; +import java.util.stream.Collectors; + +import static org.broadinstitute.hellbender.tools.walkers.sv.JointGermlineCNVSegmentation.BREAKPOINT_SUMMARY_STRATEGY_LONG_NAME; + +/** + *

Clusters structural variants based on coordinates, event type, and supporting algorithms. Primary use cases include:

+ *
    + *
  • + * Clustering SVs produced by multiple callers, based on interval overlap, breakpoint proximity, and sample overlap. + *
  • + *
  • + * Merging multiple SV VCFs with disjoint sets of samples and/or variants. + *
  • + *
  • + * Defragmentation of copy number variants produced with depth-based callers. + *
  • + *
+ * + *

Clustering tasks can be accomplished using one of two algorithms. The SINGLE_LINKAGE algorithm produces clusters + * for which all members cluster with at least one other member. The MAX_CLIQUE algorithm, however, + * requires that all members cluster with every other member. The latter is in general non-polynomial in time and + * space but implemented to minimize computations by traversing variants ordered by start position and efficiently + * finalizing "active" clusters that are determined to be complete.

+ * + *

The tool determines whether two given variants should cluster based following criteria:

+ *
    + *
  • + * Matching SV type. DEL and DUP are considered matching SV types if --enable-cnv is used and merged into + * a multi-allelic CNV type. + *
  • + *
  • + * Matching breakend strands (BND and INV only) + *
  • + *
  • + * Interval reciprocal overlap (inapplicable for BNDs). + *
  • + *
  • + * Distance between corresponding event breakends (breakend window). + *
  • + *
  • + * Sample reciprocal overlap, based on carrier status determined by available genotypes (GT fields). If + * no GT fields are called for a given variant, the tool attempts to find carriers based on copy number + * (CN field) and sample ploidy (as determined by the ECN FORMAT field). + *
  • + *
+ * + *

For CNV defragmentation (DEFRAGMENT_CNV algorithm), the tool uses single-linkage clustering based + * on the following linkage criteria:

+ *
    + *
  • + * Must be a DEL/DUP/CNV and only be supported by a depth algorithm. + *
  • + *
  • + * Matching SV type + *
  • + *
  • + * Overlapping after padding both sides of each variant by the fraction of event length specified by + * --defrag-padding-fraction. + *
  • + *
  • + * Sample overlap fraction (described above) specified by --defrag-sample-overlap. + *
  • + *
+ * + *

Interval overlap, breakend window, and sample overlap parameters are defined for three combinations of event types + * using the ALGORITHMS field, which describes the type of evidence that was used to call the variant:

+ *
    + *
  • + * Depth-only - both variants have solely "depth" ALGORITHMS + *
  • + *
  • + * PESR (paired-end/split-read) - both variants have at least one non-depth entry in ALGORITHMS + *
  • + *
  • + * Mixed - one variant is depth-only and the other is PESR + *
  • + *
+ * + *

Users must supply one or more VCFs containing SVs with the following info fields:

+ * + *
    + *
  • + * SVTYPE - event type (DEL, DUP, CNV, INS, INV, BND) + *
  • + *
  • + * SVLEN - variant length for INS only, if known + *
  • + *
  • + * STRANDS - breakend strands ("++", "+-", "-+", or "--") (BND and INV only) + *
  • + *
  • + * CHROM2 / END2 - mate coordinates (BND only) + *
  • + *
  • + * ALGORITHMS - list of supporting tools/algorithms. These names may be arbitrary, except for depth-based + * callers, which should be listed as "depth". + *
  • + *
+ * + *

In addition, the following FORMAT fields must be defined:

+ * + *
    + *
  • + * GT - genotype alleles + *
  • + *
  • + * ECN - expected copy number (i.e. ploidy) + *
  • + *
  • + * CN - genotype copy number (DEL/DUP/CNV only) + *
  • + *
+ * + *

Note that for CNVs (DEL, DUP, multi-allelic CNV), GT alleles are set according to the CN/ECN fields. In + * some cases, (e.g. diploid DUPs with CN 4), allele phasing cannot be determined unambiguously and GT is set with + * no-call alleles.

+ * + *

The tool generates a new VCF with clusters collapsed into single representative records. By default, a MEMBERS + * field is generated that lists the input variant IDs contained in that record's cluster.

+ * + *

Inputs

+ * + *
    + *
  • + * One or more SV VCFs + *
  • + *
+ * + *

Output

+ * + *
    + *
  • + * Clustered VCF + *
  • + *
+ * + *

Usage example

+ * + *
+ *     gatk SVCluster \
+ *       -V variants.vcf.gz \
+ *       -O clustered.vcf.gz \
+ *       --algorithm SINGLE_LINKAGE
+ * 
+ * + * @author Mark Walker <markw@broadinstitute.org> + */ +@CommandLineProgramProperties( + summary = "Clusters structural variants", + oneLineSummary = "Clusters structural variants", + programGroup = StructuralVariantDiscoveryProgramGroup.class +) +@BetaFeature +@DocumentedFeature +public final class SVCluster extends MultiVariantWalker { + public static final String PLOIDY_TABLE_LONG_NAME = "ploidy-table"; + public static final String VARIANT_PREFIX_LONG_NAME = "variant-prefix"; + public static final String ENABLE_CNV_LONG_NAME = "enable-cnv"; + public static final String DEFRAG_PADDING_FRACTION_LONG_NAME = "defrag-padding-fraction"; + public static final String DEFRAG_SAMPLE_OVERLAP_LONG_NAME = "defrag-sample-overlap"; + public static final String CONVERT_INV_LONG_NAME = "convert-inv-to-bnd"; + public static final String ALGORITHM_LONG_NAME = "algorithm"; + public static final String FAST_MODE_LONG_NAME = "fast-mode"; + public static final String OMIT_MEMBERS_LONG_NAME = "omit-members"; + public static final String INSERTION_LENGTH_SUMMARY_STRATEGY_LONG_NAME = "insertion-length-summary-strategy"; + public static final String DEFAULT_NO_CALL_LONG_NAME = "default-no-call"; + + /** + * The enum Cluster algorithm. + */ + enum CLUSTER_ALGORITHM { + /** + * Defragment cnv cluster algorithm. + */ + DEFRAGMENT_CNV, + /** + * Single linkage cluster algorithm. + */ + SINGLE_LINKAGE, + /** + * Max clique cluster algorithm. + */ + MAX_CLIQUE + } + + @Argument( + doc = "Output VCF", + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME + ) + private GATKPath outputFile; + + /** + * Expected format is tab-delimited and contains a header with the first column SAMPLE and remaining columns + * contig names. Each row corresponds to a sample, with the sample ID in the first column and contig ploidy + * integers in their respective columns. + */ + @Argument( + doc = "Sample ploidy table (.tsv)", + fullName = PLOIDY_TABLE_LONG_NAME + ) + private GATKPath ploidyTablePath; + + @Argument( + doc = "If supplied, generate variant IDs with this prefix", + fullName = VARIANT_PREFIX_LONG_NAME, + optional = true + ) + private String variantPrefix = null; + + /** + * When enabled, DEL and DUP variants will be clustered together. The resulting records with have an SVTYPE of CNV. + */ + @Argument( + doc = "Enable clustering DEL/DUP variants together as CNVs (does not apply to CNV defragmentation)", + fullName = ENABLE_CNV_LONG_NAME, + optional = true + ) + private boolean enableCnv = false; + + /** + * When enabled, INV records will be converted to a pairs of BNDs prior to clustering. + */ + @Argument( + doc = "Convert inversions to BND records", + fullName = CONVERT_INV_LONG_NAME, + optional = true + ) + private boolean convertInversions = false; + + /** + * Results in substantial space and time costs for large sample sets by clearing genotypes that are not needed for + * clustering, but any associated annotation fields will be set to null in the output. + */ + @Argument( + doc = "Fast mode. Drops hom-ref and no-call genotype fields and emits them as no-calls.", + fullName = FAST_MODE_LONG_NAME, + optional = true + ) + private boolean fastMode = false; + + @Argument( + doc = "Omit cluster member ID annotations", + fullName = OMIT_MEMBERS_LONG_NAME, + optional = true + ) + private boolean omitMembers = false; + + @Argument(fullName = BREAKPOINT_SUMMARY_STRATEGY_LONG_NAME, + doc = "Strategy to use for choosing a representative value for a breakpoint cluster.", + optional = true) + private CanonicalSVCollapser.BreakpointSummaryStrategy breakpointSummaryStrategy = + CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END; + + @Argument(fullName = JointGermlineCNVSegmentation.ALT_ALLELE_SUMMARY_STRATEGY_LONG_NAME, + doc = "Strategy to use for choosing a representative alt allele for non-CNV biallelic sites with " + + "different subtypes.", + optional = true) + private CanonicalSVCollapser.AltAlleleSummaryStrategy altAlleleSummaryStrategy = + CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE; + + @Argument(fullName = INSERTION_LENGTH_SUMMARY_STRATEGY_LONG_NAME, + doc = "Strategy to use for choosing a representative value for insertion length when unknown.", + optional = true) + private CanonicalSVCollapser.InsertionLengthSummaryStrategy insertionLengthSummaryStrategy = + CanonicalSVCollapser.InsertionLengthSummaryStrategy.MEDIAN; + + @Argument(fullName = DEFRAG_PADDING_FRACTION_LONG_NAME, + doc = "Padding as a fraction of variant length for CNV defragmentation mode.", + optional = true + ) + private double defragPaddingFraction = CNVLinkage.DEFAULT_PADDING_FRACTION; + + @Argument(fullName = DEFRAG_SAMPLE_OVERLAP_LONG_NAME, + doc = "Minimum sample overlap fraction. Use instead of --depth-sample-overlap in CNV defragmentation mode.", + optional = true + ) + private double defragSampleOverlapFraction = CNVLinkage.DEFAULT_SAMPLE_OVERLAP; + + @Argument(fullName = ALGORITHM_LONG_NAME, + doc = "Clustering algorithm", + optional = true + ) + private CLUSTER_ALGORITHM algorithm = CLUSTER_ALGORITHM.SINGLE_LINKAGE; + + /** + * Default genotypes are assigned when they cannot be inferred from the inputs, such as when VCFs with different + * variants and samples are provided. + */ + @Argument(fullName = DEFAULT_NO_CALL_LONG_NAME, + doc = "Default to no-call GT (e.g. ./.) instead of reference alleles (e.g. 0/0) when a genotype is not" + + " available", + optional = true + ) + private boolean defaultNoCall = false; + + @ArgumentCollection + private final SVClusterEngineArgumentsCollection clusterParameterArgs = new SVClusterEngineArgumentsCollection(); + + private SAMSequenceDictionary dictionary; + private ReferenceSequenceFile reference; + private PloidyTable ploidyTable; + private Comparator recordComparator; + private OutputSortingBuffer outputBuffer; + private VariantContextWriter writer; + private SVClusterEngine clusterEngine; + private Set samples; + private String currentContig; + private int numVariantsBuilt = 0; + + @Override + public boolean requiresReference() { + return true; + } + + @Override + public void onTraversalStart() { + reference = ReferenceUtils.createReferenceReader(referenceArguments.getReferenceSpecifier()); + dictionary = reference.getSequenceDictionary(); + if (dictionary == null) { + throw new UserException("Reference sequence dictionary required"); + } + ploidyTable = new PloidyTable(ploidyTablePath.toPath()); + recordComparator = SVCallRecordUtils.getCallComparator(dictionary); + samples = getSamplesForVariants(); + + if (algorithm == CLUSTER_ALGORITHM.DEFRAGMENT_CNV) { + clusterEngine = SVClusterEngineFactory.createCNVDefragmenter(dictionary, altAlleleSummaryStrategy, + reference, defragPaddingFraction, defragSampleOverlapFraction); + } else if (algorithm == CLUSTER_ALGORITHM.SINGLE_LINKAGE || algorithm == CLUSTER_ALGORITHM.MAX_CLIQUE) { + final SVClusterEngine.CLUSTERING_TYPE type = algorithm == CLUSTER_ALGORITHM.SINGLE_LINKAGE ? + SVClusterEngine.CLUSTERING_TYPE.SINGLE_LINKAGE : SVClusterEngine.CLUSTERING_TYPE.MAX_CLIQUE; + clusterEngine = SVClusterEngineFactory.createCanonical(type, breakpointSummaryStrategy, + altAlleleSummaryStrategy, insertionLengthSummaryStrategy, dictionary, reference, enableCnv, + clusterParameterArgs.getDepthParameters(), clusterParameterArgs.getMixedParameters(), + clusterParameterArgs.getPESRParameters()); + } else { + throw new IllegalArgumentException("Unsupported algorithm: " + algorithm.name()); + } + + outputBuffer = new OutputSortingBuffer(clusterEngine); + writer = createVCFWriter(outputFile); + writer.writeHeader(createHeader()); + currentContig = null; + } + + @Override + public Object onTraversalSuccess() { + write(true); + return super.onTraversalSuccess(); + } + + @Override + public void closeTool() { + super.closeTool(); + if (writer != null) { + writer.close(); + } + } + + @Override + public void apply(final VariantContext variant, final ReadsContext readsContext, + final ReferenceContext referenceContext, final FeatureContext featureContext) { + final SVCallRecord call = SVCallRecordUtils.create(variant); + final SVCallRecord filteredCall; + if (fastMode) { + // Strip out non-carrier genotypes to save memory and compute + final GenotypesContext filteredGenotypes = GenotypesContext.copy(call.getCarrierGenotypeList()); + filteredCall = SVCallRecordUtils.copyCallWithNewGenotypes(call, filteredGenotypes); + } else { + filteredCall = call; + } + + // Update current contig + if (!filteredCall.getContigA().equals(currentContig)) { + currentContig = filteredCall.getContigA(); + logger.info("Processing contig " + currentContig + "..."); + } + + // Add to clustering buffer + if (convertInversions) { + SVCallRecordUtils.convertInversionsToBreakends(filteredCall).forEachOrdered(clusterEngine::add); + } else { + clusterEngine.add(filteredCall); + } + + write(false); + } + + private void write(final boolean force) { + final List records = force ? outputBuffer.forceFlush() : outputBuffer.flush(); + records.stream().map(this::buildVariantContext).forEachOrdered(writer::add); + } + + private VCFHeader createHeader() { + final VCFHeader header = new VCFHeader(getDefaultToolVCFHeaderLines(), samples); + header.setVCFHeaderVersion(VCFHeaderVersion.VCF4_2); + header.setSequenceDictionary(dictionary); + + // Copy from inputs + getHeaderForVariants().getFormatHeaderLines().forEach(header::addMetaDataLine); + getHeaderForVariants().getInfoHeaderLines().forEach(header::addMetaDataLine); + + // Required info lines + header.addMetaDataLine(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY)); + header.addMetaDataLine(GATKSVVCFHeaderLines.getInfoLine(GATKSVVCFConstants.SVLEN)); + header.addMetaDataLine(GATKSVVCFHeaderLines.getInfoLine(GATKSVVCFConstants.SVTYPE)); + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.END2_ATTRIBUTE, 1, + VCFHeaderLineType.Integer, "Second position")); + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.CONTIG2_ATTRIBUTE, 1, + VCFHeaderLineType.String, "Second contig")); + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.STRANDS_ATTRIBUTE, 1, + VCFHeaderLineType.String, "First and second strands")); + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE, + VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Source algorithms")); + if (!omitMembers) { + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, + VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Cluster variant ids")); + } + + // Required format lines + header.addMetaDataLine(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY)); + + return header; + } + + public VariantContext buildVariantContext(final SVCallRecord call) { + // Add genotypes for missing samples + final GenotypesContext filledGenotypes = SVCallRecordUtils.populateGenotypesForMissingSamplesWithAlleles( + call, samples, !defaultNoCall, ploidyTable); + + // Assign new variant ID + final String newId = variantPrefix == null ? call.getId() : String.format("%s%08x", variantPrefix, numVariantsBuilt++); + + // Build new variant + final SVCallRecord finalCall = new SVCallRecord(newId, call.getContigA(), call.getPositionA(), call.getStrandA(), + call.getContigB(), call.getPositionB(), call.getStrandB(), call.getType(), call.getLength(), + call.getAlgorithms(), call.getAlleles(), filledGenotypes, call.getAttributes()); + final VariantContextBuilder builder = SVCallRecordUtils.getVariantBuilder(finalCall); + if (omitMembers) { + builder.rmAttribute(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY); + } + return builder.make(); + } + + private final class OutputSortingBuffer { + private final TreeSet buffer; + private final SVClusterEngine engine; + + public OutputSortingBuffer(final SVClusterEngine engine) { + this.buffer = new TreeSet<>(SVCallRecordUtils.getCallComparator(dictionary)); + this.engine = engine; + } + + public List flush() { + buffer.addAll(engine.getOutput()); + final Integer minActiveStart = engine.getMinActiveStartingPosition(); + final int minPos = minActiveStart == null ? Integer.MAX_VALUE : minActiveStart; + final List result = buffer.stream() + .filter(record -> !record.getContigA().equals(currentContig) || record.getPositionA() < minPos) + .sorted(recordComparator) + .collect(Collectors.toList()); + buffer.removeAll(result); + return result; + } + + public List forceFlush() { + buffer.addAll(engine.forceFlushAndGetOutput()); + final List result = buffer.stream().sorted(recordComparator).collect(Collectors.toList()); + buffer.clear(); + return result; + } + } + +} diff --git a/src/test/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtilsUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtilsUnitTest.java index f2d53234b98..4685bc938c6 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtilsUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/sv/SVCallRecordUtilsUnitTest.java @@ -6,6 +6,7 @@ import htsjdk.variant.vcf.VCFConstants; import org.broadinstitute.hellbender.testutils.VariantContextTestUtils; import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; +import org.broadinstitute.hellbender.tools.sv.cluster.PloidyTable; import org.broadinstitute.hellbender.utils.variant.GATKSVVariantContextUtils; import org.testng.Assert; import org.testng.annotations.DataProvider; @@ -63,7 +64,6 @@ public Object[][] testGetVariantBuilderData() { .genotypes(GENOTYPE_DEL_1, GENOTYPE_DEL_2) .attribute(VCFConstants.END_KEY, 1999) .attribute(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE, SVTestUtils.DEPTH_ONLY_ALGORITHM_LIST) - .attribute(GATKSVVCFConstants.SVLEN, 1000) .attribute(GATKSVVCFConstants.SVTYPE, StructuralVariantType.DEL) .make(), Collections.emptyList() @@ -80,7 +80,6 @@ public Object[][] testGetVariantBuilderData() { .genotypes(GENOTYPE_DEL_3) .attribute(VCFConstants.END_KEY, 1999) .attribute(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE, SVTestUtils.DEPTH_ONLY_ALGORITHM_LIST) - .attribute(GATKSVVCFConstants.SVLEN, 1000) .attribute(GATKSVVCFConstants.SVTYPE, StructuralVariantType.DEL) .make(), Collections.emptyList() @@ -97,7 +96,6 @@ public Object[][] testGetVariantBuilderData() { .genotypes(GENOTYPE_INS_1) .attribute(VCFConstants.END_KEY, 1000) .attribute(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE, SVTestUtils.PESR_ONLY_ALGORITHM_LIST) - .attribute(GATKSVVCFConstants.SVLEN, 500) .attribute(GATKSVVCFConstants.SVTYPE, StructuralVariantType.INS) .make(), Collections.emptyList() @@ -155,14 +153,31 @@ public void testFillMissingSamplesWithGenotypes() { final Set samplesNoMissing = Sets.newHashSet(GENOTYPE_DEL_1.getSampleName(), GENOTYPE_DEL_2.getSampleName()); final Set samples = Sets.newHashSet(GENOTYPE_DEL_1.getSampleName(), GENOTYPE_DEL_2.getSampleName(), "sample3", "sample4"); - final List alleles = Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL); - final Map attributes = new HashMap<>(); - attributes.put(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM)); - final GenotypesContext resultNoMissing = SVCallRecordUtils.populateGenotypesForMissingSamplesWithAlleles(genotypesContext, samplesNoMissing, alleles, attributes); - Assert.assertEquals(resultNoMissing, genotypesContext); + final Map sample1PloidyMap = Collections.singletonMap("chr20", 2); + final Map sample2PloidyMap = Collections.singletonMap("chr20", 2); + final Map sample3PloidyMap = Collections.singletonMap("chr20", 2); + final Map sample4PloidyMap = Collections.singletonMap("chr20", 3); + final Map> rawPloidyMap = new HashMap<>(); + rawPloidyMap.put("sample1", sample1PloidyMap); + rawPloidyMap.put("sample2", sample2PloidyMap); + rawPloidyMap.put("sample3", sample3PloidyMap); + rawPloidyMap.put("sample4", sample4PloidyMap); + final PloidyTable ploidyTable = new PloidyTable(rawPloidyMap); - final GenotypesContext result = SVCallRecordUtils.populateGenotypesForMissingSamplesWithAlleles(genotypesContext, samples, alleles, attributes); + final GenotypeBuilder builder1a = new GenotypeBuilder("sample1"); + final SVCallRecord record1 = SVTestUtils.makeRecord("record1", "chr20", 1000, true, + "chr20", 2000, false, StructuralVariantType.DEL, null, + Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL), + Lists.newArrayList(new GenotypeBuilder(GENOTYPE_DEL_1), new GenotypeBuilder(GENOTYPE_DEL_2))); + final GenotypesContext resultNoMissing = SVCallRecordUtils.populateGenotypesForMissingSamplesWithAlleles( + record1, samplesNoMissing, true, ploidyTable); + Assert.assertEquals(resultNoMissing.size(), 2); + for (final Genotype g : genotypesContext) { + VariantContextTestUtils.assertGenotypesAreEqual(g, genotypesContext.get(g.getSampleName())); + } + + final GenotypesContext result = SVCallRecordUtils.populateGenotypesForMissingSamplesWithAlleles(record1, samples, true, ploidyTable); Assert.assertEquals(result.size(), 4); final Genotype g1 = result.get(GENOTYPE_DEL_1.getSampleName()); @@ -170,8 +185,16 @@ public void testFillMissingSamplesWithGenotypes() { final Genotype g3 = result.get("sample3"); final Genotype g4 = result.get("sample4"); - final Genotype g3Expected = new GenotypeBuilder("sample3", alleles).attributes(attributes).make(); - final Genotype g4Expected = new GenotypeBuilder("sample4", alleles).attributes(attributes).make(); + final List alleles3 = Collections.nCopies(2, record1.getRefAllele()); + final Genotype g3Expected = new GenotypeBuilder("sample3", alleles3) + .attribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 2) + .attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, 2) + .make(); + final List alleles4 = Collections.nCopies(3, record1.getRefAllele()); + final Genotype g4Expected = new GenotypeBuilder("sample4", alleles4) + .attribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 3) + .attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, 3) + .make(); VariantContextTestUtils.assertGenotypesAreEqual(g1, GENOTYPE_DEL_1); VariantContextTestUtils.assertGenotypesAreEqual(g2, GENOTYPE_DEL_2); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/sv/SVTestUtils.java b/src/test/java/org/broadinstitute/hellbender/tools/sv/SVTestUtils.java index b294026c47d..51a92dcc342 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/sv/SVTestUtils.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/sv/SVTestUtils.java @@ -355,11 +355,19 @@ public static List buildBiallelicListWithPloidy(final Allele altAllele, } public static SVCallRecord newCallRecordWithAlleles(final List genotypeAlleles, final List variantAlleles, - final StructuralVariantType svtype) { + final StructuralVariantType svtype, final Integer expectedCopyNumber, + final Integer copyNumber) { + GenotypeBuilder builder = new GenotypeBuilder("sample").alleles(genotypeAlleles); + if (expectedCopyNumber != null) { + builder = builder.attribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, expectedCopyNumber); + } + if (copyNumber != null) { + builder = builder.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, copyNumber); + } return new SVCallRecord("", "chr1", 100, getValidTestStrandA(svtype), "chr1", 199, getValidTestStrandB(svtype), svtype, 100, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), variantAlleles, - Collections.singletonList(new GenotypeBuilder().alleles(genotypeAlleles).make()), + Collections.singletonList(builder.make()), Collections.emptyMap()); } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/BinnedCNVDefragmenterTest.java b/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/BinnedCNVDefragmenterTest.java index 96c2928f2e1..1791c4d9068 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/BinnedCNVDefragmenterTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/BinnedCNVDefragmenterTest.java @@ -38,7 +38,7 @@ public void testCollapser() { Assert.assertEquals(testGenotype1.getExtendedAttributes(), expectedGenotype1.getExtendedAttributes()); final Genotype testGenotype2 = sameBoundsThreeSamples.getGenotypes().get(SVTestUtils.sample2.make().getSampleName()); - final Genotype expectedGenotype2 = SVTestUtils.sample2.alleles(Collections.nCopies(2, Allele.NO_CALL)).make(); + final Genotype expectedGenotype2 = SVTestUtils.sample2.alleles(Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL)).make(); Assert.assertEquals(testGenotype2.getAlleles(), expectedGenotype2.getAlleles()); Assert.assertEquals(testGenotype2.getExtendedAttributes(), expectedGenotype2.getExtendedAttributes()); @@ -98,7 +98,7 @@ public void testAdd() { temp1.add(SVTestUtils.call1); //force new cluster by adding a non-overlapping event temp1.add(SVTestUtils.call3); - final List output1 = temp1.getOutput(); //flushes all clusters + final List output1 = temp1.forceFlushAndGetOutput(); //flushes all clusters Assert.assertEquals(output1.size(), 2); SVTestUtils.assertEqualsExceptMembershipAndGT(SVTestUtils.call1, output1.get(0)); SVTestUtils.assertEqualsExceptMembershipAndGT(SVTestUtils.call3, output1.get(1)); @@ -108,7 +108,7 @@ public void testAdd() { temp2.add(SVTestUtils.call2); //should overlap after padding //force new cluster by adding a call on another contig temp2.add(SVTestUtils.call4_chr10); - final List output2 = temp2.getOutput(); + final List output2 = temp2.forceFlushAndGetOutput(); Assert.assertEquals(output2.size(), 2); Assert.assertEquals(output2.get(0).getPositionA(), SVTestUtils.call1.getPositionA()); Assert.assertEquals(output2.get(0).getPositionB(), SVTestUtils.call2.getPositionB()); @@ -118,7 +118,7 @@ public void testAdd() { final SVClusterEngine temp3 = SVClusterEngineFactory.createCNVDefragmenter(SVTestUtils.hg38Dict, CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, SVTestUtils.hg38Reference, CNVLinkage.DEFAULT_PADDING_FRACTION, CNVLinkage.DEFAULT_SAMPLE_OVERLAP); temp3.add(SVTestUtils.call1); temp3.add(SVTestUtils.sameBoundsSampleMismatch); - final List output3 = temp3.getOutput(); + final List output3 = temp3.forceFlushAndGetOutput(); Assert.assertEquals(output3.size(), 2); } } \ No newline at end of file diff --git a/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapserUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapserUnitTest.java new file mode 100644 index 00000000000..23ff598afe1 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/CanonicalSVCollapserUnitTest.java @@ -0,0 +1,1807 @@ +package org.broadinstitute.hellbender.tools.sv.cluster; + +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypeBuilder; +import htsjdk.variant.variantcontext.StructuralVariantType; +import org.apache.commons.lang3.tuple.Pair; +import org.broadinstitute.hellbender.testutils.VariantContextTestUtils; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; +import org.broadinstitute.hellbender.tools.sv.SVCallRecord; +import org.broadinstitute.hellbender.tools.sv.SVCallRecordUtils; +import org.broadinstitute.hellbender.tools.sv.SVTestUtils; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +public class CanonicalSVCollapserUnitTest { + + private static final CanonicalSVCollapser collapser = new CanonicalSVCollapser( + SVTestUtils.hg38Reference, + CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, + CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END, + CanonicalSVCollapser.InsertionLengthSummaryStrategy.MEDIAN); + private static final CanonicalSVCollapser collapserMinMax = new CanonicalSVCollapser( + SVTestUtils.hg38Reference, + CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, + CanonicalSVCollapser.BreakpointSummaryStrategy.MIN_START_MAX_END, + CanonicalSVCollapser.InsertionLengthSummaryStrategy.MEDIAN); + private static final CanonicalSVCollapser collapserMaxMin = new CanonicalSVCollapser( + SVTestUtils.hg38Reference, + CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, + CanonicalSVCollapser.BreakpointSummaryStrategy.MAX_START_MIN_END, + CanonicalSVCollapser.InsertionLengthSummaryStrategy.MEDIAN); + private static final CanonicalSVCollapser collapserMean = new CanonicalSVCollapser( + SVTestUtils.hg38Reference, + CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, + CanonicalSVCollapser.BreakpointSummaryStrategy.MEAN_START_MEAN_END, + CanonicalSVCollapser.InsertionLengthSummaryStrategy.MEDIAN); + private static final CanonicalSVCollapser collapserSpecificAltAllele = new CanonicalSVCollapser( + SVTestUtils.hg38Reference, + CanonicalSVCollapser.AltAlleleSummaryStrategy.MOST_SPECIFIC_SUBTYPE, + CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END, + CanonicalSVCollapser.InsertionLengthSummaryStrategy.MEDIAN); + + private static final CanonicalSVCollapser collapserInsertionMean = new CanonicalSVCollapser( + SVTestUtils.hg38Reference, + CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, + CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END, + CanonicalSVCollapser.InsertionLengthSummaryStrategy.MEAN); + private static final CanonicalSVCollapser collapserInsertionMin = new CanonicalSVCollapser( + SVTestUtils.hg38Reference, + CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, + CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END, + CanonicalSVCollapser.InsertionLengthSummaryStrategy.MIN); + private static final CanonicalSVCollapser collapserInsertionMax = new CanonicalSVCollapser( + SVTestUtils.hg38Reference, + CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, + CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END, + CanonicalSVCollapser.InsertionLengthSummaryStrategy.MAX); + private static final CanonicalSVCollapser collapserInsertionUndefined = new CanonicalSVCollapser( + SVTestUtils.hg38Reference, + CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, + CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END, + CanonicalSVCollapser.InsertionLengthSummaryStrategy.UNDEFINED); + + private static final CanonicalSVCollapser.AlleleCollectionCollapserComparator alleleComparator = new CanonicalSVCollapser.AlleleCollectionCollapserComparator(); + + private static final Allele MEI_INSERTION_ALLELE = Allele.create(""); + private static final Allele SVA_INSERTION_ALLELE = Allele.create(""); + private static final Allele LINE_INSERTION_ALLELE = Allele.create(""); + + @DataProvider(name = "expectedCopyNumberTestData") + public Object[][] expectedCopyNumberTestData() { + return new Object[][]{ + // Result should be same value + {new int[]{0}, 0}, + {new int[]{1, 1, 1}, 1}, + {new int[]{2, 2, 2}, 2}, + {new int[]{3, 3, 3, 3}, 3} + }; + } + + @Test(dataProvider= "expectedCopyNumberTestData") + public void testCollapseExpectedCopyNumber(final int[] input, final int result) { + final Collection genotypes = IntStream.of(input).mapToObj(p -> SVTestUtils.buildHomGenotypeWithPloidy(Allele.REF_N, p)).collect(Collectors.toList()); + Assert.assertEquals(collapser.collapseExpectedCopyNumber(genotypes), result); + } + + @DataProvider(name = "collapseRefAllelesTestData") + public Object[][] collapseRefAllelesTestData() { + return new Object[][]{ + {1, Allele.REF_N}, + {100000, Allele.REF_C}, + {100001, Allele.REF_A}, + {100002, Allele.REF_C}, + {100003, Allele.REF_T}, + {248956422, Allele.REF_N}, + }; + } + + @Test(dataProvider= "collapseRefAllelesTestData") + public void testCollapseRefAlleles(final int pos, final Allele result) { + Assert.assertEquals(collapser.collapseRefAlleles("chr1", pos), result); + } + + @DataProvider(name = "harmonizeAltAllelesTestData") + public Object[][] harmonizeAltAllelesTestData() { + return new Object[][]{ + // Basic cases + {Collections.singletonList(Allele.SV_SIMPLE_DEL), + Collections.singletonList(new GenotypeBuilder().alleles(Lists.newArrayList(Allele.REF_A, Allele.SV_SIMPLE_DEL)).make()), + null}, + {Collections.singletonList(Allele.SV_SIMPLE_DEL), + Collections.singletonList(new GenotypeBuilder().alleles(Lists.newArrayList(Allele.REF_A, Allele.REF_A)).make()), + null}, + {Collections.emptyList(), + Collections.singletonList(new GenotypeBuilder().alleles(Lists.newArrayList(Allele.REF_A, Allele.REF_A)).make()), + null}, + // Multiallelic, without subtypes + {Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP), + Collections.singletonList(new GenotypeBuilder().alleles(Lists.newArrayList(Allele.REF_A, Allele.SV_SIMPLE_DEL)).make()), + null}, + {Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP), + Collections.singletonList(new GenotypeBuilder().alleles(Lists.newArrayList(Allele.REF_A, Allele.REF_A)).make()), + null}, + {Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP), + Lists.newArrayList( + new GenotypeBuilder().alleles(Lists.newArrayList(Allele.REF_A, Allele.SV_SIMPLE_DEL)).make(), + new GenotypeBuilder().alleles(Lists.newArrayList(Allele.REF_A, Allele.SV_SIMPLE_DUP)).make() + ), + null}, + // Biallelic, with subtypes + {Collections.singletonList(Allele.SV_SIMPLE_DUP), + Collections.singletonList(new GenotypeBuilder().alleles(Lists.newArrayList(Allele.REF_A, Allele.create(""))).make()), + Collections.singletonList(new GenotypeBuilder().alleles(Lists.newArrayList(Allele.REF_A, Allele.SV_SIMPLE_DUP)).make())}, + {Collections.singletonList(Allele.create("")), + Collections.singletonList(new GenotypeBuilder().alleles(Lists.newArrayList(Allele.REF_A, Allele.SV_SIMPLE_DUP)).make()), + Collections.singletonList(new GenotypeBuilder().alleles(Lists.newArrayList(Allele.REF_A, Allele.create(""))).make())}, + {Collections.singletonList(Allele.create("")), + Lists.newArrayList( + new GenotypeBuilder().alleles(Lists.newArrayList(Allele.REF_A, Allele.SV_SIMPLE_DUP)).make(), + new GenotypeBuilder().alleles(Lists.newArrayList(Allele.REF_A, Allele.create(""))).make() + ), + Lists.newArrayList( + new GenotypeBuilder().alleles(Lists.newArrayList(Allele.REF_A, Allele.create(""))).make(), + new GenotypeBuilder().alleles(Lists.newArrayList(Allele.REF_A, Allele.create(""))).make() + )}, + {Lists.newArrayList(Allele.create("")), + Lists.newArrayList( + new GenotypeBuilder().alleles(Lists.newArrayList(Allele.REF_A, Allele.SV_SIMPLE_INS)).make(), + new GenotypeBuilder().alleles(Lists.newArrayList(Allele.REF_A, Allele.create(""))).make() + ), + Lists.newArrayList( + new GenotypeBuilder().alleles(Lists.newArrayList(Allele.REF_A, Allele.create(""))).make(), + new GenotypeBuilder().alleles(Lists.newArrayList(Allele.REF_A, Allele.create(""))).make() + )}, + }; + } + + @Test(dataProvider= "harmonizeAltAllelesTestData") + public void testHarmonizeAltAlleles(final List altAlleles, final List genotypes, + final List expectedOrNull) { + final List result = collapser.harmonizeAltAlleles(altAlleles, genotypes); + // If null input, just expect the original genotypes + final List expected = expectedOrNull == null ? genotypes : expectedOrNull; + Assert.assertEquals(result.size(), expected.size()); + for (int i = 0; i < expected.size(); i++) { + VariantContextTestUtils.assertGenotypesAreEqual(result.get(i), expected.get(i)); + } + } + + @DataProvider(name = "collapseAltAllelesTestData") + public Object[][] collapseAltAllelesTestData() { + return new Object[][]{ + { + Collections.singletonList(Collections.emptyList()), + StructuralVariantType.DEL, + new Integer[]{0}, + new Integer[]{0}, + Collections.emptyList(), + Collections.emptyList() + }, + { + Collections.singletonList(Collections.singletonList(Allele.REF_N)), + StructuralVariantType.DEL, + new Integer[]{1}, + new Integer[]{1}, + Collections.emptyList(), + Collections.emptyList() + }, + { + Collections.singletonList(Collections.singletonList(Allele.REF_A)), + StructuralVariantType.DEL, + new Integer[]{1}, + new Integer[]{1}, + Collections.emptyList(), + Collections.emptyList() + }, + { + Collections.singletonList(Collections.singletonList(Allele.NO_CALL)), + StructuralVariantType.DEL, + new Integer[]{1}, + new Integer[]{1}, + Collections.emptyList(), + Collections.emptyList() + }, + { + Collections.singletonList(Collections.singletonList(Allele.ALT_A)), + StructuralVariantType.INS, + new Integer[]{1}, + new Integer[]{null}, + Collections.singletonList(Allele.ALT_A), + Collections.singletonList(Allele.ALT_A) + }, + { + Collections.singletonList(Collections.singletonList(Allele.SV_SIMPLE_DEL)), + StructuralVariantType.DEL, + new Integer[]{1}, + new Integer[]{0}, + Collections.singletonList(Allele.SV_SIMPLE_DEL), + Collections.singletonList(Allele.SV_SIMPLE_DEL) + }, + { + Lists.newArrayList(Collections.singletonList(Allele.REF_N), Collections.singletonList(Allele.SV_SIMPLE_DEL)), + StructuralVariantType.DEL, + new Integer[]{1, 1}, + new Integer[]{1, 0}, + Collections.singletonList(Allele.SV_SIMPLE_DEL), + Collections.singletonList(Allele.SV_SIMPLE_DEL) + }, + { + Lists.newArrayList(Lists.newArrayList(Allele.REF_N, Allele.REF_N), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL)), + StructuralVariantType.DEL, + new Integer[]{2, 2}, + new Integer[]{2, 1}, + Collections.singletonList(Allele.SV_SIMPLE_DEL), + Collections.singletonList(Allele.SV_SIMPLE_DEL) + }, + { + Lists.newArrayList(Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DUP), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL)), + StructuralVariantType.CNV, + new Integer[]{2, 2}, + new Integer[]{3, 1}, + Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP), + Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP)}, + { + Lists.newArrayList(Collections.singletonList(MEI_INSERTION_ALLELE), Collections.singletonList(MEI_INSERTION_ALLELE)), + StructuralVariantType.INS, + new Integer[]{1, 1}, + new Integer[]{null, null}, + Collections.singletonList(MEI_INSERTION_ALLELE), + Collections.singletonList(MEI_INSERTION_ALLELE)}, + { + Lists.newArrayList(Collections.singletonList(Allele.SV_SIMPLE_INS), Collections.singletonList(MEI_INSERTION_ALLELE)), + StructuralVariantType.INS, + new Integer[]{1, 1}, + new Integer[]{null, null}, + Collections.singletonList(Allele.SV_SIMPLE_INS), + Collections.singletonList(MEI_INSERTION_ALLELE)}, + { + Lists.newArrayList(Collections.singletonList(MEI_INSERTION_ALLELE), Collections.singletonList(SVA_INSERTION_ALLELE)), + StructuralVariantType.INS, + new Integer[]{1, 1}, + new Integer[]{null, null}, + Collections.singletonList(MEI_INSERTION_ALLELE), + Collections.singletonList(SVA_INSERTION_ALLELE)}, + { + Lists.newArrayList(Collections.singletonList(LINE_INSERTION_ALLELE), Collections.singletonList(SVA_INSERTION_ALLELE)), + StructuralVariantType.INS, + new Integer[]{1, 1}, + new Integer[]{null, null}, + Collections.singletonList(MEI_INSERTION_ALLELE), + Collections.singletonList(MEI_INSERTION_ALLELE)}, + }; + } + + @Test(dataProvider= "collapseAltAllelesTestData") + public void collapseAltAllelesTest(final List> recordGenotypeAlleles, + final StructuralVariantType svtype, + final Integer[] expectedCopyNumber, + final Integer[] copyNumber, + final List resultCommon, + final List resultSpecific) { + final List variantAlleles = recordGenotypeAlleles.stream().flatMap(List::stream).distinct().collect(Collectors.toList()); + final List records = IntStream.range(0, recordGenotypeAlleles.size()) + .mapToObj(i -> SVTestUtils.newCallRecordWithAlleles(recordGenotypeAlleles.get(i), variantAlleles, svtype, expectedCopyNumber[i], copyNumber[i])) + .collect(Collectors.toList()); + + final List sortedTestCommon = SVCallRecordUtils.sortAlleles(collapser.collapseAltAlleles(records)); + final List sortedExpectedCommon = SVCallRecordUtils.sortAlleles(resultCommon); + Assert.assertEquals(sortedTestCommon, sortedExpectedCommon); + + final List sortedTestSpecific = SVCallRecordUtils.sortAlleles(collapserSpecificAltAllele.collapseAltAlleles(records)); + final List sortedExpectedSpecific = SVCallRecordUtils.sortAlleles(resultSpecific); + Assert.assertEquals(sortedTestSpecific, sortedExpectedSpecific); + } + + private static final String TEST_KEY_1 = "TEST_KEY_1"; + private static final String TEST_KEY_2 = "TEST_KEY_2"; + + @DataProvider(name = "collapseAttributesTestData") + public Object[][] collapseAttributesTestData() { + return new Object[][]{ + // Null value + { + Collections.singletonList("var1"), + Collections.singletonList(new String[]{TEST_KEY_1}), + Collections.singletonList(new Object[]{null}), + 2, + new String[]{TEST_KEY_1}, + new Object[]{null} + }, + // Single key / value + { + Collections.singletonList("var1"), + Collections.singletonList(new String[]{TEST_KEY_1}), + Collections.singletonList(new Object[]{30}), + 2, + new String[]{TEST_KEY_1}, + new Object[]{30} + }, + // Two samples, null values + { + Lists.newArrayList("var1", "var2"), + Lists.newArrayList( + new String[]{TEST_KEY_1}, + new String[]{TEST_KEY_1} + ), + Lists.newArrayList( + new Object[]{null}, + new Object[]{null}), + 2, + new String[]{TEST_KEY_1}, + new Object[]{null} + }, + // Two samples, same key/value + { + Lists.newArrayList("var1", "var2"), + Lists.newArrayList( + new String[]{TEST_KEY_1}, + new String[]{TEST_KEY_1} + ), + Lists.newArrayList( + new Object[]{30}, + new Object[]{30}), + 2, + new String[]{TEST_KEY_1}, + new Object[]{30} + }, + // Two samples, same key / different value + { + Lists.newArrayList("var1", "var2"), + Lists.newArrayList( + new String[]{TEST_KEY_1}, + new String[]{TEST_KEY_1} + ), + Lists.newArrayList( + new Object[]{30}, + new Object[]{45}), + 2, + new String[]{TEST_KEY_1}, + new Object[]{null} + }, + // Two samples, one with an extra key + { + Lists.newArrayList("var1", "var2"), + Lists.newArrayList( + new String[]{TEST_KEY_1, TEST_KEY_2}, + new String[]{TEST_KEY_1} + ), + Lists.newArrayList( + new Object[]{30, "VALUE2"}, + new Object[]{30}), + 2, + new String[]{TEST_KEY_1, TEST_KEY_2}, + new Object[]{30, "VALUE2"} + }, + }; + } + + @Test(dataProvider= "collapseAttributesTestData") + public void collapseAttributesTest(final List variantIds, + final List keys, + final List values, + final int expectedCopyNumber, + final String[] expectedKeys, + final Object[] expectedValues) { + final List> inputAttributesList = IntStream.range(0, keys.size()) + .mapToObj(i -> SVTestUtils.keyValueArraysToMap(keys.get(i), values.get(i))) + .collect(Collectors.toList()); + final Map expectedAttributes = SVTestUtils.keyValueArraysToMap(expectedKeys, expectedValues); + expectedAttributes.put(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, expectedCopyNumber); + + // Test as genotype attributes + final List genotypes = inputAttributesList.stream() + .map(m -> new GenotypeBuilder().attributes(m).make()) + .collect(Collectors.toList()); + final Map testGenotypeMap = collapser.collapseGenotypeAttributes(genotypes, expectedCopyNumber); + Assert.assertEquals(testGenotypeMap, expectedAttributes); + + // Test as variant attributes + final List variants = IntStream.range(0, inputAttributesList.size()) + .mapToObj(i -> SVTestUtils.newNamedDeletionRecordWithAttributes(variantIds.get(i), inputAttributesList.get(i))) + .collect(Collectors.toList()); + final Map expectedAttributesWithMembers = new HashMap<>(expectedAttributes); + expectedAttributesWithMembers.put(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, variantIds); + expectedAttributesWithMembers.remove(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT); + final Map testVariantMap = collapser.collapseVariantAttributes(variants); + Assert.assertEquals(testVariantMap, expectedAttributesWithMembers); + } + + private Map createGenotypeTestAttributes(final Integer expectedCopyNumber, final Integer copyNumber, final String testVal) { + final String[] keys = new String[]{ + GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, + GATKSVVCFConstants.COPY_NUMBER_FORMAT, + TEST_KEY_1 + }; + final Object[] vals = new Object[]{expectedCopyNumber, copyNumber, testVal}; + return SVTestUtils.keyValueArraysToMap(keys, vals); + } + + private Map createGenotypeTestAttributes(final Integer expectedCopyNumber, final String testVal) { + final String[] keys = new String[]{ + GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, + TEST_KEY_1 + }; + final Object[] vals = new Object[]{expectedCopyNumber, testVal}; + return SVTestUtils.keyValueArraysToMap(keys, vals); + } + + private Map createGenotypeTestAttributes(final Integer expectedCopyNumber, final Integer copyNumber) { + final String[] keys = new String[]{ + GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, + GATKSVVCFConstants.COPY_NUMBER_FORMAT + }; + final Object[] vals = new Object[]{expectedCopyNumber, copyNumber}; + return SVTestUtils.keyValueArraysToMap(keys, vals); + } + + private Map createGenotypeTestAttributes(final Integer expectedCopyNumber) { + return Collections.singletonMap(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, expectedCopyNumber); + } + + @DataProvider(name = "collapseSampleGenotypesTestData") + public Object[][] collapseSampleGenotypesTestData() { + return new Object[][]{ + // Empty case + { + "sample1", + Collections.singletonList(Collections.emptyList()), + Collections.singletonList(createGenotypeTestAttributes(0)), + Allele.REF_N, + Collections.emptyList(), + GenotypeBuilder.create( + "sample1", + Collections.emptyList(), + createGenotypeTestAttributes(0) + ) + }, + // Extra attribute + { + "sample1", + Collections.singletonList(Collections.emptyList()), + Collections.singletonList(createGenotypeTestAttributes(0, "test")), + Allele.REF_N, + Collections.emptyList(), + GenotypeBuilder.create( + "sample1", + Collections.emptyList(), + createGenotypeTestAttributes(0, "test") + ) + }, + // Haploid no-call + { + "sample1", + Lists.newArrayList( + Collections.singletonList(Allele.NO_CALL), + Collections.singletonList(Allele.NO_CALL) + ), + Lists.newArrayList( + createGenotypeTestAttributes(1), + createGenotypeTestAttributes(1) + ), + Allele.REF_N, + Collections.emptyList(), + GenotypeBuilder.create( + "sample1", + Collections.singletonList(Allele.NO_CALL), + createGenotypeTestAttributes(1) + ) + }, + // Simple ref haploid + { + "sample1", + Lists.newArrayList( + Collections.singletonList(Allele.REF_N), + Collections.singletonList(Allele.REF_N) + ), + Lists.newArrayList( + createGenotypeTestAttributes(1), + createGenotypeTestAttributes(1) + ), + Allele.REF_N, + Collections.emptyList(), + GenotypeBuilder.create( + "sample1", + Collections.singletonList(Allele.REF_N), + createGenotypeTestAttributes(1) + ) + }, + // Simple ref haploid, different ref allele + { + "sample1", + Lists.newArrayList( + Collections.singletonList(Allele.REF_T), + Collections.singletonList(Allele.REF_T) + ), + Lists.newArrayList( + createGenotypeTestAttributes(1), + createGenotypeTestAttributes(1) + ), + Allele.REF_T, + Collections.emptyList(), + GenotypeBuilder.create( + "sample1", + Collections.singletonList(Allele.REF_T), + createGenotypeTestAttributes(1) + ) + }, + // Simple ref haploid, with no-call + { + "sample1", + Lists.newArrayList( + Collections.singletonList(Allele.REF_T), + Collections.singletonList(Allele.NO_CALL) + ), + Lists.newArrayList( + createGenotypeTestAttributes(1), + createGenotypeTestAttributes(1) + ), + Allele.REF_T, + Collections.emptyList(), + GenotypeBuilder.create( + "sample1", + Collections.singletonList(Allele.REF_T), + createGenotypeTestAttributes(1) + ) + }, + // Simple ref diploid + { + "sample1", + Lists.newArrayList( + Lists.newArrayList(Allele.REF_N, Allele.REF_N), + Lists.newArrayList(Allele.REF_N, Allele.REF_N) + ), + Lists.newArrayList( + createGenotypeTestAttributes(2), + createGenotypeTestAttributes(2) + ), + Allele.REF_N, + Collections.emptyList(), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.REF_N, Allele.REF_N), + createGenotypeTestAttributes(2) + ) + }, + // Simple ref diploid, with no-call + { + "sample1", + Lists.newArrayList( + Lists.newArrayList(Allele.REF_N, Allele.REF_N), + Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL) + ), + Lists.newArrayList( + createGenotypeTestAttributes(2), + createGenotypeTestAttributes(2) + ), + Allele.REF_N, + Collections.emptyList(), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.REF_N, Allele.REF_N), + createGenotypeTestAttributes(2) + ) + }, + // Simple INS cases + { + "sample1", + Lists.newArrayList( + Collections.singletonList(Allele.REF_N), + Collections.singletonList(Allele.SV_SIMPLE_INS) + ), + Lists.newArrayList( + createGenotypeTestAttributes(1), + createGenotypeTestAttributes(1) + ), + Allele.REF_N, + Collections.singletonList(Allele.SV_SIMPLE_INS), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.SV_SIMPLE_INS), + createGenotypeTestAttributes(1) + ) + }, + { + "sample1", + Lists.newArrayList( + Collections.singletonList(Allele.NO_CALL), + Collections.singletonList(Allele.SV_SIMPLE_INS) + ), + Lists.newArrayList( + createGenotypeTestAttributes(1), + createGenotypeTestAttributes(1) + ), + Allele.REF_N, + Collections.singletonList(Allele.SV_SIMPLE_INS), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.SV_SIMPLE_INS), + createGenotypeTestAttributes(1) + ) + }, + { + "sample1", + Lists.newArrayList( + Collections.singletonList(Allele.SV_SIMPLE_INS), + Collections.singletonList(Allele.SV_SIMPLE_INS) + ), + Lists.newArrayList( + createGenotypeTestAttributes(1), + createGenotypeTestAttributes(1) + ), + Allele.REF_N, + Collections.singletonList(Allele.SV_SIMPLE_INS), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.SV_SIMPLE_INS), + createGenotypeTestAttributes(1) + ) + }, + // het preferred over hom ref + { + "sample1", + Lists.newArrayList( + Lists.newArrayList(Allele.REF_N, Allele.REF_N), + Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS) + ), + Lists.newArrayList( + createGenotypeTestAttributes(2), + createGenotypeTestAttributes(2) + ), + Allele.REF_N, + Collections.singletonList(Allele.SV_SIMPLE_INS), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS), + createGenotypeTestAttributes(2) + ) + }, + // het preferred over hom var + { + "sample1", + Lists.newArrayList( + Lists.newArrayList(Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS), + Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS) + ), + Lists.newArrayList( + createGenotypeTestAttributes(2), + createGenotypeTestAttributes(2) + ), + Allele.REF_N, + Collections.singletonList(Allele.SV_SIMPLE_INS), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS), + createGenotypeTestAttributes(2) + ) + }, + // hom more frequent + { + "sample1", + Lists.newArrayList( + Lists.newArrayList(Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS), + Lists.newArrayList(Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS), + Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS) + ), + Lists.newArrayList( + createGenotypeTestAttributes(2), + createGenotypeTestAttributes(2), + createGenotypeTestAttributes(2) + ), + Allele.REF_N, + Collections.singletonList(Allele.SV_SIMPLE_INS), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS), + createGenotypeTestAttributes(2) + ) + }, + // het preferred over both hom ref always and hom var when tied + { + "sample1", + Lists.newArrayList( + Lists.newArrayList(Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS), + Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS), + Lists.newArrayList(Allele.REF_N, Allele.REF_N), + Lists.newArrayList(Allele.REF_N, Allele.REF_N) + ), + Lists.newArrayList( + createGenotypeTestAttributes(2), + createGenotypeTestAttributes(2), + createGenotypeTestAttributes(2), + createGenotypeTestAttributes(2) + ), + Allele.REF_N, + Collections.singletonList(Allele.SV_SIMPLE_INS), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS), + createGenotypeTestAttributes(2) + ) + }, + // hom is most frequent non-ref + { + "sample1", + Lists.newArrayList( + Lists.newArrayList(Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS), + Lists.newArrayList(Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS), + Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS), + Lists.newArrayList(Allele.REF_N, Allele.REF_N), + Lists.newArrayList(Allele.REF_N, Allele.REF_N), + Lists.newArrayList(Allele.REF_N, Allele.REF_N) + ), + Lists.newArrayList( + createGenotypeTestAttributes(2), + createGenotypeTestAttributes(2), + createGenotypeTestAttributes(2), + createGenotypeTestAttributes(2), + createGenotypeTestAttributes(2), + createGenotypeTestAttributes(2) + ), + Allele.REF_N, + Collections.singletonList(Allele.SV_SIMPLE_INS), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS), + createGenotypeTestAttributes(2) + ) + }, + // triploid - 1 alt + { + "sample1", + Lists.newArrayList( + Lists.newArrayList(Allele.REF_N, Allele.REF_N, Allele.SV_SIMPLE_INS), + Lists.newArrayList(Allele.REF_N, Allele.REF_N, Allele.REF_N) + ), + Lists.newArrayList( + createGenotypeTestAttributes(3), + createGenotypeTestAttributes(3) + ), + Allele.REF_N, + Collections.singletonList(Allele.SV_SIMPLE_INS), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.REF_N, Allele.REF_N, Allele.SV_SIMPLE_INS), + createGenotypeTestAttributes(3) + ) + }, + { + "sample1", + Lists.newArrayList( + Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS), + Lists.newArrayList(Allele.REF_N, Allele.REF_N, Allele.SV_SIMPLE_INS), + Lists.newArrayList(Allele.REF_N, Allele.REF_N, Allele.REF_N) + ), + Lists.newArrayList( + createGenotypeTestAttributes(3), + createGenotypeTestAttributes(3), + createGenotypeTestAttributes(3) + ), + Allele.REF_N, + Collections.singletonList(Allele.SV_SIMPLE_INS), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.REF_N, Allele.REF_N, Allele.SV_SIMPLE_INS), + createGenotypeTestAttributes(3) + ) + }, + // triploid - 2 alt + { + "sample1", + Lists.newArrayList( + Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS), + Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS), + Lists.newArrayList(Allele.REF_N, Allele.REF_N, Allele.SV_SIMPLE_INS), + Lists.newArrayList(Allele.REF_N, Allele.REF_N, Allele.REF_N) + ), + Lists.newArrayList( + createGenotypeTestAttributes(3), + createGenotypeTestAttributes(3), + createGenotypeTestAttributes(3), + createGenotypeTestAttributes(3) + ), + Allele.REF_N, + Collections.singletonList(Allele.SV_SIMPLE_INS), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS), + createGenotypeTestAttributes(3) + ) + }, + // triploid - 3 alt + { + "sample1", + Lists.newArrayList( + Lists.newArrayList(Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS), + Lists.newArrayList(Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS), + Lists.newArrayList(Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS), + Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS), + Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS), + Lists.newArrayList(Allele.REF_N, Allele.REF_N, Allele.SV_SIMPLE_INS), + Lists.newArrayList(Allele.REF_N, Allele.REF_N, Allele.REF_N) + ), + Lists.newArrayList( + createGenotypeTestAttributes(3), + createGenotypeTestAttributes(3), + createGenotypeTestAttributes(3), + createGenotypeTestAttributes(3), + createGenotypeTestAttributes(3), + createGenotypeTestAttributes(3), + createGenotypeTestAttributes(3) + ), + Allele.REF_N, + Collections.singletonList(Allele.SV_SIMPLE_INS), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS), + createGenotypeTestAttributes(3) + ) + }, + // Simple DEL + { + "sample1", + Lists.newArrayList( + Collections.singletonList(Allele.REF_N), + Collections.singletonList(Allele.SV_SIMPLE_DEL) + ), + Lists.newArrayList( + createGenotypeTestAttributes(1, 1), + createGenotypeTestAttributes(1, 0) + ), + Allele.REF_N, + Collections.singletonList(Allele.SV_SIMPLE_DEL), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.SV_SIMPLE_DEL), + createGenotypeTestAttributes(1, 0) + ) + }, + // Simple DEL, falling back on copy number info when GT not available + { + "sample1", + Lists.newArrayList( + Collections.singletonList(Allele.REF_N), + Collections.singletonList(Allele.NO_CALL) + ), + Lists.newArrayList( + createGenotypeTestAttributes(1, 1), + createGenotypeTestAttributes(1, 0) + ), + Allele.REF_N, + Collections.singletonList(Allele.SV_SIMPLE_DEL), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.SV_SIMPLE_DEL), + createGenotypeTestAttributes(1, 0) + ) + }, + // Simple DEL diploid het + { + "sample1", + Lists.newArrayList( + Lists.newArrayList(Allele.REF_N, Allele.REF_N), + Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL) + ), + Lists.newArrayList( + createGenotypeTestAttributes(2, 2), + createGenotypeTestAttributes(2, 1) + ), + Allele.REF_N, + Collections.singletonList(Allele.SV_SIMPLE_DEL), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL), + createGenotypeTestAttributes(2, 1) + ) + }, + // Simple DEL diploid hom var + { + "sample1", + Lists.newArrayList( + Lists.newArrayList(Allele.REF_N, Allele.REF_N), + Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DEL) + ), + Lists.newArrayList( + createGenotypeTestAttributes(2, 2), + createGenotypeTestAttributes(2, 0) + ), + Allele.REF_N, + Collections.singletonList(Allele.SV_SIMPLE_DEL), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DEL), + createGenotypeTestAttributes(2, 0) + ) + }, + // Simple DEL diploid hom ref + { + "sample1", + Lists.newArrayList( + Lists.newArrayList(Allele.REF_N, Allele.REF_N), + Lists.newArrayList(Allele.REF_N, Allele.REF_N) + ), + Lists.newArrayList( + createGenotypeTestAttributes(2, 2), + createGenotypeTestAttributes(2, 2) + ), + Allele.REF_N, + Collections.singletonList(Allele.SV_SIMPLE_DEL), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.REF_N, Allele.REF_N), + createGenotypeTestAttributes(2, 2) + ) + }, + // Simple DEL triploid with 1 alt + { + "sample1", + Lists.newArrayList( + Lists.newArrayList(Allele.REF_N, Allele.REF_N, Allele.REF_N), + Lists.newArrayList(Allele.REF_N, Allele.REF_N, Allele.SV_SIMPLE_DEL) + ), + Lists.newArrayList( + createGenotypeTestAttributes(3, 3), + createGenotypeTestAttributes(3, 2) + ), + Allele.REF_N, + Collections.singletonList(Allele.SV_SIMPLE_DEL), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.REF_N, Allele.REF_N, Allele.SV_SIMPLE_DEL), + createGenotypeTestAttributes(3, 2) + ) + }, + + // Simple DUP, haploid + { + "sample1", + Lists.newArrayList( + Collections.singletonList(Allele.REF_N), + Collections.singletonList(Allele.SV_SIMPLE_DUP) + ), + Lists.newArrayList( + createGenotypeTestAttributes(1, 1), + createGenotypeTestAttributes(1, 2) + ), + Allele.REF_N, + Collections.singletonList(Allele.SV_SIMPLE_DUP), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.SV_SIMPLE_DUP), + createGenotypeTestAttributes(1, 2) + ) + }, + // Simple DUP, haploid, falling back on copy number info when GT not available and inferring the GT + { + "sample1", + Lists.newArrayList( + Collections.singletonList(Allele.REF_N), + Collections.singletonList(Allele.NO_CALL) + ), + Lists.newArrayList( + createGenotypeTestAttributes(1, 1), + createGenotypeTestAttributes(1, 2) + ), + Allele.REF_N, + Collections.singletonList(Allele.SV_SIMPLE_DUP), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.SV_SIMPLE_DUP), + createGenotypeTestAttributes(1, 2) + ) + }, + // Simple DUP, haploid, copy number 3 but phasing is still unambiguous + { + "sample1", + Lists.newArrayList( + Collections.singletonList(Allele.REF_N), + Collections.singletonList(Allele.SV_SIMPLE_DUP) + ), + Lists.newArrayList( + createGenotypeTestAttributes(1, 1), + createGenotypeTestAttributes(1, 3) + ), + Allele.REF_N, + Collections.singletonList(Allele.SV_SIMPLE_DUP), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.SV_SIMPLE_DUP), + createGenotypeTestAttributes(1, 3) + ) + }, + // Simple DUP diploid het + { + "sample1", + Lists.newArrayList( + Lists.newArrayList(Allele.REF_N, Allele.REF_N), + Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DUP) + ), + Lists.newArrayList( + createGenotypeTestAttributes(2, 2), + createGenotypeTestAttributes(2, 3) + ), + Allele.REF_N, + Collections.singletonList(Allele.SV_SIMPLE_DUP), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DUP), + createGenotypeTestAttributes(2, 3) + ) + }, + // Simple DUP diploid hom var - has ambiguous alleles + { + "sample1", + Lists.newArrayList( + Lists.newArrayList(Allele.REF_N, Allele.REF_N), + Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL) + ), + Lists.newArrayList( + createGenotypeTestAttributes(2, 2), + createGenotypeTestAttributes(2, 4) + ), + Allele.REF_N, + Collections.singletonList(Allele.SV_SIMPLE_DUP), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL), + createGenotypeTestAttributes(2, 4) + ) + }, + // Simple DUP diploid hom ref + { + "sample1", + Lists.newArrayList( + Lists.newArrayList(Allele.REF_N, Allele.REF_N), + Lists.newArrayList(Allele.REF_N, Allele.REF_N) + ), + Lists.newArrayList( + createGenotypeTestAttributes(2, 2), + createGenotypeTestAttributes(2, 2) + ), + Allele.REF_N, + Collections.singletonList(Allele.SV_SIMPLE_DUP), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.REF_N, Allele.REF_N), + createGenotypeTestAttributes(2, 2) + ) + }, + // Simple DUP triploid with 1 alt - unambiguous alleles + { + "sample1", + Lists.newArrayList( + Lists.newArrayList(Allele.REF_N, Allele.REF_N, Allele.REF_N), + Lists.newArrayList(Allele.REF_N, Allele.REF_N, Allele.SV_SIMPLE_DUP) + ), + Lists.newArrayList( + createGenotypeTestAttributes(3, 3), + createGenotypeTestAttributes(3, 4) + ), + Allele.REF_N, + Collections.singletonList(Allele.SV_SIMPLE_DUP), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.REF_N, Allele.REF_N, Allele.SV_SIMPLE_DUP), + createGenotypeTestAttributes(3, 4) + ) + }, + // Simple DUP triploid with 2 alts - ambiguous alleles + { + "sample1", + Lists.newArrayList( + Lists.newArrayList(Allele.REF_N, Allele.REF_N, Allele.REF_N), + Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL, Allele.NO_CALL) + ), + Lists.newArrayList( + createGenotypeTestAttributes(3, 3), + createGenotypeTestAttributes(3, 5) + ), + Allele.REF_N, + Collections.singletonList(Allele.SV_SIMPLE_DUP), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL, Allele.NO_CALL), + createGenotypeTestAttributes(3, 5) + ) + }, + // Simple DUP triploid where 1-alt genotype should be prioritized + { + "sample1", + Lists.newArrayList( + Lists.newArrayList(Allele.REF_N, Allele.REF_N, Allele.REF_N), + Lists.newArrayList(Allele.REF_N, Allele.REF_N, Allele.SV_SIMPLE_DUP), + Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL, Allele.NO_CALL) + ), + Lists.newArrayList( + createGenotypeTestAttributes(3, 3), + createGenotypeTestAttributes(3, 4), + createGenotypeTestAttributes(3, 5) + ), + Allele.REF_N, + Collections.singletonList(Allele.SV_SIMPLE_DUP), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.REF_N, Allele.REF_N, Allele.SV_SIMPLE_DUP), + createGenotypeTestAttributes(3, 4) + ) + }, + // Simple DUP triploid where 2-alt genotype should be prioritized + { + "sample1", + Lists.newArrayList( + Lists.newArrayList(Allele.REF_N, Allele.REF_N, Allele.REF_N), + Lists.newArrayList(Allele.REF_N, Allele.REF_N, Allele.SV_SIMPLE_DUP), + Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL, Allele.NO_CALL), + Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL, Allele.NO_CALL) + ), + Lists.newArrayList( + createGenotypeTestAttributes(3, 3), + createGenotypeTestAttributes(3, 4), + createGenotypeTestAttributes(3, 5), + createGenotypeTestAttributes(3, 5) + ), + Allele.REF_N, + Collections.singletonList(Allele.SV_SIMPLE_DUP), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL, Allele.NO_CALL), + createGenotypeTestAttributes(3, 5) + ) + }, + + + // Multi-allelic CNV, haploid hom ref + { + "sample1", + Lists.newArrayList( + Collections.singletonList(Allele.REF_N), + Collections.singletonList(Allele.REF_N) + ), + Lists.newArrayList( + createGenotypeTestAttributes(1, 1), + createGenotypeTestAttributes(1, 1) + ), + Allele.REF_N, + Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.REF_N), + createGenotypeTestAttributes(1, 1) + ) + }, + // Multi-allelic CNV, haploid del + { + "sample1", + Lists.newArrayList( + Collections.singletonList(Allele.REF_N), + Collections.singletonList(Allele.SV_SIMPLE_DEL) + ), + Lists.newArrayList( + createGenotypeTestAttributes(1, 1), + createGenotypeTestAttributes(1, 0) + ), + Allele.REF_N, + Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.SV_SIMPLE_DEL), + createGenotypeTestAttributes(1, 0) + ) + }, + // Multi-allelic CNV, haploid dup + { + "sample1", + Lists.newArrayList( + Collections.singletonList(Allele.REF_N), + Collections.singletonList(Allele.SV_SIMPLE_DUP) + ), + Lists.newArrayList( + createGenotypeTestAttributes(1, 1), + createGenotypeTestAttributes(1, 2) + ), + Allele.REF_N, + Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.SV_SIMPLE_DUP), + createGenotypeTestAttributes(1, 2) + ) + }, + // Multi-allelic CNV, diploid hom ref (ambiguous alleles) + { + "sample1", + Lists.newArrayList( + Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL), + Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL) + ), + Lists.newArrayList( + createGenotypeTestAttributes(2, 2), + createGenotypeTestAttributes(2, 2) + ), + Allele.REF_N, + Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL), + createGenotypeTestAttributes(2, 2) + ) + }, + // Multi-allelic CNV, diploid del het + { + "sample1", + Lists.newArrayList( + Lists.newArrayList(Allele.REF_N, Allele.REF_N), + Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL) + ), + Lists.newArrayList( + createGenotypeTestAttributes(2, 2), + createGenotypeTestAttributes(2, 1) + ), + Allele.REF_N, + Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL), + createGenotypeTestAttributes(2, 1) + ) + }, + // Multi-allelic CNV, diploid del hom + { + "sample1", + Lists.newArrayList( + Lists.newArrayList(Allele.REF_N, Allele.REF_N), + Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DEL) + ), + Lists.newArrayList( + createGenotypeTestAttributes(2, 2), + createGenotypeTestAttributes(2, 0) + ), + Allele.REF_N, + Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DEL), + createGenotypeTestAttributes(2, 0) + ) + }, + // Multi-allelic CNV, diploid dup het (ambiguous alleles) + { + "sample1", + Lists.newArrayList( + Lists.newArrayList(Allele.REF_N, Allele.REF_N), + Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL) + ), + Lists.newArrayList( + createGenotypeTestAttributes(2, 2), + createGenotypeTestAttributes(2, 3) + ), + Allele.REF_N, + Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL), + createGenotypeTestAttributes(2, 3) + ) + }, + // Multi-allelic CNV, diploid dup hom (ambiguous alleles) + { + "sample1", + Lists.newArrayList( + Lists.newArrayList(Allele.REF_N, Allele.REF_N), + Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL) + ), + Lists.newArrayList( + createGenotypeTestAttributes(2, 2), + createGenotypeTestAttributes(2, 4) + ), + Allele.REF_N, + Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL), + createGenotypeTestAttributes(2, 4) + ) + }, + // Multi-allelic CNV, conflicting del and dup genotypes should result in non-call, haploid + { + "sample1", + Lists.newArrayList( + Collections.singletonList(Allele.SV_SIMPLE_DUP), + Collections.singletonList(Allele.SV_SIMPLE_DEL) + ), + Lists.newArrayList( + createGenotypeTestAttributes(1, 2), + createGenotypeTestAttributes(1, 0) + ), + Allele.REF_N, + Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.NO_CALL), + createGenotypeTestAttributes(1, (Integer) null) + ) + }, + // Multi-allelic CNV, conflicting del and dup genotypes should result in non-call, diploid + { + "sample1", + Lists.newArrayList( + Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL), + Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL) + ), + Lists.newArrayList( + createGenotypeTestAttributes(2, 1), + createGenotypeTestAttributes(2, 3) + ), + Allele.REF_N, + Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP), + GenotypeBuilder.create( + "sample1", + Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL), + createGenotypeTestAttributes(2, (Integer) null) + ) + } + }; + } + + @Test(dataProvider="collapseSampleGenotypesTestData") + public void collapseSampleGenotypesTest(final String sampleId, + final List> alleles, + final List> attributes, + final Allele refAllele, + final List altAlleles, + final Genotype expected) { + final List genotypes = IntStream.range(0, alleles.size()) + .mapToObj(i -> GenotypeBuilder.create(sampleId, alleles.get(i), attributes.get(i))) + .collect(Collectors.toList()); + final Genotype test = collapser.collapseSampleGenotypes(genotypes, refAllele, altAlleles); + VariantContextTestUtils.assertGenotypesAreEqual(test, expected); + } + + @DataProvider(name = "makeBiallelicListTestData") + public Object[][] makeBiallelicListTestData() { + return new Object[][]{ + // No alleles + {Allele.SV_SIMPLE_DEL, Allele.REF_A, 0, 0, Collections.emptyList()}, + {Allele.SV_SIMPLE_DEL, Allele.REF_A, 0, 1, Collections.singletonList(Allele.REF_A)}, + {Allele.SV_SIMPLE_DEL, Allele.REF_A, 1, 1, Collections.singletonList(Allele.SV_SIMPLE_DEL)}, + {Allele.SV_SIMPLE_INS, Allele.REF_A, 1, 1, Collections.singletonList(Allele.SV_SIMPLE_INS)}, + {Allele.SV_SIMPLE_DEL, Allele.REF_A, 1, 2, Lists.newArrayList(Allele.REF_A, Allele.SV_SIMPLE_DEL)}, + {Allele.SV_SIMPLE_DEL, Allele.REF_A, 2, 2, Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DEL)}, + {Allele.SV_SIMPLE_DEL, Allele.REF_A, 2, 2, Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DEL)}, + {Allele.SV_SIMPLE_DEL, Allele.REF_A, 2, 3, Lists.newArrayList(Allele.REF_A, Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DEL)}, + }; + } + + @Test(dataProvider="makeBiallelicListTestData") + public void makeBiallelicListTest(final Allele alt, final Allele ref, final int numAlt, + final int ploidy, final List result) { + final List test = CanonicalSVCollapser.makeBiallelicList(alt, ref, numAlt, ploidy); + Assert.assertEquals(test.stream().sorted().collect(Collectors.toList()), + result.stream().sorted().collect(Collectors.toList())); + } + + @DataProvider(name = "collapseLengthTestData") + public Object[][] collapseLengthTestData() { + return new Object[][]{ + { + new Integer[]{1000}, + new String[]{"chr1"}, + new StructuralVariantType[]{StructuralVariantType.DEL}, + StructuralVariantType.DEL, + null + }, + { + new Integer[]{1000, 1000}, + new String[]{"chr1", "chr1"}, + new StructuralVariantType[]{StructuralVariantType.DUP, StructuralVariantType.DUP}, + StructuralVariantType.DUP, + null + }, + { + new Integer[]{300, 400}, + new String[]{"chr1", "chr1"}, + new StructuralVariantType[]{StructuralVariantType.DEL, StructuralVariantType.DUP}, + StructuralVariantType.CNV, + null + }, + { + new Integer[]{300, 400}, + new String[]{"chr1", "chr1"}, + new StructuralVariantType[]{StructuralVariantType.INS, StructuralVariantType.INS}, + StructuralVariantType.INS, + 300 + }, + { + new Integer[]{300, 400, 500}, + new String[]{"chr1", "chr1", "chr1"}, + new StructuralVariantType[]{StructuralVariantType.INS, StructuralVariantType.INS, StructuralVariantType.INS}, + StructuralVariantType.INS, + 400 + }, + { + new Integer[]{null}, + new String[]{"chr2"}, + new StructuralVariantType[]{StructuralVariantType.BND}, + StructuralVariantType.BND, + null + }, + { + new Integer[]{null, null}, + new String[]{"chr2", "chr2"}, + new StructuralVariantType[]{StructuralVariantType.BND, StructuralVariantType.BND}, + StructuralVariantType.BND, + null + } + }; + } + + @Test(dataProvider= "collapseLengthTestData") + public void collapseLengthTest(final Integer[] lengths, final String[] chrom2, final StructuralVariantType[] svtypes, + final StructuralVariantType newType, final Integer expectedLength) { + final List records = IntStream.range(0, lengths.length).mapToObj(i -> SVTestUtils.newCallRecordWithLengthAndTypeAndChrom2(lengths[i], svtypes[i], chrom2[i])).collect(Collectors.toList()); + Assert.assertEquals(collapser.collapseLength(records, newType), expectedLength); + } + + @DataProvider(name = "collapseInsertionLengthTestData") + public Object[][] collapseInsertionLengthTestData() { + return new Object[][]{ + { + new Integer[]{}, + null, + null, + null, + null, + null + }, + { + new Integer[]{null}, + null, + null, + null, + null, + null + }, + { + new Integer[]{100}, + 100, + 100, + 100, + 100, + 100 + }, + { + new Integer[]{null, 100}, + 100, + 100, + 100, + 100, + 100 + }, + { + new Integer[]{100, null}, + 100, + 100, + 100, + 100, + 100 + }, + { + new Integer[]{200, 100}, + 100, + 150, + 100, + 200, + null + }, + { + new Integer[]{200, 100, 400}, + 200, + 234, + 100, + 400, + null + } + }; + } + + @Test(dataProvider= "collapseInsertionLengthTestData") + public void collapseInsertionLengthTest(final Integer[] lengths, + final Integer expectedMedian, + final Integer expectedMean, + final Integer expectedMin, + final Integer expectedMax, + final Integer expectedUndefined) { + final List records = IntStream.range(0, lengths.length).mapToObj(i -> SVTestUtils.newCallRecordWithLengthAndTypeAndChrom2(lengths[i], StructuralVariantType.INS, "chr1")).collect(Collectors.toList()); + Assert.assertEquals(collapser.collapseInsertionLength(records), expectedMedian); + Assert.assertEquals(collapserInsertionMean.collapseInsertionLength(records), expectedMean); + Assert.assertEquals(collapserInsertionMin.collapseInsertionLength(records), expectedMin); + Assert.assertEquals(collapserInsertionMax.collapseInsertionLength(records), expectedMax); + Assert.assertEquals(collapserInsertionUndefined.collapseInsertionLength(records), expectedUndefined); + + } + + @DataProvider(name = "collapseIdsTestData") + public Object[][] collapseIdsTestData() { + return new Object[][]{ + {Collections.singletonList("var1"), "var1"}, + {Lists.newArrayList("var1", "var2"), "var1"}, + {Lists.newArrayList("var2", "var1"), "var1"}, + }; + } + + @Test(dataProvider= "collapseIdsTestData") + public void collapseIdsTest(final List ids, final String expectedResult) { + final List records = ids.stream().map(SVTestUtils::newDeletionCallRecordWithId).collect(Collectors.toList()); + final String result = collapser.collapseIds(records); + Assert.assertEquals(result, expectedResult); + } + + @DataProvider(name = "getMostPreciseCallsTestData") + public Object[][] getMostPreciseCallsTestData() { + return new Object[][]{ + { + new String[]{"depth1"}, + Collections.singletonList( + Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM) + ), + Sets.newHashSet("depth1") + }, + { + new String[]{"depth1", "depth2"}, + Lists.newArrayList( + Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), + Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM) + ), + Sets.newHashSet("depth1", "depth2") + }, + { + new String[]{"pesr1"}, + Collections.singletonList( + SVTestUtils.PESR_ONLY_ALGORITHM_LIST + ), + Sets.newHashSet("pesr1") + }, + { + new String[]{"pesr1", "pesr2"}, + Lists.newArrayList( + SVTestUtils.PESR_ONLY_ALGORITHM_LIST, + SVTestUtils.PESR_ONLY_ALGORITHM_LIST + ), + Sets.newHashSet("pesr1", "pesr2") + }, + { + new String[]{"depth1", "pesr1"}, + Lists.newArrayList( + Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), + SVTestUtils.PESR_ONLY_ALGORITHM_LIST + ), + Sets.newHashSet("pesr1"), + }, + { + new String[]{"mixed1", "depth1"}, + Lists.newArrayList( + Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM, SVTestUtils.PESR_ALGORITHM), + Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM) + ), + Sets.newHashSet("mixed1"), + }, + { + new String[]{"mixed1", "pesr1"}, + Lists.newArrayList( + Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM, SVTestUtils.PESR_ALGORITHM), + SVTestUtils.PESR_ONLY_ALGORITHM_LIST + ), + Sets.newHashSet("mixed1", "pesr1"), + }, + }; + } + + @Test(dataProvider= "getMostPreciseCallsTestData") + public void getMostPreciseCallsTest(final String[] ids, final List> algorithms, final Set expectedIds) { + final List records = IntStream.range(0, ids.length).mapToObj(i -> SVTestUtils.newDeletionCallRecordWithIdAndAlgorithms(ids[i], algorithms.get(i))).collect(Collectors.toList()); + final List resultIds = collapser.getRecordsWithMostPreciseBreakpoints(records).stream().map(SVCallRecord::getId).collect(Collectors.toList()); + Assert.assertEquals(resultIds.size(), expectedIds.size()); + Assert.assertTrue(expectedIds.containsAll(resultIds)); + } + + @DataProvider(name = "collapseIntervalTestData") + public Object[][] collapseIntervalTestData() { + return new Object[][]{ + // 1 variant + { + new int[]{1001}, // starts + new int[]{1100}, // ends + StructuralVariantType.DEL, + new int[]{1001, 1100}, // median strategy + new int[]{1001, 1100}, // min-max strategy + new int[]{1001, 1100}, // max-min strategy + new int[]{1001, 1100} // mean strategy + }, + // 2 variants + { + new int[]{1001, 1011}, + new int[]{1100, 1110}, + StructuralVariantType.DEL, + new int[]{1001, 1100}, + new int[]{1001, 1110}, + new int[]{1011, 1100}, + new int[]{1006, 1105} + }, + // 3 variants + { + new int[]{1001, 1011, 1021}, + new int[]{1100, 1110, 1121}, + StructuralVariantType.DUP, + new int[]{1011, 1110}, + new int[]{1001, 1121}, + new int[]{1021, 1100}, + new int[]{1011, 1110} + }, + // 3 variants, ends overlap starts + { + new int[]{1001, 1011, 1021}, + new int[]{1031, 1011, 1021}, + StructuralVariantType.DUP, + new int[]{1011, 1021}, + new int[]{1001, 1031}, + new int[]{1021, 1021}, // min end before max start + new int[]{1011, 1021} + }, + // BND + { + new int[]{1001, 1011, 1021}, + new int[]{1100, 1110, 1121}, + StructuralVariantType.BND, + new int[]{1011, 1110}, + new int[]{1001, 1121}, + new int[]{1021, 1100}, + new int[]{1011, 1110} + }, + // INS, same start/end + { + new int[]{1001, 1011, 1021}, + new int[]{1001, 1011, 1021}, + StructuralVariantType.INS, + new int[]{1011, 1011}, + new int[]{1001, 1001}, + new int[]{1021, 1021}, + new int[]{1011, 1011} + }, + // INS, different start/end + { + new int[]{1001, 1011, 1021}, + new int[]{1011, 1021, 1031}, + StructuralVariantType.INS, + new int[]{1011, 1011}, + new int[]{1001, 1001}, + new int[]{1021, 1021}, + new int[]{1011, 1011} + } + }; + } + + @Test(dataProvider= "collapseIntervalTestData") + public void collapseIntervalTest(final int[] starts, final int[] ends, final StructuralVariantType svtype, + final int[] expectedMedian, final int[] expectedMinMax, final int[] expectedMaxMin, + final int[] expectedMean) { + final List records = IntStream.range(0, starts.length) + .mapToObj(i -> SVTestUtils.newCallRecordWithIntervalAndType(starts[i], ends[i], svtype)).collect(Collectors.toList()); + + final Pair resultMedian = collapser.collapseInterval(records); + Assert.assertEquals((int) resultMedian.getKey(), expectedMedian[0]); + Assert.assertEquals((int) resultMedian.getValue(), expectedMedian[1]); + + final Pair resultMinMax = collapserMinMax.collapseInterval(records); + Assert.assertEquals((int) resultMinMax.getKey(), expectedMinMax[0]); + Assert.assertEquals((int) resultMinMax.getValue(), expectedMinMax[1]); + + final Pair resultMaxMin = collapserMaxMin.collapseInterval(records); + Assert.assertEquals((int) resultMaxMin.getKey(), expectedMaxMin[0]); + Assert.assertEquals((int) resultMaxMin.getValue(), expectedMaxMin[1]); + + final Pair resultMean = collapserMean.collapseInterval(records); + Assert.assertEquals((int) resultMean.getKey(), expectedMean[0]); + Assert.assertEquals((int) resultMean.getValue(), expectedMean[1]); + } + + @DataProvider(name = "collapseTypesTestData") + public Object[][] collapseTypesTestData() { + return new Object[][]{ + {Collections.singletonList(StructuralVariantType.DEL), StructuralVariantType.DEL}, + {Collections.singletonList(StructuralVariantType.DUP), StructuralVariantType.DUP}, + {Collections.singletonList(StructuralVariantType.INS), StructuralVariantType.INS}, + {Collections.singletonList(StructuralVariantType.INV), StructuralVariantType.INV}, + {Collections.singletonList(StructuralVariantType.BND), StructuralVariantType.BND}, + {Collections.singletonList(StructuralVariantType.CNV), StructuralVariantType.CNV}, + {Lists.newArrayList(StructuralVariantType.DEL, StructuralVariantType.DUP), StructuralVariantType.CNV}, + {Lists.newArrayList(StructuralVariantType.DEL, StructuralVariantType.DEL), StructuralVariantType.DEL}, + {Lists.newArrayList(StructuralVariantType.DUP, StructuralVariantType.DUP), StructuralVariantType.DUP}, + {Lists.newArrayList(StructuralVariantType.INS, StructuralVariantType.INS), StructuralVariantType.INS}, + {Lists.newArrayList(StructuralVariantType.INV, StructuralVariantType.INV), StructuralVariantType.INV}, + {Lists.newArrayList(StructuralVariantType.BND, StructuralVariantType.BND), StructuralVariantType.BND}, + {Lists.newArrayList(StructuralVariantType.CNV, StructuralVariantType.CNV), StructuralVariantType.CNV}, + {Lists.newArrayList(StructuralVariantType.DEL, StructuralVariantType.DUP, StructuralVariantType.CNV), StructuralVariantType.CNV} + }; + } + + @Test(dataProvider= "collapseTypesTestData") + public void collapseTypesTest(final List types, final StructuralVariantType expectedResult) { + final List records = types.stream().map(t -> SVTestUtils.newCallRecordWithIntervalAndType(1, 100, t)).collect(Collectors.toList()); + Assert.assertEquals(collapser.collapseTypes(records), expectedResult); + } + + @DataProvider(name = "collapseAlgorithmsTestData") + public Object[][] collapseAlgorithmsTestData() { + return new Object[][]{ + // depth only + {Collections.singletonList(Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM)), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM)}, + {Lists.newArrayList(Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM)), + Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM)}, + + // pesr only + {Collections.singletonList(SVTestUtils.PESR_ONLY_ALGORITHM_LIST), SVTestUtils.PESR_ONLY_ALGORITHM_LIST}, + {Collections.singletonList(Lists.newArrayList("pesrA", "pesrB")), Lists.newArrayList("pesrA", "pesrB")}, + {Collections.singletonList(Lists.newArrayList("pesrB", "pesrA")), Lists.newArrayList("pesrA", "pesrB")}, + {Lists.newArrayList(Collections.singletonList("pesrA"), Collections.singletonList("pesrB")), Lists.newArrayList("pesrA", "pesrB")}, + {Lists.newArrayList(Collections.singletonList("pesrB"), Collections.singletonList("pesrA")), Lists.newArrayList("pesrA", "pesrB")}, + {Lists.newArrayList(Lists.newArrayList("pesrA", "pesrB"), Collections.singletonList("pesrB")), Lists.newArrayList("pesrA", "pesrB")}, + {Lists.newArrayList(Lists.newArrayList("pesrB", "pesrA"), Collections.singletonList("pesrA")), Lists.newArrayList("pesrA", "pesrB")}, + + // mixed depth/pesr + {Lists.newArrayList(SVTestUtils.PESR_ONLY_ALGORITHM_LIST, SVTestUtils.DEPTH_ONLY_ALGORITHM_LIST), + Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM, SVTestUtils.PESR_ALGORITHM)}, + {Lists.newArrayList(Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM, SVTestUtils.PESR_ALGORITHM), + Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM, SVTestUtils.PESR_ALGORITHM)), + Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM, SVTestUtils.PESR_ALGORITHM)} + }; + } + + @Test(dataProvider= "collapseAlgorithmsTestData") + public void collapseAlgorithmsTest(final List> algorithmLists, final List expectedResult) { + final List records = algorithmLists.stream().map(list -> SVTestUtils.newDeletionCallRecordWithIdAndAlgorithms("", list)).collect(Collectors.toList()); + Assert.assertEquals(collapser.collapseAlgorithms(records), expectedResult); + } + + @DataProvider(name = "alleleCollapserComparatorTestData") + public Object[][] alleleCollapserComparatorTestData() { + return new Object[][]{ + {Collections.emptyList(), Collections.emptyList(), 0}, + {Collections.singletonList(Allele.NO_CALL), Collections.singletonList(Allele.NO_CALL), 0}, + {Collections.singletonList(Allele.REF_N), Collections.singletonList(Allele.REF_N), 0}, + {Collections.singletonList(Allele.SV_SIMPLE_DEL), Collections.singletonList(Allele.SV_SIMPLE_DEL), 0}, + {Lists.newArrayList(Allele.SV_SIMPLE_DUP, Allele.SV_SIMPLE_DUP), Lists.newArrayList(Allele.SV_SIMPLE_DUP, Allele.SV_SIMPLE_DUP), 0}, + // When otherwise equal up to common length, shorter list first should return 1 <=> longer list first should return -1 + {Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DEL), Collections.singletonList(Allele.SV_SIMPLE_DEL), 1}, + {Collections.singletonList(Allele.SV_SIMPLE_DEL), Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP), -1}, + {Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP), Collections.singletonList(Allele.SV_SIMPLE_DEL), 1}, + {Collections.singletonList(Allele.SV_SIMPLE_DEL), Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP), -1}, + {Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP), Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DEL), 1}, + {Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DEL), Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP), -1} + }; + } + + @Test(dataProvider= "alleleCollapserComparatorTestData") + public void alleleCollapserComparatorTest(final List allelesA, final List allelesB, final int expectedResult) { + Assert.assertEquals(alleleComparator.compare(allelesA, allelesB), expectedResult); + } + + @Test(expectedExceptions = { IllegalArgumentException.class }) + public void testUnsupportedCNVAltAlleles() throws IllegalArgumentException { + final List siteAltAlleles = Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_INS); + CanonicalSVCollapser.getCNVGenotypeAllelesFromCopyNumber(siteAltAlleles, Allele.REF_A, 2, 1); + } + + @Test(expectedExceptions = { IllegalArgumentException.class }) + public void testUnsupportedCNVAltAllele() throws IllegalArgumentException { + final List siteAltAlleles = Collections.singletonList(Allele.SV_SIMPLE_INS); + CanonicalSVCollapser.getCNVGenotypeAllelesFromCopyNumber(siteAltAlleles, Allele.REF_A, 2, 2); + } +} \ No newline at end of file diff --git a/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/PloidyTableTest.java b/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/PloidyTableTest.java new file mode 100644 index 00000000000..9ab7b255488 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/PloidyTableTest.java @@ -0,0 +1,54 @@ +package org.broadinstitute.hellbender.tools.sv.cluster; + +import org.broadinstitute.hellbender.CommandLineProgramTest; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.HashMap; +import java.util.Map; + +public class PloidyTableTest extends CommandLineProgramTest { + + @Test + public void testCreateTableFromFile() { + final Path tablePath = Paths.get( getTestDataDir() + "/walkers/sv/SVCluster/1kgp.batch1.ploidy.tsv"); + final PloidyTable table = new PloidyTable(tablePath); + + Assert.assertEquals(table.get("HG00096", "chr1").intValue(), 2); + Assert.assertEquals(table.get("HG00096", "chrX").intValue(), 1); + Assert.assertEquals(table.get("HG00096", "chrY").intValue(), 1); + + Assert.assertEquals(table.get("NA19143", "chr1").intValue(), 2); + Assert.assertEquals(table.get("NA19143", "chrX").intValue(), 2); + Assert.assertEquals(table.get("NA19143", "chrY").intValue(), 0); + + Assert.assertEquals(table.get("NA21133", "chr1").intValue(), 2); + Assert.assertEquals(table.get("NA21133", "chrX").intValue(), 1); + Assert.assertEquals(table.get("NA21133", "chrY").intValue(), 1); + } + + @Test + public void testCreateTableFromMap() { + final Map> map = new HashMap<>(); + + final Map sampleMap1 = new HashMap<>(); + sampleMap1.put("chr1", 2); + sampleMap1.put("chrX", 2); + map.put("sample1", sampleMap1); + + final Map sampleMap2 = new HashMap<>(); + sampleMap2.put("chr1", 2); + sampleMap2.put("chrX", 1); + map.put("sample2", sampleMap2); + + final PloidyTable table = new PloidyTable(map); + + Assert.assertEquals(table.get("sample1", "chr1").intValue(), 2); + Assert.assertEquals(table.get("sample1", "chrX").intValue(), 2); + Assert.assertEquals(table.get("sample2", "chr1").intValue(), 2); + Assert.assertEquals(table.get("sample2", "chrX").intValue(), 1); + } + +} \ No newline at end of file diff --git a/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngineTest.java b/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngineTest.java index 6f8a3034294..039053310f0 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngineTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/SVClusterEngineTest.java @@ -41,11 +41,32 @@ public void testCollapser() { Assert.assertTrue(flattened.getAlgorithms().containsAll(SVTestUtils.depthAndStuff.getAlgorithms())); Assert.assertTrue(flattened.getAlgorithms().containsAll(SVTestUtils.depthOnly.getAlgorithms())); Assert.assertTrue(flattened.getAlgorithms().containsAll(SVTestUtils.call2.getAlgorithms())); - //should have all the genotypes + SVTestUtils.assertContainsAllIgnoreRefAlleleBase(flattened.getGenotypes(), SVTestUtils.depthAndStuff.getGenotypes(), true); SVTestUtils.assertContainsAllIgnoreRefAlleleBase(flattened.getGenotypes(), SVTestUtils.depthOnly.getGenotypes(), true); SVTestUtils.assertContainsAllIgnoreRefAlleleBase(flattened.getGenotypes(), SVTestUtils.call2.getGenotypes(), true); - //TODO: add test for insertion cluster + + // Test subtyped insertions + final List subtypedRecords = Lists.newArrayList( + SVTestUtils.newCallRecordWithAlleles( + Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS), + Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS), + StructuralVariantType.INS, + 2, + null), + SVTestUtils.newCallRecordWithAlleles( + Lists.newArrayList(Allele.REF_N, Allele.create("")), + Lists.newArrayList(Allele.REF_N, Allele.create("")), + StructuralVariantType.INS, + 2, + null) + ); + final SVCallRecord subtypedFlattened = engine.getCollapser().collapse(subtypedRecords); + Assert.assertEquals(subtypedFlattened.getAlleles().size(), 2); + Assert.assertEquals(subtypedFlattened.getAltAlleles().size(), 1); + final List collapsedAlleles = subtypedFlattened.getGenotypes().stream().map(Genotype::getAlleles) + .flatMap(Collection::stream).distinct().sorted().collect(Collectors.toList()); + Assert.assertEquals(subtypedFlattened.getAlleles(), collapsedAlleles); } @Test @@ -315,7 +336,7 @@ public void testAddVaryPositions(final int positionA1, final int positionB1, engine.add(call1); engine.add(call2); engine.add(call3); - Assert.assertEquals(engine.getOutput().size(), result); + Assert.assertEquals(engine.forceFlushAndGetOutput().size(), result); } @Test @@ -327,7 +348,7 @@ public void testAdd() { Assert.assertFalse(temp1.isEmpty()); //force new cluster by adding a non-overlapping event temp1.add(SVTestUtils.call3); - final List output1 = temp1.getOutput(); //flushes all clusters + final List output1 = temp1.forceFlushAndGetOutput(); //flushes all clusters Assert.assertTrue(temp1.isEmpty()); Assert.assertEquals(output1.size(), 2); SVTestUtils.assertEqualsExceptMembershipAndGT(SVTestUtils.call1, output1.get(0)); @@ -338,7 +359,7 @@ public void testAdd() { temp2.add(SVTestUtils.overlapsCall1); //force new cluster by adding a call on another contig temp2.add(SVTestUtils.call4_chr10); - final List output2 = temp2.getOutput(); + final List output2 = temp2.forceFlushAndGetOutput(); Assert.assertEquals(output2.size(), 2); //median of two items ends up being the second item here Assert.assertEquals(output2.get(0).getPositionA(), SVTestUtils.call1.getPositionA()); @@ -349,7 +370,7 @@ public void testAdd() { final SVClusterEngine temp3 = SVTestUtils.getNewDefaultSingleLinkageEngine(); temp3.add(SVTestUtils.call1); temp3.add(SVTestUtils.sameBoundsSampleMismatch); - final List output3 = temp3.getOutput(); + final List output3 = temp3.forceFlushAndGetOutput(); Assert.assertEquals(output3.size(), 1); Assert.assertEquals(output3.get(0).getPositionA(), SVTestUtils.call1.getPositionA()); Assert.assertEquals(output3.get(0).getPositionB(), SVTestUtils.call1.getPositionB()); @@ -367,7 +388,7 @@ public void testAddMaxCliqueLarge() { final int end = start + length - 1; engine.add(SVTestUtils.newCallRecordWithIntervalAndType(start, end, StructuralVariantType.DEL)); } - final List result = engine.getOutput(); + final List result = engine.forceFlushAndGetOutput(); Assert.assertEquals(result.size(), 50); for (final SVCallRecord resultRecord : result) { Assert.assertTrue(resultRecord.getAttributes().containsKey(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY)); @@ -441,7 +462,7 @@ public void testGetCarrierSamplesBiallelic(final int ploidy, final Allele refAll "chr1", 1999, SVTestUtils.getValidTestStrandB(svtype), svtype, 1000, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), alleles, GenotypesContext.copy(genotypesWithCopyNumber), Collections.emptyMap()); - final Set resultWithCopyNumber = recordWithCopyNumber.getCarrierSamples(); + final Set resultWithCopyNumber = recordWithCopyNumber.getCarrierSampleSet(); Assert.assertEquals(resultWithCopyNumber, expectedResult); @@ -456,7 +477,7 @@ public void testGetCarrierSamplesBiallelic(final int ploidy, final Allele refAll "chr1", 1999, SVTestUtils.getValidTestStrandB(svtype), svtype, 1000, Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), alleles, GenotypesContext.copy(genotypesWithGenotype), Collections.emptyMap()); - final Set resultWithGenotype = recordWithGenotype.getCarrierSamples(); + final Set resultWithGenotype = recordWithGenotype.getCarrierSampleSet(); Assert.assertEquals(resultWithGenotype, expectedResult); } @@ -472,7 +493,7 @@ public void testLargeRandom() { } final SVClusterEngine engine = SVTestUtils.getNewDefaultMaxCliqueEngine(); records.stream().sorted(SVCallRecordUtils.getCallComparator(SVTestUtils.hg38Dict)).forEach(engine::add); - final List output = engine.getOutput(); + final List output = engine.forceFlushAndGetOutput(); Assert.assertEquals(output.size(), 2926); } } \ No newline at end of file diff --git a/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/SVCollapserTest.java b/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/SVCollapserTest.java deleted file mode 100644 index 7c1819c8ca5..00000000000 --- a/src/test/java/org/broadinstitute/hellbender/tools/sv/cluster/SVCollapserTest.java +++ /dev/null @@ -1,964 +0,0 @@ -package org.broadinstitute.hellbender.tools.sv.cluster; - -import com.google.common.collect.Lists; -import com.google.common.collect.Sets; -import htsjdk.variant.variantcontext.Allele; -import htsjdk.variant.variantcontext.Genotype; -import htsjdk.variant.variantcontext.GenotypeBuilder; -import htsjdk.variant.variantcontext.StructuralVariantType; -import htsjdk.variant.vcf.VCFConstants; -import org.apache.commons.lang3.tuple.Pair; -import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; -import org.broadinstitute.hellbender.tools.sv.SVCallRecord; -import org.broadinstitute.hellbender.tools.sv.SVCallRecordUtils; -import org.broadinstitute.hellbender.tools.sv.SVTestUtils; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.*; -import java.util.stream.Collectors; -import java.util.stream.IntStream; - -public class SVCollapserTest { - - private static final CanonicalSVCollapser collapser = new CanonicalSVCollapser( - SVTestUtils.hg38Reference, - CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, - CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END, - CanonicalSVCollapser.InsertionLengthSummaryStrategy.MEDIAN); - private static final CanonicalSVCollapser collapserMinMax = new CanonicalSVCollapser( - SVTestUtils.hg38Reference, - CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, - CanonicalSVCollapser.BreakpointSummaryStrategy.MIN_START_MAX_END, - CanonicalSVCollapser.InsertionLengthSummaryStrategy.MEDIAN); - private static final CanonicalSVCollapser collapserMaxMin = new CanonicalSVCollapser( - SVTestUtils.hg38Reference, - CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, - CanonicalSVCollapser.BreakpointSummaryStrategy.MAX_START_MIN_END, - CanonicalSVCollapser.InsertionLengthSummaryStrategy.MEDIAN); - private static final CanonicalSVCollapser collapserMean = new CanonicalSVCollapser( - SVTestUtils.hg38Reference, - CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, - CanonicalSVCollapser.BreakpointSummaryStrategy.MEAN_START_MEAN_END, - CanonicalSVCollapser.InsertionLengthSummaryStrategy.MEDIAN); - private static final CanonicalSVCollapser collapserSpecificAltAllele = new CanonicalSVCollapser( - SVTestUtils.hg38Reference, - CanonicalSVCollapser.AltAlleleSummaryStrategy.MOST_SPECIFIC_SUBTYPE, - CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END, - CanonicalSVCollapser.InsertionLengthSummaryStrategy.MEDIAN); - - private static final CanonicalSVCollapser collapserInsertionMean = new CanonicalSVCollapser( - SVTestUtils.hg38Reference, - CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, - CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END, - CanonicalSVCollapser.InsertionLengthSummaryStrategy.MEAN); - private static final CanonicalSVCollapser collapserInsertionMin = new CanonicalSVCollapser( - SVTestUtils.hg38Reference, - CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, - CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END, - CanonicalSVCollapser.InsertionLengthSummaryStrategy.MIN); - private static final CanonicalSVCollapser collapserInsertionMax = new CanonicalSVCollapser( - SVTestUtils.hg38Reference, - CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, - CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END, - CanonicalSVCollapser.InsertionLengthSummaryStrategy.MAX); - private static final CanonicalSVCollapser collapserInsertionUndefined = new CanonicalSVCollapser( - SVTestUtils.hg38Reference, - CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, - CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END, - CanonicalSVCollapser.InsertionLengthSummaryStrategy.UNDEFINED); - - private static final CanonicalSVCollapser.AlleleCollectionCollapserComparator alleleComparator = new CanonicalSVCollapser.AlleleCollectionCollapserComparator(); - - private static final Allele MEI_INSERTION_ALLELE = Allele.create(""); - private static final Allele SVA_INSERTION_ALLELE = Allele.create(""); - private static final Allele LINE_INSERTION_ALLELE = Allele.create(""); - - @DataProvider(name = "expectedCopyNumberTestData") - public Object[][] expectedCopyNumberTestData() { - return new Object[][]{ - // Result should be same value - {new int[]{0}, 0}, - {new int[]{1, 1, 1}, 1}, - {new int[]{2, 2, 2}, 2}, - {new int[]{3, 3, 3, 3}, 3} - }; - } - - @Test(dataProvider= "expectedCopyNumberTestData") - public void testCollapseExpectedCopyNumber(final int[] input, final int result) { - final Collection genotypes = IntStream.of(input).mapToObj(p -> SVTestUtils.buildHomGenotypeWithPloidy(Allele.REF_N, p)).collect(Collectors.toList()); - Assert.assertEquals(collapser.collapseExpectedCopyNumber(genotypes), result); - } - - @DataProvider(name = "collapseRefAllelesTestData") - public Object[][] collapseRefAllelesTestData() { - return new Object[][]{ - {1, Allele.REF_N}, - {100000, Allele.REF_C}, - {100001, Allele.REF_A}, - {100002, Allele.REF_C}, - {100003, Allele.REF_T}, - {248956422, Allele.REF_N}, - }; - } - - @Test(dataProvider= "collapseRefAllelesTestData") - public void testCollapseRefAlleles(final int pos, final Allele result) { - Assert.assertEquals(collapser.collapseRefAlleles("chr1", pos), result); - } - - @DataProvider(name = "collapseAltAllelesTestData") - public Object[][] collapseAltAllelesTestData() { - return new Object[][]{ - {Collections.singletonList(Collections.emptyList()), StructuralVariantType.DEL, Collections.emptyList(), Collections.emptyList()}, - {Collections.singletonList(Collections.singletonList(Allele.REF_N)), StructuralVariantType.DEL, Collections.emptyList(), Collections.emptyList()}, - {Collections.singletonList(Collections.singletonList(Allele.REF_A)), StructuralVariantType.DEL, Collections.emptyList(), Collections.emptyList()}, - {Collections.singletonList(Collections.singletonList(Allele.NO_CALL)), StructuralVariantType.DEL, Collections.emptyList(), Collections.emptyList()}, - {Collections.singletonList(Collections.singletonList(Allele.ALT_A)), StructuralVariantType.INS, Collections.singletonList(Allele.ALT_A), Collections.singletonList(Allele.ALT_A)}, - {Collections.singletonList(Collections.singletonList(Allele.SV_SIMPLE_DEL)), StructuralVariantType.DEL, Collections.singletonList(Allele.SV_SIMPLE_DEL), Collections.singletonList(Allele.SV_SIMPLE_DEL)}, - {Lists.newArrayList(Collections.singletonList(Allele.REF_N), Collections.singletonList(Allele.SV_SIMPLE_DEL)), StructuralVariantType.DEL, Collections.singletonList(Allele.SV_SIMPLE_DEL), Collections.singletonList(Allele.SV_SIMPLE_DEL)}, - {Lists.newArrayList(Lists.newArrayList(Allele.REF_N, Allele.REF_N), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL)), StructuralVariantType.DEL, Collections.singletonList(Allele.SV_SIMPLE_DEL), Collections.singletonList(Allele.SV_SIMPLE_DEL)}, - {Lists.newArrayList(Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DUP), Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL)), StructuralVariantType.CNV, Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP), Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP)}, - {Lists.newArrayList(Collections.singletonList(MEI_INSERTION_ALLELE), Collections.singletonList(MEI_INSERTION_ALLELE)), StructuralVariantType.INS, Collections.singletonList(MEI_INSERTION_ALLELE), Collections.singletonList(MEI_INSERTION_ALLELE)}, - {Lists.newArrayList(Collections.singletonList(Allele.SV_SIMPLE_INS), Collections.singletonList(MEI_INSERTION_ALLELE)), StructuralVariantType.INS, Collections.singletonList(Allele.SV_SIMPLE_INS), Collections.singletonList(MEI_INSERTION_ALLELE)}, - {Lists.newArrayList(Collections.singletonList(MEI_INSERTION_ALLELE), Collections.singletonList(SVA_INSERTION_ALLELE)), StructuralVariantType.INS, Collections.singletonList(MEI_INSERTION_ALLELE), Collections.singletonList(SVA_INSERTION_ALLELE)}, - {Lists.newArrayList(Collections.singletonList(LINE_INSERTION_ALLELE), Collections.singletonList(SVA_INSERTION_ALLELE)), StructuralVariantType.INS, Collections.singletonList(MEI_INSERTION_ALLELE), Collections.singletonList(MEI_INSERTION_ALLELE)}, - }; - } - - @Test(dataProvider= "collapseAltAllelesTestData") - public void collapseAltAllelesTest(final List> recordGenotypeAlleles, - final StructuralVariantType svtype, - final List resultCommon, - final List resultSpecific) { - final List variantAlleles = recordGenotypeAlleles.stream().flatMap(List::stream).distinct().collect(Collectors.toList()); - final List records = recordGenotypeAlleles.stream() - .map(a -> SVTestUtils.newCallRecordWithAlleles(a, variantAlleles, svtype)) - .collect(Collectors.toList()); - - final List sortedTestCommon = SVCallRecordUtils.sortAlleles(collapser.collapseAltAlleles(records)); - final List sortedExpectedCommon = SVCallRecordUtils.sortAlleles(resultCommon); - Assert.assertEquals(sortedTestCommon, sortedExpectedCommon); - - final List sortedTestSpecific = SVCallRecordUtils.sortAlleles(collapserSpecificAltAllele.collapseAltAlleles(records)); - final List sortedExpectedSpecific = SVCallRecordUtils.sortAlleles(resultSpecific); - Assert.assertEquals(sortedTestSpecific, sortedExpectedSpecific); - } - - @DataProvider(name = "collapseSampleAllelesTestData") - public Object[][] collapseSampleAllelesTestData() { - return new Object[][]{ - // empty - { - Collections.singletonList( - Collections.emptyList() - ), - Collections.emptyList(), - 0, - Collections.emptyList() - }, - // null - { - Collections.singletonList( - Collections.singletonList(null) - ), - Collections.emptyList(), - 1, - Collections.singletonList(Allele.REF_N) - }, - // REF - { - Collections.singletonList( - Collections.singletonList(Allele.REF_N) - ), - Collections.emptyList(), - 1, - Collections.singletonList(Allele.REF_N) - }, - // INS - { - Collections.singletonList( - Collections.singletonList(Allele.SV_SIMPLE_INS) - ), - Collections.singletonList(Allele.SV_SIMPLE_INS), - 1, - Collections.singletonList(Allele.SV_SIMPLE_INS) - }, - // INS, INS - { - Lists.newArrayList( - Collections.singletonList(Allele.SV_SIMPLE_INS), - Collections.singletonList(Allele.SV_SIMPLE_INS) - ), - Collections.singletonList(Allele.SV_SIMPLE_INS), - 1, - Collections.singletonList(Allele.SV_SIMPLE_INS) - }, - // INS, REF - { - Lists.newArrayList( - Collections.singletonList(Allele.SV_SIMPLE_INS), - Collections.singletonList(Allele.REF_N) - ), - Collections.singletonList(Allele.SV_SIMPLE_INS), - 1, - Collections.singletonList(Allele.SV_SIMPLE_INS) - }, - // REF/INS, REF/REF - { - Lists.newArrayList( - Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS), - Lists.newArrayList(Allele.REF_N, Allele.REF_N) - ), - Collections.singletonList(Allele.SV_SIMPLE_INS), - 2, - Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS) - }, - // INS/INS, REF/REF - { - Lists.newArrayList( - Lists.newArrayList(Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS), - Lists.newArrayList(Allele.REF_N, Allele.REF_N) - ), - Collections.singletonList(Allele.SV_SIMPLE_INS), - 2, - Lists.newArrayList(Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS) - }, - // REF/INS, REF/INS - { - Lists.newArrayList( - Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS), - Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS) - ), - Collections.singletonList(Allele.SV_SIMPLE_INS), - 2, - Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS) - }, - // REF/INS, REF/INS, INS/INS - { - Lists.newArrayList( - Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS), - Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS), - Lists.newArrayList(Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS) - ), - Collections.singletonList(Allele.SV_SIMPLE_INS), - 2, - Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS) - }, - // REF/INS, INS/INS, INS/INS - { - Lists.newArrayList( - Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS), - Lists.newArrayList(Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS), - Lists.newArrayList(Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS) - ), - Collections.singletonList(Allele.SV_SIMPLE_INS), - 2, - Lists.newArrayList(Allele.SV_SIMPLE_INS, Allele.SV_SIMPLE_INS) - }, - // REF/INS, REF - { - Lists.newArrayList( - Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS), - Collections.singletonList(Allele.REF_N) - ), - Collections.singletonList(Allele.SV_SIMPLE_INS), - 2, - Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_INS) - }, - { - Collections.singletonList( - Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL) - ), - Lists.newArrayList(Allele.SV_SIMPLE_DEL), - 2, - Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL) - }, - // DEL should have GT filled out - { - Collections.singletonList( - Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL) - ), - Collections.singletonList(Allele.SV_SIMPLE_DEL), - 2, - Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL) - }, - { - Collections.singletonList( - Collections.singletonList(Allele.SV_SIMPLE_DEL) - ), - Collections.singletonList(Allele.SV_SIMPLE_DEL), - 1, - Collections.singletonList(Allele.SV_SIMPLE_DEL) - }, - // If collapsed ploidy > 1, fill out GT with REF calls - { - Collections.singletonList( - Collections.singletonList(Allele.SV_SIMPLE_DEL) - ), - Collections.singletonList(Allele.SV_SIMPLE_DEL), - 2, - Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL) - }, - // DUPs with ploidy 1 should have GT filled out - { - Lists.newArrayList( - Collections.singletonList(Allele.SV_SIMPLE_DUP), - Collections.singletonList(Allele.REF_N) - ), - Lists.newArrayList(Allele.SV_SIMPLE_DUP), - 1, - Lists.newArrayList(Allele.SV_SIMPLE_DUP) - }, - // DUPs with ploidy >1 should be converted to no-call GTs - { - Collections.singletonList( - Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DUP) - ), - Lists.newArrayList(Allele.SV_SIMPLE_DUP), - 2, - Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL) - }, - // CNVs should get converted to no-call GTs with correct ploidy, preferring higher ploidy - { - Lists.newArrayList( - Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DEL), - Lists.newArrayList(Allele.REF_N, Allele.SV_SIMPLE_DUP) - ), - Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP), - 2, - Lists.newArrayList(Allele.NO_CALL, Allele.NO_CALL) - }, - }; - } - - @Test(dataProvider= "collapseSampleAllelesTestData") - public void collapseSampleAllelesTest(final List> alleles, - final List sampleAltAlleles, - final int expectedCopyNumber, - final List result) { - final Collection genotypes = alleles.stream().map(a -> new GenotypeBuilder().alleles(a).make()).collect(Collectors.toList()); - final List sortedTest = SVCallRecordUtils.sortAlleles(collapser.collapseSampleGenotypeAlleles(genotypes, expectedCopyNumber, Allele.REF_N, sampleAltAlleles)); - final List sortedResult = SVCallRecordUtils.sortAlleles(result); - Assert.assertEquals(sortedTest, sortedResult); - } - - @DataProvider(name = "collapseAttributesTestData") - public Object[][] collapseAttributesTestData() { - return new Object[][]{ - // Null value - { - Collections.singletonList("var1"), - Collections.singletonList(new String[]{VCFConstants.GENOTYPE_QUALITY_KEY}), - Collections.singletonList(new Object[]{null}), - 2, - new String[]{VCFConstants.GENOTYPE_QUALITY_KEY}, - new Object[]{null}, - false - }, - // Single key / value - { - Collections.singletonList("var1"), - Collections.singletonList(new String[]{VCFConstants.GENOTYPE_QUALITY_KEY}), - Collections.singletonList(new Object[]{30}), - 2, - new String[]{VCFConstants.GENOTYPE_QUALITY_KEY}, - new Object[]{30}, - false - }, - // Two samples, null values - { - Lists.newArrayList("var1", "var2"), - Lists.newArrayList( - new String[]{VCFConstants.GENOTYPE_QUALITY_KEY}, - new String[]{VCFConstants.GENOTYPE_QUALITY_KEY} - ), - Lists.newArrayList( - new Object[]{null}, - new Object[]{null}), - 2, - new String[]{VCFConstants.GENOTYPE_QUALITY_KEY}, - new Object[]{null}, - false - }, - // Two samples, same key/value - { - Lists.newArrayList("var1", "var2"), - Lists.newArrayList( - new String[]{VCFConstants.GENOTYPE_QUALITY_KEY}, - new String[]{VCFConstants.GENOTYPE_QUALITY_KEY} - ), - Lists.newArrayList( - new Object[]{30}, - new Object[]{30}), - 2, - new String[]{VCFConstants.GENOTYPE_QUALITY_KEY}, - new Object[]{30}, - false - }, - // Two samples, same key / different value - { - Lists.newArrayList("var1", "var2"), - Lists.newArrayList( - new String[]{VCFConstants.GENOTYPE_QUALITY_KEY}, - new String[]{VCFConstants.GENOTYPE_QUALITY_KEY} - ), - Lists.newArrayList( - new Object[]{30}, - new Object[]{45}), - 2, - new String[]{VCFConstants.GENOTYPE_QUALITY_KEY}, - new Object[]{null}, - false - }, - // Two samples, one with an extra key - { - Lists.newArrayList("var1", "var2"), - Lists.newArrayList( - new String[]{VCFConstants.GENOTYPE_QUALITY_KEY, "KEY2"}, - new String[]{VCFConstants.GENOTYPE_QUALITY_KEY} - ), - Lists.newArrayList( - new Object[]{30, "VALUE2"}, - new Object[]{30}), - 2, - new String[]{VCFConstants.GENOTYPE_QUALITY_KEY, "KEY2"}, - new Object[]{30, "VALUE2"}, - false - }, - // CNVs - { - Lists.newArrayList("var1", "var2"), - Lists.newArrayList( - new String[]{GATKSVVCFConstants.COPY_NUMBER_FORMAT}, - new String[]{GATKSVVCFConstants.COPY_NUMBER_FORMAT} - ), - Lists.newArrayList( - new Object[]{2}, - new Object[]{2}), - 2, - new String[]{GATKSVVCFConstants.COPY_NUMBER_FORMAT}, - new Object[]{2}, - true - }, - { - Lists.newArrayList("var1", "var2"), - Lists.newArrayList( - new String[]{GATKSVVCFConstants.COPY_NUMBER_FORMAT}, - new String[]{GATKSVVCFConstants.COPY_NUMBER_FORMAT} - ), - Lists.newArrayList( - new Object[]{1}, - new Object[]{2}), - 2, - new String[]{GATKSVVCFConstants.COPY_NUMBER_FORMAT}, - new Object[]{1}, - true - }, - { - Lists.newArrayList("var1", "var2"), - Lists.newArrayList( - new String[]{GATKSVVCFConstants.COPY_NUMBER_FORMAT}, - new String[]{GATKSVVCFConstants.COPY_NUMBER_FORMAT} - ), - Lists.newArrayList( - new Object[]{3}, - new Object[]{2}), - 2, - new String[]{GATKSVVCFConstants.COPY_NUMBER_FORMAT}, - new Object[]{3}, - true - }, - // Tie by frequency goes to het - { - Lists.newArrayList("var1", "var2"), - Lists.newArrayList( - new String[]{GATKSVVCFConstants.COPY_NUMBER_FORMAT}, - new String[]{GATKSVVCFConstants.COPY_NUMBER_FORMAT} - ), - Lists.newArrayList( - new Object[]{1}, - new Object[]{0}), - 2, - new String[]{GATKSVVCFConstants.COPY_NUMBER_FORMAT}, - new Object[]{1}, - true - }, - // Hom wins by freq - { - Lists.newArrayList("var1", "var2"), - Lists.newArrayList( - new String[]{GATKSVVCFConstants.COPY_NUMBER_FORMAT}, - new String[]{GATKSVVCFConstants.COPY_NUMBER_FORMAT}, - new String[]{GATKSVVCFConstants.COPY_NUMBER_FORMAT} - ), - Lists.newArrayList( - new Object[]{1}, - new Object[]{0}, - new Object[]{0}), - 2, - new String[]{GATKSVVCFConstants.COPY_NUMBER_FORMAT}, - new Object[]{0}, - true - }, - // Mixed DEL/DUP cancel out - { - Lists.newArrayList("var1", "var2"), - Lists.newArrayList( - new String[]{GATKSVVCFConstants.COPY_NUMBER_FORMAT}, - new String[]{GATKSVVCFConstants.COPY_NUMBER_FORMAT} - ), - Lists.newArrayList( - new Object[]{1}, - new Object[]{3}), - 2, - new String[]{GATKSVVCFConstants.COPY_NUMBER_FORMAT}, - new Object[]{2}, - true - } - }; - } - - @Test(dataProvider= "collapseAttributesTestData") - public void collapseAttributesTest(final List variantIds, - final List keys, - final List values, - final int expectedCopyNumber, - final String[] expectedKeys, - final Object[] expectedValues, - final boolean isCNVTest) { - final List> inputAttributesList = IntStream.range(0, keys.size()) - .mapToObj(i -> SVTestUtils.keyValueArraysToMap(keys.get(i), values.get(i))) - .collect(Collectors.toList()); - final Map expectedAttributes = SVTestUtils.keyValueArraysToMap(expectedKeys, expectedValues); - expectedAttributes.put(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, expectedCopyNumber); - - // Test as genotype attributes - final List genotypes = inputAttributesList.stream() - .map(m -> new GenotypeBuilder().attributes(m).make()) - .collect(Collectors.toList()); - Assert.assertEquals(collapser.collapseGenotypeAttributes(genotypes, expectedCopyNumber), expectedAttributes); - - if (!isCNVTest) { - // Test as variant attributes - final List variants = IntStream.range(0, inputAttributesList.size()) - .mapToObj(i -> SVTestUtils.newNamedDeletionRecordWithAttributes(variantIds.get(i), inputAttributesList.get(i))) - .collect(Collectors.toList()); - final Map expectedAttributesWithMembers = new HashMap<>(expectedAttributes); - expectedAttributesWithMembers.put(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, variantIds); - expectedAttributesWithMembers.remove(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT); - Assert.assertEquals(collapser.collapseVariantAttributes(variants), expectedAttributesWithMembers); - } - } - - @DataProvider(name = "collapseLengthTestData") - public Object[][] collapseLengthTestData() { - return new Object[][]{ - { - new Integer[]{1000}, - new String[]{"chr1"}, - new StructuralVariantType[]{StructuralVariantType.DEL}, - 1, - 1000, - StructuralVariantType.DEL, - 1000 - }, - { - new Integer[]{1000, 1000}, - new String[]{"chr1", "chr1"}, - new StructuralVariantType[]{StructuralVariantType.DUP, StructuralVariantType.DUP}, - 1, - 1000, - StructuralVariantType.DUP, - 1000 - }, - { - new Integer[]{300, 400}, - new String[]{"chr1", "chr1"}, - new StructuralVariantType[]{StructuralVariantType.DEL, StructuralVariantType.DUP}, - 1001, - 1350, - StructuralVariantType.CNV, - 350 - }, - { - new Integer[]{300, 400}, - new String[]{"chr1", "chr1"}, - new StructuralVariantType[]{StructuralVariantType.INS, StructuralVariantType.INS}, - 1, - 1, - StructuralVariantType.INS, - 300 - }, - { - new Integer[]{300, 400, 500}, - new String[]{"chr1", "chr1", "chr1"}, - new StructuralVariantType[]{StructuralVariantType.INS, StructuralVariantType.INS, StructuralVariantType.INS}, - 1, - 1, - StructuralVariantType.INS, - 400 - }, - { - new Integer[]{null}, - new String[]{"chr2"}, - new StructuralVariantType[]{StructuralVariantType.BND}, - 1, - 1, - StructuralVariantType.BND, - -1 - }, - { - new Integer[]{null, null}, - new String[]{"chr2", "chr2"}, - new StructuralVariantType[]{StructuralVariantType.BND, StructuralVariantType.BND}, - 1, - 1, - StructuralVariantType.BND, - -1 - } - }; - } - - @Test(dataProvider= "collapseLengthTestData") - public void collapseLengthTest(final Integer[] lengths, final String[] chrom2, final StructuralVariantType[] svtypes, - final int newStart, final int newEnd, final StructuralVariantType newType, final int expectedLength) { - final List records = IntStream.range(0, lengths.length).mapToObj(i -> SVTestUtils.newCallRecordWithLengthAndTypeAndChrom2(lengths[i], svtypes[i], chrom2[i])).collect(Collectors.toList()); - Assert.assertEquals(collapser.collapseLength(records, newStart, newEnd, newType), expectedLength); - } - - @DataProvider(name = "collapseInsertionLengthTestData") - public Object[][] collapseInsertionLengthTestData() { - return new Object[][]{ - { - new Integer[]{}, - null, - null, - null, - null, - null - }, - { - new Integer[]{null}, - null, - null, - null, - null, - null - }, - { - new Integer[]{100}, - 100, - 100, - 100, - 100, - 100 - }, - { - new Integer[]{null, 100}, - 100, - 100, - 100, - 100, - 100 - }, - { - new Integer[]{100, null}, - 100, - 100, - 100, - 100, - 100 - }, - { - new Integer[]{200, 100}, - 100, - 150, - 100, - 200, - null - }, - { - new Integer[]{200, 100, 400}, - 200, - 234, - 100, - 400, - null - } - }; - } - - @Test(dataProvider= "collapseInsertionLengthTestData") - public void collapseInsertionLengthTest(final Integer[] lengths, - final Integer expectedMedian, - final Integer expectedMean, - final Integer expectedMin, - final Integer expectedMax, - final Integer expectedUndefined) { - final List records = IntStream.range(0, lengths.length).mapToObj(i -> SVTestUtils.newCallRecordWithLengthAndTypeAndChrom2(lengths[i], StructuralVariantType.INS, "chr1")).collect(Collectors.toList()); - Assert.assertEquals(collapser.collapseInsertionLength(records), expectedMedian); - Assert.assertEquals(collapserInsertionMean.collapseInsertionLength(records), expectedMean); - Assert.assertEquals(collapserInsertionMin.collapseInsertionLength(records), expectedMin); - Assert.assertEquals(collapserInsertionMax.collapseInsertionLength(records), expectedMax); - Assert.assertEquals(collapserInsertionUndefined.collapseInsertionLength(records), expectedUndefined); - - } - - @DataProvider(name = "collapseIdsTestData") - public Object[][] collapseIdsTestData() { - return new Object[][]{ - {Collections.singletonList("var1"), "var1"}, - {Lists.newArrayList("var1", "var2"), "var1"}, - {Lists.newArrayList("var2", "var1"), "var1"}, - }; - } - - @Test(dataProvider= "collapseIdsTestData") - public void collapseIdsTest(final List ids, final String expectedResult) { - final List records = ids.stream().map(SVTestUtils::newDeletionCallRecordWithId).collect(Collectors.toList()); - final String result = collapser.collapseIds(records); - Assert.assertEquals(result, expectedResult); - } - - @DataProvider(name = "getMostPreciseCallsTestData") - public Object[][] getMostPreciseCallsTestData() { - return new Object[][]{ - { - new String[]{"depth1"}, - Collections.singletonList( - Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM) - ), - Sets.newHashSet("depth1") - }, - { - new String[]{"depth1", "depth2"}, - Lists.newArrayList( - Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), - Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM) - ), - Sets.newHashSet("depth1", "depth2") - }, - { - new String[]{"pesr1"}, - Collections.singletonList( - SVTestUtils.PESR_ONLY_ALGORITHM_LIST - ), - Sets.newHashSet("pesr1") - }, - { - new String[]{"pesr1", "pesr2"}, - Lists.newArrayList( - SVTestUtils.PESR_ONLY_ALGORITHM_LIST, - SVTestUtils.PESR_ONLY_ALGORITHM_LIST - ), - Sets.newHashSet("pesr1", "pesr2") - }, - { - new String[]{"depth1", "pesr1"}, - Lists.newArrayList( - Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), - SVTestUtils.PESR_ONLY_ALGORITHM_LIST - ), - Sets.newHashSet("pesr1"), - }, - { - new String[]{"mixed1", "depth1"}, - Lists.newArrayList( - Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM, SVTestUtils.PESR_ALGORITHM), - Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM) - ), - Sets.newHashSet("mixed1"), - }, - { - new String[]{"mixed1", "pesr1"}, - Lists.newArrayList( - Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM, SVTestUtils.PESR_ALGORITHM), - SVTestUtils.PESR_ONLY_ALGORITHM_LIST - ), - Sets.newHashSet("mixed1", "pesr1"), - }, - }; - } - - @Test(dataProvider= "getMostPreciseCallsTestData") - public void getMostPreciseCallsTest(final String[] ids, final List> algorithms, final Set expectedIds) { - final List records = IntStream.range(0, ids.length).mapToObj(i -> SVTestUtils.newDeletionCallRecordWithIdAndAlgorithms(ids[i], algorithms.get(i))).collect(Collectors.toList()); - final List resultIds = collapser.getRecordsWithMostPreciseBreakpoints(records).stream().map(SVCallRecord::getId).collect(Collectors.toList()); - Assert.assertEquals(resultIds.size(), expectedIds.size()); - Assert.assertTrue(expectedIds.containsAll(resultIds)); - } - - @DataProvider(name = "collapseIntervalTestData") - public Object[][] collapseIntervalTestData() { - return new Object[][]{ - // 1 variant - { - new int[]{1001}, // starts - new int[]{1100}, // ends - StructuralVariantType.DEL, - new int[]{1001, 1100}, // median strategy - new int[]{1001, 1100}, // min-max strategy - new int[]{1001, 1100}, // max-min strategy - new int[]{1001, 1100} // mean strategy - }, - // 2 variants - { - new int[]{1001, 1011}, - new int[]{1100, 1110}, - StructuralVariantType.DEL, - new int[]{1001, 1100}, - new int[]{1001, 1110}, - new int[]{1011, 1100}, - new int[]{1006, 1105} - }, - // 3 variants - { - new int[]{1001, 1011, 1021}, - new int[]{1100, 1110, 1121}, - StructuralVariantType.DUP, - new int[]{1011, 1110}, - new int[]{1001, 1121}, - new int[]{1021, 1100}, - new int[]{1011, 1110} - }, - // BND - { - new int[]{1001, 1011, 1021}, - new int[]{1100, 1110, 1121}, - StructuralVariantType.BND, - new int[]{1011, 1110}, - new int[]{1001, 1121}, - new int[]{1021, 1100}, - new int[]{1011, 1110} - }, - // INS, same start/end - { - new int[]{1001, 1011, 1021}, - new int[]{1001, 1011, 1021}, - StructuralVariantType.INS, - new int[]{1011, 1011}, - new int[]{1011, 1011}, - new int[]{1011, 1011}, - new int[]{1011, 1011} - }, - // INS, different start/end - { - new int[]{1001, 1011, 1021}, - new int[]{1011, 1021, 1031}, - StructuralVariantType.INS, - new int[]{1016, 1016}, - new int[]{1016, 1016}, - new int[]{1016, 1016}, - new int[]{1016, 1016} - } - }; - } - - @Test(dataProvider= "collapseIntervalTestData") - public void collapseIntervalTest(final int[] starts, final int[] ends, final StructuralVariantType svtype, - final int[] expectedMedian, final int[] expectedMinMax, final int[] expectedMaxMin, - final int[] expectedMean) { - final List records = IntStream.range(0, starts.length) - .mapToObj(i -> SVTestUtils.newCallRecordWithIntervalAndType(starts[i], ends[i], svtype)).collect(Collectors.toList()); - - final Pair resultMedian = collapser.collapseInterval(records); - Assert.assertEquals((int) resultMedian.getKey(), expectedMedian[0]); - Assert.assertEquals((int) resultMedian.getValue(), expectedMedian[1]); - - final Pair resultMinMax = collapserMinMax.collapseInterval(records); - Assert.assertEquals((int) resultMinMax.getKey(), expectedMinMax[0]); - Assert.assertEquals((int) resultMinMax.getValue(), expectedMinMax[1]); - - final Pair resultMaxMin = collapserMaxMin.collapseInterval(records); - Assert.assertEquals((int) resultMaxMin.getKey(), expectedMaxMin[0]); - Assert.assertEquals((int) resultMaxMin.getValue(), expectedMaxMin[1]); - - final Pair resultMean = collapserMean.collapseInterval(records); - Assert.assertEquals((int) resultMean.getKey(), expectedMean[0]); - Assert.assertEquals((int) resultMean.getValue(), expectedMean[1]); - } - - @DataProvider(name = "collapseTypesTestData") - public Object[][] collapseTypesTestData() { - return new Object[][]{ - {Collections.singletonList(StructuralVariantType.DEL), StructuralVariantType.DEL}, - {Collections.singletonList(StructuralVariantType.DUP), StructuralVariantType.DUP}, - {Collections.singletonList(StructuralVariantType.INS), StructuralVariantType.INS}, - {Collections.singletonList(StructuralVariantType.INV), StructuralVariantType.INV}, - {Collections.singletonList(StructuralVariantType.BND), StructuralVariantType.BND}, - {Collections.singletonList(StructuralVariantType.CNV), StructuralVariantType.CNV}, - {Lists.newArrayList(StructuralVariantType.DEL, StructuralVariantType.DUP), StructuralVariantType.CNV}, - {Lists.newArrayList(StructuralVariantType.DEL, StructuralVariantType.DEL), StructuralVariantType.DEL}, - {Lists.newArrayList(StructuralVariantType.DUP, StructuralVariantType.DUP), StructuralVariantType.DUP}, - {Lists.newArrayList(StructuralVariantType.INS, StructuralVariantType.INS), StructuralVariantType.INS}, - {Lists.newArrayList(StructuralVariantType.INV, StructuralVariantType.INV), StructuralVariantType.INV}, - {Lists.newArrayList(StructuralVariantType.BND, StructuralVariantType.BND), StructuralVariantType.BND}, - {Lists.newArrayList(StructuralVariantType.CNV, StructuralVariantType.CNV), StructuralVariantType.CNV}, - {Lists.newArrayList(StructuralVariantType.DEL, StructuralVariantType.DUP, StructuralVariantType.CNV), StructuralVariantType.CNV} - }; - } - - @Test(dataProvider= "collapseTypesTestData") - public void collapseTypesTest(final List types, final StructuralVariantType expectedResult) { - final List records = types.stream().map(t -> SVTestUtils.newCallRecordWithIntervalAndType(1, 100, t)).collect(Collectors.toList()); - Assert.assertEquals(collapser.collapseTypes(records), expectedResult); - } - - @DataProvider(name = "collapseAlgorithmsTestData") - public Object[][] collapseAlgorithmsTestData() { - return new Object[][]{ - // depth only - {Collections.singletonList(Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM)), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM)}, - {Lists.newArrayList(Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM), Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM)), - Collections.singletonList(GATKSVVCFConstants.DEPTH_ALGORITHM)}, - - // pesr only - {Collections.singletonList(SVTestUtils.PESR_ONLY_ALGORITHM_LIST), SVTestUtils.PESR_ONLY_ALGORITHM_LIST}, - {Collections.singletonList(Lists.newArrayList("pesrA", "pesrB")), Lists.newArrayList("pesrA", "pesrB")}, - {Collections.singletonList(Lists.newArrayList("pesrB", "pesrA")), Lists.newArrayList("pesrA", "pesrB")}, - {Lists.newArrayList(Collections.singletonList("pesrA"), Collections.singletonList("pesrB")), Lists.newArrayList("pesrA", "pesrB")}, - {Lists.newArrayList(Collections.singletonList("pesrB"), Collections.singletonList("pesrA")), Lists.newArrayList("pesrA", "pesrB")}, - {Lists.newArrayList(Lists.newArrayList("pesrA", "pesrB"), Collections.singletonList("pesrB")), Lists.newArrayList("pesrA", "pesrB")}, - {Lists.newArrayList(Lists.newArrayList("pesrB", "pesrA"), Collections.singletonList("pesrA")), Lists.newArrayList("pesrA", "pesrB")}, - - // mixed depth/pesr - {Lists.newArrayList(SVTestUtils.PESR_ONLY_ALGORITHM_LIST, SVTestUtils.DEPTH_ONLY_ALGORITHM_LIST), - Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM, SVTestUtils.PESR_ALGORITHM)}, - {Lists.newArrayList(Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM, SVTestUtils.PESR_ALGORITHM), - Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM, SVTestUtils.PESR_ALGORITHM)), - Lists.newArrayList(GATKSVVCFConstants.DEPTH_ALGORITHM, SVTestUtils.PESR_ALGORITHM)} - }; - } - - @Test(dataProvider= "collapseAlgorithmsTestData") - public void collapseAlgorithmsTest(final List> algorithmLists, final List expectedResult) { - final List records = algorithmLists.stream().map(list -> SVTestUtils.newDeletionCallRecordWithIdAndAlgorithms("", list)).collect(Collectors.toList()); - Assert.assertEquals(collapser.collapseAlgorithms(records), expectedResult); - } - - @DataProvider(name = "alleleCollapserComparatorTestData") - public Object[][] alleleCollapserComparatorTestData() { - return new Object[][]{ - {Collections.emptyList(), Collections.emptyList(), 0}, - {Collections.singletonList(Allele.NO_CALL), Collections.singletonList(Allele.NO_CALL), 0}, - {Collections.singletonList(Allele.REF_N), Collections.singletonList(Allele.REF_N), 0}, - {Collections.singletonList(Allele.SV_SIMPLE_DEL), Collections.singletonList(Allele.SV_SIMPLE_DEL), 0}, - {Lists.newArrayList(Allele.SV_SIMPLE_DUP, Allele.SV_SIMPLE_DUP), Lists.newArrayList(Allele.SV_SIMPLE_DUP, Allele.SV_SIMPLE_DUP), 0}, - // When otherwise equal up to common length, shorter list first should return 1 <=> longer list first should return -1 - {Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DEL), Collections.singletonList(Allele.SV_SIMPLE_DEL), 1}, - {Collections.singletonList(Allele.SV_SIMPLE_DEL), Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP), -1}, - {Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP), Collections.singletonList(Allele.SV_SIMPLE_DEL), 1}, - {Collections.singletonList(Allele.SV_SIMPLE_DEL), Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP), -1}, - {Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP), Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DEL), 1}, - {Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DEL), Lists.newArrayList(Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DEL, Allele.SV_SIMPLE_DUP), -1} - }; - } - - @Test(dataProvider= "alleleCollapserComparatorTestData") - public void alleleCollapserComparatorTest(final List allelesA, final List allelesB, final int expectedResult) { - Assert.assertEquals(alleleComparator.compare(allelesA, allelesB), expectedResult); - } -} \ No newline at end of file diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentationIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentationIntegrationTest.java index cef03b06a7c..2172881795f 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentationIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/JointGermlineCNVSegmentationIntegrationTest.java @@ -169,7 +169,8 @@ public void testDefragmentation() { final Pair> defragmentedEvents = VariantContextTestUtils.readEntireVCFIntoMemory(output.getAbsolutePath()); //events get combined into a single event of 62113369 bp Assert.assertEquals(defragmentedEvents.getRight().size(), 1); - Assert.assertEquals(defragmentedEvents.getRight().get(0).getAttributeAsInt(GATKSVVCFConstants.SVLEN,0), 62113369); + final VariantContext testEvent = defragmentedEvents.getRight().get(0); + Assert.assertEquals(testEvent.getEnd() - testEvent.getStart() + 1, 62113369); final File output2 = createTempFile("notDefragmented",".vcf"); final ArgumentsBuilder args2 = new ArgumentsBuilder() @@ -227,8 +228,8 @@ public void testOverlappingEvents(final List inputVcfs) { Assert.assertTrue(vc.getAlternateAlleles().size() == 1 && vc.getAlternateAllele(0).equals(GATKSVVCFConstants.DEL_ALLELE)); Assert.assertTrue(vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)); Assert.assertFalse(vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, "").contains(",")); //no zero ACs for uncalled alts - Assert.assertTrue(vc.hasAttribute(GATKSVVCFConstants.SVTYPE) && vc.getAttributeAsString(GATKSVVCFConstants.SVTYPE, "").equals("DEL")); - Assert.assertTrue(vc.hasAttribute(GATKSVVCFConstants.SVLEN)); + Assert.assertTrue(vc.hasAttribute(GATKSVVCFConstants.SVTYPE)); + Assert.assertEquals(vc.getAttributeAsString(GATKSVVCFConstants.SVTYPE, null), "DEL"); } //in NA11829 variant events are not overlapping, so there should be a CN2 homRef in between diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVClusterIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVClusterIntegrationTest.java new file mode 100644 index 00000000000..aa95dbc808f --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVClusterIntegrationTest.java @@ -0,0 +1,546 @@ +package org.broadinstitute.hellbender.tools.walkers.sv; + +import htsjdk.samtools.reference.ReferenceSequenceFile; +import htsjdk.variant.variantcontext.*; +import htsjdk.variant.vcf.VCFHeader; +import org.apache.commons.lang3.tuple.Pair; +import org.broadinstitute.hellbender.CommandLineProgramTest; +import org.broadinstitute.hellbender.GATKBaseTest; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.engine.GATKPath; +import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; +import org.broadinstitute.hellbender.testutils.VariantContextTestUtils; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; +import org.broadinstitute.hellbender.tools.sv.SVCallRecord; +import org.broadinstitute.hellbender.tools.sv.SVCallRecordUtils; +import org.broadinstitute.hellbender.tools.sv.cluster.*; +import org.broadinstitute.hellbender.utils.IntervalUtils; +import org.broadinstitute.hellbender.utils.reference.ReferenceUtils; +import org.broadinstitute.hellbender.utils.variant.VariantContextGetters; +import org.spark_project.guava.collect.Lists; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Comparator; +import java.util.List; +import java.util.stream.Collectors; + +public class SVClusterIntegrationTest extends CommandLineProgramTest { + + private static final String REFERENCE_PATH = GATKBaseTest.hg38Reference; + + @Test + public void testDefragmentation() { + final File output = createTempFile("defragmented", ".vcf"); + + final String inputVcfPath = getToolTestDataDir() + "1kgp_test.cnvs.vcf.gz"; + final ArgumentsBuilder args = new ArgumentsBuilder() + .addOutput(output) + .add(StandardArgumentDefinitions.REFERENCE_LONG_NAME, REFERENCE_PATH) + .addVCF(inputVcfPath) + .add(SVCluster.PLOIDY_TABLE_LONG_NAME, getToolTestDataDir() + "1kgp.batch1.ploidy.tsv") + .add(SVCluster.VARIANT_PREFIX_LONG_NAME, "SVx") + .add(SVCluster.ALGORITHM_LONG_NAME, SVCluster.CLUSTER_ALGORITHM.DEFRAGMENT_CNV) + .add(SVCluster.DEFRAG_PADDING_FRACTION_LONG_NAME, 0.25) + .add(SVClusterEngineArgumentsCollection.DEPTH_SAMPLE_OVERLAP_FRACTION_NAME, 0.5); + + runCommandLine(args, SVCluster.class.getSimpleName()); + + final VCFHeader inputHeader = VariantContextTestUtils.getVCFHeader(inputVcfPath); + final Pair> vcf = VariantContextTestUtils.readEntireVCFIntoMemory(output.getAbsolutePath()); + final VCFHeader header = vcf.getKey(); + final List records = vcf.getValue(); + + Assert.assertEquals(header.getSampleNamesInOrder(), inputHeader.getSampleNamesInOrder()); + Assert.assertEquals(header.getSequenceDictionary().size(), inputHeader.getSequenceDictionary().size()); + + Assert.assertEquals(records.size(), 338); + + // Check for one record + boolean foundExpectedDefragmentedRecord = false; + for (final VariantContext variant : records) { + Assert.assertTrue(variant.hasAttribute(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY)); + Assert.assertFalse(variant.getAttributeAsStringList(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, null).isEmpty()); + Assert.assertTrue(variant.hasAttribute(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE)); + Assert.assertTrue(variant.getAttributeAsStringList(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE, null).contains(GATKSVVCFConstants.DEPTH_ALGORITHM)); + if (variant.getStart() == 25928335 && variant.getEnd() == 29403000) { + foundExpectedDefragmentedRecord = true; + final List alts = variant.getAlternateAlleles(); + Assert.assertEquals(alts.size(), 1); + Assert.assertEquals(alts.get(0), Allele.SV_SIMPLE_DUP); + Assert.assertEquals(variant.getStructuralVariantType(), StructuralVariantType.DUP); + for (final Genotype g : variant.getGenotypes()) { + if (g.getSampleName().equals("HG00129")) { + Assert.assertEquals(VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.COPY_NUMBER_FORMAT, -1), 3); + Assert.assertEquals(VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, -1), 2); + } else { + Assert.assertEquals(VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.COPY_NUMBER_FORMAT, -1), 2); + } + } + break; + } + } + Assert.assertTrue(foundExpectedDefragmentedRecord); + } + + @DataProvider(name = "testMergeData") + public Object[][] testMergeData() { + return new Object[][]{ + {true}, + {false} + }; + } + + @Test(dataProvider= "testMergeData") + public void testMergeHelper(final boolean omitMembers) { + final File output = createTempFile("merged", ".vcf"); + ArgumentsBuilder args = new ArgumentsBuilder() + .addOutput(output) + .addFlag(StandardArgumentDefinitions.DISABLE_SEQUENCE_DICT_VALIDATION_NAME) + .add(StandardArgumentDefinitions.REFERENCE_LONG_NAME, REFERENCE_PATH) + .addVCF(getToolTestDataDir() + "1kgp_test.cnvs.vcf.gz") + .addVCF(getToolTestDataDir() + "HG00096.manta.vcf.gz") + .addVCF(getToolTestDataDir() + "HG00096.wham.vcf.gz") + .addVCF(getToolTestDataDir() + "HG00129.manta.vcf.gz") + .addVCF(getToolTestDataDir() + "HG00129.wham.vcf.gz") + .addVCF(getToolTestDataDir() + "HG00140.manta.vcf.gz") + .addVCF(getToolTestDataDir() + "HG00140.wham.vcf.gz") + .add(SVCluster.PLOIDY_TABLE_LONG_NAME, getToolTestDataDir() + "1kgp.batch1.ploidy.tsv") + .add(SVCluster.VARIANT_PREFIX_LONG_NAME, "SVx") + .add(SVCluster.ALGORITHM_LONG_NAME, SVCluster.CLUSTER_ALGORITHM.SINGLE_LINKAGE) + .add(SVClusterEngineArgumentsCollection.DEPTH_SAMPLE_OVERLAP_FRACTION_NAME, 0) + .add(SVClusterEngineArgumentsCollection.DEPTH_INTERVAL_OVERLAP_FRACTION_NAME, 1) + .add(SVClusterEngineArgumentsCollection.DEPTH_BREAKEND_WINDOW_NAME, 0) + .add(SVClusterEngineArgumentsCollection.MIXED_SAMPLE_OVERLAP_FRACTION_NAME, 0) + .add(SVClusterEngineArgumentsCollection.MIXED_INTERVAL_OVERLAP_FRACTION_NAME, 1) + .add(SVClusterEngineArgumentsCollection.MIXED_BREAKEND_WINDOW_NAME, 0) + .add(SVClusterEngineArgumentsCollection.PESR_SAMPLE_OVERLAP_FRACTION_NAME, 0) + .add(SVClusterEngineArgumentsCollection.PESR_INTERVAL_OVERLAP_FRACTION_NAME, 1) + .add(SVClusterEngineArgumentsCollection.PESR_BREAKEND_WINDOW_NAME, 0); + if (omitMembers) { + args = args.addFlag(SVCluster.OMIT_MEMBERS_LONG_NAME); + } + + runCommandLine(args, SVCluster.class.getSimpleName()); + + final Pair> vcf = VariantContextTestUtils.readEntireVCFIntoMemory(output.getAbsolutePath()); + final VCFHeader header = vcf.getKey(); + final List records = vcf.getValue(); + + Assert.assertEquals(header.getSampleNamesInOrder(), Lists.newArrayList("HG00096", "HG00129", "HG00140", "NA18945", "NA18956")); + + Assert.assertEquals(records.size(), 1793); + + // Check for one record + int expectedRecordsFound = 0; + for (final VariantContext variant : records) { + Assert.assertEquals(!variant.hasAttribute(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY), omitMembers); + Assert.assertTrue(variant.hasAttribute(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE)); + if (variant.getID().equals("SVx00000065")) { + expectedRecordsFound++; + Assert.assertEquals(variant.getStart(), 8835844); + Assert.assertEquals(variant.getEnd(), 8835844); + final List algorithms = variant.getAttributeAsStringList(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE, null); + Assert.assertEquals(algorithms.size(), 1); + Assert.assertEquals(algorithms.get(0), "manta"); + final int svlen = variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, -1); + Assert.assertEquals(svlen, 301); + Assert.assertEquals(variant.getReference(), Allele.REF_C); + if (!omitMembers) { + final List members = variant.getAttributeAsStringList(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, null); + Assert.assertEquals(members.size(), 2); + } + final List alts = variant.getAlternateAlleles(); + Assert.assertEquals(alts.size(), 1); + Assert.assertEquals(alts.get(0), Allele.SV_SIMPLE_INS); + Assert.assertEquals(variant.getStructuralVariantType(), StructuralVariantType.INS); + for (final Genotype g : variant.getGenotypes()) { + if (g.getSampleName().equals("HG00096") || g.getSampleName().equals("HG00129")) { + Assert.assertTrue(g.isHet()); + Assert.assertEquals(VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, -1), 2); + } else { + Assert.assertTrue(g.isHomRef()); + } + } + } + } + Assert.assertEquals(expectedRecordsFound, 1); + } + + @Test + public void testClusterSingleLinkage() { + final File output = createTempFile("single_linkage_cluster", ".vcf"); + final ArgumentsBuilder args = new ArgumentsBuilder() + .addOutput(output) + .addVCF(getToolTestDataDir() + "1kgp_test.cnvs.vcf.gz") + .addVCF(getToolTestDataDir() + "HG00096.manta.vcf.gz") + .addVCF(getToolTestDataDir() + "HG00096.wham.vcf.gz") + .addVCF(getToolTestDataDir() + "HG00129.manta.vcf.gz") + .addVCF(getToolTestDataDir() + "HG00129.wham.vcf.gz") + .addVCF(getToolTestDataDir() + "HG00140.manta.vcf.gz") + .addVCF(getToolTestDataDir() + "HG00140.wham.vcf.gz") + .add(SVCluster.PLOIDY_TABLE_LONG_NAME, getToolTestDataDir() + "1kgp.batch1.ploidy.tsv") + .add(SVCluster.VARIANT_PREFIX_LONG_NAME, "SVx") + .add(SVCluster.ALGORITHM_LONG_NAME, SVCluster.CLUSTER_ALGORITHM.SINGLE_LINKAGE) + .add(StandardArgumentDefinitions.REFERENCE_LONG_NAME, REFERENCE_PATH) + .add(SVClusterEngineArgumentsCollection.DEPTH_SAMPLE_OVERLAP_FRACTION_NAME, 0) + .add(SVClusterEngineArgumentsCollection.DEPTH_INTERVAL_OVERLAP_FRACTION_NAME, 0.5) + .add(SVClusterEngineArgumentsCollection.DEPTH_BREAKEND_WINDOW_NAME, 2000) + .add(SVClusterEngineArgumentsCollection.MIXED_SAMPLE_OVERLAP_FRACTION_NAME, 0) + .add(SVClusterEngineArgumentsCollection.MIXED_INTERVAL_OVERLAP_FRACTION_NAME, 0.1) + .add(SVClusterEngineArgumentsCollection.MIXED_BREAKEND_WINDOW_NAME, 2000) + .add(SVClusterEngineArgumentsCollection.PESR_SAMPLE_OVERLAP_FRACTION_NAME, 0) + .add(SVClusterEngineArgumentsCollection.PESR_INTERVAL_OVERLAP_FRACTION_NAME, 0.1) + .add(SVClusterEngineArgumentsCollection.PESR_BREAKEND_WINDOW_NAME, 500); + + runCommandLine(args, SVCluster.class.getSimpleName()); + + final Pair> vcf = VariantContextTestUtils.readEntireVCFIntoMemory(output.getAbsolutePath()); + final VCFHeader header = vcf.getKey(); + final List records = vcf.getValue(); + + Assert.assertEquals(header.getSampleNamesInOrder(), Lists.newArrayList("HG00096", "HG00129", "HG00140", "NA18945", "NA18956")); + + Assert.assertEquals(records.size(), 1338); + + // Check for one record + int expectedRecordsFound = 0; + for (final VariantContext variant : records) { + Assert.assertTrue(variant.hasAttribute(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY)); + Assert.assertTrue(variant.hasAttribute(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE)); + if (variant.getID().equals("SVx0000001c")) { + expectedRecordsFound++; + Assert.assertEquals(variant.getContig(), "chr20"); + Assert.assertEquals(variant.getStart(), 3067778); + Assert.assertEquals(variant.getEnd(), 3067860); + final List algorithms = variant.getAttributeAsStringList(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE, null); + Assert.assertEquals(algorithms.size(), 2); + Assert.assertTrue(algorithms.contains("manta")); + Assert.assertTrue(algorithms.contains("wham")); + final List members = variant.getAttributeAsStringList(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, null); + Assert.assertEquals(members.size(), 4); + final List alts = variant.getAlternateAlleles(); + Assert.assertEquals(alts.size(), 1); + Assert.assertEquals(alts.get(0), Allele.SV_SIMPLE_DUP); + Assert.assertEquals(variant.getStructuralVariantType(), StructuralVariantType.DUP); + for (final Genotype g : variant.getGenotypes()) { + if (g.getSampleName().equals("HG00096") || g.getSampleName().equals("HG00129")) { + Assert.assertTrue(g.isHet()); + Assert.assertEquals(VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, -1), 2); + Assert.assertEquals(VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.COPY_NUMBER_FORMAT, -1), 3); + } else { + Assert.assertTrue(g.isHomRef()); + } + } + } + } + Assert.assertEquals(expectedRecordsFound, 1); + } + + + // Ensure the output buffer works correctly + @Test + public void testAgainstSimpleImplementation() { + final File output = createTempFile("single_linkage_cluster", ".vcf"); + final ArgumentsBuilder args = new ArgumentsBuilder() + .addOutput(output) + .add(SVCluster.PLOIDY_TABLE_LONG_NAME, getToolTestDataDir() + "1kgp.batch1.ploidy.tsv") + .add(SVCluster.VARIANT_PREFIX_LONG_NAME, "SVx") + .add(SVCluster.ALGORITHM_LONG_NAME, SVCluster.CLUSTER_ALGORITHM.SINGLE_LINKAGE) + .add(JointGermlineCNVSegmentation.BREAKPOINT_SUMMARY_STRATEGY_LONG_NAME, CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END) + .add(JointGermlineCNVSegmentation.ALT_ALLELE_SUMMARY_STRATEGY_LONG_NAME, CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE) + .add(SVCluster.INSERTION_LENGTH_SUMMARY_STRATEGY_LONG_NAME, CanonicalSVCollapser.InsertionLengthSummaryStrategy.MEDIAN) + .add(StandardArgumentDefinitions.REFERENCE_LONG_NAME, REFERENCE_PATH) + .add(SVClusterEngineArgumentsCollection.DEPTH_SAMPLE_OVERLAP_FRACTION_NAME, 0) + .add(SVClusterEngineArgumentsCollection.DEPTH_INTERVAL_OVERLAP_FRACTION_NAME, 0.5) + .add(SVClusterEngineArgumentsCollection.DEPTH_BREAKEND_WINDOW_NAME, 2000) + .add(SVClusterEngineArgumentsCollection.MIXED_SAMPLE_OVERLAP_FRACTION_NAME, 0) + .add(SVClusterEngineArgumentsCollection.MIXED_INTERVAL_OVERLAP_FRACTION_NAME, 0.1) + .add(SVClusterEngineArgumentsCollection.MIXED_BREAKEND_WINDOW_NAME, 2000) + .add(SVClusterEngineArgumentsCollection.PESR_SAMPLE_OVERLAP_FRACTION_NAME, 0) + .add(SVClusterEngineArgumentsCollection.PESR_INTERVAL_OVERLAP_FRACTION_NAME, 0.1) + .add(SVClusterEngineArgumentsCollection.PESR_BREAKEND_WINDOW_NAME, 500); + + final List vcfInputFilenames = Lists.newArrayList( + "1kgp_test.cnvs.vcf.gz", + "HG00096.manta.vcf.gz", + "HG00096.wham.vcf.gz", + "HG00129.manta.vcf.gz", + "HG00129.wham.vcf.gz", + "HG00140.manta.vcf.gz", + "HG00140.wham.vcf.gz" + ); + vcfInputFilenames.stream().forEach(v -> args.addVCF(getToolTestDataDir() + v)); + + runCommandLine(args, SVCluster.class.getSimpleName()); + + final Pair> testVcf = VariantContextTestUtils.readEntireVCFIntoMemory(output.getAbsolutePath()); + + final ReferenceSequenceFile referenceSequenceFile = ReferenceUtils.createReferenceReader(new GATKPath(REFERENCE_PATH)); + final ClusteringParameters depthParameters = ClusteringParameters.createDepthParameters(0.5, 2000, 0); + final ClusteringParameters mixedParameters = ClusteringParameters.createMixedParameters(0.1, 2000, 0); + final ClusteringParameters pesrParameters = ClusteringParameters.createPesrParameters(0.1, 500, 0); + final SVClusterEngine engine = SVClusterEngineFactory.createCanonical( + SVClusterEngine.CLUSTERING_TYPE.SINGLE_LINKAGE, + CanonicalSVCollapser.BreakpointSummaryStrategy.MEDIAN_START_MEDIAN_END, + CanonicalSVCollapser.AltAlleleSummaryStrategy.COMMON_SUBTYPE, + CanonicalSVCollapser.InsertionLengthSummaryStrategy.MEDIAN, + referenceSequenceFile.getSequenceDictionary(), + referenceSequenceFile, + false, + depthParameters, + mixedParameters, + pesrParameters); + + vcfInputFilenames.stream() + .flatMap(vcfFilename -> VariantContextTestUtils.readEntireVCFIntoMemory(getToolTestDataDir() + vcfFilename).getValue().stream()) + .sorted(IntervalUtils.getDictionaryOrderComparator(referenceSequenceFile.getSequenceDictionary())) + .map(SVCallRecordUtils::create) + .forEach(engine::add); + + final Comparator recordComparator = SVCallRecordUtils.getCallComparator(referenceSequenceFile.getSequenceDictionary()); + final List expectedVariants = engine.forceFlushAndGetOutput().stream() + .sorted(recordComparator) + .map(SVCallRecordUtils::getVariantBuilder) + .map(VariantContextBuilder::make) + .collect(Collectors.toList()); + final List testVariants = testVcf.getValue(); + + Assert.assertEquals(testVariants.size(), expectedVariants.size()); + for (int i = 0; i < testVariants.size(); i++) { + final VariantContext testVariant = testVariants.get(i); + final VariantContext expectedVariant = expectedVariants.get(i); + Assert.assertEquals(testVariant.getContig(), expectedVariant.getContig()); + Assert.assertEquals(testVariant.getStart(), expectedVariant.getStart()); + Assert.assertEquals(testVariant.getEnd(), expectedVariant.getEnd()); + Assert.assertEquals(testVariant.getAlleles(), expectedVariant.getAlleles()); + Assert.assertEquals(testVariant.getAttributeAsStringList(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE, "test_default"), expectedVariant.getAttributeAsStringList(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE, "expected_default")); + Assert.assertEquals(testVariant.getAttributeAsStringList(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, "test_default"), expectedVariant.getAttributeAsStringList(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, "expected_default")); + Assert.assertEquals(testVariant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, "test_default"), expectedVariant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, "expected_default")); + } + } + + @DataProvider(name = "testClusterMaxCliqueData") + public Object[][] testClusterMaxCliqueData() { + return new Object[][]{ + {true}, + {false} + }; + } + + @Test(dataProvider= "testClusterMaxCliqueData") + public void testClusterMaxClique(final boolean fastMode) { + final File output = createTempFile("max_clique_cluster", ".vcf"); + ArgumentsBuilder args = new ArgumentsBuilder() + .addOutput(output) + .addVCF(getToolTestDataDir() + "1kgp_test.cnvs.vcf.gz") + .addVCF(getToolTestDataDir() + "HG00096.manta.vcf.gz") + .addVCF(getToolTestDataDir() + "HG00096.wham.vcf.gz") + .addVCF(getToolTestDataDir() + "HG00129.manta.vcf.gz") + .addVCF(getToolTestDataDir() + "HG00129.wham.vcf.gz") + .addVCF(getToolTestDataDir() + "HG00140.manta.vcf.gz") + .addVCF(getToolTestDataDir() + "HG00140.wham.vcf.gz") + .add(SVCluster.PLOIDY_TABLE_LONG_NAME, getToolTestDataDir() + "1kgp.batch1.ploidy.tsv") + .add(SVCluster.VARIANT_PREFIX_LONG_NAME, "SVx") + .add(StandardArgumentDefinitions.REFERENCE_LONG_NAME, REFERENCE_PATH) + .add(SVCluster.ALGORITHM_LONG_NAME, SVCluster.CLUSTER_ALGORITHM.MAX_CLIQUE) + .add(SVClusterEngineArgumentsCollection.DEPTH_SAMPLE_OVERLAP_FRACTION_NAME, 0) + .add(SVClusterEngineArgumentsCollection.DEPTH_INTERVAL_OVERLAP_FRACTION_NAME, 0.5) + .add(SVClusterEngineArgumentsCollection.DEPTH_BREAKEND_WINDOW_NAME, 2000) + .add(SVClusterEngineArgumentsCollection.MIXED_SAMPLE_OVERLAP_FRACTION_NAME, 0) + .add(SVClusterEngineArgumentsCollection.MIXED_INTERVAL_OVERLAP_FRACTION_NAME, 0.1) + .add(SVClusterEngineArgumentsCollection.MIXED_BREAKEND_WINDOW_NAME, 2000) + .add(SVClusterEngineArgumentsCollection.PESR_SAMPLE_OVERLAP_FRACTION_NAME, 0) + .add(SVClusterEngineArgumentsCollection.PESR_INTERVAL_OVERLAP_FRACTION_NAME, 0.1) + .add(SVClusterEngineArgumentsCollection.PESR_BREAKEND_WINDOW_NAME, 500); + if (fastMode) { + args = args.addFlag(SVCluster.FAST_MODE_LONG_NAME); + } + + runCommandLine(args, SVCluster.class.getSimpleName()); + + final Pair> vcf = VariantContextTestUtils.readEntireVCFIntoMemory(output.getAbsolutePath()); + final VCFHeader header = vcf.getKey(); + final List records = vcf.getValue(); + + Assert.assertEquals(header.getSampleNamesInOrder(), Lists.newArrayList("HG00096", "HG00129", "HG00140", "NA18945", "NA18956")); + + Assert.assertEquals(records.size(), 1353); + + // Check for one record + int expectedRecordsFound = 0; + for (final VariantContext variant : records) { + Assert.assertTrue(variant.hasAttribute(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY)); + Assert.assertTrue(variant.hasAttribute(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE)); + if (variant.getID().equals("SVx000001ad")) { + expectedRecordsFound++; + Assert.assertEquals(variant.getContig(), "chr20"); + Assert.assertEquals(variant.getStart(), 28654436); + Assert.assertEquals(variant.getEnd(), 28719092); + Assert.assertFalse(variant.hasAttribute(GATKSVVCFConstants.SVLEN)); + final List algorithms = variant.getAttributeAsStringList(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE, null); + Assert.assertEquals(algorithms.size(), 1); + Assert.assertTrue(algorithms.contains("manta")); + final List members = variant.getAttributeAsStringList(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, null); + Assert.assertEquals(members.size(), 2); + final List alts = variant.getAlternateAlleles(); + Assert.assertEquals(alts.size(), 1); + Assert.assertEquals(alts.get(0), Allele.SV_SIMPLE_INV); + Assert.assertEquals(variant.getStructuralVariantType(), StructuralVariantType.INV); + final String strands = variant.getAttributeAsString(GATKSVVCFConstants.STRANDS_ATTRIBUTE, null); + Assert.assertEquals(strands, "--"); + for (final Genotype g : variant.getGenotypes()) { + if (g.getSampleName().equals("HG00140") || g.getSampleName().equals("HG00129")) { + Assert.assertTrue(g.isHom()); + Assert.assertEquals(VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, -1), 2); + } else { + Assert.assertTrue(g.isHomRef()); + } + } + } + } + Assert.assertEquals(expectedRecordsFound, 1); + } + + @Test + public void testClusterSampleOverlap() { + final File output = createTempFile("cluster_sample_overlap", ".vcf"); + final ArgumentsBuilder args = new ArgumentsBuilder() + .addOutput(output) + .addVCF(getToolTestDataDir() + "1kgp_test.batch1.pesr.chr22.vcf.gz") + .addVCF(getToolTestDataDir() + "1kgp_test.batch1.depth.chr22.vcf.gz") + .add(SVCluster.PLOIDY_TABLE_LONG_NAME, getToolTestDataDir() + "1kgp.batch1.ploidy.tsv") + .add(SVCluster.VARIANT_PREFIX_LONG_NAME, "SVx") + .add(StandardArgumentDefinitions.REFERENCE_LONG_NAME, REFERENCE_PATH) + .add(SVCluster.ALGORITHM_LONG_NAME, SVCluster.CLUSTER_ALGORITHM.SINGLE_LINKAGE) + .add(SVClusterEngineArgumentsCollection.DEPTH_SAMPLE_OVERLAP_FRACTION_NAME, 0.5) + .add(SVClusterEngineArgumentsCollection.DEPTH_INTERVAL_OVERLAP_FRACTION_NAME, 0.5) + .add(SVClusterEngineArgumentsCollection.DEPTH_BREAKEND_WINDOW_NAME, 2000) + .add(SVClusterEngineArgumentsCollection.MIXED_SAMPLE_OVERLAP_FRACTION_NAME, 0.5) + .add(SVClusterEngineArgumentsCollection.MIXED_INTERVAL_OVERLAP_FRACTION_NAME, 0.1) + .add(SVClusterEngineArgumentsCollection.MIXED_BREAKEND_WINDOW_NAME, 2000) + .add(SVClusterEngineArgumentsCollection.PESR_SAMPLE_OVERLAP_FRACTION_NAME, 0.5) + .add(SVClusterEngineArgumentsCollection.PESR_INTERVAL_OVERLAP_FRACTION_NAME, 0.1) + .add(SVClusterEngineArgumentsCollection.PESR_BREAKEND_WINDOW_NAME, 500); + + runCommandLine(args, SVCluster.class.getSimpleName()); + + final Pair> vcf = VariantContextTestUtils.readEntireVCFIntoMemory(output.getAbsolutePath()); + final VCFHeader header = vcf.getKey(); + final List records = vcf.getValue(); + + Assert.assertEquals(header.getSampleNamesInOrder().size(), 156); + + Assert.assertEquals(records.size(), 1705); + + // Check for one record + int expectedRecordsFound = 0; + for (final VariantContext variant : records) { + Assert.assertTrue(variant.hasAttribute(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY)); + Assert.assertTrue(variant.hasAttribute(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE)); + if (variant.getID().equals("SVx0000041c")) { + expectedRecordsFound++; + Assert.assertEquals(variant.getContig(), "chr22"); + Assert.assertEquals(variant.getStart(), 38898103); + Assert.assertEquals(variant.getEnd(), 38902679); + final List algorithms = variant.getAttributeAsStringList(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE, null); + Assert.assertEquals(algorithms.size(), 2); + Assert.assertTrue(algorithms.contains(GATKSVVCFConstants.DEPTH_ALGORITHM)); + Assert.assertTrue(algorithms.contains("manta")); + final List members = variant.getAttributeAsStringList(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, null); + Assert.assertEquals(members.size(), 2); + Assert.assertEquals(variant.getReference(), Allele.REF_A); + final List alts = variant.getAlternateAlleles(); + Assert.assertEquals(alts.size(), 1); + Assert.assertEquals(alts.get(0), Allele.SV_SIMPLE_DEL); + Assert.assertEquals(variant.getStructuralVariantType(), StructuralVariantType.DEL); + final int nonRefGenotypeCount = (int) variant.getGenotypes().stream().filter(g -> SVCallRecordUtils.isAltGenotype(g)).count(); + Assert.assertEquals(nonRefGenotypeCount, 71); + final int alleleCount = (int) variant.getGenotypes().stream().flatMap(g -> g.getAlleles().stream()).filter(SVCallRecordUtils::isAltAllele).count(); + Assert.assertEquals(alleleCount, 87); + final Genotype g = variant.getGenotype("HG00129"); + Assert.assertTrue(g.isHet()); + Assert.assertEquals(VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, -1), 2); + Assert.assertEquals(VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.COPY_NUMBER_FORMAT, -1), 1); + } + } + Assert.assertEquals(expectedRecordsFound, 1); + } + + + @Test + public void testAllosome() { + final File output = createTempFile("allosome", ".vcf"); + final ArgumentsBuilder args = new ArgumentsBuilder() + .addOutput(output) + .addVCF(getToolTestDataDir() + "HG00096.manta.allo.vcf.gz") + .addVCF(getToolTestDataDir() + "HG00129.manta.allo.vcf.gz") + .addVCF(getToolTestDataDir() + "HG00150.manta.allo.vcf.gz") + .add(SVCluster.PLOIDY_TABLE_LONG_NAME, getToolTestDataDir() + "1kgp.batch1.ploidy.tsv") + .add(SVCluster.VARIANT_PREFIX_LONG_NAME, "SVx") + .add(SVCluster.ALGORITHM_LONG_NAME, SVCluster.CLUSTER_ALGORITHM.SINGLE_LINKAGE) + .add(StandardArgumentDefinitions.REFERENCE_LONG_NAME, REFERENCE_PATH) + .add(SVClusterEngineArgumentsCollection.DEPTH_SAMPLE_OVERLAP_FRACTION_NAME, 0) + .add(SVClusterEngineArgumentsCollection.DEPTH_INTERVAL_OVERLAP_FRACTION_NAME, 0.5) + .add(SVClusterEngineArgumentsCollection.DEPTH_BREAKEND_WINDOW_NAME, 2000) + .add(SVClusterEngineArgumentsCollection.MIXED_SAMPLE_OVERLAP_FRACTION_NAME, 0) + .add(SVClusterEngineArgumentsCollection.MIXED_INTERVAL_OVERLAP_FRACTION_NAME, 0.1) + .add(SVClusterEngineArgumentsCollection.MIXED_BREAKEND_WINDOW_NAME, 2000) + .add(SVClusterEngineArgumentsCollection.PESR_SAMPLE_OVERLAP_FRACTION_NAME, 0) + .add(SVClusterEngineArgumentsCollection.PESR_INTERVAL_OVERLAP_FRACTION_NAME, 0.1) + .add(SVClusterEngineArgumentsCollection.PESR_BREAKEND_WINDOW_NAME, 500); + + runCommandLine(args, SVCluster.class.getSimpleName()); + + final Pair> vcf = VariantContextTestUtils.readEntireVCFIntoMemory(output.getAbsolutePath()); + final VCFHeader header = vcf.getKey(); + final List records = vcf.getValue(); + + Assert.assertEquals(header.getSampleNamesInOrder(), Lists.newArrayList("HG00096", "HG00129", "HG00150")); + + Assert.assertEquals(records.size(), 536); + + // Check for one record + int expectedRecordsFound = 0; + for (final VariantContext variant : records) { + Assert.assertTrue(variant.hasAttribute(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY)); + Assert.assertTrue(variant.hasAttribute(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE)); + if (variant.getID().equals("SVx00000203")) { + expectedRecordsFound++; + Assert.assertEquals(variant.getContig(), "chrY"); + Assert.assertEquals(variant.getStart(), 10676436); + Assert.assertEquals(variant.getEnd(), 10694243); + final List algorithms = variant.getAttributeAsStringList(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE, null); + Assert.assertEquals(algorithms.size(), 1); + Assert.assertTrue(algorithms.contains("manta")); + final List members = variant.getAttributeAsStringList(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY, null); + Assert.assertEquals(members.size(), 1); + final List alts = variant.getAlternateAlleles(); + Assert.assertEquals(alts.size(), 1); + Assert.assertEquals(alts.get(0), Allele.SV_SIMPLE_DUP); + Assert.assertEquals(variant.getStructuralVariantType(), StructuralVariantType.DUP); + for (final Genotype g : variant.getGenotypes()) { + if (g.getSampleName().equals("HG00096")) { + Assert.assertTrue(g.isHomVar()); + Assert.assertEquals(g.getAlleles().size(), 1); + Assert.assertEquals(VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, -1), 1); + Assert.assertEquals(VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.COPY_NUMBER_FORMAT, -1), 3); + } else if (g.getSampleName().equals("HG00129")) { + Assert.assertTrue(g.isHomRef()); + Assert.assertEquals(g.getAlleles().size(), 1); + Assert.assertEquals(VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, -1), 1); + Assert.assertEquals(VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.COPY_NUMBER_FORMAT, -1), 1); + } else if (g.getSampleName().equals("HG00150")) { + Assert.assertTrue(g.isNoCall()); + Assert.assertEquals(VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, -1), 0); + Assert.assertEquals(VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.COPY_NUMBER_FORMAT, -1), 0); + } + } + } + } + Assert.assertEquals(expectedRecordsFound, 1); + } + +} \ No newline at end of file diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/1kgp.batch1.ploidy.tsv b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/1kgp.batch1.ploidy.tsv new file mode 100644 index 00000000000..60479abd3d3 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/1kgp.batch1.ploidy.tsv @@ -0,0 +1,157 @@ +SAMPLE chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX chrY +HG00096 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG00129 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG00140 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG00150 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG00187 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG00239 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG00277 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG00288 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG00337 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG00349 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG00375 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG00410 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG00457 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG00557 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG00599 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG00625 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG00701 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG00740 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG00844 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG01060 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG01085 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG01112 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG01275 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG01325 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG01344 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG01356 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG01384 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG01393 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG01396 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG01474 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG01507 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG01572 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG01607 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG01709 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG01747 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG01790 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG01794 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG01799 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG01861 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG01874 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG01880 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG01885 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG01958 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG01982 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG02002 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG02010 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG02019 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG02020 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG02069 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG02085 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG02186 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG02221 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG02235 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG02272 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG02275 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG02299 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG02332 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG02367 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG02374 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG02489 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG02490 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG02491 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG02586 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG02588 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG02611 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG02620 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG02642 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG02648 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG02658 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG02855 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG02953 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG03007 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG03009 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG03085 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG03099 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG03100 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG03111 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG03369 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG03370 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG03436 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG03449 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG03472 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG03476 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG03556 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG03604 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG03649 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG03684 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG03694 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG03709 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG03722 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG03727 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG03744 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG03756 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG03789 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG03850 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG03864 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG03872 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG03888 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG04118 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +HG04158 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG04161 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +HG04183 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +NA06984 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +NA10847 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +NA11894 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +NA12340 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +NA12489 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +NA12872 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +NA18499 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +NA18507 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +NA18530 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +NA18539 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +NA18549 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +NA18553 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +NA18560 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +NA18638 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +NA18923 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +NA18941 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +NA18945 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +NA18956 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +NA18995 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +NA19001 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +NA19035 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +NA19062 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +NA19102 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +NA19143 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +NA19184 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +NA19350 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +NA19351 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +NA19377 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +NA19443 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +NA19449 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +NA19661 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +NA19678 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +NA19679 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +NA19684 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +NA19746 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +NA19795 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +NA19818 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +NA19913 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +NA20126 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +NA20320 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +NA20321 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +NA20346 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +NA20509 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +NA20510 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +NA20522 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +NA20752 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +NA20764 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +NA20802 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +NA20845 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +NA20869 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +NA20895 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 +NA21102 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +NA21122 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 +NA21133 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/1kgp_test.batch1.depth.chr22.vcf.gz b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/1kgp_test.batch1.depth.chr22.vcf.gz new file mode 100644 index 00000000000..ebb50391145 Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/1kgp_test.batch1.depth.chr22.vcf.gz differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/1kgp_test.batch1.depth.chr22.vcf.gz.tbi b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/1kgp_test.batch1.depth.chr22.vcf.gz.tbi new file mode 100644 index 00000000000..b929dadc1a4 Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/1kgp_test.batch1.depth.chr22.vcf.gz.tbi differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/1kgp_test.batch1.pesr.chr22.vcf.gz b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/1kgp_test.batch1.pesr.chr22.vcf.gz new file mode 100644 index 00000000000..c6e4ceda564 Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/1kgp_test.batch1.pesr.chr22.vcf.gz differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/1kgp_test.batch1.pesr.chr22.vcf.gz.tbi b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/1kgp_test.batch1.pesr.chr22.vcf.gz.tbi new file mode 100644 index 00000000000..a22a10018de Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/1kgp_test.batch1.pesr.chr22.vcf.gz.tbi differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/1kgp_test.cnvs.vcf.gz b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/1kgp_test.cnvs.vcf.gz new file mode 100644 index 00000000000..ee0b5acb92c Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/1kgp_test.cnvs.vcf.gz differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/1kgp_test.cnvs.vcf.gz.tbi b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/1kgp_test.cnvs.vcf.gz.tbi new file mode 100644 index 00000000000..da396392f6a Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/1kgp_test.cnvs.vcf.gz.tbi differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/1kgp_test.merged.vcf.gz b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/1kgp_test.merged.vcf.gz new file mode 100644 index 00000000000..4698640b14b Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/1kgp_test.merged.vcf.gz differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/1kgp_test.merged.vcf.gz.tbi b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/1kgp_test.merged.vcf.gz.tbi new file mode 100644 index 00000000000..460e645767e Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/1kgp_test.merged.vcf.gz.tbi differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00096.manta.allo.vcf.gz b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00096.manta.allo.vcf.gz new file mode 100644 index 00000000000..4f6306dcf4e Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00096.manta.allo.vcf.gz differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00096.manta.allo.vcf.gz.tbi b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00096.manta.allo.vcf.gz.tbi new file mode 100644 index 00000000000..906ec1a4e7a Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00096.manta.allo.vcf.gz.tbi differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00096.manta.vcf.gz b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00096.manta.vcf.gz new file mode 100644 index 00000000000..bae9ecbdf2a Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00096.manta.vcf.gz differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00096.manta.vcf.gz.tbi b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00096.manta.vcf.gz.tbi new file mode 100644 index 00000000000..bd998827c96 Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00096.manta.vcf.gz.tbi differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00096.wham.vcf.gz b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00096.wham.vcf.gz new file mode 100644 index 00000000000..9f05ba81d2a Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00096.wham.vcf.gz differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00096.wham.vcf.gz.tbi b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00096.wham.vcf.gz.tbi new file mode 100644 index 00000000000..d0fba88fc92 Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00096.wham.vcf.gz.tbi differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00129.manta.allo.vcf.gz b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00129.manta.allo.vcf.gz new file mode 100644 index 00000000000..3d8918d0774 Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00129.manta.allo.vcf.gz differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00129.manta.allo.vcf.gz.tbi b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00129.manta.allo.vcf.gz.tbi new file mode 100644 index 00000000000..751a02d2e8f Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00129.manta.allo.vcf.gz.tbi differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00129.manta.vcf.gz b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00129.manta.vcf.gz new file mode 100644 index 00000000000..da5538f1c71 Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00129.manta.vcf.gz differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00129.manta.vcf.gz.tbi b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00129.manta.vcf.gz.tbi new file mode 100644 index 00000000000..9c95d94c4c3 Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00129.manta.vcf.gz.tbi differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00129.wham.vcf.gz b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00129.wham.vcf.gz new file mode 100644 index 00000000000..5ec9c9edaa0 Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00129.wham.vcf.gz differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00129.wham.vcf.gz.tbi b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00129.wham.vcf.gz.tbi new file mode 100644 index 00000000000..313730dd830 Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00129.wham.vcf.gz.tbi differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00140.manta.vcf.gz b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00140.manta.vcf.gz new file mode 100644 index 00000000000..9fb134287f2 Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00140.manta.vcf.gz differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00140.manta.vcf.gz.tbi b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00140.manta.vcf.gz.tbi new file mode 100644 index 00000000000..4c1c94f925b Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00140.manta.vcf.gz.tbi differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00140.wham.vcf.gz b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00140.wham.vcf.gz new file mode 100644 index 00000000000..72ea2503c55 Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00140.wham.vcf.gz differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00140.wham.vcf.gz.tbi b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00140.wham.vcf.gz.tbi new file mode 100644 index 00000000000..cd3b6e96a5e Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00140.wham.vcf.gz.tbi differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00150.manta.allo.vcf.gz b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00150.manta.allo.vcf.gz new file mode 100644 index 00000000000..0e920011c1d Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00150.manta.allo.vcf.gz differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00150.manta.allo.vcf.gz.tbi b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00150.manta.allo.vcf.gz.tbi new file mode 100644 index 00000000000..6314a4966de Binary files /dev/null and b/src/test/resources/org/broadinstitute/hellbender/tools/walkers/sv/SVCluster/HG00150.manta.allo.vcf.gz.tbi differ