Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SVCluster tool #7541

Merged
merged 8 commits into from
Jan 26, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import static org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants.COPY_NUMBER_FORMAT;

Expand Down Expand Up @@ -232,8 +233,20 @@ public static int getExpectedCopyNumber(final Genotype genotype) {
return VariantContextGetters.getAttributeAsInt(genotype, GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 0);
}

public Set<String> getCarrierSamples() {
return genotypes.stream().filter(this::isCarrier).map(Genotype::getSampleName).collect(Collectors.toSet());
public Set<String> getCarrierSampleSet() {
return getCarrierSampleStream().collect(Collectors.toSet());
}

public List<Genotype> getCarrierGenotypeList() {
return getCarrierGenotypeStream().collect(Collectors.toList());
}

private Stream<String> getCarrierSampleStream() {
return getCarrierGenotypeStream().map(Genotype::getSampleName);
}

private Stream<Genotype> getCarrierGenotypeStream() {
return genotypes.stream().filter(this::isCarrier);
}

public boolean isDepthOnly() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@
import htsjdk.variant.vcf.VCFConstants;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
import org.broadinstitute.hellbender.tools.sv.cluster.CanonicalSVCollapser;
import org.broadinstitute.hellbender.tools.sv.cluster.PloidyTable;
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.variant.VariantContextGetters;

import java.util.*;
import java.util.stream.Collectors;
Expand Down Expand Up @@ -64,71 +65,70 @@ public static VariantContextBuilder getVariantBuilder(final SVCallRecord record)
builder.attribute(GATKSVVCFConstants.CONTIG2_ATTRIBUTE, record.getContigB());
builder.attribute(GATKSVVCFConstants.END2_ATTRIBUTE, end2);
}
if (!svtype.equals(StructuralVariantType.BND)) {
if (svtype.equals(StructuralVariantType.INS)) {
builder.attribute(GATKSVVCFConstants.SVLEN, record.getLength());
}
if (svtype.equals(StructuralVariantType.BND) || svtype.equals(StructuralVariantType.INV)) {
builder.attribute(GATKSVVCFConstants.STRANDS_ATTRIBUTE, getStrandString(record));
}
final GenotypesContext genotypes = GenotypesContext.create(record.getGenotypes().size());
for (final Genotype g : record.getGenotypes()) {
genotypes.add(sanitizeEmptyGenotype(g));
}
// htsjdk vcf encoder does not allow genotypes to have empty alleles
builder.genotypes(record.getGenotypes().stream().map(SVCallRecordUtils::sanitizeEmptyGenotype).collect(Collectors.toList()));
return builder;
}

// Generate alleles for DEL genotypes, which can be inferred from expected and actual copy numbers
final List<Genotype> newGenotypes = new ArrayList<>(record.getGenotypes().size());
if (svtype.equals(StructuralVariantType.DEL)) {
for (final Genotype g : record.getGenotypes()) {
Utils.validate(altAlleles.size() == 1, "Encountered deletion with multiple ALT alleles");
Utils.validate(g.hasExtendedAttribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT),
"Deletion genotype missing " + GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT + " field");
Utils.validate(g.hasExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT),
"Deletion genotype missing " + GATKSVVCFConstants.COPY_NUMBER_FORMAT + " field");
final int expectedCopyNumber = VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, 0);
final int copyNumber = VariantContextGetters.getAttributeAsInt(g, GATKSVVCFConstants.COPY_NUMBER_FORMAT, 0);
final int numAltAlleles = expectedCopyNumber - copyNumber;
Utils.validate(numAltAlleles >= 0, "Invalid copy number " + copyNumber +
" for deletion genotype with expected copy number " + expectedCopyNumber);
final List<Allele> genotypeAlleles = new ArrayList<>(expectedCopyNumber);
for (int i = 0; i < copyNumber; i++) {
genotypeAlleles.add(refAllele);
}
for (int i = copyNumber; i < numAlleles; i++) {
genotypeAlleles.add(altAlleles.get(0));
}
newGenotypes.add(new GenotypeBuilder(g).alleles(genotypeAlleles).make());
}
builder.genotypes(newGenotypes);
/**
* Adds NO_CALL allele if empty
*/
private static Genotype sanitizeEmptyGenotype(final Genotype g) {
if (g.getAlleles().isEmpty()) {
return new GenotypeBuilder(g).alleles(Collections.singletonList(Allele.NO_CALL)).make();
} else {
builder.genotypes(record.getGenotypes());
return g;
}

return builder;
}

/**
* Creates a new {@link GenotypesContext} object augmented with the given sample set. Samples with existing
* genotypes are not touched. Samples without genotypes are assigned the provided sets of alleles and attributes.
* @param genotypes base genotypes
* Populates genotypes for samples not present in the given record. Samples with existing genotypes are not
* touched. Samples without genotypes are assigned one according to the provided default reference allele
* and ploidy, specified by the {@link GATKSVVCFConstants#EXPECTED_COPY_NUMBER_FORMAT} value. If the record
* represents a CNV, the {@link GATKSVVCFConstants#COPY_NUMBER_FORMAT} is also set.
* @param record record containing the genotypes
* @param samples samples which the resulting genotypes must contain (existing samples are ignored)
* @param alleles alleles to apply to all new genotypes, must be length equal to the ploidy for the sample
* @param attributes attributes to apply to all new genotypes
* @param refAlleleDefault default allele to use for samples without genotypes
* @param ploidyTable ploidy table, which must contain at least all samples with missing genotypes
* @return genotypes augmented with missing samples
*/
public static GenotypesContext populateGenotypesForMissingSamplesWithAlleles(final GenotypesContext genotypes,
public static GenotypesContext populateGenotypesForMissingSamplesWithAlleles(final SVCallRecord record,
final Set<String> samples,
final List<Allele> alleles,
final Map<String, Object> attributes) {
Utils.nonNull(genotypes);
final boolean refAlleleDefault,
final PloidyTable ploidyTable) {
Utils.nonNull(record);
Utils.nonNull(samples);
final GenotypesContext genotypes = record.getGenotypes();
final Set<String> missingSamples = Sets.difference(samples, genotypes.getSampleNames());
if (missingSamples.isEmpty()) {
return genotypes;
}
final ArrayList<Genotype> newGenotypes = new ArrayList<>(genotypes.size() + missingSamples.size());
newGenotypes.addAll(genotypes);
final String contig = record.getContigA();
final List<Allele> altAlleles = record.getAltAlleles();
final Allele refAllele = record.getRefAllele();
final boolean isCNV = record.isSimpleCNV();
for (final String sample : missingSamples) {
final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(sample);
if (attributes != null) {
genotypeBuilder.attributes(attributes);
final int ploidy = ploidyTable.get(sample, contig);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd add some documentation in the method javadoc about how this logic works (and add @param docs for refAlleleDefault and ploidyTable).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

genotypeBuilder.attribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT, ploidy);
if (isCNV) {
genotypeBuilder.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, ploidy);
genotypeBuilder.alleles(CanonicalSVCollapser.getCNVGenotypeAllelesFromCopyNumber(altAlleles, refAllele, ploidy, ploidy));
} else {
genotypeBuilder.alleles(Collections.nCopies(ploidy, refAlleleDefault ? refAllele : Allele.NO_CALL));
}
genotypeBuilder.alleles(alleles);
newGenotypes.add(genotypeBuilder.make());
}
return GenotypesContext.create(newGenotypes);
Expand Down Expand Up @@ -393,10 +393,18 @@ public static boolean containsAltAllele(final Genotype g) {
return g.getAlleles().stream().anyMatch(SVCallRecordUtils::isAltAllele);
}

public static boolean isAltGenotype(final Genotype g) {
return g.getAlleles().stream().anyMatch(SVCallRecordUtils::isAltAllele);
}

public static boolean isAltAllele(final Allele allele) {
return allele != null && !allele.isNoCall() && !allele.isReference();
}

public static boolean isNonRefAllele(final Allele allele) {
return allele != null && !allele.isReference();
}

// TODO this is sort of hacky but the Allele compareTo() method doesn't give stable ordering
public static List<Allele> sortAlleles(final Collection<Allele> alleles) {
return alleles.stream().sorted(Comparator.nullsFirst(Comparator.comparing(Allele::getDisplayString))).collect(Collectors.toList());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@ public boolean areClusterable(final SVCallRecord a, final SVCallRecord b) {

// In the single-sample case, match copy number strictly if we're looking at the same sample
// TODO repeated check for CN attributes in hasSampleOverlap and getCarrierSamples
final Set<String> carriersA = a.getCarrierSamples();
final Set<String> carriersB = b.getCarrierSamples();
final Set<String> carriersA = a.getCarrierSampleSet();
final Set<String> carriersB = b.getCarrierSampleSet();
if (carriersA.size() == 1 && carriersA.equals(carriersB)) {
final Genotype genotypeA = a.getGenotypes().get(carriersA.iterator().next());
final Genotype genotypeB = b.getGenotypes().get(carriersB.iterator().next());
Expand Down Expand Up @@ -90,6 +90,9 @@ public boolean areClusterable(final SVCallRecord a, final SVCallRecord b) {
@Override
public int getMaxClusterableStartingPosition(final SVCallRecord call) {
final int contigLength = dictionary.getSequence(call.getContigA()).getSequenceLength();
if (!call.isSimpleCNV()) {
return 0;
}
final int maxTheoreticalStart = (int) Math.floor((call.getPositionB() + paddingFraction * (call.getLength() + contigLength)) / (1.0 + paddingFraction));
return Math.min(maxTheoreticalStart, dictionary.getSequence(call.getContigA()).getSequenceLength());
}
Expand Down
Loading