Skip to content

Commit

Permalink
Merge pull request #976 from fulcrumgenomics/multiallelic-likelihoods
Browse files Browse the repository at this point in the history
Add support for multi-allelic variants
  • Loading branch information
jdidion authored Apr 15, 2024
2 parents 9a2d510 + 0232972 commit 75de344
Show file tree
Hide file tree
Showing 2 changed files with 186 additions and 60 deletions.
118 changes: 78 additions & 40 deletions src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import com.fulcrumgenomics.vcf.DownsampleVcf.{downsampleAndRegenotype, winnowVar

import scala.math.log10
import scala.util.Random
import scala.tools.nsc.doc.html.HtmlTags

object DownsampleVcf extends LazyLogging {
/** Removes variants that are within a specified distance from a previous variant.
Expand Down Expand Up @@ -97,69 +98,106 @@ object DownsampleVcf extends LazyLogging {
def downsampleAndRegenotype(gt: Genotype, proportion: Double, random: Random, epsilon: Double): Genotype = {
val oldAds = gt.getOrElse[IndexedSeq[Int]]("AD", throw new Exception(s"AD tag not found for sample ${gt.sample}"))
val newAds = downsampleADs(oldAds, proportion, random)
val Seq(aa, ab, bb) = computePls(newAds)
val Seq(alleleA, alleleB) = gt.alleles.toSeq

val calls = {
if (aa == 0 && ab == 0 && bb == 0) IndexedSeq(NoCallAllele, NoCallAllele)
else if (aa < ab && aa < bb) IndexedSeq(alleleA, alleleA)
else if (bb < ab && bb < aa) IndexedSeq(alleleB, alleleB)
else IndexedSeq(alleleA, alleleB)
}
gt.copy(attrs=Map("PL" -> IndexedSeq(aa, ab, bb), "AD" -> newAds, "DP" -> newAds.sum), calls=calls)
}

/**
* Compute the genotype likelihoods given the allele depths, assuming a diploid genotype (i.e.
* two allele depths).
* @param ads The input depths for the two alleles A and B.
* @return a list of three likelihoods for the alleles AA, AB, and BB.
*/
def computePls(ads: IndexedSeq[Int]): IndexedSeq[Int] = {
require(ads.length == 2, "there must be exactly two allele depths")
val likelihoods = Likelihoods(ads(0), ads(1))
IndexedSeq(likelihoods.aa.round.toInt, likelihoods.ab.round.toInt, likelihoods.bb.round.toInt)
val likelihoods = Likelihoods(newAds)
val pls = likelihoods.pls
val calls = likelihoods.mostLikelyCall(gt.alleles.toSeq)
gt.copy(attrs=Map("PL" -> pls, "AD" -> newAds, "DP" -> newAds.sum), calls=calls)
}

object Likelihoods {
/** Computes the likelihoods for each possible genotype.
*
/**Converts a sequence of log-likelihoods to phred-scale by 1) multiplying each by -10, 2)
* subtracting from each the min value so the smallest value is 0, and 3) rounding to the
* nearest integer.
*/
def logToPhredLikelihoods(logLikelihoods: IndexedSeq[Double]): IndexedSeq[Int] = {
val rawPL = logLikelihoods.map(gl => gl * -10)
val minPL = rawPL.min
rawPL.map(pl => (pl - minPL).round.toInt)
}

/** Computes the likelihoods for each possible biallelic genotype.
* @param alleleDepthA the reference allele depth
* @param alleleDepthB the alternate allele depth
* @param epsilon the error rate for genotyping
* @return a new `Likelihood` that has the likelihoods of AA, AB, and BB
*/
def apply(alleleDepthA: Int, alleleDepthB: Int, epsilon: Double=0.01): Likelihoods = {
def biallelic(alleleDepthA: Int, alleleDepthB: Int, epsilon: Double = 0.01): IndexedSeq[Double] = {
val aGivenAA = log10(1 - epsilon)
val aGivenBB = log10(epsilon)
val aGivenAB = log10((1 - epsilon) / 2)

val rawGlAA = ((alleleDepthA * aGivenAA) + (alleleDepthB * aGivenBB)) * -10
val rawGlBB = ((alleleDepthA * aGivenBB) + (alleleDepthB * aGivenAA)) * -10
val rawGlAB = ((alleleDepthA + alleleDepthB) * aGivenAB) * -10
val rawGlAA = ((alleleDepthA * aGivenAA) + (alleleDepthB * aGivenBB))
val rawGlBB = ((alleleDepthA * aGivenBB) + (alleleDepthB * aGivenAA))
val rawGlAB = ((alleleDepthA + alleleDepthB) * aGivenAB)

val minGL = math.min(math.min(rawGlAA, rawGlAB), rawGlBB)
IndexedSeq(rawGlAA, rawGlAB, rawGlBB)
}

Likelihoods(
aa = rawGlAA - minGL,
ab = rawGlAB - minGL,
bb = rawGlBB - minGL
/** Computes the likelihoods for each possible genotype given a sequence of read depths for any
* number of alleles.
* @param alleleDepths the sequence of allele depths in the order specified in the VCF
* @param epsilon the error rate for genotyping
* @return a new `Likelihood` that has the log likelihoods of all possible genotypes in the
* order specified in VFC spec for the GL/PL tags.
*/
def generalized(alleleDepths: IndexedSeq[Int], epsilon: Double = 0.01): IndexedSeq[Double] = {
val numAlleles = alleleDepths.length
// probabilities associated with each possible genotype for a pair of alleles
val logProbs: Array[Double] = Array(
math.log10(epsilon),
math.log10((1 - epsilon) / 2),
math.log10(1 - epsilon)
)
// compute genotype log-likelihoods
(0 until numAlleles).flatMap(b =>
(0 to b).map(a =>
(0 until numAlleles).map(allele =>
logProbs(Array(a, b).count(_ == allele)) * alleleDepths(allele)
).sum
)
)
}

def apply(alleleDepths: IndexedSeq[Int], epsilon: Double = 0.01): Likelihoods = {
val numAlleles = alleleDepths.length
require(numAlleles >= 2, "at least two alleles are required to calculate genotype likelihoods")
Likelihoods(numAlleles, generalized(alleleDepths, epsilon))
}
}

/** Stores the log10(likelihoods) for all possible bi-allelic genotypes.
*
* @param aa likelihood of AA
* @param ab likelihood of AB
* @param bb likelihood of BB
/** Stores the log10(likelihoods) for all possible genotypes.
* @param numAlleles the number of alleles the variant has
* @param genotypeLikelihoods sequence of GLs in the order specified in the VCF spec
*/
case class Likelihoods(aa: Double, ab: Double, bb: Double) {
case class Likelihoods(numAlleles: Int, genotypeLikelihoods: IndexedSeq[Double]) {
/**
* Returns the likelihoods as a list of phred-scaled integers (i.e, the value of the PL tag).
* @return a list of phred-scaled likelihooodS for AA, AB, BB.
*/
def pls = IndexedSeq(aa.round.toInt, ab.round.toInt, bb.round.toInt)
def pls: IndexedSeq[Int] = {
Likelihoods.logToPhredLikelihoods(genotypeLikelihoods)
}

def mostLikelyGenotype: Option[(Int, Int)] = {
val minIndexes = pls.zipWithIndex.filter(pair => pair._1 == 0)
minIndexes.length match {
case 0 => throw new RuntimeException("expected the most likely PL to have a value of 0.0")
case 1 => {
val genotypes =
for (b <- 0 until numAlleles; a <- 0 to b)
yield (a, b)
Some(genotypes(minIndexes.head._2))
}
case _ => None // if multiple genotypes are most likely, don't make a call
}
}

def mostLikelyCall(alleles: Seq[Allele]): IndexedSeq[Allele] = {
mostLikelyGenotype match {
case None => IndexedSeq(NoCallAllele, NoCallAllele)
case Some((a, b)) => IndexedSeq(alleles(a), alleles(b))
}
}
}
}

Expand Down
128 changes: 108 additions & 20 deletions src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import com.fulcrumgenomics.util.Metric
import com.fulcrumgenomics.vcf.api.Allele.SimpleAllele
import com.fulcrumgenomics.vcf.api.{Allele, AlleleSet, Genotype, Variant}
import com.fulcrumgenomics.testing.UnitSpec
import com.fulcrumgenomics.vcf.DownsampleVcf.{Likelihoods, computePls, downsampleAndRegenotype}
import com.fulcrumgenomics.vcf.DownsampleVcf.{Likelihoods, downsampleAndRegenotype}

import scala.util.Random

Expand Down Expand Up @@ -187,7 +187,7 @@ class DownsampleVcfTest extends UnitSpec {
"DownsampleVcf.computePls" should "return new PLs that are not always 0,0,0" in {
val ads = IndexedSeq[Int](0, 100)
val expected = IndexedSeq(1996, 301, 0)
val newlikelihoods = computePls(ads)
val newlikelihoods = Likelihoods(ads).pls
newlikelihoods should contain theSameElementsInOrderAs expected
}

Expand All @@ -196,51 +196,89 @@ class DownsampleVcfTest extends UnitSpec {
*/

"DownsampleVcf.Likelihoods" should "return ref if all allele depths are zero" in {
val likelihood = Likelihoods(alleleDepthA=0, alleleDepthB=0)
val likelihood = Likelihoods(IndexedSeq(0, 0))
val expected = IndexedSeq[Int](0, 0, 0)
likelihood.pls.length shouldBe expected.length
likelihood.pls should contain theSameElementsInOrderAs expected
}

it should "return correct results for basic cases" in {
val e = 0.01
val cases: IndexedSeq[(IndexedSeq[Int], IndexedSeq[Double])] = IndexedSeq(
(IndexedSeq(1, 0), IndexedSeq(1 - e, 0.5, e)),
(IndexedSeq(0, 1), IndexedSeq(e, 0.5, 1 - e)),
(IndexedSeq(1, 1), IndexedSeq((1 - e) * e, 0.25, (1 - e) * e)),
(IndexedSeq(2, 0), IndexedSeq(math.pow((1 - e), 2), 0.25, math.pow(e, 2))),
(IndexedSeq(0, 0, 1), IndexedSeq(e, e, e, 0.5, 0.5, 1 - e)),
)
cases.foreach { case (input, output) =>
val likelihood = Likelihoods(input, e)
val logOutput = output.map(p => math.log10(p))
likelihood.pls.length shouldBe logOutput.length
likelihood.pls should contain theSameElementsInOrderAs DownsampleVcf.Likelihoods.logToPhredLikelihoods(logOutput)
}
}

it should "return the same results for biallelic and generalized algorithm" in {
val e = 0.01
val cases: IndexedSeq[(IndexedSeq[Int], IndexedSeq[Double])] = IndexedSeq(
(IndexedSeq(1, 0), IndexedSeq(1 - e, 0.5, e)),
(IndexedSeq(0, 1), IndexedSeq(e, 0.5, 1 - e)),
(IndexedSeq(1, 1), IndexedSeq((1 - e) * e, 0.25, (1 - e) * e)),
(IndexedSeq(2, 0), IndexedSeq(math.pow((1 - e), 2), 0.25, math.pow(e, 2))),
)
cases.foreach { case (input, output) =>
val biallelic = Likelihoods(2, DownsampleVcf.Likelihoods.biallelic(input(0), input(1), e))
val generalized = Likelihoods(2, DownsampleVcf.Likelihoods.generalized(input, e))
biallelic.pls should contain theSameElementsInOrderAs generalized.pls
}
}

it should "return a likelihood of 0 for AA if there are only ref alleles observed" in {
val likelihood = Likelihoods(alleleDepthA = 10, alleleDepthB = 0)
val likelihood = Likelihoods(IndexedSeq(10, 0))
val expected = IndexedSeq[Int](0, 30, 200)
likelihood.pls should contain theSameElementsInOrderAs expected
}

it should "return a likelihood of 0 for BB if there are only alt alleles observed" in {
val likelihood = Likelihoods(alleleDepthA = 0, alleleDepthB = 10)
val likelihood = Likelihoods(IndexedSeq(0, 10))
val expected = IndexedSeq[Int](200, 30, 0)
likelihood.pls should contain theSameElementsInOrderAs expected
}

it should "return a likelihood of 0 for AB if there are an equal number of ref and alt alleles" in {
val likelihood = Likelihoods(alleleDepthA = 5, alleleDepthB = 5)
val likelihood = Likelihoods(IndexedSeq(5, 5))
val expected = IndexedSeq[Int](70, 0, 70)
likelihood.pls should contain theSameElementsInOrderAs expected
}

it should "return a likelihood of 0 for AA if the AD A >> AD B" in {
val likelihood = Likelihoods(alleleDepthA = 15, alleleDepthB = 2)
likelihood.pls(0) == 0
val likelihood = Likelihoods(IndexedSeq(15, 2))
assert(likelihood.pls(0) == 0)
}

it should "return a likelihood of 0 for AB if AD.A and AD.B are similar but not equal" in {
val likelihood = Likelihoods(alleleDepthA = 15, alleleDepthB = 17)
likelihood.pls(1) == 0
val likelihood = Likelihoods(IndexedSeq(15, 17))
assert(likelihood.pls(1) == 0)
}

it should "return a likelihood of 0 for BB if AD.B >> AD.A but neither are 0" in {
val likelihood = Likelihoods(alleleDepthA = 3, alleleDepthB = 30)
likelihood.pls(2) == 0
val likelihood = Likelihoods(IndexedSeq(3, 30))
assert(likelihood.pls(2) == 0)
}

it should "return correct values when there are very few reads" in {
Likelihoods(0, 0).pls should contain theSameElementsInOrderAs IndexedSeq(0, 0, 0)
Likelihoods(1, 0).pls should contain theSameElementsInOrderAs IndexedSeq(0, 3, 20)
Likelihoods(1, 1).pls should contain theSameElementsInOrderAs IndexedSeq(14, 0, 14)
Likelihoods(0, 2).pls should contain theSameElementsInOrderAs IndexedSeq(40, 6, 0)
Likelihoods(1, 2).pls should contain theSameElementsInOrderAs IndexedSeq(31, 0, 11)
Likelihoods(IndexedSeq(0, 0)).pls should contain theSameElementsInOrderAs IndexedSeq(0, 0, 0)
Likelihoods(IndexedSeq(1, 0)).pls should contain theSameElementsInOrderAs IndexedSeq(0, 3, 20)
Likelihoods(IndexedSeq(1, 1)).pls should contain theSameElementsInOrderAs IndexedSeq(14, 0, 14)
Likelihoods(IndexedSeq(0, 2)).pls should contain theSameElementsInOrderAs IndexedSeq(40, 6, 0)
Likelihoods(IndexedSeq(1, 2)).pls should contain theSameElementsInOrderAs IndexedSeq(31, 0, 11)
}

it should "return correct values for multi-allelic variants" in {
Likelihoods(IndexedSeq(0, 0, 0)).pls should contain theSameElementsInOrderAs IndexedSeq(0, 0, 0, 0, 0, 0)
Likelihoods(IndexedSeq(10, 0, 0)).pls should contain theSameElementsInOrderAs IndexedSeq(0, 30, 200, 30, 200, 200)
Likelihoods(IndexedSeq(10, 10, 0)).pls should contain theSameElementsInOrderAs IndexedSeq(139, 0, 139, 169, 169, 339)
}


Expand All @@ -251,10 +289,10 @@ class DownsampleVcfTest extends UnitSpec {
Genotype(alleles=AlleleSet(ref=SimpleAllele(ref), alts=IndexedSeq(Allele(alt))),
sample=sample,
calls=IndexedSeq[Allele](Allele(ref), Allele(alt)),
attrs=Map("AD" -> ads, "PL" -> Likelihoods(alleleDepthA = ads(0), alleleDepthB = ads(1))))
attrs=Map("AD" -> ads, "PL" -> Likelihoods(ads))
)
}


"DownsampleVcf.downsampleAndRegneotype(Genotype)" should "return no call if all allele depths are zero" in {
val geno = makeGt(ref="A", alt="T", ads=IndexedSeq(0,0))
val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.01, random = new Random(42), epsilon = 0.01)
Expand Down Expand Up @@ -298,6 +336,30 @@ class DownsampleVcfTest extends UnitSpec {
newGeno.calls should contain theSameElementsInOrderAs expected
}

/*
testing DownsampleVcf.downsampleAndRegenotype on downsampleAndRegenotypes
*/
private def makeTriallelicGt(ref: String, alt1: String, alt2: String, ads: IndexedSeq[Int], sample: String ="test"): Genotype = {
val likelihoods = Likelihoods(ads)
val alleles = AlleleSet(ref=SimpleAllele(ref), alts=IndexedSeq(Allele(alt1), Allele(alt2)))
val calls = likelihoods.mostLikelyCall(alleles.toSeq)
Genotype(alleles, sample=sample, calls=calls, attrs=Map("AD" -> ads, "PL" -> likelihoods.pls))
}

it should "return ref,alt1 for a tri-allelic genotype if those alleles have the highest depth" in {
val geno = makeTriallelicGt(ref="A", alt1="T", alt2="G", ads=IndexedSeq(100, 100, 0))
val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.1, random = new Random(42), epsilon = 0.01)
val expected = IndexedSeq(Allele("A"), Allele("T"))
newGeno.calls should contain theSameElementsInOrderAs expected
}

it should "return alt1,alt2 for a tri-allelic genotype if those alleles have the highest depth" in {
val geno = makeTriallelicGt(ref="A", alt1="T", alt2="G", ads=IndexedSeq(0, 100, 100))
val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.1, random = new Random(42), epsilon = 0.01)
val expected = IndexedSeq(Allele("T"), Allele("G"))
newGeno.calls should contain theSameElementsInOrderAs expected
}

/*
testing DownsampleVcf.downsampleAndRegenotype on Variant
*/
Expand All @@ -306,7 +368,7 @@ class DownsampleVcfTest extends UnitSpec {
Variant(chrom="1",
pos=10,
alleles=AlleleSet(ref=Allele(ref), alts=Allele(alt)),
genotypes=Map(sample -> makeGt(ref=ref, alt=alt, ads=ads, sample = sample))
genotypes=Map(sample -> makeGt(ref=ref, alt=alt, ads=ads, sample=sample))
)
}

Expand Down Expand Up @@ -345,6 +407,32 @@ class DownsampleVcfTest extends UnitSpec {
newVariant.genotypes("test").calls should contain theSameElementsInOrderAs expected
}

/*
testing DownsampleVcf.downsampleAndRegenotype on downsampleAndRegenotypes
*/
private def makeTriallelicVariant(ref: String, alt1: String, alt2: String, ads: IndexedSeq[Int], sample: String ="test"): Variant = {
val likelihoods = Likelihoods(ads)
val alleles = AlleleSet(ref=SimpleAllele(ref), alts=IndexedSeq(Allele(alt1), Allele(alt2)))
Variant(chrom="1",
pos=10,
alleles=alleles,
genotypes=Map(sample -> makeTriallelicGt(ref=ref, alt1=alt1, alt2=alt2, ads=ads, sample=sample)))
}

it should "return ref,alt1 for a tri-allelic variant if those alleles have the highest depth" in {
val variant = makeTriallelicVariant(ref="A", alt1="T", alt2="G", ads=IndexedSeq(100, 100, 0))
val newVariant = downsampleAndRegenotype(variant=variant, proportions = Map("test" -> 0.1), random = new Random(42), epsilon = 0.01)
val expected = IndexedSeq(Allele("A"), Allele("T"))
newVariant.genotypes("test").calls should contain theSameElementsInOrderAs expected
}

it should "return alt1,alt2 for a tri-allelic variant if those alleles have the highest depth" in {
val variant = makeTriallelicVariant(ref="A", alt1="T", alt2="G", ads=IndexedSeq(0, 100, 100))
val newVariant = downsampleAndRegenotype(variant=variant, proportions = Map("test" -> 0.1), random = new Random(42), epsilon = 0.01)
val expected = IndexedSeq(Allele("T"), Allele("G"))
newVariant.genotypes("test").calls should contain theSameElementsInOrderAs expected
}

private val sample = "test1"
private val builder = VcfBuilder(samples=Seq(sample))
builder.add(chrom="chr1", pos=100, id="1", alleles=Seq("A", "C"), info=Map(),
Expand Down

0 comments on commit 75de344

Please sign in to comment.